14 files changed, 2518 insertions, 0 deletions
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE b/contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE
new file mode 100644
index 0000000..a76efd7
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, Adrien Descamps
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of yuv2rgb nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/README.md b/contrib/SDL-3.2.8/src/video/yuv2rgb/README.md
new file mode 100644
index 0000000..21191e9
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/README.md
@@ -0,0 +1,63 @@
+From: https://github.com/descampsa/yuv2rgb
+# yuv2rgb
+C library for fast image conversion between yuv420p and rgb24.
+This is a simple library for optimized image conversion between YUV420p and rgb24.
+It was done mainly as an exercise to learn to use sse intrinsics, so there may still be room for optimization.
+For each conversion, a standard c optimized function and two sse function (with aligned and unaligned memory) are implemented.
+The sse version requires only SSE2, which is available on any reasonably recent CPU.
+The library also supports the three different YUV (YCrCb to be correct) color spaces that exist (see comments in code), and others can be added simply.
+There is a simple test program, that convert a raw YUV file to rgb ppm format, and measure computation time.
+Optionally, it also compares the result and computation time with the ffmpeg implementation (that uses MMX), and with the IPP functions.
+To compile, simply do :
+    mkdir build
+    cd build
+    cmake -DCMAKE_BUILD_TYPE=Release ..
+    make
+The test program only support raw YUV files for the YUV420 format, and ppm for the RGB24 format.
+To generate a raw yuv file, you can use avconv:
+    avconv -i example.jpg -c:v rawvideo -pix_fmt yuv420p example.yuv
+To generate the rgb file, you can use the ImageMagick convert program:
+    convert example.jpg example.ppm
+Then, for YUV420 to RGB24 conversion, use the test program like that:
+    ./test_yuv_rgb yuv2rgb image.yuv 4096 2160 image
+  
+The second and third parameters are image width and height (that are needed because not available in the raw YUV file), and fourth parameter is the output filename template (several output files will be generated, named for example output_sse.ppm, output_av.ppm, etc.)
+Similarly, for RGB24 to YUV420 conversion:
+    ./test_yuv_rgb yuv2rgb image.ppm image
+On my computer, the test program on a 4K image give the following for yuv2rgb:
+    Time will be measured in each configuration for 100 iterations...
+    Processing time (std) : 2.630193 sec
+    Processing time (sse2_unaligned) : 0.704394 sec
+    Processing time (ffmpeg_unaligned) : 1.221432 sec
+    Processing time (ipp_unaligned) : 0.636274 sec
+    Processing time (sse2_aligned) : 0.606648 sec
+    Processing time (ffmpeg_aligned) : 1.227100 sec
+    Processing time (ipp_aligned) : 0.636951 sec
+And for rgb2yuv:
+    Time will be measured in each configuration for 100 iterations...
+    Processing time (std) : 2.588675 sec
+    Processing time (sse2_unaligned) : 0.676625 sec
+    Processing time (ffmpeg_unaligned) : 3.385816 sec
+    Processing time (ipp_unaligned) : 0.593890 sec
+    Processing time (sse2_aligned) : 0.640630 sec
+    Processing time (ffmpeg_aligned) : 3.397952 sec
+    Processing time (ipp_aligned) : 0.579043 sec
+configuration : gcc 4.9.2, swscale 3.0.0, IPP 9.0.1, intel i7-5500U
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h
new file mode 100644
index 0000000..c359316
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h
@@ -0,0 +1,33 @@
+#ifndef YUV_RGB_H_
+#define YUV_RGB_H_
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
+// There are a few slightly different variations of the YCbCr color space with different parameters that
+// change the conversion matrix.
+// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
+// See the respective standards for details
+// The matrix values used are derived from http://www.equasys.de/colorconversion.html
+// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
+// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This
+// is suboptimal for image quality, but by far the fastest method.
+// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
+// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
+/*#include <stdint.h>*/
+// yuv to rgb, standard c implementation
+#include "yuv_rgb_std.h"
+// yuv to rgb, sse2 implementation
+#include "yuv_rgb_sse.h"
+// yuv to rgb, lsx implementation
+#include "yuv_rgb_lsx.h"
+#endif /* YUV_RGB_H_ */
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h
new file mode 100644
index 0000000..a4ef8ea
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h
@@ -0,0 +1,15 @@
+#ifndef YUV_RGB_COMMON_H_
+#define YUV_RGB_COMMON_H_
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+typedef enum
+{
+    YCBCR_601_FULL,
+    YCBCR_601_LIMITED,
+    YCBCR_709_FULL,
+    YCBCR_709_LIMITED,
+    YCBCR_2020_NCL_FULL,
+} YCbCrType;
+#endif /* YUV_RGB_COMMON_H_ */
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h
new file mode 100644
index 0000000..d5939ed
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h
@@ -0,0 +1,85 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "yuv_rgb.h"
+#define PRECISION 6
+#define PRECISION_FACTOR (1<<PRECISION)
+typedef struct
+{
+        uint8_t y_shift;
+        int16_t matrix[3][3];
+} RGB2YUVParam;
+// |Y|   |y_shift|                        |matrix[0][0] matrix[0][1] matrix[0][2]|   |R|
+// |U| = |  128  | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
+// |V|   |  128  |                        |matrix[2][0] matrix[2][1] matrix[2][2]|   |B|
+typedef struct
+{
+        uint8_t y_shift;
+        int16_t y_factor;
+        int16_t v_r_factor;
+        int16_t u_g_factor;
+        int16_t v_g_factor;
+        int16_t u_b_factor;
+} YUV2RGBParam;
+// |R|                        |y_factor      0       v_r_factor|   |Y-y_shift|
+// |G| = 1/PRECISION_FACTOR * |y_factor  u_g_factor  v_g_factor| * |  U-128  |
+// |B|                        |y_factor  u_b_factor      0     |   |  V-128  |
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
+#define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
+// for ITU-T T.871, values can be found in section 7
+// for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// for ITU-R BT.2020 values are assuming RGB is encoded using full 10-bit range ([0-1]<->[0-1023])
+// all values are rounded to the fourth decimal
+static const YUV2RGBParam YUV2RGB[] = {
+        // ITU-T T.871 (JPEG)
+        {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
+        // ITU-R BT.601-7
+        {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
+        // ITU-R BT.709-6 full range
+        {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.581), /*.u_g_factor=*/ -V(0.1881), /*.v_g_factor=*/ -V(0.47), /*.u_b_factor=*/ V(1.8629)},
+        // ITU-R BT.709-6
+        {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)},
+        // ITU-R BT.2020 10-bit full range
+        {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.4760), /*.u_g_factor=*/ -V(0.1647), /*.v_g_factor=*/ -V(0.5719), /*.u_b_factor=*/ V(1.8832) }
+};
+static const RGB2YUVParam RGB2YUV[] = {
+        // ITU-T T.871 (JPEG)
+        {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
+        // ITU-R BT.601-7
+        {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
+        // ITU-R BT.709-6 full range
+        {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.2126), V(0.7152), V(0.0722)}, {-V(0.1141), -V(0.3839), V(0.498)}, {V(0.498), -V(0.4524), -V(0.0457)}}},
+        // ITU-R BT.709-6
+        {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}},
+        // ITU-R BT.2020 10-bit full range
+        {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.2627), V(0.6780), V(0.0593)}, {-V(0.1395), -V(0.3600), V(0.4995)}, {V(0.4995), -V(0.4593), -V(0.0402)}}},
+};
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+/* The various layouts of YUV data we support */
+#define YUV_FORMAT_420  1
+#define YUV_FORMAT_422  2
+#define YUV_FORMAT_NV12 3
+/* The various formats of RGB pixel that we support */
+#define RGB_FORMAT_RGB565       1
+#define RGB_FORMAT_RGB24        2
+#define RGB_FORMAT_RGBA         3
+#define RGB_FORMAT_BGRA         4
+#define RGB_FORMAT_ARGB         5
+#define RGB_FORMAT_ABGR         6
+#define RGB_FORMAT_XBGR2101010 7
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c
new file mode 100644
index 0000000..250ff37
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c
@@ -0,0 +1,43 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "SDL_internal.h"
+#ifdef SDL_HAVE_YUV
+#include "yuv_rgb_lsx.h"
+#include "yuv_rgb_internal.h"
+#ifdef SDL_LSX_INTRINSICS
+#define LSX_FUNCTION_NAME       yuv420_rgb24_lsx
+#define STD_FUNCTION_NAME       yuv420_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_lsx_func.h"
+#define LSX_FUNCTION_NAME       yuv420_rgba_lsx
+#define STD_FUNCTION_NAME       yuv420_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_lsx_func.h"
+#define LSX_FUNCTION_NAME       yuv420_bgra_lsx
+#define STD_FUNCTION_NAME       yuv420_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_lsx_func.h"
+#define LSX_FUNCTION_NAME       yuv420_argb_lsx
+#define STD_FUNCTION_NAME       yuv420_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_lsx_func.h"
+#define LSX_FUNCTION_NAME       yuv420_abgr_lsx
+#define STD_FUNCTION_NAME       yuv420_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_lsx_func.h"
+#endif  // SDL_LSX_INTRINSICS
+#endif // SDL_HAVE_YUV
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h
new file mode 100644
index 0000000..1347a31
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h
@@ -0,0 +1,36 @@
+#ifdef SDL_LSX_INTRINSICS
+#include "yuv_rgb_common.h"
+//yuv420 to bgra, lsx implementation
+void yuv420_rgb24_lsx(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgba_lsx(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_bgra_lsx(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_argb_lsx(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_abgr_lsx(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+#endif  //SDL_LSX_INTRINSICS
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
new file mode 100644
index 0000000..89d582a
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
@@ -0,0 +1,372 @@
+// Copyright 2016 Adrien Descamps
+// // Distributed under BSD 3-Clause License
+#include <lsxintrin.h>
+#if YUV_FORMAT == YUV_FORMAT_420
+#define READ_Y(y_ptr)                                     \
+    y = __lsx_vld(y_ptr, 0);                              \
+#define READ_UV                                           \
+    u_temp = __lsx_vld(u_ptr, 0);                         \
+    v_temp = __lsx_vld(v_ptr, 0);                         \
+#else
+#error READ_UV unimplemented
+#endif
+#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2,       \
+                     RGB3, RGB4, RGB5, RGB6, RGB7, RGB8)               \
+{                                       \
+    __m128i ab_l, ab_h, gr_l, gr_h;     \
+    ab_l = __lsx_vilvl_b(B1, A1);       \
+    ab_h = __lsx_vilvh_b(B1, A1);       \
+    gr_l = __lsx_vilvl_b(R1, G1);       \
+    gr_h = __lsx_vilvh_b(R1, G1);       \
+    RGB1 = __lsx_vilvl_h(gr_l, ab_l);   \
+    RGB2 = __lsx_vilvh_h(gr_l, ab_l);   \
+    RGB3 = __lsx_vilvl_h(gr_h, ab_h);   \
+    RGB4 = __lsx_vilvh_h(gr_h, ab_h);   \
+    ab_l = __lsx_vilvl_b(B2, A2);       \
+    ab_h = __lsx_vilvh_b(B2, A2);       \
+    gr_l = __lsx_vilvl_b(R2, G2);       \
+    gr_h = __lsx_vilvh_b(R2, G2);       \
+    RGB5 = __lsx_vilvl_h(gr_l, ab_l);   \
+    RGB6 = __lsx_vilvh_h(gr_l, ab_l);   \
+    RGB7 = __lsx_vilvl_h(gr_h, ab_h);   \
+    RGB8 = __lsx_vilvh_h(gr_h, ab_h);   \
+}
+#define PACK_RGB24_32_STEP(R, G, B, RGB1, RGB2, RGB3)        \
+    RGB1 = __lsx_vilvl_b(G, R);                              \
+    RGB1 = __lsx_vshuf_b(B, RGB1, mask1);                    \
+    RGB2 = __lsx_vshuf_b(B, G, mask2);                       \
+    RGB2 = __lsx_vshuf_b(R, RGB2, mask3);                    \
+    RGB3 = __lsx_vshuf_b(R, B, mask4);                       \
+    RGB3 = __lsx_vshuf_b(G, RGB3, mask5);                    \
+#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6)  \
+    PACK_RGB24_32_STEP(R1, G1, B1, RGB1, RGB2, RGB3);                              \
+    PACK_RGB24_32_STEP(R2, G2, B2, RGB4, RGB5, RGB6);                              \
+#if RGB_FORMAT == RGB_FORMAT_RGB24
+#define PACK_PIXEL                                                             \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6;                          \
+    __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12;                       \
+    PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12,              \
+                  rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6)                    \
+    PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22,              \
+                  rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12)                 \
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#else
+#error PACK_PIXEL unimplemented
+#endif
+#define LSX_ST_UB2(in0, in1, pdst, stride)                      \
+{                                                               \
+    __lsx_vst(in0, pdst, 0);                                    \
+    __lsx_vst(in1, pdst + stride, 0);                           \
+}
+#if RGB_FORMAT == RGB_FORMAT_RGB24                              \
+#define SAVE_LINE1                                              \
+    LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16);                     \
+    LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16);                \
+    LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16);                \
+#define SAVE_LINE2                                              \
+    LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr2, 16);                     \
+    LSX_ST_UB2(rgb_9, rgb_10, rgb_ptr2 + 32, 16);               \
+    LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 64, 16);              \
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA ||  \
+    RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR       \
+#define SAVE_LINE1                                              \
+    LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16);                     \
+    LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16);                \
+    LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16);                \
+    LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr1 + 96, 16);                \
+#define SAVE_LINE2                                              \
+    LSX_ST_UB2(rgb_9,  rgb_10, rgb_ptr2, 16);                   \
+    LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 32, 16);              \
+    LSX_ST_UB2(rgb_13, rgb_14, rgb_ptr2 + 64, 16);              \
+    LSX_ST_UB2(rgb_15, rgb_16, rgb_ptr2 + 96, 16);              \
+#else
+#error SAVE_LINE unimplemented
+#endif
+// = u*vr g=u*ug+v*vg b=u*ub
+#define UV2RGB_16(U, V, R1, G1, B1, R2, G2, B2)     \
+    r_temp = __lsx_vmul_h(V, v2r);                  \
+    g_temp = __lsx_vmul_h(U, u2g);                  \
+    g_temp = __lsx_vmadd_h(g_temp, V, v2g);         \
+    b_temp = __lsx_vmul_h(U, u2b);                  \
+    R1     = __lsx_vilvl_h(r_temp, r_temp);         \
+    G1     = __lsx_vilvl_h(g_temp, g_temp);         \
+    B1     = __lsx_vilvl_h(b_temp, b_temp);         \
+    R2     = __lsx_vilvh_h(r_temp, r_temp);         \
+    G2     = __lsx_vilvh_h(g_temp, g_temp);         \
+    B2     = __lsx_vilvh_h(b_temp, b_temp);         \
+// Y=(Y-shift)*shift R=(Y+R)>>6,G=(Y+G)>>6,B=(B+Y)>>6
+#define ADD_Y2RGB_16(Y1, Y2, R1, G1, B1, R2, G2, B2)        \
+    Y1 = __lsx_vsub_h(Y1, shift);                           \
+    Y2 = __lsx_vsub_h(Y2, shift);                           \
+    Y1 = __lsx_vmul_h(Y1, yf);                              \
+    Y2 = __lsx_vmul_h(Y2, yf);                              \
+    R1 = __lsx_vadd_h(R1, Y1);                              \
+    G1 = __lsx_vadd_h(G1, Y1);                              \
+    B1 = __lsx_vadd_h(B1, Y1);                              \
+    R2 = __lsx_vadd_h(R2, Y2);                              \
+    G2 = __lsx_vadd_h(G2, Y2);                              \
+    B2 = __lsx_vadd_h(B2, Y2);                              \
+    R1 = __lsx_vsrai_h(R1, PRECISION);                      \
+    G1 = __lsx_vsrai_h(G1, PRECISION);                      \
+    B1 = __lsx_vsrai_h(B1, PRECISION);                      \
+    R2 = __lsx_vsrai_h(R2, PRECISION);                      \
+    G2 = __lsx_vsrai_h(G2, PRECISION);                      \
+    B2 = __lsx_vsrai_h(B2, PRECISION);                      \
+#define CLIP(in0, in1, in2, in3, in4, in5)       \
+{                                                \
+    in0 = __lsx_vmaxi_h(in0, 0);                 \
+    in1 = __lsx_vmaxi_h(in1, 0);                 \
+    in2 = __lsx_vmaxi_h(in2, 0);                 \
+    in3 = __lsx_vmaxi_h(in3, 0);                 \
+    in4 = __lsx_vmaxi_h(in4, 0);                 \
+    in5 = __lsx_vmaxi_h(in5, 0);                 \
+    in0 = __lsx_vsat_hu(in0, 7);                 \
+    in1 = __lsx_vsat_hu(in1, 7);                 \
+    in2 = __lsx_vsat_hu(in2, 7);                 \
+    in3 = __lsx_vsat_hu(in3, 7);                 \
+    in4 = __lsx_vsat_hu(in4, 7);                 \
+    in5 = __lsx_vsat_hu(in5, 7);                 \
+}
+#define YUV2RGB_32                                            \
+    __m128i y, u_temp, v_temp;                                \
+    __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21;   \
+    __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22;   \
+    __m128i u, v, r_temp, g_temp, b_temp;                     \
+    __m128i r_1, g_1, b_1, r_2, g_2, b_2;                     \
+    __m128i y_1, y_2;                                         \
+    __m128i r_uv_1, g_uv_1, b_uv_1, r_uv_2, g_uv_2, b_uv_2;   \
+                                                              \
+    READ_UV                                                   \
+                                                              \
+    /* process first 16 pixels of first line */               \
+    u = __lsx_vilvl_b(zero, u_temp);                          \
+    v = __lsx_vilvl_b(zero, v_temp);                          \
+    u = __lsx_vsub_h(u, bias);                                \
+    v = __lsx_vsub_h(v, bias);                                \
+    UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2);            \
+    r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1;                 \
+    r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2;                 \
+    READ_Y(y_ptr1)                                            \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_11 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_11 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_11 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+    /* process first 16 pixels of second line */              \
+    r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1;                 \
+    r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2;                 \
+                                                              \
+    READ_Y(y_ptr2)                                            \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_21 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_21 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_21 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+    /* process last 16 pixels of first line */                \
+    u = __lsx_vilvh_b(zero, u_temp);                          \
+    v = __lsx_vilvh_b(zero, v_temp);                          \
+    u = __lsx_vsub_h(u, bias);                                \
+    v = __lsx_vsub_h(v, bias);                                \
+    UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2);            \
+    r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1;                 \
+    r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2;                 \
+    READ_Y(y_ptr1 + 16 * y_pixel_stride)                      \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_12 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_12 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_12 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+   /* process last 16 pixels of second line */                \
+    r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1;                 \
+    r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2;                 \
+                                                              \
+    READ_Y(y_ptr2 + 16 * y_pixel_stride)                      \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_22 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_22 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_22 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+void LSX_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y,
+                       const uint8_t *U, const uint8_t *V, uint32_t Y_stride,
+                       uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride,
+                       YCbCrType yuv_type)
+{
+    const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+    const int y_pixel_stride = 1;
+    const int uv_pixel_stride = 1;
+    const int uv_x_sample_interval = 2;
+    const int uv_y_sample_interval = 2;
+#endif
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+    const int rgb_pixel_stride = 2;
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+    const int rgb_pixel_stride = 3;
+    __m128i mask1 = {0x0504110302100100, 0x0A14090813070612};
+    __m128i mask2 = {0x1808170716061505, 0x00000000000A1909};
+    __m128i mask3 = {0x0504170302160100, 0x0A1A090819070618};
+    __m128i mask4 = {0x1E0D1D0C1C0B1B0A, 0x00000000000F1F0E};
+    __m128i mask5 = {0x05041C03021B0100, 0x0A1F09081E07061D};
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT_BGRA || \
+    RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT_ABGR
+    const int rgb_pixel_stride = 4;
+#else
+#error Unknown RGB pixel size
+#endif
+    uint32_t xpos, ypos;
+    __m128i v2r   = __lsx_vreplgr2vr_h(param->v_r_factor);
+    __m128i v2g   = __lsx_vreplgr2vr_h(param->v_g_factor);
+    __m128i u2g   = __lsx_vreplgr2vr_h(param->u_g_factor);
+    __m128i u2b   = __lsx_vreplgr2vr_h(param->u_b_factor);
+    __m128i bias  = __lsx_vreplgr2vr_h(128);
+    __m128i shift = __lsx_vreplgr2vr_h(param->y_shift);
+    __m128i yf    = __lsx_vreplgr2vr_h(param->y_factor);
+    __m128i zero  = __lsx_vldi(0);
+    if (width >= 32) {
+        for (ypos = 0; ypos < (height - (uv_y_sample_interval - 1)); ypos += uv_y_sample_interval) {
+            const uint8_t *y_ptr1 = Y + ypos * Y_stride,
+                          *y_ptr2 = Y + (ypos + 1) * Y_stride,
+                          *u_ptr  = U + (ypos/uv_y_sample_interval) * UV_stride,
+                          *v_ptr  = V + (ypos/uv_y_sample_interval) * UV_stride;
+            uint8_t *rgb_ptr1 = RGB + ypos * RGB_stride,
+                    *rgb_ptr2 = RGB + (ypos + 1) * RGB_stride;
+            for (xpos = 0; xpos < (width - 31); xpos += 32){
+                YUV2RGB_32
+                {
+                    PACK_PIXEL
+                    SAVE_LINE1
+                    if (uv_y_sample_interval > 1)
+                    {
+                        SAVE_LINE2
+                    }
+                }
+                y_ptr1   += 32 * y_pixel_stride;
+                y_ptr2   += 32 * y_pixel_stride;
+                u_ptr    += 32 * uv_pixel_stride/uv_x_sample_interval;
+                v_ptr    += 32 * uv_pixel_stride/uv_x_sample_interval;
+                rgb_ptr1 += 32 * rgb_pixel_stride;
+                rgb_ptr2 += 32 * rgb_pixel_stride;
+            }
+        }
+        if (uv_y_sample_interval == 2 && ypos == (height - 1)) {
+            const uint8_t *y_ptr = Y + ypos * Y_stride,
+                          *u_ptr = U + (ypos/uv_y_sample_interval) * UV_stride,
+                          *v_ptr = V + (ypos/uv_y_sample_interval) * UV_stride;
+            uint8_t *rgb_ptr = RGB + ypos * RGB_stride;
+            STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+        }
+    }
+    {
+        int converted = (width & ~31);
+        if (converted != width)
+        {
+            const uint8_t *y_ptr = Y + converted * y_pixel_stride,
+                          *u_ptr = U + converted * uv_pixel_stride / uv_x_sample_interval,
+                          *v_ptr = V + converted * uv_pixel_stride / uv_x_sample_interval;
+            uint8_t *rgb_ptr = RGB + converted * rgb_pixel_stride;
+            STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+        }
+    }
+}
+#undef LSX_FUNCTION_NAME
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef LSX_ALIGNED
+#undef LSX_ST_UB2
+#undef UV2RGB_16
+#undef ADD_Y2RGB_16
+#undef PACK_RGB24_32_STEP
+#undef PACK_RGB24_32
+#undef PACK_PIXEL
+#undef PACK_RGBA_32
+#undef SAVE_LINE1
+#undef SAVE_LINE2
+#undef READ_Y
+#undef READ_UV
+#undef YUV2RGB_32
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c
new file mode 100644
index 0000000..37fe7e4
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c
@@ -0,0 +1,460 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "SDL_internal.h"
+#ifdef SDL_HAVE_YUV
+#include "yuv_rgb_internal.h"
+#ifdef SDL_SSE2_INTRINSICS
+/* SDL doesn't use these atm and compiling them adds seconds onto the build.  --ryan.
+#define SSE_FUNCTION_NAME       yuv420_rgb565_sse
+#define STD_FUNCTION_NAME       yuv420_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_rgb24_sse
+#define STD_FUNCTION_NAME       yuv420_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_rgba_sse
+#define STD_FUNCTION_NAME       yuv420_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_bgra_sse
+#define STD_FUNCTION_NAME       yuv420_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_argb_sse
+#define STD_FUNCTION_NAME       yuv420_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_abgr_sse
+#define STD_FUNCTION_NAME       yuv420_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_rgb565_sse
+#define STD_FUNCTION_NAME       yuv422_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_rgb24_sse
+#define STD_FUNCTION_NAME       yuv422_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_rgba_sse
+#define STD_FUNCTION_NAME       yuv422_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_bgra_sse
+#define STD_FUNCTION_NAME       yuv422_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_argb_sse
+#define STD_FUNCTION_NAME       yuv422_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_abgr_sse
+#define STD_FUNCTION_NAME       yuv422_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_rgb565_sse
+#define STD_FUNCTION_NAME       yuvnv12_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_rgb24_sse
+#define STD_FUNCTION_NAME       yuvnv12_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_rgba_sse
+#define STD_FUNCTION_NAME       yuvnv12_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_bgra_sse
+#define STD_FUNCTION_NAME       yuvnv12_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_argb_sse
+#define STD_FUNCTION_NAME       yuvnv12_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_abgr_sse
+#define STD_FUNCTION_NAME       yuvnv12_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+*/
+#define SSE_FUNCTION_NAME       yuv420_rgb565_sseu
+#define STD_FUNCTION_NAME       yuv420_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_rgb24_sseu
+#define STD_FUNCTION_NAME       yuv420_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_rgba_sseu
+#define STD_FUNCTION_NAME       yuv420_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_bgra_sseu
+#define STD_FUNCTION_NAME       yuv420_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_argb_sseu
+#define STD_FUNCTION_NAME       yuv420_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv420_abgr_sseu
+#define STD_FUNCTION_NAME       yuv420_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_rgb565_sseu
+#define STD_FUNCTION_NAME       yuv422_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_rgb24_sseu
+#define STD_FUNCTION_NAME       yuv422_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_rgba_sseu
+#define STD_FUNCTION_NAME       yuv422_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_bgra_sseu
+#define STD_FUNCTION_NAME       yuv422_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_argb_sseu
+#define STD_FUNCTION_NAME       yuv422_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuv422_abgr_sseu
+#define STD_FUNCTION_NAME       yuv422_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_rgb565_sseu
+#define STD_FUNCTION_NAME       yuvnv12_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_rgb24_sseu
+#define STD_FUNCTION_NAME       yuvnv12_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_rgba_sseu
+#define STD_FUNCTION_NAME       yuvnv12_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_bgra_sseu
+#define STD_FUNCTION_NAME       yuvnv12_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_argb_sseu
+#define STD_FUNCTION_NAME       yuvnv12_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+#define SSE_FUNCTION_NAME       yuvnv12_abgr_sseu
+#define STD_FUNCTION_NAME       yuvnv12_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+/* SDL doesn't use these atm and compiling them adds seconds onto the build.  --ryan.
+#define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
+R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
+G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
+G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
+B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
+B2 = _mm_unpackhi_epi8(RGB3, RGB6);
+#define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+RGB1 = _mm_unpacklo_epi8(R1, G2); \
+RGB2 = _mm_unpackhi_epi8(R1, G2); \
+RGB3 = _mm_unpacklo_epi8(R2, B1); \
+RGB4 = _mm_unpackhi_epi8(R2, B1); \
+RGB5 = _mm_unpacklo_epi8(G1, B2); \
+RGB6 = _mm_unpackhi_epi8(G1, B2); \
+#define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+#define RGB2YUV_16(R, G, B, Y, U, V) \
+Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
+                _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
+Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
+Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
+Y = _mm_srai_epi16(Y, PRECISION); \
+U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
+                _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
+U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
+U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
+U = _mm_srai_epi16(U, PRECISION); \
+V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
+                _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
+V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
+V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
+V = _mm_srai_epi16(V, PRECISION);
+*/
+#if 0  // SDL doesn't use these atm and compiling them adds seconds onto the build.  --ryan.
+#define RGB2YUV_32 \
+        __m128i r1, r2, b1, b2, g1, g2; \
+        __m128i r_16, g_16, b_16; \
+        __m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
+        __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
+                rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
+                rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
+                rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
+                rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
+                rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
+        /* unpack rgb24 data to r, g and b data in separate channels*/ \
+        UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
+        /* process pixels of first line */ \
+        r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
+        g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
+        b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+        r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
+        g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
+        b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+        y = _mm_packus_epi16(y1_16, y2_16); \
+        u1 = _mm_packus_epi16(u1_16, u2_16); \
+        v1 = _mm_packus_epi16(v1_16, v2_16); \
+        /* save Y values */ \
+        SAVE_SI128((__m128i*)(y_ptr1), y); \
+        /* process pixels of second line */ \
+        r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
+        g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
+        b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+        r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
+        g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
+        b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+        y = _mm_packus_epi16(y1_16, y2_16); \
+        u2 = _mm_packus_epi16(u1_16, u2_16); \
+        v2 = _mm_packus_epi16(v1_16, v2_16); \
+        /* save Y values */ \
+        SAVE_SI128((__m128i*)(y_ptr2), y); \
+        /* vertical subsampling of u/v values */ \
+        u1_tmp = _mm_avg_epu8(u1, u2); \
+        v1_tmp = _mm_avg_epu8(v1, v2); \
+        /* do the same again with next data */ \
+        rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
+        rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
+        rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
+        rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
+        rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
+        rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
+        /* unpack rgb24 data to r, g and b data in separate channels*/ \
+        UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
+        /* process pixels of first line */ \
+        r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
+        g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
+        b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+        r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
+        g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
+        b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+        y = _mm_packus_epi16(y1_16, y2_16); \
+        u1 = _mm_packus_epi16(u1_16, u2_16); \
+        v1 = _mm_packus_epi16(v1_16, v2_16); \
+        /* save Y values */ \
+        SAVE_SI128((__m128i*)(y_ptr1+16), y); \
+        /* process pixels of second line */ \
+        r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
+        g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
+        b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+        r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
+        g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
+        b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
+        RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+        y = _mm_packus_epi16(y1_16, y2_16); \
+        u2 = _mm_packus_epi16(u1_16, u2_16); \
+        v2 = _mm_packus_epi16(v1_16, v2_16); \
+        /* save Y values */ \
+        SAVE_SI128((__m128i*)(y_ptr2+16), y); \
+        /* vertical subsampling of u/v values */ \
+        u2_tmp = _mm_avg_epu8(u1, u2); \
+        v2_tmp = _mm_avg_epu8(v1, v2); \
+        /* horizontal subsampling of u/v values */ \
+        u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
+        v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
+        u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
+        v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
+        u1 = _mm_avg_epu8(u1, u2); \
+        v1 = _mm_avg_epu8(v1, v2); \
+        SAVE_SI128((__m128i*)(u_ptr), u1); \
+        SAVE_SI128((__m128i*)(v_ptr), v1);
+#endif
+/* SDL doesn't use these atm and compiling them adds seconds onto the build.  --ryan.
+void SDL_TARGETING("sse2") rgb24_yuv420_sse(uint32_t width, uint32_t height,
+        const uint8_t *RGB, uint32_t RGB_stride,
+        uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
+        YCbCrType yuv_type)
+{
+        #define LOAD_SI128 _mm_load_si128
+        #define SAVE_SI128 _mm_stream_si128
+        const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+        uint32_t xpos, ypos;
+        for(ypos=0; ypos<(height-1); ypos+=2)
+        {
+                const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
+                        *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
+                uint8_t *y_ptr1=Y+ypos*Y_stride,
+                        *y_ptr2=Y+(ypos+1)*Y_stride,
+                        *u_ptr=U+(ypos/2)*UV_stride,
+                        *v_ptr=V+(ypos/2)*UV_stride;
+                for(xpos=0; xpos<(width-31); xpos+=32)
+                {
+                        RGB2YUV_32
+                        rgb_ptr1+=96;
+                        rgb_ptr2+=96;
+                        y_ptr1+=32;
+                        y_ptr2+=32;
+                        u_ptr+=16;
+                        v_ptr+=16;
+                }
+        }
+        #undef LOAD_SI128
+        #undef SAVE_SI128
+}
+void SDL_TARGETING("sse2") rgb24_yuv420_sseu(uint32_t width, uint32_t height,
+        const uint8_t *RGB, uint32_t RGB_stride,
+        uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
+        YCbCrType yuv_type)
+{
+        #define LOAD_SI128 _mm_loadu_si128
+        #define SAVE_SI128 _mm_storeu_si128
+        const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+        uint32_t xpos, ypos;
+        for(ypos=0; ypos<(height-1); ypos+=2)
+        {
+                const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
+                        *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
+                uint8_t *y_ptr1=Y+ypos*Y_stride,
+                        *y_ptr2=Y+(ypos+1)*Y_stride,
+                        *u_ptr=U+(ypos/2)*UV_stride,
+                        *v_ptr=V+(ypos/2)*UV_stride;
+                for(xpos=0; xpos<(width-31); xpos+=32)
+                {
+                        RGB2YUV_32
+                        rgb_ptr1+=96;
+                        rgb_ptr2+=96;
+                        y_ptr1+=32;
+                        y_ptr2+=32;
+                        u_ptr+=16;
+                        v_ptr+=16;
+                }
+        }
+        #undef LOAD_SI128
+        #undef SAVE_SI128
+}
+*/
+#endif // SDL_SSE2_INTRINSICS
+#endif // SDL_HAVE_YUV
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h
new file mode 100644
index 0000000..bfad856
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h
@@ -0,0 +1,241 @@
+#ifdef SDL_SSE2_INTRINSICS
+#include "yuv_rgb_common.h"
+// yuv to rgb, sse implementation
+// pointers must be 16 byte aligned, and strides must be divisable by 16
+void yuv420_rgb565_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgb24_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgba_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_bgra_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_argb_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_abgr_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgb565_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgb24_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgba_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_bgra_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_argb_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_abgr_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgb565_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgb24_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgba_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_bgra_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_argb_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_abgr_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+// yuv to rgb, sse implementation
+// pointers do not need to be 16 byte aligned
+void yuv420_rgb565_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgb24_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgba_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_bgra_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_argb_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_abgr_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgb565_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgb24_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgba_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_bgra_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_argb_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_abgr_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgb565_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgb24_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgba_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_bgra_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_argb_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_abgr_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+// rgb to yuv, sse implementation
+// pointers must be 16 byte aligned, and strides must be divisible by 16
+void rgb24_yuv420_sse(
+        uint32_t width, uint32_t height,
+        const uint8_t *rgb, uint32_t rgb_stride,
+        uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        YCbCrType yuv_type);
+// rgb to yuv, sse implementation
+// pointers do not need to be 16 byte aligned
+void rgb24_yuv420_sseu(
+        uint32_t width, uint32_t height,
+        const uint8_t *rgb, uint32_t rgb_stride,
+        uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        YCbCrType yuv_type);
+#endif
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h
new file mode 100644
index 0000000..cbd751d
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h
@@ -0,0 +1,529 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+/* You need to define the following macros before including this file:
+        SSE_FUNCTION_NAME
+        STD_FUNCTION_NAME
+        YUV_FORMAT
+        RGB_FORMAT
+*/
+/* You may define the following macro, which affects generated code:
+        SSE_ALIGNED
+*/
+#ifdef SSE_ALIGNED
+/* Unaligned instructions seem faster, even on aligned data? */
+/*
+#define LOAD_SI128 _mm_load_si128
+#define SAVE_SI128 _mm_stream_si128
+*/
+#define LOAD_SI128 _mm_loadu_si128
+#define SAVE_SI128 _mm_storeu_si128
+#else
+#define LOAD_SI128 _mm_loadu_si128
+#define SAVE_SI128 _mm_storeu_si128
+#endif
+#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
+        r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
+        g_tmp = _mm_add_epi16( \
+                _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
+                _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
+        b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
+        R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
+        G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
+        B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
+        R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
+        G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
+        B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
+#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
+        Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
+        Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
+        \
+        R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
+        G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
+        B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
+        R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
+        G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
+        B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
+#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
+{ \
+        __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
+\
+        red_mask = _mm_set1_epi16((unsigned short)0xF800); \
+        RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
+        RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
+        RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
+        RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
+        tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
+        tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
+        tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
+        tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
+        RGB1 = _mm_or_si128(RGB1, tmp1); \
+        RGB2 = _mm_or_si128(RGB2, tmp2); \
+        RGB3 = _mm_or_si128(RGB3, tmp3); \
+        RGB4 = _mm_or_si128(RGB4, tmp4); \
+        tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
+        tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
+        tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
+        tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
+        RGB1 = _mm_or_si128(RGB1, tmp1); \
+        RGB2 = _mm_or_si128(RGB2, tmp2); \
+        RGB3 = _mm_or_si128(RGB3, tmp3); \
+        RGB4 = _mm_or_si128(RGB4, tmp4); \
+}
+#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
+RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
+RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
+RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
+RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
+RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
+#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
+R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
+G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
+G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
+B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
+B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
+#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
+{ \
+        __m128i lo_ab, hi_ab, lo_gr, hi_gr; \
+\
+        lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
+        hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
+        lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
+        hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
+        RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
+        RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
+        RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
+        RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
+\
+        lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
+        hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
+        lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
+        hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
+        RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
+        RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
+        RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
+        RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
+}
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+#define PACK_PIXEL \
+        __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+        \
+        PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
+        \
+        PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+#define PACK_PIXEL \
+        __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
+        __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
+        \
+        PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
+        \
+        PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+#define PACK_PIXEL \
+        __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+        __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+        __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
+        \
+        PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+        \
+        PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+#define PACK_PIXEL \
+        __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+        __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+        __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
+        \
+        PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+        \
+        PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+#define PACK_PIXEL \
+        __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+        __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+        __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
+        \
+        PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+        \
+        PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+#define PACK_PIXEL \
+        __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+        __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+        __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
+        \
+        PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+        \
+        PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+#else
+#error PACK_PIXEL unimplemented
+#endif
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+#define SAVE_LINE1 \
+        SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+#define SAVE_LINE2 \
+        SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+#define SAVE_LINE1 \
+        SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
+#define SAVE_LINE2 \
+        SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
+      RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
+#define SAVE_LINE1 \
+        SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
+        SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
+#define SAVE_LINE2 \
+        SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
+        SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
+#else
+#error SAVE_LINE unimplemented
+#endif
+#if YUV_FORMAT == YUV_FORMAT_420
+#define READ_Y(y_ptr) \
+        y = LOAD_SI128((const __m128i*)(y_ptr)); \
+#define READ_UV \
+        u = LOAD_SI128((const __m128i*)(u_ptr)); \
+        v = LOAD_SI128((const __m128i*)(v_ptr)); \
+#elif YUV_FORMAT == YUV_FORMAT_422
+#define READ_Y(y_ptr) \
+{ \
+        __m128i y1, y2; \
+        y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
+        y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
+        y = _mm_packus_epi16(y1, y2); \
+}
+#define READ_UV \
+{ \
+        __m128i u1, u2, u3, u4, v1, v2, v3, v4; \
+        u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
+        u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
+        u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
+        u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
+        u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
+        v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
+        v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
+        v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
+        v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
+        v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
+}
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+#define READ_Y(y_ptr) \
+        y = LOAD_SI128((const __m128i*)(y_ptr)); \
+#define READ_UV \
+{ \
+        __m128i u1, u2, v1, v2; \
+        u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
+        u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
+        u = _mm_packus_epi16(u1, u2); \
+        v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
+        v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
+        v = _mm_packus_epi16(v1, v2); \
+}
+#else
+#error READ_UV unimplemented
+#endif
+#define YUV2RGB_32 \
+        __m128i r_tmp, g_tmp, b_tmp; \
+        __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
+        __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
+        __m128i y_16_1, y_16_2; \
+        __m128i y, u, v, u_16, v_16; \
+    __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
+    __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
+        \
+        READ_UV \
+        \
+        /* process first 16 pixels of first line */\
+        u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
+        v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
+        u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
+        v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
+        \
+        UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+        r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
+        r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
+        \
+        READ_Y(y_ptr1) \
+        y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+        y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+        \
+        ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+        \
+        r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
+        g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
+        b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
+        \
+        /* process first 16 pixels of second line */\
+        r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
+        r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
+        \
+        READ_Y(y_ptr2) \
+        y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+        y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+        \
+        ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+        \
+        r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
+        g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
+        b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
+        \
+        /* process last 16 pixels of first line */\
+        u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
+        v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
+        u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
+        v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
+        \
+        UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+        r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
+        r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
+        \
+        READ_Y(y_ptr1+16*y_pixel_stride) \
+        y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+        y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+        \
+        ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+        \
+        r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
+        g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
+        b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
+        \
+        /* process last 16 pixels of second line */\
+        r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
+        r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
+        \
+        READ_Y(y_ptr2+16*y_pixel_stride) \
+        y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+        y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+        \
+        ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+        \
+        r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
+        g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
+        b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
+        \
+void SDL_TARGETING("sse2") SSE_FUNCTION_NAME(uint32_t width, uint32_t height, 
+        const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+        uint8_t *RGB, uint32_t RGB_stride, 
+        YCbCrType yuv_type)
+{
+        const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+        const int y_pixel_stride = 1;
+        const int uv_pixel_stride = 1;
+        const int uv_x_sample_interval = 2;
+        const int uv_y_sample_interval = 2;
+#elif YUV_FORMAT == YUV_FORMAT_422
+        const int y_pixel_stride = 2;
+        const int uv_pixel_stride = 4;
+        const int uv_x_sample_interval = 2;
+        const int uv_y_sample_interval = 1;
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+        const int y_pixel_stride = 1;
+        const int uv_pixel_stride = 2;
+        const int uv_x_sample_interval = 2;
+        const int uv_y_sample_interval = 2;
+#endif
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+        const int rgb_pixel_stride = 2;
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+        const int rgb_pixel_stride = 3;
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
+      RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
+        const int rgb_pixel_stride = 4;
+#else
+#error Unknown RGB pixel size
+#endif
+#if YUV_FORMAT == YUV_FORMAT_NV12
+        /* For NV12 formats (where U/V are interleaved)
+         * SSE READ_UV does an invalid read access at the very last pixel.
+         * As a workaround. Make sure not to decode the last column using assembly but with STD fallback path.
+         * see https://github.com/libsdl-org/SDL/issues/4841
+         */
+        const int fix_read_nv12 = ((width & 31) == 0);
+#else
+        const int fix_read_nv12 = 0;
+#endif
+#if YUV_FORMAT == YUV_FORMAT_422
+        /* Avoid invalid read on last line */
+        const int fix_read_422 = 1;
+#else
+        const int fix_read_422 = 0;
+#endif
+        if (width >= 32) {
+                uint32_t xpos, ypos;
+                for(ypos=0; ypos<(height-(uv_y_sample_interval-1)) - fix_read_422; ypos+=uv_y_sample_interval)
+                {
+                        const uint8_t *y_ptr1=Y+ypos*Y_stride,
+                                *y_ptr2=Y+(ypos+1)*Y_stride,
+                                *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
+                                *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
+                        
+                        uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
+                                *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
+                        
+                        for(xpos=0; xpos<(width-31) - fix_read_nv12; xpos+=32)
+                        {
+                                YUV2RGB_32
+                                {
+                                        PACK_PIXEL
+                                        SAVE_LINE1
+                                        if (uv_y_sample_interval > 1)
+                                        {
+                                                SAVE_LINE2
+                                        }
+                                }
+                                y_ptr1+=32*y_pixel_stride;
+                                y_ptr2+=32*y_pixel_stride;
+                                u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
+                                v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
+                                rgb_ptr1+=32*rgb_pixel_stride;
+                                rgb_ptr2+=32*rgb_pixel_stride;
+                        }
+                }
+                if (fix_read_422) {
+                        const uint8_t *y_ptr=Y+ypos*Y_stride,
+                                *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
+                                *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
+                        uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
+                        STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+                        ypos += uv_y_sample_interval;
+                }
+                /* Catch the last line, if needed */
+                if (uv_y_sample_interval == 2 && ypos == (height-1))
+                {
+                        const uint8_t *y_ptr=Y+ypos*Y_stride,
+                                *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
+                                *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
+                        
+                        uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
+                        STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+                }
+        }
+        /* Catch the right column, if needed */
+        {
+                uint32_t converted = (width & ~31);
+                if (fix_read_nv12) {
+                        converted -= 32;
+                }
+                if (converted != width)
+                {
+                        const uint8_t *y_ptr=Y+converted*y_pixel_stride,
+                                *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
+                                *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
+                        
+                        uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
+                        STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+                }
+        }
+}
+#undef SSE_FUNCTION_NAME
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef SSE_ALIGNED
+#undef LOAD_SI128
+#undef SAVE_SI128
+#undef UV2RGB_16
+#undef ADD_Y2RGB_16
+#undef PACK_RGB24_32_STEP1
+#undef PACK_RGB24_32_STEP2
+#undef PACK_RGB24_32
+#undef PACK_RGBA_32
+#undef PACK_PIXEL
+#undef SAVE_LINE1
+#undef SAVE_LINE2
+#undef READ_Y
+#undef READ_UV
+#undef YUV2RGB_32
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c
new file mode 100644
index 0000000..0fa900d
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c
@@ -0,0 +1,200 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "SDL_internal.h"
+#ifdef SDL_HAVE_YUV
+#include "yuv_rgb_internal.h"
+// divide by PRECISION_FACTOR and clamp to [0:255] interval
+// input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
+static uint8_t clampU8(int32_t v)
+{
+    static const uint8_t lut[512] =
+            {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+             0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
+             47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
+             91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+             126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,
+             159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
+             192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+             225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
+             255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+             255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+             255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+             255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
+            };
+    return lut[((v+128*PRECISION_FACTOR)>>PRECISION)&511];
+}
+static uint16_t clamp10(int32_t v)
+{
+    v >>= PRECISION;
+    if (v < 0) {
+        return 0;
+    } else if (v > 1023) {
+        return 1023;
+    } else {
+        return (uint16_t)v;
+    }
+}
+#define YUV_BITS    8
+#define STD_FUNCTION_NAME       yuv420_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv420_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv420_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv420_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv420_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv420_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_420
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv422_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv422_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv422_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv422_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv422_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuv422_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_422
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuvnv12_rgb565_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuvnv12_rgb24_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuvnv12_rgba_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuvnv12_bgra_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuvnv12_argb_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+#define STD_FUNCTION_NAME       yuvnv12_abgr_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+#undef YUV_BITS
+#define YUV_BITS    10
+#define STD_FUNCTION_NAME       yuvp010_xbgr2101010_std
+#define YUV_FORMAT                      YUV_FORMAT_NV12
+#define RGB_FORMAT                      RGB_FORMAT_XBGR2101010
+#include "yuv_rgb_std_func.h"
+void rgb24_yuv420_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *RGB, uint32_t RGB_stride,
+        uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
+        YCbCrType yuv_type)
+{
+    const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+    uint32_t x, y;
+    for(y=0; y<(height-1); y+=2)
+    {
+        const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+                *rgb_ptr2=RGB+(y+1)*RGB_stride;
+        uint8_t *y_ptr1=Y+y*Y_stride,
+                *y_ptr2=Y+(y+1)*Y_stride,
+                *u_ptr=U+(y/2)*UV_stride,
+                *v_ptr=V+(y/2)*UV_stride;
+        for(x=0; x<(width-1); x+=2)
+        {
+            // compute yuv for the four pixels, u and v values are summed
+            int32_t y_tmp, u_tmp, v_tmp;
+            y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
+            u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
+            v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
+            y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+            y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
+            u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
+            v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
+            y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+            y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
+            u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
+            v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
+            y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+            y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
+            u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
+            v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
+            y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+            u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
+            v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
+            rgb_ptr1 += 6;
+            rgb_ptr2 += 6;
+            y_ptr1 += 2;
+            y_ptr2 += 2;
+            u_ptr += 1;
+            v_ptr += 1;
+        }
+    }
+}
+#endif /* SDL_HAVE_YUV */
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h
new file mode 100644
index 0000000..c9f856b
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h
@@ -0,0 +1,143 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
+// There are a few slightly different variations of the YCbCr color space with different parameters that
+// change the conversion matrix.
+// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
+// See the respective standards for details
+// The matrix values used are derived from http://www.equasys.de/colorconversion.html
+// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
+// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This
+// is suboptimal for image quality, but by far the fastest method.
+// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
+// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
+/*#include <stdint.h>*/
+#include "yuv_rgb_common.h"
+// yuv to rgb, standard c implementation
+void yuv420_rgb565_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgb24_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_rgba_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_bgra_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_argb_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv420_abgr_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgb565_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgb24_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_rgba_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_bgra_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_argb_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuv422_abgr_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgb565_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgb24_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_rgba_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_bgra_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_argb_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvnv12_abgr_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+void yuvp010_xbgr2101010_std(
+        uint32_t width, uint32_t height,
+        const uint16_t *y, const uint16_t *u, const uint16_t *v, uint32_t y_stride, uint32_t uv_stride,
+        uint8_t *rgb, uint32_t rgb_stride,
+        YCbCrType yuv_type);
+// rgb to yuv, standard c implementation
+void rgb24_yuv420_std(
+        uint32_t width, uint32_t height,
+        const uint8_t *rgb, uint32_t rgb_stride,
+        uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+        YCbCrType yuv_type);
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h
new file mode 100644
index 0000000..8091ea9
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h
@@ -0,0 +1,271 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+/* You need to define the following macros before including this file:
+        STD_FUNCTION_NAME
+        YUV_FORMAT
+        RGB_FORMAT
+*/
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+#define PACK_PIXEL(rgb_ptr) \
+        *(Uint16 *)rgb_ptr = \
+                ((((Uint16)clampU8(y_tmp+r_tmp)) << 8 ) & 0xF800) | \
+                ((((Uint16)clampU8(y_tmp+g_tmp)) << 3) & 0x07E0) | \
+                (((Uint16)clampU8(y_tmp+b_tmp)) >> 3); \
+        rgb_ptr += 2; \
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+#define PACK_PIXEL(rgb_ptr) \
+        rgb_ptr[0] = clampU8(y_tmp+r_tmp); \
+        rgb_ptr[1] = clampU8(y_tmp+g_tmp); \
+        rgb_ptr[2] = clampU8(y_tmp+b_tmp); \
+        rgb_ptr += 3; \
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+#define PACK_PIXEL(rgb_ptr) \
+        *(Uint32 *)rgb_ptr = \
+                (((Uint32)clampU8(y_tmp+r_tmp)) << 24) | \
+                (((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
+                (((Uint32)clampU8(y_tmp+b_tmp)) << 8) | \
+                0x000000FF; \
+        rgb_ptr += 4; \
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+#define PACK_PIXEL(rgb_ptr) \
+        *(Uint32 *)rgb_ptr = \
+                (((Uint32)clampU8(y_tmp+b_tmp)) << 24) | \
+                (((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
+                (((Uint32)clampU8(y_tmp+r_tmp)) << 8) | \
+                0x000000FF; \
+        rgb_ptr += 4; \
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+#define PACK_PIXEL(rgb_ptr) \
+        *(Uint32 *)rgb_ptr = \
+                0xFF000000 | \
+                (((Uint32)clampU8(y_tmp+r_tmp)) << 16) | \
+                (((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
+                (((Uint32)clampU8(y_tmp+b_tmp)) << 0); \
+        rgb_ptr += 4; \
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+#define PACK_PIXEL(rgb_ptr) \
+        *(Uint32 *)rgb_ptr = \
+                0xFF000000 | \
+                (((Uint32)clampU8(y_tmp+b_tmp)) << 16) | \
+                (((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
+                (((Uint32)clampU8(y_tmp+r_tmp)) << 0); \
+        rgb_ptr += 4; \
+#elif RGB_FORMAT == RGB_FORMAT_XBGR2101010
+#define PACK_PIXEL(rgb_ptr) \
+        *(Uint32 *)rgb_ptr = \
+                0xC0000000 | \
+                (((Uint32)clamp10(y_tmp+b_tmp)) << 20) | \
+                (((Uint32)clamp10(y_tmp+g_tmp)) << 10) | \
+                (((Uint32)clamp10(y_tmp+r_tmp)) << 0); \
+        rgb_ptr += 4; \
+#else
+#error PACK_PIXEL unimplemented
+#endif
+#ifdef _MSC_VER /* Visual Studio analyzer can't tell that we're building this with different constants */
+#pragma warning(push)
+#pragma warning(disable : 6239)
+#endif
+#undef YUV_TYPE
+#if YUV_BITS > 8
+#define YUV_TYPE        uint16_t
+#else
+#define YUV_TYPE        uint8_t
+#endif
+#undef UV_OFFSET
+#define UV_OFFSET       (1 << ((YUV_BITS)-1))
+#undef GET
+#if YUV_BITS == 10
+#define GET(X)  ((X) >> 6)
+#else
+#define GET(X)  (X)
+#endif
+void STD_FUNCTION_NAME(
+        uint32_t width, uint32_t height,
+        const YUV_TYPE *Y, const YUV_TYPE *U, const YUV_TYPE *V, uint32_t Y_stride, uint32_t UV_stride,
+        uint8_t *RGB, uint32_t RGB_stride,
+        YCbCrType yuv_type)
+{
+        const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+        #define y_pixel_stride 1
+        #define uv_pixel_stride 1
+        #define uv_x_sample_interval 2
+        #define uv_y_sample_interval 2
+#elif YUV_FORMAT == YUV_FORMAT_422
+        #define y_pixel_stride 2
+        #define uv_pixel_stride 4
+        #define uv_x_sample_interval 2
+        #define uv_y_sample_interval 1
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+        #define y_pixel_stride 1
+        #define uv_pixel_stride 2
+        #define uv_x_sample_interval 2
+        #define uv_y_sample_interval 2
+#endif
+        Y_stride /= sizeof(YUV_TYPE);
+        UV_stride /= sizeof(YUV_TYPE);
+        uint32_t x, y;
+        for(y=0; y<(height-(uv_y_sample_interval-1)); y+=uv_y_sample_interval)
+        {
+                const YUV_TYPE *y_ptr1=Y+y*Y_stride,
+                        *u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+                        *v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+                #if uv_y_sample_interval > 1
+                const YUV_TYPE *y_ptr2=Y+(y+1)*Y_stride;
+                #endif
+                uint8_t *rgb_ptr1=RGB+y*RGB_stride;
+                #if uv_y_sample_interval > 1
+                uint8_t *rgb_ptr2=RGB+(y+1)*RGB_stride;
+                #endif
+                for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
+                {
+                        // Compute U and V contributions, common to the four pixels
+                        int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
+                        int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
+                        int32_t r_tmp = (v_tmp*param->v_r_factor);
+                        int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+                        int32_t b_tmp = (u_tmp*param->u_b_factor);
+                        // Compute the Y contribution for each pixel
+                        int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr1);
+                        y_tmp = (GET(y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr1);
+                        #if uv_y_sample_interval > 1
+                        y_tmp = (GET(y_ptr2[0]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr2);
+                        y_tmp = (GET(y_ptr2[y_pixel_stride]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr2);
+                        #endif
+                        y_ptr1+=2*y_pixel_stride;
+                        #if uv_y_sample_interval > 1
+                        y_ptr2+=2*y_pixel_stride;
+                        #endif
+                        u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+                        v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+                }
+                /* Catch the last pixel, if needed */
+                if (uv_x_sample_interval == 2 && x == (width-1))
+                {
+                        // Compute U and V contributions, common to the four pixels
+                        int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
+                        int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
+                        int32_t r_tmp = (v_tmp*param->v_r_factor);
+                        int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+                        int32_t b_tmp = (u_tmp*param->u_b_factor);
+                        // Compute the Y contribution for each pixel
+                        int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr1);
+                        #if uv_y_sample_interval > 1
+                        y_tmp = (GET(y_ptr2[0]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr2);
+                        #endif
+                }
+        }
+        /* Catch the last line, if needed */
+        if (uv_y_sample_interval == 2 && y == (height-1))
+        {
+                const YUV_TYPE *y_ptr1=Y+y*Y_stride,
+                        *u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+                        *v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+                uint8_t *rgb_ptr1=RGB+y*RGB_stride;
+                for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
+                {
+                        // Compute U and V contributions, common to the four pixels
+                        int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
+                        int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
+                        int32_t r_tmp = (v_tmp*param->v_r_factor);
+                        int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+                        int32_t b_tmp = (u_tmp*param->u_b_factor);
+                        // Compute the Y contribution for each pixel
+                        int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr1);
+                        y_tmp = (GET(y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr1);
+                        y_ptr1+=2*y_pixel_stride;
+                        u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+                        v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+                }
+                /* Catch the last pixel, if needed */
+                if (uv_x_sample_interval == 2 && x == (width-1))
+                {
+                        // Compute U and V contributions, common to the four pixels
+                        int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
+                        int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
+                        int32_t r_tmp = (v_tmp*param->v_r_factor);
+                        int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+                        int32_t b_tmp = (u_tmp*param->u_b_factor);
+                        // Compute the Y contribution for each pixel
+                        int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
+                        PACK_PIXEL(rgb_ptr1);
+                }
+        }
+        #undef y_pixel_stride
+        #undef uv_pixel_stride
+        #undef uv_x_sample_interval
+        #undef uv_y_sample_interval
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef PACK_PIXEL