summaryrefslogtreecommitdiff
path: root/contrib/SDL-3.2.8/src/video/yuv2rgb
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/SDL-3.2.8/src/video/yuv2rgb')
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE27
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/README.md63
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h33
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h15
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h85
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c43
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h36
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h372
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c460
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h241
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h529
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c200
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h143
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h271
14 files changed, 2518 insertions, 0 deletions
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE b/contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE
new file mode 100644
index 0000000..a76efd7
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/LICENSE
@@ -0,0 +1,27 @@
1Copyright (c) 2016, Adrien Descamps
2All rights reserved.
3
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are met:
6
7* Redistributions of source code must retain the above copyright notice, this
8 list of conditions and the following disclaimer.
9
10* Redistributions in binary form must reproduce the above copyright notice,
11 this list of conditions and the following disclaimer in the documentation
12 and/or other materials provided with the distribution.
13
14* Neither the name of yuv2rgb nor the names of its
15 contributors may be used to endorse or promote products derived from
16 this software without specific prior written permission.
17
18THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/README.md b/contrib/SDL-3.2.8/src/video/yuv2rgb/README.md
new file mode 100644
index 0000000..21191e9
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/README.md
@@ -0,0 +1,63 @@
1From: https://github.com/descampsa/yuv2rgb
2# yuv2rgb
3C library for fast image conversion between yuv420p and rgb24.
4
5This is a simple library for optimized image conversion between YUV420p and rgb24.
6It was done mainly as an exercise to learn to use sse intrinsics, so there may still be room for optimization.
7
8For each conversion, a standard c optimized function and two sse function (with aligned and unaligned memory) are implemented.
9The sse version requires only SSE2, which is available on any reasonably recent CPU.
10The library also supports the three different YUV (YCrCb to be correct) color spaces that exist (see comments in code), and others can be added simply.
11
12There is a simple test program, that convert a raw YUV file to rgb ppm format, and measure computation time.
13Optionally, it also compares the result and computation time with the ffmpeg implementation (that uses MMX), and with the IPP functions.
14
15To compile, simply do :
16
17 mkdir build
18 cd build
19 cmake -DCMAKE_BUILD_TYPE=Release ..
20 make
21
22The test program only support raw YUV files for the YUV420 format, and ppm for the RGB24 format.
23To generate a raw yuv file, you can use avconv:
24
25 avconv -i example.jpg -c:v rawvideo -pix_fmt yuv420p example.yuv
26
27To generate the rgb file, you can use the ImageMagick convert program:
28
29 convert example.jpg example.ppm
30
31Then, for YUV420 to RGB24 conversion, use the test program like that:
32
33 ./test_yuv_rgb yuv2rgb image.yuv 4096 2160 image
34
35The second and third parameters are image width and height (that are needed because not available in the raw YUV file), and fourth parameter is the output filename template (several output files will be generated, named for example output_sse.ppm, output_av.ppm, etc.)
36
37Similarly, for RGB24 to YUV420 conversion:
38
39 ./test_yuv_rgb yuv2rgb image.ppm image
40
41On my computer, the test program on a 4K image give the following for yuv2rgb:
42
43 Time will be measured in each configuration for 100 iterations...
44 Processing time (std) : 2.630193 sec
45 Processing time (sse2_unaligned) : 0.704394 sec
46 Processing time (ffmpeg_unaligned) : 1.221432 sec
47 Processing time (ipp_unaligned) : 0.636274 sec
48 Processing time (sse2_aligned) : 0.606648 sec
49 Processing time (ffmpeg_aligned) : 1.227100 sec
50 Processing time (ipp_aligned) : 0.636951 sec
51
52And for rgb2yuv:
53
54 Time will be measured in each configuration for 100 iterations...
55 Processing time (std) : 2.588675 sec
56 Processing time (sse2_unaligned) : 0.676625 sec
57 Processing time (ffmpeg_unaligned) : 3.385816 sec
58 Processing time (ipp_unaligned) : 0.593890 sec
59 Processing time (sse2_aligned) : 0.640630 sec
60 Processing time (ffmpeg_aligned) : 3.397952 sec
61 Processing time (ipp_aligned) : 0.579043 sec
62
63configuration : gcc 4.9.2, swscale 3.0.0, IPP 9.0.1, intel i7-5500U
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h
new file mode 100644
index 0000000..c359316
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb.h
@@ -0,0 +1,33 @@
1#ifndef YUV_RGB_H_
2#define YUV_RGB_H_
3
4// Copyright 2016 Adrien Descamps
5// Distributed under BSD 3-Clause License
6
7// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
8
9// There are a few slightly different variations of the YCbCr color space with different parameters that
10// change the conversion matrix.
11// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
12// See the respective standards for details
13// The matrix values used are derived from http://www.equasys.de/colorconversion.html
14
15// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
16// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This
17// is suboptimal for image quality, but by far the fastest method.
18
19// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
20// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
21
22/*#include <stdint.h>*/
23
24// yuv to rgb, standard c implementation
25#include "yuv_rgb_std.h"
26
27// yuv to rgb, sse2 implementation
28#include "yuv_rgb_sse.h"
29
30// yuv to rgb, lsx implementation
31#include "yuv_rgb_lsx.h"
32
33#endif /* YUV_RGB_H_ */
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h
new file mode 100644
index 0000000..a4ef8ea
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_common.h
@@ -0,0 +1,15 @@
1#ifndef YUV_RGB_COMMON_H_
2#define YUV_RGB_COMMON_H_
3// Copyright 2016 Adrien Descamps
4// Distributed under BSD 3-Clause License
5
6typedef enum
7{
8 YCBCR_601_FULL,
9 YCBCR_601_LIMITED,
10 YCBCR_709_FULL,
11 YCBCR_709_LIMITED,
12 YCBCR_2020_NCL_FULL,
13} YCbCrType;
14
15#endif /* YUV_RGB_COMMON_H_ */
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h
new file mode 100644
index 0000000..d5939ed
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_internal.h
@@ -0,0 +1,85 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3#include "yuv_rgb.h"
4
5#define PRECISION 6
6#define PRECISION_FACTOR (1<<PRECISION)
7
8typedef struct
9{
10 uint8_t y_shift;
11 int16_t matrix[3][3];
12} RGB2YUVParam;
13// |Y| |y_shift| |matrix[0][0] matrix[0][1] matrix[0][2]| |R|
14// |U| = | 128 | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
15// |V| | 128 | |matrix[2][0] matrix[2][1] matrix[2][2]| |B|
16
17typedef struct
18{
19 uint8_t y_shift;
20 int16_t y_factor;
21 int16_t v_r_factor;
22 int16_t u_g_factor;
23 int16_t v_g_factor;
24 int16_t u_b_factor;
25} YUV2RGBParam;
26// |R| |y_factor 0 v_r_factor| |Y-y_shift|
27// |G| = 1/PRECISION_FACTOR * |y_factor u_g_factor v_g_factor| * | U-128 |
28// |B| |y_factor u_b_factor 0 | | V-128 |
29
30#ifdef _MSC_VER
31#pragma warning(push)
32#pragma warning(disable : 26451)
33#endif
34
35#define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
36
37// for ITU-T T.871, values can be found in section 7
38// for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
39// for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
40// for ITU-R BT.2020 values are assuming RGB is encoded using full 10-bit range ([0-1]<->[0-1023])
41// all values are rounded to the fourth decimal
42
43static const YUV2RGBParam YUV2RGB[] = {
44 // ITU-T T.871 (JPEG)
45 {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
46 // ITU-R BT.601-7
47 {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
48 // ITU-R BT.709-6 full range
49 {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.581), /*.u_g_factor=*/ -V(0.1881), /*.v_g_factor=*/ -V(0.47), /*.u_b_factor=*/ V(1.8629)},
50 // ITU-R BT.709-6
51 {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)},
52 // ITU-R BT.2020 10-bit full range
53 {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.4760), /*.u_g_factor=*/ -V(0.1647), /*.v_g_factor=*/ -V(0.5719), /*.u_b_factor=*/ V(1.8832) }
54};
55
56static const RGB2YUVParam RGB2YUV[] = {
57 // ITU-T T.871 (JPEG)
58 {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
59 // ITU-R BT.601-7
60 {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
61 // ITU-R BT.709-6 full range
62 {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.2126), V(0.7152), V(0.0722)}, {-V(0.1141), -V(0.3839), V(0.498)}, {V(0.498), -V(0.4524), -V(0.0457)}}},
63 // ITU-R BT.709-6
64 {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}},
65 // ITU-R BT.2020 10-bit full range
66 {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.2627), V(0.6780), V(0.0593)}, {-V(0.1395), -V(0.3600), V(0.4995)}, {V(0.4995), -V(0.4593), -V(0.0402)}}},
67};
68
69#ifdef _MSC_VER
70#pragma warning(pop)
71#endif
72
73/* The various layouts of YUV data we support */
74#define YUV_FORMAT_420 1
75#define YUV_FORMAT_422 2
76#define YUV_FORMAT_NV12 3
77
78/* The various formats of RGB pixel that we support */
79#define RGB_FORMAT_RGB565 1
80#define RGB_FORMAT_RGB24 2
81#define RGB_FORMAT_RGBA 3
82#define RGB_FORMAT_BGRA 4
83#define RGB_FORMAT_ARGB 5
84#define RGB_FORMAT_ABGR 6
85#define RGB_FORMAT_XBGR2101010 7
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c
new file mode 100644
index 0000000..250ff37
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.c
@@ -0,0 +1,43 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3#include "SDL_internal.h"
4
5#ifdef SDL_HAVE_YUV
6#include "yuv_rgb_lsx.h"
7#include "yuv_rgb_internal.h"
8
9#ifdef SDL_LSX_INTRINSICS
10
11#define LSX_FUNCTION_NAME yuv420_rgb24_lsx
12#define STD_FUNCTION_NAME yuv420_rgb24_std
13#define YUV_FORMAT YUV_FORMAT_420
14#define RGB_FORMAT RGB_FORMAT_RGB24
15#include "yuv_rgb_lsx_func.h"
16
17#define LSX_FUNCTION_NAME yuv420_rgba_lsx
18#define STD_FUNCTION_NAME yuv420_rgba_std
19#define YUV_FORMAT YUV_FORMAT_420
20#define RGB_FORMAT RGB_FORMAT_RGBA
21#include "yuv_rgb_lsx_func.h"
22
23#define LSX_FUNCTION_NAME yuv420_bgra_lsx
24#define STD_FUNCTION_NAME yuv420_bgra_std
25#define YUV_FORMAT YUV_FORMAT_420
26#define RGB_FORMAT RGB_FORMAT_BGRA
27#include "yuv_rgb_lsx_func.h"
28
29#define LSX_FUNCTION_NAME yuv420_argb_lsx
30#define STD_FUNCTION_NAME yuv420_argb_std
31#define YUV_FORMAT YUV_FORMAT_420
32#define RGB_FORMAT RGB_FORMAT_ARGB
33#include "yuv_rgb_lsx_func.h"
34
35#define LSX_FUNCTION_NAME yuv420_abgr_lsx
36#define STD_FUNCTION_NAME yuv420_abgr_std
37#define YUV_FORMAT YUV_FORMAT_420
38#define RGB_FORMAT RGB_FORMAT_ABGR
39#include "yuv_rgb_lsx_func.h"
40
41#endif // SDL_LSX_INTRINSICS
42
43#endif // SDL_HAVE_YUV
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h
new file mode 100644
index 0000000..1347a31
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx.h
@@ -0,0 +1,36 @@
1#ifdef SDL_LSX_INTRINSICS
2
3#include "yuv_rgb_common.h"
4
5//yuv420 to bgra, lsx implementation
6void yuv420_rgb24_lsx(
7 uint32_t width, uint32_t height,
8 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
9 uint8_t *rgb, uint32_t rgb_stride,
10 YCbCrType yuv_type);
11
12void yuv420_rgba_lsx(
13 uint32_t width, uint32_t height,
14 const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
15 uint8_t *rgb, uint32_t rgb_stride,
16 YCbCrType yuv_type);
17
18void yuv420_bgra_lsx(
19 uint32_t width, uint32_t height,
20 const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
21 uint8_t *rgb, uint32_t rgb_stride,
22 YCbCrType yuv_type);
23
24void yuv420_argb_lsx(
25 uint32_t width, uint32_t height,
26 const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
27 uint8_t *rgb, uint32_t rgb_stride,
28 YCbCrType yuv_type);
29
30void yuv420_abgr_lsx(
31 uint32_t width, uint32_t height,
32 const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
33 uint8_t *rgb, uint32_t rgb_stride,
34 YCbCrType yuv_type);
35
36#endif //SDL_LSX_INTRINSICS
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
new file mode 100644
index 0000000..89d582a
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
@@ -0,0 +1,372 @@
1// Copyright 2016 Adrien Descamps
2// // Distributed under BSD 3-Clause License
3
4#include <lsxintrin.h>
5
6#if YUV_FORMAT == YUV_FORMAT_420
7
8#define READ_Y(y_ptr) \
9 y = __lsx_vld(y_ptr, 0); \
10
11#define READ_UV \
12 u_temp = __lsx_vld(u_ptr, 0); \
13 v_temp = __lsx_vld(v_ptr, 0); \
14
15#else
16#error READ_UV unimplemented
17#endif
18
19#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, \
20 RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
21{ \
22 __m128i ab_l, ab_h, gr_l, gr_h; \
23 ab_l = __lsx_vilvl_b(B1, A1); \
24 ab_h = __lsx_vilvh_b(B1, A1); \
25 gr_l = __lsx_vilvl_b(R1, G1); \
26 gr_h = __lsx_vilvh_b(R1, G1); \
27 RGB1 = __lsx_vilvl_h(gr_l, ab_l); \
28 RGB2 = __lsx_vilvh_h(gr_l, ab_l); \
29 RGB3 = __lsx_vilvl_h(gr_h, ab_h); \
30 RGB4 = __lsx_vilvh_h(gr_h, ab_h); \
31 ab_l = __lsx_vilvl_b(B2, A2); \
32 ab_h = __lsx_vilvh_b(B2, A2); \
33 gr_l = __lsx_vilvl_b(R2, G2); \
34 gr_h = __lsx_vilvh_b(R2, G2); \
35 RGB5 = __lsx_vilvl_h(gr_l, ab_l); \
36 RGB6 = __lsx_vilvh_h(gr_l, ab_l); \
37 RGB7 = __lsx_vilvl_h(gr_h, ab_h); \
38 RGB8 = __lsx_vilvh_h(gr_h, ab_h); \
39}
40
41#define PACK_RGB24_32_STEP(R, G, B, RGB1, RGB2, RGB3) \
42 RGB1 = __lsx_vilvl_b(G, R); \
43 RGB1 = __lsx_vshuf_b(B, RGB1, mask1); \
44 RGB2 = __lsx_vshuf_b(B, G, mask2); \
45 RGB2 = __lsx_vshuf_b(R, RGB2, mask3); \
46 RGB3 = __lsx_vshuf_b(R, B, mask4); \
47 RGB3 = __lsx_vshuf_b(G, RGB3, mask5); \
48
49#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
50 PACK_RGB24_32_STEP(R1, G1, B1, RGB1, RGB2, RGB3); \
51 PACK_RGB24_32_STEP(R2, G2, B2, RGB4, RGB5, RGB6); \
52
53#if RGB_FORMAT == RGB_FORMAT_RGB24
54
55#define PACK_PIXEL \
56 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
57 __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
58 PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, \
59 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
60 PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, \
61 rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
62
63#elif RGB_FORMAT == RGB_FORMAT_RGBA
64
65#define PACK_PIXEL \
66 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
67 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
68 __m128i a = __lsx_vldi(0xFF); \
69 PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, \
70 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
71 PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, \
72 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
73
74#elif RGB_FORMAT == RGB_FORMAT_BGRA
75
76#define PACK_PIXEL \
77 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
78 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
79 __m128i a = __lsx_vldi(0xFF); \
80 PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, \
81 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
82 PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, \
83 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
84
85#elif RGB_FORMAT == RGB_FORMAT_ARGB
86
87#define PACK_PIXEL \
88 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
89 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
90 __m128i a = __lsx_vldi(0xFF); \
91 PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, \
92 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
93 PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, \
94 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
95
96#elif RGB_FORMAT == RGB_FORMAT_ABGR
97
98#define PACK_PIXEL \
99 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
100 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
101 __m128i a = __lsx_vldi(0xFF); \
102 PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, \
103 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
104 PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, \
105 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
106
107#else
108#error PACK_PIXEL unimplemented
109#endif
110
111#define LSX_ST_UB2(in0, in1, pdst, stride) \
112{ \
113 __lsx_vst(in0, pdst, 0); \
114 __lsx_vst(in1, pdst + stride, 0); \
115}
116
117#if RGB_FORMAT == RGB_FORMAT_RGB24 \
118
119#define SAVE_LINE1 \
120 LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16); \
121 LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16); \
122 LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16); \
123
124#define SAVE_LINE2 \
125 LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr2, 16); \
126 LSX_ST_UB2(rgb_9, rgb_10, rgb_ptr2 + 32, 16); \
127 LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 64, 16); \
128
129#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
130 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR \
131
132#define SAVE_LINE1 \
133 LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16); \
134 LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16); \
135 LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16); \
136 LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr1 + 96, 16); \
137
138#define SAVE_LINE2 \
139 LSX_ST_UB2(rgb_9, rgb_10, rgb_ptr2, 16); \
140 LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 32, 16); \
141 LSX_ST_UB2(rgb_13, rgb_14, rgb_ptr2 + 64, 16); \
142 LSX_ST_UB2(rgb_15, rgb_16, rgb_ptr2 + 96, 16); \
143
144#else
145#error SAVE_LINE unimplemented
146#endif
147
148// = u*vr g=u*ug+v*vg b=u*ub
149#define UV2RGB_16(U, V, R1, G1, B1, R2, G2, B2) \
150 r_temp = __lsx_vmul_h(V, v2r); \
151 g_temp = __lsx_vmul_h(U, u2g); \
152 g_temp = __lsx_vmadd_h(g_temp, V, v2g); \
153 b_temp = __lsx_vmul_h(U, u2b); \
154 R1 = __lsx_vilvl_h(r_temp, r_temp); \
155 G1 = __lsx_vilvl_h(g_temp, g_temp); \
156 B1 = __lsx_vilvl_h(b_temp, b_temp); \
157 R2 = __lsx_vilvh_h(r_temp, r_temp); \
158 G2 = __lsx_vilvh_h(g_temp, g_temp); \
159 B2 = __lsx_vilvh_h(b_temp, b_temp); \
160
161// Y=(Y-shift)*shift R=(Y+R)>>6,G=(Y+G)>>6,B=(B+Y)>>6
162#define ADD_Y2RGB_16(Y1, Y2, R1, G1, B1, R2, G2, B2) \
163 Y1 = __lsx_vsub_h(Y1, shift); \
164 Y2 = __lsx_vsub_h(Y2, shift); \
165 Y1 = __lsx_vmul_h(Y1, yf); \
166 Y2 = __lsx_vmul_h(Y2, yf); \
167 R1 = __lsx_vadd_h(R1, Y1); \
168 G1 = __lsx_vadd_h(G1, Y1); \
169 B1 = __lsx_vadd_h(B1, Y1); \
170 R2 = __lsx_vadd_h(R2, Y2); \
171 G2 = __lsx_vadd_h(G2, Y2); \
172 B2 = __lsx_vadd_h(B2, Y2); \
173 R1 = __lsx_vsrai_h(R1, PRECISION); \
174 G1 = __lsx_vsrai_h(G1, PRECISION); \
175 B1 = __lsx_vsrai_h(B1, PRECISION); \
176 R2 = __lsx_vsrai_h(R2, PRECISION); \
177 G2 = __lsx_vsrai_h(G2, PRECISION); \
178 B2 = __lsx_vsrai_h(B2, PRECISION); \
179
180#define CLIP(in0, in1, in2, in3, in4, in5) \
181{ \
182 in0 = __lsx_vmaxi_h(in0, 0); \
183 in1 = __lsx_vmaxi_h(in1, 0); \
184 in2 = __lsx_vmaxi_h(in2, 0); \
185 in3 = __lsx_vmaxi_h(in3, 0); \
186 in4 = __lsx_vmaxi_h(in4, 0); \
187 in5 = __lsx_vmaxi_h(in5, 0); \
188 in0 = __lsx_vsat_hu(in0, 7); \
189 in1 = __lsx_vsat_hu(in1, 7); \
190 in2 = __lsx_vsat_hu(in2, 7); \
191 in3 = __lsx_vsat_hu(in3, 7); \
192 in4 = __lsx_vsat_hu(in4, 7); \
193 in5 = __lsx_vsat_hu(in5, 7); \
194}
195
196#define YUV2RGB_32 \
197 __m128i y, u_temp, v_temp; \
198 __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
199 __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
200 __m128i u, v, r_temp, g_temp, b_temp; \
201 __m128i r_1, g_1, b_1, r_2, g_2, b_2; \
202 __m128i y_1, y_2; \
203 __m128i r_uv_1, g_uv_1, b_uv_1, r_uv_2, g_uv_2, b_uv_2; \
204 \
205 READ_UV \
206 \
207 /* process first 16 pixels of first line */ \
208 u = __lsx_vilvl_b(zero, u_temp); \
209 v = __lsx_vilvl_b(zero, v_temp); \
210 u = __lsx_vsub_h(u, bias); \
211 v = __lsx_vsub_h(v, bias); \
212 UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2); \
213 r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1; \
214 r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2; \
215 READ_Y(y_ptr1) \
216 y_1 = __lsx_vilvl_b(zero, y); \
217 y_2 = __lsx_vilvh_b(zero, y); \
218 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
219 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
220 r_8_11 = __lsx_vpickev_b(r_2, r_1); \
221 g_8_11 = __lsx_vpickev_b(g_2, g_1); \
222 b_8_11 = __lsx_vpickev_b(b_2, b_1); \
223 \
224 /* process first 16 pixels of second line */ \
225 r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1; \
226 r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2; \
227 \
228 READ_Y(y_ptr2) \
229 y_1 = __lsx_vilvl_b(zero, y); \
230 y_2 = __lsx_vilvh_b(zero, y); \
231 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
232 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
233 r_8_21 = __lsx_vpickev_b(r_2, r_1); \
234 g_8_21 = __lsx_vpickev_b(g_2, g_1); \
235 b_8_21 = __lsx_vpickev_b(b_2, b_1); \
236 \
237 /* process last 16 pixels of first line */ \
238 u = __lsx_vilvh_b(zero, u_temp); \
239 v = __lsx_vilvh_b(zero, v_temp); \
240 u = __lsx_vsub_h(u, bias); \
241 v = __lsx_vsub_h(v, bias); \
242 UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2); \
243 r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1; \
244 r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2; \
245 READ_Y(y_ptr1 + 16 * y_pixel_stride) \
246 y_1 = __lsx_vilvl_b(zero, y); \
247 y_2 = __lsx_vilvh_b(zero, y); \
248 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
249 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
250 r_8_12 = __lsx_vpickev_b(r_2, r_1); \
251 g_8_12 = __lsx_vpickev_b(g_2, g_1); \
252 b_8_12 = __lsx_vpickev_b(b_2, b_1); \
253 \
254 /* process last 16 pixels of second line */ \
255 r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1; \
256 r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2; \
257 \
258 READ_Y(y_ptr2 + 16 * y_pixel_stride) \
259 y_1 = __lsx_vilvl_b(zero, y); \
260 y_2 = __lsx_vilvh_b(zero, y); \
261 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
262 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
263 r_8_22 = __lsx_vpickev_b(r_2, r_1); \
264 g_8_22 = __lsx_vpickev_b(g_2, g_1); \
265 b_8_22 = __lsx_vpickev_b(b_2, b_1); \
266 \
267
268void LSX_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y,
269 const uint8_t *U, const uint8_t *V, uint32_t Y_stride,
270 uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride,
271 YCbCrType yuv_type)
272{
273 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
274#if YUV_FORMAT == YUV_FORMAT_420
275 const int y_pixel_stride = 1;
276 const int uv_pixel_stride = 1;
277 const int uv_x_sample_interval = 2;
278 const int uv_y_sample_interval = 2;
279#endif
280
281#if RGB_FORMAT == RGB_FORMAT_RGB565
282 const int rgb_pixel_stride = 2;
283#elif RGB_FORMAT == RGB_FORMAT_RGB24
284 const int rgb_pixel_stride = 3;
285 __m128i mask1 = {0x0504110302100100, 0x0A14090813070612};
286 __m128i mask2 = {0x1808170716061505, 0x00000000000A1909};
287 __m128i mask3 = {0x0504170302160100, 0x0A1A090819070618};
288 __m128i mask4 = {0x1E0D1D0C1C0B1B0A, 0x00000000000F1F0E};
289 __m128i mask5 = {0x05041C03021B0100, 0x0A1F09081E07061D};
290#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT_BGRA || \
291 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT_ABGR
292 const int rgb_pixel_stride = 4;
293#else
294#error Unknown RGB pixel size
295#endif
296
297 uint32_t xpos, ypos;
298 __m128i v2r = __lsx_vreplgr2vr_h(param->v_r_factor);
299 __m128i v2g = __lsx_vreplgr2vr_h(param->v_g_factor);
300 __m128i u2g = __lsx_vreplgr2vr_h(param->u_g_factor);
301 __m128i u2b = __lsx_vreplgr2vr_h(param->u_b_factor);
302 __m128i bias = __lsx_vreplgr2vr_h(128);
303 __m128i shift = __lsx_vreplgr2vr_h(param->y_shift);
304 __m128i yf = __lsx_vreplgr2vr_h(param->y_factor);
305 __m128i zero = __lsx_vldi(0);
306
307 if (width >= 32) {
308 for (ypos = 0; ypos < (height - (uv_y_sample_interval - 1)); ypos += uv_y_sample_interval) {
309 const uint8_t *y_ptr1 = Y + ypos * Y_stride,
310 *y_ptr2 = Y + (ypos + 1) * Y_stride,
311 *u_ptr = U + (ypos/uv_y_sample_interval) * UV_stride,
312 *v_ptr = V + (ypos/uv_y_sample_interval) * UV_stride;
313 uint8_t *rgb_ptr1 = RGB + ypos * RGB_stride,
314 *rgb_ptr2 = RGB + (ypos + 1) * RGB_stride;
315
316 for (xpos = 0; xpos < (width - 31); xpos += 32){
317 YUV2RGB_32
318 {
319 PACK_PIXEL
320 SAVE_LINE1
321 if (uv_y_sample_interval > 1)
322 {
323 SAVE_LINE2
324 }
325 }
326 y_ptr1 += 32 * y_pixel_stride;
327 y_ptr2 += 32 * y_pixel_stride;
328 u_ptr += 32 * uv_pixel_stride/uv_x_sample_interval;
329 v_ptr += 32 * uv_pixel_stride/uv_x_sample_interval;
330 rgb_ptr1 += 32 * rgb_pixel_stride;
331 rgb_ptr2 += 32 * rgb_pixel_stride;
332 }
333 }
334 if (uv_y_sample_interval == 2 && ypos == (height - 1)) {
335 const uint8_t *y_ptr = Y + ypos * Y_stride,
336 *u_ptr = U + (ypos/uv_y_sample_interval) * UV_stride,
337 *v_ptr = V + (ypos/uv_y_sample_interval) * UV_stride;
338 uint8_t *rgb_ptr = RGB + ypos * RGB_stride;
339 STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
340 }
341 }
342 {
343 int converted = (width & ~31);
344 if (converted != width)
345 {
346 const uint8_t *y_ptr = Y + converted * y_pixel_stride,
347 *u_ptr = U + converted * uv_pixel_stride / uv_x_sample_interval,
348 *v_ptr = V + converted * uv_pixel_stride / uv_x_sample_interval;
349 uint8_t *rgb_ptr = RGB + converted * rgb_pixel_stride;
350
351 STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
352 }
353 }
354}
355
356#undef LSX_FUNCTION_NAME
357#undef STD_FUNCTION_NAME
358#undef YUV_FORMAT
359#undef RGB_FORMAT
360#undef LSX_ALIGNED
361#undef LSX_ST_UB2
362#undef UV2RGB_16
363#undef ADD_Y2RGB_16
364#undef PACK_RGB24_32_STEP
365#undef PACK_RGB24_32
366#undef PACK_PIXEL
367#undef PACK_RGBA_32
368#undef SAVE_LINE1
369#undef SAVE_LINE2
370#undef READ_Y
371#undef READ_UV
372#undef YUV2RGB_32
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c
new file mode 100644
index 0000000..37fe7e4
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.c
@@ -0,0 +1,460 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3#include "SDL_internal.h"
4
5#ifdef SDL_HAVE_YUV
6#include "yuv_rgb_internal.h"
7
8#ifdef SDL_SSE2_INTRINSICS
9
10/* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
11#define SSE_FUNCTION_NAME yuv420_rgb565_sse
12#define STD_FUNCTION_NAME yuv420_rgb565_std
13#define YUV_FORMAT YUV_FORMAT_420
14#define RGB_FORMAT RGB_FORMAT_RGB565
15#define SSE_ALIGNED
16#include "yuv_rgb_sse_func.h"
17
18#define SSE_FUNCTION_NAME yuv420_rgb24_sse
19#define STD_FUNCTION_NAME yuv420_rgb24_std
20#define YUV_FORMAT YUV_FORMAT_420
21#define RGB_FORMAT RGB_FORMAT_RGB24
22#define SSE_ALIGNED
23#include "yuv_rgb_sse_func.h"
24
25#define SSE_FUNCTION_NAME yuv420_rgba_sse
26#define STD_FUNCTION_NAME yuv420_rgba_std
27#define YUV_FORMAT YUV_FORMAT_420
28#define RGB_FORMAT RGB_FORMAT_RGBA
29#define SSE_ALIGNED
30#include "yuv_rgb_sse_func.h"
31
32#define SSE_FUNCTION_NAME yuv420_bgra_sse
33#define STD_FUNCTION_NAME yuv420_bgra_std
34#define YUV_FORMAT YUV_FORMAT_420
35#define RGB_FORMAT RGB_FORMAT_BGRA
36#define SSE_ALIGNED
37#include "yuv_rgb_sse_func.h"
38
39#define SSE_FUNCTION_NAME yuv420_argb_sse
40#define STD_FUNCTION_NAME yuv420_argb_std
41#define YUV_FORMAT YUV_FORMAT_420
42#define RGB_FORMAT RGB_FORMAT_ARGB
43#define SSE_ALIGNED
44#include "yuv_rgb_sse_func.h"
45
46#define SSE_FUNCTION_NAME yuv420_abgr_sse
47#define STD_FUNCTION_NAME yuv420_abgr_std
48#define YUV_FORMAT YUV_FORMAT_420
49#define RGB_FORMAT RGB_FORMAT_ABGR
50#define SSE_ALIGNED
51#include "yuv_rgb_sse_func.h"
52
53#define SSE_FUNCTION_NAME yuv422_rgb565_sse
54#define STD_FUNCTION_NAME yuv422_rgb565_std
55#define YUV_FORMAT YUV_FORMAT_422
56#define RGB_FORMAT RGB_FORMAT_RGB565
57#define SSE_ALIGNED
58#include "yuv_rgb_sse_func.h"
59
60#define SSE_FUNCTION_NAME yuv422_rgb24_sse
61#define STD_FUNCTION_NAME yuv422_rgb24_std
62#define YUV_FORMAT YUV_FORMAT_422
63#define RGB_FORMAT RGB_FORMAT_RGB24
64#define SSE_ALIGNED
65#include "yuv_rgb_sse_func.h"
66
67#define SSE_FUNCTION_NAME yuv422_rgba_sse
68#define STD_FUNCTION_NAME yuv422_rgba_std
69#define YUV_FORMAT YUV_FORMAT_422
70#define RGB_FORMAT RGB_FORMAT_RGBA
71#define SSE_ALIGNED
72#include "yuv_rgb_sse_func.h"
73
74#define SSE_FUNCTION_NAME yuv422_bgra_sse
75#define STD_FUNCTION_NAME yuv422_bgra_std
76#define YUV_FORMAT YUV_FORMAT_422
77#define RGB_FORMAT RGB_FORMAT_BGRA
78#define SSE_ALIGNED
79#include "yuv_rgb_sse_func.h"
80
81#define SSE_FUNCTION_NAME yuv422_argb_sse
82#define STD_FUNCTION_NAME yuv422_argb_std
83#define YUV_FORMAT YUV_FORMAT_422
84#define RGB_FORMAT RGB_FORMAT_ARGB
85#define SSE_ALIGNED
86#include "yuv_rgb_sse_func.h"
87
88#define SSE_FUNCTION_NAME yuv422_abgr_sse
89#define STD_FUNCTION_NAME yuv422_abgr_std
90#define YUV_FORMAT YUV_FORMAT_422
91#define RGB_FORMAT RGB_FORMAT_ABGR
92#define SSE_ALIGNED
93#include "yuv_rgb_sse_func.h"
94
95#define SSE_FUNCTION_NAME yuvnv12_rgb565_sse
96#define STD_FUNCTION_NAME yuvnv12_rgb565_std
97#define YUV_FORMAT YUV_FORMAT_NV12
98#define RGB_FORMAT RGB_FORMAT_RGB565
99#define SSE_ALIGNED
100#include "yuv_rgb_sse_func.h"
101
102#define SSE_FUNCTION_NAME yuvnv12_rgb24_sse
103#define STD_FUNCTION_NAME yuvnv12_rgb24_std
104#define YUV_FORMAT YUV_FORMAT_NV12
105#define RGB_FORMAT RGB_FORMAT_RGB24
106#define SSE_ALIGNED
107#include "yuv_rgb_sse_func.h"
108
109#define SSE_FUNCTION_NAME yuvnv12_rgba_sse
110#define STD_FUNCTION_NAME yuvnv12_rgba_std
111#define YUV_FORMAT YUV_FORMAT_NV12
112#define RGB_FORMAT RGB_FORMAT_RGBA
113#define SSE_ALIGNED
114#include "yuv_rgb_sse_func.h"
115
116#define SSE_FUNCTION_NAME yuvnv12_bgra_sse
117#define STD_FUNCTION_NAME yuvnv12_bgra_std
118#define YUV_FORMAT YUV_FORMAT_NV12
119#define RGB_FORMAT RGB_FORMAT_BGRA
120#define SSE_ALIGNED
121#include "yuv_rgb_sse_func.h"
122
123#define SSE_FUNCTION_NAME yuvnv12_argb_sse
124#define STD_FUNCTION_NAME yuvnv12_argb_std
125#define YUV_FORMAT YUV_FORMAT_NV12
126#define RGB_FORMAT RGB_FORMAT_ARGB
127#define SSE_ALIGNED
128#include "yuv_rgb_sse_func.h"
129
130#define SSE_FUNCTION_NAME yuvnv12_abgr_sse
131#define STD_FUNCTION_NAME yuvnv12_abgr_std
132#define YUV_FORMAT YUV_FORMAT_NV12
133#define RGB_FORMAT RGB_FORMAT_ABGR
134#define SSE_ALIGNED
135#include "yuv_rgb_sse_func.h"
136*/
137
138#define SSE_FUNCTION_NAME yuv420_rgb565_sseu
139#define STD_FUNCTION_NAME yuv420_rgb565_std
140#define YUV_FORMAT YUV_FORMAT_420
141#define RGB_FORMAT RGB_FORMAT_RGB565
142#include "yuv_rgb_sse_func.h"
143
144#define SSE_FUNCTION_NAME yuv420_rgb24_sseu
145#define STD_FUNCTION_NAME yuv420_rgb24_std
146#define YUV_FORMAT YUV_FORMAT_420
147#define RGB_FORMAT RGB_FORMAT_RGB24
148#include "yuv_rgb_sse_func.h"
149
150#define SSE_FUNCTION_NAME yuv420_rgba_sseu
151#define STD_FUNCTION_NAME yuv420_rgba_std
152#define YUV_FORMAT YUV_FORMAT_420
153#define RGB_FORMAT RGB_FORMAT_RGBA
154#include "yuv_rgb_sse_func.h"
155
156#define SSE_FUNCTION_NAME yuv420_bgra_sseu
157#define STD_FUNCTION_NAME yuv420_bgra_std
158#define YUV_FORMAT YUV_FORMAT_420
159#define RGB_FORMAT RGB_FORMAT_BGRA
160#include "yuv_rgb_sse_func.h"
161
162#define SSE_FUNCTION_NAME yuv420_argb_sseu
163#define STD_FUNCTION_NAME yuv420_argb_std
164#define YUV_FORMAT YUV_FORMAT_420
165#define RGB_FORMAT RGB_FORMAT_ARGB
166#include "yuv_rgb_sse_func.h"
167
168#define SSE_FUNCTION_NAME yuv420_abgr_sseu
169#define STD_FUNCTION_NAME yuv420_abgr_std
170#define YUV_FORMAT YUV_FORMAT_420
171#define RGB_FORMAT RGB_FORMAT_ABGR
172#include "yuv_rgb_sse_func.h"
173
174#define SSE_FUNCTION_NAME yuv422_rgb565_sseu
175#define STD_FUNCTION_NAME yuv422_rgb565_std
176#define YUV_FORMAT YUV_FORMAT_422
177#define RGB_FORMAT RGB_FORMAT_RGB565
178#include "yuv_rgb_sse_func.h"
179
180#define SSE_FUNCTION_NAME yuv422_rgb24_sseu
181#define STD_FUNCTION_NAME yuv422_rgb24_std
182#define YUV_FORMAT YUV_FORMAT_422
183#define RGB_FORMAT RGB_FORMAT_RGB24
184#include "yuv_rgb_sse_func.h"
185
186#define SSE_FUNCTION_NAME yuv422_rgba_sseu
187#define STD_FUNCTION_NAME yuv422_rgba_std
188#define YUV_FORMAT YUV_FORMAT_422
189#define RGB_FORMAT RGB_FORMAT_RGBA
190#include "yuv_rgb_sse_func.h"
191
192#define SSE_FUNCTION_NAME yuv422_bgra_sseu
193#define STD_FUNCTION_NAME yuv422_bgra_std
194#define YUV_FORMAT YUV_FORMAT_422
195#define RGB_FORMAT RGB_FORMAT_BGRA
196#include "yuv_rgb_sse_func.h"
197
198#define SSE_FUNCTION_NAME yuv422_argb_sseu
199#define STD_FUNCTION_NAME yuv422_argb_std
200#define YUV_FORMAT YUV_FORMAT_422
201#define RGB_FORMAT RGB_FORMAT_ARGB
202#include "yuv_rgb_sse_func.h"
203
204#define SSE_FUNCTION_NAME yuv422_abgr_sseu
205#define STD_FUNCTION_NAME yuv422_abgr_std
206#define YUV_FORMAT YUV_FORMAT_422
207#define RGB_FORMAT RGB_FORMAT_ABGR
208#include "yuv_rgb_sse_func.h"
209
210#define SSE_FUNCTION_NAME yuvnv12_rgb565_sseu
211#define STD_FUNCTION_NAME yuvnv12_rgb565_std
212#define YUV_FORMAT YUV_FORMAT_NV12
213#define RGB_FORMAT RGB_FORMAT_RGB565
214#include "yuv_rgb_sse_func.h"
215
216#define SSE_FUNCTION_NAME yuvnv12_rgb24_sseu
217#define STD_FUNCTION_NAME yuvnv12_rgb24_std
218#define YUV_FORMAT YUV_FORMAT_NV12
219#define RGB_FORMAT RGB_FORMAT_RGB24
220#include "yuv_rgb_sse_func.h"
221
222#define SSE_FUNCTION_NAME yuvnv12_rgba_sseu
223#define STD_FUNCTION_NAME yuvnv12_rgba_std
224#define YUV_FORMAT YUV_FORMAT_NV12
225#define RGB_FORMAT RGB_FORMAT_RGBA
226#include "yuv_rgb_sse_func.h"
227
228#define SSE_FUNCTION_NAME yuvnv12_bgra_sseu
229#define STD_FUNCTION_NAME yuvnv12_bgra_std
230#define YUV_FORMAT YUV_FORMAT_NV12
231#define RGB_FORMAT RGB_FORMAT_BGRA
232#include "yuv_rgb_sse_func.h"
233
234#define SSE_FUNCTION_NAME yuvnv12_argb_sseu
235#define STD_FUNCTION_NAME yuvnv12_argb_std
236#define YUV_FORMAT YUV_FORMAT_NV12
237#define RGB_FORMAT RGB_FORMAT_ARGB
238#include "yuv_rgb_sse_func.h"
239
240#define SSE_FUNCTION_NAME yuvnv12_abgr_sseu
241#define STD_FUNCTION_NAME yuvnv12_abgr_std
242#define YUV_FORMAT YUV_FORMAT_NV12
243#define RGB_FORMAT RGB_FORMAT_ABGR
244#include "yuv_rgb_sse_func.h"
245
246
247/* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
248#define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
249R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
250R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
251G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
252G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
253B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
254B2 = _mm_unpackhi_epi8(RGB3, RGB6);
255
256#define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
257RGB1 = _mm_unpacklo_epi8(R1, G2); \
258RGB2 = _mm_unpackhi_epi8(R1, G2); \
259RGB3 = _mm_unpacklo_epi8(R2, B1); \
260RGB4 = _mm_unpackhi_epi8(R2, B1); \
261RGB5 = _mm_unpacklo_epi8(G1, B2); \
262RGB6 = _mm_unpackhi_epi8(G1, B2); \
263
264#define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
265UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
266UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
267UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
268UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
269UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
270
271#define RGB2YUV_16(R, G, B, Y, U, V) \
272Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
273 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
274Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
275Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
276Y = _mm_srai_epi16(Y, PRECISION); \
277U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
278 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
279U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
280U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
281U = _mm_srai_epi16(U, PRECISION); \
282V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
283 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
284V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
285V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
286V = _mm_srai_epi16(V, PRECISION);
287*/
288
289#if 0 // SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
290#define RGB2YUV_32 \
291 __m128i r1, r2, b1, b2, g1, g2; \
292 __m128i r_16, g_16, b_16; \
293 __m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
294 __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
295 rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
296 rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
297 rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
298 rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
299 rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
300 /* unpack rgb24 data to r, g and b data in separate channels*/ \
301 UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
302 /* process pixels of first line */ \
303 r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
304 g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
305 b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
306 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
307 r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
308 g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
309 b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
310 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
311 y = _mm_packus_epi16(y1_16, y2_16); \
312 u1 = _mm_packus_epi16(u1_16, u2_16); \
313 v1 = _mm_packus_epi16(v1_16, v2_16); \
314 /* save Y values */ \
315 SAVE_SI128((__m128i*)(y_ptr1), y); \
316 /* process pixels of second line */ \
317 r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
318 g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
319 b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
320 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
321 r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
322 g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
323 b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
324 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
325 y = _mm_packus_epi16(y1_16, y2_16); \
326 u2 = _mm_packus_epi16(u1_16, u2_16); \
327 v2 = _mm_packus_epi16(v1_16, v2_16); \
328 /* save Y values */ \
329 SAVE_SI128((__m128i*)(y_ptr2), y); \
330 /* vertical subsampling of u/v values */ \
331 u1_tmp = _mm_avg_epu8(u1, u2); \
332 v1_tmp = _mm_avg_epu8(v1, v2); \
333 /* do the same again with next data */ \
334 rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
335 rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
336 rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
337 rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
338 rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
339 rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
340 /* unpack rgb24 data to r, g and b data in separate channels*/ \
341 UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
342 /* process pixels of first line */ \
343 r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
344 g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
345 b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
346 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
347 r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
348 g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
349 b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
350 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
351 y = _mm_packus_epi16(y1_16, y2_16); \
352 u1 = _mm_packus_epi16(u1_16, u2_16); \
353 v1 = _mm_packus_epi16(v1_16, v2_16); \
354 /* save Y values */ \
355 SAVE_SI128((__m128i*)(y_ptr1+16), y); \
356 /* process pixels of second line */ \
357 r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
358 g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
359 b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
360 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
361 r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
362 g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
363 b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
364 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
365 y = _mm_packus_epi16(y1_16, y2_16); \
366 u2 = _mm_packus_epi16(u1_16, u2_16); \
367 v2 = _mm_packus_epi16(v1_16, v2_16); \
368 /* save Y values */ \
369 SAVE_SI128((__m128i*)(y_ptr2+16), y); \
370 /* vertical subsampling of u/v values */ \
371 u2_tmp = _mm_avg_epu8(u1, u2); \
372 v2_tmp = _mm_avg_epu8(v1, v2); \
373 /* horizontal subsampling of u/v values */ \
374 u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
375 v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
376 u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
377 v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
378 u1 = _mm_avg_epu8(u1, u2); \
379 v1 = _mm_avg_epu8(v1, v2); \
380 SAVE_SI128((__m128i*)(u_ptr), u1); \
381 SAVE_SI128((__m128i*)(v_ptr), v1);
382#endif
383
384/* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
385void SDL_TARGETING("sse2") rgb24_yuv420_sse(uint32_t width, uint32_t height,
386 const uint8_t *RGB, uint32_t RGB_stride,
387 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
388 YCbCrType yuv_type)
389{
390 #define LOAD_SI128 _mm_load_si128
391 #define SAVE_SI128 _mm_stream_si128
392 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
393
394 uint32_t xpos, ypos;
395 for(ypos=0; ypos<(height-1); ypos+=2)
396 {
397 const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
398 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
399
400 uint8_t *y_ptr1=Y+ypos*Y_stride,
401 *y_ptr2=Y+(ypos+1)*Y_stride,
402 *u_ptr=U+(ypos/2)*UV_stride,
403 *v_ptr=V+(ypos/2)*UV_stride;
404
405 for(xpos=0; xpos<(width-31); xpos+=32)
406 {
407 RGB2YUV_32
408
409 rgb_ptr1+=96;
410 rgb_ptr2+=96;
411 y_ptr1+=32;
412 y_ptr2+=32;
413 u_ptr+=16;
414 v_ptr+=16;
415 }
416 }
417 #undef LOAD_SI128
418 #undef SAVE_SI128
419}
420
421void SDL_TARGETING("sse2") rgb24_yuv420_sseu(uint32_t width, uint32_t height,
422 const uint8_t *RGB, uint32_t RGB_stride,
423 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
424 YCbCrType yuv_type)
425{
426 #define LOAD_SI128 _mm_loadu_si128
427 #define SAVE_SI128 _mm_storeu_si128
428 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
429
430 uint32_t xpos, ypos;
431 for(ypos=0; ypos<(height-1); ypos+=2)
432 {
433 const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
434 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
435
436 uint8_t *y_ptr1=Y+ypos*Y_stride,
437 *y_ptr2=Y+(ypos+1)*Y_stride,
438 *u_ptr=U+(ypos/2)*UV_stride,
439 *v_ptr=V+(ypos/2)*UV_stride;
440
441 for(xpos=0; xpos<(width-31); xpos+=32)
442 {
443 RGB2YUV_32
444
445 rgb_ptr1+=96;
446 rgb_ptr2+=96;
447 y_ptr1+=32;
448 y_ptr2+=32;
449 u_ptr+=16;
450 v_ptr+=16;
451 }
452 }
453 #undef LOAD_SI128
454 #undef SAVE_SI128
455}
456*/
457
458#endif // SDL_SSE2_INTRINSICS
459
460#endif // SDL_HAVE_YUV
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h
new file mode 100644
index 0000000..bfad856
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse.h
@@ -0,0 +1,241 @@
1#ifdef SDL_SSE2_INTRINSICS
2
3#include "yuv_rgb_common.h"
4
5// yuv to rgb, sse implementation
6// pointers must be 16 byte aligned, and strides must be divisable by 16
7void yuv420_rgb565_sse(
8 uint32_t width, uint32_t height,
9 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
10 uint8_t *rgb, uint32_t rgb_stride,
11 YCbCrType yuv_type);
12
13void yuv420_rgb24_sse(
14 uint32_t width, uint32_t height,
15 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
16 uint8_t *rgb, uint32_t rgb_stride,
17 YCbCrType yuv_type);
18
19void yuv420_rgba_sse(
20 uint32_t width, uint32_t height,
21 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
22 uint8_t *rgb, uint32_t rgb_stride,
23 YCbCrType yuv_type);
24
25void yuv420_bgra_sse(
26 uint32_t width, uint32_t height,
27 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
28 uint8_t *rgb, uint32_t rgb_stride,
29 YCbCrType yuv_type);
30
31void yuv420_argb_sse(
32 uint32_t width, uint32_t height,
33 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
34 uint8_t *rgb, uint32_t rgb_stride,
35 YCbCrType yuv_type);
36
37void yuv420_abgr_sse(
38 uint32_t width, uint32_t height,
39 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
40 uint8_t *rgb, uint32_t rgb_stride,
41 YCbCrType yuv_type);
42
43void yuv422_rgb565_sse(
44 uint32_t width, uint32_t height,
45 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
46 uint8_t *rgb, uint32_t rgb_stride,
47 YCbCrType yuv_type);
48
49void yuv422_rgb24_sse(
50 uint32_t width, uint32_t height,
51 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
52 uint8_t *rgb, uint32_t rgb_stride,
53 YCbCrType yuv_type);
54
55void yuv422_rgba_sse(
56 uint32_t width, uint32_t height,
57 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
58 uint8_t *rgb, uint32_t rgb_stride,
59 YCbCrType yuv_type);
60
61void yuv422_bgra_sse(
62 uint32_t width, uint32_t height,
63 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
64 uint8_t *rgb, uint32_t rgb_stride,
65 YCbCrType yuv_type);
66
67void yuv422_argb_sse(
68 uint32_t width, uint32_t height,
69 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
70 uint8_t *rgb, uint32_t rgb_stride,
71 YCbCrType yuv_type);
72
73void yuv422_abgr_sse(
74 uint32_t width, uint32_t height,
75 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
76 uint8_t *rgb, uint32_t rgb_stride,
77 YCbCrType yuv_type);
78
79void yuvnv12_rgb565_sse(
80 uint32_t width, uint32_t height,
81 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
82 uint8_t *rgb, uint32_t rgb_stride,
83 YCbCrType yuv_type);
84
85void yuvnv12_rgb24_sse(
86 uint32_t width, uint32_t height,
87 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
88 uint8_t *rgb, uint32_t rgb_stride,
89 YCbCrType yuv_type);
90
91void yuvnv12_rgba_sse(
92 uint32_t width, uint32_t height,
93 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
94 uint8_t *rgb, uint32_t rgb_stride,
95 YCbCrType yuv_type);
96
97void yuvnv12_bgra_sse(
98 uint32_t width, uint32_t height,
99 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
100 uint8_t *rgb, uint32_t rgb_stride,
101 YCbCrType yuv_type);
102
103void yuvnv12_argb_sse(
104 uint32_t width, uint32_t height,
105 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
106 uint8_t *rgb, uint32_t rgb_stride,
107 YCbCrType yuv_type);
108
109void yuvnv12_abgr_sse(
110 uint32_t width, uint32_t height,
111 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
112 uint8_t *rgb, uint32_t rgb_stride,
113 YCbCrType yuv_type);
114
115// yuv to rgb, sse implementation
116// pointers do not need to be 16 byte aligned
117void yuv420_rgb565_sseu(
118 uint32_t width, uint32_t height,
119 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
120 uint8_t *rgb, uint32_t rgb_stride,
121 YCbCrType yuv_type);
122
123void yuv420_rgb24_sseu(
124 uint32_t width, uint32_t height,
125 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
126 uint8_t *rgb, uint32_t rgb_stride,
127 YCbCrType yuv_type);
128
129void yuv420_rgba_sseu(
130 uint32_t width, uint32_t height,
131 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
132 uint8_t *rgb, uint32_t rgb_stride,
133 YCbCrType yuv_type);
134
135void yuv420_bgra_sseu(
136 uint32_t width, uint32_t height,
137 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
138 uint8_t *rgb, uint32_t rgb_stride,
139 YCbCrType yuv_type);
140
141void yuv420_argb_sseu(
142 uint32_t width, uint32_t height,
143 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
144 uint8_t *rgb, uint32_t rgb_stride,
145 YCbCrType yuv_type);
146
147void yuv420_abgr_sseu(
148 uint32_t width, uint32_t height,
149 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
150 uint8_t *rgb, uint32_t rgb_stride,
151 YCbCrType yuv_type);
152
153void yuv422_rgb565_sseu(
154 uint32_t width, uint32_t height,
155 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
156 uint8_t *rgb, uint32_t rgb_stride,
157 YCbCrType yuv_type);
158
159void yuv422_rgb24_sseu(
160 uint32_t width, uint32_t height,
161 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
162 uint8_t *rgb, uint32_t rgb_stride,
163 YCbCrType yuv_type);
164
165void yuv422_rgba_sseu(
166 uint32_t width, uint32_t height,
167 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
168 uint8_t *rgb, uint32_t rgb_stride,
169 YCbCrType yuv_type);
170
171void yuv422_bgra_sseu(
172 uint32_t width, uint32_t height,
173 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
174 uint8_t *rgb, uint32_t rgb_stride,
175 YCbCrType yuv_type);
176
177void yuv422_argb_sseu(
178 uint32_t width, uint32_t height,
179 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
180 uint8_t *rgb, uint32_t rgb_stride,
181 YCbCrType yuv_type);
182
183void yuv422_abgr_sseu(
184 uint32_t width, uint32_t height,
185 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
186 uint8_t *rgb, uint32_t rgb_stride,
187 YCbCrType yuv_type);
188
189void yuvnv12_rgb565_sseu(
190 uint32_t width, uint32_t height,
191 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
192 uint8_t *rgb, uint32_t rgb_stride,
193 YCbCrType yuv_type);
194
195void yuvnv12_rgb24_sseu(
196 uint32_t width, uint32_t height,
197 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
198 uint8_t *rgb, uint32_t rgb_stride,
199 YCbCrType yuv_type);
200
201void yuvnv12_rgba_sseu(
202 uint32_t width, uint32_t height,
203 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
204 uint8_t *rgb, uint32_t rgb_stride,
205 YCbCrType yuv_type);
206
207void yuvnv12_bgra_sseu(
208 uint32_t width, uint32_t height,
209 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
210 uint8_t *rgb, uint32_t rgb_stride,
211 YCbCrType yuv_type);
212
213void yuvnv12_argb_sseu(
214 uint32_t width, uint32_t height,
215 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
216 uint8_t *rgb, uint32_t rgb_stride,
217 YCbCrType yuv_type);
218
219void yuvnv12_abgr_sseu(
220 uint32_t width, uint32_t height,
221 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
222 uint8_t *rgb, uint32_t rgb_stride,
223 YCbCrType yuv_type);
224
225
226// rgb to yuv, sse implementation
227// pointers must be 16 byte aligned, and strides must be divisible by 16
228void rgb24_yuv420_sse(
229 uint32_t width, uint32_t height,
230 const uint8_t *rgb, uint32_t rgb_stride,
231 uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
232 YCbCrType yuv_type);
233
234// rgb to yuv, sse implementation
235// pointers do not need to be 16 byte aligned
236void rgb24_yuv420_sseu(
237 uint32_t width, uint32_t height,
238 const uint8_t *rgb, uint32_t rgb_stride,
239 uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
240 YCbCrType yuv_type);
241#endif
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h
new file mode 100644
index 0000000..cbd751d
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_sse_func.h
@@ -0,0 +1,529 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3
4/* You need to define the following macros before including this file:
5 SSE_FUNCTION_NAME
6 STD_FUNCTION_NAME
7 YUV_FORMAT
8 RGB_FORMAT
9*/
10/* You may define the following macro, which affects generated code:
11 SSE_ALIGNED
12*/
13
14#ifdef SSE_ALIGNED
15/* Unaligned instructions seem faster, even on aligned data? */
16/*
17#define LOAD_SI128 _mm_load_si128
18#define SAVE_SI128 _mm_stream_si128
19*/
20#define LOAD_SI128 _mm_loadu_si128
21#define SAVE_SI128 _mm_storeu_si128
22#else
23#define LOAD_SI128 _mm_loadu_si128
24#define SAVE_SI128 _mm_storeu_si128
25#endif
26
27#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
28 r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
29 g_tmp = _mm_add_epi16( \
30 _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
31 _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
32 b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
33 R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
34 G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
35 B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
36 R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
37 G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
38 B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
39
40#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
41 Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
42 Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
43 \
44 R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
45 G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
46 B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
47 R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
48 G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
49 B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
50
51#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
52{ \
53 __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
54\
55 red_mask = _mm_set1_epi16((unsigned short)0xF800); \
56 RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
57 RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
58 RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
59 RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
60 tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
61 tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
62 tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
63 tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
64 RGB1 = _mm_or_si128(RGB1, tmp1); \
65 RGB2 = _mm_or_si128(RGB2, tmp2); \
66 RGB3 = _mm_or_si128(RGB3, tmp3); \
67 RGB4 = _mm_or_si128(RGB4, tmp4); \
68 tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
69 tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
70 tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
71 tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
72 RGB1 = _mm_or_si128(RGB1, tmp1); \
73 RGB2 = _mm_or_si128(RGB2, tmp2); \
74 RGB3 = _mm_or_si128(RGB3, tmp3); \
75 RGB4 = _mm_or_si128(RGB4, tmp4); \
76}
77
78#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
79RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
80RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
81RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
82RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
83RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
84RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
85
86#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
87R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
88R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
89G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
90G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
91B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
92B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
93
94#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
95PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
96PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
97PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
98PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
99PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
100
101#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
102{ \
103 __m128i lo_ab, hi_ab, lo_gr, hi_gr; \
104\
105 lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
106 hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
107 lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
108 hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
109 RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
110 RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
111 RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
112 RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
113\
114 lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
115 hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
116 lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
117 hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
118 RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
119 RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
120 RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
121 RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
122}
123
124#if RGB_FORMAT == RGB_FORMAT_RGB565
125
126#define PACK_PIXEL \
127 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
128 \
129 PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
130 \
131 PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
132
133#elif RGB_FORMAT == RGB_FORMAT_RGB24
134
135#define PACK_PIXEL \
136 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
137 __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
138 \
139 PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
140 \
141 PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
142
143#elif RGB_FORMAT == RGB_FORMAT_RGBA
144
145#define PACK_PIXEL \
146 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
147 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
148 __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
149 \
150 PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
151 \
152 PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
153
154#elif RGB_FORMAT == RGB_FORMAT_BGRA
155
156#define PACK_PIXEL \
157 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
158 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
159 __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
160 \
161 PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
162 \
163 PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
164
165#elif RGB_FORMAT == RGB_FORMAT_ARGB
166
167#define PACK_PIXEL \
168 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
169 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
170 __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
171 \
172 PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
173 \
174 PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
175
176#elif RGB_FORMAT == RGB_FORMAT_ABGR
177
178#define PACK_PIXEL \
179 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
180 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
181 __m128i a = _mm_set1_epi8((unsigned char)0xFF); \
182 \
183 PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
184 \
185 PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
186
187#else
188#error PACK_PIXEL unimplemented
189#endif
190
191#if RGB_FORMAT == RGB_FORMAT_RGB565
192
193#define SAVE_LINE1 \
194 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
195 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
196 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
197 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
198
199#define SAVE_LINE2 \
200 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
201 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
202 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
203 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
204
205#elif RGB_FORMAT == RGB_FORMAT_RGB24
206
207#define SAVE_LINE1 \
208 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
209 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
210 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
211 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
212 SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
213 SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
214
215#define SAVE_LINE2 \
216 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
217 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
218 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
219 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
220 SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
221 SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
222
223#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
224 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
225
226#define SAVE_LINE1 \
227 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
228 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
229 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
230 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
231 SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
232 SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
233 SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
234 SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
235
236#define SAVE_LINE2 \
237 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
238 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
239 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
240 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
241 SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
242 SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
243 SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
244 SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
245
246#else
247#error SAVE_LINE unimplemented
248#endif
249
250#if YUV_FORMAT == YUV_FORMAT_420
251
252#define READ_Y(y_ptr) \
253 y = LOAD_SI128((const __m128i*)(y_ptr)); \
254
255#define READ_UV \
256 u = LOAD_SI128((const __m128i*)(u_ptr)); \
257 v = LOAD_SI128((const __m128i*)(v_ptr)); \
258
259#elif YUV_FORMAT == YUV_FORMAT_422
260
261#define READ_Y(y_ptr) \
262{ \
263 __m128i y1, y2; \
264 y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
265 y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
266 y = _mm_packus_epi16(y1, y2); \
267}
268
269#define READ_UV \
270{ \
271 __m128i u1, u2, u3, u4, v1, v2, v3, v4; \
272 u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
273 u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
274 u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
275 u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
276 u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
277 v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
278 v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
279 v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
280 v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
281 v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
282}
283
284#elif YUV_FORMAT == YUV_FORMAT_NV12
285
286#define READ_Y(y_ptr) \
287 y = LOAD_SI128((const __m128i*)(y_ptr)); \
288
289#define READ_UV \
290{ \
291 __m128i u1, u2, v1, v2; \
292 u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
293 u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
294 u = _mm_packus_epi16(u1, u2); \
295 v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
296 v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
297 v = _mm_packus_epi16(v1, v2); \
298}
299
300#else
301#error READ_UV unimplemented
302#endif
303
304#define YUV2RGB_32 \
305 __m128i r_tmp, g_tmp, b_tmp; \
306 __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
307 __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
308 __m128i y_16_1, y_16_2; \
309 __m128i y, u, v, u_16, v_16; \
310 __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
311 __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
312 \
313 READ_UV \
314 \
315 /* process first 16 pixels of first line */\
316 u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
317 v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
318 u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
319 v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
320 \
321 UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
322 r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
323 r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
324 \
325 READ_Y(y_ptr1) \
326 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
327 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
328 \
329 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
330 \
331 r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
332 g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
333 b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
334 \
335 /* process first 16 pixels of second line */\
336 r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
337 r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
338 \
339 READ_Y(y_ptr2) \
340 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
341 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
342 \
343 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
344 \
345 r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
346 g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
347 b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
348 \
349 /* process last 16 pixels of first line */\
350 u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
351 v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
352 u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
353 v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
354 \
355 UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
356 r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
357 r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
358 \
359 READ_Y(y_ptr1+16*y_pixel_stride) \
360 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
361 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
362 \
363 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
364 \
365 r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
366 g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
367 b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
368 \
369 /* process last 16 pixels of second line */\
370 r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
371 r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
372 \
373 READ_Y(y_ptr2+16*y_pixel_stride) \
374 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
375 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
376 \
377 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
378 \
379 r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
380 g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
381 b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
382 \
383
384
385void SDL_TARGETING("sse2") SSE_FUNCTION_NAME(uint32_t width, uint32_t height,
386 const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
387 uint8_t *RGB, uint32_t RGB_stride,
388 YCbCrType yuv_type)
389{
390 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
391#if YUV_FORMAT == YUV_FORMAT_420
392 const int y_pixel_stride = 1;
393 const int uv_pixel_stride = 1;
394 const int uv_x_sample_interval = 2;
395 const int uv_y_sample_interval = 2;
396#elif YUV_FORMAT == YUV_FORMAT_422
397 const int y_pixel_stride = 2;
398 const int uv_pixel_stride = 4;
399 const int uv_x_sample_interval = 2;
400 const int uv_y_sample_interval = 1;
401#elif YUV_FORMAT == YUV_FORMAT_NV12
402 const int y_pixel_stride = 1;
403 const int uv_pixel_stride = 2;
404 const int uv_x_sample_interval = 2;
405 const int uv_y_sample_interval = 2;
406#endif
407#if RGB_FORMAT == RGB_FORMAT_RGB565
408 const int rgb_pixel_stride = 2;
409#elif RGB_FORMAT == RGB_FORMAT_RGB24
410 const int rgb_pixel_stride = 3;
411#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
412 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
413 const int rgb_pixel_stride = 4;
414#else
415#error Unknown RGB pixel size
416#endif
417
418#if YUV_FORMAT == YUV_FORMAT_NV12
419 /* For NV12 formats (where U/V are interleaved)
420 * SSE READ_UV does an invalid read access at the very last pixel.
421 * As a workaround. Make sure not to decode the last column using assembly but with STD fallback path.
422 * see https://github.com/libsdl-org/SDL/issues/4841
423 */
424 const int fix_read_nv12 = ((width & 31) == 0);
425#else
426 const int fix_read_nv12 = 0;
427#endif
428
429#if YUV_FORMAT == YUV_FORMAT_422
430 /* Avoid invalid read on last line */
431 const int fix_read_422 = 1;
432#else
433 const int fix_read_422 = 0;
434#endif
435
436
437 if (width >= 32) {
438 uint32_t xpos, ypos;
439 for(ypos=0; ypos<(height-(uv_y_sample_interval-1)) - fix_read_422; ypos+=uv_y_sample_interval)
440 {
441 const uint8_t *y_ptr1=Y+ypos*Y_stride,
442 *y_ptr2=Y+(ypos+1)*Y_stride,
443 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
444 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
445
446 uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
447 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
448
449 for(xpos=0; xpos<(width-31) - fix_read_nv12; xpos+=32)
450 {
451 YUV2RGB_32
452 {
453 PACK_PIXEL
454 SAVE_LINE1
455 if (uv_y_sample_interval > 1)
456 {
457 SAVE_LINE2
458 }
459 }
460
461 y_ptr1+=32*y_pixel_stride;
462 y_ptr2+=32*y_pixel_stride;
463 u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
464 v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
465 rgb_ptr1+=32*rgb_pixel_stride;
466 rgb_ptr2+=32*rgb_pixel_stride;
467 }
468 }
469
470 if (fix_read_422) {
471 const uint8_t *y_ptr=Y+ypos*Y_stride,
472 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
473 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
474 uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
475 STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
476 ypos += uv_y_sample_interval;
477 }
478
479 /* Catch the last line, if needed */
480 if (uv_y_sample_interval == 2 && ypos == (height-1))
481 {
482 const uint8_t *y_ptr=Y+ypos*Y_stride,
483 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
484 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
485
486 uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
487
488 STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
489 }
490 }
491
492 /* Catch the right column, if needed */
493 {
494 uint32_t converted = (width & ~31);
495 if (fix_read_nv12) {
496 converted -= 32;
497 }
498 if (converted != width)
499 {
500 const uint8_t *y_ptr=Y+converted*y_pixel_stride,
501 *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
502 *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
503
504 uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
505
506 STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
507 }
508 }
509}
510
511#undef SSE_FUNCTION_NAME
512#undef STD_FUNCTION_NAME
513#undef YUV_FORMAT
514#undef RGB_FORMAT
515#undef SSE_ALIGNED
516#undef LOAD_SI128
517#undef SAVE_SI128
518#undef UV2RGB_16
519#undef ADD_Y2RGB_16
520#undef PACK_RGB24_32_STEP1
521#undef PACK_RGB24_32_STEP2
522#undef PACK_RGB24_32
523#undef PACK_RGBA_32
524#undef PACK_PIXEL
525#undef SAVE_LINE1
526#undef SAVE_LINE2
527#undef READ_Y
528#undef READ_UV
529#undef YUV2RGB_32
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c
new file mode 100644
index 0000000..0fa900d
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.c
@@ -0,0 +1,200 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3#include "SDL_internal.h"
4
5#ifdef SDL_HAVE_YUV
6
7#include "yuv_rgb_internal.h"
8
9// divide by PRECISION_FACTOR and clamp to [0:255] interval
10// input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
11static uint8_t clampU8(int32_t v)
12{
13 static const uint8_t lut[512] =
14 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
15 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
16 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
17 47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
18 91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
19 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,
20 159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
21 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
22 225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
23 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
24 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
25 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
26 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
27 };
28 return lut[((v+128*PRECISION_FACTOR)>>PRECISION)&511];
29}
30
31static uint16_t clamp10(int32_t v)
32{
33 v >>= PRECISION;
34 if (v < 0) {
35 return 0;
36 } else if (v > 1023) {
37 return 1023;
38 } else {
39 return (uint16_t)v;
40 }
41}
42
43#define YUV_BITS 8
44
45#define STD_FUNCTION_NAME yuv420_rgb565_std
46#define YUV_FORMAT YUV_FORMAT_420
47#define RGB_FORMAT RGB_FORMAT_RGB565
48#include "yuv_rgb_std_func.h"
49
50#define STD_FUNCTION_NAME yuv420_rgb24_std
51#define YUV_FORMAT YUV_FORMAT_420
52#define RGB_FORMAT RGB_FORMAT_RGB24
53#include "yuv_rgb_std_func.h"
54
55#define STD_FUNCTION_NAME yuv420_rgba_std
56#define YUV_FORMAT YUV_FORMAT_420
57#define RGB_FORMAT RGB_FORMAT_RGBA
58#include "yuv_rgb_std_func.h"
59
60#define STD_FUNCTION_NAME yuv420_bgra_std
61#define YUV_FORMAT YUV_FORMAT_420
62#define RGB_FORMAT RGB_FORMAT_BGRA
63#include "yuv_rgb_std_func.h"
64
65#define STD_FUNCTION_NAME yuv420_argb_std
66#define YUV_FORMAT YUV_FORMAT_420
67#define RGB_FORMAT RGB_FORMAT_ARGB
68#include "yuv_rgb_std_func.h"
69
70#define STD_FUNCTION_NAME yuv420_abgr_std
71#define YUV_FORMAT YUV_FORMAT_420
72#define RGB_FORMAT RGB_FORMAT_ABGR
73#include "yuv_rgb_std_func.h"
74
75#define STD_FUNCTION_NAME yuv422_rgb565_std
76#define YUV_FORMAT YUV_FORMAT_422
77#define RGB_FORMAT RGB_FORMAT_RGB565
78#include "yuv_rgb_std_func.h"
79
80#define STD_FUNCTION_NAME yuv422_rgb24_std
81#define YUV_FORMAT YUV_FORMAT_422
82#define RGB_FORMAT RGB_FORMAT_RGB24
83#include "yuv_rgb_std_func.h"
84
85#define STD_FUNCTION_NAME yuv422_rgba_std
86#define YUV_FORMAT YUV_FORMAT_422
87#define RGB_FORMAT RGB_FORMAT_RGBA
88#include "yuv_rgb_std_func.h"
89
90#define STD_FUNCTION_NAME yuv422_bgra_std
91#define YUV_FORMAT YUV_FORMAT_422
92#define RGB_FORMAT RGB_FORMAT_BGRA
93#include "yuv_rgb_std_func.h"
94
95#define STD_FUNCTION_NAME yuv422_argb_std
96#define YUV_FORMAT YUV_FORMAT_422
97#define RGB_FORMAT RGB_FORMAT_ARGB
98#include "yuv_rgb_std_func.h"
99
100#define STD_FUNCTION_NAME yuv422_abgr_std
101#define YUV_FORMAT YUV_FORMAT_422
102#define RGB_FORMAT RGB_FORMAT_ABGR
103#include "yuv_rgb_std_func.h"
104
105#define STD_FUNCTION_NAME yuvnv12_rgb565_std
106#define YUV_FORMAT YUV_FORMAT_NV12
107#define RGB_FORMAT RGB_FORMAT_RGB565
108#include "yuv_rgb_std_func.h"
109
110#define STD_FUNCTION_NAME yuvnv12_rgb24_std
111#define YUV_FORMAT YUV_FORMAT_NV12
112#define RGB_FORMAT RGB_FORMAT_RGB24
113#include "yuv_rgb_std_func.h"
114
115#define STD_FUNCTION_NAME yuvnv12_rgba_std
116#define YUV_FORMAT YUV_FORMAT_NV12
117#define RGB_FORMAT RGB_FORMAT_RGBA
118#include "yuv_rgb_std_func.h"
119
120#define STD_FUNCTION_NAME yuvnv12_bgra_std
121#define YUV_FORMAT YUV_FORMAT_NV12
122#define RGB_FORMAT RGB_FORMAT_BGRA
123#include "yuv_rgb_std_func.h"
124
125#define STD_FUNCTION_NAME yuvnv12_argb_std
126#define YUV_FORMAT YUV_FORMAT_NV12
127#define RGB_FORMAT RGB_FORMAT_ARGB
128#include "yuv_rgb_std_func.h"
129
130#define STD_FUNCTION_NAME yuvnv12_abgr_std
131#define YUV_FORMAT YUV_FORMAT_NV12
132#define RGB_FORMAT RGB_FORMAT_ABGR
133#include "yuv_rgb_std_func.h"
134
135#undef YUV_BITS
136#define YUV_BITS 10
137
138#define STD_FUNCTION_NAME yuvp010_xbgr2101010_std
139#define YUV_FORMAT YUV_FORMAT_NV12
140#define RGB_FORMAT RGB_FORMAT_XBGR2101010
141#include "yuv_rgb_std_func.h"
142
143void rgb24_yuv420_std(
144 uint32_t width, uint32_t height,
145 const uint8_t *RGB, uint32_t RGB_stride,
146 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
147 YCbCrType yuv_type)
148{
149 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
150
151 uint32_t x, y;
152 for(y=0; y<(height-1); y+=2)
153 {
154 const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
155 *rgb_ptr2=RGB+(y+1)*RGB_stride;
156
157 uint8_t *y_ptr1=Y+y*Y_stride,
158 *y_ptr2=Y+(y+1)*Y_stride,
159 *u_ptr=U+(y/2)*UV_stride,
160 *v_ptr=V+(y/2)*UV_stride;
161
162 for(x=0; x<(width-1); x+=2)
163 {
164 // compute yuv for the four pixels, u and v values are summed
165 int32_t y_tmp, u_tmp, v_tmp;
166
167 y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
168 u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
169 v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
170 y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
171
172 y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
173 u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
174 v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
175 y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
176
177 y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
178 u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
179 v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
180 y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
181
182 y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
183 u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
184 v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
185 y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
186
187 u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
188 v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
189
190 rgb_ptr1 += 6;
191 rgb_ptr2 += 6;
192 y_ptr1 += 2;
193 y_ptr2 += 2;
194 u_ptr += 1;
195 v_ptr += 1;
196 }
197 }
198}
199
200#endif /* SDL_HAVE_YUV */
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h
new file mode 100644
index 0000000..c9f856b
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std.h
@@ -0,0 +1,143 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3
4// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
5
6// There are a few slightly different variations of the YCbCr color space with different parameters that
7// change the conversion matrix.
8// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
9// See the respective standards for details
10// The matrix values used are derived from http://www.equasys.de/colorconversion.html
11
12// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
13// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This
14// is suboptimal for image quality, but by far the fastest method.
15
16// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
17// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
18
19/*#include <stdint.h>*/
20
21#include "yuv_rgb_common.h"
22
23// yuv to rgb, standard c implementation
24void yuv420_rgb565_std(
25 uint32_t width, uint32_t height,
26 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
27 uint8_t *rgb, uint32_t rgb_stride,
28 YCbCrType yuv_type);
29
30void yuv420_rgb24_std(
31 uint32_t width, uint32_t height,
32 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
33 uint8_t *rgb, uint32_t rgb_stride,
34 YCbCrType yuv_type);
35
36void yuv420_rgba_std(
37 uint32_t width, uint32_t height,
38 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
39 uint8_t *rgb, uint32_t rgb_stride,
40 YCbCrType yuv_type);
41
42void yuv420_bgra_std(
43 uint32_t width, uint32_t height,
44 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
45 uint8_t *rgb, uint32_t rgb_stride,
46 YCbCrType yuv_type);
47
48void yuv420_argb_std(
49 uint32_t width, uint32_t height,
50 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
51 uint8_t *rgb, uint32_t rgb_stride,
52 YCbCrType yuv_type);
53
54void yuv420_abgr_std(
55 uint32_t width, uint32_t height,
56 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
57 uint8_t *rgb, uint32_t rgb_stride,
58 YCbCrType yuv_type);
59
60void yuv422_rgb565_std(
61 uint32_t width, uint32_t height,
62 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
63 uint8_t *rgb, uint32_t rgb_stride,
64 YCbCrType yuv_type);
65
66void yuv422_rgb24_std(
67 uint32_t width, uint32_t height,
68 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
69 uint8_t *rgb, uint32_t rgb_stride,
70 YCbCrType yuv_type);
71
72void yuv422_rgba_std(
73 uint32_t width, uint32_t height,
74 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
75 uint8_t *rgb, uint32_t rgb_stride,
76 YCbCrType yuv_type);
77
78void yuv422_bgra_std(
79 uint32_t width, uint32_t height,
80 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
81 uint8_t *rgb, uint32_t rgb_stride,
82 YCbCrType yuv_type);
83
84void yuv422_argb_std(
85 uint32_t width, uint32_t height,
86 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
87 uint8_t *rgb, uint32_t rgb_stride,
88 YCbCrType yuv_type);
89
90void yuv422_abgr_std(
91 uint32_t width, uint32_t height,
92 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
93 uint8_t *rgb, uint32_t rgb_stride,
94 YCbCrType yuv_type);
95
96void yuvnv12_rgb565_std(
97 uint32_t width, uint32_t height,
98 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
99 uint8_t *rgb, uint32_t rgb_stride,
100 YCbCrType yuv_type);
101
102void yuvnv12_rgb24_std(
103 uint32_t width, uint32_t height,
104 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
105 uint8_t *rgb, uint32_t rgb_stride,
106 YCbCrType yuv_type);
107
108void yuvnv12_rgba_std(
109 uint32_t width, uint32_t height,
110 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
111 uint8_t *rgb, uint32_t rgb_stride,
112 YCbCrType yuv_type);
113
114void yuvnv12_bgra_std(
115 uint32_t width, uint32_t height,
116 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
117 uint8_t *rgb, uint32_t rgb_stride,
118 YCbCrType yuv_type);
119
120void yuvnv12_argb_std(
121 uint32_t width, uint32_t height,
122 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
123 uint8_t *rgb, uint32_t rgb_stride,
124 YCbCrType yuv_type);
125
126void yuvnv12_abgr_std(
127 uint32_t width, uint32_t height,
128 const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
129 uint8_t *rgb, uint32_t rgb_stride,
130 YCbCrType yuv_type);
131
132void yuvp010_xbgr2101010_std(
133 uint32_t width, uint32_t height,
134 const uint16_t *y, const uint16_t *u, const uint16_t *v, uint32_t y_stride, uint32_t uv_stride,
135 uint8_t *rgb, uint32_t rgb_stride,
136 YCbCrType yuv_type);
137
138// rgb to yuv, standard c implementation
139void rgb24_yuv420_std(
140 uint32_t width, uint32_t height,
141 const uint8_t *rgb, uint32_t rgb_stride,
142 uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
143 YCbCrType yuv_type);
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h
new file mode 100644
index 0000000..8091ea9
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_std_func.h
@@ -0,0 +1,271 @@
1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3
4/* You need to define the following macros before including this file:
5 STD_FUNCTION_NAME
6 YUV_FORMAT
7 RGB_FORMAT
8*/
9
10#if RGB_FORMAT == RGB_FORMAT_RGB565
11
12#define PACK_PIXEL(rgb_ptr) \
13 *(Uint16 *)rgb_ptr = \
14 ((((Uint16)clampU8(y_tmp+r_tmp)) << 8 ) & 0xF800) | \
15 ((((Uint16)clampU8(y_tmp+g_tmp)) << 3) & 0x07E0) | \
16 (((Uint16)clampU8(y_tmp+b_tmp)) >> 3); \
17 rgb_ptr += 2; \
18
19#elif RGB_FORMAT == RGB_FORMAT_RGB24
20
21#define PACK_PIXEL(rgb_ptr) \
22 rgb_ptr[0] = clampU8(y_tmp+r_tmp); \
23 rgb_ptr[1] = clampU8(y_tmp+g_tmp); \
24 rgb_ptr[2] = clampU8(y_tmp+b_tmp); \
25 rgb_ptr += 3; \
26
27#elif RGB_FORMAT == RGB_FORMAT_RGBA
28
29#define PACK_PIXEL(rgb_ptr) \
30 *(Uint32 *)rgb_ptr = \
31 (((Uint32)clampU8(y_tmp+r_tmp)) << 24) | \
32 (((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
33 (((Uint32)clampU8(y_tmp+b_tmp)) << 8) | \
34 0x000000FF; \
35 rgb_ptr += 4; \
36
37#elif RGB_FORMAT == RGB_FORMAT_BGRA
38
39#define PACK_PIXEL(rgb_ptr) \
40 *(Uint32 *)rgb_ptr = \
41 (((Uint32)clampU8(y_tmp+b_tmp)) << 24) | \
42 (((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
43 (((Uint32)clampU8(y_tmp+r_tmp)) << 8) | \
44 0x000000FF; \
45 rgb_ptr += 4; \
46
47#elif RGB_FORMAT == RGB_FORMAT_ARGB
48
49#define PACK_PIXEL(rgb_ptr) \
50 *(Uint32 *)rgb_ptr = \
51 0xFF000000 | \
52 (((Uint32)clampU8(y_tmp+r_tmp)) << 16) | \
53 (((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
54 (((Uint32)clampU8(y_tmp+b_tmp)) << 0); \
55 rgb_ptr += 4; \
56
57#elif RGB_FORMAT == RGB_FORMAT_ABGR
58
59#define PACK_PIXEL(rgb_ptr) \
60 *(Uint32 *)rgb_ptr = \
61 0xFF000000 | \
62 (((Uint32)clampU8(y_tmp+b_tmp)) << 16) | \
63 (((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
64 (((Uint32)clampU8(y_tmp+r_tmp)) << 0); \
65 rgb_ptr += 4; \
66
67#elif RGB_FORMAT == RGB_FORMAT_XBGR2101010
68
69#define PACK_PIXEL(rgb_ptr) \
70 *(Uint32 *)rgb_ptr = \
71 0xC0000000 | \
72 (((Uint32)clamp10(y_tmp+b_tmp)) << 20) | \
73 (((Uint32)clamp10(y_tmp+g_tmp)) << 10) | \
74 (((Uint32)clamp10(y_tmp+r_tmp)) << 0); \
75 rgb_ptr += 4; \
76
77#else
78#error PACK_PIXEL unimplemented
79#endif
80
81
82#ifdef _MSC_VER /* Visual Studio analyzer can't tell that we're building this with different constants */
83#pragma warning(push)
84#pragma warning(disable : 6239)
85#endif
86
87#undef YUV_TYPE
88#if YUV_BITS > 8
89#define YUV_TYPE uint16_t
90#else
91#define YUV_TYPE uint8_t
92#endif
93#undef UV_OFFSET
94#define UV_OFFSET (1 << ((YUV_BITS)-1))
95
96#undef GET
97#if YUV_BITS == 10
98#define GET(X) ((X) >> 6)
99#else
100#define GET(X) (X)
101#endif
102
103void STD_FUNCTION_NAME(
104 uint32_t width, uint32_t height,
105 const YUV_TYPE *Y, const YUV_TYPE *U, const YUV_TYPE *V, uint32_t Y_stride, uint32_t UV_stride,
106 uint8_t *RGB, uint32_t RGB_stride,
107 YCbCrType yuv_type)
108{
109 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
110#if YUV_FORMAT == YUV_FORMAT_420
111 #define y_pixel_stride 1
112 #define uv_pixel_stride 1
113 #define uv_x_sample_interval 2
114 #define uv_y_sample_interval 2
115#elif YUV_FORMAT == YUV_FORMAT_422
116 #define y_pixel_stride 2
117 #define uv_pixel_stride 4
118 #define uv_x_sample_interval 2
119 #define uv_y_sample_interval 1
120#elif YUV_FORMAT == YUV_FORMAT_NV12
121 #define y_pixel_stride 1
122 #define uv_pixel_stride 2
123 #define uv_x_sample_interval 2
124 #define uv_y_sample_interval 2
125#endif
126
127 Y_stride /= sizeof(YUV_TYPE);
128 UV_stride /= sizeof(YUV_TYPE);
129
130 uint32_t x, y;
131 for(y=0; y<(height-(uv_y_sample_interval-1)); y+=uv_y_sample_interval)
132 {
133 const YUV_TYPE *y_ptr1=Y+y*Y_stride,
134 *u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
135 *v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
136
137 #if uv_y_sample_interval > 1
138 const YUV_TYPE *y_ptr2=Y+(y+1)*Y_stride;
139 #endif
140
141 uint8_t *rgb_ptr1=RGB+y*RGB_stride;
142
143 #if uv_y_sample_interval > 1
144 uint8_t *rgb_ptr2=RGB+(y+1)*RGB_stride;
145 #endif
146
147 for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
148 {
149 // Compute U and V contributions, common to the four pixels
150
151 int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
152 int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
153
154 int32_t r_tmp = (v_tmp*param->v_r_factor);
155 int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
156 int32_t b_tmp = (u_tmp*param->u_b_factor);
157
158 // Compute the Y contribution for each pixel
159
160 int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
161 PACK_PIXEL(rgb_ptr1);
162
163 y_tmp = (GET(y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
164 PACK_PIXEL(rgb_ptr1);
165
166 #if uv_y_sample_interval > 1
167 y_tmp = (GET(y_ptr2[0]-param->y_shift)*param->y_factor);
168 PACK_PIXEL(rgb_ptr2);
169
170 y_tmp = (GET(y_ptr2[y_pixel_stride]-param->y_shift)*param->y_factor);
171 PACK_PIXEL(rgb_ptr2);
172 #endif
173
174 y_ptr1+=2*y_pixel_stride;
175 #if uv_y_sample_interval > 1
176 y_ptr2+=2*y_pixel_stride;
177 #endif
178 u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
179 v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
180 }
181
182 /* Catch the last pixel, if needed */
183 if (uv_x_sample_interval == 2 && x == (width-1))
184 {
185 // Compute U and V contributions, common to the four pixels
186
187 int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
188 int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
189
190 int32_t r_tmp = (v_tmp*param->v_r_factor);
191 int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
192 int32_t b_tmp = (u_tmp*param->u_b_factor);
193
194 // Compute the Y contribution for each pixel
195
196 int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
197 PACK_PIXEL(rgb_ptr1);
198
199 #if uv_y_sample_interval > 1
200 y_tmp = (GET(y_ptr2[0]-param->y_shift)*param->y_factor);
201 PACK_PIXEL(rgb_ptr2);
202 #endif
203 }
204 }
205
206 /* Catch the last line, if needed */
207 if (uv_y_sample_interval == 2 && y == (height-1))
208 {
209 const YUV_TYPE *y_ptr1=Y+y*Y_stride,
210 *u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
211 *v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
212
213 uint8_t *rgb_ptr1=RGB+y*RGB_stride;
214
215 for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
216 {
217 // Compute U and V contributions, common to the four pixels
218
219 int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
220 int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
221
222 int32_t r_tmp = (v_tmp*param->v_r_factor);
223 int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
224 int32_t b_tmp = (u_tmp*param->u_b_factor);
225
226 // Compute the Y contribution for each pixel
227
228 int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
229 PACK_PIXEL(rgb_ptr1);
230
231 y_tmp = (GET(y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
232 PACK_PIXEL(rgb_ptr1);
233
234 y_ptr1+=2*y_pixel_stride;
235 u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
236 v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
237 }
238
239 /* Catch the last pixel, if needed */
240 if (uv_x_sample_interval == 2 && x == (width-1))
241 {
242 // Compute U and V contributions, common to the four pixels
243
244 int32_t u_tmp = (GET(*u_ptr)-UV_OFFSET);
245 int32_t v_tmp = (GET(*v_ptr)-UV_OFFSET);
246
247 int32_t r_tmp = (v_tmp*param->v_r_factor);
248 int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
249 int32_t b_tmp = (u_tmp*param->u_b_factor);
250
251 // Compute the Y contribution for each pixel
252
253 int32_t y_tmp = (GET(y_ptr1[0]-param->y_shift)*param->y_factor);
254 PACK_PIXEL(rgb_ptr1);
255 }
256 }
257
258 #undef y_pixel_stride
259 #undef uv_pixel_stride
260 #undef uv_x_sample_interval
261 #undef uv_y_sample_interval
262}
263
264#ifdef _MSC_VER
265#pragma warning(pop)
266#endif
267
268#undef STD_FUNCTION_NAME
269#undef YUV_FORMAT
270#undef RGB_FORMAT
271#undef PACK_PIXEL