summaryrefslogtreecommitdiff
path: root/contrib/SDL-3.2.8/src/video/SDL_stretch.c
diff options
context:
space:
mode:
author3gg <3gg@shellblade.net>2025-12-27 12:03:39 -0800
committer3gg <3gg@shellblade.net>2025-12-27 12:03:39 -0800
commit5a079a2d114f96d4847d1ee305d5b7c16eeec50e (patch)
tree8926ab44f168acf787d8e19608857b3af0f82758 /contrib/SDL-3.2.8/src/video/SDL_stretch.c
Initial commit
Diffstat (limited to 'contrib/SDL-3.2.8/src/video/SDL_stretch.c')
-rw-r--r--contrib/SDL-3.2.8/src/video/SDL_stretch.c978
1 files changed, 978 insertions, 0 deletions
diff --git a/contrib/SDL-3.2.8/src/video/SDL_stretch.c b/contrib/SDL-3.2.8/src/video/SDL_stretch.c
new file mode 100644
index 0000000..c893cc3
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/SDL_stretch.c
@@ -0,0 +1,978 @@
1/*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20*/
21#include "SDL_internal.h"
22
23#include "SDL_surface_c.h"
24
25static bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
26static bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
27
28bool SDL_StretchSurface(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect, SDL_ScaleMode scaleMode)
29{
30 bool result;
31 int src_locked;
32 int dst_locked;
33 SDL_Rect full_src;
34 SDL_Rect full_dst;
35
36 if (!src) {
37 return SDL_InvalidParamError("src");
38 }
39 if (!dst) {
40 return SDL_InvalidParamError("dst");
41 }
42
43 if (src->format != dst->format) {
44 // Slow!
45 SDL_Surface *src_tmp = SDL_ConvertSurfaceAndColorspace(src, dst->format, dst->palette, dst->colorspace, dst->props);
46 if (!src_tmp) {
47 return false;
48 }
49 result = SDL_StretchSurface(src_tmp, srcrect, dst, dstrect, scaleMode);
50 SDL_DestroySurface(src_tmp);
51 return result;
52 }
53
54 if (SDL_ISPIXELFORMAT_FOURCC(src->format)) {
55 // Slow!
56 if (!dstrect) {
57 full_dst.x = 0;
58 full_dst.y = 0;
59 full_dst.w = dst->w;
60 full_dst.h = dst->h;
61 dstrect = &full_dst;
62 }
63
64 SDL_Surface *src_tmp = SDL_ConvertSurface(src, SDL_PIXELFORMAT_XRGB8888);
65 SDL_Surface *dst_tmp = SDL_CreateSurface(dstrect->w, dstrect->h, SDL_PIXELFORMAT_XRGB8888);
66 if (src_tmp && dst_tmp) {
67 result = SDL_StretchSurface(src_tmp, srcrect, dst_tmp, NULL, scaleMode);
68 if (result) {
69 result = SDL_ConvertPixelsAndColorspace(dstrect->w, dstrect->h,
70 dst_tmp->format, SDL_COLORSPACE_SRGB, 0,
71 dst_tmp->pixels, dst_tmp->pitch,
72 dst->format, dst->colorspace, SDL_GetSurfaceProperties(dst),
73 (Uint8 *)dst->pixels + dstrect->y * dst->pitch + dstrect->x * SDL_BYTESPERPIXEL(dst->format), dst->pitch);
74 }
75 } else {
76 result = false;
77 }
78 SDL_DestroySurface(src_tmp);
79 SDL_DestroySurface(dst_tmp);
80 return result;
81 }
82
83 if (scaleMode != SDL_SCALEMODE_NEAREST && scaleMode != SDL_SCALEMODE_LINEAR) {
84 return SDL_InvalidParamError("scaleMode");
85 }
86
87 if (scaleMode != SDL_SCALEMODE_NEAREST) {
88 scaleMode = SDL_SCALEMODE_LINEAR;
89 }
90
91 if (scaleMode == SDL_SCALEMODE_LINEAR) {
92 if (SDL_BYTESPERPIXEL(src->format) != 4 || src->format == SDL_PIXELFORMAT_ARGB2101010) {
93 return SDL_SetError("Wrong format");
94 }
95 }
96
97 // Verify the blit rectangles
98 if (srcrect) {
99 if ((srcrect->x < 0) || (srcrect->y < 0) ||
100 ((srcrect->x + srcrect->w) > src->w) ||
101 ((srcrect->y + srcrect->h) > src->h)) {
102 return SDL_SetError("Invalid source blit rectangle");
103 }
104 } else {
105 full_src.x = 0;
106 full_src.y = 0;
107 full_src.w = src->w;
108 full_src.h = src->h;
109 srcrect = &full_src;
110 }
111 if (dstrect) {
112 if ((dstrect->x < 0) || (dstrect->y < 0) ||
113 ((dstrect->x + dstrect->w) > dst->w) ||
114 ((dstrect->y + dstrect->h) > dst->h)) {
115 return SDL_SetError("Invalid destination blit rectangle");
116 }
117 } else {
118 full_dst.x = 0;
119 full_dst.y = 0;
120 full_dst.w = dst->w;
121 full_dst.h = dst->h;
122 dstrect = &full_dst;
123 }
124
125 if (dstrect->w <= 0 || dstrect->h <= 0) {
126 return true;
127 }
128
129 if (srcrect->w > SDL_MAX_UINT16 || srcrect->h > SDL_MAX_UINT16 ||
130 dstrect->w > SDL_MAX_UINT16 || dstrect->h > SDL_MAX_UINT16) {
131 return SDL_SetError("Size too large for scaling");
132 }
133
134 // Lock the destination if it's in hardware
135 dst_locked = 0;
136 if (SDL_MUSTLOCK(dst)) {
137 if (!SDL_LockSurface(dst)) {
138 return SDL_SetError("Unable to lock destination surface");
139 }
140 dst_locked = 1;
141 }
142 // Lock the source if it's in hardware
143 src_locked = 0;
144 if (SDL_MUSTLOCK(src)) {
145 if (!SDL_LockSurface(src)) {
146 if (dst_locked) {
147 SDL_UnlockSurface(dst);
148 }
149 return SDL_SetError("Unable to lock source surface");
150 }
151 src_locked = 1;
152 }
153
154 if (scaleMode == SDL_SCALEMODE_NEAREST) {
155 result = SDL_StretchSurfaceUncheckedNearest(src, srcrect, dst, dstrect);
156 } else {
157 result = SDL_StretchSurfaceUncheckedLinear(src, srcrect, dst, dstrect);
158 }
159
160 // We need to unlock the surfaces if they're locked
161 if (dst_locked) {
162 SDL_UnlockSurface(dst);
163 }
164 if (src_locked) {
165 SDL_UnlockSurface(src);
166 }
167
168 return result;
169}
170
171/* bilinear interpolation precision must be < 8
172 Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
173 so pixels 0xb1...... are negatives and false the result
174 same in NEON probably */
175#define PRECISION 7
176
177#define FIXED_POINT(i) ((Uint32)(i) << 16)
178#define SRC_INDEX(fp) ((Uint32)(fp) >> 16)
179#define INTEGER(fp) ((Uint32)(fp) >> PRECISION)
180#define FRAC(fp) ((Uint32)((fp) >> (16 - PRECISION)) & ((1 << PRECISION) - 1))
181#define FRAC_ZERO 0
182#define FRAC_ONE (1 << PRECISION)
183#define FP_ONE FIXED_POINT(1)
184
185#define BILINEAR___START \
186 int i; \
187 Sint64 fp_sum_h; \
188 int fp_step_h, left_pad_h, right_pad_h; \
189 Sint64 fp_sum_w; \
190 int fp_step_w, left_pad_w, right_pad_w; \
191 Sint64 fp_sum_w_init; \
192 int left_pad_w_init, right_pad_w_init, dst_gap, middle_init; \
193 get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
194 get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
195 fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w; \
196 left_pad_w_init = left_pad_w; \
197 right_pad_w_init = right_pad_w; \
198 dst_gap = dst_pitch - 4 * dst_w; \
199 middle_init = dst_w - left_pad_w - right_pad_w;
200
201#define BILINEAR___HEIGHT \
202 int index_h, frac_h0, frac_h1, middle; \
203 const Uint32 *src_h0, *src_h1; \
204 int no_padding; \
205 Uint64 incr_h0, incr_h1; \
206 \
207 no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h); \
208 index_h = SRC_INDEX(fp_sum_h); \
209 frac_h0 = FRAC(fp_sum_h); \
210 \
211 index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
212 frac_h0 = no_padding ? frac_h0 : 0; \
213 incr_h1 = no_padding ? src_pitch : 0; \
214 incr_h0 = (Uint64)index_h * src_pitch; \
215 \
216 src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0); \
217 src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1); \
218 \
219 fp_sum_h += fp_step_h; \
220 \
221 frac_h1 = FRAC_ONE - frac_h0; \
222 fp_sum_w = fp_sum_w_init; \
223 right_pad_w = right_pad_w_init; \
224 left_pad_w = left_pad_w_init; \
225 middle = middle_init;
226
227#ifdef __clang__
228// Remove inlining of this function
229// Compiler crash with clang 9.0.8 / android-ndk-r21d
230// Compiler crash with clang 11.0.3 / Xcode
231// OK with clang 11.0.5 / android-ndk-22
232// OK with clang 12.0.0 / Xcode
233__attribute__((noinline))
234#endif
235static void get_scaler_datas(int src_nb, int dst_nb, Sint64 *fp_start, int *fp_step, int *left_pad, int *right_pad)
236{
237
238 int step = FIXED_POINT(src_nb) / (dst_nb); // source step in fixed point
239 int x0 = FP_ONE / 2; // dst first pixel center at 0.5 in fixed point
240 Sint64 fp_sum;
241 int i;
242#if 0
243 // scale to source coordinates
244 x0 *= src_nb;
245 x0 /= dst_nb; // x0 == step / 2
246#else
247 // Use this code for perfect match with pixman
248 Sint64 tmp[2];
249 tmp[0] = (Sint64)step * (x0 >> 16);
250 tmp[1] = (Sint64)step * (x0 & 0xFFFF);
251 x0 = (int)(tmp[0] + ((tmp[1] + 0x8000) >> 16)); // x0 == (step + 1) / 2
252#endif
253 // -= 0.5, get back the pixel origin, in source coordinates
254 x0 -= FP_ONE / 2;
255
256 *fp_start = x0;
257 *fp_step = step;
258 *left_pad = 0;
259 *right_pad = 0;
260
261 fp_sum = x0;
262 for (i = 0; i < dst_nb; i++) {
263 if (fp_sum < 0) {
264 *left_pad += 1;
265 } else {
266 int index = SRC_INDEX(fp_sum);
267 if (index > src_nb - 2) {
268 *right_pad += 1;
269 }
270 }
271 fp_sum += step;
272 }
273 // SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad);
274}
275
276typedef struct color_t
277{
278 Uint8 a;
279 Uint8 b;
280 Uint8 c;
281 Uint8 d;
282} color_t;
283
284#if 0
285static void printf_64(const char *str, void *var)
286{
287 uint8_t *val = (uint8_t*) var;
288 printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
289 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
290}
291#endif
292
293/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
294
295static SDL_INLINE void INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst)
296{
297 const color_t *c0 = (const color_t *)src_x0;
298 const color_t *c1 = (const color_t *)src_x1;
299 color_t *cx = (color_t *)dst;
300#if 0
301 cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
302 cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
303 cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
304 cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
305#else
306 cx->a = (Uint8)INTEGER(frac1 * c0->a + frac0 * c1->a);
307 cx->b = (Uint8)INTEGER(frac1 * c0->b + frac0 * c1->b);
308 cx->c = (Uint8)INTEGER(frac1 * c0->c + frac0 * c1->c);
309 cx->d = (Uint8)INTEGER(frac1 * c0->d + frac0 * c1->d);
310#endif
311}
312
313static SDL_INLINE void INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
314{
315 Uint32 tmp[2];
316 unsigned int frac_w1 = FRAC_ONE - frac_w0;
317
318 // Vertical first, store to 'tmp'
319 INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
320 INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1);
321
322 // Horizontal, store to 'dst'
323 INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst);
324}
325
326static bool scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
327{
328 BILINEAR___START
329
330 for (i = 0; i < dst_h; i++) {
331
332 BILINEAR___HEIGHT
333
334 while (left_pad_w--) {
335 INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
336 dst += 1;
337 }
338
339 while (middle--) {
340 const Uint32 *s_00_01;
341 const Uint32 *s_10_11;
342 int index_w = 4 * SRC_INDEX(fp_sum_w);
343 int frac_w = FRAC(fp_sum_w);
344 fp_sum_w += fp_step_w;
345
346 /*
347 x00 ... x0_ ..... x01
348 . . .
349 . x .
350 . . .
351 . . .
352 x10 ... x1_ ..... x11
353 */
354 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
355 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
356
357 INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
358
359 dst += 1;
360 }
361
362 while (right_pad_w--) {
363 int index_w = 4 * (src_w - 2);
364 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
365 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
366 INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
367 dst += 1;
368 }
369 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
370 }
371 return true;
372}
373
374#ifdef SDL_NEON_INTRINSICS
375#define CAST_uint8x8_t (uint8x8_t)
376#define CAST_uint32x2_t (uint32x2_t)
377#endif
378
379#if defined(_MSC_VER)
380#ifdef SDL_NEON_INTRINSICS
381#undef CAST_uint8x8_t
382#undef CAST_uint32x2_t
383#define CAST_uint8x8_t
384#define CAST_uint32x2_t
385#endif
386#endif
387
388#ifdef SDL_SSE2_INTRINSICS
389
390#if 0
391static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
392{
393 uint16_t *val = (uint16_t*) &var;
394 printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
395 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
396}
397#endif
398
399static SDL_INLINE int hasSSE2(void)
400{
401 static int val = -1;
402 if (val != -1) {
403 return val;
404 }
405 val = SDL_HasSSE2();
406 return val;
407}
408
409static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
410{
411 __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
412 __m128i v_frac_w0, k0, l0, d0, e0;
413
414 int f, f2;
415 f = frac_w;
416 f2 = FRAC_ONE - frac_w;
417 v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
418
419 x_00_01 = _mm_loadl_epi64((const __m128i *)s0); // Load x00 and x01
420 x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
421
422 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
423
424 // Interpolation vertical
425 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
426 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
427 k0 = _mm_add_epi16(k0, l0);
428
429 // For perfect match, clear the factionnal part eventually.
430 /*
431 k0 = _mm_srli_epi16(k0, PRECISION);
432 k0 = _mm_slli_epi16(k0, PRECISION);
433 */
434
435 // Interpolation horizontal
436 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
437 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
438
439 // Store 1 pixel
440 d0 = _mm_srli_epi32(k0, PRECISION * 2);
441 e0 = _mm_packs_epi32(d0, d0);
442 e0 = _mm_packus_epi16(e0, e0);
443 *dst = _mm_cvtsi128_si32(e0);
444}
445
446static bool SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
447{
448 BILINEAR___START
449
450 for (i = 0; i < dst_h; i++) {
451 int nb_block2;
452 __m128i v_frac_h0;
453 __m128i v_frac_h1;
454 __m128i zero;
455
456 BILINEAR___HEIGHT
457
458 nb_block2 = middle / 2;
459
460 v_frac_h0 = _mm_set_epi16((short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0);
461 v_frac_h1 = _mm_set_epi16((short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1);
462 zero = _mm_setzero_si128();
463
464 while (left_pad_w--) {
465 INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
466 dst += 1;
467 }
468
469 while (nb_block2--) {
470 int index_w_0, frac_w_0;
471 int index_w_1, frac_w_1;
472
473 const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13;
474
475 __m128i x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
476 __m128i v_frac_w0, k0, l0, d0, e0;
477 __m128i v_frac_w1, k1, l1, d1, e1;
478
479 int f, f2;
480 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
481 frac_w_0 = FRAC(fp_sum_w);
482 fp_sum_w += fp_step_w;
483 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
484 frac_w_1 = FRAC(fp_sum_w);
485 fp_sum_w += fp_step_w;
486 /*
487 x00............ x01 x02...........x03
488 . . . . . .
489 j0 f0 j1 j2 f1 j3
490 . . . . . .
491 . . . . . .
492 . . . . . .
493 x10............ x11 x12...........x13
494 */
495 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
496 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
497 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
498 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
499
500 f = frac_w_0;
501 f2 = FRAC_ONE - frac_w_0;
502 v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
503
504 f = frac_w_1;
505 f2 = FRAC_ONE - frac_w_1;
506 v_frac_w1 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
507
508 x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); // Load x00 and x01
509 x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
510 x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
511 x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
512
513 // Interpolation vertical
514 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
515 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
516 k0 = _mm_add_epi16(k0, l0);
517 k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
518 l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
519 k1 = _mm_add_epi16(k1, l1);
520
521 // Interpolation horizontal
522 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
523 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
524 l1 = _mm_unpacklo_epi64(/* unused */ l1, k1);
525 k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
526
527 // Store 1 pixel
528 d0 = _mm_srli_epi32(k0, PRECISION * 2);
529 e0 = _mm_packs_epi32(d0, d0);
530 e0 = _mm_packus_epi16(e0, e0);
531 *dst++ = _mm_cvtsi128_si32(e0);
532
533 // Store 1 pixel
534 d1 = _mm_srli_epi32(k1, PRECISION * 2);
535 e1 = _mm_packs_epi32(d1, d1);
536 e1 = _mm_packus_epi16(e1, e1);
537 *dst++ = _mm_cvtsi128_si32(e1);
538 }
539
540 // Last point
541 if (middle & 0x1) {
542 const Uint32 *s_00_01;
543 const Uint32 *s_10_11;
544 int index_w = 4 * SRC_INDEX(fp_sum_w);
545 int frac_w = FRAC(fp_sum_w);
546 fp_sum_w += fp_step_w;
547 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
548 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
549 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
550 dst += 1;
551 }
552
553 while (right_pad_w--) {
554 int index_w = 4 * (src_w - 2);
555 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
556 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
557 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
558 dst += 1;
559 }
560 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
561 }
562 return true;
563}
564#endif
565
566#ifdef SDL_NEON_INTRINSICS
567
568static SDL_INLINE int hasNEON(void)
569{
570 static int val = -1;
571 if (val != -1) {
572 return val;
573 }
574 val = SDL_HasNEON();
575 return val;
576}
577
578static SDL_INLINE void INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
579{
580 uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
581 uint16x8_t k0;
582 uint32x4_t l0;
583 uint16x8_t d0;
584 uint8x8_t e0;
585
586 x_00_01 = CAST_uint8x8_t vld1_u32(s0); // Load 2 pixels
587 x_10_11 = CAST_uint8x8_t vld1_u32(s1);
588
589 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
590 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
591 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
592
593 // k0 now contains 2 interpolated pixels { j0, j1 }
594 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
595 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
596 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
597
598 // Shift and narrow
599 d0 = vcombine_u16(
600 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
601 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION));
602
603 // Narrow again
604 e0 = vmovn_u16(d0);
605
606 // Store 1 pixel
607 *dst = vget_lane_u32(CAST_uint32x2_t e0, 0);
608}
609
610static bool scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
611{
612 BILINEAR___START
613
614 for (i = 0; i < dst_h; i++) {
615 int nb_block4;
616 uint8x8_t v_frac_h0, v_frac_h1;
617
618 BILINEAR___HEIGHT
619
620 nb_block4 = middle / 4;
621
622 v_frac_h0 = vmov_n_u8(frac_h0);
623 v_frac_h1 = vmov_n_u8(frac_h1);
624
625 while (left_pad_w--) {
626 INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
627 dst += 1;
628 }
629
630 while (nb_block4--) {
631 int index_w_0, frac_w_0;
632 int index_w_1, frac_w_1;
633 int index_w_2, frac_w_2;
634 int index_w_3, frac_w_3;
635
636 const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07;
637 const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17;
638
639 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
640 uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
641
642 uint16x8_t k0, k1, k2, k3;
643 uint32x4_t l0, l1, l2, l3;
644 uint16x8_t d0, d1;
645 uint8x8_t e0, e1;
646 uint32x4_t f0;
647
648 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
649 frac_w_0 = FRAC(fp_sum_w);
650 fp_sum_w += fp_step_w;
651 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
652 frac_w_1 = FRAC(fp_sum_w);
653 fp_sum_w += fp_step_w;
654 index_w_2 = 4 * SRC_INDEX(fp_sum_w);
655 frac_w_2 = FRAC(fp_sum_w);
656 fp_sum_w += fp_step_w;
657 index_w_3 = 4 * SRC_INDEX(fp_sum_w);
658 frac_w_3 = FRAC(fp_sum_w);
659 fp_sum_w += fp_step_w;
660
661 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
662 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
663 s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2);
664 s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3);
665 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
666 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
667 s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2);
668 s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3);
669
670 // Interpolation vertical
671 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
672 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
673 x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
674 x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
675 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
676 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
677 x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
678 x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
679
680 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
681 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
682 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
683
684 k1 = vmull_u8(x_02_03, v_frac_h1);
685 k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
686
687 k2 = vmull_u8(x_04_05, v_frac_h1);
688 k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
689
690 k3 = vmull_u8(x_06_07, v_frac_h1);
691 k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
692
693 // k0 now contains 2 interpolated pixels { j0, j1 }
694 // k1 now contains 2 interpolated pixels { j2, j3 }
695 // k2 now contains 2 interpolated pixels { j4, j5 }
696 // k3 now contains 2 interpolated pixels { j6, j7 }
697
698 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
699 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
700 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
701
702 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
703 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
704 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
705
706 l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
707 l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
708 l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
709
710 l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
711 l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
712 l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
713
714 // shift and narrow
715 d0 = vcombine_u16(
716 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
717 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
718 // narrow again
719 e0 = vmovn_u16(d0);
720
721 // Shift and narrow
722 d1 = vcombine_u16(
723 /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION),
724 /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION));
725 // Narrow again
726 e1 = vmovn_u16(d1);
727
728 f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
729 // Store 4 pixels
730 vst1q_u32(dst, f0);
731
732 dst += 4;
733 }
734
735 if (middle & 0x2) {
736 int index_w_0, frac_w_0;
737 int index_w_1, frac_w_1;
738 const Uint32 *s_00_01, *s_02_03;
739 const Uint32 *s_10_11, *s_12_13;
740 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
741 uint16x8_t k0, k1;
742 uint32x4_t l0, l1;
743 uint16x8_t d0;
744 uint8x8_t e0;
745
746 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
747 frac_w_0 = FRAC(fp_sum_w);
748 fp_sum_w += fp_step_w;
749 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
750 frac_w_1 = FRAC(fp_sum_w);
751 fp_sum_w += fp_step_w;
752 /*
753 x00............ x01 x02...........x03
754 . . . . . .
755 j0 dest0 j1 j2 dest1 j3
756 . . . . . .
757 . . . . . .
758 . . . . . .
759 x10............ x11 x12...........x13
760 */
761 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
762 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
763 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
764 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
765
766 // Interpolation vertical
767 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
768 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
769 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
770 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
771
772 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
773 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
774 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
775
776 k1 = vmull_u8(x_02_03, v_frac_h1);
777 k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
778
779 // k0 now contains 2 interpolated pixels { j0, j1 }
780 // k1 now contains 2 interpolated pixels { j2, j3 }
781
782 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
783 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
784 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
785
786 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
787 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
788 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
789
790 // Shift and narrow
791
792 d0 = vcombine_u16(
793 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
794 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
795
796 // Narrow again
797 e0 = vmovn_u16(d0);
798
799 // Store 2 pixels
800 vst1_u32(dst, CAST_uint32x2_t e0);
801 dst += 2;
802 }
803
804 // Last point
805 if (middle & 0x1) {
806 int index_w = 4 * SRC_INDEX(fp_sum_w);
807 int frac_w = FRAC(fp_sum_w);
808 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
809 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
810 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
811 dst += 1;
812 }
813
814 while (right_pad_w--) {
815 int index_w = 4 * (src_w - 2);
816 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
817 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
818 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
819 dst += 1;
820 }
821
822 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
823 }
824 return true;
825}
826#endif
827
828bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
829{
830 bool result = false;
831 int src_w = srcrect->w;
832 int src_h = srcrect->h;
833 int dst_w = dstrect->w;
834 int dst_h = dstrect->h;
835 int src_pitch = s->pitch;
836 int dst_pitch = d->pitch;
837 Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch);
838 Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch);
839
840#ifdef SDL_NEON_INTRINSICS
841 if (!result && hasNEON()) {
842 result = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
843 }
844#endif
845
846#ifdef SDL_SSE2_INTRINSICS
847 if (!result && hasSSE2()) {
848 result = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
849 }
850#endif
851
852 if (!result) {
853 result = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
854 }
855
856 return result;
857}
858
859#define SDL_SCALE_NEAREST__START \
860 int i; \
861 Uint64 posy, incy; \
862 Uint64 posx, incx; \
863 Uint64 srcy, srcx; \
864 int dst_gap, n; \
865 const Uint32 *src_h0; \
866 incy = ((Uint64)src_h << 16) / dst_h; \
867 incx = ((Uint64)src_w << 16) / dst_w; \
868 dst_gap = dst_pitch - bpp * dst_w; \
869 posy = incy / 2;
870
871#define SDL_SCALE_NEAREST__HEIGHT \
872 srcy = (posy >> 16); \
873 src_h0 = (const Uint32 *)((const Uint8 *)src_ptr + srcy * src_pitch); \
874 posy += incy; \
875 posx = incx / 2; \
876 n = dst_w;
877
878static bool scale_mat_nearest_1(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
879{
880 Uint32 bpp = 1;
881 SDL_SCALE_NEAREST__START
882 for (i = 0; i < dst_h; i++) {
883 SDL_SCALE_NEAREST__HEIGHT
884 while (n--) {
885 const Uint8 *src;
886 srcx = bpp * (posx >> 16);
887 posx += incx;
888 src = (const Uint8 *)src_h0 + srcx;
889 *(Uint8 *)dst = *src;
890 dst = (Uint32 *)((Uint8 *)dst + bpp);
891 }
892 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
893 }
894 return true;
895}
896
897static bool scale_mat_nearest_2(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
898{
899 Uint32 bpp = 2;
900 SDL_SCALE_NEAREST__START
901 for (i = 0; i < dst_h; i++) {
902 SDL_SCALE_NEAREST__HEIGHT
903 while (n--) {
904 const Uint16 *src;
905 srcx = bpp * (posx >> 16);
906 posx += incx;
907 src = (const Uint16 *)((const Uint8 *)src_h0 + srcx);
908 *(Uint16 *)dst = *src;
909 dst = (Uint32 *)((Uint8 *)dst + bpp);
910 }
911 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
912 }
913 return true;
914}
915
916static bool scale_mat_nearest_3(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
917{
918 Uint32 bpp = 3;
919 SDL_SCALE_NEAREST__START
920 for (i = 0; i < dst_h; i++) {
921 SDL_SCALE_NEAREST__HEIGHT
922 while (n--) {
923 const Uint8 *src;
924 srcx = bpp * (posx >> 16);
925 posx += incx;
926 src = (const Uint8 *)src_h0 + srcx;
927 ((Uint8 *)dst)[0] = src[0];
928 ((Uint8 *)dst)[1] = src[1];
929 ((Uint8 *)dst)[2] = src[2];
930 dst = (Uint32 *)((Uint8 *)dst + bpp);
931 }
932 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
933 }
934 return true;
935}
936
937static bool scale_mat_nearest_4(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
938{
939 Uint32 bpp = 4;
940 SDL_SCALE_NEAREST__START
941 for (i = 0; i < dst_h; i++) {
942 SDL_SCALE_NEAREST__HEIGHT
943 while (n--) {
944 const Uint32 *src;
945 srcx = bpp * (posx >> 16);
946 posx += incx;
947 src = (const Uint32 *)((const Uint8 *)src_h0 + srcx);
948 *dst = *src;
949 dst = (Uint32 *)((Uint8 *)dst + bpp);
950 }
951 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
952 }
953 return true;
954}
955
956bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
957{
958 int src_w = srcrect->w;
959 int src_h = srcrect->h;
960 int dst_w = dstrect->w;
961 int dst_h = dstrect->h;
962 int src_pitch = s->pitch;
963 int dst_pitch = d->pitch;
964 int bpp = SDL_BYTESPERPIXEL(d->format);
965
966 Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * bpp + srcrect->y * src_pitch);
967 Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * bpp + dstrect->y * dst_pitch);
968
969 if (bpp == 4) {
970 return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
971 } else if (bpp == 3) {
972 return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
973 } else if (bpp == 2) {
974 return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
975 } else {
976 return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
977 }
978}