From 30f41c02aec763d32e62351452da9ef582bc3472 Mon Sep 17 00:00:00 2001
From: 3gg <3gg@shellblade.net>
Date: Fri, 6 Mar 2026 13:30:59 -0800
Subject: Move contrib libraries to contrib repo

---
 contrib/SDL-3.2.8/src/video/SDL_stretch.c | 978 ------------------------------
 1 file changed, 978 deletions(-)
 delete mode 100644 contrib/SDL-3.2.8/src/video/SDL_stretch.c

(limited to 'contrib/SDL-3.2.8/src/video/SDL_stretch.c')

diff --git a/contrib/SDL-3.2.8/src/video/SDL_stretch.c b/contrib/SDL-3.2.8/src/video/SDL_stretch.c
deleted file mode 100644
index c893cc3..0000000
--- a/contrib/SDL-3.2.8/src/video/SDL_stretch.c
+++ /dev/null
@@ -1,978 +0,0 @@
-/*
-  Simple DirectMedia Layer
-  Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org>
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-#include "SDL_internal.h"
-
-#include "SDL_surface_c.h"
-
-static bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
-static bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
-
-bool SDL_StretchSurface(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect, SDL_ScaleMode scaleMode)
-{
-    bool result;
-    int src_locked;
-    int dst_locked;
-    SDL_Rect full_src;
-    SDL_Rect full_dst;
-
-    if (!src) {
-        return SDL_InvalidParamError("src");
-    }
-    if (!dst) {
-        return SDL_InvalidParamError("dst");
-    }
-
-    if (src->format != dst->format) {
-        // Slow!
-        SDL_Surface *src_tmp = SDL_ConvertSurfaceAndColorspace(src, dst->format, dst->palette, dst->colorspace, dst->props);
-        if (!src_tmp) {
-            return false;
-        }
-        result = SDL_StretchSurface(src_tmp, srcrect, dst, dstrect, scaleMode);
-        SDL_DestroySurface(src_tmp);
-        return result;
-    }
-
-    if (SDL_ISPIXELFORMAT_FOURCC(src->format)) {
-        // Slow!
-        if (!dstrect) {
-            full_dst.x = 0;
-            full_dst.y = 0;
-            full_dst.w = dst->w;
-            full_dst.h = dst->h;
-            dstrect = &full_dst;
-        }
-
-        SDL_Surface *src_tmp = SDL_ConvertSurface(src, SDL_PIXELFORMAT_XRGB8888);
-        SDL_Surface *dst_tmp = SDL_CreateSurface(dstrect->w, dstrect->h, SDL_PIXELFORMAT_XRGB8888);
-        if (src_tmp && dst_tmp) {
-            result = SDL_StretchSurface(src_tmp, srcrect, dst_tmp, NULL, scaleMode);
-            if (result) {
-                result = SDL_ConvertPixelsAndColorspace(dstrect->w, dstrect->h,
-                            dst_tmp->format, SDL_COLORSPACE_SRGB, 0,
-                            dst_tmp->pixels, dst_tmp->pitch,
-                            dst->format, dst->colorspace, SDL_GetSurfaceProperties(dst),
-                            (Uint8 *)dst->pixels + dstrect->y * dst->pitch + dstrect->x * SDL_BYTESPERPIXEL(dst->format), dst->pitch);
-            }
-        } else {
-            result = false;
-        }
-        SDL_DestroySurface(src_tmp);
-        SDL_DestroySurface(dst_tmp);
-        return result;
-    }
-
-    if (scaleMode != SDL_SCALEMODE_NEAREST && scaleMode != SDL_SCALEMODE_LINEAR) {
-        return SDL_InvalidParamError("scaleMode");
-    }
-
-    if (scaleMode != SDL_SCALEMODE_NEAREST) {
-        scaleMode = SDL_SCALEMODE_LINEAR;
-    }
-
-    if (scaleMode == SDL_SCALEMODE_LINEAR) {
-        if (SDL_BYTESPERPIXEL(src->format) != 4 || src->format == SDL_PIXELFORMAT_ARGB2101010) {
-            return SDL_SetError("Wrong format");
-        }
-    }
-
-    // Verify the blit rectangles
-    if (srcrect) {
-        if ((srcrect->x < 0) || (srcrect->y < 0) ||
-            ((srcrect->x + srcrect->w) > src->w) ||
-            ((srcrect->y + srcrect->h) > src->h)) {
-            return SDL_SetError("Invalid source blit rectangle");
-        }
-    } else {
-        full_src.x = 0;
-        full_src.y = 0;
-        full_src.w = src->w;
-        full_src.h = src->h;
-        srcrect = &full_src;
-    }
-    if (dstrect) {
-        if ((dstrect->x < 0) || (dstrect->y < 0) ||
-            ((dstrect->x + dstrect->w) > dst->w) ||
-            ((dstrect->y + dstrect->h) > dst->h)) {
-            return SDL_SetError("Invalid destination blit rectangle");
-        }
-    } else {
-        full_dst.x = 0;
-        full_dst.y = 0;
-        full_dst.w = dst->w;
-        full_dst.h = dst->h;
-        dstrect = &full_dst;
-    }
-
-    if (dstrect->w <= 0 || dstrect->h <= 0) {
-        return true;
-    }
-
-    if (srcrect->w > SDL_MAX_UINT16 || srcrect->h > SDL_MAX_UINT16 ||
-        dstrect->w > SDL_MAX_UINT16 || dstrect->h > SDL_MAX_UINT16) {
-        return SDL_SetError("Size too large for scaling");
-    }
-
-    // Lock the destination if it's in hardware
-    dst_locked = 0;
-    if (SDL_MUSTLOCK(dst)) {
-        if (!SDL_LockSurface(dst)) {
-            return SDL_SetError("Unable to lock destination surface");
-        }
-        dst_locked = 1;
-    }
-    // Lock the source if it's in hardware
-    src_locked = 0;
-    if (SDL_MUSTLOCK(src)) {
-        if (!SDL_LockSurface(src)) {
-            if (dst_locked) {
-                SDL_UnlockSurface(dst);
-            }
-            return SDL_SetError("Unable to lock source surface");
-        }
-        src_locked = 1;
-    }
-
-    if (scaleMode == SDL_SCALEMODE_NEAREST) {
-        result = SDL_StretchSurfaceUncheckedNearest(src, srcrect, dst, dstrect);
-    } else {
-        result = SDL_StretchSurfaceUncheckedLinear(src, srcrect, dst, dstrect);
-    }
-
-    // We need to unlock the surfaces if they're locked
-    if (dst_locked) {
-        SDL_UnlockSurface(dst);
-    }
-    if (src_locked) {
-        SDL_UnlockSurface(src);
-    }
-
-    return result;
-}
-
-/* bilinear interpolation precision must be < 8
-   Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
-   so pixels 0xb1...... are negatives and false the result
-   same in NEON probably */
-#define PRECISION 7
-
-#define FIXED_POINT(i) ((Uint32)(i) << 16)
-#define SRC_INDEX(fp)  ((Uint32)(fp) >> 16)
-#define INTEGER(fp)    ((Uint32)(fp) >> PRECISION)
-#define FRAC(fp)       ((Uint32)((fp) >> (16 - PRECISION)) & ((1 << PRECISION) - 1))
-#define FRAC_ZERO      0
-#define FRAC_ONE       (1 << PRECISION)
-#define FP_ONE         FIXED_POINT(1)
-
-#define BILINEAR___START                                                              \
-    int i;                                                                            \
-    Sint64 fp_sum_h;                                                                  \
-    int fp_step_h, left_pad_h, right_pad_h;                                           \
-    Sint64 fp_sum_w;                                                                  \
-    int fp_step_w, left_pad_w, right_pad_w;                                           \
-    Sint64 fp_sum_w_init;                                                             \
-    int left_pad_w_init, right_pad_w_init, dst_gap, middle_init;                      \
-    get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
-    get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
-    fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w;                                \
-    left_pad_w_init = left_pad_w;                                                     \
-    right_pad_w_init = right_pad_w;                                                   \
-    dst_gap = dst_pitch - 4 * dst_w;                                                  \
-    middle_init = dst_w - left_pad_w - right_pad_w;
-
-#define BILINEAR___HEIGHT                                              \
-    int index_h, frac_h0, frac_h1, middle;                             \
-    const Uint32 *src_h0, *src_h1;                                     \
-    int no_padding;                                                    \
-    Uint64 incr_h0, incr_h1;                                           \
-                                                                       \
-    no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h);     \
-    index_h = SRC_INDEX(fp_sum_h);                                     \
-    frac_h0 = FRAC(fp_sum_h);                                          \
-                                                                       \
-    index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
-    frac_h0 = no_padding ? frac_h0 : 0;                                \
-    incr_h1 = no_padding ? src_pitch : 0;                              \
-    incr_h0 = (Uint64)index_h * src_pitch;                             \
-                                                                       \
-    src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0);           \
-    src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1);        \
-                                                                       \
-    fp_sum_h += fp_step_h;                                             \
-                                                                       \
-    frac_h1 = FRAC_ONE - frac_h0;                                      \
-    fp_sum_w = fp_sum_w_init;                                          \
-    right_pad_w = right_pad_w_init;                                    \
-    left_pad_w = left_pad_w_init;                                      \
-    middle = middle_init;
-
-#ifdef __clang__
-// Remove inlining of this function
-// Compiler crash with clang 9.0.8 / android-ndk-r21d
-// Compiler crash with clang 11.0.3 / Xcode
-// OK with clang 11.0.5 / android-ndk-22
-// OK with clang 12.0.0 / Xcode
-__attribute__((noinline))
-#endif
-static void get_scaler_datas(int src_nb, int dst_nb, Sint64 *fp_start, int *fp_step, int *left_pad, int *right_pad)
-{
-
-    int step = FIXED_POINT(src_nb) / (dst_nb); // source step in fixed point
-    int x0 = FP_ONE / 2;                       // dst first pixel center at 0.5 in fixed point
-    Sint64 fp_sum;
-    int i;
-#if 0
-    // scale to source coordinates
-    x0 *= src_nb;
-    x0 /= dst_nb; // x0 == step / 2
-#else
-    // Use this code for perfect match with pixman
-    Sint64 tmp[2];
-    tmp[0] = (Sint64)step * (x0 >> 16);
-    tmp[1] = (Sint64)step * (x0 & 0xFFFF);
-    x0 = (int)(tmp[0] + ((tmp[1] + 0x8000) >> 16)); // x0 == (step + 1) / 2
-#endif
-    // -= 0.5, get back the pixel origin, in source coordinates
-    x0 -= FP_ONE / 2;
-
-    *fp_start = x0;
-    *fp_step = step;
-    *left_pad = 0;
-    *right_pad = 0;
-
-    fp_sum = x0;
-    for (i = 0; i < dst_nb; i++) {
-        if (fp_sum < 0) {
-            *left_pad += 1;
-        } else {
-            int index = SRC_INDEX(fp_sum);
-            if (index > src_nb - 2) {
-                *right_pad += 1;
-            }
-        }
-        fp_sum += step;
-    }
-    //    SDL_Log("%d -> %d  x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad);
-}
-
-typedef struct color_t
-{
-    Uint8 a;
-    Uint8 b;
-    Uint8 c;
-    Uint8 d;
-} color_t;
-
-#if 0
-static void printf_64(const char *str, void *var)
-{
-    uint8_t *val = (uint8_t*) var;
-    printf(" *   %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
-           str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
-}
-#endif
-
-/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
-
-static SDL_INLINE void INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst)
-{
-    const color_t *c0 = (const color_t *)src_x0;
-    const color_t *c1 = (const color_t *)src_x1;
-    color_t *cx = (color_t *)dst;
-#if 0
-    cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
-    cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
-    cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
-    cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
-#else
-    cx->a = (Uint8)INTEGER(frac1 * c0->a + frac0 * c1->a);
-    cx->b = (Uint8)INTEGER(frac1 * c0->b + frac0 * c1->b);
-    cx->c = (Uint8)INTEGER(frac1 * c0->c + frac0 * c1->c);
-    cx->d = (Uint8)INTEGER(frac1 * c0->d + frac0 * c1->d);
-#endif
-}
-
-static SDL_INLINE void INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
-{
-    Uint32 tmp[2];
-    unsigned int frac_w1 = FRAC_ONE - frac_w0;
-
-    // Vertical first, store to 'tmp'
-    INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
-    INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1);
-
-    // Horizontal, store to 'dst'
-    INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst);
-}
-
-static bool scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    BILINEAR___START
-
-    for (i = 0; i < dst_h; i++) {
-
-        BILINEAR___HEIGHT
-
-        while (left_pad_w--) {
-            INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
-            dst += 1;
-        }
-
-        while (middle--) {
-            const Uint32 *s_00_01;
-            const Uint32 *s_10_11;
-            int index_w = 4 * SRC_INDEX(fp_sum_w);
-            int frac_w = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-
-            /*
-                        x00 ... x0_ ..... x01
-                        .       .         .
-                        .       x         .
-                        .       .         .
-                        .       .         .
-                        x10 ... x1_ ..... x11
-            */
-            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
-            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
-
-            INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
-
-            dst += 1;
-        }
-
-        while (right_pad_w--) {
-            int index_w = 4 * (src_w - 2);
-            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
-            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
-            INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
-            dst += 1;
-        }
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-
-#ifdef SDL_NEON_INTRINSICS
-#define CAST_uint8x8_t       (uint8x8_t)
-#define CAST_uint32x2_t      (uint32x2_t)
-#endif
-
-#if defined(_MSC_VER)
-#ifdef SDL_NEON_INTRINSICS
-#undef CAST_uint8x8_t
-#undef CAST_uint32x2_t
-#define CAST_uint8x8_t
-#define CAST_uint32x2_t
-#endif
-#endif
-
-#ifdef SDL_SSE2_INTRINSICS
-
-#if 0
-static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
-{
-    uint16_t *val = (uint16_t*) &var;
-    printf(" *   %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
-           str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
-}
-#endif
-
-static SDL_INLINE int hasSSE2(void)
-{
-    static int val = -1;
-    if (val != -1) {
-        return val;
-    }
-    val = SDL_HasSSE2();
-    return val;
-}
-
-static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
-{
-    __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
-    __m128i v_frac_w0, k0, l0, d0, e0;
-
-    int f, f2;
-    f = frac_w;
-    f2 = FRAC_ONE - frac_w;
-    v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
-
-    x_00_01 = _mm_loadl_epi64((const __m128i *)s0); // Load x00 and x01
-    x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
-
-    /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
-
-    // Interpolation vertical
-    k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
-    l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
-    k0 = _mm_add_epi16(k0, l0);
-
-    // For perfect match, clear the factionnal part eventually.
-    /*
-    k0 = _mm_srli_epi16(k0, PRECISION);
-    k0 = _mm_slli_epi16(k0, PRECISION);
-    */
-
-    // Interpolation horizontal
-    l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
-    k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
-
-    // Store 1 pixel
-    d0 = _mm_srli_epi32(k0, PRECISION * 2);
-    e0 = _mm_packs_epi32(d0, d0);
-    e0 = _mm_packus_epi16(e0, e0);
-    *dst = _mm_cvtsi128_si32(e0);
-}
-
-static bool SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    BILINEAR___START
-
-    for (i = 0; i < dst_h; i++) {
-        int nb_block2;
-        __m128i v_frac_h0;
-        __m128i v_frac_h1;
-        __m128i zero;
-
-        BILINEAR___HEIGHT
-
-        nb_block2 = middle / 2;
-
-        v_frac_h0 = _mm_set_epi16((short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0);
-        v_frac_h1 = _mm_set_epi16((short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1);
-        zero = _mm_setzero_si128();
-
-        while (left_pad_w--) {
-            INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
-            dst += 1;
-        }
-
-        while (nb_block2--) {
-            int index_w_0, frac_w_0;
-            int index_w_1, frac_w_1;
-
-            const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13;
-
-            __m128i x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
-            __m128i v_frac_w0, k0, l0, d0, e0;
-            __m128i v_frac_w1, k1, l1, d1, e1;
-
-            int f, f2;
-            index_w_0 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_0 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            index_w_1 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_1 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            /*
-                        x00............ x01   x02...........x03
-                        .      .         .     .       .     .
-                        j0     f0        j1    j2      f1    j3
-                        .      .         .     .       .     .
-                        .      .         .     .       .     .
-                        .      .         .     .       .     .
-                        x10............ x11   x12...........x13
-             */
-            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
-            s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
-            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
-            s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
-
-            f = frac_w_0;
-            f2 = FRAC_ONE - frac_w_0;
-            v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
-
-            f = frac_w_1;
-            f2 = FRAC_ONE - frac_w_1;
-            v_frac_w1 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
-
-            x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); // Load x00 and x01
-            x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
-            x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
-            x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
-
-            // Interpolation vertical
-            k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
-            l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
-            k0 = _mm_add_epi16(k0, l0);
-            k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
-            l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
-            k1 = _mm_add_epi16(k1, l1);
-
-            // Interpolation horizontal
-            l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
-            k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
-            l1 = _mm_unpacklo_epi64(/* unused */ l1, k1);
-            k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
-
-            // Store 1 pixel
-            d0 = _mm_srli_epi32(k0, PRECISION * 2);
-            e0 = _mm_packs_epi32(d0, d0);
-            e0 = _mm_packus_epi16(e0, e0);
-            *dst++ = _mm_cvtsi128_si32(e0);
-
-            // Store 1 pixel
-            d1 = _mm_srli_epi32(k1, PRECISION * 2);
-            e1 = _mm_packs_epi32(d1, d1);
-            e1 = _mm_packus_epi16(e1, e1);
-            *dst++ = _mm_cvtsi128_si32(e1);
-        }
-
-        // Last point
-        if (middle & 0x1) {
-            const Uint32 *s_00_01;
-            const Uint32 *s_10_11;
-            int index_w = 4 * SRC_INDEX(fp_sum_w);
-            int frac_w = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
-            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
-            INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
-            dst += 1;
-        }
-
-        while (right_pad_w--) {
-            int index_w = 4 * (src_w - 2);
-            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
-            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
-            INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
-            dst += 1;
-        }
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-#endif
-
-#ifdef SDL_NEON_INTRINSICS
-
-static SDL_INLINE int hasNEON(void)
-{
-    static int val = -1;
-    if (val != -1) {
-        return val;
-    }
-    val = SDL_HasNEON();
-    return val;
-}
-
-static SDL_INLINE void INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
-{
-    uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
-    uint16x8_t k0;
-    uint32x4_t l0;
-    uint16x8_t d0;
-    uint8x8_t e0;
-
-    x_00_01 = CAST_uint8x8_t vld1_u32(s0); // Load 2 pixels
-    x_10_11 = CAST_uint8x8_t vld1_u32(s1);
-
-    /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
-    k0 = vmull_u8(x_00_01, v_frac_h1);     /* k0 := x0 * (1 - frac)    */
-    k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac          */
-
-    // k0 now contains 2 interpolated pixels { j0, j1 }
-    l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
-    l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
-    l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
-
-    // Shift and narrow
-    d0 = vcombine_u16(
-        /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
-        /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION));
-
-    // Narrow again
-    e0 = vmovn_u16(d0);
-
-    // Store 1 pixel
-    *dst = vget_lane_u32(CAST_uint32x2_t e0, 0);
-}
-
-static bool scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    BILINEAR___START
-
-    for (i = 0; i < dst_h; i++) {
-        int nb_block4;
-        uint8x8_t v_frac_h0, v_frac_h1;
-
-        BILINEAR___HEIGHT
-
-        nb_block4 = middle / 4;
-
-        v_frac_h0 = vmov_n_u8(frac_h0);
-        v_frac_h1 = vmov_n_u8(frac_h1);
-
-        while (left_pad_w--) {
-            INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
-            dst += 1;
-        }
-
-        while (nb_block4--) {
-            int index_w_0, frac_w_0;
-            int index_w_1, frac_w_1;
-            int index_w_2, frac_w_2;
-            int index_w_3, frac_w_3;
-
-            const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07;
-            const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17;
-
-            uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
-            uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
-
-            uint16x8_t k0, k1, k2, k3;
-            uint32x4_t l0, l1, l2, l3;
-            uint16x8_t d0, d1;
-            uint8x8_t e0, e1;
-            uint32x4_t f0;
-
-            index_w_0 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_0 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            index_w_1 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_1 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            index_w_2 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_2 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            index_w_3 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_3 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-
-            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
-            s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
-            s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2);
-            s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3);
-            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
-            s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
-            s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2);
-            s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3);
-
-            // Interpolation vertical
-            x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
-            x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
-            x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
-            x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
-            x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
-            x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
-            x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
-            x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
-
-            /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
-            k0 = vmull_u8(x_00_01, v_frac_h1);     /* k0 := x0 * (1 - frac)    */
-            k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac          */
-
-            k1 = vmull_u8(x_02_03, v_frac_h1);
-            k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
-
-            k2 = vmull_u8(x_04_05, v_frac_h1);
-            k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
-
-            k3 = vmull_u8(x_06_07, v_frac_h1);
-            k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
-
-            // k0 now contains 2 interpolated pixels { j0, j1 }
-            // k1 now contains 2 interpolated pixels { j2, j3 }
-            // k2 now contains 2 interpolated pixels { j4, j5 }
-            // k3 now contains 2 interpolated pixels { j6, j7 }
-
-            l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
-            l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
-            l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
-
-            l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
-            l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
-            l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
-
-            l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
-            l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
-            l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
-
-            l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
-            l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
-            l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
-
-            // shift and narrow
-            d0 = vcombine_u16(
-                /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
-                /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
-            // narrow again
-            e0 = vmovn_u16(d0);
-
-            // Shift and narrow
-            d1 = vcombine_u16(
-                /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION),
-                /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION));
-            // Narrow again
-            e1 = vmovn_u16(d1);
-
-            f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
-            // Store 4 pixels
-            vst1q_u32(dst, f0);
-
-            dst += 4;
-        }
-
-        if (middle & 0x2) {
-            int index_w_0, frac_w_0;
-            int index_w_1, frac_w_1;
-            const Uint32 *s_00_01, *s_02_03;
-            const Uint32 *s_10_11, *s_12_13;
-            uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
-            uint16x8_t k0, k1;
-            uint32x4_t l0, l1;
-            uint16x8_t d0;
-            uint8x8_t e0;
-
-            index_w_0 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_0 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            index_w_1 = 4 * SRC_INDEX(fp_sum_w);
-            frac_w_1 = FRAC(fp_sum_w);
-            fp_sum_w += fp_step_w;
-            /*
-                        x00............ x01   x02...........x03
-                        .      .         .     .       .     .
-                        j0   dest0       j1    j2    dest1   j3
-                        .      .         .     .       .     .
-                        .      .         .     .       .     .
-                        .      .         .     .       .     .
-                        x10............ x11   x12...........x13
-            */
-            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
-            s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
-            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
-            s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
-
-            // Interpolation vertical
-            x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
-            x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
-            x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
-            x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
-
-            /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
-            k0 = vmull_u8(x_00_01, v_frac_h1);     /* k0 := x0 * (1 - frac)    */
-            k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac          */
-
-            k1 = vmull_u8(x_02_03, v_frac_h1);
-            k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
-
-            // k0 now contains 2 interpolated pixels { j0, j1 }
-            // k1 now contains 2 interpolated pixels { j2, j3 }
-
-            l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
-            l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
-            l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
-
-            l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
-            l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
-            l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
-
-            // Shift and narrow
-
-            d0 = vcombine_u16(
-                /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
-                /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
-
-            // Narrow again
-            e0 = vmovn_u16(d0);
-
-            // Store 2 pixels
-            vst1_u32(dst, CAST_uint32x2_t e0);
-            dst += 2;
-        }
-
-        // Last point
-        if (middle & 0x1) {
-            int index_w = 4 * SRC_INDEX(fp_sum_w);
-            int frac_w = FRAC(fp_sum_w);
-            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
-            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
-            INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
-            dst += 1;
-        }
-
-        while (right_pad_w--) {
-            int index_w = 4 * (src_w - 2);
-            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
-            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
-            INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
-            dst += 1;
-        }
-
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-#endif
-
-bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
-{
-    bool result = false;
-    int src_w = srcrect->w;
-    int src_h = srcrect->h;
-    int dst_w = dstrect->w;
-    int dst_h = dstrect->h;
-    int src_pitch = s->pitch;
-    int dst_pitch = d->pitch;
-    Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch);
-    Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch);
-
-#ifdef SDL_NEON_INTRINSICS
-    if (!result && hasNEON()) {
-        result = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    }
-#endif
-
-#ifdef SDL_SSE2_INTRINSICS
-    if (!result && hasSSE2()) {
-        result = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    }
-#endif
-
-    if (!result) {
-        result = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    }
-
-    return result;
-}
-
-#define SDL_SCALE_NEAREST__START          \
-    int i;                                \
-    Uint64 posy, incy;                    \
-    Uint64 posx, incx;                    \
-    Uint64 srcy, srcx;                    \
-    int dst_gap, n;                       \
-    const Uint32 *src_h0;                 \
-    incy = ((Uint64)src_h << 16) / dst_h; \
-    incx = ((Uint64)src_w << 16) / dst_w; \
-    dst_gap = dst_pitch - bpp * dst_w;    \
-    posy = incy / 2;
-
-#define SDL_SCALE_NEAREST__HEIGHT                                         \
-    srcy = (posy >> 16);                                                  \
-    src_h0 = (const Uint32 *)((const Uint8 *)src_ptr + srcy * src_pitch); \
-    posy += incy;                                                         \
-    posx = incx / 2;                                                      \
-    n = dst_w;
-
-static bool scale_mat_nearest_1(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    Uint32 bpp = 1;
-    SDL_SCALE_NEAREST__START
-    for (i = 0; i < dst_h; i++) {
-        SDL_SCALE_NEAREST__HEIGHT
-        while (n--) {
-            const Uint8 *src;
-            srcx = bpp * (posx >> 16);
-            posx += incx;
-            src = (const Uint8 *)src_h0 + srcx;
-            *(Uint8 *)dst = *src;
-            dst = (Uint32 *)((Uint8 *)dst + bpp);
-        }
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-
-static bool scale_mat_nearest_2(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    Uint32 bpp = 2;
-    SDL_SCALE_NEAREST__START
-    for (i = 0; i < dst_h; i++) {
-        SDL_SCALE_NEAREST__HEIGHT
-        while (n--) {
-            const Uint16 *src;
-            srcx = bpp * (posx >> 16);
-            posx += incx;
-            src = (const Uint16 *)((const Uint8 *)src_h0 + srcx);
-            *(Uint16 *)dst = *src;
-            dst = (Uint32 *)((Uint8 *)dst + bpp);
-        }
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-
-static bool scale_mat_nearest_3(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    Uint32 bpp = 3;
-    SDL_SCALE_NEAREST__START
-    for (i = 0; i < dst_h; i++) {
-        SDL_SCALE_NEAREST__HEIGHT
-        while (n--) {
-            const Uint8 *src;
-            srcx = bpp * (posx >> 16);
-            posx += incx;
-            src = (const Uint8 *)src_h0 + srcx;
-            ((Uint8 *)dst)[0] = src[0];
-            ((Uint8 *)dst)[1] = src[1];
-            ((Uint8 *)dst)[2] = src[2];
-            dst = (Uint32 *)((Uint8 *)dst + bpp);
-        }
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-
-static bool scale_mat_nearest_4(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
-{
-    Uint32 bpp = 4;
-    SDL_SCALE_NEAREST__START
-    for (i = 0; i < dst_h; i++) {
-        SDL_SCALE_NEAREST__HEIGHT
-        while (n--) {
-            const Uint32 *src;
-            srcx = bpp * (posx >> 16);
-            posx += incx;
-            src = (const Uint32 *)((const Uint8 *)src_h0 + srcx);
-            *dst = *src;
-            dst = (Uint32 *)((Uint8 *)dst + bpp);
-        }
-        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
-    }
-    return true;
-}
-
-bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
-{
-    int src_w = srcrect->w;
-    int src_h = srcrect->h;
-    int dst_w = dstrect->w;
-    int dst_h = dstrect->h;
-    int src_pitch = s->pitch;
-    int dst_pitch = d->pitch;
-    int bpp = SDL_BYTESPERPIXEL(d->format);
-
-    Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * bpp + srcrect->y * src_pitch);
-    Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * bpp + dstrect->y * dst_pitch);
-
-    if (bpp == 4) {
-        return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    } else if (bpp == 3) {
-        return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    } else if (bpp == 2) {
-        return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    } else {
-        return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
-    }
-}
-- 
cgit v1.2.3