#include "SDL_internal.h"
#ifdef SDL_HAVE_BLIT_A
#include "SDL_pixels_c.h"
#include "SDL_surface_c.h"
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
Uint8 *palmap = info->table;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_Color *dstpal = info->dst_pal->colors;
int srcbpp = srcfmt->bytes_per_pixel;
Uint32 Pixel;
unsigned sR, sG, sB;
unsigned dR, dG, dB;
const unsigned A = info->a;
while (height--) {
DUFFS_LOOP(
{
DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
dR = dstpal[*dst].r;
dG = dstpal[*dst].g;
dB = dstpal[*dst].b;
ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
dR &= 0xff;
dG &= 0xff;
dB &= 0xff;
if ( palmap == NULL ) {
*dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
} else {
*dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
}
dst++;
src += srcbpp;
},
width);
src += srcskip;
dst += dstskip;
}
}
static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
Uint8 *palmap = info->table;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_Color *dstpal = info->dst_pal->colors;
int srcbpp = srcfmt->bytes_per_pixel;
Uint32 Pixel;
unsigned sR, sG, sB, sA;
unsigned dR, dG, dB;
while (height--) {
DUFFS_LOOP(
{
DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
dR = dstpal[*dst].r;
dG = dstpal[*dst].g;
dB = dstpal[*dst].b;
ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
dR &= 0xff;
dG &= 0xff;
dB &= 0xff;
if ( palmap == NULL ) {
*dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
} else {
*dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
}
dst++;
src += srcbpp;
},
width);
src += srcskip;
dst += dstskip;
}
}
static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
Uint8 *palmap = info->table;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_Color *dstpal = info->dst_pal->colors;
int srcbpp = srcfmt->bytes_per_pixel;
Uint32 ckey = info->colorkey;
Uint32 Pixel;
unsigned sR, sG, sB;
unsigned dR, dG, dB;
const unsigned A = info->a;
while (height--) {
DUFFS_LOOP(
{
DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
if ( Pixel != ckey ) {
dR = dstpal[*dst].r;
dG = dstpal[*dst].g;
dB = dstpal[*dst].b;
ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
dR &= 0xff;
dG &= 0xff;
dB &= 0xff;
if ( palmap == NULL ) {
*dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
} else {
*dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
}
}
dst++;
src += srcbpp;
},
width);
src += srcskip;
dst += dstskip;
}
}
#ifdef SDL_SSE2_INTRINSICS
static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
Uint8 alpha = info->a;
const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000);
const __m128i srcA = _mm_set1_epi16(alpha);
while (height--) {
int i = 0;
for (; i + 4 <= width; i += 4) {
__m128i src128 = _mm_loadu_si128((__m128i *)src);
__m128i dst128 = _mm_loadu_si128((__m128i *)dst);
__m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
__m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
__m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
__m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA),
_mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA),
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
dst128 = _mm_packus_epi16(dst_lo, dst_hi);
dst128 = _mm_or_si128(dst128, alpha_fill_mask);
_mm_storeu_si128((__m128i *)dst, dst128);
src += 16;
dst += 16;
}
for (; i < width; ++i) {
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
FACTOR_BLEND_8888(src32, dst32, alpha);
*dst = dst32 | 0xff000000;
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
#endif
#ifdef SDL_LSX_INTRINSICS
static void SDL_TARGETING("lsx") Blit8888to8888PixelAlphaSwizzleLSX(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
bool fill_alpha = !dstfmt->Amask;
Uint32 dstAmask, dstAshift;
const Uint8 offsets[] = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
const __m128i const_0xff00 = __lsx_vreplgr2vr_h(0xff00);
const __m128i const_128 = __lsx_vreplgr2vr_b((Uint8)128);
const __m128i const_32641 = __lsx_vreplgr2vr_h(32641);
const __m128i const_257 = __lsx_vreplgr2vr_h(257);
const __m128i mask_offsets = __lsx_vld(offsets, 0);
const __m128i convert_mask = __lsx_vadd_w(
__lsx_vreplgr2vr_w(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);
const __m128i alpha_splat_mask = __lsx_vadd_b(__lsx_vreplgr2vr_b(srcfmt->Ashift >> 3), mask_offsets);
const __m128i alpha_fill_mask = __lsx_vreplgr2vr_w((int)dstAmask);
while (height--) {
int i = 0;
for (; i + 4 <= width; i += 4) {
__m128i src128 = __lsx_vld(src, 0);
__m128i dst128 = __lsx_vld(dst, 0);
__m128i srcA = __lsx_vshuf_b(src128, src128, alpha_splat_mask);
src128 = __lsx_vshuf_b(src128, src128, convert_mask);
src128 = __lsx_vor_v(src128, alpha_fill_mask);
__m128i srca_lo = __lsx_vilvl_b(srcA, srcA);
__m128i srca_hi = __lsx_vilvh_b(srcA, srcA);
srca_lo = __lsx_vxor_v(srca_lo, const_0xff00);
srca_hi = __lsx_vxor_v(srca_hi, const_0xff00);
src128 = __lsx_vsub_b(src128, const_128);
dst128 = __lsx_vsub_b(dst128, const_128);
__m128i tmp = __lsx_vilvl_b(dst128, src128);
__m128i dst_lo = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_lo, tmp), __lsx_vmulwod_h_bu_b(srca_lo, tmp));
tmp = __lsx_vilvh_b(dst128, src128);
__m128i dst_hi = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_hi, tmp), __lsx_vmulwod_h_bu_b(srca_hi, tmp));
dst_lo = __lsx_vadd_h(dst_lo, const_32641);
dst_hi = __lsx_vadd_h(dst_hi, const_32641);
dst_lo = __lsx_vmuh_hu(dst_lo, const_257);
dst_hi = __lsx_vmuh_hu(dst_hi, const_257);
dst128 = __lsx_vssrarni_bu_h(dst_hi, dst_lo, 0);
if (fill_alpha) {
dst128 = __lsx_vor_v(dst128, alpha_fill_mask);
}
__lsx_vst(dst128, dst, 0);
src += 16;
dst += 16;
}
for (; i < width; ++i) {
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
if (fill_alpha) {
dst32 |= dstAmask;
}
*(Uint32 *)dst = dst32;
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
#endif
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint32 *srcp = (Uint32 *)info->src;
int srcskip = info->src_skip >> 2;
Uint32 *dstp = (Uint32 *)info->dst;
int dstskip = info->dst_skip >> 2;
while (height--) {
DUFFS_LOOP({
Uint32 s = *srcp++;
Uint32 d = *dstp;
*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
+ (s & d & 0x00010101)) | 0xff000000;
}, width);
srcp += srcskip;
dstp += dstskip;
}
}
static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
{
unsigned alpha = info->a;
if (alpha == 128) {
BlitRGBtoRGBSurfaceAlpha128(info);
} else {
int width = info->dst_w;
int height = info->dst_h;
Uint32 *srcp = (Uint32 *)info->src;
int srcskip = info->src_skip >> 2;
Uint32 *dstp = (Uint32 *)info->dst;
int dstskip = info->dst_skip >> 2;
Uint32 s;
Uint32 d;
while (height--) {
DUFFS_LOOP({
s = *srcp;
d = *dstp;
FACTOR_BLEND_8888(s, d, alpha);
*dstp = d | 0xff000000;
++srcp;
++dstp;
}, width);
srcp += srcskip;
dstp += dstskip;
}
}
}
#define BLEND16_50(d, s, mask) \
((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
#define BLEND2x16_50(d, s, mask) \
(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) + (s & d & (~(mask | mask << 16))))
static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
{
int width = info->dst_w;
int height = info->dst_h;
Uint16 *srcp = (Uint16 *)info->src;
int srcskip = info->src_skip >> 1;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
while (height--) {
if (((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
Uint32 prev_sw;
int w = width;
if ((uintptr_t)dstp & 2) {
Uint16 d = *dstp, s = *srcp;
*dstp = BLEND16_50(d, s, mask);
dstp++;
srcp++;
w--;
}
srcp++;
prev_sw = ((Uint32 *)srcp)[-1];
while (w > 1) {
Uint32 sw, dw, s;
sw = *(Uint32 *)srcp;
dw = *(Uint32 *)dstp;
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
s = (prev_sw << 16) + (sw >> 16);
#else
s = (prev_sw >> 16) + (sw << 16);
#endif
prev_sw = sw;
*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
dstp += 2;
srcp += 2;
w -= 2;
}
if (w) {
Uint16 d = *dstp, s;
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
s = (Uint16)prev_sw;
#else
s = (Uint16)(prev_sw >> 16);
#endif
*dstp = BLEND16_50(d, s, mask);
srcp++;
dstp++;
}
srcp += srcskip - 1;
dstp += dstskip;
} else {
int w = width;
if ((uintptr_t)srcp & 2) {
Uint16 d = *dstp, s = *srcp;
*dstp = BLEND16_50(d, s, mask);
srcp++;
dstp++;
w--;
}
while (w > 1) {
Uint32 sw = *(Uint32 *)srcp;
Uint32 dw = *(Uint32 *)dstp;
*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
srcp += 2;
dstp += 2;
w -= 2;
}
if (w) {
Uint16 d = *dstp, s = *srcp;
*dstp = BLEND16_50(d, s, mask);
srcp++;
dstp++;
}
srcp += srcskip;
dstp += dstskip;
}
}
}
#ifdef SDL_MMX_INTRINSICS
static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
{
unsigned alpha = info->a;
if (alpha == 128) {
Blit16to16SurfaceAlpha128(info, 0xf7de);
} else {
int width = info->dst_w;
int height = info->dst_h;
Uint16 *srcp = (Uint16 *)info->src;
int srcskip = info->src_skip >> 1;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
Uint32 s, d;
#ifdef USE_DUFFS_LOOP
__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
alpha &= ~(1 + 2 + 4); mm_alpha = _mm_set_pi32(0, alpha); alpha >>= 3;
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
mm_alpha = _mm_slli_si64(mm_alpha, 3);
gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); bmask = _mm_set_pi32(0x001F001F, 0x001F001F); #endif
while (height--) {
DUFFS_LOOP_124(
{
s = *srcp++;
d = *dstp;
s = (s | s << 16) & 0x07e0f81f;
d = (d | d << 16) & 0x07e0f81f;
d += (s - d) * alpha >> 5;
d &= 0x07e0f81f;
*dstp++ = (Uint16)(d | d >> 16);
},{
s = *srcp++;
d = *dstp;
s = (s | s << 16) & 0x07e0f81f;
d = (d | d << 16) & 0x07e0f81f;
d += (s - d) * alpha >> 5;
d &= 0x07e0f81f;
*dstp++ = (Uint16)(d | d >> 16);
s = *srcp++;
d = *dstp;
s = (s | s << 16) & 0x07e0f81f;
d = (d | d << 16) & 0x07e0f81f;
d += (s - d) * alpha >> 5;
d &= 0x07e0f81f;
*dstp++ = (Uint16)(d | d >> 16);
},{
src1 = *(__m64 *)srcp; dst1 = *(__m64 *)dstp;
src2 = src1;
src2 = _mm_srli_pi16(src2, 11);
dst2 = dst1;
dst2 = _mm_srli_pi16(dst2, 11);
src2 = _mm_sub_pi16(src2, dst2); src2 = _mm_mullo_pi16(src2, mm_alpha);
src2 = _mm_srli_pi16(src2, 11); dst2 = _mm_add_pi16(src2, dst2); dst2 = _mm_slli_pi16(dst2, 11);
mm_res = dst2;
src2 = src1;
src2 = _mm_and_si64(src2, gmask);
dst2 = dst1;
dst2 = _mm_and_si64(dst2, gmask);
src2 = _mm_sub_pi16(src2, dst2); src2 = _mm_mulhi_pi16(src2, mm_alpha);
src2 = _mm_slli_pi16(src2, 5); dst2 = _mm_add_pi16(src2, dst2);
mm_res = _mm_or_si64(mm_res, dst2);
src2 = src1;
src2 = _mm_and_si64(src2, bmask);
dst2 = dst1;
dst2 = _mm_and_si64(dst2, bmask);
src2 = _mm_sub_pi16(src2, dst2); src2 = _mm_mullo_pi16(src2, mm_alpha);
src2 = _mm_srli_pi16(src2, 11); dst2 = _mm_add_pi16(src2, dst2); dst2 = _mm_and_si64(dst2, bmask);
mm_res = _mm_or_si64(mm_res, dst2);
*(__m64 *)dstp = mm_res;
srcp += 4;
dstp += 4;
}, width);
srcp += srcskip;
dstp += dstskip;
}
_mm_empty();
}
}
static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
{
unsigned alpha = info->a;
if (alpha == 128) {
Blit16to16SurfaceAlpha128(info, 0xfbde);
} else {
int width = info->dst_w;
int height = info->dst_h;
Uint16 *srcp = (Uint16 *)info->src;
int srcskip = info->src_skip >> 1;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
Uint32 s, d;
#ifdef USE_DUFFS_LOOP
__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
alpha &= ~(1 + 2 + 4); mm_alpha = _mm_set_pi32(0, alpha); alpha >>= 3;
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
mm_alpha = _mm_slli_si64(mm_alpha, 3);
rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); bmask = _mm_set_pi32(0x001F001F, 0x001F001F); #endif
while (height--) {
DUFFS_LOOP_124(
{
s = *srcp++;
d = *dstp;
s = (s | s << 16) & 0x03e07c1f;
d = (d | d << 16) & 0x03e07c1f;
d += (s - d) * alpha >> 5;
d &= 0x03e07c1f;
*dstp++ = (Uint16)(d | d >> 16);
},{
s = *srcp++;
d = *dstp;
s = (s | s << 16) & 0x03e07c1f;
d = (d | d << 16) & 0x03e07c1f;
d += (s - d) * alpha >> 5;
d &= 0x03e07c1f;
*dstp++ = (Uint16)(d | d >> 16);
s = *srcp++;
d = *dstp;
s = (s | s << 16) & 0x03e07c1f;
d = (d | d << 16) & 0x03e07c1f;
d += (s - d) * alpha >> 5;
d &= 0x03e07c1f;
*dstp++ = (Uint16)(d | d >> 16);
},{
src1 = *(__m64 *)srcp; dst1 = *(__m64 *)dstp;
src2 = src1;
src2 = _mm_and_si64(src2, rmask);
dst2 = dst1;
dst2 = _mm_and_si64(dst2, rmask);
src2 = _mm_sub_pi16(src2, dst2); src2 = _mm_mulhi_pi16(src2, mm_alpha);
src2 = _mm_slli_pi16(src2, 5); dst2 = _mm_add_pi16(src2, dst2); dst2 = _mm_and_si64(dst2, rmask);
mm_res = dst2;
src2 = src1;
src2 = _mm_and_si64(src2, gmask);
dst2 = dst1;
dst2 = _mm_and_si64(dst2, gmask);
src2 = _mm_sub_pi16(src2, dst2); src2 = _mm_mulhi_pi16(src2, mm_alpha);
src2 = _mm_slli_pi16(src2, 5); dst2 = _mm_add_pi16(src2, dst2);
mm_res = _mm_or_si64(mm_res, dst2);
src2 = src1; src2 = _mm_and_si64(src2, bmask);
dst2 = dst1; dst2 = _mm_and_si64(dst2, bmask);
src2 = _mm_sub_pi16(src2, dst2); src2 = _mm_mullo_pi16(src2, mm_alpha);
src2 = _mm_srli_pi16(src2, 11); dst2 = _mm_add_pi16(src2, dst2); dst2 = _mm_and_si64(dst2, bmask);
mm_res = _mm_or_si64(mm_res, dst2);
*(__m64 *)dstp = mm_res;
srcp += 4;
dstp += 4;
}, width);
srcp += srcskip;
dstp += dstskip;
}
_mm_empty();
}
}
#endif
static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
{
unsigned alpha = info->a;
if (alpha == 128) {
Blit16to16SurfaceAlpha128(info, 0xf7de);
} else {
int width = info->dst_w;
int height = info->dst_h;
Uint16 *srcp = (Uint16 *)info->src;
int srcskip = info->src_skip >> 1;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
alpha >>= 3;
while (height--) {
DUFFS_LOOP({
Uint32 s = *srcp++;
Uint32 d = *dstp;
s = (s | s << 16) & 0x07e0f81f;
d = (d | d << 16) & 0x07e0f81f;
d += (s - d) * alpha >> 5;
d &= 0x07e0f81f;
*dstp++ = (Uint16)(d | d >> 16);
}, width);
srcp += srcskip;
dstp += dstskip;
}
}
}
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
{
unsigned alpha = info->a; if (alpha == 128) {
Blit16to16SurfaceAlpha128(info, 0xfbde);
} else {
int width = info->dst_w;
int height = info->dst_h;
Uint16 *srcp = (Uint16 *)info->src;
int srcskip = info->src_skip >> 1;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
alpha >>= 3;
while (height--) {
DUFFS_LOOP({
Uint32 s = *srcp++;
Uint32 d = *dstp;
s = (s | s << 16) & 0x03e07c1f;
d = (d | d << 16) & 0x03e07c1f;
d += (s - d) * alpha >> 5;
d &= 0x03e07c1f;
*dstp++ = (Uint16)(d | d >> 16);
}, width);
srcp += srcskip;
dstp += dstskip;
}
}
}
static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint32 *srcp = (Uint32 *)info->src;
int srcskip = info->src_skip >> 2;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
while (height--) {
DUFFS_LOOP({
Uint32 s = *srcp;
unsigned alpha = s >> 27;
if (alpha) {
if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
*dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
} else {
Uint32 d = *dstp;
s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) + (s >> 3 & 0x1f);
d = (d | d << 16) & 0x07e0f81f;
d += (s - d) * alpha >> 5;
d &= 0x07e0f81f;
*dstp = (Uint16)(d | d >> 16);
}
}
srcp++;
dstp++;
}, width);
srcp += srcskip;
dstp += dstskip;
}
}
static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint32 *srcp = (Uint32 *)info->src;
int srcskip = info->src_skip >> 2;
Uint16 *dstp = (Uint16 *)info->dst;
int dstskip = info->dst_skip >> 1;
while (height--) {
DUFFS_LOOP({
unsigned alpha;
Uint32 s = *srcp;
alpha = s >> 27;
if (alpha) {
if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
*dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
} else {
Uint32 d = *dstp;
s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) + (s >> 3 & 0x1f);
d = (d | d << 16) & 0x03e07c1f;
d += (s - d) * alpha >> 5;
d &= 0x03e07c1f;
*dstp = (Uint16)(d | d >> 16);
}
}
srcp++;
dstp++;
}, width);
srcp += srcskip;
dstp += dstskip;
}
}
static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
int srcbpp = srcfmt->bytes_per_pixel;
int dstbpp = dstfmt->bytes_per_pixel;
Uint32 Pixel;
unsigned sR, sG, sB;
unsigned dR, dG, dB, dA;
const unsigned sA = info->a;
if (sA) {
while (height--) {
DUFFS_LOOP(
{
DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
src += srcbpp;
dst += dstbpp;
},
width);
src += srcskip;
dst += dstskip;
}
}
}
static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
Uint32 ckey = info->colorkey;
int srcbpp = srcfmt->bytes_per_pixel;
int dstbpp = dstfmt->bytes_per_pixel;
Uint32 Pixel;
unsigned sR, sG, sB;
unsigned dR, dG, dB, dA;
const unsigned sA = info->a;
while (height--) {
DUFFS_LOOP(
{
RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
if (sA && Pixel != ckey) {
RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
}
src += srcbpp;
dst += dstbpp;
},
width);
src += srcskip;
dst += dstskip;
}
}
static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
while (height--) {
int i = 0;
for (; i < width; ++i) {
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
ALPHA_BLEND_8888(src32, dst32, srcfmt);
*(Uint32 *)dst = dst32;
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
bool fill_alpha = !dstfmt->Amask;
Uint32 dstAmask, dstAshift;
SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
while (height--) {
int i = 0;
for (; i < width; ++i) {
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
if (fill_alpha) {
dst32 |= dstAmask;
}
*(Uint32 *)dst = dst32;
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
#ifdef SDL_SSE4_1_INTRINSICS
static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
bool fill_alpha = !dstfmt->Amask;
Uint32 dstAmask, dstAshift;
SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
const __m128i mask_offsets = _mm_set_epi8(
12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
const __m128i convert_mask = _mm_add_epi32(
_mm_set1_epi32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);
const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstAmask);
while (height--) {
int i = 0;
for (; i + 4 <= width; i += 4) {
__m128i src128 = _mm_loadu_si128((__m128i *)src);
__m128i dst128 = _mm_loadu_si128((__m128i *)dst);
__m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
src128 = _mm_shuffle_epi8(src128, convert_mask);
src128 = _mm_or_si128(src128, alpha_fill_mask);
__m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA);
__m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA);
srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00));
srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00));
src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)128));
dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)128));
__m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128));
__m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128));
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128 * 255));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128 * 255));
dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257));
dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257));
dst128 = _mm_packus_epi16(dst_lo, dst_hi);
if (fill_alpha) {
dst128 = _mm_or_si128(dst128, alpha_fill_mask);
}
_mm_storeu_si128((__m128i *)dst, dst128);
src += 16;
dst += 16;
}
for (; i < width; ++i) {
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
if (fill_alpha) {
dst32 |= dstAmask;
}
*(Uint32 *)dst = dst32;
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
#endif
#ifdef SDL_AVX2_INTRINSICS
static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
bool fill_alpha = !dstfmt->Amask;
Uint32 dstAmask, dstAshift;
SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
const __m256i mask_offsets = _mm256_set_epi8(
28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
const __m256i convert_mask = _mm256_add_epi32(
_mm256_set1_epi32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);
const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstAmask);
while (height--) {
int i = 0;
for (; i + 8 <= width; i += 8) {
__m256i src256 = _mm256_loadu_si256((__m256i *)src);
__m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
__m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
src256 = _mm256_shuffle_epi8(src256, convert_mask);
src256 = _mm256_or_si256(src256, alpha_fill_mask);
__m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA);
__m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA);
alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00));
alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00));
src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)128));
dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)128));
__m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256));
__m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256));
dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128 * 255));
dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128 * 255));
dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257));
dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257));
dst256 = _mm256_packus_epi16(dst_lo, dst_hi);
if (fill_alpha) {
dst256 = _mm256_or_si256(dst256, alpha_fill_mask);
}
_mm256_storeu_si256((__m256i *)dst, dst256);
src += 32;
dst += 32;
}
for (; i < width; ++i) {
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
if (fill_alpha) {
dst32 |= dstAmask;
}
*(Uint32 *)dst = dst32;
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
#endif
#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
bool fill_alpha = !dstfmt->Amask;
Uint32 dstAmask, dstAshift;
SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64(
vcreate_u64(0x0404040400000000), vcreate_u64(0x0c0c0c0c08080808)));
const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32(
vreinterpretq_u32_u8(mask_offsets),
vdupq_n_u32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift))));
const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> 3), mask_offsets);
const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstAmask));
while (height--) {
int i = 0;
for (; i + 4 <= width; i += 4) {
uint8x16_t src128 = vld1q_u8(src);
uint8x16_t dst128 = vld1q_u8(dst);
uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask);
src128 = vqtbl1q_u8(src128, convert_mask);
src128 = vorrq_u8(src128, alpha_fill_mask);
uint8x16_t srcInvA = vmvnq_u8(srcA);
uint16x8_t res_lo = vdupq_n_u16(1);
uint16x8_t res_hi = vdupq_n_u16(1);
res_lo = vmlal_u8(res_lo, vget_low_u8(srcA), vget_low_u8(src128));
res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128));
res_hi = vmlal_high_u8(res_hi, srcA, src128);
res_hi = vmlal_high_u8(res_hi, srcInvA, dst128);
uint8x8_t temp;
temp = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, 8));
dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, 8));
if (fill_alpha) {
dst128 = vorrq_u8(dst128, alpha_fill_mask);
}
vst1q_u8(dst, dst128);
src += 16;
dst += 16;
}
for (; i < width; ++i) {
uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32 *)src));
uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32 *)dst));
uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask));
src32 = vtbl1_u8(src32, vget_low_u8(convert_mask));
src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask));
uint8x8_t srcInvA = vmvn_u8(srcA);
uint16x8_t res = vdupq_n_u16(1);
res = vmlal_u8(res, srcA, src32);
res = vmlal_u8(res, srcInvA, dst32);
dst32 = vaddhn_u16(res, vshrq_n_u16(res, 8));
if (fill_alpha) {
dst32 = vorr_u8(dst32, vget_low_u8(alpha_fill_mask));
}
vst1_lane_u32((Uint32 *)dst, vreinterpret_u32_u8(dst32), 0);
src += 4;
dst += 4;
}
src += srcskip;
dst += dstskip;
}
}
#endif
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
Uint8 *src = info->src;
int srcskip = info->src_skip;
Uint8 *dst = info->dst;
int dstskip = info->dst_skip;
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
int srcbpp;
int dstbpp;
Uint32 Pixel;
unsigned sR, sG, sB, sA;
unsigned dR, dG, dB, dA;
srcbpp = srcfmt->bytes_per_pixel;
dstbpp = dstfmt->bytes_per_pixel;
while (height--) {
DUFFS_LOOP(
{
DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
if (sA) {
DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
}
src += srcbpp;
dst += dstbpp;
},
width);
src += srcskip;
dst += dstskip;
}
}
SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
{
const SDL_PixelFormatDetails *sf = surface->fmt;
const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt;
switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) {
case SDL_COPY_BLEND:
switch (df->bytes_per_pixel) {
case 1:
if (surface->map.info.dst_pal) {
return BlitNto1PixelAlpha;
} else {
return BlitNtoNPixelAlpha;
}
case 2:
if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
if (df->Gmask == 0x7e0) {
return BlitARGBto565PixelAlpha;
} else if (df->Gmask == 0x3e0 && !df->Amask) {
return BlitARGBto555PixelAlpha;
}
}
return BlitNtoNPixelAlpha;
case 4:
if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask &&
SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) {
#ifdef SDL_AVX2_INTRINSICS
if (SDL_HasAVX2()) {
return Blit8888to8888PixelAlphaSwizzleAVX2;
}
#endif
#ifdef SDL_SSE4_1_INTRINSICS
if (SDL_HasSSE41()) {
return Blit8888to8888PixelAlphaSwizzleSSE41;
}
#endif
#ifdef SDL_LSX_INTRINSICS
if (SDL_HasLSX()) {
return Blit8888to8888PixelAlphaSwizzleLSX;
}
#endif
#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
(void)Blit8888to8888PixelAlpha;
(void)Blit8888to8888PixelAlphaSwizzle;
return Blit8888to8888PixelAlphaSwizzleNEON;
#else
if (sf->format == df->format) {
return Blit8888to8888PixelAlpha;
} else {
return Blit8888to8888PixelAlphaSwizzle;
}
#endif
}
return BlitNtoNPixelAlpha;
case 3:
default:
break;
}
return BlitNtoNPixelAlpha;
case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
if (sf->Amask == 0) {
switch (df->bytes_per_pixel) {
case 1:
if (surface->map.info.dst_pal) {
return BlitNto1SurfaceAlpha;
} else {
return BlitNtoNSurfaceAlpha;
}
case 2:
if (surface->map.identity) {
if (df->Gmask == 0x7e0) {
#ifdef SDL_MMX_INTRINSICS
if (SDL_HasMMX()) {
return Blit565to565SurfaceAlphaMMX;
} else
#endif
{
return Blit565to565SurfaceAlpha;
}
} else if (df->Gmask == 0x3e0) {
#ifdef SDL_MMX_INTRINSICS
if (SDL_HasMMX()) {
return Blit555to555SurfaceAlphaMMX;
} else
#endif
{
return Blit555to555SurfaceAlpha;
}
}
}
return BlitNtoNSurfaceAlpha;
case 4:
if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
#ifdef SDL_SSE2_INTRINSICS
if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) {
return Blit888to888SurfaceAlphaSSE2;
}
#endif
if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
return BlitRGBtoRGBSurfaceAlpha;
}
}
return BlitNtoNSurfaceAlpha;
case 3:
default:
return BlitNtoNSurfaceAlpha;
}
}
break;
case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
if (sf->Amask == 0) {
if (df->bytes_per_pixel == 1) {
if (surface->map.info.dst_pal) {
return BlitNto1SurfaceAlphaKey;
} else {
return BlitNtoNSurfaceAlphaKey;
}
} else {
return BlitNtoNSurfaceAlphaKey;
}
}
break;
}
return NULL;
}
#endif