#include "ccap_convert_neon.h"
#include "ccap_convert.h"
#include <cstring>
namespace ccap {
bool hasNEON() {
#if ENABLE_NEON_IMP
static bool s_hasNEON = hasNEON_();
return s_hasNEON;
#else
return false;
#endif
}
#if ENABLE_NEON_IMP
template <int inputChannels, int outputChannels, int swapRB>
void colorShuffle_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height) {
static_assert((inputChannels == 3 || inputChannels == 4) &&
(outputChannels == 3 || outputChannels == 4),
"inputChannels and outputChannels must be 3 or 4");
constexpr int idxR = swapRB ? 2 : 0;
constexpr int idxG = 1;
constexpr int idxB = swapRB ? 0 : 2;
constexpr int idxA = 3;
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
if constexpr (inputChannels == 4 && outputChannels == 4) {
for (; x + 16 <= width; x += 16) {
uint8x16x4_t rgba = vld4q_u8(srcRow + x * 4);
if constexpr (swapRB) {
uint8x16_t tmp = rgba.val[0];
rgba.val[0] = rgba.val[2];
rgba.val[2] = tmp;
}
vst4q_u8(dstRow + x * 4, rgba);
}
} else if constexpr (inputChannels == 3 && outputChannels == 3) {
for (; x + 16 <= width; x += 16) {
uint8x16x3_t rgb = vld3q_u8(srcRow + x * 3);
if constexpr (swapRB) {
uint8x16_t tmp = rgb.val[0];
rgb.val[0] = rgb.val[2];
rgb.val[2] = tmp;
}
vst3q_u8(dstRow + x * 3, rgb);
}
} else if constexpr (inputChannels == 3 && outputChannels == 4) {
for (; x + 16 <= width; x += 16) {
uint8x16x3_t rgb = vld3q_u8(srcRow + x * 3);
if constexpr (swapRB) {
uint8x16_t tmp = rgb.val[0];
rgb.val[0] = rgb.val[2];
rgb.val[2] = tmp;
}
uint8x16_t a = vdupq_n_u8(255);
uint8x16x4_t rgba;
rgba.val[0] = rgb.val[0];
rgba.val[1] = rgb.val[1];
rgba.val[2] = rgb.val[2];
rgba.val[3] = a;
vst4q_u8(dstRow + x * 4, rgba);
}
} else { for (; x + 16 <= width; x += 16) {
uint8x16x4_t rgba = vld4q_u8(srcRow + x * 4);
uint8x16x3_t rgb;
if constexpr (swapRB) {
rgb.val[0] = rgba.val[2];
rgb.val[1] = rgba.val[1];
rgb.val[2] = rgba.val[0];
} else {
rgb.val[0] = rgba.val[0];
rgb.val[1] = rgba.val[1];
rgb.val[2] = rgba.val[2];
}
vst3q_u8(dstRow + x * 3, rgb);
}
}
if constexpr (inputChannels == 4 && outputChannels == 4) {
for (; x + 8 <= width; x += 8) {
uint8x8x4_t rgba = vld4_u8(srcRow + x * 4);
if constexpr (swapRB) {
uint8x8_t tmp = rgba.val[0];
rgba.val[0] = rgba.val[2];
rgba.val[2] = tmp;
}
vst4_u8(dstRow + x * 4, rgba);
}
} else if constexpr (inputChannels == 3 && outputChannels == 3) {
for (; x + 8 <= width; x += 8) {
uint8x8x3_t rgb = vld3_u8(srcRow + x * 3);
if constexpr (swapRB) {
uint8x8_t tmp = rgb.val[0];
rgb.val[0] = rgb.val[2];
rgb.val[2] = tmp;
}
vst3_u8(dstRow + x * 3, rgb);
}
} else if constexpr (inputChannels == 3 && outputChannels == 4) {
for (; x + 8 <= width; x += 8) {
uint8x8x3_t rgb = vld3_u8(srcRow + x * 3);
if constexpr (swapRB) {
uint8x8_t tmp = rgb.val[0];
rgb.val[0] = rgb.val[2];
rgb.val[2] = tmp;
}
uint8x8_t a = vdup_n_u8(255);
uint8x8x4_t rgba;
rgba.val[0] = rgb.val[0];
rgba.val[1] = rgb.val[1];
rgba.val[2] = rgb.val[2];
rgba.val[3] = a;
vst4_u8(dstRow + x * 4, rgba);
}
} else { for (; x + 8 <= width; x += 8) {
uint8x8x4_t rgba = vld4_u8(srcRow + x * 4);
uint8x8x3_t rgb;
if constexpr (swapRB) {
rgb.val[0] = rgba.val[2];
rgb.val[1] = rgba.val[1];
rgb.val[2] = rgba.val[0];
} else {
rgb.val[0] = rgba.val[0];
rgb.val[1] = rgba.val[1];
rgb.val[2] = rgba.val[2];
}
vst3_u8(dstRow + x * 3, rgb);
}
}
for (; x < width; ++x) {
if constexpr (outputChannels == 4 && inputChannels == 4) {
const uint8_t* s = srcRow + x * 4;
uint8_t* d = dstRow + x * 4;
d[0] = s[idxR];
d[1] = s[idxG];
d[2] = s[idxB];
d[3] = s[idxA];
} else if constexpr (inputChannels == 3 && outputChannels == 3) {
const uint8_t* s = srcRow + x * 3;
uint8_t* d = dstRow + x * 3;
d[0] = s[idxR];
d[1] = s[idxG];
d[2] = s[idxB];
} else if constexpr (inputChannels == 3 && outputChannels == 4) {
const uint8_t* s = srcRow + x * 3;
uint8_t* d = dstRow + x * 4;
d[0] = s[idxR];
d[1] = s[idxG];
d[2] = s[idxB];
d[3] = 255;
} else { const uint8_t* s = srcRow + x * 4;
uint8_t* d = dstRow + x * 3;
d[0] = s[idxR];
d[1] = s[idxG];
d[2] = s[idxB];
}
}
}
}
template void colorShuffle_neon<4, 4, 0>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<4, 4, 1>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<4, 3, 0>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<4, 3, 1>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<3, 4, 0>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<3, 4, 1>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<3, 3, 0>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
template void colorShuffle_neon<3, 3, 1>(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height);
inline void getYuvToRgbCoefficients_neon(bool isBT601, bool isFullRange,
int& cy, int& cr, int& cgu, int& cgv,
int& cb, int& y_offset);
inline void getYuvToRgbCoefficients_neon(ConvertFlag flag,
int& cy, int& cr, int& cgu, int& cgv,
int& cb, int& y_offset) {
const bool isBT601 = (flag & ConvertFlag::BT601);
const bool isFullRange = (flag & ConvertFlag::FullRange);
getYuvToRgbCoefficients_neon(isBT601, isFullRange, cy, cr, cgu, cgv, cb, y_offset);
}
inline void yuv2rgbGeneric_int(int y, int u, int v, int& r, int& g, int& b,
int cy, int cr, int cgu, int cgv, int cb) {
int fr = (cy * y + cr * v + 32) >> 6;
int fg = (cy * y - cgu * u - cgv * v + 32) >> 6;
int fb = (cy * y + cb * u + 32) >> 6;
r = fr < 0 ? 0 : (fr > 255 ? 255 : fr);
g = fg < 0 ? 0 : (fg > 255 ? 255 : fg);
b = fb < 0 ? 0 : (fb > 255 ? 255 : fb);
}
inline void getYuvToRgbCoefficients_neon(bool isBT601, bool isFullRange, int& cy, int& cr, int& cgu, int& cgv, int& cb, int& y_offset) {
if (isBT601) {
if (isFullRange) { cy = 64;
cr = 88;
cgu = 22;
cgv = 45;
cb = 111;
y_offset = 0;
} else { cy = 75;
cr = 102;
cgu = 25;
cgv = 52;
cb = 129;
y_offset = 16;
}
} else {
if (isFullRange) { cy = 64;
cr = 101;
cgu = 12;
cgv = 30;
cb = 119;
y_offset = 0;
} else { cy = 75;
cr = 115;
cgu = 14;
cgv = 34;
cb = 135;
y_offset = 16;
}
}
}
template <bool isBGRA>
void nv12ToRgbaColor_neon_imp(const uint8_t* srcY, int srcYStride,
const uint8_t* srcUV, int srcUVStride,
uint8_t* dst, int dstStride,
int width, int height, bool is601, bool isFullRange) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb, y_offset;
getYuvToRgbCoefficients_neon(is601, isFullRange, cy, cr, cgu, cgv, cb, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uvRow = srcUV + (y / 2) * srcUVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
uint8x16_t y_vals = vld1q_u8(yRow + x);
uint8x16_t uv_vals = vld1q_u8(uvRow + x);
uint8x8x2_t uv_deint = vuzp_u8(vget_low_u8(uv_vals), vget_high_u8(uv_vals));
uint8x8_t u_vals = uv_deint.val[0]; uint8x8_t v_vals = uv_deint.val[1];
uint8x8x2_t u_dup = vzip_u8(u_vals, u_vals);
uint8x8x2_t v_dup = vzip_u8(v_vals, v_vals);
uint8x16_t u_expanded = vcombine_u8(u_dup.val[0], u_dup.val[1]);
uint8x16_t v_expanded = vcombine_u8(v_dup.val[0], v_dup.val[1]);
int16x8_t y_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t y_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t u_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(u_expanded), vdup_n_u8(128)));
int16x8_t u_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(u_expanded), vdup_n_u8(128)));
int16x8_t v_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(v_expanded), vdup_n_u8(128)));
int16x8_t v_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_expanded), vdup_n_u8(128)));
int16x8_t c_y = vdupq_n_s16(cy);
int16x8_t c_r = vdupq_n_s16(cr);
int16x8_t c_gu = vdupq_n_s16(cgu);
int16x8_t c_gv = vdupq_n_s16(cgv);
int16x8_t c_b = vdupq_n_s16(cb);
int16x8_t c32 = vdupq_n_s16(32);
int16x8_t y_scaled_lo = vmulq_s16(y_lo, c_y);
int16x8_t r_lo = vaddq_s16(y_scaled_lo, vmulq_s16(v_lo, c_r));
r_lo = vshrq_n_s16(vaddq_s16(r_lo, c32), 6);
int16x8_t g_lo = vsubq_s16(y_scaled_lo, vmulq_s16(u_lo, c_gu));
g_lo = vsubq_s16(g_lo, vmulq_s16(v_lo, c_gv));
g_lo = vshrq_n_s16(vaddq_s16(g_lo, c32), 6);
int16x8_t b_lo = vaddq_s16(y_scaled_lo, vmulq_s16(u_lo, c_b));
b_lo = vshrq_n_s16(vaddq_s16(b_lo, c32), 6);
int16x8_t y_scaled_hi = vmulq_s16(y_hi, c_y);
int16x8_t r_hi = vaddq_s16(y_scaled_hi, vmulq_s16(v_hi, c_r));
r_hi = vshrq_n_s16(vaddq_s16(r_hi, c32), 6);
int16x8_t g_hi = vsubq_s16(y_scaled_hi, vmulq_s16(u_hi, c_gu));
g_hi = vsubq_s16(g_hi, vmulq_s16(v_hi, c_gv));
g_hi = vshrq_n_s16(vaddq_s16(g_hi, c32), 6);
int16x8_t b_hi = vaddq_s16(y_scaled_hi, vmulq_s16(u_hi, c_b));
b_hi = vshrq_n_s16(vaddq_s16(b_hi, c32), 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
uint8x16_t a8 = vdupq_n_u8(255);
if constexpr (isBGRA) {
uint8x16x4_t bgra;
bgra.val[0] = b8;
bgra.val[1] = g8;
bgra.val[2] = r8;
bgra.val[3] = a8;
vst4q_u8(dstRow + x * 4, bgra);
} else {
uint8x16x4_t rgba;
rgba.val[0] = r8;
rgba.val[1] = g8;
rgba.val[2] = b8;
rgba.val[3] = a8;
vst4q_u8(dstRow + x * 4, rgba);
}
}
for (; x < width; x += 2) {
int y0 = yRow[x] - y_offset;
int y1 = (x + 1 < width) ? yRow[x + 1] - y_offset : y0;
int u = uvRow[x] - 128; int v = uvRow[x + 1] - 128;
int r0 = (cy * y0 + cr * v) >> 6;
int g0 = (cy * y0 - cgu * u - cgv * v) >> 6;
int b0 = (cy * y0 + cb * u) >> 6;
int r1 = (cy * y1 + cr * v) >> 6;
int g1 = (cy * y1 - cgu * u - cgv * v) >> 6;
int b1 = (cy * y1 + cb * u) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
r1 = r1 < 0 ? 0 : (r1 > 255 ? 255 : r1);
g1 = g1 < 0 ? 0 : (g1 > 255 ? 255 : g1);
b1 = b1 < 0 ? 0 : (b1 > 255 ? 255 : b1);
if constexpr (isBGRA) {
dstRow[x * 4 + 0] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
if (x + 1 < width) {
dstRow[(x + 1) * 4 + 0] = b1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = r1;
dstRow[(x + 1) * 4 + 3] = 255;
}
} else {
dstRow[x * 4 + 0] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
if (x + 1 < width) {
dstRow[(x + 1) * 4 + 0] = r1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = b1;
dstRow[(x + 1) * 4 + 3] = 255;
}
}
}
}
}
template <bool isBGR>
void _nv12ToRgbColor_neon_imp(const uint8_t* srcY, int srcYStride,
const uint8_t* srcUV, int srcUVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
int cy, cr, cgu, cgv, cb, y_offset;
getYuvToRgbCoefficients_neon(is601, isFullRange, cy, cr, cgu, cgv, cb, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uvRow = srcUV + (y / 2) * srcUVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
uint8x16_t y_vals = vld1q_u8(yRow + x);
uint8x16_t uv_vals = vld1q_u8(uvRow + x);
uint8x8x2_t uv_deint = vuzp_u8(vget_low_u8(uv_vals), vget_high_u8(uv_vals));
uint8x8_t u_vals = uv_deint.val[0]; uint8x8_t v_vals = uv_deint.val[1];
uint8x8x2_t u_dup = vzip_u8(u_vals, u_vals);
uint8x8x2_t v_dup = vzip_u8(v_vals, v_vals);
uint8x16_t u_expanded = vcombine_u8(u_dup.val[0], u_dup.val[1]);
uint8x16_t v_expanded = vcombine_u8(v_dup.val[0], v_dup.val[1]);
int16x8_t y_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t y_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t u_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(u_expanded), vdup_n_u8(128)));
int16x8_t u_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(u_expanded), vdup_n_u8(128)));
int16x8_t v_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(v_expanded), vdup_n_u8(128)));
int16x8_t v_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_expanded), vdup_n_u8(128)));
int16x8_t c_y = vdupq_n_s16(cy);
int16x8_t c_r = vdupq_n_s16(cr);
int16x8_t c_gu = vdupq_n_s16(cgu);
int16x8_t c_gv = vdupq_n_s16(cgv);
int16x8_t c_b = vdupq_n_s16(cb);
int16x8_t r_lo = vaddq_s16(vmulq_s16(c_y, y_lo), vmulq_s16(c_r, v_lo));
int16x8_t r_hi = vaddq_s16(vmulq_s16(c_y, y_hi), vmulq_s16(c_r, v_hi));
int16x8_t g_lo = vsubq_s16(vmulq_s16(c_y, y_lo), vaddq_s16(vmulq_s16(c_gu, u_lo), vmulq_s16(c_gv, v_lo)));
int16x8_t g_hi = vsubq_s16(vmulq_s16(c_y, y_hi), vaddq_s16(vmulq_s16(c_gu, u_hi), vmulq_s16(c_gv, v_hi)));
int16x8_t b_lo = vaddq_s16(vmulq_s16(c_y, y_lo), vmulq_s16(c_b, u_lo));
int16x8_t b_hi = vaddq_s16(vmulq_s16(c_y, y_hi), vmulq_s16(c_b, u_hi));
r_lo = vshrq_n_s16(r_lo, 6);
r_hi = vshrq_n_s16(r_hi, 6);
g_lo = vshrq_n_s16(g_lo, 6);
g_hi = vshrq_n_s16(g_hi, 6);
b_lo = vshrq_n_s16(b_lo, 6);
b_hi = vshrq_n_s16(b_hi, 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
if constexpr (isBGR) {
uint8x16x3_t bgr;
bgr.val[0] = b8;
bgr.val[1] = g8;
bgr.val[2] = r8;
vst3q_u8(dstRow + x * 3, bgr);
} else {
uint8x16x3_t rgb;
rgb.val[0] = r8;
rgb.val[1] = g8;
rgb.val[2] = b8;
vst3q_u8(dstRow + x * 3, rgb);
}
}
for (; x < width; x += 2) {
int y0 = yRow[x] - y_offset;
int y1 = (x + 1 < width) ? yRow[x + 1] - y_offset : y0;
int u = uvRow[x] - 128; int v = uvRow[x + 1] - 128;
int r0 = (cy * y0 + cr * v) >> 6;
int g0 = (cy * y0 - cgu * u - cgv * v) >> 6;
int b0 = (cy * y0 + cb * u) >> 6;
int r1 = (cy * y1 + cr * v) >> 6;
int g1 = (cy * y1 - cgu * u - cgv * v) >> 6;
int b1 = (cy * y1 + cb * u) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
r1 = r1 < 0 ? 0 : (r1 > 255 ? 255 : r1);
g1 = g1 < 0 ? 0 : (g1 > 255 ? 255 : g1);
b1 = b1 < 0 ? 0 : (b1 > 255 ? 255 : b1);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
if (x + 1 < width) {
dstRow[(x + 1) * 3 + 0] = b1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = r1;
}
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
if (x + 1 < width) {
dstRow[(x + 1) * 3 + 0] = r1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = b1;
}
}
}
}
}
template <bool isBGRA>
void _i420ToRgba_neon_imp(const uint8_t* srcY, int srcYStride,
const uint8_t* srcU, int srcUStride,
const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb, y_offset;
getYuvToRgbCoefficients_neon(flag, cy, cr, cgu, cgv, cb, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uRow = srcU + (y / 2) * srcUStride;
const uint8_t* vRow = srcV + (y / 2) * srcVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
uint8x16_t y_vals = vld1q_u8(yRow + x);
uint8x8_t u_vals = vld1_u8(uRow + x / 2);
uint8x8_t v_vals = vld1_u8(vRow + x / 2);
uint8x8x2_t u_dup = vzip_u8(u_vals, u_vals);
uint8x8x2_t v_dup = vzip_u8(v_vals, v_vals);
uint8x16_t u_expanded = vcombine_u8(u_dup.val[0], u_dup.val[1]);
uint8x16_t v_expanded = vcombine_u8(v_dup.val[0], v_dup.val[1]);
int16x8_t y_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t y_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t u_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(u_expanded), vdup_n_u8(128)));
int16x8_t u_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(u_expanded), vdup_n_u8(128)));
int16x8_t v_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(v_expanded), vdup_n_u8(128)));
int16x8_t v_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_expanded), vdup_n_u8(128)));
int16x8_t cy_coeff = vdupq_n_s16(static_cast<int16_t>(cy));
int16x8_t cr_coeff = vdupq_n_s16(static_cast<int16_t>(cr));
int16x8_t cgu_coeff = vdupq_n_s16(static_cast<int16_t>(cgu));
int16x8_t cgv_coeff = vdupq_n_s16(static_cast<int16_t>(cgv));
int16x8_t cb_coeff = vdupq_n_s16(static_cast<int16_t>(cb));
int16x8_t c32 = vdupq_n_s16(32);
int16x8_t y_scaled_lo = vmulq_s16(y_lo, cy_coeff);
int16x8_t r_lo = vaddq_s16(y_scaled_lo, vmulq_s16(v_lo, cr_coeff));
r_lo = vshrq_n_s16(vaddq_s16(r_lo, c32), 6);
int16x8_t g_lo = vsubq_s16(y_scaled_lo, vmulq_s16(u_lo, cgu_coeff));
g_lo = vsubq_s16(g_lo, vmulq_s16(v_lo, cgv_coeff));
g_lo = vshrq_n_s16(vaddq_s16(g_lo, c32), 6);
int16x8_t b_lo = vaddq_s16(y_scaled_lo, vmulq_s16(u_lo, cb_coeff));
b_lo = vshrq_n_s16(vaddq_s16(b_lo, c32), 6);
int16x8_t y_scaled_hi = vmulq_s16(y_hi, cy_coeff);
int16x8_t r_hi = vaddq_s16(y_scaled_hi, vmulq_s16(v_hi, cr_coeff));
r_hi = vshrq_n_s16(vaddq_s16(r_hi, c32), 6);
int16x8_t g_hi = vsubq_s16(y_scaled_hi, vmulq_s16(u_hi, cgu_coeff));
g_hi = vsubq_s16(g_hi, vmulq_s16(v_hi, cgv_coeff));
g_hi = vshrq_n_s16(vaddq_s16(g_hi, c32), 6);
int16x8_t b_hi = vaddq_s16(y_scaled_hi, vmulq_s16(u_hi, cb_coeff));
b_hi = vshrq_n_s16(vaddq_s16(b_hi, c32), 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
uint8x16_t a8 = vdupq_n_u8(255);
if constexpr (isBGRA) {
uint8x16x4_t bgra;
bgra.val[0] = b8;
bgra.val[1] = g8;
bgra.val[2] = r8;
bgra.val[3] = a8;
vst4q_u8(dstRow + x * 4, bgra);
} else {
uint8x16x4_t rgba;
rgba.val[0] = r8;
rgba.val[1] = g8;
rgba.val[2] = b8;
rgba.val[3] = a8;
vst4q_u8(dstRow + x * 4, rgba);
}
}
for (; x < width; x += 2) {
int y0 = yRow[x] - y_offset;
int y1 = (x + 1 < width) ? yRow[x + 1] - y_offset : y0;
int u = uRow[x / 2] - 128;
int v = vRow[x / 2] - 128;
int r0, g0, b0, r1, g1, b1;
yuv2rgbGeneric_int(y0, u, v, r0, g0, b0, cy, cr, cgu, cgv, cb);
yuv2rgbGeneric_int(y1, u, v, r1, g1, b1, cy, cr, cgu, cgv, cb);
if constexpr (isBGRA) {
dstRow[x * 4 + 0] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
if (x + 1 < width) {
dstRow[(x + 1) * 4 + 0] = b1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = r1;
dstRow[(x + 1) * 4 + 3] = 255;
}
} else {
dstRow[x * 4 + 0] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
if (x + 1 < width) {
dstRow[(x + 1) * 4 + 0] = r1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = b1;
dstRow[(x + 1) * 4 + 3] = 255;
}
}
}
}
}
template <bool isBGR>
void _i420ToRgb_neon_imp(const uint8_t* srcY, int srcYStride,
const uint8_t* srcU, int srcUStride,
const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb, y_offset;
getYuvToRgbCoefficients_neon(flag, cy, cr, cgu, cgv, cb, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uRow = srcU + (y / 2) * srcUStride;
const uint8_t* vRow = srcV + (y / 2) * srcVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
uint8x16_t y_vals = vld1q_u8(yRow + x);
uint8x8_t u_vals = vld1_u8(uRow + x / 2);
uint8x8_t v_vals = vld1_u8(vRow + x / 2);
uint8x8x2_t u_dup = vzip_u8(u_vals, u_vals);
uint8x8x2_t v_dup = vzip_u8(v_vals, v_vals);
uint8x16_t u_expanded = vcombine_u8(u_dup.val[0], u_dup.val[1]);
uint8x16_t v_expanded = vcombine_u8(v_dup.val[0], v_dup.val[1]);
int16x8_t y_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t y_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t u_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(u_expanded), vdup_n_u8(128)));
int16x8_t u_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(u_expanded), vdup_n_u8(128)));
int16x8_t v_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(v_expanded), vdup_n_u8(128)));
int16x8_t v_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_expanded), vdup_n_u8(128)));
int16x8_t cy_coeff = vdupq_n_s16(static_cast<int16_t>(cy));
int16x8_t cr_coeff = vdupq_n_s16(static_cast<int16_t>(cr));
int16x8_t cgu_coeff = vdupq_n_s16(static_cast<int16_t>(cgu));
int16x8_t cgv_coeff = vdupq_n_s16(static_cast<int16_t>(cgv));
int16x8_t cb_coeff = vdupq_n_s16(static_cast<int16_t>(cb));
int16x8_t c32 = vdupq_n_s16(32);
int16x8_t y_scaled_lo = vmulq_s16(y_lo, cy_coeff);
int16x8_t r_lo = vaddq_s16(y_scaled_lo, vmulq_s16(v_lo, cr_coeff));
r_lo = vshrq_n_s16(vaddq_s16(r_lo, c32), 6);
int16x8_t g_lo = vsubq_s16(y_scaled_lo, vmulq_s16(u_lo, cgu_coeff));
g_lo = vsubq_s16(g_lo, vmulq_s16(v_lo, cgv_coeff));
g_lo = vshrq_n_s16(vaddq_s16(g_lo, c32), 6);
int16x8_t b_lo = vaddq_s16(y_scaled_lo, vmulq_s16(u_lo, cb_coeff));
b_lo = vshrq_n_s16(vaddq_s16(b_lo, c32), 6);
int16x8_t y_scaled_hi = vmulq_s16(y_hi, cy_coeff);
int16x8_t r_hi = vaddq_s16(y_scaled_hi, vmulq_s16(v_hi, cr_coeff));
r_hi = vshrq_n_s16(vaddq_s16(r_hi, c32), 6);
int16x8_t g_hi = vsubq_s16(y_scaled_hi, vmulq_s16(u_hi, cgu_coeff));
g_hi = vsubq_s16(g_hi, vmulq_s16(v_hi, cgv_coeff));
g_hi = vshrq_n_s16(vaddq_s16(g_hi, c32), 6);
int16x8_t b_hi = vaddq_s16(y_scaled_hi, vmulq_s16(u_hi, cb_coeff));
b_hi = vshrq_n_s16(vaddq_s16(b_hi, c32), 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
uint8x16x3_t interleaved;
if constexpr (isBGR) {
interleaved.val[0] = b8;
interleaved.val[1] = g8;
interleaved.val[2] = r8;
} else {
interleaved.val[0] = r8;
interleaved.val[1] = g8;
interleaved.val[2] = b8;
}
vst3q_u8(dstRow + x * 3, interleaved);
}
for (; x < width; x += 2) {
int y0 = yRow[x] - y_offset;
int y1 = (x + 1 < width) ? yRow[x + 1] - y_offset : y0;
int u = uRow[x / 2] - 128;
int v = vRow[x / 2] - 128;
int r0, g0, b0, r1, g1, b1;
yuv2rgbGeneric_int(y0, u, v, r0, g0, b0, cy, cr, cgu, cgv, cb);
yuv2rgbGeneric_int(y1, u, v, r1, g1, b1, cy, cr, cgu, cgv, cb);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
if (x + 1 < width) {
dstRow[(x + 1) * 3 + 0] = b1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = r1;
}
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
if (x + 1 < width) {
dstRow[(x + 1) * 3 + 0] = r1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = b1;
}
}
}
}
}
void nv12ToBgra32_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcUV, int srcUVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
nv12ToRgbaColor_neon_imp<true>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601, isFullRange);
}
void nv12ToRgba32_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcUV, int srcUVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
nv12ToRgbaColor_neon_imp<false>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601, isFullRange);
}
void nv12ToBgr24_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcUV, int srcUVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_nv12ToRgbColor_neon_imp<true>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, flag);
}
void nv12ToRgb24_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcUV, int srcUVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_nv12ToRgbColor_neon_imp<false>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, flag);
}
void i420ToBgra32_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcU, int srcUStride,
const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_i420ToRgba_neon_imp<true>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, flag);
}
void i420ToRgba32_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcU, int srcUStride,
const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_i420ToRgba_neon_imp<false>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, flag);
}
void i420ToBgr24_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcU, int srcUStride,
const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_i420ToRgb_neon_imp<true>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, flag);
}
void i420ToRgb24_neon(const uint8_t* srcY, int srcYStride,
const uint8_t* srcU, int srcUStride,
const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_i420ToRgb_neon_imp<false>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, flag);
}
template <bool isBGRA>
void _yuyvToRgba_neon_imp(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
int cy, cr, cgu, cgv, cb, y_offset;
getYuvToRgbCoefficients_neon(is601, isFullRange, cy, cr, cgu, cgv, cb, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
uint8x16x2_t yuyv_data = vld2q_u8(srcRow + x * 2);
uint8x16_t y_vals = yuyv_data.val[0]; uint8x16_t uv_vals = yuyv_data.val[1];
uint8x8x2_t uv_deint = vuzp_u8(vget_low_u8(uv_vals), vget_high_u8(uv_vals));
uint8x8_t u_vals = uv_deint.val[0];
uint8x8_t v_vals = uv_deint.val[1];
uint8x8x2_t u_dup = vzip_u8(u_vals, u_vals);
uint8x8x2_t v_dup = vzip_u8(v_vals, v_vals);
uint8x16_t u_expanded = vcombine_u8(u_dup.val[0], u_dup.val[1]);
uint8x16_t v_expanded = vcombine_u8(v_dup.val[0], v_dup.val[1]);
int16x8_t y_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t y_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(y_vals), vdup_n_u8(y_offset)));
int16x8_t u_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(u_expanded), vdup_n_u8(128)));
int16x8_t u_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(u_expanded), vdup_n_u8(128)));
int16x8_t v_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(v_expanded), vdup_n_u8(128)));
int16x8_t v_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_expanded), vdup_n_u8(128)));
int16x8_t c_y = vdupq_n_s16(cy);
int16x8_t c_r = vdupq_n_s16(cr);
int16x8_t c_gu = vdupq_n_s16(cgu);
int16x8_t c_gv = vdupq_n_s16(cgv);
int16x8_t c_b = vdupq_n_s16(cb);
int16x8_t r_lo = vaddq_s16(vmulq_s16(c_y, y_lo), vmulq_s16(c_r, v_lo));
int16x8_t r_hi = vaddq_s16(vmulq_s16(c_y, y_hi), vmulq_s16(c_r, v_hi));
int16x8_t g_lo = vsubq_s16(vmulq_s16(c_y, y_lo), vaddq_s16(vmulq_s16(c_gu, u_lo), vmulq_s16(c_gv, v_lo)));
int16x8_t g_hi = vsubq_s16(vmulq_s16(c_y, y_hi), vaddq_s16(vmulq_s16(c_gu, u_hi), vmulq_s16(c_gv, v_hi)));
int16x8_t b_lo = vaddq_s16(vmulq_s16(c_y, y_lo), vmulq_s16(c_b, u_lo));
int16x8_t b_hi = vaddq_s16(vmulq_s16(c_y, y_hi), vmulq_s16(c_b, u_hi));
r_lo = vshrq_n_s16(r_lo, 6);
r_hi = vshrq_n_s16(r_hi, 6);
g_lo = vshrq_n_s16(g_lo, 6);
g_hi = vshrq_n_s16(g_hi, 6);
b_lo = vshrq_n_s16(b_lo, 6);
b_hi = vshrq_n_s16(b_hi, 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
uint8x16_t a8 = vdupq_n_u8(255);
if constexpr (isBGRA) {
uint8x16x4_t bgra;
bgra.val[0] = b8;
bgra.val[1] = g8;
bgra.val[2] = r8;
bgra.val[3] = a8;
vst4q_u8(dstRow + x * 4, bgra);
} else {
uint8x16x4_t rgba;
rgba.val[0] = r8;
rgba.val[1] = g8;
rgba.val[2] = b8;
rgba.val[3] = a8;
vst4q_u8(dstRow + x * 4, rgba);
}
}
for (; x < width; x += 2) {
if (x + 1 >= width) break;
uint8_t y0 = srcRow[x * 2];
uint8_t u = srcRow[x * 2 + 1];
uint8_t y1 = srcRow[x * 2 + 2];
uint8_t v = srcRow[x * 2 + 3];
int c_y0 = (int)y0 - y_offset;
int c_y1 = (int)y1 - y_offset;
int c_u = (int)u - 128;
int c_v = (int)v - 128;
int r0 = (cy * c_y0 + cr * c_v + 32) >> 6;
int g0 = (cy * c_y0 - cgu * c_u - cgv * c_v + 32) >> 6;
int b0 = (cy * c_y0 + cb * c_u + 32) >> 6;
int r1 = (cy * c_y1 + cr * c_v + 32) >> 6;
int g1 = (cy * c_y1 - cgu * c_u - cgv * c_v + 32) >> 6;
int b1 = (cy * c_y1 + cb * c_u + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
r1 = r1 < 0 ? 0 : (r1 > 255 ? 255 : r1);
g1 = g1 < 0 ? 0 : (g1 > 255 ? 255 : g1);
b1 = b1 < 0 ? 0 : (b1 > 255 ? 255 : b1);
if constexpr (isBGRA) {
dstRow[x * 4] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4] = b1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = r1;
dstRow[(x + 1) * 4 + 3] = 255;
} else {
dstRow[x * 4] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4] = r1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = b1;
dstRow[(x + 1) * 4 + 3] = 255;
}
}
}
}
template <bool isBGR>
void _yuyvToRgb_neon_imp(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
int cy_i, cr_i, cgu_i, cgv_i, cb_i, y_offset;
getYuvToRgbCoefficients_neon(is601, isFullRange, cy_i, cr_i, cgu_i, cgv_i, cb_i, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
if (width >= 16) {
for (; x + 16 <= width; x += 16) {
uint8x16_t yuv1 = vld1q_u8(srcRow + x * 2); uint8x16_t yuv2 = vld1q_u8(srcRow + x * 2 + 16);
uint8x8_t y_low = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0E0C0A0806040200ULL)); uint8x8_t y_high = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0E0C0A0806040200ULL)); uint8x16_t y_vals = vcombine_u8(y_low, y_high);
uint8x8_t u_packed = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0D0D090905050101ULL)); uint8x8_t u_packed2 = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0D0D090905050101ULL)); uint8x16_t u_vals = vcombine_u8(u_packed, u_packed2);
uint8x8_t v_packed = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0F0F0B0B07070303ULL)); uint8x8_t v_packed2 = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0F0F0B0B07070303ULL)); uint8x16_t v_vals = vcombine_u8(v_packed, v_packed2);
int16x8_t y_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals))), vdupq_n_s16(y_offset));
int16x8_t y_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals))), vdupq_n_s16(y_offset));
int16x8_t u_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(u_vals))), vdupq_n_s16(128));
int16x8_t u_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(u_vals))), vdupq_n_s16(128));
int16x8_t v_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_vals))), vdupq_n_s16(128));
int16x8_t v_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_vals))), vdupq_n_s16(128));
int16x8_t cy_coeff = vdupq_n_s16(static_cast<int16_t>(cy_i));
int16x8_t cr_coeff = vdupq_n_s16(static_cast<int16_t>(cr_i));
int16x8_t cgu_coeff = vdupq_n_s16(static_cast<int16_t>(cgu_i));
int16x8_t cgv_coeff = vdupq_n_s16(static_cast<int16_t>(cgv_i));
int16x8_t cb_coeff = vdupq_n_s16(static_cast<int16_t>(cb_i));
int16x8_t c32 = vdupq_n_s16(32);
int16x8_t y_scaled_lo = vmulq_s16(y_lo, cy_coeff);
int16x8_t r_lo = vaddq_s16(y_scaled_lo, vmulq_s16(v_lo, cr_coeff));
r_lo = vshrq_n_s16(vaddq_s16(r_lo, c32), 6);
int16x8_t g_lo = vsubq_s16(y_scaled_lo, vmulq_s16(u_lo, cgu_coeff));
g_lo = vsubq_s16(g_lo, vmulq_s16(v_lo, cgv_coeff));
g_lo = vshrq_n_s16(vaddq_s16(g_lo, c32), 6);
int16x8_t b_lo = vaddq_s16(y_scaled_lo, vmulq_s16(u_lo, cb_coeff));
b_lo = vshrq_n_s16(vaddq_s16(b_lo, c32), 6);
int16x8_t y_scaled_hi = vmulq_s16(y_hi, cy_coeff);
int16x8_t r_hi = vaddq_s16(y_scaled_hi, vmulq_s16(v_hi, cr_coeff));
r_hi = vshrq_n_s16(vaddq_s16(r_hi, c32), 6);
int16x8_t g_hi = vsubq_s16(y_scaled_hi, vmulq_s16(u_hi, cgu_coeff));
g_hi = vsubq_s16(g_hi, vmulq_s16(v_hi, cgv_coeff));
g_hi = vshrq_n_s16(vaddq_s16(g_hi, c32), 6);
int16x8_t b_hi = vaddq_s16(y_scaled_hi, vmulq_s16(u_hi, cb_coeff));
b_hi = vshrq_n_s16(vaddq_s16(b_hi, c32), 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
if constexpr (isBGR) {
uint8x16x3_t bgr;
bgr.val[0] = b8;
bgr.val[1] = g8;
bgr.val[2] = r8;
vst3q_u8(dstRow + x * 3, bgr);
} else {
uint8x16x3_t rgb;
rgb.val[0] = r8;
rgb.val[1] = g8;
rgb.val[2] = b8;
vst3q_u8(dstRow + x * 3, rgb);
}
}
}
for (; x + 2 <= width; x += 2) {
uint8_t y0 = srcRow[x * 2];
uint8_t u = srcRow[x * 2 + 1];
uint8_t y1 = srcRow[x * 2 + 2];
uint8_t v = srcRow[x * 2 + 3];
int y0c = static_cast<int>(y0) - y_offset;
int y1c = static_cast<int>(y1) - y_offset;
int uc = static_cast<int>(u) - 128;
int vc = static_cast<int>(v) - 128;
int r0 = (cy_i * y0c + cr_i * vc + 32) >> 6;
int g0 = (cy_i * y0c - cgu_i * uc - cgv_i * vc + 32) >> 6;
int b0 = (cy_i * y0c + cb_i * uc + 32) >> 6;
int r1 = (cy_i * y1c + cr_i * vc + 32) >> 6;
int g1 = (cy_i * y1c - cgu_i * uc - cgv_i * vc + 32) >> 6;
int b1 = (cy_i * y1c + cb_i * uc + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
r1 = r1 < 0 ? 0 : (r1 > 255 ? 255 : r1);
g1 = g1 < 0 ? 0 : (g1 > 255 ? 255 : g1);
b1 = b1 < 0 ? 0 : (b1 > 255 ? 255 : b1);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
if (x + 1 < width) {
dstRow[(x + 1) * 3 + 0] = b1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = r1;
}
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
if (x + 1 < width) {
dstRow[(x + 1) * 3 + 0] = r1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = b1;
}
}
}
if (x < width) {
uint8_t y0 = srcRow[x * 2];
uint8_t u = srcRow[x * 2 + 1];
uint8_t y1 = srcRow[x * 2 + 2];
uint8_t v = srcRow[x * 2 + 3];
int y0c = static_cast<int>(y0) - y_offset;
int uc = static_cast<int>(u) - 128;
int vc = static_cast<int>(v) - 128;
int r0 = (cy_i * y0c + cr_i * vc + 32) >> 6;
int g0 = (cy_i * y0c - cgu_i * uc - cgv_i * vc + 32) >> 6;
int b0 = (cy_i * y0c + cb_i * uc + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
}
}
}
}
template <bool isBGRA>
void _uyvyToRgba_neon_imp(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
int cy_i, cr_i, cgu_i, cgv_i, cb_i, y_offset;
getYuvToRgbCoefficients_neon(is601, isFullRange, cy_i, cr_i, cgu_i, cgv_i, cb_i, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
if (width >= 16) {
for (; x + 16 <= width; x += 16) {
uint8x16_t yuv1 = vld1q_u8(srcRow + x * 2);
uint8x16_t yuv2 = vld1q_u8(srcRow + x * 2 + 16);
uint8x8_t y_low = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0F0D0B0907050301ULL));
uint8x8_t y_high = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0F0D0B0907050301ULL));
uint8x16_t y_vals = vcombine_u8(y_low, y_high);
uint8x8_t u_packed1 = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0C0C080804040000ULL));
uint8x8_t u_packed2 = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0C0C080804040000ULL));
uint8x16_t u_vals = vcombine_u8(u_packed1, u_packed2);
uint8x8_t v_packed1 = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0E0E0A0A06060202ULL));
uint8x8_t v_packed2 = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0E0E0A0A06060202ULL));
uint8x16_t v_vals = vcombine_u8(v_packed1, v_packed2);
int16x8_t y_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals))), vdupq_n_s16(y_offset));
int16x8_t y_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals))), vdupq_n_s16(y_offset));
int16x8_t u_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(u_vals))), vdupq_n_s16(128));
int16x8_t u_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(u_vals))), vdupq_n_s16(128));
int16x8_t v_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_vals))), vdupq_n_s16(128));
int16x8_t v_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_vals))), vdupq_n_s16(128));
int16x8_t cy_c = vdupq_n_s16(static_cast<int16_t>(cy_i));
int16x8_t cr_c = vdupq_n_s16(static_cast<int16_t>(cr_i));
int16x8_t cgu_c = vdupq_n_s16(static_cast<int16_t>(cgu_i));
int16x8_t cgv_c = vdupq_n_s16(static_cast<int16_t>(cgv_i));
int16x8_t cb_c = vdupq_n_s16(static_cast<int16_t>(cb_i));
int16x8_t c32 = vdupq_n_s16(32);
int16x8_t y_scaled_lo = vmulq_s16(y_lo, cy_c);
int16x8_t r_lo = vaddq_s16(y_scaled_lo, vmulq_s16(v_lo, cr_c));
r_lo = vshrq_n_s16(vaddq_s16(r_lo, c32), 6);
int16x8_t g_lo = vsubq_s16(y_scaled_lo, vmulq_s16(u_lo, cgu_c));
g_lo = vsubq_s16(g_lo, vmulq_s16(v_lo, cgv_c));
g_lo = vshrq_n_s16(vaddq_s16(g_lo, c32), 6);
int16x8_t b_lo = vaddq_s16(y_scaled_lo, vmulq_s16(u_lo, cb_c));
b_lo = vshrq_n_s16(vaddq_s16(b_lo, c32), 6);
int16x8_t y_scaled_hi = vmulq_s16(y_hi, cy_c);
int16x8_t r_hi = vaddq_s16(y_scaled_hi, vmulq_s16(v_hi, cr_c));
r_hi = vshrq_n_s16(vaddq_s16(r_hi, c32), 6);
int16x8_t g_hi = vsubq_s16(y_scaled_hi, vmulq_s16(u_hi, cgu_c));
g_hi = vsubq_s16(g_hi, vmulq_s16(v_hi, cgv_c));
g_hi = vshrq_n_s16(vaddq_s16(g_hi, c32), 6);
int16x8_t b_hi = vaddq_s16(y_scaled_hi, vmulq_s16(u_hi, cb_c));
b_hi = vshrq_n_s16(vaddq_s16(b_hi, c32), 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
uint8x16_t a8 = vdupq_n_u8(255);
uint8x16x4_t px;
if constexpr (isBGRA) {
px.val[0] = b8;
px.val[1] = g8;
px.val[2] = r8;
px.val[3] = a8;
} else {
px.val[0] = r8;
px.val[1] = g8;
px.val[2] = b8;
px.val[3] = a8;
}
vst4q_u8(dstRow + x * 4, px);
}
}
for (; x + 2 <= width; x += 2) {
uint8_t u = srcRow[x * 2];
uint8_t y0 = srcRow[x * 2 + 1];
uint8_t v = srcRow[x * 2 + 2];
uint8_t y1 = srcRow[x * 2 + 3];
int c_y0 = static_cast<int>(y0) - y_offset;
int c_y1 = static_cast<int>(y1) - y_offset;
int c_u = static_cast<int>(u) - 128;
int c_v = static_cast<int>(v) - 128;
int r0 = (cy_i * c_y0 + cr_i * c_v + 32) >> 6;
int g0 = (cy_i * c_y0 - cgu_i * c_u - cgv_i * c_v + 32) >> 6;
int b0 = (cy_i * c_y0 + cb_i * c_u + 32) >> 6;
int r1 = (cy_i * c_y1 + cr_i * c_v + 32) >> 6;
int g1 = (cy_i * c_y1 - cgu_i * c_u - cgv_i * c_v + 32) >> 6;
int b1 = (cy_i * c_y1 + cb_i * c_u + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
r1 = r1 < 0 ? 0 : (r1 > 255 ? 255 : r1);
g1 = g1 < 0 ? 0 : (g1 > 255 ? 255 : g1);
b1 = b1 < 0 ? 0 : (b1 > 255 ? 255 : b1);
if constexpr (isBGRA) {
dstRow[x * 4 + 0] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4 + 0] = b1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = r1;
dstRow[(x + 1) * 4 + 3] = 255;
} else {
dstRow[x * 4 + 0] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4 + 0] = r1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = b1;
dstRow[(x + 1) * 4 + 3] = 255;
}
}
if (x < width) {
uint8_t u = srcRow[x * 2 + 0];
uint8_t y0 = srcRow[x * 2 + 1];
uint8_t v = srcRow[x * 2 + 2];
int c_y0 = static_cast<int>(y0) - y_offset;
int c_u = static_cast<int>(u) - 128;
int c_v = static_cast<int>(v) - 128;
int r0 = (cy_i * c_y0 + cr_i * c_v + 32) >> 6;
int g0 = (cy_i * c_y0 - cgu_i * c_u - cgv_i * c_v + 32) >> 6;
int b0 = (cy_i * c_y0 + cb_i * c_u + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
if constexpr (isBGRA) {
dstRow[x * 4 + 0] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
} else {
dstRow[x * 4 + 0] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
}
}
}
}
template <bool isBGR>
void _uyvyToRgb_neon_imp(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
int cy_i, cr_i, cgu_i, cgv_i, cb_i, y_offset;
getYuvToRgbCoefficients_neon(is601, isFullRange, cy_i, cr_i, cgu_i, cgv_i, cb_i, y_offset);
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
if (width >= 16) {
for (; x + 16 <= width; x += 16) {
uint8x16_t yuv1 = vld1q_u8(srcRow + x * 2);
uint8x16_t yuv2 = vld1q_u8(srcRow + x * 2 + 16);
uint8x8_t y_low = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0F0D0B0907050301ULL));
uint8x8_t y_high = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0F0D0B0907050301ULL));
uint8x16_t y_vals = vcombine_u8(y_low, y_high);
uint8x8_t u_p1 = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0C0C080804040000ULL));
uint8x8_t u_p2 = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0C0C080804040000ULL));
uint8x16_t u_vals = vcombine_u8(u_p1, u_p2);
uint8x8_t v_p1 = vtbl2_u8({ vget_low_u8(yuv1), vget_high_u8(yuv1) },
vcreate_u8(0x0E0E0A0A06060202ULL));
uint8x8_t v_p2 = vtbl2_u8({ vget_low_u8(yuv2), vget_high_u8(yuv2) },
vcreate_u8(0x0E0E0A0A06060202ULL));
uint8x16_t v_vals = vcombine_u8(v_p1, v_p2);
int16x8_t y_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals))), vdupq_n_s16(y_offset));
int16x8_t y_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals))), vdupq_n_s16(y_offset));
int16x8_t u_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(u_vals))), vdupq_n_s16(128));
int16x8_t u_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(u_vals))), vdupq_n_s16(128));
int16x8_t v_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_vals))), vdupq_n_s16(128));
int16x8_t v_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_vals))), vdupq_n_s16(128));
int16x8_t cy_c = vdupq_n_s16(static_cast<int16_t>(cy_i));
int16x8_t cr_c = vdupq_n_s16(static_cast<int16_t>(cr_i));
int16x8_t cgu_c = vdupq_n_s16(static_cast<int16_t>(cgu_i));
int16x8_t cgv_c = vdupq_n_s16(static_cast<int16_t>(cgv_i));
int16x8_t cb_c = vdupq_n_s16(static_cast<int16_t>(cb_i));
int16x8_t c32 = vdupq_n_s16(32);
int16x8_t y_scaled_lo = vmulq_s16(y_lo, cy_c);
int16x8_t r_lo = vaddq_s16(y_scaled_lo, vmulq_s16(v_lo, cr_c));
r_lo = vshrq_n_s16(vaddq_s16(r_lo, c32), 6);
int16x8_t g_lo = vsubq_s16(y_scaled_lo, vmulq_s16(u_lo, cgu_c));
g_lo = vsubq_s16(g_lo, vmulq_s16(v_lo, cgv_c));
g_lo = vshrq_n_s16(vaddq_s16(g_lo, c32), 6);
int16x8_t b_lo = vaddq_s16(y_scaled_lo, vmulq_s16(u_lo, cb_c));
b_lo = vshrq_n_s16(vaddq_s16(b_lo, c32), 6);
int16x8_t y_scaled_hi = vmulq_s16(y_hi, cy_c);
int16x8_t r_hi = vaddq_s16(y_scaled_hi, vmulq_s16(v_hi, cr_c));
r_hi = vshrq_n_s16(vaddq_s16(r_hi, c32), 6);
int16x8_t g_hi = vsubq_s16(y_scaled_hi, vmulq_s16(u_hi, cgu_c));
g_hi = vsubq_s16(g_hi, vmulq_s16(v_hi, cgv_c));
g_hi = vshrq_n_s16(vaddq_s16(g_hi, c32), 6);
int16x8_t b_hi = vaddq_s16(y_scaled_hi, vmulq_s16(u_hi, cb_c));
b_hi = vshrq_n_s16(vaddq_s16(b_hi, c32), 6);
uint8x8_t r8_lo = vqmovun_s16(r_lo);
uint8x8_t g8_lo = vqmovun_s16(g_lo);
uint8x8_t b8_lo = vqmovun_s16(b_lo);
uint8x8_t r8_hi = vqmovun_s16(r_hi);
uint8x8_t g8_hi = vqmovun_s16(g_hi);
uint8x8_t b8_hi = vqmovun_s16(b_hi);
uint8x16_t r8 = vcombine_u8(r8_lo, r8_hi);
uint8x16_t g8 = vcombine_u8(g8_lo, g8_hi);
uint8x16_t b8 = vcombine_u8(b8_lo, b8_hi);
uint8x16x3_t px3;
if constexpr (isBGR) {
px3.val[0] = b8;
px3.val[1] = g8;
px3.val[2] = r8;
} else {
px3.val[0] = r8;
px3.val[1] = g8;
px3.val[2] = b8;
}
vst3q_u8(dstRow + x * 3, px3);
}
}
for (; x + 2 <= width; x += 2) {
uint8_t u = srcRow[x * 2];
uint8_t y0 = srcRow[x * 2 + 1];
uint8_t v = srcRow[x * 2 + 2];
uint8_t y1 = srcRow[x * 2 + 3];
int c_y0 = static_cast<int>(y0) - y_offset;
int c_y1 = static_cast<int>(y1) - y_offset;
int c_u = static_cast<int>(u) - 128;
int c_v = static_cast<int>(v) - 128;
int r0 = (cy_i * c_y0 + cr_i * c_v + 32) >> 6;
int g0 = (cy_i * c_y0 - cgu_i * c_u - cgv_i * c_v + 32) >> 6;
int b0 = (cy_i * c_y0 + cb_i * c_u + 32) >> 6;
int r1 = (cy_i * c_y1 + cr_i * c_v + 32) >> 6;
int g1 = (cy_i * c_y1 - cgu_i * c_u - cgv_i * c_v + 32) >> 6;
int b1 = (cy_i * c_y1 + cb_i * c_u + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
r1 = r1 < 0 ? 0 : (r1 > 255 ? 255 : r1);
g1 = g1 < 0 ? 0 : (g1 > 255 ? 255 : g1);
b1 = b1 < 0 ? 0 : (b1 > 255 ? 255 : b1);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
dstRow[(x + 1) * 3 + 0] = b1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = r1;
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
dstRow[(x + 1) * 3 + 0] = r1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = b1;
}
}
if (x < width) {
uint8_t u = srcRow[x * 2 + 0];
uint8_t y0 = srcRow[x * 2 + 1];
uint8_t v = srcRow[x * 2 + 2];
int c_y0 = static_cast<int>(y0) - y_offset;
int c_u = static_cast<int>(u) - 128;
int c_v = static_cast<int>(v) - 128;
int r0 = (cy_i * c_y0 + cr_i * c_v + 32) >> 6;
int g0 = (cy_i * c_y0 - cgu_i * c_u - cgv_i * c_v + 32) >> 6;
int b0 = (cy_i * c_y0 + cb_i * c_u + 32) >> 6;
r0 = r0 < 0 ? 0 : (r0 > 255 ? 255 : r0);
g0 = g0 < 0 ? 0 : (g0 > 255 ? 255 : g0);
b0 = b0 < 0 ? 0 : (b0 > 255 ? 255 : b0);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
}
}
}
}
void yuyvToBgr24_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_yuyvToRgb_neon_imp<true>(src, srcStride, dst, dstStride, width, height, flag);
}
void yuyvToRgb24_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_yuyvToRgb_neon_imp<false>(src, srcStride, dst, dstStride, width, height, flag);
}
void yuyvToBgra32_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_yuyvToRgba_neon_imp<true>(src, srcStride, dst, dstStride, width, height, flag);
}
void yuyvToRgba32_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_yuyvToRgba_neon_imp<false>(src, srcStride, dst, dstStride, width, height, flag);
}
void uyvyToBgr24_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_uyvyToRgb_neon_imp<true>(src, srcStride, dst, dstStride, width, height, flag);
}
void uyvyToRgb24_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_uyvyToRgb_neon_imp<false>(src, srcStride, dst, dstStride, width, height, flag);
}
void uyvyToBgra32_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_uyvyToRgba_neon_imp<true>(src, srcStride, dst, dstStride, width, height, flag);
}
void uyvyToRgba32_neon(const uint8_t* src, int srcStride,
uint8_t* dst, int dstStride,
int width, int height, ConvertFlag flag) {
_uyvyToRgba_neon_imp<false>(src, srcStride, dst, dstStride, width, height, flag);
}
#endif }