#include "ccap_convert_avx2.h"
#include <cassert>
#include <cstring>
#if ENABLE_AVX2_IMP
#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#define AVX2_TARGET __attribute__((target("avx2,fma")))
#else
#define AVX2_TARGET
#endif
#include <immintrin.h>
#if defined(_MSC_VER)
#include <intrin.h>
inline bool hasAVX2_() {
int cpuInfo[4];
__cpuid(cpuInfo, 1);
bool osxsave = (cpuInfo[2] & (1 << 27)) != 0;
bool avx = (cpuInfo[2] & (1 << 28)) != 0;
if (!(osxsave && avx)) return false;
unsigned long long xcrFeatureMask = _xgetbv(0);
if ((xcrFeatureMask & 0x6) != 0x6) return false;
__cpuid(cpuInfo, 7);
return (cpuInfo[1] & (1 << 5)) != 0;
}
#elif defined(__GNUC__) || defined(__clang__)
#include <cpuid.h>
inline bool hasAVX2_() {
unsigned int eax, ebx, ecx, edx;
if (!__get_cpuid(0, &eax, &ebx, &ecx, &edx)) return false;
if (eax < 1) return false;
if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) return false;
bool osxsave = (ecx & (1 << 27)) != 0;
bool avx = (ecx & (1 << 28)) != 0;
if (!(osxsave && avx)) return false;
unsigned int xcr0_lo = 0, xcr0_hi = 0;
asm volatile("xgetbv"
: "=a"(xcr0_lo), "=d"(xcr0_hi)
: "c"(0));
if ((xcr0_lo & 0x6) != 0x6) return false;
if (!__get_cpuid(0, &eax, &ebx, &ecx, &edx)) return false;
if (eax < 7) return false;
if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) return false;
return (ebx & (1 << 5)) != 0; }
#else
inline bool hasAVX2_() { return false; }
#endif
#endif
namespace ccap {
bool sEnableAVX2 = true;
bool enableAVX2(bool enable) {
sEnableAVX2 = enable;
return hasAVX2(); }
bool hasAVX2() {
#if ENABLE_AVX2_IMP
static bool s_hasAVX2 = hasAVX2_();
return s_hasAVX2;
#else
return false;
#endif
}
bool canUseAVX2() {
return hasAVX2() && sEnableAVX2;
}
const char* getAVX2SupportInfo() {
#if ENABLE_AVX2_IMP
static const char* info = nullptr;
if (info == nullptr) {
if (hasAVX2()) {
if (sEnableAVX2) {
info = "AVX2: Hardware supported and enabled";
} else {
info = "AVX2: Hardware supported but disabled by software";
}
} else {
info = "AVX2: Not supported by hardware or OS";
}
}
return info;
#else
return "AVX2: Disabled at compile time";
#endif
}
#if ENABLE_AVX2_IMP
template <int inputChannels, int outputChannels, int swapRB>
AVX2_TARGET void colorShuffle_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width,
int height) {
static_assert((inputChannels == 3 || inputChannels == 4) && (outputChannels == 3 || outputChannels == 4),
"inputChannels and outputChannels must be 3 or 4");
static_assert(inputChannels != outputChannels || swapRB, "swapRB must be true when inputChannels == outputChannels");
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
alignas(32) uint8_t shuffleData[32];
constexpr uint32_t inputPatchSize = inputChannels == 4 ? 8 : (inputChannels == 3 && outputChannels == 3 ? 5 : 10);
constexpr uint32_t outputPatchSize = outputChannels == 4 ? 8 : (inputChannels == 3 && outputChannels == 3 ? 5 : 10);
constexpr uint32_t patchSize = inputPatchSize < outputPatchSize ? inputPatchSize : outputPatchSize;
for (int i = 0; i < patchSize; ++i) {
auto idx1 = i * outputChannels;
auto idx2 = i * inputChannels;
if constexpr (swapRB) {
shuffleData[idx1] = 2 + idx2; shuffleData[idx1 + 1] = 1 + idx2; shuffleData[idx1 + 2] = 0 + idx2; } else {
shuffleData[idx1] = 0 + idx2; shuffleData[idx1 + 1] = 1 + idx2; shuffleData[idx1 + 2] = 2 + idx2; }
if constexpr (outputChannels == 4) {
if constexpr (inputChannels == 4)
shuffleData[idx1 + 3] = idx2 + 3; else
shuffleData[idx1 + 3] = 0xFF; }
}
#if 0#endif
__m256i shuffle256; __m128i shuffle128; if constexpr (inputChannels == 4 && outputChannels == 4) { shuffle256 = _mm256_load_si256((const __m256i*)shuffleData);
} else {
shuffle128 = _mm_load_si128((__m128i*)shuffleData);
}
constexpr uint32_t loopBoundary = (inputChannels == 3 && outputChannels == 4) ? (patchSize + 2) :
(inputChannels == 3 && outputChannels == 3) ? (patchSize + 1) :
patchSize;
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
uint32_t x = 0;
while (x + loopBoundary <= (uint32_t)width) {
if constexpr (outputChannels == 4 && inputChannels == 3) { __m128i pixels_lo = _mm_loadu_si128((__m128i*)(srcRow + x * inputChannels));
__m128i pixels_hi = _mm_loadu_si128((__m128i*)(srcRow + x * inputChannels + 12));
__m128i result_lo = _mm_shuffle_epi8(pixels_lo, shuffle128);
__m128i result_hi = _mm_shuffle_epi8(pixels_hi, shuffle128);
__m128i alpha_mask = _mm_set1_epi32(0xFF000000);
result_lo = _mm_or_si128(result_lo, alpha_mask);
result_hi = _mm_or_si128(result_hi, alpha_mask);
_mm_storeu_si128((__m128i*)(dstRow + x * outputChannels), result_lo);
_mm_storeu_si128((__m128i*)(dstRow + x * outputChannels + 16), result_hi);
} else if constexpr (outputChannels == 3 && inputChannels == 4) { __m128i pixels_lo = _mm_load_si128((__m128i*)(srcRow + x * inputChannels));
__m128i pixels_hi = _mm_load_si128((__m128i*)(srcRow + x * inputChannels + 16));
__m128i result_lo = _mm_shuffle_epi8(pixels_lo, shuffle128); __m128i result_hi = _mm_shuffle_epi8(pixels_hi, shuffle128);
_mm_storeu_si128((__m128i*)(dstRow + x * outputChannels), result_lo); alignas(16) uint8_t remainBuffer[16];
_mm_store_si128((__m128i*)remainBuffer, result_hi); memcpy(dstRow + x * outputChannels + 12, remainBuffer, 12); } else if constexpr (inputChannels == 3 && outputChannels == 3) { __m128i pixels = _mm_loadu_si128((__m128i*)(srcRow + x * inputChannels));
__m128i result = _mm_shuffle_epi8(pixels, shuffle128);
_mm_storeu_si128((__m128i*)(dstRow + x * outputChannels), result); } else { __m256i pixels = _mm256_loadu_si256((const __m256i*)(srcRow + x * inputChannels));
__m256i result = _mm256_shuffle_epi8(pixels, shuffle256);
_mm256_storeu_si256((__m256i*)(dstRow + x * outputChannels), result);
}
x += patchSize;
}
for (; x < (uint32_t)width; ++x) {
for (int c = 0; c < outputChannels; ++c) {
if (inputChannels == 3 && c == 3) {
dstRow[x * outputChannels + c] = 0xFF; } else {
dstRow[x * outputChannels + c] = srcRow[x * inputChannels + shuffleData[c]];
assert(shuffleData[c] <= 3);
}
}
}
}
}
template void colorShuffle_avx2<4, 4, true>(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height);
template void colorShuffle_avx2<4, 3, true>(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height);
template void colorShuffle_avx2<4, 3, false>(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height);
template void colorShuffle_avx2<3, 4, true>(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height);
template void colorShuffle_avx2<3, 4, false>(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height);
template void colorShuffle_avx2<3, 3, true>(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height);
inline void getYuvToRgbCoefficients(bool isBT601, bool isFullRange, int& cy, int& cr, int& cgu, int& cgv, int& cb) {
if (isBT601) {
if (isFullRange) { cy = 64;
cr = 88;
cgu = 22;
cgv = 45;
cb = 111;
} else { cy = 75;
cr = 102;
cgu = 25;
cgv = 52;
cb = 129;
}
} else {
if (isFullRange) { cy = 64;
cr = 101;
cgu = 12;
cgv = 30;
cb = 119;
} else { cy = 75;
cr = 115;
cgu = 14;
cgv = 34;
cb = 135;
}
}
}
template <bool isBGRA, bool isFullRange>
AVX2_TARGET void nv12ToRgbaColor_avx2_imp(const uint8_t* srcY, int srcYStride, const uint8_t* srcUV, int srcUVStride, uint8_t* dst, int dstStride,
int width, int height, bool is601) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb;
getYuvToRgbCoefficients(is601, isFullRange, cy, cr, cgu, cgv, cb);
__m256i c_y = _mm256_set1_epi16(cy);
__m256i c_r = _mm256_set1_epi16(cr);
__m256i c_gu = _mm256_set1_epi16(cgu);
__m256i c_gv = _mm256_set1_epi16(cgv);
__m256i c_b = _mm256_set1_epi16(cb);
__m256i c128 = _mm256_set1_epi16(128);
__m128i a8 = _mm_set1_epi8((char)255);
YuvToRgbFunc convertFunc = getYuvToRgbFunc(is601, isFullRange);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uvRow = srcUV + (y / 2) * srcUVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
__m128i y_vals = _mm_loadu_si128((const __m128i*)(yRow + x));
__m128i uv_vals = _mm_loadu_si128((const __m128i*)(uvRow + x));
__m128i u8 = _mm_and_si128(uv_vals, _mm_set1_epi16(0x00FF));
__m128i v8 = _mm_srli_epi16(uv_vals, 8);
u8 = _mm_packus_epi16(u8, _mm_setzero_si128());
v8 = _mm_packus_epi16(v8, _mm_setzero_si128());
__m128i u_lo = _mm_unpacklo_epi8(u8, u8);
__m128i v_lo = _mm_unpacklo_epi8(v8, v8);
__m256i u_16 = _mm256_cvtepu8_epi16(u_lo);
__m256i v_16 = _mm256_cvtepu8_epi16(v_lo);
__m256i y_16 = _mm256_cvtepu8_epi16(y_vals);
u_16 = _mm256_sub_epi16(u_16, c128);
v_16 = _mm256_sub_epi16(v_16, c128);
if constexpr (!isFullRange) { y_16 = _mm256_sub_epi16(y_16, _mm256_set1_epi16(16));
}
__m256i y_scaled = _mm256_mullo_epi16(y_16, c_y);
__m256i r = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(v_16, c_r));
r = _mm256_add_epi16(r, _mm256_set1_epi16(32));
r = _mm256_srai_epi16(r, 6);
__m256i g = _mm256_sub_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_gu));
g = _mm256_sub_epi16(g, _mm256_mullo_epi16(v_16, c_gv));
g = _mm256_add_epi16(g, _mm256_set1_epi16(32));
g = _mm256_srai_epi16(g, 6);
__m256i b = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_b));
b = _mm256_add_epi16(b, _mm256_set1_epi16(32));
b = _mm256_srai_epi16(b, 6);
__m256i zero = _mm256_setzero_si256();
__m256i maxv = _mm256_set1_epi16(255);
r = _mm256_max_epi16(zero, _mm256_min_epi16(r, maxv));
g = _mm256_max_epi16(zero, _mm256_min_epi16(g, maxv));
b = _mm256_max_epi16(zero, _mm256_min_epi16(b, maxv));
__m128i r8 = _mm_packus_epi16(_mm256_castsi256_si128(r), _mm256_extracti128_si256(r, 1));
__m128i g8 = _mm_packus_epi16(_mm256_castsi256_si128(g), _mm256_extracti128_si256(g, 1));
__m128i b8 = _mm_packus_epi16(_mm256_castsi256_si128(b), _mm256_extracti128_si256(b, 1));
if constexpr (isBGRA) { __m128i bg0 = _mm_unpacklo_epi8(b8, g8); __m128i ra0 = _mm_unpacklo_epi8(r8, a8); __m128i bgra0 = _mm_unpacklo_epi16(bg0, ra0); __m128i bgra1 = _mm_unpackhi_epi16(bg0, ra0);
__m128i bg1 = _mm_unpackhi_epi8(b8, g8);
__m128i ra1 = _mm_unpackhi_epi8(r8, a8);
__m128i bgra2 = _mm_unpacklo_epi16(bg1, ra1);
__m128i bgra3 = _mm_unpackhi_epi16(bg1, ra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 0), bgra0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), bgra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), bgra2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), bgra3);
} else { __m128i rg0 = _mm_unpacklo_epi8(r8, g8); __m128i ba0 = _mm_unpacklo_epi8(b8, a8); __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); __m128i rgba1 = _mm_unpackhi_epi16(rg0, ba0);
__m128i rg1 = _mm_unpackhi_epi8(r8, g8);
__m128i ba1 = _mm_unpackhi_epi8(b8, a8);
__m128i rgba2 = _mm_unpacklo_epi16(rg1, ba1);
__m128i rgba3 = _mm_unpackhi_epi16(rg1, ba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 0), rgba0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), rgba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), rgba2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), rgba3);
}
}
for (; x < width; x += 2) {
int y0 = yRow[x];
int y1 = yRow[x + 1];
int u = uvRow[x];
int v = uvRow[x + 1];
int r0, g0, b0, r1, g1, b1;
convertFunc(y0, u, v, r0, g0, b0);
convertFunc(y1, u, v, r1, g1, b1);
if constexpr (isBGRA) {
dstRow[x * 4 + 0] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4 + 0] = b1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = r1;
dstRow[(x + 1) * 4 + 3] = 255;
} else {
dstRow[x * 4 + 0] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4 + 0] = r1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = b1;
dstRow[(x + 1) * 4 + 3] = 255;
}
}
}
}
template <bool isBGR, bool isFullRange>
AVX2_TARGET void _nv12ToRgbColor_avx2_imp(const uint8_t* srcY, int srcYStride, const uint8_t* srcUV, int srcUVStride, uint8_t* dst, int dstStride,
int width, int height, bool is601) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb;
getYuvToRgbCoefficients(is601, isFullRange, cy, cr, cgu, cgv, cb);
__m256i c_y = _mm256_set1_epi16(cy);
__m256i c_r = _mm256_set1_epi16(cr);
__m256i c_gu = _mm256_set1_epi16(cgu);
__m256i c_gv = _mm256_set1_epi16(cgv);
__m256i c_b = _mm256_set1_epi16(cb);
__m256i c128 = _mm256_set1_epi16(128);
YuvToRgbFunc convertFunc = getYuvToRgbFunc(is601, isFullRange);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uvRow = srcUV + (y / 2) * srcUVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
__m128i y_vals = _mm_loadu_si128((const __m128i*)(yRow + x));
__m128i uv_vals = _mm_loadu_si128((const __m128i*)(uvRow + x));
__m128i u8 = _mm_and_si128(uv_vals, _mm_set1_epi16(0x00FF)); __m128i v8 = _mm_srli_epi16(uv_vals, 8);
u8 = _mm_packus_epi16(u8, _mm_setzero_si128()); v8 = _mm_packus_epi16(v8, _mm_setzero_si128());
__m128i u_lo = _mm_unpacklo_epi8(u8, u8); __m128i v_lo = _mm_unpacklo_epi8(v8, v8);
__m256i u_16 = _mm256_cvtepu8_epi16(u_lo);
__m256i v_16 = _mm256_cvtepu8_epi16(v_lo);
__m256i y_16 = _mm256_cvtepu8_epi16(y_vals);
u_16 = _mm256_sub_epi16(u_16, c128);
v_16 = _mm256_sub_epi16(v_16, c128);
if constexpr (!isFullRange) { y_16 = _mm256_sub_epi16(y_16, _mm256_set1_epi16(16));
}
__m256i y_scaled = _mm256_mullo_epi16(y_16, c_y);
__m256i r = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(v_16, c_r));
r = _mm256_add_epi16(r, _mm256_set1_epi16(32));
r = _mm256_srai_epi16(r, 6);
__m256i g = _mm256_sub_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_gu));
g = _mm256_sub_epi16(g, _mm256_mullo_epi16(v_16, c_gv));
g = _mm256_add_epi16(g, _mm256_set1_epi16(32));
g = _mm256_srai_epi16(g, 6);
__m256i b = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_b));
b = _mm256_add_epi16(b, _mm256_set1_epi16(32));
b = _mm256_srai_epi16(b, 6);
__m256i zero = _mm256_setzero_si256();
__m256i maxv = _mm256_set1_epi16(255);
r = _mm256_max_epi16(zero, _mm256_min_epi16(r, maxv));
g = _mm256_max_epi16(zero, _mm256_min_epi16(g, maxv));
b = _mm256_max_epi16(zero, _mm256_min_epi16(b, maxv));
alignas(32) uint16_t b_arr[16], g_arr[16], r_arr[16];
_mm256_store_si256((__m256i*)b_arr, b);
_mm256_store_si256((__m256i*)g_arr, g);
_mm256_store_si256((__m256i*)r_arr, r);
for (int i = 0; i < 16; ++i) {
if constexpr (isBGR) {
dstRow[(x + i) * 3 + 0] = (uint8_t)b_arr[i];
dstRow[(x + i) * 3 + 1] = (uint8_t)g_arr[i];
dstRow[(x + i) * 3 + 2] = (uint8_t)r_arr[i];
} else {
dstRow[(x + i) * 3 + 0] = (uint8_t)r_arr[i];
dstRow[(x + i) * 3 + 1] = (uint8_t)g_arr[i];
dstRow[(x + i) * 3 + 2] = (uint8_t)b_arr[i];
}
}
}
for (; x < width; x += 2) {
int y0 = yRow[x + 0];
int y1 = yRow[x + 1];
int u = uvRow[x]; int v = uvRow[x + 1];
int r0, g0, b0, r1, g1, b1;
convertFunc(y0, u, v, r0, g0, b0);
convertFunc(y1, u, v, r1, g1, b1);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
dstRow[(x + 1) * 3 + 0] = b1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = r1;
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
dstRow[(x + 1) * 3 + 0] = r1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = b1;
}
}
}
}
template <bool isBGRA, bool isFullRange>
AVX2_TARGET void _i420ToRgba_avx2_imp(const uint8_t* srcY, int srcYStride, const uint8_t* srcU, int srcUStride, const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride, int width, int height, bool is601) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb;
getYuvToRgbCoefficients(is601, isFullRange, cy, cr, cgu, cgv, cb);
__m256i c_y = _mm256_set1_epi16(cy);
__m256i c_r = _mm256_set1_epi16(cr);
__m256i c_gu = _mm256_set1_epi16(cgu);
__m256i c_gv = _mm256_set1_epi16(cgv);
__m256i c_b = _mm256_set1_epi16(cb);
__m256i c128 = _mm256_set1_epi16(128);
YuvToRgbFunc convertFunc = getYuvToRgbFunc(is601, isFullRange);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uRow = srcU + (y / 2) * srcUStride;
const uint8_t* vRow = srcV + (y / 2) * srcVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
__m128i y_vals = _mm_loadu_si128((const __m128i*)(yRow + x));
__m128i u8 = _mm_loadl_epi64((const __m128i*)(uRow + x / 2));
__m128i v8 = _mm_loadl_epi64((const __m128i*)(vRow + x / 2));
__m128i u16 = _mm_unpacklo_epi8(u8, u8); __m128i v16 = _mm_unpacklo_epi8(v8, v8);
__m256i u_16 = _mm256_cvtepu8_epi16(u16);
__m256i v_16 = _mm256_cvtepu8_epi16(v16);
__m256i y_16 = _mm256_cvtepu8_epi16(y_vals);
u_16 = _mm256_sub_epi16(u_16, _mm256_set1_epi16(128));
v_16 = _mm256_sub_epi16(v_16, _mm256_set1_epi16(128));
if constexpr (!isFullRange) { y_16 = _mm256_sub_epi16(y_16, _mm256_set1_epi16(16));
}
__m256i y_scaled = _mm256_mullo_epi16(y_16, c_y);
__m256i r = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(v_16, c_r));
r = _mm256_add_epi16(r, _mm256_set1_epi16(32));
r = _mm256_srai_epi16(r, 6);
__m256i g = _mm256_sub_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_gu));
g = _mm256_sub_epi16(g, _mm256_mullo_epi16(v_16, c_gv));
g = _mm256_add_epi16(g, _mm256_set1_epi16(32));
g = _mm256_srai_epi16(g, 6);
__m256i b = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_b));
b = _mm256_add_epi16(b, _mm256_set1_epi16(32));
b = _mm256_srai_epi16(b, 6);
__m256i zero = _mm256_setzero_si256();
__m256i maxv = _mm256_set1_epi16(255);
r = _mm256_max_epi16(zero, _mm256_min_epi16(r, maxv));
g = _mm256_max_epi16(zero, _mm256_min_epi16(g, maxv));
b = _mm256_max_epi16(zero, _mm256_min_epi16(b, maxv));
__m128i b8 = _mm_packus_epi16(_mm256_castsi256_si128(b), _mm256_extracti128_si256(b, 1));
__m128i g8 = _mm_packus_epi16(_mm256_castsi256_si128(g), _mm256_extracti128_si256(g, 1));
__m128i r8 = _mm_packus_epi16(_mm256_castsi256_si128(r), _mm256_extracti128_si256(r, 1));
__m128i a8 = _mm_set1_epi8((char)255);
if constexpr (isBGRA) {
__m128i bg0 = _mm_unpacklo_epi8(b8, g8);
__m128i ra0 = _mm_unpacklo_epi8(r8, a8);
__m128i bgra0 = _mm_unpacklo_epi16(bg0, ra0);
__m128i bgra1 = _mm_unpackhi_epi16(bg0, ra0);
__m128i bg1 = _mm_unpackhi_epi8(b8, g8);
__m128i ra1 = _mm_unpackhi_epi8(r8, a8);
__m128i bgra2 = _mm_unpacklo_epi16(bg1, ra1);
__m128i bgra3 = _mm_unpackhi_epi16(bg1, ra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 0), bgra0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), bgra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), bgra2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), bgra3);
} else {
__m128i rg0 = _mm_unpacklo_epi8(r8, g8);
__m128i ba0 = _mm_unpacklo_epi8(b8, a8);
__m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);
__m128i rgba1 = _mm_unpackhi_epi16(rg0, ba0);
__m128i rg1 = _mm_unpackhi_epi8(r8, g8);
__m128i ba1 = _mm_unpackhi_epi8(b8, a8);
__m128i rgba2 = _mm_unpacklo_epi16(rg1, ba1);
__m128i rgba3 = _mm_unpackhi_epi16(rg1, ba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 0), rgba0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), rgba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), rgba2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), rgba3);
}
}
for (; x < width; x += 2) {
int y0 = yRow[x + 0];
int y1 = yRow[x + 1];
int u = uRow[x / 2];
int v = vRow[x / 2];
int r0, g0, b0, r1, g1, b1;
convertFunc(y0, u, v, r0, g0, b0);
convertFunc(y1, u, v, r1, g1, b1);
if constexpr (isBGRA) {
dstRow[x * 4 + 0] = b0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = r0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4 + 0] = b1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = r1;
dstRow[(x + 1) * 4 + 3] = 255;
} else {
dstRow[x * 4 + 0] = r0;
dstRow[x * 4 + 1] = g0;
dstRow[x * 4 + 2] = b0;
dstRow[x * 4 + 3] = 255;
dstRow[(x + 1) * 4 + 0] = r1;
dstRow[(x + 1) * 4 + 1] = g1;
dstRow[(x + 1) * 4 + 2] = b1;
dstRow[(x + 1) * 4 + 3] = 255;
}
}
}
}
template <bool isBGR, bool isFullRange>
AVX2_TARGET void _i420ToRgb_avx2_imp(const uint8_t* srcY, int srcYStride, const uint8_t* srcU, int srcUStride, const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride, int width, int height, bool is601) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb;
getYuvToRgbCoefficients(is601, isFullRange, cy, cr, cgu, cgv, cb);
__m256i c_y = _mm256_set1_epi16(cy);
__m256i c_r = _mm256_set1_epi16(cr);
__m256i c_gu = _mm256_set1_epi16(cgu);
__m256i c_gv = _mm256_set1_epi16(cgv);
__m256i c_b = _mm256_set1_epi16(cb);
__m256i c128 = _mm256_set1_epi16(128);
YuvToRgbFunc convertFunc = getYuvToRgbFunc(is601, isFullRange);
for (int y = 0; y < height; ++y) {
const uint8_t* yRow = srcY + y * srcYStride;
const uint8_t* uRow = srcU + (y / 2) * srcUStride;
const uint8_t* vRow = srcV + (y / 2) * srcVStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + 16 <= width; x += 16) {
__m128i y_vals = _mm_loadu_si128((const __m128i*)(yRow + x));
__m128i u8 = _mm_loadl_epi64((const __m128i*)(uRow + x / 2));
__m128i v8 = _mm_loadl_epi64((const __m128i*)(vRow + x / 2));
__m128i u16 = _mm_unpacklo_epi8(u8, u8); __m128i v16 = _mm_unpacklo_epi8(v8, v8);
__m256i u_16 = _mm256_cvtepu8_epi16(u16);
__m256i v_16 = _mm256_cvtepu8_epi16(v16);
__m256i y_16 = _mm256_cvtepu8_epi16(y_vals);
u_16 = _mm256_sub_epi16(u_16, c128);
v_16 = _mm256_sub_epi16(v_16, c128);
if constexpr (!isFullRange) {
y_16 = _mm256_sub_epi16(y_16, _mm256_set1_epi16(16));
}
__m256i y_scaled = _mm256_mullo_epi16(y_16, c_y);
__m256i r = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(v_16, c_r));
r = _mm256_add_epi16(r, _mm256_set1_epi16(32));
r = _mm256_srai_epi16(r, 6);
__m256i g = _mm256_sub_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_gu));
g = _mm256_sub_epi16(g, _mm256_mullo_epi16(v_16, c_gv));
g = _mm256_add_epi16(g, _mm256_set1_epi16(32));
g = _mm256_srai_epi16(g, 6);
__m256i b = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_b));
b = _mm256_add_epi16(b, _mm256_set1_epi16(32));
b = _mm256_srai_epi16(b, 6);
__m256i zero = _mm256_setzero_si256();
__m256i maxv = _mm256_set1_epi16(255);
r = _mm256_max_epi16(zero, _mm256_min_epi16(r, maxv));
g = _mm256_max_epi16(zero, _mm256_min_epi16(g, maxv));
b = _mm256_max_epi16(zero, _mm256_min_epi16(b, maxv));
alignas(32) uint16_t b_arr[16], g_arr[16], r_arr[16];
_mm256_store_si256((__m256i*)b_arr, b);
_mm256_store_si256((__m256i*)g_arr, g);
_mm256_store_si256((__m256i*)r_arr, r);
for (int i = 0; i < 16; ++i) {
if constexpr (isBGR) {
dstRow[(x + i) * 3 + 0] = (uint8_t)b_arr[i];
dstRow[(x + i) * 3 + 1] = (uint8_t)g_arr[i];
dstRow[(x + i) * 3 + 2] = (uint8_t)r_arr[i];
} else {
dstRow[(x + i) * 3 + 0] = (uint8_t)r_arr[i];
dstRow[(x + i) * 3 + 1] = (uint8_t)g_arr[i];
dstRow[(x + i) * 3 + 2] = (uint8_t)b_arr[i];
}
}
}
for (; x < width; x += 2) {
int y0 = yRow[x + 0];
int y1 = yRow[x + 1];
int u = uRow[x / 2];
int v = vRow[x / 2];
int r0, g0, b0, r1, g1, b1;
convertFunc(y0, u, v, r0, g0, b0);
convertFunc(y1, u, v, r1, g1, b1);
if constexpr (isBGR) {
dstRow[x * 3 + 0] = b0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = r0;
dstRow[(x + 1) * 3 + 0] = b1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = r1;
} else {
dstRow[x * 3 + 0] = r0;
dstRow[x * 3 + 1] = g0;
dstRow[x * 3 + 2] = b0;
dstRow[(x + 1) * 3 + 0] = r1;
dstRow[(x + 1) * 3 + 1] = g1;
dstRow[(x + 1) * 3 + 2] = b1;
}
}
}
}
AVX2_TARGET
void nv12ToBgra32_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcUV, int srcUVStride, uint8_t* dst, int dstStride, int width,
int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
nv12ToRgbaColor_avx2_imp<true, true>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
} else {
nv12ToRgbaColor_avx2_imp<true, false>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void nv12ToRgba32_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcUV, int srcUVStride, uint8_t* dst, int dstStride, int width,
int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
nv12ToRgbaColor_avx2_imp<false, true>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
} else {
nv12ToRgbaColor_avx2_imp<false, false>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void nv12ToBgr24_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcUV, int srcUVStride, uint8_t* dst, int dstStride, int width,
int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
_nv12ToRgbColor_avx2_imp<true, true>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
} else {
_nv12ToRgbColor_avx2_imp<true, false>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void nv12ToRgb24_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcUV, int srcUVStride, uint8_t* dst, int dstStride, int width,
int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
_nv12ToRgbColor_avx2_imp<false, true>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
} else {
_nv12ToRgbColor_avx2_imp<false, false>(srcY, srcYStride, srcUV, srcUVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void i420ToBgra32_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcU, int srcUStride, const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
_i420ToRgba_avx2_imp<true, true>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
} else {
_i420ToRgba_avx2_imp<true, false>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void i420ToRgba32_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcU, int srcUStride, const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
_i420ToRgba_avx2_imp<false, true>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
} else {
_i420ToRgba_avx2_imp<false, false>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void i420ToBgr24_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcU, int srcUStride, const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
_i420ToRgb_avx2_imp<true, true>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
} else {
_i420ToRgb_avx2_imp<true, false>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void i420ToRgb24_avx2(const uint8_t* srcY, int srcYStride, const uint8_t* srcU, int srcUStride, const uint8_t* srcV, int srcVStride,
uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
_i420ToRgb_avx2_imp<false, true>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
} else {
_i420ToRgb_avx2_imp<false, false>(srcY, srcYStride, srcU, srcUStride, srcV, srcVStride, dst, dstStride, width, height, is601);
}
}
template <bool isBgrColor, bool hasAlpha, bool isFullRange>
AVX2_TARGET void yuyvToRgb_avx2_imp(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, bool is601) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb;
getYuvToRgbCoefficients(is601, isFullRange, cy, cr, cgu, cgv, cb);
__m256i c_y = _mm256_set1_epi16(cy);
__m256i c_r = _mm256_set1_epi16(cr);
__m256i c_gu = _mm256_set1_epi16(cgu);
__m256i c_gv = _mm256_set1_epi16(cgv);
__m256i c_b = _mm256_set1_epi16(cb);
__m256i c128 = _mm256_set1_epi16(128);
__m128i a8 = _mm_set1_epi8((char)255);
constexpr int channels = hasAlpha ? 4 : 3;
const int vectorWidth = 16; YuvToRgbFunc convertFunc = getYuvToRgbFunc(is601, isFullRange);
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + vectorWidth <= width; x += vectorWidth) {
__m256i yuyv_data = _mm256_loadu_si256((const __m256i*)(srcRow + x * 2));
__m256i shuffle_y = _mm256_setr_epi8(
0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1 );
__m256i shuffle_u = _mm256_setr_epi8(
1, 1, 5, 5, 9, 9, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 5, 5, 9, 9, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1 );
__m256i shuffle_v = _mm256_setr_epi8(
3, 3, 7, 7, 11, 11, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 7, 7, 11, 11, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1 );
__m256i y_shuffled = _mm256_shuffle_epi8(yuyv_data, shuffle_y);
__m256i u_shuffled = _mm256_shuffle_epi8(yuyv_data, shuffle_u);
__m256i v_shuffled = _mm256_shuffle_epi8(yuyv_data, shuffle_v);
__m128i y_lo = _mm256_castsi256_si128(y_shuffled); __m128i y_hi = _mm256_extracti128_si256(y_shuffled, 1); __m128i u_lo = _mm256_castsi256_si128(u_shuffled); __m128i u_hi = _mm256_extracti128_si256(u_shuffled, 1); __m128i v_lo = _mm256_castsi256_si128(v_shuffled); __m128i v_hi = _mm256_extracti128_si256(v_shuffled, 1);
__m128i y_final = _mm_unpacklo_epi64(y_lo, y_hi); __m128i u_final = _mm_unpacklo_epi64(u_lo, u_hi); __m128i v_final = _mm_unpacklo_epi64(v_lo, v_hi);
__m256i y_16 = _mm256_cvtepu8_epi16(y_final);
__m256i u_16 = _mm256_cvtepu8_epi16(u_final);
__m256i v_16 = _mm256_cvtepu8_epi16(v_final);
u_16 = _mm256_sub_epi16(u_16, c128);
v_16 = _mm256_sub_epi16(v_16, c128);
if constexpr (!isFullRange) {
y_16 = _mm256_sub_epi16(y_16, _mm256_set1_epi16(16));
}
__m256i y_scaled = _mm256_mullo_epi16(y_16, c_y);
__m256i r = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(v_16, c_r));
r = _mm256_add_epi16(r, _mm256_set1_epi16(32));
r = _mm256_srai_epi16(r, 6);
__m256i g = _mm256_sub_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_gu));
g = _mm256_sub_epi16(g, _mm256_mullo_epi16(v_16, c_gv));
g = _mm256_add_epi16(g, _mm256_set1_epi16(32));
g = _mm256_srai_epi16(g, 6);
__m256i b = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_b));
b = _mm256_add_epi16(b, _mm256_set1_epi16(32));
b = _mm256_srai_epi16(b, 6);
__m256i zero = _mm256_setzero_si256();
__m256i maxv = _mm256_set1_epi16(255);
r = _mm256_max_epi16(zero, _mm256_min_epi16(r, maxv));
g = _mm256_max_epi16(zero, _mm256_min_epi16(g, maxv));
b = _mm256_max_epi16(zero, _mm256_min_epi16(b, maxv));
__m128i r8 = _mm_packus_epi16(_mm256_castsi256_si128(r), _mm256_extracti128_si256(r, 1));
__m128i g8 = _mm_packus_epi16(_mm256_castsi256_si128(g), _mm256_extracti128_si256(g, 1));
__m128i b8 = _mm_packus_epi16(_mm256_castsi256_si128(b), _mm256_extracti128_si256(b, 1));
if constexpr (hasAlpha) {
if constexpr (isBgrColor) {
__m128i bg0 = _mm_unpacklo_epi8(b8, g8);
__m128i ra0 = _mm_unpacklo_epi8(r8, a8);
__m128i bgra0 = _mm_unpacklo_epi16(bg0, ra0);
__m128i bgra1 = _mm_unpackhi_epi16(bg0, ra0);
__m128i bg1 = _mm_unpackhi_epi8(b8, g8);
__m128i ra1 = _mm_unpackhi_epi8(r8, a8);
__m128i bgra2 = _mm_unpacklo_epi16(bg1, ra1);
__m128i bgra3 = _mm_unpackhi_epi16(bg1, ra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4), bgra0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), bgra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), bgra2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), bgra3);
} else {
__m128i rg0 = _mm_unpacklo_epi8(r8, g8);
__m128i ba0 = _mm_unpacklo_epi8(b8, a8);
__m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);
__m128i rgba1 = _mm_unpackhi_epi16(rg0, ba0);
__m128i rg1 = _mm_unpackhi_epi8(r8, g8);
__m128i ba1 = _mm_unpackhi_epi8(b8, a8);
__m128i rgba2 = _mm_unpacklo_epi16(rg1, ba1);
__m128i rgba3 = _mm_unpackhi_epi16(rg1, ba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4), rgba0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), rgba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), rgba2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), rgba3);
}
} else {
uint8_t r_vals[16], g_vals[16], b_vals[16];
_mm_storeu_si128((__m128i*)r_vals, r8);
_mm_storeu_si128((__m128i*)g_vals, g8);
_mm_storeu_si128((__m128i*)b_vals, b8);
for (int i = 0; i < 16 && (x + i) < width; ++i) {
if constexpr (isBgrColor) {
dstRow[(x + i) * 3 + 0] = b_vals[i];
dstRow[(x + i) * 3 + 1] = g_vals[i];
dstRow[(x + i) * 3 + 2] = r_vals[i];
} else {
dstRow[(x + i) * 3 + 0] = r_vals[i];
dstRow[(x + i) * 3 + 1] = g_vals[i];
dstRow[(x + i) * 3 + 2] = b_vals[i];
}
}
}
}
for (; x < width; x += 2) {
if (x + 1 >= width) break;
int baseIdx = x * 2;
int y0 = srcRow[baseIdx + 0]; int u = srcRow[baseIdx + 1]; int y1 = srcRow[baseIdx + 2]; int v = srcRow[baseIdx + 3];
int r0, g0, b0, r1, g1, b1;
convertFunc(y0, u, v, r0, g0, b0);
convertFunc(y1, u, v, r1, g1, b1);
if constexpr (isBgrColor) {
dstRow[x * channels + 0] = b0;
dstRow[x * channels + 1] = g0;
dstRow[x * channels + 2] = r0;
if (x + 1 < width) {
dstRow[(x + 1) * channels + 0] = b1;
dstRow[(x + 1) * channels + 1] = g1;
dstRow[(x + 1) * channels + 2] = r1;
}
} else {
dstRow[x * channels + 0] = r0;
dstRow[x * channels + 1] = g0;
dstRow[x * channels + 2] = b0;
if (x + 1 < width) {
dstRow[(x + 1) * channels + 0] = r1;
dstRow[(x + 1) * channels + 1] = g1;
dstRow[(x + 1) * channels + 2] = b1;
}
}
if constexpr (hasAlpha) {
dstRow[x * channels + 3] = 255;
if (x + 1 < width) {
dstRow[(x + 1) * channels + 3] = 255;
}
}
}
}
}
template <bool isBgrColor, bool hasAlpha, bool isFullRange>
AVX2_TARGET void uyvyToRgb_avx2_imp(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, bool is601) {
if (height < 0) {
height = -height;
dst = dst + (height - 1) * dstStride;
dstStride = -dstStride;
}
int cy, cr, cgu, cgv, cb;
getYuvToRgbCoefficients(is601, isFullRange, cy, cr, cgu, cgv, cb);
__m256i c_y = _mm256_set1_epi16(cy);
__m256i c_r = _mm256_set1_epi16(cr);
__m256i c_gu = _mm256_set1_epi16(cgu);
__m256i c_gv = _mm256_set1_epi16(cgv);
__m256i c_b = _mm256_set1_epi16(cb);
__m256i c128 = _mm256_set1_epi16(128);
__m128i a8 = _mm_set1_epi8((char)255);
constexpr int channels = hasAlpha ? 4 : 3;
const int vectorWidth = 16; YuvToRgbFunc convertFunc = getYuvToRgbFunc(is601, isFullRange);
for (int y = 0; y < height; ++y) {
const uint8_t* srcRow = src + y * srcStride;
uint8_t* dstRow = dst + y * dstStride;
int x = 0;
for (; x + vectorWidth <= width; x += vectorWidth) {
__m256i uyvy_data = _mm256_loadu_si256((const __m256i*)(srcRow + x * 2));
__m256i shuffle_y = _mm256_setr_epi8(
1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1, 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1 );
__m256i shuffle_u = _mm256_setr_epi8(
0, 0, 4, 4, 8, 8, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 4, 4, 8, 8, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1 );
__m256i shuffle_v = _mm256_setr_epi8(
2, 2, 6, 6, 10, 10, 14, 14, -1, -1, -1, -1, -1, -1, -1, -1, 2, 2, 6, 6, 10, 10, 14, 14, -1, -1, -1, -1, -1, -1, -1, -1 );
__m256i y_shuffled = _mm256_shuffle_epi8(uyvy_data, shuffle_y);
__m256i u_shuffled = _mm256_shuffle_epi8(uyvy_data, shuffle_u);
__m256i v_shuffled = _mm256_shuffle_epi8(uyvy_data, shuffle_v);
__m128i y_lo = _mm256_castsi256_si128(y_shuffled); __m128i y_hi = _mm256_extracti128_si256(y_shuffled, 1); __m128i u_lo = _mm256_castsi256_si128(u_shuffled); __m128i u_hi = _mm256_extracti128_si256(u_shuffled, 1); __m128i v_lo = _mm256_castsi256_si128(v_shuffled); __m128i v_hi = _mm256_extracti128_si256(v_shuffled, 1);
__m128i y_final = _mm_unpacklo_epi64(y_lo, y_hi); __m128i u_final = _mm_unpacklo_epi64(u_lo, u_hi); __m128i v_final = _mm_unpacklo_epi64(v_lo, v_hi);
__m256i y_16 = _mm256_cvtepu8_epi16(y_final);
__m256i u_16 = _mm256_cvtepu8_epi16(u_final);
__m256i v_16 = _mm256_cvtepu8_epi16(v_final);
u_16 = _mm256_sub_epi16(u_16, c128);
v_16 = _mm256_sub_epi16(v_16, c128);
if constexpr (!isFullRange) {
y_16 = _mm256_sub_epi16(y_16, _mm256_set1_epi16(16));
}
__m256i y_scaled = _mm256_mullo_epi16(y_16, c_y);
__m256i r = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(v_16, c_r));
r = _mm256_add_epi16(r, _mm256_set1_epi16(32));
r = _mm256_srai_epi16(r, 6);
__m256i g = _mm256_sub_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_gu));
g = _mm256_sub_epi16(g, _mm256_mullo_epi16(v_16, c_gv));
g = _mm256_add_epi16(g, _mm256_set1_epi16(32));
g = _mm256_srai_epi16(g, 6);
__m256i b = _mm256_add_epi16(y_scaled, _mm256_mullo_epi16(u_16, c_b));
b = _mm256_add_epi16(b, _mm256_set1_epi16(32));
b = _mm256_srai_epi16(b, 6);
__m256i zero = _mm256_setzero_si256();
__m256i maxv = _mm256_set1_epi16(255);
r = _mm256_max_epi16(zero, _mm256_min_epi16(r, maxv));
g = _mm256_max_epi16(zero, _mm256_min_epi16(g, maxv));
b = _mm256_max_epi16(zero, _mm256_min_epi16(b, maxv));
__m128i r8 = _mm_packus_epi16(_mm256_castsi256_si128(r), _mm256_extracti128_si256(r, 1));
__m128i g8 = _mm_packus_epi16(_mm256_castsi256_si128(g), _mm256_extracti128_si256(g, 1));
__m128i b8 = _mm_packus_epi16(_mm256_castsi256_si128(b), _mm256_extracti128_si256(b, 1));
if constexpr (hasAlpha) {
if constexpr (isBgrColor) {
__m128i bg0 = _mm_unpacklo_epi8(b8, g8);
__m128i ra0 = _mm_unpacklo_epi8(r8, a8);
__m128i bgra0 = _mm_unpacklo_epi16(bg0, ra0);
__m128i bgra1 = _mm_unpackhi_epi16(bg0, ra0);
__m128i bg1 = _mm_unpackhi_epi8(b8, g8);
__m128i ra1 = _mm_unpackhi_epi8(r8, a8);
__m128i bgra2 = _mm_unpacklo_epi16(bg1, ra1);
__m128i bgra3 = _mm_unpackhi_epi16(bg1, ra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4), bgra0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), bgra1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), bgra2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), bgra3);
} else {
__m128i rg0 = _mm_unpacklo_epi8(r8, g8);
__m128i ba0 = _mm_unpacklo_epi8(b8, a8);
__m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);
__m128i rgba1 = _mm_unpackhi_epi16(rg0, ba0);
__m128i rg1 = _mm_unpackhi_epi8(r8, g8);
__m128i ba1 = _mm_unpackhi_epi8(b8, a8);
__m128i rgba2 = _mm_unpacklo_epi16(rg1, ba1);
__m128i rgba3 = _mm_unpackhi_epi16(rg1, ba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4), rgba0);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 16), rgba1);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 32), rgba2);
_mm_storeu_si128((__m128i*)(dstRow + x * 4 + 48), rgba3);
}
} else {
uint8_t r_vals[16], g_vals[16], b_vals[16];
_mm_storeu_si128((__m128i*)r_vals, r8);
_mm_storeu_si128((__m128i*)g_vals, g8);
_mm_storeu_si128((__m128i*)b_vals, b8);
for (int i = 0; i < 16 && (x + i) < width; ++i) {
if constexpr (isBgrColor) {
dstRow[(x + i) * 3 + 0] = b_vals[i];
dstRow[(x + i) * 3 + 1] = g_vals[i];
dstRow[(x + i) * 3 + 2] = r_vals[i];
} else {
dstRow[(x + i) * 3 + 0] = r_vals[i];
dstRow[(x + i) * 3 + 1] = g_vals[i];
dstRow[(x + i) * 3 + 2] = b_vals[i];
}
}
}
}
for (; x < width; x += 2) {
if (x + 1 >= width) break;
int baseIdx = x * 2;
int u = srcRow[baseIdx + 0]; int y0 = srcRow[baseIdx + 1]; int v = srcRow[baseIdx + 2]; int y1 = srcRow[baseIdx + 3];
int r0, g0, b0, r1, g1, b1;
convertFunc(y0, u, v, r0, g0, b0);
convertFunc(y1, u, v, r1, g1, b1);
if constexpr (isBgrColor) {
dstRow[x * channels + 0] = b0;
dstRow[x * channels + 1] = g0;
dstRow[x * channels + 2] = r0;
if (x + 1 < width) {
dstRow[(x + 1) * channels + 0] = b1;
dstRow[(x + 1) * channels + 1] = g1;
dstRow[(x + 1) * channels + 2] = r1;
}
} else {
dstRow[x * channels + 0] = r0;
dstRow[x * channels + 1] = g0;
dstRow[x * channels + 2] = b0;
if (x + 1 < width) {
dstRow[(x + 1) * channels + 0] = r1;
dstRow[(x + 1) * channels + 1] = g1;
dstRow[(x + 1) * channels + 2] = b1;
}
}
if constexpr (hasAlpha) {
dstRow[x * channels + 3] = 255;
if (x + 1 < width) {
dstRow[(x + 1) * channels + 3] = 255;
}
}
}
}
}
AVX2_TARGET
void yuyvToBgr24_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
yuyvToRgb_avx2_imp<true, false, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
yuyvToRgb_avx2_imp<true, false, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void yuyvToRgb24_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
yuyvToRgb_avx2_imp<false, false, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
yuyvToRgb_avx2_imp<false, false, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void yuyvToBgra32_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
yuyvToRgb_avx2_imp<true, true, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
yuyvToRgb_avx2_imp<true, true, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void yuyvToRgba32_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
yuyvToRgb_avx2_imp<false, true, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
yuyvToRgb_avx2_imp<false, true, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void uyvyToBgr24_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
uyvyToRgb_avx2_imp<true, false, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
uyvyToRgb_avx2_imp<true, false, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void uyvyToRgb24_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
uyvyToRgb_avx2_imp<false, false, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
uyvyToRgb_avx2_imp<false, false, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void uyvyToBgra32_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
uyvyToRgb_avx2_imp<true, true, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
uyvyToRgb_avx2_imp<true, true, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
AVX2_TARGET
void uyvyToRgba32_avx2(const uint8_t* src, int srcStride, uint8_t* dst, int dstStride, int width, int height, ConvertFlag flag) {
const bool is601 = (flag & ConvertFlag::BT601) != 0;
const bool isFullRange = (flag & ConvertFlag::FullRange) != 0;
if (isFullRange) {
uyvyToRgb_avx2_imp<false, true, true>(src, srcStride, dst, dstStride, width, height, is601);
} else {
uyvyToRgb_avx2_imp<false, true, false>(src, srcStride, dst, dstStride, width, height, is601);
}
}
#endif }