#![allow(unsafe_op_in_unsafe_fn)]
#![allow(clippy::too_many_arguments)]
use archmage::prelude::*;
use imgref::ImgVec;
use rgb::RGB8;
#[cfg(target_arch = "x86_64")]
#[arcane]
pub fn yuv420_to_rgb8_fast(
token: Desktop64,
y_plane: &[u8],
y_stride: usize,
u_plane: &[u8],
u_stride: usize,
v_plane: &[u8],
v_stride: usize,
width: usize,
height: usize,
) -> ImgVec<RGB8> {
let mut out = vec![RGB8::default(); width * height];
let y_coef: i16 = 9539; let cr_coef: i16 = 13075; let cb_coef: i16 = 16525; let g_coef_1: i16 = 6660; let g_coef_2: i16 = 3209;
let y_bias: i16 = 16;
let uv_bias: i16 = 128;
for y in (0..height).step_by(2) {
let y0_row = y;
let y1_row = (y + 1).min(height - 1);
let chroma_row = y / 2;
for x in (0..width).step_by(32) {
let pixels_remaining = (width - x).min(32);
if pixels_remaining < 32 {
for i in 0..pixels_remaining {
for row in [y0_row, y1_row] {
if row >= height {
continue;
}
let px = x + i;
let chroma_x = px / 2;
let y_val = y_plane[row * y_stride + px] as i32 - y_bias as i32;
let u_val =
u_plane[chroma_row * u_stride + chroma_x] as i32 - uv_bias as i32;
let v_val =
v_plane[chroma_row * v_stride + chroma_x] as i32 - uv_bias as i32;
let y_scaled = (y_val * y_coef as i32) >> 13;
let r = y_scaled + ((v_val * cr_coef as i32) >> 13);
let g =
y_scaled - ((v_val * g_coef_1 as i32 + u_val * g_coef_2 as i32) >> 13);
let b = y_scaled + ((u_val * cb_coef as i32) >> 13);
out[row * width + px] = RGB8 {
r: r.clamp(0, 255) as u8,
g: g.clamp(0, 255) as u8,
b: b.clamp(0, 255) as u8,
};
}
}
continue;
}
let split_point = y1_row * width;
let (top_rows, bottom_rows) = out.split_at_mut(split_point);
let row0_out = &mut top_rows[y0_row * width + x..];
let row1_out = &mut bottom_rows[x..];
process_32_pixels_420(
token,
&y_plane[y0_row * y_stride + x..],
&y_plane[y1_row * y_stride + x..],
&u_plane[chroma_row * u_stride + x / 2..],
&v_plane[chroma_row * v_stride + x / 2..],
row0_out,
row1_out,
y_coef,
cr_coef,
cb_coef,
g_coef_1,
g_coef_2,
y_bias,
uv_bias,
);
}
}
ImgVec::new(out, width, height)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn process_32_pixels_420(
_token: Desktop64,
y0: &[u8],
y1: &[u8],
u: &[u8],
v: &[u8],
out0: &mut [RGB8],
out1: &mut [RGB8],
y_coef: i16,
cr_coef: i16,
cb_coef: i16,
g_coef_1: i16,
g_coef_2: i16,
y_bias: i16,
uv_bias: i16,
) {
use core::arch::x86_64::*;
let out0 = &mut out0[..32];
let out1 = &mut out1[..32];
unsafe {
let y0_vals = _mm256_loadu_si256(y0.as_ptr() as *const __m256i);
let y1_vals = _mm256_loadu_si256(y1.as_ptr() as *const __m256i);
let u_vals = _mm_loadu_si128(u.as_ptr() as *const __m128i);
let v_vals = _mm_loadu_si128(v.as_ptr() as *const __m128i);
let y_corr = _mm256_set1_epi8(y_bias as i8);
let uv_corr = _mm256_set1_epi16((uv_bias << 2) | (uv_bias >> 6));
let v_y_coef = _mm256_set1_epi16(y_coef);
let v_cr_coef = _mm256_set1_epi16(cr_coef);
let v_cb_coef = _mm256_set1_epi16(cb_coef);
let v_g_coef_1 = _mm256_set1_epi16(g_coef_1);
let v_g_coef_2 = _mm256_set1_epi16(g_coef_2);
let y0_sub = _mm256_subs_epu8(y0_vals, y_corr);
let y1_sub = _mm256_subs_epu8(y1_vals, y_corr);
let shuf_expand = _mm256_setr_epi8(
0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
13, 14, 14, 15, 15,
);
let u_256 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(u_vals), u_vals);
let v_256 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(v_vals), v_vals);
let u_expanded = _mm256_shuffle_epi8(u_256, shuf_expand);
let v_expanded = _mm256_shuffle_epi8(v_256, shuf_expand);
let y0_lo = expand_u8_to_i16_lo(y0_sub);
let y0_hi = expand_u8_to_i16_hi(y0_sub);
let y1_lo = expand_u8_to_i16_lo(y1_sub);
let y1_hi = expand_u8_to_i16_hi(y1_sub);
let u_lo = expand_u8_to_i16_lo(u_expanded);
let u_hi = expand_u8_to_i16_hi(u_expanded);
let v_lo = expand_u8_to_i16_lo(v_expanded);
let v_hi = expand_u8_to_i16_hi(v_expanded);
let u_lo = _mm256_sub_epi16(u_lo, uv_corr);
let u_hi = _mm256_sub_epi16(u_hi, uv_corr);
let v_lo = _mm256_sub_epi16(v_lo, uv_corr);
let v_hi = _mm256_sub_epi16(v_hi, uv_corr);
let (r0_lo, g0_lo, b0_lo) = yuv_to_rgb_i16(
y0_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let (r0_hi, g0_hi, b0_hi) = yuv_to_rgb_i16(
y0_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let (r1_lo, g1_lo, b1_lo) = yuv_to_rgb_i16(
y1_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let (r1_hi, g1_hi, b1_hi) = yuv_to_rgb_i16(
y1_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let r0 = _mm256_packus_epi16(r0_lo, r0_hi);
let g0 = _mm256_packus_epi16(g0_lo, g0_hi);
let b0 = _mm256_packus_epi16(b0_lo, b0_hi);
let r1 = _mm256_packus_epi16(r1_lo, r1_hi);
let g1 = _mm256_packus_epi16(g1_lo, g1_hi);
let b1 = _mm256_packus_epi16(b1_lo, b1_hi);
store_rgb_row(out0, r0, g0, b0);
store_rgb_row(out1, r1, g1, b1);
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn expand_u8_to_i16_lo(v: __m256i) -> __m256i {
use core::arch::x86_64::*;
let v_dup = _mm256_unpacklo_epi8(v, v);
_mm256_srli_epi16::<6>(v_dup)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn expand_u8_to_i16_hi(v: __m256i) -> __m256i {
use core::arch::x86_64::*;
let v_dup = _mm256_unpackhi_epi8(v, v);
_mm256_srli_epi16::<6>(v_dup)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn yuv_to_rgb_i16(
y: __m256i,
u: __m256i,
v: __m256i,
y_coef: __m256i,
cr_coef: __m256i,
cb_coef: __m256i,
g_coef_1: __m256i,
g_coef_2: __m256i,
) -> (__m256i, __m256i, __m256i) {
use core::arch::x86_64::*;
let y_scaled = _mm256_mulhrs_epi16(y, y_coef);
let v_cr = _mm256_mulhrs_epi16(v, cr_coef);
let u_cb = _mm256_mulhrs_epi16(u, cb_coef);
let v_g = _mm256_mulhrs_epi16(v, g_coef_1);
let u_g = _mm256_mulhrs_epi16(u, g_coef_2);
let r = _mm256_add_epi16(y_scaled, v_cr);
let b = _mm256_add_epi16(y_scaled, u_cb);
let g = _mm256_sub_epi16(y_scaled, _mm256_add_epi16(v_g, u_g));
(r, g, b)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn store_rgb_row(out: &mut [RGB8], r: __m256i, g: __m256i, b: __m256i) {
use core::arch::x86_64::*;
let mut r_arr = [0u8; 32];
let mut g_arr = [0u8; 32];
let mut b_arr = [0u8; 32];
_mm256_storeu_si256(r_arr.as_mut_ptr() as *mut __m256i, r);
_mm256_storeu_si256(g_arr.as_mut_ptr() as *mut __m256i, g);
_mm256_storeu_si256(b_arr.as_mut_ptr() as *mut __m256i, b);
for i in 0..32 {
out[i] = RGB8 {
r: r_arr[i],
g: g_arr[i],
b: b_arr[i],
};
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[allow(dead_code)]
unsafe fn interleave_rgb_avx2(r: __m256i, g: __m256i, b: __m256i) -> (__m256i, __m256i, __m256i) {
use core::arch::x86_64::*;
let sh_b = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
9, 4, 15, 10, 5,
);
let sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
14, 9, 4, 15, 10,
);
let sh_r = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
3, 14, 9, 4, 15,
);
let b0 = _mm256_shuffle_epi8(r, sh_b);
let g0 = _mm256_shuffle_epi8(g, sh_g);
let r0 = _mm256_shuffle_epi8(b, sh_r);
let m0 = _mm256_setr_epi8(
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
0, 0, -1, 0, 0,
);
let m1 = _mm256_setr_epi8(
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0,
);
let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
let rgb0 = _mm256_permute2x128_si256::<0x20>(p0, p1); let rgb1 = _mm256_permute2x128_si256::<0x30>(p2, p0); let rgb2 = _mm256_permute2x128_si256::<0x31>(p1, p2);
(rgb0, rgb1, rgb2)
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub fn yuv420_to_rgb8_fast_neon(
token: NeonToken,
y_plane: &[u8],
y_stride: usize,
u_plane: &[u8],
u_stride: usize,
v_plane: &[u8],
v_stride: usize,
width: usize,
height: usize,
) -> ImgVec<RGB8> {
let mut out = vec![RGB8::default(); width * height];
let y_coef: i16 = 9539; let cr_coef: i16 = 13075; let cb_coef: i16 = 16525; let g_coef_1: i16 = 6660; let g_coef_2: i16 = 3209;
let y_bias: i16 = 16;
let uv_bias: i16 = 128;
for y in (0..height).step_by(2) {
let y0_row = y;
let y1_row = (y + 1).min(height - 1);
let chroma_row = y / 2;
for x in (0..width).step_by(16) {
let pixels_remaining = (width - x).min(16);
if pixels_remaining < 16 {
for i in 0..pixels_remaining {
for row in [y0_row, y1_row] {
if row >= height {
continue;
}
let px = x + i;
let chroma_x = px / 2;
let y_val = y_plane[row * y_stride + px] as i32 - y_bias as i32;
let u_val =
u_plane[chroma_row * u_stride + chroma_x] as i32 - uv_bias as i32;
let v_val =
v_plane[chroma_row * v_stride + chroma_x] as i32 - uv_bias as i32;
let y_scaled = (y_val * y_coef as i32) >> 13;
let r = y_scaled + ((v_val * cr_coef as i32) >> 13);
let g =
y_scaled - ((v_val * g_coef_1 as i32 + u_val * g_coef_2 as i32) >> 13);
let b = y_scaled + ((u_val * cb_coef as i32) >> 13);
out[row * width + px] = RGB8 {
r: r.clamp(0, 255) as u8,
g: g.clamp(0, 255) as u8,
b: b.clamp(0, 255) as u8,
};
}
}
continue;
}
let split_point = y1_row * width;
let (top_rows, bottom_rows) = out.split_at_mut(split_point);
let row0_out = &mut top_rows[y0_row * width + x..];
let row1_out = &mut bottom_rows[x..];
process_16_pixels_420_neon(
token,
&y_plane[y0_row * y_stride + x..],
&y_plane[y1_row * y_stride + x..],
&u_plane[chroma_row * u_stride + x / 2..],
&v_plane[chroma_row * v_stride + x / 2..],
row0_out,
row1_out,
y_coef,
cr_coef,
cb_coef,
g_coef_1,
g_coef_2,
y_bias,
uv_bias,
);
}
}
ImgVec::new(out, width, height)
}
#[cfg(target_arch = "aarch64")]
#[rite]
fn process_16_pixels_420_neon(
_token: NeonToken,
y0: &[u8],
y1: &[u8],
u: &[u8],
v: &[u8],
out0: &mut [RGB8],
out1: &mut [RGB8],
y_coef: i16,
cr_coef: i16,
cb_coef: i16,
g_coef_1: i16,
g_coef_2: i16,
y_bias: i16,
uv_bias: i16,
) {
use core::arch::aarch64::*;
let out0 = &mut out0[..16];
let out1 = &mut out1[..16];
let y0_vals = safe_unaligned_simd::aarch64::vld1q_u8(y0[..16].try_into().unwrap());
let y1_vals = safe_unaligned_simd::aarch64::vld1q_u8(y1[..16].try_into().unwrap());
let u_vals = safe_unaligned_simd::aarch64::vld1_u8(u[..8].try_into().unwrap());
let v_vals = safe_unaligned_simd::aarch64::vld1_u8(v[..8].try_into().unwrap());
let y_corr = vdupq_n_u8(y_bias as u8);
let uv_corr = vdupq_n_s16((uv_bias << 2) | (uv_bias >> 6));
let v_y_coef = vdupq_n_s16(y_coef);
let v_cr_coef = vdupq_n_s16(cr_coef);
let v_cb_coef = vdupq_n_s16(cb_coef);
let v_g_coef_1 = vdupq_n_s16(g_coef_1);
let v_g_coef_2 = vdupq_n_s16(g_coef_2);
let y0_sub = vqsubq_u8(y0_vals, y_corr);
let y1_sub = vqsubq_u8(y1_vals, y_corr);
let u_expanded_lo = vzip1_u8(u_vals, u_vals);
let u_expanded_hi = vzip2_u8(u_vals, u_vals);
let u_expanded = vcombine_u8(u_expanded_lo, u_expanded_hi);
let v_expanded_lo = vzip1_u8(v_vals, v_vals);
let v_expanded_hi = vzip2_u8(v_vals, v_vals);
let v_expanded = vcombine_u8(v_expanded_lo, v_expanded_hi);
let y0_lo = expand_u8_to_i16_lo_neon(y0_sub);
let y0_hi = expand_u8_to_i16_hi_neon(y0_sub);
let y1_lo = expand_u8_to_i16_lo_neon(y1_sub);
let y1_hi = expand_u8_to_i16_hi_neon(y1_sub);
let u_lo = expand_u8_to_i16_lo_neon(u_expanded);
let u_hi = expand_u8_to_i16_hi_neon(u_expanded);
let v_lo = expand_u8_to_i16_lo_neon(v_expanded);
let v_hi = expand_u8_to_i16_hi_neon(v_expanded);
let u_lo = vsubq_s16(u_lo, uv_corr);
let u_hi = vsubq_s16(u_hi, uv_corr);
let v_lo = vsubq_s16(v_lo, uv_corr);
let v_hi = vsubq_s16(v_hi, uv_corr);
let (r0_lo, g0_lo, b0_lo) = yuv_to_rgb_i16_neon(
y0_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let (r0_hi, g0_hi, b0_hi) = yuv_to_rgb_i16_neon(
y0_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let (r1_lo, g1_lo, b1_lo) = yuv_to_rgb_i16_neon(
y1_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let (r1_hi, g1_hi, b1_hi) = yuv_to_rgb_i16_neon(
y1_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
);
let r0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
let g0 = vcombine_u8(vqmovun_s16(g0_lo), vqmovun_s16(g0_hi));
let b0 = vcombine_u8(vqmovun_s16(b0_lo), vqmovun_s16(b0_hi));
let r1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
let g1 = vcombine_u8(vqmovun_s16(g1_lo), vqmovun_s16(g1_hi));
let b1 = vcombine_u8(vqmovun_s16(b1_lo), vqmovun_s16(b1_hi));
store_rgb_row_neon(out0, r0, g0, b0);
store_rgb_row_neon(out1, r1, g1, b1);
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn expand_u8_to_i16_lo_neon(v: uint8x16_t) -> int16x8_t {
let lo = vget_low_u8(v);
let dup = vzip1q_u8(vcombine_u8(lo, lo), vcombine_u8(lo, lo));
let as_u16 = vreinterpretq_u16_u8(dup);
vreinterpretq_s16_u16(vshrq_n_u16::<6>(as_u16))
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn expand_u8_to_i16_hi_neon(v: uint8x16_t) -> int16x8_t {
let hi = vget_high_u8(v);
let dup = vzip1q_u8(vcombine_u8(hi, hi), vcombine_u8(hi, hi));
let as_u16 = vreinterpretq_u16_u8(dup);
vreinterpretq_s16_u16(vshrq_n_u16::<6>(as_u16))
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn yuv_to_rgb_i16_neon(
y: int16x8_t,
u: int16x8_t,
v: int16x8_t,
y_coef: int16x8_t,
cr_coef: int16x8_t,
cb_coef: int16x8_t,
g_coef_1: int16x8_t,
g_coef_2: int16x8_t,
) -> (int16x8_t, int16x8_t, int16x8_t) {
let y_scaled = vqrdmulhq_s16(y, y_coef);
let v_cr = vqrdmulhq_s16(v, cr_coef);
let u_cb = vqrdmulhq_s16(u, cb_coef);
let v_g = vqrdmulhq_s16(v, g_coef_1);
let u_g = vqrdmulhq_s16(u, g_coef_2);
let r = vaddq_s16(y_scaled, v_cr);
let b = vaddq_s16(y_scaled, u_cb);
let g = vsubq_s16(y_scaled, vaddq_s16(v_g, u_g));
(r, g, b)
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn store_rgb_row_neon(out: &mut [RGB8], r: uint8x16_t, g: uint8x16_t, b: uint8x16_t) {
let mut r_arr = [0u8; 16];
let mut g_arr = [0u8; 16];
let mut b_arr = [0u8; 16];
safe_unaligned_simd::aarch64::vst1q_u8(&mut r_arr, r);
safe_unaligned_simd::aarch64::vst1q_u8(&mut g_arr, g);
safe_unaligned_simd::aarch64::vst1q_u8(&mut b_arr, b);
for i in 0..16 {
out[i] = RGB8 {
r: r_arr[i],
g: g_arr[i],
b: b_arr[i],
};
}
}