#![forbid(unsafe_code)]
#![allow(clippy::too_many_arguments)]
#![allow(dead_code)]
use crate::yuv_convert::{YuvMatrix, YuvRange};
use archmage::prelude::*; use imgref::ImgVec;
use rgb::RGB8;
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64::_mm_loadl_epi64;
const YG: i32 = 18997;
const YGB: i32 = -1160;
const UB: i32 = -128;
const UG: i32 = 14;
const VG: i32 = 34;
const VR: i32 = -115;
const BB: i32 = UB * 128 + YGB;
const BG: i32 = UG * 128 + VG * 128 + YGB;
const BR: i32 = VR * 128 + YGB;
#[inline(always)]
fn yuv_pixel(y: u8, u: u8, v: u8) -> RGB8 {
let y1 = ((y as u32) * 0x0101 * (YG as u32)) >> 16;
let y1 = y1 as i32;
let b_raw = (-((u as i32) * UB) + y1 + BB) >> 6;
let g_raw = (-((u as i32) * UG + (v as i32) * VG) + y1 + BG) >> 6;
let r_raw = (-((v as i32) * VR) + y1 + BR) >> 6;
RGB8 {
r: r_raw.clamp(0, 255) as u8,
g: g_raw.clamp(0, 255) as u8,
b: b_raw.clamp(0, 255) as u8,
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub fn yuv420_to_rgb8_simd(
token: Desktop64,
y_plane: &[u8],
y_stride: usize,
u_plane: &[u8],
u_stride: usize,
v_plane: &[u8],
v_stride: usize,
width: usize,
height: usize,
range: YuvRange,
matrix: YuvMatrix,
) -> Option<ImgVec<RGB8>> {
if !matches!((range, matrix), (YuvRange::Full, YuvMatrix::Bt709)) {
return None;
}
let mut out = vec![RGB8::default(); width * height];
for y in (0..height).step_by(2) {
let y0 = y;
let y1 = (y + 1).min(height - 1);
let chroma_y = y / 2;
let mut x = 0;
while x + 8 <= width {
process_8_pixels_avx2(
token,
&y_plane[y0 * y_stride + x..],
&u_plane[chroma_y * u_stride + x / 2..],
&v_plane[chroma_y * v_stride + x / 2..],
&mut out[y0 * width + x..],
);
if y1 < height {
process_8_pixels_avx2(
token,
&y_plane[y1 * y_stride + x..],
&u_plane[chroma_y * u_stride + x / 2..],
&v_plane[chroma_y * v_stride + x / 2..],
&mut out[y1 * width + x..],
);
}
x += 8;
}
while x < width {
for row in [y0, y1] {
if row >= height {
continue;
}
let chroma_x = x / 2;
let y_val = y_plane[row * y_stride + x];
let u_val = u_plane[chroma_y * u_stride + chroma_x];
let v_val = v_plane[chroma_y * v_stride + chroma_x];
out[row * width + x] = yuv_pixel(y_val, u_val, v_val);
}
x += 1;
}
}
Some(ImgVec::new(out, width, height))
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn process_8_pixels_avx2(
_token: Desktop64, y: &[u8],
u: &[u8],
v: &[u8],
out: &mut [RGB8],
) {
let yg_vec = _mm256_set1_epi32(YG);
let ub_vec = _mm256_set1_epi32(UB);
let ug_vec = _mm256_set1_epi32(UG);
let vg_vec = _mm256_set1_epi32(VG);
let vr_vec = _mm256_set1_epi32(VR);
let bb_vec = _mm256_set1_epi32(BB);
let bg_vec = _mm256_set1_epi32(BG);
let br_vec = _mm256_set1_epi32(BR);
let c0x0101 = _mm256_set1_epi32(0x0101);
let mut y_padded = [0u8; 16];
y_padded[..8].copy_from_slice(&y[..8]);
let y_vals = _mm_loadl_epi64(&y_padded); let y_8xi32 = _mm256_cvtepu8_epi32(y_vals);
let u_arr: &[u8; 4] = (&u[..4]).try_into().unwrap();
let v_arr: &[u8; 4] = (&v[..4]).try_into().unwrap();
let u_vals_4 = _mm_cvtsi32_si128(u32::from_le_bytes(*u_arr) as i32);
let v_vals_4 = _mm_cvtsi32_si128(u32::from_le_bytes(*v_arr) as i32);
let u_dup = _mm_unpacklo_epi8(u_vals_4, u_vals_4);
let v_dup = _mm_unpacklo_epi8(v_vals_4, v_vals_4);
let u_8xi32 = _mm256_cvtepu8_epi32(u_dup);
let v_8xi32 = _mm256_cvtepu8_epi32(v_dup);
let y1 = _mm256_srai_epi32(
_mm256_mullo_epi32(_mm256_mullo_epi32(y_8xi32, c0x0101), yg_vec),
16,
);
let b_i32 = _mm256_srai_epi32(
_mm256_add_epi32(
_mm256_sub_epi32(y1, _mm256_mullo_epi32(u_8xi32, ub_vec)),
bb_vec,
),
6,
);
let g_i32 = _mm256_srai_epi32(
_mm256_add_epi32(
_mm256_sub_epi32(
y1,
_mm256_add_epi32(
_mm256_mullo_epi32(u_8xi32, ug_vec),
_mm256_mullo_epi32(v_8xi32, vg_vec),
),
),
bg_vec,
),
6,
);
let r_i32 = _mm256_srai_epi32(
_mm256_add_epi32(
_mm256_sub_epi32(y1, _mm256_mullo_epi32(v_8xi32, vr_vec)),
br_vec,
),
6,
);
let zero = _mm256_setzero_si256();
let r_i16_lane = _mm256_packs_epi32(r_i32, zero);
let g_i16_lane = _mm256_packs_epi32(g_i32, zero);
let b_i16_lane = _mm256_packs_epi32(b_i32, zero);
let perm = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
let r_i16 = _mm256_permutevar8x32_epi32(r_i16_lane, perm);
let g_i16 = _mm256_permutevar8x32_epi32(g_i16_lane, perm);
let b_i16 = _mm256_permutevar8x32_epi32(b_i16_lane, perm);
let r_u8 = _mm256_packus_epi16(r_i16, zero);
let g_u8 = _mm256_packus_epi16(g_i16, zero);
let b_u8 = _mm256_packus_epi16(b_i16, zero);
let r_64 = _mm256_extract_epi64(r_u8, 0);
let g_64 = _mm256_extract_epi64(g_u8, 0);
let b_64 = _mm256_extract_epi64(b_u8, 0);
for (i, px) in out[..8].iter_mut().enumerate() {
*px = RGB8 {
r: ((r_64 >> (i * 8)) & 0xFF) as u8,
g: ((g_64 >> (i * 8)) & 0xFF) as u8,
b: ((b_64 >> (i * 8)) & 0xFF) as u8,
};
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub fn yuv420_to_rgb8_simd_neon(
token: NeonToken,
y_plane: &[u8],
y_stride: usize,
u_plane: &[u8],
u_stride: usize,
v_plane: &[u8],
v_stride: usize,
width: usize,
height: usize,
range: YuvRange,
matrix: YuvMatrix,
) -> Option<ImgVec<RGB8>> {
if !matches!((range, matrix), (YuvRange::Full, YuvMatrix::Bt709)) {
return None;
}
let mut out = vec![RGB8::default(); width * height];
for y in (0..height).step_by(2) {
let y0 = y;
let y1 = (y + 1).min(height - 1);
let chroma_y = y / 2;
let mut x = 0;
while x + 8 <= width {
process_8_pixels_neon(
token,
&y_plane[y0 * y_stride + x..],
&u_plane[chroma_y * u_stride + x / 2..],
&v_plane[chroma_y * v_stride + x / 2..],
&mut out[y0 * width + x..],
);
if y1 < height {
process_8_pixels_neon(
token,
&y_plane[y1 * y_stride + x..],
&u_plane[chroma_y * u_stride + x / 2..],
&v_plane[chroma_y * v_stride + x / 2..],
&mut out[y1 * width + x..],
);
}
x += 8;
}
while x < width {
for row in [y0, y1] {
if row >= height {
continue;
}
let chroma_x = x / 2;
let y_val = y_plane[row * y_stride + x];
let u_val = u_plane[chroma_y * u_stride + chroma_x];
let v_val = v_plane[chroma_y * v_stride + chroma_x];
out[row * width + x] = yuv_pixel(y_val, u_val, v_val);
}
x += 1;
}
}
Some(ImgVec::new(out, width, height))
}
#[cfg(target_arch = "aarch64")]
#[rite]
fn process_8_pixels_neon(_token: NeonToken, y: &[u8], u: &[u8], v: &[u8], out: &mut [RGB8]) {
let y_arr: [u8; 8] = y[..8].try_into().unwrap();
let y_u8 = vld1_u8(&y_arr); let y_u16 = vmovl_u8(y_u8);
let y_lo_u16 = vget_low_u16(y_u16);
let y_hi_u16 = vget_high_u16(y_u16);
let y_lo = vreinterpretq_s32_u32(vmovl_u16(y_lo_u16));
let y_hi = vreinterpretq_s32_u32(vmovl_u16(y_hi_u16));
let u_arr: [u8; 4] = u[..4].try_into().unwrap();
let u_word = u32::from_le_bytes(u_arr);
let u_raw = vld1_dup_u32(&u_word); let u_u8 = vreinterpret_u8_u32(u_raw);
let u_dup = vzip1_u8(u_u8, u_u8); let u_u16 = vmovl_u8(u_dup);
let u_lo = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_u16)));
let u_hi = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(u_u16)));
let v_arr: [u8; 4] = v[..4].try_into().unwrap();
let v_word = u32::from_le_bytes(v_arr);
let v_raw = vld1_dup_u32(&v_word);
let v_u8 = vreinterpret_u8_u32(v_raw);
let v_dup = vzip1_u8(v_u8, v_u8);
let v_u16 = vmovl_u8(v_dup);
let v_lo = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_u16)));
let v_hi = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_u16)));
let yg_vec = vdupq_n_s32(YG);
let ub_vec = vdupq_n_s32(UB);
let ug_vec = vdupq_n_s32(UG);
let vg_vec = vdupq_n_s32(VG);
let vr_vec = vdupq_n_s32(VR);
let bb_vec = vdupq_n_s32(BB);
let bg_vec = vdupq_n_s32(BG);
let br_vec = vdupq_n_s32(BR);
let c0x0101 = vdupq_n_s32(0x0101);
let (r_lo, g_lo, b_lo) = yuv_to_rgb_neon_i32(
_token, y_lo, u_lo, v_lo, yg_vec, c0x0101, ub_vec, ug_vec, vg_vec, vr_vec, bb_vec, bg_vec,
br_vec,
);
let (r_hi, g_hi, b_hi) = yuv_to_rgb_neon_i32(
_token, y_hi, u_hi, v_hi, yg_vec, c0x0101, ub_vec, ug_vec, vg_vec, vr_vec, bb_vec, bg_vec,
br_vec,
);
let r_i16_lo = vmovn_s32(r_lo);
let r_i16 = vmovn_high_s32(r_i16_lo, r_hi);
let g_i16_lo = vmovn_s32(g_lo);
let g_i16 = vmovn_high_s32(g_i16_lo, g_hi);
let b_i16_lo = vmovn_s32(b_lo);
let b_i16 = vmovn_high_s32(b_i16_lo, b_hi);
let r_u8 = vqmovun_s16(r_i16);
let g_u8 = vqmovun_s16(g_i16);
let b_u8 = vqmovun_s16(b_i16);
let mut r_out = [0u8; 8];
let mut g_out = [0u8; 8];
let mut b_out = [0u8; 8];
vst1_u8(&mut r_out, r_u8);
vst1_u8(&mut g_out, g_u8);
vst1_u8(&mut b_out, b_u8);
for (i, px) in out[..8].iter_mut().enumerate() {
*px = RGB8 {
r: r_out[i],
g: g_out[i],
b: b_out[i],
};
}
}
#[cfg(target_arch = "aarch64")]
#[rite]
fn yuv_to_rgb_neon_i32(
_token: NeonToken,
y: int32x4_t,
u: int32x4_t,
v: int32x4_t,
yg: int32x4_t,
c0x0101: int32x4_t,
ub: int32x4_t,
ug: int32x4_t,
vg: int32x4_t,
vr: int32x4_t,
bb: int32x4_t,
bg: int32x4_t,
br: int32x4_t,
) -> (int32x4_t, int32x4_t, int32x4_t) {
let y1 = vshrq_n_s32::<16>(vmulq_s32(vmulq_s32(y, c0x0101), yg));
let b = vshrq_n_s32::<6>(vaddq_s32(vsubq_s32(y1, vmulq_s32(u, ub)), bb));
let g = vshrq_n_s32::<6>(vaddq_s32(
vsubq_s32(y1, vaddq_s32(vmulq_s32(u, ug), vmulq_s32(v, vg))),
bg,
));
let r = vshrq_n_s32::<6>(vaddq_s32(vsubq_s32(y1, vmulq_s32(v, vr)), br));
(r, g, b)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simd_matches_scalar() {
#[cfg(target_arch = "x86_64")]
if let Some(token) = Desktop64::summon() {
let width = 16;
let height = 16;
let y_plane = vec![180u8; width * height];
let u_plane = vec![100u8; (width / 2) * (height / 2)];
let v_plane = vec![150u8; (width / 2) * (height / 2)];
let result = yuv420_to_rgb8_simd(
token,
&y_plane,
width,
&u_plane,
width / 2,
&v_plane,
width / 2,
width,
height,
YuvRange::Full,
YuvMatrix::Bt709,
)
.unwrap();
for (i, pixel) in result.buf().iter().enumerate() {
assert_eq!(pixel.r, 230, "R at {}", i);
assert_eq!(pixel.g, 185, "G at {}", i);
assert_eq!(pixel.b, 135, "B at {}", i);
}
}
#[cfg(target_arch = "aarch64")]
if let Some(token) = NeonToken::summon() {
let width = 16;
let height = 16;
let y_plane = vec![180u8; width * height];
let u_plane = vec![100u8; (width / 2) * (height / 2)];
let v_plane = vec![150u8; (width / 2) * (height / 2)];
let result = yuv420_to_rgb8_simd_neon(
token,
&y_plane,
width,
&u_plane,
width / 2,
&v_plane,
width / 2,
width,
height,
YuvRange::Full,
YuvMatrix::Bt709,
)
.unwrap();
for (i, pixel) in result.buf().iter().enumerate() {
assert_eq!(pixel.r, 230, "R at {}", i);
assert_eq!(pixel.g, 185, "G at {}", i);
assert_eq!(pixel.b, 135, "B at {}", i);
}
}
}
}