#![allow(dead_code)]
use archmage::autoversion;
use wide::f32x8;
#[cfg(target_arch = "x86_64")]
use archmage::{SimdToken, arcane, rite};
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
use core::arch::x86_64::{
__m128, __m128i, __m256, _mm_cvtepu8_epi32, _mm_fmadd_ps, _mm_loadu_si128, _mm_mul_ps,
_mm_set1_ps, _mm_setr_epi8, _mm_shuffle_epi8, _mm_storeu_ps,
};
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64 as safe_simd;
use crate::foundation::consts::{
YCBCR_B_TO_CB, YCBCR_B_TO_CR, YCBCR_B_TO_Y, YCBCR_G_TO_CB, YCBCR_G_TO_CR, YCBCR_G_TO_Y,
YCBCR_R_TO_CB, YCBCR_R_TO_CR, YCBCR_R_TO_Y,
};
#[inline(always)]
fn load_f32x8(slice: &[f32], offset: usize) -> f32x8 {
<[f32; 8]>::try_from(&slice[offset..offset + 8])
.unwrap()
.into()
}
#[inline(always)]
fn store_f32x8(slice: &mut [f32], offset: usize, value: f32x8) {
slice[offset..offset + 8].copy_from_slice(value.as_array())
}
#[autoversion]
pub fn downsample_2x2_simd_inplace(plane: &[f32], width: usize, height: usize, result: &mut [f32]) {
let new_width = (width + 1) / 2;
let new_height = (height + 1) / 2;
debug_assert!(result.len() >= new_width * new_height);
let scale = f32x8::splat(0.25);
let safe_chunks = if width >= 16 { (width - 15) / 16 } else { 0 };
for y in 0..new_height {
let y0 = y * 2;
let y1 = (y0 + 1).min(height - 1);
let out_row_start = y * new_width;
for chunk in 0..safe_chunks {
let out_x = chunk * 8;
let in_x = out_x * 2;
let row0_idx = y0 * width + in_x;
let row1_idx = y1 * width + in_x;
let (p00, p10) = gather_even_odd_x8(plane, row0_idx, width);
let (p01, p11) = gather_even_odd_x8(plane, row1_idx, width);
let sum = p00 + p10 + p01 + p11;
let avg = sum * scale;
store_f32x8(result, out_row_start + out_x, avg);
}
for out_x in (safe_chunks * 8)..new_width {
let x0 = out_x * 2;
let x1 = (x0 + 1).min(width - 1);
let p00 = plane[y0 * width + x0];
let p10 = plane[y0 * width + x1];
let p01 = plane[y1 * width + x0];
let p11 = plane[y1 * width + x1];
result[out_row_start + out_x] = (p00 + p10 + p01 + p11) * 0.25;
}
}
}
pub fn downsample_2x1_simd_inplace(plane: &[f32], width: usize, height: usize, result: &mut [f32]) {
let new_width = (width + 1) / 2;
debug_assert!(result.len() >= new_width * height);
let scale = f32x8::splat(0.5);
let safe_chunks = if width >= 16 { (width - 15) / 16 } else { 0 };
for y in 0..height {
let out_row_start = y * new_width;
let in_row_start = y * width;
for chunk in 0..safe_chunks {
let out_x = chunk * 8;
let in_x = out_x * 2;
let (p0, p1) = gather_even_odd_x8(plane, in_row_start + in_x, width);
let avg = (p0 + p1) * scale;
store_f32x8(result, out_row_start + out_x, avg);
}
for out_x in (safe_chunks * 8)..new_width {
let x0 = out_x * 2;
let x1 = (x0 + 1).min(width - 1);
let p0 = plane[in_row_start + x0];
let p1 = plane[in_row_start + x1];
result[out_row_start + out_x] = (p0 + p1) * 0.5;
}
}
}
pub fn downsample_1x2_simd_inplace(plane: &[f32], width: usize, height: usize, result: &mut [f32]) {
let new_height = (height + 1) / 2;
debug_assert!(result.len() >= width * new_height);
let scale = f32x8::splat(0.5);
let chunks = width / 8;
for y in 0..new_height {
let y0 = y * 2;
let y1 = (y0 + 1).min(height - 1);
let out_row_start = y * width;
for chunk in 0..chunks {
let x = chunk * 8;
let row0_idx = y0 * width + x;
let row1_idx = y1 * width + x;
let p0 = load_f32x8(plane, row0_idx);
let p1 = load_f32x8(plane, row1_idx);
let avg = (p0 + p1) * scale;
store_f32x8(result, out_row_start + x, avg);
}
for x in (chunks * 8)..width {
let p0 = plane[y0 * width + x];
let p1 = plane[y1 * width + x];
result[out_row_start + x] = (p0 + p1) * 0.5;
}
}
}
#[cfg(test)]
#[inline]
fn gather_even_odd_scalar(data: &[f32]) -> ([f32; 8], [f32; 8]) {
debug_assert!(data.len() >= 16);
let evens = [
data[0], data[2], data[4], data[6], data[8], data[10], data[12], data[14],
];
let odds = [
data[1], data[3], data[5], data[7], data[9], data[11], data[13], data[15],
];
(evens, odds)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
#[inline]
fn gather_even_odd_x8_avx2(_token: archmage::X64V3Token, data: &[f32; 16]) -> (f32x8, f32x8) {
use std::arch::x86_64::*;
let lo = safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&data[..8]).unwrap());
let hi = safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&data[8..]).unwrap());
let v2020 = _mm256_shuffle_ps(lo, hi, 0x88);
let evens_raw = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(v2020), 0xD8));
let v3131 = _mm256_shuffle_ps(lo, hi, 0xDD);
let odds_raw = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(v3131), 0xD8));
(
bytemuck::cast::<__m256, f32x8>(evens_raw),
bytemuck::cast::<__m256, f32x8>(odds_raw),
)
}
#[inline(always)]
fn gather_even_odd_x8_scalar(slice: &[f32]) -> (f32x8, f32x8) {
let evens = f32x8::from([
slice[0], slice[2], slice[4], slice[6], slice[8], slice[10], slice[12], slice[14],
]);
let odds = f32x8::from([
slice[1], slice[3], slice[5], slice[7], slice[9], slice[11], slice[13], slice[15],
]);
(evens, odds)
}
#[inline(always)]
fn gather_even_odd_x8_boundary(plane: &[f32], start_idx: usize) -> (f32x8, f32x8) {
let get = |offset: usize| -> f32 {
let idx = start_idx + offset;
if idx < plane.len() {
plane[idx]
} else {
plane[plane.len() - 1]
}
};
let evens = f32x8::from([
get(0),
get(2),
get(4),
get(6),
get(8),
get(10),
get(12),
get(14),
]);
let odds = f32x8::from([
get(1),
get(3),
get(5),
get(7),
get(9),
get(11),
get(13),
get(15),
]);
(evens, odds)
}
#[inline(always)]
fn gather_even_odd_x8(plane: &[f32], start_idx: usize, _width: usize) -> (f32x8, f32x8) {
if start_idx + 16 <= plane.len() {
let slice = &plane[start_idx..start_idx + 16];
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
return gather_even_odd_x8_avx2(token, slice.try_into().unwrap());
} else {
return gather_even_odd_x8_scalar(slice);
}
}
#[cfg(not(target_arch = "x86_64"))]
{
return gather_even_odd_x8_scalar(slice);
}
}
gather_even_odd_x8_boundary(plane, start_idx)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn extract_r_ssse3(_token: archmage::X64V3Token, rgb: __m128i) -> __m128i {
let mask = _mm_setr_epi8(0, 3, 6, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
_mm_shuffle_epi8(rgb, mask)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn extract_g_ssse3(_token: archmage::X64V3Token, rgb: __m128i) -> __m128i {
let mask = _mm_setr_epi8(1, 4, 7, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
_mm_shuffle_epi8(rgb, mask)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn extract_b_ssse3(_token: archmage::X64V3Token, rgb: __m128i) -> __m128i {
let mask = _mm_setr_epi8(2, 5, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
_mm_shuffle_epi8(rgb, mask)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn u8x4_to_f32x4_sse41(_token: archmage::X64V3Token, v: __m128i) -> __m128 {
use core::arch::x86_64::_mm_cvtepi32_ps;
let i32_vec = _mm_cvtepu8_epi32(v);
_mm_cvtepi32_ps(i32_vec)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
#[inline]
pub(crate) fn rgb_to_ycbcr_8px_fma(
_token: archmage::X64V3Token,
rgb_data: &[u8],
y_out: &mut [f32; 8],
cb_out: &mut [f32; 8],
cr_out: &mut [f32; 8],
) {
let rgb0 = safe_simd::_mm_loadu_si128(<&[u8; 16]>::try_from(&rgb_data[..16]).unwrap());
let rgb1 = safe_simd::_mm_loadu_si128(<&[u8; 16]>::try_from(&rgb_data[12..28]).unwrap());
let r0_bytes = extract_r_ssse3(_token, rgb0);
let g0_bytes = extract_g_ssse3(_token, rgb0);
let b0_bytes = extract_b_ssse3(_token, rgb0);
let r1_bytes = extract_r_ssse3(_token, rgb1);
let g1_bytes = extract_g_ssse3(_token, rgb1);
let b1_bytes = extract_b_ssse3(_token, rgb1);
let r0: __m128 = u8x4_to_f32x4_sse41(_token, r0_bytes);
let g0: __m128 = u8x4_to_f32x4_sse41(_token, g0_bytes);
let b0: __m128 = u8x4_to_f32x4_sse41(_token, b0_bytes);
let r1: __m128 = u8x4_to_f32x4_sse41(_token, r1_bytes);
let g1: __m128 = u8x4_to_f32x4_sse41(_token, g1_bytes);
let b1: __m128 = u8x4_to_f32x4_sse41(_token, b1_bytes);
let r_to_y = _mm_set1_ps(YCBCR_R_TO_Y);
let g_to_y = _mm_set1_ps(YCBCR_G_TO_Y);
let b_to_y = _mm_set1_ps(YCBCR_B_TO_Y);
let r_to_cb = _mm_set1_ps(YCBCR_R_TO_CB);
let g_to_cb = _mm_set1_ps(YCBCR_G_TO_CB);
let b_to_cb = _mm_set1_ps(YCBCR_B_TO_CB);
let r_to_cr = _mm_set1_ps(YCBCR_R_TO_CR);
let g_to_cr = _mm_set1_ps(YCBCR_G_TO_CR);
let b_to_cr = _mm_set1_ps(YCBCR_B_TO_CR);
let offset_128 = _mm_set1_ps(128.0);
let y0 = _mm_fmadd_ps(b0, b_to_y, _mm_fmadd_ps(g0, g_to_y, _mm_mul_ps(r0, r_to_y)));
let cb0 = _mm_fmadd_ps(
b0,
b_to_cb,
_mm_fmadd_ps(g0, g_to_cb, _mm_fmadd_ps(r0, r_to_cb, offset_128)),
);
let cr0 = _mm_fmadd_ps(
b0,
b_to_cr,
_mm_fmadd_ps(g0, g_to_cr, _mm_fmadd_ps(r0, r_to_cr, offset_128)),
);
let y1 = _mm_fmadd_ps(b1, b_to_y, _mm_fmadd_ps(g1, g_to_y, _mm_mul_ps(r1, r_to_y)));
let cb1 = _mm_fmadd_ps(
b1,
b_to_cb,
_mm_fmadd_ps(g1, g_to_cb, _mm_fmadd_ps(r1, r_to_cb, offset_128)),
);
let cr1 = _mm_fmadd_ps(
b1,
b_to_cr,
_mm_fmadd_ps(g1, g_to_cr, _mm_fmadd_ps(r1, r_to_cr, offset_128)),
);
safe_simd::_mm_storeu_ps(<&mut [f32; 4]>::try_from(&mut y_out[..4]).unwrap(), y0);
safe_simd::_mm_storeu_ps(<&mut [f32; 4]>::try_from(&mut y_out[4..]).unwrap(), y1);
safe_simd::_mm_storeu_ps(<&mut [f32; 4]>::try_from(&mut cb_out[..4]).unwrap(), cb0);
safe_simd::_mm_storeu_ps(<&mut [f32; 4]>::try_from(&mut cb_out[4..]).unwrap(), cb1);
safe_simd::_mm_storeu_ps(<&mut [f32; 4]>::try_from(&mut cr_out[..4]).unwrap(), cr0);
safe_simd::_mm_storeu_ps(<&mut [f32; 4]>::try_from(&mut cr_out[4..]).unwrap(), cr1);
}
#[cfg(all(test, target_arch = "x86_64"))]
fn rgb_to_ycbcr_scalar(
rgb_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
num_pixels: usize,
) {
for i in 0..num_pixels {
let rgb_idx = i * 3;
let r = rgb_data[rgb_idx] as f32;
let g = rgb_data[rgb_idx + 1] as f32;
let b = rgb_data[rgb_idx + 2] as f32;
y_plane[i] = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[i] =
YCBCR_R_TO_CB.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[i] =
YCBCR_R_TO_CR.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
}
#[inline]
pub fn rgb_to_ycbcr_planes_simd_inplace(
rgb_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
num_pixels: usize,
) {
debug_assert!(rgb_data.len() >= num_pixels * 3);
debug_assert!(y_plane.len() >= num_pixels);
debug_assert!(cb_plane.len() >= num_pixels);
debug_assert!(cr_plane.len() >= num_pixels);
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let pixel_idx = chunk * 8;
let rgb_idx = pixel_idx * 3;
rgb_to_ycbcr_8px_fma(
token,
&rgb_data[rgb_idx..],
<&mut [f32; 8]>::try_from(&mut y_plane[pixel_idx..pixel_idx + 8]).unwrap(),
<&mut [f32; 8]>::try_from(&mut cb_plane[pixel_idx..pixel_idx + 8]).unwrap(),
<&mut [f32; 8]>::try_from(&mut cr_plane[pixel_idx..pixel_idx + 8]).unwrap(),
);
}
for i in (chunks * 8)..num_pixels {
let rgb_idx = i * 3;
let r = rgb_data[rgb_idx] as f32;
let g = rgb_data[rgb_idx + 1] as f32;
let b = rgb_data[rgb_idx + 2] as f32;
y_plane[i] = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[i] = YCBCR_R_TO_CB
.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[i] = YCBCR_R_TO_CR
.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
return;
}
}
rgb_to_ycbcr_planes_simd_inplace_fallback(rgb_data, y_plane, cb_plane, cr_plane, num_pixels);
}
#[autoversion]
fn rgb_to_ycbcr_planes_simd_inplace_fallback(
rgb_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
num_pixels: usize,
) {
let r_to_y = f32x8::splat(YCBCR_R_TO_Y);
let g_to_y = f32x8::splat(YCBCR_G_TO_Y);
let b_to_y = f32x8::splat(YCBCR_B_TO_Y);
let r_to_cb = f32x8::splat(YCBCR_R_TO_CB);
let g_to_cb = f32x8::splat(YCBCR_G_TO_CB);
let b_to_cb = f32x8::splat(YCBCR_B_TO_CB);
let r_to_cr = f32x8::splat(YCBCR_R_TO_CR);
let g_to_cr = f32x8::splat(YCBCR_G_TO_CR);
let b_to_cr = f32x8::splat(YCBCR_B_TO_CR);
let offset_128 = f32x8::splat(128.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let pixel_idx = chunk * 8;
let rgb_idx = pixel_idx * 3;
let r = f32x8::from([
rgb_data[rgb_idx] as f32,
rgb_data[rgb_idx + 3] as f32,
rgb_data[rgb_idx + 6] as f32,
rgb_data[rgb_idx + 9] as f32,
rgb_data[rgb_idx + 12] as f32,
rgb_data[rgb_idx + 15] as f32,
rgb_data[rgb_idx + 18] as f32,
rgb_data[rgb_idx + 21] as f32,
]);
let g = f32x8::from([
rgb_data[rgb_idx + 1] as f32,
rgb_data[rgb_idx + 4] as f32,
rgb_data[rgb_idx + 7] as f32,
rgb_data[rgb_idx + 10] as f32,
rgb_data[rgb_idx + 13] as f32,
rgb_data[rgb_idx + 16] as f32,
rgb_data[rgb_idx + 19] as f32,
rgb_data[rgb_idx + 22] as f32,
]);
let b = f32x8::from([
rgb_data[rgb_idx + 2] as f32,
rgb_data[rgb_idx + 5] as f32,
rgb_data[rgb_idx + 8] as f32,
rgb_data[rgb_idx + 11] as f32,
rgb_data[rgb_idx + 14] as f32,
rgb_data[rgb_idx + 17] as f32,
rgb_data[rgb_idx + 20] as f32,
rgb_data[rgb_idx + 23] as f32,
]);
let y = r_to_y.mul_add(r, g_to_y.mul_add(g, b_to_y * b));
let cb = r_to_cb.mul_add(r, g_to_cb.mul_add(g, b_to_cb.mul_add(b, offset_128)));
let cr = r_to_cr.mul_add(r, g_to_cr.mul_add(g, b_to_cr.mul_add(b, offset_128)));
store_f32x8(y_plane, pixel_idx, y);
store_f32x8(cb_plane, pixel_idx, cb);
store_f32x8(cr_plane, pixel_idx, cr);
}
for i in (chunks * 8)..num_pixels {
let rgb_idx = i * 3;
let r = rgb_data[rgb_idx] as f32;
let g = rgb_data[rgb_idx + 1] as f32;
let b = rgb_data[rgb_idx + 2] as f32;
y_plane[i] = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[i] =
YCBCR_R_TO_CB.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[i] =
YCBCR_R_TO_CR.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
}
#[autoversion]
pub fn rgba_to_ycbcr_planes_simd_inplace(
rgba_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
num_pixels: usize,
) {
debug_assert!(rgba_data.len() >= num_pixels * 4);
debug_assert!(y_plane.len() >= num_pixels);
debug_assert!(cb_plane.len() >= num_pixels);
debug_assert!(cr_plane.len() >= num_pixels);
let r_to_y = f32x8::splat(YCBCR_R_TO_Y);
let g_to_y = f32x8::splat(YCBCR_G_TO_Y);
let b_to_y = f32x8::splat(YCBCR_B_TO_Y);
let r_to_cb = f32x8::splat(YCBCR_R_TO_CB);
let g_to_cb = f32x8::splat(YCBCR_G_TO_CB);
let b_to_cb = f32x8::splat(YCBCR_B_TO_CB);
let r_to_cr = f32x8::splat(YCBCR_R_TO_CR);
let g_to_cr = f32x8::splat(YCBCR_G_TO_CR);
let b_to_cr = f32x8::splat(YCBCR_B_TO_CR);
let offset_128 = f32x8::splat(128.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let pixel_idx = chunk * 8;
let rgba_idx = pixel_idx * 4;
let r = f32x8::from([
rgba_data[rgba_idx] as f32,
rgba_data[rgba_idx + 4] as f32,
rgba_data[rgba_idx + 8] as f32,
rgba_data[rgba_idx + 12] as f32,
rgba_data[rgba_idx + 16] as f32,
rgba_data[rgba_idx + 20] as f32,
rgba_data[rgba_idx + 24] as f32,
rgba_data[rgba_idx + 28] as f32,
]);
let g = f32x8::from([
rgba_data[rgba_idx + 1] as f32,
rgba_data[rgba_idx + 5] as f32,
rgba_data[rgba_idx + 9] as f32,
rgba_data[rgba_idx + 13] as f32,
rgba_data[rgba_idx + 17] as f32,
rgba_data[rgba_idx + 21] as f32,
rgba_data[rgba_idx + 25] as f32,
rgba_data[rgba_idx + 29] as f32,
]);
let b = f32x8::from([
rgba_data[rgba_idx + 2] as f32,
rgba_data[rgba_idx + 6] as f32,
rgba_data[rgba_idx + 10] as f32,
rgba_data[rgba_idx + 14] as f32,
rgba_data[rgba_idx + 18] as f32,
rgba_data[rgba_idx + 22] as f32,
rgba_data[rgba_idx + 26] as f32,
rgba_data[rgba_idx + 30] as f32,
]);
let y = r_to_y.mul_add(r, g_to_y.mul_add(g, b_to_y * b));
let cb = r_to_cb.mul_add(r, g_to_cb.mul_add(g, b_to_cb.mul_add(b, offset_128)));
let cr = r_to_cr.mul_add(r, g_to_cr.mul_add(g, b_to_cr.mul_add(b, offset_128)));
store_f32x8(y_plane, pixel_idx, y);
store_f32x8(cb_plane, pixel_idx, cb);
store_f32x8(cr_plane, pixel_idx, cr);
}
for i in (chunks * 8)..num_pixels {
let rgba_idx = i * 4;
let r = rgba_data[rgba_idx] as f32;
let g = rgba_data[rgba_idx + 1] as f32;
let b = rgba_data[rgba_idx + 2] as f32;
y_plane[i] = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[i] =
YCBCR_R_TO_CB.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[i] =
YCBCR_R_TO_CR.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
}
#[autoversion]
pub fn bgr_to_ycbcr_planes_simd_inplace(
bgr_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
num_pixels: usize,
) {
debug_assert!(bgr_data.len() >= num_pixels * 3);
debug_assert!(y_plane.len() >= num_pixels);
debug_assert!(cb_plane.len() >= num_pixels);
debug_assert!(cr_plane.len() >= num_pixels);
let r_to_y = f32x8::splat(YCBCR_R_TO_Y);
let g_to_y = f32x8::splat(YCBCR_G_TO_Y);
let b_to_y = f32x8::splat(YCBCR_B_TO_Y);
let r_to_cb = f32x8::splat(YCBCR_R_TO_CB);
let g_to_cb = f32x8::splat(YCBCR_G_TO_CB);
let b_to_cb = f32x8::splat(YCBCR_B_TO_CB);
let r_to_cr = f32x8::splat(YCBCR_R_TO_CR);
let g_to_cr = f32x8::splat(YCBCR_G_TO_CR);
let b_to_cr = f32x8::splat(YCBCR_B_TO_CR);
let offset_128 = f32x8::splat(128.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let pixel_idx = chunk * 8;
let bgr_idx = pixel_idx * 3;
let b = f32x8::from([
bgr_data[bgr_idx] as f32,
bgr_data[bgr_idx + 3] as f32,
bgr_data[bgr_idx + 6] as f32,
bgr_data[bgr_idx + 9] as f32,
bgr_data[bgr_idx + 12] as f32,
bgr_data[bgr_idx + 15] as f32,
bgr_data[bgr_idx + 18] as f32,
bgr_data[bgr_idx + 21] as f32,
]);
let g = f32x8::from([
bgr_data[bgr_idx + 1] as f32,
bgr_data[bgr_idx + 4] as f32,
bgr_data[bgr_idx + 7] as f32,
bgr_data[bgr_idx + 10] as f32,
bgr_data[bgr_idx + 13] as f32,
bgr_data[bgr_idx + 16] as f32,
bgr_data[bgr_idx + 19] as f32,
bgr_data[bgr_idx + 22] as f32,
]);
let r = f32x8::from([
bgr_data[bgr_idx + 2] as f32,
bgr_data[bgr_idx + 5] as f32,
bgr_data[bgr_idx + 8] as f32,
bgr_data[bgr_idx + 11] as f32,
bgr_data[bgr_idx + 14] as f32,
bgr_data[bgr_idx + 17] as f32,
bgr_data[bgr_idx + 20] as f32,
bgr_data[bgr_idx + 23] as f32,
]);
let y = r_to_y.mul_add(r, g_to_y.mul_add(g, b_to_y * b));
let cb = r_to_cb.mul_add(r, g_to_cb.mul_add(g, b_to_cb.mul_add(b, offset_128)));
let cr = r_to_cr.mul_add(r, g_to_cr.mul_add(g, b_to_cr.mul_add(b, offset_128)));
store_f32x8(y_plane, pixel_idx, y);
store_f32x8(cb_plane, pixel_idx, cb);
store_f32x8(cr_plane, pixel_idx, cr);
}
for i in (chunks * 8)..num_pixels {
let bgr_idx = i * 3;
let b = bgr_data[bgr_idx] as f32;
let g = bgr_data[bgr_idx + 1] as f32;
let r = bgr_data[bgr_idx + 2] as f32;
y_plane[i] = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[i] =
YCBCR_R_TO_CB.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[i] =
YCBCR_R_TO_CR.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
}
#[autoversion]
pub fn bgra_to_ycbcr_planes_simd_inplace(
bgra_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
num_pixels: usize,
) {
debug_assert!(bgra_data.len() >= num_pixels * 4);
debug_assert!(y_plane.len() >= num_pixels);
debug_assert!(cb_plane.len() >= num_pixels);
debug_assert!(cr_plane.len() >= num_pixels);
let r_to_y = f32x8::splat(YCBCR_R_TO_Y);
let g_to_y = f32x8::splat(YCBCR_G_TO_Y);
let b_to_y = f32x8::splat(YCBCR_B_TO_Y);
let r_to_cb = f32x8::splat(YCBCR_R_TO_CB);
let g_to_cb = f32x8::splat(YCBCR_G_TO_CB);
let b_to_cb = f32x8::splat(YCBCR_B_TO_CB);
let r_to_cr = f32x8::splat(YCBCR_R_TO_CR);
let g_to_cr = f32x8::splat(YCBCR_G_TO_CR);
let b_to_cr = f32x8::splat(YCBCR_B_TO_CR);
let offset_128 = f32x8::splat(128.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let pixel_idx = chunk * 8;
let bgra_idx = pixel_idx * 4;
let b = f32x8::from([
bgra_data[bgra_idx] as f32,
bgra_data[bgra_idx + 4] as f32,
bgra_data[bgra_idx + 8] as f32,
bgra_data[bgra_idx + 12] as f32,
bgra_data[bgra_idx + 16] as f32,
bgra_data[bgra_idx + 20] as f32,
bgra_data[bgra_idx + 24] as f32,
bgra_data[bgra_idx + 28] as f32,
]);
let g = f32x8::from([
bgra_data[bgra_idx + 1] as f32,
bgra_data[bgra_idx + 5] as f32,
bgra_data[bgra_idx + 9] as f32,
bgra_data[bgra_idx + 13] as f32,
bgra_data[bgra_idx + 17] as f32,
bgra_data[bgra_idx + 21] as f32,
bgra_data[bgra_idx + 25] as f32,
bgra_data[bgra_idx + 29] as f32,
]);
let r = f32x8::from([
bgra_data[bgra_idx + 2] as f32,
bgra_data[bgra_idx + 6] as f32,
bgra_data[bgra_idx + 10] as f32,
bgra_data[bgra_idx + 14] as f32,
bgra_data[bgra_idx + 18] as f32,
bgra_data[bgra_idx + 22] as f32,
bgra_data[bgra_idx + 26] as f32,
bgra_data[bgra_idx + 30] as f32,
]);
let y = r_to_y.mul_add(r, g_to_y.mul_add(g, b_to_y * b));
let cb = r_to_cb.mul_add(r, g_to_cb.mul_add(g, b_to_cb.mul_add(b, offset_128)));
let cr = r_to_cr.mul_add(r, g_to_cr.mul_add(g, b_to_cr.mul_add(b, offset_128)));
store_f32x8(y_plane, pixel_idx, y);
store_f32x8(cb_plane, pixel_idx, cb);
store_f32x8(cr_plane, pixel_idx, cr);
}
for i in (chunks * 8)..num_pixels {
let bgra_idx = i * 4;
let b = bgra_data[bgra_idx] as f32;
let g = bgra_data[bgra_idx + 1] as f32;
let r = bgra_data[bgra_idx + 2] as f32;
y_plane[i] = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[i] =
YCBCR_R_TO_CB.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[i] =
YCBCR_R_TO_CR.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
}
#[autoversion]
pub fn rgb_to_ycbcr_strided_inplace(
rgb_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
width: usize,
height: usize,
y_stride: usize,
bpp: usize,
) {
debug_assert!(rgb_data.len() >= width * height * bpp);
debug_assert!(y_plane.len() >= y_stride * height);
debug_assert!(cb_plane.len() >= width * height);
debug_assert!(cr_plane.len() >= width * height);
#[cfg(feature = "yuv")]
{
crate::color::fast_yuv::rgb_to_ycbcr_strided_fast(
rgb_data, y_plane, cb_plane, cr_plane, width, height, y_stride, bpp,
);
}
#[cfg(not(feature = "yuv"))]
if y_stride == width {
let num_pixels = width * height;
match bpp {
3 => {
rgb_to_ycbcr_planes_simd_inplace(rgb_data, y_plane, cb_plane, cr_plane, num_pixels)
}
4 => {
rgba_to_ycbcr_planes_simd_inplace(rgb_data, y_plane, cb_plane, cr_plane, num_pixels)
}
_ => return, }
return;
}
#[cfg(not(feature = "yuv"))]
{
let r_to_y = f32x8::splat(YCBCR_R_TO_Y);
let g_to_y = f32x8::splat(YCBCR_G_TO_Y);
let b_to_y = f32x8::splat(YCBCR_B_TO_Y);
let r_to_cb = f32x8::splat(YCBCR_R_TO_CB);
let g_to_cb = f32x8::splat(YCBCR_G_TO_CB);
let b_to_cb = f32x8::splat(YCBCR_B_TO_CB);
let r_to_cr = f32x8::splat(YCBCR_R_TO_CR);
let g_to_cr = f32x8::splat(YCBCR_G_TO_CR);
let b_to_cr = f32x8::splat(YCBCR_B_TO_CR);
let offset_128 = f32x8::splat(128.0);
for row in 0..height {
let rgb_row_start = row * width * bpp;
let y_row_start = row * y_stride;
let cbcr_row_start = row * width;
let chunks = width / 8;
for chunk in 0..chunks {
let px = chunk * 8;
let rgb_idx = rgb_row_start + px * bpp;
let (r, g, b) = if bpp == 3 {
(
f32x8::from([
rgb_data[rgb_idx] as f32,
rgb_data[rgb_idx + 3] as f32,
rgb_data[rgb_idx + 6] as f32,
rgb_data[rgb_idx + 9] as f32,
rgb_data[rgb_idx + 12] as f32,
rgb_data[rgb_idx + 15] as f32,
rgb_data[rgb_idx + 18] as f32,
rgb_data[rgb_idx + 21] as f32,
]),
f32x8::from([
rgb_data[rgb_idx + 1] as f32,
rgb_data[rgb_idx + 4] as f32,
rgb_data[rgb_idx + 7] as f32,
rgb_data[rgb_idx + 10] as f32,
rgb_data[rgb_idx + 13] as f32,
rgb_data[rgb_idx + 16] as f32,
rgb_data[rgb_idx + 19] as f32,
rgb_data[rgb_idx + 22] as f32,
]),
f32x8::from([
rgb_data[rgb_idx + 2] as f32,
rgb_data[rgb_idx + 5] as f32,
rgb_data[rgb_idx + 8] as f32,
rgb_data[rgb_idx + 11] as f32,
rgb_data[rgb_idx + 14] as f32,
rgb_data[rgb_idx + 17] as f32,
rgb_data[rgb_idx + 20] as f32,
rgb_data[rgb_idx + 23] as f32,
]),
)
} else {
(
f32x8::from([
rgb_data[rgb_idx] as f32,
rgb_data[rgb_idx + 4] as f32,
rgb_data[rgb_idx + 8] as f32,
rgb_data[rgb_idx + 12] as f32,
rgb_data[rgb_idx + 16] as f32,
rgb_data[rgb_idx + 20] as f32,
rgb_data[rgb_idx + 24] as f32,
rgb_data[rgb_idx + 28] as f32,
]),
f32x8::from([
rgb_data[rgb_idx + 1] as f32,
rgb_data[rgb_idx + 5] as f32,
rgb_data[rgb_idx + 9] as f32,
rgb_data[rgb_idx + 13] as f32,
rgb_data[rgb_idx + 17] as f32,
rgb_data[rgb_idx + 21] as f32,
rgb_data[rgb_idx + 25] as f32,
rgb_data[rgb_idx + 29] as f32,
]),
f32x8::from([
rgb_data[rgb_idx + 2] as f32,
rgb_data[rgb_idx + 6] as f32,
rgb_data[rgb_idx + 10] as f32,
rgb_data[rgb_idx + 14] as f32,
rgb_data[rgb_idx + 18] as f32,
rgb_data[rgb_idx + 22] as f32,
rgb_data[rgb_idx + 26] as f32,
rgb_data[rgb_idx + 30] as f32,
]),
)
};
let y = r_to_y.mul_add(r, g_to_y.mul_add(g, b_to_y * b));
let cb = r_to_cb.mul_add(r, g_to_cb.mul_add(g, b_to_cb.mul_add(b, offset_128)));
let cr = r_to_cr.mul_add(r, g_to_cr.mul_add(g, b_to_cr.mul_add(b, offset_128)));
store_f32x8(y_plane, y_row_start + px, y);
store_f32x8(cb_plane, cbcr_row_start + px, cb);
store_f32x8(cr_plane, cbcr_row_start + px, cr);
}
for px in (chunks * 8)..width {
let rgb_idx = rgb_row_start + px * bpp;
let r = rgb_data[rgb_idx] as f32;
let g = rgb_data[rgb_idx + 1] as f32;
let b = rgb_data[rgb_idx + 2] as f32;
y_plane[y_row_start + px] =
YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[cbcr_row_start + px] = YCBCR_R_TO_CB
.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[cbcr_row_start + px] = YCBCR_R_TO_CR
.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
if width < y_stride {
let edge_val = y_plane[y_row_start + width - 1];
for px in width..y_stride {
y_plane[y_row_start + px] = edge_val;
}
}
}
} }
#[cfg(feature = "yuv")]
pub fn rgb_to_ycbcr_strided_reuse(
rgb_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
yuv_temp_y: &mut [u8],
yuv_temp_cb: &mut [u8],
yuv_temp_cr: &mut [u8],
width: usize,
height: usize,
y_stride: usize,
bpp: usize,
) {
crate::color::fast_yuv::rgb_to_ycbcr_strided_reuse(
rgb_data,
y_plane,
cb_plane,
cr_plane,
yuv_temp_y,
yuv_temp_cb,
yuv_temp_cr,
width,
height,
y_stride,
bpp,
);
}
#[cfg(feature = "yuv")]
pub fn bgr_to_ycbcr_strided_reuse(
bgr_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
yuv_temp_y: &mut [u8],
yuv_temp_cb: &mut [u8],
yuv_temp_cr: &mut [u8],
width: usize,
height: usize,
y_stride: usize,
bpp: usize,
) {
crate::color::fast_yuv::bgr_to_ycbcr_strided_reuse(
bgr_data,
y_plane,
cb_plane,
cr_plane,
yuv_temp_y,
yuv_temp_cb,
yuv_temp_cr,
width,
height,
y_stride,
bpp,
);
}
#[autoversion]
pub fn bgr_to_ycbcr_strided_inplace(
bgr_data: &[u8],
y_plane: &mut [f32],
cb_plane: &mut [f32],
cr_plane: &mut [f32],
width: usize,
height: usize,
y_stride: usize,
bpp: usize,
) {
debug_assert!(bgr_data.len() >= width * height * bpp);
debug_assert!(y_plane.len() >= y_stride * height);
debug_assert!(cb_plane.len() >= width * height);
debug_assert!(cr_plane.len() >= width * height);
#[cfg(feature = "yuv")]
{
crate::color::fast_yuv::bgr_to_ycbcr_strided_fast(
bgr_data, y_plane, cb_plane, cr_plane, width, height, y_stride, bpp,
);
}
#[cfg(not(feature = "yuv"))]
if y_stride == width {
let num_pixels = width * height;
match bpp {
3 => {
bgr_to_ycbcr_planes_simd_inplace(bgr_data, y_plane, cb_plane, cr_plane, num_pixels)
}
4 => {
bgra_to_ycbcr_planes_simd_inplace(bgr_data, y_plane, cb_plane, cr_plane, num_pixels)
}
_ => return,
}
return;
}
#[cfg(not(feature = "yuv"))]
{
let r_to_y = f32x8::splat(YCBCR_R_TO_Y);
let g_to_y = f32x8::splat(YCBCR_G_TO_Y);
let b_to_y = f32x8::splat(YCBCR_B_TO_Y);
let r_to_cb = f32x8::splat(YCBCR_R_TO_CB);
let g_to_cb = f32x8::splat(YCBCR_G_TO_CB);
let b_to_cb = f32x8::splat(YCBCR_B_TO_CB);
let r_to_cr = f32x8::splat(YCBCR_R_TO_CR);
let g_to_cr = f32x8::splat(YCBCR_G_TO_CR);
let b_to_cr = f32x8::splat(YCBCR_B_TO_CR);
let offset_128 = f32x8::splat(128.0);
for row in 0..height {
let bgr_row_start = row * width * bpp;
let y_row_start = row * y_stride;
let cbcr_row_start = row * width;
let chunks = width / 8;
for chunk in 0..chunks {
let px = chunk * 8;
let bgr_idx = bgr_row_start + px * bpp;
let (r, g, b) = if bpp == 3 {
(
f32x8::from([
bgr_data[bgr_idx + 2] as f32,
bgr_data[bgr_idx + 5] as f32,
bgr_data[bgr_idx + 8] as f32,
bgr_data[bgr_idx + 11] as f32,
bgr_data[bgr_idx + 14] as f32,
bgr_data[bgr_idx + 17] as f32,
bgr_data[bgr_idx + 20] as f32,
bgr_data[bgr_idx + 23] as f32,
]),
f32x8::from([
bgr_data[bgr_idx + 1] as f32,
bgr_data[bgr_idx + 4] as f32,
bgr_data[bgr_idx + 7] as f32,
bgr_data[bgr_idx + 10] as f32,
bgr_data[bgr_idx + 13] as f32,
bgr_data[bgr_idx + 16] as f32,
bgr_data[bgr_idx + 19] as f32,
bgr_data[bgr_idx + 22] as f32,
]),
f32x8::from([
bgr_data[bgr_idx] as f32,
bgr_data[bgr_idx + 3] as f32,
bgr_data[bgr_idx + 6] as f32,
bgr_data[bgr_idx + 9] as f32,
bgr_data[bgr_idx + 12] as f32,
bgr_data[bgr_idx + 15] as f32,
bgr_data[bgr_idx + 18] as f32,
bgr_data[bgr_idx + 21] as f32,
]),
)
} else {
(
f32x8::from([
bgr_data[bgr_idx + 2] as f32,
bgr_data[bgr_idx + 6] as f32,
bgr_data[bgr_idx + 10] as f32,
bgr_data[bgr_idx + 14] as f32,
bgr_data[bgr_idx + 18] as f32,
bgr_data[bgr_idx + 22] as f32,
bgr_data[bgr_idx + 26] as f32,
bgr_data[bgr_idx + 30] as f32,
]),
f32x8::from([
bgr_data[bgr_idx + 1] as f32,
bgr_data[bgr_idx + 5] as f32,
bgr_data[bgr_idx + 9] as f32,
bgr_data[bgr_idx + 13] as f32,
bgr_data[bgr_idx + 17] as f32,
bgr_data[bgr_idx + 21] as f32,
bgr_data[bgr_idx + 25] as f32,
bgr_data[bgr_idx + 29] as f32,
]),
f32x8::from([
bgr_data[bgr_idx] as f32,
bgr_data[bgr_idx + 4] as f32,
bgr_data[bgr_idx + 8] as f32,
bgr_data[bgr_idx + 12] as f32,
bgr_data[bgr_idx + 16] as f32,
bgr_data[bgr_idx + 20] as f32,
bgr_data[bgr_idx + 24] as f32,
bgr_data[bgr_idx + 28] as f32,
]),
)
};
let y = r_to_y.mul_add(r, g_to_y.mul_add(g, b_to_y * b));
let cb = r_to_cb.mul_add(r, g_to_cb.mul_add(g, b_to_cb.mul_add(b, offset_128)));
let cr = r_to_cr.mul_add(r, g_to_cr.mul_add(g, b_to_cr.mul_add(b, offset_128)));
store_f32x8(y_plane, y_row_start + px, y);
store_f32x8(cb_plane, cbcr_row_start + px, cb);
store_f32x8(cr_plane, cbcr_row_start + px, cr);
}
for px in (chunks * 8)..width {
let bgr_idx = bgr_row_start + px * bpp;
let b = bgr_data[bgr_idx] as f32;
let g = bgr_data[bgr_idx + 1] as f32;
let r = bgr_data[bgr_idx + 2] as f32;
y_plane[y_row_start + px] =
YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
cb_plane[cbcr_row_start + px] = YCBCR_R_TO_CB
.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
cr_plane[cbcr_row_start + px] = YCBCR_R_TO_CR
.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
}
if width < y_stride {
let edge_val = y_plane[y_row_start + width - 1];
for px in width..y_stride {
y_plane[y_row_start + px] = edge_val;
}
}
}
} }
#[inline]
pub fn extract_block_xyb_simd(
plane: &[f32],
width: usize,
height: usize,
bx: usize,
by: usize,
) -> [f32; 64] {
let px_start = bx * 8;
let py_start = by * 8;
let is_interior = px_start + 8 <= width && py_start + 8 <= height;
let scale = f32x8::splat(255.0);
let level_shift = f32x8::splat(128.0);
let mut block = [0.0f32; 64];
if is_interior {
for y in 0..8 {
let row_start = (py_start + y) * width + px_start;
let row_arr: [f32; 8] = plane[row_start..row_start + 8].try_into().unwrap();
let row = f32x8::from(row_arr);
let scaled = row * scale - level_shift;
let arr: [f32; 8] = scaled.into();
block[y * 8..y * 8 + 8].copy_from_slice(&arr);
}
} else {
for y in 0..8 {
let py = (py_start + y).min(height - 1);
for x in 0..8 {
let px = (px_start + x).min(width - 1);
block[y * 8 + x] = plane[py * width + px] * 255.0 - 128.0;
}
}
}
block
}
#[cfg(test)]
mod tests {
use super::*;
const EPSILON: f32 = 1e-4;
#[cfg(target_arch = "x86_64")]
const EPSILON_ACCUMULATED: f32 = 5e-4;
#[test]
fn test_gather_even_odd_x8_correctness() {
let data: Vec<f32> = (0..32).map(|i| i as f32).collect();
let (evens, odds) = gather_even_odd_x8(&data, 0, 32);
let evens_arr: [f32; 8] = evens.into();
let odds_arr: [f32; 8] = odds.into();
let expected_evens = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0];
let expected_odds = [1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0];
for i in 0..8 {
assert!(
(evens_arr[i] - expected_evens[i]).abs() < EPSILON,
"evens[{}]: got {}, expected {}",
i,
evens_arr[i],
expected_evens[i]
);
assert!(
(odds_arr[i] - expected_odds[i]).abs() < EPSILON,
"odds[{}]: got {}, expected {}",
i,
odds_arr[i],
expected_odds[i]
);
}
let (evens2, odds2) = gather_even_odd_x8(&data, 4, 32);
let evens2_arr: [f32; 8] = evens2.into();
let odds2_arr: [f32; 8] = odds2.into();
let expected_evens2 = [4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0];
let expected_odds2 = [5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0];
for i in 0..8 {
assert!(
(evens2_arr[i] - expected_evens2[i]).abs() < EPSILON,
"evens2[{}]: got {}, expected {}",
i,
evens2_arr[i],
expected_evens2[i]
);
assert!(
(odds2_arr[i] - expected_odds2[i]).abs() < EPSILON,
"odds2[{}]: got {}, expected {}",
i,
odds2_arr[i],
expected_odds2[i]
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_rgb_to_ycbcr_avx2_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let rgb_data: Vec<u8> = (0..28).map(|i| ((i * 17 + 5) % 256) as u8).collect();
let mut y_avx2 = [0.0f32; 8];
let mut cb_avx2 = [0.0f32; 8];
let mut cr_avx2 = [0.0f32; 8];
rgb_to_ycbcr_8px_fma(token, &rgb_data, &mut y_avx2, &mut cb_avx2, &mut cr_avx2);
let mut y_scalar = vec![0.0f32; 8];
let mut cb_scalar = vec![0.0f32; 8];
let mut cr_scalar = vec![0.0f32; 8];
rgb_to_ycbcr_scalar(&rgb_data, &mut y_scalar, &mut cb_scalar, &mut cr_scalar, 8);
for i in 0..8 {
let y_diff = (y_avx2[i] - y_scalar[i]).abs();
let cb_diff = (cb_avx2[i] - cb_scalar[i]).abs();
let cr_diff = (cr_avx2[i] - cr_scalar[i]).abs();
assert!(
y_diff < EPSILON_ACCUMULATED,
"Y mismatch at {}: AVX2={}, scalar={}, diff={}",
i,
y_avx2[i],
y_scalar[i],
y_diff
);
assert!(
cb_diff < EPSILON_ACCUMULATED,
"Cb mismatch at {}: AVX2={}, scalar={}, diff={}",
i,
cb_avx2[i],
cb_scalar[i],
cb_diff
);
assert!(
cr_diff < EPSILON_ACCUMULATED,
"Cr mismatch at {}: AVX2={}, scalar={}, diff={}",
i,
cr_avx2[i],
cr_scalar[i],
cr_diff
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_rgb_to_ycbcr_avx2_brute_force() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let mut max_y_diff = 0.0f32;
let mut max_cb_diff = 0.0f32;
let mut max_cr_diff = 0.0f32;
for r_base in (0u8..=255).step_by(16) {
for g_base in (0u8..=255).step_by(64) {
for b_base in (0u8..=255).step_by(64) {
let mut rgb_data = vec![0u8; 28];
for p in 0..8 {
let r = r_base.wrapping_add((p * 2) as u8);
let g = g_base.wrapping_add((p * 3) as u8);
let b = b_base.wrapping_add((p * 5) as u8);
rgb_data[p * 3] = r;
rgb_data[p * 3 + 1] = g;
rgb_data[p * 3 + 2] = b;
}
let mut y_avx2 = [0.0f32; 8];
let mut cb_avx2 = [0.0f32; 8];
let mut cr_avx2 = [0.0f32; 8];
rgb_to_ycbcr_8px_fma(token, &rgb_data, &mut y_avx2, &mut cb_avx2, &mut cr_avx2);
let mut y_scalar = vec![0.0f32; 8];
let mut cb_scalar = vec![0.0f32; 8];
let mut cr_scalar = vec![0.0f32; 8];
rgb_to_ycbcr_scalar(
&rgb_data,
&mut y_scalar,
&mut cb_scalar,
&mut cr_scalar,
8,
);
for i in 0..8 {
max_y_diff = max_y_diff.max((y_avx2[i] - y_scalar[i]).abs());
max_cb_diff = max_cb_diff.max((cb_avx2[i] - cb_scalar[i]).abs());
max_cr_diff = max_cr_diff.max((cr_avx2[i] - cr_scalar[i]).abs());
}
}
}
}
assert!(
max_y_diff < EPSILON_ACCUMULATED,
"Max Y diff too large: {}",
max_y_diff
);
assert!(
max_cb_diff < EPSILON_ACCUMULATED,
"Max Cb diff too large: {}",
max_cb_diff
);
assert!(
max_cr_diff < EPSILON_ACCUMULATED,
"Max Cr diff too large: {}",
max_cr_diff
);
}
}