#![allow(dead_code)]
use crate::error::Result;
use crate::foundation::alloc::{checked_size, checked_size_2d, try_alloc_zeroed};
use crate::foundation::consts::{
YCBCR_B_TO_CB, YCBCR_B_TO_CR, YCBCR_B_TO_Y, YCBCR_CB_TO_B, YCBCR_CB_TO_G, YCBCR_CB_TO_R,
YCBCR_CR_TO_B, YCBCR_CR_TO_G, YCBCR_CR_TO_R, YCBCR_G_TO_CB, YCBCR_G_TO_CR, YCBCR_G_TO_Y,
YCBCR_R_TO_CB, YCBCR_R_TO_CR, YCBCR_R_TO_Y, YCBCR_Y_TO_B, YCBCR_Y_TO_G, YCBCR_Y_TO_R,
};
use crate::types::PixelFormat;
use archmage::prelude::*;
use magetypes::simd::generic::f32x8 as GenericF32x8;
use magetypes::simd::generic::i32x4 as GenericI32x4;
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64 as safe_simd;
#[inline]
#[must_use]
pub fn rgb_to_ycbcr(r: u8, g: u8, b: u8) -> (u8, u8, u8) {
let rf = r as f32;
let gf = g as f32;
let bf = b as f32;
let y = YCBCR_R_TO_Y.mul_add(rf, YCBCR_G_TO_Y.mul_add(gf, YCBCR_B_TO_Y * bf));
let cb = YCBCR_R_TO_CB.mul_add(
rf,
YCBCR_G_TO_CB.mul_add(gf, YCBCR_B_TO_CB.mul_add(bf, 128.0)),
);
let cr = YCBCR_R_TO_CR.mul_add(
rf,
YCBCR_G_TO_CR.mul_add(gf, YCBCR_B_TO_CR.mul_add(bf, 128.0)),
);
(
y.round().clamp(0.0, 255.0) as u8,
cb.round().clamp(0.0, 255.0) as u8,
cr.round().clamp(0.0, 255.0) as u8,
)
}
#[inline]
#[must_use]
pub fn ycbcr_to_rgb(y: u8, cb: u8, cr: u8) -> (u8, u8, u8) {
let yf = y as f32;
let cbf = cb as f32 - 128.0;
let crf = cr as f32 - 128.0;
let r = YCBCR_Y_TO_R.mul_add(yf, YCBCR_CB_TO_R.mul_add(cbf, YCBCR_CR_TO_R * crf));
let g = YCBCR_Y_TO_G.mul_add(yf, YCBCR_CB_TO_G.mul_add(cbf, YCBCR_CR_TO_G * crf));
let b = YCBCR_Y_TO_B.mul_add(yf, YCBCR_CB_TO_B.mul_add(cbf, YCBCR_CR_TO_B * crf));
(
r.round().clamp(0.0, 255.0) as u8,
g.round().clamp(0.0, 255.0) as u8,
b.round().clamp(0.0, 255.0) as u8,
)
}
#[inline]
#[must_use]
pub fn rgb_to_ycbcr_f32(r: f32, g: f32, b: f32) -> (f32, f32, f32) {
let y = YCBCR_R_TO_Y.mul_add(r, YCBCR_G_TO_Y.mul_add(g, YCBCR_B_TO_Y * b));
let cb = YCBCR_R_TO_CB.mul_add(r, YCBCR_G_TO_CB.mul_add(g, YCBCR_B_TO_CB.mul_add(b, 128.0)));
let cr = YCBCR_R_TO_CR.mul_add(r, YCBCR_G_TO_CR.mul_add(g, YCBCR_B_TO_CR.mul_add(b, 128.0)));
(y, cb, cr)
}
#[inline]
#[must_use]
pub fn ycbcr_to_rgb_f32(y: f32, cb: f32, cr: f32) -> (f32, f32, f32) {
let cbf = cb - 128.0;
let crf = cr - 128.0;
let r = YCBCR_Y_TO_R * y + YCBCR_CB_TO_R * cbf + YCBCR_CR_TO_R * crf;
let g = YCBCR_Y_TO_G * y + YCBCR_CB_TO_G * cbf + YCBCR_CR_TO_G * crf;
let b = YCBCR_Y_TO_B * y + YCBCR_CB_TO_B * cbf + YCBCR_CR_TO_B * crf;
(r, g, b)
}
pub fn convert_rgb_to_ycbcr_buffer(buffer: &mut [u8]) {
assert!(buffer.len() % 3 == 0, "Buffer length must be multiple of 3");
for chunk in buffer.chunks_exact_mut(3) {
let (y, cb, cr) = rgb_to_ycbcr(chunk[0], chunk[1], chunk[2]);
chunk[0] = y;
chunk[1] = cb;
chunk[2] = cr;
}
}
pub fn convert_ycbcr_to_rgb_buffer(buffer: &mut [u8]) {
assert!(buffer.len() % 3 == 0, "Buffer length must be multiple of 3");
for chunk in buffer.chunks_exact_mut(3) {
let (r, g, b) = ycbcr_to_rgb(chunk[0], chunk[1], chunk[2]);
chunk[0] = r;
chunk[1] = g;
chunk[2] = b;
}
}
mod simd {
use super::*;
#[inline]
pub fn rgb_to_ycbcr_x4(r: [u8; 4], g: [u8; 4], b: [u8; 4]) -> ([u8; 4], [u8; 4], [u8; 4]) {
let clamp = |v: f32| v.round().clamp(0.0, 255.0) as u8;
let mut y_out = [0u8; 4];
let mut cb_out = [0u8; 4];
let mut cr_out = [0u8; 4];
for i in 0..4 {
let rf = r[i] as f32;
let gf = g[i] as f32;
let bf = b[i] as f32;
let y = YCBCR_R_TO_Y.mul_add(rf, YCBCR_G_TO_Y.mul_add(gf, YCBCR_B_TO_Y * bf));
let cb = YCBCR_R_TO_CB.mul_add(
rf,
YCBCR_G_TO_CB.mul_add(gf, YCBCR_B_TO_CB.mul_add(bf, 128.0)),
);
let cr = YCBCR_R_TO_CR.mul_add(
rf,
YCBCR_G_TO_CR.mul_add(gf, YCBCR_B_TO_CR.mul_add(bf, 128.0)),
);
y_out[i] = clamp(y);
cb_out[i] = clamp(cb);
cr_out[i] = clamp(cr);
}
(y_out, cb_out, cr_out)
}
#[inline]
pub fn ycbcr_to_rgb_x4(y: [u8; 4], cb: [u8; 4], cr: [u8; 4]) -> ([u8; 4], [u8; 4], [u8; 4]) {
let clamp = |v: f32| v.round().clamp(0.0, 255.0) as u8;
let mut r_out = [0u8; 4];
let mut g_out = [0u8; 4];
let mut b_out = [0u8; 4];
for i in 0..4 {
let yf = y[i] as f32;
let cbf = cb[i] as f32 - 128.0;
let crf = cr[i] as f32 - 128.0;
let r = YCBCR_Y_TO_R.mul_add(yf, YCBCR_CB_TO_R.mul_add(cbf, YCBCR_CR_TO_R * crf));
let g = YCBCR_Y_TO_G.mul_add(yf, YCBCR_CB_TO_G.mul_add(cbf, YCBCR_CR_TO_G * crf));
let b = YCBCR_Y_TO_B.mul_add(yf, YCBCR_CB_TO_B.mul_add(cbf, YCBCR_CR_TO_B * crf));
r_out[i] = clamp(r);
g_out[i] = clamp(g);
b_out[i] = clamp(b);
}
(r_out, g_out, b_out)
}
}
pub fn rgb_to_ycbcr_planes(
rgb: &[u8],
width: usize,
height: usize,
) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>)> {
let num_pixels = checked_size_2d(width, height)?;
let expected_len = checked_size(width, height, 3)?;
assert_eq!(rgb.len(), expected_len);
let mut y_plane = try_alloc_zeroed(num_pixels, "YCbCr Y plane")?;
let mut cb_plane = try_alloc_zeroed(num_pixels, "YCbCr Cb plane")?;
let mut cr_plane = try_alloc_zeroed(num_pixels, "YCbCr Cr plane")?;
let chunks = num_pixels / 4;
for chunk in 0..chunks {
let base = chunk * 4;
let rgb_base = base * 3;
let r = [
rgb[rgb_base],
rgb[rgb_base + 3],
rgb[rgb_base + 6],
rgb[rgb_base + 9],
];
let g = [
rgb[rgb_base + 1],
rgb[rgb_base + 4],
rgb[rgb_base + 7],
rgb[rgb_base + 10],
];
let b = [
rgb[rgb_base + 2],
rgb[rgb_base + 5],
rgb[rgb_base + 8],
rgb[rgb_base + 11],
];
let (y, cb, cr) = simd::rgb_to_ycbcr_x4(r, g, b);
y_plane[base..base + 4].copy_from_slice(&y);
cb_plane[base..base + 4].copy_from_slice(&cb);
cr_plane[base..base + 4].copy_from_slice(&cr);
}
for i in (chunks * 4)..num_pixels {
let (y, cb, cr) = rgb_to_ycbcr(rgb[i * 3], rgb[i * 3 + 1], rgb[i * 3 + 2]);
y_plane[i] = y;
cb_plane[i] = cb;
cr_plane[i] = cr;
}
Ok((y_plane, cb_plane, cr_plane))
}
pub fn ycbcr_planes_to_rgb(
y_plane: &[u8],
cb_plane: &[u8],
cr_plane: &[u8],
width: usize,
height: usize,
) -> Result<Vec<u8>> {
let num_pixels = checked_size_2d(width, height)?;
assert_eq!(y_plane.len(), num_pixels);
assert_eq!(cb_plane.len(), num_pixels);
assert_eq!(cr_plane.len(), num_pixels);
let rgb_size = checked_size(width, height, 3)?;
let mut rgb = try_alloc_zeroed(rgb_size, "RGB output buffer")?;
let chunks = num_pixels / 4;
for chunk in 0..chunks {
let base = chunk * 4;
let rgb_base = base * 3;
let y = [
y_plane[base],
y_plane[base + 1],
y_plane[base + 2],
y_plane[base + 3],
];
let cb = [
cb_plane[base],
cb_plane[base + 1],
cb_plane[base + 2],
cb_plane[base + 3],
];
let cr = [
cr_plane[base],
cr_plane[base + 1],
cr_plane[base + 2],
cr_plane[base + 3],
];
let (r, g, b) = simd::ycbcr_to_rgb_x4(y, cb, cr);
rgb[rgb_base] = r[0];
rgb[rgb_base + 1] = g[0];
rgb[rgb_base + 2] = b[0];
rgb[rgb_base + 3] = r[1];
rgb[rgb_base + 4] = g[1];
rgb[rgb_base + 5] = b[1];
rgb[rgb_base + 6] = r[2];
rgb[rgb_base + 7] = g[2];
rgb[rgb_base + 8] = b[2];
rgb[rgb_base + 9] = r[3];
rgb[rgb_base + 10] = g[3];
rgb[rgb_base + 11] = b[3];
}
for i in (chunks * 4)..num_pixels {
let (r, g, b) = ycbcr_to_rgb(y_plane[i], cb_plane[i], cr_plane[i]);
rgb[i * 3] = r;
rgb[i * 3 + 1] = g;
rgb[i * 3 + 2] = b;
}
Ok(rgb)
}
pub fn ycbcr_planes_f32_to_rgb_u8(
y_plane: &[f32],
cb_plane: &[f32],
cr_plane: &[f32],
rgb: &mut [u8],
) {
incant!(ycbcr_planes_f32_to_rgb_u8_impl(
y_plane, cb_plane, cr_plane, rgb
));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn ycbcr_planes_f32_to_rgb_u8_impl(
token: Token,
y_plane: &[f32],
cb_plane: &[f32],
cr_plane: &[f32],
rgb: &mut [u8],
) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
debug_assert_eq!(y_plane.len(), cb_plane.len());
debug_assert_eq!(y_plane.len(), cr_plane.len());
debug_assert_eq!(rgb.len(), y_plane.len() * 3);
let num_pixels = y_plane.len();
let cr_to_r = f32x8::splat(token, 1.402);
let cb_to_g = f32x8::splat(token, -0.344136);
let cr_to_g = f32x8::splat(token, -0.714136);
let cb_to_b = f32x8::splat(token, 1.772);
let offset = f32x8::splat(token, 128.0);
let zero = f32x8::splat(token, 0.0);
let max_val = f32x8::splat(token, 255.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let i = chunk * 8;
let y = f32x8::from_array(token, *<&[f32; 8]>::try_from(&y_plane[i..i + 8]).unwrap());
let cb = f32x8::from_array(token, *<&[f32; 8]>::try_from(&cb_plane[i..i + 8]).unwrap());
let cr = f32x8::from_array(token, *<&[f32; 8]>::try_from(&cr_plane[i..i + 8]).unwrap());
let y_off = y + offset;
let r = cr_to_r.mul_add(cr, y_off).max(zero).min(max_val);
let g = cb_to_g
.mul_add(cb, cr_to_g.mul_add(cr, y_off))
.max(zero)
.min(max_val);
let b = cb_to_b.mul_add(cb, y_off).max(zero).min(max_val);
let r_arr = r.to_array();
let g_arr = g.to_array();
let b_arr = b.to_array();
let rgb_chunk = &mut rgb[i * 3..(i + 8) * 3];
for j in 0..8 {
rgb_chunk[j * 3] = r_arr[j] as u8;
rgb_chunk[j * 3 + 1] = g_arr[j] as u8;
rgb_chunk[j * 3 + 2] = b_arr[j] as u8;
}
}
let start = chunks * 8;
for i in start..num_pixels {
let y = y_plane[i];
let cb = cb_plane[i];
let cr = cr_plane[i];
let r = 1.402f32.mul_add(cr, y);
let g = (-0.344136f32).mul_add(cb, (-0.714136f32).mul_add(cr, y));
let b_val = 1.772f32.mul_add(cb, y);
rgb[i * 3] = (r + 128.0).clamp(0.0, 255.0) as u8;
rgb[i * 3 + 1] = (g + 128.0).clamp(0.0, 255.0) as u8;
rgb[i * 3 + 2] = (b_val + 128.0).clamp(0.0, 255.0) as u8;
}
}
pub fn ycbcr_planes_f32_to_rgb_f32(
y_plane: &[f32],
cb_plane: &[f32],
cr_plane: &[f32],
rgb: &mut [f32],
) {
incant!(ycbcr_planes_f32_to_rgb_f32_impl(
y_plane, cb_plane, cr_plane, rgb
));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn ycbcr_planes_f32_to_rgb_f32_impl(
token: Token,
y_plane: &[f32],
cb_plane: &[f32],
cr_plane: &[f32],
rgb: &mut [f32],
) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
debug_assert_eq!(y_plane.len(), cb_plane.len());
debug_assert_eq!(y_plane.len(), cr_plane.len());
debug_assert_eq!(rgb.len(), y_plane.len() * 3);
let num_pixels = y_plane.len();
let cr_to_r = f32x8::splat(token, 1.402);
let cb_to_g = f32x8::splat(token, -0.344136);
let cr_to_g = f32x8::splat(token, -0.714136);
let cb_to_b = f32x8::splat(token, 1.772);
let offset = f32x8::splat(token, 128.0);
let scale = f32x8::splat(token, 1.0 / 255.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let base = chunk * 8;
let y = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&y_plane[base..base + 8]).unwrap(),
);
let cb = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&cb_plane[base..base + 8]).unwrap(),
);
let cr = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&cr_plane[base..base + 8]).unwrap(),
);
let y_off = y + offset;
let r = cr_to_r.mul_add(cr, y_off) * scale;
let g = cb_to_g.mul_add(cb, cr_to_g.mul_add(cr, y_off)) * scale;
let b = cb_to_b.mul_add(cb, y_off) * scale;
let r_arr = r.to_array();
let g_arr = g.to_array();
let b_arr = b.to_array();
for j in 0..8 {
let idx = (base + j) * 3;
rgb[idx] = r_arr[j];
rgb[idx + 1] = g_arr[j];
rgb[idx + 2] = b_arr[j];
}
}
for i in (chunks * 8)..num_pixels {
let y = y_plane[i];
let cb = cb_plane[i];
let cr = cr_plane[i];
let r = 1.402f32.mul_add(cr, y);
let g = (-0.344136f32).mul_add(cb, (-0.714136f32).mul_add(cr, y));
let b = 1.772f32.mul_add(cb, y);
let idx = i * 3;
rgb[idx] = (r + 128.0) / 255.0;
rgb[idx + 1] = (g + 128.0) / 255.0;
rgb[idx + 2] = (b + 128.0) / 255.0;
}
}
pub fn gray_f32_to_rgb_u8(y_plane: &[f32], rgb: &mut [u8]) {
incant!(gray_f32_to_rgb_u8_impl(y_plane, rgb));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn gray_f32_to_rgb_u8_impl(token: Token, y_plane: &[f32], rgb: &mut [u8]) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
debug_assert_eq!(rgb.len(), y_plane.len() * 3);
let num_pixels = y_plane.len();
let offset = f32x8::splat(token, 128.0);
let zero = f32x8::splat(token, 0.0);
let max_val = f32x8::splat(token, 255.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let base = chunk * 8;
let y = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&y_plane[base..base + 8]).unwrap(),
);
let val = (y + offset).max(zero).min(max_val);
let arr = val.to_array();
for j in 0..8 {
let idx = (base + j) * 3;
let v = arr[j] as u8;
rgb[idx] = v;
rgb[idx + 1] = v;
rgb[idx + 2] = v;
}
}
for i in (chunks * 8)..num_pixels {
let val = (y_plane[i] + 128.0).clamp(0.0, 255.0) as u8;
let idx = i * 3;
rgb[idx] = val;
rgb[idx + 1] = val;
rgb[idx + 2] = val;
}
}
pub fn gray_f32_to_rgb_f32(y_plane: &[f32], rgb: &mut [f32]) {
incant!(gray_f32_to_rgb_f32_impl(y_plane, rgb));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn gray_f32_to_rgb_f32_impl(token: Token, y_plane: &[f32], rgb: &mut [f32]) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
debug_assert_eq!(rgb.len(), y_plane.len() * 3);
let num_pixels = y_plane.len();
let offset = f32x8::splat(token, 128.0);
let scale = f32x8::splat(token, 1.0 / 255.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let base = chunk * 8;
let y = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&y_plane[base..base + 8]).unwrap(),
);
let val = (y + offset) * scale;
let arr = val.to_array();
for j in 0..8 {
let idx = (base + j) * 3;
rgb[idx] = arr[j];
rgb[idx + 1] = arr[j];
rgb[idx + 2] = arr[j];
}
}
for i in (chunks * 8)..num_pixels {
let val = (y_plane[i] + 128.0) / 255.0;
let idx = i * 3;
rgb[idx] = val;
rgb[idx + 1] = val;
rgb[idx + 2] = val;
}
}
pub fn gray_f32_to_gray_u8(y_plane: &[f32], output: &mut [u8]) {
incant!(gray_f32_to_gray_u8_impl(y_plane, output));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn gray_f32_to_gray_u8_impl(token: Token, y_plane: &[f32], output: &mut [u8]) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
debug_assert_eq!(y_plane.len(), output.len());
let num_pixels = y_plane.len();
let offset = f32x8::splat(token, 128.0);
let zero = f32x8::splat(token, 0.0);
let max_val = f32x8::splat(token, 255.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let base = chunk * 8;
let y = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&y_plane[base..base + 8]).unwrap(),
);
let val = (y + offset).max(zero).min(max_val);
let arr = val.to_array();
for j in 0..8 {
output[base + j] = arr[j] as u8;
}
}
for i in (chunks * 8)..num_pixels {
output[i] = (y_plane[i] + 128.0).clamp(0.0, 255.0) as u8;
}
}
pub fn gray_f32_to_gray_f32(y_plane: &[f32], output: &mut [f32]) {
incant!(gray_f32_to_gray_f32_impl(y_plane, output));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn gray_f32_to_gray_f32_impl(token: Token, y_plane: &[f32], output: &mut [f32]) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
debug_assert_eq!(y_plane.len(), output.len());
let num_pixels = y_plane.len();
let offset = f32x8::splat(token, 128.0);
let scale = f32x8::splat(token, 1.0 / 255.0);
let chunks = num_pixels / 8;
for chunk in 0..chunks {
let base = chunk * 8;
let y = f32x8::from_array(
token,
*<&[f32; 8]>::try_from(&y_plane[base..base + 8]).unwrap(),
);
let val = (y + offset) * scale;
let arr = val.to_array();
output[base..base + 8].copy_from_slice(&arr);
}
for i in (chunks * 8)..num_pixels {
output[i] = (y_plane[i] + 128.0) / 255.0;
}
}
#[inline]
pub fn bgr_to_rgb(bgr: &[u8; 3]) -> [u8; 3] {
[bgr[2], bgr[1], bgr[0]]
}
#[inline]
pub fn bgra_to_rgba(bgra: &[u8; 4]) -> [u8; 4] {
[bgra[2], bgra[1], bgra[0], bgra[3]]
}
#[cfg(feature = "decoder")]
pub fn rgb_u8_swap_rb_inplace(data: &mut [u8]) {
debug_assert_eq!(data.len() % 3, 0);
for pixel in data.chunks_exact_mut(3) {
pixel.swap(0, 2);
}
}
#[cfg(feature = "decoder")]
pub fn rgb_u8_to_rgba_u8(src: &[u8], dst: &mut [u8]) {
debug_assert_eq!(src.len() % 3, 0);
let npixels = src.len() / 3;
debug_assert!(dst.len() >= npixels * 4);
for (s, d) in src.chunks_exact(3).zip(dst.chunks_exact_mut(4)) {
d[0] = s[0];
d[1] = s[1];
d[2] = s[2];
d[3] = 255;
}
}
#[cfg(feature = "decoder")]
pub fn rgb_u8_to_bgra_u8(src: &[u8], dst: &mut [u8]) {
debug_assert_eq!(src.len() % 3, 0);
let npixels = src.len() / 3;
debug_assert!(dst.len() >= npixels * 4);
for (s, d) in src.chunks_exact(3).zip(dst.chunks_exact_mut(4)) {
d[0] = s[2]; d[1] = s[1]; d[2] = s[0]; d[3] = 255;
}
}
#[cfg(feature = "decoder")]
#[inline]
pub fn rgb_u8_to_bgrx_u8(src: &[u8], dst: &mut [u8]) {
rgb_u8_to_bgra_u8(src, dst);
}
#[inline]
#[must_use]
pub fn cmyk_to_rgb(c: u8, m: u8, y: u8, k: u8) -> (u8, u8, u8) {
let c = c as f32 / 255.0;
let m = m as f32 / 255.0;
let y = y as f32 / 255.0;
let k = k as f32 / 255.0;
let r = 255.0 * (1.0 - c) * (1.0 - k);
let g = 255.0 * (1.0 - m) * (1.0 - k);
let b = 255.0 * (1.0 - y) * (1.0 - k);
(
r.round().clamp(0.0, 255.0) as u8,
g.round().clamp(0.0, 255.0) as u8,
b.round().clamp(0.0, 255.0) as u8,
)
}
#[inline]
#[must_use]
pub fn rgb_to_cmyk(r: u8, g: u8, b: u8) -> (u8, u8, u8, u8) {
let r = r as f32 / 255.0;
let g = g as f32 / 255.0;
let b = b as f32 / 255.0;
let k = 1.0 - r.max(g).max(b);
if k >= 1.0 {
return (0, 0, 0, 255);
}
let c = (1.0 - r - k) / (1.0 - k);
let m = (1.0 - g - k) / (1.0 - k);
let y = (1.0 - b - k) / (1.0 - k);
(
(c * 255.0).round() as u8,
(m * 255.0).round() as u8,
(y * 255.0).round() as u8,
(k * 255.0).round() as u8,
)
}
#[inline]
#[must_use]
pub fn cmyk_adobe_to_rgb(c: u8, m: u8, y: u8, k: u8) -> (u8, u8, u8) {
let c32 = c as u32;
let m32 = m as u32;
let y32 = y as u32;
let k32 = k as u32;
let r = ((c32 * k32 + 127) / 255) as u8;
let g = ((m32 * k32 + 127) / 255) as u8;
let b = ((y32 * k32 + 127) / 255) as u8;
(r, g, b)
}
#[inline]
#[must_use]
pub fn ycck_to_rgb(y: u8, cb: u8, cr: u8, k: u8) -> (u8, u8, u8) {
let (c, m, yy) = ycbcr_to_rgb(y, cb, cr);
let k32 = k as u32;
let r = (((255 - c as u32) * k32 + 127) / 255) as u8;
let g = (((255 - m as u32) * k32 + 127) / 255) as u8;
let b = (((255 - yy as u32) * k32 + 127) / 255) as u8;
(r, g, b)
}
pub fn cmyk_planes_to_rgb_u8(
c_plane: &[f32],
m_plane: &[f32],
y_plane: &[f32],
k_plane: &[f32],
rgb: &mut [u8],
) {
debug_assert_eq!(c_plane.len(), m_plane.len());
debug_assert_eq!(c_plane.len(), y_plane.len());
debug_assert_eq!(c_plane.len(), k_plane.len());
debug_assert_eq!(rgb.len(), c_plane.len() * 3);
for i in 0..c_plane.len() {
let c = (c_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let m = (m_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let y = (y_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let k = (k_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let (r, g, b) = cmyk_adobe_to_rgb(c, m, y, k);
rgb[i * 3] = r;
rgb[i * 3 + 1] = g;
rgb[i * 3 + 2] = b;
}
}
pub fn ycck_planes_to_rgb_u8(
y_plane: &[f32],
cb_plane: &[f32],
cr_plane: &[f32],
k_plane: &[f32],
rgb: &mut [u8],
) {
debug_assert_eq!(y_plane.len(), cb_plane.len());
debug_assert_eq!(y_plane.len(), cr_plane.len());
debug_assert_eq!(y_plane.len(), k_plane.len());
debug_assert_eq!(rgb.len(), y_plane.len() * 3);
for i in 0..y_plane.len() {
let y = (y_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let cb = (cb_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let cr = (cr_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let k = (k_plane[i] + 128.0).round().clamp(0.0, 255.0) as u8;
let (r, g, b) = ycck_to_rgb(y, cb, cr, k);
rgb[i * 3] = r;
rgb[i * 3 + 1] = g;
rgb[i * 3 + 2] = b;
}
}
pub fn extract_channel(data: &[u8], format: PixelFormat, channel: usize) -> Result<Vec<u8>> {
let bpp = format.bytes_per_pixel();
let num_pixels = data.len() / bpp;
let mut result = try_alloc_zeroed(num_pixels, "channel extraction buffer")?;
for i in 0..num_pixels {
result[i] = data[i * bpp + channel];
}
Ok(result)
}
const Y_CF_INT: i32 = 16384; const CR_TO_R_INT: i32 = 22970; const CB_TO_B_INT: i32 = 29032; const CR_TO_G_INT: i32 = -11700; const CB_TO_G_INT: i32 = -5638; const YUV_ROUND: i32 = 8192;
#[inline]
pub fn ycbcr_to_rgb_i16_x16(
y: &[i16; 16],
cb: &[i16; 16],
cr: &[i16; 16],
rgb: &mut [u8],
offset: &mut usize,
) {
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
ycbcr_to_rgb_i16_x16_avx2(token, y, cb, cr, rgb, offset);
return;
}
}
incant!(ycbcr_to_rgb_i16_x8_generic(y, cb, cr, rgb, *offset));
incant!(ycbcr_to_rgb_i16_x8_generic(
&y[8..],
&cb[8..],
&cr[8..],
rgb,
*offset + 24
));
*offset += 48;
}
#[inline]
fn ycbcr_to_rgb_i16_x16_scalar(
y: &[i16; 16],
cb: &[i16; 16],
cr: &[i16; 16],
rgb: &mut [u8],
offset: &mut usize,
) {
for i in 0..16 {
let y_val = i32::from(y[i]);
let cb_val = i32::from(cb[i]) - 128;
let cr_val = i32::from(cr[i]) - 128;
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = *offset + i * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
*offset += 48;
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn ycbcr_to_rgb_i16_x8_generic(
token: Token,
y: &[i16],
cb: &[i16],
cr: &[i16],
rgb: &mut [u8],
base: usize,
) {
#[allow(non_camel_case_types)]
type i32x4 = GenericI32x4<Token>;
let y_coeff = i32x4::splat(token, Y_CF_INT);
let rounding = i32x4::splat(token, YUV_ROUND);
let bias = i32x4::splat(token, 128);
let zero = i32x4::zero(token);
let max255 = i32x4::splat(token, 255);
let cr_to_r = i32x4::splat(token, CR_TO_R_INT);
let cr_to_g = i32x4::splat(token, CR_TO_G_INT);
let cb_to_g = i32x4::splat(token, CB_TO_G_INT);
let cb_to_b = i32x4::splat(token, CB_TO_B_INT);
for half in 0..2 {
let off = half * 4;
let y4 = i32x4::from_array(
token,
[
i32::from(y[off]),
i32::from(y[off + 1]),
i32::from(y[off + 2]),
i32::from(y[off + 3]),
],
);
let cb4 = i32x4::from_array(
token,
[
i32::from(cb[off]),
i32::from(cb[off + 1]),
i32::from(cb[off + 2]),
i32::from(cb[off + 3]),
],
) - bias;
let cr4 = i32x4::from_array(
token,
[
i32::from(cr[off]),
i32::from(cr[off + 1]),
i32::from(cr[off + 2]),
i32::from(cr[off + 3]),
],
) - bias;
let y_scaled = y4 * y_coeff + rounding;
let r = (y_scaled + cr4 * cr_to_r).shr_arithmetic::<14>();
let g = (y_scaled + cr4 * cr_to_g + cb4 * cb_to_g).shr_arithmetic::<14>();
let b = (y_scaled + cb4 * cb_to_b).shr_arithmetic::<14>();
let r = r.max(zero).min(max255);
let g = g.max(zero).min(max255);
let b = b.max(zero).min(max255);
let ra = r.to_array();
let ga = g.to_array();
let ba = b.to_array();
let idx = base + off * 3;
for i in 0..4 {
rgb[idx + i * 3] = ra[i] as u8;
rgb[idx + i * 3 + 1] = ga[i] as u8;
rgb[idx + i * 3 + 2] = ba[i] as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ycbcr_to_rgb_i16_x16_avx2(
_token: archmage::X64V3Token,
y: &[i16; 16],
cb: &[i16; 16],
cr: &[i16; 16],
rgb: &mut [u8],
offset: &mut usize,
) {
use core::arch::x86_64::*;
let (y_vec, cb_vec, cr_vec) = (
safe_simd::_mm256_loadu_si256(y),
safe_simd::_mm256_loadu_si256(cb),
safe_simd::_mm256_loadu_si256(cr),
);
let bias = _mm256_set1_epi16(128);
let cb_centered = _mm256_sub_epi16(cb_vec, bias);
let cr_centered = _mm256_sub_epi16(cr_vec, bias);
let y_coeff = _mm256_set1_epi32(Y_CF_INT);
let rounding = _mm256_set1_epi32(YUV_ROUND);
let zero = _mm256_setzero_si256();
let y_lo = _mm256_unpacklo_epi16(y_vec, zero);
let y_hi = _mm256_unpackhi_epi16(y_vec, zero);
let y_scaled_lo = _mm256_add_epi32(_mm256_mullo_epi32(y_lo, y_coeff), rounding);
let y_scaled_hi = _mm256_add_epi32(_mm256_mullo_epi32(y_hi, y_coeff), rounding);
let cb_sign = _mm256_srai_epi16(cb_centered, 15); let cr_sign = _mm256_srai_epi16(cr_centered, 15);
let cb_lo = _mm256_unpacklo_epi16(cb_centered, cb_sign);
let cb_hi = _mm256_unpackhi_epi16(cb_centered, cb_sign);
let cr_lo = _mm256_unpacklo_epi16(cr_centered, cr_sign);
let cr_hi = _mm256_unpackhi_epi16(cr_centered, cr_sign);
let r_lo = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_lo,
_mm256_mullo_epi32(cr_lo, _mm256_set1_epi32(CR_TO_R_INT)),
),
14,
);
let r_hi = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_hi,
_mm256_mullo_epi32(cr_hi, _mm256_set1_epi32(CR_TO_R_INT)),
),
14,
);
let g_lo = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_lo,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_lo, _mm256_set1_epi32(CR_TO_G_INT)),
_mm256_mullo_epi32(cb_lo, _mm256_set1_epi32(CB_TO_G_INT)),
),
),
14,
);
let g_hi = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_hi,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_hi, _mm256_set1_epi32(CR_TO_G_INT)),
_mm256_mullo_epi32(cb_hi, _mm256_set1_epi32(CB_TO_G_INT)),
),
),
14,
);
let b_lo = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_lo,
_mm256_mullo_epi32(cb_lo, _mm256_set1_epi32(CB_TO_B_INT)),
),
14,
);
let b_hi = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_hi,
_mm256_mullo_epi32(cb_hi, _mm256_set1_epi32(CB_TO_B_INT)),
),
14,
);
let r_16 = _mm256_packs_epi32(r_lo, r_hi);
let g_16 = _mm256_packs_epi32(g_lo, g_hi);
let b_16 = _mm256_packs_epi32(b_lo, b_hi);
let r_8 = _mm256_packus_epi16(r_16, _mm256_setzero_si256());
let g_8 = _mm256_packus_epi16(g_16, _mm256_setzero_si256());
let b_8 = _mm256_packus_epi16(b_16, _mm256_setzero_si256());
let r_8 = _mm256_permute4x64_epi64(r_8, 0b11_01_10_00);
let g_8 = _mm256_permute4x64_epi64(g_8, 0b11_01_10_00);
let b_8 = _mm256_permute4x64_epi64(b_8, 0b11_01_10_00);
let sh_r = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
9, 4, 15, 10, 5,
);
let sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
14, 9, 4, 15, 10,
);
let sh_b = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
3, 14, 9, 4, 15,
);
let r0 = _mm256_shuffle_epi8(r_8, sh_r);
let g0 = _mm256_shuffle_epi8(g_8, sh_g);
let b0 = _mm256_shuffle_epi8(b_8, sh_b);
let m0 = _mm256_setr_epi8(
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
0, 0, -1, 0, 0,
);
let m1 = _mm256_setr_epi8(
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0,
);
let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
let rgb0 = _mm256_permute2x128_si256(p0, p1, 0x20);
let rgb1 = _mm256_permute2x128_si256(p2, p0, 0x30);
safe_simd::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut rgb[*offset..*offset + 32]).unwrap(),
rgb0,
);
safe_simd::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut rgb[*offset + 32..*offset + 48]).unwrap(),
_mm256_castsi256_si128(rgb1),
);
*offset += 48;
}
#[archmage::autoversion]
fn ycbcr_to_rgb_planes_autovec(
y_plane: &[i16],
cb_plane: &[i16],
cr_plane: &[i16],
r_out: &mut [u8],
g_out: &mut [u8],
b_out: &mut [u8],
) {
let len = y_plane.len();
for i in 0..len {
let y_val = i32::from(y_plane[i]);
let cb_val = i32::from(cb_plane[i]) - 128;
let cr_val = i32::from(cr_plane[i]) - 128;
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r_raw = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g_raw = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b_raw = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
r_out[i] = r_raw.clamp(0, 255) as u8;
g_out[i] = g_raw.clamp(0, 255) as u8;
b_out[i] = b_raw.clamp(0, 255) as u8;
}
}
#[archmage::autoversion]
fn interleave_rgb_planes(r: &[u8], g: &[u8], b: &[u8], rgb: &mut [u8]) {
let len = r.len();
for i in 0..len {
let out_idx = i * 3;
rgb[out_idx] = r[i];
rgb[out_idx + 1] = g[i];
rgb[out_idx + 2] = b[i];
}
}
pub fn ycbcr_planes_i16_to_rgb_u8(
y_plane: &[i16],
cb_plane: &[i16],
cr_plane: &[i16],
rgb: &mut [u8],
) {
debug_assert_eq!(y_plane.len(), cb_plane.len());
debug_assert_eq!(y_plane.len(), cr_plane.len());
debug_assert_eq!(rgb.len(), y_plane.len() * 3);
let len = y_plane.len();
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V4Token::summon() {
ycbcr_planes_i16_to_rgb_u8_avx512(token, y_plane, cb_plane, cr_plane, rgb);
return;
}
}
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
ycbcr_planes_i16_to_rgb_u8_avx2(token, y_plane, cb_plane, cr_plane, rgb);
return;
}
}
for i in 0..len {
let y_val = i32::from(y_plane[i]);
let cb_val = i32::from(cb_plane[i]) - 128;
let cr_val = i32::from(cr_plane[i]) - 128;
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = i * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ycbcr_planes_i16_to_rgb_u8_avx2(
_token: archmage::X64V3Token,
y_plane: &[i16],
cb_plane: &[i16],
cr_plane: &[i16],
rgb: &mut [u8],
) {
use core::arch::x86_64::*;
let len = y_plane.len();
let bias = _mm256_set1_epi16(128);
let y_coeff = _mm256_set1_epi32(Y_CF_INT);
let rounding = _mm256_set1_epi32(YUV_ROUND);
let cr_to_r = _mm256_set1_epi32(CR_TO_R_INT);
let cr_to_g = _mm256_set1_epi32(CR_TO_G_INT);
let cb_to_g = _mm256_set1_epi32(CB_TO_G_INT);
let cb_to_b = _mm256_set1_epi32(CB_TO_B_INT);
let zero = _mm256_setzero_si256();
let sh_r = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
9, 4, 15, 10, 5,
);
let sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
14, 9, 4, 15, 10,
);
let sh_b = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
3, 14, 9, 4, 15,
);
let m0 = _mm256_setr_epi8(
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
0, 0, -1, 0, 0,
);
let m1 = _mm256_setr_epi8(
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0,
);
let y_chunks = y_plane.chunks_exact(16);
let remainder_len = y_chunks.remainder().len();
for ((y_chunk, cb_chunk), (cr_chunk, rgb_chunk)) in y_chunks
.zip(cb_plane.chunks_exact(16))
.zip(cr_plane.chunks_exact(16).zip(rgb.chunks_exact_mut(48)))
{
let (y_vec, cb_vec, cr_vec) = (
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(y_chunk).unwrap()),
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(cb_chunk).unwrap()),
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(cr_chunk).unwrap()),
);
let cb_centered = _mm256_sub_epi16(cb_vec, bias);
let cr_centered = _mm256_sub_epi16(cr_vec, bias);
let y_lo = _mm256_unpacklo_epi16(y_vec, zero);
let y_hi = _mm256_unpackhi_epi16(y_vec, zero);
let y_scaled_lo = _mm256_add_epi32(_mm256_mullo_epi32(y_lo, y_coeff), rounding);
let y_scaled_hi = _mm256_add_epi32(_mm256_mullo_epi32(y_hi, y_coeff), rounding);
let cb_sign = _mm256_srai_epi16(cb_centered, 15);
let cr_sign = _mm256_srai_epi16(cr_centered, 15);
let cb_lo = _mm256_unpacklo_epi16(cb_centered, cb_sign);
let cb_hi = _mm256_unpackhi_epi16(cb_centered, cb_sign);
let cr_lo = _mm256_unpacklo_epi16(cr_centered, cr_sign);
let cr_hi = _mm256_unpackhi_epi16(cr_centered, cr_sign);
let r_lo = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_lo, _mm256_mullo_epi32(cr_lo, cr_to_r)),
14,
);
let r_hi = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_hi, _mm256_mullo_epi32(cr_hi, cr_to_r)),
14,
);
let g_lo = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_lo,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_lo, cr_to_g),
_mm256_mullo_epi32(cb_lo, cb_to_g),
),
),
14,
);
let g_hi = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_hi,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_hi, cr_to_g),
_mm256_mullo_epi32(cb_hi, cb_to_g),
),
),
14,
);
let b_lo = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_lo, _mm256_mullo_epi32(cb_lo, cb_to_b)),
14,
);
let b_hi = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_hi, _mm256_mullo_epi32(cb_hi, cb_to_b)),
14,
);
let r_16 = _mm256_packs_epi32(r_lo, r_hi);
let g_16 = _mm256_packs_epi32(g_lo, g_hi);
let b_16 = _mm256_packs_epi32(b_lo, b_hi);
let r_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(r_16, zero), 0b11_01_10_00);
let g_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(g_16, zero), 0b11_01_10_00);
let b_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(b_16, zero), 0b11_01_10_00);
let r0 = _mm256_shuffle_epi8(r_8, sh_r);
let g0 = _mm256_shuffle_epi8(g_8, sh_g);
let b0 = _mm256_shuffle_epi8(b_8, sh_b);
let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
let rgb0 = _mm256_permute2x128_si256(p0, p1, 0x20);
let rgb1 = _mm256_permute2x128_si256(p2, p0, 0x30);
let (rgb_lo, rgb_hi) = rgb_chunk.split_at_mut(32);
safe_simd::_mm256_storeu_si256(<&mut [u8; 32]>::try_from(rgb_lo).unwrap(), rgb0);
safe_simd::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(rgb_hi).unwrap(),
_mm256_castsi256_si128(rgb1),
);
}
let remainder_start = len - remainder_len;
for i in remainder_start..len {
let y_val = i32::from(y_plane[i]);
let cb_val = i32::from(cb_plane[i]) - 128;
let cr_val = i32::from(cr_plane[i]) - 128;
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = i * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ycbcr_planes_i16_to_rgb_u8_avx512(
_token: archmage::X64V4Token,
y_plane: &[i16],
cb_plane: &[i16],
cr_plane: &[i16],
rgb: &mut [u8],
) {
use core::arch::x86_64::*;
let len = y_plane.len();
let chunks = len / 16;
let y_coeff = _mm512_set1_epi32(Y_CF_INT);
let rounding = _mm512_set1_epi32(YUV_ROUND);
let cr_to_r = _mm512_set1_epi32(CR_TO_R_INT);
let cr_to_g = _mm512_set1_epi32(CR_TO_G_INT);
let cb_to_g = _mm512_set1_epi32(CB_TO_G_INT);
let cb_to_b = _mm512_set1_epi32(CB_TO_B_INT);
let bias_16 = _mm256_set1_epi16(128);
let zero_256 = _mm256_setzero_si256();
let sh_r = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
9, 4, 15, 10, 5,
);
let sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
14, 9, 4, 15, 10,
);
let sh_b = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
3, 14, 9, 4, 15,
);
let m0 = _mm256_setr_epi8(
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
0, 0, -1, 0, 0,
);
let m1 = _mm256_setr_epi8(
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0,
);
for chunk in 0..chunks {
let in_offset = chunk * 16;
let out_offset = chunk * 48;
let y_vec = safe_simd::_mm256_loadu_si256(
<&[i16; 16]>::try_from(&y_plane[in_offset..in_offset + 16]).unwrap(),
);
let cb_vec = safe_simd::_mm256_loadu_si256(
<&[i16; 16]>::try_from(&cb_plane[in_offset..in_offset + 16]).unwrap(),
);
let cr_vec = safe_simd::_mm256_loadu_si256(
<&[i16; 16]>::try_from(&cr_plane[in_offset..in_offset + 16]).unwrap(),
);
let cb_centered = _mm256_sub_epi16(cb_vec, bias_16);
let cr_centered = _mm256_sub_epi16(cr_vec, bias_16);
let y_32 = _mm512_cvtepi16_epi32(y_vec);
let cb_32 = _mm512_cvtepi16_epi32(cb_centered);
let cr_32 = _mm512_cvtepi16_epi32(cr_centered);
let y_scaled = _mm512_add_epi32(_mm512_mullo_epi32(y_32, y_coeff), rounding);
let r_32 = _mm512_srai_epi32(
_mm512_add_epi32(y_scaled, _mm512_mullo_epi32(cr_32, cr_to_r)),
14,
);
let g_32 = _mm512_srai_epi32(
_mm512_add_epi32(
y_scaled,
_mm512_add_epi32(
_mm512_mullo_epi32(cr_32, cr_to_g),
_mm512_mullo_epi32(cb_32, cb_to_g),
),
),
14,
);
let b_32 = _mm512_srai_epi32(
_mm512_add_epi32(y_scaled, _mm512_mullo_epi32(cb_32, cb_to_b)),
14,
);
let r_16 = _mm512_cvtsepi32_epi16(r_32);
let g_16 = _mm512_cvtsepi32_epi16(g_32);
let b_16 = _mm512_cvtsepi32_epi16(b_32);
let r_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(r_16, zero_256), 0b11_01_10_00);
let g_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(g_16, zero_256), 0b11_01_10_00);
let b_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(b_16, zero_256), 0b11_01_10_00);
let r0 = _mm256_shuffle_epi8(r_8, sh_r);
let g0 = _mm256_shuffle_epi8(g_8, sh_g);
let b0 = _mm256_shuffle_epi8(b_8, sh_b);
let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
let rgb0 = _mm256_permute2x128_si256(p0, p1, 0x20);
let rgb1 = _mm256_permute2x128_si256(p2, p0, 0x30);
safe_simd::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut rgb[out_offset..out_offset + 32]).unwrap(),
rgb0,
);
safe_simd::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut rgb[out_offset + 32..out_offset + 48]).unwrap(),
_mm256_castsi256_si128(rgb1),
);
}
let remainder_start = chunks * 16;
for i in remainder_start..len {
let y_val = i32::from(y_plane[i]);
let cb_val = i32::from(cb_plane[i]) - 128;
let cr_val = i32::from(cr_plane[i]) - 128;
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = i * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn fused_h2v2_box_ycbcr_to_rgb_u8_generic(
token: Token,
y_row: &[i16],
cb_row: &[i16],
cr_row: &[i16],
rgb: &mut [u8],
width: usize,
) {
#[allow(non_camel_case_types)]
type i32x4 = GenericI32x4<Token>;
let y_coeff = i32x4::splat(token, Y_CF_INT);
let rounding = i32x4::splat(token, YUV_ROUND);
let bias = i32x4::splat(token, 128);
let zero = i32x4::zero(token);
let max255 = i32x4::splat(token, 255);
let cr_to_r = i32x4::splat(token, CR_TO_R_INT);
let cr_to_g = i32x4::splat(token, CR_TO_G_INT);
let cb_to_g = i32x4::splat(token, CB_TO_G_INT);
let cb_to_b = i32x4::splat(token, CB_TO_B_INT);
let chroma_width = (width + 1) / 2;
let safe_chroma = if width >= 8 { (width - 7) / 2 } else { 0 };
let chunks = safe_chroma / 4;
for chunk in 0..chunks {
let cx_base = chunk * 4;
let cb4 = i32x4::from_array(
token,
[
i32::from(cb_row[cx_base]),
i32::from(cb_row[cx_base + 1]),
i32::from(cb_row[cx_base + 2]),
i32::from(cb_row[cx_base + 3]),
],
) - bias;
let cr4 = i32x4::from_array(
token,
[
i32::from(cr_row[cx_base]),
i32::from(cr_row[cx_base + 1]),
i32::from(cr_row[cx_base + 2]),
i32::from(cr_row[cx_base + 3]),
],
) - bias;
let cr_r = cr4 * cr_to_r;
let cr_g = cr4 * cr_to_g;
let cb_g = cb4 * cb_to_g;
let cb_b = cb4 * cb_to_b;
let chroma_g = cr_g + cb_g;
let px_base = cx_base * 2;
let y_left = i32x4::from_array(
token,
[
i32::from(y_row[px_base]),
i32::from(y_row[px_base + 2]),
i32::from(y_row[px_base + 4]),
i32::from(y_row[px_base + 6]),
],
);
let ys_left = y_left * y_coeff + rounding;
let rl = (ys_left + cr_r)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let gl = (ys_left + chroma_g)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let bl = (ys_left + cb_b)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let y_right = i32x4::from_array(
token,
[
i32::from(y_row[px_base + 1]),
i32::from(y_row[px_base + 3]),
i32::from(y_row[px_base + 5]),
i32::from(y_row[px_base + 7]),
],
);
let ys_right = y_right * y_coeff + rounding;
let rr = (ys_right + cr_r)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let gr = (ys_right + chroma_g)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let br = (ys_right + cb_b)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let rla = rl.to_array();
let gla = gl.to_array();
let bla = bl.to_array();
let rra = rr.to_array();
let gra = gr.to_array();
let bra = br.to_array();
for i in 0..4 {
let idx = (px_base + i * 2) * 3;
rgb[idx] = rla[i] as u8;
rgb[idx + 1] = gla[i] as u8;
rgb[idx + 2] = bla[i] as u8;
rgb[idx + 3] = rra[i] as u8;
rgb[idx + 4] = gra[i] as u8;
rgb[idx + 5] = bra[i] as u8;
}
}
for cx in (chunks * 4)..chroma_width {
let cb_val = i32::from(cb_row[cx]) - 128;
let cr_val = i32::from(cr_row[cx]) - 128;
let px0 = cx * 2;
if px0 < width {
let y_val = i32::from(y_row[px0]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = px0 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
let px1 = cx * 2 + 1;
if px1 < width {
let y_val = i32::from(y_row[px1]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = px1 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
pub fn fused_h2v2_box_ycbcr_to_rgb_u8(
y_row: &[i16],
cb_row: &[i16],
cr_row: &[i16],
rgb: &mut [u8],
width: usize,
) {
debug_assert!(y_row.len() >= width);
debug_assert!(cb_row.len() >= (width + 1) / 2);
debug_assert!(cr_row.len() >= (width + 1) / 2);
debug_assert!(rgb.len() >= width * 3);
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
fused_h2v2_box_ycbcr_to_rgb_u8_avx2(token, y_row, cb_row, cr_row, rgb, width);
return;
}
}
incant!(fused_h2v2_box_ycbcr_to_rgb_u8_generic(
y_row, cb_row, cr_row, rgb, width
));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn fused_h2v2_hfancy_ycbcr_to_rgb_u8_generic(
token: Token,
y_row: &[i16],
cb_row: &[i16],
cr_row: &[i16],
rgb: &mut [u8],
width: usize,
) {
#[allow(non_camel_case_types)]
type i32x4 = GenericI32x4<Token>;
let y_coeff = i32x4::splat(token, Y_CF_INT);
let rounding = i32x4::splat(token, YUV_ROUND);
let zero = i32x4::zero(token);
let max255 = i32x4::splat(token, 255);
let cr_to_r = i32x4::splat(token, CR_TO_R_INT);
let cr_to_g = i32x4::splat(token, CR_TO_G_INT);
let cb_to_g = i32x4::splat(token, CB_TO_G_INT);
let cb_to_b = i32x4::splat(token, CB_TO_B_INT);
let chroma_width = (width + 1) / 2;
let mut cb_buf = [0i32; 8]; let mut cr_buf = [0i32; 8];
let mut y_buf = [0i32; 8];
let safe_chroma = if width >= 8 { (width - 7) / 2 } else { 0 };
let chunks = safe_chroma / 4;
for chunk in 0..chunks {
let cx_base = chunk * 4;
for j in 0..4 {
let cx = cx_base + j;
let curr_cb = i32::from(cb_row[cx]);
let curr_cr = i32::from(cr_row[cx]);
let left_cb = if cx > 0 {
i32::from(cb_row[cx - 1])
} else {
curr_cb
};
let left_cr = if cx > 0 {
i32::from(cr_row[cx - 1])
} else {
curr_cr
};
let right_cb = if cx + 1 < chroma_width {
i32::from(cb_row[cx + 1])
} else {
curr_cb
};
let right_cr = if cx + 1 < chroma_width {
i32::from(cr_row[cx + 1])
} else {
curr_cr
};
cb_buf[j] = ((3 * curr_cb + left_cb + 2) >> 2) - 128; cr_buf[j] = ((3 * curr_cr + left_cr + 2) >> 2) - 128;
cb_buf[4 + j] = ((3 * curr_cb + right_cb + 2) >> 2) - 128; cr_buf[4 + j] = ((3 * curr_cr + right_cr + 2) >> 2) - 128;
let px_base = cx * 2;
y_buf[j] = i32::from(y_row[px_base]); y_buf[4 + j] = i32::from(y_row[px_base + 1]); }
for half in 0..2u32 {
let off = (half * 4) as usize;
let cb4 = i32x4::from_array(
token,
[
cb_buf[off],
cb_buf[off + 1],
cb_buf[off + 2],
cb_buf[off + 3],
],
);
let cr4 = i32x4::from_array(
token,
[
cr_buf[off],
cr_buf[off + 1],
cr_buf[off + 2],
cr_buf[off + 3],
],
);
let y4 = i32x4::from_array(
token,
[y_buf[off], y_buf[off + 1], y_buf[off + 2], y_buf[off + 3]],
);
let ys = y4 * y_coeff + rounding;
let r = (ys + cr4 * cr_to_r)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let g = (ys + cr4 * cr_to_g + cb4 * cb_to_g)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let b = (ys + cb4 * cb_to_b)
.shr_arithmetic::<14>()
.max(zero)
.min(max255);
let ra = r.to_array();
let ga = g.to_array();
let ba = b.to_array();
for j in 0..4 {
let cx = cx_base + j;
let px = cx * 2 + half as usize;
if px < width {
let idx = px * 3;
rgb[idx] = ra[j] as u8;
rgb[idx + 1] = ga[j] as u8;
rgb[idx + 2] = ba[j] as u8;
}
}
}
}
for cx in (chunks * 4)..chroma_width {
let curr_cb = i32::from(cb_row[cx]);
let curr_cr = i32::from(cr_row[cx]);
let left_cb = if cx > 0 {
i32::from(cb_row[cx - 1])
} else {
curr_cb
};
let left_cr = if cx > 0 {
i32::from(cr_row[cx - 1])
} else {
curr_cr
};
let right_cb = if cx + 1 < chroma_width {
i32::from(cb_row[cx + 1])
} else {
curr_cb
};
let right_cr = if cx + 1 < chroma_width {
i32::from(cr_row[cx + 1])
} else {
curr_cr
};
let cb_l = ((3 * curr_cb + left_cb + 2) >> 2) - 128;
let cr_l = ((3 * curr_cr + left_cr + 2) >> 2) - 128;
let px0 = cx * 2;
if px0 < width {
let y_val = i32::from(y_row[px0]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_l * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_l * CR_TO_G_INT + cb_l * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_l * CB_TO_B_INT) >> 14;
let idx = px0 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
let cb_r = ((3 * curr_cb + right_cb + 2) >> 2) - 128;
let cr_r = ((3 * curr_cr + right_cr + 2) >> 2) - 128;
let px1 = cx * 2 + 1;
if px1 < width {
let y_val = i32::from(y_row[px1]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_r * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_r * CR_TO_G_INT + cb_r * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_r * CB_TO_B_INT) >> 14;
let idx = px1 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
pub fn fused_h2v2_hfancy_ycbcr_to_rgb_u8(
y_row: &[i16],
cb_row: &[i16],
cr_row: &[i16],
rgb: &mut [u8],
width: usize,
) {
debug_assert!(y_row.len() >= width);
debug_assert!(cb_row.len() >= (width + 1) / 2);
debug_assert!(cr_row.len() >= (width + 1) / 2);
debug_assert!(rgb.len() >= width * 3);
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
fused_h2v2_hfancy_ycbcr_to_rgb_u8_avx2(token, y_row, cb_row, cr_row, rgb, width);
return;
}
}
incant!(fused_h2v2_hfancy_ycbcr_to_rgb_u8_generic(
y_row, cb_row, cr_row, rgb, width
));
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn fused_h2v2_hfancy_ycbcr_to_rgb_u8_avx2(
_token: archmage::X64V3Token,
y_row: &[i16],
cb_row: &[i16],
cr_row: &[i16],
rgb: &mut [u8],
width: usize,
) {
use core::arch::x86_64::*;
let chroma_width = (width + 1) / 2;
let chunks = width / 16;
let bias = _mm256_set1_epi16(128);
let y_coeff = _mm256_set1_epi32(Y_CF_INT);
let rounding = _mm256_set1_epi32(YUV_ROUND);
let cr_to_r = _mm256_set1_epi32(CR_TO_R_INT);
let cr_to_g = _mm256_set1_epi32(CR_TO_G_INT);
let cb_to_g = _mm256_set1_epi32(CB_TO_G_INT);
let cb_to_b = _mm256_set1_epi32(CB_TO_B_INT);
let zero = _mm256_setzero_si256();
let three = _mm_set1_epi16(3);
let round2 = _mm_set1_epi16(2);
let sh_r = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
9, 4, 15, 10, 5,
);
let sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
14, 9, 4, 15, 10,
);
let sh_b = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
3, 14, 9, 4, 15,
);
let m0 = _mm256_setr_epi8(
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
0, 0, -1, 0, 0,
);
let m1 = _mm256_setr_epi8(
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0,
);
let mut prev_cb = cb_row[0]; let mut prev_cr = cr_row[0];
for chunk in 0..chunks {
let c_offset = chunk * 8;
let y_offset = chunk * 16;
let out_offset = chunk * 48;
let y_vec = safe_simd::_mm256_loadu_si256(
<&[i16; 16]>::try_from(&y_row[y_offset..y_offset + 16]).unwrap(),
);
let cb_curr = safe_simd::_mm_loadu_si128(
<&[i16; 8]>::try_from(&cb_row[c_offset..c_offset + 8]).unwrap(),
);
let cr_curr = safe_simd::_mm_loadu_si128(
<&[i16; 8]>::try_from(&cr_row[c_offset..c_offset + 8]).unwrap(),
);
let cb_left = _mm_insert_epi16::<0>(_mm_slli_si128::<2>(cb_curr), prev_cb as i32);
let cr_left = _mm_insert_epi16::<0>(_mm_slli_si128::<2>(cr_curr), prev_cr as i32);
let next_cb = if c_offset + 8 < chroma_width {
cb_row[c_offset + 8]
} else {
cb_row[c_offset + 7] };
let next_cr = if c_offset + 8 < chroma_width {
cr_row[c_offset + 8]
} else {
cr_row[c_offset + 7]
};
let cb_right = _mm_insert_epi16::<7>(_mm_srli_si128::<2>(cb_curr), next_cb as i32);
let cr_right = _mm_insert_epi16::<7>(_mm_srli_si128::<2>(cr_curr), next_cr as i32);
prev_cb = cb_row[c_offset + 7];
prev_cr = cr_row[c_offset + 7];
let three_cb = _mm_mullo_epi16(cb_curr, three);
let three_cr = _mm_mullo_epi16(cr_curr, three);
let cb_interp_l =
_mm_srai_epi16::<2>(_mm_add_epi16(_mm_add_epi16(three_cb, cb_left), round2));
let cr_interp_l =
_mm_srai_epi16::<2>(_mm_add_epi16(_mm_add_epi16(three_cr, cr_left), round2));
let cb_interp_r =
_mm_srai_epi16::<2>(_mm_add_epi16(_mm_add_epi16(three_cb, cb_right), round2));
let cr_interp_r =
_mm_srai_epi16::<2>(_mm_add_epi16(_mm_add_epi16(three_cr, cr_right), round2));
let cb_lo = _mm_unpacklo_epi16(cb_interp_l, cb_interp_r);
let cb_hi = _mm_unpackhi_epi16(cb_interp_l, cb_interp_r);
let cb_vec = _mm256_set_m128i(cb_hi, cb_lo);
let cr_lo = _mm_unpacklo_epi16(cr_interp_l, cr_interp_r);
let cr_hi = _mm_unpackhi_epi16(cr_interp_l, cr_interp_r);
let cr_vec = _mm256_set_m128i(cr_hi, cr_lo);
let cb_centered = _mm256_sub_epi16(cb_vec, bias);
let cr_centered = _mm256_sub_epi16(cr_vec, bias);
let y_lo = _mm256_unpacklo_epi16(y_vec, zero);
let y_hi = _mm256_unpackhi_epi16(y_vec, zero);
let y_scaled_lo = _mm256_add_epi32(_mm256_mullo_epi32(y_lo, y_coeff), rounding);
let y_scaled_hi = _mm256_add_epi32(_mm256_mullo_epi32(y_hi, y_coeff), rounding);
let cb_sign = _mm256_srai_epi16(cb_centered, 15);
let cr_sign = _mm256_srai_epi16(cr_centered, 15);
let cb_lo32 = _mm256_unpacklo_epi16(cb_centered, cb_sign);
let cb_hi32 = _mm256_unpackhi_epi16(cb_centered, cb_sign);
let cr_lo32 = _mm256_unpacklo_epi16(cr_centered, cr_sign);
let cr_hi32 = _mm256_unpackhi_epi16(cr_centered, cr_sign);
let r_lo = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_lo, _mm256_mullo_epi32(cr_lo32, cr_to_r)),
14,
);
let r_hi = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_hi, _mm256_mullo_epi32(cr_hi32, cr_to_r)),
14,
);
let g_lo = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_lo,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_lo32, cr_to_g),
_mm256_mullo_epi32(cb_lo32, cb_to_g),
),
),
14,
);
let g_hi = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_hi,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_hi32, cr_to_g),
_mm256_mullo_epi32(cb_hi32, cb_to_g),
),
),
14,
);
let b_lo = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_lo, _mm256_mullo_epi32(cb_lo32, cb_to_b)),
14,
);
let b_hi = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_hi, _mm256_mullo_epi32(cb_hi32, cb_to_b)),
14,
);
let r_16 = _mm256_packs_epi32(r_lo, r_hi);
let g_16 = _mm256_packs_epi32(g_lo, g_hi);
let b_16 = _mm256_packs_epi32(b_lo, b_hi);
let r_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(r_16, zero), 0b11_01_10_00);
let g_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(g_16, zero), 0b11_01_10_00);
let b_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(b_16, zero), 0b11_01_10_00);
let r0 = _mm256_shuffle_epi8(r_8, sh_r);
let g0 = _mm256_shuffle_epi8(g_8, sh_g);
let b0 = _mm256_shuffle_epi8(b_8, sh_b);
let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
let rgb0 = _mm256_permute2x128_si256(p0, p1, 0x20);
let rgb1 = _mm256_permute2x128_si256(p2, p0, 0x30);
safe_simd::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut rgb[out_offset..out_offset + 32]).unwrap(),
rgb0,
);
safe_simd::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut rgb[out_offset + 32..out_offset + 48]).unwrap(),
_mm256_castsi256_si128(rgb1),
);
}
let c_remainder_start = chunks * 8;
for cx in c_remainder_start..chroma_width {
let curr_cb = i32::from(cb_row[cx]);
let curr_cr = i32::from(cr_row[cx]);
let left_cb = if cx > 0 {
i32::from(cb_row[cx - 1])
} else {
curr_cb
};
let left_cr = if cx > 0 {
i32::from(cr_row[cx - 1])
} else {
curr_cr
};
let cb_l = ((3 * curr_cb + left_cb + 2) >> 2) - 128;
let cr_l = ((3 * curr_cr + left_cr + 2) >> 2) - 128;
let px0 = cx * 2;
if px0 < width {
let y_val = i32::from(y_row[px0]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_l * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_l * CR_TO_G_INT + cb_l * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_l * CB_TO_B_INT) >> 14;
let idx = px0 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
let right_cb = if cx + 1 < chroma_width {
i32::from(cb_row[cx + 1])
} else {
curr_cb
};
let right_cr = if cx + 1 < chroma_width {
i32::from(cr_row[cx + 1])
} else {
curr_cr
};
let cb_r = ((3 * curr_cb + right_cb + 2) >> 2) - 128;
let cr_r = ((3 * curr_cr + right_cr + 2) >> 2) - 128;
let px1 = cx * 2 + 1;
if px1 < width {
let y_val = i32::from(y_row[px1]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_r * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_r * CR_TO_G_INT + cb_r * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_r * CB_TO_B_INT) >> 14;
let idx = px1 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn fused_h2v2_box_ycbcr_to_rgb_u8_avx2(
_token: archmage::X64V3Token,
y_row: &[i16],
cb_row: &[i16],
cr_row: &[i16],
rgb: &mut [u8],
width: usize,
) {
use core::arch::x86_64::*;
let chunks = width / 16;
let bias = _mm256_set1_epi16(128);
let y_coeff = _mm256_set1_epi32(Y_CF_INT);
let rounding = _mm256_set1_epi32(YUV_ROUND);
let cr_to_r = _mm256_set1_epi32(CR_TO_R_INT);
let cr_to_g = _mm256_set1_epi32(CR_TO_G_INT);
let cb_to_g = _mm256_set1_epi32(CB_TO_G_INT);
let cb_to_b = _mm256_set1_epi32(CB_TO_B_INT);
let zero = _mm256_setzero_si256();
let sh_r = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
9, 4, 15, 10, 5,
);
let sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
14, 9, 4, 15, 10,
);
let sh_b = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
3, 14, 9, 4, 15,
);
let m0 = _mm256_setr_epi8(
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
0, 0, -1, 0, 0,
);
let m1 = _mm256_setr_epi8(
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0,
);
for chunk in 0..chunks {
let y_offset = chunk * 16;
let c_offset = chunk * 8;
let out_offset = chunk * 48;
let y_vec = safe_simd::_mm256_loadu_si256(
<&[i16; 16]>::try_from(&y_row[y_offset..y_offset + 16]).unwrap(),
);
let cb_half = safe_simd::_mm_loadu_si128(
<&[i16; 8]>::try_from(&cb_row[c_offset..c_offset + 8]).unwrap(),
);
let cr_half = safe_simd::_mm_loadu_si128(
<&[i16; 8]>::try_from(&cr_row[c_offset..c_offset + 8]).unwrap(),
);
let cb_lo = _mm_unpacklo_epi16(cb_half, cb_half); let cb_hi = _mm_unpackhi_epi16(cb_half, cb_half); let cb_vec = _mm256_set_m128i(cb_hi, cb_lo);
let cr_lo = _mm_unpacklo_epi16(cr_half, cr_half);
let cr_hi = _mm_unpackhi_epi16(cr_half, cr_half);
let cr_vec = _mm256_set_m128i(cr_hi, cr_lo);
let cb_centered = _mm256_sub_epi16(cb_vec, bias);
let cr_centered = _mm256_sub_epi16(cr_vec, bias);
let y_lo = _mm256_unpacklo_epi16(y_vec, zero);
let y_hi = _mm256_unpackhi_epi16(y_vec, zero);
let y_scaled_lo = _mm256_add_epi32(_mm256_mullo_epi32(y_lo, y_coeff), rounding);
let y_scaled_hi = _mm256_add_epi32(_mm256_mullo_epi32(y_hi, y_coeff), rounding);
let cb_sign = _mm256_srai_epi16(cb_centered, 15);
let cr_sign = _mm256_srai_epi16(cr_centered, 15);
let cb_lo32 = _mm256_unpacklo_epi16(cb_centered, cb_sign);
let cb_hi32 = _mm256_unpackhi_epi16(cb_centered, cb_sign);
let cr_lo32 = _mm256_unpacklo_epi16(cr_centered, cr_sign);
let cr_hi32 = _mm256_unpackhi_epi16(cr_centered, cr_sign);
let r_lo = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_lo, _mm256_mullo_epi32(cr_lo32, cr_to_r)),
14,
);
let r_hi = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_hi, _mm256_mullo_epi32(cr_hi32, cr_to_r)),
14,
);
let g_lo = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_lo,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_lo32, cr_to_g),
_mm256_mullo_epi32(cb_lo32, cb_to_g),
),
),
14,
);
let g_hi = _mm256_srai_epi32(
_mm256_add_epi32(
y_scaled_hi,
_mm256_add_epi32(
_mm256_mullo_epi32(cr_hi32, cr_to_g),
_mm256_mullo_epi32(cb_hi32, cb_to_g),
),
),
14,
);
let b_lo = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_lo, _mm256_mullo_epi32(cb_lo32, cb_to_b)),
14,
);
let b_hi = _mm256_srai_epi32(
_mm256_add_epi32(y_scaled_hi, _mm256_mullo_epi32(cb_hi32, cb_to_b)),
14,
);
let r_16 = _mm256_packs_epi32(r_lo, r_hi);
let g_16 = _mm256_packs_epi32(g_lo, g_hi);
let b_16 = _mm256_packs_epi32(b_lo, b_hi);
let r_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(r_16, zero), 0b11_01_10_00);
let g_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(g_16, zero), 0b11_01_10_00);
let b_8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(b_16, zero), 0b11_01_10_00);
let r0 = _mm256_shuffle_epi8(r_8, sh_r);
let g0 = _mm256_shuffle_epi8(g_8, sh_g);
let b0 = _mm256_shuffle_epi8(b_8, sh_b);
let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
let rgb0 = _mm256_permute2x128_si256(p0, p1, 0x20);
let rgb1 = _mm256_permute2x128_si256(p2, p0, 0x30);
safe_simd::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut rgb[out_offset..out_offset + 32]).unwrap(),
rgb0,
);
safe_simd::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut rgb[out_offset + 32..out_offset + 48]).unwrap(),
_mm256_castsi256_si128(rgb1),
);
}
let c_remainder_start = chunks * 8;
let chroma_width = (width + 1) / 2;
for cx in c_remainder_start..chroma_width {
let cb_val = i32::from(cb_row[cx]) - 128;
let cr_val = i32::from(cr_row[cx]) - 128;
let px0 = cx * 2;
if px0 < width {
let y_val = i32::from(y_row[px0]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = px0 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
let px1 = cx * 2 + 1;
if px1 < width {
let y_val = i32::from(y_row[px1]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_val * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_val * CR_TO_G_INT + cb_val * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_val * CB_TO_B_INT) >> 14;
let idx = px1 * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rgb_ycbcr_roundtrip() {
let test_colors = [
(0u8, 0u8, 0u8), (255u8, 255u8, 255u8), (255u8, 0u8, 0u8), (0u8, 255u8, 0u8), (0u8, 0u8, 255u8), (128u8, 128u8, 128u8), ];
for (r, g, b) in test_colors {
let (y, cb, cr) = rgb_to_ycbcr(r, g, b);
let (r2, g2, b2) = ycbcr_to_rgb(y, cb, cr);
assert!(
(r as i16 - r2 as i16).abs() <= 1,
"R mismatch for ({},{},{})",
r,
g,
b
);
assert!(
(g as i16 - g2 as i16).abs() <= 1,
"G mismatch for ({},{},{})",
r,
g,
b
);
assert!(
(b as i16 - b2 as i16).abs() <= 1,
"B mismatch for ({},{},{})",
r,
g,
b
);
}
}
#[test]
fn test_gray_ycbcr() {
for gray in [0u8, 64, 128, 192, 255] {
let (y, cb, cr) = rgb_to_ycbcr(gray, gray, gray);
assert_eq!(y, gray);
assert!((cb as i16 - 128).abs() <= 1);
assert!((cr as i16 - 128).abs() <= 1);
}
}
#[test]
fn test_cmyk_rgb_roundtrip() {
let (r, g, b) = cmyk_to_rgb(0, 0, 0, 0);
assert_eq!((r, g, b), (255, 255, 255));
let (r, g, b) = cmyk_to_rgb(255, 255, 255, 255);
assert_eq!((r, g, b), (0, 0, 0)); }
#[test]
fn test_bgr_conversion() {
assert_eq!(bgr_to_rgb(&[1, 2, 3]), [3, 2, 1]);
assert_eq!(bgra_to_rgba(&[1, 2, 3, 4]), [3, 2, 1, 4]);
}
#[cfg(feature = "decoder")]
#[test]
fn test_rgb_u8_swap_rb_inplace() {
let mut data = vec![10, 20, 30, 40, 50, 60]; rgb_u8_swap_rb_inplace(&mut data);
assert_eq!(data, [30, 20, 10, 60, 50, 40]);
}
#[cfg(feature = "decoder")]
#[test]
fn test_rgb_u8_to_rgba_u8() {
let src = [10, 20, 30, 40, 50, 60]; let mut dst = [0u8; 8];
rgb_u8_to_rgba_u8(&src, &mut dst);
assert_eq!(dst, [10, 20, 30, 255, 40, 50, 60, 255]);
}
#[cfg(feature = "decoder")]
#[test]
fn test_rgb_u8_to_bgra_u8() {
let src = [10, 20, 30, 40, 50, 60]; let mut dst = [0u8; 8];
rgb_u8_to_bgra_u8(&src, &mut dst);
assert_eq!(dst, [30, 20, 10, 255, 60, 50, 40, 255]);
}
#[cfg(feature = "decoder")]
#[test]
fn test_rgb_u8_to_bgrx_u8() {
let src = [10, 20, 30];
let mut dst = [0u8; 4];
rgb_u8_to_bgrx_u8(&src, &mut dst);
assert_eq!(dst, [30, 20, 10, 255]);
}
#[test]
fn test_simd_rgb_to_ycbcr_matches_scalar() {
let test_colors = [
(0u8, 0u8, 0u8),
(255u8, 255u8, 255u8),
(255u8, 0u8, 0u8),
(0u8, 255u8, 0u8),
(0u8, 0u8, 255u8),
(128u8, 128u8, 128u8),
(100u8, 150u8, 200u8),
(33u8, 66u8, 99u8),
];
for chunk in test_colors.chunks(4) {
if chunk.len() < 4 {
continue;
}
let r = [chunk[0].0, chunk[1].0, chunk[2].0, chunk[3].0];
let g = [chunk[0].1, chunk[1].1, chunk[2].1, chunk[3].1];
let b = [chunk[0].2, chunk[1].2, chunk[2].2, chunk[3].2];
let (y_simd, cb_simd, cr_simd) = simd::rgb_to_ycbcr_x4(r, g, b);
for i in 0..4 {
let (y_scalar, cb_scalar, cr_scalar) = rgb_to_ycbcr(r[i], g[i], b[i]);
assert_eq!(y_simd[i], y_scalar, "Y mismatch at {}", i);
assert_eq!(cb_simd[i], cb_scalar, "Cb mismatch at {}", i);
assert_eq!(cr_simd[i], cr_scalar, "Cr mismatch at {}", i);
}
}
}
#[test]
fn test_simd_ycbcr_to_rgb_matches_scalar() {
let test_ycbcr = [
(0u8, 128u8, 128u8), (255u8, 128u8, 128u8), (76u8, 85u8, 255u8), (150u8, 44u8, 21u8), (29u8, 255u8, 107u8), (128u8, 128u8, 128u8), ];
for chunk in test_ycbcr.chunks(4) {
if chunk.len() < 4 {
continue;
}
let y = [chunk[0].0, chunk[1].0, chunk[2].0, chunk[3].0];
let cb = [chunk[0].1, chunk[1].1, chunk[2].1, chunk[3].1];
let cr = [chunk[0].2, chunk[1].2, chunk[2].2, chunk[3].2];
let (r_simd, g_simd, b_simd) = simd::ycbcr_to_rgb_x4(y, cb, cr);
for i in 0..4 {
let (r_scalar, g_scalar, b_scalar) = ycbcr_to_rgb(y[i], cb[i], cr[i]);
assert_eq!(r_simd[i], r_scalar, "R mismatch at {}", i);
assert_eq!(g_simd[i], g_scalar, "G mismatch at {}", i);
assert_eq!(b_simd[i], b_scalar, "B mismatch at {}", i);
}
}
}
#[test]
fn test_rgb_to_ycbcr_f32() {
let (y, cb, cr) = rgb_to_ycbcr_f32(255.0, 0.0, 0.0); assert!((y - 76.0).abs() < 1.0);
assert!((cb - 85.0).abs() < 1.0);
assert!((cr - 255.0).abs() < 1.0);
let (y, _cb, _cr) = rgb_to_ycbcr_f32(0.0, 255.0, 0.0); assert!((y - 150.0).abs() < 1.0);
let (y, _cb, _cr) = rgb_to_ycbcr_f32(0.0, 0.0, 255.0); assert!((y - 29.0).abs() < 1.0);
}
#[test]
fn test_ycbcr_to_rgb_f32() {
let (r, g, b) = ycbcr_to_rgb_f32(128.0, 128.0, 128.0); assert!((r - 128.0).abs() < 1.0);
assert!((g - 128.0).abs() < 1.0);
assert!((b - 128.0).abs() < 1.0);
}
#[test]
fn test_convert_rgb_to_ycbcr_buffer() {
let mut buffer = [255, 0, 0, 0, 255, 0, 0, 0, 255]; convert_rgb_to_ycbcr_buffer(&mut buffer);
assert!((buffer[0] as i16 - 76).abs() <= 1);
}
#[test]
fn test_convert_ycbcr_to_rgb_buffer() {
let mut buffer = [128, 128, 128, 128, 128, 128]; convert_ycbcr_to_rgb_buffer(&mut buffer);
assert!((buffer[0] as i16 - 128).abs() <= 1);
assert!((buffer[1] as i16 - 128).abs() <= 1);
assert!((buffer[2] as i16 - 128).abs() <= 1);
}
#[test]
fn test_rgb_to_ycbcr_planes() {
let rgb = vec![255, 0, 0, 0, 255, 0, 0, 0, 255, 128, 128, 128]; let (y, cb, cr) = rgb_to_ycbcr_planes(&rgb, 2, 2).unwrap();
assert_eq!(y.len(), 4);
assert_eq!(cb.len(), 4);
assert_eq!(cr.len(), 4);
assert!((y[0] as i16 - 76).abs() <= 1);
}
#[test]
fn test_ycbcr_planes_to_rgb() {
let y = vec![128u8, 128, 128, 128];
let cb = vec![128u8, 128, 128, 128];
let cr = vec![128u8, 128, 128, 128];
let rgb = ycbcr_planes_to_rgb(&y, &cb, &cr, 2, 2).unwrap();
assert_eq!(rgb.len(), 12); for i in 0..4 {
assert!((rgb[i * 3] as i16 - 128).abs() <= 1);
}
}
#[test]
fn test_ycbcr_planes_f32_to_rgb_u8() {
let y = vec![0.0f32; 4];
let cb = vec![0.0f32; 4];
let cr = vec![0.0f32; 4];
let mut rgb = vec![0u8; 12];
ycbcr_planes_f32_to_rgb_u8(&y, &cb, &cr, &mut rgb);
for i in 0..4 {
assert_eq!(rgb[i * 3], 128);
assert_eq!(rgb[i * 3 + 1], 128);
assert_eq!(rgb[i * 3 + 2], 128);
}
}
#[test]
fn test_ycbcr_planes_f32_to_rgb_f32() {
let y = vec![0.0f32; 4];
let cb = vec![0.0f32; 4];
let cr = vec![0.0f32; 4];
let mut rgb = vec![0.0f32; 12];
ycbcr_planes_f32_to_rgb_f32(&y, &cb, &cr, &mut rgb);
for i in 0..4 {
assert!((rgb[i * 3] - 0.502).abs() < 0.01);
}
}
#[test]
fn test_gray_f32_to_rgb_u8() {
let y = vec![0.0f32, 127.0, -128.0]; let mut rgb = vec![0u8; 9];
gray_f32_to_rgb_u8(&y, &mut rgb);
assert_eq!(rgb[0], 128); assert_eq!(rgb[1], 128); assert_eq!(rgb[2], 128); assert_eq!(rgb[3], 255); assert_eq!(rgb[6], 0); }
#[test]
fn test_gray_f32_to_rgb_f32() {
let y = vec![0.0f32; 2];
let mut rgb = vec![0.0f32; 6];
gray_f32_to_rgb_f32(&y, &mut rgb);
for v in &rgb {
assert!((*v - 0.502).abs() < 0.01);
}
}
#[test]
fn test_gray_f32_to_gray_u8() {
let y = vec![0.0f32, 127.0, -128.0];
let mut output = vec![0u8; 3];
gray_f32_to_gray_u8(&y, &mut output);
assert_eq!(output[0], 128);
assert_eq!(output[1], 255);
assert_eq!(output[2], 0);
}
#[test]
fn test_gray_f32_to_gray_f32() {
let y = vec![0.0f32, 127.0];
let mut output = vec![0.0f32; 2];
gray_f32_to_gray_f32(&y, &mut output);
assert!((output[0] - 0.502).abs() < 0.01);
assert!((output[1] - 1.0).abs() < 0.01);
}
#[test]
fn test_rgb_to_cmyk() {
let (c, m, y, k) = rgb_to_cmyk(255, 255, 255);
assert_eq!((c, m, y, k), (0, 0, 0, 0));
let (_c, _m, _y, k) = rgb_to_cmyk(0, 0, 0);
assert_eq!(k, 255);
let (c, _, _, _) = rgb_to_cmyk(255, 0, 0);
assert_eq!(c, 0);
}
#[test]
fn test_extract_channel() {
let data = vec![10, 20, 30, 40, 50, 60]; let red = extract_channel(&data, PixelFormat::Rgb, 0).unwrap();
assert_eq!(red, vec![10, 40]);
let green = extract_channel(&data, PixelFormat::Rgb, 1).unwrap();
assert_eq!(green, vec![20, 50]);
let blue = extract_channel(&data, PixelFormat::Rgb, 2).unwrap();
assert_eq!(blue, vec![30, 60]);
}
#[test]
fn test_extract_channel_rgba() {
let data = vec![10, 20, 30, 255, 40, 50, 60, 128]; let alpha = extract_channel(&data, PixelFormat::Rgba, 3).unwrap();
assert_eq!(alpha, vec![255, 128]);
}
#[test]
fn test_ycbcr_to_rgb_i16_scalar() {
let test_cases = [
(128i16, 128i16, 128i16), (76i16, 85i16, 255i16), (150i16, 44i16, 21i16), (29i16, 255i16, 107i16), ];
for (y, cb, cr) in test_cases {
let (r_f32, g_f32, b_f32) = ycbcr_to_rgb(y as u8, cb as u8, cr as u8);
let y_arr = [y; 16];
let cb_arr = [cb; 16];
let cr_arr = [cr; 16];
let mut rgb = vec![0u8; 48];
let mut offset = 0;
ycbcr_to_rgb_i16_x16_scalar(&y_arr, &cb_arr, &cr_arr, &mut rgb, &mut offset);
assert!(
(rgb[0] as i16 - r_f32 as i16).abs() <= 2,
"R mismatch: {} vs {} for Y={}, Cb={}, Cr={}",
rgb[0],
r_f32,
y,
cb,
cr
);
assert!(
(rgb[1] as i16 - g_f32 as i16).abs() <= 2,
"G mismatch: {} vs {} for Y={}, Cb={}, Cr={}",
rgb[1],
g_f32,
y,
cb,
cr
);
assert!(
(rgb[2] as i16 - b_f32 as i16).abs() <= 2,
"B mismatch: {} vs {} for Y={}, Cb={}, Cr={}",
rgb[2],
b_f32,
y,
cb,
cr
);
}
}
#[test]
fn test_ycbcr_planes_i16_to_rgb_u8() {
let y_plane: Vec<i16> = (0..32).map(|i| 128 + (i % 5) as i16).collect();
let cb_plane: Vec<i16> = (0..32).map(|i| 128 + (i % 3) as i16).collect();
let cr_plane: Vec<i16> = (0..32).map(|i| 128 + (i % 7) as i16).collect();
let mut rgb = vec![0u8; 96];
ycbcr_planes_i16_to_rgb_u8(&y_plane, &cb_plane, &cr_plane, &mut rgb);
for i in 0..32 {
let (r_ref, g_ref, b_ref) =
ycbcr_to_rgb(y_plane[i] as u8, cb_plane[i] as u8, cr_plane[i] as u8);
assert!(
(rgb[i * 3] as i16 - r_ref as i16).abs() <= 2,
"R mismatch at {}: {} vs {}",
i,
rgb[i * 3],
r_ref
);
assert!(
(rgb[i * 3 + 1] as i16 - g_ref as i16).abs() <= 2,
"G mismatch at {}: {} vs {}",
i,
rgb[i * 3 + 1],
g_ref
);
assert!(
(rgb[i * 3 + 2] as i16 - b_ref as i16).abs() <= 2,
"B mismatch at {}: {} vs {}",
i,
rgb[i * 3 + 2],
b_ref
);
}
}
#[test]
fn test_fused_hfancy_avx2_matches_scalar() {
for width in [16, 32, 48, 64, 100, 128, 255, 256, 300, 512] {
let chroma_width = (width + 1) / 2;
let y_row: Vec<i16> = (0..width).map(|i| 16 + (i as i16 * 7) % 220).collect();
let cb_row: Vec<i16> = (0..chroma_width)
.map(|i| 30 + (i as i16 * 13) % 200)
.collect();
let cr_row: Vec<i16> = (0..chroma_width)
.map(|i| 50 + (i as i16 * 11) % 180)
.collect();
let mut rgb_scalar = vec![0u8; width * 3];
{
for cx in 0..chroma_width {
let curr_cb = i32::from(cb_row[cx]);
let curr_cr = i32::from(cr_row[cx]);
let left_cb = if cx > 0 {
i32::from(cb_row[cx - 1])
} else {
curr_cb
};
let left_cr = if cx > 0 {
i32::from(cr_row[cx - 1])
} else {
curr_cr
};
let cb_l = ((3 * curr_cb + left_cb + 2) >> 2) - 128;
let cr_l = ((3 * curr_cr + left_cr + 2) >> 2) - 128;
let px0 = cx * 2;
if px0 < width {
let y_val = i32::from(y_row[px0]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_l * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_l * CR_TO_G_INT + cb_l * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_l * CB_TO_B_INT) >> 14;
let idx = px0 * 3;
rgb_scalar[idx] = r.clamp(0, 255) as u8;
rgb_scalar[idx + 1] = g.clamp(0, 255) as u8;
rgb_scalar[idx + 2] = b.clamp(0, 255) as u8;
}
let right_cb = if cx + 1 < chroma_width {
i32::from(cb_row[cx + 1])
} else {
curr_cb
};
let right_cr = if cx + 1 < chroma_width {
i32::from(cr_row[cx + 1])
} else {
curr_cr
};
let cb_r = ((3 * curr_cb + right_cb + 2) >> 2) - 128;
let cr_r = ((3 * curr_cr + right_cr + 2) >> 2) - 128;
let px1 = cx * 2 + 1;
if px1 < width {
let y_val = i32::from(y_row[px1]);
let y_scaled = y_val * Y_CF_INT + YUV_ROUND;
let r = (y_scaled + cr_r * CR_TO_R_INT) >> 14;
let g = (y_scaled + cr_r * CR_TO_G_INT + cb_r * CB_TO_G_INT) >> 14;
let b = (y_scaled + cb_r * CB_TO_B_INT) >> 14;
let idx = px1 * 3;
rgb_scalar[idx] = r.clamp(0, 255) as u8;
rgb_scalar[idx + 1] = g.clamp(0, 255) as u8;
rgb_scalar[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
let mut rgb_fused = vec![0u8; width * 3];
fused_h2v2_hfancy_ycbcr_to_rgb_u8(&y_row, &cb_row, &cr_row, &mut rgb_fused, width);
assert_eq!(
rgb_scalar,
rgb_fused,
"Mismatch at width={width}: first diff at pixel {}",
rgb_scalar
.iter()
.zip(rgb_fused.iter())
.position(|(a, b)| a != b)
.unwrap_or(0)
/ 3
);
}
}
}