#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
const CR_TO_R_7: i16 = 179;
const CB_TO_G_7: i16 = -44;
const CR_TO_G_7: i16 = -91;
const CB_TO_B_7: i16 = 227;
pub fn ycbcr_to_rgb_sse2(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
width: usize,
) {
unsafe { ycbcr_to_rgb_sse2_inner(y_row, cb_row, cr_row, output, width) }
}
#[target_feature(enable = "sse2")]
unsafe fn ycbcr_to_rgb_sse2_inner(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
width: usize,
) {
let chunks = width / 8;
let zero = _mm_setzero_si128();
let c128 = _mm_set1_epi16(128);
let round = _mm_set1_epi16(64); let cr_r = _mm_set1_epi16(CR_TO_R_7);
let cb_g = _mm_set1_epi16(CB_TO_G_7);
let cr_g = _mm_set1_epi16(CR_TO_G_7);
let cb_b = _mm_set1_epi16(CB_TO_B_7);
for i in 0..chunks {
let base = i * 8;
let y_u8 = _mm_loadl_epi64(y_row.as_ptr().add(base) as *const __m128i);
let cb_u8 = _mm_loadl_epi64(cb_row.as_ptr().add(base) as *const __m128i);
let cr_u8 = _mm_loadl_epi64(cr_row.as_ptr().add(base) as *const __m128i);
let y16 = _mm_unpacklo_epi8(y_u8, zero);
let cb16 = _mm_sub_epi16(_mm_unpacklo_epi8(cb_u8, zero), c128);
let cr16 = _mm_sub_epi16(_mm_unpacklo_epi8(cr_u8, zero), c128);
let r_offset = _mm_srai_epi16::<7>(_mm_add_epi16(_mm_mullo_epi16(cr16, cr_r), round));
let r16 = _mm_add_epi16(y16, r_offset);
let g_offset = _mm_srai_epi16::<7>(_mm_add_epi16(
_mm_add_epi16(_mm_mullo_epi16(cb16, cb_g), _mm_mullo_epi16(cr16, cr_g)),
round,
));
let g16 = _mm_add_epi16(y16, g_offset);
let b_offset = _mm_srai_epi16::<7>(_mm_add_epi16(_mm_mullo_epi16(cb16, cb_b), round));
let b16 = _mm_add_epi16(y16, b_offset);
let r8 = _mm_packus_epi16(r16, zero); let g8 = _mm_packus_epi16(g16, zero);
let b8 = _mm_packus_epi16(b16, zero);
store_rgb_interleaved(r8, g8, b8, output.as_mut_ptr().add(base * 3));
}
scalar_tail_rgb(y_row, cb_row, cr_row, output, chunks * 8, width);
}
pub fn ycbcr_to_rgba_sse2(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
width: usize,
) {
unsafe { ycbcr_to_rgba_sse2_inner(y_row, cb_row, cr_row, output, width) }
}
#[target_feature(enable = "sse2")]
unsafe fn ycbcr_to_rgba_sse2_inner(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
width: usize,
) {
let chunks = width / 8;
let zero = _mm_setzero_si128();
let c128 = _mm_set1_epi16(128);
let round = _mm_set1_epi16(64);
let cr_r = _mm_set1_epi16(CR_TO_R_7);
let cb_g = _mm_set1_epi16(CB_TO_G_7);
let cr_g = _mm_set1_epi16(CR_TO_G_7);
let cb_b = _mm_set1_epi16(CB_TO_B_7);
let alpha = _mm_set1_epi8(-1i8);
for i in 0..chunks {
let base = i * 8;
let y_u8 = _mm_loadl_epi64(y_row.as_ptr().add(base) as *const __m128i);
let cb_u8 = _mm_loadl_epi64(cb_row.as_ptr().add(base) as *const __m128i);
let cr_u8 = _mm_loadl_epi64(cr_row.as_ptr().add(base) as *const __m128i);
let y16 = _mm_unpacklo_epi8(y_u8, zero);
let cb16 = _mm_sub_epi16(_mm_unpacklo_epi8(cb_u8, zero), c128);
let cr16 = _mm_sub_epi16(_mm_unpacklo_epi8(cr_u8, zero), c128);
let r_offset = _mm_srai_epi16::<7>(_mm_add_epi16(_mm_mullo_epi16(cr16, cr_r), round));
let r16 = _mm_add_epi16(y16, r_offset);
let g_offset = _mm_srai_epi16::<7>(_mm_add_epi16(
_mm_add_epi16(_mm_mullo_epi16(cb16, cb_g), _mm_mullo_epi16(cr16, cr_g)),
round,
));
let g16 = _mm_add_epi16(y16, g_offset);
let b_offset = _mm_srai_epi16::<7>(_mm_add_epi16(_mm_mullo_epi16(cb16, cb_b), round));
let b16 = _mm_add_epi16(y16, b_offset);
let r8 = _mm_packus_epi16(r16, zero);
let g8 = _mm_packus_epi16(g16, zero);
let b8 = _mm_packus_epi16(b16, zero);
store_rgba_interleaved(r8, g8, b8, alpha, output.as_mut_ptr().add(base * 4));
}
scalar_tail_rgba(y_row, cb_row, cr_row, output, chunks * 8, width);
}
pub fn ycbcr_to_bgra_sse2(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
width: usize,
) {
unsafe { ycbcr_to_bgra_sse2_inner(y_row, cb_row, cr_row, output, width) }
}
#[target_feature(enable = "sse2")]
unsafe fn ycbcr_to_bgra_sse2_inner(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
width: usize,
) {
let chunks = width / 8;
let zero = _mm_setzero_si128();
let c128 = _mm_set1_epi16(128);
let round = _mm_set1_epi16(64);
let cr_r = _mm_set1_epi16(CR_TO_R_7);
let cb_g = _mm_set1_epi16(CB_TO_G_7);
let cr_g = _mm_set1_epi16(CR_TO_G_7);
let cb_b = _mm_set1_epi16(CB_TO_B_7);
let alpha = _mm_set1_epi8(-1i8);
for i in 0..chunks {
let base = i * 8;
let y_u8 = _mm_loadl_epi64(y_row.as_ptr().add(base) as *const __m128i);
let cb_u8 = _mm_loadl_epi64(cb_row.as_ptr().add(base) as *const __m128i);
let cr_u8 = _mm_loadl_epi64(cr_row.as_ptr().add(base) as *const __m128i);
let y16 = _mm_unpacklo_epi8(y_u8, zero);
let cb16 = _mm_sub_epi16(_mm_unpacklo_epi8(cb_u8, zero), c128);
let cr16 = _mm_sub_epi16(_mm_unpacklo_epi8(cr_u8, zero), c128);
let r_offset = _mm_srai_epi16::<7>(_mm_add_epi16(_mm_mullo_epi16(cr16, cr_r), round));
let r16 = _mm_add_epi16(y16, r_offset);
let g_offset = _mm_srai_epi16::<7>(_mm_add_epi16(
_mm_add_epi16(_mm_mullo_epi16(cb16, cb_g), _mm_mullo_epi16(cr16, cr_g)),
round,
));
let g16 = _mm_add_epi16(y16, g_offset);
let b_offset = _mm_srai_epi16::<7>(_mm_add_epi16(_mm_mullo_epi16(cb16, cb_b), round));
let b16 = _mm_add_epi16(y16, b_offset);
let r8 = _mm_packus_epi16(r16, zero);
let g8 = _mm_packus_epi16(g16, zero);
let b8 = _mm_packus_epi16(b16, zero);
store_rgba_interleaved(b8, g8, r8, alpha, output.as_mut_ptr().add(base * 4));
}
scalar_tail_bgra(y_row, cb_row, cr_row, output, chunks * 8, width);
}
#[inline(always)]
unsafe fn store_rgba_interleaved(r8: __m128i, g8: __m128i, b8: __m128i, a8: __m128i, dst: *mut u8) {
let rg = _mm_unpacklo_epi8(r8, g8);
let ba = _mm_unpacklo_epi8(b8, a8);
let rgba_lo = _mm_unpacklo_epi16(rg, ba);
let rgba_hi = _mm_unpackhi_epi16(rg, ba);
_mm_storeu_si128(dst as *mut __m128i, rgba_lo);
_mm_storeu_si128(dst.add(16) as *mut __m128i, rgba_hi);
}
#[inline(always)]
unsafe fn store_rgb_interleaved(r8: __m128i, g8: __m128i, b8: __m128i, dst: *mut u8) {
let mut r = [0u8; 8];
let mut g = [0u8; 8];
let mut b = [0u8; 8];
_mm_storel_epi64(r.as_mut_ptr() as *mut __m128i, r8);
_mm_storel_epi64(g.as_mut_ptr() as *mut __m128i, g8);
_mm_storel_epi64(b.as_mut_ptr() as *mut __m128i, b8);
for j in 0..8 {
*dst.add(j * 3) = r[j];
*dst.add(j * 3 + 1) = g[j];
*dst.add(j * 3 + 2) = b[j];
}
}
#[inline(always)]
fn scalar_tail_rgb(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
start: usize,
width: usize,
) {
for j in start..width {
let y = y_row[j] as i32;
let cb = cb_row[j] as i32 - 128;
let cr = cr_row[j] as i32 - 128;
let r = (y + ((cr * CR_TO_R_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let g =
(y + ((cb * CB_TO_G_7 as i32 + cr * CR_TO_G_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let b = (y + ((cb * CB_TO_B_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
output[j * 3] = r;
output[j * 3 + 1] = g;
output[j * 3 + 2] = b;
}
}
#[inline(always)]
fn scalar_tail_rgba(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
start: usize,
width: usize,
) {
for j in start..width {
let y = y_row[j] as i32;
let cb = cb_row[j] as i32 - 128;
let cr = cr_row[j] as i32 - 128;
let r = (y + ((cr * CR_TO_R_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let g =
(y + ((cb * CB_TO_G_7 as i32 + cr * CR_TO_G_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let b = (y + ((cb * CB_TO_B_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
output[j * 4] = r;
output[j * 4 + 1] = g;
output[j * 4 + 2] = b;
output[j * 4 + 3] = 255;
}
}
#[inline(always)]
fn scalar_tail_bgra(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
output: &mut [u8],
start: usize,
width: usize,
) {
for j in start..width {
let y = y_row[j] as i32;
let cb = cb_row[j] as i32 - 128;
let cr = cr_row[j] as i32 - 128;
let r = (y + ((cr * CR_TO_R_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let g =
(y + ((cb * CB_TO_G_7 as i32 + cr * CR_TO_G_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let b = (y + ((cb * CB_TO_B_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
output[j * 4] = b;
output[j * 4 + 1] = g;
output[j * 4 + 2] = r;
output[j * 4 + 3] = 255;
}
}
#[cfg(test)]
#[cfg(target_arch = "x86_64")]
mod tests {
use super::{ycbcr_to_bgra_sse2, ycbcr_to_rgb_sse2, ycbcr_to_rgba_sse2};
use super::{CB_TO_B_7, CB_TO_G_7, CR_TO_G_7, CR_TO_R_7};
fn make_ycbcr_row() -> ([u8; 16], [u8; 16], [u8; 16]) {
let y: [u8; 16] = [
16, 40, 64, 80, 100, 120, 128, 140, 160, 180, 200, 210, 220, 230, 235, 200,
];
let cb: [u8; 16] = [
128, 100, 80, 64, 150, 200, 128, 90, 110, 170, 210, 128, 60, 220, 128, 140,
];
let cr: [u8; 16] = [
128, 200, 60, 180, 100, 128, 220, 128, 90, 160, 128, 50, 200, 128, 170, 100,
];
(y, cb, cr)
}
fn scalar_rgb_7bit(y: u8, cb: u8, cr: u8) -> (u8, u8, u8) {
let y = y as i32;
let cb = cb as i32 - 128;
let cr = cr as i32 - 128;
let r = (y + ((cr * CR_TO_R_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let g =
(y + ((cb * CB_TO_G_7 as i32 + cr * CR_TO_G_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
let b = (y + ((cb * CB_TO_B_7 as i32 + 64) >> 7)).clamp(0, 255) as u8;
(r, g, b)
}
#[test]
fn ycbcr_to_rgb_parity() {
if !is_x86_feature_detected!("sse2") {
eprintln!("SIMD feature not available, skipping");
return;
}
let (y, cb, cr) = make_ycbcr_row();
let width = 16usize;
let mut simd_out = vec![0u8; width * 3];
ycbcr_to_rgb_sse2(&y, &cb, &cr, &mut simd_out, width);
for i in 0..width {
let (sr, sg, sb) = scalar_rgb_7bit(y[i], cb[i], cr[i]);
let vr = simd_out[i * 3];
let vg = simd_out[i * 3 + 1];
let vb = simd_out[i * 3 + 2];
let dr = (sr as i32 - vr as i32).abs();
let dg = (sg as i32 - vg as i32).abs();
let db = (sb as i32 - vb as i32).abs();
assert!(
dr <= 1,
"RGB R parity mismatch at pixel {i}: scalar={sr}, simd={vr}, diff={dr}"
);
assert!(
dg <= 1,
"RGB G parity mismatch at pixel {i}: scalar={sg}, simd={vg}, diff={dg}"
);
assert!(
db <= 1,
"RGB B parity mismatch at pixel {i}: scalar={sb}, simd={vb}, diff={db}"
);
}
}
#[test]
fn ycbcr_to_rgba_parity() {
if !is_x86_feature_detected!("sse2") {
eprintln!("SIMD feature not available, skipping");
return;
}
let (y, cb, cr) = make_ycbcr_row();
let width = 16usize;
let mut simd_out = vec![0u8; width * 4];
ycbcr_to_rgba_sse2(&y, &cb, &cr, &mut simd_out, width);
for i in 0..width {
let (sr, sg, sb) = scalar_rgb_7bit(y[i], cb[i], cr[i]);
let vr = simd_out[i * 4];
let vg = simd_out[i * 4 + 1];
let vb = simd_out[i * 4 + 2];
let va = simd_out[i * 4 + 3];
let dr = (sr as i32 - vr as i32).abs();
let dg = (sg as i32 - vg as i32).abs();
let db = (sb as i32 - vb as i32).abs();
assert!(
dr <= 1,
"RGBA R parity mismatch at pixel {i}: scalar={sr}, simd={vr}, diff={dr}"
);
assert!(
dg <= 1,
"RGBA G parity mismatch at pixel {i}: scalar={sg}, simd={vg}, diff={dg}"
);
assert!(
db <= 1,
"RGBA B parity mismatch at pixel {i}: scalar={sb}, simd={vb}, diff={db}"
);
assert_eq!(va, 255, "RGBA alpha must be 255 at pixel {i}");
}
}
#[test]
fn ycbcr_to_bgra_parity() {
if !is_x86_feature_detected!("sse2") {
eprintln!("SIMD feature not available, skipping");
return;
}
let (y, cb, cr) = make_ycbcr_row();
let width = 16usize;
let mut simd_out = vec![0u8; width * 4];
ycbcr_to_bgra_sse2(&y, &cb, &cr, &mut simd_out, width);
for i in 0..width {
let (sr, sg, sb) = scalar_rgb_7bit(y[i], cb[i], cr[i]);
let vb = simd_out[i * 4];
let vg = simd_out[i * 4 + 1];
let vr = simd_out[i * 4 + 2];
let va = simd_out[i * 4 + 3];
let dr = (sr as i32 - vr as i32).abs();
let dg = (sg as i32 - vg as i32).abs();
let db = (sb as i32 - vb as i32).abs();
assert!(
dr <= 1,
"BGRA R parity mismatch at pixel {i}: scalar={sr}, simd={vr}, diff={dr}"
);
assert!(
dg <= 1,
"BGRA G parity mismatch at pixel {i}: scalar={sg}, simd={vg}, diff={dg}"
);
assert!(
db <= 1,
"BGRA B parity mismatch at pixel {i}: scalar={sb}, simd={vb}, diff={db}"
);
assert_eq!(va, 255, "BGRA alpha must be 255 at pixel {i}");
}
}
}