pub fn uyvy_to_planar_simd(src: &[u8], width: usize, height: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
assert_eq!(
src.len(),
width * height * 2,
"UYVY src length mismatch: expected {}, got {}",
width * height * 2,
src.len()
);
assert_eq!(width % 2, 0, "width must be even for UYVY");
let npix = width * height;
let nchroma = npix / 2;
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("ssse3") {
#[allow(unsafe_code)]
return unsafe { uyvy_to_planar_ssse3(src, npix, nchroma) };
}
}
uyvy_to_planar_scalar(src, npix, nchroma)
}
fn uyvy_to_planar_scalar(src: &[u8], npix: usize, nchroma: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let mut y = Vec::with_capacity(npix);
let mut u = Vec::with_capacity(nchroma);
let mut v = Vec::with_capacity(nchroma);
let mut i = 0usize;
while i + 3 < src.len() {
u.push(src[i]);
y.push(src[i + 1]);
v.push(src[i + 2]);
y.push(src[i + 3]);
i += 4;
}
(y, u, v)
}
#[cfg(target_arch = "x86_64")]
#[allow(unsafe_code)]
#[allow(clippy::cast_ptr_alignment)]
#[target_feature(enable = "ssse3")]
unsafe fn uyvy_to_planar_ssse3(
src: &[u8],
npix: usize,
nchroma: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
use std::arch::x86_64::*;
let mut y = Vec::with_capacity(npix);
let mut u = Vec::with_capacity(nchroma);
let mut v = Vec::with_capacity(nchroma);
#[rustfmt::skip]
let y_mask = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1,
15, 13, 11, 9, 7, 5, 3, 1,
);
#[rustfmt::skip]
let u_mask = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 12, 8, 4, 0,
);
#[rustfmt::skip]
let v_mask = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 14, 10, 6, 2,
);
let chunks = src.len() / 16; let mut offset = 0usize;
for _ in 0..chunks {
let chunk = _mm_loadu_si128(src.as_ptr().add(offset).cast::<__m128i>());
let y_vec = _mm_shuffle_epi8(chunk, y_mask);
let u_vec = _mm_shuffle_epi8(chunk, u_mask);
let v_vec = _mm_shuffle_epi8(chunk, v_mask);
let mut y_buf = [0u8; 16];
let mut uv_buf = [0u8; 16];
_mm_storeu_si128(y_buf.as_mut_ptr().cast::<__m128i>(), y_vec);
_mm_storeu_si128(uv_buf.as_mut_ptr().cast::<__m128i>(), u_vec);
y.extend_from_slice(&y_buf[..8]);
u.extend_from_slice(&uv_buf[..4]);
_mm_storeu_si128(uv_buf.as_mut_ptr().cast::<__m128i>(), v_vec);
v.extend_from_slice(&uv_buf[..4]);
offset += 16;
}
if offset < src.len() {
let (y_tail, u_tail, v_tail) = uyvy_to_planar_scalar(
&src[offset..],
src.len() / 2 - y.len(),
(src.len() / 4) - u.len(),
);
y.extend(y_tail);
u.extend(u_tail);
v.extend(v_tail);
}
(y, u, v)
}
pub fn v210_to_planar(src: &[u32], width: usize, height: usize) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
assert_eq!(width % 6, 0, "v210 width must be a multiple of 6");
let words_per_line = (width / 6) * 4;
assert_eq!(
src.len(),
words_per_line * height,
"v210 src length mismatch: expected {}, got {}",
words_per_line * height,
src.len()
);
let npix = width * height;
let nchroma = npix / 2;
let mut y = Vec::with_capacity(npix);
let mut cb = Vec::with_capacity(nchroma);
let mut cr = Vec::with_capacity(nchroma);
let mut i = 0usize;
while i + 3 < src.len() {
let w0 = src[i];
let w1 = src[i + 1];
let w2 = src[i + 2];
let w3 = src[i + 3];
let y0 = ((w0 >> 10) & 0x3FF) as u16;
let y1 = (w1 & 0x3FF) as u16;
let y2 = ((w1 >> 20) & 0x3FF) as u16;
let y3 = ((w2 >> 10) & 0x3FF) as u16;
let y4 = (w3 & 0x3FF) as u16;
let y5 = ((w3 >> 20) & 0x3FF) as u16;
let cb0 = (w0 & 0x3FF) as u16;
let cb1 = ((w1 >> 10) & 0x3FF) as u16;
let cb2 = ((w2 >> 20) & 0x3FF) as u16;
let cr0 = ((w0 >> 20) & 0x3FF) as u16;
let cr1 = (w2 & 0x3FF) as u16;
let cr2 = ((w3 >> 10) & 0x3FF) as u16;
y.extend_from_slice(&[y0, y1, y2, y3, y4, y5]);
cb.extend_from_slice(&[cb0, cb1, cb2]);
cr.extend_from_slice(&[cr0, cr1, cr2]);
i += 4;
}
(y, cb, cr)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_uyvy_to_planar_simd_matches_scalar() {
let width = 16usize;
let height = 4usize;
let npix = width * height;
let mut src = Vec::with_capacity(npix * 2);
for i in 0..(npix / 2) {
let u = (i * 7 % 256) as u8;
let y0 = (i * 13 % 256) as u8;
let v = (i * 19 % 256) as u8;
let y1 = (i * 23 % 256) as u8;
src.push(u);
src.push(y0);
src.push(v);
src.push(y1);
}
let (y_scalar, u_scalar, v_scalar) = uyvy_to_planar_scalar(&src, npix, npix / 2);
let (y_simd, u_simd, v_simd) = uyvy_to_planar_simd(&src, width, height);
assert_eq!(y_simd, y_scalar, "Y planes differ");
assert_eq!(u_simd, u_scalar, "U planes differ");
assert_eq!(v_simd, v_scalar, "V planes differ");
}
#[test]
fn test_v210_to_planar_known_values() {
let cb0: u32 = 0x100; let y0: u32 = 0x200; let cr0: u32 = 0x300;
let y1: u32 = 0x040; let cb1: u32 = 0x080; let y2: u32 = 0x0C0;
let cr1: u32 = 0x110; let y3: u32 = 0x150; let cb2: u32 = 0x190;
let y4: u32 = 0x020; let cr2: u32 = 0x060; let y5: u32 = 0x0A0;
let w0 = cb0 | (y0 << 10) | (cr0 << 20);
let w1 = y1 | (cb1 << 10) | (y2 << 20);
let w2 = cr1 | (y3 << 10) | (cb2 << 20);
let w3 = y4 | (cr2 << 10) | (y5 << 20);
let src = [w0, w1, w2, w3];
let (y, cb, cr) = v210_to_planar(&src, 6, 1);
assert_eq!(y[0], 0x200, "Y0 mismatch");
assert_eq!(y[1], 0x040, "Y1 mismatch");
assert_eq!(y[2], 0x0C0, "Y2 mismatch");
assert_eq!(y[3], 0x150, "Y3 mismatch");
assert_eq!(y[4], 0x020, "Y4 mismatch");
assert_eq!(y[5], 0x0A0, "Y5 mismatch");
assert_eq!(cb[0], 0x100, "Cb0 mismatch");
assert_eq!(cb[1], 0x080, "Cb1 mismatch");
assert_eq!(cb[2], 0x190, "Cb2 mismatch");
assert_eq!(cr[0], 0x300, "Cr0 mismatch");
assert_eq!(cr[1], 0x110, "Cr1 mismatch");
assert_eq!(cr[2], 0x060, "Cr2 mismatch");
}
#[test]
fn test_uyvy_planar_output_sizes() {
let width = 8usize;
let height = 2usize;
let src = vec![0u8; width * height * 2];
let (y, u, v) = uyvy_to_planar_simd(&src, width, height);
assert_eq!(y.len(), width * height);
assert_eq!(u.len(), width * height / 2);
assert_eq!(v.len(), width * height / 2);
}
#[test]
fn test_v210_planar_output_sizes() {
let width = 6usize;
let height = 2usize;
let words_per_line = (width / 6) * 4;
let src = vec![0u32; words_per_line * height];
let (y, cb, cr) = v210_to_planar(&src, width, height);
assert_eq!(y.len(), width * height);
assert_eq!(cb.len(), width * height / 2);
assert_eq!(cr.len(), width * height / 2);
}
#[test]
fn test_v210_10bit_values_in_range() {
let w = 0x3FF | (0x3FF << 10) | (0x3FF << 20);
let src = [w, w, w, w];
let (y, cb, cr) = v210_to_planar(&src, 6, 1);
for &val in y.iter().chain(cb.iter()).chain(cr.iter()) {
assert!(val <= 0x3FF, "value {val} exceeds 10-bit range");
}
}
}