#![allow(unsafe_code)]
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use super::simd_pixel::YuvCoeffs;
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
pub unsafe fn nv12_to_rgb24_sse41(
y_plane: &[u8],
uv_plane: &[u8],
rgb_out: &mut [u8],
width: usize,
height: usize,
coeffs: &YuvCoeffs,
) {
let y_scale_v = _mm_set1_epi32(coeffs.y_scale);
let cr_to_r_v = _mm_set1_epi32(coeffs.cr_to_r);
let cb_to_g_v = _mm_set1_epi32(coeffs.cb_to_g);
let cr_to_g_v = _mm_set1_epi32(coeffs.cr_to_g);
let cb_to_b_v = _mm_set1_epi32(coeffs.cb_to_b);
let bias16_v = _mm_set1_epi32(16); let bias128_v = _mm_set1_epi32(128);
let cols4 = (width / 4) * 4;
for row in 0..height {
let uv_row_base = (row / 2) * width; let y_row_base = row * width;
let rgb_row_base = row * width * 3;
let mut col = 0usize;
while col < cols4 {
let y0 = i32::from(y_plane[y_row_base + col]);
let y1 = i32::from(y_plane[y_row_base + col + 1]);
let y2 = i32::from(y_plane[y_row_base + col + 2]);
let y3 = i32::from(y_plane[y_row_base + col + 3]);
let y_vec = _mm_set_epi32(y3, y2, y1, y0);
let uv_idx0 = uv_row_base + col; let uv_idx1 = uv_row_base + col + 2; let u0 = i32::from(uv_plane[uv_idx0]);
let v0 = i32::from(uv_plane[uv_idx0 + 1]);
let u1 = i32::from(uv_plane[uv_idx1]);
let v1 = i32::from(uv_plane[uv_idx1 + 1]);
let u_vec = _mm_set_epi32(u1, u1, u0, u0);
let v_vec = _mm_set_epi32(v1, v1, v0, v0);
let y_m16 = _mm_sub_epi32(y_vec, bias16_v);
let y_scaled = _mm_mullo_epi32(y_m16, y_scale_v);
let cb = _mm_sub_epi32(u_vec, bias128_v);
let cr = _mm_sub_epi32(v_vec, bias128_v);
let r_vec = _mm_srai_epi32(_mm_add_epi32(y_scaled, _mm_mullo_epi32(cr, cr_to_r_v)), 14);
let g_vec = _mm_srai_epi32(
_mm_add_epi32(
y_scaled,
_mm_add_epi32(
_mm_mullo_epi32(cb, cb_to_g_v),
_mm_mullo_epi32(cr, cr_to_g_v),
),
),
14,
);
let b_vec = _mm_srai_epi32(_mm_add_epi32(y_scaled, _mm_mullo_epi32(cb, cb_to_b_v)), 14);
let r_u16 = _mm_packus_epi32(r_vec, r_vec); let g_u16 = _mm_packus_epi32(g_vec, g_vec); let b_u16 = _mm_packus_epi32(b_vec, b_vec);
let r_u8 = _mm_packus_epi16(r_u16, r_u16); let g_u8 = _mm_packus_epi16(g_u16, g_u16);
let b_u8 = _mm_packus_epi16(b_u16, b_u16);
let r0 = _mm_extract_epi8(r_u8, 0) as u8;
let r1 = _mm_extract_epi8(r_u8, 1) as u8;
let r2 = _mm_extract_epi8(r_u8, 2) as u8;
let r3 = _mm_extract_epi8(r_u8, 3) as u8;
let g0 = _mm_extract_epi8(g_u8, 0) as u8;
let g1 = _mm_extract_epi8(g_u8, 1) as u8;
let g2 = _mm_extract_epi8(g_u8, 2) as u8;
let g3 = _mm_extract_epi8(g_u8, 3) as u8;
let b0 = _mm_extract_epi8(b_u8, 0) as u8;
let b1 = _mm_extract_epi8(b_u8, 1) as u8;
let b2 = _mm_extract_epi8(b_u8, 2) as u8;
let b3 = _mm_extract_epi8(b_u8, 3) as u8;
let out = rgb_row_base + col * 3;
rgb_out[out] = r0;
rgb_out[out + 1] = g0;
rgb_out[out + 2] = b0;
rgb_out[out + 3] = r1;
rgb_out[out + 4] = g1;
rgb_out[out + 5] = b1;
rgb_out[out + 6] = r2;
rgb_out[out + 7] = g2;
rgb_out[out + 8] = b2;
rgb_out[out + 9] = r3;
rgb_out[out + 10] = g3;
rgb_out[out + 11] = b3;
col += 4;
}
while col < width {
nv12_pixel_scalar(
y_plane,
uv_plane,
rgb_out,
col,
uv_row_base,
y_row_base,
rgb_row_base,
coeffs,
);
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn nv12_pixel_scalar(
y_plane: &[u8],
uv_plane: &[u8],
rgb_out: &mut [u8],
col: usize,
uv_row_base: usize,
y_row_base: usize,
rgb_row_base: usize,
coeffs: &YuvCoeffs,
) {
let y_idx = y_row_base + col;
let uv_idx = uv_row_base + (col & !1);
let y_val = i32::from(y_plane[y_idx]);
let u_val = i32::from(uv_plane[uv_idx]);
let v_val = i32::from(uv_plane[uv_idx + 1]);
let y_scaled = (y_val - 16) * coeffs.y_scale;
let cb = u_val - 128;
let cr = v_val - 128;
let r = ((y_scaled + coeffs.cr_to_r * cr) >> 14).clamp(0, 255) as u8;
let g = ((y_scaled + coeffs.cb_to_g * cb + coeffs.cr_to_g * cr) >> 14).clamp(0, 255) as u8;
let b = ((y_scaled + coeffs.cb_to_b * cb) >> 14).clamp(0, 255) as u8;
let out = rgb_row_base + col * 3;
rgb_out[out] = r;
rgb_out[out + 1] = g;
rgb_out[out + 2] = b;
}
#[cfg(test)]
#[cfg(target_arch = "x86_64")]
mod tests {
use super::*;
use crate::convert::simd_pixel::{nv12_to_rgb24, SimdColorMatrix, YuvCoeffs};
fn scalar_nv12_to_rgb24(
y_plane: &[u8],
uv_plane: &[u8],
width: usize,
height: usize,
coeffs: &YuvCoeffs,
) -> Vec<u8> {
let mut rgb = vec![0u8; width * height * 3];
for row in 0..height {
let uv_row_base = (row / 2) * width;
let y_row_base = row * width;
let rgb_row_base = row * width * 3;
for col in 0..width {
let y_idx = y_row_base + col;
let uv_idx = uv_row_base + (col & !1);
let y_val = i32::from(y_plane[y_idx]);
let u_val = i32::from(uv_plane[uv_idx]);
let v_val = i32::from(uv_plane[uv_idx + 1]);
let y_scaled = (y_val - 16) * coeffs.y_scale;
let cb = u_val - 128;
let cr = v_val - 128;
let r = ((y_scaled + coeffs.cr_to_r * cr) >> 14).clamp(0, 255) as u8;
let g = ((y_scaled + coeffs.cb_to_g * cb + coeffs.cr_to_g * cr) >> 14).clamp(0, 255)
as u8;
let b = ((y_scaled + coeffs.cb_to_b * cb) >> 14).clamp(0, 255) as u8;
let out = rgb_row_base + col * 3;
rgb[out] = r;
rgb[out + 1] = g;
rgb[out + 2] = b;
}
}
rgb
}
fn compare_within_1(label: &str, sse: &[u8], scalar: &[u8]) {
assert_eq!(
sse.len(),
scalar.len(),
"{label}: length mismatch {sse_len} vs {scalar_len}",
sse_len = sse.len(),
scalar_len = scalar.len()
);
for (i, (&a, &b)) in sse.iter().zip(scalar.iter()).enumerate() {
let diff = (i32::from(a) - i32::from(b)).abs();
assert!(
diff <= 1,
"{label}: byte {i} differs by {diff}: sse={a} scalar={b}"
);
}
}
fn run_sse41(
y_plane: &[u8],
uv_plane: &[u8],
width: usize,
height: usize,
coeffs: &YuvCoeffs,
) -> Vec<u8> {
let mut rgb = vec![0u8; width * height * 3];
if is_x86_feature_detected!("sse4.1") {
unsafe {
nv12_to_rgb24_sse41(y_plane, uv_plane, &mut rgb, width, height, coeffs);
}
} else {
let scalar = scalar_nv12_to_rgb24(y_plane, uv_plane, width, height, coeffs);
rgb.copy_from_slice(&scalar);
}
rgb
}
#[test]
fn test_nv12_sse41_vs_scalar_uniform() {
let width = 8usize;
let height = 4usize;
let y_plane = vec![128u8; width * height];
let uv_plane = vec![128u8; (width / 2) * (height / 2) * 2];
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt709);
let sse = run_sse41(&y_plane, &uv_plane, width, height, &coeffs);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("uniform_gray", &sse, &scalar);
}
#[test]
fn test_nv12_sse41_vs_scalar_gradient() {
let width = 8usize;
let height = 4usize;
let y_plane: Vec<u8> = (0..(width * height))
.map(|i| (i * 7 % 220 + 16) as u8)
.collect();
let uv_plane = vec![128u8; (width / 2) * (height / 2) * 2];
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt601);
let sse = run_sse41(&y_plane, &uv_plane, width, height, &coeffs);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("gradient", &sse, &scalar);
}
#[test]
fn test_nv12_sse41_vs_scalar_4x4() {
let width = 4usize;
let height = 4usize;
let y_plane: Vec<u8> = (0..(width * height)).map(|i| (16 + i * 15) as u8).collect();
let uv_size = width * height / 2; let uv_plane: Vec<u8> = (0..uv_size)
.map(|i| 100u8.wrapping_add(i as u8 * 10))
.collect();
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt709);
let sse = run_sse41(&y_plane, &uv_plane, width, height, &coeffs);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("4x4", &sse, &scalar);
}
#[test]
fn test_nv12_sse41_vs_scalar_8x2() {
let width = 8usize;
let height = 2usize;
let y_plane: Vec<u8> = (0..(width * height)).map(|i| (50 + i * 13) as u8).collect();
let uv_plane: Vec<u8> = (0..((width / 2) * (height / 2) * 2))
.map(|i| (100 + i * 17) as u8)
.collect();
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt2020);
let sse = run_sse41(&y_plane, &uv_plane, width, height, &coeffs);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("8x2", &sse, &scalar);
}
#[test]
fn test_nv12_sse41_vs_scalar_16x16() {
let width = 16usize;
let height = 16usize;
let y_plane: Vec<u8> = (0..(width * height))
.map(|i| (i % 220 + 16) as u8)
.collect();
let uv_plane: Vec<u8> = (0..((width / 2) * (height / 2) * 2))
.map(|i| (64 + i * 5 % 128) as u8)
.collect();
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt601);
let sse = run_sse41(&y_plane, &uv_plane, width, height, &coeffs);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("16x16", &sse, &scalar);
}
#[test]
fn test_nv12_sse41_vs_scalar_remainder_cols() {
let width = 6usize; let height = 4usize;
let y_plane: Vec<u8> = (0..(width * height)).map(|i| (30 + i * 11) as u8).collect();
let uv_plane = vec![128u8; (width / 2) * (height / 2) * 2];
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt709);
let sse = run_sse41(&y_plane, &uv_plane, width, height, &coeffs);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("6-wide remainder", &sse, &scalar);
}
#[test]
fn test_nv12_to_rgb24_dispatch_matches_scalar() {
let width = 8usize;
let height = 4usize;
let y_plane: Vec<u8> = (0..(width * height))
.map(|i| (i % 220 + 16) as u8)
.collect();
let uv_plane: Vec<u8> = (0..((width / 2) * (height / 2) * 2))
.map(|i| (80 + i * 3 % 160) as u8)
.collect();
let coeffs = YuvCoeffs::for_matrix(SimdColorMatrix::Bt709);
let dispatched = nv12_to_rgb24(&y_plane, &uv_plane, width, height, SimdColorMatrix::Bt709);
let scalar = scalar_nv12_to_rgb24(&y_plane, &uv_plane, width, height, &coeffs);
compare_within_1("dispatch", &dispatched, &scalar);
}
}