use anyhow::{Result, bail};
use bytes::{Bytes, BytesMut};
use crate::frame::{ColorMetadata, ColorSpace, PixelFormat, TransferFn, VideoFrame};
use crate::tonemap::tonemap_yuv420p10le_bt2020_to_yuv420p_bt709;
pub fn convert_to_sdr_bt709(
frame: &VideoFrame,
color_metadata: &ColorMetadata,
) -> Result<VideoFrame> {
let is_hdr_transfer = matches!(
color_metadata.transfer,
TransferFn::St2084 | TransferFn::AribStdB67
);
if is_hdr_transfer && matches!(frame.format, PixelFormat::Yuv420p10le) {
let max_white_nits = color_metadata
.mastering_display
.as_ref()
.map(|m| (m.max_luminance as f32) / 10_000.0)
.filter(|n| *n > 0.0);
return tonemap_yuv420p10le_bt2020_to_yuv420p_bt709(
frame,
color_metadata.transfer,
max_white_nits,
);
}
convert_to_yuv420p_bt709(frame)
}
pub fn convert_to_yuv420p_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
use PixelFormat::*;
match frame.format {
Yuv420p10le => return Ok(frame.clone()),
Yuv422p10le => return yuv422p10le_to_yuv420p10le(frame),
Yuv444p10le | Yuva444p10le => return downsample_444_to_420_frame(frame),
Yuv420p12le => bail!(
"Yuv420p12le not yet supported in convert_to_yuv420p_bt709 \
(no decoder in tree emits 12-bit; add a 12→10-bit dither \
when a decoder lands that does)"
),
_ => {}
}
match frame.format {
Rgb24 => return rgb_to_yuv420p_bt709(frame, false),
Rgba32 => return rgb_to_yuv420p_bt709(frame, true),
_ => {}
}
let yuv420p = match frame.format {
Yuv420p => frame.clone(),
Nv12 => nv12_to_yuv420p(frame)?,
Nv21 => nv21_to_yuv420p(frame)?,
Yuv422p => yuv422p_to_yuv420p(frame)?,
Yuv444p => downsample_444_to_420_frame(frame)?,
other => bail!(
"unsupported conversion: {:?}/{:?} → Yuv420p/Bt709",
other,
frame.color_space
),
};
if yuv420p.color_space == ColorSpace::Bt709 {
Ok(yuv420p)
} else {
recolor_yuv420p_bt601_to_bt709(&yuv420p)
}
}
fn nv12_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
deinterleave_semiplanar_to_yuv420p(frame, false)
}
fn nv21_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
deinterleave_semiplanar_to_yuv420p(frame, true)
}
fn deinterleave_semiplanar_to_yuv420p(frame: &VideoFrame, v_first: bool) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
let y_size = w * h;
let uv_size = y_size / 4;
if frame.data.len() < y_size + 2 * uv_size {
bail!(
"{} frame too small for {}x{}: need {} bytes got {}",
if v_first { "NV21" } else { "NV12" },
w,
h,
y_size + 2 * uv_size,
frame.data.len()
);
}
let mut out = BytesMut::with_capacity(y_size + uv_size * 2);
out.extend_from_slice(&frame.data[..y_size]);
let uv = &frame.data[y_size..];
let mut u_plane = Vec::with_capacity(uv_size);
let mut v_plane = Vec::with_capacity(uv_size);
for i in 0..uv_size {
let (a, b) = (uv[i * 2], uv[i * 2 + 1]);
if v_first {
v_plane.push(a);
u_plane.push(b);
} else {
u_plane.push(a);
v_plane.push(b);
}
}
out.extend_from_slice(&u_plane);
out.extend_from_slice(&v_plane);
Ok(VideoFrame::new(
out.freeze(),
frame.width,
frame.height,
PixelFormat::Yuv420p,
frame.color_space,
frame.pts,
))
}
fn yuv422p_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
let cw = w.div_ceil(2);
let ch_in = h;
let ch_out = h.div_ceil(2);
let y_size = w * h;
let chroma_in_size = cw * ch_in;
let chroma_out_size = cw * ch_out;
if frame.data.len() < y_size + 2 * chroma_in_size {
bail!(
"Yuv422p frame too small for {}x{}: need {} bytes got {}",
w,
h,
y_size + 2 * chroma_in_size,
frame.data.len()
);
}
let (y_in, rest) = frame.data.split_at(y_size);
let (cb_in, cr_in) = rest.split_at(chroma_in_size);
let mut out = BytesMut::with_capacity(y_size + 2 * chroma_out_size);
out.extend_from_slice(y_in);
for plane in [cb_in, cr_in] {
for cy in 0..ch_out {
let y0 = 2 * cy;
let y1 = (y0 + 1).min(ch_in - 1);
for cx in 0..cw {
let s0 = plane[y0 * cw + cx] as u16;
let s1 = plane[y1 * cw + cx] as u16;
out.extend_from_slice(&[((s0 + s1 + 1) >> 1) as u8]);
}
}
}
Ok(VideoFrame::new(
out.freeze(),
frame.width,
frame.height,
PixelFormat::Yuv420p,
frame.color_space,
frame.pts,
))
}
fn yuv422p10le_to_yuv420p10le(frame: &VideoFrame) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
let cw = w.div_ceil(2);
let ch_in = h;
let ch_out = h.div_ceil(2);
let y_samples = w * h;
let chroma_in_samples = cw * ch_in;
let chroma_out_samples = cw * ch_out;
let need_bytes = (y_samples + 2 * chroma_in_samples) * 2;
if frame.data.len() < need_bytes {
bail!(
"Yuv422p10le frame too small for {}x{}: need {} bytes got {}",
w,
h,
need_bytes,
frame.data.len()
);
}
let words = read_u16le(&frame.data[..need_bytes]);
let (y_in, rest) = words.split_at(y_samples);
let (cb_in, cr_in) = rest.split_at(chroma_in_samples);
let mut out = BytesMut::with_capacity((y_samples + 2 * chroma_out_samples) * 2);
write_u16le(&mut out, y_in);
for plane in [cb_in, cr_in] {
for cy in 0..ch_out {
let y0 = 2 * cy;
let y1 = (y0 + 1).min(ch_in - 1);
for cx in 0..cw {
let s0 = plane[y0 * cw + cx] as u32;
let s1 = plane[y1 * cw + cx] as u32;
let avg = ((s0 + s1 + 1) >> 1) as u16;
out.extend_from_slice(&avg.to_le_bytes());
}
}
}
Ok(VideoFrame::new(
out.freeze(),
frame.width,
frame.height,
PixelFormat::Yuv420p10le,
frame.color_space,
frame.pts,
))
}
fn rgb_to_yuv420p_bt709(frame: &VideoFrame, has_alpha: bool) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
let stride = if has_alpha { 4 } else { 3 };
let need = w * h * stride;
if frame.data.len() < need {
bail!(
"{} frame too small for {}x{}: need {} bytes got {}",
if has_alpha { "Rgba32" } else { "Rgb24" },
w,
h,
need,
frame.data.len()
);
}
let cw = w.div_ceil(2);
let ch = h.div_ceil(2);
let y_size = w * h;
let chroma_size = cw * ch;
let mut out = BytesMut::with_capacity(y_size + 2 * chroma_size);
out.resize(y_size + 2 * chroma_size, 0);
const Y_R: i32 = 5982;
const Y_G: i32 = 20128;
const Y_B: i32 = 2032;
const CB_R: i32 = -3299;
const CB_G: i32 = -11086;
const CB_B: i32 = 14385;
const CR_R: i32 = 14385;
const CR_G: i32 = -13066;
const CR_B: i32 = -1319;
for y in 0..h {
for x in 0..w {
let off = (y * w + x) * stride;
let r = frame.data[off] as i32;
let g = frame.data[off + 1] as i32;
let b = frame.data[off + 2] as i32;
let y_val = ((r * Y_R + g * Y_G + b * Y_B + (1 << 14)) >> 15) + 16;
out[y * w + x] = y_val.clamp(16, 235) as u8;
}
}
let cb_off = y_size;
let cr_off = y_size + chroma_size;
for cy in 0..ch {
let y0 = 2 * cy;
let y1 = (y0 + 1).min(h - 1);
for cx in 0..cw {
let x0 = 2 * cx;
let x1 = (x0 + 1).min(w - 1);
let mut r_sum = 0i32;
let mut g_sum = 0i32;
let mut b_sum = 0i32;
for &(py, px) in &[(y0, x0), (y0, x1), (y1, x0), (y1, x1)] {
let off = (py * w + px) * stride;
r_sum += frame.data[off] as i32;
g_sum += frame.data[off + 1] as i32;
b_sum += frame.data[off + 2] as i32;
}
let r = (r_sum + 2) >> 2;
let g = (g_sum + 2) >> 2;
let b = (b_sum + 2) >> 2;
let cb = ((r * CB_R + g * CB_G + b * CB_B + (1 << 14)) >> 15) + 128;
let cr = ((r * CR_R + g * CR_G + b * CR_B + (1 << 14)) >> 15) + 128;
out[cb_off + cy * cw + cx] = cb.clamp(16, 240) as u8;
out[cr_off + cy * cw + cx] = cr.clamp(16, 240) as u8;
}
}
Ok(VideoFrame::new(
out.freeze(),
frame.width,
frame.height,
PixelFormat::Yuv420p,
ColorSpace::Bt709,
frame.pts,
))
}
const Q15: i32 = 15;
const Q15_ROUND: i32 = 1 << (Q15 - 1);
#[allow(dead_code)] const M_Y_Y: i32 = 32768;
const M_Y_CB: i32 = (-0.11554975_f64 * 32768.0) as i32; const M_Y_CR: i32 = (-0.20793764_f64 * 32768.0) as i32; const M_CB_CB: i32 = (1.01863972_f64 * 32768.0).round() as i32; const M_CB_CR: i32 = (0.11461795_f64 * 32768.0).round() as i32; const M_CR_CB: i32 = (0.07504945_f64 * 32768.0).round() as i32; const M_CR_CR: i32 = (1.02532707_f64 * 32768.0).round() as i32;
#[inline(always)]
fn clamp_y(v: i32) -> u8 {
v.clamp(16, 235) as u8
}
#[inline(always)]
fn clamp_c(v: i32) -> u8 {
v.clamp(16, 240) as u8
}
fn bt601_to_bt709_scalar(y: &mut [u8], cb: &mut [u8], cr: &mut [u8], width: usize, height: usize) {
debug_assert_eq!(y.len(), width * height);
debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
let cw = width / 2;
for yi in 0..height {
let cy = yi >> 1;
for xi in 0..width {
let cx = xi >> 1;
let cbl = cb[cy * cw + cx] as i32 - 128;
let crl = cr[cy * cw + cx] as i32 - 128;
let y_orig = y[yi * width + xi] as i32;
let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
y[yi * width + xi] = clamp_y(y_orig + delta);
}
}
for v in cb.iter_mut().zip(cr.iter_mut()) {
let (cbp, crp) = v;
let cbl = *cbp as i32 - 128;
let crl = *crp as i32 - 128;
let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
*cbp = clamp_c(new_cb + 128);
*crp = clamp_c(new_cr + 128);
}
}
pub fn bt601_to_bt709_planes_scalar(
y: &mut [u8],
cb: &mut [u8],
cr: &mut [u8],
width: usize,
height: usize,
) {
bt601_to_bt709_scalar(y, cb, cr, width, height);
}
pub fn bt601_to_bt709_planes(
y: &mut [u8],
cb: &mut [u8],
cr: &mut [u8],
width: usize,
height: usize,
) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") {
unsafe {
bt601_to_bt709_avx2(y, cb, cr, width, height);
}
return;
}
}
bt601_to_bt709_scalar(y, cb, cr, width, height);
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bt601_to_bt709_avx2(
y: &mut [u8],
cb: &mut [u8],
cr: &mut [u8],
width: usize,
height: usize,
) {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let cw = width / 2;
let ch = height / 2;
let v_m_y_cb = _mm256_set1_epi16(M_Y_CB as i16); let v_m_y_cr = _mm256_set1_epi16(M_Y_CR as i16); let v_m_cb_cb_corr = _mm256_set1_epi16((M_CB_CB - 32768) as i16); let v_m_cb_cr = _mm256_set1_epi16(M_CB_CR as i16); let v_m_cr_cb = _mm256_set1_epi16(M_CR_CB as i16); let v_m_cr_cr_corr = _mm256_set1_epi16((M_CR_CR - 32768) as i16);
let v_128 = _mm256_set1_epi16(128);
let v_chroma_lo = _mm256_set1_epi16(16);
let v_chroma_hi = _mm256_set1_epi16(240);
let v_luma_lo = _mm256_set1_epi16(16);
let v_luma_hi = _mm256_set1_epi16(235);
for cy_idx in 0..ch {
let y_row0 = cy_idx * 2 * width;
let y_row1 = y_row0 + width;
let c_row = cy_idx * cw;
let mut cx = 0usize;
while cx + 16 <= cw {
let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(c_row + cx) as *const _);
let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(c_row + cx) as *const _);
let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
let cbl = _mm256_sub_epi16(cb_i16, v_128);
let crl = _mm256_sub_epi16(cr_i16, v_128);
let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr);
let mut dy_luma = [0i16; 32];
_mm256_storeu_si256(dy_luma.as_mut_ptr().add(0) as *mut _, dy_chroma);
let mut dy_luma_pair = [0i16; 32];
for i in 0..16 {
dy_luma_pair[i * 2] = dy_luma[i];
dy_luma_pair[i * 2 + 1] = dy_luma[i];
}
let dy_luma_lo = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(0) as *const _);
let dy_luma_hi = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
for row_off in [y_row0, y_row1] {
let y_u8 = _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
let y_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_u8));
let y_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(y_u8));
let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
let y_lo_out =
_mm256_min_epi16(_mm256_max_epi16(y_lo_out, v_luma_lo), v_luma_hi);
let y_hi_out =
_mm256_min_epi16(_mm256_max_epi16(y_hi_out, v_luma_lo), v_luma_hi);
let packed = _mm256_packus_epi16(y_lo_out, y_hi_out);
let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed);
_mm256_storeu_si256(y.as_mut_ptr().add(row_off + cx * 2) as *mut _, packed);
}
cx += 16;
}
while cx < cw {
let cb_idx = c_row + cx;
let cbl = cb[cb_idx] as i32 - 128;
let crl = cr[cb_idx] as i32 - 128;
let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
let xi = cx * 2;
for row_off in [y_row0, y_row1] {
for sub in 0..2 {
let idx = row_off + xi + sub;
y[idx] = clamp_y(y[idx] as i32 + delta);
}
}
cx += 1;
}
}
let total_c = cb.len();
let mut i = 0usize;
while i + 16 <= total_c {
let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(i) as *const _);
let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(i) as *const _);
let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
let cbl = _mm256_sub_epi16(cb_i16, v_128);
let crl = _mm256_sub_epi16(cr_i16, v_128);
let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
let new_cb = _mm256_add_epi16(new_cb, v_128);
let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
let new_cr = _mm256_add_epi16(new_cr, v_128);
let new_cb = _mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
let new_cr = _mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
let cb_packed = _mm256_packus_epi16(new_cb, new_cb);
let cr_packed = _mm256_packus_epi16(new_cr, new_cr);
let cb_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cb_packed);
let cr_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cr_packed);
_mm_storeu_si128(
cb.as_mut_ptr().add(i) as *mut _,
_mm256_castsi256_si128(cb_packed),
);
_mm_storeu_si128(
cr.as_mut_ptr().add(i) as *mut _,
_mm256_castsi256_si128(cr_packed),
);
i += 16;
}
while i < total_c {
let cbl = cb[i] as i32 - 128;
let crl = cr[i] as i32 - 128;
let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
cb[i] = clamp_c(new_cb + 128);
cr[i] = clamp_c(new_cr + 128);
i += 1;
}
}
}
#[inline(always)]
fn clamp_y_10bit(v: i32) -> u16 {
v.clamp(64, 940) as u16
}
#[inline(always)]
fn clamp_c_10bit(v: i32) -> u16 {
v.clamp(64, 960) as u16
}
const CHROMA_CENTER_10BIT: i32 = 512;
pub fn bt601_to_bt709_planes_10bit_scalar(
y: &mut [u16],
cb: &mut [u16],
cr: &mut [u16],
width: usize,
height: usize,
) {
debug_assert_eq!(y.len(), width * height);
debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
let cw = width / 2;
for yi in 0..height {
let cy = yi >> 1;
for xi in 0..width {
let cx = xi >> 1;
let cbl = cb[cy * cw + cx] as i32 - CHROMA_CENTER_10BIT;
let crl = cr[cy * cw + cx] as i32 - CHROMA_CENTER_10BIT;
let y_orig = y[yi * width + xi] as i32;
let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
y[yi * width + xi] = clamp_y_10bit(y_orig + delta);
}
}
for v in cb.iter_mut().zip(cr.iter_mut()) {
let (cbp, crp) = v;
let cbl = *cbp as i32 - CHROMA_CENTER_10BIT;
let crl = *crp as i32 - CHROMA_CENTER_10BIT;
let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
*cbp = clamp_c_10bit(new_cb + CHROMA_CENTER_10BIT);
*crp = clamp_c_10bit(new_cr + CHROMA_CENTER_10BIT);
}
}
pub fn bt601_to_bt709_planes_10bit(
y: &mut [u16],
cb: &mut [u16],
cr: &mut [u16],
width: usize,
height: usize,
) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") {
unsafe {
bt601_to_bt709_10bit_avx2(y, cb, cr, width, height);
}
return;
}
}
bt601_to_bt709_planes_10bit_scalar(y, cb, cr, width, height);
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bt601_to_bt709_10bit_avx2(
y: &mut [u16],
cb: &mut [u16],
cr: &mut [u16],
width: usize,
height: usize,
) {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let cw = width / 2;
let ch = height / 2;
let v_m_y_cb = _mm256_set1_epi16(M_Y_CB as i16);
let v_m_y_cr = _mm256_set1_epi16(M_Y_CR as i16);
let v_m_cb_cb_corr = _mm256_set1_epi16((M_CB_CB - 32768) as i16);
let v_m_cb_cr = _mm256_set1_epi16(M_CB_CR as i16);
let v_m_cr_cb = _mm256_set1_epi16(M_CR_CB as i16);
let v_m_cr_cr_corr = _mm256_set1_epi16((M_CR_CR - 32768) as i16);
let v_chroma_center = _mm256_set1_epi16(CHROMA_CENTER_10BIT as i16);
let v_chroma_lo = _mm256_set1_epi16(64);
let v_chroma_hi = _mm256_set1_epi16(960);
let v_luma_lo = _mm256_set1_epi16(64);
let v_luma_hi = _mm256_set1_epi16(940);
for cy_idx in 0..ch {
let y_row0 = cy_idx * 2 * width;
let y_row1 = y_row0 + width;
let c_row = cy_idx * cw;
let mut cx = 0usize;
while cx + 16 <= cw {
let cb_i16 = _mm256_loadu_si256(cb.as_ptr().add(c_row + cx) as *const _);
let cr_i16 = _mm256_loadu_si256(cr.as_ptr().add(c_row + cx) as *const _);
let cbl = _mm256_sub_epi16(cb_i16, v_chroma_center);
let crl = _mm256_sub_epi16(cr_i16, v_chroma_center);
let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr);
let mut dy_luma = [0i16; 16];
_mm256_storeu_si256(dy_luma.as_mut_ptr() as *mut _, dy_chroma);
let mut dy_luma_pair = [0i16; 32];
for i in 0..16 {
dy_luma_pair[i * 2] = dy_luma[i];
dy_luma_pair[i * 2 + 1] = dy_luma[i];
}
let dy_luma_lo = _mm256_loadu_si256(dy_luma_pair.as_ptr() as *const _);
let dy_luma_hi = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
for row_off in [y_row0, y_row1] {
let y_lo = _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
let y_hi =
_mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2 + 16) as *const _);
let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
let y_lo_out =
_mm256_min_epi16(_mm256_max_epi16(y_lo_out, v_luma_lo), v_luma_hi);
let y_hi_out =
_mm256_min_epi16(_mm256_max_epi16(y_hi_out, v_luma_lo), v_luma_hi);
_mm256_storeu_si256(y.as_mut_ptr().add(row_off + cx * 2) as *mut _, y_lo_out);
_mm256_storeu_si256(
y.as_mut_ptr().add(row_off + cx * 2 + 16) as *mut _,
y_hi_out,
);
}
cx += 16;
}
while cx < cw {
let cb_idx = c_row + cx;
let cbl = cb[cb_idx] as i32 - CHROMA_CENTER_10BIT;
let crl = cr[cb_idx] as i32 - CHROMA_CENTER_10BIT;
let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
let xi = cx * 2;
for row_off in [y_row0, y_row1] {
for sub in 0..2 {
let idx = row_off + xi + sub;
y[idx] = clamp_y_10bit(y[idx] as i32 + delta);
}
}
cx += 1;
}
}
let total_c = cb.len();
let mut i = 0usize;
while i + 16 <= total_c {
let cb_i16 = _mm256_loadu_si256(cb.as_ptr().add(i) as *const _);
let cr_i16 = _mm256_loadu_si256(cr.as_ptr().add(i) as *const _);
let cbl = _mm256_sub_epi16(cb_i16, v_chroma_center);
let crl = _mm256_sub_epi16(cr_i16, v_chroma_center);
let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
let new_cb = _mm256_add_epi16(new_cb, v_chroma_center);
let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
let new_cr = _mm256_add_epi16(new_cr, v_chroma_center);
let new_cb = _mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
let new_cr = _mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
_mm256_storeu_si256(cb.as_mut_ptr().add(i) as *mut _, new_cb);
_mm256_storeu_si256(cr.as_mut_ptr().add(i) as *mut _, new_cr);
i += 16;
}
while i < total_c {
let cbl = cb[i] as i32 - CHROMA_CENTER_10BIT;
let crl = cr[i] as i32 - CHROMA_CENTER_10BIT;
let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
cb[i] = clamp_c_10bit(new_cb + CHROMA_CENTER_10BIT);
cr[i] = clamp_c_10bit(new_cr + CHROMA_CENTER_10BIT);
i += 1;
}
}
}
fn recolor_yuv420p_bt601_to_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
let y_size = w * h;
let c_size = y_size / 4;
if frame.data.len() < y_size + 2 * c_size {
bail!(
"frame data too short for yuv420p {}x{}: {} bytes",
w,
h,
frame.data.len()
);
}
if !w.is_multiple_of(2) || !h.is_multiple_of(2) {
bail!(
"BT.601→BT.709 requires even dimensions for 4:2:0 subsampling; got {}x{}",
w,
h
);
}
let mut y = frame.data[..y_size].to_vec();
let mut cb = frame.data[y_size..y_size + c_size].to_vec();
let mut cr = frame.data[y_size + c_size..y_size + 2 * c_size].to_vec();
bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
let mut out = BytesMut::with_capacity(y_size + 2 * c_size);
out.extend_from_slice(&y);
out.extend_from_slice(&cb);
out.extend_from_slice(&cr);
Ok(VideoFrame::new(
out.freeze(),
frame.width,
frame.height,
frame.format,
ColorSpace::Bt709,
frame.pts,
))
}
pub fn downsample_chroma_444_to_420(
y: &[u8],
cb: &[u8],
cr: &[u8],
width: usize,
height: usize,
) -> Vec<u8> {
debug_assert_eq!(y.len(), width * height, "Y plane size");
debug_assert_eq!(cb.len(), width * height, "Cb plane size (4:4:4)");
debug_assert_eq!(cr.len(), width * height, "Cr plane size (4:4:4)");
let cw = width.div_ceil(2);
let ch = height.div_ceil(2);
let mut out = Vec::with_capacity(width * height + 2 * cw * ch);
out.extend_from_slice(y);
for plane in [cb, cr] {
for cy in 0..ch {
let y0 = 2 * cy;
let y1 = (y0 + 1).min(height - 1);
for cx in 0..cw {
let x0 = 2 * cx;
let x1 = (x0 + 1).min(width - 1);
let s00 = plane[y0 * width + x0] as u16;
let s01 = plane[y0 * width + x1] as u16;
let s10 = plane[y1 * width + x0] as u16;
let s11 = plane[y1 * width + x1] as u16;
let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u8;
out.push(avg);
}
}
}
out
}
pub fn downsample_chroma_444_to_420_10bit(
y: &[u16],
cb: &[u16],
cr: &[u16],
width: usize,
height: usize,
) -> Vec<u8> {
debug_assert_eq!(y.len(), width * height, "Y plane samples");
debug_assert_eq!(cb.len(), width * height, "Cb plane samples (4:4:4)");
debug_assert_eq!(cr.len(), width * height, "Cr plane samples (4:4:4)");
let cw = width.div_ceil(2);
let ch = height.div_ceil(2);
let total_samples = width * height + 2 * cw * ch;
let mut out = Vec::with_capacity(total_samples * 2);
for &s in y {
out.extend_from_slice(&s.to_le_bytes());
}
for plane in [cb, cr] {
for cy in 0..ch {
let y0 = 2 * cy;
let y1 = (y0 + 1).min(height - 1);
for cx in 0..cw {
let x0 = 2 * cx;
let x1 = (x0 + 1).min(width - 1);
let s00 = plane[y0 * width + x0] as u32;
let s01 = plane[y0 * width + x1] as u32;
let s10 = plane[y1 * width + x0] as u32;
let s11 = plane[y1 * width + x1] as u32;
let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u16;
out.extend_from_slice(&avg.to_le_bytes());
}
}
}
out
}
pub fn downsample_444_to_420_frame(frame: &VideoFrame) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
if w == 0 || h == 0 {
bail!("zero-dimension frame");
}
match frame.format {
PixelFormat::Yuv444p => {
let plane = w * h;
if frame.data.len() < 3 * plane {
bail!(
"Yuv444p frame data too short for {}x{}: {} bytes",
w,
h,
frame.data.len()
);
}
let y = &frame.data[..plane];
let cb = &frame.data[plane..2 * plane];
let cr = &frame.data[2 * plane..3 * plane];
let out = downsample_chroma_444_to_420(y, cb, cr, w, h);
Ok(VideoFrame::new(
Bytes::from(out),
frame.width,
frame.height,
PixelFormat::Yuv420p,
frame.color_space,
frame.pts,
))
}
PixelFormat::Yuv444p10le | PixelFormat::Yuva444p10le => {
let plane = w * h;
let needed = if frame.format == PixelFormat::Yuva444p10le {
4 * plane * 2
} else {
3 * plane * 2
};
if frame.data.len() < needed {
bail!(
"{:?} frame data too short for {}x{}: {} bytes (need {})",
frame.format,
w,
h,
frame.data.len(),
needed
);
}
let y = read_u16le(&frame.data[..plane * 2]);
let cb = read_u16le(&frame.data[plane * 2..2 * plane * 2]);
let cr = read_u16le(&frame.data[2 * plane * 2..3 * plane * 2]);
if frame.format == PixelFormat::Yuva444p10le {
tracing::warn!(
pts = frame.pts,
"dropping alpha plane on 4:4:4→4:2:0 downsample (rav1e 0.7 has no alpha; pipeline target is Yuv420p10le)"
);
}
let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
Ok(VideoFrame::new(
Bytes::from(out),
frame.width,
frame.height,
PixelFormat::Yuv420p10le,
frame.color_space,
frame.pts,
))
}
other => bail!(
"downsample_444_to_420_frame: expected 4:4:4 input, got {:?}",
other
),
}
}
pub fn scale_frame(
frame: &VideoFrame,
target_width: u32,
target_height: u32,
) -> Result<VideoFrame> {
if frame.width == target_width && frame.height == target_height {
return Ok(frame.clone());
}
match frame.format {
PixelFormat::Yuv420p => scale_frame_8bit(frame, target_width, target_height),
PixelFormat::Yuv420p10le => scale_frame_10bit(frame, target_width, target_height),
_ => bail!(
"scaling only implemented for Yuv420p / Yuv420p10le; got {:?}",
frame.format
),
}
}
fn scale_frame_8bit(
frame: &VideoFrame,
target_width: u32,
target_height: u32,
) -> Result<VideoFrame> {
let src_w = frame.width as usize;
let src_h = frame.height as usize;
let dst_w = target_width as usize;
let dst_h = target_height as usize;
let src_y_size = src_w * src_h;
let dst_y_size = dst_w * dst_h;
let dst_uv_size = dst_y_size / 4;
let mut out = BytesMut::with_capacity(dst_y_size + dst_uv_size * 2);
let y_plane = &frame.data[..src_y_size];
out.extend(bilinear_scale_plane(y_plane, src_w, src_h, dst_w, dst_h));
let u_offset = src_y_size;
let u_plane = &frame.data[u_offset..u_offset + src_y_size / 4];
out.extend(bilinear_scale_plane(
u_plane,
src_w / 2,
src_h / 2,
dst_w / 2,
dst_h / 2,
));
let v_offset = u_offset + src_y_size / 4;
let v_plane = &frame.data[v_offset..v_offset + src_y_size / 4];
out.extend(bilinear_scale_plane(
v_plane,
src_w / 2,
src_h / 2,
dst_w / 2,
dst_h / 2,
));
Ok(VideoFrame::new(
out.freeze(),
target_width,
target_height,
frame.format,
frame.color_space,
frame.pts,
))
}
fn scale_frame_10bit(
frame: &VideoFrame,
target_width: u32,
target_height: u32,
) -> Result<VideoFrame> {
let src_w = frame.width as usize;
let src_h = frame.height as usize;
let dst_w = target_width as usize;
let dst_h = target_height as usize;
let bytes_per_sample = 2usize;
let src_y_size_samples = src_w * src_h;
let src_y_size_bytes = src_y_size_samples * bytes_per_sample;
let src_c_size_samples = (src_w / 2) * (src_h / 2);
let src_c_size_bytes = src_c_size_samples * bytes_per_sample;
if frame.data.len() < src_y_size_bytes + 2 * src_c_size_bytes {
bail!(
"10-bit frame data too short for {}x{}: {} bytes",
src_w,
src_h,
frame.data.len()
);
}
let dst_y_size_samples = dst_w * dst_h;
let dst_c_size_samples = (dst_w / 2) * (dst_h / 2);
let dst_total_bytes = (dst_y_size_samples + 2 * dst_c_size_samples) * bytes_per_sample;
let y_plane = read_u16le(&frame.data[..src_y_size_bytes]);
let u_plane = read_u16le(&frame.data[src_y_size_bytes..src_y_size_bytes + src_c_size_bytes]);
let v_plane = read_u16le(
&frame.data[src_y_size_bytes + src_c_size_bytes..src_y_size_bytes + 2 * src_c_size_bytes],
);
let y_dst = bilinear_scale_plane_u16(&y_plane, src_w, src_h, dst_w, dst_h);
let u_dst = bilinear_scale_plane_u16(&u_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
let v_dst = bilinear_scale_plane_u16(&v_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
let mut out = BytesMut::with_capacity(dst_total_bytes);
write_u16le(&mut out, &y_dst);
write_u16le(&mut out, &u_dst);
write_u16le(&mut out, &v_dst);
Ok(VideoFrame::new(
out.freeze(),
target_width,
target_height,
frame.format,
frame.color_space,
frame.pts,
))
}
fn read_u16le(bytes: &[u8]) -> Vec<u16> {
bytes
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect()
}
fn write_u16le(out: &mut BytesMut, samples: &[u16]) {
for s in samples {
out.extend_from_slice(&s.to_le_bytes());
}
}
pub fn bilinear_scale_plane_u16_scalar(
src: &[u16],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u16> {
let mut dst = vec![0u16; dst_w * dst_h];
let x_ratio = src_w as f64 / dst_w as f64;
let y_ratio = src_h as f64 / dst_h as f64;
for dy in 0..dst_h {
let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
let y0 = sy as usize;
let y1 = (y0 + 1).min(src_h - 1);
let fy = sy - y0 as f64;
for dx in 0..dst_w {
let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
let x0 = sx as usize;
let x1 = (x0 + 1).min(src_w - 1);
let fx = sx - x0 as f64;
let p00 = src[y0 * src_w + x0] as f64;
let p10 = src[y0 * src_w + x1] as f64;
let p01 = src[y1 * src_w + x0] as f64;
let p11 = src[y1 * src_w + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dy * dst_w + dx] = val.round().clamp(0.0, 1023.0) as u16;
}
}
dst
}
pub fn bilinear_scale_plane_u16(
src: &[u16],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u16> {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
return unsafe { bilinear_scale_plane_u16_avx2(src, src_w, src_h, dst_w, dst_h) };
}
}
bilinear_scale_plane_u16_scalar(src, src_w, src_h, dst_w, dst_h)
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bilinear_scale_plane_u16_avx2(
src: &[u16],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u16> {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let mut dst = vec![0u16; dst_w * dst_h];
let x_step = ((src_w as u64) << 32) / (dst_w as u64);
let y_step = ((src_h as u64) << 32) / (dst_h as u64);
let mut x0s: Vec<u32> = vec![0; dst_w];
let mut x1s: Vec<u32> = vec![0; dst_w];
let mut fxs_q15: Vec<i16> = vec![0; dst_w];
let mut one_minus_fxs_q15: Vec<i16> = vec![0; dst_w];
for dx in 0..dst_w {
let sx_32_32 = (dx as u64) * x_step;
let x0_full = (sx_32_32 >> 32) as usize;
let x0 = x0_full.min(src_w - 1);
let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u32;
let fx_q15 = ((fx_q16 as i32) >> 1).min(32767) as i16;
if x0 >= src_w - 1 {
x0s[dx] = (src_w - 1) as u32;
x1s[dx] = (src_w - 1) as u32;
fxs_q15[dx] = 0;
one_minus_fxs_q15[dx] = 32767;
} else {
x0s[dx] = x0 as u32;
x1s[dx] = (x0 + 1) as u32;
fxs_q15[dx] = fx_q15;
one_minus_fxs_q15[dx] = 32767 - fx_q15;
}
}
let v_max = _mm256_set1_epi16(1023);
let v_zero = _mm256_setzero_si256();
for dy in 0..dst_h {
let sy_32_32 = (dy as u64) * y_step;
let y0_full = (sy_32_32 >> 32) as usize;
let y0 = y0_full.min(src_h - 1);
let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
let y1 = (y0 + 1).min(src_h - 1);
let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
let one_minus_fy_q15 = 32767i16 - fy_q15;
let row0 = y0 * src_w;
let row1 = y1 * src_w;
let dst_row = dy * dst_w;
let v_fy = _mm256_set1_epi16(fy_q15);
let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
let mut dx = 0usize;
while dx + 16 <= dst_w {
let mut p00_buf = [0u16; 16];
let mut p10_buf = [0u16; 16];
let mut p01_buf = [0u16; 16];
let mut p11_buf = [0u16; 16];
for i in 0..16 {
let x0 = x0s[dx + i] as usize;
let x1 = x1s[dx + i] as usize;
p00_buf[i] = *src.get_unchecked(row0 + x0);
p10_buf[i] = *src.get_unchecked(row0 + x1);
p01_buf[i] = *src.get_unchecked(row1 + x0);
p11_buf[i] = *src.get_unchecked(row1 + x1);
}
let p00 = _mm256_loadu_si256(p00_buf.as_ptr() as *const _);
let p10 = _mm256_loadu_si256(p10_buf.as_ptr() as *const _);
let p01 = _mm256_loadu_si256(p01_buf.as_ptr() as *const _);
let p11 = _mm256_loadu_si256(p11_buf.as_ptr() as *const _);
let v_fx = _mm256_loadu_si256(fxs_q15.as_ptr().add(dx) as *const _);
let v_one_minus_fx =
_mm256_loadu_si256(one_minus_fxs_q15.as_ptr().add(dx) as *const _);
let top = _mm256_add_epi16(
_mm256_mulhrs_epi16(p00, v_one_minus_fx),
_mm256_mulhrs_epi16(p10, v_fx),
);
let bottom = _mm256_add_epi16(
_mm256_mulhrs_epi16(p01, v_one_minus_fx),
_mm256_mulhrs_epi16(p11, v_fx),
);
let out_i16 = _mm256_add_epi16(
_mm256_mulhrs_epi16(top, v_one_minus_fy),
_mm256_mulhrs_epi16(bottom, v_fy),
);
let clamped = _mm256_min_epi16(_mm256_max_epi16(out_i16, v_zero), v_max);
_mm256_storeu_si256(dst.as_mut_ptr().add(dst_row + dx) as *mut _, clamped);
dx += 16;
}
while dx < dst_w {
let x0 = x0s[dx] as usize;
let x1 = x1s[dx] as usize;
let fx = fxs_q15[dx] as f64 / 32768.0;
let fy = fy_q15 as f64 / 32768.0;
let p00 = src[row0 + x0] as f64;
let p10 = src[row0 + x1] as f64;
let p01 = src[row1 + x0] as f64;
let p11 = src[row1 + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dst_row + dx] = val.round().clamp(0.0, 1023.0) as u16;
dx += 1;
}
}
dst
}
}
pub fn bilinear_scale_plane(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
return unsafe { bilinear_scale_plane_avx2(src, src_w, src_h, dst_w, dst_h) };
}
}
bilinear_scale_plane_scalar(src, src_w, src_h, dst_w, dst_h)
}
pub fn bilinear_scale_plane_scalar(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
let mut dst = vec![0u8; dst_w * dst_h];
let x_ratio = src_w as f64 / dst_w as f64;
let y_ratio = src_h as f64 / dst_h as f64;
for dy in 0..dst_h {
let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
let y0 = sy as usize;
let y1 = (y0 + 1).min(src_h - 1);
let fy = sy - y0 as f64;
for dx in 0..dst_w {
let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
let x0 = sx as usize;
let x1 = (x0 + 1).min(src_w - 1);
let fx = sx - x0 as f64;
let p00 = src[y0 * src_w + x0] as f64;
let p10 = src[y0 * src_w + x1] as f64;
let p01 = src[y1 * src_w + x0] as f64;
let p11 = src[y1 * src_w + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dy * dst_w + dx] = val.round() as u8;
}
}
dst
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bilinear_scale_plane_avx2(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let mut dst = vec![0u8; dst_w * dst_h];
let x_step = ((src_w as u64) << 32) / (dst_w as u64); let y_step = ((src_h as u64) << 32) / (dst_h as u64);
let mut x0s: Vec<u32> = vec![0; dst_w];
let mut fxs: Vec<u16> = vec![0; dst_w];
for dx in 0..dst_w {
let sx_32_32 = (dx as u64) * x_step; let x0_full = (sx_32_32 >> 32) as usize;
let x0 = x0_full.min(src_w - 1);
let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u16; if x0 >= src_w - 1 {
x0s[dx] = (src_w - 1) as u32;
fxs[dx] = 0;
} else {
x0s[dx] = x0 as u32;
fxs[dx] = fx_q16;
}
}
let mut fx_q15: Vec<i16> = vec![0; dst_w];
let mut one_minus_fx_q15: Vec<i16> = vec![0; dst_w];
for dx in 0..dst_w {
let fxq15 = (fxs[dx] as i32 >> 1).min(32767) as i16;
fx_q15[dx] = fxq15;
one_minus_fx_q15[dx] = 32767 - fxq15;
}
for dy in 0..dst_h {
let sy_32_32 = (dy as u64) * y_step;
let y0_full = (sy_32_32 >> 32) as usize;
let y0 = y0_full.min(src_h - 1);
let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
let y1 = (y0 + 1).min(src_h - 1);
let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
let one_minus_fy_q15 = 32767i16 - fy_q15;
let row0 = y0 * src_w;
let row1 = y1 * src_w;
let dst_row = dy * dst_w;
let v_fy = _mm256_set1_epi16(fy_q15);
let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
let mut dx = 0usize;
while dx + 16 <= dst_w {
let mut p00_buf = [0u8; 16];
let mut p10_buf = [0u8; 16];
let mut p01_buf = [0u8; 16];
let mut p11_buf = [0u8; 16];
for i in 0..16 {
let x0 = x0s[dx + i] as usize;
let x1 = (x0 + 1).min(src_w - 1);
p00_buf[i] = *src.get_unchecked(row0 + x0);
p10_buf[i] = *src.get_unchecked(row0 + x1);
p01_buf[i] = *src.get_unchecked(row1 + x0);
p11_buf[i] = *src.get_unchecked(row1 + x1);
}
let p00 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p00_buf.as_ptr() as *const _));
let p10 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p10_buf.as_ptr() as *const _));
let p01 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p01_buf.as_ptr() as *const _));
let p11 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p11_buf.as_ptr() as *const _));
let p00 = _mm256_slli_epi16::<7>(p00);
let p10 = _mm256_slli_epi16::<7>(p10);
let p01 = _mm256_slli_epi16::<7>(p01);
let p11 = _mm256_slli_epi16::<7>(p11);
let v_fx = _mm256_loadu_si256(fx_q15.as_ptr().add(dx) as *const _);
let v_one_minus_fx =
_mm256_loadu_si256(one_minus_fx_q15.as_ptr().add(dx) as *const _);
let top = _mm256_add_epi16(
_mm256_mulhrs_epi16(p00, v_one_minus_fx),
_mm256_mulhrs_epi16(p10, v_fx),
);
let bottom = _mm256_add_epi16(
_mm256_mulhrs_epi16(p01, v_one_minus_fx),
_mm256_mulhrs_epi16(p11, v_fx),
);
let out_q7 = _mm256_add_epi16(
_mm256_mulhrs_epi16(top, v_one_minus_fy),
_mm256_mulhrs_epi16(bottom, v_fy),
);
let rounded = _mm256_add_epi16(out_q7, _mm256_set1_epi16(64));
let shifted = _mm256_srai_epi16::<7>(rounded);
let packed = _mm256_packus_epi16(shifted, shifted);
let packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(packed);
_mm_storeu_si128(
dst.as_mut_ptr().add(dst_row + dx) as *mut _,
_mm256_castsi256_si128(packed),
);
dx += 16;
}
while dx < dst_w {
let x0 = x0s[dx] as usize;
let x1 = (x0 + 1).min(src_w - 1);
let fx = fxs[dx] as f64 / 65536.0;
let fy = fy_q16 as f64 / 65536.0;
let p00 = src[row0 + x0] as f64;
let p10 = src[row0 + x1] as f64;
let p01 = src[row1 + x0] as f64;
let p11 = src[row1 + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dst_row + dx] = val.round() as u8;
dx += 1;
}
}
dst
}
}
#[cfg(test)]
mod tests {
use super::*;
fn synth_601_frame(w: usize, h: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let mut y = vec![0u8; w * h];
let mut cb = vec![0u8; (w / 2) * (h / 2)];
let mut cr = vec![0u8; (w / 2) * (h / 2)];
for i in 0..y.len() {
y[i] = 16 + ((i as u32 * 17) % 220) as u8;
}
for i in 0..cb.len() {
cb[i] = 16 + ((i as u32 * 13) % 225) as u8;
cr[i] = 16 + ((i as u32 * 23) % 225) as u8;
}
(y, cb, cr)
}
#[test]
fn bt601_to_bt709_neutral_gray_roundtrips() {
for &y_val in &[16u8, 64, 128, 200, 235] {
let w = 32;
let h = 16;
let mut y = vec![y_val; w * h];
let mut cb = vec![128u8; (w / 2) * (h / 2)];
let mut cr = vec![128u8; (w / 2) * (h / 2)];
bt601_to_bt709_planes_scalar(&mut y, &mut cb, &mut cr, w, h);
for v in &y {
assert_eq!(*v, y_val, "Y with neutral chroma must round-trip");
}
for v in &cb {
assert_eq!(*v, 128);
}
for v in &cr {
assert_eq!(*v, 128);
}
}
}
#[test]
fn bt601_to_bt709_black_and_white_round_trip() {
for &(y_val, label) in &[(16u8, "black"), (235u8, "white")] {
let w = 64;
let h = 32;
let mut y = vec![y_val; w * h];
let mut cb = vec![128u8; (w / 2) * (h / 2)];
let mut cr = vec![128u8; (w / 2) * (h / 2)];
bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
for v in &y {
assert_eq!(*v, y_val, "{} Y round-trip", label);
}
for v in &cb {
assert_eq!(*v, 128, "{} Cb round-trip", label);
}
for v in &cr {
assert_eq!(*v, 128, "{} Cr round-trip", label);
}
}
}
#[test]
fn bt601_to_bt709_scalar_vs_avx2_agree_256x256() {
let w = 256;
let h = 256;
let (y0, cb0, cr0) = synth_601_frame(w, h);
let mut y_s = y0.clone();
let mut cb_s = cb0.clone();
let mut cr_s = cr0.clone();
bt601_to_bt709_planes_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
let mut y_v = y0.clone();
let mut cb_v = cb0.clone();
let mut cr_v = cr0.clone();
bt601_to_bt709_planes(&mut y_v, &mut cb_v, &mut cr_v, w, h);
let mut max_y = 0i32;
for i in 0..y_s.len() {
let d = (y_s[i] as i32 - y_v[i] as i32).abs();
if d > max_y {
max_y = d;
}
assert!(d <= 1, "Y[{}] scalar={} avx2={}", i, y_s[i], y_v[i]);
}
for i in 0..cb_s.len() {
assert!(
(cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1,
"Cb[{}] scalar={} avx2={}",
i,
cb_s[i],
cb_v[i]
);
assert!(
(cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1,
"Cr[{}] scalar={} avx2={}",
i,
cr_s[i],
cr_v[i]
);
}
}
#[test]
fn bt601_to_bt709_scalar_vs_avx2_agree_tail() {
let w = 34;
let h = 16;
let (y0, cb0, cr0) = synth_601_frame(w, h);
let mut y_s = y0.clone();
let mut cb_s = cb0.clone();
let mut cr_s = cr0.clone();
bt601_to_bt709_planes_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
let mut y_v = y0.clone();
let mut cb_v = cb0.clone();
let mut cr_v = cr0.clone();
bt601_to_bt709_planes(&mut y_v, &mut cb_v, &mut cr_v, w, h);
for i in 0..y_s.len() {
assert!(
(y_s[i] as i32 - y_v[i] as i32).abs() <= 1,
"Y[{}] scalar={} avx2={}",
i,
y_s[i],
y_v[i]
);
}
for i in 0..cb_s.len() {
assert!((cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1);
assert!((cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1);
}
}
#[test]
fn bt601_to_bt709_clamps_ranges() {
let w = 32;
let h = 16;
let (mut y, mut cb, mut cr) = synth_601_frame(w, h);
bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
for &v in cb.iter().chain(cr.iter()) {
assert!((16..=240).contains(&v), "chroma {} out of limited range", v);
}
for &v in y.iter() {
assert!((16..=235).contains(&v), "luma {} out of limited range", v);
}
}
fn make_ramp(w: usize, h: usize) -> Vec<u8> {
(0..w * h).map(|i| ((i * 7 + i / w) & 0xff) as u8).collect()
}
#[test]
fn bilinear_scalar_vs_avx2_agree_2x() {
let src_w = 64;
let src_h = 32;
let src = make_ramp(src_w, src_h);
let dst_w = 128;
let dst_h = 64;
let scalar = bilinear_scale_plane_scalar(&src, src_w, src_h, dst_w, dst_h);
let simd = bilinear_scale_plane(&src, src_w, src_h, dst_w, dst_h);
assert_eq!(scalar.len(), simd.len());
let mut max_diff = 0i32;
for i in 0..scalar.len() {
let d = (scalar[i] as i32 - simd[i] as i32).abs();
if d > max_diff {
max_diff = d;
}
assert!(
d <= 1,
"bilinear mismatch at {}: scalar={} simd={}",
i,
scalar[i],
simd[i]
);
}
}
#[test]
fn bilinear_scalar_vs_avx2_agree_downscale() {
let src_w = 128;
let src_h = 72;
let src = make_ramp(src_w, src_h);
let dst_w = 64;
let dst_h = 36;
let scalar = bilinear_scale_plane_scalar(&src, src_w, src_h, dst_w, dst_h);
let simd = bilinear_scale_plane(&src, src_w, src_h, dst_w, dst_h);
for i in 0..scalar.len() {
let d = (scalar[i] as i32 - simd[i] as i32).abs();
assert!(
d <= 1,
"bilinear mismatch at {}: scalar={} simd={}",
i,
scalar[i],
simd[i]
);
}
}
#[test]
fn bilinear_constant_input_yields_constant_output() {
let src = vec![42u8; 64 * 32];
let out = bilinear_scale_plane(&src, 64, 32, 128, 64);
for &v in &out {
assert_eq!(v, 42, "constant input must yield constant output");
}
}
#[test]
fn bilinear_identity_scale() {
let src = make_ramp(32, 32);
let out = bilinear_scale_plane_scalar(&src, 32, 32, 32, 32);
assert_eq!(out, src);
}
fn make_10bit_frame_planar(w: usize, h: usize, y_val: u16, c_val: u16) -> VideoFrame {
let y_samples = w * h;
let c_samples = (w / 2) * (h / 2);
let total = y_samples + 2 * c_samples;
let mut buf = Vec::with_capacity(total * 2);
for _ in 0..y_samples {
buf.extend_from_slice(&y_val.to_le_bytes());
}
for _ in 0..(2 * c_samples) {
buf.extend_from_slice(&c_val.to_le_bytes());
}
VideoFrame::new(
bytes::Bytes::from(buf),
w as u32,
h as u32,
PixelFormat::Yuv420p10le,
ColorSpace::Bt2020,
0,
)
}
#[test]
fn convert_to_yuv420p_bt709_passthrough_10bit() {
let frame = make_10bit_frame_planar(16, 16, 600, 512);
let out = convert_to_yuv420p_bt709(&frame).expect("10-bit passthrough");
assert_eq!(out.format, PixelFormat::Yuv420p10le);
assert_eq!(out.width, 16);
assert_eq!(out.height, 16);
assert_eq!(out.data.len(), frame.data.len());
assert_eq!(
&out.data[..],
&frame.data[..],
"10-bit data must be byte-identical (no tonemap)"
);
assert_eq!(
out.color_space,
ColorSpace::Bt2020,
"color space must not change"
);
}
#[test]
fn scale_frame_10bit_constant_input_yields_constant_output() {
let frame = make_10bit_frame_planar(64, 64, 600, 400);
let out = scale_frame(&frame, 32, 32).expect("10-bit scale");
assert_eq!(out.format, PixelFormat::Yuv420p10le);
assert_eq!(out.width, 32);
assert_eq!(out.height, 32);
let y_samples = 32 * 32;
let c_samples = 16 * 16;
let y_bytes = y_samples * 2;
let c_bytes = c_samples * 2;
assert_eq!(out.data.len(), y_bytes + 2 * c_bytes);
let y = read_u16le(&out.data[..y_bytes]);
let u = read_u16le(&out.data[y_bytes..y_bytes + c_bytes]);
let v = read_u16le(&out.data[y_bytes + c_bytes..y_bytes + 2 * c_bytes]);
for &s in &y {
assert_eq!(s, 600, "luma must be constant after bilinear");
}
for &s in u.iter().chain(v.iter()) {
assert_eq!(s, 400, "chroma must be constant after bilinear");
}
}
#[test]
fn scale_frame_10bit_identity_yields_byte_identical() {
let frame = make_10bit_frame_planar(32, 32, 768, 256);
let out = scale_frame(&frame, 32, 32).expect("identity");
assert_eq!(&out.data[..], &frame.data[..]);
}
#[test]
fn bilinear_10bit_scalar_clamps_inside_10bit_range() {
let mut src = vec![0u16; 64 * 32];
for (i, s) in src.iter_mut().enumerate() {
*s = (i as u16) % 1024;
}
let out = bilinear_scale_plane_u16_scalar(&src, 64, 32, 128, 64);
for &v in &out {
assert!(v <= 1023, "10-bit sample {} exceeds 1023", v);
}
}
fn make_10bit_ramp(w: usize, h: usize) -> Vec<u16> {
(0..w * h)
.map(|i| ((i * 7 + i / w) % 1024) as u16)
.collect()
}
#[test]
fn bilinear_10bit_scalar_vs_avx2_agree_2x_upscale() {
let src_w = 64;
let src_h = 32;
let src = make_10bit_ramp(src_w, src_h);
let dst_w = 128;
let dst_h = 64;
let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
assert_eq!(scalar.len(), simd.len());
let mut max_diff = 0i32;
for i in 0..scalar.len() {
let d = (scalar[i] as i32 - simd[i] as i32).abs();
if d > max_diff {
max_diff = d;
}
assert!(
d <= 1,
"bilinear 10-bit mismatch at {}: scalar={} simd={}",
i,
scalar[i],
simd[i]
);
}
}
#[test]
fn bilinear_10bit_scalar_vs_avx2_agree_downscale_1080p_to_720p() {
let src_w = 1920;
let src_h = 1080;
let src = make_10bit_ramp(src_w, src_h);
let dst_w = 1280;
let dst_h = 720;
let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
for i in 0..scalar.len() {
let d = (scalar[i] as i32 - simd[i] as i32).abs();
assert!(
d <= 1,
"bilinear 10-bit mismatch at {}: scalar={} simd={}",
i,
scalar[i],
simd[i]
);
}
}
#[test]
fn bilinear_10bit_avx2_constant_input_yields_constant_output() {
let src = vec![600u16; 128 * 64];
let out = bilinear_scale_plane_u16(&src, 128, 64, 256, 128);
for &v in &out {
assert_eq!(v, 600, "constant 10-bit input must yield constant output");
}
}
#[test]
fn bilinear_10bit_avx2_max_value_clamped() {
let src = vec![1023u16; 64 * 32];
let out = bilinear_scale_plane_u16(&src, 64, 32, 128, 64);
for &v in &out {
assert!(v <= 1023, "10-bit AVX2 sample {} exceeds 1023", v);
assert_eq!(v, 1023, "constant 1023 should stay 1023");
}
}
#[test]
fn bilinear_10bit_narrow_width_falls_back_to_scalar() {
let src_w = 8;
let src_h = 8;
let src = make_10bit_ramp(src_w, src_h);
let dst_w = 4;
let dst_h = 4;
let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
let dispatched = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
assert_eq!(
scalar, dispatched,
"narrow strip should match scalar exactly"
);
}
#[test]
fn bilinear_10bit_odd_dst_dims_handled() {
let src_w = 32;
let src_h = 32;
let src = make_10bit_ramp(src_w, src_h);
let dst_w = 17;
let dst_h = 9;
let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
assert_eq!(scalar.len(), simd.len());
for i in 0..scalar.len() {
let d = (scalar[i] as i32 - simd[i] as i32).abs();
assert!(
d <= 1,
"tail mismatch at {}: scalar={} simd={}",
i,
scalar[i],
simd[i]
);
}
}
#[test]
fn bilinear_10bit_tall_narrow_strip() {
let src_w = 16;
let src_h = 512;
let src = make_10bit_ramp(src_w, src_h);
let dst_w = 16;
let dst_h = 256;
let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
for i in 0..scalar.len() {
let d = (scalar[i] as i32 - simd[i] as i32).abs();
assert!(d <= 1, "tall strip mismatch at {}", i);
}
}
fn synth_601_frame_10bit(w: usize, h: usize) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
let mut y = vec![0u16; w * h];
let mut cb = vec![0u16; (w / 2) * (h / 2)];
let mut cr = vec![0u16; (w / 2) * (h / 2)];
for i in 0..y.len() {
y[i] = 64 + ((i as u32 * 17) % 877) as u16;
}
for i in 0..cb.len() {
cb[i] = 64 + ((i as u32 * 13) % 897) as u16;
cr[i] = 64 + ((i as u32 * 23) % 897) as u16;
}
(y, cb, cr)
}
#[test]
fn bt601_to_bt709_10bit_neutral_gray_roundtrips() {
for &y_val in &[64u16, 256, 512, 800, 940] {
let w = 32;
let h = 16;
let mut y = vec![y_val; w * h];
let mut cb = vec![512u16; (w / 2) * (h / 2)];
let mut cr = vec![512u16; (w / 2) * (h / 2)];
bt601_to_bt709_planes_10bit_scalar(&mut y, &mut cb, &mut cr, w, h);
for v in &y {
assert_eq!(*v, y_val, "Y with neutral chroma must round-trip");
}
for v in &cb {
assert_eq!(*v, 512);
}
for v in &cr {
assert_eq!(*v, 512);
}
}
}
#[test]
fn bt601_to_bt709_10bit_scalar_vs_avx2_agree_256x256() {
let w = 256;
let h = 256;
let (y0, cb0, cr0) = synth_601_frame_10bit(w, h);
let mut y_s = y0.clone();
let mut cb_s = cb0.clone();
let mut cr_s = cr0.clone();
bt601_to_bt709_planes_10bit_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
let mut y_v = y0.clone();
let mut cb_v = cb0.clone();
let mut cr_v = cr0.clone();
bt601_to_bt709_planes_10bit(&mut y_v, &mut cb_v, &mut cr_v, w, h);
for i in 0..y_s.len() {
let d = (y_s[i] as i32 - y_v[i] as i32).abs();
assert!(d <= 1, "Y[{}] scalar={} avx2={}", i, y_s[i], y_v[i]);
}
for i in 0..cb_s.len() {
assert!(
(cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1,
"Cb[{}] scalar={} avx2={}",
i,
cb_s[i],
cb_v[i]
);
assert!(
(cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1,
"Cr[{}] scalar={} avx2={}",
i,
cr_s[i],
cr_v[i]
);
}
}
#[test]
fn bt601_to_bt709_10bit_scalar_vs_avx2_agree_tail() {
let w = 34;
let h = 16;
let (y0, cb0, cr0) = synth_601_frame_10bit(w, h);
let mut y_s = y0.clone();
let mut cb_s = cb0.clone();
let mut cr_s = cr0.clone();
bt601_to_bt709_planes_10bit_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
let mut y_v = y0.clone();
let mut cb_v = cb0.clone();
let mut cr_v = cr0.clone();
bt601_to_bt709_planes_10bit(&mut y_v, &mut cb_v, &mut cr_v, w, h);
for i in 0..y_s.len() {
assert!(
(y_s[i] as i32 - y_v[i] as i32).abs() <= 1,
"Y[{}] scalar={} avx2={}",
i,
y_s[i],
y_v[i]
);
}
for i in 0..cb_s.len() {
assert!((cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1);
assert!((cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1);
}
}
#[test]
fn bt601_to_bt709_10bit_clamps_ranges() {
let w = 32;
let h = 16;
let (mut y, mut cb, mut cr) = synth_601_frame_10bit(w, h);
bt601_to_bt709_planes_10bit(&mut y, &mut cb, &mut cr, w, h);
for &v in cb.iter().chain(cr.iter()) {
assert!(
(64..=960).contains(&v),
"chroma {} out of 10-bit limited range",
v
);
}
for &v in y.iter() {
assert!(
(64..=940).contains(&v),
"luma {} out of 10-bit limited range",
v
);
}
}
#[test]
fn bt601_to_bt709_10bit_extreme_chroma_clamped_at_high_end() {
let w = 32;
let h = 16;
let mut y = vec![940u16; w * h];
let mut cb = vec![960u16; (w / 2) * (h / 2)];
let mut cr = vec![960u16; (w / 2) * (h / 2)];
bt601_to_bt709_planes_10bit(&mut y, &mut cb, &mut cr, w, h);
for &v in y.iter() {
assert!(v <= 940, "luma {} > 940 (clamp violated)", v);
}
for &v in cb.iter().chain(cr.iter()) {
assert!(v <= 960, "chroma {} > 960 (clamp violated)", v);
}
}
#[test]
fn downsample_4x4_box_average_8bit_hand_verified() {
let cb: Vec<u8> = vec![
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
];
let cr: Vec<u8> = vec![
5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125, 135, 145, 155,
];
let y: Vec<u8> = (0..16).map(|i| i as u8 * 8).collect();
let out = downsample_chroma_444_to_420(&y, &cb, &cr, 4, 4);
assert_eq!(out.len(), 16 + 4 + 4);
assert_eq!(&out[..16], y.as_slice(), "Y must round-trip verbatim");
assert_eq!(out[16], 35, "Cb block (0,0)");
assert_eq!(out[17], 55, "Cb block (1,0)");
assert_eq!(out[18], 115, "Cb block (0,1)");
assert_eq!(out[19], 135, "Cb block (1,1)");
assert_eq!(out[20], 30, "Cr block (0,0): (5+15+45+55+2)>>2 = 30");
assert_eq!(out[21], 50, "Cr block (1,0): (25+35+65+75+2)>>2 = 50");
assert_eq!(out[22], 110, "Cr block (0,1): (85+95+125+135+2)>>2 = 110");
assert_eq!(out[23], 130, "Cr block (1,1): (105+115+145+155+2)>>2 = 130");
}
#[test]
fn downsample_constant_input_8bit_yields_constant_output() {
let w = 16;
let h = 16;
let y = vec![64u8; w * h];
let cb = vec![128u8; w * h];
let cr = vec![128u8; w * h];
let out = downsample_chroma_444_to_420(&y, &cb, &cr, w, h);
let cw = (w + 1) / 2;
let ch = (h + 1) / 2;
assert_eq!(out.len(), w * h + 2 * cw * ch);
for i in 0..w * h {
assert_eq!(out[i], 64, "Y[{}] should be 64", i);
}
for i in (w * h)..(w * h + 2 * cw * ch) {
assert_eq!(out[i], 128, "chroma[{}] should be 128", i - w * h);
}
}
#[test]
fn downsample_odd_dimensions_clamp_policy() {
let w = 7;
let h = 7;
let y = vec![100u8; w * h];
let cb = vec![128u8; w * h];
let cr = vec![64u8; w * h];
let out = downsample_chroma_444_to_420(&y, &cb, &cr, w, h);
let cw = (w + 1) / 2; let ch = (h + 1) / 2; assert_eq!(cw, 4);
assert_eq!(ch, 4);
assert_eq!(out.len(), w * h + 2 * cw * ch);
for i in 0..w * h {
assert_eq!(out[i], 100);
}
for cx in 0..cw {
for cy in 0..ch {
let idx = w * h + cy * cw + cx;
assert_eq!(out[idx], 128, "Cb[{},{}] expected 128", cx, cy);
}
}
for cx in 0..cw {
for cy in 0..ch {
let idx = w * h + cw * ch + cy * cw + cx;
assert_eq!(out[idx], 64, "Cr[{},{}] expected 64", cx, cy);
}
}
}
#[test]
fn downsample_10bit_constant_input_yields_constant_output() {
let w = 16;
let h = 16;
let y = vec![400u16; w * h];
let cb = vec![512u16; w * h];
let cr = vec![512u16; w * h];
let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
let cw = (w + 1) / 2;
let ch = (h + 1) / 2;
assert_eq!(out.len(), 2 * (w * h + 2 * cw * ch), "10-bit byte count");
for i in 0..w * h {
let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
assert_eq!(s, 400, "Y[{}] should be 400", i);
}
let cb_byte_off = w * h * 2;
for i in 0..cw * ch {
let s = u16::from_le_bytes([out[cb_byte_off + i * 2], out[cb_byte_off + i * 2 + 1]]);
assert_eq!(s, 512, "Cb[{}] should be 512", i);
}
let cr_byte_off = cb_byte_off + cw * ch * 2;
for i in 0..cw * ch {
let s = u16::from_le_bytes([out[cr_byte_off + i * 2], out[cr_byte_off + i * 2 + 1]]);
assert_eq!(s, 512, "Cr[{}] should be 512", i);
}
}
#[test]
fn downsample_10bit_max_value_no_overflow() {
let w = 4;
let h = 4;
let y = vec![1023u16; w * h];
let cb = vec![1023u16; w * h];
let cr = vec![1023u16; w * h];
let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
let cw = (w + 1) / 2;
let ch = (h + 1) / 2;
for i in 0..w * h {
let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
assert_eq!(s, 1023, "Y[{}]", i);
}
let cb_byte_off = w * h * 2;
for i in 0..2 * cw * ch {
let s = u16::from_le_bytes([out[cb_byte_off + i * 2], out[cb_byte_off + i * 2 + 1]]);
assert_eq!(s, 1023, "chroma[{}] should be 1023 (no overflow)", i);
}
}
#[test]
fn downsample_10bit_4x4_box_average_hand_verified() {
let cb_u: Vec<u16> = vec![
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
];
let cr_u: Vec<u16> = vec![
500, 600, 700, 800, 500, 600, 700, 800, 500, 600, 700, 800, 500, 600, 700, 800,
];
let y_u: Vec<u16> = (0..16).map(|i| i as u16 * 50).collect();
let out = downsample_chroma_444_to_420_10bit(&y_u, &cb_u, &cr_u, 4, 4);
assert_eq!(out.len(), 32 + 8 + 8);
for i in 0..16 {
let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
assert_eq!(s, i as u16 * 50, "Y[{}]", i);
}
let cb_off = 32;
let cb0 = u16::from_le_bytes([out[cb_off], out[cb_off + 1]]);
let cb1 = u16::from_le_bytes([out[cb_off + 2], out[cb_off + 3]]);
let cb2 = u16::from_le_bytes([out[cb_off + 4], out[cb_off + 5]]);
let cb3 = u16::from_le_bytes([out[cb_off + 6], out[cb_off + 7]]);
assert_eq!(cb0, 35);
assert_eq!(cb1, 55);
assert_eq!(cb2, 115);
assert_eq!(cb3, 135);
let cr_off = cb_off + 8;
let cr0 = u16::from_le_bytes([out[cr_off], out[cr_off + 1]]);
let cr1 = u16::from_le_bytes([out[cr_off + 2], out[cr_off + 3]]);
assert_eq!(cr0, 550);
assert_eq!(cr1, 750);
}
#[test]
fn downsample_frame_yuv444p10le_to_yuv420p10le() {
let w = 16;
let h = 16;
let plane = w * h;
let mut buf = Vec::with_capacity(3 * plane * 2);
for _ in 0..plane {
buf.extend_from_slice(&500u16.to_le_bytes()); }
for _ in 0..plane {
buf.extend_from_slice(&512u16.to_le_bytes()); }
for _ in 0..plane {
buf.extend_from_slice(&512u16.to_le_bytes()); }
let frame = VideoFrame::new(
bytes::Bytes::from(buf),
w as u32,
h as u32,
PixelFormat::Yuv444p10le,
ColorSpace::Bt2020,
42,
);
let out = downsample_444_to_420_frame(&frame).expect("downsample");
assert_eq!(out.format, PixelFormat::Yuv420p10le);
assert_eq!(out.width, w as u32);
assert_eq!(out.height, h as u32);
assert_eq!(out.pts, 42, "PTS preserved");
assert_eq!(out.color_space, ColorSpace::Bt2020, "color_space preserved");
let cw = w / 2;
let ch = h / 2;
let expected_bytes = 2 * (w * h + 2 * cw * ch);
assert_eq!(out.data.len(), expected_bytes);
let y0 = u16::from_le_bytes([out.data[0], out.data[1]]);
assert_eq!(y0, 500);
let cb0 = u16::from_le_bytes([out.data[w * h * 2], out.data[w * h * 2 + 1]]);
assert_eq!(cb0, 512);
}
#[test]
fn downsample_frame_yuva444p10le_drops_alpha() {
let w = 8;
let h = 8;
let plane = w * h;
let mut buf = Vec::with_capacity(4 * plane * 2);
for _ in 0..plane {
buf.extend_from_slice(&600u16.to_le_bytes());
}
for _ in 0..plane {
buf.extend_from_slice(&500u16.to_le_bytes());
}
for _ in 0..plane {
buf.extend_from_slice(&500u16.to_le_bytes());
}
for _ in 0..plane {
buf.extend_from_slice(&65535u16.to_le_bytes());
}
let frame = VideoFrame::new(
bytes::Bytes::from(buf),
w as u32,
h as u32,
PixelFormat::Yuva444p10le,
ColorSpace::Bt2020,
7,
);
let out = downsample_444_to_420_frame(&frame).expect("downsample with alpha");
assert_eq!(out.format, PixelFormat::Yuv420p10le);
let cw = w / 2;
let ch = h / 2;
let expected = 2 * (w * h + 2 * cw * ch);
assert_eq!(out.data.len(), expected);
for i in (0..out.data.len()).step_by(2) {
let s = u16::from_le_bytes([out.data[i], out.data[i + 1]]);
assert!(
s < 1024 || s == 65535 && false,
"stray alpha sample {} at {}",
s,
i
);
assert_ne!(s, 65535, "alpha plane leaked into output");
}
}
#[test]
fn downsample_frame_rejects_non_444() {
let w = 16;
let h = 16;
let plane = w * h;
let mut buf = Vec::with_capacity(plane + 2 * (plane / 4));
buf.resize(plane + 2 * (plane / 4), 128);
let frame = VideoFrame::new(
bytes::Bytes::from(buf),
w as u32,
h as u32,
PixelFormat::Yuv420p,
ColorSpace::Bt709,
0,
);
let err = downsample_444_to_420_frame(&frame).unwrap_err();
assert!(format!("{}", err).contains("expected 4:4:4 input"));
}
}