use anyhow::{Result, bail};
use bytes::BytesMut;
use crate::frame::{ColorSpace, VideoFrame};
#[inline(always)]
fn clamp_y(v: i32) -> u8 {
v.clamp(16, 235) as u8
}
#[inline(always)]
fn clamp_c(v: i32) -> u8 {
v.clamp(16, 240) as u8
}
fn bt601_to_bt709_scalar(y: &mut [u8], cb: &mut [u8], cr: &mut [u8], width: usize, height: usize) {
debug_assert_eq!(y.len(), width * height);
debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
let cw = width / 2;
for yi in 0..height {
let cy = yi >> 1;
for xi in 0..width {
let cx = xi >> 1;
let cbl = cb[cy * cw + cx] as i32 - 128;
let crl = cr[cy * cw + cx] as i32 - 128;
let y_orig = y[yi * width + xi] as i32;
let delta =
(super::M_Y_CB * cbl + super::M_Y_CR * crl + super::Q15_ROUND) >> super::Q15;
y[yi * width + xi] = clamp_y(y_orig + delta);
}
}
for v in cb.iter_mut().zip(cr.iter_mut()) {
let (cbp, crp) = v;
let cbl = *cbp as i32 - 128;
let crl = *crp as i32 - 128;
let new_cb =
(super::M_CB_CB * cbl + super::M_CB_CR * crl + super::Q15_ROUND) >> super::Q15;
let new_cr =
(super::M_CR_CB * cbl + super::M_CR_CR * crl + super::Q15_ROUND) >> super::Q15;
*cbp = clamp_c(new_cb + 128);
*crp = clamp_c(new_cr + 128);
}
}
pub fn bt601_to_bt709_planes_scalar(
y: &mut [u8],
cb: &mut [u8],
cr: &mut [u8],
width: usize,
height: usize,
) {
bt601_to_bt709_scalar(y, cb, cr, width, height);
}
pub fn bt601_to_bt709_planes(
y: &mut [u8],
cb: &mut [u8],
cr: &mut [u8],
width: usize,
height: usize,
) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") {
unsafe {
bt601_to_bt709_avx2(y, cb, cr, width, height);
}
return;
}
}
bt601_to_bt709_scalar(y, cb, cr, width, height);
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bt601_to_bt709_avx2(
y: &mut [u8],
cb: &mut [u8],
cr: &mut [u8],
width: usize,
height: usize,
) {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let cw = width / 2;
let ch = height / 2;
let v_m_y_cb = _mm256_set1_epi16(super::M_Y_CB as i16); let v_m_y_cr = _mm256_set1_epi16(super::M_Y_CR as i16); let v_m_cb_cb_corr = _mm256_set1_epi16((super::M_CB_CB - 32768) as i16); let v_m_cb_cr = _mm256_set1_epi16(super::M_CB_CR as i16); let v_m_cr_cb = _mm256_set1_epi16(super::M_CR_CB as i16); let v_m_cr_cr_corr = _mm256_set1_epi16((super::M_CR_CR - 32768) as i16);
let v_128 = _mm256_set1_epi16(128);
let v_chroma_lo = _mm256_set1_epi16(16);
let v_chroma_hi = _mm256_set1_epi16(240);
let v_luma_lo = _mm256_set1_epi16(16);
let v_luma_hi = _mm256_set1_epi16(235);
for cy_idx in 0..ch {
let y_row0 = cy_idx * 2 * width;
let y_row1 = y_row0 + width;
let c_row = cy_idx * cw;
let mut cx = 0usize;
while cx + 16 <= cw {
let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(c_row + cx) as *const _);
let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(c_row + cx) as *const _);
let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
let cbl = _mm256_sub_epi16(cb_i16, v_128);
let crl = _mm256_sub_epi16(cr_i16, v_128);
let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr);
let mut dy_luma = [0i16; 32];
_mm256_storeu_si256(dy_luma.as_mut_ptr().add(0) as *mut _, dy_chroma);
let mut dy_luma_pair = [0i16; 32];
for i in 0..16 {
dy_luma_pair[i * 2] = dy_luma[i];
dy_luma_pair[i * 2 + 1] = dy_luma[i];
}
let dy_luma_lo =
_mm256_loadu_si256(dy_luma_pair.as_ptr().add(0) as *const _);
let dy_luma_hi =
_mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
for row_off in [y_row0, y_row1] {
let y_u8 =
_mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
let y_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_u8));
let y_hi =
_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(y_u8));
let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
let y_lo_out = _mm256_min_epi16(
_mm256_max_epi16(y_lo_out, v_luma_lo),
v_luma_hi,
);
let y_hi_out = _mm256_min_epi16(
_mm256_max_epi16(y_hi_out, v_luma_lo),
v_luma_hi,
);
let packed = _mm256_packus_epi16(y_lo_out, y_hi_out);
let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed);
_mm256_storeu_si256(
y.as_mut_ptr().add(row_off + cx * 2) as *mut _,
packed,
);
}
cx += 16;
}
while cx < cw {
let cb_idx = c_row + cx;
let cbl = cb[cb_idx] as i32 - 128;
let crl = cr[cb_idx] as i32 - 128;
let delta = (super::M_Y_CB * cbl + super::M_Y_CR * crl + super::Q15_ROUND)
>> super::Q15;
let xi = cx * 2;
for row_off in [y_row0, y_row1] {
for sub in 0..2 {
let idx = row_off + xi + sub;
y[idx] = clamp_y(y[idx] as i32 + delta);
}
}
cx += 1;
}
}
let total_c = cb.len();
let mut i = 0usize;
while i + 16 <= total_c {
let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(i) as *const _);
let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(i) as *const _);
let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
let cbl = _mm256_sub_epi16(cb_i16, v_128);
let crl = _mm256_sub_epi16(cr_i16, v_128);
let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
let new_cb = _mm256_add_epi16(new_cb, v_128);
let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
let new_cr = _mm256_add_epi16(new_cr, v_128);
let new_cb =
_mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
let new_cr =
_mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
let cb_packed = _mm256_packus_epi16(new_cb, new_cb);
let cr_packed = _mm256_packus_epi16(new_cr, new_cr);
let cb_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cb_packed);
let cr_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cr_packed);
_mm_storeu_si128(
cb.as_mut_ptr().add(i) as *mut _,
_mm256_castsi256_si128(cb_packed),
);
_mm_storeu_si128(
cr.as_mut_ptr().add(i) as *mut _,
_mm256_castsi256_si128(cr_packed),
);
i += 16;
}
while i < total_c {
let cbl = cb[i] as i32 - 128;
let crl = cr[i] as i32 - 128;
let new_cb =
(super::M_CB_CB * cbl + super::M_CB_CR * crl + super::Q15_ROUND) >> super::Q15;
let new_cr =
(super::M_CR_CB * cbl + super::M_CR_CR * crl + super::Q15_ROUND) >> super::Q15;
cb[i] = clamp_c(new_cb + 128);
cr[i] = clamp_c(new_cr + 128);
i += 1;
}
}
}
pub(super) fn recolor_yuv420p_bt601_to_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
let w = frame.width as usize;
let h = frame.height as usize;
let y_size = w * h;
let c_size = y_size / 4;
if frame.data.len() < y_size + 2 * c_size {
bail!(
"frame data too short for yuv420p {}x{}: {} bytes",
w,
h,
frame.data.len()
);
}
if !w.is_multiple_of(2) || !h.is_multiple_of(2) {
bail!(
"BT.601→BT.709 requires even dimensions for 4:2:0 subsampling; got {}x{}",
w,
h
);
}
let mut y = frame.data[..y_size].to_vec();
let mut cb = frame.data[y_size..y_size + c_size].to_vec();
let mut cr = frame.data[y_size + c_size..y_size + 2 * c_size].to_vec();
bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
let mut out = BytesMut::with_capacity(y_size + 2 * c_size);
out.extend_from_slice(&y);
out.extend_from_slice(&cb);
out.extend_from_slice(&cr);
Ok(VideoFrame::new(
out.freeze(),
frame.width,
frame.height,
frame.format,
ColorSpace::Bt709,
frame.pts,
))
}