use anyhow::{Result, bail};
use bytes::BytesMut;
use crate::frame::{PixelFormat, VideoFrame};
pub fn scale_frame(
frame: &VideoFrame,
target_width: u32,
target_height: u32,
) -> Result<VideoFrame> {
if frame.width == target_width && frame.height == target_height {
return Ok(frame.clone());
}
match frame.format {
PixelFormat::Yuv420p => scale_frame_8bit(frame, target_width, target_height),
PixelFormat::Yuv420p10le => scale_frame_10bit(frame, target_width, target_height),
_ => bail!(
"scaling only implemented for Yuv420p / Yuv420p10le; got {:?}",
frame.format
),
}
}
fn scale_frame_8bit(
frame: &VideoFrame,
target_width: u32,
target_height: u32,
) -> Result<VideoFrame> {
let src_w = frame.width as usize;
let src_h = frame.height as usize;
let dst_w = target_width as usize;
let dst_h = target_height as usize;
let src_y_size = src_w * src_h;
let dst_y_size = dst_w * dst_h;
let dst_uv_size = dst_y_size / 4;
let mut out = BytesMut::with_capacity(dst_y_size + dst_uv_size * 2);
let y_plane = &frame.data[..src_y_size];
out.extend(bilinear_scale_plane(y_plane, src_w, src_h, dst_w, dst_h));
let u_offset = src_y_size;
let u_plane = &frame.data[u_offset..u_offset + src_y_size / 4];
out.extend(bilinear_scale_plane(
u_plane,
src_w / 2,
src_h / 2,
dst_w / 2,
dst_h / 2,
));
let v_offset = u_offset + src_y_size / 4;
let v_plane = &frame.data[v_offset..v_offset + src_y_size / 4];
out.extend(bilinear_scale_plane(
v_plane,
src_w / 2,
src_h / 2,
dst_w / 2,
dst_h / 2,
));
Ok(VideoFrame::new(
out.freeze(),
target_width,
target_height,
frame.format,
frame.color_space,
frame.pts,
))
}
fn scale_frame_10bit(
frame: &VideoFrame,
target_width: u32,
target_height: u32,
) -> Result<VideoFrame> {
let src_w = frame.width as usize;
let src_h = frame.height as usize;
let dst_w = target_width as usize;
let dst_h = target_height as usize;
let bytes_per_sample = 2usize;
let src_y_size_samples = src_w * src_h;
let src_y_size_bytes = src_y_size_samples * bytes_per_sample;
let src_c_size_samples = (src_w / 2) * (src_h / 2);
let src_c_size_bytes = src_c_size_samples * bytes_per_sample;
if frame.data.len() < src_y_size_bytes + 2 * src_c_size_bytes {
bail!(
"10-bit frame data too short for {}x{}: {} bytes",
src_w,
src_h,
frame.data.len()
);
}
let dst_y_size_samples = dst_w * dst_h;
let dst_c_size_samples = (dst_w / 2) * (dst_h / 2);
let dst_total_bytes = (dst_y_size_samples + 2 * dst_c_size_samples) * bytes_per_sample;
let y_plane = super::read_u16le(&frame.data[..src_y_size_bytes]);
let u_plane =
super::read_u16le(&frame.data[src_y_size_bytes..src_y_size_bytes + src_c_size_bytes]);
let v_plane = super::read_u16le(
&frame.data
[src_y_size_bytes + src_c_size_bytes..src_y_size_bytes + 2 * src_c_size_bytes],
);
let y_dst = bilinear_scale_plane_u16(&y_plane, src_w, src_h, dst_w, dst_h);
let u_dst =
bilinear_scale_plane_u16(&u_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
let v_dst =
bilinear_scale_plane_u16(&v_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
let mut out = BytesMut::with_capacity(dst_total_bytes);
super::write_u16le(&mut out, &y_dst);
super::write_u16le(&mut out, &u_dst);
super::write_u16le(&mut out, &v_dst);
Ok(VideoFrame::new(
out.freeze(),
target_width,
target_height,
frame.format,
frame.color_space,
frame.pts,
))
}
pub fn bilinear_scale_plane_u16_scalar(
src: &[u16],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u16> {
let mut dst = vec![0u16; dst_w * dst_h];
let x_ratio = src_w as f64 / dst_w as f64;
let y_ratio = src_h as f64 / dst_h as f64;
for dy in 0..dst_h {
let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
let y0 = sy as usize;
let y1 = (y0 + 1).min(src_h - 1);
let fy = sy - y0 as f64;
for dx in 0..dst_w {
let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
let x0 = sx as usize;
let x1 = (x0 + 1).min(src_w - 1);
let fx = sx - x0 as f64;
let p00 = src[y0 * src_w + x0] as f64;
let p10 = src[y0 * src_w + x1] as f64;
let p01 = src[y1 * src_w + x0] as f64;
let p11 = src[y1 * src_w + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dy * dst_w + dx] = val.round().clamp(0.0, 1023.0) as u16;
}
}
dst
}
pub fn bilinear_scale_plane_u16(
src: &[u16],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u16> {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
return unsafe {
bilinear_scale_plane_u16_avx2(src, src_w, src_h, dst_w, dst_h)
};
}
}
bilinear_scale_plane_u16_scalar(src, src_w, src_h, dst_w, dst_h)
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bilinear_scale_plane_u16_avx2(
src: &[u16],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u16> {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let mut dst = vec![0u16; dst_w * dst_h];
let x_step = ((src_w as u64) << 32) / (dst_w as u64);
let y_step = ((src_h as u64) << 32) / (dst_h as u64);
let mut x0s: Vec<u32> = vec![0; dst_w];
let mut x1s: Vec<u32> = vec![0; dst_w];
let mut fxs_q15: Vec<i16> = vec![0; dst_w];
let mut one_minus_fxs_q15: Vec<i16> = vec![0; dst_w];
for dx in 0..dst_w {
let sx_32_32 = (dx as u64) * x_step;
let x0_full = (sx_32_32 >> 32) as usize;
let x0 = x0_full.min(src_w - 1);
let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u32;
let fx_q15 = ((fx_q16 as i32) >> 1).min(32767) as i16;
if x0 >= src_w - 1 {
x0s[dx] = (src_w - 1) as u32;
x1s[dx] = (src_w - 1) as u32;
fxs_q15[dx] = 0;
one_minus_fxs_q15[dx] = 32767;
} else {
x0s[dx] = x0 as u32;
x1s[dx] = (x0 + 1) as u32;
fxs_q15[dx] = fx_q15;
one_minus_fxs_q15[dx] = 32767 - fx_q15;
}
}
let v_max = _mm256_set1_epi16(1023);
let v_zero = _mm256_setzero_si256();
for dy in 0..dst_h {
let sy_32_32 = (dy as u64) * y_step;
let y0_full = (sy_32_32 >> 32) as usize;
let y0 = y0_full.min(src_h - 1);
let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
let y1 = (y0 + 1).min(src_h - 1);
let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
let one_minus_fy_q15 = 32767i16 - fy_q15;
let row0 = y0 * src_w;
let row1 = y1 * src_w;
let dst_row = dy * dst_w;
let v_fy = _mm256_set1_epi16(fy_q15);
let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
let mut dx = 0usize;
while dx + 16 <= dst_w {
let mut p00_buf = [0u16; 16];
let mut p10_buf = [0u16; 16];
let mut p01_buf = [0u16; 16];
let mut p11_buf = [0u16; 16];
for i in 0..16 {
let x0 = x0s[dx + i] as usize;
let x1 = x1s[dx + i] as usize;
p00_buf[i] = *src.get_unchecked(row0 + x0);
p10_buf[i] = *src.get_unchecked(row0 + x1);
p01_buf[i] = *src.get_unchecked(row1 + x0);
p11_buf[i] = *src.get_unchecked(row1 + x1);
}
let p00 = _mm256_loadu_si256(p00_buf.as_ptr() as *const _);
let p10 = _mm256_loadu_si256(p10_buf.as_ptr() as *const _);
let p01 = _mm256_loadu_si256(p01_buf.as_ptr() as *const _);
let p11 = _mm256_loadu_si256(p11_buf.as_ptr() as *const _);
let v_fx = _mm256_loadu_si256(fxs_q15.as_ptr().add(dx) as *const _);
let v_one_minus_fx =
_mm256_loadu_si256(one_minus_fxs_q15.as_ptr().add(dx) as *const _);
let top = _mm256_add_epi16(
_mm256_mulhrs_epi16(p00, v_one_minus_fx),
_mm256_mulhrs_epi16(p10, v_fx),
);
let bottom = _mm256_add_epi16(
_mm256_mulhrs_epi16(p01, v_one_minus_fx),
_mm256_mulhrs_epi16(p11, v_fx),
);
let out_i16 = _mm256_add_epi16(
_mm256_mulhrs_epi16(top, v_one_minus_fy),
_mm256_mulhrs_epi16(bottom, v_fy),
);
let clamped =
_mm256_min_epi16(_mm256_max_epi16(out_i16, v_zero), v_max);
_mm256_storeu_si256(
dst.as_mut_ptr().add(dst_row + dx) as *mut _,
clamped,
);
dx += 16;
}
while dx < dst_w {
let x0 = x0s[dx] as usize;
let x1 = x1s[dx] as usize;
let fx = fxs_q15[dx] as f64 / 32768.0;
let fy = fy_q15 as f64 / 32768.0;
let p00 = src[row0 + x0] as f64;
let p10 = src[row0 + x1] as f64;
let p01 = src[row1 + x0] as f64;
let p11 = src[row1 + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dst_row + dx] = val.round().clamp(0.0, 1023.0) as u16;
dx += 1;
}
}
dst
}
}
pub fn bilinear_scale_plane(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
return unsafe { bilinear_scale_plane_avx2(src, src_w, src_h, dst_w, dst_h) };
}
}
bilinear_scale_plane_scalar(src, src_w, src_h, dst_w, dst_h)
}
pub fn bilinear_scale_plane_scalar(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
let mut dst = vec![0u8; dst_w * dst_h];
let x_ratio = src_w as f64 / dst_w as f64;
let y_ratio = src_h as f64 / dst_h as f64;
for dy in 0..dst_h {
let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
let y0 = sy as usize;
let y1 = (y0 + 1).min(src_h - 1);
let fy = sy - y0 as f64;
for dx in 0..dst_w {
let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
let x0 = sx as usize;
let x1 = (x0 + 1).min(src_w - 1);
let fx = sx - x0 as f64;
let p00 = src[y0 * src_w + x0] as f64;
let p10 = src[y0 * src_w + x1] as f64;
let p01 = src[y1 * src_w + x0] as f64;
let p11 = src[y1 * src_w + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dy * dst_w + dx] = val.round() as u8;
}
}
dst
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn bilinear_scale_plane_avx2(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
unsafe {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let mut dst = vec![0u8; dst_w * dst_h];
let x_step = ((src_w as u64) << 32) / (dst_w as u64); let y_step = ((src_h as u64) << 32) / (dst_h as u64);
let mut x0s: Vec<u32> = vec![0; dst_w];
let mut fxs: Vec<u16> = vec![0; dst_w];
for dx in 0..dst_w {
let sx_32_32 = (dx as u64) * x_step; let x0_full = (sx_32_32 >> 32) as usize;
let x0 = x0_full.min(src_w - 1);
let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u16; if x0 >= src_w - 1 {
x0s[dx] = (src_w - 1) as u32;
fxs[dx] = 0;
} else {
x0s[dx] = x0 as u32;
fxs[dx] = fx_q16;
}
}
let mut fx_q15: Vec<i16> = vec![0; dst_w];
let mut one_minus_fx_q15: Vec<i16> = vec![0; dst_w];
for dx in 0..dst_w {
let fxq15 = (fxs[dx] as i32 >> 1).min(32767) as i16;
fx_q15[dx] = fxq15;
one_minus_fx_q15[dx] = 32767 - fxq15;
}
for dy in 0..dst_h {
let sy_32_32 = (dy as u64) * y_step;
let y0_full = (sy_32_32 >> 32) as usize;
let y0 = y0_full.min(src_h - 1);
let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
let y1 = (y0 + 1).min(src_h - 1);
let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
let one_minus_fy_q15 = 32767i16 - fy_q15;
let row0 = y0 * src_w;
let row1 = y1 * src_w;
let dst_row = dy * dst_w;
let v_fy = _mm256_set1_epi16(fy_q15);
let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
let mut dx = 0usize;
while dx + 16 <= dst_w {
let mut p00_buf = [0u8; 16];
let mut p10_buf = [0u8; 16];
let mut p01_buf = [0u8; 16];
let mut p11_buf = [0u8; 16];
for i in 0..16 {
let x0 = x0s[dx + i] as usize;
let x1 = (x0 + 1).min(src_w - 1);
p00_buf[i] = *src.get_unchecked(row0 + x0);
p10_buf[i] = *src.get_unchecked(row0 + x1);
p01_buf[i] = *src.get_unchecked(row1 + x0);
p11_buf[i] = *src.get_unchecked(row1 + x1);
}
let p00 = _mm256_cvtepu8_epi16(
_mm_loadu_si128(p00_buf.as_ptr() as *const _),
);
let p10 = _mm256_cvtepu8_epi16(
_mm_loadu_si128(p10_buf.as_ptr() as *const _),
);
let p01 = _mm256_cvtepu8_epi16(
_mm_loadu_si128(p01_buf.as_ptr() as *const _),
);
let p11 = _mm256_cvtepu8_epi16(
_mm_loadu_si128(p11_buf.as_ptr() as *const _),
);
let p00 = _mm256_slli_epi16::<7>(p00);
let p10 = _mm256_slli_epi16::<7>(p10);
let p01 = _mm256_slli_epi16::<7>(p01);
let p11 = _mm256_slli_epi16::<7>(p11);
let v_fx = _mm256_loadu_si256(fx_q15.as_ptr().add(dx) as *const _);
let v_one_minus_fx =
_mm256_loadu_si256(one_minus_fx_q15.as_ptr().add(dx) as *const _);
let top = _mm256_add_epi16(
_mm256_mulhrs_epi16(p00, v_one_minus_fx),
_mm256_mulhrs_epi16(p10, v_fx),
);
let bottom = _mm256_add_epi16(
_mm256_mulhrs_epi16(p01, v_one_minus_fx),
_mm256_mulhrs_epi16(p11, v_fx),
);
let out_q7 = _mm256_add_epi16(
_mm256_mulhrs_epi16(top, v_one_minus_fy),
_mm256_mulhrs_epi16(bottom, v_fy),
);
let rounded = _mm256_add_epi16(out_q7, _mm256_set1_epi16(64));
let shifted = _mm256_srai_epi16::<7>(rounded);
let packed = _mm256_packus_epi16(shifted, shifted);
let packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(packed);
_mm_storeu_si128(
dst.as_mut_ptr().add(dst_row + dx) as *mut _,
_mm256_castsi256_si128(packed),
);
dx += 16;
}
while dx < dst_w {
let x0 = x0s[dx] as usize;
let x1 = (x0 + 1).min(src_w - 1);
let fx = fxs[dx] as f64 / 65536.0;
let fy = fy_q16 as f64 / 65536.0;
let p00 = src[row0 + x0] as f64;
let p10 = src[row0 + x1] as f64;
let p01 = src[row1 + x0] as f64;
let p11 = src[row1 + x1] as f64;
let val = p00 * (1.0 - fx) * (1.0 - fy)
+ p10 * fx * (1.0 - fy)
+ p01 * (1.0 - fx) * fy
+ p11 * fx * fy;
dst[dst_row + dx] = val.round() as u8;
dx += 1;
}
}
dst
}
}