use super::h264_params::{Pps, Sps};
use crate::VideoError;
pub fn deinterlace_fields(
top_field: &[u8],
bottom_field: &[u8],
width: usize,
height: usize,
) -> Vec<u8> {
let row_bytes = width * 3; let mut frame = vec![0u8; height * 2 * row_bytes];
for y in 0..height {
let dst_even_start = y * 2 * row_bytes;
let src_top_start = y * row_bytes;
if src_top_start + row_bytes <= top_field.len() && dst_even_start + row_bytes <= frame.len()
{
frame[dst_even_start..dst_even_start + row_bytes]
.copy_from_slice(&top_field[src_top_start..src_top_start + row_bytes]);
}
let dst_odd_start = (y * 2 + 1) * row_bytes;
let src_bot_start = y * row_bytes;
if src_bot_start + row_bytes <= bottom_field.len()
&& dst_odd_start + row_bytes <= frame.len()
{
frame[dst_odd_start..dst_odd_start + row_bytes]
.copy_from_slice(&bottom_field[src_bot_start..src_bot_start + row_bytes]);
}
}
frame
}
pub fn generate_slice_group_map(pps: &Pps, sps: &Sps) -> Vec<u8> {
let pic_width = sps.pic_width_in_mbs as usize;
let pic_height = sps.pic_height_in_map_units as usize;
let num_mbs = pic_width * pic_height;
let mut map = vec![0u8; num_mbs];
if pps.num_slice_groups <= 1 {
return map; }
let num_groups = pps.num_slice_groups as usize;
match pps.slice_group_map_type {
0 => {
let mut i = 0;
loop {
if i >= num_mbs {
break;
}
for group in 0..num_groups {
let run = if group < pps.run_length_minus1.len() {
pps.run_length_minus1[group] as usize + 1
} else {
1
};
for _ in 0..run {
if i >= num_mbs {
break;
}
map[i] = group as u8;
i += 1;
}
}
}
}
1 => {
for i in 0..num_mbs {
let x = i % pic_width;
let y = i / pic_width;
let group = ((x + ((y * num_groups) / 2)) % num_groups) as u8;
map[i] = group;
}
}
2 => {
let bg_group = (num_groups - 1) as u8;
for m in map.iter_mut() {
*m = bg_group;
}
for group in (0..num_groups.saturating_sub(1)).rev() {
if group >= pps.top_left.len() || group >= pps.bottom_right.len() {
continue;
}
let tl = pps.top_left[group] as usize;
let br = pps.bottom_right[group] as usize;
let tl_x = tl % pic_width;
let tl_y = tl / pic_width;
let br_x = br % pic_width;
let br_y = br / pic_width;
for y in tl_y..=br_y.min(pic_height.saturating_sub(1)) {
for x in tl_x..=br_x.min(pic_width.saturating_sub(1)) {
let idx = y * pic_width + x;
if idx < num_mbs {
map[idx] = group as u8;
}
}
}
}
}
3..=5 => {
let change = (pps.slice_group_change_rate as usize).min(num_mbs);
for (i, m) in map.iter_mut().enumerate() {
*m = if i < change { 0 } else { 1 };
}
}
6 => {
for (i, m) in map.iter_mut().enumerate() {
if i < pps.slice_group_id.len() {
*m = pps.slice_group_id[i] as u8;
}
}
}
_ => {
}
}
map
}
pub fn chroma_dimensions(width: usize, height: usize, chroma_format: u32) -> (usize, usize) {
match chroma_format {
0 => (0, 0), 1 => (width / 2, height / 2), 2 => (width / 2, height), 3 => (width, height), _ => (width / 2, height / 2), }
}
pub fn yuv422_to_rgb8(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
width: usize,
height: usize,
) -> Result<Vec<u8>, VideoError> {
let expected_y = width * height;
let expected_uv = (width / 2) * height;
if y_plane.len() < expected_y {
return Err(VideoError::Codec(format!(
"Y plane too small: expected {expected_y}, got {}",
y_plane.len()
)));
}
if u_plane.len() < expected_uv || v_plane.len() < expected_uv {
return Err(VideoError::Codec(format!(
"UV planes too small for 4:2:2: expected {expected_uv}, got U={} V={}",
u_plane.len(),
v_plane.len()
)));
}
let mut rgb = vec![0u8; width * height * 3];
let uv_stride = width / 2;
for row in 0..height {
let y_off = row * width;
let uv_off = row * uv_stride;
for col in 0..width {
let y_val = y_plane[y_off + col] as i16;
let u_val = u_plane[uv_off + col / 2] as i16 - 128;
let v_val = v_plane[uv_off + col / 2] as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = (row * width + col) * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
Ok(rgb)
}
pub fn yuv444_to_rgb8(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
width: usize,
height: usize,
) -> Result<Vec<u8>, VideoError> {
let expected = width * height;
if y_plane.len() < expected {
return Err(VideoError::Codec(format!(
"Y plane too small: expected {expected}, got {}",
y_plane.len()
)));
}
if u_plane.len() < expected || v_plane.len() < expected {
return Err(VideoError::Codec(format!(
"UV planes too small for 4:4:4: expected {expected}, got U={} V={}",
u_plane.len(),
v_plane.len()
)));
}
let mut rgb = vec![0u8; width * height * 3];
for i in 0..expected {
let y_val = y_plane[i] as i16;
let u_val = u_plane[i] as i16 - 128;
let v_val = v_plane[i] as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = i * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
Ok(rgb)
}
pub fn mono_to_rgb8(y_plane: &[u8], width: usize, height: usize) -> Result<Vec<u8>, VideoError> {
let expected = width * height;
if y_plane.len() < expected {
return Err(VideoError::Codec(format!(
"Y plane too small: expected {expected}, got {}",
y_plane.len()
)));
}
let mut rgb = vec![0u8; expected * 3];
for i in 0..expected {
let v = y_plane[i];
let idx = i * 3;
rgb[idx] = v;
rgb[idx + 1] = v;
rgb[idx + 2] = v;
}
Ok(rgb)
}
pub(crate) fn yuv_to_rgb8_by_format(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
width: usize,
height: usize,
chroma_format_idc: u32,
) -> Result<Vec<u8>, VideoError> {
match chroma_format_idc {
0 => mono_to_rgb8(y_plane, width, height),
1 => yuv420_to_rgb8(y_plane, u_plane, v_plane, width, height),
2 => yuv422_to_rgb8(y_plane, u_plane, v_plane, width, height),
3 => yuv444_to_rgb8(y_plane, u_plane, v_plane, width, height),
_ => yuv420_to_rgb8(y_plane, u_plane, v_plane, width, height),
}
}
pub fn yuv_to_rgb8_generic(
y: &[u8],
u: &[u8],
v: &[u8],
w: usize,
h: usize,
sub_w: usize,
sub_h: usize,
) -> Vec<u8> {
let sub_w = sub_w.max(1);
let sub_h = sub_h.max(1);
let uv_stride = w / sub_w;
let mut rgb = vec![0u8; w * h * 3];
for row in 0..h {
let y_off = row * w;
let uv_row = (row / sub_h) * uv_stride;
for col in 0..w {
let y_val = y.get(y_off + col).copied().unwrap_or(128) as i16;
let u_val = u.get(uv_row + col / sub_w).copied().unwrap_or(128) as i16 - 128;
let v_val = v.get(uv_row + col / sub_w).copied().unwrap_or(128) as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = (row * w + col) * 3;
rgb[idx] = r.clamp(0, 255) as u8;
rgb[idx + 1] = g.clamp(0, 255) as u8;
rgb[idx + 2] = b.clamp(0, 255) as u8;
}
}
rgb
}
pub fn yuv420_p16_to_rgb16(
y: &[u16],
u: &[u16],
v: &[u16],
w: usize,
h: usize,
bit_depth: u8,
) -> Result<Vec<u16>, VideoError> {
let expected_y = w * h;
let expected_uv = (w / 2) * (h / 2);
if y.len() < expected_y {
return Err(VideoError::Codec(format!(
"Y16 plane too small: expected {expected_y}, got {}",
y.len()
)));
}
if u.len() < expected_uv || v.len() < expected_uv {
return Err(VideoError::Codec(format!(
"UV16 planes too small: expected {expected_uv}, got U={} V={}",
u.len(),
v.len()
)));
}
let max_in = (1i32 << bit_depth) - 1;
let upshift = 16u32.saturating_sub(bit_depth as u32);
let uv_stride = w / 2;
let mid = 1i32 << (bit_depth - 1);
let mut rgb = vec![0u16; w * h * 3];
for row in 0..h {
let y_off = row * w;
let uv_off = (row / 2) * uv_stride;
for col in 0..w {
let y_val = y[y_off + col] as i32;
let u_val = u[uv_off + col / 2] as i32 - mid;
let v_val = v[uv_off + col / 2] as i32 - mid;
let r = y_val + ((v_val * 1613) >> 10);
let g = y_val - ((u_val * 192 + v_val * 479) >> 10);
let b = y_val + ((u_val * 1901) >> 10);
let idx = (row * w + col) * 3;
rgb[idx] = (r.clamp(0, max_in) as u32).wrapping_shl(upshift) as u16;
rgb[idx + 1] = (g.clamp(0, max_in) as u32).wrapping_shl(upshift) as u16;
rgb[idx + 2] = (b.clamp(0, max_in) as u32).wrapping_shl(upshift) as u16;
}
}
Ok(rgb)
}
#[allow(unsafe_code)]
pub fn yuv420_to_rgb8(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
width: usize,
height: usize,
) -> Result<Vec<u8>, VideoError> {
let expected_y = width * height;
let expected_uv = (width / 2) * (height / 2);
if y_plane.len() < expected_y {
return Err(VideoError::Codec(format!(
"Y plane too small: expected {expected_y}, got {}",
y_plane.len()
)));
}
if u_plane.len() < expected_uv || v_plane.len() < expected_uv {
return Err(VideoError::Codec(format!(
"UV planes too small: expected {expected_uv}, got U={} V={}",
u_plane.len(),
v_plane.len()
)));
}
let mut rgb = vec![0u8; width * height * 3];
let uv_stride = width / 2;
if height < 4 {
yuv420_to_rgb8_rows(
y_plane, u_plane, v_plane, &mut rgb, width, uv_stride, 0, height,
);
} else {
use rayon::prelude::*;
let row_bytes = width * 3;
rgb.par_chunks_mut(row_bytes)
.enumerate()
.for_each(|(row_idx, row_slice)| {
yuv420_to_rgb8_rows(
y_plane,
u_plane,
v_plane,
row_slice,
width,
uv_stride,
row_idx,
row_idx + 1,
);
});
}
Ok(rgb)
}
pub fn yuv420_to_rgb8_into(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
width: usize,
height: usize,
rgb_out: &mut Vec<u8>,
) -> Result<(), VideoError> {
let expected_y = width * height;
let expected_uv = (width / 2) * (height / 2);
if y_plane.len() < expected_y {
return Err(VideoError::Codec(format!(
"Y plane too small: expected {expected_y}, got {}",
y_plane.len()
)));
}
if u_plane.len() < expected_uv || v_plane.len() < expected_uv {
return Err(VideoError::Codec(format!(
"UV planes too small: expected {expected_uv}, got U={} V={}",
u_plane.len(),
v_plane.len()
)));
}
let total = width * height * 3;
rgb_out.resize(total, 0);
let uv_stride = width / 2;
if height < 4 {
yuv420_to_rgb8_rows(
y_plane, u_plane, v_plane, rgb_out, width, uv_stride, 0, height,
);
} else {
use rayon::prelude::*;
let row_bytes = width * 3;
rgb_out
.par_chunks_mut(row_bytes)
.enumerate()
.for_each(|(row_idx, row_slice)| {
yuv420_to_rgb8_rows(
y_plane,
u_plane,
v_plane,
row_slice,
width,
uv_stride,
row_idx,
row_idx + 1,
);
});
}
Ok(())
}
#[inline]
#[allow(unsafe_code)]
fn yuv420_to_rgb8_rows(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
rgb_out: &mut [u8],
width: usize,
uv_stride: usize,
start_row: usize,
end_row: usize,
) {
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") {
unsafe {
yuv420_to_rgb8_rows_neon(
y_plane, u_plane, v_plane, rgb_out, width, uv_stride, start_row, end_row,
);
}
return;
}
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
unsafe {
yuv420_to_rgb8_rows_avx2(
y_plane, u_plane, v_plane, rgb_out, width, uv_stride, start_row, end_row,
);
}
return;
}
if is_x86_feature_detected!("sse2") {
unsafe {
yuv420_to_rgb8_rows_sse2(
y_plane, u_plane, v_plane, rgb_out, width, uv_stride, start_row, end_row,
);
}
return;
}
}
yuv420_to_rgb8_rows_scalar(
y_plane, u_plane, v_plane, rgb_out, width, uv_stride, start_row, end_row,
);
}
#[inline]
fn yuv420_to_rgb8_rows_scalar(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
rgb_out: &mut [u8],
width: usize,
uv_stride: usize,
start_row: usize,
end_row: usize,
) {
for row in start_row..end_row {
let out_row = row - start_row;
let y_row_off = row * width;
let uv_row_off = (row / 2) * uv_stride;
for col in 0..width {
let y_val = y_plane[y_row_off + col] as i16;
let u_val = u_plane[uv_row_off + col / 2] as i16 - 128;
let v_val = v_plane[uv_row_off + col / 2] as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = (out_row * width + col) * 3;
rgb_out[idx] = r.clamp(0, 255) as u8;
rgb_out[idx + 1] = g.clamp(0, 255) as u8;
rgb_out[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn yuv420_to_rgb8_rows_neon(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
rgb_out: &mut [u8],
width: usize,
uv_stride: usize,
start_row: usize,
end_row: usize,
) {
use std::arch::aarch64::*;
let c_179 = vdupq_n_s16(179); let c_44 = vdupq_n_s16(44); let c_91 = vdupq_n_s16(91); let c_227 = vdupq_n_s16(227); let c_128 = vdupq_n_s16(128);
for row in start_row..end_row {
let out_row = row - start_row;
let y_row_ptr = y_plane.as_ptr().add(row * width);
let uv_row = (row / 2) * uv_stride;
let u_row_ptr = u_plane.as_ptr().add(uv_row);
let v_row_ptr = v_plane.as_ptr().add(uv_row);
let rgb_row_ptr = rgb_out.as_mut_ptr().add(out_row * width * 3);
let mut col = 0usize;
while col + 16 <= width {
let y16 = vld1q_u8(y_row_ptr.add(col));
let u8_vals = vld1_u8(u_row_ptr.add(col / 2));
let v8_vals = vld1_u8(v_row_ptr.add(col / 2));
let u16_dup = vcombine_u8(vzip1_u8(u8_vals, u8_vals), vzip2_u8(u8_vals, u8_vals));
let v16_dup = vcombine_u8(vzip1_u8(v8_vals, v8_vals), vzip2_u8(v8_vals, v8_vals));
let y_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y16)));
let u_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(u16_dup))), c_128);
let v_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v16_dup))), c_128);
let r_lo = vaddq_s16(y_lo, vshrq_n_s16::<7>(vmulq_s16(v_lo, c_179)));
let g_lo = vsubq_s16(
y_lo,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_lo, c_44), vmulq_s16(v_lo, c_91))),
);
let b_lo = vaddq_s16(y_lo, vshrq_n_s16::<7>(vmulq_s16(u_lo, c_227)));
let r_lo_u8 = vqmovun_s16(r_lo);
let g_lo_u8 = vqmovun_s16(g_lo);
let b_lo_u8 = vqmovun_s16(b_lo);
let y_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y16)));
let u_hi = vsubq_s16(
vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(u16_dup))),
c_128,
);
let v_hi = vsubq_s16(
vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v16_dup))),
c_128,
);
let r_hi = vaddq_s16(y_hi, vshrq_n_s16::<7>(vmulq_s16(v_hi, c_179)));
let g_hi = vsubq_s16(
y_hi,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_hi, c_44), vmulq_s16(v_hi, c_91))),
);
let b_hi = vaddq_s16(y_hi, vshrq_n_s16::<7>(vmulq_s16(u_hi, c_227)));
let r_hi_u8 = vqmovun_s16(r_hi);
let g_hi_u8 = vqmovun_s16(g_hi);
let b_hi_u8 = vqmovun_s16(b_hi);
let rgb_lo = uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8);
vst3_u8(rgb_row_ptr.add(col * 3), rgb_lo);
let rgb_hi = uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8);
vst3_u8(rgb_row_ptr.add((col + 8) * 3), rgb_hi);
col += 16;
}
if col + 8 <= width {
let y8_vals = vld1_u8(y_row_ptr.add(col));
let u4_vals_raw = u_row_ptr.add(col / 2);
let v4_vals_raw = v_row_ptr.add(col / 2);
let mut u_buf = [0u8; 8];
let mut v_buf = [0u8; 8];
for i in 0..4 {
u_buf[i * 2] = *u4_vals_raw.add(i);
u_buf[i * 2 + 1] = *u4_vals_raw.add(i);
v_buf[i * 2] = *v4_vals_raw.add(i);
v_buf[i * 2 + 1] = *v4_vals_raw.add(i);
}
let u8_dup = vld1_u8(u_buf.as_ptr());
let v8_dup = vld1_u8(v_buf.as_ptr());
let y_i16 = vreinterpretq_s16_u16(vmovl_u8(y8_vals));
let u_i16 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(u8_dup)), c_128);
let v_i16 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(v8_dup)), c_128);
let r = vaddq_s16(y_i16, vshrq_n_s16::<7>(vmulq_s16(v_i16, c_179)));
let g = vsubq_s16(
y_i16,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_i16, c_44), vmulq_s16(v_i16, c_91))),
);
let b = vaddq_s16(y_i16, vshrq_n_s16::<7>(vmulq_s16(u_i16, c_227)));
let r_u8 = vqmovun_s16(r);
let g_u8 = vqmovun_s16(g);
let b_u8 = vqmovun_s16(b);
let rgb = uint8x8x3_t(r_u8, g_u8, b_u8);
vst3_u8(rgb_row_ptr.add(col * 3), rgb);
col += 8;
}
while col < width {
let y_val = *y_row_ptr.add(col) as i16;
let u_val = *u_row_ptr.add(col / 2) as i16 - 128;
let v_val = *v_row_ptr.add(col / 2) as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - (((u_val * 44) + (v_val * 91)) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = col * 3;
*rgb_row_ptr.add(idx) = r.clamp(0, 255) as u8;
*rgb_row_ptr.add(idx + 1) = g.clamp(0, 255) as u8;
*rgb_row_ptr.add(idx + 2) = b.clamp(0, 255) as u8;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn yuv420_to_rgb8_rows_avx2(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
rgb_out: &mut [u8],
width: usize,
uv_stride: usize,
start_row: usize,
end_row: usize,
) {
use std::arch::x86_64::*;
let c_179 = _mm256_set1_epi16(179);
let c_44 = _mm256_set1_epi16(44);
let c_91 = _mm256_set1_epi16(91);
let c_227 = _mm256_set1_epi16(227);
let c_128 = _mm256_set1_epi16(128);
let zero = _mm256_setzero_si256();
for row in start_row..end_row {
let out_row = row - start_row;
let y_row_ptr = y_plane.as_ptr().add(row * width);
let uv_row = (row / 2) * uv_stride;
let u_row_ptr = u_plane.as_ptr().add(uv_row);
let v_row_ptr = v_plane.as_ptr().add(uv_row);
let rgb_row_ptr = rgb_out.as_mut_ptr().add(out_row * width * 3);
let mut col = 0usize;
while col + 16 <= width {
let y16 = _mm_loadu_si128(y_row_ptr.add(col) as *const __m128i);
let y_lo = _mm256_cvtepu8_epi16(y16);
let mut u_buf = [0u8; 16];
let mut v_buf = [0u8; 16];
for i in 0..8 {
u_buf[i * 2] = *u_row_ptr.add(col / 2 + i);
u_buf[i * 2 + 1] = *u_row_ptr.add(col / 2 + i);
v_buf[i * 2] = *v_row_ptr.add(col / 2 + i);
v_buf[i * 2 + 1] = *v_row_ptr.add(col / 2 + i);
}
let u16_raw = _mm_loadu_si128(u_buf.as_ptr() as *const __m128i);
let v16_raw = _mm_loadu_si128(v_buf.as_ptr() as *const __m128i);
let u_i16 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(u16_raw), c_128);
let v_i16 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(v16_raw), c_128);
let r = _mm256_add_epi16(
y_lo,
_mm256_srai_epi16::<7>(_mm256_mullo_epi16(v_i16, c_179)),
);
let g = _mm256_sub_epi16(
y_lo,
_mm256_srai_epi16::<7>(_mm256_add_epi16(
_mm256_mullo_epi16(u_i16, c_44),
_mm256_mullo_epi16(v_i16, c_91),
)),
);
let b = _mm256_add_epi16(
y_lo,
_mm256_srai_epi16::<7>(_mm256_mullo_epi16(u_i16, c_227)),
);
let r_packed = _mm256_packus_epi16(r, zero);
let g_packed = _mm256_packus_epi16(g, zero);
let b_packed = _mm256_packus_epi16(b, zero);
let r_perm = _mm256_permute4x64_epi64::<0xD8>(r_packed);
let g_perm = _mm256_permute4x64_epi64::<0xD8>(g_packed);
let b_perm = _mm256_permute4x64_epi64::<0xD8>(b_packed);
let r_lo128 = _mm256_castsi256_si128(r_perm);
let g_lo128 = _mm256_castsi256_si128(g_perm);
let b_lo128 = _mm256_castsi256_si128(b_perm);
let mut r_arr = [0u8; 16];
let mut g_arr = [0u8; 16];
let mut b_arr = [0u8; 16];
_mm_storeu_si128(r_arr.as_mut_ptr() as *mut __m128i, r_lo128);
_mm_storeu_si128(g_arr.as_mut_ptr() as *mut __m128i, g_lo128);
_mm_storeu_si128(b_arr.as_mut_ptr() as *mut __m128i, b_lo128);
let mut rgb_buf = [0u8; 48];
for i in 0..16 {
rgb_buf[i * 3] = r_arr[i];
rgb_buf[i * 3 + 1] = g_arr[i];
rgb_buf[i * 3 + 2] = b_arr[i];
}
std::ptr::copy_nonoverlapping(rgb_buf.as_ptr(), rgb_row_ptr.add(col * 3), 48);
col += 16;
}
while col + 8 <= width {
let y8 = _mm_loadl_epi64(y_row_ptr.add(col) as *const __m128i);
let zero128 = _mm_setzero_si128();
let y_i16 = _mm_unpacklo_epi8(y8, zero128);
let c_179_128 = _mm_set1_epi16(179);
let c_44_128 = _mm_set1_epi16(44);
let c_91_128 = _mm_set1_epi16(91);
let c_227_128 = _mm_set1_epi16(227);
let c_128_128 = _mm_set1_epi16(128);
let mut u_buf = [0u8; 8];
let mut v_buf = [0u8; 8];
for i in 0..4 {
u_buf[i * 2] = *u_row_ptr.add(col / 2 + i);
u_buf[i * 2 + 1] = *u_row_ptr.add(col / 2 + i);
v_buf[i * 2] = *v_row_ptr.add(col / 2 + i);
v_buf[i * 2 + 1] = *v_row_ptr.add(col / 2 + i);
}
let u8_dup = _mm_loadl_epi64(u_buf.as_ptr() as *const __m128i);
let v8_dup = _mm_loadl_epi64(v_buf.as_ptr() as *const __m128i);
let u_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(u8_dup, zero128), c_128_128);
let v_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(v8_dup, zero128), c_128_128);
let r = _mm_add_epi16(
y_i16,
_mm_srai_epi16::<7>(_mm_mullo_epi16(v_i16, c_179_128)),
);
let g = _mm_sub_epi16(
y_i16,
_mm_srai_epi16::<7>(_mm_add_epi16(
_mm_mullo_epi16(u_i16, c_44_128),
_mm_mullo_epi16(v_i16, c_91_128),
)),
);
let b = _mm_add_epi16(
y_i16,
_mm_srai_epi16::<7>(_mm_mullo_epi16(u_i16, c_227_128)),
);
let r_u8 = _mm_packus_epi16(r, zero128);
let g_u8 = _mm_packus_epi16(g, zero128);
let b_u8 = _mm_packus_epi16(b, zero128);
let mut r_arr = [0u8; 8];
let mut g_arr = [0u8; 8];
let mut b_arr = [0u8; 8];
_mm_storel_epi64(r_arr.as_mut_ptr() as *mut __m128i, r_u8);
_mm_storel_epi64(g_arr.as_mut_ptr() as *mut __m128i, g_u8);
_mm_storel_epi64(b_arr.as_mut_ptr() as *mut __m128i, b_u8);
let mut rgb_buf = [0u8; 24];
for i in 0..8 {
rgb_buf[i * 3] = r_arr[i];
rgb_buf[i * 3 + 1] = g_arr[i];
rgb_buf[i * 3 + 2] = b_arr[i];
}
std::ptr::copy_nonoverlapping(rgb_buf.as_ptr(), rgb_row_ptr.add(col * 3), 24);
col += 8;
}
while col < width {
let y_val = *y_row_ptr.add(col) as i16;
let u_val = *u_row_ptr.add(col / 2) as i16 - 128;
let v_val = *v_row_ptr.add(col / 2) as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - (((u_val * 44) + (v_val * 91)) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = col * 3;
*rgb_row_ptr.add(idx) = r.clamp(0, 255) as u8;
*rgb_row_ptr.add(idx + 1) = g.clamp(0, 255) as u8;
*rgb_row_ptr.add(idx + 2) = b.clamp(0, 255) as u8;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn yuv420_to_rgb8_rows_sse2(
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
rgb_out: &mut [u8],
width: usize,
uv_stride: usize,
start_row: usize,
end_row: usize,
) {
use std::arch::x86_64::*;
let c_179 = _mm_set1_epi16(179); let c_44 = _mm_set1_epi16(44); let c_91 = _mm_set1_epi16(91); let c_227 = _mm_set1_epi16(227); let c_128 = _mm_set1_epi16(128);
let zero = _mm_setzero_si128();
for row in start_row..end_row {
let out_row = row - start_row;
let y_row_ptr = y_plane.as_ptr().add(row * width);
let uv_row = (row / 2) * uv_stride;
let u_row_ptr = u_plane.as_ptr().add(uv_row);
let v_row_ptr = v_plane.as_ptr().add(uv_row);
let rgb_row_ptr = rgb_out.as_mut_ptr().add(out_row * width * 3);
let mut col = 0usize;
while col + 8 <= width {
let y8 = _mm_loadl_epi64(y_row_ptr.add(col) as *const __m128i);
let y_i16 = _mm_unpacklo_epi8(y8, zero);
let mut u_buf = [0u8; 8];
let mut v_buf = [0u8; 8];
for i in 0..4 {
u_buf[i * 2] = *u_row_ptr.add(col / 2 + i);
u_buf[i * 2 + 1] = *u_row_ptr.add(col / 2 + i);
v_buf[i * 2] = *v_row_ptr.add(col / 2 + i);
v_buf[i * 2 + 1] = *v_row_ptr.add(col / 2 + i);
}
let u8_dup = _mm_loadl_epi64(u_buf.as_ptr() as *const __m128i);
let v8_dup = _mm_loadl_epi64(v_buf.as_ptr() as *const __m128i);
let u_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(u8_dup, zero), c_128);
let v_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(v8_dup, zero), c_128);
let r = _mm_add_epi16(y_i16, _mm_srai_epi16::<7>(_mm_mullo_epi16(v_i16, c_179)));
let g = _mm_sub_epi16(
y_i16,
_mm_srai_epi16::<7>(_mm_add_epi16(
_mm_mullo_epi16(u_i16, c_44),
_mm_mullo_epi16(v_i16, c_91),
)),
);
let b = _mm_add_epi16(y_i16, _mm_srai_epi16::<7>(_mm_mullo_epi16(u_i16, c_227)));
let r_u8 = _mm_packus_epi16(r, zero); let g_u8 = _mm_packus_epi16(g, zero);
let b_u8 = _mm_packus_epi16(b, zero);
let mut rgb_buf = [0u8; 24];
let mut r_arr = [0u8; 8];
let mut g_arr = [0u8; 8];
let mut b_arr = [0u8; 8];
_mm_storel_epi64(r_arr.as_mut_ptr() as *mut __m128i, r_u8);
_mm_storel_epi64(g_arr.as_mut_ptr() as *mut __m128i, g_u8);
_mm_storel_epi64(b_arr.as_mut_ptr() as *mut __m128i, b_u8);
for i in 0..8 {
rgb_buf[i * 3] = r_arr[i];
rgb_buf[i * 3 + 1] = g_arr[i];
rgb_buf[i * 3 + 2] = b_arr[i];
}
std::ptr::copy_nonoverlapping(rgb_buf.as_ptr(), rgb_row_ptr.add(col * 3), 24);
col += 8;
}
while col < width {
let y_val = *y_row_ptr.add(col) as i16;
let u_val = *u_row_ptr.add(col / 2) as i16 - 128;
let v_val = *v_row_ptr.add(col / 2) as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - (((u_val * 44) + (v_val * 91)) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = col * 3;
*rgb_row_ptr.add(idx) = r.clamp(0, 255) as u8;
*rgb_row_ptr.add(idx + 1) = g.clamp(0, 255) as u8;
*rgb_row_ptr.add(idx + 2) = b.clamp(0, 255) as u8;
col += 1;
}
}
}
#[allow(unsafe_code)]
pub fn nv12_to_rgb8(
y: &[u8],
uv: &[u8],
w: usize,
h: usize,
out: &mut [u8],
) -> Result<(), VideoError> {
let expected_y = w * h;
let expected_uv = w * (h / 2);
let expected_out = w * h * 3;
if y.len() < expected_y {
return Err(VideoError::Codec(format!(
"NV12: Y plane too small: expected {expected_y}, got {}",
y.len()
)));
}
if uv.len() < expected_uv {
return Err(VideoError::Codec(format!(
"NV12: UV plane too small: expected {expected_uv}, got {}",
uv.len()
)));
}
if out.len() < expected_out {
return Err(VideoError::Codec(format!(
"NV12: output buffer too small: expected {expected_out}, got {}",
out.len()
)));
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") {
unsafe {
nv12_to_rgb8_neon(y, uv, w, h, out);
}
return Ok(());
}
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
unsafe {
nv12_to_rgb8_sse2(y, uv, w, h, out);
}
return Ok(());
}
}
nv12_to_rgb8_scalar(y, uv, w, h, out);
Ok(())
}
fn nv12_to_rgb8_scalar(y: &[u8], uv: &[u8], w: usize, h: usize, out: &mut [u8]) {
for row in 0..h {
let y_off = row * w;
let uv_off = (row / 2) * w; let dst_off = row * w * 3;
for col in 0..w {
let y_val = y[y_off + col] as i16;
let u_val = uv[uv_off + (col & !1)] as i16 - 128;
let v_val = uv[uv_off + (col | 1)] as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = dst_off + col * 3;
out[idx] = r.clamp(0, 255) as u8;
out[idx + 1] = g.clamp(0, 255) as u8;
out[idx + 2] = b.clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn nv12_to_rgb8_neon(y: &[u8], uv: &[u8], w: usize, h: usize, out: &mut [u8]) {
use std::arch::aarch64::*;
let c_179 = vdupq_n_s16(179);
let c_44 = vdupq_n_s16(44);
let c_91 = vdupq_n_s16(91);
let c_227 = vdupq_n_s16(227);
let c_128 = vdupq_n_s16(128);
for row in 0..h {
let y_row = y.as_ptr().add(row * w);
let uv_row = uv.as_ptr().add((row / 2) * w);
let dst_row = out.as_mut_ptr().add(row * w * 3);
let mut col = 0usize;
while col + 16 <= w {
let y16 = vld1q_u8(y_row.add(col));
let uv16 = vld2_u8(uv_row.add(col));
let u8_vals = uv16.0; let v8_vals = uv16.1;
let u_dup = vcombine_u8(vzip1_u8(u8_vals, u8_vals), vzip2_u8(u8_vals, u8_vals));
let v_dup = vcombine_u8(vzip1_u8(v8_vals, v8_vals), vzip2_u8(v8_vals, v8_vals));
let y_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y16)));
let u_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(u_dup))), c_128);
let v_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_dup))), c_128);
let r_lo = vaddq_s16(y_lo, vshrq_n_s16::<7>(vmulq_s16(v_lo, c_179)));
let g_lo = vsubq_s16(
y_lo,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_lo, c_44), vmulq_s16(v_lo, c_91))),
);
let b_lo = vaddq_s16(y_lo, vshrq_n_s16::<7>(vmulq_s16(u_lo, c_227)));
let rgb_lo = uint8x8x3_t(vqmovun_s16(r_lo), vqmovun_s16(g_lo), vqmovun_s16(b_lo));
vst3_u8(dst_row.add(col * 3), rgb_lo);
let y_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y16)));
let u_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(u_dup))), c_128);
let v_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_dup))), c_128);
let r_hi = vaddq_s16(y_hi, vshrq_n_s16::<7>(vmulq_s16(v_hi, c_179)));
let g_hi = vsubq_s16(
y_hi,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_hi, c_44), vmulq_s16(v_hi, c_91))),
);
let b_hi = vaddq_s16(y_hi, vshrq_n_s16::<7>(vmulq_s16(u_hi, c_227)));
let rgb_hi = uint8x8x3_t(vqmovun_s16(r_hi), vqmovun_s16(g_hi), vqmovun_s16(b_hi));
vst3_u8(dst_row.add((col + 8) * 3), rgb_hi);
col += 16;
}
while col < w {
let y_val = *y_row.add(col) as i16;
let u_val = *uv_row.add(col & !1) as i16 - 128;
let v_val = *uv_row.add(col | 1) as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = col * 3;
*dst_row.add(idx) = r.clamp(0, 255) as u8;
*dst_row.add(idx + 1) = g.clamp(0, 255) as u8;
*dst_row.add(idx + 2) = b.clamp(0, 255) as u8;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn nv12_to_rgb8_sse2(y: &[u8], uv: &[u8], w: usize, h: usize, out: &mut [u8]) {
use std::arch::x86_64::*;
let c_179 = _mm_set1_epi16(179);
let c_44 = _mm_set1_epi16(44);
let c_91 = _mm_set1_epi16(91);
let c_227 = _mm_set1_epi16(227);
let c_128 = _mm_set1_epi16(128);
let zero = _mm_setzero_si128();
for row in 0..h {
let y_row = y.as_ptr().add(row * w);
let uv_row = uv.as_ptr().add((row / 2) * w);
let dst_row = out.as_mut_ptr().add(row * w * 3);
let mut col = 0usize;
while col + 8 <= w {
let y8 = _mm_loadl_epi64(y_row.add(col) as *const __m128i);
let y_i16 = _mm_unpacklo_epi8(y8, zero);
let mut u_buf = [0u8; 8];
let mut v_buf = [0u8; 8];
for i in 0..4 {
let u_val = *uv_row.add(col + i * 2);
let v_val = *uv_row.add(col + i * 2 + 1);
u_buf[i * 2] = u_val;
u_buf[i * 2 + 1] = u_val;
v_buf[i * 2] = v_val;
v_buf[i * 2 + 1] = v_val;
}
let u8_dup = _mm_loadl_epi64(u_buf.as_ptr() as *const __m128i);
let v8_dup = _mm_loadl_epi64(v_buf.as_ptr() as *const __m128i);
let u_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(u8_dup, zero), c_128);
let v_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(v8_dup, zero), c_128);
let r = _mm_add_epi16(y_i16, _mm_srai_epi16::<7>(_mm_mullo_epi16(v_i16, c_179)));
let g = _mm_sub_epi16(
y_i16,
_mm_srai_epi16::<7>(_mm_add_epi16(
_mm_mullo_epi16(u_i16, c_44),
_mm_mullo_epi16(v_i16, c_91),
)),
);
let b = _mm_add_epi16(y_i16, _mm_srai_epi16::<7>(_mm_mullo_epi16(u_i16, c_227)));
let r_u8 = _mm_packus_epi16(r, zero);
let g_u8 = _mm_packus_epi16(g, zero);
let b_u8 = _mm_packus_epi16(b, zero);
let mut r_arr = [0u8; 8];
let mut g_arr = [0u8; 8];
let mut b_arr = [0u8; 8];
_mm_storel_epi64(r_arr.as_mut_ptr() as *mut __m128i, r_u8);
_mm_storel_epi64(g_arr.as_mut_ptr() as *mut __m128i, g_u8);
_mm_storel_epi64(b_arr.as_mut_ptr() as *mut __m128i, b_u8);
let mut rgb_buf = [0u8; 24];
for i in 0..8 {
rgb_buf[i * 3] = r_arr[i];
rgb_buf[i * 3 + 1] = g_arr[i];
rgb_buf[i * 3 + 2] = b_arr[i];
}
std::ptr::copy_nonoverlapping(rgb_buf.as_ptr(), dst_row.add(col * 3), 24);
col += 8;
}
while col < w {
let y_val = *y_row.add(col) as i16;
let u_val = *uv_row.add(col & !1) as i16 - 128;
let v_val = *uv_row.add(col | 1) as i16 - 128;
let r = y_val + ((v_val * 179) >> 7);
let g = y_val - ((u_val * 44 + v_val * 91) >> 7);
let b = y_val + ((u_val * 227) >> 7);
let idx = col * 3;
*dst_row.add(idx) = r.clamp(0, 255) as u8;
*dst_row.add(idx + 1) = g.clamp(0, 255) as u8;
*dst_row.add(idx + 2) = b.clamp(0, 255) as u8;
col += 1;
}
}
}
#[allow(unsafe_code)]
pub fn yuyv_to_rgb8(data: &[u8], w: usize, h: usize, out: &mut [u8]) -> Result<(), VideoError> {
let expected_in = w * h * 2;
let expected_out = w * h * 3;
if data.len() < expected_in {
return Err(VideoError::Codec(format!(
"YUYV: input too small: expected {expected_in}, got {}",
data.len()
)));
}
if out.len() < expected_out {
return Err(VideoError::Codec(format!(
"YUYV: output too small: expected {expected_out}, got {}",
out.len()
)));
}
if !w.is_multiple_of(2) {
return Err(VideoError::Codec("YUYV: width must be even".into()));
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") {
unsafe {
yuyv_to_rgb8_neon(data, w, h, out);
}
return Ok(());
}
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
unsafe {
yuyv_to_rgb8_sse2(data, w, h, out);
}
return Ok(());
}
}
yuyv_to_rgb8_scalar(data, w, h, out);
Ok(())
}
fn yuyv_to_rgb8_scalar(data: &[u8], w: usize, h: usize, out: &mut [u8]) {
let pairs = w / 2;
for row in 0..h {
let src_off = row * w * 2;
let dst_off = row * w * 3;
for pair in 0..pairs {
let si = src_off + pair * 4;
let y0 = data[si] as i16;
let u_val = data[si + 1] as i16 - 128;
let y1 = data[si + 2] as i16;
let v_val = data[si + 3] as i16 - 128;
let r0 = y0 + ((v_val * 179) >> 7);
let g0 = y0 - ((u_val * 44 + v_val * 91) >> 7);
let b0 = y0 + ((u_val * 227) >> 7);
let r1 = y1 + ((v_val * 179) >> 7);
let g1 = y1 - ((u_val * 44 + v_val * 91) >> 7);
let b1 = y1 + ((u_val * 227) >> 7);
let di = dst_off + pair * 6;
out[di] = r0.clamp(0, 255) as u8;
out[di + 1] = g0.clamp(0, 255) as u8;
out[di + 2] = b0.clamp(0, 255) as u8;
out[di + 3] = r1.clamp(0, 255) as u8;
out[di + 4] = g1.clamp(0, 255) as u8;
out[di + 5] = b1.clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn yuyv_to_rgb8_neon(data: &[u8], w: usize, h: usize, out: &mut [u8]) {
use std::arch::aarch64::*;
let c_179 = vdupq_n_s16(179);
let c_44 = vdupq_n_s16(44);
let c_91 = vdupq_n_s16(91);
let c_227 = vdupq_n_s16(227);
let c_128 = vdupq_n_s16(128);
for row in 0..h {
let src_row = data.as_ptr().add(row * w * 2);
let dst_row = out.as_mut_ptr().add(row * w * 3);
let mut col = 0usize;
while col + 16 <= w {
let yuyv = vld4_u8(src_row.add(col * 2));
let y0_8 = yuyv.0;
let u_8 = yuyv.1;
let y1_8 = yuyv.2;
let v_8 = yuyv.3;
let y0_i16 = vreinterpretq_s16_u16(vmovl_u8(y0_8));
let u_i16 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(u_8)), c_128);
let v_i16 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(v_8)), c_128);
let r0 = vaddq_s16(y0_i16, vshrq_n_s16::<7>(vmulq_s16(v_i16, c_179)));
let g0 = vsubq_s16(
y0_i16,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_i16, c_44), vmulq_s16(v_i16, c_91))),
);
let b0 = vaddq_s16(y0_i16, vshrq_n_s16::<7>(vmulq_s16(u_i16, c_227)));
let y1_i16 = vreinterpretq_s16_u16(vmovl_u8(y1_8));
let r1 = vaddq_s16(y1_i16, vshrq_n_s16::<7>(vmulq_s16(v_i16, c_179)));
let g1 = vsubq_s16(
y1_i16,
vshrq_n_s16::<7>(vaddq_s16(vmulq_s16(u_i16, c_44), vmulq_s16(v_i16, c_91))),
);
let b1 = vaddq_s16(y1_i16, vshrq_n_s16::<7>(vmulq_s16(u_i16, c_227)));
let r_lo = vqmovun_s16(r0);
let r_hi = vqmovun_s16(r1);
let g_lo = vqmovun_s16(g0);
let g_hi = vqmovun_s16(g1);
let b_lo = vqmovun_s16(b0);
let b_hi = vqmovun_s16(b1);
let r_zip = vzip1_u8(r_lo, r_hi);
let r_zip2 = vzip2_u8(r_lo, r_hi);
let g_zip = vzip1_u8(g_lo, g_hi);
let g_zip2 = vzip2_u8(g_lo, g_hi);
let b_zip = vzip1_u8(b_lo, b_hi);
let b_zip2 = vzip2_u8(b_lo, b_hi);
let rgb_first = uint8x8x3_t(r_zip, g_zip, b_zip);
vst3_u8(dst_row.add(col * 3), rgb_first);
let rgb_second = uint8x8x3_t(r_zip2, g_zip2, b_zip2);
vst3_u8(dst_row.add((col + 8) * 3), rgb_second);
col += 16;
}
while col + 2 <= w {
let si = col * 2;
let y0 = *src_row.add(si) as i16;
let u_val = *src_row.add(si + 1) as i16 - 128;
let y1 = *src_row.add(si + 2) as i16;
let v_val = *src_row.add(si + 3) as i16 - 128;
let r0 = y0 + ((v_val * 179) >> 7);
let g0 = y0 - ((u_val * 44 + v_val * 91) >> 7);
let b0 = y0 + ((u_val * 227) >> 7);
let r1 = y1 + ((v_val * 179) >> 7);
let g1 = y1 - ((u_val * 44 + v_val * 91) >> 7);
let b1 = y1 + ((u_val * 227) >> 7);
let di = col * 3;
*dst_row.add(di) = r0.clamp(0, 255) as u8;
*dst_row.add(di + 1) = g0.clamp(0, 255) as u8;
*dst_row.add(di + 2) = b0.clamp(0, 255) as u8;
*dst_row.add(di + 3) = r1.clamp(0, 255) as u8;
*dst_row.add(di + 4) = g1.clamp(0, 255) as u8;
*dst_row.add(di + 5) = b1.clamp(0, 255) as u8;
col += 2;
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
unsafe fn yuyv_to_rgb8_sse2(data: &[u8], w: usize, h: usize, out: &mut [u8]) {
use std::arch::x86_64::*;
let c_179 = _mm_set1_epi16(179);
let c_44 = _mm_set1_epi16(44);
let c_91 = _mm_set1_epi16(91);
let c_227 = _mm_set1_epi16(227);
let c_128_16 = _mm_set1_epi16(128);
let zero = _mm_setzero_si128();
for row in 0..h {
let src_row = data.as_ptr().add(row * w * 2);
let dst_row = out.as_mut_ptr().add(row * w * 3);
let mut col = 0usize;
while col + 8 <= w {
let mut y_buf = [0u8; 8];
let mut u_buf = [0u8; 8];
let mut v_buf = [0u8; 8];
for i in 0..4 {
let si = col * 2 + i * 4;
y_buf[i * 2] = *src_row.add(si);
y_buf[i * 2 + 1] = *src_row.add(si + 2);
let u = *src_row.add(si + 1);
let v = *src_row.add(si + 3);
u_buf[i * 2] = u;
u_buf[i * 2 + 1] = u;
v_buf[i * 2] = v;
v_buf[i * 2 + 1] = v;
}
let y8 = _mm_loadl_epi64(y_buf.as_ptr() as *const __m128i);
let u8_dup = _mm_loadl_epi64(u_buf.as_ptr() as *const __m128i);
let v8_dup = _mm_loadl_epi64(v_buf.as_ptr() as *const __m128i);
let y_i16 = _mm_unpacklo_epi8(y8, zero);
let u_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(u8_dup, zero), c_128_16);
let v_i16 = _mm_sub_epi16(_mm_unpacklo_epi8(v8_dup, zero), c_128_16);
let r = _mm_add_epi16(y_i16, _mm_srai_epi16::<7>(_mm_mullo_epi16(v_i16, c_179)));
let g = _mm_sub_epi16(
y_i16,
_mm_srai_epi16::<7>(_mm_add_epi16(
_mm_mullo_epi16(u_i16, c_44),
_mm_mullo_epi16(v_i16, c_91),
)),
);
let b = _mm_add_epi16(y_i16, _mm_srai_epi16::<7>(_mm_mullo_epi16(u_i16, c_227)));
let r_u8 = _mm_packus_epi16(r, zero);
let g_u8 = _mm_packus_epi16(g, zero);
let b_u8 = _mm_packus_epi16(b, zero);
let mut r_arr = [0u8; 8];
let mut g_arr = [0u8; 8];
let mut b_arr = [0u8; 8];
_mm_storel_epi64(r_arr.as_mut_ptr() as *mut __m128i, r_u8);
_mm_storel_epi64(g_arr.as_mut_ptr() as *mut __m128i, g_u8);
_mm_storel_epi64(b_arr.as_mut_ptr() as *mut __m128i, b_u8);
let mut rgb_buf = [0u8; 24];
for i in 0..8 {
rgb_buf[i * 3] = r_arr[i];
rgb_buf[i * 3 + 1] = g_arr[i];
rgb_buf[i * 3 + 2] = b_arr[i];
}
std::ptr::copy_nonoverlapping(rgb_buf.as_ptr(), dst_row.add(col * 3), 24);
col += 8;
}
while col + 2 <= w {
let si = col * 2;
let y0 = *src_row.add(si) as i16;
let u_val = *src_row.add(si + 1) as i16 - 128;
let y1 = *src_row.add(si + 2) as i16;
let v_val = *src_row.add(si + 3) as i16 - 128;
let r0 = y0 + ((v_val * 179) >> 7);
let g0 = y0 - ((u_val * 44 + v_val * 91) >> 7);
let b0 = y0 + ((u_val * 227) >> 7);
let r1 = y1 + ((v_val * 179) >> 7);
let g1 = y1 - ((u_val * 44 + v_val * 91) >> 7);
let b1 = y1 + ((u_val * 227) >> 7);
let di = col * 3;
*dst_row.add(di) = r0.clamp(0, 255) as u8;
*dst_row.add(di + 1) = g0.clamp(0, 255) as u8;
*dst_row.add(di + 2) = b0.clamp(0, 255) as u8;
*dst_row.add(di + 3) = r1.clamp(0, 255) as u8;
*dst_row.add(di + 4) = g1.clamp(0, 255) as u8;
*dst_row.add(di + 5) = b1.clamp(0, 255) as u8;
col += 2;
}
}
}