#![allow(unsafe_code)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_possible_wrap)]
#![allow(clippy::cast_lossless)]
#![allow(clippy::too_many_lines)]
#[must_use]
pub fn yuv420_to_yuv444(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { yuv420_to_yuv444_avx2(y, u, v, width, height) };
}
}
#[cfg(target_arch = "aarch64")]
{
return yuv420_to_yuv444_neon(y, u, v, width, height);
}
#[allow(unreachable_code)]
yuv420_to_yuv444_scalar(y, u, v, width, height)
}
#[must_use]
pub fn yuv444_to_yuv420(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { yuv444_to_yuv420_avx2(y, u, v, width, height) };
}
}
#[cfg(target_arch = "aarch64")]
{
return yuv444_to_yuv420_neon(y, u, v, width, height);
}
#[allow(unreachable_code)]
yuv444_to_yuv420_scalar(y, u, v, width, height)
}
#[must_use]
pub fn yuv422_to_yuv444(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { yuv422_to_yuv444_avx2(y, u, v, width, height) };
}
}
#[cfg(target_arch = "aarch64")]
{
return yuv422_to_yuv444_neon(y, u, v, width, height);
}
#[allow(unreachable_code)]
yuv422_to_yuv444_scalar(y, u, v, width, height)
}
#[must_use]
pub fn yuv444_to_yuv422(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { yuv444_to_yuv422_avx2(y, u, v, width, height) };
}
}
#[cfg(target_arch = "aarch64")]
{
return yuv444_to_yuv422_neon(y, u, v, width, height);
}
#[allow(unreachable_code)]
yuv444_to_yuv422_scalar(y, u, v, width, height)
}
#[must_use]
pub fn nv12_to_i420(
y: &[u8],
uv_interleaved: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { nv12_to_i420_avx2(y, uv_interleaved, width, height) };
}
}
#[cfg(target_arch = "aarch64")]
{
return nv12_to_i420_neon(y, uv_interleaved, width, height);
}
#[allow(unreachable_code)]
nv12_to_i420_scalar(y, uv_interleaved, width, height)
}
#[must_use]
pub fn i420_to_nv12(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { i420_to_nv12_avx2(y, u, v, width, height) };
}
}
#[cfg(target_arch = "aarch64")]
{
return i420_to_nv12_neon(y, u, v, width, height);
}
#[allow(unreachable_code)]
i420_to_nv12_scalar(y, u, v, width, height)
}
fn yuv420_to_yuv444_scalar(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let uv_width = (width + 1) / 2;
let uv_height = (height + 1) / 2;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_width * uv_height);
assert!(v.len() >= uv_width * uv_height);
let total = width * height;
let mut u_out = vec![0u8; total];
let mut v_out = vec![0u8; total];
for row in 0..height {
let uv_row = row / 2;
for col in 0..width {
let uv_col = col / 2;
let src_uv = uv_row * uv_width + uv_col;
let dst = row * width + col;
u_out[dst] = u[src_uv];
v_out[dst] = v[src_uv];
}
}
(y[..total].to_vec(), u_out, v_out)
}
fn yuv444_to_yuv420_scalar(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
assert!(y.len() >= width * height);
assert!(u.len() >= width * height);
assert!(v.len() >= width * height);
let uv_width = (width + 1) / 2;
let uv_height = (height + 1) / 2;
let uv_total = uv_width * uv_height;
let mut u_out = vec![0u8; uv_total];
let mut v_out = vec![0u8; uv_total];
for uv_row in 0..uv_height {
for uv_col in 0..uv_width {
let row0 = uv_row * 2;
let col0 = uv_col * 2;
let row1 = (row0 + 1).min(height - 1);
let col1 = (col0 + 1).min(width - 1);
let u_sum = u32::from(u[row0 * width + col0])
+ u32::from(u[row0 * width + col1])
+ u32::from(u[row1 * width + col0])
+ u32::from(u[row1 * width + col1]);
let v_sum = u32::from(v[row0 * width + col0])
+ u32::from(v[row0 * width + col1])
+ u32::from(v[row1 * width + col0])
+ u32::from(v[row1 * width + col1]);
let dst = uv_row * uv_width + uv_col;
u_out[dst] = ((u_sum + 2) / 4) as u8;
v_out[dst] = ((v_sum + 2) / 4) as u8;
}
}
(y[..width * height].to_vec(), u_out, v_out)
}
fn yuv422_to_yuv444_scalar(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let uv_width = (width + 1) / 2;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_width * height);
assert!(v.len() >= uv_width * height);
let total = width * height;
let mut u_out = vec![0u8; total];
let mut v_out = vec![0u8; total];
for row in 0..height {
for col in 0..width {
let uv_col = col / 2;
let src_uv = row * uv_width + uv_col;
let dst = row * width + col;
u_out[dst] = u[src_uv];
v_out[dst] = v[src_uv];
}
}
(y[..total].to_vec(), u_out, v_out)
}
fn yuv444_to_yuv422_scalar(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
assert!(y.len() >= width * height);
assert!(u.len() >= width * height);
assert!(v.len() >= width * height);
let uv_width = (width + 1) / 2;
let uv_total = uv_width * height;
let mut u_out = vec![0u8; uv_total];
let mut v_out = vec![0u8; uv_total];
for row in 0..height {
for uv_col in 0..uv_width {
let col0 = uv_col * 2;
let col1 = (col0 + 1).min(width - 1);
let u_sum = u32::from(u[row * width + col0]) + u32::from(u[row * width + col1]);
let v_sum = u32::from(v[row * width + col0]) + u32::from(v[row * width + col1]);
let dst = row * uv_width + uv_col;
u_out[dst] = ((u_sum + 1) / 2) as u8;
v_out[dst] = ((v_sum + 1) / 2) as u8;
}
}
(y[..width * height].to_vec(), u_out, v_out)
}
fn nv12_to_i420_scalar(
y: &[u8],
uv_interleaved: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let uv_height = (height + 1) / 2;
let uv_samples = ((width + 1) / 2) * uv_height;
assert!(y.len() >= width * height);
assert!(uv_interleaved.len() >= uv_samples * 2);
let mut u_out = vec![0u8; uv_samples];
let mut v_out = vec![0u8; uv_samples];
for i in 0..uv_samples {
u_out[i] = uv_interleaved[i * 2];
v_out[i] = uv_interleaved[i * 2 + 1];
}
(y[..width * height].to_vec(), u_out, v_out)
}
fn i420_to_nv12_scalar(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>) {
let uv_height = (height + 1) / 2;
let uv_samples = ((width + 1) / 2) * uv_height;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_samples);
assert!(v.len() >= uv_samples);
let mut uv_out = vec![0u8; uv_samples * 2];
for i in 0..uv_samples {
uv_out[i * 2] = u[i];
uv_out[i * 2 + 1] = v[i];
}
(y[..width * height].to_vec(), uv_out)
}
#[cfg(target_arch = "x86_64")]
mod avx2_impl {
use std::arch::x86_64::*;
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn replicate_chroma_h2_avx2(
src_u: *const u8,
src_v: *const u8,
dst_u: *mut u8,
dst_v: *mut u8,
) {
let u16 = _mm_loadu_si128(src_u as *const __m128i);
let v16 = _mm_loadu_si128(src_v as *const __m128i);
let u_lo = _mm_unpacklo_epi8(u16, u16); let u_hi = _mm_unpackhi_epi8(u16, u16); let v_lo = _mm_unpacklo_epi8(v16, v16);
let v_hi = _mm_unpackhi_epi8(v16, v16);
_mm256_storeu_si256(dst_u as *mut __m256i, _mm256_set_m128i(u_hi, u_lo));
_mm256_storeu_si256(dst_v as *mut __m256i, _mm256_set_m128i(v_hi, v_lo));
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn deinterleave_uv_avx2(src_uv: *const u8, dst_u: *mut u8, dst_v: *mut u8) {
let uv = _mm256_loadu_si256(src_uv as *const __m256i);
let mask_u = _mm256_set1_epi16(0x00FF_u16 as i16);
let u_vals = _mm256_and_si256(uv, mask_u); let v_vals = _mm256_srli_epi16::<8>(uv);
let u_packed = _mm256_packus_epi16(u_vals, u_vals);
let v_packed = _mm256_packus_epi16(v_vals, v_vals);
let u_perm = _mm256_permute4x64_epi64::<0b_11_01_10_00>(u_packed);
let v_perm = _mm256_permute4x64_epi64::<0b_11_01_10_00>(v_packed);
_mm_storeu_si128(dst_u as *mut __m128i, _mm256_extracti128_si256::<0>(u_perm));
_mm_storeu_si128(dst_v as *mut __m128i, _mm256_extracti128_si256::<0>(v_perm));
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn interleave_uv_avx2(src_u: *const u8, src_v: *const u8, dst_uv: *mut u8) {
let u8 = _mm_loadu_si128(src_u as *const __m128i);
let v8 = _mm_loadu_si128(src_v as *const __m128i);
let lo = _mm_unpacklo_epi8(u8, v8);
let hi = _mm_unpackhi_epi8(u8, v8);
_mm256_storeu_si256(dst_uv as *mut __m256i, _mm256_set_m128i(hi, lo));
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn avg_pairs_avx2(src: *const u8, dst: *mut u8) {
let v = _mm256_loadu_si256(src as *const __m256i);
let mask = _mm256_set1_epi16(0x00FF_u16 as i16);
let even = _mm256_and_si256(v, mask); let odd = _mm256_srli_epi16::<8>(v); let sum = _mm256_add_epi16(even, odd);
let one = _mm256_set1_epi16(1);
let rounded = _mm256_srli_epi16::<1>(_mm256_add_epi16(sum, one));
let packed = _mm256_packus_epi16(rounded, rounded);
let perm = _mm256_permute4x64_epi64::<0b_11_01_10_00>(packed);
_mm_storeu_si128(dst as *mut __m128i, _mm256_extracti128_si256::<0>(perm));
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn yuv420_to_yuv444_avx2(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
if width < 32 || height < 2 {
return yuv420_to_yuv444_scalar(y, u, v, width, height);
}
let uv_width = (width + 1) / 2;
let uv_height = (height + 1) / 2;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_width * uv_height);
assert!(v.len() >= uv_width * uv_height);
let total = width * height;
let mut u_out = vec![0u8; total];
let mut v_out = vec![0u8; total];
let y_out = y[..total].to_vec();
for row in 0..height {
let uv_row = row / 2;
let src_u_row = u.as_ptr().add(uv_row * uv_width);
let src_v_row = v.as_ptr().add(uv_row * uv_width);
let dst_u_row = u_out.as_mut_ptr().add(row * width);
let dst_v_row = v_out.as_mut_ptr().add(row * width);
let chunks = width / 32;
for chunk in 0..chunks {
avx2_impl::replicate_chroma_h2_avx2(
src_u_row.add(chunk * 16),
src_v_row.add(chunk * 16),
dst_u_row.add(chunk * 32),
dst_v_row.add(chunk * 32),
);
}
let done = chunks * 32;
for col in done..width {
let uv_col = col / 2;
let src = uv_row * uv_width + uv_col;
u_out[row * width + col] = u[src];
v_out[row * width + col] = v[src];
}
}
(y_out, u_out, v_out)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn yuv444_to_yuv420_avx2(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
yuv444_to_yuv420_scalar(y, u, v, width, height)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn yuv422_to_yuv444_avx2(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
if width < 32 {
return yuv422_to_yuv444_scalar(y, u, v, width, height);
}
let uv_width = (width + 1) / 2;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_width * height);
assert!(v.len() >= uv_width * height);
let total = width * height;
let mut u_out = vec![0u8; total];
let mut v_out = vec![0u8; total];
let y_out = y[..total].to_vec();
for row in 0..height {
let src_u_row = u.as_ptr().add(row * uv_width);
let src_v_row = v.as_ptr().add(row * uv_width);
let dst_u_row = u_out.as_mut_ptr().add(row * width);
let dst_v_row = v_out.as_mut_ptr().add(row * width);
let chunks = width / 32;
for chunk in 0..chunks {
avx2_impl::replicate_chroma_h2_avx2(
src_u_row.add(chunk * 16),
src_v_row.add(chunk * 16),
dst_u_row.add(chunk * 32),
dst_v_row.add(chunk * 32),
);
}
let done = chunks * 32;
for col in done..width {
let uv_col = col / 2;
u_out[row * width + col] = u[row * uv_width + uv_col];
v_out[row * width + col] = v[row * uv_width + uv_col];
}
}
(y_out, u_out, v_out)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn yuv444_to_yuv422_avx2(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
if width < 32 {
return yuv444_to_yuv422_scalar(y, u, v, width, height);
}
assert!(y.len() >= width * height);
assert!(u.len() >= width * height);
assert!(v.len() >= width * height);
let uv_width = (width + 1) / 2;
let uv_total = uv_width * height;
let mut u_out = vec![0u8; uv_total];
let mut v_out = vec![0u8; uv_total];
let y_out = y[..width * height].to_vec();
for row in 0..height {
let src_u_row = u.as_ptr().add(row * width);
let src_v_row = v.as_ptr().add(row * width);
let dst_u_row = u_out.as_mut_ptr().add(row * uv_width);
let dst_v_row = v_out.as_mut_ptr().add(row * uv_width);
let chunks = width / 32;
for chunk in 0..chunks {
avx2_impl::avg_pairs_avx2(src_u_row.add(chunk * 32), dst_u_row.add(chunk * 16));
avx2_impl::avg_pairs_avx2(src_v_row.add(chunk * 32), dst_v_row.add(chunk * 16));
}
let done_uv = chunks * 16;
let done_px = chunks * 32;
for uv_col in done_uv..uv_width {
let col0 = uv_col * 2;
let col1 = (col0 + 1).min(width - 1);
let u_sum = u32::from(u[row * width + col0]) + u32::from(u[row * width + col1]);
let v_sum = u32::from(v[row * width + col0]) + u32::from(v[row * width + col1]);
let _ = done_px; u_out[row * uv_width + uv_col] = ((u_sum + 1) / 2) as u8;
v_out[row * uv_width + uv_col] = ((v_sum + 1) / 2) as u8;
}
}
(y_out, u_out, v_out)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn nv12_to_i420_avx2(
y: &[u8],
uv_interleaved: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let uv_height = (height + 1) / 2;
let uv_width = (width + 1) / 2;
let uv_samples = uv_width * uv_height;
assert!(y.len() >= width * height);
assert!(uv_interleaved.len() >= uv_samples * 2);
let mut u_out = vec![0u8; uv_samples];
let mut v_out = vec![0u8; uv_samples];
let chunks = uv_samples / 16;
for chunk in 0..chunks {
avx2_impl::deinterleave_uv_avx2(
uv_interleaved.as_ptr().add(chunk * 32),
u_out.as_mut_ptr().add(chunk * 16),
v_out.as_mut_ptr().add(chunk * 16),
);
}
let done = chunks * 16;
for i in done..uv_samples {
u_out[i] = uv_interleaved[i * 2];
v_out[i] = uv_interleaved[i * 2 + 1];
}
(y[..width * height].to_vec(), u_out, v_out)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn i420_to_nv12_avx2(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>) {
let uv_height = (height + 1) / 2;
let uv_width = (width + 1) / 2;
let uv_samples = uv_width * uv_height;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_samples);
assert!(v.len() >= uv_samples);
let mut uv_out = vec![0u8; uv_samples * 2];
let chunks = uv_samples / 16;
for chunk in 0..chunks {
avx2_impl::interleave_uv_avx2(
u.as_ptr().add(chunk * 16),
v.as_ptr().add(chunk * 16),
uv_out.as_mut_ptr().add(chunk * 32),
);
}
let done = chunks * 16;
for i in done..uv_samples {
uv_out[i * 2] = u[i];
uv_out[i * 2 + 1] = v[i];
}
(y[..width * height].to_vec(), uv_out)
}
#[cfg(target_arch = "aarch64")]
fn yuv420_to_yuv444_neon(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
use std::arch::aarch64::*;
let uv_width = (width + 1) / 2;
let uv_height = (height + 1) / 2;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_width * uv_height);
assert!(v.len() >= uv_width * uv_height);
let total = width * height;
let mut u_out = vec![0u8; total];
let mut v_out = vec![0u8; total];
let y_out = y[..total].to_vec();
for row in 0..height {
let uv_row = row / 2;
let src_u_row = &u[uv_row * uv_width..];
let src_v_row = &v[uv_row * uv_width..];
let dst_u_row = &mut u_out[row * width..row * width + width];
let dst_v_row = &mut v_out[row * width..row * width + width];
let chunks = width / 16;
for chunk in 0..chunks {
unsafe {
let u8v = vld1_u8(src_u_row.as_ptr().add(chunk * 8));
let v8v = vld1_u8(src_v_row.as_ptr().add(chunk * 8));
let u16v = vzip_u8(u8v, u8v);
let v16v = vzip_u8(v8v, v8v);
let u_combined = vcombine_u8(u16v.0, u16v.1);
let v_combined = vcombine_u8(v16v.0, v16v.1);
vst1q_u8(dst_u_row.as_mut_ptr().add(chunk * 16), u_combined);
vst1q_u8(dst_v_row.as_mut_ptr().add(chunk * 16), v_combined);
}
}
let done = chunks * 16;
for col in done..width {
let uv_col = col / 2;
dst_u_row[col] = src_u_row[uv_col];
dst_v_row[col] = src_v_row[uv_col];
}
}
(y_out, u_out, v_out)
}
#[cfg(target_arch = "aarch64")]
fn yuv444_to_yuv420_neon(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
yuv444_to_yuv420_scalar(y, u, v, width, height)
}
#[cfg(target_arch = "aarch64")]
fn yuv422_to_yuv444_neon(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
use std::arch::aarch64::*;
let uv_width = (width + 1) / 2;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_width * height);
assert!(v.len() >= uv_width * height);
let total = width * height;
let mut u_out = vec![0u8; total];
let mut v_out = vec![0u8; total];
let y_out = y[..total].to_vec();
for row in 0..height {
let src_u_row = &u[row * uv_width..];
let src_v_row = &v[row * uv_width..];
let dst_u_row = &mut u_out[row * width..row * width + width];
let dst_v_row = &mut v_out[row * width..row * width + width];
let chunks = width / 16;
for chunk in 0..chunks {
unsafe {
let u8v = vld1_u8(src_u_row.as_ptr().add(chunk * 8));
let v8v = vld1_u8(src_v_row.as_ptr().add(chunk * 8));
let u16v = vzip_u8(u8v, u8v);
let v16v = vzip_u8(v8v, v8v);
vst1q_u8(
dst_u_row.as_mut_ptr().add(chunk * 16),
vcombine_u8(u16v.0, u16v.1),
);
vst1q_u8(
dst_v_row.as_mut_ptr().add(chunk * 16),
vcombine_u8(v16v.0, v16v.1),
);
}
}
let done = chunks * 16;
for col in done..width {
let uv_col = col / 2;
dst_u_row[col] = src_u_row[uv_col];
dst_v_row[col] = src_v_row[uv_col];
}
}
(y_out, u_out, v_out)
}
#[cfg(target_arch = "aarch64")]
fn yuv444_to_yuv422_neon(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
use std::arch::aarch64::*;
assert!(y.len() >= width * height);
assert!(u.len() >= width * height);
assert!(v.len() >= width * height);
let uv_width = (width + 1) / 2;
let uv_total = uv_width * height;
let mut u_out = vec![0u8; uv_total];
let mut v_out = vec![0u8; uv_total];
let y_out = y[..width * height].to_vec();
for row in 0..height {
let src_u_row = &u[row * width..row * width + width];
let src_v_row = &v[row * width..row * width + width];
let dst_u_row = &mut u_out[row * uv_width..row * uv_width + uv_width];
let dst_v_row = &mut v_out[row * uv_width..row * uv_width + uv_width];
let chunks = width / 16; for chunk in 0..chunks {
unsafe {
let u_v = vld1q_u8(src_u_row.as_ptr().add(chunk * 16));
let v_v = vld1q_u8(src_v_row.as_ptr().add(chunk * 16));
let u_sum = vpaddlq_u8(u_v); let v_sum = vpaddlq_u8(v_v);
let one = vdupq_n_u16(1);
let u_rounded = vshrq_n_u16(vaddq_u16(u_sum, one), 1);
let v_rounded = vshrq_n_u16(vaddq_u16(v_sum, one), 1);
let u8_out = vmovn_u16(u_rounded);
let v8_out = vmovn_u16(v_rounded);
vst1_u8(dst_u_row.as_mut_ptr().add(chunk * 8), u8_out);
vst1_u8(dst_v_row.as_mut_ptr().add(chunk * 8), v8_out);
}
}
let done_uv = chunks * 8;
let done_px = chunks * 16;
for uv_col in done_uv..uv_width {
let col0 = uv_col * 2;
let col1 = (col0 + 1).min(width - 1);
let _ = done_px;
let u_sum = u32::from(src_u_row[col0]) + u32::from(src_u_row[col1]);
let v_sum = u32::from(src_v_row[col0]) + u32::from(src_v_row[col1]);
dst_u_row[uv_col] = ((u_sum + 1) / 2) as u8;
dst_v_row[uv_col] = ((v_sum + 1) / 2) as u8;
}
}
(y_out, u_out, v_out)
}
#[cfg(target_arch = "aarch64")]
fn nv12_to_i420_neon(
y: &[u8],
uv_interleaved: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
use std::arch::aarch64::*;
let uv_height = (height + 1) / 2;
let uv_width = (width + 1) / 2;
let uv_samples = uv_width * uv_height;
assert!(y.len() >= width * height);
assert!(uv_interleaved.len() >= uv_samples * 2);
let mut u_out = vec![0u8; uv_samples];
let mut v_out = vec![0u8; uv_samples];
let chunks = uv_samples / 16;
for chunk in 0..chunks {
unsafe {
let uv = vld2q_u8(uv_interleaved.as_ptr().add(chunk * 32));
vst1q_u8(u_out.as_mut_ptr().add(chunk * 16), uv.0);
vst1q_u8(v_out.as_mut_ptr().add(chunk * 16), uv.1);
}
}
let done = chunks * 16;
for i in done..uv_samples {
u_out[i] = uv_interleaved[i * 2];
v_out[i] = uv_interleaved[i * 2 + 1];
}
(y[..width * height].to_vec(), u_out, v_out)
}
#[cfg(target_arch = "aarch64")]
fn i420_to_nv12_neon(
y: &[u8],
u: &[u8],
v: &[u8],
width: usize,
height: usize,
) -> (Vec<u8>, Vec<u8>) {
use std::arch::aarch64::*;
let uv_height = (height + 1) / 2;
let uv_width = (width + 1) / 2;
let uv_samples = uv_width * uv_height;
assert!(y.len() >= width * height);
assert!(u.len() >= uv_samples);
assert!(v.len() >= uv_samples);
let mut uv_out = vec![0u8; uv_samples * 2];
let chunks = uv_samples / 16;
for chunk in 0..chunks {
unsafe {
let u_v = vld1q_u8(u.as_ptr().add(chunk * 16));
let v_v = vld1q_u8(v.as_ptr().add(chunk * 16));
let uv = uint8x16x2_t(u_v, v_v);
vst2q_u8(uv_out.as_mut_ptr().add(chunk * 32), uv);
}
}
let done = chunks * 16;
for i in done..uv_samples {
uv_out[i * 2] = u[i];
uv_out[i * 2 + 1] = v[i];
}
(y[..width * height].to_vec(), uv_out)
}
#[cfg(test)]
mod tests {
use super::*;
fn make_yuv444(width: usize, height: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let n = width * height;
let y: Vec<u8> = (0..n).map(|i| (i % 235 + 16) as u8).collect();
let u: Vec<u8> = (0..n).map(|i| (i % 200 + 16) as u8).collect();
let v: Vec<u8> = (0..n).map(|i| (i % 180 + 40) as u8).collect();
(y, u, v)
}
fn make_yuv420(width: usize, height: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let luma_n = width * height;
let uv_w = (width + 1) / 2;
let uv_h = (height + 1) / 2;
let uv_n = uv_w * uv_h;
let y: Vec<u8> = (0..luma_n).map(|i| (i % 235 + 16) as u8).collect();
let u: Vec<u8> = (0..uv_n).map(|i| (i % 120 + 16) as u8).collect();
let v: Vec<u8> = (0..uv_n).map(|i| (i % 100 + 50) as u8).collect();
(y, u, v)
}
fn make_yuv422(width: usize, height: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
let luma_n = width * height;
let uv_w = (width + 1) / 2;
let uv_n = uv_w * height;
let y: Vec<u8> = (0..luma_n).map(|i| (i % 235 + 16) as u8).collect();
let u: Vec<u8> = (0..uv_n).map(|i| (i % 120 + 16) as u8).collect();
let v: Vec<u8> = (0..uv_n).map(|i| (i % 100 + 50) as u8).collect();
(y, u, v)
}
#[test]
fn yuv420_to_yuv444_output_size() {
let (y, u, v) = make_yuv420(64, 48);
let (y_o, u_o, v_o) = yuv420_to_yuv444(&y, &u, &v, 64, 48);
assert_eq!(y_o.len(), 64 * 48);
assert_eq!(u_o.len(), 64 * 48);
assert_eq!(v_o.len(), 64 * 48);
}
#[test]
fn yuv420_to_yuv444_luma_unchanged() {
let (y, u, v) = make_yuv420(32, 32);
let (y_o, _, _) = yuv420_to_yuv444(&y, &u, &v, 32, 32);
assert_eq!(y_o, y);
}
#[test]
fn yuv420_to_yuv444_gray_image() {
let w = 8;
let h = 8;
let y = vec![128u8; w * h];
let u = vec![128u8; (w / 2) * (h / 2)];
let v = vec![128u8; (w / 2) * (h / 2)];
let (_, u_o, v_o) = yuv420_to_yuv444(&y, &u, &v, w, h);
assert!(u_o.iter().all(|&x| x == 128));
assert!(v_o.iter().all(|&x| x == 128));
}
#[test]
fn yuv420_to_yuv444_chroma_replication() {
let w = 2;
let h = 2;
let y = vec![100u8; 4];
let u = vec![77u8; 1];
let v = vec![88u8; 1];
let (_, u_o, v_o) = yuv420_to_yuv444(&y, &u, &v, w, h);
assert!(u_o.iter().all(|&x| x == 77));
assert!(v_o.iter().all(|&x| x == 88));
}
#[test]
fn yuv420_to_yuv444_odd_dimensions() {
let (y, u, v) = make_yuv420(7, 5);
let (y_o, u_o, v_o) = yuv420_to_yuv444(&y, &u, &v, 7, 5);
assert_eq!(y_o.len(), 35);
assert_eq!(u_o.len(), 35);
assert_eq!(v_o.len(), 35);
}
#[test]
fn yuv444_to_yuv420_output_size() {
let (y, u, v) = make_yuv444(64, 48);
let (y_o, u_o, v_o) = yuv444_to_yuv420(&y, &u, &v, 64, 48);
assert_eq!(y_o.len(), 64 * 48);
assert_eq!(u_o.len(), 32 * 24);
assert_eq!(v_o.len(), 32 * 24);
}
#[test]
fn yuv444_to_yuv420_luma_unchanged() {
let (y, u, v) = make_yuv444(16, 16);
let (y_o, _, _) = yuv444_to_yuv420(&y, &u, &v, 16, 16);
assert_eq!(y_o, y);
}
#[test]
fn yuv444_to_yuv420_constant_chroma() {
let w = 4;
let h = 4;
let y = vec![100u8; w * h];
let u = vec![60u8; w * h];
let v = vec![200u8; w * h];
let (_, u_o, v_o) = yuv444_to_yuv420(&y, &u, &v, w, h);
assert!(u_o.iter().all(|&x| x == 60));
assert!(v_o.iter().all(|&x| x == 200));
}
#[test]
fn yuv444_to_yuv420_averaging() {
let w = 2;
let h = 2;
let y = vec![100u8; 4];
let u = vec![10u8, 20u8, 30u8, 40u8];
let v = vec![100u8, 110u8, 120u8, 130u8];
let (_, u_o, v_o) = yuv444_to_yuv420(&y, &u, &v, w, h);
assert_eq!(u_o.len(), 1);
assert_eq!(u_o[0], 25);
assert_eq!(v_o[0], 115);
}
#[test]
fn yuv422_to_yuv444_output_size() {
let (y, u, v) = make_yuv422(64, 48);
let (y_o, u_o, v_o) = yuv422_to_yuv444(&y, &u, &v, 64, 48);
assert_eq!(y_o.len(), 64 * 48);
assert_eq!(u_o.len(), 64 * 48);
assert_eq!(v_o.len(), 64 * 48);
}
#[test]
fn yuv422_to_yuv444_luma_unchanged() {
let (y, u, v) = make_yuv422(32, 16);
let (y_o, _, _) = yuv422_to_yuv444(&y, &u, &v, 32, 16);
assert_eq!(y_o, y);
}
#[test]
fn yuv422_to_yuv444_horizontal_replication() {
let w = 4;
let h = 1;
let y = vec![100u8; w * h];
let u = vec![10u8, 20u8]; let v = vec![30u8, 40u8];
let (_, u_o, v_o) = yuv422_to_yuv444(&y, &u, &v, w, h);
assert_eq!(u_o[0], 10);
assert_eq!(u_o[1], 10);
assert_eq!(u_o[2], 20);
assert_eq!(u_o[3], 20);
assert_eq!(v_o[0], 30);
assert_eq!(v_o[3], 40);
}
#[test]
fn yuv444_to_yuv422_output_size() {
let (y, u, v) = make_yuv444(64, 48);
let (y_o, u_o, v_o) = yuv444_to_yuv422(&y, &u, &v, 64, 48);
assert_eq!(y_o.len(), 64 * 48);
assert_eq!(u_o.len(), 32 * 48);
assert_eq!(v_o.len(), 32 * 48);
}
#[test]
fn yuv444_to_yuv422_luma_unchanged() {
let (y, u, v) = make_yuv444(16, 8);
let (y_o, _, _) = yuv444_to_yuv422(&y, &u, &v, 16, 8);
assert_eq!(y_o, y);
}
#[test]
fn yuv444_to_yuv422_horizontal_averaging() {
let w = 4;
let h = 1;
let y = vec![100u8; w];
let u = vec![10u8, 20u8, 30u8, 40u8];
let v = vec![50u8, 60u8, 70u8, 80u8];
let (_, u_o, v_o) = yuv444_to_yuv422(&y, &u, &v, w, h);
assert_eq!(u_o[0], 15);
assert_eq!(u_o[1], 35);
assert_eq!(v_o[0], 55);
assert_eq!(v_o[1], 75);
}
#[test]
fn yuv422_444_422_roundtrip_constant() {
let w = 32;
let h = 16;
let (y_orig, u_orig, v_orig) = make_yuv422(w, h);
let (y444, u444, v444) = yuv422_to_yuv444(&y_orig, &u_orig, &v_orig, w, h);
let (y_rt, u_rt, v_rt) = yuv444_to_yuv422(&y444, &u444, &v444, w, h);
assert_eq!(y_rt, y_orig, "luma must survive round-trip");
for ((&a, &b), (&c, &d)) in u_orig
.iter()
.zip(u_rt.iter())
.zip(v_orig.iter().zip(v_rt.iter()))
{
let diff_u = (i32::from(a) - i32::from(b)).unsigned_abs();
let diff_v = (i32::from(c) - i32::from(d)).unsigned_abs();
assert!(diff_u <= 1, "U chroma diff {} exceeds 1", diff_u);
assert!(diff_v <= 1, "V chroma diff {} exceeds 1", diff_v);
}
}
#[test]
fn nv12_to_i420_output_size() {
let w = 64;
let h = 48;
let y = vec![128u8; w * h];
let uv = vec![128u8; w * h / 2];
let (y_o, u_o, v_o) = nv12_to_i420(&y, &uv, w, h);
assert_eq!(y_o.len(), w * h);
assert_eq!(u_o.len(), (w / 2) * (h / 2));
assert_eq!(v_o.len(), (w / 2) * (h / 2));
}
#[test]
fn nv12_to_i420_luma_unchanged() {
let w = 16;
let h = 16;
let y: Vec<u8> = (0..w * h).map(|i| i as u8).collect();
let uv = vec![128u8; w * h / 2];
let (y_o, _, _) = nv12_to_i420(&y, &uv, w, h);
assert_eq!(y_o, y);
}
#[test]
fn nv12_to_i420_deinterleave_known_values() {
let w = 4;
let h = 4;
let y = vec![0u8; w * h];
let uv = vec![10u8, 20, 30, 40, 50, 60, 70, 80];
let (_, u_o, v_o) = nv12_to_i420(&y, &uv, w, h);
assert_eq!(u_o, [10, 30, 50, 70]);
assert_eq!(v_o, [20, 40, 60, 80]);
}
#[test]
fn i420_to_nv12_output_size() {
let w = 64;
let h = 48;
let y = vec![128u8; w * h];
let u = vec![128u8; (w / 2) * (h / 2)];
let v = vec![128u8; (w / 2) * (h / 2)];
let (y_o, uv_o) = i420_to_nv12(&y, &u, &v, w, h);
assert_eq!(y_o.len(), w * h);
assert_eq!(uv_o.len(), (w / 2) * (h / 2) * 2);
}
#[test]
fn i420_to_nv12_luma_unchanged() {
let w = 16;
let h = 16;
let y: Vec<u8> = (0..w * h).map(|i| i as u8).collect();
let u = vec![128u8; (w / 2) * (h / 2)];
let v = vec![128u8; (w / 2) * (h / 2)];
let (y_o, _) = i420_to_nv12(&y, &u, &v, w, h);
assert_eq!(y_o, y);
}
#[test]
fn i420_to_nv12_interleave_known_values() {
let w = 4;
let h = 4;
let y = vec![0u8; w * h];
let u = [10u8, 30, 50, 70];
let v = [20u8, 40, 60, 80];
let (_, uv_o) = i420_to_nv12(&y, &u, &v, w, h);
assert_eq!(uv_o, [10, 20, 30, 40, 50, 60, 70, 80]);
}
#[test]
fn nv12_i420_nv12_roundtrip() {
let w = 32;
let h = 32;
let y: Vec<u8> = (0..w * h).map(|i| (i % 235 + 16) as u8).collect();
let u: Vec<u8> = (0..(w / 2) * (h / 2))
.map(|i| (i % 120 + 16) as u8)
.collect();
let v: Vec<u8> = (0..(w / 2) * (h / 2))
.map(|i| (i % 100 + 50) as u8)
.collect();
let (y_nv, uv_nv) = i420_to_nv12(&y, &u, &v, w, h);
let (y_rt, u_rt, v_rt) = nv12_to_i420(&y_nv, &uv_nv, w, h);
assert_eq!(y_rt, y);
assert_eq!(u_rt, u);
assert_eq!(v_rt, v);
}
#[test]
fn nv12_i420_roundtrip_large() {
let w = 128;
let h = 128;
let y: Vec<u8> = (0..w * h).map(|i| (i % 256) as u8).collect();
let u: Vec<u8> = (0..(w / 2) * (h / 2)).map(|i| (i % 256) as u8).collect();
let v: Vec<u8> = (0..(w / 2) * (h / 2))
.map(|i| (255 - i % 256) as u8)
.collect();
let (_, uv) = i420_to_nv12(&y, &u, &v, w, h);
let (_, u_rt, v_rt) = nv12_to_i420(&y, &uv, w, h);
assert_eq!(u_rt, u);
assert_eq!(v_rt, v);
}
#[test]
fn yuv420_444_420_roundtrip_constant_chroma() {
let w = 32;
let h = 32;
let y = vec![128u8; w * h];
let u = vec![77u8; (w / 2) * (h / 2)];
let v = vec![88u8; (w / 2) * (h / 2)];
let (y444, u444, v444) = yuv420_to_yuv444(&y, &u, &v, w, h);
let (y_rt, u_rt, v_rt) = yuv444_to_yuv420(&y444, &u444, &v444, w, h);
assert_eq!(y_rt, y);
assert_eq!(u_rt, u);
assert_eq!(v_rt, v);
}
#[test]
fn yuv420_to_yuv444_large_frame() {
let w = 256;
let h = 256;
let (y, u, v) = make_yuv420(w, h);
let (y_o, u_o, v_o) = yuv420_to_yuv444(&y, &u, &v, w, h);
assert_eq!(y_o.len(), w * h);
assert_eq!(u_o.len(), w * h);
assert_eq!(v_o.len(), w * h);
}
}