use std::arch::aarch64::*;
pub fn upsample_h2_neon(input: &[u8], output: &mut [u8], width: usize) {
if width < 2 {
super::scalar::upsample_h2(input, output, width);
return;
}
unsafe { upsample_h2_neon_inner(input, output, width) }
}
#[target_feature(enable = "neon")]
unsafe fn upsample_h2_neon_inner(input: &[u8], output: &mut [u8], width: usize) {
output[0] = input[0];
let chunks = (width - 1) / 8;
for chunk in 0..chunks {
let base = chunk * 8;
let a = vld1_u8(input.as_ptr().add(base));
let b = vld1_u8(input.as_ptr().add(base + 1));
let a16 = vmovl_u8(a);
let b16 = vmovl_u8(b);
let left16 = vshrq_n_u16::<2>(vaddq_u16(
vaddq_u16(vmulq_n_u16(a16, 3), b16),
vdupq_n_u16(2),
));
let right16 = vshrq_n_u16::<2>(vaddq_u16(
vaddq_u16(a16, vmulq_n_u16(b16, 3)),
vdupq_n_u16(2),
));
let left8 = vmovn_u16(left16);
let right8 = vmovn_u16(right16);
let interleaved = uint8x8x2_t(left8, right8);
vst2_u8(output.as_mut_ptr().add(base * 2 + 1), interleaved);
}
let start = chunks * 8;
for i in start..width - 1 {
let a = input[i] as u16;
let b = input[i + 1] as u16;
output[2 * i + 1] = ((a * 3 + b + 2) >> 2) as u8;
output[2 * i + 2] = ((a + b * 3 + 2) >> 2) as u8;
}
output[2 * width - 1] = input[width - 1];
}
#[cfg(test)]
mod tests {
use super::super::scalar::upsample_h2;
use super::upsample_h2_neon;
fn make_chroma_row() -> [u8; 16] {
[
0, 16, 32, 64, 80, 100, 128, 150, 170, 200, 220, 235, 255, 128, 64, 0,
]
}
#[test]
fn upsample_h2_parity_full() {
if !std::arch::is_aarch64_feature_detected!("neon") {
eprintln!("SIMD feature not available, skipping");
return;
}
let input = make_chroma_row();
let width = 16usize;
let mut scalar_out = vec![0u8; width * 2];
let mut simd_out = vec![0u8; width * 2];
upsample_h2(&input, &mut scalar_out, width);
upsample_h2_neon(&input, &mut simd_out, width);
for i in 0..(width * 2) {
let diff = (scalar_out[i] as i32 - simd_out[i] as i32).abs();
assert!(
diff <= 1,
"parity mismatch at output index {i}: scalar={}, simd={}, diff={}",
scalar_out[i],
simd_out[i],
diff
);
}
}
#[test]
fn upsample_h2_parity_partial_chunk() {
if !std::arch::is_aarch64_feature_detected!("neon") {
eprintln!("SIMD feature not available, skipping");
return;
}
let input = make_chroma_row();
let width = 11usize;
let mut scalar_out = vec![0u8; width * 2];
let mut simd_out = vec![0u8; width * 2];
upsample_h2(&input[..width], &mut scalar_out, width);
upsample_h2_neon(&input[..width], &mut simd_out, width);
for i in 0..(width * 2) {
let diff = (scalar_out[i] as i32 - simd_out[i] as i32).abs();
assert!(
diff <= 1,
"partial parity mismatch at output index {i}: scalar={}, simd={}, diff={}",
scalar_out[i],
simd_out[i],
diff
);
}
}
#[test]
fn upsample_h2_parity_uniform() {
if !std::arch::is_aarch64_feature_detected!("neon") {
eprintln!("SIMD feature not available, skipping");
return;
}
let input = [128u8; 16];
let width = 16usize;
let mut scalar_out = vec![0u8; width * 2];
let mut simd_out = vec![0u8; width * 2];
upsample_h2(&input, &mut scalar_out, width);
upsample_h2_neon(&input, &mut simd_out, width);
for i in 0..(width * 2) {
let diff = (scalar_out[i] as i32 - simd_out[i] as i32).abs();
assert!(
diff <= 1,
"uniform parity mismatch at output index {i}: scalar={}, simd={}, diff={}",
scalar_out[i],
simd_out[i],
diff
);
}
}
}