#![allow(clippy::too_many_arguments)]
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
#[cfg(target_arch = "aarch64")]
use archmage::{Arm64, arcane, rite};
#[cfg(target_arch = "aarch64")]
use safe_unaligned_simd::aarch64 as safe_simd;
use super::itx_arm_neon_common::{
IADST4_COEFFS, IDCT_COEFFS, IDENTITY_SCALE, add_to_dst_4x4_8bpc, transpose_4x4h,
};
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
pub(crate) fn idct_4h(
r0: int16x4_t,
r1: int16x4_t,
r2: int16x4_t,
r3: int16x4_t,
) -> (int16x4_t, int16x4_t, int16x4_t, int16x4_t) {
let coeffs = safe_simd::vld1_s16(<&[i16; 4]>::try_from(&IDCT_COEFFS[0..4]).unwrap());
let v6 = vmull_lane_s16::<3>(r1, coeffs); let v6 = vmlal_lane_s16::<2>(v6, r3, coeffs);
let v4 = vmull_lane_s16::<2>(r1, coeffs); let v4 = vmlsl_lane_s16::<3>(v4, r3, coeffs);
let v2 = vmull_lane_s16::<0>(r0, coeffs); let v2 = vmlal_lane_s16::<0>(v2, r2, coeffs);
let t3a = vqrshrn_n_s32::<12>(v6);
let t2a = vqrshrn_n_s32::<12>(v4);
let v4 = vmull_lane_s16::<0>(r0, coeffs); let v4 = vmlsl_lane_s16::<0>(v4, r2, coeffs);
let t0 = vqrshrn_n_s32::<12>(v2);
let t1 = vqrshrn_n_s32::<12>(v4);
let out0 = vqadd_s16(t0, t3a); let out3 = vqsub_s16(t0, t3a); let out1 = vqadd_s16(t1, t2a); let out2 = vqsub_s16(t1, t2a);
(out0, out1, out2, out3)
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
pub(crate) fn iadst_4h(
in0: int16x4_t,
in1: int16x4_t,
in2: int16x4_t,
in3: int16x4_t,
) -> (int16x4_t, int16x4_t, int16x4_t, int16x4_t) {
let coeffs = safe_simd::vld1_s16(<&[i16; 4]>::try_from(&IADST4_COEFFS[0..4]).unwrap());
let v3 = vsubl_s16(in0, in2);
let v4 = vmull_lane_s16::<0>(in0, coeffs); let v4 = vmlal_lane_s16::<1>(v4, in2, coeffs); let v4 = vmlal_lane_s16::<2>(v4, in3, coeffs);
let v7 = vmull_lane_s16::<3>(in1, coeffs);
let v3 = vaddw_s16(v3, in3);
let v5 = vmull_lane_s16::<2>(in0, coeffs); let v5 = vmlsl_lane_s16::<0>(v5, in2, coeffs); let v5 = vmlsl_lane_s16::<1>(v5, in3, coeffs);
let o3 = vaddq_s32(v4, v5);
let o3 = vsubq_s32(o3, v7);
let sinpi3_broadcast = vdupq_n_s32(3344);
let o2 = vmulq_s32(v3, sinpi3_broadcast);
let o0 = vaddq_s32(v4, v7);
let o1 = vaddq_s32(v5, v7);
let o0 = vqrshrn_n_s32::<12>(o0);
let o2 = vqrshrn_n_s32::<12>(o2);
let o1 = vqrshrn_n_s32::<12>(o1);
let o3 = vqrshrn_n_s32::<12>(o3);
(o0, o1, o2, o3)
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
pub(crate) fn identity_4h(
v16: int16x4_t,
v17: int16x4_t,
v18: int16x4_t,
v19: int16x4_t,
) -> (int16x4_t, int16x4_t, int16x4_t, int16x4_t) {
let scale = vdup_n_s16(IDENTITY_SCALE);
let h0 = vqrdmulh_s16(v16, scale);
let o0 = vqadd_s16(v16, h0);
let h1 = vqrdmulh_s16(v17, scale);
let o1 = vqadd_s16(v17, h1);
let h2 = vqrdmulh_s16(v18, scale);
let o2 = vqadd_s16(v18, h2);
let h3 = vqrdmulh_s16(v19, scale);
let o3 = vqadd_s16(v19, h3);
(o0, o1, o2, o3)
}
#[cfg(target_arch = "aarch64")]
#[derive(Clone, Copy)]
pub(crate) enum TxType4 {
Dct,
Adst,
FlipAdst,
Identity,
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn apply_tx4(
tx: TxType4,
v0: int16x4_t,
v1: int16x4_t,
v2: int16x4_t,
v3: int16x4_t,
) -> (int16x4_t, int16x4_t, int16x4_t, int16x4_t) {
match tx {
TxType4::Dct => idct_4h(v0, v1, v2, v3),
TxType4::Adst => iadst_4h(v0, v1, v2, v3),
TxType4::FlipAdst => {
let (o0, o1, o2, o3) = iadst_4h(v0, v1, v2, v3);
(o3, o2, o1, o0)
}
TxType4::Identity => identity_4h(v0, v1, v2, v3),
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_4x4_8bpc_neon(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
row_tx: TxType4,
col_tx: TxType4,
) {
if matches!(row_tx, TxType4::Dct) && matches!(col_tx, TxType4::Dct) && eob == 0 {
dc_only_4x4_8bpc(dst, dst_base, dst_stride, coeff);
return;
}
let v16 = safe_simd::vld1_s16(<&[i16; 4]>::try_from(&coeff[0..4]).unwrap());
let v17 = safe_simd::vld1_s16(<&[i16; 4]>::try_from(&coeff[4..8]).unwrap());
let v18 = safe_simd::vld1_s16(<&[i16; 4]>::try_from(&coeff[8..12]).unwrap());
let v19 = safe_simd::vld1_s16(<&[i16; 4]>::try_from(&coeff[12..16]).unwrap());
let (v16, v17, v18, v19) = apply_tx4(row_tx, v16, v17, v18, v19);
let (v16, v17, v18, v19) = transpose_4x4h(v16, v17, v18, v19);
let (v16, v17, v18, v19) = apply_tx4(col_tx, v16, v17, v18, v19);
add_to_dst_4x4_8bpc(dst, dst_base, dst_stride, v16, v17, v18, v19, true);
coeff[0..16].fill(0);
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_4x4_8bpc(dst: &mut [u8], dst_base: usize, dst_stride: isize, coeff: &mut [i16]) {
let dc = coeff[0];
coeff[0] = 0;
let scale = vdupq_n_s16((2896 * 8) as i16);
let v16 = vdupq_n_s16(dc);
let v16 = vqrdmulhq_s16(v16, scale);
let v20 = vqrdmulhq_s16(v16, scale);
let v_shifted = vrshrq_n_s16::<4>(v20);
let v16_lo = vget_low_s16(v_shifted);
let v17_lo = vget_high_s16(v_shifted);
add_to_dst_4x4_8bpc(
dst, dst_base, dst_stride, v16_lo, v17_lo, v16_lo, v17_lo, false,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Dct,
TxType4::Dct,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_adst_adst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Adst,
TxType4::Adst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::FlipAdst,
TxType4::FlipAdst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Identity,
TxType4::Identity,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_adst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Dct,
TxType4::Adst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_adst_dct_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Adst,
TxType4::Dct,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_flipadst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Dct,
TxType4::FlipAdst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_flipadst_dct_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::FlipAdst,
TxType4::Dct,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_adst_flipadst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Adst,
TxType4::FlipAdst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_flipadst_adst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::FlipAdst,
TxType4::Adst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_identity_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Dct,
TxType4::Identity,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_dct_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Identity,
TxType4::Dct,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_adst_identity_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Adst,
TxType4::Identity,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_adst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Identity,
TxType4::Adst,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_flipadst_identity_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::FlipAdst,
TxType4::Identity,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_flipadst_4x4_8bpc_neon_inner(
token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
bitdepth_max: i32,
) {
inv_txfm_add_4x4_8bpc_neon(
token,
dst,
dst_base,
dst_stride,
coeff,
eob,
bitdepth_max,
TxType4::Identity,
TxType4::FlipAdst,
);
}
#[cfg(test)]
#[cfg(target_arch = "aarch64")]
mod tests {
use super::*;
use archmage::SimdToken;
fn run_neon_vs_scalar_test(
neon_fn: fn(Arm64, &mut [u8], usize, isize, &mut [i16], i32, i32),
scalar_fn: fn(&mut [u8], usize, isize, &mut [i16], i32, i32),
test_name: &str,
) {
let token = match Arm64::summon() {
Some(t) => t,
None => {
eprintln!("Skipping NEON test: Arm64 not available");
return;
}
};
let test_coeffs: &[[i16; 16]] = &[
[64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[50, -25, 15, -10, 20, -15, 10, -5, 8, -4, 2, -1, 4, -2, 1, 0],
[32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
];
const MAX_DIFF: i32 = 15;
let stride: isize = 16;
for (i, coeffs) in test_coeffs.iter().enumerate() {
let mut coeff_scalar = *coeffs;
let mut coeff_neon = *coeffs;
let mut dst_scalar = [0u8; 64];
for (j, byte) in dst_scalar.iter_mut().enumerate() {
*byte = ((j * 7 + 128) % 256) as u8;
}
let mut dst_neon = dst_scalar;
scalar_fn(&mut dst_scalar, 0, stride, &mut coeff_scalar, 1, 255);
neon_fn(token, &mut dst_neon, 0, stride, &mut coeff_neon, 1, 255);
for row in 0..4 {
let off = row * stride as usize;
for col in 0..4 {
let n = dst_neon[off + col];
let s = dst_scalar[off + col];
let diff = (n as i32 - s as i32).abs();
assert!(
diff <= MAX_DIFF,
"{test_name}: Pixel ({col},{row}) diff {diff} exceeds \
tolerance {MAX_DIFF} in pattern {i}: neon={n}, scalar={s}",
);
}
}
assert!(
coeff_neon.iter().all(|&c| c == 0),
"{test_name}: NEON coefficients not zeroed in test pattern {i}"
);
}
}
#[test]
fn test_dct_dct_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_dct_dct_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_dct_dct_4x4_8bpc_inner,
"DCT_DCT",
);
}
#[test]
fn test_adst_adst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_adst_adst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_adst_adst_4x4_8bpc_inner,
"ADST_ADST",
);
}
#[test]
fn test_flipadst_flipadst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_flipadst_flipadst_4x4_8bpc_inner,
"FLIPADST_FLIPADST",
);
}
#[test]
fn test_identity_identity_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_identity_identity_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_identity_identity_4x4_8bpc_inner,
"IDENTITY_IDENTITY",
);
}
#[test]
fn test_dct_adst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_dct_adst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_dct_adst_4x4_8bpc_inner,
"DCT_ADST",
);
}
#[test]
fn test_adst_dct_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_adst_dct_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_adst_dct_4x4_8bpc_inner,
"ADST_DCT",
);
}
#[test]
fn test_dct_flipadst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_dct_flipadst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_dct_flipadst_4x4_8bpc_inner,
"DCT_FLIPADST",
);
}
#[test]
fn test_flipadst_dct_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_flipadst_dct_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_flipadst_dct_4x4_8bpc_inner,
"FLIPADST_DCT",
);
}
#[test]
fn test_adst_flipadst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_adst_flipadst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_adst_flipadst_4x4_8bpc_inner,
"ADST_FLIPADST",
);
}
#[test]
fn test_flipadst_adst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_flipadst_adst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_flipadst_adst_4x4_8bpc_inner,
"FLIPADST_ADST",
);
}
#[test]
fn test_dct_identity_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_dct_identity_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_dct_identity_4x4_8bpc_inner,
"DCT_IDENTITY",
);
}
#[test]
fn test_identity_dct_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_identity_dct_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_identity_dct_4x4_8bpc_inner,
"IDENTITY_DCT",
);
}
#[test]
fn test_adst_identity_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_adst_identity_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_adst_identity_4x4_8bpc_inner,
"ADST_IDENTITY",
);
}
#[test]
fn test_identity_adst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_identity_adst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_identity_adst_4x4_8bpc_inner,
"IDENTITY_ADST",
);
}
#[test]
fn test_flipadst_identity_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_flipadst_identity_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_flipadst_identity_4x4_8bpc_inner,
"FLIPADST_IDENTITY",
);
}
#[test]
fn test_identity_flipadst_4x4_neon_matches_scalar() {
run_neon_vs_scalar_test(
inv_txfm_add_identity_flipadst_4x4_8bpc_neon_inner,
super::super::itx_arm::inv_txfm_add_identity_flipadst_4x4_8bpc_inner,
"IDENTITY_FLIPADST",
);
}
#[test]
fn test_dct_dct_dc_only_4x4() {
let token = match Arm64::summon() {
Some(t) => t,
None => {
eprintln!("Skipping NEON test: Arm64 not available");
return;
}
};
let stride: isize = 16;
let test_dcs: &[i16] = &[0, 1, -1, 100, -100, 500, -500, 1000, -1000];
for &dc in test_dcs {
let mut coeff_neon = [0i16; 16];
coeff_neon[0] = dc;
let mut dst_neon = [128u8; 64];
inv_txfm_add_dct_dct_4x4_8bpc_neon_inner(
token,
&mut dst_neon,
0,
stride,
&mut coeff_neon,
0,
255,
);
let expected_val = dst_neon[0];
for row in 0..4 {
let off = row * stride as usize;
for col in 0..4 {
assert_eq!(
dst_neon[off + col],
expected_val,
"DC-only should produce uniform output for dc={dc}, \
pixel ({col},{row}): got {}, expected {expected_val}",
dst_neon[off + col],
);
}
}
assert!(
(expected_val as i32 - 128).abs() < 50,
"DC value {expected_val} out of expected range for dc={dc}",
);
assert!(
coeff_neon.iter().all(|&c| c == 0),
"NEON coefficients not zeroed for dc={dc}",
);
}
}
}