#![allow(clippy::too_many_arguments)]
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
#[cfg(target_arch = "aarch64")]
use archmage::{Arm64, arcane, rite};
#[cfg(target_arch = "aarch64")]
use safe_unaligned_simd::aarch64 as safe_simd;
use super::itx_arm_neon_8x8::idct_8_q;
use super::itx_arm_neon_8x8::transpose_8x8h;
use super::itx_arm_neon_16x16::idct_16_q;
use super::itx_arm_neon_32::{
V16, horz_dct_32x8, idct32_odd_q, load_v16, rev128_s16, store_v16, vert_dct_add_8x32_8bpc,
vert_dct_add_8x32_16bpc,
};
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_large_rect_8bpc(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
w: usize,
h: usize,
shift: i32,
) {
let dc = coeff[0];
coeff[0] = 0;
let scale = vdupq_n_s16((2896 * 8) as i16);
let v = vdupq_n_s16(dc);
let v = vqrdmulhq_s16(v, scale);
let v = vqrdmulhq_s16(v, scale);
let v = match shift {
1 => vrshrq_n_s16::<1>(v),
2 => vrshrq_n_s16::<2>(v),
_ => v,
};
let v = vqrdmulhq_s16(v, scale);
let v = vrshrq_n_s16::<4>(v);
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for half in 0..(w / 8) {
let off = row_off + half * 8;
let dst_bytes: [u8; 8] = dst[off..off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(v), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[off..off + 8].copy_from_slice(&out);
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_large_rect_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
w: usize,
h: usize,
shift: i32,
bitdepth_max: i32,
) {
let dc_val = coeff[0];
coeff[0] = 0;
let scale = 2896i32 * 8;
let mut dc = ((dc_val as i64 * scale as i64 + 16384) >> 15) as i32;
dc = ((dc as i64 * scale as i64 + 16384) >> 15) as i32;
dc = match shift {
1 => (dc + 1) >> 1,
2 => (dc + 2) >> 2,
_ => dc,
};
dc = ((dc as i64 * scale as i64 + 16384) >> 15) as i32;
dc = (dc + 8) >> 4;
let dc = dc.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
let dc_vec = vdupq_n_s16(dc);
let bd_max = vdupq_n_s16(bitdepth_max as i16);
let zero = vdupq_n_s16(0);
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for half in 0..(w / 8) {
let off = row_off + half * 8;
let mut arr = [0i16; 8];
for j in 0..8 {
arr[j] = dst[off + j] as i16;
}
let d = safe_simd::vld1q_s16(&arr);
let sum = vqaddq_s16(d, dc_vec);
let clamped = vminq_s16(vmaxq_s16(sum, zero), bd_max);
let mut out_arr = [0i16; 8];
safe_simd::vst1q_s16(&mut out_arr, clamped);
for j in 0..8 {
dst[off + j] = out_arr[j] as u16;
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn scale_input_q_8(v: &mut [int16x8_t; 8]) {
let scale = vdupq_n_s16((2896 * 8) as i16);
for vi in v.iter_mut() {
*vi = vqrdmulhq_s16(*vi, scale);
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn scale_input_q_16(v: &mut [int16x8_t; 16]) {
let scale = vdupq_n_s16((2896 * 8) as i16);
for vi in v.iter_mut() {
*vi = vqrdmulhq_s16(*vi, scale);
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn add_to_dst_8x8_8bpc(dst: &mut [u8], dst_base: usize, stride: isize, v: [int16x8_t; 8]) {
for (i, &row) in v.iter().enumerate() {
let row_off = dst_base.wrapping_add_signed(i as isize * stride);
let shifted = vrshrq_n_s16::<4>(row);
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn add_to_dst_8x16_8bpc(dst: &mut [u8], dst_base: usize, stride: isize, v: [int16x8_t; 16]) {
for (i, &row) in v.iter().enumerate() {
let row_off = dst_base.wrapping_add_signed(i as isize * stride);
let shifted = vrshrq_n_s16::<4>(row);
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn horz_dct8_for_8x32(
coeff: &mut [i16],
coeff_base: usize,
coeff_stride: usize,
scratch: &mut [i16],
scratch_base: usize,
scratch_stride: usize,
) {
let zero_vec = vdupq_n_s16(0);
let mut v = [zero_vec; 8];
for c in 0..8 {
let base = coeff_base + c * coeff_stride;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
scale_input_q_8(&mut v);
let (r0, r1, r2, r3, r4, r5, r6, r7) = idct_8_q(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let r0 = vrshrq_n_s16::<1>(r0);
let r1 = vrshrq_n_s16::<1>(r1);
let r2 = vrshrq_n_s16::<1>(r2);
let r3 = vrshrq_n_s16::<1>(r3);
let r4 = vrshrq_n_s16::<1>(r4);
let r5 = vrshrq_n_s16::<1>(r5);
let r6 = vrshrq_n_s16::<1>(r6);
let r7 = vrshrq_n_s16::<1>(r7);
let (t0, t1, t2, t3, t4, t5, t6, t7) = transpose_8x8h(r0, r1, r2, r3, r4, r5, r6, r7);
let rows = [t0, t1, t2, t3, t4, t5, t6, t7];
for (i, &row) in rows.iter().enumerate() {
store_v16(scratch, scratch_base + i * scratch_stride, row);
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn horz_dct16_for_16x32(
coeff: &mut [i16],
coeff_base: usize,
coeff_stride: usize,
scratch: &mut [i16],
scratch_base: usize,
scratch_stride: usize,
) {
let zero_vec = vdupq_n_s16(0);
let mut v: V16 = [zero_vec; 16];
for c in 0..16 {
let base = coeff_base + c * coeff_stride;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
scale_input_q_16(&mut v);
let result = idct_16_q(v);
let mut shifted = [zero_vec; 16];
for i in 0..16 {
shifted[i] = vrshrq_n_s16::<1>(result[i]);
}
let (t0, t1, t2, t3, t4, t5, t6, t7) = transpose_8x8h(
shifted[0], shifted[1], shifted[2], shifted[3], shifted[4], shifted[5], shifted[6],
shifted[7],
);
let first = [t0, t1, t2, t3, t4, t5, t6, t7];
for (i, &row) in first.iter().enumerate() {
store_v16(scratch, scratch_base + i * scratch_stride, row);
}
let (t0, t1, t2, t3, t4, t5, t6, t7) = transpose_8x8h(
shifted[8],
shifted[9],
shifted[10],
shifted[11],
shifted[12],
shifted[13],
shifted[14],
shifted[15],
);
let second = [t0, t1, t2, t3, t4, t5, t6, t7];
for (i, &row) in second.iter().enumerate() {
store_v16(scratch, scratch_base + i * scratch_stride + 8, row);
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn vert_dct16_add_8x16_8bpc(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
scratch: &[i16],
scratch_base: usize,
scratch_stride: usize,
) {
let zero_vec = vdupq_n_s16(0);
let mut v: V16 = [zero_vec; 16];
for i in 0..16 {
v[i] = load_v16(scratch, scratch_base + i * scratch_stride);
}
let result = idct_16_q(v);
add_to_dst_8x16_8bpc(dst, dst_base, dst_stride, result);
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn vert_dct16_add_8x16_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
scratch: &[i16],
scratch_base: usize,
scratch_stride: usize,
bitdepth_max: i32,
) {
let zero_vec = vdupq_n_s16(0);
let bd_max = vdupq_n_s16(bitdepth_max as i16);
let zero = vdupq_n_s16(0);
let mut v: V16 = [zero_vec; 16];
for i in 0..16 {
v[i] = load_v16(scratch, scratch_base + i * scratch_stride);
}
let result = idct_16_q(v);
for i in 0..16 {
let shifted = vrshrq_n_s16::<4>(result[i]);
let row_off = dst_base.wrapping_add_signed(i as isize * dst_stride);
let mut arr = [0i16; 8];
for j in 0..8 {
arr[j] = dst[row_off + j] as i16;
}
let d = safe_simd::vld1q_s16(&arr);
let sum = vqaddq_s16(d, shifted);
let clamped = vminq_s16(vmaxq_s16(sum, zero), bd_max);
let mut out = [0i16; 8];
safe_simd::vst1q_s16(&mut out, clamped);
for j in 0..8 {
dst[row_off + j] = out[j] as u16;
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn vert_dct8_add_8x8_8bpc(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
scratch: &[i16],
scratch_base: usize,
scratch_stride: usize,
) {
let v0 = load_v16(scratch, scratch_base);
let v1 = load_v16(scratch, scratch_base + scratch_stride);
let v2 = load_v16(scratch, scratch_base + 2 * scratch_stride);
let v3 = load_v16(scratch, scratch_base + 3 * scratch_stride);
let v4 = load_v16(scratch, scratch_base + 4 * scratch_stride);
let v5 = load_v16(scratch, scratch_base + 5 * scratch_stride);
let v6 = load_v16(scratch, scratch_base + 6 * scratch_stride);
let v7 = load_v16(scratch, scratch_base + 7 * scratch_stride);
let (r0, r1, r2, r3, r4, r5, r6, r7) = idct_8_q(v0, v1, v2, v3, v4, v5, v6, v7);
add_to_dst_8x8_8bpc(dst, dst_base, dst_stride, [r0, r1, r2, r3, r4, r5, r6, r7]);
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn vert_dct8_add_8x8_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
scratch: &[i16],
scratch_base: usize,
scratch_stride: usize,
bitdepth_max: i32,
) {
let bd_max = vdupq_n_s16(bitdepth_max as i16);
let zero = vdupq_n_s16(0);
let v0 = load_v16(scratch, scratch_base);
let v1 = load_v16(scratch, scratch_base + scratch_stride);
let v2 = load_v16(scratch, scratch_base + 2 * scratch_stride);
let v3 = load_v16(scratch, scratch_base + 3 * scratch_stride);
let v4 = load_v16(scratch, scratch_base + 4 * scratch_stride);
let v5 = load_v16(scratch, scratch_base + 5 * scratch_stride);
let v6 = load_v16(scratch, scratch_base + 6 * scratch_stride);
let v7 = load_v16(scratch, scratch_base + 7 * scratch_stride);
let (r0, r1, r2, r3, r4, r5, r6, r7) = idct_8_q(v0, v1, v2, v3, v4, v5, v6, v7);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for (i, &row) in rows.iter().enumerate() {
let shifted = vrshrq_n_s16::<4>(row);
let row_off = dst_base.wrapping_add_signed(i as isize * dst_stride);
let mut arr = [0i16; 8];
for j in 0..8 {
arr[j] = dst[row_off + j] as i16;
}
let d = safe_simd::vld1q_s16(&arr);
let sum = vqaddq_s16(d, shifted);
let clamped = vminq_s16(vmaxq_s16(sum, zero), bd_max);
let mut out = [0i16; 8];
safe_simd::vst1q_s16(&mut out, clamped);
for j in 0..8 {
dst[row_off + j] = out[j] as u16;
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_8x32_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_8bpc(dst, dst_base, dst_stride, coeff, 8, 32, 1);
return;
}
let eob_thresholds: [i32; 4] = [36, 136, 300, 1024];
let mut scratch = [0i16; 256];
for group in 0..4 {
let row_start = group * 8;
if group > 0 && eob < eob_thresholds[group - 1] {
break;
}
horz_dct8_for_8x32(
coeff,
row_start,
32, &mut scratch,
row_start * 8, 8, );
}
vert_dct_add_8x32_8bpc(
dst, dst_base, dst_stride, &scratch, 0, 8, );
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_8x32_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_16bpc(dst, dst_base, dst_stride, coeff, 8, 32, 1, bitdepth_max);
return;
}
let mut scratch = [0i16; 256];
let eob_thresholds: [i32; 4] = [36, 136, 300, 1024];
for group in 0..4 {
let row_start = group * 8;
if group > 0 && eob < eob_thresholds[group - 1] {
break;
}
for r in 0..8 {
let y = row_start + r;
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 32];
coeff[y + x * 32] = 0;
}
let scale = 2896i64 * 8;
for val in input.iter_mut() {
*val = ((*val as i64 * scale + 16384) >> 15) as i32;
}
let out = scalar_dct8_1d(&input);
for x in 0..8 {
let v = (out[x] + 1) >> 1;
scratch[(y * 8 + x) as usize] = v.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
}
}
}
vert_dct_add_8x32_16bpc(dst, dst_base, dst_stride, &scratch, 0, 8, bitdepth_max);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_32x8_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_8bpc(dst, dst_base, dst_stride, coeff, 32, 8, 1);
return;
}
let mut scratch = [0i16; 256];
horz_dct_32x8(
coeff,
0, 8, &mut scratch,
0, 1, );
scratch.fill(0);
horz_32x8_rect(coeff, 0, 8, &mut scratch, 0);
for group in 0..4 {
let col_start = group * 8;
vert_dct8_add_8x8_8bpc(
dst,
dst_base + col_start,
dst_stride,
&scratch,
col_start,
32, );
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn horz_32x8_rect(
coeff: &mut [i16],
coeff_base: usize,
coeff_stride: usize,
scratch: &mut [i16],
scratch_base: usize,
) {
let zero_vec = vdupq_n_s16(0);
let mut even_in: V16 = [zero_vec; 16];
for c in 0..16 {
let col = c * 2;
let base = coeff_base + col * coeff_stride;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
even_in[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let even_out = idct_16_q(even_in);
let (et0, et1, et2, et3, et4, et5, et6, et7) = transpose_8x8h(
even_out[0],
even_out[1],
even_out[2],
even_out[3],
even_out[4],
even_out[5],
even_out[6],
even_out[7],
);
let (et8, et9, et10, et11, et12, et13, et14, et15) = transpose_8x8h(
even_out[8],
even_out[9],
even_out[10],
even_out[11],
even_out[12],
even_out[13],
even_out[14],
even_out[15],
);
let even_t = [
[et0, et8],
[et1, et9],
[et2, et10],
[et3, et11],
[et4, et12],
[et5, et13],
[et6, et14],
[et7, et15],
];
let mut odd_in: V16 = [zero_vec; 16];
for c in 0..16 {
let col = c * 2 + 1;
let base = coeff_base + col * coeff_stride;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
odd_in[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let odd_out = idct32_odd_q(odd_in);
let (ot15, ot14, ot13, ot12, ot11, ot10, ot9, ot8) = transpose_8x8h(
odd_out[15],
odd_out[14],
odd_out[13],
odd_out[12],
odd_out[11],
odd_out[10],
odd_out[9],
odd_out[8],
);
let (ot7, ot6, ot5, ot4, ot3, ot2, ot1, ot0) = transpose_8x8h(
odd_out[7], odd_out[6], odd_out[5], odd_out[4], odd_out[3], odd_out[2], odd_out[1],
odd_out[0],
);
let odd_t_hi = [ot15, ot14, ot13, ot12, ot11, ot10, ot9, ot8];
let odd_t_lo = [ot7, ot6, ot5, ot4, ot3, ot2, ot1, ot0];
for row in 0..8 {
let e_lo = even_t[row][0];
let e_hi = even_t[row][1];
let o_hi = odd_t_hi[row];
let o_lo = odd_t_lo[row];
let first_lo = vqaddq_s16(e_lo, o_hi);
let first_hi = vqaddq_s16(e_hi, o_lo);
let sub_lo = vqsubq_s16(e_lo, o_hi);
let sub_hi = vqsubq_s16(e_hi, o_lo);
let rev_lo = rev128_s16(sub_hi);
let rev_hi = rev128_s16(sub_lo);
let r0 = vrshrq_n_s16::<1>(first_lo);
let r1 = vrshrq_n_s16::<1>(first_hi);
let r2 = vrshrq_n_s16::<1>(rev_lo);
let r3 = vrshrq_n_s16::<1>(rev_hi);
let soff = scratch_base + row * 32;
store_v16(scratch, soff, r0);
store_v16(scratch, soff + 8, r1);
store_v16(scratch, soff + 16, r2);
store_v16(scratch, soff + 24, r3);
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_32x8_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_16bpc(dst, dst_base, dst_stride, coeff, 32, 8, 1, bitdepth_max);
return;
}
let mut scratch = [0i16; 256];
for y in 0..8 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 8];
coeff[y + x * 8] = 0;
}
let out = scalar_dct32_1d(&input);
for x in 0..32 {
let v = (out[x] + 1) >> 1;
scratch[y * 32 + x] = v.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
}
}
for group in 0..4 {
let col_start = group * 8;
vert_dct8_add_8x8_16bpc(
dst,
dst_base + col_start,
dst_stride,
&scratch,
col_start,
32,
bitdepth_max,
);
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_16x32_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_8bpc(dst, dst_base, dst_stride, coeff, 16, 32, 1);
return;
}
let eob_thresholds: [i32; 4] = [36, 136, 300, 1024];
let mut scratch = [0i16; 512];
for group in 0..4 {
let row_start = group * 8;
if group > 0 && eob < eob_thresholds[group - 1] {
break;
}
horz_dct16_for_16x32(
coeff,
row_start,
32, &mut scratch,
row_start * 16, 16, );
}
for group in 0..2 {
let col_start = group * 8;
vert_dct_add_8x32_8bpc(
dst,
dst_base + col_start,
dst_stride,
&scratch,
col_start,
16, );
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_16x32_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_16bpc(dst, dst_base, dst_stride, coeff, 16, 32, 1, bitdepth_max);
return;
}
let eob_thresholds: [i32; 4] = [36, 136, 300, 1024];
let mut scratch = [0i16; 512];
for group in 0..4 {
let row_start = group * 8;
if group > 0 && eob < eob_thresholds[group - 1] {
break;
}
for r in 0..8 {
let y = row_start + r;
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 32];
coeff[y + x * 32] = 0;
}
let scale = 2896i64 * 8;
for val in input.iter_mut() {
*val = ((*val as i64 * scale + 16384) >> 15) as i32;
}
let out = scalar_dct16_1d(&input);
for x in 0..16 {
let v = (out[x] + 1) >> 1;
scratch[y * 16 + x] = v.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
}
}
}
for group in 0..2 {
let col_start = group * 8;
vert_dct_add_8x32_16bpc(
dst,
dst_base + col_start,
dst_stride,
&scratch,
col_start,
16,
bitdepth_max,
);
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_32x16_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_8bpc(dst, dst_base, dst_stride, coeff, 32, 16, 1);
return;
}
let eob_thresholds: [i32; 2] = [36, 512];
let mut scratch = [0i16; 512];
for group in 0..2 {
let row_start = group * 8;
if group > 0 && eob < eob_thresholds[group - 1] {
break;
}
horz_32x16_rect(
coeff,
row_start,
16, &mut scratch,
row_start * 32,
);
}
for group in 0..4 {
let col_start = group * 8;
vert_dct16_add_8x16_8bpc(
dst,
dst_base + col_start,
dst_stride,
&scratch,
col_start,
32, );
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn horz_32x16_rect(
coeff: &mut [i16],
coeff_base: usize,
coeff_stride: usize,
scratch: &mut [i16],
scratch_base: usize,
) {
horz_32x8_rect(coeff, coeff_base, coeff_stride, scratch, scratch_base);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_32x16_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_large_rect_16bpc(dst, dst_base, dst_stride, coeff, 32, 16, 1, bitdepth_max);
return;
}
let eob_thresholds: [i32; 2] = [36, 512];
let mut scratch = [0i16; 512];
for group in 0..2 {
let row_start = group * 8;
if group > 0 && eob < eob_thresholds[group - 1] {
break;
}
for r in 0..8 {
let y = row_start + r;
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 16];
coeff[y + x * 16] = 0;
}
let out = scalar_dct32_1d(&input);
for x in 0..32 {
let v = (out[x] + 1) >> 1;
scratch[y * 32 + x] = v.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
}
}
}
for group in 0..4 {
let col_start = group * 8;
vert_dct16_add_8x16_16bpc(
dst,
dst_base + col_start,
dst_stride,
&scratch,
col_start,
32,
bitdepth_max,
);
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_8x32_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_thresholds: [i32; 4] = [36, 136, 300, 1024];
for rg in 0..4 {
if rg > 0 && eob < eob_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let base = c * 32 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off = dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride);
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_32x8_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
_bitdepth_max: i32,
) {
for cg in 0..4 {
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 8;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off = dst_base.wrapping_add_signed(r as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_16x32_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_row_thresholds: [i32; 4] = [36, 136, 300, 1024];
let eob_col_thresholds: [i32; 2] = [36, 512];
for rg in 0..4 {
if rg > 0 && eob < eob_row_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..2 {
if cg > 0 && eob < eob_col_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 32 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_32x16_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_row_thresholds: [i32; 2] = [36, 512];
let eob_col_thresholds: [i32; 4] = [36, 136, 300, 1024];
for rg in 0..2 {
if rg > 0 && eob < eob_row_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..4 {
if cg > 0 && eob < eob_col_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 16 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}
#[allow(dead_code)]
fn scalar_dct8_1d(input: &[i32; 8]) -> [i32; 8] {
let s0 = input[0] + input[7];
let s1 = input[1] + input[6];
let s2 = input[2] + input[5];
let s3 = input[3] + input[4];
let s4 = input[0] - input[7];
let s5 = input[1] - input[6];
let s6 = input[2] - input[5];
let s7 = input[3] - input[4];
let a0 = s0 + s3;
let a1 = s1 + s2;
let a2 = s0 - s3;
let a3 = s1 - s2;
let e0 = a0 + a1;
let e1 = a0 - a1;
let e2 = (a2 * 1567 + a3 * 3784 + 2048) >> 12;
let e3 = (a2 * 3784 - a3 * 1567 + 2048) >> 12;
let t8a = (s4 * 799 + s7 * 4017 + 2048) >> 12;
let t15a = (s4 * 4017 - s7 * 799 + 2048) >> 12;
let t9a = (s5 * 3406 + s6 * 2276 + 2048) >> 12;
let t14a = (s5 * 2276 - s6 * 3406 + 2048) >> 12;
let t8 = t8a + t9a;
let t9 = t8a - t9a;
let t14 = t15a - t14a;
let t15 = t15a + t14a;
let t9a = (t14 * 2896 - t9 * 2896 + 2048) >> 12;
let t14a = (t14 * 2896 + t9 * 2896 + 2048) >> 12;
[
e0 + t15,
e2 + t14a,
e1 + t9a,
e3 + t8,
e3 - t8,
e1 - t9a,
e2 - t14a,
e0 - t15,
]
}
#[allow(dead_code)]
fn scalar_dct16_1d(input: &[i32; 16]) -> [i32; 16] {
let mut even = [0i32; 8];
for i in 0..8 {
even[i] = input[2 * i];
}
let even_out = scalar_dct8_1d(&even);
let c = [401i32, 4076, 3166, 2598, 1931, 3612, 3920, 1189];
let t8a = (input[1] * c[0] - input[15] * c[1] + 2048) >> 12;
let t15a = (input[1] * c[1] + input[15] * c[0] + 2048) >> 12;
let t9a = (input[9] * c[2] - input[7] * c[3] + 2048) >> 12;
let t14a = (input[9] * c[3] + input[7] * c[2] + 2048) >> 12;
let t10a = (input[5] * c[4] - input[11] * c[5] + 2048) >> 12;
let t13a = (input[5] * c[5] + input[11] * c[4] + 2048) >> 12;
let t11a = (input[13] * c[6] - input[3] * c[7] + 2048) >> 12;
let t12a = (input[13] * c[7] + input[3] * c[6] + 2048) >> 12;
let t8 = t8a + t9a;
let t9 = t8a - t9a;
let t10 = t11a - t10a;
let t11 = t11a + t10a;
let t12 = t12a + t13a;
let t13 = t12a - t13a;
let t14 = t15a - t14a;
let t15 = t15a + t14a;
let t9a = (t14 * 1567 - t9 * 3784 + 2048) >> 12;
let t14a = (t14 * 3784 + t9 * 1567 + 2048) >> 12;
let t13a = (t13 * 1567 - t10 * 3784 + 2048) >> 12;
let t10a = -((t13 * 3784 + t10 * 1567 + 2048) >> 12);
let t8a = t8 + t11;
let t11a = t8 - t11;
let t9b = t9a + t10a;
let t10b = t9a - t10a;
let t12a = t15 - t12;
let t15a = t15 + t12;
let t13b = t14a - t13a;
let t14b = t14a + t13a;
let t11_f = (t12a * 2896 - t11a * 2896 + 2048) >> 12;
let t12_f = (t12a * 2896 + t11a * 2896 + 2048) >> 12;
let t10_f = (t13b * 2896 - t10b * 2896 + 2048) >> 12;
let t13_f = (t13b * 2896 + t10b * 2896 + 2048) >> 12;
let mut out = [0i32; 16];
out[0] = even_out[0] + t15a;
out[1] = even_out[1] + t14b;
out[2] = even_out[2] + t13_f;
out[3] = even_out[3] + t12_f;
out[4] = even_out[4] + t11_f;
out[5] = even_out[5] + t10_f;
out[6] = even_out[6] + t9b;
out[7] = even_out[7] + t8a;
out[8] = even_out[7] - t8a;
out[9] = even_out[6] - t9b;
out[10] = even_out[5] - t10_f;
out[11] = even_out[4] - t11_f;
out[12] = even_out[3] - t12_f;
out[13] = even_out[2] - t13_f;
out[14] = even_out[1] - t14b;
out[15] = even_out[0] - t15a;
out
}
#[allow(dead_code)]
fn scalar_dct32_1d(input: &[i32; 32]) -> [i32; 32] {
let mut even = [0i32; 16];
for i in 0..16 {
even[i] = input[2 * i];
}
let even_out = scalar_dct16_1d(&even);
let c0 = [201i32, 4091, 3035, 2751, 1751, 3703, 3857, 1380];
let c1 = [995i32, 3973, 3513, 2106, 2440, 3290, 4052, 601];
let t16a = (input[1] * c0[0] - input[31] * c0[1] + 2048) >> 12;
let t31a = (input[1] * c0[1] + input[31] * c0[0] + 2048) >> 12;
let t17a = (input[17] * c0[2] - input[15] * c0[3] + 2048) >> 12;
let t30a = (input[17] * c0[3] + input[15] * c0[2] + 2048) >> 12;
let t18a = (input[9] * c0[4] - input[23] * c0[5] + 2048) >> 12;
let t29a = (input[9] * c0[5] + input[23] * c0[4] + 2048) >> 12;
let t19a = (input[25] * c0[6] - input[7] * c0[7] + 2048) >> 12;
let t28a = (input[25] * c0[7] + input[7] * c0[6] + 2048) >> 12;
let t20a = (input[5] * c1[0] - input[27] * c1[1] + 2048) >> 12;
let t27a = (input[5] * c1[1] + input[27] * c1[0] + 2048) >> 12;
let t21a = (input[21] * c1[2] - input[11] * c1[3] + 2048) >> 12;
let t26a = (input[21] * c1[3] + input[11] * c1[2] + 2048) >> 12;
let t22a = (input[13] * c1[4] - input[19] * c1[5] + 2048) >> 12;
let t25a = (input[13] * c1[5] + input[19] * c1[4] + 2048) >> 12;
let t23a = (input[29] * c1[6] - input[3] * c1[7] + 2048) >> 12;
let t24a = (input[29] * c1[7] + input[3] * c1[6] + 2048) >> 12;
let s16 = t16a + t17a;
let s17 = t16a - t17a;
let s18 = t19a - t18a;
let s19 = t19a + t18a;
let s20 = t20a + t21a;
let s21 = t20a - t21a;
let s22 = t23a - t22a;
let s23 = t23a + t22a;
let s24 = t24a + t25a;
let s25 = t24a - t25a;
let s26 = t27a - t26a;
let s27 = t27a + t26a;
let s28 = t28a + t29a;
let s29 = t28a - t29a;
let s30 = t31a - t30a;
let s31 = t31a + t30a;
let u17a = (s30 * 799 - s17 * 4017 + 2048) >> 12;
let u30a = (s30 * 4017 + s17 * 799 + 2048) >> 12;
let u18a = -((s29 * 4017 + s18 * 799 + 2048) >> 12);
let u29a = (s29 * 799 - s18 * 4017 + 2048) >> 12;
let u21a = (s26 * 3406 - s21 * 2276 + 2048) >> 12;
let u26a = (s26 * 2276 + s21 * 3406 + 2048) >> 12;
let u22a = -((s25 * 2276 + s22 * 3406 + 2048) >> 12);
let u25a = (s25 * 3406 - s22 * 2276 + 2048) >> 12;
let w16a = s16 + s19;
let w19a = s16 - s19;
let w17 = u17a + u18a;
let w18 = u17a - u18a;
let w20a = s23 - s20;
let w23a = s23 + s20;
let w21 = u22a - u21a;
let w22 = u22a + u21a;
let w24a = s24 + s27;
let w27a = s24 - s27;
let w25 = u25a + u26a;
let w26 = u25a - u26a;
let w28a = s31 - s28;
let w31a = s31 + s28;
let w29 = u30a - u29a;
let w30 = u30a + u29a;
let x18a = (w29 * 1567 - w18 * 3784 + 2048) >> 12;
let x29a = (w29 * 3784 + w18 * 1567 + 2048) >> 12;
let x19 = (w28a * 1567 - w19a * 3784 + 2048) >> 12;
let x28 = (w28a * 3784 + w19a * 1567 + 2048) >> 12;
let x20 = -((w27a * 3784 + w20a * 1567 + 2048) >> 12);
let x27 = (w27a * 1567 - w20a * 3784 + 2048) >> 12;
let x21a = -((w26 * 3784 + w21 * 1567 + 2048) >> 12);
let x26a = (w26 * 1567 - w21 * 3784 + 2048) >> 12;
let y16 = w16a + w23a;
let y23 = w16a - w23a;
let y17a = w17 + w22;
let y22a = w17 - w22;
let y18 = x18a + x21a;
let y21 = x18a - x21a;
let y19a = x19 + x20;
let y20a = x19 - x20;
let y24 = w31a - w24a;
let y31 = w31a + w24a;
let y25a = w30 - w25;
let y30a = w30 + w25;
let y26 = x29a - x26a;
let y29 = x29a + x26a;
let y27a = x28 - x27;
let y28a = x28 + x27;
let z20 = (y27a * 2896 - y20a * 2896 + 2048) >> 12;
let z27 = (y27a * 2896 + y20a * 2896 + 2048) >> 12;
let z21a = (y26 * 2896 - y21 * 2896 + 2048) >> 12;
let z26a = (y26 * 2896 + y21 * 2896 + 2048) >> 12;
let z22 = (y25a * 2896 - y22a * 2896 + 2048) >> 12;
let z25 = (y25a * 2896 + y22a * 2896 + 2048) >> 12;
let z23a = (y24 * 2896 - y23 * 2896 + 2048) >> 12;
let z24a = (y24 * 2896 + y23 * 2896 + 2048) >> 12;
let mut out = [0i32; 32];
let odd = [
y16, y17a, y18, y19a, z20, z21a, z22, z23a, z24a, z25, z26a, z27, y28a, y29, y30a, y31,
];
for i in 0..16 {
out[i] = even_out[i] + odd[15 - i];
out[31 - i] = even_out[i] - odd[15 - i];
}
out
}