#![allow(clippy::too_many_arguments)]
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
#[cfg(target_arch = "aarch64")]
use archmage::{Arm64, arcane, rite};
#[cfg(target_arch = "aarch64")]
use safe_unaligned_simd::aarch64 as safe_simd;
use super::itx_arm_neon_8x8::transpose_8x8h;
#[inline(always)]
fn coeff_or_zero(coeff: &[i16], idx: usize) -> i32 {
if idx < coeff.len() {
coeff[idx] as i32
} else {
0
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_64x64_8bpc(dst: &mut [u8], dst_base: usize, dst_stride: isize, coeff: &mut [i16]) {
let dc = coeff[0];
coeff[0] = 0;
let scale = vdupq_n_s16((2896 * 8) as i16);
let v = vdupq_n_s16(dc);
let v = vqrdmulhq_s16(v, scale);
let v = vrshrq_n_s16::<2>(v);
let v = vqrdmulhq_s16(v, scale);
let v = vrshrq_n_s16::<4>(v);
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for half in 0..8 {
let off = row_off + half * 8;
let dst_bytes: [u8; 8] = dst[off..off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(v), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[off..off + 8].copy_from_slice(&out);
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_64x64_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
bitdepth_max: i32,
) {
let dc_val = coeff[0];
coeff[0] = 0;
let scale = 2896i64 * 8;
let mut dc = ((dc_val as i64 * scale + 16384) >> 15) as i32;
dc = (dc + 2) >> 2;
dc = ((dc as i64 * scale + 16384) >> 15) as i32;
dc = (dc + 8) >> 4;
let dc = dc.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
let dc_vec = vdupq_n_s16(dc);
let bd_max = vdupq_n_s16(bitdepth_max as i16);
let zero = vdupq_n_s16(0);
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for half in 0..8 {
let off = row_off + half * 8;
let mut arr = [0i16; 8];
for j in 0..8 {
arr[j] = dst[off + j] as i16;
}
let d = safe_simd::vld1q_s16(&arr);
let sum = vqaddq_s16(d, dc_vec);
let clamped = vminq_s16(vmaxq_s16(sum, zero), bd_max);
let mut out = [0i16; 8];
safe_simd::vst1q_s16(&mut out, clamped);
for j in 0..8 {
dst[off + j] = out[j] as u16;
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_rect64_8bpc(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
w: usize,
h: usize,
shift: i32,
) {
let dc = coeff[0];
coeff[0] = 0;
let scale = vdupq_n_s16((2896 * 8) as i16);
let v = vdupq_n_s16(dc);
let v = vqrdmulhq_s16(v, scale);
let v = vqrdmulhq_s16(v, scale);
let v = match shift {
1 => vrshrq_n_s16::<1>(v),
2 => vrshrq_n_s16::<2>(v),
_ => v,
};
let v = vqrdmulhq_s16(v, scale);
let v = vrshrq_n_s16::<4>(v);
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for half in 0..(w / 8) {
let off = row_off + half * 8;
let dst_bytes: [u8; 8] = dst[off..off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(v), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[off..off + 8].copy_from_slice(&out);
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn dc_only_rect64_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
w: usize,
h: usize,
shift: i32,
bitdepth_max: i32,
) {
let dc_val = coeff[0];
coeff[0] = 0;
let scale = 2896i64 * 8;
let mut dc = ((dc_val as i64 * scale + 16384) >> 15) as i32;
dc = ((dc as i64 * scale + 16384) >> 15) as i32;
dc = match shift {
1 => (dc + 1) >> 1,
2 => (dc + 2) >> 2,
_ => dc,
};
dc = ((dc as i64 * scale + 16384) >> 15) as i32;
dc = (dc + 8) >> 4;
let dc = dc.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
let dc_vec = vdupq_n_s16(dc);
let bd_max = vdupq_n_s16(bitdepth_max as i16);
let zero = vdupq_n_s16(0);
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for half in 0..(w / 8) {
let off = row_off + half * 8;
let mut arr = [0i16; 8];
for j in 0..8 {
arr[j] = dst[off + j] as i16;
}
let d = safe_simd::vld1q_s16(&arr);
let sum = vqaddq_s16(d, dc_vec);
let clamped = vminq_s16(vmaxq_s16(sum, zero), bd_max);
let mut out = [0i16; 8];
safe_simd::vst1q_s16(&mut out, clamped);
for j in 0..8 {
dst[off + j] = out[j] as u16;
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn neon_add_to_dst_8bpc(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
tmp: &[i32],
w: usize,
h: usize,
total_shift: i32,
) {
let rounding = 1i32 << (total_shift - 1);
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let mut x = 0;
while x + 8 <= w {
let mut vals = [0i16; 8];
for j in 0..8 {
vals[j] = ((tmp[y * w + x + j] + rounding) >> total_shift)
.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
}
let v = safe_simd::vld1q_s16(&vals);
let off = row_off + x;
let dst_bytes: [u8; 8] = dst[off..off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(v), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[off..off + 8].copy_from_slice(&out);
x += 8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
fn neon_add_to_dst_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
tmp: &[i32],
w: usize,
h: usize,
total_shift: i32,
bitdepth_max: i32,
) {
let rounding = 1i32 << (total_shift - 1);
let bd_max = vdupq_n_s16(bitdepth_max as i16);
let zero = vdupq_n_s16(0);
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let mut x = 0;
while x + 8 <= w {
let mut vals = [0i16; 8];
for j in 0..8 {
vals[j] = ((tmp[y * w + x + j] + rounding) >> total_shift)
.clamp(i16::MIN as i32, i16::MAX as i32) as i16;
}
let v = safe_simd::vld1q_s16(&vals);
let off = row_off + x;
let mut arr = [0i16; 8];
for j in 0..8 {
arr[j] = dst[off + j] as i16;
}
let d = safe_simd::vld1q_s16(&arr);
let sum = vqaddq_s16(d, v);
let clamped = vminq_s16(vmaxq_s16(sum, zero), bd_max);
let mut out = [0i16; 8];
safe_simd::vst1q_s16(&mut out, clamped);
for j in 0..8 {
dst[off + j] = out[j] as u16;
}
x += 8;
}
}
}
#[allow(dead_code)]
fn scalar_dct64_1d(input: &[i32; 64]) -> [i32; 64] {
let mut even = [0i32; 32];
for i in 0..32 {
even[i] = input[2 * i];
}
let even_out = scalar_dct32_1d(&even);
let mut odd = [0i32; 32];
for i in 0..32 {
odd[i] = input[2 * i + 1];
}
let odd_out = scalar_idct64_odd(&odd);
let mut out = [0i32; 64];
for i in 0..32 {
out[i] = even_out[i] + odd_out[31 - i];
out[63 - i] = even_out[i] - odd_out[31 - i];
}
out
}
#[allow(dead_code)]
const IDCT64_COEFFS: [[i32; 2]; 16] = [
[101, 4095],
[2967, 2824],
[1660, 3745],
[3884, 1474],
[897, 3996],
[3461, 2191],
[2359, 3349],
[4076, 700],
[501, 4065],
[3229, 2520],
[2019, 3564],
[3948, 1092],
[1285, 3889],
[3659, 1842],
[2675, 3102],
[4017, 301],
];
#[allow(dead_code)]
fn scalar_idct64_odd(input: &[i32; 32]) -> [i32; 32] {
let mut t = [0i32; 32];
for i in 0..16 {
let c0 = IDCT64_COEFFS[i][0];
let c1 = IDCT64_COEFFS[i][1];
t[2 * i] = (input[2 * i] * c0 - input[31 - 2 * i] * c1 + 2048) >> 12;
t[2 * i + 1] = (input[2 * i] * c1 + input[31 - 2 * i] * c0 + 2048) >> 12;
}
let mut s = [0i32; 32];
for i in 0..16 {
if i % 2 == 0 {
s[2 * i] = t[2 * i] + t[2 * i + 2];
s[2 * i + 1] = t[2 * i + 1] + t[2 * i + 3];
s[2 * i + 2] = t[2 * i] - t[2 * i + 2];
s[2 * i + 3] = t[2 * i + 1] - t[2 * i + 3];
}
}
let mut out = [0i32; 32];
let mut a = [0i32; 32];
for i in 0..16 {
let c0 = IDCT64_COEFFS[i][0];
let c1 = IDCT64_COEFFS[i][1];
a[i] = (input[i] * c0 - input[31 - i] * c1 + 2048) >> 12;
a[31 - i] = (input[i] * c1 + input[31 - i] * c0 + 2048) >> 12;
}
let mut b = [0i32; 32];
for i in (0..32).step_by(4) {
b[i] = a[i] + a[i + 1];
b[i + 1] = a[i] - a[i + 1];
b[i + 2] = a[i + 3] - a[i + 2];
b[i + 3] = a[i + 3] + a[i + 2];
}
let c32 = [
(201i32, 4091),
(3035, 2751),
(1751, 3703),
(3857, 1380),
(995, 3973),
(3513, 2106),
(2440, 3290),
(4052, 601),
];
let mut c = b;
for i in 0..8 {
let idx = 4 * i + 1;
let (co, si) = c32[i];
let v0 = b[idx];
let v1 = b[idx + 1];
c[idx] = (v0 * co - v1 * si + 2048) >> 12;
c[idx + 1] = (v0 * si + v1 * co + 2048) >> 12;
}
let mut d = [0i32; 32];
for i in (0..32).step_by(8) {
d[i] = c[i] + c[i + 2];
d[i + 1] = c[i + 1] + c[i + 3];
d[i + 2] = c[i] - c[i + 2];
d[i + 3] = c[i + 1] - c[i + 3];
d[i + 4] = c[i + 7] - c[i + 4];
d[i + 5] = c[i + 6] - c[i + 5];
d[i + 6] = c[i + 7] + c[i + 4];
d[i + 7] = c[i + 6] + c[i + 5];
}
let c16 = [(799i32, 4017), (3406, 2276), (2276, 3406), (4017, 799)];
let mut e = d;
for i in 0..4 {
let idx = 8 * i;
let v2 = d[idx + 2];
let v3 = d[idx + 3];
e[idx + 2] = (v2 * c16[i].0 - v3 * c16[i].1 + 2048) >> 12;
e[idx + 3] = (v2 * c16[i].1 + v3 * c16[i].0 + 2048) >> 12;
let v4 = d[idx + 4];
let v5 = d[idx + 5];
if i < 2 {
e[idx + 4] = -((v4 * c16[3 - i].1 + v5 * c16[3 - i].0 + 2048) >> 12);
e[idx + 5] = (v4 * c16[3 - i].0 - v5 * c16[3 - i].1 + 2048) >> 12;
} else {
e[idx + 4] = -((v4 * c16[3 - i].1 + v5 * c16[3 - i].0 + 2048) >> 12);
e[idx + 5] = (v4 * c16[3 - i].0 - v5 * c16[3 - i].1 + 2048) >> 12;
}
}
let mut f = [0i32; 32];
for i in (0..32).step_by(16) {
for j in 0..4 {
f[i + j] = e[i + j] + e[i + 4 + j];
f[i + 4 + j] = e[i + j] - e[i + 4 + j];
f[i + 8 + j] = e[i + 15 - j] - e[i + 11 - j];
f[i + 12 + j] = e[i + 15 - j] + e[i + 11 - j];
}
}
let mut g = f;
for i in 0..2 {
let idx = 16 * i;
for j in [4, 5, 6, 7] {
let _ = j; }
let v4 = f[idx + 4];
let v5 = f[idx + 5];
g[idx + 4] = (v4 * 1567 - v5 * 3784 + 2048) >> 12;
g[idx + 5] = (v4 * 3784 + v5 * 1567 + 2048) >> 12;
let v6 = f[idx + 6];
let v7 = f[idx + 7];
g[idx + 6] = (v6 * 1567 - v7 * 3784 + 2048) >> 12;
g[idx + 7] = (v6 * 3784 + v7 * 1567 + 2048) >> 12;
let v8 = f[idx + 8];
let v9 = f[idx + 9];
g[idx + 8] = -((v8 * 3784 + v9 * 1567 + 2048) >> 12);
g[idx + 9] = (v8 * 1567 - v9 * 3784 + 2048) >> 12;
let v10 = f[idx + 10];
let v11 = f[idx + 11];
g[idx + 10] = -((v10 * 3784 + v11 * 1567 + 2048) >> 12);
g[idx + 11] = (v10 * 1567 - v11 * 3784 + 2048) >> 12;
}
let mut h = [0i32; 32];
for j in 0..8 {
h[j] = g[j] + g[8 + j];
h[8 + j] = g[j] - g[8 + j];
h[16 + j] = g[31 - j] - g[23 - j];
h[24 + j] = g[31 - j] + g[23 - j];
}
for j in [8, 9, 10, 11, 12, 13, 14, 15] {
let v0 = h[j];
let v1 = h[31 - j + 8]; let _ = v0;
let _ = v1;
}
let mut result = h;
for j in 0..8 {
let a_val = h[8 + j];
let b_val = h[23 - j];
result[8 + j] = (b_val * 2896 - a_val * 2896 + 2048) >> 12;
result[23 - j] = (b_val * 2896 + a_val * 2896 + 2048) >> 12;
}
out = result;
out
}
#[allow(dead_code)]
fn scalar_dct32_1d(input: &[i32; 32]) -> [i32; 32] {
let mut even = [0i32; 16];
for i in 0..16 {
even[i] = input[2 * i];
}
let even_out = scalar_dct16_1d(&even);
let c0 = [201i32, 4091, 3035, 2751, 1751, 3703, 3857, 1380];
let c1 = [995i32, 3973, 3513, 2106, 2440, 3290, 4052, 601];
let t16a = (input[1] * c0[0] - input[31] * c0[1] + 2048) >> 12;
let t31a = (input[1] * c0[1] + input[31] * c0[0] + 2048) >> 12;
let t17a = (input[17] * c0[2] - input[15] * c0[3] + 2048) >> 12;
let t30a = (input[17] * c0[3] + input[15] * c0[2] + 2048) >> 12;
let t18a = (input[9] * c0[4] - input[23] * c0[5] + 2048) >> 12;
let t29a = (input[9] * c0[5] + input[23] * c0[4] + 2048) >> 12;
let t19a = (input[25] * c0[6] - input[7] * c0[7] + 2048) >> 12;
let t28a = (input[25] * c0[7] + input[7] * c0[6] + 2048) >> 12;
let t20a = (input[5] * c1[0] - input[27] * c1[1] + 2048) >> 12;
let t27a = (input[5] * c1[1] + input[27] * c1[0] + 2048) >> 12;
let t21a = (input[21] * c1[2] - input[11] * c1[3] + 2048) >> 12;
let t26a = (input[21] * c1[3] + input[11] * c1[2] + 2048) >> 12;
let t22a = (input[13] * c1[4] - input[19] * c1[5] + 2048) >> 12;
let t25a = (input[13] * c1[5] + input[19] * c1[4] + 2048) >> 12;
let t23a = (input[29] * c1[6] - input[3] * c1[7] + 2048) >> 12;
let t24a = (input[29] * c1[7] + input[3] * c1[6] + 2048) >> 12;
let s16 = t16a + t17a;
let s17 = t16a - t17a;
let s18 = t19a - t18a;
let s19 = t19a + t18a;
let s20 = t20a + t21a;
let s21 = t20a - t21a;
let s22 = t23a - t22a;
let s23 = t23a + t22a;
let s24 = t24a + t25a;
let s25 = t24a - t25a;
let s26 = t27a - t26a;
let s27 = t27a + t26a;
let s28 = t28a + t29a;
let s29 = t28a - t29a;
let s30 = t31a - t30a;
let s31 = t31a + t30a;
let u17a = (s30 * 799 - s17 * 4017 + 2048) >> 12;
let u30a = (s30 * 4017 + s17 * 799 + 2048) >> 12;
let u18a = -((s29 * 4017 + s18 * 799 + 2048) >> 12);
let u29a = (s29 * 799 - s18 * 4017 + 2048) >> 12;
let u21a = (s26 * 3406 - s21 * 2276 + 2048) >> 12;
let u26a = (s26 * 2276 + s21 * 3406 + 2048) >> 12;
let u22a = -((s25 * 2276 + s22 * 3406 + 2048) >> 12);
let u25a = (s25 * 3406 - s22 * 2276 + 2048) >> 12;
let w16a = s16 + s19;
let w19a = s16 - s19;
let w17 = u17a + u18a;
let w18 = u17a - u18a;
let w20a = s23 - s20;
let w23a = s23 + s20;
let w21 = u22a - u21a;
let w22 = u22a + u21a;
let w24a = s24 + s27;
let w27a = s24 - s27;
let w25 = u25a + u26a;
let w26 = u25a - u26a;
let w28a = s31 - s28;
let w31a = s31 + s28;
let w29 = u30a - u29a;
let w30 = u30a + u29a;
let x18a = (w29 * 1567 - w18 * 3784 + 2048) >> 12;
let x29a = (w29 * 3784 + w18 * 1567 + 2048) >> 12;
let x19 = (w28a * 1567 - w19a * 3784 + 2048) >> 12;
let x28 = (w28a * 3784 + w19a * 1567 + 2048) >> 12;
let x20 = -((w27a * 3784 + w20a * 1567 + 2048) >> 12);
let x27 = (w27a * 1567 - w20a * 3784 + 2048) >> 12;
let x21a = -((w26 * 3784 + w21 * 1567 + 2048) >> 12);
let x26a = (w26 * 1567 - w21 * 3784 + 2048) >> 12;
let y16 = w16a + w23a;
let y23 = w16a - w23a;
let y17a = w17 + w22;
let y22a = w17 - w22;
let y18 = x18a + x21a;
let y21 = x18a - x21a;
let y19a = x19 + x20;
let y20a = x19 - x20;
let y24 = w31a - w24a;
let y31 = w31a + w24a;
let y25a = w30 - w25;
let y30a = w30 + w25;
let y26 = x29a - x26a;
let y29 = x29a + x26a;
let y27a = x28 - x27;
let y28a = x28 + x27;
let z20 = (y27a * 2896 - y20a * 2896 + 2048) >> 12;
let z27 = (y27a * 2896 + y20a * 2896 + 2048) >> 12;
let z21a = (y26 * 2896 - y21 * 2896 + 2048) >> 12;
let z26a = (y26 * 2896 + y21 * 2896 + 2048) >> 12;
let z22 = (y25a * 2896 - y22a * 2896 + 2048) >> 12;
let z25 = (y25a * 2896 + y22a * 2896 + 2048) >> 12;
let z23a = (y24 * 2896 - y23 * 2896 + 2048) >> 12;
let z24a = (y24 * 2896 + y23 * 2896 + 2048) >> 12;
let mut out = [0i32; 32];
let odd = [
y16, y17a, y18, y19a, z20, z21a, z22, z23a, z24a, z25, z26a, z27, y28a, y29, y30a, y31,
];
for i in 0..16 {
out[i] = even_out[i] + odd[15 - i];
out[31 - i] = even_out[i] - odd[15 - i];
}
out
}
#[allow(dead_code)]
fn scalar_dct16_1d(input: &[i32; 16]) -> [i32; 16] {
let mut even = [0i32; 8];
for i in 0..8 {
even[i] = input[2 * i];
}
let even_out = scalar_dct8_1d(&even);
let c = [401i32, 4076, 3166, 2598, 1931, 3612, 3920, 1189];
let t8a = (input[1] * c[0] - input[15] * c[1] + 2048) >> 12;
let t15a = (input[1] * c[1] + input[15] * c[0] + 2048) >> 12;
let t9a = (input[9] * c[2] - input[7] * c[3] + 2048) >> 12;
let t14a = (input[9] * c[3] + input[7] * c[2] + 2048) >> 12;
let t10a = (input[5] * c[4] - input[11] * c[5] + 2048) >> 12;
let t13a = (input[5] * c[5] + input[11] * c[4] + 2048) >> 12;
let t11a = (input[13] * c[6] - input[3] * c[7] + 2048) >> 12;
let t12a = (input[13] * c[7] + input[3] * c[6] + 2048) >> 12;
let t8 = t8a + t9a;
let t9 = t8a - t9a;
let t10 = t11a - t10a;
let t11 = t11a + t10a;
let t12 = t12a + t13a;
let t13 = t12a - t13a;
let t14 = t15a - t14a;
let t15 = t15a + t14a;
let t9a = (t14 * 1567 - t9 * 3784 + 2048) >> 12;
let t14a = (t14 * 3784 + t9 * 1567 + 2048) >> 12;
let t13a = (t13 * 1567 - t10 * 3784 + 2048) >> 12;
let t10a = -((t13 * 3784 + t10 * 1567 + 2048) >> 12);
let t8a = t8 + t11;
let t11a = t8 - t11;
let t9b = t9a + t10a;
let t10b = t9a - t10a;
let t12a = t15 - t12;
let t15a = t15 + t12;
let t13b = t14a - t13a;
let t14b = t14a + t13a;
let t11_f = (t12a * 2896 - t11a * 2896 + 2048) >> 12;
let t12_f = (t12a * 2896 + t11a * 2896 + 2048) >> 12;
let t10_f = (t13b * 2896 - t10b * 2896 + 2048) >> 12;
let t13_f = (t13b * 2896 + t10b * 2896 + 2048) >> 12;
[
even_out[0] + t15a,
even_out[1] + t14b,
even_out[2] + t13_f,
even_out[3] + t12_f,
even_out[4] + t11_f,
even_out[5] + t10_f,
even_out[6] + t9b,
even_out[7] + t8a,
even_out[7] - t8a,
even_out[6] - t9b,
even_out[5] - t10_f,
even_out[4] - t11_f,
even_out[3] - t12_f,
even_out[2] - t13_f,
even_out[1] - t14b,
even_out[0] - t15a,
]
}
#[allow(dead_code)]
fn scalar_dct8_1d(input: &[i32; 8]) -> [i32; 8] {
let s0 = input[0] + input[7];
let s1 = input[1] + input[6];
let s2 = input[2] + input[5];
let s3 = input[3] + input[4];
let s4 = input[0] - input[7];
let s5 = input[1] - input[6];
let s6 = input[2] - input[5];
let s7 = input[3] - input[4];
let a0 = s0 + s3;
let a1 = s1 + s2;
let a2 = s0 - s3;
let a3 = s1 - s2;
let e0 = a0 + a1;
let e1 = a0 - a1;
let e2 = (a2 * 1567 + a3 * 3784 + 2048) >> 12;
let e3 = (a2 * 3784 - a3 * 1567 + 2048) >> 12;
let t8a = (s4 * 799 + s7 * 4017 + 2048) >> 12;
let t15a = (s4 * 4017 - s7 * 799 + 2048) >> 12;
let t9a = (s5 * 3406 + s6 * 2276 + 2048) >> 12;
let t14a = (s5 * 2276 - s6 * 3406 + 2048) >> 12;
let t8 = t8a + t9a;
let t9 = t8a - t9a;
let t14 = t15a - t14a;
let t15 = t15a + t14a;
let t9a = (t14 * 2896 - t9 * 2896 + 2048) >> 12;
let t14a = (t14 * 2896 + t9 * 2896 + 2048) >> 12;
[
e0 + t15,
e2 + t14a,
e1 + t9a,
e3 + t8,
e3 - t8,
e1 - t9a,
e2 - t14a,
e0 - t15,
]
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_64x64_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_64x64_8bpc(dst, dst_base, dst_stride, coeff);
return;
}
let mut tmp = vec![0i32; 4096];
for y in 0..64 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff_or_zero(coeff, y + x * 64);
}
let out = scalar_dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 64 + x];
}
let out = scalar_dct64_1d(&input);
for y in 0..64 {
tmp[y * 64 + x] = out[y];
}
}
neon_add_to_dst_8bpc(dst, dst_base, dst_stride, &tmp, 64, 64, 12);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_64x64_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_64x64_16bpc(dst, dst_base, dst_stride, coeff, bitdepth_max);
return;
}
let mut tmp = vec![0i32; 4096];
for y in 0..64 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff.get(y + x * 64).copied().unwrap_or(0);
}
let out = scalar_dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 64 + x];
}
let out = scalar_dct64_1d(&input);
for y in 0..64 {
tmp[y * 64 + x] = out[y];
}
}
neon_add_to_dst_16bpc(dst, dst_base, dst_stride, &tmp, 64, 64, 12, bitdepth_max);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_64x32_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_8bpc(dst, dst_base, dst_stride, coeff, 64, 32, 1);
return;
}
let mut tmp = vec![0i32; 2048];
for y in 0..32 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff_or_zero(coeff, y + x * 32);
}
let out = scalar_dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 64 + x];
}
let out = scalar_dct32_1d(&input);
for y in 0..32 {
tmp[y * 64 + x] = out[y];
}
}
neon_add_to_dst_8bpc(dst, dst_base, dst_stride, &tmp, 64, 32, 11);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_64x32_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_16bpc(dst, dst_base, dst_stride, coeff, 64, 32, 1, bitdepth_max);
return;
}
let mut tmp = vec![0i32; 2048];
for y in 0..32 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff.get(y + x * 32).copied().unwrap_or(0);
}
let out = scalar_dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 64 + x];
}
let out = scalar_dct32_1d(&input);
for y in 0..32 {
tmp[y * 64 + x] = out[y];
}
}
neon_add_to_dst_16bpc(dst, dst_base, dst_stride, &tmp, 64, 32, 11, bitdepth_max);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_32x64_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_8bpc(dst, dst_base, dst_stride, coeff, 32, 64, 1);
return;
}
let mut tmp = vec![0i32; 2048];
for y in 0..64 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff_or_zero(coeff, y + x * 64);
}
let out = scalar_dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 32 + x];
}
let out = scalar_dct64_1d(&input);
for y in 0..64 {
tmp[y * 32 + x] = out[y];
}
}
neon_add_to_dst_8bpc(dst, dst_base, dst_stride, &tmp, 32, 64, 11);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_32x64_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_16bpc(dst, dst_base, dst_stride, coeff, 32, 64, 1, bitdepth_max);
return;
}
let mut tmp = vec![0i32; 2048];
for y in 0..64 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff.get(y + x * 64).copied().unwrap_or(0);
}
let out = scalar_dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 32 + x];
}
let out = scalar_dct64_1d(&input);
for y in 0..64 {
tmp[y * 32 + x] = out[y];
}
}
neon_add_to_dst_16bpc(dst, dst_base, dst_stride, &tmp, 32, 64, 11, bitdepth_max);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_16x64_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_8bpc(dst, dst_base, dst_stride, coeff, 16, 64, 1);
return;
}
let mut tmp = vec![0i32; 1024];
for y in 0..64 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff_or_zero(coeff, y + x * 64);
}
let scale = 2896i64 * 8;
for val in input.iter_mut() {
*val = ((*val as i64 * scale + 16384) >> 15) as i32;
}
let out = scalar_dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..16 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 16 + x];
}
let out = scalar_dct64_1d(&input);
for y in 0..64 {
tmp[y * 16 + x] = out[y];
}
}
neon_add_to_dst_8bpc(dst, dst_base, dst_stride, &tmp, 16, 64, 4);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_16x64_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_16bpc(dst, dst_base, dst_stride, coeff, 16, 64, 1, bitdepth_max);
return;
}
let mut tmp = vec![0i32; 1024];
for y in 0..64 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff.get(y + x * 64).copied().unwrap_or(0);
}
let scale = 2896i64 * 8;
for val in input.iter_mut() {
*val = ((*val as i64 * scale + 16384) >> 15) as i32;
}
let out = scalar_dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..16 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 16 + x];
}
let out = scalar_dct64_1d(&input);
for y in 0..64 {
tmp[y * 16 + x] = out[y];
}
}
neon_add_to_dst_16bpc(dst, dst_base, dst_stride, &tmp, 16, 64, 4, bitdepth_max);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_64x16_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_8bpc(dst, dst_base, dst_stride, coeff, 64, 16, 1);
return;
}
let mut tmp = vec![0i32; 1024];
for y in 0..16 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff_or_zero(coeff, y + x * 16);
}
let scale = 2896i64 * 8;
for val in input.iter_mut() {
*val = ((*val as i64 * scale + 16384) >> 15) as i32;
}
let out = scalar_dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..64 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 64 + x];
}
let out = scalar_dct16_1d(&input);
for y in 0..16 {
tmp[y * 64 + x] = out[y];
}
}
neon_add_to_dst_8bpc(dst, dst_base, dst_stride, &tmp, 64, 16, 4);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_dct_dct_64x16_16bpc_neon_inner(
_token: Arm64,
dst: &mut [u16],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
) {
if eob == 0 {
dc_only_rect64_16bpc(dst, dst_base, dst_stride, coeff, 64, 16, 1, bitdepth_max);
return;
}
let mut tmp = vec![0i32; 1024];
for y in 0..16 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff.get(y + x * 16).copied().unwrap_or(0);
}
let scale = 2896i64 * 8;
for val in input.iter_mut() {
*val = ((*val as i64 * scale + 16384) >> 15) as i32;
}
let out = scalar_dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..64 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 64 + x];
}
let out = scalar_dct16_1d(&input);
for y in 0..16 {
tmp[y * 64 + x] = out[y];
}
}
neon_add_to_dst_16bpc(dst, dst_base, dst_stride, &tmp, 64, 16, 4, bitdepth_max);
coeff.fill(0);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_64x64_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_thresholds: [i32; 8] = [36, 136, 300, 1024, 1024, 1024, 1024, 4096];
for rg in 0..8 {
if rg > 0 && eob < eob_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..8 {
if cg > 0 && eob < eob_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 64 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_64x32_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_row_thresholds: [i32; 4] = [36, 136, 300, 1024];
let eob_col_thresholds: [i32; 8] = [36, 136, 300, 1024, 1024, 1024, 1024, 2048];
for rg in 0..4 {
if rg > 0 && eob < eob_row_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..8 {
if cg > 0 && eob < eob_col_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 32 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_32x64_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_row_thresholds: [i32; 8] = [36, 136, 300, 1024, 1024, 1024, 1024, 2048];
let eob_col_thresholds: [i32; 4] = [36, 136, 300, 1024];
for rg in 0..8 {
if rg > 0 && eob < eob_row_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..4 {
if cg > 0 && eob < eob_col_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 64 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_16x64_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_row_thresholds: [i32; 8] = [36, 136, 300, 1024, 1024, 1024, 1024, 1024];
let eob_col_thresholds: [i32; 2] = [36, 512];
for rg in 0..8 {
if rg > 0 && eob < eob_row_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..2 {
if cg > 0 && eob < eob_col_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 64 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
pub(crate) fn inv_txfm_add_identity_identity_64x16_8bpc_neon_inner(
_token: Arm64,
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
) {
let eob_row_thresholds: [i32; 2] = [36, 512];
let eob_col_thresholds: [i32; 8] = [36, 136, 300, 1024, 1024, 1024, 1024, 1024];
for rg in 0..2 {
if rg > 0 && eob < eob_row_thresholds[rg - 1] {
break;
}
let row_start = rg * 8;
for cg in 0..8 {
if cg > 0 && eob < eob_col_thresholds[cg - 1] {
break;
}
let col_start = cg * 8;
let zero_vec = vdupq_n_s16(0);
let mut v: [int16x8_t; 8] = [zero_vec; 8];
for c in 0..8 {
let col = col_start + c;
let base = col * 16 + row_start;
let arr: [i16; 8] = coeff[base..base + 8].try_into().unwrap();
v[c] = safe_simd::vld1q_s16(&arr);
coeff[base..base + 8].fill(0);
}
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8h(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let rows = [r0, r1, r2, r3, r4, r5, r6, r7];
for r in 0..8 {
let shifted = vrshrq_n_s16::<2>(rows[r]);
let row_off =
dst_base.wrapping_add_signed((row_start + r) as isize * dst_stride) + col_start;
let dst_bytes: [u8; 8] = dst[row_off..row_off + 8].try_into().unwrap();
let dst_u8 = safe_simd::vld1_u8(&dst_bytes);
let sum = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(shifted), dst_u8));
let result = vqmovun_s16(sum);
let mut out = [0u8; 8];
safe_simd::vst1_u8(&mut out, result);
dst[row_off..row_off + 8].copy_from_slice(&out);
}
}
}
}