#![allow(unused_imports)]
#![allow(clippy::too_many_arguments)]
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
use archmage::autoversion;
use std::ffi::c_int;
use crate::include::common::bitdepth::AsPrimitive;
use crate::include::common::bitdepth::BitDepth8;
use crate::include::common::bitdepth::BitDepth16;
use crate::include::common::bitdepth::DynCoef;
use crate::include::common::bitdepth::DynPixel;
use crate::include::common::intops::iclip;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;
#[autoversion]
pub(crate) fn inv_txfm_add_wht_wht_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32 >> 2;
let in1 = coeff[y + 4] as i32 >> 2;
let in2 = coeff[y + 8] as i32 >> 2;
let in3 = coeff[y + 12] as i32 >> 2;
let t0 = in0 + in1;
let t2 = in2 - in3;
let t4 = (t0 - t2) >> 1;
let t3 = t4 - in3;
let t1 = t4 - in1;
tmp[y * 4 + 0] = t0 - t3;
tmp[y * 4 + 1] = t3;
tmp[y * 4 + 2] = t1;
tmp[y * 4 + 3] = t2 + t1;
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let t0 = in0 + in1;
let t2 = in2 - in3;
let t4 = (t0 - t2) >> 1;
let t3 = t4 - in3;
let t1 = t4 - in1;
tmp[0 * 4 + x] = t0 - t3;
tmp[1 * 4 + x] = t3;
tmp[2 * 4 + x] = t1;
tmp[3 * 4 + x] = t2 + t1;
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let c = tmp[y * 4 + x];
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_wht_wht_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] >> 2;
let in1 = coeff[y + 4] >> 2;
let in2 = coeff[y + 8] >> 2;
let in3 = coeff[y + 12] >> 2;
let t0 = in0 + in1;
let t2 = in2 - in3;
let t4 = (t0 - t2) >> 1;
let t3 = t4 - in3;
let t1 = t4 - in1;
tmp[y * 4 + 0] = t0 - t3;
tmp[y * 4 + 1] = t3;
tmp[y * 4 + 2] = t1;
tmp[y * 4 + 3] = t2 + t1;
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let t0 = in0 + in1;
let t2 = in2 - in3;
let t4 = (t0 - t2) >> 1;
let t3 = t4 - in3;
let t1 = t4 - in1;
tmp[0 * 4 + x] = t0 - t3;
tmp[1 * 4 + x] = t3;
tmp[2 * 4 + x] = t1;
tmp[3 * 4 + x] = t2 + t1;
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let c = tmp[y * 4 + x];
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
const DCT4_C1: i32 = 2896; const DCT4_C2: i32 = 2896; const DCT4_C3: i32 = 1567; const DCT4_C4: i32 = 3784;
#[inline(always)]
fn dct4_1d(in0: i32, in1: i32, in2: i32, in3: i32) -> [i32; 4] {
let t0 = in0 + in3;
let t1 = in1 + in2;
let t2 = in0 - in3;
let t3 = in1 - in2;
let s0 = t0 + t1;
let s1 = t0 - t1;
let s2 = ((t2 * 1567 + t3 * 3784) + 2048) >> 12;
let s3 = ((t2 * 3784 - t3 * 1567) + 2048) >> 12;
[s0, s2, s1, s3]
}
#[autoversion]
pub(crate) fn inv_txfm_add_dct_dct_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = dct4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = dct4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
const COS_PI_1_16: i32 = 4017; const COS_PI_2_16: i32 = 3784; const COS_PI_3_16: i32 = 3406; const COS_PI_4_16: i32 = 2896; const COS_PI_5_16: i32 = 2276; const COS_PI_6_16: i32 = 1567; const COS_PI_7_16: i32 = 799;
#[inline(always)]
fn dct8_1d(input: &[i32; 8]) -> [i32; 8] {
let t0 = input[0] + input[7];
let t7 = input[0] - input[7];
let t1 = input[1] + input[6];
let t6 = input[1] - input[6];
let t2 = input[2] + input[5];
let t5 = input[2] - input[5];
let t3 = input[3] + input[4];
let t4 = input[3] - input[4];
let e0 = t0 + t3;
let e3 = t0 - t3;
let e1 = t1 + t2;
let e2 = t1 - t2;
let out0 = e0 + e1;
let out4 = e0 - e1;
let out2 = ((e2 * 1567 + e3 * 3784) + 2048) >> 12;
let out6 = ((e3 * 1567 - e2 * 3784) + 2048) >> 12;
let o0 = ((t4 * 799 + t7 * 4017) + 2048) >> 12;
let o7 = ((t7 * 799 - t4 * 4017) + 2048) >> 12;
let o1 = ((t5 * 2276 + t6 * 3406) + 2048) >> 12;
let o6 = ((t6 * 2276 - t5 * 3406) + 2048) >> 12;
let out1 = o0 + o1;
let out3 = o7 - o6;
let out5 = o7 + o6;
let out7 = o0 - o1;
[out0, out1, out2, out3, out4, out5, out6, out7]
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8] as i32;
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8];
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[inline(always)]
fn dct16_1d(input: &[i32; 16]) -> [i32; 16] {
let mut t = [0i32; 16];
for i in 0..8 {
t[i] = input[i] + input[15 - i];
t[15 - i] = input[i] - input[15 - i];
}
let even_input = [t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7]];
let even_out = dct8_1d(&even_input);
let o0 = ((t[8] * 401 + t[15] * 4076) + 2048) >> 12;
let o15 = ((t[15] * 401 - t[8] * 4076) + 2048) >> 12;
let o1 = ((t[9] * 1189 + t[14] * 3920) + 2048) >> 12;
let o14 = ((t[14] * 1189 - t[9] * 3920) + 2048) >> 12;
let o2 = ((t[10] * 1931 + t[13] * 3612) + 2048) >> 12;
let o13 = ((t[13] * 1931 - t[10] * 3612) + 2048) >> 12;
let o3 = ((t[11] * 2598 + t[12] * 3166) + 2048) >> 12;
let o12 = ((t[12] * 2598 - t[11] * 3166) + 2048) >> 12;
let a0 = o0 + o1;
let a1 = o0 - o1;
let a2 = o2 + o3;
let a3 = o2 - o3;
let a4 = o12 + o13;
let a5 = o12 - o13;
let a6 = o14 + o15;
let a7 = o14 - o15;
let b1 = ((a1 * 1567 + a6 * 3784) + 2048) >> 12;
let b6 = ((a6 * 1567 - a1 * 3784) + 2048) >> 12;
let b3 = ((a3 * 3784 + a4 * 1567) + 2048) >> 12;
let b4 = ((a4 * 3784 - a3 * 1567) + 2048) >> 12;
let mut out = [0i32; 16];
out[0] = even_out[0];
out[1] = a0 + a2;
out[2] = even_out[1];
out[3] = b1 + b3;
out[4] = even_out[2];
out[5] = a7 + a5;
out[6] = even_out[3];
out[7] = b6 + b4;
out[8] = even_out[4];
out[9] = b6 - b4;
out[10] = even_out[5];
out[11] = a7 - a5;
out[12] = even_out[6];
out[13] = b1 - b3;
out[14] = even_out[7];
out[15] = a0 - a2;
out
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x16_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 256];
for y in 0..16 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 16] as i32;
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 16 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 128) >> 8;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x16_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 256];
for y in 0..16 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 16];
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 16 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 128) >> 8;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_identity_identity_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let sqrt2 = 181i32;
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let c = coeff[y + x * 4] as i32;
let scaled = ((c * sqrt2 + 64) >> 7) * sqrt2;
let final_val = (scaled + 2048) >> 12;
let d = dst[row_off + x] as i32;
let result = iclip(d + final_val, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_identity_identity_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let sqrt2 = 181i32;
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..4 {
let c = coeff[y + x * 4];
let scaled = ((c * sqrt2 + 64) >> 7) * sqrt2;
let final_val = (scaled + 2048) >> 12;
let d = dst[row_off + x] as i32;
let result = iclip(d + final_val, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_identity_identity_8x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..8 {
let c = coeff[y + x * 8] as i32;
let final_val = (c + 1) >> 1;
let d = dst[row_off + x] as i32;
let result = iclip(d + final_val, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_identity_identity_8x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..8 {
let c = coeff[y + x * 8];
let final_val = (c + 1) >> 1;
let d = dst[row_off + x] as i32;
let result = iclip(d + final_val, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_identity_identity_16x16_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let sqrt2 = 181i32;
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..16 {
let c = coeff[y + x * 16] as i32;
let scaled = (c * sqrt2 + 64) >> 7;
let final_val = (scaled + 1) >> 1;
let d = dst[row_off + x] as i32;
let result = iclip(d + final_val, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_identity_identity_16x16_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let sqrt2 = 181i32;
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..16 {
let c = coeff[y + x * 16];
let scaled = (c * sqrt2 + 64) >> 7;
let final_val = (scaled + 1) >> 1;
let d = dst[row_off + x] as i32;
let result = iclip(d + final_val, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[inline(always)]
fn adst4_1d(in0: i32, in1: i32, in2: i32, in3: i32) -> [i32; 4] {
const SINPI_1_9: i32 = 1321;
const SINPI_2_9: i32 = 2482;
const SINPI_3_9: i32 = 3344;
const SINPI_4_9: i32 = 3803;
let s0 = SINPI_1_9 * in0;
let s1 = SINPI_2_9 * in0;
let s2 = SINPI_3_9 * in1;
let s3 = SINPI_4_9 * in2;
let s4 = SINPI_1_9 * in2;
let s5 = SINPI_2_9 * in3;
let s6 = SINPI_4_9 * in3;
let x0 = s0 + s3 + s5;
let x1 = s1 - s4 - s6;
let x2 = SINPI_3_9 * (in0 - in2 + in3);
let x3 = s2;
let s0 = x0 + x3;
let s1 = x1 + x3;
let s2 = x2;
let s3 = x0 + x1 - x3;
[
(s0 + 2048) >> 12,
(s1 + 2048) >> 12,
(s2 + 2048) >> 12,
(s3 + 2048) >> 12,
]
}
#[autoversion]
pub(crate) fn inv_txfm_add_adst_adst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = adst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = adst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_adst_adst_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = adst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = adst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[inline(always)]
fn adst8_1d(input: &[i32; 8]) -> [i32; 8] {
const COSPI_2: i32 = 4091;
const COSPI_6: i32 = 3973;
const COSPI_10: i32 = 3703;
const COSPI_14: i32 = 3290;
const COSPI_18: i32 = 2751;
const COSPI_22: i32 = 2106;
const COSPI_26: i32 = 1380;
const COSPI_30: i32 = 601;
let x0 = input[7];
let x1 = input[0];
let x2 = input[5];
let x3 = input[2];
let x4 = input[3];
let x5 = input[4];
let x6 = input[1];
let x7 = input[6];
let s0 = ((x0 * COSPI_2 + x1 * COSPI_30) + 2048) >> 12;
let s1 = ((x0 * COSPI_30 - x1 * COSPI_2) + 2048) >> 12;
let s2 = ((x2 * COSPI_10 + x3 * COSPI_22) + 2048) >> 12;
let s3 = ((x2 * COSPI_22 - x3 * COSPI_10) + 2048) >> 12;
let s4 = ((x4 * COSPI_18 + x5 * COSPI_14) + 2048) >> 12;
let s5 = ((x4 * COSPI_14 - x5 * COSPI_18) + 2048) >> 12;
let s6 = ((x6 * COSPI_26 + x7 * COSPI_6) + 2048) >> 12;
let s7 = ((x6 * COSPI_6 - x7 * COSPI_26) + 2048) >> 12;
let x0 = s0 + s4;
let x1 = s1 + s5;
let x2 = s2 + s6;
let x3 = s3 + s7;
let x4 = s0 - s4;
let x5 = s1 - s5;
let x6 = s2 - s6;
let x7 = s3 - s7;
let s4 = ((x4 * 1567 + x5 * 3784) + 2048) >> 12;
let s5 = ((x4 * 3784 - x5 * 1567) + 2048) >> 12;
let s6 = ((-x6 * 3784 + x7 * 1567) + 2048) >> 12;
let s7 = ((x6 * 1567 + x7 * 3784) + 2048) >> 12;
let x0 = x0 + x2;
let x1 = x1 + x3;
let x2_new = x0 - x2 - x2;
let x3_new = x1 - x3 - x3;
let x4 = s4 + s6;
let x5 = s5 + s7;
let x6 = s4 - s6;
let x7 = s5 - s7;
let s2 = ((x2_new * 2896 + x3_new * 2896) + 2048) >> 12;
let s3 = ((x2_new * 2896 - x3_new * 2896) + 2048) >> 12;
let s6 = ((x6 * 2896 + x7 * 2896) + 2048) >> 12;
let s7 = ((x6 * 2896 - x7 * 2896) + 2048) >> 12;
[x0, -x4, s2, -s6, s3, -x5, s7, -x1]
}
#[autoversion]
fn inv_txfm_add_adst_adst_8x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8] as i32;
}
let out = adst8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = adst8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_adst_adst_8x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8];
}
let out = adst8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = adst8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[inline(always)]
fn flipadst4_1d(in0: i32, in1: i32, in2: i32, in3: i32) -> [i32; 4] {
adst4_1d(in3, in2, in1, in0)
}
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_flipadst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = flipadst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = flipadst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_flipadst_flipadst_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = flipadst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = flipadst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_dct_adst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = dct4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = adst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_adst_dct_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = adst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_adst_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = dct4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = adst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_adst_dct_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = adst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_dct_flipadst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = dct4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = flipadst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_dct_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = flipadst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_flipadst_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = dct4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = flipadst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_flipadst_dct_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = flipadst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_adst_flipadst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = adst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = flipadst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_adst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 4] as i32;
let in2 = coeff[y + 8] as i32;
let in3 = coeff[y + 12] as i32;
let out = flipadst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = adst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_adst_flipadst_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = adst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = flipadst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_flipadst_adst_4x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let in0 = coeff[y];
let in1 = coeff[y + 4];
let in2 = coeff[y + 8];
let in3 = coeff[y + 12];
let out = flipadst4_1d(in0, in1, in2, in3);
tmp[y * 4 + 0] = out[0];
tmp[y * 4 + 1] = out[1];
tmp[y * 4 + 2] = out[2];
tmp[y * 4 + 3] = out[3];
}
for x in 0..4 {
let in0 = tmp[0 * 4 + x];
let in1 = tmp[1 * 4 + x];
let in2 = tmp[2 * 4 + x];
let in3 = tmp[3 * 4 + x];
let out = adst4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_wht_wht_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_wht::inv_txfm_add_wht_wht_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_wht_wht_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_wht_wht_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_wht_wht_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_dct_dct_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_dct_dct_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_8x8::inv_txfm_add_dct_dct_8x8_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_dct_dct_8x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x16_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x16_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_identity_identity_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_identity_identity_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_identity_identity_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_8x8::inv_txfm_add_identity_identity_8x8_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_identity_identity_8x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_identity_identity_8x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_identity_identity_16x16_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_identity_identity_16x16_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_adst_adst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_adst_adst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_adst_adst_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_8x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_8x8::inv_txfm_add_adst_adst_8x8_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_adst_adst_8x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_8x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_adst_adst_8x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_flipadst_flipadst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_flipadst_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_flipadst_flipadst_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_dct_adst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_dct_adst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_adst_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_adst_dct_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_adst_dct_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_adst_dct_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_flipadst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_dct_flipadst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_dct_flipadst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_flipadst_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_flipadst_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_dct_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_flipadst_dct_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_flipadst_dct_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_dct_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_flipadst_dct_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_flipadst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_adst_flipadst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_adst_flipadst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_flipadst_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_adst_flipadst_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_adst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_flipadst_adst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_flipadst_adst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_adst_4x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_flipadst_adst_4x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[autoversion]
fn inv_txfm_add_dct_dct_4x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 32];
for y in 0..8 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 8] as i32;
let in2 = coeff[y + 16] as i32;
let in3 = coeff[y + 24] as i32;
let out = dct4_1d(in0, in1, in2, in3);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 4 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 16) >> 5;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..32 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_4x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 32];
for y in 0..8 {
let in0 = coeff[y];
let in1 = coeff[y + 8];
let in2 = coeff[y + 16];
let in3 = coeff[y + 24];
let out = dct4_1d(in0, in1, in2, in3);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 4 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 16) >> 5;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..32 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 32];
for y in 0..4 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 4] as i32;
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let in0 = tmp[0 * 8 + x];
let in1 = tmp[1 * 8 + x];
let in2 = tmp[2 * 8 + x];
let in3 = tmp[3 * 8 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 16) >> 5;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..32 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 32];
for y in 0..4 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 4];
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let in0 = tmp[0 * 8 + x];
let in1 = tmp[1 * 8 + x];
let in2 = tmp[2 * 8 + x];
let in3 = tmp[3 * 8 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 16) >> 5;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..32 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x16_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 128];
for y in 0..16 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 16] as i32;
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 8 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..128 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x16_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 128];
for y in 0..16 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 16];
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 8 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..128 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 128];
for y in 0..8 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 8] as i32;
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 16 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..128 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 128];
for y in 0..8 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 8];
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 16 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..128 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_adst_8x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8] as i32;
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = adst8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_adst_dct_8x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8] as i32;
}
let out = adst8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_adst_8x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8];
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = adst8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_adst_dct_8x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..8 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 8];
}
let out = adst8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 8 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 32) >> 6;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 32) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_4x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_4x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 32) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 128) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x16_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x16_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 128) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_8x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_8x8::inv_txfm_add_dct_adst_8x8_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_dct_adst_8x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_8x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_adst_8x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_8x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_8x8::inv_txfm_add_adst_dct_8x8_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_adst_dct_8x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_8x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_adst_dct_8x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[inline(always)]
fn dct32_1d(input: &[i32; 32]) -> [i32; 32] {
let mut t = [0i32; 32];
for i in 0..16 {
t[i] = input[i] + input[31 - i];
t[31 - i] = input[i] - input[31 - i];
}
let even_input = [
t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], t[13],
t[14], t[15],
];
let even_out = dct16_1d(&even_input);
let c1 = 4091; let c3 = 4076; let c5 = 4017; let c7 = 3920; let c9 = 3784; let c11 = 3612; let c13 = 3406; let c15 = 3166;
let mut odd = [0i32; 16];
for i in 0..8 {
let idx = 16 + i;
let o0 = t[idx];
let o1 = t[31 - i];
let cos_val = match i {
0 => c1,
1 => c3,
2 => c5,
3 => c7,
4 => c9,
5 => c11,
6 => c13,
7 => c15,
_ => c1,
};
let sin_val = 4096 - cos_val / 16;
odd[i] = ((o0 * cos_val + o1 * sin_val) + 2048) >> 12;
odd[15 - i] = ((o1 * cos_val - o0 * sin_val) + 2048) >> 12;
}
let mut out = [0i32; 32];
for i in 0..16 {
out[2 * i] = even_out[i];
out[2 * i + 1] = odd[i];
}
out
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x32_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 1024];
for y in 0..32 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 32] as i32;
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 32 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 512) >> 10;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..1024 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x32_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 1024];
for y in 0..32 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 32];
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 32 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 512) >> 10;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..1024 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_4x16_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..16 {
let in0 = coeff[y] as i32;
let in1 = coeff[y + 16] as i32;
let in2 = coeff[y + 32] as i32;
let in3 = coeff[y + 48] as i32;
let out = dct4_1d(in0, in1, in2, in3);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 4 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_4x16_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..16 {
let in0 = coeff[y];
let in1 = coeff[y + 16];
let in2 = coeff[y + 32];
let in3 = coeff[y + 48];
let out = dct4_1d(in0, in1, in2, in3);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 4 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..4 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 4] as i32;
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let in0 = tmp[0 * 16 + x];
let in1 = tmp[1 * 16 + x];
let in2 = tmp[2 * 16 + x];
let in3 = tmp[3 * 16 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x4_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64];
for y in 0..4 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 4];
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let in0 = tmp[0 * 16 + x];
let in1 = tmp[1 * 16 + x];
let in2 = tmp[2 * 16 + x];
let in3 = tmp[3 * 16 + x];
let out = dct4_1d(in0, in1, in2, in3);
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 64) >> 7;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..64 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x32_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 512];
for y in 0..32 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 32] as i32;
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 16 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..512 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x32_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 512];
for y in 0..32 {
let mut input = [0i32; 16];
for x in 0..16 {
input[x] = coeff[y + x * 32];
}
let out = dct16_1d(&input);
for x in 0..16 {
tmp[y * 16 + x] = out[x];
}
}
for x in 0..16 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 16 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..512 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x16_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 512];
for y in 0..16 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 16] as i32;
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 32 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..512 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x16_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 512];
for y in 0..16 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 16];
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 16];
for y in 0..16 {
input[y] = tmp[y * 32 + x];
}
let out = dct16_1d(&input);
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..512 {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
{
let token = unsafe { archmage::Arm64::forge_token_dangerously() };
super::itx_arm_neon_32::inv_txfm_add_dct_dct_32x32_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
{
let token = unsafe { archmage::Arm64::forge_token_dangerously() };
super::itx_arm_neon_32::inv_txfm_add_dct_dct_32x32_16bpc_neon_inner(
token,
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_4x16_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_4x16_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x4_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x32_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x32_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_32x16_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_32x16_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[inline(always)]
fn dct64_1d(input: &[i32; 64]) -> [i32; 64] {
let mut t = [0i32; 64];
for i in 0..32 {
t[i] = input[i] + input[63 - i];
t[63 - i] = input[i] - input[63 - i];
}
let even_input: [i32; 32] = core::array::from_fn(|i| t[i]);
let even_out = dct32_1d(&even_input);
let mut odd = [0i32; 32];
for i in 0..16 {
let idx = 32 + i;
let o0 = t[idx];
let o1 = t[63 - i];
let cos_val = 4096 - (i as i32 * 64);
let sin_val = i as i32 * 256;
odd[i] = ((o0 * cos_val + o1 * sin_val) + 2048) >> 12;
odd[31 - i] = ((o1 * cos_val - o0 * sin_val) + 2048) >> 12;
}
let mut out = [0i32; 64];
for i in 0..32 {
out[2 * i] = even_out[i];
out[2 * i + 1] = odd[i];
}
out
}
#[autoversion]
fn inv_txfm_add_dct_dct_64x64_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 4096];
for y in 0..64 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff[y + x * 64] as i32;
}
let out = dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 64 + x];
}
let out = dct64_1d(&input);
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 2048) >> 12;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..4096 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_64x64_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 4096];
for y in 0..64 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff[y + x * 64];
}
let out = dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 64 + x];
}
let out = dct64_1d(&input);
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 2048) >> 12;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..4096 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x32_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 256];
for y in 0..32 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 32] as i32;
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 8 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_8x32_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 256];
for y in 0..32 {
let mut input = [0i32; 8];
for x in 0..8 {
input[x] = coeff[y + x * 32];
}
let out = dct8_1d(&input);
for x in 0..8 {
tmp[y * 8 + x] = out[x];
}
}
for x in 0..8 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 8 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x8_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 256];
for y in 0..8 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 8] as i32;
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 32 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x8_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 256];
for y in 0..8 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 8];
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 8];
for y in 0..8 {
input[y] = tmp[y * 32 + x];
}
let out = dct8_1d(&input);
for y in 0..8 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 256) >> 9;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..256 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x64_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 2048];
for y in 0..64 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 64] as i32;
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 32 + x];
}
let out = dct64_1d(&input);
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 1024) >> 11;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..2048 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_32x64_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 2048];
for y in 0..64 {
let mut input = [0i32; 32];
for x in 0..32 {
input[x] = coeff[y + x * 64];
}
let out = dct32_1d(&input);
for x in 0..32 {
tmp[y * 32 + x] = out[x];
}
}
for x in 0..32 {
let mut input = [0i32; 64];
for y in 0..64 {
input[y] = tmp[y * 32 + x];
}
let out = dct64_1d(&input);
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 1024) >> 11;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..2048 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_64x32_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 2048];
for y in 0..32 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff[y + x * 32] as i32;
}
let out = dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 64 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
let d = dst[row_off + x] as i32;
let c = (out[y] + 1024) >> 11;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..2048 {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_64x32_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 2048];
for y in 0..32 {
let mut input = [0i32; 64];
for x in 0..64 {
input[x] = coeff[y + x * 32];
}
let out = dct64_1d(&input);
for x in 0..64 {
tmp[y * 64 + x] = out[x];
}
}
for x in 0..64 {
let mut input = [0i32; 32];
for y in 0..32 {
input[y] = tmp[y * 64 + x];
}
let out = dct32_1d(&input);
for y in 0..32 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
let d = dst[row_off + x] as i32;
let c = (out[y] + 1024) >> 11;
let result = iclip(d + c, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..2048 {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x64_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 4096) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 63usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 64;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 64;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_64x64_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x64_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 4096) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 63usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 64;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 64;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_64x64_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x32_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_8x32_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_32x8_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_32x8_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x64_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 2048) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 63usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_32x64_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x64_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 2048) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 63usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_32x64_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 2048) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 64;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 64;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_64x32_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 2048) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 64;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 64;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_64x32_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[inline(always)]
fn identity4_1d_arm(c: &mut [i32], stride: usize) {
for i in 0..4 {
let v = c[i * stride];
c[i * stride] = ((v * 181 + 128) >> 8) + v;
}
}
#[inline(always)]
fn identity8_1d_arm(c: &mut [i32], stride: usize) {
for i in 0..8 {
c[i * stride] *= 2;
}
}
#[inline(always)]
fn identity16_1d_arm(c: &mut [i32], stride: usize) {
for i in 0..16 {
c[i * stride] *= 2;
}
}
#[inline(always)]
fn identity32_1d_arm(c: &mut [i32], stride: usize) {
for i in 0..32 {
c[i * stride] *= 4;
}
}
#[inline(always)]
fn rect2_scale(v: i32) -> i32 {
(v * 181 + 128) >> 8
}
#[autoversion]
fn identity_rect_8bpc_inner<const W: usize, const H: usize>(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
row_fn: fn(&mut [i32], usize),
col_fn: fn(&mut [i32], usize),
is_rect2: bool,
) {
let mut tmp = vec![0i32; W * H];
for y in 0..H {
let mut row = [0i32; W];
for x in 0..W {
let c = coeff[y + x * H] as i32;
row[x] = if is_rect2 { rect2_scale(c) } else { c };
}
row_fn(&mut row, 1);
for x in 0..W {
tmp[y * W + x] = row[x];
}
}
for x in 0..W {
col_fn(&mut tmp[x..], W);
}
for y in 0..H {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..W {
let d = dst[row_off + x] as i32;
let val = (tmp[y * W + x] + 8) >> 4;
let result = iclip(d + val, 0, bitdepth_max);
dst[row_off + x] = result as u8;
}
}
for i in 0..(W * H) {
coeff[i] = 0;
}
}
#[autoversion]
fn identity_rect_16bpc_inner<const W: usize, const H: usize>(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
row_fn: fn(&mut [i32], usize),
col_fn: fn(&mut [i32], usize),
is_rect2: bool,
) {
let mut tmp = vec![0i32; W * H];
for y in 0..H {
let mut row = [0i32; W];
for x in 0..W {
let c = coeff[y + x * H];
row[x] = if is_rect2 { rect2_scale(c) } else { c };
}
row_fn(&mut row, 1);
for x in 0..W {
tmp[y * W + x] = row[x];
}
}
for x in 0..W {
col_fn(&mut tmp[x..], W);
}
for y in 0..H {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..W {
let d = dst[row_off + x] as i32;
let val = (tmp[y * W + x] + 8) >> 4;
let result = iclip(d + val, 0, bitdepth_max);
dst[row_off + x] = result as u16;
}
}
for i in 0..(W * H) {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<4, 8>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity4_1d_arm,
identity8_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<8, 4>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity8_1d_arm,
identity4_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<4, 16>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity4_1d_arm,
identity16_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x4_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<16, 4>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity16_1d_arm,
identity4_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<8, 16>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity8_1d_arm,
identity16_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<16, 8>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity16_1d_arm,
identity8_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_8bpc_inner::<8, 32>(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
identity8_1d_arm,
identity32_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 8;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 8;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<8, 32>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity8_1d_arm,
identity32_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x8_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_8bpc_inner::<32, 8>(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
identity32_1d_arm,
identity8_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x8_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 7usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<32, 8>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity32_1d_arm,
identity8_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_8bpc_inner::<16, 32>(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
identity16_1d_arm,
identity32_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<16, 32>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity16_1d_arm,
identity32_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_8bpc_inner::<32, 16>(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
identity32_1d_arm,
identity16_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
identity_rect_16bpc_inner::<32, 16>(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
identity32_1d_arm,
identity16_1d_arm,
true,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x32_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
{
let token = unsafe { archmage::Arm64::forge_token_dangerously() };
super::itx_arm_neon_32::inv_txfm_add_identity_identity_32x32_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x32_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 31usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 32;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 32;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
{
let token = unsafe { archmage::Arm64::forge_token_dangerously() };
super::itx_arm_neon_32::inv_txfm_add_identity_identity_32x32_16bpc_neon_inner(
token,
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_dct_identity_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let c0 = coeff[y] as i32;
let c1 = coeff[y + 4] as i32;
let c2 = coeff[y + 8] as i32;
let c3 = coeff[y + 12] as i32;
let out = dct4_1d(c0, c1, c2, c3);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
identity4_1d_arm(&mut tmp[x..], 4);
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 4 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_identity_dct_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let mut row = [0i32; 4];
for x in 0..4 {
row[x] = coeff[y + x * 4] as i32;
}
identity4_1d_arm(&mut row, 1);
for x in 0..4 {
tmp[y * 4 + x] = row[x];
}
}
for x in 0..4 {
let out = dct4_1d(
tmp[0 * 4 + x],
tmp[1 * 4 + x],
tmp[2 * 4 + x],
tmp[3 * 4 + x],
);
for y in 0..4 {
tmp[y * 4 + x] = out[y];
}
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 4 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_adst_identity_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let out = adst4_1d(
coeff[y] as i32,
coeff[y + 4] as i32,
coeff[y + 8] as i32,
coeff[y + 12] as i32,
);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
identity4_1d_arm(&mut tmp[x..], 4);
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 4 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_identity_adst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let mut row = [0i32; 4];
for x in 0..4 {
row[x] = coeff[y + x * 4] as i32;
}
identity4_1d_arm(&mut row, 1);
for x in 0..4 {
tmp[y * 4 + x] = row[x];
}
}
for x in 0..4 {
let out = adst4_1d(
tmp[0 * 4 + x],
tmp[1 * 4 + x],
tmp[2 * 4 + x],
tmp[3 * 4 + x],
);
for y in 0..4 {
tmp[y * 4 + x] = out[y];
}
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 4 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_identity_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let out = flipadst4_1d(
coeff[y] as i32,
coeff[y + 4] as i32,
coeff[y + 8] as i32,
coeff[y + 12] as i32,
);
for x in 0..4 {
tmp[y * 4 + x] = out[x];
}
}
for x in 0..4 {
identity4_1d_arm(&mut tmp[x..], 4);
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 4 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[autoversion]
pub(crate) fn inv_txfm_add_identity_flipadst_4x4_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16];
for y in 0..4 {
let mut row = [0i32; 4];
for x in 0..4 {
row[x] = coeff[y + x * 4] as i32;
}
identity4_1d_arm(&mut row, 1);
for x in 0..4 {
tmp[y * 4 + x] = row[x];
}
}
for x in 0..4 {
let out = flipadst4_1d(
tmp[0 * 4 + x],
tmp[1 * 4 + x],
tmp[2 * 4 + x],
tmp[3 * 4 + x],
);
for y in 0..4 {
tmp[y * 4 + x] = out[y];
}
}
for y in 0..4 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..4 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 4 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..16 {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_identity_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_dct_identity_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_dct_identity_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_dct_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_identity_dct_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_identity_dct_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_identity_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_adst_identity_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_adst_identity_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_adst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_identity_adst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_identity_adst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_identity_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_flipadst_identity_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_flipadst_identity_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_flipadst_4x4_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 3usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 4;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 4;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
#[cfg(target_arch = "aarch64")]
if let Some(token) = archmage::Arm64::summon() {
super::itx_arm_neon_4x4::inv_txfm_add_identity_flipadst_4x4_8bpc_neon_inner(
token,
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
return;
}
inv_txfm_add_identity_flipadst_4x4_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x64_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16 * 64];
for y in 0..64 {
let mut row = [0i32; 16];
for x in 0..16 {
row[x] = rect2_scale(coeff[y + x * 64] as i32);
}
let out = dct16_1d(&row);
for x in 0..16 {
tmp[y * 16 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..16 {
let mut col = [0i32; 64];
for y in 0..64 {
col[y] = tmp[y * 16 + x];
}
let out = dct64_1d(&col);
for y in 0..64 {
tmp[y * 16 + x] = out[y];
}
}
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..16 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 16 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..(16 * 64) {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_16x64_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 16 * 64];
for y in 0..64 {
let mut row = [0i32; 16];
for x in 0..16 {
row[x] = rect2_scale(coeff[y + x * 64]);
}
let out = dct16_1d(&row);
for x in 0..16 {
tmp[y * 16 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..16 {
let mut col = [0i32; 64];
for y in 0..64 {
col[y] = tmp[y * 16 + x];
}
let out = dct64_1d(&col);
for y in 0..64 {
tmp[y * 16 + x] = out[y];
}
}
for y in 0..64 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..16 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 16 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u16;
}
}
for i in 0..(16 * 64) {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_64x16_8bpc_inner(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64 * 16];
for y in 0..16 {
let mut row = [0i32; 64];
for x in 0..64 {
row[x] = rect2_scale(coeff[y + x * 16] as i32);
}
let out = dct64_1d(&row);
for x in 0..64 {
tmp[y * 64 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..64 {
let mut col = [0i32; 16];
for y in 0..16 {
col[y] = tmp[y * 64 + x];
}
let out = dct16_1d(&col);
for y in 0..16 {
tmp[y * 64 + x] = out[y];
}
}
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..64 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 64 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
}
}
for i in 0..(64 * 16) {
coeff[i] = 0;
}
}
#[autoversion]
fn inv_txfm_add_dct_dct_64x16_16bpc_inner(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut tmp = [0i32; 64 * 16];
for y in 0..16 {
let mut row = [0i32; 64];
for x in 0..64 {
row[x] = rect2_scale(coeff[y + x * 16]);
}
let out = dct64_1d(&row);
for x in 0..64 {
tmp[y * 64 + x] = (out[x] + 1) >> 1;
}
}
for x in 0..64 {
let mut col = [0i32; 16];
for y in 0..16 {
col[y] = tmp[y * 64 + x];
}
let out = dct16_1d(&col);
for y in 0..16 {
tmp[y * 64 + x] = out[y];
}
}
for y in 0..16 {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..64 {
let d = dst[row_off + x] as i32;
let val = (tmp[y * 64 + x] + 8) >> 4;
dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u16;
}
}
for i in 0..(64 * 16) {
coeff[i] = 0;
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x64_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 63usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x64_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x64_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 63usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 16;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 16;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_16x64_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x16_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
let abs_stride = dst_stride.unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + 64;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
0usize,
)
} else {
let len = rows * abs_stride + 64;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_64x16_8bpc_inner(
dst_slice,
base,
dst_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x16_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = 15usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + 64;
(
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
0usize,
)
} else {
let len = rows * abs_stride + 64;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(
unsafe { std::slice::from_raw_parts_mut(start, len) },
rows * abs_stride,
)
};
inv_txfm_add_dct_dct_64x16_16bpc_inner(
dst_slice,
base,
stride_u16 as isize,
coeff_slice,
eob,
bitdepth_max,
);
}
use crate::src::itx_1d::*;
use std::cmp;
use std::num::NonZeroUsize;
type Itx1dFn = fn(&mut [i32], NonZeroUsize, i32, i32);
fn inv_txfm_add_generic_8bpc(
dst: &mut [u8],
dst_base: usize,
dst_stride: isize,
coeff: &mut [i16],
eob: i32,
_bitdepth_max: i32,
w: usize,
h: usize,
shift: u8,
row_fn: Itx1dFn,
col_fn: Itx1dFn,
has_dc_only: bool,
) {
let is_rect2 = w * 2 == h || h * 2 == w;
let rnd = if shift > 0 { 1i32 << (shift - 1) } else { 0 };
if eob < has_dc_only as i32 {
let mut dc = coeff[0] as i32;
coeff[0] = 0;
if is_rect2 {
dc = (dc * 181 + 128) >> 8;
}
dc = (dc * 181 + 128) >> 8;
dc = (dc + rnd) >> shift;
dc = (dc * 181 + 128 + 2048) >> 12;
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..w {
let p = dst[row_off + x] as i32 + dc;
dst[row_off + x] = p.max(0).min(255) as u8;
}
}
return;
}
let sh = cmp::min(h, 32);
let sw = cmp::min(w, 32);
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 64 * 64];
for y in 0..sh {
if is_rect2 {
for x in 0..sw {
tmp[y * w + x] = (coeff[y + x * sh] as i32 * 181 + 128) >> 8;
}
} else {
for x in 0..sw {
tmp[y * w + x] = coeff[y + x * sh] as i32;
}
}
row_fn(
&mut tmp[y * w..],
1.try_into().unwrap(),
row_clip_min,
row_clip_max,
);
}
for i in 0..(sh * sw) {
coeff[i] = 0;
}
for i in 0..(w * sh) {
tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
}
for x in 0..w {
col_fn(
&mut tmp[x..],
w.try_into().unwrap(),
col_clip_min,
col_clip_max,
);
}
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
for x in 0..w {
let p = dst[row_off + x] as i32 + ((tmp[y * w + x] + 8) >> 4);
dst[row_off + x] = p.max(0).min(255) as u8;
}
}
}
fn inv_txfm_add_generic_16bpc(
dst: &mut [u16],
dst_base: usize,
dst_stride_u16: isize,
coeff: &mut [i32],
eob: i32,
bitdepth_max: i32,
w: usize,
h: usize,
shift: u8,
row_fn: Itx1dFn,
col_fn: Itx1dFn,
has_dc_only: bool,
) {
let is_rect2 = w * 2 == h || h * 2 == w;
let rnd = if shift > 0 { 1i32 << (shift - 1) } else { 0 };
if eob < has_dc_only as i32 {
let mut dc = coeff[0];
coeff[0] = 0;
if is_rect2 {
dc = (dc * 181 + 128) >> 8;
}
dc = (dc * 181 + 128) >> 8;
dc = (dc + rnd) >> shift;
dc = (dc * 181 + 128 + 2048) >> 12;
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..w {
let p = dst[row_off + x] as i32 + dc;
dst[row_off + x] = p.max(0).min(bitdepth_max) as u16;
}
}
return;
}
let sh = cmp::min(h, 32);
let sw = cmp::min(w, 32);
let row_clip_min = (!bitdepth_max) << 7;
let row_clip_max = !row_clip_min;
let col_clip_min = (!bitdepth_max) << 5;
let col_clip_max = !col_clip_min;
let mut tmp = [0i32; 64 * 64];
for y in 0..sh {
if is_rect2 {
for x in 0..sw {
tmp[y * w + x] = (coeff[y + x * sh] * 181 + 128) >> 8;
}
} else {
for x in 0..sw {
tmp[y * w + x] = coeff[y + x * sh];
}
}
row_fn(
&mut tmp[y * w..],
1.try_into().unwrap(),
row_clip_min,
row_clip_max,
);
}
for i in 0..(sh * sw) {
coeff[i] = 0;
}
for i in 0..(w * sh) {
tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
}
for x in 0..w {
col_fn(
&mut tmp[x..],
w.try_into().unwrap(),
col_clip_min,
col_clip_max,
);
}
for y in 0..h {
let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
for x in 0..w {
let p = dst[row_off + x] as i32 + ((tmp[y * w + x] + 8) >> 4);
dst[row_off + x] = p.max(0).min(bitdepth_max) as u16;
}
}
}
fn resolve_1d(txfm: &str, n: usize) -> Itx1dFn {
match (txfm, n) {
("dct", 4) => rav1d_inv_dct4_1d_c,
("dct", 8) => rav1d_inv_dct8_1d_c,
("dct", 16) => rav1d_inv_dct16_1d_c,
("dct", 32) => rav1d_inv_dct32_1d_c,
("dct", 64) => rav1d_inv_dct64_1d_c,
("adst", 4) => rav1d_inv_adst4_1d_c,
("adst", 8) => rav1d_inv_adst8_1d_c,
("adst", 16) => rav1d_inv_adst16_1d_c,
("flipadst", 4) => rav1d_inv_flipadst4_1d_c,
("flipadst", 8) => rav1d_inv_flipadst8_1d_c,
("flipadst", 16) => rav1d_inv_flipadst16_1d_c,
("identity", 4) => rav1d_inv_identity4_1d_c,
("identity", 8) => rav1d_inv_identity8_1d_c,
("identity", 16) => rav1d_inv_identity16_1d_c,
("identity", 32) => rav1d_inv_identity32_1d_c,
_ => unreachable!("unsupported 1D transform: {} size {}", txfm, n),
}
}
fn shift_for(w: usize, h: usize) -> u8 {
match (w, h) {
(4, 4) => 0,
(4, 8) => 0,
(4, 16) => 1,
(8, 4) => 0,
(8, 8) => 1,
(8, 16) => 1,
(8, 32) => 2,
(16, 4) => 1,
(16, 8) => 1,
(16, 16) => 2,
(16, 32) => 1,
(16, 64) => 2,
(32, 8) => 2,
(32, 16) => 1,
(32, 32) => 2,
(32, 64) => 1,
(64, 16) => 2,
(64, 32) => 1,
(64, 64) => 2,
_ => unreachable!(),
}
}
macro_rules! gen_itx_arm {
($row_name:ident, $col_name:ident, $w:literal, $h:literal, $is_dct_dct:expr) => {
paste::paste! {
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _8bpc_neon>](
dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
eob: i32, bitdepth_max: i32, _coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
{
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, $w * $h) };
let abs_stride = dst_stride.unsigned_abs();
let rows = ($h - 1) as usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + $w;
(unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) }, 0usize)
} else {
let len = rows * abs_stride + $w;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
};
inv_txfm_add_generic_8bpc(
dst_slice, base, dst_stride, coeff_slice, eob, bitdepth_max,
$w, $h, shift_for($w, $h),
resolve_1d(stringify!($row_name), $w),
resolve_1d(stringify!($col_name), $h),
$is_dct_dct,
);
}
}
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _16bpc_neon>](
dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
eob: i32, bitdepth_max: i32, _coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
{
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, $w * $h) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = ($h - 1) as usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + $w;
(unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) }, 0usize)
} else {
let len = rows * abs_stride + $w;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
};
inv_txfm_add_generic_16bpc(
dst_slice, base, stride_u16 as isize, coeff_slice, eob, bitdepth_max,
$w, $h, shift_for($w, $h),
resolve_1d(stringify!($row_name), $w),
resolve_1d(stringify!($col_name), $h),
$is_dct_dct,
);
}
}
}
};
}
macro_rules! gen_itx_arm_8bpc {
($row_name:ident, $col_name:ident, $w:literal, $h:literal, $is_dct_dct:expr) => {
paste::paste! {
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _8bpc_neon>](
dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
eob: i32, bitdepth_max: i32, _coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
{
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, $w * $h) };
let abs_stride = dst_stride.unsigned_abs();
let rows = ($h - 1) as usize;
let (dst_slice, base) = if dst_stride >= 0 {
let len = rows * abs_stride + $w;
(unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) }, 0usize)
} else {
let len = rows * abs_stride + $w;
let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
(unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
};
inv_txfm_add_generic_8bpc(
dst_slice, base, dst_stride, coeff_slice, eob, bitdepth_max,
$w, $h, shift_for($w, $h),
resolve_1d(stringify!($row_name), $w),
resolve_1d(stringify!($col_name), $h),
$is_dct_dct,
);
}
}
}
};
}
macro_rules! gen_itx_arm_16bpc {
($row_name:ident, $col_name:ident, $w:literal, $h:literal, $is_dct_dct:expr) => {
paste::paste! {
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _16bpc_neon>](
dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
eob: i32, bitdepth_max: i32, _coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
{
let stride_u16 = dst_stride / 2;
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, $w * $h) };
let abs_stride = (stride_u16 as isize).unsigned_abs();
let rows = ($h - 1) as usize;
let (dst_slice, base) = if stride_u16 >= 0 {
let len = rows * abs_stride + $w;
(unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) }, 0usize)
} else {
let len = rows * abs_stride + $w;
let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
(unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
};
inv_txfm_add_generic_16bpc(
dst_slice, base, stride_u16 as isize, coeff_slice, eob, bitdepth_max,
$w, $h, shift_for($w, $h),
resolve_1d(stringify!($row_name), $w),
resolve_1d(stringify!($col_name), $h),
$is_dct_dct,
);
}
}
}
};
}
gen_itx_arm!(adst, adst, 4, 8, false);
gen_itx_arm!(adst, adst, 8, 4, false);
gen_itx_arm!(adst, adst, 4, 16, false);
gen_itx_arm!(adst, adst, 16, 4, false);
gen_itx_arm!(adst, adst, 8, 16, false);
gen_itx_arm!(adst, adst, 16, 8, false);
gen_itx_arm!(adst, adst, 16, 16, false);
gen_itx_arm!(adst, dct, 4, 8, false);
gen_itx_arm!(adst, dct, 8, 4, false);
gen_itx_arm!(adst, dct, 4, 16, false);
gen_itx_arm!(adst, dct, 16, 4, false);
gen_itx_arm!(adst, dct, 8, 16, false);
gen_itx_arm!(adst, dct, 16, 8, false);
gen_itx_arm!(adst, dct, 16, 16, false);
gen_itx_arm!(dct, adst, 4, 8, false);
gen_itx_arm!(dct, adst, 8, 4, false);
gen_itx_arm!(dct, adst, 4, 16, false);
gen_itx_arm!(dct, adst, 16, 4, false);
gen_itx_arm!(dct, adst, 8, 16, false);
gen_itx_arm!(dct, adst, 16, 8, false);
gen_itx_arm!(dct, adst, 16, 16, false);
gen_itx_arm!(flipadst, flipadst, 4, 8, false);
gen_itx_arm!(flipadst, flipadst, 8, 4, false);
gen_itx_arm!(flipadst, flipadst, 4, 16, false);
gen_itx_arm!(flipadst, flipadst, 16, 4, false);
gen_itx_arm!(flipadst, flipadst, 8, 8, false);
gen_itx_arm!(flipadst, flipadst, 8, 16, false);
gen_itx_arm!(flipadst, flipadst, 16, 8, false);
gen_itx_arm!(flipadst, flipadst, 16, 16, false);
gen_itx_arm!(dct, flipadst, 4, 8, false);
gen_itx_arm!(dct, flipadst, 8, 4, false);
gen_itx_arm!(dct, flipadst, 4, 16, false);
gen_itx_arm!(dct, flipadst, 16, 4, false);
gen_itx_arm!(dct, flipadst, 8, 8, false);
gen_itx_arm!(dct, flipadst, 8, 16, false);
gen_itx_arm!(dct, flipadst, 16, 8, false);
gen_itx_arm!(dct, flipadst, 16, 16, false);
gen_itx_arm!(flipadst, dct, 4, 8, false);
gen_itx_arm!(flipadst, dct, 8, 4, false);
gen_itx_arm!(flipadst, dct, 4, 16, false);
gen_itx_arm!(flipadst, dct, 16, 4, false);
gen_itx_arm!(flipadst, dct, 8, 8, false);
gen_itx_arm!(flipadst, dct, 8, 16, false);
gen_itx_arm!(flipadst, dct, 16, 8, false);
gen_itx_arm!(flipadst, dct, 16, 16, false);
gen_itx_arm!(adst, flipadst, 4, 8, false);
gen_itx_arm!(adst, flipadst, 8, 4, false);
gen_itx_arm!(adst, flipadst, 4, 16, false);
gen_itx_arm!(adst, flipadst, 16, 4, false);
gen_itx_arm!(adst, flipadst, 8, 8, false);
gen_itx_arm!(adst, flipadst, 8, 16, false);
gen_itx_arm!(adst, flipadst, 16, 8, false);
gen_itx_arm!(adst, flipadst, 16, 16, false);
gen_itx_arm!(flipadst, adst, 4, 8, false);
gen_itx_arm!(flipadst, adst, 8, 4, false);
gen_itx_arm!(flipadst, adst, 4, 16, false);
gen_itx_arm!(flipadst, adst, 16, 4, false);
gen_itx_arm!(flipadst, adst, 8, 8, false);
gen_itx_arm!(flipadst, adst, 8, 16, false);
gen_itx_arm!(flipadst, adst, 16, 8, false);
gen_itx_arm!(flipadst, adst, 16, 16, false);
gen_itx_arm_16bpc!(dct, identity, 4, 4, false);
gen_itx_arm!(dct, identity, 4, 8, false);
gen_itx_arm!(dct, identity, 8, 4, false);
gen_itx_arm!(dct, identity, 4, 16, false);
gen_itx_arm!(dct, identity, 16, 4, false);
gen_itx_arm!(dct, identity, 8, 8, false);
gen_itx_arm!(dct, identity, 8, 16, false);
gen_itx_arm!(dct, identity, 16, 8, false);
gen_itx_arm!(dct, identity, 16, 16, false);
gen_itx_arm_16bpc!(identity, dct, 4, 4, false);
gen_itx_arm!(identity, dct, 4, 8, false);
gen_itx_arm!(identity, dct, 8, 4, false);
gen_itx_arm!(identity, dct, 4, 16, false);
gen_itx_arm!(identity, dct, 16, 4, false);
gen_itx_arm!(identity, dct, 8, 8, false);
gen_itx_arm!(identity, dct, 8, 16, false);
gen_itx_arm!(identity, dct, 16, 8, false);
gen_itx_arm!(identity, dct, 16, 16, false);
gen_itx_arm_16bpc!(adst, identity, 4, 4, false);
gen_itx_arm!(adst, identity, 4, 8, false);
gen_itx_arm!(adst, identity, 8, 4, false);
gen_itx_arm!(adst, identity, 4, 16, false);
gen_itx_arm!(adst, identity, 16, 4, false);
gen_itx_arm!(adst, identity, 8, 8, false);
gen_itx_arm!(adst, identity, 8, 16, false);
gen_itx_arm!(adst, identity, 16, 8, false);
gen_itx_arm!(adst, identity, 16, 16, false);
gen_itx_arm_16bpc!(identity, adst, 4, 4, false);
gen_itx_arm!(identity, adst, 4, 8, false);
gen_itx_arm!(identity, adst, 8, 4, false);
gen_itx_arm!(identity, adst, 4, 16, false);
gen_itx_arm!(identity, adst, 16, 4, false);
gen_itx_arm!(identity, adst, 8, 8, false);
gen_itx_arm!(identity, adst, 8, 16, false);
gen_itx_arm!(identity, adst, 16, 8, false);
gen_itx_arm!(identity, adst, 16, 16, false);
gen_itx_arm_16bpc!(flipadst, identity, 4, 4, false);
gen_itx_arm!(flipadst, identity, 4, 8, false);
gen_itx_arm!(flipadst, identity, 8, 4, false);
gen_itx_arm!(flipadst, identity, 4, 16, false);
gen_itx_arm!(flipadst, identity, 16, 4, false);
gen_itx_arm!(flipadst, identity, 8, 8, false);
gen_itx_arm!(flipadst, identity, 8, 16, false);
gen_itx_arm!(flipadst, identity, 16, 8, false);
gen_itx_arm!(flipadst, identity, 16, 16, false);
gen_itx_arm_16bpc!(identity, flipadst, 4, 4, false);
gen_itx_arm!(identity, flipadst, 4, 8, false);
gen_itx_arm!(identity, flipadst, 8, 4, false);
gen_itx_arm!(identity, flipadst, 4, 16, false);
gen_itx_arm!(identity, flipadst, 16, 4, false);
gen_itx_arm!(identity, flipadst, 8, 8, false);
gen_itx_arm!(identity, flipadst, 8, 16, false);
gen_itx_arm!(identity, flipadst, 16, 8, false);
gen_itx_arm!(identity, flipadst, 16, 16, false);
gen_itx_arm_8bpc!(identity, identity, 4, 8, false);
gen_itx_arm_8bpc!(identity, identity, 8, 4, false);
gen_itx_arm_8bpc!(identity, identity, 4, 16, false);
gen_itx_arm_8bpc!(identity, identity, 16, 4, false);
gen_itx_arm_8bpc!(identity, identity, 8, 16, false);
gen_itx_arm_8bpc!(identity, identity, 16, 8, false);
use crate::include::common::bitdepth::BPC;
use crate::include::common::bitdepth::BitDepth;
use crate::src::levels::ADST_ADST;
use crate::src::levels::ADST_DCT;
use crate::src::levels::ADST_FLIPADST;
use crate::src::levels::DCT_ADST;
use crate::src::levels::DCT_DCT;
use crate::src::levels::DCT_FLIPADST;
use crate::src::levels::FLIPADST_ADST;
use crate::src::levels::FLIPADST_DCT;
use crate::src::levels::FLIPADST_FLIPADST;
use crate::src::levels::H_ADST;
use crate::src::levels::H_DCT;
use crate::src::levels::H_FLIPADST;
use crate::src::levels::IDTX;
use crate::src::levels::TxfmSize;
use crate::src::levels::TxfmType;
use crate::src::levels::V_ADST;
use crate::src::levels::V_DCT;
use crate::src::levels::V_FLIPADST;
use crate::src::levels::WHT_WHT;
use crate::src::strided::Strided as _;
macro_rules! impl_itxfm_direct_dispatch {
(
fn $fn_name:ident, $mod_path:path,
itx16: [$(($sz16:expr, $w16:literal, $h16:literal)),* $(,)?],
itx12: [$(($sz12:expr, $w12:literal, $h12:literal)),* $(,)?],
itx2: [$(($sz2:expr, $w2:literal, $h2:literal)),* $(,)?],
itx1: [$(($sz1:expr, $w1:literal, $h1:literal)),* $(,)?],
wht: ($szw:expr, $ww:literal, $hw:literal),
$bpc:literal bpc, $ext:ident,
h_dct_fn: $h_dct_fn:ident, v_dct_fn: $v_dct_fn:ident,
h_adst_fn: $h_adst_fn:ident, v_adst_fn: $v_adst_fn:ident,
h_flipadst_fn: $h_flipadst_fn:ident, v_flipadst_fn: $v_flipadst_fn:ident
) => {
paste::paste! {
#[cfg(feature = "asm")]
#[allow(non_upper_case_globals)]
fn $fn_name(
tx_size: usize,
tx_type: usize,
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: i32,
bitdepth_max: i32,
coeff_len: u16,
dst: *const FFISafe<PicOffset>,
) -> bool {
use $mod_path as si;
macro_rules! c {
($func:expr) => {{
unsafe { $func(dst_ptr, dst_stride, coeff, eob, bitdepth_max, coeff_len, dst) };
return true;
}};
}
const s4x4: usize = TxfmSize::S4x4 as usize;
const s8x8: usize = TxfmSize::S8x8 as usize;
const s16x16: usize = TxfmSize::S16x16 as usize;
const s32x32: usize = TxfmSize::S32x32 as usize;
const s64x64: usize = TxfmSize::S64x64 as usize;
const r4x8: usize = TxfmSize::R4x8 as usize;
const r8x4: usize = TxfmSize::R8x4 as usize;
const r8x16: usize = TxfmSize::R8x16 as usize;
const r16x8: usize = TxfmSize::R16x8 as usize;
const r16x32: usize = TxfmSize::R16x32 as usize;
const r32x16: usize = TxfmSize::R32x16 as usize;
const r32x64: usize = TxfmSize::R32x64 as usize;
const r64x32: usize = TxfmSize::R64x32 as usize;
const r4x16: usize = TxfmSize::R4x16 as usize;
const r16x4: usize = TxfmSize::R16x4 as usize;
const r8x32: usize = TxfmSize::R8x32 as usize;
const r32x8: usize = TxfmSize::R32x8 as usize;
const r16x64: usize = TxfmSize::R16x64 as usize;
const r64x16: usize = TxfmSize::R64x16 as usize;
match (tx_size, tx_type as TxfmType) {
($szw, WHT_WHT) => c!(si::[<inv_txfm_add_wht_wht_ $ww x $hw _ $bpc bpc_ $ext>]),
$(
($sz16, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, IDTX) => c!(si::[<inv_txfm_add_identity_identity_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, ADST_DCT) => c!(si::[<inv_txfm_add_dct_adst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, DCT_ADST) => c!(si::[<inv_txfm_add_adst_dct_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, ADST_ADST) => c!(si::[<inv_txfm_add_adst_adst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, FLIPADST_DCT) => c!(si::[<inv_txfm_add_dct_flipadst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, DCT_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_dct_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, FLIPADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_flipadst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, ADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_adst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, FLIPADST_ADST) => c!(si::[<inv_txfm_add_adst_flipadst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, H_DCT) => c!(si::[<inv_txfm_add_ $h_dct_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, V_DCT) => c!(si::[<inv_txfm_add_ $v_dct_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, H_ADST) => c!(si::[<inv_txfm_add_ $h_adst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, V_ADST) => c!(si::[<inv_txfm_add_ $v_adst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, H_FLIPADST) => c!(si::[<inv_txfm_add_ $h_flipadst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
($sz16, V_FLIPADST) => c!(si::[<inv_txfm_add_ $v_flipadst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
)*
$(
($sz12, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, IDTX) => c!(si::[<inv_txfm_add_identity_identity_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, ADST_DCT) => c!(si::[<inv_txfm_add_dct_adst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, DCT_ADST) => c!(si::[<inv_txfm_add_adst_dct_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, ADST_ADST) => c!(si::[<inv_txfm_add_adst_adst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, FLIPADST_DCT) => c!(si::[<inv_txfm_add_dct_flipadst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, DCT_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_dct_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, FLIPADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_flipadst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, ADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_adst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, FLIPADST_ADST) => c!(si::[<inv_txfm_add_adst_flipadst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, H_DCT) => c!(si::[<inv_txfm_add_ $h_dct_fn _ $w12 x $h12 _ $bpc bpc_ $ext>]),
($sz12, V_DCT) => c!(si::[<inv_txfm_add_ $v_dct_fn _ $w12 x $h12 _ $bpc bpc_ $ext>]),
)*
$(
($sz2, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w2 x $h2 _ $bpc bpc_ $ext>]),
($sz2, IDTX) => c!(si::[<inv_txfm_add_identity_identity_ $w2 x $h2 _ $bpc bpc_ $ext>]),
)*
$(
($sz1, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w1 x $h1 _ $bpc bpc_ $ext>]),
)*
_ => return false,
}
}
}
};
}
impl_itxfm_direct_dispatch!(
fn itxfm_add_direct_arm_8bpc, crate::src::safe_simd::itx_arm,
itx16: [
(s4x4, 4, 4),
(s8x8, 8, 8),
(r4x8, 4, 8), (r8x4, 8, 4),
(r4x16, 4, 16), (r16x4, 16, 4),
(r8x16, 8, 16), (r16x8, 16, 8),
],
itx12: [
(s16x16, 16, 16),
],
itx2: [
(r8x32, 8, 32), (r32x8, 32, 8),
(r16x32, 16, 32), (r32x16, 32, 16),
(s32x32, 32, 32),
],
itx1: [
(r16x64, 16, 64), (r32x64, 32, 64),
(r64x16, 64, 16), (r64x32, 64, 32),
(s64x64, 64, 64),
],
wht: (s4x4, 4, 4),
8 bpc, neon,
h_dct_fn: dct_identity, v_dct_fn: identity_dct,
h_adst_fn: adst_identity, v_adst_fn: identity_adst,
h_flipadst_fn: flipadst_identity, v_flipadst_fn: identity_flipadst
);
impl_itxfm_direct_dispatch!(
fn itxfm_add_direct_arm_16bpc, crate::src::safe_simd::itx_arm,
itx16: [
(s4x4, 4, 4),
(s8x8, 8, 8),
(r4x8, 4, 8), (r8x4, 8, 4),
(r4x16, 4, 16), (r16x4, 16, 4),
(r8x16, 8, 16), (r16x8, 16, 8),
],
itx12: [
(s16x16, 16, 16),
],
itx2: [
(r8x32, 8, 32), (r32x8, 32, 8),
(r16x32, 16, 32), (r32x16, 32, 16),
(s32x32, 32, 32),
],
itx1: [
(r16x64, 16, 64), (r32x64, 32, 64),
(r64x16, 64, 16), (r64x32, 64, 32),
(s64x64, 64, 64),
],
wht: (s4x4, 4, 4),
16 bpc, neon,
h_dct_fn: dct_identity, v_dct_fn: identity_dct,
h_adst_fn: adst_identity, v_adst_fn: identity_adst,
h_flipadst_fn: flipadst_identity, v_flipadst_fn: identity_flipadst
);
pub fn itxfm_add_dispatch<BD: BitDepth>(
tx_size: usize,
tx_type: usize,
dst: PicOffset,
coeff: &mut [BD::Coef],
eob: i32,
bd: BD,
) -> bool {
#[cfg(not(feature = "asm"))]
{
#[cfg(target_arch = "aarch64")]
{
use crate::include::common::bitdepth::BPC;
use crate::src::levels::{self, TxfmSize};
use crate::src::strided::Strided as _;
use zerocopy::IntoBytes;
use archmage::SimdToken;
let Some(token) = archmage::Arm64::summon() else {
return false;
};
let txfm = match TxfmSize::from_repr(tx_size) {
Some(t) => t,
None => return false,
};
let (w, h) = txfm.to_wh();
if w == 4 && h == 4 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_4x4::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::ADST_ADST => inv_txfm_add_adst_adst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::FLIPADST_FLIPADST => {
inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
)
}
levels::IDTX => inv_txfm_add_identity_identity_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::ADST_DCT => inv_txfm_add_dct_adst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::DCT_ADST => inv_txfm_add_adst_dct_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::FLIPADST_DCT => inv_txfm_add_dct_flipadst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::DCT_FLIPADST => inv_txfm_add_flipadst_dct_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::ADST_FLIPADST => inv_txfm_add_flipadst_adst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::FLIPADST_ADST => inv_txfm_add_adst_flipadst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::H_DCT => inv_txfm_add_dct_identity_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::V_DCT => inv_txfm_add_identity_dct_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::H_ADST => inv_txfm_add_adst_identity_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::V_ADST => inv_txfm_add_identity_adst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::H_FLIPADST => inv_txfm_add_flipadst_identity_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::V_FLIPADST => inv_txfm_add_identity_flipadst_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::WHT_WHT => {
super::itx_arm_neon_wht::inv_txfm_add_wht_wht_4x4_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
);
}
_ => return false,
}
return true;
}
if w == 16 && h == 16 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_16x16::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::ADST_ADST => inv_txfm_add_adst_adst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::ADST_DCT => inv_txfm_add_dct_adst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::DCT_ADST => inv_txfm_add_adst_dct_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::FLIPADST_DCT => inv_txfm_add_dct_flipadst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::DCT_FLIPADST => inv_txfm_add_flipadst_dct_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::FLIPADST_FLIPADST => {
inv_txfm_add_flipadst_flipadst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
)
}
levels::ADST_FLIPADST => inv_txfm_add_flipadst_adst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::FLIPADST_ADST => inv_txfm_add_adst_flipadst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::H_DCT => inv_txfm_add_dct_identity_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::V_DCT => inv_txfm_add_identity_dct_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::H_ADST => inv_txfm_add_adst_identity_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::V_ADST => inv_txfm_add_identity_adst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::H_FLIPADST => inv_txfm_add_flipadst_identity_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::V_FLIPADST => inv_txfm_add_identity_flipadst_16x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 32 && h == 32 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_32::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_32x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_32x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 8 && h == 32 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_large_rect::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_8x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_8x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 32 && h == 8 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_large_rect::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_32x8_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_32x8_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 16 && h == 32 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_large_rect::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_16x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_16x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 32 && h == 16 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_large_rect::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_32x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_32x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 64 && h == 64 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_64::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_64x64_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_64x64_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 64 && h == 32 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_64::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_64x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_64x32_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 32 && h == 64 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_64::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_32x64_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_32x64_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 16 && h == 64 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_64::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_16x64_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_16x64_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
if w == 64 && h == 16 && BD::BPC == BPC::BPC8 {
let byte_stride_i = dst.stride();
let bd_c = bd.into_c();
let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
let coeff_i16: &mut [i16] =
zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
use super::itx_arm_neon_64::*;
let tx_t = tx_type as u8;
match tx_t {
levels::DCT_DCT => inv_txfm_add_dct_dct_64x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
levels::IDTX => inv_txfm_add_identity_identity_64x16_8bpc_neon_inner(
token,
dst_u8,
base,
byte_stride_i,
coeff_i16,
eob,
bd_c,
),
_ => return false,
}
return true;
}
return false;
}
#[cfg(not(target_arch = "aarch64"))]
{
let _ = (tx_size, tx_type, &dst, coeff, eob, &bd);
return false;
}
}
#[cfg(feature = "asm")]
{
use crate::src::levels::TxfmSize;
use zerocopy::IntoBytes;
let txfm = TxfmSize::from_repr(tx_size).unwrap_or_default();
let (w, h) = txfm.to_wh();
let (mut dst_guard, _dst_base) = dst.strided_slice_mut::<BD>(w, h);
let dst_ptr: *mut DynPixel = dst_guard.as_mut_bytes().as_mut_ptr() as *mut DynPixel;
let dst_stride = dst.stride();
let coeff_len = coeff.len() as u16;
let coeff_ptr = coeff.as_mut_ptr().cast();
let bd_c = bd.into_c();
let dst_ffi = FFISafe::new(&dst);
match BD::BPC {
BPC::BPC8 => itxfm_add_direct_arm_8bpc(
tx_size, tx_type, dst_ptr, dst_stride, coeff_ptr, eob, bd_c, coeff_len, dst_ffi,
),
BPC::BPC16 => itxfm_add_direct_arm_16bpc(
tx_size, tx_type, dst_ptr, dst_stride, coeff_ptr, eob, bd_c, coeff_len, dst_ffi,
),
}
}
}
#[cfg(all(test, target_arch = "aarch64"))]
mod bench_autoversion_vs_neon {
use archmage::SimdToken as _;
use std::time::Instant;
fn bench_fn(name: &str, iters: u32, mut f: impl FnMut()) -> u64 {
for _ in 0..100 {
f();
}
let mut times = Vec::with_capacity(iters as usize);
for _ in 0..iters {
let start = Instant::now();
f();
times.push(start.elapsed().as_nanos() as u64);
}
times.sort();
let median = times[times.len() / 2];
let mean: u64 = times.iter().sum::<u64>() / times.len() as u64;
println!(" {name}: median={median}ns mean={mean}ns");
median
}
macro_rules! bench_transform {
($test_name:ident, $w:expr, $h:expr,
autoversioned: $av_fn:path,
neon: $neon_fn:path,
$coeff_count:expr) => {
#[test]
#[ignore]
fn $test_name() {
let token = archmage::Arm64::summon().expect("NEON required");
let stride: isize = ($w + 16) as isize;
let dst_size = ($h as usize) * (stride as usize) + $w;
let mut dst_av = vec![128u8; dst_size];
let mut dst_neon = vec![128u8; dst_size];
let base = 0usize;
let mut coeff_template = vec![0i16; $coeff_count];
for (i, c) in coeff_template.iter_mut().enumerate() {
*c = (((i * 37 + 13) % 512) as i16) - 256;
}
coeff_template[0] = 1000;
let iters = 100_000u32;
println!("\n=== {} ({}x{}) ===", stringify!($test_name), $w, $h);
let mut coeff = coeff_template.clone();
let av_ns = bench_fn("autoversioned", iters, || {
dst_av.fill(128);
coeff.copy_from_slice(&coeff_template);
$av_fn(
&mut dst_av,
base,
stride,
&mut coeff,
$coeff_count as i32 - 1,
255,
);
});
let neon_ns = bench_fn("hand-written NEON", iters, || {
dst_neon.fill(128);
coeff.copy_from_slice(&coeff_template);
$neon_fn(
token,
&mut dst_neon,
base,
stride,
&mut coeff,
$coeff_count as i32 - 1,
255,
);
});
dst_av.fill(128);
dst_neon.fill(128);
let mut ca = coeff_template.clone();
let mut cb = coeff_template.clone();
$av_fn(
&mut dst_av,
base,
stride,
&mut ca,
$coeff_count as i32 - 1,
255,
);
$neon_fn(
token,
&mut dst_neon,
base,
stride,
&mut cb,
$coeff_count as i32 - 1,
255,
);
assert_eq!(dst_av, dst_neon, "Output mismatch!");
let ratio = av_ns as f64 / neon_ns as f64;
println!(" ratio: {ratio:.2}x (autoversioned / NEON, <1 = autoversioned faster)");
}
};
}
bench_transform!(bench_dct_dct_4x4, 4, 4,
autoversioned: super::inv_txfm_add_dct_dct_4x4_8bpc_inner,
neon: super::super::itx_arm_neon_4x4::inv_txfm_add_dct_dct_4x4_8bpc_neon_inner,
16
);
bench_transform!(bench_identity_4x4, 4, 4,
autoversioned: super::inv_txfm_add_identity_identity_4x4_8bpc_inner,
neon: super::super::itx_arm_neon_4x4::inv_txfm_add_identity_identity_4x4_8bpc_neon_inner,
16
);
bench_transform!(bench_adst_adst_4x4, 4, 4,
autoversioned: super::inv_txfm_add_adst_adst_4x4_8bpc_inner,
neon: super::super::itx_arm_neon_4x4::inv_txfm_add_adst_adst_4x4_8bpc_neon_inner,
16
);
bench_transform!(bench_wht_wht_4x4, 4, 4,
autoversioned: super::inv_txfm_add_wht_wht_4x4_8bpc_inner,
neon: super::super::itx_arm_neon_wht::inv_txfm_add_wht_wht_4x4_8bpc_neon_inner,
16
);
bench_transform!(bench_dct_dct_8x8, 8, 8,
autoversioned: super::inv_txfm_add_dct_dct_8x8_8bpc_inner,
neon: super::super::itx_arm_neon_8x8::inv_txfm_add_dct_dct_8x8_8bpc_neon_inner,
64
);
bench_transform!(bench_identity_8x8, 8, 8,
autoversioned: super::inv_txfm_add_identity_identity_8x8_8bpc_inner,
neon: super::super::itx_arm_neon_8x8::inv_txfm_add_identity_identity_8x8_8bpc_neon_inner,
64
);
bench_transform!(bench_dct_dct_16x16, 16, 16,
autoversioned: super::inv_txfm_add_dct_dct_16x16_8bpc_inner,
neon: super::super::itx_arm_neon_16x16::inv_txfm_add_dct_dct_16x16_8bpc_neon_inner,
256
);
bench_transform!(bench_dct_dct_32x32, 32, 32,
autoversioned: super::inv_txfm_add_dct_dct_32x32_8bpc_inner,
neon: super::super::itx_arm_neon_32::inv_txfm_add_dct_dct_32x32_8bpc_neon_inner,
1024
);
}