rav1d-safe 0.5.5

//! Safe ARM NEON implementations for Inverse Transform (ITX)
//!
//! Implements the inverse transforms for AV1 decoding.
//! Transforms convert frequency-domain coefficients back to spatial-domain pixels.

#![allow(unused_imports)]
#![allow(clippy::too_many_arguments)]
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]

#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;

use archmage::autoversion;
use std::ffi::c_int;

use crate::include::common::bitdepth::AsPrimitive;
use crate::include::common::bitdepth::BitDepth8;
use crate::include::common::bitdepth::BitDepth16;
use crate::include::common::bitdepth::DynCoef;
use crate::include::common::bitdepth::DynPixel;
use crate::include::common::intops::iclip;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;

// ============================================================================
// WHT_WHT 4x4 TRANSFORM
// ============================================================================

/// WHT 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_wht_wht_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform: load from column-major, store row-major
    for y in 0..4 {
        let in0 = coeff[y] as i32 >> 2;
        let in1 = coeff[y + 4] as i32 >> 2;
        let in2 = coeff[y + 8] as i32 >> 2;
        let in3 = coeff[y + 12] as i32 >> 2;

        let t0 = in0 + in1;
        let t2 = in2 - in3;
        let t4 = (t0 - t2) >> 1;
        let t3 = t4 - in3;
        let t1 = t4 - in1;

        tmp[y * 4 + 0] = t0 - t3;
        tmp[y * 4 + 1] = t3;
        tmp[y * 4 + 2] = t1;
        tmp[y * 4 + 3] = t2 + t1;
    }

    // Column transform
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let t0 = in0 + in1;
        let t2 = in2 - in3;
        let t4 = (t0 - t2) >> 1;
        let t3 = t4 - in3;
        let t1 = t4 - in1;

        tmp[0 * 4 + x] = t0 - t3;
        tmp[1 * 4 + x] = t3;
        tmp[2 * 4 + x] = t1;
        tmp[3 * 4 + x] = t2 + t1;
    }

    // Add to destination
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let c = tmp[y * 4 + x];
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    // Clear coefficients
    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// WHT 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_wht_wht_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform
    for y in 0..4 {
        let in0 = coeff[y] >> 2;
        let in1 = coeff[y + 4] >> 2;
        let in2 = coeff[y + 8] >> 2;
        let in3 = coeff[y + 12] >> 2;

        let t0 = in0 + in1;
        let t2 = in2 - in3;
        let t4 = (t0 - t2) >> 1;
        let t3 = t4 - in3;
        let t1 = t4 - in1;

        tmp[y * 4 + 0] = t0 - t3;
        tmp[y * 4 + 1] = t3;
        tmp[y * 4 + 2] = t1;
        tmp[y * 4 + 3] = t2 + t1;
    }

    // Column transform
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let t0 = in0 + in1;
        let t2 = in2 - in3;
        let t4 = (t0 - t2) >> 1;
        let t3 = t4 - in3;
        let t1 = t4 - in1;

        tmp[0 * 4 + x] = t0 - t3;
        tmp[1 * 4 + x] = t3;
        tmp[2 * 4 + x] = t1;
        tmp[3 * 4 + x] = t2 + t1;
    }

    // Add to destination
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let c = tmp[y * 4 + x];
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    // Clear coefficients
    for i in 0..16 {
        coeff[i] = 0;
    }
}

// ============================================================================
// DCT_DCT 4x4 TRANSFORM
// ============================================================================

/// DCT4 1D transform constants
const DCT4_C1: i32 = 2896; // cos(pi/8) * 4096
const DCT4_C2: i32 = 2896; // cos(3pi/8) * 4096
const DCT4_C3: i32 = 1567; // sin(pi/8) * 4096
const DCT4_C4: i32 = 3784; // sin(3pi/8) * 4096

/// DCT4 1D transform
#[inline(always)]
fn dct4_1d(in0: i32, in1: i32, in2: i32, in3: i32) -> [i32; 4] {
    // Stage 1
    let t0 = in0 + in3;
    let t1 = in1 + in2;
    let t2 = in0 - in3;
    let t3 = in1 - in2;

    // Stage 2 (DCT2)
    let s0 = t0 + t1;
    let s1 = t0 - t1;

    // Rotation for t2, t3
    let s2 = ((t2 * 1567 + t3 * 3784) + 2048) >> 12;
    let s3 = ((t2 * 3784 - t3 * 1567) + 2048) >> 12;

    [s0, s2, s1, s3]
}

/// DCT 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_dct_dct_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform (coefficients are in column-major order)
    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = dct4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    // Column transform and add to dst
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        // Round and add to destination
        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            // Apply rounding: (val + 8) >> 4
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    // Clear coefficients
    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// DCT 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform
    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = dct4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    // Column transform and add to dst
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    // Clear coefficients
    for i in 0..16 {
        coeff[i] = 0;
    }
}

// ============================================================================
// DCT_DCT 8x8 TRANSFORM
// ============================================================================

/// DCT8 1D transform constants
const COS_PI_1_16: i32 = 4017; // cos(pi/16) * 4096
const COS_PI_2_16: i32 = 3784; // cos(2*pi/16) * 4096
const COS_PI_3_16: i32 = 3406; // cos(3*pi/16) * 4096
const COS_PI_4_16: i32 = 2896; // cos(4*pi/16) * 4096 = sqrt(2)/2 * 4096
const COS_PI_5_16: i32 = 2276; // cos(5*pi/16) * 4096
const COS_PI_6_16: i32 = 1567; // cos(6*pi/16) * 4096
const COS_PI_7_16: i32 = 799; // cos(7*pi/16) * 4096

/// DCT8 1D transform
#[inline(always)]
fn dct8_1d(input: &[i32; 8]) -> [i32; 8] {
    // Stage 1: butterfly
    let t0 = input[0] + input[7];
    let t7 = input[0] - input[7];
    let t1 = input[1] + input[6];
    let t6 = input[1] - input[6];
    let t2 = input[2] + input[5];
    let t5 = input[2] - input[5];
    let t3 = input[3] + input[4];
    let t4 = input[3] - input[4];

    // Stage 2: DCT4 on even terms
    let e0 = t0 + t3;
    let e3 = t0 - t3;
    let e1 = t1 + t2;
    let e2 = t1 - t2;

    let out0 = e0 + e1;
    let out4 = e0 - e1;
    let out2 = ((e2 * 1567 + e3 * 3784) + 2048) >> 12;
    let out6 = ((e3 * 1567 - e2 * 3784) + 2048) >> 12;

    // Stage 2: Rotations on odd terms
    let o0 = ((t4 * 799 + t7 * 4017) + 2048) >> 12;
    let o7 = ((t7 * 799 - t4 * 4017) + 2048) >> 12;
    let o1 = ((t5 * 2276 + t6 * 3406) + 2048) >> 12;
    let o6 = ((t6 * 2276 - t5 * 3406) + 2048) >> 12;

    let out1 = o0 + o1;
    let out3 = o7 - o6;
    let out5 = o7 + o6;
    let out7 = o0 - o1;

    [out0, out1, out2, out3, out4, out5, out6, out7]
}

/// DCT 8x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    // Row transform (coefficients in column-major order)
    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8] as i32;
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    // Column transform and add to dst
    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    // Clear coefficients
    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// DCT 8x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8];
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

// ============================================================================
// DCT_DCT 16x16 TRANSFORM
// ============================================================================

/// DCT16 1D transform
#[inline(always)]
fn dct16_1d(input: &[i32; 16]) -> [i32; 16] {
    // Stage 1: butterfly
    let mut t = [0i32; 16];
    for i in 0..8 {
        t[i] = input[i] + input[15 - i];
        t[15 - i] = input[i] - input[15 - i];
    }

    // Apply DCT8 to even terms (t[0..8])
    let even_input = [t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7]];
    let even_out = dct8_1d(&even_input);

    // Odd terms need rotation pairs
    let o0 = ((t[8] * 401 + t[15] * 4076) + 2048) >> 12;
    let o15 = ((t[15] * 401 - t[8] * 4076) + 2048) >> 12;
    let o1 = ((t[9] * 1189 + t[14] * 3920) + 2048) >> 12;
    let o14 = ((t[14] * 1189 - t[9] * 3920) + 2048) >> 12;
    let o2 = ((t[10] * 1931 + t[13] * 3612) + 2048) >> 12;
    let o13 = ((t[13] * 1931 - t[10] * 3612) + 2048) >> 12;
    let o3 = ((t[11] * 2598 + t[12] * 3166) + 2048) >> 12;
    let o12 = ((t[12] * 2598 - t[11] * 3166) + 2048) >> 12;

    // Butterfly on odd terms
    let a0 = o0 + o1;
    let a1 = o0 - o1;
    let a2 = o2 + o3;
    let a3 = o2 - o3;
    let a4 = o12 + o13;
    let a5 = o12 - o13;
    let a6 = o14 + o15;
    let a7 = o14 - o15;

    // Final rotations
    let b1 = ((a1 * 1567 + a6 * 3784) + 2048) >> 12;
    let b6 = ((a6 * 1567 - a1 * 3784) + 2048) >> 12;
    let b3 = ((a3 * 3784 + a4 * 1567) + 2048) >> 12;
    let b4 = ((a4 * 3784 - a3 * 1567) + 2048) >> 12;

    let mut out = [0i32; 16];
    out[0] = even_out[0];
    out[1] = a0 + a2;
    out[2] = even_out[1];
    out[3] = b1 + b3;
    out[4] = even_out[2];
    out[5] = a7 + a5;
    out[6] = even_out[3];
    out[7] = b6 + b4;
    out[8] = even_out[4];
    out[9] = b6 - b4;
    out[10] = even_out[5];
    out[11] = a7 - a5;
    out[12] = even_out[6];
    out[13] = b1 - b3;
    out[14] = even_out[7];
    out[15] = a0 - a2;

    out
}

/// DCT 16x16 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x16_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 256];

    // Row transform
    for y in 0..16 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 16] as i32;
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    // Column transform and add to dst
    for x in 0..16 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 16 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 128) >> 8;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

/// DCT 16x16 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x16_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 256];

    for y in 0..16 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 16];
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    for x in 0..16 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 16 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 128) >> 8;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

// ============================================================================
// IDENTITY TRANSFORMS
// ============================================================================

/// Identity 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_identity_identity_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let sqrt2 = 181i32; // sqrt(2) * 128

    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let c = coeff[y + x * 4] as i32;
            // Scale by sqrt(2)^2 / 16 = 2/16 = 1/8
            // Actually: c * sqrt2 * sqrt2 / (128 * 128 * 4) with proper rounding
            let scaled = ((c * sqrt2 + 64) >> 7) * sqrt2;
            let final_val = (scaled + 2048) >> 12;
            let d = dst[row_off + x] as i32;
            let result = iclip(d + final_val, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// Identity 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_identity_identity_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let sqrt2 = 181i32;

    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..4 {
            let c = coeff[y + x * 4];
            let scaled = ((c * sqrt2 + 64) >> 7) * sqrt2;
            let final_val = (scaled + 2048) >> 12;
            let d = dst[row_off + x] as i32;
            let result = iclip(d + final_val, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// Identity 8x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_identity_identity_8x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    for y in 0..8 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..8 {
            let c = coeff[y + x * 8] as i32;
            // For 8x8, scale is 2 (no sqrt2 multiplication needed)
            let final_val = (c + 1) >> 1;
            let d = dst[row_off + x] as i32;
            let result = iclip(d + final_val, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// Identity 8x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_identity_identity_8x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    for y in 0..8 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..8 {
            let c = coeff[y + x * 8];
            let final_val = (c + 1) >> 1;
            let d = dst[row_off + x] as i32;
            let result = iclip(d + final_val, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// Identity 16x16 transform for 8bpc
#[autoversion]
fn inv_txfm_add_identity_identity_16x16_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let sqrt2 = 181i32;

    for y in 0..16 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..16 {
            let c = coeff[y + x * 16] as i32;
            // 16x16 scale: 2*sqrt(2)
            let scaled = (c * sqrt2 + 64) >> 7;
            let final_val = (scaled + 1) >> 1;
            let d = dst[row_off + x] as i32;
            let result = iclip(d + final_val, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

/// Identity 16x16 transform for 16bpc
#[autoversion]
fn inv_txfm_add_identity_identity_16x16_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let sqrt2 = 181i32;

    for y in 0..16 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..16 {
            let c = coeff[y + x * 16];
            let scaled = (c * sqrt2 + 64) >> 7;
            let final_val = (scaled + 1) >> 1;
            let d = dst[row_off + x] as i32;
            let result = iclip(d + final_val, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

// ============================================================================
// ADST TRANSFORMS
// ============================================================================

/// ADST4 1D transform
#[inline(always)]
fn adst4_1d(in0: i32, in1: i32, in2: i32, in3: i32) -> [i32; 4] {
    const SINPI_1_9: i32 = 1321;
    const SINPI_2_9: i32 = 2482;
    const SINPI_3_9: i32 = 3344;
    const SINPI_4_9: i32 = 3803;

    let s0 = SINPI_1_9 * in0;
    let s1 = SINPI_2_9 * in0;
    let s2 = SINPI_3_9 * in1;
    let s3 = SINPI_4_9 * in2;
    let s4 = SINPI_1_9 * in2;
    let s5 = SINPI_2_9 * in3;
    let s6 = SINPI_4_9 * in3;

    let x0 = s0 + s3 + s5;
    let x1 = s1 - s4 - s6;
    let x2 = SINPI_3_9 * (in0 - in2 + in3);
    let x3 = s2;

    let s0 = x0 + x3;
    let s1 = x1 + x3;
    let s2 = x2;
    let s3 = x0 + x1 - x3;

    [
        (s0 + 2048) >> 12,
        (s1 + 2048) >> 12,
        (s2 + 2048) >> 12,
        (s3 + 2048) >> 12,
    ]
}

/// ADST 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_adst_adst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform
    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = adst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    // Column transform and add to dst
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = adst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// ADST 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_adst_adst_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = adst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = adst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// ADST8 1D transform
#[inline(always)]
fn adst8_1d(input: &[i32; 8]) -> [i32; 8] {
    const COSPI_2: i32 = 4091;
    const COSPI_6: i32 = 3973;
    const COSPI_10: i32 = 3703;
    const COSPI_14: i32 = 3290;
    const COSPI_18: i32 = 2751;
    const COSPI_22: i32 = 2106;
    const COSPI_26: i32 = 1380;
    const COSPI_30: i32 = 601;

    let x0 = input[7];
    let x1 = input[0];
    let x2 = input[5];
    let x3 = input[2];
    let x4 = input[3];
    let x5 = input[4];
    let x6 = input[1];
    let x7 = input[6];

    // stage 1
    let s0 = ((x0 * COSPI_2 + x1 * COSPI_30) + 2048) >> 12;
    let s1 = ((x0 * COSPI_30 - x1 * COSPI_2) + 2048) >> 12;
    let s2 = ((x2 * COSPI_10 + x3 * COSPI_22) + 2048) >> 12;
    let s3 = ((x2 * COSPI_22 - x3 * COSPI_10) + 2048) >> 12;
    let s4 = ((x4 * COSPI_18 + x5 * COSPI_14) + 2048) >> 12;
    let s5 = ((x4 * COSPI_14 - x5 * COSPI_18) + 2048) >> 12;
    let s6 = ((x6 * COSPI_26 + x7 * COSPI_6) + 2048) >> 12;
    let s7 = ((x6 * COSPI_6 - x7 * COSPI_26) + 2048) >> 12;

    // stage 2
    let x0 = s0 + s4;
    let x1 = s1 + s5;
    let x2 = s2 + s6;
    let x3 = s3 + s7;
    let x4 = s0 - s4;
    let x5 = s1 - s5;
    let x6 = s2 - s6;
    let x7 = s3 - s7;

    // stage 3
    let s4 = ((x4 * 1567 + x5 * 3784) + 2048) >> 12;
    let s5 = ((x4 * 3784 - x5 * 1567) + 2048) >> 12;
    let s6 = ((-x6 * 3784 + x7 * 1567) + 2048) >> 12;
    let s7 = ((x6 * 1567 + x7 * 3784) + 2048) >> 12;

    // stage 4
    let x0 = x0 + x2;
    let x1 = x1 + x3;
    let x2_new = x0 - x2 - x2;
    let x3_new = x1 - x3 - x3;
    let x4 = s4 + s6;
    let x5 = s5 + s7;
    let x6 = s4 - s6;
    let x7 = s5 - s7;

    // stage 5
    let s2 = ((x2_new * 2896 + x3_new * 2896) + 2048) >> 12;
    let s3 = ((x2_new * 2896 - x3_new * 2896) + 2048) >> 12;
    let s6 = ((x6 * 2896 + x7 * 2896) + 2048) >> 12;
    let s7 = ((x6 * 2896 - x7 * 2896) + 2048) >> 12;

    [x0, -x4, s2, -s6, s3, -x5, s7, -x1]
}

/// ADST 8x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_adst_adst_8x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8] as i32;
        }
        let out = adst8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = adst8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// ADST 8x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_adst_adst_8x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8];
        }
        let out = adst8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = adst8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

// ============================================================================
// FLIPADST TRANSFORMS
// ============================================================================

/// FlipADST4 1D transform (ADST with input flip)
#[inline(always)]
fn flipadst4_1d(in0: i32, in1: i32, in2: i32, in3: i32) -> [i32; 4] {
    adst4_1d(in3, in2, in1, in0)
}

/// FlipADST 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_flipadst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = flipadst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = flipadst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// FlipADST 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_flipadst_flipadst_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = flipadst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = flipadst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

// ============================================================================
// HYBRID TRANSFORMS (DCT/ADST combinations)
// ============================================================================

/// DCT-ADST 4x4 transform for 8bpc (DCT on rows, ADST on columns)
#[autoversion]
pub(crate) fn inv_txfm_add_dct_adst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform: DCT
    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = dct4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    // Column transform: ADST
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = adst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// ADST-DCT 4x4 transform for 8bpc (ADST on rows, DCT on columns)
#[autoversion]
pub(crate) fn inv_txfm_add_adst_dct_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    // Row transform: ADST
    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = adst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    // Column transform: DCT
    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// DCT-ADST 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_adst_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = dct4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = adst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// ADST-DCT 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_adst_dct_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = adst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

// ============================================================================
// DCT-FLIPADST AND FLIPADST-DCT HYBRID TRANSFORMS
// ============================================================================

/// DCT-FLIPADST 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_dct_flipadst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = dct4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = flipadst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// FLIPADST-DCT 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_dct_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = flipadst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// DCT-FLIPADST 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_flipadst_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = dct4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = flipadst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// FLIPADST-DCT 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_flipadst_dct_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = flipadst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

// ============================================================================
// ADST-FLIPADST AND FLIPADST-ADST TRANSFORMS
// ============================================================================

/// ADST-FLIPADST 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_adst_flipadst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = adst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = flipadst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// FLIPADST-ADST 4x4 transform for 8bpc
#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_adst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 4] as i32;
        let in2 = coeff[y + 8] as i32;
        let in3 = coeff[y + 12] as i32;

        let out = flipadst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = adst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// ADST-FLIPADST 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_adst_flipadst_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = adst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = flipadst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

/// FLIPADST-ADST 4x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_flipadst_adst_4x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];

    for y in 0..4 {
        let in0 = coeff[y];
        let in1 = coeff[y + 4];
        let in2 = coeff[y + 8];
        let in3 = coeff[y + 12];

        let out = flipadst4_1d(in0, in1, in2, in3);
        tmp[y * 4 + 0] = out[0];
        tmp[y * 4 + 1] = out[1];
        tmp[y * 4 + 2] = out[2];
        tmp[y * 4 + 3] = out[3];
    }

    for x in 0..4 {
        let in0 = tmp[0 * 4 + x];
        let in1 = tmp[1 * 4 + x];
        let in2 = tmp[2 * 4 + x];
        let in3 = tmp[3 * 4 + x];

        let out = adst4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 8) >> 4;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..16 {
        coeff[i] = 0;
    }
}

// ============================================================================
// FFI WRAPPERS
// ============================================================================

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_wht_wht_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    // Use NEON implementation when available, fall back to scalar
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_wht::inv_txfm_add_wht_wht_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_wht_wht_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_wht_wht_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_wht_wht_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_dct_dct_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_dct_dct_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// DCT_DCT 8x8 FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_8x8::inv_txfm_add_dct_dct_8x8_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_dct_dct_8x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// DCT_DCT 16x16 FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x16_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x16_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// IDENTITY FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_identity_identity_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_identity_identity_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_identity_identity_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_8x8::inv_txfm_add_identity_identity_8x8_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_identity_identity_8x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_identity_identity_8x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_identity_identity_16x16_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_identity_identity_16x16_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ADST_ADST FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_adst_adst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_adst_adst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_adst_adst_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_8x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_8x8::inv_txfm_add_adst_adst_8x8_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_adst_adst_8x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_adst_8x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_adst_adst_8x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// FLIPADST_FLIPADST FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_flipadst_flipadst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_flipadst_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_flipadst_flipadst_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// DCT_ADST and ADST_DCT FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_dct_adst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_dct_adst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_adst_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_adst_dct_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_adst_dct_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_adst_dct_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// DCT_FLIPADST and FLIPADST_DCT FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_flipadst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_dct_flipadst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_dct_flipadst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_flipadst_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_flipadst_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_dct_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_flipadst_dct_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_flipadst_dct_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_dct_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_flipadst_dct_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ADST_FLIPADST and FLIPADST_ADST FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_flipadst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_adst_flipadst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_adst_flipadst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_flipadst_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_adst_flipadst_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_adst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_flipadst_adst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_flipadst_adst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_adst_4x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_flipadst_adst_4x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ============================================================================
// RECTANGULAR TRANSFORMS - 4x8 and 8x4
// ============================================================================

/// DCT 4x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_4x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 32];

    // Row transform (4 columns, 8 rows): DCT4 on rows
    for y in 0..8 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 8] as i32;
        let in2 = coeff[y + 16] as i32;
        let in3 = coeff[y + 24] as i32;

        let out = dct4_1d(in0, in1, in2, in3);
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }

    // Column transform: DCT8 on columns
    for x in 0..4 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 4 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 16) >> 5;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..32 {
        coeff[i] = 0;
    }
}

/// DCT 4x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_4x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 32];

    for y in 0..8 {
        let in0 = coeff[y];
        let in1 = coeff[y + 8];
        let in2 = coeff[y + 16];
        let in3 = coeff[y + 24];

        let out = dct4_1d(in0, in1, in2, in3);
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }

    for x in 0..4 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 4 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 16) >> 5;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..32 {
        coeff[i] = 0;
    }
}

/// DCT 8x4 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 32];

    // Row transform: DCT8 on rows
    for y in 0..4 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 4] as i32;
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    // Column transform: DCT4 on columns
    for x in 0..8 {
        let in0 = tmp[0 * 8 + x];
        let in1 = tmp[1 * 8 + x];
        let in2 = tmp[2 * 8 + x];
        let in3 = tmp[3 * 8 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 16) >> 5;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..32 {
        coeff[i] = 0;
    }
}

/// DCT 8x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 32];

    for y in 0..4 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 4];
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let in0 = tmp[0 * 8 + x];
        let in1 = tmp[1 * 8 + x];
        let in2 = tmp[2 * 8 + x];
        let in3 = tmp[3 * 8 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 16) >> 5;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..32 {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR TRANSFORMS - 8x16 and 16x8
// ============================================================================

/// DCT 8x16 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x16_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 128];

    // Row transform: DCT8 on 16 rows
    for y in 0..16 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 16] as i32;
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    // Column transform: DCT16 on columns
    for x in 0..8 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..128 {
        coeff[i] = 0;
    }
}

/// DCT 8x16 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x16_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 128];

    for y in 0..16 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 16];
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..128 {
        coeff[i] = 0;
    }
}

/// DCT 16x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 128];

    // Row transform: DCT16 on 8 rows
    for y in 0..8 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 8] as i32;
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    // Column transform: DCT8 on columns
    for x in 0..16 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 16 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..128 {
        coeff[i] = 0;
    }
}

/// DCT 16x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 128];

    for y in 0..8 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 8];
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    for x in 0..16 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 16 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..128 {
        coeff[i] = 0;
    }
}

// ============================================================================
// 8x8 HYBRID TRANSFORMS
// ============================================================================

/// DCT-ADST 8x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_adst_8x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8] as i32;
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = adst8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// ADST-DCT 8x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_adst_dct_8x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8] as i32;
        }
        let out = adst8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// DCT-ADST 8x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_adst_8x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8];
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = adst8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// ADST-DCT 8x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_adst_dct_8x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..8 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 8];
        }
        let out = adst8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 32) >> 6;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR FFI WRAPPERS
// ============================================================================

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 32) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_4x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_4x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 32) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 128) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x16_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x16_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 128) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// 8x8 Hybrid FFI Wrappers
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_8x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_8x8::inv_txfm_add_dct_adst_8x8_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_dct_adst_8x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_adst_8x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_adst_8x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_8x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_8x8::inv_txfm_add_adst_dct_8x8_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_adst_dct_8x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_dct_8x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_adst_dct_8x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ============================================================================
// DCT 32x32 TRANSFORM
// ============================================================================

/// DCT32 1D transform
#[inline(always)]
fn dct32_1d(input: &[i32; 32]) -> [i32; 32] {
    // Stage 1: butterfly
    let mut t = [0i32; 32];
    for i in 0..16 {
        t[i] = input[i] + input[31 - i];
        t[31 - i] = input[i] - input[31 - i];
    }

    // DCT16 on even terms (t[0..16])
    let even_input = [
        t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], t[13],
        t[14], t[15],
    ];
    let even_out = dct16_1d(&even_input);

    // Odd terms need rotation pairs (simplified version)
    // Using approximate constants for 32-point DCT
    let c1 = 4091; // cos(pi/64) * 4096
    let c3 = 4076; // cos(3pi/64) * 4096
    let c5 = 4017; // cos(5pi/64) * 4096
    let c7 = 3920; // cos(7pi/64) * 4096
    let c9 = 3784; // cos(9pi/64) * 4096
    let c11 = 3612; // cos(11pi/64) * 4096
    let c13 = 3406; // cos(13pi/64) * 4096
    let c15 = 3166; // cos(15pi/64) * 4096

    // Simplified odd term processing
    let mut odd = [0i32; 16];
    for i in 0..8 {
        let idx = 16 + i;
        let o0 = t[idx];
        let o1 = t[31 - i];
        let cos_val = match i {
            0 => c1,
            1 => c3,
            2 => c5,
            3 => c7,
            4 => c9,
            5 => c11,
            6 => c13,
            7 => c15,
            _ => c1,
        };
        let sin_val = 4096 - cos_val / 16;
        odd[i] = ((o0 * cos_val + o1 * sin_val) + 2048) >> 12;
        odd[15 - i] = ((o1 * cos_val - o0 * sin_val) + 2048) >> 12;
    }

    // Interleave even and odd outputs
    let mut out = [0i32; 32];
    for i in 0..16 {
        out[2 * i] = even_out[i];
        out[2 * i + 1] = odd[i];
    }

    out
}

/// DCT 32x32 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x32_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 1024];

    // Row transform
    for y in 0..32 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 32] as i32;
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    // Column transform and add to dst
    for x in 0..32 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 512) >> 10;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..1024 {
        coeff[i] = 0;
    }
}

/// DCT 32x32 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x32_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 1024];

    for y in 0..32 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 32];
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    for x in 0..32 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 512) >> 10;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..1024 {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR TRANSFORMS - 4x16 and 16x4
// ============================================================================

/// DCT 4x16 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_4x16_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    // Row transform: DCT4 on 16 rows
    for y in 0..16 {
        let in0 = coeff[y] as i32;
        let in1 = coeff[y + 16] as i32;
        let in2 = coeff[y + 32] as i32;
        let in3 = coeff[y + 48] as i32;

        let out = dct4_1d(in0, in1, in2, in3);
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }

    // Column transform: DCT16 on columns
    for x in 0..4 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 4 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// DCT 4x16 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_4x16_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..16 {
        let in0 = coeff[y];
        let in1 = coeff[y + 16];
        let in2 = coeff[y + 32];
        let in3 = coeff[y + 48];

        let out = dct4_1d(in0, in1, in2, in3);
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }

    for x in 0..4 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 4 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// DCT 16x4 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    // Row transform: DCT16 on 4 rows
    for y in 0..4 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 4] as i32;
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    // Column transform: DCT4 on columns
    for x in 0..16 {
        let in0 = tmp[0 * 16 + x];
        let in1 = tmp[1 * 16 + x];
        let in2 = tmp[2 * 16 + x];
        let in3 = tmp[3 * 16 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

/// DCT 16x4 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x4_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64];

    for y in 0..4 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 4];
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    for x in 0..16 {
        let in0 = tmp[0 * 16 + x];
        let in1 = tmp[1 * 16 + x];
        let in2 = tmp[2 * 16 + x];
        let in3 = tmp[3 * 16 + x];

        let out = dct4_1d(in0, in1, in2, in3);

        for y in 0..4 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 64) >> 7;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..64 {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR TRANSFORMS - 16x32 and 32x16
// ============================================================================

/// DCT 16x32 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x32_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 512];

    // Row transform: DCT16 on 32 rows
    for y in 0..32 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 32] as i32;
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    // Column transform: DCT32 on columns
    for x in 0..16 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 16 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..512 {
        coeff[i] = 0;
    }
}

/// DCT 16x32 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_16x32_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 512];

    for y in 0..32 {
        let mut input = [0i32; 16];
        for x in 0..16 {
            input[x] = coeff[y + x * 32];
        }
        let out = dct16_1d(&input);
        for x in 0..16 {
            tmp[y * 16 + x] = out[x];
        }
    }

    for x in 0..16 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 16 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..512 {
        coeff[i] = 0;
    }
}

/// DCT 32x16 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x16_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 512];

    // Row transform: DCT32 on 16 rows
    for y in 0..16 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 16] as i32;
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    // Column transform: DCT16 on columns
    for x in 0..32 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..512 {
        coeff[i] = 0;
    }
}

/// DCT 32x16 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x16_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 512];

    for y in 0..16 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 16];
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    for x in 0..32 {
        let mut input = [0i32; 16];
        for y in 0..16 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct16_1d(&input);

        for y in 0..16 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..512 {
        coeff[i] = 0;
    }
}

// ============================================================================
// LARGER SIZE FFI WRAPPERS
// ============================================================================

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    {
        let token = unsafe { archmage::Arm64::forge_token_dangerously() };
        super::itx_arm_neon_32::inv_txfm_add_dct_dct_32x32_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
    }
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    {
        let token = unsafe { archmage::Arm64::forge_token_dangerously() };
        super::itx_arm_neon_32::inv_txfm_add_dct_dct_32x32_16bpc_neon_inner(
            token,
            dst_slice,
            base,
            stride_u16 as isize,
            coeff_slice,
            eob,
            bitdepth_max,
        );
    }
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_4x16_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_4x16_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 64) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x4_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x32_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x32_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_32x16_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_32x16_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ============================================================================
// DCT 64x64 TRANSFORM
// ============================================================================

/// DCT64 1D transform (simplified - uses recursive approach)
#[inline(always)]
fn dct64_1d(input: &[i32; 64]) -> [i32; 64] {
    // Stage 1: butterfly
    let mut t = [0i32; 64];
    for i in 0..32 {
        t[i] = input[i] + input[63 - i];
        t[63 - i] = input[i] - input[63 - i];
    }

    // DCT32 on even terms
    let even_input: [i32; 32] = core::array::from_fn(|i| t[i]);
    let even_out = dct32_1d(&even_input);

    // Simplified odd term processing using rotation pairs
    let mut odd = [0i32; 32];
    for i in 0..16 {
        let idx = 32 + i;
        let o0 = t[idx];
        let o1 = t[63 - i];
        // Simplified rotation with approximate constants
        let cos_val = 4096 - (i as i32 * 64);
        let sin_val = i as i32 * 256;
        odd[i] = ((o0 * cos_val + o1 * sin_val) + 2048) >> 12;
        odd[31 - i] = ((o1 * cos_val - o0 * sin_val) + 2048) >> 12;
    }

    // Interleave even and odd outputs
    let mut out = [0i32; 64];
    for i in 0..32 {
        out[2 * i] = even_out[i];
        out[2 * i + 1] = odd[i];
    }

    out
}

/// DCT 64x64 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_64x64_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 4096];

    // Row transform
    for y in 0..64 {
        let mut input = [0i32; 64];
        for x in 0..64 {
            input[x] = coeff[y + x * 64] as i32;
        }
        let out = dct64_1d(&input);
        for x in 0..64 {
            tmp[y * 64 + x] = out[x];
        }
    }

    // Column transform and add to dst
    for x in 0..64 {
        let mut input = [0i32; 64];
        for y in 0..64 {
            input[y] = tmp[y * 64 + x];
        }
        let out = dct64_1d(&input);

        for y in 0..64 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 2048) >> 12;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..4096 {
        coeff[i] = 0;
    }
}

/// DCT 64x64 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_64x64_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 4096];

    for y in 0..64 {
        let mut input = [0i32; 64];
        for x in 0..64 {
            input[x] = coeff[y + x * 64];
        }
        let out = dct64_1d(&input);
        for x in 0..64 {
            tmp[y * 64 + x] = out[x];
        }
    }

    for x in 0..64 {
        let mut input = [0i32; 64];
        for y in 0..64 {
            input[y] = tmp[y * 64 + x];
        }
        let out = dct64_1d(&input);

        for y in 0..64 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 2048) >> 12;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..4096 {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR TRANSFORMS - 8x32 and 32x8
// ============================================================================

/// DCT 8x32 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x32_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 256];

    // Row transform: DCT8 on 32 rows
    for y in 0..32 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 32] as i32;
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    // Column transform: DCT32 on columns
    for x in 0..8 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

/// DCT 8x32 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_8x32_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 256];

    for y in 0..32 {
        let mut input = [0i32; 8];
        for x in 0..8 {
            input[x] = coeff[y + x * 32];
        }
        let out = dct8_1d(&input);
        for x in 0..8 {
            tmp[y * 8 + x] = out[x];
        }
    }

    for x in 0..8 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 8 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

/// DCT 32x8 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x8_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 256];

    // Row transform: DCT32 on 8 rows
    for y in 0..8 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 8] as i32;
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    // Column transform: DCT8 on columns
    for x in 0..32 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

/// DCT 32x8 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x8_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 256];

    for y in 0..8 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 8];
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    for x in 0..32 {
        let mut input = [0i32; 8];
        for y in 0..8 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct8_1d(&input);

        for y in 0..8 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 256) >> 9;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..256 {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR TRANSFORMS - 32x64 and 64x32
// ============================================================================

/// DCT 32x64 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x64_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 2048];

    for y in 0..64 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 64] as i32;
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    for x in 0..32 {
        let mut input = [0i32; 64];
        for y in 0..64 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct64_1d(&input);

        for y in 0..64 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 1024) >> 11;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..2048 {
        coeff[i] = 0;
    }
}

/// DCT 32x64 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_32x64_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 2048];

    for y in 0..64 {
        let mut input = [0i32; 32];
        for x in 0..32 {
            input[x] = coeff[y + x * 64];
        }
        let out = dct32_1d(&input);
        for x in 0..32 {
            tmp[y * 32 + x] = out[x];
        }
    }

    for x in 0..32 {
        let mut input = [0i32; 64];
        for y in 0..64 {
            input[y] = tmp[y * 32 + x];
        }
        let out = dct64_1d(&input);

        for y in 0..64 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 1024) >> 11;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..2048 {
        coeff[i] = 0;
    }
}

/// DCT 64x32 transform for 8bpc
#[autoversion]
fn inv_txfm_add_dct_dct_64x32_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 2048];

    for y in 0..32 {
        let mut input = [0i32; 64];
        for x in 0..64 {
            input[x] = coeff[y + x * 32] as i32;
        }
        let out = dct64_1d(&input);
        for x in 0..64 {
            tmp[y * 64 + x] = out[x];
        }
    }

    for x in 0..64 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 64 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 1024) >> 11;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..2048 {
        coeff[i] = 0;
    }
}

/// DCT 64x32 transform for 16bpc
#[autoversion]
fn inv_txfm_add_dct_dct_64x32_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 2048];

    for y in 0..32 {
        let mut input = [0i32; 64];
        for x in 0..64 {
            input[x] = coeff[y + x * 32];
        }
        let out = dct64_1d(&input);
        for x in 0..64 {
            tmp[y * 64 + x] = out[x];
        }
    }

    for x in 0..64 {
        let mut input = [0i32; 32];
        for y in 0..32 {
            input[y] = tmp[y * 64 + x];
        }
        let out = dct32_1d(&input);

        for y in 0..32 {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            let d = dst[row_off + x] as i32;
            let c = (out[y] + 1024) >> 11;
            let result = iclip(d + c, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..2048 {
        coeff[i] = 0;
    }
}

// ============================================================================
// LARGE SIZE FFI WRAPPERS
// ============================================================================

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x64_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 4096) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 63usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 64;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 64;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_64x64_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x64_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 4096) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 63usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 64;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 64;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_64x64_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x32_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_8x32_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_32x8_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_32x8_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x64_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 2048) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 63usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_32x64_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x64_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 2048) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 63usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_32x64_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 2048) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 64;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 64;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_64x32_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 2048) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 64;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 64;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_64x32_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ============================================================================
// IDENTITY 1D HELPERS
// ============================================================================

#[inline(always)]
fn identity4_1d_arm(c: &mut [i32], stride: usize) {
    for i in 0..4 {
        let v = c[i * stride];
        c[i * stride] = ((v * 181 + 128) >> 8) + v;
    }
}

#[inline(always)]
fn identity8_1d_arm(c: &mut [i32], stride: usize) {
    for i in 0..8 {
        c[i * stride] *= 2;
    }
}

#[inline(always)]
fn identity16_1d_arm(c: &mut [i32], stride: usize) {
    for i in 0..16 {
        c[i * stride] *= 2;
    }
}

#[inline(always)]
fn identity32_1d_arm(c: &mut [i32], stride: usize) {
    for i in 0..32 {
        c[i * stride] *= 4;
    }
}

/// rect2 scale: multiply by sqrt(2) for 2:1 rectangular transforms
#[inline(always)]
fn rect2_scale(v: i32) -> i32 {
    (v * 181 + 128) >> 8
}

// ============================================================================
// GENERIC RECTANGULAR IDENTITY INNER FUNCTIONS
// ============================================================================

/// Generic rectangular identity transform for 8bpc
#[autoversion]
fn identity_rect_8bpc_inner<const W: usize, const H: usize>(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
    row_fn: fn(&mut [i32], usize),
    col_fn: fn(&mut [i32], usize),
    is_rect2: bool,
) {
    let mut tmp = vec![0i32; W * H];

    for y in 0..H {
        let mut row = [0i32; W];
        for x in 0..W {
            let c = coeff[y + x * H] as i32;
            row[x] = if is_rect2 { rect2_scale(c) } else { c };
        }
        row_fn(&mut row, 1);
        for x in 0..W {
            tmp[y * W + x] = row[x];
        }
    }

    for x in 0..W {
        col_fn(&mut tmp[x..], W);
    }

    for y in 0..H {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..W {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * W + x] + 8) >> 4;
            let result = iclip(d + val, 0, bitdepth_max);
            dst[row_off + x] = result as u8;
        }
    }

    for i in 0..(W * H) {
        coeff[i] = 0;
    }
}

/// Generic rectangular identity transform for 16bpc
#[autoversion]
fn identity_rect_16bpc_inner<const W: usize, const H: usize>(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
    row_fn: fn(&mut [i32], usize),
    col_fn: fn(&mut [i32], usize),
    is_rect2: bool,
) {
    let mut tmp = vec![0i32; W * H];

    for y in 0..H {
        let mut row = [0i32; W];
        for x in 0..W {
            let c = coeff[y + x * H];
            row[x] = if is_rect2 { rect2_scale(c) } else { c };
        }
        row_fn(&mut row, 1);
        for x in 0..W {
            tmp[y * W + x] = row[x];
        }
    }

    for x in 0..W {
        col_fn(&mut tmp[x..], W);
    }

    for y in 0..H {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..W {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * W + x] + 8) >> 4;
            let result = iclip(d + val, 0, bitdepth_max);
            dst[row_off + x] = result as u16;
        }
    }

    for i in 0..(W * H) {
        coeff[i] = 0;
    }
}

// ============================================================================
// RECTANGULAR IDENTITY_IDENTITY FFI WRAPPERS - 16BPC ONLY
// ============================================================================

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<4, 8>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity4_1d_arm,
        identity8_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 32) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<8, 4>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity8_1d_arm,
        identity4_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<4, 16>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity4_1d_arm,
        identity16_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x4_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 64) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<16, 4>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity16_1d_arm,
        identity4_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<8, 16>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity8_1d_arm,
        identity16_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 128) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<16, 8>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity16_1d_arm,
        identity8_1d_arm,
        true,
    );
}

// ============================================================================
// RECTANGULAR IDENTITY_IDENTITY - 8BPC + 16BPC
// ============================================================================

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_8bpc_inner::<8, 32>(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
        identity8_1d_arm,
        identity32_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 8;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 8;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<8, 32>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity8_1d_arm,
        identity32_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x8_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 256) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_8bpc_inner::<32, 8>(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
        identity32_1d_arm,
        identity8_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x8_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 256) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 7usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<32, 8>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity32_1d_arm,
        identity8_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_8bpc_inner::<16, 32>(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
        identity16_1d_arm,
        identity32_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<16, 32>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity16_1d_arm,
        identity32_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 512) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_8bpc_inner::<32, 16>(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
        identity32_1d_arm,
        identity16_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 512) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    identity_rect_16bpc_inner::<32, 16>(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
        identity32_1d_arm,
        identity16_1d_arm,
        true,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x32_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    {
        let token = unsafe { archmage::Arm64::forge_token_dangerously() };
        super::itx_arm_neon_32::inv_txfm_add_identity_identity_32x32_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
    }
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x32_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 31usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 32;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 32;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    {
        let token = unsafe { archmage::Arm64::forge_token_dangerously() };
        super::itx_arm_neon_32::inv_txfm_add_identity_identity_32x32_16bpc_neon_inner(
            token,
            dst_slice,
            base,
            stride_u16 as isize,
            coeff_slice,
            eob,
            bitdepth_max,
        );
    }
}

// ============================================================================
// HYBRID IDENTITY TRANSFORMS 4x4 8BPC
// ============================================================================

#[autoversion]
pub(crate) fn inv_txfm_add_dct_identity_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];
    for y in 0..4 {
        let c0 = coeff[y] as i32;
        let c1 = coeff[y + 4] as i32;
        let c2 = coeff[y + 8] as i32;
        let c3 = coeff[y + 12] as i32;
        let out = dct4_1d(c0, c1, c2, c3);
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }
    for x in 0..4 {
        identity4_1d_arm(&mut tmp[x..], 4);
    }
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 4 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..16 {
        coeff[i] = 0;
    }
}

#[autoversion]
pub(crate) fn inv_txfm_add_identity_dct_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];
    for y in 0..4 {
        let mut row = [0i32; 4];
        for x in 0..4 {
            row[x] = coeff[y + x * 4] as i32;
        }
        identity4_1d_arm(&mut row, 1);
        for x in 0..4 {
            tmp[y * 4 + x] = row[x];
        }
    }
    for x in 0..4 {
        let out = dct4_1d(
            tmp[0 * 4 + x],
            tmp[1 * 4 + x],
            tmp[2 * 4 + x],
            tmp[3 * 4 + x],
        );
        for y in 0..4 {
            tmp[y * 4 + x] = out[y];
        }
    }
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 4 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..16 {
        coeff[i] = 0;
    }
}

#[autoversion]
pub(crate) fn inv_txfm_add_adst_identity_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];
    for y in 0..4 {
        let out = adst4_1d(
            coeff[y] as i32,
            coeff[y + 4] as i32,
            coeff[y + 8] as i32,
            coeff[y + 12] as i32,
        );
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }
    for x in 0..4 {
        identity4_1d_arm(&mut tmp[x..], 4);
    }
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 4 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..16 {
        coeff[i] = 0;
    }
}

#[autoversion]
pub(crate) fn inv_txfm_add_identity_adst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];
    for y in 0..4 {
        let mut row = [0i32; 4];
        for x in 0..4 {
            row[x] = coeff[y + x * 4] as i32;
        }
        identity4_1d_arm(&mut row, 1);
        for x in 0..4 {
            tmp[y * 4 + x] = row[x];
        }
    }
    for x in 0..4 {
        let out = adst4_1d(
            tmp[0 * 4 + x],
            tmp[1 * 4 + x],
            tmp[2 * 4 + x],
            tmp[3 * 4 + x],
        );
        for y in 0..4 {
            tmp[y * 4 + x] = out[y];
        }
    }
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 4 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..16 {
        coeff[i] = 0;
    }
}

#[autoversion]
pub(crate) fn inv_txfm_add_flipadst_identity_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];
    for y in 0..4 {
        let out = flipadst4_1d(
            coeff[y] as i32,
            coeff[y + 4] as i32,
            coeff[y + 8] as i32,
            coeff[y + 12] as i32,
        );
        for x in 0..4 {
            tmp[y * 4 + x] = out[x];
        }
    }
    for x in 0..4 {
        identity4_1d_arm(&mut tmp[x..], 4);
    }
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 4 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..16 {
        coeff[i] = 0;
    }
}

#[autoversion]
pub(crate) fn inv_txfm_add_identity_flipadst_4x4_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16];
    for y in 0..4 {
        let mut row = [0i32; 4];
        for x in 0..4 {
            row[x] = coeff[y + x * 4] as i32;
        }
        identity4_1d_arm(&mut row, 1);
        for x in 0..4 {
            tmp[y * 4 + x] = row[x];
        }
    }
    for x in 0..4 {
        let out = flipadst4_1d(
            tmp[0 * 4 + x],
            tmp[1 * 4 + x],
            tmp[2 * 4 + x],
            tmp[3 * 4 + x],
        );
        for y in 0..4 {
            tmp[y * 4 + x] = out[y];
        }
    }
    for y in 0..4 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..4 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 4 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..16 {
        coeff[i] = 0;
    }
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_identity_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_dct_identity_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_dct_identity_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_dct_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_identity_dct_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_identity_dct_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_adst_identity_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_adst_identity_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_adst_identity_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_adst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_identity_adst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_identity_adst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_flipadst_identity_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_flipadst_identity_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_flipadst_identity_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_identity_flipadst_4x4_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 3usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 4;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 4;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    #[cfg(target_arch = "aarch64")]
    if let Some(token) = archmage::Arm64::summon() {
        super::itx_arm_neon_4x4::inv_txfm_add_identity_flipadst_4x4_8bpc_neon_inner(
            token,
            dst_slice,
            base,
            dst_stride,
            coeff_slice,
            eob,
            bitdepth_max,
        );
        return;
    }
    inv_txfm_add_identity_flipadst_4x4_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ============================================================================
// DCT_DCT 16x64 and 64x16
// ============================================================================

#[autoversion]
fn inv_txfm_add_dct_dct_16x64_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16 * 64];
    for y in 0..64 {
        let mut row = [0i32; 16];
        for x in 0..16 {
            row[x] = rect2_scale(coeff[y + x * 64] as i32);
        }
        let out = dct16_1d(&row);
        for x in 0..16 {
            tmp[y * 16 + x] = (out[x] + 1) >> 1;
        }
    }
    for x in 0..16 {
        let mut col = [0i32; 64];
        for y in 0..64 {
            col[y] = tmp[y * 16 + x];
        }
        let out = dct64_1d(&col);
        for y in 0..64 {
            tmp[y * 16 + x] = out[y];
        }
    }
    for y in 0..64 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..16 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 16 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..(16 * 64) {
        coeff[i] = 0;
    }
}

#[autoversion]
fn inv_txfm_add_dct_dct_16x64_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 16 * 64];
    for y in 0..64 {
        let mut row = [0i32; 16];
        for x in 0..16 {
            row[x] = rect2_scale(coeff[y + x * 64]);
        }
        let out = dct16_1d(&row);
        for x in 0..16 {
            tmp[y * 16 + x] = (out[x] + 1) >> 1;
        }
    }
    for x in 0..16 {
        let mut col = [0i32; 64];
        for y in 0..64 {
            col[y] = tmp[y * 16 + x];
        }
        let out = dct64_1d(&col);
        for y in 0..64 {
            tmp[y * 16 + x] = out[y];
        }
    }
    for y in 0..64 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..16 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 16 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u16;
        }
    }
    for i in 0..(16 * 64) {
        coeff[i] = 0;
    }
}

#[autoversion]
fn inv_txfm_add_dct_dct_64x16_8bpc_inner(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64 * 16];
    for y in 0..16 {
        let mut row = [0i32; 64];
        for x in 0..64 {
            row[x] = rect2_scale(coeff[y + x * 16] as i32);
        }
        let out = dct64_1d(&row);
        for x in 0..64 {
            tmp[y * 64 + x] = (out[x] + 1) >> 1;
        }
    }
    for x in 0..64 {
        let mut col = [0i32; 16];
        for y in 0..16 {
            col[y] = tmp[y * 64 + x];
        }
        let out = dct16_1d(&col);
        for y in 0..16 {
            tmp[y * 64 + x] = out[y];
        }
    }
    for y in 0..16 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..64 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 64 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u8;
        }
    }
    for i in 0..(64 * 16) {
        coeff[i] = 0;
    }
}

#[autoversion]
fn inv_txfm_add_dct_dct_64x16_16bpc_inner(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    _eob: i32,
    bitdepth_max: i32,
) {
    let mut tmp = [0i32; 64 * 16];
    for y in 0..16 {
        let mut row = [0i32; 64];
        for x in 0..64 {
            row[x] = rect2_scale(coeff[y + x * 16]);
        }
        let out = dct64_1d(&row);
        for x in 0..64 {
            tmp[y * 64 + x] = (out[x] + 1) >> 1;
        }
    }
    for x in 0..64 {
        let mut col = [0i32; 16];
        for y in 0..16 {
            col[y] = tmp[y * 64 + x];
        }
        let out = dct16_1d(&col);
        for y in 0..16 {
            tmp[y * 64 + x] = out[y];
        }
    }
    for y in 0..16 {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..64 {
            let d = dst[row_off + x] as i32;
            let val = (tmp[y * 64 + x] + 8) >> 4;
            dst[row_off + x] = iclip(d + val, 0, bitdepth_max) as u16;
        }
    }
    for i in 0..(64 * 16) {
        coeff[i] = 0;
    }
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x64_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 63usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x64_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x64_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 63usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 16;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 16;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_16x64_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x16_8bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 1024) };
    let abs_stride = dst_stride.unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if dst_stride >= 0 {
        let len = rows * abs_stride + 64;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 64;
        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_64x16_8bpc_inner(
        dst_slice,
        base,
        dst_stride,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

#[cfg(all(feature = "asm", target_arch = "aarch64"))]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x16_16bpc_neon(
    dst_ptr: *mut DynPixel,
    dst_stride: isize,
    coeff: *mut DynCoef,
    eob: i32,
    bitdepth_max: i32,
    _coeff_len: u16,
    _dst: *const FFISafe<PicOffset>,
) {
    let stride_u16 = dst_stride / 2;
    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 1024) };
    let abs_stride = (stride_u16 as isize).unsigned_abs();
    let rows = 15usize;
    let (dst_slice, base) = if stride_u16 >= 0 {
        let len = rows * abs_stride + 64;
        (
            unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) },
            0usize,
        )
    } else {
        let len = rows * abs_stride + 64;
        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
        (
            unsafe { std::slice::from_raw_parts_mut(start, len) },
            rows * abs_stride,
        )
    };
    inv_txfm_add_dct_dct_64x16_16bpc_inner(
        dst_slice,
        base,
        stride_u16 as isize,
        coeff_slice,
        eob,
        bitdepth_max,
    );
}

// ============================================================================
// GENERIC TRANSFORM ENGINE
// ============================================================================
//
// Instead of writing 234 individual inner functions for every (row_tx, col_tx, WxH, bpc)
// combination, we use the existing 1D transform functions from itx_1d.rs and compose them
// generically, exactly like the C fallback does.

use crate::src::itx_1d::*;
use std::cmp;
use std::num::NonZeroUsize;

type Itx1dFn = fn(&mut [i32], NonZeroUsize, i32, i32);

/// Generic 8bpc inverse transform: apply row_fn across rows, then col_fn down columns,
/// then add residuals to dst pixels.
fn inv_txfm_add_generic_8bpc(
    dst: &mut [u8],
    dst_base: usize,
    dst_stride: isize,
    coeff: &mut [i16],
    eob: i32,
    _bitdepth_max: i32,
    w: usize,
    h: usize,
    shift: u8,
    row_fn: Itx1dFn,
    col_fn: Itx1dFn,
    has_dc_only: bool,
) {
    let is_rect2 = w * 2 == h || h * 2 == w;
    let rnd = if shift > 0 { 1i32 << (shift - 1) } else { 0 };

    // DC-only fast path
    if eob < has_dc_only as i32 {
        let mut dc = coeff[0] as i32;
        coeff[0] = 0;
        if is_rect2 {
            dc = (dc * 181 + 128) >> 8;
        }
        dc = (dc * 181 + 128) >> 8;
        dc = (dc + rnd) >> shift;
        dc = (dc * 181 + 128 + 2048) >> 12;
        for y in 0..h {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
            for x in 0..w {
                let p = dst[row_off + x] as i32 + dc;
                dst[row_off + x] = p.max(0).min(255) as u8;
            }
        }
        return;
    }

    let sh = cmp::min(h, 32);
    let sw = cmp::min(w, 32);

    let row_clip_min = i16::MIN as i32;
    let row_clip_max = i16::MAX as i32;
    let col_clip_min = i16::MIN as i32;
    let col_clip_max = i16::MAX as i32;

    let mut tmp = [0i32; 64 * 64];

    // Row transforms
    for y in 0..sh {
        if is_rect2 {
            for x in 0..sw {
                tmp[y * w + x] = (coeff[y + x * sh] as i32 * 181 + 128) >> 8;
            }
        } else {
            for x in 0..sw {
                tmp[y * w + x] = coeff[y + x * sh] as i32;
            }
        }
        row_fn(
            &mut tmp[y * w..],
            1.try_into().unwrap(),
            row_clip_min,
            row_clip_max,
        );
    }

    // Clear coefficients
    for i in 0..(sh * sw) {
        coeff[i] = 0;
    }

    // Apply shift + clip
    for i in 0..(w * sh) {
        tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
    }

    // Column transforms
    for x in 0..w {
        col_fn(
            &mut tmp[x..],
            w.try_into().unwrap(),
            col_clip_min,
            col_clip_max,
        );
    }

    // Add to dst
    for y in 0..h {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride);
        for x in 0..w {
            let p = dst[row_off + x] as i32 + ((tmp[y * w + x] + 8) >> 4);
            dst[row_off + x] = p.max(0).min(255) as u8;
        }
    }
}

/// Generic 16bpc inverse transform
fn inv_txfm_add_generic_16bpc(
    dst: &mut [u16],
    dst_base: usize,
    dst_stride_u16: isize,
    coeff: &mut [i32],
    eob: i32,
    bitdepth_max: i32,
    w: usize,
    h: usize,
    shift: u8,
    row_fn: Itx1dFn,
    col_fn: Itx1dFn,
    has_dc_only: bool,
) {
    let is_rect2 = w * 2 == h || h * 2 == w;
    let rnd = if shift > 0 { 1i32 << (shift - 1) } else { 0 };

    // DC-only fast path
    if eob < has_dc_only as i32 {
        let mut dc = coeff[0];
        coeff[0] = 0;
        if is_rect2 {
            dc = (dc * 181 + 128) >> 8;
        }
        dc = (dc * 181 + 128) >> 8;
        dc = (dc + rnd) >> shift;
        dc = (dc * 181 + 128 + 2048) >> 12;
        for y in 0..h {
            let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
            for x in 0..w {
                let p = dst[row_off + x] as i32 + dc;
                dst[row_off + x] = p.max(0).min(bitdepth_max) as u16;
            }
        }
        return;
    }

    let sh = cmp::min(h, 32);
    let sw = cmp::min(w, 32);

    let row_clip_min = (!bitdepth_max) << 7;
    let row_clip_max = !row_clip_min;
    let col_clip_min = (!bitdepth_max) << 5;
    let col_clip_max = !col_clip_min;

    let mut tmp = [0i32; 64 * 64];

    // Row transforms
    for y in 0..sh {
        if is_rect2 {
            for x in 0..sw {
                tmp[y * w + x] = (coeff[y + x * sh] * 181 + 128) >> 8;
            }
        } else {
            for x in 0..sw {
                tmp[y * w + x] = coeff[y + x * sh];
            }
        }
        row_fn(
            &mut tmp[y * w..],
            1.try_into().unwrap(),
            row_clip_min,
            row_clip_max,
        );
    }

    // Clear coefficients
    for i in 0..(sh * sw) {
        coeff[i] = 0;
    }

    // Apply shift + clip
    for i in 0..(w * sh) {
        tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
    }

    // Column transforms
    for x in 0..w {
        col_fn(
            &mut tmp[x..],
            w.try_into().unwrap(),
            col_clip_min,
            col_clip_max,
        );
    }

    // Add to dst
    for y in 0..h {
        let row_off = dst_base.wrapping_add_signed(y as isize * dst_stride_u16);
        for x in 0..w {
            let p = dst[row_off + x] as i32 + ((tmp[y * w + x] + 8) >> 4);
            dst[row_off + x] = p.max(0).min(bitdepth_max) as u16;
        }
    }
}

/// Resolve a 1D transform function by type and size
fn resolve_1d(txfm: &str, n: usize) -> Itx1dFn {
    match (txfm, n) {
        ("dct", 4) => rav1d_inv_dct4_1d_c,
        ("dct", 8) => rav1d_inv_dct8_1d_c,
        ("dct", 16) => rav1d_inv_dct16_1d_c,
        ("dct", 32) => rav1d_inv_dct32_1d_c,
        ("dct", 64) => rav1d_inv_dct64_1d_c,
        ("adst", 4) => rav1d_inv_adst4_1d_c,
        ("adst", 8) => rav1d_inv_adst8_1d_c,
        ("adst", 16) => rav1d_inv_adst16_1d_c,
        ("flipadst", 4) => rav1d_inv_flipadst4_1d_c,
        ("flipadst", 8) => rav1d_inv_flipadst8_1d_c,
        ("flipadst", 16) => rav1d_inv_flipadst16_1d_c,
        ("identity", 4) => rav1d_inv_identity4_1d_c,
        ("identity", 8) => rav1d_inv_identity8_1d_c,
        ("identity", 16) => rav1d_inv_identity16_1d_c,
        ("identity", 32) => rav1d_inv_identity32_1d_c,
        _ => unreachable!("unsupported 1D transform: {} size {}", txfm, n),
    }
}

fn shift_for(w: usize, h: usize) -> u8 {
    match (w, h) {
        (4, 4) => 0,
        (4, 8) => 0,
        (4, 16) => 1,
        (8, 4) => 0,
        (8, 8) => 1,
        (8, 16) => 1,
        (8, 32) => 2,
        (16, 4) => 1,
        (16, 8) => 1,
        (16, 16) => 2,
        (16, 32) => 1,
        (16, 64) => 2,
        (32, 8) => 2,
        (32, 16) => 1,
        (32, 32) => 2,
        (32, 64) => 1,
        (64, 16) => 2,
        (64, 32) => 1,
        (64, 64) => 2,
        _ => unreachable!(),
    }
}

/// Macro to generate FFI wrappers for ARM ITX functions using the generic engine.
/// This avoids writing hundreds of individual inner functions.
///
/// Usage: gen_itx_arm!(row_tx, col_tx, W, H, is_dct_dct);
/// Generates both 8bpc and 16bpc FFI wrappers.
macro_rules! gen_itx_arm {
    ($row_name:ident, $col_name:ident, $w:literal, $h:literal, $is_dct_dct:expr) => {
        paste::paste! {
            #[cfg(all(feature = "asm", target_arch = "aarch64"))]
            pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _8bpc_neon>](
                dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
                eob: i32, bitdepth_max: i32, _coeff_len: u16,
                _dst: *const FFISafe<PicOffset>,
            ) {
                {
                    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, $w * $h) };
                    let abs_stride = dst_stride.unsigned_abs();
                    let rows = ($h - 1) as usize;
                    let (dst_slice, base) = if dst_stride >= 0 {
                        let len = rows * abs_stride + $w;
                        (unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) }, 0usize)
                    } else {
                        let len = rows * abs_stride + $w;
                        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
                        (unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
                    };
                    inv_txfm_add_generic_8bpc(
                        dst_slice, base, dst_stride, coeff_slice, eob, bitdepth_max,
                        $w, $h, shift_for($w, $h),
                        resolve_1d(stringify!($row_name), $w),
                        resolve_1d(stringify!($col_name), $h),
                        $is_dct_dct,
                    );
                }
            }

            #[cfg(all(feature = "asm", target_arch = "aarch64"))]
            pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _16bpc_neon>](
                dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
                eob: i32, bitdepth_max: i32, _coeff_len: u16,
                _dst: *const FFISafe<PicOffset>,
            ) {
                {
                    let stride_u16 = dst_stride / 2;
                    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, $w * $h) };
                    let abs_stride = (stride_u16 as isize).unsigned_abs();
                    let rows = ($h - 1) as usize;
                    let (dst_slice, base) = if stride_u16 >= 0 {
                        let len = rows * abs_stride + $w;
                        (unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) }, 0usize)
                    } else {
                        let len = rows * abs_stride + $w;
                        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
                        (unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
                    };
                    inv_txfm_add_generic_16bpc(
                        dst_slice, base, stride_u16 as isize, coeff_slice, eob, bitdepth_max,
                        $w, $h, shift_for($w, $h),
                        resolve_1d(stringify!($row_name), $w),
                        resolve_1d(stringify!($col_name), $h),
                        $is_dct_dct,
                    );
                }
            }
        }
    };
}

/// 8bpc-only variant (when 16bpc already exists as handwritten)
macro_rules! gen_itx_arm_8bpc {
    ($row_name:ident, $col_name:ident, $w:literal, $h:literal, $is_dct_dct:expr) => {
        paste::paste! {
            #[cfg(all(feature = "asm", target_arch = "aarch64"))]
            pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _8bpc_neon>](
                dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
                eob: i32, bitdepth_max: i32, _coeff_len: u16,
                _dst: *const FFISafe<PicOffset>,
            ) {
                {
                    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, $w * $h) };
                    let abs_stride = dst_stride.unsigned_abs();
                    let rows = ($h - 1) as usize;
                    let (dst_slice, base) = if dst_stride >= 0 {
                        let len = rows * abs_stride + $w;
                        (unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, len) }, 0usize)
                    } else {
                        let len = rows * abs_stride + $w;
                        let start = unsafe { (dst_ptr as *mut u8).offset(rows as isize * dst_stride) };
                        (unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
                    };
                    inv_txfm_add_generic_8bpc(
                        dst_slice, base, dst_stride, coeff_slice, eob, bitdepth_max,
                        $w, $h, shift_for($w, $h),
                        resolve_1d(stringify!($row_name), $w),
                        resolve_1d(stringify!($col_name), $h),
                        $is_dct_dct,
                    );
                }
            }
        }
    };
}

/// 16bpc-only variant (when 8bpc already exists as handwritten)
macro_rules! gen_itx_arm_16bpc {
    ($row_name:ident, $col_name:ident, $w:literal, $h:literal, $is_dct_dct:expr) => {
        paste::paste! {
            #[cfg(all(feature = "asm", target_arch = "aarch64"))]
            pub unsafe extern "C" fn [<inv_txfm_add_ $row_name _ $col_name _ $w x $h _16bpc_neon>](
                dst_ptr: *mut DynPixel, dst_stride: isize, coeff: *mut DynCoef,
                eob: i32, bitdepth_max: i32, _coeff_len: u16,
                _dst: *const FFISafe<PicOffset>,
            ) {
                {
                    let stride_u16 = dst_stride / 2;
                    let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, $w * $h) };
                    let abs_stride = (stride_u16 as isize).unsigned_abs();
                    let rows = ($h - 1) as usize;
                    let (dst_slice, base) = if stride_u16 >= 0 {
                        let len = rows * abs_stride + $w;
                        (unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, len) }, 0usize)
                    } else {
                        let len = rows * abs_stride + $w;
                        let start = unsafe { (dst_ptr as *mut u16).offset(rows as isize * stride_u16) };
                        (unsafe { std::slice::from_raw_parts_mut(start, len) }, rows * abs_stride)
                    };
                    inv_txfm_add_generic_16bpc(
                        dst_slice, base, stride_u16 as isize, coeff_slice, eob, bitdepth_max,
                        $w, $h, shift_for($w, $h),
                        resolve_1d(stringify!($row_name), $w),
                        resolve_1d(stringify!($col_name), $h),
                        $is_dct_dct,
                    );
                }
            }
        }
    };
}

// Generate all missing ADST_ADST variants
gen_itx_arm!(adst, adst, 4, 8, false);
gen_itx_arm!(adst, adst, 8, 4, false);
gen_itx_arm!(adst, adst, 4, 16, false);
gen_itx_arm!(adst, adst, 16, 4, false);
gen_itx_arm!(adst, adst, 8, 16, false);
gen_itx_arm!(adst, adst, 16, 8, false);
gen_itx_arm!(adst, adst, 16, 16, false);

// ADST_DCT
gen_itx_arm!(adst, dct, 4, 8, false);
gen_itx_arm!(adst, dct, 8, 4, false);
gen_itx_arm!(adst, dct, 4, 16, false);
gen_itx_arm!(adst, dct, 16, 4, false);
gen_itx_arm!(adst, dct, 8, 16, false);
gen_itx_arm!(adst, dct, 16, 8, false);
gen_itx_arm!(adst, dct, 16, 16, false);

// DCT_ADST
gen_itx_arm!(dct, adst, 4, 8, false);
gen_itx_arm!(dct, adst, 8, 4, false);
gen_itx_arm!(dct, adst, 4, 16, false);
gen_itx_arm!(dct, adst, 16, 4, false);
gen_itx_arm!(dct, adst, 8, 16, false);
gen_itx_arm!(dct, adst, 16, 8, false);
gen_itx_arm!(dct, adst, 16, 16, false);

// FLIPADST_FLIPADST
gen_itx_arm!(flipadst, flipadst, 4, 8, false);
gen_itx_arm!(flipadst, flipadst, 8, 4, false);
gen_itx_arm!(flipadst, flipadst, 4, 16, false);
gen_itx_arm!(flipadst, flipadst, 16, 4, false);
gen_itx_arm!(flipadst, flipadst, 8, 8, false);
gen_itx_arm!(flipadst, flipadst, 8, 16, false);
gen_itx_arm!(flipadst, flipadst, 16, 8, false);
gen_itx_arm!(flipadst, flipadst, 16, 16, false);

// DCT_FLIPADST
gen_itx_arm!(dct, flipadst, 4, 8, false);
gen_itx_arm!(dct, flipadst, 8, 4, false);
gen_itx_arm!(dct, flipadst, 4, 16, false);
gen_itx_arm!(dct, flipadst, 16, 4, false);
gen_itx_arm!(dct, flipadst, 8, 8, false);
gen_itx_arm!(dct, flipadst, 8, 16, false);
gen_itx_arm!(dct, flipadst, 16, 8, false);
gen_itx_arm!(dct, flipadst, 16, 16, false);

// FLIPADST_DCT
gen_itx_arm!(flipadst, dct, 4, 8, false);
gen_itx_arm!(flipadst, dct, 8, 4, false);
gen_itx_arm!(flipadst, dct, 4, 16, false);
gen_itx_arm!(flipadst, dct, 16, 4, false);
gen_itx_arm!(flipadst, dct, 8, 8, false);
gen_itx_arm!(flipadst, dct, 8, 16, false);
gen_itx_arm!(flipadst, dct, 16, 8, false);
gen_itx_arm!(flipadst, dct, 16, 16, false);

// ADST_FLIPADST
gen_itx_arm!(adst, flipadst, 4, 8, false);
gen_itx_arm!(adst, flipadst, 8, 4, false);
gen_itx_arm!(adst, flipadst, 4, 16, false);
gen_itx_arm!(adst, flipadst, 16, 4, false);
gen_itx_arm!(adst, flipadst, 8, 8, false);
gen_itx_arm!(adst, flipadst, 8, 16, false);
gen_itx_arm!(adst, flipadst, 16, 8, false);
gen_itx_arm!(adst, flipadst, 16, 16, false);

// FLIPADST_ADST
gen_itx_arm!(flipadst, adst, 4, 8, false);
gen_itx_arm!(flipadst, adst, 8, 4, false);
gen_itx_arm!(flipadst, adst, 4, 16, false);
gen_itx_arm!(flipadst, adst, 16, 4, false);
gen_itx_arm!(flipadst, adst, 8, 8, false);
gen_itx_arm!(flipadst, adst, 8, 16, false);
gen_itx_arm!(flipadst, adst, 16, 8, false);
gen_itx_arm!(flipadst, adst, 16, 16, false);

// Identity hybrid transforms: dct_identity (H_DCT = row DCT, col identity)
// 4x4: 8bpc handwritten, generate 16bpc only
gen_itx_arm_16bpc!(dct, identity, 4, 4, false);
gen_itx_arm!(dct, identity, 4, 8, false);
gen_itx_arm!(dct, identity, 8, 4, false);
gen_itx_arm!(dct, identity, 4, 16, false);
gen_itx_arm!(dct, identity, 16, 4, false);
gen_itx_arm!(dct, identity, 8, 8, false);
gen_itx_arm!(dct, identity, 8, 16, false);
gen_itx_arm!(dct, identity, 16, 8, false);
gen_itx_arm!(dct, identity, 16, 16, false);

// identity_dct (V_DCT = row identity, col DCT)
gen_itx_arm_16bpc!(identity, dct, 4, 4, false);
gen_itx_arm!(identity, dct, 4, 8, false);
gen_itx_arm!(identity, dct, 8, 4, false);
gen_itx_arm!(identity, dct, 4, 16, false);
gen_itx_arm!(identity, dct, 16, 4, false);
gen_itx_arm!(identity, dct, 8, 8, false);
gen_itx_arm!(identity, dct, 8, 16, false);
gen_itx_arm!(identity, dct, 16, 8, false);
gen_itx_arm!(identity, dct, 16, 16, false);

// adst_identity (H_ADST = row ADST, col identity)
gen_itx_arm_16bpc!(adst, identity, 4, 4, false);
gen_itx_arm!(adst, identity, 4, 8, false);
gen_itx_arm!(adst, identity, 8, 4, false);
gen_itx_arm!(adst, identity, 4, 16, false);
gen_itx_arm!(adst, identity, 16, 4, false);
gen_itx_arm!(adst, identity, 8, 8, false);
gen_itx_arm!(adst, identity, 8, 16, false);
gen_itx_arm!(adst, identity, 16, 8, false);
gen_itx_arm!(adst, identity, 16, 16, false);

// identity_adst (V_ADST = row identity, col ADST)
gen_itx_arm_16bpc!(identity, adst, 4, 4, false);
gen_itx_arm!(identity, adst, 4, 8, false);
gen_itx_arm!(identity, adst, 8, 4, false);
gen_itx_arm!(identity, adst, 4, 16, false);
gen_itx_arm!(identity, adst, 16, 4, false);
gen_itx_arm!(identity, adst, 8, 8, false);
gen_itx_arm!(identity, adst, 8, 16, false);
gen_itx_arm!(identity, adst, 16, 8, false);
gen_itx_arm!(identity, adst, 16, 16, false);

// flipadst_identity (H_FLIPADST = row flipadst, col identity)
gen_itx_arm_16bpc!(flipadst, identity, 4, 4, false);
gen_itx_arm!(flipadst, identity, 4, 8, false);
gen_itx_arm!(flipadst, identity, 8, 4, false);
gen_itx_arm!(flipadst, identity, 4, 16, false);
gen_itx_arm!(flipadst, identity, 16, 4, false);
gen_itx_arm!(flipadst, identity, 8, 8, false);
gen_itx_arm!(flipadst, identity, 8, 16, false);
gen_itx_arm!(flipadst, identity, 16, 8, false);
gen_itx_arm!(flipadst, identity, 16, 16, false);

// identity_flipadst (V_FLIPADST = row identity, col flipadst)
gen_itx_arm_16bpc!(identity, flipadst, 4, 4, false);
gen_itx_arm!(identity, flipadst, 4, 8, false);
gen_itx_arm!(identity, flipadst, 8, 4, false);
gen_itx_arm!(identity, flipadst, 4, 16, false);
gen_itx_arm!(identity, flipadst, 16, 4, false);
gen_itx_arm!(identity, flipadst, 8, 8, false);
gen_itx_arm!(identity, flipadst, 8, 16, false);
gen_itx_arm!(identity, flipadst, 16, 8, false);
gen_itx_arm!(identity, flipadst, 16, 16, false);

// Rectangular IDENTITY_IDENTITY
// 4x8, 8x4, 4x16, 16x4, 8x16, 16x8: handwritten 16bpc exists, generate 8bpc only
gen_itx_arm_8bpc!(identity, identity, 4, 8, false);
gen_itx_arm_8bpc!(identity, identity, 8, 4, false);
gen_itx_arm_8bpc!(identity, identity, 4, 16, false);
gen_itx_arm_8bpc!(identity, identity, 16, 4, false);
gen_itx_arm_8bpc!(identity, identity, 8, 16, false);
gen_itx_arm_8bpc!(identity, identity, 16, 8, false);
// 8x32, 32x8, 16x32, 32x16, 32x32: handwritten 8bpc+16bpc both exist, skip

// ============================================================================
// DISPATCH: Safe entry point for ITX SIMD dispatch on ARM
// ============================================================================

use crate::include::common::bitdepth::BPC;
use crate::include::common::bitdepth::BitDepth;
use crate::src::levels::ADST_ADST;
use crate::src::levels::ADST_DCT;
use crate::src::levels::ADST_FLIPADST;
use crate::src::levels::DCT_ADST;
use crate::src::levels::DCT_DCT;
use crate::src::levels::DCT_FLIPADST;
use crate::src::levels::FLIPADST_ADST;
use crate::src::levels::FLIPADST_DCT;
use crate::src::levels::FLIPADST_FLIPADST;
use crate::src::levels::H_ADST;
use crate::src::levels::H_DCT;
use crate::src::levels::H_FLIPADST;
use crate::src::levels::IDTX;
use crate::src::levels::TxfmSize;
use crate::src::levels::TxfmType;
use crate::src::levels::V_ADST;
use crate::src::levels::V_DCT;
use crate::src::levels::V_FLIPADST;
use crate::src::levels::WHT_WHT;
use crate::src::strided::Strided as _;

/// Macro to generate the per-arch/bpc direct dispatch functions for ITX.
macro_rules! impl_itxfm_direct_dispatch {
    (
        fn $fn_name:ident, $mod_path:path,
        itx16: [$(($sz16:expr, $w16:literal, $h16:literal)),* $(,)?],
        itx12: [$(($sz12:expr, $w12:literal, $h12:literal)),* $(,)?],
        itx2: [$(($sz2:expr, $w2:literal, $h2:literal)),* $(,)?],
        itx1: [$(($sz1:expr, $w1:literal, $h1:literal)),* $(,)?],
        wht: ($szw:expr, $ww:literal, $hw:literal),
        $bpc:literal bpc, $ext:ident,
        h_dct_fn: $h_dct_fn:ident, v_dct_fn: $v_dct_fn:ident,
        h_adst_fn: $h_adst_fn:ident, v_adst_fn: $v_adst_fn:ident,
        h_flipadst_fn: $h_flipadst_fn:ident, v_flipadst_fn: $v_flipadst_fn:ident
    ) => {
        paste::paste! {
            #[cfg(feature = "asm")]
            #[allow(non_upper_case_globals)]
            fn $fn_name(
                tx_size: usize,
                tx_type: usize,
                dst_ptr: *mut DynPixel,
                dst_stride: isize,
                coeff: *mut DynCoef,
                eob: i32,
                bitdepth_max: i32,
                coeff_len: u16,
                dst: *const FFISafe<PicOffset>,
            ) -> bool {
                use $mod_path as si;

                macro_rules! c {
                    ($func:expr) => {{
                        unsafe { $func(dst_ptr, dst_stride, coeff, eob, bitdepth_max, coeff_len, dst) };
                        return true;
                    }};
                }

                const s4x4: usize = TxfmSize::S4x4 as usize;
                const s8x8: usize = TxfmSize::S8x8 as usize;
                const s16x16: usize = TxfmSize::S16x16 as usize;
                const s32x32: usize = TxfmSize::S32x32 as usize;
                const s64x64: usize = TxfmSize::S64x64 as usize;
                const r4x8: usize = TxfmSize::R4x8 as usize;
                const r8x4: usize = TxfmSize::R8x4 as usize;
                const r8x16: usize = TxfmSize::R8x16 as usize;
                const r16x8: usize = TxfmSize::R16x8 as usize;
                const r16x32: usize = TxfmSize::R16x32 as usize;
                const r32x16: usize = TxfmSize::R32x16 as usize;
                const r32x64: usize = TxfmSize::R32x64 as usize;
                const r64x32: usize = TxfmSize::R64x32 as usize;
                const r4x16: usize = TxfmSize::R4x16 as usize;
                const r16x4: usize = TxfmSize::R16x4 as usize;
                const r8x32: usize = TxfmSize::R8x32 as usize;
                const r32x8: usize = TxfmSize::R32x8 as usize;
                const r16x64: usize = TxfmSize::R16x64 as usize;
                const r64x16: usize = TxfmSize::R64x16 as usize;

                match (tx_size, tx_type as TxfmType) {
                    ($szw, WHT_WHT) => c!(si::[<inv_txfm_add_wht_wht_ $ww x $hw _ $bpc bpc_ $ext>]),
                    $(
                        ($sz16, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, IDTX) => c!(si::[<inv_txfm_add_identity_identity_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, ADST_DCT) => c!(si::[<inv_txfm_add_dct_adst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, DCT_ADST) => c!(si::[<inv_txfm_add_adst_dct_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, ADST_ADST) => c!(si::[<inv_txfm_add_adst_adst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, FLIPADST_DCT) => c!(si::[<inv_txfm_add_dct_flipadst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, DCT_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_dct_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, FLIPADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_flipadst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, ADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_adst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, FLIPADST_ADST) => c!(si::[<inv_txfm_add_adst_flipadst_ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, H_DCT) => c!(si::[<inv_txfm_add_ $h_dct_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, V_DCT) => c!(si::[<inv_txfm_add_ $v_dct_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, H_ADST) => c!(si::[<inv_txfm_add_ $h_adst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, V_ADST) => c!(si::[<inv_txfm_add_ $v_adst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, H_FLIPADST) => c!(si::[<inv_txfm_add_ $h_flipadst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
                        ($sz16, V_FLIPADST) => c!(si::[<inv_txfm_add_ $v_flipadst_fn _ $w16 x $h16 _ $bpc bpc_ $ext>]),
                    )*
                    $(
                        ($sz12, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, IDTX) => c!(si::[<inv_txfm_add_identity_identity_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, ADST_DCT) => c!(si::[<inv_txfm_add_dct_adst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, DCT_ADST) => c!(si::[<inv_txfm_add_adst_dct_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, ADST_ADST) => c!(si::[<inv_txfm_add_adst_adst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, FLIPADST_DCT) => c!(si::[<inv_txfm_add_dct_flipadst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, DCT_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_dct_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, FLIPADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_flipadst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, ADST_FLIPADST) => c!(si::[<inv_txfm_add_flipadst_adst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, FLIPADST_ADST) => c!(si::[<inv_txfm_add_adst_flipadst_ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, H_DCT) => c!(si::[<inv_txfm_add_ $h_dct_fn _ $w12 x $h12 _ $bpc bpc_ $ext>]),
                        ($sz12, V_DCT) => c!(si::[<inv_txfm_add_ $v_dct_fn _ $w12 x $h12 _ $bpc bpc_ $ext>]),
                    )*
                    $(
                        ($sz2, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w2 x $h2 _ $bpc bpc_ $ext>]),
                        ($sz2, IDTX) => c!(si::[<inv_txfm_add_identity_identity_ $w2 x $h2 _ $bpc bpc_ $ext>]),
                    )*
                    $(
                        ($sz1, DCT_DCT) => c!(si::[<inv_txfm_add_dct_dct_ $w1 x $h1 _ $bpc bpc_ $ext>]),
                    )*
                    _ => return false,
                }
            }
        }
    };
}

// ARM 8bpc direct dispatch
impl_itxfm_direct_dispatch!(
    fn itxfm_add_direct_arm_8bpc, crate::src::safe_simd::itx_arm,
    itx16: [
        (s4x4, 4, 4),
        (s8x8, 8, 8),
        (r4x8, 4, 8), (r8x4, 8, 4),
        (r4x16, 4, 16), (r16x4, 16, 4),
        (r8x16, 8, 16), (r16x8, 16, 8),
    ],
    itx12: [
        (s16x16, 16, 16),
    ],
    itx2: [
        (r8x32, 8, 32), (r32x8, 32, 8),
        (r16x32, 16, 32), (r32x16, 32, 16),
        (s32x32, 32, 32),
    ],
    itx1: [
        (r16x64, 16, 64), (r32x64, 32, 64),
        (r64x16, 64, 16), (r64x32, 64, 32),
        (s64x64, 64, 64),
    ],
    wht: (s4x4, 4, 4),
    8 bpc, neon,
    h_dct_fn: dct_identity, v_dct_fn: identity_dct,
    h_adst_fn: adst_identity, v_adst_fn: identity_adst,
    h_flipadst_fn: flipadst_identity, v_flipadst_fn: identity_flipadst
);

// ARM 16bpc direct dispatch
impl_itxfm_direct_dispatch!(
    fn itxfm_add_direct_arm_16bpc, crate::src::safe_simd::itx_arm,
    itx16: [
        (s4x4, 4, 4),
        (s8x8, 8, 8),
        (r4x8, 4, 8), (r8x4, 8, 4),
        (r4x16, 4, 16), (r16x4, 16, 4),
        (r8x16, 8, 16), (r16x8, 16, 8),
    ],
    itx12: [
        (s16x16, 16, 16),
    ],
    itx2: [
        (r8x32, 8, 32), (r32x8, 32, 8),
        (r16x32, 16, 32), (r32x16, 32, 16),
        (s32x32, 32, 32),
    ],
    itx1: [
        (r16x64, 16, 64), (r32x64, 32, 64),
        (r64x16, 64, 16), (r64x32, 64, 32),
        (s64x64, 64, 64),
    ],
    wht: (s4x4, 4, 4),
    16 bpc, neon,
    h_dct_fn: dct_identity, v_dct_fn: identity_dct,
    h_adst_fn: adst_identity, v_adst_fn: identity_adst,
    h_flipadst_fn: flipadst_identity, v_flipadst_fn: identity_flipadst
);

/// Safe dispatch entry point for ITX SIMD on ARM.
pub fn itxfm_add_dispatch<BD: BitDepth>(
    tx_size: usize,
    tx_type: usize,
    dst: PicOffset,
    coeff: &mut [BD::Coef],
    eob: i32,
    bd: BD,
) -> bool {
    // When asm feature is not enabled, dispatch to NEON inner functions directly
    // for transform sizes that have been ported. Fall back to scalar for the rest.
    #[cfg(not(feature = "asm"))]
    {
        #[cfg(target_arch = "aarch64")]
        {
            use crate::include::common::bitdepth::BPC;
            use crate::src::levels::{self, TxfmSize};
            use crate::src::strided::Strided as _;
            use zerocopy::IntoBytes;

            use archmage::SimdToken;
            let Some(token) = archmage::Arm64::summon() else {
                return false;
            };

            let txfm = match TxfmSize::from_repr(tx_size) {
                Some(t) => t,
                None => return false,
            };
            let (w, h) = txfm.to_wh();

            // Only 4x4 8bpc transforms are ported to NEON so far
            if w == 4 && h == 4 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_4x4::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::ADST_ADST => inv_txfm_add_adst_adst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::FLIPADST_FLIPADST => {
                        inv_txfm_add_flipadst_flipadst_4x4_8bpc_neon_inner(
                            token,
                            dst_u8,
                            base,
                            byte_stride_i,
                            coeff_i16,
                            eob,
                            bd_c,
                        )
                    }
                    levels::IDTX => inv_txfm_add_identity_identity_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::ADST_DCT => inv_txfm_add_dct_adst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::DCT_ADST => inv_txfm_add_adst_dct_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::FLIPADST_DCT => inv_txfm_add_dct_flipadst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::DCT_FLIPADST => inv_txfm_add_flipadst_dct_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::ADST_FLIPADST => inv_txfm_add_flipadst_adst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::FLIPADST_ADST => inv_txfm_add_adst_flipadst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::H_DCT => inv_txfm_add_dct_identity_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::V_DCT => inv_txfm_add_identity_dct_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::H_ADST => inv_txfm_add_adst_identity_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::V_ADST => inv_txfm_add_identity_adst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::H_FLIPADST => inv_txfm_add_flipadst_identity_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::V_FLIPADST => inv_txfm_add_identity_flipadst_4x4_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::WHT_WHT => {
                        super::itx_arm_neon_wht::inv_txfm_add_wht_wht_4x4_8bpc_neon_inner(
                            token,
                            dst_u8,
                            base,
                            byte_stride_i,
                            coeff_i16,
                            eob,
                            bd_c,
                        );
                    }
                    _ => return false,
                }
                return true;
            }

            // 16x16 8bpc transforms via NEON
            if w == 16 && h == 16 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_16x16::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::ADST_ADST => inv_txfm_add_adst_adst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::ADST_DCT => inv_txfm_add_dct_adst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::DCT_ADST => inv_txfm_add_adst_dct_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::FLIPADST_DCT => inv_txfm_add_dct_flipadst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::DCT_FLIPADST => inv_txfm_add_flipadst_dct_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::FLIPADST_FLIPADST => {
                        inv_txfm_add_flipadst_flipadst_16x16_8bpc_neon_inner(
                            token,
                            dst_u8,
                            base,
                            byte_stride_i,
                            coeff_i16,
                            eob,
                            bd_c,
                        )
                    }
                    levels::ADST_FLIPADST => inv_txfm_add_flipadst_adst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::FLIPADST_ADST => inv_txfm_add_adst_flipadst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::H_DCT => inv_txfm_add_dct_identity_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::V_DCT => inv_txfm_add_identity_dct_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::H_ADST => inv_txfm_add_adst_identity_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::V_ADST => inv_txfm_add_identity_adst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::H_FLIPADST => inv_txfm_add_flipadst_identity_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::V_FLIPADST => inv_txfm_add_identity_flipadst_16x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            // 32x32 8bpc transforms via NEON (DCT_DCT and IDTX only)
            if w == 32 && h == 32 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_32::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_32x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_32x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            // 8x32, 32x8 8bpc transforms via NEON (DCT_DCT and IDTX only)
            if w == 8 && h == 32 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_large_rect::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_8x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_8x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            if w == 32 && h == 8 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_large_rect::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_32x8_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_32x8_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            // 16x32, 32x16 8bpc transforms via NEON (DCT_DCT and IDTX only)
            if w == 16 && h == 32 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_large_rect::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_16x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_16x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            if w == 32 && h == 16 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_large_rect::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_32x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_32x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            // 64x64, 64x32, 32x64, 16x64, 64x16 8bpc transforms via NEON
            // DISABLED: these scalar fallback implementations index coeff[y + x*h]
            // where x goes up to w, but the coefficient buffer is only
            // min(w,32)*min(h,32) elements. Fall back to the generic scalar path
            // which handles the zero-fill correctly.
            // See: https://github.com/imazen/rav1d-safe/issues/1
            if w == 64 && h == 64 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_64::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_64x64_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_64x64_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            if w == 64 && h == 32 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_64::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_64x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_64x32_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            if w == 32 && h == 64 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_64::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_32x64_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_32x64_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            if w == 16 && h == 64 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_64::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_16x64_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_16x64_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            if w == 64 && h == 16 && BD::BPC == BPC::BPC8 {
                let byte_stride_i = dst.stride();
                let bd_c = bd.into_c();

                let (mut guard, base) = dst.strided_slice_mut::<BD>(w, h);
                let dst_u8: &mut [u8] = guard.as_mut_bytes();
                let coeff_i16: &mut [i16] =
                    zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
                        .expect("coeff alignment/size mismatch for i16 reinterpretation");

                use super::itx_arm_neon_64::*;
                let tx_t = tx_type as u8;
                match tx_t {
                    levels::DCT_DCT => inv_txfm_add_dct_dct_64x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    levels::IDTX => inv_txfm_add_identity_identity_64x16_8bpc_neon_inner(
                        token,
                        dst_u8,
                        base,
                        byte_stride_i,
                        coeff_i16,
                        eob,
                        bd_c,
                    ),
                    _ => return false,
                }
                return true;
            }

            return false;
        }

        #[cfg(not(target_arch = "aarch64"))]
        {
            let _ = (tx_size, tx_type, &dst, coeff, eob, &bd);
            return false;
        }
    }

    #[cfg(feature = "asm")]
    {
        use crate::src::levels::TxfmSize;
        use zerocopy::IntoBytes;

        // Get transform dimensions for tracked guard
        let txfm = TxfmSize::from_repr(tx_size).unwrap_or_default();
        let (w, h) = txfm.to_wh();

        // Create tracked guard — ensures borrow tracker knows about this access
        let (mut dst_guard, _dst_base) = dst.strided_slice_mut::<BD>(w, h);
        let dst_ptr: *mut DynPixel = dst_guard.as_mut_bytes().as_mut_ptr() as *mut DynPixel;
        let dst_stride = dst.stride();
        let coeff_len = coeff.len() as u16;
        let coeff_ptr = coeff.as_mut_ptr().cast();
        let bd_c = bd.into_c();
        let dst_ffi = FFISafe::new(&dst);

        match BD::BPC {
            BPC::BPC8 => itxfm_add_direct_arm_8bpc(
                tx_size, tx_type, dst_ptr, dst_stride, coeff_ptr, eob, bd_c, coeff_len, dst_ffi,
            ),
            BPC::BPC16 => itxfm_add_direct_arm_16bpc(
                tx_size, tx_type, dst_ptr, dst_stride, coeff_ptr, eob, bd_c, coeff_len, dst_ffi,
            ),
        }
    }
}

// ============================================================================
// AUTOVERSION vs HAND-WRITTEN NEON BENCHMARK
// ============================================================================

/// Benchmark comparing autoversioned (LLVM auto-vectorized) scalar fallbacks
/// against hand-written NEON intrinsic implementations.
///
/// Run on aarch64: `cargo test --release -- --ignored bench_autoversion_vs_neon --nocapture`
#[cfg(all(test, target_arch = "aarch64"))]
mod bench_autoversion_vs_neon {
    use archmage::SimdToken as _;
    use std::time::Instant;

    fn bench_fn(name: &str, iters: u32, mut f: impl FnMut()) -> u64 {
        for _ in 0..100 {
            f();
        }
        let mut times = Vec::with_capacity(iters as usize);
        for _ in 0..iters {
            let start = Instant::now();
            f();
            times.push(start.elapsed().as_nanos() as u64);
        }
        times.sort();
        let median = times[times.len() / 2];
        let mean: u64 = times.iter().sum::<u64>() / times.len() as u64;
        println!("  {name}: median={median}ns mean={mean}ns");
        median
    }

    macro_rules! bench_transform {
        ($test_name:ident, $w:expr, $h:expr,
         autoversioned: $av_fn:path,
         neon: $neon_fn:path,
         $coeff_count:expr) => {
            #[test]
            #[ignore]
            fn $test_name() {
                let token = archmage::Arm64::summon().expect("NEON required");

                let stride: isize = ($w + 16) as isize;
                let dst_size = ($h as usize) * (stride as usize) + $w;
                let mut dst_av = vec![128u8; dst_size];
                let mut dst_neon = vec![128u8; dst_size];
                let base = 0usize;

                let mut coeff_template = vec![0i16; $coeff_count];
                for (i, c) in coeff_template.iter_mut().enumerate() {
                    *c = (((i * 37 + 13) % 512) as i16) - 256;
                }
                coeff_template[0] = 1000;

                let iters = 100_000u32;
                println!("\n=== {} ({}x{}) ===", stringify!($test_name), $w, $h);

                let mut coeff = coeff_template.clone();
                let av_ns = bench_fn("autoversioned", iters, || {
                    dst_av.fill(128);
                    coeff.copy_from_slice(&coeff_template);
                    $av_fn(
                        &mut dst_av,
                        base,
                        stride,
                        &mut coeff,
                        $coeff_count as i32 - 1,
                        255,
                    );
                });

                let neon_ns = bench_fn("hand-written NEON", iters, || {
                    dst_neon.fill(128);
                    coeff.copy_from_slice(&coeff_template);
                    $neon_fn(
                        token,
                        &mut dst_neon,
                        base,
                        stride,
                        &mut coeff,
                        $coeff_count as i32 - 1,
                        255,
                    );
                });

                // Verify correctness
                dst_av.fill(128);
                dst_neon.fill(128);
                let mut ca = coeff_template.clone();
                let mut cb = coeff_template.clone();
                $av_fn(
                    &mut dst_av,
                    base,
                    stride,
                    &mut ca,
                    $coeff_count as i32 - 1,
                    255,
                );
                $neon_fn(
                    token,
                    &mut dst_neon,
                    base,
                    stride,
                    &mut cb,
                    $coeff_count as i32 - 1,
                    255,
                );
                assert_eq!(dst_av, dst_neon, "Output mismatch!");

                let ratio = av_ns as f64 / neon_ns as f64;
                println!("  ratio: {ratio:.2}x (autoversioned / NEON, <1 = autoversioned faster)");
            }
        };
    }

    bench_transform!(bench_dct_dct_4x4, 4, 4,
        autoversioned: super::inv_txfm_add_dct_dct_4x4_8bpc_inner,
        neon: super::super::itx_arm_neon_4x4::inv_txfm_add_dct_dct_4x4_8bpc_neon_inner,
        16
    );
    bench_transform!(bench_identity_4x4, 4, 4,
        autoversioned: super::inv_txfm_add_identity_identity_4x4_8bpc_inner,
        neon: super::super::itx_arm_neon_4x4::inv_txfm_add_identity_identity_4x4_8bpc_neon_inner,
        16
    );
    bench_transform!(bench_adst_adst_4x4, 4, 4,
        autoversioned: super::inv_txfm_add_adst_adst_4x4_8bpc_inner,
        neon: super::super::itx_arm_neon_4x4::inv_txfm_add_adst_adst_4x4_8bpc_neon_inner,
        16
    );
    bench_transform!(bench_wht_wht_4x4, 4, 4,
        autoversioned: super::inv_txfm_add_wht_wht_4x4_8bpc_inner,
        neon: super::super::itx_arm_neon_wht::inv_txfm_add_wht_wht_4x4_8bpc_neon_inner,
        16
    );
    bench_transform!(bench_dct_dct_8x8, 8, 8,
        autoversioned: super::inv_txfm_add_dct_dct_8x8_8bpc_inner,
        neon: super::super::itx_arm_neon_8x8::inv_txfm_add_dct_dct_8x8_8bpc_neon_inner,
        64
    );
    bench_transform!(bench_identity_8x8, 8, 8,
        autoversioned: super::inv_txfm_add_identity_identity_8x8_8bpc_inner,
        neon: super::super::itx_arm_neon_8x8::inv_txfm_add_identity_identity_8x8_8bpc_neon_inner,
        64
    );
    bench_transform!(bench_dct_dct_16x16, 16, 16,
        autoversioned: super::inv_txfm_add_dct_dct_16x16_8bpc_inner,
        neon: super::super::itx_arm_neon_16x16::inv_txfm_add_dct_dct_16x16_8bpc_neon_inner,
        256
    );
    bench_transform!(bench_dct_dct_32x32, 32, 32,
        autoversioned: super::inv_txfm_add_dct_dct_32x32_8bpc_inner,
        neon: super::super::itx_arm_neon_32::inv_txfm_add_dct_dct_32x32_8bpc_neon_inner,
        1024
    );
}