jxl-encoder-simd 0.3.0

// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
// Algorithms and constants derived from libjxl (BSD-3-Clause).
// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing

//! SIMD-accelerated 8x8 DCT and IDCT.
//!
//! The forward DCT produces coefficients in **transposed layout** (matching libjxl convention):
//! `output[cx * 8 + cy]` stores the coefficient for frequency `(cy, cx)`.
//!
//! The inverse DCT expects input in transposed layout and produces spatial-domain output
//! in row-major order.

// Constants copied from jxl_encoder/src/vardct/dct/constants.rs
// Generated by: `1.0 / (2 * cos((i + 0.5) * pi / N))` for i in 0..N/2
// From libjxl-tiny dct_scales.h
const SQRT2: f32 = core::f32::consts::SQRT_2;
const WC_MULTIPLIERS_4: [f32; 2] = [0.541_196_1, 1.306_563];
const WC_MULTIPLIERS_8: [f32; 4] = [0.509_795_6, 0.601_344_9, 0.899_976_2, 2.562_915_5];

/// Compute scaled 8x8 forward DCT with SIMD acceleration.
///
/// Input: 64 f32 in row-major order. Output: 64 f32 in transposed layout.
/// Dispatches to SIMD when available; falls back to scalar otherwise.
#[inline]
pub fn dct_8x8(input: &[f32; 64], output: &mut [f32; 64]) {
    #[cfg(target_arch = "x86_64")]
    {
        use archmage::SimdToken;
        if let Some(token) = archmage::X64V3Token::summon() {
            dct_8x8_avx2(token, input, output);
            return;
        }
    }

    #[cfg(target_arch = "aarch64")]
    {
        use archmage::SimdToken;
        if let Some(token) = archmage::NeonToken::summon() {
            dct_8x8_neon(token, input, output);
            return;
        }
    }

    #[cfg(target_arch = "wasm32")]
    {
        use archmage::SimdToken;
        if let Some(token) = archmage::Wasm128Token::summon() {
            dct_8x8_wasm128(token, input, output);
            return;
        }
    }

    dct_8x8_scalar(input, output);
}

/// Compute scaled 8x8 inverse DCT with SIMD acceleration.
///
/// Input: 64 f32 in transposed layout. Output: 64 f32 in row-major order.
#[inline]
pub fn idct_8x8(input: &[f32; 64], output: &mut [f32; 64]) {
    #[cfg(target_arch = "x86_64")]
    {
        use archmage::SimdToken;
        if let Some(token) = archmage::X64V3Token::summon() {
            idct_8x8_avx2(token, input, output);
            return;
        }
    }

    #[cfg(target_arch = "aarch64")]
    {
        use archmage::SimdToken;
        if let Some(token) = archmage::NeonToken::summon() {
            idct_8x8_neon(token, input, output);
            return;
        }
    }

    #[cfg(target_arch = "wasm32")]
    {
        use archmage::SimdToken;
        if let Some(token) = archmage::Wasm128Token::summon() {
            idct_8x8_wasm128(token, input, output);
            return;
        }
    }

    idct_8x8_scalar(input, output);
}

// ============================================================================
// Scalar fallback — matches the encoder's existing implementation exactly
// ============================================================================

#[inline]
pub fn dct_8x8_scalar(input: &[f32; 64], output: &mut [f32; 64]) {
    let mut tmp = crate::scratch_buf::<64>();

    // Row DCTs
    for row in 0..8 {
        let s = row * 8;
        tmp[s..s + 8].copy_from_slice(&input[s..s + 8]);
        dct1d_8_scalar(&mut tmp[s..s + 8]);
        for i in 0..8 {
            tmp[s + i] *= 0.125; // 1/8
        }
    }

    // Transpose
    let mut transposed = crate::scratch_buf::<64>();
    for r in 0..8 {
        for c in 0..8 {
            transposed[c * 8 + r] = tmp[r * 8 + c];
        }
    }

    // Column DCTs (on transposed data)
    for row in 0..8 {
        let s = row * 8;
        dct1d_8_scalar(&mut transposed[s..s + 8]);
        for i in 0..8 {
            transposed[s + i] *= 0.125;
        }
    }

    output.copy_from_slice(&transposed);
}

#[inline]
pub fn idct_8x8_scalar(input: &[f32; 64], output: &mut [f32; 64]) {
    let mut tmp = crate::scratch_buf::<64>();
    tmp.copy_from_slice(input);

    // Inverse column DCTs
    for row in 0..8 {
        let s = row * 8;
        idct1d_8_scalar(&mut tmp[s..s + 8]);
    }

    // Transpose
    let mut transposed = crate::scratch_buf::<64>();
    for r in 0..8 {
        for c in 0..8 {
            transposed[c * 8 + r] = tmp[r * 8 + c];
        }
    }

    // Inverse row DCTs
    for row in 0..8 {
        let s = row * 8;
        idct1d_8_scalar(&mut transposed[s..s + 8]);
    }

    output.copy_from_slice(&transposed);
}

fn dct1d_4_scalar(mem: &mut [f32]) {
    let mut tmp = [0.0f32; 4];
    tmp[0] = mem[0] + mem[3];
    tmp[1] = mem[1] + mem[2];
    tmp[2] = mem[0] - mem[3];
    tmp[3] = mem[1] - mem[2];

    // DCT-2 on first half (inline)
    let (a, b) = (tmp[0] + tmp[1], tmp[0] - tmp[1]);
    tmp[0] = a;
    tmp[1] = b;

    tmp[2] *= WC_MULTIPLIERS_4[0];
    tmp[3] *= WC_MULTIPLIERS_4[1];

    // DCT-2 on second half (inline)
    let (a, b) = (tmp[2] + tmp[3], tmp[2] - tmp[3]);
    tmp[2] = a;
    tmp[3] = b;

    tmp[2] = SQRT2.mul_add(tmp[2], tmp[3]);

    mem[0] = tmp[0];
    mem[2] = tmp[1];
    mem[1] = tmp[2];
    mem[3] = tmp[3];
}

fn dct1d_8_scalar(mem: &mut [f32]) {
    let mut tmp = [0.0f32; 8];
    for i in 0..4 {
        tmp[i] = mem[i] + mem[7 - i];
    }
    for i in 0..4 {
        tmp[4 + i] = mem[i] - mem[7 - i];
    }

    dct1d_4_scalar(&mut tmp[0..4]);

    for i in 0..4 {
        tmp[4 + i] *= WC_MULTIPLIERS_8[i];
    }

    dct1d_4_scalar(&mut tmp[4..8]);

    tmp[4] = SQRT2.mul_add(tmp[4], tmp[5]);
    tmp[5] += tmp[6];
    tmp[6] += tmp[7];

    for i in 0..4 {
        mem[2 * i] = tmp[i];
        mem[2 * i + 1] = tmp[4 + i];
    }
}

// IDCT scalar

fn idct1d_4_scalar(mem: &mut [f32]) {
    // De-interleave: even positions are first half, odd positions are second half
    let mut tmp = [mem[0], mem[2], mem[1], mem[3]];

    // Inverse B on second half
    tmp[2] = (tmp[2] - tmp[3]) * (1.0 / SQRT2);

    // IDCT-2 on second half (inline)
    let (a, b) = (tmp[2] + tmp[3], tmp[2] - tmp[3]);
    tmp[2] = a;
    tmp[3] = b;

    // Un-WcMultiply
    tmp[2] *= 1.0 / WC_MULTIPLIERS_4[0];
    tmp[3] *= 1.0 / WC_MULTIPLIERS_4[1];

    // IDCT-2 on first half (inline)
    let (a, b) = (tmp[0] + tmp[1], tmp[0] - tmp[1]);
    tmp[0] = a;
    tmp[1] = b;

    // Inverse AddReverse/SubReverse
    mem[0] = tmp[0] + tmp[2];
    mem[3] = tmp[0] - tmp[2];
    mem[1] = tmp[1] + tmp[3];
    mem[2] = tmp[1] - tmp[3];
}

fn idct1d_8_scalar(mem: &mut [f32]) {
    // De-interleave
    let mut first_half = [0.0f32; 4];
    let mut second_half = [0.0f32; 4];
    for i in 0..4 {
        first_half[i] = mem[2 * i];
        second_half[i] = mem[2 * i + 1];
    }

    // Inverse B transform on second half
    second_half[2] -= second_half[3];
    second_half[1] -= second_half[2];
    second_half[0] = (second_half[0] - second_half[1]) * (1.0 / SQRT2);

    // IDCT-4 on second half
    idct1d_4_scalar(&mut second_half);

    // Un-WcMultiply
    for i in 0..4 {
        second_half[i] *= 1.0 / WC_MULTIPLIERS_8[i];
    }

    // IDCT-4 on first half
    idct1d_4_scalar(&mut first_half);

    // Inverse AddReverse/SubReverse
    for i in 0..4 {
        mem[i] = first_half[i] + second_half[i];
        mem[7 - i] = first_half[i] - second_half[i];
    }
}

// ============================================================================
// x86_64 AVX2+FMA implementation
// ============================================================================

#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
pub fn dct_8x8_avx2(token: archmage::X64V3Token, input: &[f32; 64], output: &mut [f32; 64]) {
    use magetypes::simd::f32x8;

    // Load 8 rows
    let r0 = f32x8::from_slice(token, &input[0..]);
    let r1 = f32x8::from_slice(token, &input[8..]);
    let r2 = f32x8::from_slice(token, &input[16..]);
    let r3 = f32x8::from_slice(token, &input[24..]);
    let r4 = f32x8::from_slice(token, &input[32..]);
    let r5 = f32x8::from_slice(token, &input[40..]);
    let r6 = f32x8::from_slice(token, &input[48..]);
    let r7 = f32x8::from_slice(token, &input[56..]);

    // Column-DCT: butterfly on registers (processes all 8 columns simultaneously)
    let (r0, r1, r2, r3, r4, r5, r6, r7) =
        vectorized_dct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);

    // Scale by 1/8
    let scale = f32x8::splat(token, 0.125);
    let r0 = r0 * scale;
    let r1 = r1 * scale;
    let r2 = r2 * scale;
    let r3 = r3 * scale;
    let r4 = r4 * scale;
    let r5 = r5 * scale;
    let r6 = r6 * scale;
    let r7 = r7 * scale;

    // Transpose
    let (r0, r1, r2, r3, r4, r5, r6, r7) =
        transpose_8x8_regs(token, r0, r1, r2, r3, r4, r5, r6, r7);

    // Row-DCT: butterfly on transposed registers (processes all 8 rows simultaneously)
    let (r0, r1, r2, r3, r4, r5, r6, r7) =
        vectorized_dct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);

    // Scale by 1/8
    let r0 = r0 * scale;
    let r1 = r1 * scale;
    let r2 = r2 * scale;
    let r3 = r3 * scale;
    let r4 = r4 * scale;
    let r5 = r5 * scale;
    let r6 = r6 * scale;
    let r7 = r7 * scale;

    // Store — output is in transposed layout (matching scalar dct_8x8)
    r0.store((&mut output[0..8]).try_into().unwrap());
    r1.store((&mut output[8..16]).try_into().unwrap());
    r2.store((&mut output[16..24]).try_into().unwrap());
    r3.store((&mut output[24..32]).try_into().unwrap());
    r4.store((&mut output[32..40]).try_into().unwrap());
    r5.store((&mut output[40..48]).try_into().unwrap());
    r6.store((&mut output[48..56]).try_into().unwrap());
    r7.store((&mut output[56..64]).try_into().unwrap());
}

#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
pub fn idct_8x8_avx2(token: archmage::X64V3Token, input: &[f32; 64], output: &mut [f32; 64]) {
    use magetypes::simd::f32x8;

    // Load 8 registers (input is in transposed layout)
    let r0 = f32x8::from_slice(token, &input[0..]);
    let r1 = f32x8::from_slice(token, &input[8..]);
    let r2 = f32x8::from_slice(token, &input[16..]);
    let r3 = f32x8::from_slice(token, &input[24..]);
    let r4 = f32x8::from_slice(token, &input[32..]);
    let r5 = f32x8::from_slice(token, &input[40..]);
    let r6 = f32x8::from_slice(token, &input[48..]);
    let r7 = f32x8::from_slice(token, &input[56..]);

    // Inverse row-DCT
    let (r0, r1, r2, r3, r4, r5, r6, r7) =
        vectorized_idct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);

    // Transpose
    let (r0, r1, r2, r3, r4, r5, r6, r7) =
        transpose_8x8_regs(token, r0, r1, r2, r3, r4, r5, r6, r7);

    // Inverse column-DCT
    let (r0, r1, r2, r3, r4, r5, r6, r7) =
        vectorized_idct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);

    // Store — output is in row-major order
    r0.store((&mut output[0..8]).try_into().unwrap());
    r1.store((&mut output[8..16]).try_into().unwrap());
    r2.store((&mut output[16..24]).try_into().unwrap());
    r3.store((&mut output[24..32]).try_into().unwrap());
    r4.store((&mut output[32..40]).try_into().unwrap());
    r5.store((&mut output[40..48]).try_into().unwrap());
    r6.store((&mut output[48..56]).try_into().unwrap());
    r7.store((&mut output[56..64]).try_into().unwrap());
}

/// Vectorized 8-point forward DCT butterfly.
///
/// Each f32x8 register holds one "position" across 8 independent DCTs.
/// The butterfly operates across registers (cross-position), processing
/// all 8 DCTs simultaneously via SIMD element-wise operations.
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
pub(crate) fn vectorized_dct1d_8(
    token: archmage::X64V3Token,
    r0: magetypes::simd::f32x8,
    r1: magetypes::simd::f32x8,
    r2: magetypes::simd::f32x8,
    r3: magetypes::simd::f32x8,
    r4: magetypes::simd::f32x8,
    r5: magetypes::simd::f32x8,
    r6: magetypes::simd::f32x8,
    r7: magetypes::simd::f32x8,
) -> (
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
) {
    use magetypes::simd::f32x8;

    let sqrt2 = f32x8::splat(token, SQRT2);

    // ---- Level 0: 8-point split ----
    // AddReverse / SubReverse
    let a0 = r0 + r7;
    let a1 = r1 + r6;
    let a2 = r2 + r5;
    let a3 = r3 + r4;
    let s0 = r0 - r7;
    let s1 = r1 - r6;
    let s2 = r2 - r5;
    let s3 = r3 - r4;

    // ---- DCT-4 on first half {a0,a1,a2,a3} ----
    let b0 = a0 + a3;
    let b1 = a1 + a2;
    let b2 = a0 - a3;
    let b3 = a1 - a2;

    // DCT-2 on {b0, b1}
    let c0 = b0 + b1;
    let c1 = b0 - b1;

    // WcMultipliers_4 on {b2, b3}
    let b2 = b2 * f32x8::splat(token, WC_MULTIPLIERS_4[0]);
    let b3 = b3 * f32x8::splat(token, WC_MULTIPLIERS_4[1]);

    // DCT-2 on {b2, b3}
    let d0 = b2 + b3;
    let d1 = b2 - b3;

    // B transform: d0 = sqrt2 * d0 + d1
    let d0 = sqrt2.mul_add(d0, d1);

    // InverseEvenOdd: first_half = [c0, d0, c1, d1]
    let fh0 = c0;
    let fh1 = d0;
    let fh2 = c1;
    let fh3 = d1;

    // ---- WcMultipliers_8 on second half {s0,s1,s2,s3} ----
    let s0 = s0 * f32x8::splat(token, WC_MULTIPLIERS_8[0]);
    let s1 = s1 * f32x8::splat(token, WC_MULTIPLIERS_8[1]);
    let s2 = s2 * f32x8::splat(token, WC_MULTIPLIERS_8[2]);
    let s3 = s3 * f32x8::splat(token, WC_MULTIPLIERS_8[3]);

    // ---- DCT-4 on second half {s0,s1,s2,s3} ----
    let e0 = s0 + s3;
    let e1 = s1 + s2;
    let e2 = s0 - s3;
    let e3 = s1 - s2;

    // DCT-2
    let f0 = e0 + e1;
    let f1 = e0 - e1;

    // WcMultipliers_4
    let e2 = e2 * f32x8::splat(token, WC_MULTIPLIERS_4[0]);
    let e3 = e3 * f32x8::splat(token, WC_MULTIPLIERS_4[1]);

    // DCT-2
    let g0 = e2 + e3;
    let g1 = e2 - e3;

    // B transform
    let g0 = sqrt2.mul_add(g0, g1);

    // InverseEvenOdd of inner DCT-4
    let sh0 = f0;
    let sh1 = g0;
    let sh2 = f1;
    let sh3 = g1;

    // ---- B transform on second half ----
    // IMPORTANT: must process top-down (0→1→2) so each step uses the ORIGINAL next value.
    // `let` creates new bindings, so sh0 = f(sh1_original) before sh1 is shadowed.
    let sh0 = sqrt2.mul_add(sh0, sh1);
    let sh1 = sh1 + sh2;
    let sh2 = sh2 + sh3;

    // ---- Final InverseEvenOdd: interleave first_half and second_half ----
    (fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3)
}

/// Vectorized 8-point inverse DCT butterfly.
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
pub(crate) fn vectorized_idct1d_8(
    token: archmage::X64V3Token,
    r0: magetypes::simd::f32x8,
    r1: magetypes::simd::f32x8,
    r2: magetypes::simd::f32x8,
    r3: magetypes::simd::f32x8,
    r4: magetypes::simd::f32x8,
    r5: magetypes::simd::f32x8,
    r6: magetypes::simd::f32x8,
    r7: magetypes::simd::f32x8,
) -> (
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
) {
    use magetypes::simd::f32x8;

    let inv_sqrt2 = f32x8::splat(token, 1.0 / SQRT2);

    // ---- De-interleave (reverse of InverseEvenOdd) ----
    // Input order from forward: [fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3]
    let fh0 = r0;
    let sh0 = r1;
    let fh1 = r2;
    let sh1 = r3;
    let fh2 = r4;
    let sh2 = r5;
    let fh3 = r6;
    let sh3 = r7;

    // ---- Inverse B transform on second half ----
    // Forward was: sh0 = sqrt2*sh0 + sh1, sh1 += sh2, sh2 += sh3
    // Reverse:
    let sh2 = sh2 - sh3;
    let sh1 = sh1 - sh2;
    let sh0 = (sh0 - sh1) * inv_sqrt2;

    // ---- Inverse InverseEvenOdd of inner DCT-4 ----
    let f0 = sh0;
    let g0 = sh1;
    let f1 = sh2;
    let g1 = sh3;

    // Inverse B transform on inner DCT-4
    // Forward was: g0 = sqrt2*g0 + g1
    let g0 = (g0 - g1) * inv_sqrt2;

    // Inverse DCT-2
    let e2p = g0 + g1;
    let e3p = g0 - g1;

    // Inverse WcMultipliers_4
    let e2 = e2p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
    let e3 = e3p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);

    // Inverse DCT-2
    let e0p = f0 + f1;
    let e1p = f0 - f1;

    // Inverse add/sub reverse for inner 4-point
    let s0 = e0p + e2;
    let s3 = e0p - e2;
    let s1 = e1p + e3;
    let s2 = e1p - e3;

    // Inverse WcMultipliers_8
    let s0 = s0 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[0]);
    let s1 = s1 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[1]);
    let s2 = s2 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[2]);
    let s3 = s3 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[3]);

    // ---- First half: inverse InverseEvenOdd ----
    let c0 = fh0;
    let d0 = fh1;
    let c1 = fh2;
    let d1 = fh3;

    // Inverse B transform: d0 = sqrt2*d0_in + d1 → d0_in = (d0 - d1) / sqrt2
    let d0_in = (d0 - d1) * inv_sqrt2;

    // Inverse DCT-2 of {d0_in, d1}
    let b2p = d0_in + d1;
    let b3p = d0_in - d1;

    // Inverse WcMultipliers_4
    let b2 = b2p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
    let b3 = b3p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);

    // Inverse DCT-2 of {c0, c1}
    let b0p = c0 + c1;
    let b1p = c0 - c1;

    // Inverse add/sub reverse
    let a0 = b0p + b2;
    let a3 = b0p - b2;
    let a1 = b1p + b3;
    let a2 = b1p - b3;

    // ---- Level 0: inverse add/sub reverse ----
    let out0 = a0 + s0;
    let out7 = a0 - s0;
    let out1 = a1 + s1;
    let out6 = a1 - s1;
    let out2 = a2 + s2;
    let out5 = a2 - s2;
    let out3 = a3 + s3;
    let out4 = a3 - s3;

    (out0, out1, out2, out3, out4, out5, out6, out7)
}

/// In-register 8x8 transpose using AVX2 instructions.
///
/// Safe: token proves AVX2 is available, `from_m256` is token-gated.
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
pub(crate) fn transpose_8x8_regs(
    token: archmage::X64V3Token,
    r0: magetypes::simd::f32x8,
    r1: magetypes::simd::f32x8,
    r2: magetypes::simd::f32x8,
    r3: magetypes::simd::f32x8,
    r4: magetypes::simd::f32x8,
    r5: magetypes::simd::f32x8,
    r6: magetypes::simd::f32x8,
    r7: magetypes::simd::f32x8,
) -> (
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
    magetypes::simd::f32x8,
) {
    use core::arch::x86_64::*;
    use magetypes::simd::f32x8;

    let r0 = r0.raw();
    let r1 = r1.raw();
    let r2 = r2.raw();
    let r3 = r3.raw();
    let r4 = r4.raw();
    let r5 = r5.raw();
    let r6 = r6.raw();
    let r7 = r7.raw();

    // Stage 1: interleave pairs
    let t0 = _mm256_unpacklo_ps(r0, r1);
    let t1 = _mm256_unpackhi_ps(r0, r1);
    let t2 = _mm256_unpacklo_ps(r2, r3);
    let t3 = _mm256_unpackhi_ps(r2, r3);
    let t4 = _mm256_unpacklo_ps(r4, r5);
    let t5 = _mm256_unpackhi_ps(r4, r5);
    let t6 = _mm256_unpacklo_ps(r6, r7);
    let t7 = _mm256_unpackhi_ps(r6, r7);

    // Stage 2: shuffle to form 4-element groups
    let s0 = _mm256_shuffle_ps::<0x44>(t0, t2);
    let s1 = _mm256_shuffle_ps::<0xEE>(t0, t2);
    let s2 = _mm256_shuffle_ps::<0x44>(t1, t3);
    let s3 = _mm256_shuffle_ps::<0xEE>(t1, t3);
    let s4 = _mm256_shuffle_ps::<0x44>(t4, t6);
    let s5 = _mm256_shuffle_ps::<0xEE>(t4, t6);
    let s6 = _mm256_shuffle_ps::<0x44>(t5, t7);
    let s7 = _mm256_shuffle_ps::<0xEE>(t5, t7);

    // Stage 3: exchange 128-bit halves
    let c0 = _mm256_permute2f128_ps::<0x20>(s0, s4);
    let c1 = _mm256_permute2f128_ps::<0x20>(s1, s5);
    let c2 = _mm256_permute2f128_ps::<0x20>(s2, s6);
    let c3 = _mm256_permute2f128_ps::<0x20>(s3, s7);
    let c4 = _mm256_permute2f128_ps::<0x31>(s0, s4);
    let c5 = _mm256_permute2f128_ps::<0x31>(s1, s5);
    let c6 = _mm256_permute2f128_ps::<0x31>(s2, s6);
    let c7 = _mm256_permute2f128_ps::<0x31>(s3, s7);

    (
        f32x8::from_m256(token, c0),
        f32x8::from_m256(token, c1),
        f32x8::from_m256(token, c2),
        f32x8::from_m256(token, c3),
        f32x8::from_m256(token, c4),
        f32x8::from_m256(token, c5),
        f32x8::from_m256(token, c6),
        f32x8::from_m256(token, c7),
    )
}

// ============================================================================
// aarch64 NEON implementation
// ============================================================================

/// NEON 8x8 forward DCT: two-pass (4 columns at a time), in-register transpose.
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
pub fn dct_8x8_neon(token: archmage::NeonToken, input: &[f32; 64], output: &mut [f32; 64]) {
    use magetypes::simd::f32x4;

    let scale = f32x4::splat(token, 0.125);

    // Pass 1: Column-DCT on left 4 columns
    let l0 = f32x4::from_slice(token, &input[0..]);
    let l1 = f32x4::from_slice(token, &input[8..]);
    let l2 = f32x4::from_slice(token, &input[16..]);
    let l3 = f32x4::from_slice(token, &input[24..]);
    let l4 = f32x4::from_slice(token, &input[32..]);
    let l5 = f32x4::from_slice(token, &input[40..]);
    let l6 = f32x4::from_slice(token, &input[48..]);
    let l7 = f32x4::from_slice(token, &input[56..]);

    let (l0, l1, l2, l3, l4, l5, l6, l7) = neon_dct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
    let l0 = l0 * scale;
    let l1 = l1 * scale;
    let l2 = l2 * scale;
    let l3 = l3 * scale;
    let l4 = l4 * scale;
    let l5 = l5 * scale;
    let l6 = l6 * scale;
    let l7 = l7 * scale;

    // Pass 2: Column-DCT on right 4 columns
    let h0 = f32x4::from_slice(token, &input[4..]);
    let h1 = f32x4::from_slice(token, &input[12..]);
    let h2 = f32x4::from_slice(token, &input[20..]);
    let h3 = f32x4::from_slice(token, &input[28..]);
    let h4 = f32x4::from_slice(token, &input[36..]);
    let h5 = f32x4::from_slice(token, &input[44..]);
    let h6 = f32x4::from_slice(token, &input[52..]);
    let h7 = f32x4::from_slice(token, &input[60..]);

    let (h0, h1, h2, h3, h4, h5, h6, h7) = neon_dct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);
    let h0 = h0 * scale;
    let h1 = h1 * scale;
    let h2 = h2 * scale;
    let h3 = h3 * scale;
    let h4 = h4 * scale;
    let h5 = h5 * scale;
    let h6 = h6 * scale;
    let h7 = h7 * scale;

    // In-register 8x8 transpose using four 4x4 sub-transposes
    // Matrix is stored as 8 rows × (lo, hi) f32x4 pairs
    // Quadrant A = rows 0-3 lo, B = rows 0-3 hi, C = rows 4-7 lo, D = rows 4-7 hi
    let (a0, a1, a2, a3) = neon_transpose_4x4(token, l0, l1, l2, l3);
    let (b0, b1, b2, b3) = neon_transpose_4x4(token, h0, h1, h2, h3);
    let (c0, c1, c2, c3) = neon_transpose_4x4(token, l4, l5, l6, l7);
    let (d0, d1, d2, d3) = neon_transpose_4x4(token, h4, h5, h6, h7);

    // After transpose: row i = [A^T[i] | C^T[i]] for i=0..3
    //                  row i = [B^T[i-4] | D^T[i-4]] for i=4..7
    // Pass 3: Row-DCT on left 4 columns (cols 0-3 across all 8 rows)
    let (a0, a1, a2, a3, b0, b1, b2, b3) = neon_dct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
    let a0 = a0 * scale;
    let a1 = a1 * scale;
    let a2 = a2 * scale;
    let a3 = a3 * scale;
    let b0 = b0 * scale;
    let b1 = b1 * scale;
    let b2 = b2 * scale;
    let b3 = b3 * scale;

    // Pass 4: Row-DCT on right 4 columns (cols 4-7 across all 8 rows)
    let (c0, c1, c2, c3, d0, d1, d2, d3) = neon_dct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);
    let c0 = c0 * scale;
    let c1 = c1 * scale;
    let c2 = c2 * scale;
    let c3 = c3 * scale;
    let d0 = d0 * scale;
    let d1 = d1 * scale;
    let d2 = d2 * scale;
    let d3 = d3 * scale;

    // Store — interleave left (a/b) and right (c/d) halves for each row
    a0.store((&mut output[0..4]).try_into().unwrap());
    c0.store((&mut output[4..8]).try_into().unwrap());
    a1.store((&mut output[8..12]).try_into().unwrap());
    c1.store((&mut output[12..16]).try_into().unwrap());
    a2.store((&mut output[16..20]).try_into().unwrap());
    c2.store((&mut output[20..24]).try_into().unwrap());
    a3.store((&mut output[24..28]).try_into().unwrap());
    c3.store((&mut output[28..32]).try_into().unwrap());
    b0.store((&mut output[32..36]).try_into().unwrap());
    d0.store((&mut output[36..40]).try_into().unwrap());
    b1.store((&mut output[40..44]).try_into().unwrap());
    d1.store((&mut output[44..48]).try_into().unwrap());
    b2.store((&mut output[48..52]).try_into().unwrap());
    d2.store((&mut output[52..56]).try_into().unwrap());
    b3.store((&mut output[56..60]).try_into().unwrap());
    d3.store((&mut output[60..64]).try_into().unwrap());
}

/// NEON 8x8 inverse DCT.
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
pub fn idct_8x8_neon(token: archmage::NeonToken, input: &[f32; 64], output: &mut [f32; 64]) {
    use magetypes::simd::f32x4;

    // Load as 8 rows × (lo, hi)
    let l0 = f32x4::from_slice(token, &input[0..]);
    let h0 = f32x4::from_slice(token, &input[4..]);
    let l1 = f32x4::from_slice(token, &input[8..]);
    let h1 = f32x4::from_slice(token, &input[12..]);
    let l2 = f32x4::from_slice(token, &input[16..]);
    let h2 = f32x4::from_slice(token, &input[20..]);
    let l3 = f32x4::from_slice(token, &input[24..]);
    let h3 = f32x4::from_slice(token, &input[28..]);
    let l4 = f32x4::from_slice(token, &input[32..]);
    let h4 = f32x4::from_slice(token, &input[36..]);
    let l5 = f32x4::from_slice(token, &input[40..]);
    let h5 = f32x4::from_slice(token, &input[44..]);
    let l6 = f32x4::from_slice(token, &input[48..]);
    let h6 = f32x4::from_slice(token, &input[52..]);
    let l7 = f32x4::from_slice(token, &input[56..]);
    let h7 = f32x4::from_slice(token, &input[60..]);

    // Inverse row-DCT: left half
    let (l0, l1, l2, l3, l4, l5, l6, l7) = neon_idct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
    // Inverse row-DCT: right half
    let (h0, h1, h2, h3, h4, h5, h6, h7) = neon_idct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);

    // Transpose 8x8 (four 4x4 sub-transposes)
    let (a0, a1, a2, a3) = neon_transpose_4x4(token, l0, l1, l2, l3);
    let (b0, b1, b2, b3) = neon_transpose_4x4(token, h0, h1, h2, h3);
    let (c0, c1, c2, c3) = neon_transpose_4x4(token, l4, l5, l6, l7);
    let (d0, d1, d2, d3) = neon_transpose_4x4(token, h4, h5, h6, h7);

    // After transpose: reassemble rows
    // Row 0 = [a0, c0], Row 1 = [a1, c1], ... Row 4 = [b0, d0], etc.
    // Inverse column-DCT: left half (rows 0-7, cols 0-3)
    let (a0, a1, a2, a3, b0, b1, b2, b3) = neon_idct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
    // Inverse column-DCT: right half (rows 0-7, cols 4-7)
    let (c0, c1, c2, c3, d0, d1, d2, d3) = neon_idct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);

    // Store row-major
    a0.store((&mut output[0..4]).try_into().unwrap());
    c0.store((&mut output[4..8]).try_into().unwrap());
    a1.store((&mut output[8..12]).try_into().unwrap());
    c1.store((&mut output[12..16]).try_into().unwrap());
    a2.store((&mut output[16..20]).try_into().unwrap());
    c2.store((&mut output[20..24]).try_into().unwrap());
    a3.store((&mut output[24..28]).try_into().unwrap());
    c3.store((&mut output[28..32]).try_into().unwrap());
    b0.store((&mut output[32..36]).try_into().unwrap());
    d0.store((&mut output[36..40]).try_into().unwrap());
    b1.store((&mut output[40..44]).try_into().unwrap());
    d1.store((&mut output[44..48]).try_into().unwrap());
    b2.store((&mut output[48..52]).try_into().unwrap());
    d2.store((&mut output[52..56]).try_into().unwrap());
    b3.store((&mut output[56..60]).try_into().unwrap());
    d3.store((&mut output[60..64]).try_into().unwrap());
}

/// NEON vectorized 8-point forward DCT butterfly (f32x4, 4 independent DCTs).
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn neon_dct1d_8(
    token: archmage::NeonToken,
    r0: magetypes::simd::f32x4,
    r1: magetypes::simd::f32x4,
    r2: magetypes::simd::f32x4,
    r3: magetypes::simd::f32x4,
    r4: magetypes::simd::f32x4,
    r5: magetypes::simd::f32x4,
    r6: magetypes::simd::f32x4,
    r7: magetypes::simd::f32x4,
) -> (
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
) {
    use magetypes::simd::f32x4;

    let sqrt2 = f32x4::splat(token, SQRT2);

    // Level 0: AddReverse / SubReverse
    let a0 = r0 + r7;
    let a1 = r1 + r6;
    let a2 = r2 + r5;
    let a3 = r3 + r4;
    let s0 = r0 - r7;
    let s1 = r1 - r6;
    let s2 = r2 - r5;
    let s3 = r3 - r4;

    // DCT-4 on first half
    let b0 = a0 + a3;
    let b1 = a1 + a2;
    let b2 = a0 - a3;
    let b3 = a1 - a2;

    let c0 = b0 + b1;
    let c1 = b0 - b1;

    let b2 = b2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
    let b3 = b3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);

    let d0 = b2 + b3;
    let d1 = b2 - b3;
    let d0 = sqrt2.mul_add(d0, d1);

    let fh0 = c0;
    let fh1 = d0;
    let fh2 = c1;
    let fh3 = d1;

    // WcMultipliers_8 on second half
    let s0 = s0 * f32x4::splat(token, WC_MULTIPLIERS_8[0]);
    let s1 = s1 * f32x4::splat(token, WC_MULTIPLIERS_8[1]);
    let s2 = s2 * f32x4::splat(token, WC_MULTIPLIERS_8[2]);
    let s3 = s3 * f32x4::splat(token, WC_MULTIPLIERS_8[3]);

    // DCT-4 on second half
    let e0 = s0 + s3;
    let e1 = s1 + s2;
    let e2 = s0 - s3;
    let e3 = s1 - s2;

    let f0 = e0 + e1;
    let f1 = e0 - e1;

    let e2 = e2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
    let e3 = e3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);

    let g0 = e2 + e3;
    let g1 = e2 - e3;
    let g0 = sqrt2.mul_add(g0, g1);

    let sh0 = f0;
    let sh1 = g0;
    let sh2 = f1;
    let sh3 = g1;

    // B transform on second half
    let sh0 = sqrt2.mul_add(sh0, sh1);
    let sh1 = sh1 + sh2;
    let sh2 = sh2 + sh3;

    (fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3)
}

/// NEON vectorized 8-point inverse DCT butterfly (f32x4).
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn neon_idct1d_8(
    token: archmage::NeonToken,
    r0: magetypes::simd::f32x4,
    r1: magetypes::simd::f32x4,
    r2: magetypes::simd::f32x4,
    r3: magetypes::simd::f32x4,
    r4: magetypes::simd::f32x4,
    r5: magetypes::simd::f32x4,
    r6: magetypes::simd::f32x4,
    r7: magetypes::simd::f32x4,
) -> (
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
) {
    use magetypes::simd::f32x4;

    let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);

    // De-interleave
    let fh0 = r0;
    let sh0 = r1;
    let fh1 = r2;
    let sh1 = r3;
    let fh2 = r4;
    let sh2 = r5;
    let fh3 = r6;
    let sh3 = r7;

    // Inverse B transform on second half
    let sh2 = sh2 - sh3;
    let sh1 = sh1 - sh2;
    let sh0 = (sh0 - sh1) * inv_sqrt2;

    let f0 = sh0;
    let g0 = sh1;
    let f1 = sh2;
    let g1 = sh3;

    let g0 = (g0 - g1) * inv_sqrt2;

    let e2p = g0 + g1;
    let e3p = g0 - g1;

    let e2 = e2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
    let e3 = e3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);

    let e0p = f0 + f1;
    let e1p = f0 - f1;

    let s0 = e0p + e2;
    let s3 = e0p - e2;
    let s1 = e1p + e3;
    let s2 = e1p - e3;

    let s0 = s0 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[0]);
    let s1 = s1 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[1]);
    let s2 = s2 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[2]);
    let s3 = s3 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[3]);

    // First half
    let c0 = fh0;
    let d0 = fh1;
    let c1 = fh2;
    let d1 = fh3;

    let d0_in = (d0 - d1) * inv_sqrt2;

    let b2p = d0_in + d1;
    let b3p = d0_in - d1;

    let b2 = b2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
    let b3 = b3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);

    let b0p = c0 + c1;
    let b1p = c0 - c1;

    let a0 = b0p + b2;
    let a3 = b0p - b2;
    let a1 = b1p + b3;
    let a2 = b1p - b3;

    // Level 0: inverse add/sub reverse
    let out0 = a0 + s0;
    let out7 = a0 - s0;
    let out1 = a1 + s1;
    let out6 = a1 - s1;
    let out2 = a2 + s2;
    let out5 = a2 - s2;
    let out3 = a3 + s3;
    let out4 = a3 - s3;

    (out0, out1, out2, out3, out4, out5, out6, out7)
}

/// NEON in-register 4x4 transpose using vtrn + 64-bit lane swap.
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::type_complexity)]
fn neon_transpose_4x4(
    _token: archmage::NeonToken,
    r0: magetypes::simd::f32x4,
    r1: magetypes::simd::f32x4,
    r2: magetypes::simd::f32x4,
    r3: magetypes::simd::f32x4,
) -> (
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
) {
    use core::arch::aarch64::*;
    use magetypes::simd::f32x4;

    let r0 = r0.raw();
    let r1 = r1.raw();
    let r2 = r2.raw();
    let r3 = r3.raw();

    let t01_lo = vtrn1q_f32(r0, r1);
    let t01_hi = vtrn2q_f32(r0, r1);
    let t23_lo = vtrn1q_f32(r2, r3);
    let t23_hi = vtrn2q_f32(r2, r3);

    let lo0 = vreinterpretq_f64_f32(t01_lo);
    let lo1 = vreinterpretq_f64_f32(t23_lo);
    let hi0 = vreinterpretq_f64_f32(t01_hi);
    let hi1 = vreinterpretq_f64_f32(t23_hi);

    let out0 = vreinterpretq_f32_f64(vtrn1q_f64(lo0, lo1));
    let out1 = vreinterpretq_f32_f64(vtrn1q_f64(hi0, hi1));
    let out2 = vreinterpretq_f32_f64(vtrn2q_f64(lo0, lo1));
    let out3 = vreinterpretq_f32_f64(vtrn2q_f64(hi0, hi1));

    // Token needed for from_float32x4_t, get it back via _token
    (
        f32x4::from_float32x4_t(_token, out0),
        f32x4::from_float32x4_t(_token, out1),
        f32x4::from_float32x4_t(_token, out2),
        f32x4::from_float32x4_t(_token, out3),
    )
}

// ============================================================================
// wasm32 SIMD128 implementation
// ============================================================================

/// WASM128 8x8 forward DCT: two-pass (4 columns at a time), in-register transpose.
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
pub fn dct_8x8_wasm128(token: archmage::Wasm128Token, input: &[f32; 64], output: &mut [f32; 64]) {
    use magetypes::simd::f32x4;

    let scale = f32x4::splat(token, 0.125);

    // Pass 1: Column-DCT on left 4 columns
    let l0 = f32x4::from_slice(token, &input[0..]);
    let l1 = f32x4::from_slice(token, &input[8..]);
    let l2 = f32x4::from_slice(token, &input[16..]);
    let l3 = f32x4::from_slice(token, &input[24..]);
    let l4 = f32x4::from_slice(token, &input[32..]);
    let l5 = f32x4::from_slice(token, &input[40..]);
    let l6 = f32x4::from_slice(token, &input[48..]);
    let l7 = f32x4::from_slice(token, &input[56..]);

    let (l0, l1, l2, l3, l4, l5, l6, l7) = wasm128_dct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
    let l0 = l0 * scale;
    let l1 = l1 * scale;
    let l2 = l2 * scale;
    let l3 = l3 * scale;
    let l4 = l4 * scale;
    let l5 = l5 * scale;
    let l6 = l6 * scale;
    let l7 = l7 * scale;

    // Pass 2: Column-DCT on right 4 columns
    let h0 = f32x4::from_slice(token, &input[4..]);
    let h1 = f32x4::from_slice(token, &input[12..]);
    let h2 = f32x4::from_slice(token, &input[20..]);
    let h3 = f32x4::from_slice(token, &input[28..]);
    let h4 = f32x4::from_slice(token, &input[36..]);
    let h5 = f32x4::from_slice(token, &input[44..]);
    let h6 = f32x4::from_slice(token, &input[52..]);
    let h7 = f32x4::from_slice(token, &input[60..]);

    let (h0, h1, h2, h3, h4, h5, h6, h7) = wasm128_dct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);
    let h0 = h0 * scale;
    let h1 = h1 * scale;
    let h2 = h2 * scale;
    let h3 = h3 * scale;
    let h4 = h4 * scale;
    let h5 = h5 * scale;
    let h6 = h6 * scale;
    let h7 = h7 * scale;

    // In-register 8x8 transpose using four 4x4 sub-transposes
    let (a0, a1, a2, a3) = wasm128_transpose_4x4(token, l0, l1, l2, l3);
    let (b0, b1, b2, b3) = wasm128_transpose_4x4(token, h0, h1, h2, h3);
    let (c0, c1, c2, c3) = wasm128_transpose_4x4(token, l4, l5, l6, l7);
    let (d0, d1, d2, d3) = wasm128_transpose_4x4(token, h4, h5, h6, h7);

    // Pass 3: Row-DCT on left 4 columns
    let (a0, a1, a2, a3, b0, b1, b2, b3) = wasm128_dct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
    let a0 = a0 * scale;
    let a1 = a1 * scale;
    let a2 = a2 * scale;
    let a3 = a3 * scale;
    let b0 = b0 * scale;
    let b1 = b1 * scale;
    let b2 = b2 * scale;
    let b3 = b3 * scale;

    // Pass 4: Row-DCT on right 4 columns
    let (c0, c1, c2, c3, d0, d1, d2, d3) = wasm128_dct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);
    let c0 = c0 * scale;
    let c1 = c1 * scale;
    let c2 = c2 * scale;
    let c3 = c3 * scale;
    let d0 = d0 * scale;
    let d1 = d1 * scale;
    let d2 = d2 * scale;
    let d3 = d3 * scale;

    // Store — interleave left (a/b) and right (c/d) halves for each row
    a0.store((&mut output[0..4]).try_into().unwrap());
    c0.store((&mut output[4..8]).try_into().unwrap());
    a1.store((&mut output[8..12]).try_into().unwrap());
    c1.store((&mut output[12..16]).try_into().unwrap());
    a2.store((&mut output[16..20]).try_into().unwrap());
    c2.store((&mut output[20..24]).try_into().unwrap());
    a3.store((&mut output[24..28]).try_into().unwrap());
    c3.store((&mut output[28..32]).try_into().unwrap());
    b0.store((&mut output[32..36]).try_into().unwrap());
    d0.store((&mut output[36..40]).try_into().unwrap());
    b1.store((&mut output[40..44]).try_into().unwrap());
    d1.store((&mut output[44..48]).try_into().unwrap());
    b2.store((&mut output[48..52]).try_into().unwrap());
    d2.store((&mut output[52..56]).try_into().unwrap());
    b3.store((&mut output[56..60]).try_into().unwrap());
    d3.store((&mut output[60..64]).try_into().unwrap());
}

/// WASM128 8x8 inverse DCT.
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
pub fn idct_8x8_wasm128(token: archmage::Wasm128Token, input: &[f32; 64], output: &mut [f32; 64]) {
    use magetypes::simd::f32x4;

    // Load as 8 rows × (lo, hi)
    let l0 = f32x4::from_slice(token, &input[0..]);
    let h0 = f32x4::from_slice(token, &input[4..]);
    let l1 = f32x4::from_slice(token, &input[8..]);
    let h1 = f32x4::from_slice(token, &input[12..]);
    let l2 = f32x4::from_slice(token, &input[16..]);
    let h2 = f32x4::from_slice(token, &input[20..]);
    let l3 = f32x4::from_slice(token, &input[24..]);
    let h3 = f32x4::from_slice(token, &input[28..]);
    let l4 = f32x4::from_slice(token, &input[32..]);
    let h4 = f32x4::from_slice(token, &input[36..]);
    let l5 = f32x4::from_slice(token, &input[40..]);
    let h5 = f32x4::from_slice(token, &input[44..]);
    let l6 = f32x4::from_slice(token, &input[48..]);
    let h6 = f32x4::from_slice(token, &input[52..]);
    let l7 = f32x4::from_slice(token, &input[56..]);
    let h7 = f32x4::from_slice(token, &input[60..]);

    // Inverse row-DCT: left half
    let (l0, l1, l2, l3, l4, l5, l6, l7) = wasm128_idct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
    // Inverse row-DCT: right half
    let (h0, h1, h2, h3, h4, h5, h6, h7) = wasm128_idct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);

    // Transpose 8x8 (four 4x4 sub-transposes)
    let (a0, a1, a2, a3) = wasm128_transpose_4x4(token, l0, l1, l2, l3);
    let (b0, b1, b2, b3) = wasm128_transpose_4x4(token, h0, h1, h2, h3);
    let (c0, c1, c2, c3) = wasm128_transpose_4x4(token, l4, l5, l6, l7);
    let (d0, d1, d2, d3) = wasm128_transpose_4x4(token, h4, h5, h6, h7);

    // Inverse column-DCT: left half
    let (a0, a1, a2, a3, b0, b1, b2, b3) = wasm128_idct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
    // Inverse column-DCT: right half
    let (c0, c1, c2, c3, d0, d1, d2, d3) = wasm128_idct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);

    // Store row-major
    a0.store((&mut output[0..4]).try_into().unwrap());
    c0.store((&mut output[4..8]).try_into().unwrap());
    a1.store((&mut output[8..12]).try_into().unwrap());
    c1.store((&mut output[12..16]).try_into().unwrap());
    a2.store((&mut output[16..20]).try_into().unwrap());
    c2.store((&mut output[20..24]).try_into().unwrap());
    a3.store((&mut output[24..28]).try_into().unwrap());
    c3.store((&mut output[28..32]).try_into().unwrap());
    b0.store((&mut output[32..36]).try_into().unwrap());
    d0.store((&mut output[36..40]).try_into().unwrap());
    b1.store((&mut output[40..44]).try_into().unwrap());
    d1.store((&mut output[44..48]).try_into().unwrap());
    b2.store((&mut output[48..52]).try_into().unwrap());
    d2.store((&mut output[52..56]).try_into().unwrap());
    b3.store((&mut output[56..60]).try_into().unwrap());
    d3.store((&mut output[60..64]).try_into().unwrap());
}

/// WASM128 vectorized 8-point forward DCT butterfly (f32x4, 4 independent DCTs).
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn wasm128_dct1d_8(
    token: archmage::Wasm128Token,
    r0: magetypes::simd::f32x4,
    r1: magetypes::simd::f32x4,
    r2: magetypes::simd::f32x4,
    r3: magetypes::simd::f32x4,
    r4: magetypes::simd::f32x4,
    r5: magetypes::simd::f32x4,
    r6: magetypes::simd::f32x4,
    r7: magetypes::simd::f32x4,
) -> (
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
) {
    use magetypes::simd::f32x4;

    let sqrt2 = f32x4::splat(token, SQRT2);

    let a0 = r0 + r7;
    let a1 = r1 + r6;
    let a2 = r2 + r5;
    let a3 = r3 + r4;
    let s0 = r0 - r7;
    let s1 = r1 - r6;
    let s2 = r2 - r5;
    let s3 = r3 - r4;

    let b0 = a0 + a3;
    let b1 = a1 + a2;
    let b2 = a0 - a3;
    let b3 = a1 - a2;

    let c0 = b0 + b1;
    let c1 = b0 - b1;

    let b2 = b2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
    let b3 = b3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);

    let d0 = b2 + b3;
    let d1 = b2 - b3;
    let d0 = sqrt2.mul_add(d0, d1);

    let fh0 = c0;
    let fh1 = d0;
    let fh2 = c1;
    let fh3 = d1;

    let s0 = s0 * f32x4::splat(token, WC_MULTIPLIERS_8[0]);
    let s1 = s1 * f32x4::splat(token, WC_MULTIPLIERS_8[1]);
    let s2 = s2 * f32x4::splat(token, WC_MULTIPLIERS_8[2]);
    let s3 = s3 * f32x4::splat(token, WC_MULTIPLIERS_8[3]);

    let e0 = s0 + s3;
    let e1 = s1 + s2;
    let e2 = s0 - s3;
    let e3 = s1 - s2;

    let f0 = e0 + e1;
    let f1 = e0 - e1;

    let e2 = e2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
    let e3 = e3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);

    let g0 = e2 + e3;
    let g1 = e2 - e3;
    let g0 = sqrt2.mul_add(g0, g1);

    let sh0 = f0;
    let sh1 = g0;
    let sh2 = f1;
    let sh3 = g1;

    let sh0 = sqrt2.mul_add(sh0, sh1);
    let sh1 = sh1 + sh2;
    let sh2 = sh2 + sh3;

    (fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3)
}

/// WASM128 vectorized 8-point inverse DCT butterfly (f32x4).
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn wasm128_idct1d_8(
    token: archmage::Wasm128Token,
    r0: magetypes::simd::f32x4,
    r1: magetypes::simd::f32x4,
    r2: magetypes::simd::f32x4,
    r3: magetypes::simd::f32x4,
    r4: magetypes::simd::f32x4,
    r5: magetypes::simd::f32x4,
    r6: magetypes::simd::f32x4,
    r7: magetypes::simd::f32x4,
) -> (
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
) {
    use magetypes::simd::f32x4;

    let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);

    let fh0 = r0;
    let sh0 = r1;
    let fh1 = r2;
    let sh1 = r3;
    let fh2 = r4;
    let sh2 = r5;
    let fh3 = r6;
    let sh3 = r7;

    let sh2 = sh2 - sh3;
    let sh1 = sh1 - sh2;
    let sh0 = (sh0 - sh1) * inv_sqrt2;

    let f0 = sh0;
    let g0 = sh1;
    let f1 = sh2;
    let g1 = sh3;

    let g0 = (g0 - g1) * inv_sqrt2;

    let e2p = g0 + g1;
    let e3p = g0 - g1;

    let e2 = e2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
    let e3 = e3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);

    let e0p = f0 + f1;
    let e1p = f0 - f1;

    let s0 = e0p + e2;
    let s3 = e0p - e2;
    let s1 = e1p + e3;
    let s2 = e1p - e3;

    let s0 = s0 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[0]);
    let s1 = s1 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[1]);
    let s2 = s2 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[2]);
    let s3 = s3 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[3]);

    let c0 = fh0;
    let d0 = fh1;
    let c1 = fh2;
    let d1 = fh3;

    let d0_in = (d0 - d1) * inv_sqrt2;

    let b2p = d0_in + d1;
    let b3p = d0_in - d1;

    let b2 = b2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
    let b3 = b3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);

    let b0p = c0 + c1;
    let b1p = c0 - c1;

    let a0 = b0p + b2;
    let a3 = b0p - b2;
    let a1 = b1p + b3;
    let a2 = b1p - b3;

    let out0 = a0 + s0;
    let out7 = a0 - s0;
    let out1 = a1 + s1;
    let out6 = a1 - s1;
    let out2 = a2 + s2;
    let out5 = a2 - s2;
    let out3 = a3 + s3;
    let out4 = a3 - s3;

    (out0, out1, out2, out3, out4, out5, out6, out7)
}

/// WASM128 in-register 4x4 transpose using i32x4_shuffle.
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::type_complexity)]
fn wasm128_transpose_4x4(
    token: archmage::Wasm128Token,
    r0: magetypes::simd::f32x4,
    r1: magetypes::simd::f32x4,
    r2: magetypes::simd::f32x4,
    r3: magetypes::simd::f32x4,
) -> (
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
    magetypes::simd::f32x4,
) {
    use core::arch::wasm32::*;
    use magetypes::simd::f32x4;

    let r0 = r0.raw();
    let r1 = r1.raw();
    let r2 = r2.raw();
    let r3 = r3.raw();

    // Stage 1: interleave pairs
    // t0 = [r0[0], r1[0], r0[1], r1[1]]
    let t0 = i32x4_shuffle::<0, 4, 1, 5>(r0, r1);
    // t1 = [r0[2], r1[2], r0[3], r1[3]]
    let t1 = i32x4_shuffle::<2, 6, 3, 7>(r0, r1);
    // t2 = [r2[0], r3[0], r2[1], r3[1]]
    let t2 = i32x4_shuffle::<0, 4, 1, 5>(r2, r3);
    // t3 = [r2[2], r3[2], r2[3], r3[3]]
    let t3 = i32x4_shuffle::<2, 6, 3, 7>(r2, r3);

    // Stage 2: combine 64-bit pairs
    // out0 = [r0[0], r1[0], r2[0], r3[0]]
    let out0 = i64x2_shuffle::<0, 2>(t0, t2);
    // out1 = [r0[1], r1[1], r2[1], r3[1]]
    let out1 = i64x2_shuffle::<1, 3>(t0, t2);
    // out2 = [r0[2], r1[2], r2[2], r3[2]]
    let out2 = i64x2_shuffle::<0, 2>(t1, t3);
    // out3 = [r0[3], r1[3], r2[3], r3[3]]
    let out3 = i64x2_shuffle::<1, 3>(t1, t3);

    (
        f32x4::from_v128(token, out0),
        f32x4::from_v128(token, out1),
        f32x4::from_v128(token, out2),
        f32x4::from_v128(token, out3),
    )
}

#[cfg(test)]
mod tests {
    extern crate std;
    use super::*;

    #[test]
    fn test_dct_8x8_scalar_roundtrip() {
        let mut input = [0.0f32; 64];
        for (i, val) in input.iter_mut().enumerate() {
            *val = (i as f32 * 0.1).sin();
        }
        let mut dct_out = [0.0f32; 64];
        let mut idct_out = [0.0f32; 64];

        dct_8x8_scalar(&input, &mut dct_out);
        idct_8x8_scalar(&dct_out, &mut idct_out);

        for i in 0..64 {
            assert!(
                (input[i] - idct_out[i]).abs() < 1e-4,
                "Roundtrip mismatch at {}: {} vs {}",
                i,
                input[i],
                idct_out[i]
            );
        }
    }

    #[test]
    fn test_vectorized_butterfly_vs_scalar() {
        let input = [1.0f32, 0.5, -0.3, 0.8, -0.1, 0.6, -0.4, 0.9];

        let mut block = [0.0f32; 64];
        for r in 0..8 {
            block[r * 8] = input[r];
        }
        let mut scalar_2d_out = [0.0f32; 64];
        dct_8x8_scalar(&block, &mut scalar_2d_out);

        let report = archmage::testing::for_each_token_permutation(
            archmage::testing::CompileTimePolicy::Warn,
            |perm| {
                let mut dct_out = [0.0f32; 64];
                dct_8x8(&block, &mut dct_out);
                for i in 0..64 {
                    let diff = (dct_out[i] - scalar_2d_out[i]).abs();
                    assert!(
                        diff < 1e-5,
                        "Mismatch at [{i}]: simd={:.6} scalar={:.6} diff={diff:.6} [{perm}]",
                        dct_out[i],
                        scalar_2d_out[i],
                    );
                }
            },
        );
        std::eprintln!("{report}");
    }

    #[test]
    fn test_dct_8x8_simd_matches_scalar() {
        let mut input = [0.0f32; 64];
        for (i, val) in input.iter_mut().enumerate() {
            *val = ((i as f32) * 0.37 + 1.5).cos();
        }
        let mut scalar_out = [0.0f32; 64];
        dct_8x8_scalar(&input, &mut scalar_out);

        let report = archmage::testing::for_each_token_permutation(
            archmage::testing::CompileTimePolicy::Warn,
            |perm| {
                let mut simd_out = [0.0f32; 64];
                dct_8x8(&input, &mut simd_out);
                for i in 0..64 {
                    assert!(
                        (scalar_out[i] - simd_out[i]).abs() < 1e-5,
                        "DCT mismatch at {i}: scalar={} simd={} [{perm}]",
                        scalar_out[i],
                        simd_out[i]
                    );
                }
            },
        );
        std::eprintln!("{report}");
    }

    #[test]
    fn test_idct_8x8_simd_matches_scalar() {
        let mut input = [0.0f32; 64];
        for (i, val) in input.iter_mut().enumerate() {
            *val = ((i as f32) * 0.37 + 1.5).cos();
        }
        let mut scalar_out = [0.0f32; 64];
        idct_8x8_scalar(&input, &mut scalar_out);

        let report = archmage::testing::for_each_token_permutation(
            archmage::testing::CompileTimePolicy::Warn,
            |perm| {
                let mut simd_out = [0.0f32; 64];
                idct_8x8(&input, &mut simd_out);
                for i in 0..64 {
                    assert!(
                        (scalar_out[i] - simd_out[i]).abs() < 1e-5,
                        "IDCT mismatch at {i}: scalar={} simd={} [{perm}]",
                        scalar_out[i],
                        simd_out[i]
                    );
                }
            },
        );
        std::eprintln!("{report}");
    }

    #[test]
    fn test_dct_idct_simd_roundtrip() {
        let mut input = [0.0f32; 64];
        for (i, val) in input.iter_mut().enumerate() {
            *val = (i as f32 * 0.1).sin();
        }

        let report = archmage::testing::for_each_token_permutation(
            archmage::testing::CompileTimePolicy::Warn,
            |perm| {
                let mut dct_out = [0.0f32; 64];
                let mut idct_out = [0.0f32; 64];
                dct_8x8(&input, &mut dct_out);
                idct_8x8(&dct_out, &mut idct_out);
                for i in 0..64 {
                    assert!(
                        (input[i] - idct_out[i]).abs() < 1e-4,
                        "SIMD roundtrip mismatch at {i}: {} vs {} [{perm}]",
                        input[i],
                        idct_out[i]
                    );
                }
            },
        );
        std::eprintln!("{report}");
    }
}