#![allow(dead_code)]
use archmage::prelude::*;
use magetypes::simd::generic::i32x8 as GenericI32x8;
#[cfg(target_arch = "x86_64")]
use archmage::SimdToken;
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64 as safe_simd;
const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
#[inline]
const fn f2f(x: f32) -> i32 {
(x * 4096.0 + 0.5) as i32
}
#[inline]
const fn fsh(x: i32) -> i32 {
x << 12
}
#[inline]
fn clamp(a: i32) -> i16 {
a.clamp(0, 255) as i16
}
#[inline(always)]
const fn wa(a: i32, b: i32) -> i32 {
a.wrapping_add(b)
}
#[inline(always)]
const fn ws(a: i32, b: i32) -> i32 {
a.wrapping_sub(b)
}
#[inline(always)]
const fn wm(a: i32, b: i32) -> i32 {
a.wrapping_mul(b)
}
#[inline]
pub fn idct_int_dc_only(dc_coeff: i32, out_vector: &mut [i16], stride: usize) {
let coeff = wa(wa(dc_coeff, 4), 1024).wrapping_shr(3).clamp(0, 255) as i16;
let min_len = stride * 7 + 8;
assert!(out_vector.len() >= min_len);
let out = &mut out_vector[..min_len];
for i in 0..8 {
let off = i * stride;
out[off..off + 8].fill(coeff);
}
}
#[inline]
pub fn is_dc_only_int(coeffs: &[i32; 64]) -> bool {
coeffs[1..].iter().all(|&x| x == 0)
}
const LJ_FIX_0_298631336: i64 = 2446;
const LJ_FIX_0_390180644: i64 = 3196;
const LJ_FIX_0_541196100: i64 = 4433;
const LJ_FIX_0_765366865: i64 = 6270;
const LJ_FIX_0_899976223: i64 = 7373;
const LJ_FIX_1_175875602: i64 = 9633;
const LJ_FIX_1_501321110: i64 = 12299;
const LJ_FIX_1_847759065: i64 = 15137;
const LJ_FIX_1_961570560: i64 = 16069;
const LJ_FIX_2_053119869: i64 = 16819;
const LJ_FIX_2_562915447: i64 = 20995;
const LJ_FIX_3_072711026: i64 = 25172;
const LJ_CONST_BITS: u32 = 13;
const LJ_PASS1_BITS: u32 = 2;
#[inline(always)]
const fn descale(x: i64, n: u32) -> i64 {
(x + (1 << (n - 1))) >> n
}
#[allow(clippy::too_many_lines)]
pub fn idct_int_libjpeg(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
if is_dc_only_int(in_vector) {
return idct_int_dc_only(in_vector[0], out_vector, stride);
}
let min_len = stride * 7 + 8;
assert!(out_vector.len() >= min_len);
let out_vector = &mut out_vector[..min_len];
let mut workspace = [0i64; 64];
for col in 0..8 {
if in_vector[col + 8] == 0
&& in_vector[col + 16] == 0
&& in_vector[col + 24] == 0
&& in_vector[col + 32] == 0
&& in_vector[col + 40] == 0
&& in_vector[col + 48] == 0
&& in_vector[col + 56] == 0
{
let dcval = (in_vector[col] as i64) << LJ_PASS1_BITS;
workspace[col] = dcval;
workspace[col + 8] = dcval;
workspace[col + 16] = dcval;
workspace[col + 24] = dcval;
workspace[col + 32] = dcval;
workspace[col + 40] = dcval;
workspace[col + 48] = dcval;
workspace[col + 56] = dcval;
continue;
}
let z2 = in_vector[col + 16] as i64;
let z3 = in_vector[col + 48] as i64;
let z1 = (z2 + z3) * LJ_FIX_0_541196100;
let tmp2 = z1 + z3 * (-LJ_FIX_1_847759065);
let tmp3 = z1 + z2 * LJ_FIX_0_765366865;
let z2 = in_vector[col] as i64;
let z3 = in_vector[col + 32] as i64;
let tmp0 = (z2 + z3) << LJ_CONST_BITS;
let tmp1 = (z2 - z3) << LJ_CONST_BITS;
let tmp10 = tmp0 + tmp3;
let tmp13 = tmp0 - tmp3;
let tmp11 = tmp1 + tmp2;
let tmp12 = tmp1 - tmp2;
let mut tmp0 = in_vector[col + 56] as i64;
let mut tmp1 = in_vector[col + 40] as i64;
let mut tmp2 = in_vector[col + 24] as i64;
let mut tmp3 = in_vector[col + 8] as i64;
let z1 = tmp0 + tmp3;
let z2 = tmp1 + tmp2;
let z3 = tmp0 + tmp2;
let z4 = tmp1 + tmp3;
let z5 = (z3 + z4) * LJ_FIX_1_175875602;
tmp0 *= LJ_FIX_0_298631336;
tmp1 *= LJ_FIX_2_053119869;
tmp2 *= LJ_FIX_3_072711026;
tmp3 *= LJ_FIX_1_501321110;
let z1 = z1 * (-LJ_FIX_0_899976223);
let z2 = z2 * (-LJ_FIX_2_562915447);
let z3 = z3 * (-LJ_FIX_1_961570560) + z5;
let z4 = z4 * (-LJ_FIX_0_390180644) + z5;
tmp0 += z1 + z3;
tmp1 += z2 + z4;
tmp2 += z2 + z3;
tmp3 += z1 + z4;
workspace[col] = descale(tmp10 + tmp3, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 56] = descale(tmp10 - tmp3, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 8] = descale(tmp11 + tmp2, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 48] = descale(tmp11 - tmp2, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 16] = descale(tmp12 + tmp1, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 40] = descale(tmp12 - tmp1, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 24] = descale(tmp13 + tmp0, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[col + 32] = descale(tmp13 - tmp0, LJ_CONST_BITS - LJ_PASS1_BITS);
}
let total_shift = LJ_CONST_BITS + LJ_PASS1_BITS + 3;
for row in 0..8 {
let base = row * 8;
if workspace[base + 1] == 0
&& workspace[base + 2] == 0
&& workspace[base + 3] == 0
&& workspace[base + 4] == 0
&& workspace[base + 5] == 0
&& workspace[base + 6] == 0
&& workspace[base + 7] == 0
{
let dcval = (descale(workspace[base], LJ_PASS1_BITS + 3) + 128).clamp(0, 255) as i16;
let out_base = row * stride;
out_vector[out_base..out_base + 8].fill(dcval);
continue;
}
let z2 = workspace[base + 2];
let z3 = workspace[base + 6];
let z1 = (z2 + z3) * LJ_FIX_0_541196100;
let tmp2 = z1 + z3 * (-LJ_FIX_1_847759065);
let tmp3 = z1 + z2 * LJ_FIX_0_765366865;
let tmp0 = (workspace[base] + workspace[base + 4]) << LJ_CONST_BITS;
let tmp1 = (workspace[base] - workspace[base + 4]) << LJ_CONST_BITS;
let tmp10 = tmp0 + tmp3;
let tmp13 = tmp0 - tmp3;
let tmp11 = tmp1 + tmp2;
let tmp12 = tmp1 - tmp2;
let mut tmp0 = workspace[base + 7];
let mut tmp1 = workspace[base + 5];
let mut tmp2 = workspace[base + 3];
let mut tmp3 = workspace[base + 1];
let z1 = tmp0 + tmp3;
let z2 = tmp1 + tmp2;
let z3 = tmp0 + tmp2;
let z4 = tmp1 + tmp3;
let z5 = (z3 + z4) * LJ_FIX_1_175875602;
tmp0 *= LJ_FIX_0_298631336;
tmp1 *= LJ_FIX_2_053119869;
tmp2 *= LJ_FIX_3_072711026;
tmp3 *= LJ_FIX_1_501321110;
let z1 = z1 * (-LJ_FIX_0_899976223);
let z2 = z2 * (-LJ_FIX_2_562915447);
let z3 = z3 * (-LJ_FIX_1_961570560) + z5;
let z4 = z4 * (-LJ_FIX_0_390180644) + z5;
tmp0 += z1 + z3;
tmp1 += z2 + z4;
tmp2 += z2 + z3;
tmp3 += z1 + z4;
let out_base = row * stride;
out_vector[out_base] = (descale(tmp10 + tmp3, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 7] = (descale(tmp10 - tmp3, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 1] = (descale(tmp11 + tmp2, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 6] = (descale(tmp11 - tmp2, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 2] = (descale(tmp12 + tmp1, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 5] = (descale(tmp12 - tmp1, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 3] = (descale(tmp13 + tmp0, total_shift) + 128).clamp(0, 255) as i16;
out_vector[out_base + 4] = (descale(tmp13 - tmp0, total_shift) + 128).clamp(0, 255) as i16;
}
}
#[allow(clippy::too_many_lines)]
pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
if is_dc_only_int(in_vector) {
return idct_int_dc_only(in_vector[0], out_vector, stride);
}
for ptr in 0..8 {
let p2 = in_vector[ptr + 16];
let p3 = in_vector[ptr + 48];
let p1 = wm(wa(p2, p3), 2217);
let t2 = wa(p1, wm(p3, -7567));
let t3 = wa(p1, wm(p2, 3135));
let p2 = in_vector[ptr];
let p3 = in_vector[32 + ptr];
let t0 = fsh(wa(p2, p3));
let t1 = fsh(ws(p2, p3));
let x0 = wa(wa(t0, t3), 512);
let x3 = wa(ws(t0, t3), 512);
let x1 = wa(wa(t1, t2), 512);
let x2 = wa(ws(t1, t2), 512);
let mut t0 = in_vector[ptr + 56];
let mut t1 = in_vector[ptr + 40];
let mut t2 = in_vector[ptr + 24];
let mut t3 = in_vector[ptr + 8];
let p3 = wa(t0, t2);
let p4 = wa(t1, t3);
let p1 = wa(t0, t3);
let p2 = wa(t1, t2);
let p5 = wm(wa(p3, p4), 4816);
t0 = wm(t0, 1223);
t1 = wm(t1, 8410);
t2 = wm(t2, 12586);
t3 = wm(t3, 6149);
let p1 = wa(p5, wm(p1, -3685));
let p2 = wa(p5, wm(p2, -10497));
let p3 = wm(p3, -8034);
let p4 = wm(p4, -1597);
t3 = wa(t3, wa(p1, p4));
t2 = wa(t2, wa(p2, p3));
t1 = wa(t1, wa(p2, p4));
t0 = wa(t0, wa(p1, p3));
in_vector[ptr] = wa(x0, t3) >> 10;
in_vector[ptr + 8] = wa(x1, t2) >> 10;
in_vector[ptr + 16] = wa(x2, t1) >> 10;
in_vector[ptr + 24] = wa(x3, t0) >> 10;
in_vector[ptr + 32] = ws(x3, t0) >> 10;
in_vector[ptr + 40] = ws(x2, t1) >> 10;
in_vector[ptr + 48] = ws(x1, t2) >> 10;
in_vector[ptr + 56] = ws(x0, t3) >> 10;
}
let mut pos = 0;
for i in (0..64).step_by(8) {
let p2 = in_vector[i + 2];
let p3 = in_vector[i + 6];
let p1 = wm(wa(p2, p3), 2217);
let t2 = wa(p1, wm(p3, -7567));
let t3 = wa(p1, wm(p2, 3135));
let p2 = in_vector[i];
let p3 = in_vector[i + 4];
let t0 = fsh(wa(p2, p3));
let t1 = fsh(ws(p2, p3));
let x0 = wa(wa(t0, t3), SCALE_BITS);
let x3 = wa(ws(t0, t3), SCALE_BITS);
let x1 = wa(wa(t1, t2), SCALE_BITS);
let x2 = wa(ws(t1, t2), SCALE_BITS);
let mut t0 = in_vector[i + 7];
let mut t1 = in_vector[i + 5];
let mut t2 = in_vector[i + 3];
let mut t3 = in_vector[i + 1];
let p3 = wa(t0, t2);
let p4 = wa(t1, t3);
let p1 = wa(t0, t3);
let p2 = wa(t1, t2);
let p5 = wm(wa(p3, p4), f2f(1.175_875_6));
t0 = wm(t0, 1223);
t1 = wm(t1, 8410);
t2 = wm(t2, 12586);
t3 = wm(t3, 6149);
let p1 = wa(p5, wm(p1, -3685));
let p2 = wa(p5, wm(p2, -10497));
let p3 = wm(p3, -8034);
let p4 = wm(p4, -1597);
t3 = wa(t3, wa(p1, p4));
t2 = wa(t2, wa(p2, p3));
t1 = wa(t1, wa(p2, p4));
t0 = wa(t0, wa(p1, p3));
out_vector[pos] = clamp(wa(x0, t3) >> 17);
out_vector[pos + 1] = clamp(wa(x1, t2) >> 17);
out_vector[pos + 2] = clamp(wa(x2, t1) >> 17);
out_vector[pos + 3] = clamp(wa(x3, t0) >> 17);
out_vector[pos + 4] = clamp(ws(x3, t0) >> 17);
out_vector[pos + 5] = clamp(ws(x2, t1) >> 17);
out_vector[pos + 6] = clamp(ws(x1, t2) >> 17);
out_vector[pos + 7] = clamp(ws(x0, t3) >> 17);
pos += stride;
}
}
#[allow(clippy::too_many_lines)]
pub fn idct_int_4x4(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
for ptr in 0..4 {
let i0 = wa(fsh(in_vector[ptr]), 512);
let i2 = in_vector[ptr + 16];
let p1 = wm(i2, 2217);
let p3 = wm(i2, 5352);
let x0 = wa(i0, p3);
let x1 = wa(i0, p1);
let x2 = ws(i0, p1);
let x3 = ws(i0, p3);
let i4 = in_vector[ptr + 24];
let i3 = in_vector[ptr + 8];
let p5 = wm(wa(i4, i3), 4816);
let p1 = wa(p5, wm(i3, -3685));
let p2 = wa(p5, wm(i4, -10497));
let t3 = wa(p5, wm(i3, 867));
let t2 = wa(p5, wm(i4, -5945));
let t1 = wa(p2, wm(i3, -1597));
let t0 = wa(p1, wm(i4, -8034));
in_vector[ptr] = wa(x0, t3) >> 10;
in_vector[ptr + 8] = wa(x1, t2) >> 10;
in_vector[ptr + 16] = wa(x2, t1) >> 10;
in_vector[ptr + 24] = wa(x3, t0) >> 10;
in_vector[ptr + 32] = ws(x3, t0) >> 10;
in_vector[ptr + 40] = ws(x2, t1) >> 10;
in_vector[ptr + 48] = ws(x1, t2) >> 10;
in_vector[ptr + 56] = ws(x0, t3) >> 10;
}
let mut pos = 0;
for i in (0..64).step_by(8) {
let i2 = in_vector[i + 2];
let i0 = in_vector[i];
let t0 = wa(fsh(i0), SCALE_BITS);
let t2 = wm(i2, 2217);
let t3 = wm(i2, 5352);
let x0 = wa(t0, t3);
let x3 = ws(t0, t3);
let x1 = wa(t0, t2);
let x2 = ws(t0, t2);
let i3 = in_vector[i + 3];
let i1 = in_vector[i + 1];
let p5 = wm(wa(i3, i1), f2f(1.175_875_6));
let p1 = wa(p5, wm(i1, -3685));
let p2 = wa(p5, wm(i3, -10497));
let t3 = wa(p5, wm(i1, 867));
let t2 = wa(p5, wm(i3, -5945));
let t1 = wa(p2, wm(i1, -1597));
let t0 = wa(p1, wm(i3, -8034));
out_vector[pos] = clamp(wa(x0, t3) >> 17);
out_vector[pos + 1] = clamp(wa(x1, t2) >> 17);
out_vector[pos + 2] = clamp(wa(x2, t1) >> 17);
out_vector[pos + 3] = clamp(wa(x3, t0) >> 17);
out_vector[pos + 4] = clamp(ws(x3, t0) >> 17);
out_vector[pos + 5] = clamp(ws(x2, t1) >> 17);
out_vector[pos + 6] = clamp(ws(x1, t2) >> 17);
out_vector[pos + 7] = clamp(ws(x0, t3) >> 17);
pos += stride;
}
in_vector[32..36].fill(0);
in_vector[40..44].fill(0);
in_vector[48..52].fill(0);
in_vector[56..60].fill(0);
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mod avx2 {
use super::*;
use archmage::{arcane, rite};
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
use core::arch::x86_64::*;
#[inline]
const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
(z << 6) | (y << 4) | (x << 2) | w
}
#[rite]
fn clamp_avx(_token: archmage::X64V3Token, reg: __m256i) -> __m256i {
let min_s = _mm256_set1_epi16(0);
let max_s = _mm256_set1_epi16(255);
let max_v = _mm256_max_epi16(reg, min_s);
_mm256_min_epi16(max_v, max_s)
}
#[rite]
fn transpose_8x8_i32(
_token: archmage::X64V3Token,
v0: &mut __m256i,
v1: &mut __m256i,
v2: &mut __m256i,
v3: &mut __m256i,
v4: &mut __m256i,
v5: &mut __m256i,
v6: &mut __m256i,
v7: &mut __m256i,
) {
let va0 = _mm256_permute4x64_epi64(*v0, shuffle(3, 1, 2, 0));
let vb0 = _mm256_permute4x64_epi64(*v1, shuffle(3, 1, 2, 0));
let w0 = _mm256_unpacklo_epi32(va0, vb0);
let w1 = _mm256_unpackhi_epi32(va0, vb0);
let va2 = _mm256_permute4x64_epi64(*v2, shuffle(3, 1, 2, 0));
let vb2 = _mm256_permute4x64_epi64(*v3, shuffle(3, 1, 2, 0));
let w2 = _mm256_unpacklo_epi32(va2, vb2);
let w3 = _mm256_unpackhi_epi32(va2, vb2);
let va4 = _mm256_permute4x64_epi64(*v4, shuffle(3, 1, 2, 0));
let vb4 = _mm256_permute4x64_epi64(*v5, shuffle(3, 1, 2, 0));
let w4 = _mm256_unpacklo_epi32(va4, vb4);
let w5 = _mm256_unpackhi_epi32(va4, vb4);
let va6 = _mm256_permute4x64_epi64(*v6, shuffle(3, 1, 2, 0));
let vb6 = _mm256_permute4x64_epi64(*v7, shuffle(3, 1, 2, 0));
let w6 = _mm256_unpacklo_epi32(va6, vb6);
let w7 = _mm256_unpackhi_epi32(va6, vb6);
let xa0 = _mm256_permute4x64_epi64(w0, shuffle(3, 1, 2, 0));
let xb0 = _mm256_permute4x64_epi64(w2, shuffle(3, 1, 2, 0));
let x0 = _mm256_unpacklo_epi64(xa0, xb0);
let x1 = _mm256_unpackhi_epi64(xa0, xb0);
let xa1 = _mm256_permute4x64_epi64(w1, shuffle(3, 1, 2, 0));
let xb1 = _mm256_permute4x64_epi64(w3, shuffle(3, 1, 2, 0));
let x2 = _mm256_unpacklo_epi64(xa1, xb1);
let x3 = _mm256_unpackhi_epi64(xa1, xb1);
let xa4 = _mm256_permute4x64_epi64(w4, shuffle(3, 1, 2, 0));
let xb4 = _mm256_permute4x64_epi64(w6, shuffle(3, 1, 2, 0));
let x4 = _mm256_unpacklo_epi64(xa4, xb4);
let x5 = _mm256_unpackhi_epi64(xa4, xb4);
let xa5 = _mm256_permute4x64_epi64(w5, shuffle(3, 1, 2, 0));
let xb5 = _mm256_permute4x64_epi64(w7, shuffle(3, 1, 2, 0));
let x6 = _mm256_unpacklo_epi64(xa5, xb5);
let x7 = _mm256_unpackhi_epi64(xa5, xb5);
*v0 = _mm256_permute2x128_si256(x0, x4, shuffle(0, 2, 0, 0));
*v1 = _mm256_permute2x128_si256(x0, x4, shuffle(0, 3, 0, 1));
*v2 = _mm256_permute2x128_si256(x1, x5, shuffle(0, 2, 0, 0));
*v3 = _mm256_permute2x128_si256(x1, x5, shuffle(0, 3, 0, 1));
*v4 = _mm256_permute2x128_si256(x2, x6, shuffle(0, 2, 0, 0));
*v5 = _mm256_permute2x128_si256(x2, x6, shuffle(0, 3, 0, 1));
*v6 = _mm256_permute2x128_si256(x3, x7, shuffle(0, 2, 0, 0));
*v7 = _mm256_permute2x128_si256(x3, x7, shuffle(0, 3, 0, 1));
}
#[arcane]
#[allow(unused_assignments)] pub fn idct_int_avx2(
_token: archmage::X64V3Token,
in_vector: &mut [i32; 64],
out_vector: &mut [i16],
stride: usize,
) {
assert!(out_vector.len() >= stride * 7 + 8);
let mut row0 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[0..8]).unwrap());
let mut row1 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[8..16]).unwrap());
let mut row2 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[16..24]).unwrap());
let mut row3 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[24..32]).unwrap());
let mut row4 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[32..40]).unwrap());
let mut row5 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[40..48]).unwrap());
let mut row6 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[48..56]).unwrap());
let mut row7 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[56..64]).unwrap());
let ac_check =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[1..9]).unwrap());
let mut bitmap = _mm256_or_si256(row1, row2);
bitmap = _mm256_or_si256(bitmap, row3);
bitmap = _mm256_or_si256(bitmap, row4);
bitmap = _mm256_or_si256(bitmap, row5);
bitmap = _mm256_or_si256(bitmap, row6);
bitmap = _mm256_or_si256(bitmap, row7);
bitmap = _mm256_or_si256(bitmap, ac_check);
if _mm256_testz_si256(bitmap, bitmap) == 1 {
let coeff = ((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16;
let idct_value = _mm_set1_epi16(coeff);
let mut pos = 0;
for _ in 0..8 {
safe_simd::_mm_storeu_si128(
<&mut [i16; 8]>::try_from(&mut out_vector[pos..pos + 8]).unwrap(),
idct_value,
);
pos += stride;
}
return;
}
let c2217 = _mm256_set1_epi32(2217);
let c3135 = _mm256_set1_epi32(3135);
let cn7567 = _mm256_set1_epi32(-7567);
let c4816 = _mm256_set1_epi32(4816);
let c1223 = _mm256_set1_epi32(1223);
let c8410 = _mm256_set1_epi32(8410);
let c12586 = _mm256_set1_epi32(12586);
let c6149 = _mm256_set1_epi32(6149);
let cn3685 = _mm256_set1_epi32(-3685);
let cn10497 = _mm256_set1_epi32(-10497);
let cn8034 = _mm256_set1_epi32(-8034);
let cn1597 = _mm256_set1_epi32(-1597);
let c512 = _mm256_set1_epi32(512);
let cscale = _mm256_set1_epi32(SCALE_BITS);
macro_rules! dct_pass {
($scale_bits:expr, $shift:expr) => {
let p1 = _mm256_mullo_epi32(_mm256_add_epi32(row2, row6), c2217);
let t2 = _mm256_add_epi32(p1, _mm256_mullo_epi32(row6, cn7567));
let t3 = _mm256_add_epi32(p1, _mm256_mullo_epi32(row2, c3135));
let t0 = _mm256_slli_epi32(_mm256_add_epi32(row0, row4), 12);
let t1 = _mm256_slli_epi32(_mm256_sub_epi32(row0, row4), 12);
let x0 = _mm256_add_epi32(_mm256_add_epi32(t0, t3), $scale_bits);
let x3 = _mm256_add_epi32(_mm256_sub_epi32(t0, t3), $scale_bits);
let x1 = _mm256_add_epi32(_mm256_add_epi32(t1, t2), $scale_bits);
let x2 = _mm256_add_epi32(_mm256_sub_epi32(t1, t2), $scale_bits);
let p3 = _mm256_add_epi32(row7, row3);
let p4 = _mm256_add_epi32(row5, row1);
let p1 = _mm256_add_epi32(row7, row1);
let p2 = _mm256_add_epi32(row5, row3);
let p5 = _mm256_mullo_epi32(_mm256_add_epi32(p3, p4), c4816);
let mut t0 = _mm256_mullo_epi32(row7, c1223);
let mut t1 = _mm256_mullo_epi32(row5, c8410);
let mut t2 = _mm256_mullo_epi32(row3, c12586);
let mut t3 = _mm256_mullo_epi32(row1, c6149);
let p1 = _mm256_add_epi32(p5, _mm256_mullo_epi32(p1, cn3685));
let p2 = _mm256_add_epi32(p5, _mm256_mullo_epi32(p2, cn10497));
let p3 = _mm256_mullo_epi32(p3, cn8034);
let p4 = _mm256_mullo_epi32(p4, cn1597);
t3 = _mm256_add_epi32(t3, _mm256_add_epi32(p1, p4));
t2 = _mm256_add_epi32(t2, _mm256_add_epi32(p2, p3));
t1 = _mm256_add_epi32(t1, _mm256_add_epi32(p2, p4));
t0 = _mm256_add_epi32(t0, _mm256_add_epi32(p1, p3));
row0 = _mm256_srai_epi32(_mm256_add_epi32(x0, t3), $shift);
row1 = _mm256_srai_epi32(_mm256_add_epi32(x1, t2), $shift);
row2 = _mm256_srai_epi32(_mm256_add_epi32(x2, t1), $shift);
row3 = _mm256_srai_epi32(_mm256_add_epi32(x3, t0), $shift);
row4 = _mm256_srai_epi32(_mm256_sub_epi32(x3, t0), $shift);
row5 = _mm256_srai_epi32(_mm256_sub_epi32(x2, t1), $shift);
row6 = _mm256_srai_epi32(_mm256_sub_epi32(x1, t2), $shift);
row7 = _mm256_srai_epi32(_mm256_sub_epi32(x0, t3), $shift);
};
}
dct_pass!(c512, 10);
transpose_8x8_i32(
_token, &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6,
&mut row7,
);
dct_pass!(cscale, 17);
transpose_8x8_i32(
_token, &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6,
&mut row7,
);
let mut pos = 0;
macro_rules! pack_store {
($r0:expr, $r1:expr) => {
let packed = _mm256_packs_epi32($r0, $r1);
let clamped = clamp_avx(_token, packed);
let reordered = _mm256_permute4x64_epi64(clamped, shuffle(3, 1, 2, 0));
safe_simd::_mm_storeu_si128(
<&mut [i16; 8]>::try_from(&mut out_vector[pos..pos + 8]).unwrap(),
_mm256_extracti128_si256::<0>(reordered),
);
pos += stride;
safe_simd::_mm_storeu_si128(
<&mut [i16; 8]>::try_from(&mut out_vector[pos..pos + 8]).unwrap(),
_mm256_extracti128_si256::<1>(reordered),
);
pos += stride;
};
}
pack_store!(row0, row1);
pack_store!(row2, row3);
pack_store!(row4, row5);
pack_store!(row6, row7);
let _ = pos;
}
#[arcane]
#[allow(unused_assignments)]
pub fn idct_int_avx2_unclamped(
_token: archmage::X64V3Token,
in_vector: &mut [i32; 64],
out_vector: &mut [i16],
stride: usize,
) {
assert!(out_vector.len() >= stride * 7 + 8);
let mut row0 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[0..8]).unwrap());
let mut row1 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[8..16]).unwrap());
let mut row2 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[16..24]).unwrap());
let mut row3 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[24..32]).unwrap());
let mut row4 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[32..40]).unwrap());
let mut row5 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[40..48]).unwrap());
let mut row6 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[48..56]).unwrap());
let mut row7 =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[56..64]).unwrap());
let ac_check =
safe_simd::_mm256_loadu_si256(<&[i32; 8]>::try_from(&in_vector[1..9]).unwrap());
let mut bitmap = _mm256_or_si256(row1, row2);
bitmap = _mm256_or_si256(bitmap, row3);
bitmap = _mm256_or_si256(bitmap, row4);
bitmap = _mm256_or_si256(bitmap, row5);
bitmap = _mm256_or_si256(bitmap, row6);
bitmap = _mm256_or_si256(bitmap, row7);
bitmap = _mm256_or_si256(bitmap, ac_check);
if _mm256_testz_si256(bitmap, bitmap) == 1 {
let coeff = ((in_vector[0] + 4 + 1024) >> 3) as i16;
let idct_value = _mm_set1_epi16(coeff);
let mut pos = 0;
for _ in 0..8 {
safe_simd::_mm_storeu_si128(
<&mut [i16; 8]>::try_from(&mut out_vector[pos..pos + 8]).unwrap(),
idct_value,
);
pos += stride;
}
return;
}
let c2217 = _mm256_set1_epi32(2217);
let c3135 = _mm256_set1_epi32(3135);
let cn7567 = _mm256_set1_epi32(-7567);
let c4816 = _mm256_set1_epi32(4816);
let c1223 = _mm256_set1_epi32(1223);
let c8410 = _mm256_set1_epi32(8410);
let c12586 = _mm256_set1_epi32(12586);
let c6149 = _mm256_set1_epi32(6149);
let cn3685 = _mm256_set1_epi32(-3685);
let cn10497 = _mm256_set1_epi32(-10497);
let cn8034 = _mm256_set1_epi32(-8034);
let cn1597 = _mm256_set1_epi32(-1597);
let c512 = _mm256_set1_epi32(512);
let cscale = _mm256_set1_epi32(SCALE_BITS);
macro_rules! dct_pass {
($scale_bits:expr, $shift:expr) => {
let p1 = _mm256_mullo_epi32(_mm256_add_epi32(row2, row6), c2217);
let t2 = _mm256_add_epi32(p1, _mm256_mullo_epi32(row6, cn7567));
let t3 = _mm256_add_epi32(p1, _mm256_mullo_epi32(row2, c3135));
let t0 = _mm256_slli_epi32(_mm256_add_epi32(row0, row4), 12);
let t1 = _mm256_slli_epi32(_mm256_sub_epi32(row0, row4), 12);
let x0 = _mm256_add_epi32(_mm256_add_epi32(t0, t3), $scale_bits);
let x3 = _mm256_add_epi32(_mm256_sub_epi32(t0, t3), $scale_bits);
let x1 = _mm256_add_epi32(_mm256_add_epi32(t1, t2), $scale_bits);
let x2 = _mm256_add_epi32(_mm256_sub_epi32(t1, t2), $scale_bits);
let p3 = _mm256_add_epi32(row7, row3);
let p4 = _mm256_add_epi32(row5, row1);
let p1 = _mm256_add_epi32(row7, row1);
let p2 = _mm256_add_epi32(row5, row3);
let p5 = _mm256_mullo_epi32(_mm256_add_epi32(p3, p4), c4816);
let mut t0 = _mm256_mullo_epi32(row7, c1223);
let mut t1 = _mm256_mullo_epi32(row5, c8410);
let mut t2 = _mm256_mullo_epi32(row3, c12586);
let mut t3 = _mm256_mullo_epi32(row1, c6149);
let p1 = _mm256_add_epi32(p5, _mm256_mullo_epi32(p1, cn3685));
let p2 = _mm256_add_epi32(p5, _mm256_mullo_epi32(p2, cn10497));
let p3 = _mm256_mullo_epi32(p3, cn8034);
let p4 = _mm256_mullo_epi32(p4, cn1597);
t3 = _mm256_add_epi32(t3, _mm256_add_epi32(p1, p4));
t2 = _mm256_add_epi32(t2, _mm256_add_epi32(p2, p3));
t1 = _mm256_add_epi32(t1, _mm256_add_epi32(p2, p4));
t0 = _mm256_add_epi32(t0, _mm256_add_epi32(p1, p3));
row0 = _mm256_srai_epi32(_mm256_add_epi32(x0, t3), $shift);
row1 = _mm256_srai_epi32(_mm256_add_epi32(x1, t2), $shift);
row2 = _mm256_srai_epi32(_mm256_add_epi32(x2, t1), $shift);
row3 = _mm256_srai_epi32(_mm256_add_epi32(x3, t0), $shift);
row4 = _mm256_srai_epi32(_mm256_sub_epi32(x3, t0), $shift);
row5 = _mm256_srai_epi32(_mm256_sub_epi32(x2, t1), $shift);
row6 = _mm256_srai_epi32(_mm256_sub_epi32(x1, t2), $shift);
row7 = _mm256_srai_epi32(_mm256_sub_epi32(x0, t3), $shift);
};
}
dct_pass!(c512, 10);
transpose_8x8_i32(
_token, &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6,
&mut row7,
);
dct_pass!(cscale, 17);
transpose_8x8_i32(
_token, &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6,
&mut row7,
);
let mut pos = 0;
macro_rules! pack_store_unclamped {
($r0:expr, $r1:expr) => {
let packed = _mm256_packs_epi32($r0, $r1);
let reordered = _mm256_permute4x64_epi64(packed, shuffle(3, 1, 2, 0));
safe_simd::_mm_storeu_si128(
<&mut [i16; 8]>::try_from(&mut out_vector[pos..pos + 8]).unwrap(),
_mm256_extracti128_si256::<0>(reordered),
);
pos += stride;
safe_simd::_mm_storeu_si128(
<&mut [i16; 8]>::try_from(&mut out_vector[pos..pos + 8]).unwrap(),
_mm256_extracti128_si256::<1>(reordered),
);
pos += stride;
};
}
pack_store_unclamped!(row0, row1);
pack_store_unclamped!(row2, row3);
pack_store_unclamped!(row4, row5);
pack_store_unclamped!(row6, row7);
let _ = pos;
}
}
mod wide_simd {
use super::SCALE_BITS;
use archmage::prelude::*;
use magetypes::simd::generic::i32x8 as GenericI32x8;
const C2217: i32 = 2217;
const C3135: i32 = 3135;
const CN7567: i32 = -7567;
const C4816: i32 = 4816;
const C1223: i32 = 1223;
const C8410: i32 = 8410;
const C12586: i32 = 12586;
const C6149: i32 = 6149;
const CN3685: i32 = -3685;
const CN10497: i32 = -10497;
const CN8034: i32 = -8034;
const CN1597: i32 = -1597;
pub fn idct_int_wide(in_vector: &[i32; 64], out_vector: &mut [i16], stride: usize) {
incant!(idct_int_wide_impl(in_vector, out_vector, stride));
}
#[magetypes(v3, neon, wasm128, scalar)]
fn idct_int_wide_impl(
token: Token,
in_vector: &[i32; 64],
out_vector: &mut [i16],
stride: usize,
) {
#[allow(non_camel_case_types)]
type i32x8 = GenericI32x8<Token>;
let mut rows: [i32x8; 8] = core::array::from_fn(|i| {
i32x8::from_array(
token,
*<&[i32; 8]>::try_from(&in_vector[i * 8..(i + 1) * 8]).unwrap(),
)
});
idct_pass_generic(token, &mut rows, i32x8::splat(token, 512), 10);
transpose_i32x8(token, &mut rows);
idct_pass_generic(token, &mut rows, i32x8::splat(token, SCALE_BITS), 17);
transpose_i32x8(token, &mut rows);
let min_len = stride * 7 + 8;
assert!(out_vector.len() >= min_len);
let out = &mut out_vector[..min_len];
let mut out_pos = 0;
for row in &rows {
let arr = row.to_array();
for (j, &val) in arr.iter().enumerate() {
out[out_pos + j] = val.clamp(0, 255) as i16;
}
out_pos += stride;
}
}
#[inline(always)]
pub(super) fn transpose_i32x8<T: magetypes::simd::backends::I32x8Backend>(
token: T,
rows: &mut [GenericI32x8<T>; 8],
) {
let r: [[i32; 8]; 8] = core::array::from_fn(|i| rows[i].to_array());
for i in 0..8 {
rows[i] = GenericI32x8::<T>::from_array(token, core::array::from_fn(|j| r[j][i]));
}
}
#[inline(always)]
pub(super) fn idct_pass_generic<T: magetypes::simd::backends::I32x8Backend>(
token: T,
rows: &mut [GenericI32x8<T>; 8],
scale_bits: GenericI32x8<T>,
shift: i32,
) {
#[allow(non_camel_case_types)]
type i32x8<U> = GenericI32x8<U>;
let p1 = (rows[2] + rows[6]) * i32x8::splat(token, C2217);
let t2 = p1 + rows[6] * i32x8::splat(token, CN7567);
let t3 = p1 + rows[2] * i32x8::splat(token, C3135);
let t0 = (rows[0] + rows[4]).shl_const::<12>();
let t1 = (rows[0] - rows[4]).shl_const::<12>();
let x0 = t0 + t3 + scale_bits;
let x3 = t0 - t3 + scale_bits;
let x1 = t1 + t2 + scale_bits;
let x2 = t1 - t2 + scale_bits;
let p3 = rows[7] + rows[3];
let p4 = rows[5] + rows[1];
let p1_odd = rows[7] + rows[1];
let p2_odd = rows[5] + rows[3];
let p5 = (p3 + p4) * i32x8::splat(token, C4816);
let mut t0 = rows[7] * i32x8::splat(token, C1223);
let mut t1 = rows[5] * i32x8::splat(token, C8410);
let mut t2 = rows[3] * i32x8::splat(token, C12586);
let mut t3 = rows[1] * i32x8::splat(token, C6149);
let p1_final = p5 + p1_odd * i32x8::splat(token, CN3685);
let p2_final = p5 + p2_odd * i32x8::splat(token, CN10497);
let p3_final = p3 * i32x8::splat(token, CN8034);
let p4_final = p4 * i32x8::splat(token, CN1597);
t3 = t3 + p1_final + p4_final;
t2 = t2 + p2_final + p3_final;
t1 = t1 + p2_final + p4_final;
t0 = t0 + p1_final + p3_final;
match shift {
10 => {
rows[0] = (x0 + t3).shr_arithmetic_const::<10>();
rows[1] = (x1 + t2).shr_arithmetic_const::<10>();
rows[2] = (x2 + t1).shr_arithmetic_const::<10>();
rows[3] = (x3 + t0).shr_arithmetic_const::<10>();
rows[4] = (x3 - t0).shr_arithmetic_const::<10>();
rows[5] = (x2 - t1).shr_arithmetic_const::<10>();
rows[6] = (x1 - t2).shr_arithmetic_const::<10>();
rows[7] = (x0 - t3).shr_arithmetic_const::<10>();
}
17 => {
rows[0] = (x0 + t3).shr_arithmetic_const::<17>();
rows[1] = (x1 + t2).shr_arithmetic_const::<17>();
rows[2] = (x2 + t1).shr_arithmetic_const::<17>();
rows[3] = (x3 + t0).shr_arithmetic_const::<17>();
rows[4] = (x3 - t0).shr_arithmetic_const::<17>();
rows[5] = (x2 - t1).shr_arithmetic_const::<17>();
rows[6] = (x1 - t2).shr_arithmetic_const::<17>();
rows[7] = (x0 - t3).shr_arithmetic_const::<17>();
}
_ => unreachable!("idct_pass_generic only supports shift=10 or shift=17"),
}
}
}
#[inline]
pub fn idct_int_auto(coeffs: &mut [i32; 64], output: &mut [i16], stride: usize) {
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
avx2::idct_int_avx2(token, coeffs, output, stride);
return;
}
}
wide_simd::idct_int_wide(coeffs, output, stride);
}
#[cfg(target_arch = "x86_64")]
#[inline]
pub fn idct_int_avx2_raw(
token: archmage::X64V3Token,
coeffs: &mut [i32; 64],
output: &mut [i16],
stride: usize,
) {
avx2::idct_int_avx2(token, coeffs, output, stride);
}
#[inline]
pub fn idct_int_tiered(coeffs: &mut [i32; 64], output: &mut [i16], stride: usize, coeff_count: u8) {
if coeff_count <= 1 {
idct_int_dc_only(coeffs[0], output, stride);
} else {
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
avx2::idct_int_avx2(token, coeffs, output, stride);
return;
}
}
wide_simd::idct_int_wide(coeffs, output, stride);
}
}
pub fn idct_int_tiered_libjpeg(
coeffs: &mut [i32; 64],
output: &mut [i16],
stride: usize,
coeff_count: u8,
) {
if coeff_count <= 1 {
idct_int_dc_only(coeffs[0], output, stride);
} else {
idct_int_libjpeg(coeffs, output, stride);
}
}
#[inline]
pub fn coeffs_i32_to_f32(coeffs: &[i32; 64]) -> [f32; 64] {
let mut out = [0.0f32; 64];
for (i, &c) in coeffs.iter().enumerate() {
out[i] = c as f32;
}
out
}
#[inline]
pub fn pixels_i16_to_f32_centered(pixels: &[i16; 64]) -> [f32; 64] {
let mut out = [0.0f32; 64];
for (i, &p) in pixels.iter().enumerate() {
out[i] = p as f32 - 128.0;
}
out
}
#[inline]
pub fn idct_int_dc_only_unclamped(dc_coeff: i32, out_vector: &mut [i16], stride: usize) {
let coeff = wa(wa(dc_coeff, 4), 1024).wrapping_shr(3) as i16;
let min_len = stride * 7 + 8;
assert!(out_vector.len() >= min_len);
let out = &mut out_vector[..min_len];
for i in 0..8 {
let off = i * stride;
out[off..off + 8].fill(coeff);
}
}
fn idct_int_wide_unclamped(in_vector: &[i32; 64], out_vector: &mut [i16], stride: usize) {
incant!(idct_int_wide_unclamped_impl(in_vector, out_vector, stride));
}
#[magetypes(v3, neon, wasm128, scalar)]
fn idct_int_wide_unclamped_impl(
token: Token,
in_vector: &[i32; 64],
out_vector: &mut [i16],
stride: usize,
) {
#[allow(non_camel_case_types)]
type i32x8 = GenericI32x8<Token>;
let mut rows: [i32x8; 8] = core::array::from_fn(|i| {
i32x8::from_array(
token,
*<&[i32; 8]>::try_from(&in_vector[i * 8..(i + 1) * 8]).unwrap(),
)
});
wide_simd::idct_pass_generic(token, &mut rows, i32x8::splat(token, 512), 10);
wide_simd::transpose_i32x8(token, &mut rows);
wide_simd::idct_pass_generic(token, &mut rows, i32x8::splat(token, SCALE_BITS), 17);
wide_simd::transpose_i32x8(token, &mut rows);
let min_len = stride * 7 + 8;
assert!(out_vector.len() >= min_len);
let out = &mut out_vector[..min_len];
let mut out_pos = 0;
for row in &rows {
let arr = row.to_array();
for (j, &val) in arr.iter().enumerate() {
out[out_pos + j] = val as i16;
}
out_pos += stride;
}
}
pub fn idct_int_libjpeg_unclamped(
in_vector: &mut [i32; 64],
out_vector: &mut [i16],
stride: usize,
) {
let min_len = stride * 7 + 8;
assert!(out_vector.len() >= min_len);
let out_vector = &mut out_vector[..min_len];
let mut workspace = [0i64; 64];
for col in 0..8 {
let base = col;
if in_vector[base + 8] == 0
&& in_vector[base + 16] == 0
&& in_vector[base + 24] == 0
&& in_vector[base + 32] == 0
&& in_vector[base + 40] == 0
&& in_vector[base + 48] == 0
&& in_vector[base + 56] == 0
{
let dcval = (in_vector[base] as i64) << LJ_PASS1_BITS;
for r in 0..8 {
workspace[r * 8 + col] = dcval;
}
continue;
}
let z2 = in_vector[base + 16] as i64;
let z3 = in_vector[base + 48] as i64;
let z1 = (z2 + z3) * LJ_FIX_0_541196100;
let tmp2 = z1 + z3 * (-LJ_FIX_1_847759065);
let tmp3 = z1 + z2 * LJ_FIX_0_765366865;
let z2 = in_vector[base] as i64;
let z3 = in_vector[base + 32] as i64;
let tmp0 = (z2 + z3) << LJ_CONST_BITS;
let tmp1 = (z2 - z3) << LJ_CONST_BITS;
let tmp10 = tmp0 + tmp3;
let tmp13 = tmp0 - tmp3;
let tmp11 = tmp1 + tmp2;
let tmp12 = tmp1 - tmp2;
let tmp0 = in_vector[base + 56] as i64;
let tmp1 = in_vector[base + 40] as i64;
let tmp2 = in_vector[base + 24] as i64;
let tmp3 = in_vector[base + 8] as i64;
let z1 = tmp0 + tmp3;
let z2 = tmp1 + tmp2;
let z3 = tmp0 + tmp2;
let z4 = tmp1 + tmp3;
let z5 = (z3 + z4) * LJ_FIX_1_175875602;
let tmp0 = tmp0 * LJ_FIX_0_298631336;
let tmp1 = tmp1 * LJ_FIX_2_053119869;
let tmp2 = tmp2 * LJ_FIX_3_072711026;
let tmp3 = tmp3 * LJ_FIX_1_501321110;
let z1 = z1 * (-LJ_FIX_0_899976223);
let z2 = z2 * (-LJ_FIX_2_562915447);
let z3 = z3 * (-LJ_FIX_1_961570560) + z5;
let z4 = z4 * (-LJ_FIX_0_390180644) + z5;
let tmp0 = tmp0 + z1 + z3;
let tmp1 = tmp1 + z2 + z4;
let tmp2 = tmp2 + z2 + z3;
let tmp3 = tmp3 + z1 + z4;
workspace[col] = descale(tmp10 + tmp3, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[7 * 8 + col] = descale(tmp10 - tmp3, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[8 + col] = descale(tmp11 + tmp2, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[6 * 8 + col] = descale(tmp11 - tmp2, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[2 * 8 + col] = descale(tmp12 + tmp1, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[5 * 8 + col] = descale(tmp12 - tmp1, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[3 * 8 + col] = descale(tmp13 + tmp0, LJ_CONST_BITS - LJ_PASS1_BITS);
workspace[4 * 8 + col] = descale(tmp13 - tmp0, LJ_CONST_BITS - LJ_PASS1_BITS);
}
let total_shift = LJ_CONST_BITS + LJ_PASS1_BITS + 3;
for row in 0..8 {
let base = row * 8;
if workspace[base + 1] == 0
&& workspace[base + 2] == 0
&& workspace[base + 3] == 0
&& workspace[base + 4] == 0
&& workspace[base + 5] == 0
&& workspace[base + 6] == 0
&& workspace[base + 7] == 0
{
let dcval = (descale(workspace[base], LJ_PASS1_BITS + 3) + 128) as i16;
let out_base = row * stride;
out_vector[out_base..out_base + 8].fill(dcval);
continue;
}
let z2 = workspace[base + 2];
let z3 = workspace[base + 6];
let z1 = (z2 + z3) * LJ_FIX_0_541196100;
let tmp2 = z1 + z3 * (-LJ_FIX_1_847759065);
let tmp3 = z1 + z2 * LJ_FIX_0_765366865;
let z2 = workspace[base];
let z3 = workspace[base + 4];
let tmp0 = (z2 + z3) << LJ_CONST_BITS;
let tmp1 = (z2 - z3) << LJ_CONST_BITS;
let tmp10 = tmp0 + tmp3;
let tmp13 = tmp0 - tmp3;
let tmp11 = tmp1 + tmp2;
let tmp12 = tmp1 - tmp2;
let tmp0 = workspace[base + 7];
let tmp1 = workspace[base + 5];
let tmp2 = workspace[base + 3];
let tmp3 = workspace[base + 1];
let z1 = tmp0 + tmp3;
let z2 = tmp1 + tmp2;
let z3 = tmp0 + tmp2;
let z4 = tmp1 + tmp3;
let z5 = (z3 + z4) * LJ_FIX_1_175875602;
let tmp0 = tmp0 * LJ_FIX_0_298631336;
let tmp1 = tmp1 * LJ_FIX_2_053119869;
let tmp2 = tmp2 * LJ_FIX_3_072711026;
let tmp3 = tmp3 * LJ_FIX_1_501321110;
let z1 = z1 * (-LJ_FIX_0_899976223);
let z2 = z2 * (-LJ_FIX_2_562915447);
let z3 = z3 * (-LJ_FIX_1_961570560) + z5;
let z4 = z4 * (-LJ_FIX_0_390180644) + z5;
let tmp0 = tmp0 + z1 + z3;
let tmp1 = tmp1 + z2 + z4;
let tmp2 = tmp2 + z2 + z3;
let tmp3 = tmp3 + z1 + z4;
let out_base = row * stride;
out_vector[out_base] = (descale(tmp10 + tmp3, total_shift) + 128) as i16;
out_vector[out_base + 7] = (descale(tmp10 - tmp3, total_shift) + 128) as i16;
out_vector[out_base + 1] = (descale(tmp11 + tmp2, total_shift) + 128) as i16;
out_vector[out_base + 6] = (descale(tmp11 - tmp2, total_shift) + 128) as i16;
out_vector[out_base + 2] = (descale(tmp12 + tmp1, total_shift) + 128) as i16;
out_vector[out_base + 5] = (descale(tmp12 - tmp1, total_shift) + 128) as i16;
out_vector[out_base + 3] = (descale(tmp13 + tmp0, total_shift) + 128) as i16;
out_vector[out_base + 4] = (descale(tmp13 - tmp0, total_shift) + 128) as i16;
}
}
pub fn idct_int_auto_unclamped(coeffs: &mut [i32; 64], output: &mut [i16], stride: usize) {
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
avx2::idct_int_avx2_unclamped(token, coeffs, output, stride);
return;
}
}
idct_int_wide_unclamped(coeffs, output, stride);
}
pub fn idct_int_tiered_unclamped(
coeffs: &mut [i32; 64],
output: &mut [i16],
stride: usize,
coeff_count: u8,
) {
if coeff_count <= 1 {
idct_int_dc_only_unclamped(coeffs[0], output, stride);
} else {
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
avx2::idct_int_avx2_unclamped(token, coeffs, output, stride);
return;
}
}
idct_int_wide_unclamped(coeffs, output, stride);
}
}
pub fn idct_int_tiered_libjpeg_unclamped(
coeffs: &mut [i32; 64],
output: &mut [i16],
stride: usize,
coeff_count: u8,
) {
if coeff_count <= 1 {
idct_int_dc_only_unclamped(coeffs[0], output, stride);
} else {
idct_int_libjpeg_unclamped(coeffs, output, stride);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dc_only() {
let mut coeffs = [0i32; 64];
coeffs[0] = 1024;
let mut output = [0i16; 64];
idct_int(&mut coeffs, &mut output, 8);
let first = output[0];
for &v in &output {
assert_eq!(v, first, "DC-only should produce uniform output");
}
}
#[test]
fn test_is_dc_only_int() {
let dc_only = [
100i32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
];
assert!(is_dc_only_int(&dc_only));
let not_dc_only = [
100i32, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
];
assert!(!is_dc_only_int(¬_dc_only));
}
#[test]
fn test_output_range() {
for dc in [-2000i32, -500, 0, 500, 2000] {
let mut coeffs = [0i32; 64];
coeffs[0] = dc;
let mut output = [0i16; 64];
idct_int(&mut coeffs, &mut output, 8);
for &v in &output {
assert!((0..=255).contains(&v), "Output {} out of range [0,255]", v);
}
}
}
#[test]
fn test_idct_int_auto() {
let mut coeffs = [0i32; 64];
coeffs[0] = 512;
coeffs[1] = 100;
coeffs[8] = -50;
let mut output = [0i16; 64];
idct_int_auto(&mut coeffs.clone(), &mut output, 8);
for &v in &output {
assert!((0..=255).contains(&v));
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_avx2_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let mut coeffs_scalar = [0i32; 64];
let mut coeffs_avx2 = [0i32; 64];
for i in 0..64 {
let v = ((i as i32 * 17 + 31) % 256) - 128;
coeffs_scalar[i] = v * 8;
coeffs_avx2[i] = v * 8;
}
let mut output_scalar = [0i16; 64];
let mut output_avx2 = [0i16; 64];
idct_int(&mut coeffs_scalar, &mut output_scalar, 8);
avx2::idct_int_avx2(token, &mut coeffs_avx2, &mut output_avx2, 8);
for i in 0..64 {
assert_eq!(
output_scalar[i], output_avx2[i],
"Mismatch at {}: scalar={}, avx2={}",
i, output_scalar[i], output_avx2[i]
);
}
}
#[test]
fn test_wide_matches_scalar() {
let mut coeffs_scalar = [0i32; 64];
for i in 0..64 {
let v = ((i as i32 * 17 + 31) % 256) - 128;
coeffs_scalar[i] = v * 8;
}
let coeffs_wide: [i32; 64] = coeffs_scalar;
let mut output_scalar = [0i16; 64];
let mut output_wide = [0i16; 64];
idct_int(&mut coeffs_scalar, &mut output_scalar, 8);
wide_simd::idct_int_wide(&coeffs_wide, &mut output_wide, 8);
for i in 0..64 {
assert_eq!(
output_scalar[i], output_wide[i],
"Mismatch at {}: scalar={}, wide={}",
i, output_scalar[i], output_wide[i]
);
}
}
#[test]
fn test_wide_with_stride() {
let coeffs: [i32; 64] = std::array::from_fn(|i| {
let v = ((i as i32 * 17 + 31) % 256) - 128;
v * 8
});
let mut output_stride8 = [0i16; 64];
let mut output_stride16 = [0i16; 128];
wide_simd::idct_int_wide(&coeffs, &mut output_stride8, 8);
wide_simd::idct_int_wide(&coeffs, &mut output_stride16, 16);
for row in 0..8 {
for col in 0..8 {
assert_eq!(
output_stride8[row * 8 + col],
output_stride16[row * 16 + col],
"Stride mismatch at ({}, {})",
row,
col
);
}
}
}
#[test]
fn test_wide_dc_patterns() {
for dc in [-2000i32, -500, 0, 500, 1000, 2000] {
let mut coeffs = [0i32; 64];
coeffs[0] = dc;
let mut output = [0i16; 64];
wide_simd::idct_int_wide(&coeffs, &mut output, 8);
let first = output[0];
for (i, &v) in output.iter().enumerate() {
assert!(
(0..=255).contains(&v),
"DC {} produced out-of-range {} at {}",
dc,
v,
i
);
assert!(
(v - first).abs() <= 1,
"DC {} non-uniform: {} vs {} at {}",
dc,
first,
v,
i
);
}
}
}
#[test]
fn test_wide_exhaustive() {
for seed in 0..100 {
let coeffs: [i32; 64] = std::array::from_fn(|i| {
let v = ((i as i32 * 17 + seed * 7 + 31) % 512) - 256;
v * 4
});
let mut coeffs_scalar = coeffs;
let mut output_scalar = [0i16; 64];
let mut output_wide = [0i16; 64];
idct_int(&mut coeffs_scalar, &mut output_scalar, 8);
wide_simd::idct_int_wide(&coeffs, &mut output_wide, 8);
for i in 0..64 {
assert_eq!(
output_scalar[i], output_wide[i],
"Seed {}: Mismatch at {}: scalar={}, wide={}",
seed, i, output_scalar[i], output_wide[i]
);
}
}
}
#[test]
fn test_libjpeg_idct_large_coefficients() {
let mut coeffs = [0i32; 64];
coeffs[0] = 40000; coeffs[1] = -35000;
let mut output = [0i16; 64];
idct_int_tiered_libjpeg(&mut coeffs, &mut output, 8, 2);
let mut coeffs2 = [0i32; 64];
coeffs2[0] = 40000;
coeffs2[1] = -35000;
let mut output2 = [0i16; 64];
idct_int_libjpeg(&mut coeffs2, &mut output2, 8);
assert_eq!(
output, output2,
"tiered and direct libjpeg IDCT should produce identical results"
);
for &v in &output {
assert!(
(0..=255).contains(&v),
"IDCT output {v} out of [0, 255] range"
);
}
}
fn reference_idct_f64(coeffs: &[i32; 64]) -> [f64; 64] {
use core::f64::consts::PI;
let mut output = [0.0f64; 64];
for y in 0..8 {
for x in 0..8 {
let mut sum = 0.0f64;
for v in 0..8 {
for u in 0..8 {
let cu = if u == 0 {
1.0 / core::f64::consts::SQRT_2
} else {
1.0
};
let cv = if v == 0 {
1.0 / core::f64::consts::SQRT_2
} else {
1.0
};
let cos_x = ((2 * x + 1) as f64 * u as f64 * PI / 16.0).cos();
let cos_y = ((2 * y + 1) as f64 * v as f64 * PI / 16.0).cos();
sum += cu * cv * coeffs[v * 8 + u] as f64 * cos_x * cos_y;
}
}
output[y * 8 + x] = sum / 4.0 + 128.0; }
}
output
}
#[test]
fn test_idct_cross_validation_harness() {
let magnitudes = [512, 2048, 4000, 8000, 16000];
let mut rng = 0x1234_5678_9ABC_DEF0u64;
let mut next = || -> i32 {
rng = rng
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
(rng >> 33) as i32
};
let mut total_blocks = 0u64;
let mut max_err_loeffler = 0.0f64;
let mut max_err_zune = 0.0f64;
let mut max_err_zune_simd = 0.0f64;
for &mag in &magnitudes {
for _trial in 0..200 {
let coeffs: [i32; 64] = core::array::from_fn(|_| next() % (2 * mag + 1) - mag);
let ref_output = reference_idct_f64(&coeffs);
let mut coeffs_lj = coeffs;
let mut out_lj = [0i16; 64];
idct_int_libjpeg(&mut coeffs_lj, &mut out_lj, 8);
let mut coeffs_zune = coeffs;
let mut out_zune = [0i16; 64];
idct_int(&mut coeffs_zune, &mut out_zune, 8);
let mut out_wide = [0i16; 64];
wide_simd::idct_int_wide(&coeffs, &mut out_wide, 8);
for i in 0..64 {
let ref_clamped = ref_output[i].round().clamp(0.0, 255.0);
let err_lj = (out_lj[i] as f64 - ref_clamped).abs();
max_err_loeffler = max_err_loeffler.max(err_lj);
let err_zune = (out_zune[i] as f64 - ref_clamped).abs();
max_err_zune = max_err_zune.max(err_zune);
let err_wide = (out_wide[i] as f64 - ref_clamped).abs();
max_err_zune_simd = max_err_zune_simd.max(err_wide);
assert!(
err_lj <= 2.0,
"Loeffler i64 error {err_lj} at pos {i}, mag={mag}, \
ref={ref_clamped}, got={}",
out_lj[i]
);
}
assert_eq!(out_zune, out_wide, "zune scalar/SIMD mismatch at mag={mag}");
total_blocks += 1;
}
}
eprintln!(
"IDCT harness: {total_blocks} blocks tested, \
max_err: loeffler={max_err_loeffler:.1}, \
zune={max_err_zune:.1}, zune_simd={max_err_zune_simd:.1}"
);
}
#[test]
fn test_loeffler_extreme_coefficients() {
for &mag in &[1000, 5000, 10000, 50000, 100000] {
let coeffs_pos: [i32; 64] = [mag; 64];
let mut coeffs = coeffs_pos;
let mut output = [0i16; 64];
idct_int_libjpeg(&mut coeffs, &mut output, 8);
for (i, &v) in output.iter().enumerate() {
assert!(
(0..=255).contains(&v),
"mag={mag} pos {i}: output {v} out of range"
);
}
let coeffs_alt: [i32; 64] =
core::array::from_fn(|i| if i % 2 == 0 { mag } else { -mag });
let mut coeffs = coeffs_alt;
let mut output = [0i16; 64];
idct_int_libjpeg(&mut coeffs, &mut output, 8);
for (i, &v) in output.iter().enumerate() {
assert!(
(0..=255).contains(&v),
"mag={mag} alt pos {i}: output {v} out of range"
);
}
}
}
}