const SQRT2: f32 = core::f32::consts::SQRT_2;
const WC_MULTIPLIERS_4: [f32; 2] = [0.541_196_1, 1.306_563];
const WC_MULTIPLIERS_8: [f32; 4] = [0.509_795_6, 0.601_344_9, 0.899_976_2, 2.562_915_5];
#[inline]
pub fn dct_8x8(input: &[f32; 64], output: &mut [f32; 64]) {
#[cfg(target_arch = "x86_64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::X64V3Token::summon() {
dct_8x8_avx2(token, input, output);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::NeonToken::summon() {
dct_8x8_neon(token, input, output);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use archmage::SimdToken;
if let Some(token) = archmage::Wasm128Token::summon() {
dct_8x8_wasm128(token, input, output);
return;
}
}
dct_8x8_scalar(input, output);
}
#[inline]
pub fn idct_8x8(input: &[f32; 64], output: &mut [f32; 64]) {
#[cfg(target_arch = "x86_64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::X64V3Token::summon() {
idct_8x8_avx2(token, input, output);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::NeonToken::summon() {
idct_8x8_neon(token, input, output);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use archmage::SimdToken;
if let Some(token) = archmage::Wasm128Token::summon() {
idct_8x8_wasm128(token, input, output);
return;
}
}
idct_8x8_scalar(input, output);
}
#[inline]
pub fn dct_8x8_scalar(input: &[f32; 64], output: &mut [f32; 64]) {
let mut tmp = crate::scratch_buf::<64>();
for row in 0..8 {
let s = row * 8;
tmp[s..s + 8].copy_from_slice(&input[s..s + 8]);
dct1d_8_scalar(&mut tmp[s..s + 8]);
for i in 0..8 {
tmp[s + i] *= 0.125; }
}
let mut transposed = crate::scratch_buf::<64>();
for r in 0..8 {
for c in 0..8 {
transposed[c * 8 + r] = tmp[r * 8 + c];
}
}
for row in 0..8 {
let s = row * 8;
dct1d_8_scalar(&mut transposed[s..s + 8]);
for i in 0..8 {
transposed[s + i] *= 0.125;
}
}
output.copy_from_slice(&transposed);
}
#[inline]
pub fn idct_8x8_scalar(input: &[f32; 64], output: &mut [f32; 64]) {
let mut tmp = crate::scratch_buf::<64>();
tmp.copy_from_slice(input);
for row in 0..8 {
let s = row * 8;
idct1d_8_scalar(&mut tmp[s..s + 8]);
}
let mut transposed = crate::scratch_buf::<64>();
for r in 0..8 {
for c in 0..8 {
transposed[c * 8 + r] = tmp[r * 8 + c];
}
}
for row in 0..8 {
let s = row * 8;
idct1d_8_scalar(&mut transposed[s..s + 8]);
}
output.copy_from_slice(&transposed);
}
fn dct1d_4_scalar(mem: &mut [f32]) {
let mut tmp = [0.0f32; 4];
tmp[0] = mem[0] + mem[3];
tmp[1] = mem[1] + mem[2];
tmp[2] = mem[0] - mem[3];
tmp[3] = mem[1] - mem[2];
let (a, b) = (tmp[0] + tmp[1], tmp[0] - tmp[1]);
tmp[0] = a;
tmp[1] = b;
tmp[2] *= WC_MULTIPLIERS_4[0];
tmp[3] *= WC_MULTIPLIERS_4[1];
let (a, b) = (tmp[2] + tmp[3], tmp[2] - tmp[3]);
tmp[2] = a;
tmp[3] = b;
tmp[2] = SQRT2.mul_add(tmp[2], tmp[3]);
mem[0] = tmp[0];
mem[2] = tmp[1];
mem[1] = tmp[2];
mem[3] = tmp[3];
}
fn dct1d_8_scalar(mem: &mut [f32]) {
let mut tmp = [0.0f32; 8];
for i in 0..4 {
tmp[i] = mem[i] + mem[7 - i];
}
for i in 0..4 {
tmp[4 + i] = mem[i] - mem[7 - i];
}
dct1d_4_scalar(&mut tmp[0..4]);
for i in 0..4 {
tmp[4 + i] *= WC_MULTIPLIERS_8[i];
}
dct1d_4_scalar(&mut tmp[4..8]);
tmp[4] = SQRT2.mul_add(tmp[4], tmp[5]);
tmp[5] += tmp[6];
tmp[6] += tmp[7];
for i in 0..4 {
mem[2 * i] = tmp[i];
mem[2 * i + 1] = tmp[4 + i];
}
}
fn idct1d_4_scalar(mem: &mut [f32]) {
let mut tmp = [mem[0], mem[2], mem[1], mem[3]];
tmp[2] = (tmp[2] - tmp[3]) * (1.0 / SQRT2);
let (a, b) = (tmp[2] + tmp[3], tmp[2] - tmp[3]);
tmp[2] = a;
tmp[3] = b;
tmp[2] *= 1.0 / WC_MULTIPLIERS_4[0];
tmp[3] *= 1.0 / WC_MULTIPLIERS_4[1];
let (a, b) = (tmp[0] + tmp[1], tmp[0] - tmp[1]);
tmp[0] = a;
tmp[1] = b;
mem[0] = tmp[0] + tmp[2];
mem[3] = tmp[0] - tmp[2];
mem[1] = tmp[1] + tmp[3];
mem[2] = tmp[1] - tmp[3];
}
fn idct1d_8_scalar(mem: &mut [f32]) {
let mut first_half = [0.0f32; 4];
let mut second_half = [0.0f32; 4];
for i in 0..4 {
first_half[i] = mem[2 * i];
second_half[i] = mem[2 * i + 1];
}
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * (1.0 / SQRT2);
idct1d_4_scalar(&mut second_half);
for i in 0..4 {
second_half[i] *= 1.0 / WC_MULTIPLIERS_8[i];
}
idct1d_4_scalar(&mut first_half);
for i in 0..4 {
mem[i] = first_half[i] + second_half[i];
mem[7 - i] = first_half[i] - second_half[i];
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
pub fn dct_8x8_avx2(token: archmage::X64V3Token, input: &[f32; 64], output: &mut [f32; 64]) {
use magetypes::simd::f32x8;
let r0 = f32x8::from_slice(token, &input[0..]);
let r1 = f32x8::from_slice(token, &input[8..]);
let r2 = f32x8::from_slice(token, &input[16..]);
let r3 = f32x8::from_slice(token, &input[24..]);
let r4 = f32x8::from_slice(token, &input[32..]);
let r5 = f32x8::from_slice(token, &input[40..]);
let r6 = f32x8::from_slice(token, &input[48..]);
let r7 = f32x8::from_slice(token, &input[56..]);
let (r0, r1, r2, r3, r4, r5, r6, r7) =
vectorized_dct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);
let scale = f32x8::splat(token, 0.125);
let r0 = r0 * scale;
let r1 = r1 * scale;
let r2 = r2 * scale;
let r3 = r3 * scale;
let r4 = r4 * scale;
let r5 = r5 * scale;
let r6 = r6 * scale;
let r7 = r7 * scale;
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8_regs(token, r0, r1, r2, r3, r4, r5, r6, r7);
let (r0, r1, r2, r3, r4, r5, r6, r7) =
vectorized_dct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);
let r0 = r0 * scale;
let r1 = r1 * scale;
let r2 = r2 * scale;
let r3 = r3 * scale;
let r4 = r4 * scale;
let r5 = r5 * scale;
let r6 = r6 * scale;
let r7 = r7 * scale;
r0.store((&mut output[0..8]).try_into().unwrap());
r1.store((&mut output[8..16]).try_into().unwrap());
r2.store((&mut output[16..24]).try_into().unwrap());
r3.store((&mut output[24..32]).try_into().unwrap());
r4.store((&mut output[32..40]).try_into().unwrap());
r5.store((&mut output[40..48]).try_into().unwrap());
r6.store((&mut output[48..56]).try_into().unwrap());
r7.store((&mut output[56..64]).try_into().unwrap());
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
pub fn idct_8x8_avx2(token: archmage::X64V3Token, input: &[f32; 64], output: &mut [f32; 64]) {
use magetypes::simd::f32x8;
let r0 = f32x8::from_slice(token, &input[0..]);
let r1 = f32x8::from_slice(token, &input[8..]);
let r2 = f32x8::from_slice(token, &input[16..]);
let r3 = f32x8::from_slice(token, &input[24..]);
let r4 = f32x8::from_slice(token, &input[32..]);
let r5 = f32x8::from_slice(token, &input[40..]);
let r6 = f32x8::from_slice(token, &input[48..]);
let r7 = f32x8::from_slice(token, &input[56..]);
let (r0, r1, r2, r3, r4, r5, r6, r7) =
vectorized_idct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);
let (r0, r1, r2, r3, r4, r5, r6, r7) =
transpose_8x8_regs(token, r0, r1, r2, r3, r4, r5, r6, r7);
let (r0, r1, r2, r3, r4, r5, r6, r7) =
vectorized_idct1d_8(token, r0, r1, r2, r3, r4, r5, r6, r7);
r0.store((&mut output[0..8]).try_into().unwrap());
r1.store((&mut output[8..16]).try_into().unwrap());
r2.store((&mut output[16..24]).try_into().unwrap());
r3.store((&mut output[24..32]).try_into().unwrap());
r4.store((&mut output[32..40]).try_into().unwrap());
r5.store((&mut output[40..48]).try_into().unwrap());
r6.store((&mut output[48..56]).try_into().unwrap());
r7.store((&mut output[56..64]).try_into().unwrap());
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
pub(crate) fn vectorized_dct1d_8(
token: archmage::X64V3Token,
r0: magetypes::simd::f32x8,
r1: magetypes::simd::f32x8,
r2: magetypes::simd::f32x8,
r3: magetypes::simd::f32x8,
r4: magetypes::simd::f32x8,
r5: magetypes::simd::f32x8,
r6: magetypes::simd::f32x8,
r7: magetypes::simd::f32x8,
) -> (
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
) {
use magetypes::simd::f32x8;
let sqrt2 = f32x8::splat(token, SQRT2);
let a0 = r0 + r7;
let a1 = r1 + r6;
let a2 = r2 + r5;
let a3 = r3 + r4;
let s0 = r0 - r7;
let s1 = r1 - r6;
let s2 = r2 - r5;
let s3 = r3 - r4;
let b0 = a0 + a3;
let b1 = a1 + a2;
let b2 = a0 - a3;
let b3 = a1 - a2;
let c0 = b0 + b1;
let c1 = b0 - b1;
let b2 = b2 * f32x8::splat(token, WC_MULTIPLIERS_4[0]);
let b3 = b3 * f32x8::splat(token, WC_MULTIPLIERS_4[1]);
let d0 = b2 + b3;
let d1 = b2 - b3;
let d0 = sqrt2.mul_add(d0, d1);
let fh0 = c0;
let fh1 = d0;
let fh2 = c1;
let fh3 = d1;
let s0 = s0 * f32x8::splat(token, WC_MULTIPLIERS_8[0]);
let s1 = s1 * f32x8::splat(token, WC_MULTIPLIERS_8[1]);
let s2 = s2 * f32x8::splat(token, WC_MULTIPLIERS_8[2]);
let s3 = s3 * f32x8::splat(token, WC_MULTIPLIERS_8[3]);
let e0 = s0 + s3;
let e1 = s1 + s2;
let e2 = s0 - s3;
let e3 = s1 - s2;
let f0 = e0 + e1;
let f1 = e0 - e1;
let e2 = e2 * f32x8::splat(token, WC_MULTIPLIERS_4[0]);
let e3 = e3 * f32x8::splat(token, WC_MULTIPLIERS_4[1]);
let g0 = e2 + e3;
let g1 = e2 - e3;
let g0 = sqrt2.mul_add(g0, g1);
let sh0 = f0;
let sh1 = g0;
let sh2 = f1;
let sh3 = g1;
let sh0 = sqrt2.mul_add(sh0, sh1);
let sh1 = sh1 + sh2;
let sh2 = sh2 + sh3;
(fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3)
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
pub(crate) fn vectorized_idct1d_8(
token: archmage::X64V3Token,
r0: magetypes::simd::f32x8,
r1: magetypes::simd::f32x8,
r2: magetypes::simd::f32x8,
r3: magetypes::simd::f32x8,
r4: magetypes::simd::f32x8,
r5: magetypes::simd::f32x8,
r6: magetypes::simd::f32x8,
r7: magetypes::simd::f32x8,
) -> (
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
) {
use magetypes::simd::f32x8;
let inv_sqrt2 = f32x8::splat(token, 1.0 / SQRT2);
let fh0 = r0;
let sh0 = r1;
let fh1 = r2;
let sh1 = r3;
let fh2 = r4;
let sh2 = r5;
let fh3 = r6;
let sh3 = r7;
let sh2 = sh2 - sh3;
let sh1 = sh1 - sh2;
let sh0 = (sh0 - sh1) * inv_sqrt2;
let f0 = sh0;
let g0 = sh1;
let f1 = sh2;
let g1 = sh3;
let g0 = (g0 - g1) * inv_sqrt2;
let e2p = g0 + g1;
let e3p = g0 - g1;
let e2 = e2p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
let e3 = e3p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);
let e0p = f0 + f1;
let e1p = f0 - f1;
let s0 = e0p + e2;
let s3 = e0p - e2;
let s1 = e1p + e3;
let s2 = e1p - e3;
let s0 = s0 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[0]);
let s1 = s1 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[1]);
let s2 = s2 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[2]);
let s3 = s3 * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_8[3]);
let c0 = fh0;
let d0 = fh1;
let c1 = fh2;
let d1 = fh3;
let d0_in = (d0 - d1) * inv_sqrt2;
let b2p = d0_in + d1;
let b3p = d0_in - d1;
let b2 = b2p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
let b3 = b3p * f32x8::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);
let b0p = c0 + c1;
let b1p = c0 - c1;
let a0 = b0p + b2;
let a3 = b0p - b2;
let a1 = b1p + b3;
let a2 = b1p - b3;
let out0 = a0 + s0;
let out7 = a0 - s0;
let out1 = a1 + s1;
let out6 = a1 - s1;
let out2 = a2 + s2;
let out5 = a2 - s2;
let out3 = a3 + s3;
let out4 = a3 - s3;
(out0, out1, out2, out3, out4, out5, out6, out7)
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
pub(crate) fn transpose_8x8_regs(
token: archmage::X64V3Token,
r0: magetypes::simd::f32x8,
r1: magetypes::simd::f32x8,
r2: magetypes::simd::f32x8,
r3: magetypes::simd::f32x8,
r4: magetypes::simd::f32x8,
r5: magetypes::simd::f32x8,
r6: magetypes::simd::f32x8,
r7: magetypes::simd::f32x8,
) -> (
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
magetypes::simd::f32x8,
) {
use core::arch::x86_64::*;
use magetypes::simd::f32x8;
let r0 = r0.raw();
let r1 = r1.raw();
let r2 = r2.raw();
let r3 = r3.raw();
let r4 = r4.raw();
let r5 = r5.raw();
let r6 = r6.raw();
let r7 = r7.raw();
let t0 = _mm256_unpacklo_ps(r0, r1);
let t1 = _mm256_unpackhi_ps(r0, r1);
let t2 = _mm256_unpacklo_ps(r2, r3);
let t3 = _mm256_unpackhi_ps(r2, r3);
let t4 = _mm256_unpacklo_ps(r4, r5);
let t5 = _mm256_unpackhi_ps(r4, r5);
let t6 = _mm256_unpacklo_ps(r6, r7);
let t7 = _mm256_unpackhi_ps(r6, r7);
let s0 = _mm256_shuffle_ps::<0x44>(t0, t2);
let s1 = _mm256_shuffle_ps::<0xEE>(t0, t2);
let s2 = _mm256_shuffle_ps::<0x44>(t1, t3);
let s3 = _mm256_shuffle_ps::<0xEE>(t1, t3);
let s4 = _mm256_shuffle_ps::<0x44>(t4, t6);
let s5 = _mm256_shuffle_ps::<0xEE>(t4, t6);
let s6 = _mm256_shuffle_ps::<0x44>(t5, t7);
let s7 = _mm256_shuffle_ps::<0xEE>(t5, t7);
let c0 = _mm256_permute2f128_ps::<0x20>(s0, s4);
let c1 = _mm256_permute2f128_ps::<0x20>(s1, s5);
let c2 = _mm256_permute2f128_ps::<0x20>(s2, s6);
let c3 = _mm256_permute2f128_ps::<0x20>(s3, s7);
let c4 = _mm256_permute2f128_ps::<0x31>(s0, s4);
let c5 = _mm256_permute2f128_ps::<0x31>(s1, s5);
let c6 = _mm256_permute2f128_ps::<0x31>(s2, s6);
let c7 = _mm256_permute2f128_ps::<0x31>(s3, s7);
(
f32x8::from_m256(token, c0),
f32x8::from_m256(token, c1),
f32x8::from_m256(token, c2),
f32x8::from_m256(token, c3),
f32x8::from_m256(token, c4),
f32x8::from_m256(token, c5),
f32x8::from_m256(token, c6),
f32x8::from_m256(token, c7),
)
}
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
pub fn dct_8x8_neon(token: archmage::NeonToken, input: &[f32; 64], output: &mut [f32; 64]) {
use magetypes::simd::f32x4;
let scale = f32x4::splat(token, 0.125);
let l0 = f32x4::from_slice(token, &input[0..]);
let l1 = f32x4::from_slice(token, &input[8..]);
let l2 = f32x4::from_slice(token, &input[16..]);
let l3 = f32x4::from_slice(token, &input[24..]);
let l4 = f32x4::from_slice(token, &input[32..]);
let l5 = f32x4::from_slice(token, &input[40..]);
let l6 = f32x4::from_slice(token, &input[48..]);
let l7 = f32x4::from_slice(token, &input[56..]);
let (l0, l1, l2, l3, l4, l5, l6, l7) = neon_dct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
let l0 = l0 * scale;
let l1 = l1 * scale;
let l2 = l2 * scale;
let l3 = l3 * scale;
let l4 = l4 * scale;
let l5 = l5 * scale;
let l6 = l6 * scale;
let l7 = l7 * scale;
let h0 = f32x4::from_slice(token, &input[4..]);
let h1 = f32x4::from_slice(token, &input[12..]);
let h2 = f32x4::from_slice(token, &input[20..]);
let h3 = f32x4::from_slice(token, &input[28..]);
let h4 = f32x4::from_slice(token, &input[36..]);
let h5 = f32x4::from_slice(token, &input[44..]);
let h6 = f32x4::from_slice(token, &input[52..]);
let h7 = f32x4::from_slice(token, &input[60..]);
let (h0, h1, h2, h3, h4, h5, h6, h7) = neon_dct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);
let h0 = h0 * scale;
let h1 = h1 * scale;
let h2 = h2 * scale;
let h3 = h3 * scale;
let h4 = h4 * scale;
let h5 = h5 * scale;
let h6 = h6 * scale;
let h7 = h7 * scale;
let (a0, a1, a2, a3) = neon_transpose_4x4(token, l0, l1, l2, l3);
let (b0, b1, b2, b3) = neon_transpose_4x4(token, h0, h1, h2, h3);
let (c0, c1, c2, c3) = neon_transpose_4x4(token, l4, l5, l6, l7);
let (d0, d1, d2, d3) = neon_transpose_4x4(token, h4, h5, h6, h7);
let (a0, a1, a2, a3, b0, b1, b2, b3) = neon_dct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
let a0 = a0 * scale;
let a1 = a1 * scale;
let a2 = a2 * scale;
let a3 = a3 * scale;
let b0 = b0 * scale;
let b1 = b1 * scale;
let b2 = b2 * scale;
let b3 = b3 * scale;
let (c0, c1, c2, c3, d0, d1, d2, d3) = neon_dct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);
let c0 = c0 * scale;
let c1 = c1 * scale;
let c2 = c2 * scale;
let c3 = c3 * scale;
let d0 = d0 * scale;
let d1 = d1 * scale;
let d2 = d2 * scale;
let d3 = d3 * scale;
a0.store((&mut output[0..4]).try_into().unwrap());
c0.store((&mut output[4..8]).try_into().unwrap());
a1.store((&mut output[8..12]).try_into().unwrap());
c1.store((&mut output[12..16]).try_into().unwrap());
a2.store((&mut output[16..20]).try_into().unwrap());
c2.store((&mut output[20..24]).try_into().unwrap());
a3.store((&mut output[24..28]).try_into().unwrap());
c3.store((&mut output[28..32]).try_into().unwrap());
b0.store((&mut output[32..36]).try_into().unwrap());
d0.store((&mut output[36..40]).try_into().unwrap());
b1.store((&mut output[40..44]).try_into().unwrap());
d1.store((&mut output[44..48]).try_into().unwrap());
b2.store((&mut output[48..52]).try_into().unwrap());
d2.store((&mut output[52..56]).try_into().unwrap());
b3.store((&mut output[56..60]).try_into().unwrap());
d3.store((&mut output[60..64]).try_into().unwrap());
}
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
pub fn idct_8x8_neon(token: archmage::NeonToken, input: &[f32; 64], output: &mut [f32; 64]) {
use magetypes::simd::f32x4;
let l0 = f32x4::from_slice(token, &input[0..]);
let h0 = f32x4::from_slice(token, &input[4..]);
let l1 = f32x4::from_slice(token, &input[8..]);
let h1 = f32x4::from_slice(token, &input[12..]);
let l2 = f32x4::from_slice(token, &input[16..]);
let h2 = f32x4::from_slice(token, &input[20..]);
let l3 = f32x4::from_slice(token, &input[24..]);
let h3 = f32x4::from_slice(token, &input[28..]);
let l4 = f32x4::from_slice(token, &input[32..]);
let h4 = f32x4::from_slice(token, &input[36..]);
let l5 = f32x4::from_slice(token, &input[40..]);
let h5 = f32x4::from_slice(token, &input[44..]);
let l6 = f32x4::from_slice(token, &input[48..]);
let h6 = f32x4::from_slice(token, &input[52..]);
let l7 = f32x4::from_slice(token, &input[56..]);
let h7 = f32x4::from_slice(token, &input[60..]);
let (l0, l1, l2, l3, l4, l5, l6, l7) = neon_idct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
let (h0, h1, h2, h3, h4, h5, h6, h7) = neon_idct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);
let (a0, a1, a2, a3) = neon_transpose_4x4(token, l0, l1, l2, l3);
let (b0, b1, b2, b3) = neon_transpose_4x4(token, h0, h1, h2, h3);
let (c0, c1, c2, c3) = neon_transpose_4x4(token, l4, l5, l6, l7);
let (d0, d1, d2, d3) = neon_transpose_4x4(token, h4, h5, h6, h7);
let (a0, a1, a2, a3, b0, b1, b2, b3) = neon_idct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
let (c0, c1, c2, c3, d0, d1, d2, d3) = neon_idct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);
a0.store((&mut output[0..4]).try_into().unwrap());
c0.store((&mut output[4..8]).try_into().unwrap());
a1.store((&mut output[8..12]).try_into().unwrap());
c1.store((&mut output[12..16]).try_into().unwrap());
a2.store((&mut output[16..20]).try_into().unwrap());
c2.store((&mut output[20..24]).try_into().unwrap());
a3.store((&mut output[24..28]).try_into().unwrap());
c3.store((&mut output[28..32]).try_into().unwrap());
b0.store((&mut output[32..36]).try_into().unwrap());
d0.store((&mut output[36..40]).try_into().unwrap());
b1.store((&mut output[40..44]).try_into().unwrap());
d1.store((&mut output[44..48]).try_into().unwrap());
b2.store((&mut output[48..52]).try_into().unwrap());
d2.store((&mut output[52..56]).try_into().unwrap());
b3.store((&mut output[56..60]).try_into().unwrap());
d3.store((&mut output[60..64]).try_into().unwrap());
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn neon_dct1d_8(
token: archmage::NeonToken,
r0: magetypes::simd::f32x4,
r1: magetypes::simd::f32x4,
r2: magetypes::simd::f32x4,
r3: magetypes::simd::f32x4,
r4: magetypes::simd::f32x4,
r5: magetypes::simd::f32x4,
r6: magetypes::simd::f32x4,
r7: magetypes::simd::f32x4,
) -> (
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
) {
use magetypes::simd::f32x4;
let sqrt2 = f32x4::splat(token, SQRT2);
let a0 = r0 + r7;
let a1 = r1 + r6;
let a2 = r2 + r5;
let a3 = r3 + r4;
let s0 = r0 - r7;
let s1 = r1 - r6;
let s2 = r2 - r5;
let s3 = r3 - r4;
let b0 = a0 + a3;
let b1 = a1 + a2;
let b2 = a0 - a3;
let b3 = a1 - a2;
let c0 = b0 + b1;
let c1 = b0 - b1;
let b2 = b2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
let b3 = b3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);
let d0 = b2 + b3;
let d1 = b2 - b3;
let d0 = sqrt2.mul_add(d0, d1);
let fh0 = c0;
let fh1 = d0;
let fh2 = c1;
let fh3 = d1;
let s0 = s0 * f32x4::splat(token, WC_MULTIPLIERS_8[0]);
let s1 = s1 * f32x4::splat(token, WC_MULTIPLIERS_8[1]);
let s2 = s2 * f32x4::splat(token, WC_MULTIPLIERS_8[2]);
let s3 = s3 * f32x4::splat(token, WC_MULTIPLIERS_8[3]);
let e0 = s0 + s3;
let e1 = s1 + s2;
let e2 = s0 - s3;
let e3 = s1 - s2;
let f0 = e0 + e1;
let f1 = e0 - e1;
let e2 = e2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
let e3 = e3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);
let g0 = e2 + e3;
let g1 = e2 - e3;
let g0 = sqrt2.mul_add(g0, g1);
let sh0 = f0;
let sh1 = g0;
let sh2 = f1;
let sh3 = g1;
let sh0 = sqrt2.mul_add(sh0, sh1);
let sh1 = sh1 + sh2;
let sh2 = sh2 + sh3;
(fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3)
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn neon_idct1d_8(
token: archmage::NeonToken,
r0: magetypes::simd::f32x4,
r1: magetypes::simd::f32x4,
r2: magetypes::simd::f32x4,
r3: magetypes::simd::f32x4,
r4: magetypes::simd::f32x4,
r5: magetypes::simd::f32x4,
r6: magetypes::simd::f32x4,
r7: magetypes::simd::f32x4,
) -> (
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
) {
use magetypes::simd::f32x4;
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let fh0 = r0;
let sh0 = r1;
let fh1 = r2;
let sh1 = r3;
let fh2 = r4;
let sh2 = r5;
let fh3 = r6;
let sh3 = r7;
let sh2 = sh2 - sh3;
let sh1 = sh1 - sh2;
let sh0 = (sh0 - sh1) * inv_sqrt2;
let f0 = sh0;
let g0 = sh1;
let f1 = sh2;
let g1 = sh3;
let g0 = (g0 - g1) * inv_sqrt2;
let e2p = g0 + g1;
let e3p = g0 - g1;
let e2 = e2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
let e3 = e3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);
let e0p = f0 + f1;
let e1p = f0 - f1;
let s0 = e0p + e2;
let s3 = e0p - e2;
let s1 = e1p + e3;
let s2 = e1p - e3;
let s0 = s0 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[0]);
let s1 = s1 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[1]);
let s2 = s2 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[2]);
let s3 = s3 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[3]);
let c0 = fh0;
let d0 = fh1;
let c1 = fh2;
let d1 = fh3;
let d0_in = (d0 - d1) * inv_sqrt2;
let b2p = d0_in + d1;
let b3p = d0_in - d1;
let b2 = b2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
let b3 = b3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);
let b0p = c0 + c1;
let b1p = c0 - c1;
let a0 = b0p + b2;
let a3 = b0p - b2;
let a1 = b1p + b3;
let a2 = b1p - b3;
let out0 = a0 + s0;
let out7 = a0 - s0;
let out1 = a1 + s1;
let out6 = a1 - s1;
let out2 = a2 + s2;
let out5 = a2 - s2;
let out3 = a3 + s3;
let out4 = a3 - s3;
(out0, out1, out2, out3, out4, out5, out6, out7)
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::type_complexity)]
fn neon_transpose_4x4(
_token: archmage::NeonToken,
r0: magetypes::simd::f32x4,
r1: magetypes::simd::f32x4,
r2: magetypes::simd::f32x4,
r3: magetypes::simd::f32x4,
) -> (
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
) {
use core::arch::aarch64::*;
use magetypes::simd::f32x4;
let r0 = r0.raw();
let r1 = r1.raw();
let r2 = r2.raw();
let r3 = r3.raw();
let t01_lo = vtrn1q_f32(r0, r1);
let t01_hi = vtrn2q_f32(r0, r1);
let t23_lo = vtrn1q_f32(r2, r3);
let t23_hi = vtrn2q_f32(r2, r3);
let lo0 = vreinterpretq_f64_f32(t01_lo);
let lo1 = vreinterpretq_f64_f32(t23_lo);
let hi0 = vreinterpretq_f64_f32(t01_hi);
let hi1 = vreinterpretq_f64_f32(t23_hi);
let out0 = vreinterpretq_f32_f64(vtrn1q_f64(lo0, lo1));
let out1 = vreinterpretq_f32_f64(vtrn1q_f64(hi0, hi1));
let out2 = vreinterpretq_f32_f64(vtrn2q_f64(lo0, lo1));
let out3 = vreinterpretq_f32_f64(vtrn2q_f64(hi0, hi1));
(
f32x4::from_float32x4_t(_token, out0),
f32x4::from_float32x4_t(_token, out1),
f32x4::from_float32x4_t(_token, out2),
f32x4::from_float32x4_t(_token, out3),
)
}
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
pub fn dct_8x8_wasm128(token: archmage::Wasm128Token, input: &[f32; 64], output: &mut [f32; 64]) {
use magetypes::simd::f32x4;
let scale = f32x4::splat(token, 0.125);
let l0 = f32x4::from_slice(token, &input[0..]);
let l1 = f32x4::from_slice(token, &input[8..]);
let l2 = f32x4::from_slice(token, &input[16..]);
let l3 = f32x4::from_slice(token, &input[24..]);
let l4 = f32x4::from_slice(token, &input[32..]);
let l5 = f32x4::from_slice(token, &input[40..]);
let l6 = f32x4::from_slice(token, &input[48..]);
let l7 = f32x4::from_slice(token, &input[56..]);
let (l0, l1, l2, l3, l4, l5, l6, l7) = wasm128_dct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
let l0 = l0 * scale;
let l1 = l1 * scale;
let l2 = l2 * scale;
let l3 = l3 * scale;
let l4 = l4 * scale;
let l5 = l5 * scale;
let l6 = l6 * scale;
let l7 = l7 * scale;
let h0 = f32x4::from_slice(token, &input[4..]);
let h1 = f32x4::from_slice(token, &input[12..]);
let h2 = f32x4::from_slice(token, &input[20..]);
let h3 = f32x4::from_slice(token, &input[28..]);
let h4 = f32x4::from_slice(token, &input[36..]);
let h5 = f32x4::from_slice(token, &input[44..]);
let h6 = f32x4::from_slice(token, &input[52..]);
let h7 = f32x4::from_slice(token, &input[60..]);
let (h0, h1, h2, h3, h4, h5, h6, h7) = wasm128_dct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);
let h0 = h0 * scale;
let h1 = h1 * scale;
let h2 = h2 * scale;
let h3 = h3 * scale;
let h4 = h4 * scale;
let h5 = h5 * scale;
let h6 = h6 * scale;
let h7 = h7 * scale;
let (a0, a1, a2, a3) = wasm128_transpose_4x4(token, l0, l1, l2, l3);
let (b0, b1, b2, b3) = wasm128_transpose_4x4(token, h0, h1, h2, h3);
let (c0, c1, c2, c3) = wasm128_transpose_4x4(token, l4, l5, l6, l7);
let (d0, d1, d2, d3) = wasm128_transpose_4x4(token, h4, h5, h6, h7);
let (a0, a1, a2, a3, b0, b1, b2, b3) = wasm128_dct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
let a0 = a0 * scale;
let a1 = a1 * scale;
let a2 = a2 * scale;
let a3 = a3 * scale;
let b0 = b0 * scale;
let b1 = b1 * scale;
let b2 = b2 * scale;
let b3 = b3 * scale;
let (c0, c1, c2, c3, d0, d1, d2, d3) = wasm128_dct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);
let c0 = c0 * scale;
let c1 = c1 * scale;
let c2 = c2 * scale;
let c3 = c3 * scale;
let d0 = d0 * scale;
let d1 = d1 * scale;
let d2 = d2 * scale;
let d3 = d3 * scale;
a0.store((&mut output[0..4]).try_into().unwrap());
c0.store((&mut output[4..8]).try_into().unwrap());
a1.store((&mut output[8..12]).try_into().unwrap());
c1.store((&mut output[12..16]).try_into().unwrap());
a2.store((&mut output[16..20]).try_into().unwrap());
c2.store((&mut output[20..24]).try_into().unwrap());
a3.store((&mut output[24..28]).try_into().unwrap());
c3.store((&mut output[28..32]).try_into().unwrap());
b0.store((&mut output[32..36]).try_into().unwrap());
d0.store((&mut output[36..40]).try_into().unwrap());
b1.store((&mut output[40..44]).try_into().unwrap());
d1.store((&mut output[44..48]).try_into().unwrap());
b2.store((&mut output[48..52]).try_into().unwrap());
d2.store((&mut output[52..56]).try_into().unwrap());
b3.store((&mut output[56..60]).try_into().unwrap());
d3.store((&mut output[60..64]).try_into().unwrap());
}
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
pub fn idct_8x8_wasm128(token: archmage::Wasm128Token, input: &[f32; 64], output: &mut [f32; 64]) {
use magetypes::simd::f32x4;
let l0 = f32x4::from_slice(token, &input[0..]);
let h0 = f32x4::from_slice(token, &input[4..]);
let l1 = f32x4::from_slice(token, &input[8..]);
let h1 = f32x4::from_slice(token, &input[12..]);
let l2 = f32x4::from_slice(token, &input[16..]);
let h2 = f32x4::from_slice(token, &input[20..]);
let l3 = f32x4::from_slice(token, &input[24..]);
let h3 = f32x4::from_slice(token, &input[28..]);
let l4 = f32x4::from_slice(token, &input[32..]);
let h4 = f32x4::from_slice(token, &input[36..]);
let l5 = f32x4::from_slice(token, &input[40..]);
let h5 = f32x4::from_slice(token, &input[44..]);
let l6 = f32x4::from_slice(token, &input[48..]);
let h6 = f32x4::from_slice(token, &input[52..]);
let l7 = f32x4::from_slice(token, &input[56..]);
let h7 = f32x4::from_slice(token, &input[60..]);
let (l0, l1, l2, l3, l4, l5, l6, l7) = wasm128_idct1d_8(token, l0, l1, l2, l3, l4, l5, l6, l7);
let (h0, h1, h2, h3, h4, h5, h6, h7) = wasm128_idct1d_8(token, h0, h1, h2, h3, h4, h5, h6, h7);
let (a0, a1, a2, a3) = wasm128_transpose_4x4(token, l0, l1, l2, l3);
let (b0, b1, b2, b3) = wasm128_transpose_4x4(token, h0, h1, h2, h3);
let (c0, c1, c2, c3) = wasm128_transpose_4x4(token, l4, l5, l6, l7);
let (d0, d1, d2, d3) = wasm128_transpose_4x4(token, h4, h5, h6, h7);
let (a0, a1, a2, a3, b0, b1, b2, b3) = wasm128_idct1d_8(token, a0, a1, a2, a3, b0, b1, b2, b3);
let (c0, c1, c2, c3, d0, d1, d2, d3) = wasm128_idct1d_8(token, c0, c1, c2, c3, d0, d1, d2, d3);
a0.store((&mut output[0..4]).try_into().unwrap());
c0.store((&mut output[4..8]).try_into().unwrap());
a1.store((&mut output[8..12]).try_into().unwrap());
c1.store((&mut output[12..16]).try_into().unwrap());
a2.store((&mut output[16..20]).try_into().unwrap());
c2.store((&mut output[20..24]).try_into().unwrap());
a3.store((&mut output[24..28]).try_into().unwrap());
c3.store((&mut output[28..32]).try_into().unwrap());
b0.store((&mut output[32..36]).try_into().unwrap());
d0.store((&mut output[36..40]).try_into().unwrap());
b1.store((&mut output[40..44]).try_into().unwrap());
d1.store((&mut output[44..48]).try_into().unwrap());
b2.store((&mut output[48..52]).try_into().unwrap());
d2.store((&mut output[52..56]).try_into().unwrap());
b3.store((&mut output[56..60]).try_into().unwrap());
d3.store((&mut output[60..64]).try_into().unwrap());
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn wasm128_dct1d_8(
token: archmage::Wasm128Token,
r0: magetypes::simd::f32x4,
r1: magetypes::simd::f32x4,
r2: magetypes::simd::f32x4,
r3: magetypes::simd::f32x4,
r4: magetypes::simd::f32x4,
r5: magetypes::simd::f32x4,
r6: magetypes::simd::f32x4,
r7: magetypes::simd::f32x4,
) -> (
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
) {
use magetypes::simd::f32x4;
let sqrt2 = f32x4::splat(token, SQRT2);
let a0 = r0 + r7;
let a1 = r1 + r6;
let a2 = r2 + r5;
let a3 = r3 + r4;
let s0 = r0 - r7;
let s1 = r1 - r6;
let s2 = r2 - r5;
let s3 = r3 - r4;
let b0 = a0 + a3;
let b1 = a1 + a2;
let b2 = a0 - a3;
let b3 = a1 - a2;
let c0 = b0 + b1;
let c1 = b0 - b1;
let b2 = b2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
let b3 = b3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);
let d0 = b2 + b3;
let d1 = b2 - b3;
let d0 = sqrt2.mul_add(d0, d1);
let fh0 = c0;
let fh1 = d0;
let fh2 = c1;
let fh3 = d1;
let s0 = s0 * f32x4::splat(token, WC_MULTIPLIERS_8[0]);
let s1 = s1 * f32x4::splat(token, WC_MULTIPLIERS_8[1]);
let s2 = s2 * f32x4::splat(token, WC_MULTIPLIERS_8[2]);
let s3 = s3 * f32x4::splat(token, WC_MULTIPLIERS_8[3]);
let e0 = s0 + s3;
let e1 = s1 + s2;
let e2 = s0 - s3;
let e3 = s1 - s2;
let f0 = e0 + e1;
let f1 = e0 - e1;
let e2 = e2 * f32x4::splat(token, WC_MULTIPLIERS_4[0]);
let e3 = e3 * f32x4::splat(token, WC_MULTIPLIERS_4[1]);
let g0 = e2 + e3;
let g1 = e2 - e3;
let g0 = sqrt2.mul_add(g0, g1);
let sh0 = f0;
let sh1 = g0;
let sh2 = f1;
let sh3 = g1;
let sh0 = sqrt2.mul_add(sh0, sh1);
let sh1 = sh1 + sh2;
let sh2 = sh2 + sh3;
(fh0, sh0, fh1, sh1, fh2, sh2, fh3, sh3)
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn wasm128_idct1d_8(
token: archmage::Wasm128Token,
r0: magetypes::simd::f32x4,
r1: magetypes::simd::f32x4,
r2: magetypes::simd::f32x4,
r3: magetypes::simd::f32x4,
r4: magetypes::simd::f32x4,
r5: magetypes::simd::f32x4,
r6: magetypes::simd::f32x4,
r7: magetypes::simd::f32x4,
) -> (
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
) {
use magetypes::simd::f32x4;
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let fh0 = r0;
let sh0 = r1;
let fh1 = r2;
let sh1 = r3;
let fh2 = r4;
let sh2 = r5;
let fh3 = r6;
let sh3 = r7;
let sh2 = sh2 - sh3;
let sh1 = sh1 - sh2;
let sh0 = (sh0 - sh1) * inv_sqrt2;
let f0 = sh0;
let g0 = sh1;
let f1 = sh2;
let g1 = sh3;
let g0 = (g0 - g1) * inv_sqrt2;
let e2p = g0 + g1;
let e3p = g0 - g1;
let e2 = e2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
let e3 = e3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);
let e0p = f0 + f1;
let e1p = f0 - f1;
let s0 = e0p + e2;
let s3 = e0p - e2;
let s1 = e1p + e3;
let s2 = e1p - e3;
let s0 = s0 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[0]);
let s1 = s1 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[1]);
let s2 = s2 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[2]);
let s3 = s3 * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_8[3]);
let c0 = fh0;
let d0 = fh1;
let c1 = fh2;
let d1 = fh3;
let d0_in = (d0 - d1) * inv_sqrt2;
let b2p = d0_in + d1;
let b3p = d0_in - d1;
let b2 = b2p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[0]);
let b3 = b3p * f32x4::splat(token, 1.0 / WC_MULTIPLIERS_4[1]);
let b0p = c0 + c1;
let b1p = c0 - c1;
let a0 = b0p + b2;
let a3 = b0p - b2;
let a1 = b1p + b3;
let a2 = b1p - b3;
let out0 = a0 + s0;
let out7 = a0 - s0;
let out1 = a1 + s1;
let out6 = a1 - s1;
let out2 = a2 + s2;
let out5 = a2 - s2;
let out3 = a3 + s3;
let out4 = a3 - s3;
(out0, out1, out2, out3, out4, out5, out6, out7)
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::type_complexity)]
fn wasm128_transpose_4x4(
token: archmage::Wasm128Token,
r0: magetypes::simd::f32x4,
r1: magetypes::simd::f32x4,
r2: magetypes::simd::f32x4,
r3: magetypes::simd::f32x4,
) -> (
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
magetypes::simd::f32x4,
) {
use core::arch::wasm32::*;
use magetypes::simd::f32x4;
let r0 = r0.raw();
let r1 = r1.raw();
let r2 = r2.raw();
let r3 = r3.raw();
let t0 = i32x4_shuffle::<0, 4, 1, 5>(r0, r1);
let t1 = i32x4_shuffle::<2, 6, 3, 7>(r0, r1);
let t2 = i32x4_shuffle::<0, 4, 1, 5>(r2, r3);
let t3 = i32x4_shuffle::<2, 6, 3, 7>(r2, r3);
let out0 = i64x2_shuffle::<0, 2>(t0, t2);
let out1 = i64x2_shuffle::<1, 3>(t0, t2);
let out2 = i64x2_shuffle::<0, 2>(t1, t3);
let out3 = i64x2_shuffle::<1, 3>(t1, t3);
(
f32x4::from_v128(token, out0),
f32x4::from_v128(token, out1),
f32x4::from_v128(token, out2),
f32x4::from_v128(token, out3),
)
}
#[cfg(test)]
mod tests {
extern crate std;
use super::*;
#[test]
fn test_dct_8x8_scalar_roundtrip() {
let mut input = [0.0f32; 64];
for (i, val) in input.iter_mut().enumerate() {
*val = (i as f32 * 0.1).sin();
}
let mut dct_out = [0.0f32; 64];
let mut idct_out = [0.0f32; 64];
dct_8x8_scalar(&input, &mut dct_out);
idct_8x8_scalar(&dct_out, &mut idct_out);
for i in 0..64 {
assert!(
(input[i] - idct_out[i]).abs() < 1e-4,
"Roundtrip mismatch at {}: {} vs {}",
i,
input[i],
idct_out[i]
);
}
}
#[test]
fn test_vectorized_butterfly_vs_scalar() {
let input = [1.0f32, 0.5, -0.3, 0.8, -0.1, 0.6, -0.4, 0.9];
let mut block = [0.0f32; 64];
for r in 0..8 {
block[r * 8] = input[r];
}
let mut scalar_2d_out = [0.0f32; 64];
dct_8x8_scalar(&block, &mut scalar_2d_out);
let report = archmage::testing::for_each_token_permutation(
archmage::testing::CompileTimePolicy::Warn,
|perm| {
let mut dct_out = [0.0f32; 64];
dct_8x8(&block, &mut dct_out);
for i in 0..64 {
let diff = (dct_out[i] - scalar_2d_out[i]).abs();
assert!(
diff < 1e-5,
"Mismatch at [{i}]: simd={:.6} scalar={:.6} diff={diff:.6} [{perm}]",
dct_out[i],
scalar_2d_out[i],
);
}
},
);
std::eprintln!("{report}");
}
#[test]
fn test_dct_8x8_simd_matches_scalar() {
let mut input = [0.0f32; 64];
for (i, val) in input.iter_mut().enumerate() {
*val = ((i as f32) * 0.37 + 1.5).cos();
}
let mut scalar_out = [0.0f32; 64];
dct_8x8_scalar(&input, &mut scalar_out);
let report = archmage::testing::for_each_token_permutation(
archmage::testing::CompileTimePolicy::Warn,
|perm| {
let mut simd_out = [0.0f32; 64];
dct_8x8(&input, &mut simd_out);
for i in 0..64 {
assert!(
(scalar_out[i] - simd_out[i]).abs() < 1e-5,
"DCT mismatch at {i}: scalar={} simd={} [{perm}]",
scalar_out[i],
simd_out[i]
);
}
},
);
std::eprintln!("{report}");
}
#[test]
fn test_idct_8x8_simd_matches_scalar() {
let mut input = [0.0f32; 64];
for (i, val) in input.iter_mut().enumerate() {
*val = ((i as f32) * 0.37 + 1.5).cos();
}
let mut scalar_out = [0.0f32; 64];
idct_8x8_scalar(&input, &mut scalar_out);
let report = archmage::testing::for_each_token_permutation(
archmage::testing::CompileTimePolicy::Warn,
|perm| {
let mut simd_out = [0.0f32; 64];
idct_8x8(&input, &mut simd_out);
for i in 0..64 {
assert!(
(scalar_out[i] - simd_out[i]).abs() < 1e-5,
"IDCT mismatch at {i}: scalar={} simd={} [{perm}]",
scalar_out[i],
simd_out[i]
);
}
},
);
std::eprintln!("{report}");
}
#[test]
fn test_dct_idct_simd_roundtrip() {
let mut input = [0.0f32; 64];
for (i, val) in input.iter_mut().enumerate() {
*val = (i as f32 * 0.1).sin();
}
let report = archmage::testing::for_each_token_permutation(
archmage::testing::CompileTimePolicy::Warn,
|perm| {
let mut dct_out = [0.0f32; 64];
let mut idct_out = [0.0f32; 64];
dct_8x8(&input, &mut dct_out);
idct_8x8(&dct_out, &mut idct_out);
for i in 0..64 {
assert!(
(input[i] - idct_out[i]).abs() < 1e-4,
"SIMD roundtrip mismatch at {i}: {} vs {} [{perm}]",
input[i],
idct_out[i]
);
}
},
);
std::eprintln!("{report}");
}
}