const SQRT2: f32 = core::f32::consts::SQRT_2;
const WC_MULTIPLIERS_4: [f32; 2] = [0.541_196_1, 1.306_563];
const WC_MULTIPLIERS_8: [f32; 4] = [0.509_795_6, 0.601_344_9, 0.899_976_2, 2.562_915_5];
const WC_MULTIPLIERS_16: [f32; 8] = [
0.502_419_3,
0.522_498_6,
0.566_944_06,
0.646_821_8,
0.788_154_65,
1.060_677_7,
1.722_447_1,
5.101_148_6,
];
const INV_WC4: [f32; 2] = [1.0 / 0.541_196_1, 1.0 / 1.306_563];
const INV_WC8: [f32; 4] = [
1.0 / 0.509_795_6,
1.0 / 0.601_344_9,
1.0 / 0.899_976_2,
1.0 / 2.562_915_5,
];
const INV_WC16: [f32; 8] = [
1.0 / 0.502_419_3,
1.0 / 0.522_498_6,
1.0 / 0.566_944_06,
1.0 / 0.646_821_8,
1.0 / 0.788_154_65,
1.0 / 1.060_677_7,
1.0 / 1.722_447_1,
1.0 / 5.101_148_6,
];
#[inline]
pub fn idct_16x16(input: &[f32; 256], output: &mut [f32; 256]) {
#[cfg(target_arch = "x86_64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::X64V3Token::summon() {
idct_16x16_avx2(token, input, output);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::NeonToken::summon() {
idct_16x16_neon(token, input, output);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use archmage::SimdToken;
if let Some(token) = archmage::Wasm128Token::summon() {
idct_16x16_wasm128(token, input, output);
return;
}
}
idct_16x16_scalar(input, output);
}
#[inline]
pub fn idct_16x16_scalar(input: &[f32; 256], output: &mut [f32; 256]) {
let mut tmp = crate::scratch_buf::<256>();
for row in 0..16 {
let s = row * 16;
tmp[s..s + 16].copy_from_slice(&input[s..s + 16]);
idct1d_16_scalar(&mut tmp[s..s + 16]);
}
let mut transposed = crate::scratch_buf::<256>();
for r in 0..16 {
for c in 0..16 {
transposed[c * 16 + r] = tmp[r * 16 + c];
}
}
for row in 0..16 {
let s = row * 16;
output[s..s + 16].copy_from_slice(&transposed[s..s + 16]);
idct1d_16_scalar(&mut output[s..s + 16]);
}
}
#[inline]
fn idct1d_2_scalar(mem: &mut [f32]) {
let x = mem[0];
let y = mem[1];
mem[0] = (x + y) * 0.5;
mem[1] = (x - y) * 0.5;
}
fn idct1d_4_scalar(mem: &mut [f32]) {
let mut tmp = [mem[0], mem[2], mem[1], mem[3]];
tmp[2] = (tmp[2] - tmp[3]) / SQRT2;
idct1d_2_scalar(&mut tmp[2..4]);
tmp[2] /= WC_MULTIPLIERS_4[0];
tmp[3] /= WC_MULTIPLIERS_4[1];
idct1d_2_scalar(&mut tmp[0..2]);
mem[0] = (tmp[0] + tmp[2]) * 0.5;
mem[3] = (tmp[0] - tmp[2]) * 0.5;
mem[1] = (tmp[1] + tmp[3]) * 0.5;
mem[2] = (tmp[1] - tmp[3]) * 0.5;
}
fn idct1d_8_core_scalar(mem: &mut [f32]) {
let mut tmp = [0.0f32; 8];
for i in 0..4 {
tmp[i] = mem[2 * i];
tmp[4 + i] = mem[2 * i + 1];
}
tmp[6] -= tmp[7];
tmp[5] -= tmp[6];
tmp[4] = (tmp[4] - tmp[5]) / SQRT2;
idct1d_4_scalar(&mut tmp[4..8]);
for i in 0..4 {
tmp[4 + i] /= WC_MULTIPLIERS_8[i];
}
idct1d_4_scalar(&mut tmp[0..4]);
for i in 0..4 {
mem[i] = (tmp[i] + tmp[4 + i]) * 0.5;
mem[7 - i] = (tmp[i] - tmp[4 + i]) * 0.5;
}
}
fn idct1d_16_scalar(mem: &mut [f32]) {
for x in mem.iter_mut().take(16) {
*x *= 16.0;
}
let mut tmp = [0.0f32; 16];
for i in 0..8 {
tmp[i] = mem[2 * i];
tmp[8 + i] = mem[2 * i + 1];
}
for i in (1..7).rev() {
tmp[8 + i] -= tmp[8 + i + 1];
}
tmp[8] = (tmp[8] - tmp[9]) / SQRT2;
idct1d_8_core_scalar(&mut tmp[8..16]);
for i in 0..8 {
tmp[8 + i] /= WC_MULTIPLIERS_16[i];
}
idct1d_8_core_scalar(&mut tmp[0..8]);
for i in 0..8 {
mem[i] = (tmp[i] + tmp[8 + i]) * 0.5;
mem[15 - i] = (tmp[i] - tmp[8 + i]) * 0.5;
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
fn gather_col(
token: archmage::X64V3Token,
data: &[f32],
base_row: usize,
j: usize,
) -> magetypes::simd::f32x8 {
magetypes::simd::f32x8::from_array(
token,
[
data[base_row * 16 + j],
data[(base_row + 1) * 16 + j],
data[(base_row + 2) * 16 + j],
data[(base_row + 3) * 16 + j],
data[(base_row + 4) * 16 + j],
data[(base_row + 5) * 16 + j],
data[(base_row + 6) * 16 + j],
data[(base_row + 7) * 16 + j],
],
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn scatter_col(v: magetypes::simd::f32x8, data: &mut [f32], base_row: usize, j: usize) {
let mut lane = [0.0f32; 8];
v.store(&mut lane);
for r in 0..8 {
data[(base_row + r) * 16 + j] = lane[r];
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
fn idct1d_4_batch(token: archmage::X64V3Token, v: &mut [magetypes::simd::f32x8; 4]) {
use magetypes::simd::f32x8;
let half = f32x8::splat(token, 0.5);
let inv_sqrt2 = f32x8::splat(token, 1.0 / SQRT2);
let inv_wc4_0 = f32x8::splat(token, INV_WC4[0]);
let inv_wc4_1 = f32x8::splat(token, INV_WC4[1]);
let t0 = v[0];
let t1 = v[2];
let t2 = v[1];
let t3 = v[3];
let t2 = (t2 - t3) * inv_sqrt2;
let s2 = (t2 + t3) * half;
let s3 = (t2 - t3) * half;
let s2 = s2 * inv_wc4_0;
let s3 = s3 * inv_wc4_1;
let s0 = (t0 + t1) * half;
let s1 = (t0 - t1) * half;
v[0] = (s0 + s2) * half;
v[3] = (s0 - s2) * half;
v[1] = (s1 + s3) * half;
v[2] = (s1 - s3) * half;
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
fn idct1d_8_core_batch(token: archmage::X64V3Token, v: &mut [magetypes::simd::f32x8; 8]) {
use magetypes::simd::f32x8;
let half = f32x8::splat(token, 0.5);
let inv_sqrt2 = f32x8::splat(token, 1.0 / SQRT2);
let mut first_half = [v[0], v[2], v[4], v[6]];
let mut second_half = [v[1], v[3], v[5], v[7]];
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * inv_sqrt2;
idct1d_4_batch(token, &mut second_half);
second_half[0] *= f32x8::splat(token, INV_WC8[0]);
second_half[1] *= f32x8::splat(token, INV_WC8[1]);
second_half[2] *= f32x8::splat(token, INV_WC8[2]);
second_half[3] *= f32x8::splat(token, INV_WC8[3]);
idct1d_4_batch(token, &mut first_half);
v[0] = (first_half[0] + second_half[0]) * half;
v[7] = (first_half[0] - second_half[0]) * half;
v[1] = (first_half[1] + second_half[1]) * half;
v[6] = (first_half[1] - second_half[1]) * half;
v[2] = (first_half[2] + second_half[2]) * half;
v[5] = (first_half[2] - second_half[2]) * half;
v[3] = (first_half[3] + second_half[3]) * half;
v[4] = (first_half[3] - second_half[3]) * half;
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
pub(crate) fn idct1d_16_core_batch(
token: archmage::X64V3Token,
v: &mut [magetypes::simd::f32x8; 16],
) {
use magetypes::simd::f32x8;
let half = f32x8::splat(token, 0.5);
let inv_sqrt2 = f32x8::splat(token, 1.0 / SQRT2);
let mut first_half = [v[0], v[2], v[4], v[6], v[8], v[10], v[12], v[14]];
let mut second_half = [v[1], v[3], v[5], v[7], v[9], v[11], v[13], v[15]];
second_half[6] -= second_half[7];
second_half[5] -= second_half[6];
second_half[4] -= second_half[5];
second_half[3] -= second_half[4];
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * inv_sqrt2;
idct1d_8_core_batch(token, &mut second_half);
second_half[0] *= f32x8::splat(token, INV_WC16[0]);
second_half[1] *= f32x8::splat(token, INV_WC16[1]);
second_half[2] *= f32x8::splat(token, INV_WC16[2]);
second_half[3] *= f32x8::splat(token, INV_WC16[3]);
second_half[4] *= f32x8::splat(token, INV_WC16[4]);
second_half[5] *= f32x8::splat(token, INV_WC16[5]);
second_half[6] *= f32x8::splat(token, INV_WC16[6]);
second_half[7] *= f32x8::splat(token, INV_WC16[7]);
idct1d_8_core_batch(token, &mut first_half);
v[0] = (first_half[0] + second_half[0]) * half;
v[15] = (first_half[0] - second_half[0]) * half;
v[1] = (first_half[1] + second_half[1]) * half;
v[14] = (first_half[1] - second_half[1]) * half;
v[2] = (first_half[2] + second_half[2]) * half;
v[13] = (first_half[2] - second_half[2]) * half;
v[3] = (first_half[3] + second_half[3]) * half;
v[12] = (first_half[3] - second_half[3]) * half;
v[4] = (first_half[4] + second_half[4]) * half;
v[11] = (first_half[4] - second_half[4]) * half;
v[5] = (first_half[5] + second_half[5]) * half;
v[10] = (first_half[5] - second_half[5]) * half;
v[6] = (first_half[6] + second_half[6]) * half;
v[9] = (first_half[6] - second_half[6]) * half;
v[7] = (first_half[7] + second_half[7]) * half;
v[8] = (first_half[7] - second_half[7]) * half;
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
pub(crate) fn idct1d_16_batch(token: archmage::X64V3Token, v: &mut [magetypes::simd::f32x8; 16]) {
use magetypes::simd::f32x8;
let scale16 = f32x8::splat(token, 16.0);
for vi in v.iter_mut() {
*vi *= scale16;
}
idct1d_16_core_batch(token, v);
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_16x16_avx2(token: archmage::X64V3Token, input: &[f32; 256], output: &mut [f32; 256]) {
use magetypes::simd::f32x8;
let mut tmp = crate::scratch_buf::<256>();
{
let mut v = [f32x8::zero(token); 16];
for j in 0..16 {
v[j] = gather_col(token, input, 0, j);
}
idct1d_16_batch(token, &mut v);
for j in 0..16 {
scatter_col(v[j], &mut tmp, 0, j);
}
}
{
let mut v = [f32x8::zero(token); 16];
for j in 0..16 {
v[j] = gather_col(token, input, 8, j);
}
idct1d_16_batch(token, &mut v);
for j in 0..16 {
scatter_col(v[j], &mut tmp, 8, j);
}
}
let mut transposed = crate::scratch_buf::<256>();
for r in 0..16 {
for c in 0..16 {
transposed[c * 16 + r] = tmp[r * 16 + c];
}
}
{
let mut v = [f32x8::zero(token); 16];
for j in 0..16 {
v[j] = gather_col(token, &transposed, 0, j);
}
idct1d_16_batch(token, &mut v);
for j in 0..16 {
scatter_col(v[j], output, 0, j);
}
}
{
let mut v = [f32x8::zero(token); 16];
for j in 0..16 {
v[j] = gather_col(token, &transposed, 8, j);
}
idct1d_16_batch(token, &mut v);
for j in 0..16 {
scatter_col(v[j], output, 8, j);
}
}
}
#[inline]
pub fn idct_16x8(input: &[f32; 128], output: &mut [f32; 128]) {
#[cfg(target_arch = "x86_64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::X64V3Token::summon() {
idct_16x8_avx2(token, input, output);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::NeonToken::summon() {
idct_16x8_neon(token, input, output);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use archmage::SimdToken;
if let Some(token) = archmage::Wasm128Token::summon() {
idct_16x8_wasm128(token, input, output);
return;
}
}
idct_16x8_scalar(input, output);
}
#[inline]
pub fn idct_16x8_scalar(input: &[f32; 128], output: &mut [f32; 128]) {
let mut tmp = crate::scratch_buf::<128>();
for row in 0..16 {
let s = row * 8;
tmp[s..s + 8].copy_from_slice(&input[s..s + 8]);
idct1d_8_scalar(&mut tmp[s..s + 8]);
}
for col in 0..8 {
let mut col_buf = [0.0f32; 16];
for row in 0..16 {
col_buf[row] = tmp[row * 8 + col];
}
idct1d_16_scalar(&mut col_buf);
for row in 0..16 {
output[row * 8 + col] = col_buf[row];
}
}
}
fn idct1d_8_scalar(mem: &mut [f32]) {
for x in mem.iter_mut().take(8) {
*x *= 8.0;
}
idct1d_8_core_scalar(mem);
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
fn gather_col_s8(
token: archmage::X64V3Token,
data: &[f32],
base_row: usize,
j: usize,
) -> magetypes::simd::f32x8 {
magetypes::simd::f32x8::from_array(
token,
[
data[base_row * 8 + j],
data[(base_row + 1) * 8 + j],
data[(base_row + 2) * 8 + j],
data[(base_row + 3) * 8 + j],
data[(base_row + 4) * 8 + j],
data[(base_row + 5) * 8 + j],
data[(base_row + 6) * 8 + j],
data[(base_row + 7) * 8 + j],
],
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn scatter_col_s8(v: magetypes::simd::f32x8, data: &mut [f32], base_row: usize, j: usize) {
let mut lane = [0.0f32; 8];
v.store(&mut lane);
for r in 0..8 {
data[(base_row + r) * 8 + j] = lane[r];
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[inline(always)]
fn idct1d_8_batch(token: archmage::X64V3Token, v: &mut [magetypes::simd::f32x8; 8]) {
use magetypes::simd::f32x8;
let scale8 = f32x8::splat(token, 8.0);
for vi in v.iter_mut() {
*vi *= scale8;
}
idct1d_8_core_batch(token, v);
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_16x8_avx2(token: archmage::X64V3Token, input: &[f32; 128], output: &mut [f32; 128]) {
use magetypes::simd::f32x8;
let mut tmp = crate::scratch_buf::<128>();
{
let mut v = [f32x8::zero(token); 8];
for j in 0..8 {
v[j] = gather_col_s8(token, input, 0, j);
}
idct1d_8_batch(token, &mut v);
for j in 0..8 {
scatter_col_s8(v[j], &mut tmp, 0, j);
}
}
{
let mut v = [f32x8::zero(token); 8];
for j in 0..8 {
v[j] = gather_col_s8(token, input, 8, j);
}
idct1d_8_batch(token, &mut v);
for j in 0..8 {
scatter_col_s8(v[j], &mut tmp, 8, j);
}
}
{
let mut v = [f32x8::zero(token); 16];
for i in 0..16 {
v[i] = f32x8::from_slice(token, &tmp[i * 8..i * 8 + 8]);
}
idct1d_16_batch(token, &mut v);
for i in 0..16 {
v[i].store((&mut output[i * 8..i * 8 + 8]).try_into().unwrap());
}
}
}
#[inline]
pub fn idct_8x16(input: &[f32; 128], output: &mut [f32; 128]) {
#[cfg(target_arch = "x86_64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::X64V3Token::summon() {
idct_8x16_avx2(token, input, output);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use archmage::SimdToken;
if let Some(token) = archmage::NeonToken::summon() {
idct_8x16_neon(token, input, output);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use archmage::SimdToken;
if let Some(token) = archmage::Wasm128Token::summon() {
idct_8x16_wasm128(token, input, output);
return;
}
}
idct_8x16_scalar(input, output);
}
#[inline]
pub fn idct_8x16_scalar(input: &[f32; 128], output: &mut [f32; 128]) {
let mut tmp = crate::scratch_buf::<128>();
for row in 0..8 {
let s = row * 16;
tmp[s..s + 16].copy_from_slice(&input[s..s + 16]);
idct1d_16_scalar(&mut tmp[s..s + 16]);
}
for col in 0..16 {
let mut col_buf = [0.0f32; 8];
for row in 0..8 {
col_buf[row] = tmp[row * 16 + col];
}
idct1d_8_scalar(&mut col_buf);
for row in 0..8 {
output[row * 16 + col] = col_buf[row];
}
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_8x16_avx2(token: archmage::X64V3Token, input: &[f32; 128], output: &mut [f32; 128]) {
use magetypes::simd::f32x8;
let mut tmp = crate::scratch_buf::<128>();
{
let mut v = [f32x8::zero(token); 16];
for j in 0..16 {
v[j] = gather_col(token, input, 0, j);
}
idct1d_16_batch(token, &mut v);
for j in 0..16 {
scatter_col(v[j], &mut tmp, 0, j);
}
}
{
let mut v = [f32x8::zero(token); 8];
for i in 0..8 {
v[i] = f32x8::from_slice(token, &tmp[i * 16..i * 16 + 8]);
}
idct1d_8_batch(token, &mut v);
for i in 0..8 {
v[i].store((&mut output[i * 16..i * 16 + 8]).try_into().unwrap());
}
}
{
let mut v = [f32x8::zero(token); 8];
for i in 0..8 {
v[i] = f32x8::from_slice(token, &tmp[i * 16 + 8..i * 16 + 16]);
}
idct1d_8_batch(token, &mut v);
for i in 0..8 {
v[i].store((&mut output[i * 16 + 8..i * 16 + 16]).try_into().unwrap());
}
}
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn gather_col_neon(
token: archmage::NeonToken,
data: &[f32],
base_row: usize,
j: usize,
s: usize,
) -> magetypes::simd::f32x4 {
magetypes::simd::f32x4::from_array(
token,
[
data[base_row * s + j],
data[(base_row + 1) * s + j],
data[(base_row + 2) * s + j],
data[(base_row + 3) * s + j],
],
)
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn scatter_col_neon(
_token: archmage::NeonToken,
v: magetypes::simd::f32x4,
data: &mut [f32],
base_row: usize,
j: usize,
s: usize,
) {
let mut lane = [0.0f32; 4];
v.store(&mut lane);
for r in 0..4 {
data[(base_row + r) * s + j] = lane[r];
}
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn idct1d_4_batch_neon(token: archmage::NeonToken, v: &mut [magetypes::simd::f32x4; 4]) {
use magetypes::simd::f32x4;
let half = f32x4::splat(token, 0.5);
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let inv_wc4_0 = f32x4::splat(token, INV_WC4[0]);
let inv_wc4_1 = f32x4::splat(token, INV_WC4[1]);
let t0 = v[0];
let t1 = v[2];
let t2 = v[1];
let t3 = v[3];
let t2 = (t2 - t3) * inv_sqrt2;
let s2 = (t2 + t3) * half;
let s3 = (t2 - t3) * half;
let s2 = s2 * inv_wc4_0;
let s3 = s3 * inv_wc4_1;
let s0 = (t0 + t1) * half;
let s1 = (t0 - t1) * half;
v[0] = (s0 + s2) * half;
v[3] = (s0 - s2) * half;
v[1] = (s1 + s3) * half;
v[2] = (s1 - s3) * half;
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn idct1d_8_core_batch_neon(token: archmage::NeonToken, v: &mut [magetypes::simd::f32x4; 8]) {
use magetypes::simd::f32x4;
let half = f32x4::splat(token, 0.5);
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let mut first_half = [v[0], v[2], v[4], v[6]];
let mut second_half = [v[1], v[3], v[5], v[7]];
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * inv_sqrt2;
idct1d_4_batch_neon(token, &mut second_half);
second_half[0] *= f32x4::splat(token, INV_WC8[0]);
second_half[1] *= f32x4::splat(token, INV_WC8[1]);
second_half[2] *= f32x4::splat(token, INV_WC8[2]);
second_half[3] *= f32x4::splat(token, INV_WC8[3]);
idct1d_4_batch_neon(token, &mut first_half);
v[0] = (first_half[0] + second_half[0]) * half;
v[7] = (first_half[0] - second_half[0]) * half;
v[1] = (first_half[1] + second_half[1]) * half;
v[6] = (first_half[1] - second_half[1]) * half;
v[2] = (first_half[2] + second_half[2]) * half;
v[5] = (first_half[2] - second_half[2]) * half;
v[3] = (first_half[3] + second_half[3]) * half;
v[4] = (first_half[3] - second_half[3]) * half;
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn idct1d_8_batch_neon(token: archmage::NeonToken, v: &mut [magetypes::simd::f32x4; 8]) {
use magetypes::simd::f32x4;
let scale8 = f32x4::splat(token, 8.0);
for vi in v.iter_mut() {
*vi *= scale8;
}
idct1d_8_core_batch_neon(token, v);
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
pub(crate) fn idct1d_16_core_batch_neon(
token: archmage::NeonToken,
v: &mut [magetypes::simd::f32x4; 16],
) {
use magetypes::simd::f32x4;
let half = f32x4::splat(token, 0.5);
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let mut first_half = [v[0], v[2], v[4], v[6], v[8], v[10], v[12], v[14]];
let mut second_half = [v[1], v[3], v[5], v[7], v[9], v[11], v[13], v[15]];
second_half[6] -= second_half[7];
second_half[5] -= second_half[6];
second_half[4] -= second_half[5];
second_half[3] -= second_half[4];
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * inv_sqrt2;
idct1d_8_core_batch_neon(token, &mut second_half);
second_half[0] *= f32x4::splat(token, INV_WC16[0]);
second_half[1] *= f32x4::splat(token, INV_WC16[1]);
second_half[2] *= f32x4::splat(token, INV_WC16[2]);
second_half[3] *= f32x4::splat(token, INV_WC16[3]);
second_half[4] *= f32x4::splat(token, INV_WC16[4]);
second_half[5] *= f32x4::splat(token, INV_WC16[5]);
second_half[6] *= f32x4::splat(token, INV_WC16[6]);
second_half[7] *= f32x4::splat(token, INV_WC16[7]);
idct1d_8_core_batch_neon(token, &mut first_half);
v[0] = (first_half[0] + second_half[0]) * half;
v[15] = (first_half[0] - second_half[0]) * half;
v[1] = (first_half[1] + second_half[1]) * half;
v[14] = (first_half[1] - second_half[1]) * half;
v[2] = (first_half[2] + second_half[2]) * half;
v[13] = (first_half[2] - second_half[2]) * half;
v[3] = (first_half[3] + second_half[3]) * half;
v[12] = (first_half[3] - second_half[3]) * half;
v[4] = (first_half[4] + second_half[4]) * half;
v[11] = (first_half[4] - second_half[4]) * half;
v[5] = (first_half[5] + second_half[5]) * half;
v[10] = (first_half[5] - second_half[5]) * half;
v[6] = (first_half[6] + second_half[6]) * half;
v[9] = (first_half[6] - second_half[6]) * half;
v[7] = (first_half[7] + second_half[7]) * half;
v[8] = (first_half[7] - second_half[7]) * half;
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
pub(crate) fn idct1d_16_batch_neon(
token: archmage::NeonToken,
v: &mut [magetypes::simd::f32x4; 16],
) {
use magetypes::simd::f32x4;
let scale16 = f32x4::splat(token, 16.0);
for vi in v.iter_mut() {
*vi *= scale16;
}
idct1d_16_core_batch_neon(token, v);
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::needless_range_loop)]
fn neon_idct8_batch(
token: archmage::NeonToken,
data_in: &[f32],
data_out: &mut [f32],
base_row: usize,
stride: usize,
) {
let mut v = [magetypes::simd::f32x4::zero(token); 8];
for j in 0..8 {
v[j] = gather_col_neon(token, data_in, base_row, j, stride);
}
idct1d_8_batch_neon(token, &mut v);
for j in 0..8 {
scatter_col_neon(token, v[j], data_out, base_row, j, stride);
}
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
#[allow(clippy::needless_range_loop)]
fn neon_idct16_batch(
token: archmage::NeonToken,
data_in: &[f32],
data_out: &mut [f32],
base_row: usize,
stride: usize,
) {
let mut v = [magetypes::simd::f32x4::zero(token); 16];
for j in 0..16 {
v[j] = gather_col_neon(token, data_in, base_row, j, stride);
}
idct1d_16_batch_neon(token, &mut v);
for j in 0..16 {
scatter_col_neon(token, v[j], data_out, base_row, j, stride);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_16x16_neon(token: archmage::NeonToken, input: &[f32; 256], output: &mut [f32; 256]) {
let mut tmp = crate::scratch_buf::<256>();
for batch in 0..4 {
neon_idct16_batch(token, input, &mut tmp, batch * 4, 16);
}
let mut transposed = crate::scratch_buf::<256>();
for r in 0..16 {
for c in 0..16 {
transposed[c * 16 + r] = tmp[r * 16 + c];
}
}
for batch in 0..4 {
neon_idct16_batch(token, &transposed, output, batch * 4, 16);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_16x8_neon(token: archmage::NeonToken, input: &[f32; 128], output: &mut [f32; 128]) {
let mut tmp = crate::scratch_buf::<128>();
for batch in 0..4 {
neon_idct8_batch(token, input, &mut tmp, batch * 4, 8);
}
for col_base in (0..8).step_by(4) {
let mut v = [magetypes::simd::f32x4::zero(token); 16];
for row in 0..16 {
v[row] = magetypes::simd::f32x4::from_array(
token,
[
tmp[row * 8 + col_base],
tmp[row * 8 + col_base + 1],
tmp[row * 8 + col_base + 2],
tmp[row * 8 + col_base + 3],
],
);
}
idct1d_16_batch_neon(token, &mut v);
for row in 0..16 {
let mut lane = [0.0f32; 4];
v[row].store(&mut lane);
output[row * 8 + col_base] = lane[0];
output[row * 8 + col_base + 1] = lane[1];
output[row * 8 + col_base + 2] = lane[2];
output[row * 8 + col_base + 3] = lane[3];
}
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_8x16_neon(token: archmage::NeonToken, input: &[f32; 128], output: &mut [f32; 128]) {
let mut tmp = crate::scratch_buf::<128>();
for batch in 0..2 {
neon_idct16_batch(token, input, &mut tmp, batch * 4, 16);
}
for col_base in (0..16).step_by(4) {
let mut v = [magetypes::simd::f32x4::zero(token); 8];
for row in 0..8 {
v[row] = magetypes::simd::f32x4::from_array(
token,
[
tmp[row * 16 + col_base],
tmp[row * 16 + col_base + 1],
tmp[row * 16 + col_base + 2],
tmp[row * 16 + col_base + 3],
],
);
}
idct1d_8_batch_neon(token, &mut v);
for row in 0..8 {
let mut lane = [0.0f32; 4];
v[row].store(&mut lane);
output[row * 16 + col_base] = lane[0];
output[row * 16 + col_base + 1] = lane[1];
output[row * 16 + col_base + 2] = lane[2];
output[row * 16 + col_base + 3] = lane[3];
}
}
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn gather_col_wasm128(
token: archmage::Wasm128Token,
data: &[f32],
base_row: usize,
j: usize,
s: usize,
) -> magetypes::simd::f32x4 {
magetypes::simd::f32x4::from_array(
token,
[
data[base_row * s + j],
data[(base_row + 1) * s + j],
data[(base_row + 2) * s + j],
data[(base_row + 3) * s + j],
],
)
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn scatter_col_wasm128(
_token: archmage::Wasm128Token,
v: magetypes::simd::f32x4,
data: &mut [f32],
base_row: usize,
j: usize,
s: usize,
) {
let mut lane = [0.0f32; 4];
v.store(&mut lane);
for r in 0..4 {
data[(base_row + r) * s + j] = lane[r];
}
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn idct1d_4_batch_wasm128(token: archmage::Wasm128Token, v: &mut [magetypes::simd::f32x4; 4]) {
use magetypes::simd::f32x4;
let half = f32x4::splat(token, 0.5);
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let inv_wc4_0 = f32x4::splat(token, INV_WC4[0]);
let inv_wc4_1 = f32x4::splat(token, INV_WC4[1]);
let t0 = v[0];
let t1 = v[2];
let t2 = v[1];
let t3 = v[3];
let t2 = (t2 - t3) * inv_sqrt2;
let s2 = (t2 + t3) * half;
let s3 = (t2 - t3) * half;
let s2 = s2 * inv_wc4_0;
let s3 = s3 * inv_wc4_1;
let s0 = (t0 + t1) * half;
let s1 = (t0 - t1) * half;
v[0] = (s0 + s2) * half;
v[3] = (s0 - s2) * half;
v[1] = (s1 + s3) * half;
v[2] = (s1 - s3) * half;
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn idct1d_8_core_batch_wasm128(token: archmage::Wasm128Token, v: &mut [magetypes::simd::f32x4; 8]) {
use magetypes::simd::f32x4;
let half = f32x4::splat(token, 0.5);
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let mut first_half = [v[0], v[2], v[4], v[6]];
let mut second_half = [v[1], v[3], v[5], v[7]];
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * inv_sqrt2;
idct1d_4_batch_wasm128(token, &mut second_half);
second_half[0] *= f32x4::splat(token, INV_WC8[0]);
second_half[1] *= f32x4::splat(token, INV_WC8[1]);
second_half[2] *= f32x4::splat(token, INV_WC8[2]);
second_half[3] *= f32x4::splat(token, INV_WC8[3]);
idct1d_4_batch_wasm128(token, &mut first_half);
v[0] = (first_half[0] + second_half[0]) * half;
v[7] = (first_half[0] - second_half[0]) * half;
v[1] = (first_half[1] + second_half[1]) * half;
v[6] = (first_half[1] - second_half[1]) * half;
v[2] = (first_half[2] + second_half[2]) * half;
v[5] = (first_half[2] - second_half[2]) * half;
v[3] = (first_half[3] + second_half[3]) * half;
v[4] = (first_half[3] - second_half[3]) * half;
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn idct1d_8_batch_wasm128(token: archmage::Wasm128Token, v: &mut [magetypes::simd::f32x4; 8]) {
use magetypes::simd::f32x4;
let scale8 = f32x4::splat(token, 8.0);
for vi in v.iter_mut() {
*vi *= scale8;
}
idct1d_8_core_batch_wasm128(token, v);
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
pub(crate) fn idct1d_16_core_batch_wasm128(
token: archmage::Wasm128Token,
v: &mut [magetypes::simd::f32x4; 16],
) {
use magetypes::simd::f32x4;
let half = f32x4::splat(token, 0.5);
let inv_sqrt2 = f32x4::splat(token, 1.0 / SQRT2);
let mut first_half = [v[0], v[2], v[4], v[6], v[8], v[10], v[12], v[14]];
let mut second_half = [v[1], v[3], v[5], v[7], v[9], v[11], v[13], v[15]];
second_half[6] -= second_half[7];
second_half[5] -= second_half[6];
second_half[4] -= second_half[5];
second_half[3] -= second_half[4];
second_half[2] -= second_half[3];
second_half[1] -= second_half[2];
second_half[0] = (second_half[0] - second_half[1]) * inv_sqrt2;
idct1d_8_core_batch_wasm128(token, &mut second_half);
second_half[0] *= f32x4::splat(token, INV_WC16[0]);
second_half[1] *= f32x4::splat(token, INV_WC16[1]);
second_half[2] *= f32x4::splat(token, INV_WC16[2]);
second_half[3] *= f32x4::splat(token, INV_WC16[3]);
second_half[4] *= f32x4::splat(token, INV_WC16[4]);
second_half[5] *= f32x4::splat(token, INV_WC16[5]);
second_half[6] *= f32x4::splat(token, INV_WC16[6]);
second_half[7] *= f32x4::splat(token, INV_WC16[7]);
idct1d_8_core_batch_wasm128(token, &mut first_half);
v[0] = (first_half[0] + second_half[0]) * half;
v[15] = (first_half[0] - second_half[0]) * half;
v[1] = (first_half[1] + second_half[1]) * half;
v[14] = (first_half[1] - second_half[1]) * half;
v[2] = (first_half[2] + second_half[2]) * half;
v[13] = (first_half[2] - second_half[2]) * half;
v[3] = (first_half[3] + second_half[3]) * half;
v[12] = (first_half[3] - second_half[3]) * half;
v[4] = (first_half[4] + second_half[4]) * half;
v[11] = (first_half[4] - second_half[4]) * half;
v[5] = (first_half[5] + second_half[5]) * half;
v[10] = (first_half[5] - second_half[5]) * half;
v[6] = (first_half[6] + second_half[6]) * half;
v[9] = (first_half[6] - second_half[6]) * half;
v[7] = (first_half[7] + second_half[7]) * half;
v[8] = (first_half[7] - second_half[7]) * half;
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
pub(crate) fn idct1d_16_batch_wasm128(
token: archmage::Wasm128Token,
v: &mut [magetypes::simd::f32x4; 16],
) {
use magetypes::simd::f32x4;
let scale16 = f32x4::splat(token, 16.0);
for vi in v.iter_mut() {
*vi *= scale16;
}
idct1d_16_core_batch_wasm128(token, v);
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::needless_range_loop)]
fn wasm128_idct8_batch(
token: archmage::Wasm128Token,
data_in: &[f32],
data_out: &mut [f32],
base_row: usize,
stride: usize,
) {
let mut v = [magetypes::simd::f32x4::zero(token); 8];
for j in 0..8 {
v[j] = gather_col_wasm128(token, data_in, base_row, j, stride);
}
idct1d_8_batch_wasm128(token, &mut v);
for j in 0..8 {
scatter_col_wasm128(token, v[j], data_out, base_row, j, stride);
}
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
#[allow(clippy::needless_range_loop)]
fn wasm128_idct16_batch(
token: archmage::Wasm128Token,
data_in: &[f32],
data_out: &mut [f32],
base_row: usize,
stride: usize,
) {
let mut v = [magetypes::simd::f32x4::zero(token); 16];
for j in 0..16 {
v[j] = gather_col_wasm128(token, data_in, base_row, j, stride);
}
idct1d_16_batch_wasm128(token, &mut v);
for j in 0..16 {
scatter_col_wasm128(token, v[j], data_out, base_row, j, stride);
}
}
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_16x16_wasm128(
token: archmage::Wasm128Token,
input: &[f32; 256],
output: &mut [f32; 256],
) {
let mut tmp = crate::scratch_buf::<256>();
for batch in 0..4 {
wasm128_idct16_batch(token, input, &mut tmp, batch * 4, 16);
}
let mut transposed = crate::scratch_buf::<256>();
for r in 0..16 {
for c in 0..16 {
transposed[c * 16 + r] = tmp[r * 16 + c];
}
}
for batch in 0..4 {
wasm128_idct16_batch(token, &transposed, output, batch * 4, 16);
}
}
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_16x8_wasm128(
token: archmage::Wasm128Token,
input: &[f32; 128],
output: &mut [f32; 128],
) {
let mut tmp = crate::scratch_buf::<128>();
for batch in 0..4 {
wasm128_idct8_batch(token, input, &mut tmp, batch * 4, 8);
}
for col_base in (0..8).step_by(4) {
let mut v = [magetypes::simd::f32x4::zero(token); 16];
for row in 0..16 {
v[row] = magetypes::simd::f32x4::from_array(
token,
[
tmp[row * 8 + col_base],
tmp[row * 8 + col_base + 1],
tmp[row * 8 + col_base + 2],
tmp[row * 8 + col_base + 3],
],
);
}
idct1d_16_batch_wasm128(token, &mut v);
for row in 0..16 {
let mut lane = [0.0f32; 4];
v[row].store(&mut lane);
output[row * 8 + col_base] = lane[0];
output[row * 8 + col_base + 1] = lane[1];
output[row * 8 + col_base + 2] = lane[2];
output[row * 8 + col_base + 3] = lane[3];
}
}
}
#[cfg(target_arch = "wasm32")]
#[inline]
#[archmage::arcane]
#[allow(clippy::needless_range_loop)]
pub fn idct_8x16_wasm128(
token: archmage::Wasm128Token,
input: &[f32; 128],
output: &mut [f32; 128],
) {
let mut tmp = crate::scratch_buf::<128>();
for batch in 0..2 {
wasm128_idct16_batch(token, input, &mut tmp, batch * 4, 16);
}
for col_base in (0..16).step_by(4) {
let mut v = [magetypes::simd::f32x4::zero(token); 8];
for row in 0..8 {
v[row] = magetypes::simd::f32x4::from_array(
token,
[
tmp[row * 16 + col_base],
tmp[row * 16 + col_base + 1],
tmp[row * 16 + col_base + 2],
tmp[row * 16 + col_base + 3],
],
);
}
idct1d_8_batch_wasm128(token, &mut v);
for row in 0..8 {
let mut lane = [0.0f32; 4];
v[row].store(&mut lane);
output[row * 16 + col_base] = lane[0];
output[row * 16 + col_base + 1] = lane[1];
output[row * 16 + col_base + 2] = lane[2];
output[row * 16 + col_base + 3] = lane[3];
}
}
}
#[cfg(test)]
mod tests {
extern crate std;
use super::*;
fn assert_simd_matches_scalar_256(
scalar_fn: fn(&[f32; 256], &mut [f32; 256]),
dispatch_fn: fn(&[f32; 256], &mut [f32; 256]),
input: &[f32; 256],
label: &str,
) {
let mut scalar_out = [0.0f32; 256];
scalar_fn(input, &mut scalar_out);
let report = archmage::testing::for_each_token_permutation(
archmage::testing::CompileTimePolicy::Warn,
|perm| {
let mut simd_out = [0.0f32; 256];
dispatch_fn(input, &mut simd_out);
let mut max_diff = 0.0f32;
let mut max_idx = 0;
for i in 0..256 {
let diff = (scalar_out[i] - simd_out[i]).abs();
if diff > max_diff {
max_diff = diff;
max_idx = i;
}
}
assert!(
max_diff < 1e-2,
"{label} max diff = {max_diff} at {max_idx} (scalar={}, simd={}) [{perm}]",
scalar_out[max_idx],
simd_out[max_idx],
);
},
);
std::eprintln!("{label}: {report}");
}
fn assert_simd_matches_scalar_128(
scalar_fn: fn(&[f32; 128], &mut [f32; 128]),
dispatch_fn: fn(&[f32; 128], &mut [f32; 128]),
input: &[f32; 128],
label: &str,
) {
let mut scalar_out = [0.0f32; 128];
scalar_fn(input, &mut scalar_out);
let report = archmage::testing::for_each_token_permutation(
archmage::testing::CompileTimePolicy::Warn,
|perm| {
let mut simd_out = [0.0f32; 128];
dispatch_fn(input, &mut simd_out);
let mut max_diff = 0.0f32;
let mut max_idx = 0;
for i in 0..128 {
let diff = (scalar_out[i] - simd_out[i]).abs();
if diff > max_diff {
max_diff = diff;
max_idx = i;
}
}
assert!(
max_diff < 1e-2,
"{label} max diff = {max_diff} at {max_idx} (scalar={}, simd={}) [{perm}]",
scalar_out[max_idx],
simd_out[max_idx],
);
},
);
std::eprintln!("{label}: {report}");
}
#[test]
fn test_idct_16x16_simd_matches_scalar() {
let mut input = [0.0f32; 256];
for (i, val) in input.iter_mut().enumerate() {
*val = i as f32;
}
assert_simd_matches_scalar_256(idct_16x16_scalar, idct_16x16, &input, "IDCT16x16 seq");
}
#[test]
fn test_idct_16x16_simd_matches_scalar_cosine_input() {
let mut input = [0.0f32; 256];
for (i, val) in input.iter_mut().enumerate() {
*val = ((i as f32) * 0.37 + 1.5).cos() * 100.0;
}
assert_simd_matches_scalar_256(idct_16x16_scalar, idct_16x16, &input, "IDCT16x16 cos");
}
#[test]
fn test_idct_16x16_dc_only() {
let mut input = [0.0f32; 256];
input[0] = 128.0;
assert_simd_matches_scalar_256(idct_16x16_scalar, idct_16x16, &input, "IDCT16x16 DC");
}
#[test]
fn test_idct_16x16_single_ac_coefficient() {
let mut input = [0.0f32; 256];
input[1] = 50.0;
assert_simd_matches_scalar_256(idct_16x16_scalar, idct_16x16, &input, "IDCT16x16 AC");
}
#[test]
fn test_idct_16x8_simd_matches_scalar() {
let mut input = [0.0f32; 128];
for (i, val) in input.iter_mut().enumerate() {
*val = ((i as f32) * 0.43 + 2.1).cos() * 80.0;
}
assert_simd_matches_scalar_128(idct_16x8_scalar, idct_16x8, &input, "IDCT16x8");
}
#[test]
fn test_idct_8x16_simd_matches_scalar() {
let mut input = [0.0f32; 128];
for (i, val) in input.iter_mut().enumerate() {
*val = ((i as f32) * 0.29 + 0.7).sin() * 120.0;
}
assert_simd_matches_scalar_128(idct_8x16_scalar, idct_8x16, &input, "IDCT8x16");
}
}