#![allow(clippy::too_many_arguments)]
use archmage::prelude::*;
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64::{
_mm_loadu_si128, _mm_storeu_si128, _mm256_loadu_si256, _mm256_storeu_si256,
};
#[cfg(target_arch = "wasm32")]
use safe_unaligned_simd::wasm32::{v128_load, v128_store};
#[cfg(target_arch = "x86_64")]
const fn pack(a: i16, b: i16) -> i32 {
(a as i32 & 0xFFFF) | ((b as i32) << 16)
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
fn transpose_8x8(
_token: X64V3Token,
r0: __m128i,
r1: __m128i,
r2: __m128i,
r3: __m128i,
r4: __m128i,
r5: __m128i,
r6: __m128i,
r7: __m128i,
) -> (
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
) {
let t0 = _mm_unpacklo_epi16(r0, r1);
let t1 = _mm_unpackhi_epi16(r0, r1);
let t2 = _mm_unpacklo_epi16(r2, r3);
let t3 = _mm_unpackhi_epi16(r2, r3);
let t4 = _mm_unpacklo_epi16(r4, r5);
let t5 = _mm_unpackhi_epi16(r4, r5);
let t6 = _mm_unpacklo_epi16(r6, r7);
let t7 = _mm_unpackhi_epi16(r6, r7);
let u0 = _mm_unpacklo_epi32(t0, t2);
let u1 = _mm_unpackhi_epi32(t0, t2);
let u2 = _mm_unpacklo_epi32(t1, t3);
let u3 = _mm_unpackhi_epi32(t1, t3);
let u4 = _mm_unpacklo_epi32(t4, t6);
let u5 = _mm_unpackhi_epi32(t4, t6);
let u6 = _mm_unpacklo_epi32(t5, t7);
let u7 = _mm_unpackhi_epi32(t5, t7);
(
_mm_unpacklo_epi64(u0, u4),
_mm_unpackhi_epi64(u0, u4),
_mm_unpacklo_epi64(u1, u5),
_mm_unpackhi_epi64(u1, u5),
_mm_unpacklo_epi64(u2, u6),
_mm_unpackhi_epi64(u2, u6),
_mm_unpacklo_epi64(u3, u7),
_mm_unpackhi_epi64(u3, u7),
)
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
fn idct8_1d_columns(
_token: X64V3Token,
r0: __m128i,
r1: __m128i,
r2: __m128i,
r3: __m128i,
r4: __m128i,
r5: __m128i,
r6: __m128i,
r7: __m128i,
shift: __m128i,
add: __m256i,
) -> (
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
__m128i,
) {
let i13 = _mm256_set_m128i(_mm_unpackhi_epi16(r1, r3), _mm_unpacklo_epi16(r1, r3));
let i57 = _mm256_set_m128i(_mm_unpackhi_epi16(r5, r7), _mm_unpacklo_epi16(r5, r7));
let o0 = _mm256_add_epi32(
_mm256_madd_epi16(i13, _mm256_set1_epi32(pack(89, 75))),
_mm256_madd_epi16(i57, _mm256_set1_epi32(pack(50, 18))),
);
let o1 = _mm256_add_epi32(
_mm256_madd_epi16(i13, _mm256_set1_epi32(pack(75, -18))),
_mm256_madd_epi16(i57, _mm256_set1_epi32(pack(-89, -50))),
);
let o2 = _mm256_add_epi32(
_mm256_madd_epi16(i13, _mm256_set1_epi32(pack(50, -89))),
_mm256_madd_epi16(i57, _mm256_set1_epi32(pack(18, 75))),
);
let o3 = _mm256_add_epi32(
_mm256_madd_epi16(i13, _mm256_set1_epi32(pack(18, -50))),
_mm256_madd_epi16(i57, _mm256_set1_epi32(pack(75, -89))),
);
let i04 = _mm256_set_m128i(_mm_unpackhi_epi16(r0, r4), _mm_unpacklo_epi16(r0, r4));
let i26 = _mm256_set_m128i(_mm_unpackhi_epi16(r2, r6), _mm_unpacklo_epi16(r2, r6));
let ee0 = _mm256_madd_epi16(i04, _mm256_set1_epi32(pack(64, 64)));
let ee1 = _mm256_madd_epi16(i04, _mm256_set1_epi32(pack(64, -64)));
let eo0 = _mm256_madd_epi16(i26, _mm256_set1_epi32(pack(83, 36)));
let eo1 = _mm256_madd_epi16(i26, _mm256_set1_epi32(pack(36, -83)));
let e0 = _mm256_add_epi32(ee0, eo0);
let e1 = _mm256_add_epi32(ee1, eo1);
let e2 = _mm256_sub_epi32(ee1, eo1);
let e3 = _mm256_sub_epi32(ee0, eo0);
let d0 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32(e0, o0), add), shift);
let d1 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32(e1, o1), add), shift);
let d2 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32(e2, o2), add), shift);
let d3 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32(e3, o3), add), shift);
let d4 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32(e3, o3), add), shift);
let d5 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32(e2, o2), add), shift);
let d6 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32(e1, o1), add), shift);
let d7 = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32(e0, o0), add), shift);
let p01 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packs_epi32(d0, d1));
let p23 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packs_epi32(d2, d3));
let p45 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packs_epi32(d4, d5));
let p67 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packs_epi32(d6, d7));
(
_mm256_castsi256_si128(p01),
_mm256_extracti128_si256::<1>(p01),
_mm256_castsi256_si128(p23),
_mm256_extracti128_si256::<1>(p23),
_mm256_castsi256_si128(p45),
_mm256_extracti128_si256::<1>(p45),
_mm256_castsi256_si128(p67),
_mm256_extracti128_si256::<1>(p67),
)
}
#[arcane]
pub(crate) fn idct8_v3(
_token: X64V3Token,
coeffs: &[i16; 64],
output: &mut [i16; 64],
bit_depth: u8,
) {
let r0 = _mm_loadu_si128::<[i16; 8]>(coeffs[0..8].try_into().unwrap());
let r1 = _mm_loadu_si128::<[i16; 8]>(coeffs[8..16].try_into().unwrap());
let r2 = _mm_loadu_si128::<[i16; 8]>(coeffs[16..24].try_into().unwrap());
let r3 = _mm_loadu_si128::<[i16; 8]>(coeffs[24..32].try_into().unwrap());
let r4 = _mm_loadu_si128::<[i16; 8]>(coeffs[32..40].try_into().unwrap());
let r5 = _mm_loadu_si128::<[i16; 8]>(coeffs[40..48].try_into().unwrap());
let r6 = _mm_loadu_si128::<[i16; 8]>(coeffs[48..56].try_into().unwrap());
let r7 = _mm_loadu_si128::<[i16; 8]>(coeffs[56..64].try_into().unwrap());
let shift1 = _mm_cvtsi32_si128(7);
let add1 = _mm256_set1_epi32(1 << 6); let (d0, d1, d2, d3, d4, d5, d6, d7) =
idct8_1d_columns(_token, r0, r1, r2, r3, r4, r5, r6, r7, shift1, add1);
let (t0, t1, t2, t3, t4, t5, t6, t7) = transpose_8x8(_token, d0, d1, d2, d3, d4, d5, d6, d7);
let shift2 = 20 - bit_depth as i32;
let shift2_v = _mm_cvtsi32_si128(shift2);
let add2 = _mm256_set1_epi32(1 << (shift2 - 1));
let (e0, e1, e2, e3, e4, e5, e6, e7) =
idct8_1d_columns(_token, t0, t1, t2, t3, t4, t5, t6, t7, shift2_v, add2);
let (f0, f1, f2, f3, f4, f5, f6, f7) = transpose_8x8(_token, e0, e1, e2, e3, e4, e5, e6, e7);
let out01 = _mm256_set_m128i(f1, f0);
let out23 = _mm256_set_m128i(f3, f2);
let out45 = _mm256_set_m128i(f5, f4);
let out67 = _mm256_set_m128i(f7, f6);
_mm256_storeu_si256::<[i16; 16]>((&mut output[0..16]).try_into().unwrap(), out01);
_mm256_storeu_si256::<[i16; 16]>((&mut output[16..32]).try_into().unwrap(), out23);
_mm256_storeu_si256::<[i16; 16]>((&mut output[32..48]).try_into().unwrap(), out45);
_mm256_storeu_si256::<[i16; 16]>((&mut output[48..64]).try_into().unwrap(), out67);
}
pub(crate) fn idct8_scalar(
_token: ScalarToken,
coeffs: &[i16; 64],
output: &mut [i16; 64],
bit_depth: u8,
) {
super::transform::idct8_inner(coeffs, output, bit_depth);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
fn idct16_1d_columns(
_token: X64V3Token,
r: &[__m256i; 16],
shift: __m128i,
add: __m256i,
) -> [__m256i; 16] {
macro_rules! interleave_madd {
($ra:expr, $rb:expr, $ca:expr, $cb:expr) => {{
let lo = _mm256_unpacklo_epi16($ra, $rb);
let hi = _mm256_unpackhi_epi16($ra, $rb);
let coeff = _mm256_set1_epi32(pack($ca, $cb));
(_mm256_madd_epi16(lo, coeff), _mm256_madd_epi16(hi, coeff))
}};
}
macro_rules! sum4_pairs {
(($ra1:expr, $rb1:expr, $ca1:expr, $cb1:expr),
($ra2:expr, $rb2:expr, $ca2:expr, $cb2:expr),
($ra3:expr, $rb3:expr, $ca3:expr, $cb3:expr),
($ra4:expr, $rb4:expr, $ca4:expr, $cb4:expr)) => {{
let (l1, h1) = interleave_madd!($ra1, $rb1, $ca1, $cb1);
let (l2, h2) = interleave_madd!($ra2, $rb2, $ca2, $cb2);
let (l3, h3) = interleave_madd!($ra3, $rb3, $ca3, $cb3);
let (l4, h4) = interleave_madd!($ra4, $rb4, $ca4, $cb4);
(
_mm256_add_epi32(_mm256_add_epi32(l1, l2), _mm256_add_epi32(l3, l4)),
_mm256_add_epi32(_mm256_add_epi32(h1, h2), _mm256_add_epi32(h3, h4)),
)
}};
}
let (o0l, o0h) = sum4_pairs!(
(r[1], r[3], 90, 87),
(r[5], r[7], 80, 70),
(r[9], r[11], 57, 43),
(r[13], r[15], 25, 9)
);
let (o1l, o1h) = sum4_pairs!(
(r[1], r[3], 87, 57),
(r[5], r[7], 9, -43),
(r[9], r[11], -80, -90),
(r[13], r[15], -70, -25)
);
let (o2l, o2h) = sum4_pairs!(
(r[1], r[3], 80, 9),
(r[5], r[7], -70, -87),
(r[9], r[11], -25, 57),
(r[13], r[15], 90, 43)
);
let (o3l, o3h) = sum4_pairs!(
(r[1], r[3], 70, -43),
(r[5], r[7], -87, 9),
(r[9], r[11], 90, 25),
(r[13], r[15], -80, -57)
);
let (o4l, o4h) = sum4_pairs!(
(r[1], r[3], 57, -80),
(r[5], r[7], -25, 90),
(r[9], r[11], -9, -87),
(r[13], r[15], 43, 70)
);
let (o5l, o5h) = sum4_pairs!(
(r[1], r[3], 43, -90),
(r[5], r[7], 57, 25),
(r[9], r[11], -87, 70),
(r[13], r[15], 9, -80)
);
let (o6l, o6h) = sum4_pairs!(
(r[1], r[3], 25, -70),
(r[5], r[7], 90, -80),
(r[9], r[11], 43, 9),
(r[13], r[15], -57, 87)
);
let (o7l, o7h) = sum4_pairs!(
(r[1], r[3], 9, -25),
(r[5], r[7], 43, -57),
(r[9], r[11], 70, -80),
(r[13], r[15], 87, -90)
);
macro_rules! sum2_pairs {
(($ra1:expr, $rb1:expr, $ca1:expr, $cb1:expr),
($ra2:expr, $rb2:expr, $ca2:expr, $cb2:expr)) => {{
let (l1, h1) = interleave_madd!($ra1, $rb1, $ca1, $cb1);
let (l2, h2) = interleave_madd!($ra2, $rb2, $ca2, $cb2);
(_mm256_add_epi32(l1, l2), _mm256_add_epi32(h1, h2))
}};
}
let (eo0l, eo0h) = sum2_pairs!((r[2], r[6], 89, 75), (r[10], r[14], 50, 18));
let (eo1l, eo1h) = sum2_pairs!((r[2], r[6], 75, -18), (r[10], r[14], -89, -50));
let (eo2l, eo2h) = sum2_pairs!((r[2], r[6], 50, -89), (r[10], r[14], 18, 75));
let (eo3l, eo3h) = sum2_pairs!((r[2], r[6], 18, -50), (r[10], r[14], 75, -89));
let (eee0l, eee0h) = interleave_madd!(r[0], r[8], 64, 64);
let (eee1l, eee1h) = interleave_madd!(r[0], r[8], 64, -64);
let (eeo0l, eeo0h) = interleave_madd!(r[4], r[12], 83, 36);
let (eeo1l, eeo1h) = interleave_madd!(r[4], r[12], 36, -83);
let ee0l = _mm256_add_epi32(eee0l, eeo0l);
let ee0h = _mm256_add_epi32(eee0h, eeo0h);
let ee1l = _mm256_add_epi32(eee1l, eeo1l);
let ee1h = _mm256_add_epi32(eee1h, eeo1h);
let ee2l = _mm256_sub_epi32(eee1l, eeo1l);
let ee2h = _mm256_sub_epi32(eee1h, eeo1h);
let ee3l = _mm256_sub_epi32(eee0l, eeo0l);
let ee3h = _mm256_sub_epi32(eee0h, eeo0h);
let e0l = _mm256_add_epi32(ee0l, eo0l);
let e0h = _mm256_add_epi32(ee0h, eo0h);
let e1l = _mm256_add_epi32(ee1l, eo1l);
let e1h = _mm256_add_epi32(ee1h, eo1h);
let e2l = _mm256_add_epi32(ee2l, eo2l);
let e2h = _mm256_add_epi32(ee2h, eo2h);
let e3l = _mm256_add_epi32(ee3l, eo3l);
let e3h = _mm256_add_epi32(ee3h, eo3h);
let e4l = _mm256_sub_epi32(ee3l, eo3l);
let e4h = _mm256_sub_epi32(ee3h, eo3h);
let e5l = _mm256_sub_epi32(ee2l, eo2l);
let e5h = _mm256_sub_epi32(ee2h, eo2h);
let e6l = _mm256_sub_epi32(ee1l, eo1l);
let e6h = _mm256_sub_epi32(ee1h, eo1h);
let e7l = _mm256_sub_epi32(ee0l, eo0l);
let e7h = _mm256_sub_epi32(ee0h, eo0h);
macro_rules! butterfly_pack {
($el:expr, $eh:expr, $ol:expr, $oh:expr, add) => {{
let dl = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32($el, $ol), add), shift);
let dh = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32($eh, $oh), add), shift);
_mm256_packs_epi32(dl, dh)
}};
($el:expr, $eh:expr, $ol:expr, $oh:expr, sub) => {{
let dl = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32($el, $ol), add), shift);
let dh = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32($eh, $oh), add), shift);
_mm256_packs_epi32(dl, dh)
}};
}
[
butterfly_pack!(e0l, e0h, o0l, o0h, add),
butterfly_pack!(e1l, e1h, o1l, o1h, add),
butterfly_pack!(e2l, e2h, o2l, o2h, add),
butterfly_pack!(e3l, e3h, o3l, o3h, add),
butterfly_pack!(e4l, e4h, o4l, o4h, add),
butterfly_pack!(e5l, e5h, o5l, o5h, add),
butterfly_pack!(e6l, e6h, o6l, o6h, add),
butterfly_pack!(e7l, e7h, o7l, o7h, add),
butterfly_pack!(e7l, e7h, o7l, o7h, sub),
butterfly_pack!(e6l, e6h, o6l, o6h, sub),
butterfly_pack!(e5l, e5h, o5l, o5h, sub),
butterfly_pack!(e4l, e4h, o4l, o4h, sub),
butterfly_pack!(e3l, e3h, o3l, o3h, sub),
butterfly_pack!(e2l, e2h, o2l, o2h, sub),
butterfly_pack!(e1l, e1h, o1l, o1h, sub),
butterfly_pack!(e0l, e0h, o0l, o0h, sub),
]
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_16x16(token: X64V3Token, r: &[__m256i; 16]) -> [__m256i; 16] {
macro_rules! lo {
($v:expr) => {
_mm256_castsi256_si128($v)
};
}
macro_rules! hi {
($v:expr) => {
_mm256_extracti128_si256::<1>($v)
};
}
macro_rules! combine {
($l:expr, $h:expr) => {
_mm256_set_m128i($h, $l)
};
}
let (tl0, tl1, tl2, tl3, tl4, tl5, tl6, tl7) = transpose_8x8(
token,
lo!(r[0]),
lo!(r[1]),
lo!(r[2]),
lo!(r[3]),
lo!(r[4]),
lo!(r[5]),
lo!(r[6]),
lo!(r[7]),
);
let (tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7) = transpose_8x8(
token,
hi!(r[0]),
hi!(r[1]),
hi!(r[2]),
hi!(r[3]),
hi!(r[4]),
hi!(r[5]),
hi!(r[6]),
hi!(r[7]),
);
let (bl0, bl1, bl2, bl3, bl4, bl5, bl6, bl7) = transpose_8x8(
token,
lo!(r[8]),
lo!(r[9]),
lo!(r[10]),
lo!(r[11]),
lo!(r[12]),
lo!(r[13]),
lo!(r[14]),
lo!(r[15]),
);
let (br0, br1, br2, br3, br4, br5, br6, br7) = transpose_8x8(
token,
hi!(r[8]),
hi!(r[9]),
hi!(r[10]),
hi!(r[11]),
hi!(r[12]),
hi!(r[13]),
hi!(r[14]),
hi!(r[15]),
);
[
combine!(tl0, bl0),
combine!(tl1, bl1),
combine!(tl2, bl2),
combine!(tl3, bl3),
combine!(tl4, bl4),
combine!(tl5, bl5),
combine!(tl6, bl6),
combine!(tl7, bl7),
combine!(tr0, br0),
combine!(tr1, br1),
combine!(tr2, br2),
combine!(tr3, br3),
combine!(tr4, br4),
combine!(tr5, br5),
combine!(tr6, br6),
combine!(tr7, br7),
]
}
#[arcane]
pub(crate) fn idct16_v3(
_token: X64V3Token,
coeffs: &[i16; 256],
output: &mut [i16; 256],
bit_depth: u8,
) {
let mut r = [_mm256_setzero_si256(); 16];
for i in 0..16 {
r[i] = _mm256_loadu_si256::<[i16; 16]>(coeffs[i * 16..(i + 1) * 16].try_into().unwrap());
}
let shift1 = _mm_cvtsi32_si128(7);
let add1 = _mm256_set1_epi32(1 << 6);
let d = idct16_1d_columns(_token, &r, shift1, add1);
let t = transpose_16x16(_token, &d);
let shift2 = 20 - bit_depth as i32;
let shift2_v = _mm_cvtsi32_si128(shift2);
let add2 = _mm256_set1_epi32(1 << (shift2 - 1));
let e = idct16_1d_columns(_token, &t, shift2_v, add2);
let f = transpose_16x16(_token, &e);
for i in 0..16 {
_mm256_storeu_si256::<[i16; 16]>(
(&mut output[i * 16..(i + 1) * 16]).try_into().unwrap(),
f[i],
);
}
}
pub(crate) fn idct16_scalar(
_token: ScalarToken,
coeffs: &[i16; 256],
output: &mut [i16; 256],
bit_depth: u8,
) {
super::transform::idct16_inner(coeffs, output, bit_depth);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn idct32_1d_columns(
_token: X64V3Token,
r: &[__m256i; 32],
shift: __m128i,
add: __m256i,
) -> [__m256i; 32] {
macro_rules! interleave_madd {
($ra:expr, $rb:expr, $ca:expr, $cb:expr) => {{
let lo = _mm256_unpacklo_epi16($ra, $rb);
let hi = _mm256_unpackhi_epi16($ra, $rb);
let coeff = _mm256_set1_epi32(pack($ca, $cb));
(_mm256_madd_epi16(lo, coeff), _mm256_madd_epi16(hi, coeff))
}};
}
macro_rules! sum4_pairs {
(($ra1:expr, $rb1:expr, $ca1:expr, $cb1:expr),
($ra2:expr, $rb2:expr, $ca2:expr, $cb2:expr),
($ra3:expr, $rb3:expr, $ca3:expr, $cb3:expr),
($ra4:expr, $rb4:expr, $ca4:expr, $cb4:expr)) => {{
let (l1, h1) = interleave_madd!($ra1, $rb1, $ca1, $cb1);
let (l2, h2) = interleave_madd!($ra2, $rb2, $ca2, $cb2);
let (l3, h3) = interleave_madd!($ra3, $rb3, $ca3, $cb3);
let (l4, h4) = interleave_madd!($ra4, $rb4, $ca4, $cb4);
(
_mm256_add_epi32(_mm256_add_epi32(l1, l2), _mm256_add_epi32(l3, l4)),
_mm256_add_epi32(_mm256_add_epi32(h1, h2), _mm256_add_epi32(h3, h4)),
)
}};
}
macro_rules! sum8_pairs {
(($ra1:expr, $rb1:expr, $ca1:expr, $cb1:expr),
($ra2:expr, $rb2:expr, $ca2:expr, $cb2:expr),
($ra3:expr, $rb3:expr, $ca3:expr, $cb3:expr),
($ra4:expr, $rb4:expr, $ca4:expr, $cb4:expr),
($ra5:expr, $rb5:expr, $ca5:expr, $cb5:expr),
($ra6:expr, $rb6:expr, $ca6:expr, $cb6:expr),
($ra7:expr, $rb7:expr, $ca7:expr, $cb7:expr),
($ra8:expr, $rb8:expr, $ca8:expr, $cb8:expr)) => {{
let (l1, h1) = interleave_madd!($ra1, $rb1, $ca1, $cb1);
let (l2, h2) = interleave_madd!($ra2, $rb2, $ca2, $cb2);
let (l3, h3) = interleave_madd!($ra3, $rb3, $ca3, $cb3);
let (l4, h4) = interleave_madd!($ra4, $rb4, $ca4, $cb4);
let (l5, h5) = interleave_madd!($ra5, $rb5, $ca5, $cb5);
let (l6, h6) = interleave_madd!($ra6, $rb6, $ca6, $cb6);
let (l7, h7) = interleave_madd!($ra7, $rb7, $ca7, $cb7);
let (l8, h8) = interleave_madd!($ra8, $rb8, $ca8, $cb8);
let la = _mm256_add_epi32(_mm256_add_epi32(l1, l2), _mm256_add_epi32(l3, l4));
let lb = _mm256_add_epi32(_mm256_add_epi32(l5, l6), _mm256_add_epi32(l7, l8));
let ha = _mm256_add_epi32(_mm256_add_epi32(h1, h2), _mm256_add_epi32(h3, h4));
let hb = _mm256_add_epi32(_mm256_add_epi32(h5, h6), _mm256_add_epi32(h7, h8));
(_mm256_add_epi32(la, lb), _mm256_add_epi32(ha, hb))
}};
}
let (o0l, o0h) = sum8_pairs!(
(r[1], r[3], 90, 90),
(r[5], r[7], 88, 85),
(r[9], r[11], 82, 78),
(r[13], r[15], 73, 67),
(r[17], r[19], 61, 54),
(r[21], r[23], 46, 38),
(r[25], r[27], 31, 22),
(r[29], r[31], 13, 4)
);
let (o1l, o1h) = sum8_pairs!(
(r[1], r[3], 90, 82),
(r[5], r[7], 67, 46),
(r[9], r[11], 22, -4),
(r[13], r[15], -31, -54),
(r[17], r[19], -73, -85),
(r[21], r[23], -90, -88),
(r[25], r[27], -78, -61),
(r[29], r[31], -38, -13)
);
let (o2l, o2h) = sum8_pairs!(
(r[1], r[3], 88, 67),
(r[5], r[7], 31, -13),
(r[9], r[11], -54, -82),
(r[13], r[15], -90, -78),
(r[17], r[19], -46, -4),
(r[21], r[23], 38, 73),
(r[25], r[27], 90, 85),
(r[29], r[31], 61, 22)
);
let (o3l, o3h) = sum8_pairs!(
(r[1], r[3], 85, 46),
(r[5], r[7], -13, -67),
(r[9], r[11], -90, -73),
(r[13], r[15], -22, 38),
(r[17], r[19], 82, 88),
(r[21], r[23], 54, -4),
(r[25], r[27], -61, -90),
(r[29], r[31], -78, -31)
);
let (o4l, o4h) = sum8_pairs!(
(r[1], r[3], 82, 22),
(r[5], r[7], -54, -90),
(r[9], r[11], -61, 13),
(r[13], r[15], 78, 85),
(r[17], r[19], 31, -46),
(r[21], r[23], -90, -67),
(r[25], r[27], 4, 73),
(r[29], r[31], 88, 38)
);
let (o5l, o5h) = sum8_pairs!(
(r[1], r[3], 78, -4),
(r[5], r[7], -82, -73),
(r[9], r[11], 13, 85),
(r[13], r[15], 67, -22),
(r[17], r[19], -88, -61),
(r[21], r[23], 31, 90),
(r[25], r[27], 54, -38),
(r[29], r[31], -90, -46)
);
let (o6l, o6h) = sum8_pairs!(
(r[1], r[3], 73, -31),
(r[5], r[7], -90, -22),
(r[9], r[11], 78, 67),
(r[13], r[15], -38, -90),
(r[17], r[19], -13, 82),
(r[21], r[23], 61, -46),
(r[25], r[27], -88, -4),
(r[29], r[31], 85, 54)
);
let (o7l, o7h) = sum8_pairs!(
(r[1], r[3], 67, -54),
(r[5], r[7], -78, 38),
(r[9], r[11], 85, -22),
(r[13], r[15], -90, 4),
(r[17], r[19], 90, 13),
(r[21], r[23], -88, -31),
(r[25], r[27], 82, 46),
(r[29], r[31], -73, -61)
);
let (o8l, o8h) = sum8_pairs!(
(r[1], r[3], 61, -73),
(r[5], r[7], -46, 82),
(r[9], r[11], 31, -88),
(r[13], r[15], -13, 90),
(r[17], r[19], -4, -90),
(r[21], r[23], 22, 85),
(r[25], r[27], -38, -78),
(r[29], r[31], 54, 67)
);
let (o9l, o9h) = sum8_pairs!(
(r[1], r[3], 54, -85),
(r[5], r[7], -4, 88),
(r[9], r[11], -46, -61),
(r[13], r[15], 82, 13),
(r[17], r[19], -90, 38),
(r[21], r[23], 67, -78),
(r[25], r[27], -22, 90),
(r[29], r[31], -31, -73)
);
let (o10l, o10h) = sum8_pairs!(
(r[1], r[3], 46, -90),
(r[5], r[7], 38, 54),
(r[9], r[11], -90, 31),
(r[13], r[15], 61, -88),
(r[17], r[19], 22, 67),
(r[21], r[23], -85, 13),
(r[25], r[27], 73, -82),
(r[29], r[31], 4, 78)
);
let (o11l, o11h) = sum8_pairs!(
(r[1], r[3], 38, -88),
(r[5], r[7], 73, -4),
(r[9], r[11], -67, 90),
(r[13], r[15], -46, -31),
(r[17], r[19], 85, -78),
(r[21], r[23], 13, 61),
(r[25], r[27], -90, 54),
(r[29], r[31], 22, -82)
);
let (o12l, o12h) = sum8_pairs!(
(r[1], r[3], 31, -78),
(r[5], r[7], 90, -61),
(r[9], r[11], 4, 54),
(r[13], r[15], -88, 82),
(r[17], r[19], -38, -22),
(r[21], r[23], 73, -90),
(r[25], r[27], 67, -13),
(r[29], r[31], -46, 85)
);
let (o13l, o13h) = sum8_pairs!(
(r[1], r[3], 22, -61),
(r[5], r[7], 85, -90),
(r[9], r[11], 73, -38),
(r[13], r[15], -4, 46),
(r[17], r[19], -78, 90),
(r[21], r[23], -82, 54),
(r[25], r[27], -13, -31),
(r[29], r[31], 67, -88)
);
let (o14l, o14h) = sum8_pairs!(
(r[1], r[3], 13, -38),
(r[5], r[7], 61, -78),
(r[9], r[11], 88, -90),
(r[13], r[15], 85, -73),
(r[17], r[19], 54, -31),
(r[21], r[23], 4, 22),
(r[25], r[27], -46, 67),
(r[29], r[31], -82, 90)
);
let (o15l, o15h) = sum8_pairs!(
(r[1], r[3], 4, -13),
(r[5], r[7], 22, -31),
(r[9], r[11], 38, -46),
(r[13], r[15], 54, -61),
(r[17], r[19], 67, -73),
(r[21], r[23], 78, -82),
(r[25], r[27], 85, -88),
(r[29], r[31], 90, -90)
);
macro_rules! sum2_pairs {
(($ra1:expr, $rb1:expr, $ca1:expr, $cb1:expr),
($ra2:expr, $rb2:expr, $ca2:expr, $cb2:expr)) => {{
let (l1, h1) = interleave_madd!($ra1, $rb1, $ca1, $cb1);
let (l2, h2) = interleave_madd!($ra2, $rb2, $ca2, $cb2);
(_mm256_add_epi32(l1, l2), _mm256_add_epi32(h1, h2))
}};
}
let (eo0l, eo0h) = sum4_pairs!(
(r[2], r[6], 90, 87),
(r[10], r[14], 80, 70),
(r[18], r[22], 57, 43),
(r[26], r[30], 25, 9)
);
let (eo1l, eo1h) = sum4_pairs!(
(r[2], r[6], 87, 57),
(r[10], r[14], 9, -43),
(r[18], r[22], -80, -90),
(r[26], r[30], -70, -25)
);
let (eo2l, eo2h) = sum4_pairs!(
(r[2], r[6], 80, 9),
(r[10], r[14], -70, -87),
(r[18], r[22], -25, 57),
(r[26], r[30], 90, 43)
);
let (eo3l, eo3h) = sum4_pairs!(
(r[2], r[6], 70, -43),
(r[10], r[14], -87, 9),
(r[18], r[22], 90, 25),
(r[26], r[30], -80, -57)
);
let (eo4l, eo4h) = sum4_pairs!(
(r[2], r[6], 57, -80),
(r[10], r[14], -25, 90),
(r[18], r[22], -9, -87),
(r[26], r[30], 43, 70)
);
let (eo5l, eo5h) = sum4_pairs!(
(r[2], r[6], 43, -90),
(r[10], r[14], 57, 25),
(r[18], r[22], -87, 70),
(r[26], r[30], 9, -80)
);
let (eo6l, eo6h) = sum4_pairs!(
(r[2], r[6], 25, -70),
(r[10], r[14], 90, -80),
(r[18], r[22], 43, 9),
(r[26], r[30], -57, 87)
);
let (eo7l, eo7h) = sum4_pairs!(
(r[2], r[6], 9, -25),
(r[10], r[14], 43, -57),
(r[18], r[22], 70, -80),
(r[26], r[30], 87, -90)
);
let (eeo0l, eeo0h) = sum2_pairs!((r[4], r[12], 89, 75), (r[20], r[28], 50, 18));
let (eeo1l, eeo1h) = sum2_pairs!((r[4], r[12], 75, -18), (r[20], r[28], -89, -50));
let (eeo2l, eeo2h) = sum2_pairs!((r[4], r[12], 50, -89), (r[20], r[28], 18, 75));
let (eeo3l, eeo3h) = sum2_pairs!((r[4], r[12], 18, -50), (r[20], r[28], 75, -89));
let (eeee0l, eeee0h) = interleave_madd!(r[0], r[16], 64, 64);
let (eeee1l, eeee1h) = interleave_madd!(r[0], r[16], 64, -64);
let (eeeo0l, eeeo0h) = interleave_madd!(r[8], r[24], 83, 36);
let (eeeo1l, eeeo1h) = interleave_madd!(r[8], r[24], 36, -83);
let eee0l = _mm256_add_epi32(eeee0l, eeeo0l);
let eee0h = _mm256_add_epi32(eeee0h, eeeo0h);
let eee1l = _mm256_add_epi32(eeee1l, eeeo1l);
let eee1h = _mm256_add_epi32(eeee1h, eeeo1h);
let eee2l = _mm256_sub_epi32(eeee1l, eeeo1l);
let eee2h = _mm256_sub_epi32(eeee1h, eeeo1h);
let eee3l = _mm256_sub_epi32(eeee0l, eeeo0l);
let eee3h = _mm256_sub_epi32(eeee0h, eeeo0h);
let ee0l = _mm256_add_epi32(eee0l, eeo0l);
let ee0h = _mm256_add_epi32(eee0h, eeo0h);
let ee1l = _mm256_add_epi32(eee1l, eeo1l);
let ee1h = _mm256_add_epi32(eee1h, eeo1h);
let ee2l = _mm256_add_epi32(eee2l, eeo2l);
let ee2h = _mm256_add_epi32(eee2h, eeo2h);
let ee3l = _mm256_add_epi32(eee3l, eeo3l);
let ee3h = _mm256_add_epi32(eee3h, eeo3h);
let ee4l = _mm256_sub_epi32(eee3l, eeo3l);
let ee4h = _mm256_sub_epi32(eee3h, eeo3h);
let ee5l = _mm256_sub_epi32(eee2l, eeo2l);
let ee5h = _mm256_sub_epi32(eee2h, eeo2h);
let ee6l = _mm256_sub_epi32(eee1l, eeo1l);
let ee6h = _mm256_sub_epi32(eee1h, eeo1h);
let ee7l = _mm256_sub_epi32(eee0l, eeo0l);
let ee7h = _mm256_sub_epi32(eee0h, eeo0h);
let e0l = _mm256_add_epi32(ee0l, eo0l);
let e0h = _mm256_add_epi32(ee0h, eo0h);
let e1l = _mm256_add_epi32(ee1l, eo1l);
let e1h = _mm256_add_epi32(ee1h, eo1h);
let e2l = _mm256_add_epi32(ee2l, eo2l);
let e2h = _mm256_add_epi32(ee2h, eo2h);
let e3l = _mm256_add_epi32(ee3l, eo3l);
let e3h = _mm256_add_epi32(ee3h, eo3h);
let e4l = _mm256_add_epi32(ee4l, eo4l);
let e4h = _mm256_add_epi32(ee4h, eo4h);
let e5l = _mm256_add_epi32(ee5l, eo5l);
let e5h = _mm256_add_epi32(ee5h, eo5h);
let e6l = _mm256_add_epi32(ee6l, eo6l);
let e6h = _mm256_add_epi32(ee6h, eo6h);
let e7l = _mm256_add_epi32(ee7l, eo7l);
let e7h = _mm256_add_epi32(ee7h, eo7h);
let e8l = _mm256_sub_epi32(ee7l, eo7l);
let e8h = _mm256_sub_epi32(ee7h, eo7h);
let e9l = _mm256_sub_epi32(ee6l, eo6l);
let e9h = _mm256_sub_epi32(ee6h, eo6h);
let e10l = _mm256_sub_epi32(ee5l, eo5l);
let e10h = _mm256_sub_epi32(ee5h, eo5h);
let e11l = _mm256_sub_epi32(ee4l, eo4l);
let e11h = _mm256_sub_epi32(ee4h, eo4h);
let e12l = _mm256_sub_epi32(ee3l, eo3l);
let e12h = _mm256_sub_epi32(ee3h, eo3h);
let e13l = _mm256_sub_epi32(ee2l, eo2l);
let e13h = _mm256_sub_epi32(ee2h, eo2h);
let e14l = _mm256_sub_epi32(ee1l, eo1l);
let e14h = _mm256_sub_epi32(ee1h, eo1h);
let e15l = _mm256_sub_epi32(ee0l, eo0l);
let e15h = _mm256_sub_epi32(ee0h, eo0h);
macro_rules! butterfly_pack {
($el:expr, $eh:expr, $ol:expr, $oh:expr, add) => {{
let dl = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32($el, $ol), add), shift);
let dh = _mm256_sra_epi32(_mm256_add_epi32(_mm256_add_epi32($eh, $oh), add), shift);
_mm256_packs_epi32(dl, dh)
}};
($el:expr, $eh:expr, $ol:expr, $oh:expr, sub) => {{
let dl = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32($el, $ol), add), shift);
let dh = _mm256_sra_epi32(_mm256_add_epi32(_mm256_sub_epi32($eh, $oh), add), shift);
_mm256_packs_epi32(dl, dh)
}};
}
[
butterfly_pack!(e0l, e0h, o0l, o0h, add),
butterfly_pack!(e1l, e1h, o1l, o1h, add),
butterfly_pack!(e2l, e2h, o2l, o2h, add),
butterfly_pack!(e3l, e3h, o3l, o3h, add),
butterfly_pack!(e4l, e4h, o4l, o4h, add),
butterfly_pack!(e5l, e5h, o5l, o5h, add),
butterfly_pack!(e6l, e6h, o6l, o6h, add),
butterfly_pack!(e7l, e7h, o7l, o7h, add),
butterfly_pack!(e8l, e8h, o8l, o8h, add),
butterfly_pack!(e9l, e9h, o9l, o9h, add),
butterfly_pack!(e10l, e10h, o10l, o10h, add),
butterfly_pack!(e11l, e11h, o11l, o11h, add),
butterfly_pack!(e12l, e12h, o12l, o12h, add),
butterfly_pack!(e13l, e13h, o13l, o13h, add),
butterfly_pack!(e14l, e14h, o14l, o14h, add),
butterfly_pack!(e15l, e15h, o15l, o15h, add),
butterfly_pack!(e15l, e15h, o15l, o15h, sub),
butterfly_pack!(e14l, e14h, o14l, o14h, sub),
butterfly_pack!(e13l, e13h, o13l, o13h, sub),
butterfly_pack!(e12l, e12h, o12l, o12h, sub),
butterfly_pack!(e11l, e11h, o11l, o11h, sub),
butterfly_pack!(e10l, e10h, o10l, o10h, sub),
butterfly_pack!(e9l, e9h, o9l, o9h, sub),
butterfly_pack!(e8l, e8h, o8l, o8h, sub),
butterfly_pack!(e7l, e7h, o7l, o7h, sub),
butterfly_pack!(e6l, e6h, o6l, o6h, sub),
butterfly_pack!(e5l, e5h, o5l, o5h, sub),
butterfly_pack!(e4l, e4h, o4l, o4h, sub),
butterfly_pack!(e3l, e3h, o3l, o3h, sub),
butterfly_pack!(e2l, e2h, o2l, o2h, sub),
butterfly_pack!(e1l, e1h, o1l, o1h, sub),
butterfly_pack!(e0l, e0h, o0l, o0h, sub),
]
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_32x32(
token: X64V3Token,
lo: &[__m256i; 32],
hi: &[__m256i; 32],
) -> ([__m256i; 32], [__m256i; 32]) {
let tl_arr: [__m256i; 16] = lo[0..16].try_into().unwrap();
let tr_arr: [__m256i; 16] = hi[0..16].try_into().unwrap();
let bl_arr: [__m256i; 16] = lo[16..32].try_into().unwrap();
let br_arr: [__m256i; 16] = hi[16..32].try_into().unwrap();
let tl_t = transpose_16x16(token, &tl_arr);
let tr_t = transpose_16x16(token, &tr_arr);
let bl_t = transpose_16x16(token, &bl_arr);
let br_t = transpose_16x16(token, &br_arr);
let mut out_lo = [_mm256_setzero_si256(); 32];
let mut out_hi = [_mm256_setzero_si256(); 32];
out_lo[..16].copy_from_slice(&tl_t);
out_lo[16..].copy_from_slice(&tr_t);
out_hi[..16].copy_from_slice(&bl_t);
out_hi[16..].copy_from_slice(&br_t);
(out_lo, out_hi)
}
#[arcane]
pub(crate) fn idct32_v3(
_token: X64V3Token,
coeffs: &[i16; 1024],
output: &mut [i16; 1024],
bit_depth: u8,
) {
let mut lo = [_mm256_setzero_si256(); 32];
let mut hi = [_mm256_setzero_si256(); 32];
for i in 0..32 {
let base = i * 32;
lo[i] = _mm256_loadu_si256::<[i16; 16]>(coeffs[base..base + 16].try_into().unwrap());
hi[i] = _mm256_loadu_si256::<[i16; 16]>(coeffs[base + 16..base + 32].try_into().unwrap());
}
let shift1 = _mm_cvtsi32_si128(7);
let add1 = _mm256_set1_epi32(1 << 6);
let d_lo = idct32_1d_columns(_token, &lo, shift1, add1);
let d_hi = idct32_1d_columns(_token, &hi, shift1, add1);
let (t_lo, t_hi) = transpose_32x32(_token, &d_lo, &d_hi);
let shift2 = 20 - bit_depth as i32;
let shift2_v = _mm_cvtsi32_si128(shift2);
let add2 = _mm256_set1_epi32(1 << (shift2 - 1));
let e_lo = idct32_1d_columns(_token, &t_lo, shift2_v, add2);
let e_hi = idct32_1d_columns(_token, &t_hi, shift2_v, add2);
let (f_lo, f_hi) = transpose_32x32(_token, &e_lo, &e_hi);
for i in 0..32 {
let base = i * 32;
_mm256_storeu_si256::<[i16; 16]>(
(&mut output[base..base + 16]).try_into().unwrap(),
f_lo[i],
);
_mm256_storeu_si256::<[i16; 16]>(
(&mut output[base + 16..base + 32]).try_into().unwrap(),
f_hi[i],
);
}
}
pub(crate) fn idct32_scalar(
_token: ScalarToken,
coeffs: &[i16; 1024],
output: &mut [i16; 1024],
bit_depth: u8,
) {
super::transform::idct32_inner(coeffs, output, bit_depth);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub(crate) fn idst4_v3(
_token: X64V3Token,
coeffs: &[i16; 16],
output: &mut [i16; 16],
bit_depth: u8,
) {
let load01 = _mm_loadu_si128::<[i16; 8]>(coeffs[0..8].try_into().unwrap());
let load23 = _mm_loadu_si128::<[i16; 8]>(coeffs[8..16].try_into().unwrap());
let row0 = _mm_cvtepi16_epi32(load01);
let row1 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(load01, load01));
let row2 = _mm_cvtepi16_epi32(load23);
let row3 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(load23, load23));
let add1 = _mm_set1_epi32(64);
let t0 = _mm_srai_epi32::<7>(_mm_add_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_mullo_epi32(row0, _mm_set1_epi32(29)),
_mm_mullo_epi32(row1, _mm_set1_epi32(74)),
),
_mm_add_epi32(
_mm_mullo_epi32(row2, _mm_set1_epi32(84)),
_mm_mullo_epi32(row3, _mm_set1_epi32(55)),
),
),
add1,
));
let t1 = _mm_srai_epi32::<7>(_mm_add_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_mullo_epi32(row0, _mm_set1_epi32(55)),
_mm_mullo_epi32(row1, _mm_set1_epi32(74)),
),
_mm_add_epi32(
_mm_mullo_epi32(row2, _mm_set1_epi32(-29)),
_mm_mullo_epi32(row3, _mm_set1_epi32(-84)),
),
),
add1,
));
let t2 = _mm_srai_epi32::<7>(_mm_add_epi32(
_mm_mullo_epi32(
_mm_add_epi32(_mm_sub_epi32(row0, row2), row3),
_mm_set1_epi32(74),
),
add1,
));
let t3 = _mm_srai_epi32::<7>(_mm_add_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_mullo_epi32(row0, _mm_set1_epi32(84)),
_mm_mullo_epi32(row1, _mm_set1_epi32(-74)),
),
_mm_add_epi32(
_mm_mullo_epi32(row2, _mm_set1_epi32(55)),
_mm_mullo_epi32(row3, _mm_set1_epi32(-29)),
),
),
add1,
));
let packed01 = _mm_packs_epi32(t0, t1);
let packed23 = _mm_packs_epi32(t2, t3);
let a = _mm_unpacklo_epi16(packed01, packed23);
let b = _mm_unpackhi_epi16(packed01, packed23);
let tp_lo = _mm_unpacklo_epi16(a, b); let tp_hi = _mm_unpackhi_epi16(a, b);
let r0 = _mm_cvtepi16_epi32(tp_lo);
let r1 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(tp_lo, tp_lo));
let r2 = _mm_cvtepi16_epi32(tp_hi);
let r3 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(tp_hi, tp_hi));
let shift2 = 20 - bit_depth as i32;
let add2 = _mm_set1_epi32(1i32 << (shift2 - 1));
let shift2_v = _mm_cvtsi32_si128(shift2);
let o0 = _mm_sra_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_mullo_epi32(r0, _mm_set1_epi32(29)),
_mm_mullo_epi32(r1, _mm_set1_epi32(74)),
),
_mm_add_epi32(
_mm_mullo_epi32(r2, _mm_set1_epi32(84)),
_mm_mullo_epi32(r3, _mm_set1_epi32(55)),
),
),
add2,
),
shift2_v,
);
let o1 = _mm_sra_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_mullo_epi32(r0, _mm_set1_epi32(55)),
_mm_mullo_epi32(r1, _mm_set1_epi32(74)),
),
_mm_add_epi32(
_mm_mullo_epi32(r2, _mm_set1_epi32(-29)),
_mm_mullo_epi32(r3, _mm_set1_epi32(-84)),
),
),
add2,
),
shift2_v,
);
let o2 = _mm_sra_epi32(
_mm_add_epi32(
_mm_mullo_epi32(_mm_add_epi32(_mm_sub_epi32(r0, r2), r3), _mm_set1_epi32(74)),
add2,
),
shift2_v,
);
let o3 = _mm_sra_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_add_epi32(
_mm_mullo_epi32(r0, _mm_set1_epi32(84)),
_mm_mullo_epi32(r1, _mm_set1_epi32(-74)),
),
_mm_add_epi32(
_mm_mullo_epi32(r2, _mm_set1_epi32(55)),
_mm_mullo_epi32(r3, _mm_set1_epi32(-29)),
),
),
add2,
),
shift2_v,
);
let out01 = _mm_packs_epi32(o0, o1);
let out23 = _mm_packs_epi32(o2, o3);
let a = _mm_unpacklo_epi16(out01, out23);
let b = _mm_unpackhi_epi16(out01, out23);
let final_lo = _mm_unpacklo_epi16(a, b);
let final_hi = _mm_unpackhi_epi16(a, b);
_mm_storeu_si128::<[i16; 8]>((&mut output[0..8]).try_into().unwrap(), final_lo);
_mm_storeu_si128::<[i16; 8]>((&mut output[8..16]).try_into().unwrap(), final_hi);
}
pub(crate) fn idst4_scalar(
_token: ScalarToken,
coeffs: &[i16; 16],
output: &mut [i16; 16],
bit_depth: u8,
) {
super::transform::idst4_inner(coeffs, output, bit_depth);
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[arcane]
pub(crate) fn add_residual_block_v3(
_token: X64V3Token,
plane: &mut [u16],
stride: usize,
x0: usize,
y0: usize,
residual: &[i16],
size: usize,
max_val: i32,
) {
let zero = _mm256_setzero_si256();
let max_v = _mm256_set1_epi16(max_val as i16);
for py in 0..size {
let row_start = (y0 + py) * stride + x0;
let row = &mut plane[row_start..row_start + size];
let res_row = &residual[py * size..(py + 1) * size];
let chunks = size / 16;
for c in 0..chunks {
let offset = c * 16;
let pred =
_mm256_loadu_si256::<[u16; 16]>(row[offset..offset + 16].try_into().unwrap());
let res =
_mm256_loadu_si256::<[i16; 16]>(res_row[offset..offset + 16].try_into().unwrap());
let sum = _mm256_add_epi16(pred, res);
let clamped = _mm256_min_epi16(_mm256_max_epi16(sum, zero), max_v);
_mm256_storeu_si256::<[u16; 16]>(
(&mut row[offset..offset + 16]).try_into().unwrap(),
clamped,
);
}
for i in (chunks * 16)..size {
let pred = row[i] as i32;
let r = res_row[i] as i32;
row[i] = (pred + r).clamp(0, max_val) as u16;
}
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn add_residual_block_scalar(
_token: ScalarToken,
plane: &mut [u16],
stride: usize,
x0: usize,
y0: usize,
residual: &[i16],
size: usize,
max_val: i32,
) {
for py in 0..size {
let row_start = (y0 + py) * stride + x0;
let row = &mut plane[row_start..row_start + size];
let res_row = &residual[py * size..(py + 1) * size];
for (out, &r) in row.iter_mut().zip(res_row.iter()) {
let pred = *out as i32;
*out = (pred + r as i32).clamp(0, max_val) as u16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub(crate) fn dequantize_v3(
_token: X64V3Token,
coeffs: &mut [i16],
combined_scale: i32,
shift: i32,
add: i32,
) {
let scale_v = _mm256_set1_epi32(combined_scale);
let add_v = _mm256_set1_epi32(add);
let shift_v = _mm_cvtsi32_si128(shift);
let chunks = coeffs.len() / 16;
for c in 0..chunks {
let offset = c * 16;
let src = _mm256_loadu_si256::<[i16; 16]>(coeffs[offset..offset + 16].try_into().unwrap());
let lo_128 = _mm256_castsi256_si128(src);
let hi_128 = _mm256_extracti128_si256::<1>(src);
let lo_32 = _mm256_cvtepi16_epi32(lo_128);
let hi_32 = _mm256_cvtepi16_epi32(hi_128);
let prod_lo = _mm256_mullo_epi32(lo_32, scale_v);
let prod_hi = _mm256_mullo_epi32(hi_32, scale_v);
let shifted_lo = _mm256_sra_epi32(_mm256_add_epi32(prod_lo, add_v), shift_v);
let shifted_hi = _mm256_sra_epi32(_mm256_add_epi32(prod_hi, add_v), shift_v);
let packed = _mm256_packs_epi32(shifted_lo, shifted_hi);
let result = _mm256_permute4x64_epi64::<0xD8>(packed);
_mm256_storeu_si256::<[i16; 16]>(
(&mut coeffs[offset..offset + 16]).try_into().unwrap(),
result,
);
}
for coef in coeffs.iter_mut().skip(chunks * 16) {
let value = (*coef as i32 * combined_scale + add) >> shift;
*coef = value.clamp(-32768, 32767) as i16;
}
}
pub(crate) fn dequantize_scalar(
_token: ScalarToken,
coeffs: &mut [i16],
combined_scale: i32,
shift: i32,
add: i32,
) {
for coef in coeffs.iter_mut() {
let value = (*coef as i32 * combined_scale + add) >> shift;
*coef = value.clamp(-32768, 32767) as i16;
}
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn add_residual_block_generic<T>(
token: T,
plane: &mut [u16],
stride: usize,
x0: usize,
y0: usize,
residual: &[i16],
size: usize,
max_val: i32,
) where
T: magetypes::simd::backends::I16x8Bitcast,
{
use magetypes::simd::generic::{i16x8, u16x8};
let zero = i16x8::zero(token);
let max_v = i16x8::splat(token, max_val as i16);
for py in 0..size {
let row_start = (y0 + py) * stride + x0;
let row = &mut plane[row_start..row_start + size];
let res_row = &residual[py * size..(py + 1) * size];
let chunks = size / 8;
for c in 0..chunks {
let offset = c * 8;
let pred_u = u16x8::load(token, row[offset..offset + 8].try_into().unwrap());
let pred = pred_u.bitcast_i16x8();
let res = i16x8::load(token, res_row[offset..offset + 8].try_into().unwrap());
let sum = pred + res;
let clamped = sum.max(zero).min(max_v);
let clamped_u = clamped.bitcast_u16x8();
clamped_u.store((&mut row[offset..offset + 8]).try_into().unwrap());
}
for i in (chunks * 8)..size {
let pred = row[i] as i32;
let r = res_row[i] as i32;
row[i] = (pred + r).clamp(0, max_val) as u16;
}
}
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn wasm_extend_low_i16(v: v128) -> v128 {
i32x4_extend_low_i16x8(v)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn wasm_extend_high_i16(v: v128) -> v128 {
i32x4_extend_high_i16x8(v)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn wasm_narrow_i32_to_i16(lo: v128, hi: v128) -> v128 {
i16x8_narrow_i32x4(lo, hi)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn wasm_transpose_4x4_i16(ab: v128, cd: v128) -> (v128, v128) {
let a = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(ab, cd);
let b = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(ab, cd);
let lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a, b);
let hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a, b);
(lo, hi)
}
#[cfg(target_arch = "wasm32")]
#[arcane]
pub(crate) fn idst4_wasm128(
_token: Wasm128Token,
coeffs: &[i16; 16],
output: &mut [i16; 16],
bit_depth: u8,
) {
let load01 = v128_load::<[i16; 8]>(coeffs[0..8].try_into().unwrap());
let load23 = v128_load::<[i16; 8]>(coeffs[8..16].try_into().unwrap());
let row0 = wasm_extend_low_i16(load01);
let row1 = wasm_extend_high_i16(load01);
let row2 = wasm_extend_low_i16(load23);
let row3 = wasm_extend_high_i16(load23);
let add1 = i32x4_splat(64);
let t0 = i32x4_shr(
i32x4_add(
i32x4_add(
i32x4_add(
i32x4_mul(row0, i32x4_splat(29)),
i32x4_mul(row1, i32x4_splat(74)),
),
i32x4_add(
i32x4_mul(row2, i32x4_splat(84)),
i32x4_mul(row3, i32x4_splat(55)),
),
),
add1,
),
7,
);
let t1 = i32x4_shr(
i32x4_add(
i32x4_add(
i32x4_add(
i32x4_mul(row0, i32x4_splat(55)),
i32x4_mul(row1, i32x4_splat(74)),
),
i32x4_add(
i32x4_mul(row2, i32x4_splat(-29)),
i32x4_mul(row3, i32x4_splat(-84)),
),
),
add1,
),
7,
);
let t2 = i32x4_shr(
i32x4_add(
i32x4_mul(i32x4_add(i32x4_sub(row0, row2), row3), i32x4_splat(74)),
add1,
),
7,
);
let t3 = i32x4_shr(
i32x4_add(
i32x4_add(
i32x4_add(
i32x4_mul(row0, i32x4_splat(84)),
i32x4_mul(row1, i32x4_splat(-74)),
),
i32x4_add(
i32x4_mul(row2, i32x4_splat(55)),
i32x4_mul(row3, i32x4_splat(-29)),
),
),
add1,
),
7,
);
let packed01 = wasm_narrow_i32_to_i16(t0, t1);
let packed23 = wasm_narrow_i32_to_i16(t2, t3);
let (tp_lo, tp_hi) = wasm_transpose_4x4_i16(packed01, packed23);
let r0 = wasm_extend_low_i16(tp_lo);
let r1 = wasm_extend_high_i16(tp_lo);
let r2 = wasm_extend_low_i16(tp_hi);
let r3 = wasm_extend_high_i16(tp_hi);
let shift2 = (20 - bit_depth as i32) as u32;
let add2 = i32x4_splat(1i32 << (shift2 - 1));
let o0 = i32x4_shr(
i32x4_add(
i32x4_add(
i32x4_add(
i32x4_mul(r0, i32x4_splat(29)),
i32x4_mul(r1, i32x4_splat(74)),
),
i32x4_add(
i32x4_mul(r2, i32x4_splat(84)),
i32x4_mul(r3, i32x4_splat(55)),
),
),
add2,
),
shift2,
);
let o1 = i32x4_shr(
i32x4_add(
i32x4_add(
i32x4_add(
i32x4_mul(r0, i32x4_splat(55)),
i32x4_mul(r1, i32x4_splat(74)),
),
i32x4_add(
i32x4_mul(r2, i32x4_splat(-29)),
i32x4_mul(r3, i32x4_splat(-84)),
),
),
add2,
),
shift2,
);
let o2 = i32x4_shr(
i32x4_add(
i32x4_mul(i32x4_add(i32x4_sub(r0, r2), r3), i32x4_splat(74)),
add2,
),
shift2,
);
let o3 = i32x4_shr(
i32x4_add(
i32x4_add(
i32x4_add(
i32x4_mul(r0, i32x4_splat(84)),
i32x4_mul(r1, i32x4_splat(-74)),
),
i32x4_add(
i32x4_mul(r2, i32x4_splat(55)),
i32x4_mul(r3, i32x4_splat(-29)),
),
),
add2,
),
shift2,
);
let out01 = wasm_narrow_i32_to_i16(o0, o1);
let out23 = wasm_narrow_i32_to_i16(o2, o3);
let (final_lo, final_hi) = wasm_transpose_4x4_i16(out01, out23);
v128_store::<[i16; 8]>((&mut output[0..8]).try_into().unwrap(), final_lo);
v128_store::<[i16; 8]>((&mut output[8..16]).try_into().unwrap(), final_hi);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn idct8_1d_4col_wasm(src: [v128; 8], shift: u32, add: v128) -> [v128; 8] {
let o0 = i32x4_add(
i32x4_add(
i32x4_mul(src[1], i32x4_splat(89)),
i32x4_mul(src[3], i32x4_splat(75)),
),
i32x4_add(
i32x4_mul(src[5], i32x4_splat(50)),
i32x4_mul(src[7], i32x4_splat(18)),
),
);
let o1 = i32x4_add(
i32x4_add(
i32x4_mul(src[1], i32x4_splat(75)),
i32x4_mul(src[3], i32x4_splat(-18)),
),
i32x4_add(
i32x4_mul(src[5], i32x4_splat(-89)),
i32x4_mul(src[7], i32x4_splat(-50)),
),
);
let o2 = i32x4_add(
i32x4_add(
i32x4_mul(src[1], i32x4_splat(50)),
i32x4_mul(src[3], i32x4_splat(-89)),
),
i32x4_add(
i32x4_mul(src[5], i32x4_splat(18)),
i32x4_mul(src[7], i32x4_splat(75)),
),
);
let o3 = i32x4_add(
i32x4_add(
i32x4_mul(src[1], i32x4_splat(18)),
i32x4_mul(src[3], i32x4_splat(-50)),
),
i32x4_add(
i32x4_mul(src[5], i32x4_splat(75)),
i32x4_mul(src[7], i32x4_splat(-89)),
),
);
let r0_64 = i32x4_mul(src[0], i32x4_splat(64));
let r4_64 = i32x4_mul(src[4], i32x4_splat(64));
let ee0 = i32x4_add(r0_64, r4_64);
let ee1 = i32x4_sub(r0_64, r4_64);
let eo0 = i32x4_add(
i32x4_mul(src[2], i32x4_splat(83)),
i32x4_mul(src[6], i32x4_splat(36)),
);
let eo1 = i32x4_sub(
i32x4_mul(src[2], i32x4_splat(36)),
i32x4_mul(src[6], i32x4_splat(83)),
);
let e0 = i32x4_add(ee0, eo0);
let e1 = i32x4_add(ee1, eo1);
let e2 = i32x4_sub(ee1, eo1);
let e3 = i32x4_sub(ee0, eo0);
[
i32x4_shr(i32x4_add(i32x4_add(e0, o0), add), shift),
i32x4_shr(i32x4_add(i32x4_add(e1, o1), add), shift),
i32x4_shr(i32x4_add(i32x4_add(e2, o2), add), shift),
i32x4_shr(i32x4_add(i32x4_add(e3, o3), add), shift),
i32x4_shr(i32x4_add(i32x4_sub(e3, o3), add), shift),
i32x4_shr(i32x4_add(i32x4_sub(e2, o2), add), shift),
i32x4_shr(i32x4_add(i32x4_sub(e1, o1), add), shift),
i32x4_shr(i32x4_add(i32x4_sub(e0, o0), add), shift),
]
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn transpose_8x8_wasm(rows: &[v128; 8]) -> [v128; 8] {
let t0 = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(rows[0], rows[1]);
let t1 = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(rows[0], rows[1]);
let t2 = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(rows[2], rows[3]);
let t3 = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(rows[2], rows[3]);
let t4 = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(rows[4], rows[5]);
let t5 = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(rows[4], rows[5]);
let t6 = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(rows[6], rows[7]);
let t7 = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(rows[6], rows[7]);
let u0 = i32x4_shuffle::<0, 4, 1, 5>(t0, t2);
let u1 = i32x4_shuffle::<2, 6, 3, 7>(t0, t2);
let u2 = i32x4_shuffle::<0, 4, 1, 5>(t1, t3);
let u3 = i32x4_shuffle::<2, 6, 3, 7>(t1, t3);
let u4 = i32x4_shuffle::<0, 4, 1, 5>(t4, t6);
let u5 = i32x4_shuffle::<2, 6, 3, 7>(t4, t6);
let u6 = i32x4_shuffle::<0, 4, 1, 5>(t5, t7);
let u7 = i32x4_shuffle::<2, 6, 3, 7>(t5, t7);
[
i64x2_shuffle::<0, 2>(u0, u4),
i64x2_shuffle::<1, 3>(u0, u4),
i64x2_shuffle::<0, 2>(u1, u5),
i64x2_shuffle::<1, 3>(u1, u5),
i64x2_shuffle::<0, 2>(u2, u6),
i64x2_shuffle::<1, 3>(u2, u6),
i64x2_shuffle::<0, 2>(u3, u7),
i64x2_shuffle::<1, 3>(u3, u7),
]
}
#[cfg(target_arch = "wasm32")]
#[arcane]
pub(crate) fn idct8_wasm128(
_token: Wasm128Token,
coeffs: &[i16; 64],
output: &mut [i16; 64],
bit_depth: u8,
) {
let rows: [v128; 8] = core::array::from_fn(|i| {
v128_load::<[i16; 8]>(coeffs[i * 8..(i + 1) * 8].try_into().unwrap())
});
let lo_src: [v128; 8] = core::array::from_fn(|i| wasm_extend_low_i16(rows[i]));
let add1 = i32x4_splat(1 << 6); let lo_out = idct8_1d_4col_wasm(lo_src, 7, add1);
let hi_src: [v128; 8] = core::array::from_fn(|i| wasm_extend_high_i16(rows[i]));
let hi_out = idct8_1d_4col_wasm(hi_src, 7, add1);
let pass1: [v128; 8] = core::array::from_fn(|i| wasm_narrow_i32_to_i16(lo_out[i], hi_out[i]));
let transposed = transpose_8x8_wasm(&pass1);
let shift2 = (20 - bit_depth as i32) as u32;
let add2 = i32x4_splat(1i32 << (shift2 - 1));
let lo_src2: [v128; 8] = core::array::from_fn(|i| wasm_extend_low_i16(transposed[i]));
let lo_out2 = idct8_1d_4col_wasm(lo_src2, shift2, add2);
let hi_src2: [v128; 8] = core::array::from_fn(|i| wasm_extend_high_i16(transposed[i]));
let hi_out2 = idct8_1d_4col_wasm(hi_src2, shift2, add2);
let pass2: [v128; 8] = core::array::from_fn(|i| wasm_narrow_i32_to_i16(lo_out2[i], hi_out2[i]));
let final_rows = transpose_8x8_wasm(&pass2);
for i in 0..8 {
v128_store::<[i16; 8]>(
(&mut output[i * 8..(i + 1) * 8]).try_into().unwrap(),
final_rows[i],
);
}
}
#[cfg(target_arch = "wasm32")]
pub(crate) fn idct16_wasm128(
_token: Wasm128Token,
coeffs: &[i16; 256],
output: &mut [i16; 256],
bit_depth: u8,
) {
super::transform::idct16_inner(coeffs, output, bit_depth);
}
#[cfg(target_arch = "wasm32")]
pub(crate) fn idct32_wasm128(
_token: Wasm128Token,
coeffs: &[i16; 1024],
output: &mut [i16; 1024],
bit_depth: u8,
) {
super::transform::idct32_inner(coeffs, output, bit_depth);
}
#[cfg(target_arch = "wasm32")]
#[arcane]
pub(crate) fn dequantize_wasm128(
_token: Wasm128Token,
coeffs: &mut [i16],
combined_scale: i32,
shift: i32,
add: i32,
) {
let scale_v = i32x4_splat(combined_scale);
let add_v = i32x4_splat(add);
let shift_u = shift as u32;
let chunks = coeffs.len() / 8;
for c in 0..chunks {
let offset = c * 8;
let src: v128 = v128_load::<[i16; 8]>(coeffs[offset..offset + 8].try_into().unwrap());
let lo_32 = wasm_extend_low_i16(src);
let hi_32 = wasm_extend_high_i16(src);
let prod_lo = i32x4_mul(lo_32, scale_v);
let prod_hi = i32x4_mul(hi_32, scale_v);
let shifted_lo = i32x4_shr(i32x4_add(prod_lo, add_v), shift_u);
let shifted_hi = i32x4_shr(i32x4_add(prod_hi, add_v), shift_u);
let result = wasm_narrow_i32_to_i16(shifted_lo, shifted_hi);
v128_store::<[i16; 8]>(
(&mut coeffs[offset..offset + 8]).try_into().unwrap(),
result,
);
}
for coef in coeffs.iter_mut().skip(chunks * 8) {
let value = (*coef as i32 * combined_scale + add) >> shift;
*coef = value.clamp(-32768, 32767) as i16;
}
}
#[cfg(target_arch = "wasm32")]
#[allow(clippy::too_many_arguments)]
pub(crate) fn add_residual_block_wasm128(
token: Wasm128Token,
plane: &mut [u16],
stride: usize,
x0: usize,
y0: usize,
residual: &[i16],
size: usize,
max_val: i32,
) {
add_residual_block_generic(token, plane, stride, x0, y0, residual, size, max_val);
}