#![allow(clippy::too_many_arguments)]
#![cfg_attr(not(feature = "std"), allow(dead_code))]
#![cfg_attr(not(feature = "std"), allow(dead_code))]
use archmage::prelude::*;
#[cfg(target_arch = "aarch64")]
use archmage::intrinsics::aarch64 as simd_mem;
#[cfg(target_arch = "x86")]
use archmage::intrinsics::x86 as simd_mem;
#[cfg(target_arch = "x86_64")]
use archmage::intrinsics::x86_64 as simd_mem;
use core::convert::TryFrom;
#[inline(always)]
fn rows4<T>(a: &[T; 16]) -> (&[T; 4], &[T; 4], &[T; 4], &[T; 4]) {
let (r0, rest) = a.split_first_chunk::<4>().unwrap();
let (r1, rest) = rest.split_first_chunk::<4>().unwrap();
let (r2, rest) = rest.split_first_chunk::<4>().unwrap();
let r3: &[T; 4] = rest.try_into().unwrap();
(r0, r1, r2, r3)
}
#[inline(always)]
fn rows4_mut<T>(a: &mut [T; 16]) -> (&mut [T; 4], &mut [T; 4], &mut [T; 4], &mut [T; 4]) {
let (r0, rest) = a.split_first_chunk_mut::<4>().unwrap();
let (r1, rest) = rest.split_first_chunk_mut::<4>().unwrap();
let (r2, rest) = rest.split_first_chunk_mut::<4>().unwrap();
let r3: &mut [T; 4] = rest.try_into().unwrap();
(r0, r1, r2, r3)
}
#[inline(always)]
fn halves8<T>(a: &[T; 16]) -> (&[T; 8], &[T; 8]) {
let (lo, rest) = a.split_first_chunk::<8>().unwrap();
let hi: &[T; 8] = rest.try_into().unwrap();
(lo, hi)
}
#[inline(always)]
fn halves8_mut<T>(a: &mut [T; 16]) -> (&mut [T; 8], &mut [T; 8]) {
let (lo, rest) = a.split_first_chunk_mut::<8>().unwrap();
let hi: &mut [T; 8] = rest.try_into().unwrap();
(lo, hi)
}
const CONST1: i64 = 20091;
const CONST2: i64 = 35468;
#[inline(always)]
pub(crate) fn idct4x4_dc(block: &mut [i32; 16]) {
let dc = (block[0] + 4) >> 3;
block.fill(dc);
}
#[inline(always)]
pub(crate) fn idct4x4(block: &mut [i32; 16]) {
idct4x4_intrinsics(block);
}
#[inline(always)]
#[allow(dead_code)] pub(crate) fn idct4x4_with_token(
block: &mut [i32; 16],
simd_token: super::prediction::SimdTokenType,
) {
idct4x4_intrinsics_with_token(block, simd_token);
}
#[archmage::autoversion(cfg(simd))]
#[allow(dead_code)]
pub(crate) fn idct4x4_scalar(block: &mut [i32; 16]) {
fn fetch(block: &[i32], idx: usize) -> i64 {
i64::from(block[idx])
}
for i in 0usize..4 {
let a1 = fetch(block, i) + fetch(block, 8 + i);
let b1 = fetch(block, i) - fetch(block, 8 + i);
let t1 = (fetch(block, 4 + i) * CONST2) >> 16;
let t2 = fetch(block, 12 + i) + ((fetch(block, 12 + i) * CONST1) >> 16);
let c1 = t1 - t2;
let t1 = fetch(block, 4 + i) + ((fetch(block, 4 + i) * CONST1) >> 16);
let t2 = (fetch(block, 12 + i) * CONST2) >> 16;
let d1 = t1 + t2;
block[i] = (a1 + d1) as i32;
block[4 + i] = (b1 + c1) as i32;
block[4 * 3 + i] = (a1 - d1) as i32;
block[4 * 2 + i] = (b1 - c1) as i32;
}
for i in 0usize..4 {
let a1 = fetch(block, 4 * i) + fetch(block, 4 * i + 2);
let b1 = fetch(block, 4 * i) - fetch(block, 4 * i + 2);
let t1 = (fetch(block, 4 * i + 1) * CONST2) >> 16;
let t2 = fetch(block, 4 * i + 3) + ((fetch(block, 4 * i + 3) * CONST1) >> 16);
let c1 = t1 - t2;
let t1 = fetch(block, 4 * i + 1) + ((fetch(block, 4 * i + 1) * CONST1) >> 16);
let t2 = (fetch(block, 4 * i + 3) * CONST2) >> 16;
let d1 = t1 + t2;
block[4 * i] = ((a1 + d1 + 4) >> 3) as i32;
block[4 * i + 3] = ((a1 - d1 + 4) >> 3) as i32;
block[4 * i + 1] = ((b1 + c1 + 4) >> 3) as i32;
block[4 * i + 2] = ((b1 - c1 + 4) >> 3) as i32;
}
}
#[archmage::autoversion(cfg(simd))]
pub(crate) fn iwht4x4(block: &mut [i32; 16]) {
for i in 0usize..4 {
let a1 = block[i] + block[12 + i];
let b1 = block[4 + i] + block[8 + i];
let c1 = block[4 + i] - block[8 + i];
let d1 = block[i] - block[12 + i];
block[i] = a1 + b1;
block[4 + i] = c1 + d1;
block[8 + i] = a1 - b1;
block[12 + i] = d1 - c1;
}
for block in block.chunks_exact_mut(4) {
let a1 = block[0] + block[3];
let b1 = block[1] + block[2];
let c1 = block[1] - block[2];
let d1 = block[0] - block[3];
let a2 = a1 + b1;
let b2 = c1 + d1;
let c2 = a1 - b1;
let d2 = d1 - c1;
block[0] = (a2 + 3) >> 3;
block[1] = (b2 + 3) >> 3;
block[2] = (c2 + 3) >> 3;
block[3] = (d2 + 3) >> 3;
}
}
#[archmage::autoversion(cfg(simd))]
pub(crate) fn wht4x4(block: &mut [i32; 16]) {
fn fetch(block: &[i32], idx: usize) -> i64 {
i64::from(block[idx])
}
for i in 0..4 {
let a = fetch(block, i * 4) + fetch(block, i * 4 + 3);
let b = fetch(block, i * 4 + 1) + fetch(block, i * 4 + 2);
let c = fetch(block, i * 4 + 1) - fetch(block, i * 4 + 2);
let d = fetch(block, i * 4) - fetch(block, i * 4 + 3);
block[i * 4] = (a + b) as i32;
block[i * 4 + 1] = (c + d) as i32;
block[i * 4 + 2] = (a - b) as i32;
block[i * 4 + 3] = (d - c) as i32;
}
for i in 0..4 {
let a1 = fetch(block, i) + fetch(block, i + 12);
let b1 = fetch(block, i + 4) + fetch(block, i + 8);
let c1 = fetch(block, i + 4) - fetch(block, i + 8);
let d1 = fetch(block, i) - fetch(block, i + 12);
let a2 = a1 + b1;
let b2 = c1 + d1;
let c2 = a1 - b1;
let d2 = d1 - c1;
let a3 = (a2 + if a2 > 0 { 1 } else { 0 }) / 2;
let b3 = (b2 + if b2 > 0 { 1 } else { 0 }) / 2;
let c3 = (c2 + if c2 > 0 { 1 } else { 0 }) / 2;
let d3 = (d2 + if d2 > 0 { 1 } else { 0 }) / 2;
block[i] = a3 as i32;
block[i + 4] = b3 as i32;
block[i + 8] = c3 as i32;
block[i + 12] = d3 as i32;
}
}
#[inline(always)]
pub(crate) fn dct4x4(block: &mut [i32; 16]) {
dct4x4_intrinsics(block);
}
#[archmage::autoversion(cfg(simd))]
#[allow(dead_code)]
pub(crate) fn dct4x4_scalar(block: &mut [i32; 16]) {
fn fetch(block: &[i32], idx: usize) -> i64 {
i64::from(block[idx])
}
for i in 0..4 {
let a = (fetch(block, i * 4) + fetch(block, i * 4 + 3)) * 8;
let b = (fetch(block, i * 4 + 1) + fetch(block, i * 4 + 2)) * 8;
let c = (fetch(block, i * 4 + 1) - fetch(block, i * 4 + 2)) * 8;
let d = (fetch(block, i * 4) - fetch(block, i * 4 + 3)) * 8;
block[i * 4] = (a + b) as i32;
block[i * 4 + 2] = (a - b) as i32;
block[i * 4 + 1] = ((c * 2217 + d * 5352 + 14500) >> 12) as i32;
block[i * 4 + 3] = ((d * 2217 - c * 5352 + 7500) >> 12) as i32;
}
for i in 0..4 {
let a = fetch(block, i) + fetch(block, i + 12);
let b = fetch(block, i + 4) + fetch(block, i + 8);
let c = fetch(block, i + 4) - fetch(block, i + 8);
let d = fetch(block, i) - fetch(block, i + 12);
block[i] = ((a + b + 7) >> 4) as i32;
block[i + 8] = ((a - b + 7) >> 4) as i32;
block[i + 4] = (((c * 2217 + d * 5352 + 12000) >> 16) + if d != 0 { 1 } else { 0 }) as i32;
block[i + 12] = ((d * 2217 - c * 5352 + 51000) >> 16) as i32;
}
}
#[inline(always)]
pub(crate) fn dct4x4_intrinsics(block: &mut [i32; 16]) {
incant!(dct4x4_dispatch(block), [v3, neon, wasm128, scalar]);
}
#[inline(always)]
pub(crate) fn idct4x4_intrinsics(block: &mut [i32; 16]) {
incant!(idct4x4_dispatch(block), [v3, neon, wasm128, scalar]);
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn idct4x4_intrinsics_with_token(
block: &mut [i32; 16],
_simd_token: crate::common::prediction::SimdTokenType,
) {
idct4x4_intrinsics(block);
}
#[allow(dead_code)]
pub(crate) fn dct4x4_two_intrinsics(block1: &mut [i32; 16], block2: &mut [i32; 16]) {
incant!(
dct4x4_two_dispatch(block1, block2),
[v3, neon, wasm128, scalar]
);
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn dct4x4_dispatch_v3(token: X64V3Token, block: &mut [i32; 16]) {
dct4x4_entry(token, block);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn dct4x4_dispatch_neon(token: NeonToken, block: &mut [i32; 16]) {
dct4x4_neon(token, block);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn dct4x4_dispatch_wasm128(token: Wasm128Token, block: &mut [i32; 16]) {
dct4x4_wasm(token, block);
}
#[inline(always)]
fn dct4x4_dispatch_scalar(_token: ScalarToken, block: &mut [i32; 16]) {
dct4x4_scalar(block);
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn idct4x4_dispatch_v3(token: X64V3Token, block: &mut [i32; 16]) {
idct4x4_entry(token, block);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn idct4x4_dispatch_neon(token: NeonToken, block: &mut [i32; 16]) {
idct4x4_neon(token, block);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn idct4x4_dispatch_wasm128(token: Wasm128Token, block: &mut [i32; 16]) {
idct4x4_wasm(token, block);
}
#[inline(always)]
fn idct4x4_dispatch_scalar(_token: ScalarToken, block: &mut [i32; 16]) {
idct4x4_scalar(block);
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn dct4x4_two_dispatch_v3(token: X64V3Token, block1: &mut [i32; 16], block2: &mut [i32; 16]) {
dct4x4_two_entry(token, block1, block2);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn dct4x4_two_dispatch_neon(token: NeonToken, block1: &mut [i32; 16], block2: &mut [i32; 16]) {
dct4x4_neon(token, block1);
dct4x4_neon(token, block2);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn dct4x4_two_dispatch_wasm128(
token: Wasm128Token,
block1: &mut [i32; 16],
block2: &mut [i32; 16],
) {
dct4x4_wasm(token, block1);
dct4x4_wasm(token, block2);
}
#[inline(always)]
fn dct4x4_two_dispatch_scalar(_token: ScalarToken, block1: &mut [i32; 16], block2: &mut [i32; 16]) {
dct4x4_scalar(block1);
dct4x4_scalar(block2);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn dct4x4_entry(_token: X64V3Token, block: &mut [i32; 16]) {
dct4x4_sse2(_token, block);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn dct4x4_sse2(_token: X64V3Token, block: &mut [i32; 16]) {
let row01 = _mm_set_epi16(
block[7] as i16, block[6] as i16, block[3] as i16, block[2] as i16, block[5] as i16, block[4] as i16, block[1] as i16, block[0] as i16, );
let row23 = _mm_set_epi16(
block[15] as i16, block[14] as i16, block[11] as i16, block[10] as i16, block[13] as i16, block[12] as i16, block[9] as i16, block[8] as i16, );
let (v01, v32) = ftransform_pass1_i16(_token, row01, row23);
let mut out16 = [0i16; 16];
ftransform_pass2_i16(_token, &v01, &v32, &mut out16);
let zero = _mm_setzero_si128();
let (out16_lo, out16_hi) = halves8(&out16);
let out01 = simd_mem::_mm_loadu_si128(out16_lo);
let out23 = simd_mem::_mm_loadu_si128(out16_hi);
let sign01 = _mm_cmpgt_epi16(zero, out01);
let sign23 = _mm_cmpgt_epi16(zero, out23);
let out_0 = _mm_unpacklo_epi16(out01, sign01);
let out_1 = _mm_unpackhi_epi16(out01, sign01);
let out_2 = _mm_unpacklo_epi16(out23, sign23);
let out_3 = _mm_unpackhi_epi16(out23, sign23);
let (r0, r1, r2, r3) = rows4_mut(block);
simd_mem::_mm_storeu_si128(r0, out_0);
simd_mem::_mm_storeu_si128(r1, out_1);
simd_mem::_mm_storeu_si128(r2, out_2);
simd_mem::_mm_storeu_si128(r3, out_3);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn ftransform_pass1_i16(_token: X64V3Token, in01: __m128i, in23: __m128i) -> (__m128i, __m128i) {
let k937 = _mm_set1_epi32(937);
let k1812 = _mm_set1_epi32(1812);
let k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
let k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
let k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352);
let k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217);
let shuf01_p = _mm_shufflehi_epi16(in01, 0b10_11_00_01); let shuf23_p = _mm_shufflehi_epi16(in23, 0b10_11_00_01);
let s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
let s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
let a01 = _mm_add_epi16(s01, s32);
let a32 = _mm_sub_epi16(s01, s32);
let tmp0 = _mm_madd_epi16(a01, k88p);
let tmp2 = _mm_madd_epi16(a01, k88m);
let tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
let tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
let tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
let tmp3_2 = _mm_add_epi32(tmp3_1, k937);
let tmp1 = _mm_srai_epi32(tmp1_2, 9);
let tmp3 = _mm_srai_epi32(tmp3_2, 9);
let s03 = _mm_packs_epi32(tmp0, tmp2); let s12 = _mm_packs_epi32(tmp1, tmp3);
let s_lo = _mm_unpacklo_epi16(s03, s12); let s_hi = _mm_unpackhi_epi16(s03, s12); let v23 = _mm_unpackhi_epi32(s_lo, s_hi);
let out01 = _mm_unpacklo_epi32(s_lo, s_hi);
let out32 = _mm_shuffle_epi32(v23, 0b01_00_11_10);
(out01, out32)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn ftransform_pass2_i16(_token: X64V3Token, v01: &__m128i, v32: &__m128i, out: &mut [i16; 16]) {
let zero = _mm_setzero_si128();
let seven = _mm_set1_epi16(7);
let k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217);
let k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352);
let k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
let k51000 = _mm_set1_epi32(51000);
let a32 = _mm_sub_epi16(*v01, *v32);
let a22 = _mm_unpackhi_epi64(a32, a32);
let b23 = _mm_unpacklo_epi16(a22, a32);
let c1 = _mm_madd_epi16(b23, k5352_2217);
let c3 = _mm_madd_epi16(b23, k2217_5352);
let d1 = _mm_add_epi32(c1, k12000_plus_one);
let d3 = _mm_add_epi32(c3, k51000);
let e1 = _mm_srai_epi32(d1, 16);
let e3 = _mm_srai_epi32(d3, 16);
let f1 = _mm_packs_epi32(e1, e1);
let f3 = _mm_packs_epi32(e3, e3);
let g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
let a01 = _mm_add_epi16(*v01, *v32);
let a01_plus_7 = _mm_add_epi16(a01, seven);
let a11 = _mm_unpackhi_epi64(a01, a01);
let c0 = _mm_add_epi16(a01_plus_7, a11);
let c2 = _mm_sub_epi16(a01_plus_7, a11);
let d0 = _mm_srai_epi16(c0, 4);
let d2 = _mm_srai_epi16(c2, 4);
let d0_g1 = _mm_unpacklo_epi64(d0, g1);
let d2_f3 = _mm_unpacklo_epi64(d2, f3);
let (out_lo, out_hi) = halves8_mut(out);
simd_mem::_mm_storeu_si128(out_lo, d0_g1);
simd_mem::_mm_storeu_si128(out_hi, d2_f3);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
#[allow(dead_code)]
fn dct4x4_two_entry(_token: X64V3Token, block1: &mut [i32; 16], block2: &mut [i32; 16]) {
dct4x4_two_sse2(_token, block1, block2);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(dead_code)]
fn dct4x4_two_sse2(_token: X64V3Token, block1: &mut [i32; 16], block2: &mut [i32; 16]) {
dct4x4_sse2(_token, block1);
dct4x4_sse2(_token, block2);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ftransform2_entry(
_token: X64V3Token,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
ftransform2_sse2(_token, src, ref_, src_stride, ref_stride, out);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn ftransform2_sse2(
_token: X64V3Token,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
let src_min = src_stride * 3 + 8;
let ref_min = ref_stride * 3 + 8;
assert!(src.len() >= src_min && ref_.len() >= ref_min);
let zero = _mm_setzero_si128();
let src0 = simd_mem::_mm_loadu_si64(<&[u8; 8]>::try_from(&src[..8]).unwrap());
let src1 =
simd_mem::_mm_loadu_si64(<&[u8; 8]>::try_from(&src[src_stride..src_stride + 8]).unwrap());
let src2 = simd_mem::_mm_loadu_si64(
<&[u8; 8]>::try_from(&src[src_stride * 2..src_stride * 2 + 8]).unwrap(),
);
let src3 = simd_mem::_mm_loadu_si64(
<&[u8; 8]>::try_from(&src[src_stride * 3..src_stride * 3 + 8]).unwrap(),
);
let ref0 = simd_mem::_mm_loadu_si64(<&[u8; 8]>::try_from(&ref_[..8]).unwrap());
let ref1 =
simd_mem::_mm_loadu_si64(<&[u8; 8]>::try_from(&ref_[ref_stride..ref_stride + 8]).unwrap());
let ref2 = simd_mem::_mm_loadu_si64(
<&[u8; 8]>::try_from(&ref_[ref_stride * 2..ref_stride * 2 + 8]).unwrap(),
);
let ref3 = simd_mem::_mm_loadu_si64(
<&[u8; 8]>::try_from(&ref_[ref_stride * 3..ref_stride * 3 + 8]).unwrap(),
);
let src_0 = _mm_unpacklo_epi8(src0, zero);
let src_1 = _mm_unpacklo_epi8(src1, zero);
let src_2 = _mm_unpacklo_epi8(src2, zero);
let src_3 = _mm_unpacklo_epi8(src3, zero);
let ref_0 = _mm_unpacklo_epi8(ref0, zero);
let ref_1 = _mm_unpacklo_epi8(ref1, zero);
let ref_2 = _mm_unpacklo_epi8(ref2, zero);
let ref_3 = _mm_unpacklo_epi8(ref3, zero);
let diff0 = _mm_sub_epi16(src_0, ref_0);
let diff1 = _mm_sub_epi16(src_1, ref_1);
let diff2 = _mm_sub_epi16(src_2, ref_2);
let diff3 = _mm_sub_epi16(src_3, ref_3);
let shuf01l = _mm_unpacklo_epi32(diff0, diff1);
let shuf23l = _mm_unpacklo_epi32(diff2, diff3);
let shuf01h = _mm_unpackhi_epi32(diff0, diff1);
let shuf23h = _mm_unpackhi_epi32(diff2, diff3);
let (v01l, v32l) = ftransform_pass1_i16(_token, shuf01l, shuf23l);
let (v01h, v32h) = ftransform_pass1_i16(_token, shuf01h, shuf23h);
let mut out0 = [0i16; 16];
let mut out1 = [0i16; 16];
ftransform_pass2_i16(_token, &v01l, &v32l, &mut out0);
ftransform_pass2_i16(_token, &v01h, &v32h, &mut out1);
out[..16].copy_from_slice(&out0);
out[16..].copy_from_slice(&out1);
}
pub(crate) fn ftransform2_from_u8(
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
incant!(
ftransform2_dispatch(src, ref_, src_stride, ref_stride, out),
[v3, neon, wasm128, scalar]
);
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn ftransform2_dispatch_v3(
token: X64V3Token,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
ftransform2_entry(token, src, ref_, src_stride, ref_stride, out);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn ftransform2_dispatch_neon(
token: NeonToken,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
for block in 0..2 {
let mut block_data = [0i32; 16];
for y in 0..4 {
for x in 0..4 {
let src_val = src[y * src_stride + block * 4 + x] as i32;
let ref_val = ref_[y * ref_stride + block * 4 + x] as i32;
block_data[y * 4 + x] = src_val - ref_val;
}
}
neon_transform::dct4x4_neon(token, &mut block_data);
for (i, &val) in block_data.iter().enumerate() {
out[block * 16 + i] = val as i16;
}
}
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn ftransform2_dispatch_wasm128(
token: Wasm128Token,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
for block in 0..2 {
let mut block_data = [0i32; 16];
for y in 0..4 {
for x in 0..4 {
let src_val = src[y * src_stride + block * 4 + x] as i32;
let ref_val = ref_[y * ref_stride + block * 4 + x] as i32;
block_data[y * 4 + x] = src_val - ref_val;
}
}
dct4x4_wasm(token, &mut block_data);
for (i, &val) in block_data.iter().enumerate() {
out[block * 16 + i] = val as i16;
}
}
}
#[inline(always)]
fn ftransform2_dispatch_scalar(
_token: ScalarToken,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [i16; 32],
) {
for block in 0..2 {
let mut block_data = [0i32; 16];
for y in 0..4 {
for x in 0..4 {
let src_val = src[y * src_stride + block * 4 + x] as i32;
let ref_val = ref_[y * ref_stride + block * 4 + x] as i32;
block_data[y * 4 + x] = src_val - ref_val;
}
}
dct4x4_scalar(&mut block_data);
for (i, &val) in block_data.iter().enumerate() {
out[block * 16 + i] = val as i16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub(crate) fn idct4x4_entry(_token: X64V3Token, block: &mut [i32; 16]) {
idct4x4_sse2(_token, block);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn idct4x4_sse2(_token: X64V3Token, block: &mut [i32; 16]) {
let k1k2 = _mm_set_epi16(-30068, -30068, -30068, -30068, 20091, 20091, 20091, 20091);
let k2k1 = _mm_set_epi16(20091, 20091, 20091, 20091, -30068, -30068, -30068, -30068);
let zero_four = _mm_set_epi16(0, 0, 0, 0, 4, 4, 4, 4);
let (r0, r1, r2, r3) = rows4(block);
let i32_0 = simd_mem::_mm_loadu_si128(r0);
let i32_1 = simd_mem::_mm_loadu_si128(r1);
let i32_2 = simd_mem::_mm_loadu_si128(r2);
let i32_3 = simd_mem::_mm_loadu_si128(r3);
let in01 = _mm_packs_epi32(i32_0, i32_1);
let in23 = _mm_packs_epi32(i32_2, i32_3);
let (t01, t23) = itransform_pass_sse2(_token, in01, in23, k1k2, k2k1);
let (out01, out23) = itransform_pass2_sse2(_token, t01, t23, k1k2, k2k1, zero_four);
let zero = _mm_setzero_si128();
let sign01_lo = _mm_cmpgt_epi16(zero, out01); let out_0 = _mm_unpacklo_epi16(out01, sign01_lo);
let out_1 = _mm_unpackhi_epi16(out01, sign01_lo);
let sign23_lo = _mm_cmpgt_epi16(zero, out23);
let out_2 = _mm_unpacklo_epi16(out23, sign23_lo);
let out_3 = _mm_unpackhi_epi16(out23, sign23_lo);
let (r0, r1, r2, r3) = rows4_mut(block);
simd_mem::_mm_storeu_si128(r0, out_0);
simd_mem::_mm_storeu_si128(r1, out_1);
simd_mem::_mm_storeu_si128(r2, out_2);
simd_mem::_mm_storeu_si128(r3, out_3);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn itransform_pass_sse2(
_token: X64V3Token,
in01: __m128i,
in23: __m128i,
k1k2: __m128i,
k2k1: __m128i,
) -> (__m128i, __m128i) {
let in1 = _mm_unpackhi_epi64(in01, in01);
let in3 = _mm_unpackhi_epi64(in23, in23);
let a_d3 = _mm_add_epi16(in01, in23);
let b_c3 = _mm_sub_epi16(in01, in23);
let c1d1 = _mm_mulhi_epi16(in1, k2k1);
let c2d2 = _mm_mulhi_epi16(in3, k1k2);
let c3 = _mm_unpackhi_epi64(b_c3, b_c3);
let c4 = _mm_sub_epi16(c1d1, c2d2);
let c = _mm_add_epi16(c3, c4);
let d4u = _mm_add_epi16(c1d1, c2d2);
let du = _mm_add_epi16(a_d3, d4u);
let d = _mm_unpackhi_epi64(du, du);
let comb_ab = _mm_unpacklo_epi64(a_d3, b_c3);
let comb_dc = _mm_unpacklo_epi64(d, c);
let tmp01 = _mm_add_epi16(comb_ab, comb_dc);
let tmp32 = _mm_sub_epi16(comb_ab, comb_dc);
let tmp23 = _mm_shuffle_epi32(tmp32, 0b01_00_11_10);
let transpose_0 = _mm_unpacklo_epi16(tmp01, tmp23);
let transpose_1 = _mm_unpackhi_epi16(tmp01, tmp23);
let t01 = _mm_unpacklo_epi16(transpose_0, transpose_1);
let t23 = _mm_unpackhi_epi16(transpose_0, transpose_1);
(t01, t23)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn itransform_pass2_sse2(
_token: X64V3Token,
t01: __m128i,
t23: __m128i,
k1k2: __m128i,
k2k1: __m128i,
zero_four: __m128i,
) -> (__m128i, __m128i) {
let t1 = _mm_unpackhi_epi64(t01, t01);
let t3 = _mm_unpackhi_epi64(t23, t23);
let dc = _mm_add_epi16(t01, zero_four);
let a_d3 = _mm_add_epi16(dc, t23);
let b_c3 = _mm_sub_epi16(dc, t23);
let c1d1 = _mm_mulhi_epi16(t1, k2k1);
let c2d2 = _mm_mulhi_epi16(t3, k1k2);
let c3 = _mm_unpackhi_epi64(b_c3, b_c3);
let c4 = _mm_sub_epi16(c1d1, c2d2);
let c = _mm_add_epi16(c3, c4);
let d4u = _mm_add_epi16(c1d1, c2d2);
let du = _mm_add_epi16(a_d3, d4u);
let d = _mm_unpackhi_epi64(du, du);
let comb_ab = _mm_unpacklo_epi64(a_d3, b_c3);
let comb_dc = _mm_unpacklo_epi64(d, c);
let tmp01 = _mm_add_epi16(comb_ab, comb_dc);
let tmp32 = _mm_sub_epi16(comb_ab, comb_dc);
let tmp23 = _mm_shuffle_epi32(tmp32, 0b01_00_11_10);
let shifted01 = _mm_srai_epi16(tmp01, 3);
let shifted23 = _mm_srai_epi16(tmp23, 3);
let transpose_0 = _mm_unpacklo_epi16(shifted01, shifted23);
let transpose_1 = _mm_unpackhi_epi16(shifted01, shifted23);
let out01 = _mm_unpacklo_epi16(transpose_0, transpose_1);
let out23 = _mm_unpackhi_epi16(transpose_0, transpose_1);
(out01, out23)
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[arcane]
pub(crate) fn idct_add_residue_entry(
_token: X64V3Token,
coeffs: &mut [i32; 16],
pred_block: &[u8],
pred_stride: usize,
out_block: &mut [u8],
out_stride: usize,
pred_y0: usize,
pred_x0: usize,
out_y0: usize,
out_x0: usize,
) {
idct_add_residue_sse2(
_token,
coeffs,
pred_block,
pred_stride,
out_block,
out_stride,
pred_y0,
pred_x0,
out_y0,
out_x0,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
pub(crate) fn idct_add_residue_sse2(
_token: X64V3Token,
coeffs: &mut [i32; 16],
pred_block: &[u8],
pred_stride: usize,
out_block: &mut [u8],
out_stride: usize,
pred_y0: usize,
pred_x0: usize,
out_y0: usize,
out_x0: usize,
) {
let pred_base = pred_y0 * pred_stride + pred_x0;
let pred_region = &pred_block[pred_base..pred_base + 3 * pred_stride + 4];
let out_base = out_y0 * out_stride + out_x0;
let out_region = &mut out_block[out_base..out_base + 3 * out_stride + 4];
let ps = pred_stride;
let os = out_stride;
let k1k2 = _mm_set_epi16(-30068, -30068, -30068, -30068, 20091, 20091, 20091, 20091);
let k2k1 = _mm_set_epi16(20091, 20091, 20091, 20091, -30068, -30068, -30068, -30068);
let zero_four = _mm_set_epi16(0, 0, 0, 0, 4, 4, 4, 4);
let zero = _mm_setzero_si128();
let (c0, c1, c2, c3) = rows4(coeffs);
let i32_0 = simd_mem::_mm_loadu_si128(c0);
let i32_1 = simd_mem::_mm_loadu_si128(c1);
let i32_2 = simd_mem::_mm_loadu_si128(c2);
let i32_3 = simd_mem::_mm_loadu_si128(c3);
let in01 = _mm_packs_epi32(i32_0, i32_1);
let in23 = _mm_packs_epi32(i32_2, i32_3);
let (t01, t23) = itransform_pass_sse2(_token, in01, in23, k1k2, k2k1);
let (res01, res23) = itransform_pass2_sse2(_token, t01, t23, k1k2, k2k1, zero_four);
macro_rules! process_row {
($res:expr, $pred_off:expr, $out_off:expr, $hi:expr) => {{
let residual = if $hi {
_mm_unpackhi_epi64($res, $res)
} else {
$res
};
let pred_bytes: [u8; 4] = pred_region[$pred_off..$pred_off + 4].try_into().unwrap();
let pred_vec = _mm_cvtsi32_si128(i32::from_ne_bytes(pred_bytes));
let pred_i16 = _mm_unpacklo_epi8(pred_vec, zero);
let sum = _mm_add_epi16(pred_i16, residual);
let packed = _mm_packus_epi16(sum, sum);
let result = _mm_cvtsi128_si32(packed) as u32;
out_region[$out_off..$out_off + 4].copy_from_slice(&result.to_ne_bytes());
}};
}
process_row!(res01, 0, 0, false);
process_row!(res01, ps, os, true);
process_row!(res23, ps * 2, os * 2, false);
process_row!(res23, ps * 3, os * 3, true);
let (c0, c1, c2, c3) = rows4_mut(coeffs);
simd_mem::_mm_storeu_si128(c0, zero);
simd_mem::_mm_storeu_si128(c1, zero);
simd_mem::_mm_storeu_si128(c2, zero);
simd_mem::_mm_storeu_si128(c3, zero);
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[arcane]
pub(crate) fn idct_add_residue_dc_entry(
_token: X64V3Token,
coeffs: &mut [i32; 16],
pred_block: &[u8],
pred_stride: usize,
out_block: &mut [u8],
out_stride: usize,
pred_y0: usize,
pred_x0: usize,
out_y0: usize,
out_x0: usize,
) {
idct_add_residue_dc_sse2(
_token,
coeffs,
pred_block,
pred_stride,
out_block,
out_stride,
pred_y0,
pred_x0,
out_y0,
out_x0,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
pub(crate) fn idct_add_residue_dc_sse2(
_token: X64V3Token,
coeffs: &mut [i32; 16],
pred_block: &[u8],
pred_stride: usize,
out_block: &mut [u8],
out_stride: usize,
pred_y0: usize,
pred_x0: usize,
out_y0: usize,
out_x0: usize,
) {
let pred_base = pred_y0 * pred_stride + pred_x0;
let pred_region = &pred_block[pred_base..pred_base + 3 * pred_stride + 4];
let out_base = out_y0 * out_stride + out_x0;
let out_region = &mut out_block[out_base..out_base + 3 * out_stride + 4];
let ps = pred_stride;
let os = out_stride;
let dc = coeffs[0];
let dc_adj = ((dc + 4) >> 3) as i16;
let dc_vec = _mm_set1_epi16(dc_adj);
let zero = _mm_setzero_si128();
for (pred_off, out_off) in [(0, 0), (ps, os), (ps * 2, os * 2), (ps * 3, os * 3)] {
let pred_bytes: [u8; 4] = pred_region[pred_off..pred_off + 4].try_into().unwrap();
let pred_vec = _mm_cvtsi32_si128(i32::from_ne_bytes(pred_bytes));
let pred_i16 = _mm_unpacklo_epi8(pred_vec, zero);
let sum = _mm_add_epi16(pred_i16, dc_vec);
let packed = _mm_packus_epi16(sum, sum);
let result = _mm_cvtsi128_si32(packed) as u32;
out_region[out_off..out_off + 4].copy_from_slice(&result.to_ne_bytes());
}
coeffs.fill(0);
}
#[cfg(target_arch = "x86_64")]
#[allow(dead_code, clippy::too_many_arguments)]
#[inline(always)]
pub(crate) fn idct_add_residue_with_token(
token: X64V3Token,
coeffs: &mut [i32; 16],
pred_block: &[u8],
pred_stride: usize,
out_block: &mut [u8],
out_stride: usize,
pred_y0: usize,
pred_x0: usize,
out_y0: usize,
out_x0: usize,
dc_only: bool,
) {
if dc_only {
idct_add_residue_dc_entry(
token,
coeffs,
pred_block,
pred_stride,
out_block,
out_stride,
pred_y0,
pred_x0,
out_y0,
out_x0,
);
} else {
idct_add_residue_entry(
token,
coeffs,
pred_block,
pred_stride,
out_block,
out_stride,
pred_y0,
pred_x0,
out_y0,
out_x0,
);
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub(crate) fn idct_add_residue_inplace_sse2(
_token: X64V3Token,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
idct_add_residue_inplace_sse2_inner(_token, coeffs, block, y0, x0, stride, dc_only);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn idct_add_residue_inplace_sse2_inner(
_token: X64V3Token,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
let base = y0 * stride + x0;
let region = &mut block[base..base + 3 * stride + 4];
let s1 = stride;
let s2 = stride * 2;
let s3 = stride * 3;
if dc_only {
let dc = coeffs[0];
let dc_adj = ((dc + 4) >> 3) as i16;
let dc_vec = _mm_set1_epi16(dc_adj);
let zero = _mm_setzero_si128();
for &off in &[0, s1, s2, s3] {
let pred_bytes: [u8; 4] = region[off..off + 4].try_into().unwrap();
let pred_vec = _mm_cvtsi32_si128(i32::from_ne_bytes(pred_bytes));
let pred_i16 = _mm_unpacklo_epi8(pred_vec, zero);
let sum = _mm_add_epi16(pred_i16, dc_vec);
let packed = _mm_packus_epi16(sum, sum);
let result = _mm_cvtsi128_si32(packed) as u32;
region[off..off + 4].copy_from_slice(&result.to_ne_bytes());
}
} else {
let k1k2 = _mm_set_epi16(-30068, -30068, -30068, -30068, 20091, 20091, 20091, 20091);
let k2k1 = _mm_set_epi16(20091, 20091, 20091, 20091, -30068, -30068, -30068, -30068);
let zero_four = _mm_set_epi16(0, 0, 0, 0, 4, 4, 4, 4);
let zero = _mm_setzero_si128();
let (c0, c1, c2, c3) = rows4(coeffs);
let i32_0 = simd_mem::_mm_loadu_si128(c0);
let i32_1 = simd_mem::_mm_loadu_si128(c1);
let i32_2 = simd_mem::_mm_loadu_si128(c2);
let i32_3 = simd_mem::_mm_loadu_si128(c3);
let in01 = _mm_packs_epi32(i32_0, i32_1);
let in23 = _mm_packs_epi32(i32_2, i32_3);
let (t01, t23) = itransform_pass_sse2(_token, in01, in23, k1k2, k2k1);
let (res01, res23) = itransform_pass2_sse2(_token, t01, t23, k1k2, k2k1, zero_four);
macro_rules! process_row {
($res:expr, $off:expr, $hi:expr) => {{
let residual = if $hi {
_mm_unpackhi_epi64($res, $res)
} else {
$res
};
let pred_bytes: [u8; 4] = region[$off..$off + 4].try_into().unwrap();
let pred_vec = _mm_cvtsi32_si128(i32::from_ne_bytes(pred_bytes));
let pred_i16 = _mm_unpacklo_epi8(pred_vec, zero);
let sum = _mm_add_epi16(pred_i16, residual);
let packed = _mm_packus_epi16(sum, sum);
let result = _mm_cvtsi128_si32(packed) as u32;
region[$off..$off + 4].copy_from_slice(&result.to_ne_bytes());
}};
}
process_row!(res01, 0, false);
process_row!(res01, s1, true);
process_row!(res23, s2, false);
process_row!(res23, s3, true);
}
coeffs.fill(0);
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
pub(crate) fn idct_add_residue_inplace_with_token(
token: X64V3Token,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
idct_add_residue_inplace_sse2(token, coeffs, block, y0, x0, stride, dc_only);
}
#[inline(always)]
pub(crate) fn idct_add_residue_inplace(
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
incant!(
idct_add_residue_inplace_dispatch(coeffs, block, y0, x0, stride, dc_only),
[v3, neon, wasm128, scalar]
);
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn idct_add_residue_inplace_dispatch_v3(
token: X64V3Token,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
idct_add_residue_inplace_sse2(token, coeffs, block, y0, x0, stride, dc_only);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn idct_add_residue_inplace_dispatch_neon(
token: NeonToken,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
idct_add_residue_inplace_neon(token, coeffs, block, y0, x0, stride, dc_only);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn idct_add_residue_inplace_dispatch_wasm128(
_token: Wasm128Token,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
idct_add_residue_inplace_dispatch_scalar(ScalarToken, coeffs, block, y0, x0, stride, dc_only);
}
#[inline(always)]
fn idct_add_residue_inplace_dispatch_scalar(
_token: ScalarToken,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
if dc_only {
let dc = coeffs[0];
let dc_adj = (dc + 4) >> 3;
for row in 0..4 {
let pos = (y0 + row) * stride + x0;
for col in 0..4 {
let p = block[pos + col] as i32;
block[pos + col] = (p + dc_adj).clamp(0, 255) as u8;
}
}
} else {
idct4x4_scalar(coeffs);
let mut pos = y0 * stride + x0;
for row in coeffs.chunks(4) {
for (p, &a) in block[pos..][..4].iter_mut().zip(row.iter()) {
*p = (a + i32::from(*p)).clamp(0, 255) as u8;
}
pos += stride;
}
}
coeffs.fill(0);
}
#[inline(always)]
pub(crate) fn ftransform_from_u8_4x4(src: &[u8; 16], ref_: &[u8; 16]) -> [i32; 16] {
incant!(
ftransform_from_u8_4x4_dispatch(src, ref_),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn ftransform_from_u8_4x4_dispatch_v3(
token: X64V3Token,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
ftransform_from_u8_4x4_entry(token, src, ref_)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn ftransform_from_u8_4x4_dispatch_neon(
token: NeonToken,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
ftransform_from_u8_4x4_neon(token, src, ref_)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn ftransform_from_u8_4x4_dispatch_wasm128(
token: Wasm128Token,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
ftransform_from_u8_4x4_wasm(token, src, ref_)
}
#[inline(always)]
fn ftransform_from_u8_4x4_dispatch_scalar(
_token: ScalarToken,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
ftransform_from_u8_4x4_scalar(src, ref_)
}
pub(crate) fn ftransform_from_u8_4x4_scalar(src: &[u8; 16], ref_: &[u8; 16]) -> [i32; 16] {
let mut block = [0i32; 16];
for i in 0..16 {
block[i] = src[i] as i32 - ref_[i] as i32;
}
dct4x4_scalar(&mut block);
block
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ftransform_from_u8_4x4_entry(_token: X64V3Token, src: &[u8; 16], ref_: &[u8; 16]) -> [i32; 16] {
ftransform_from_u8_4x4_sse2(_token, src, ref_)
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn ftransform_from_u8_4x4_sse2(
_token: X64V3Token,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
let zero = _mm_setzero_si128();
let src_all = simd_mem::_mm_loadu_si128(src);
let ref_all = simd_mem::_mm_loadu_si128(ref_);
let src_lo = _mm_unpacklo_epi8(src_all, zero); let src_hi = _mm_unpackhi_epi8(src_all, zero); let ref_lo = _mm_unpacklo_epi8(ref_all, zero); let ref_hi = _mm_unpackhi_epi8(ref_all, zero);
let diff_lo = _mm_sub_epi16(src_lo, ref_lo); let diff_hi = _mm_sub_epi16(src_hi, ref_hi);
let in01 = _mm_unpacklo_epi32(diff_lo, _mm_unpackhi_epi64(diff_lo, diff_lo));
let in23 = _mm_unpacklo_epi32(diff_hi, _mm_unpackhi_epi64(diff_hi, diff_hi));
let (v01, v32) = ftransform_pass1_i16(_token, in01, in23);
let mut out16 = [0i16; 16];
ftransform_pass2_i16(_token, &v01, &v32, &mut out16);
let (out16_lo, out16_hi) = halves8(&out16);
let out01 = simd_mem::_mm_loadu_si128(out16_lo);
let out23 = simd_mem::_mm_loadu_si128(out16_hi);
let sign01 = _mm_cmpgt_epi16(zero, out01);
let sign23 = _mm_cmpgt_epi16(zero, out23);
let out_0 = _mm_unpacklo_epi16(out01, sign01);
let out_1 = _mm_unpackhi_epi16(out01, sign01);
let out_2 = _mm_unpacklo_epi16(out23, sign23);
let out_3 = _mm_unpackhi_epi16(out23, sign23);
let mut result = [0i32; 16];
let (r0, r1, r2, r3) = rows4_mut(&mut result);
simd_mem::_mm_storeu_si128(r0, out_0);
simd_mem::_mm_storeu_si128(r1, out_1);
simd_mem::_mm_storeu_si128(r2, out_2);
simd_mem::_mm_storeu_si128(r3, out_3);
result
}
#[cfg(test)]
mod tests_simd {
use super::*;
#[test]
fn test_dct_intrinsics_matches_scalar() {
let input: [i32; 16] = [
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
];
let mut scalar_block = input;
dct4x4_scalar(&mut scalar_block);
let mut intrinsics_block = input;
dct4x4_intrinsics(&mut intrinsics_block);
assert_eq!(
scalar_block, intrinsics_block,
"Intrinsics DCT doesn't match scalar.\nScalar: {:?}\nIntrinsics: {:?}",
scalar_block, intrinsics_block
);
}
#[test]
fn test_idct_intrinsics_matches_scalar() {
let mut input: [i32; 16] = [
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
];
dct4x4_scalar(&mut input);
let mut scalar_block = input;
idct4x4_scalar(&mut scalar_block);
let mut intrinsics_block = input;
idct4x4_intrinsics(&mut intrinsics_block);
assert_eq!(
scalar_block, intrinsics_block,
"Intrinsics IDCT doesn't match scalar.\nScalar: {:?}\nIntrinsics: {:?}",
scalar_block, intrinsics_block
);
}
#[test]
fn test_dct_two_intrinsics() {
let input1: [i32; 16] = [
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
];
let input2: [i32; 16] = [
100, 50, 25, 75, 200, 150, 100, 50, 25, 75, 125, 175, 225, 200, 150, 100,
];
let mut scalar1 = input1;
let mut scalar2 = input2;
dct4x4_scalar(&mut scalar1);
dct4x4_scalar(&mut scalar2);
let mut intrinsics1 = input1;
let mut intrinsics2 = input2;
dct4x4_two_intrinsics(&mut intrinsics1, &mut intrinsics2);
assert_eq!(scalar1, intrinsics1, "Two-block intrinsics block1 mismatch");
assert_eq!(scalar2, intrinsics2, "Two-block intrinsics block2 mismatch");
}
#[test]
fn test_roundtrip() {
let original: [i32; 16] = [
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
];
let mut block = original;
dct4x4_intrinsics(&mut block);
idct4x4_intrinsics(&mut block);
assert_eq!(original, block, "Roundtrip failed");
}
#[test]
fn test_ftransform_from_u8_4x4() {
let src: [u8; 16] = [
100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220,
];
let ref_: [u8; 16] = [128; 16];
let simd_result = ftransform_from_u8_4x4(&src, &ref_);
let mut expected = [0i32; 16];
for i in 0..16 {
expected[i] = src[i] as i32 - ref_[i] as i32;
}
dct4x4_scalar(&mut expected);
assert_eq!(
simd_result, expected,
"ftransform_from_u8_4x4 mismatch.\nSIMD: {:?}\nExpected: {:?}",
simd_result, expected
);
}
#[test]
fn test_ftransform_from_u8_4x4_varied() {
let src: [u8; 16] = [
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
];
let ref_: [u8; 16] = [
100, 50, 200, 80, 60, 130, 170, 140, 230, 210, 120, 220, 200, 20, 70, 110,
];
let simd_result = ftransform_from_u8_4x4(&src, &ref_);
let mut expected = [0i32; 16];
for i in 0..16 {
expected[i] = src[i] as i32 - ref_[i] as i32;
}
dct4x4_scalar(&mut expected);
assert_eq!(
simd_result, expected,
"ftransform_from_u8_4x4 varied mismatch.\nSIMD: {:?}\nExpected: {:?}",
simd_result, expected
);
}
#[test]
fn test_ftransform2_from_u8() {
const STRIDE: usize = 16; let mut src = [128u8; STRIDE * 4];
let mut ref_ = [128u8; STRIDE * 4];
for y in 0..4 {
for x in 0..8 {
src[y * STRIDE + x] = (y * 8 + x) as u8 + 100;
ref_[y * STRIDE + x] = 128;
}
}
let mut out_simd = [0i16; 32];
ftransform2_from_u8(&src, &ref_, STRIDE, STRIDE, &mut out_simd);
let mut expected = [0i16; 32];
for block in 0..2 {
let mut block_data = [0i32; 16];
for y in 0..4 {
for x in 0..4 {
let src_val = src[y * STRIDE + block * 4 + x] as i32;
let ref_val = ref_[y * STRIDE + block * 4 + x] as i32;
block_data[y * 4 + x] = src_val - ref_val;
}
}
dct4x4_scalar(&mut block_data);
for (i, &val) in block_data.iter().enumerate() {
expected[block * 16 + i] = val as i16;
}
}
assert_eq!(
out_simd, expected,
"ftransform2_from_u8 mismatch.\nSIMD: {:?}\nExpected: {:?}",
out_simd, expected
);
}
}
#[cfg(all(test, feature = "_benchmarks"))]
mod benchmarks {
use super::*;
use test::Bencher;
const TEST_BLOCKS: [[i32; 16]; 4] = [
[
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
],
[
100, 50, 25, 75, 200, 150, 100, 50, 25, 75, 125, 175, 225, 200, 150, 100,
],
[
12, 34, 56, 78, 90, 12, 34, 56, 78, 90, 12, 34, 56, 78, 90, 12,
],
[
255, 0, 128, 64, 192, 32, 224, 16, 240, 8, 248, 4, 252, 2, 254, 1,
],
];
#[bench]
fn bench_dct_intrinsics(b: &mut Bencher) {
b.iter(|| {
for input in &TEST_BLOCKS {
let mut block = *input;
test::black_box(dct4x4_intrinsics(&mut block));
}
});
}
#[bench]
fn bench_idct_intrinsics(b: &mut Bencher) {
let mut dct_blocks = TEST_BLOCKS;
for block in &mut dct_blocks {
dct4x4_scalar(block);
}
b.iter(|| {
for input in &dct_blocks {
let mut block = *input;
test::black_box(idct4x4_intrinsics(&mut block));
}
});
}
#[bench]
fn bench_dct_two_intrinsics(b: &mut Bencher) {
b.iter(|| {
let mut block1 = TEST_BLOCKS[0];
let mut block2 = TEST_BLOCKS[1];
test::black_box(dct4x4_two_intrinsics(&mut block1, &mut block2));
let mut block3 = TEST_BLOCKS[2];
let mut block4 = TEST_BLOCKS[3];
test::black_box(dct4x4_two_intrinsics(&mut block3, &mut block4));
});
}
}
#[cfg(target_arch = "aarch64")]
mod neon_transform {
use super::*;
#[arcane]
pub(crate) fn dct4x4_neon(_token: NeonToken, block: &mut [i32; 16]) {
dct4x4_neon_inner(_token, block);
}
#[rite]
fn dct4x4_neon_inner(_token: NeonToken, block: &mut [i32; 16]) {
let (b0, b1, b2, b3) = rows4(block);
let r0 = simd_mem::vld1q_s32(b0);
let r1 = simd_mem::vld1q_s32(b1);
let r2 = simd_mem::vld1q_s32(b2);
let r3 = simd_mem::vld1q_s32(b3);
let d0 = vmovn_s32(r0);
let d1 = vmovn_s32(r1);
let d2 = vmovn_s32(r2);
let d3 = vmovn_s32(r3);
let (t0t1, t3t2) = transpose_4x4_s16_neon(_token, d0, d1, d2, d3);
let (p0p1, p3p2) = forward_pass_1_neon(_token, t0t1, t3t2);
let out = forward_pass_2_neon(_token, p0p1, p3p2);
let (b0, b1, b2, b3) = rows4_mut(block);
simd_mem::vst1q_s32(b0, out[0]);
simd_mem::vst1q_s32(b1, out[1]);
simd_mem::vst1q_s32(b2, out[2]);
simd_mem::vst1q_s32(b3, out[3]);
}
#[rite]
fn transpose_4x4_s16_neon(
_token: NeonToken,
a: int16x4_t,
b: int16x4_t,
c: int16x4_t,
d: int16x4_t,
) -> (int16x8_t, int16x8_t) {
let ab = vtrn_s16(a, b);
let cd = vtrn_s16(c, d);
let tmp02 = vtrn_s32(vreinterpret_s32_s16(ab.0), vreinterpret_s32_s16(cd.0));
let tmp13 = vtrn_s32(vreinterpret_s32_s16(ab.1), vreinterpret_s32_s16(cd.1));
let out01 = vreinterpretq_s16_s64(vcombine_s64(
vreinterpret_s64_s32(tmp02.0),
vreinterpret_s64_s32(tmp13.0),
));
let out32 = vreinterpretq_s16_s64(vcombine_s64(
vreinterpret_s64_s32(tmp13.1),
vreinterpret_s64_s32(tmp02.1),
));
(out01, out32)
}
#[rite]
fn forward_pass_1_neon(
_token: NeonToken,
d0d1: int16x8_t,
d3d2: int16x8_t,
) -> (int16x8_t, int16x8_t) {
let k_cst937 = vdupq_n_s32(937);
let k_cst1812 = vdupq_n_s32(1812);
let a0a1 = vaddq_s16(d0d1, d3d2);
let a3a2 = vsubq_s16(d0d1, d3d2);
let a0a1_2 = vshlq_n_s16::<3>(a0a1);
let tmp0 = vadd_s16(vget_low_s16(a0a1_2), vget_high_s16(a0a1_2));
let tmp2 = vsub_s16(vget_low_s16(a0a1_2), vget_high_s16(a0a1_2));
let a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
let a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
let a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
let a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
let tmp1 = vshrn_n_s32::<9>(vaddq_s32(a2_p_a3, k_cst1812));
let tmp3 = vshrn_n_s32::<9>(vaddq_s32(a3_m_a2, k_cst937));
transpose_4x4_s16_neon(_token, tmp0, tmp1, tmp2, tmp3)
}
#[rite]
fn forward_pass_2_neon(_token: NeonToken, d0d1: int16x8_t, d3d2: int16x8_t) -> [int32x4_t; 4] {
let k_cst12000 = vdupq_n_s32(12000 + (1 << 16));
let k_cst51000 = vdupq_n_s32(51000);
let a0a1 = vaddq_s16(d0d1, d3d2);
let a3a2 = vsubq_s16(d0d1, d3d2);
let a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
let out0 = vshr_n_s16::<4>(vadd_s16(a0_k7, vget_high_s16(a0a1)));
let out2 = vshr_n_s16::<4>(vsub_s16(a0_k7, vget_high_s16(a0a1)));
let a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
let a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
let a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
let a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
let tmp1 = vaddhn_s32(a2_p_a3, k_cst12000);
let out3 = vaddhn_s32(a3_m_a2, k_cst51000);
let a3_eq_0 = vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
let out1 = vadd_s16(tmp1, a3_eq_0);
[
vmovl_s16(out0),
vmovl_s16(out1),
vmovl_s16(out2),
vmovl_s16(out3),
]
}
const KC1: i16 = 20091;
const KC2_HALF: i16 = 17734;
#[arcane]
pub(crate) fn idct4x4_neon(_token: NeonToken, block: &mut [i32; 16]) {
idct4x4_neon_inner(_token, block);
}
#[rite]
fn idct4x4_neon_inner(_token: NeonToken, block: &mut [i32; 16]) {
let (b0, b1, b2, b3) = rows4(block);
let r0 = simd_mem::vld1q_s32(b0);
let r1 = simd_mem::vld1q_s32(b1);
let r2 = simd_mem::vld1q_s32(b2);
let r3 = simd_mem::vld1q_s32(b3);
let in01 = vcombine_s16(vmovn_s32(r0), vmovn_s32(r1));
let in23 = vcombine_s16(vmovn_s32(r2), vmovn_s32(r3));
let (t01, t23) = itransform_pass_neon(_token, in01, in23);
let (res01, res23) = itransform_pass_neon(_token, t01, t23);
let four = vdupq_n_s16(4);
let res01_r = vshrq_n_s16::<3>(vaddq_s16(res01, four));
let res23_r = vshrq_n_s16::<3>(vaddq_s16(res23, four));
let (b0, b1, b2, b3) = rows4_mut(block);
simd_mem::vst1q_s32(b0, vmovl_s16(vget_low_s16(res01_r)));
simd_mem::vst1q_s32(b1, vmovl_s16(vget_high_s16(res01_r)));
simd_mem::vst1q_s32(b2, vmovl_s16(vget_low_s16(res23_r)));
simd_mem::vst1q_s32(b3, vmovl_s16(vget_high_s16(res23_r)));
}
#[rite]
fn itransform_pass_neon(
_token: NeonToken,
in01: int16x8_t,
in23: int16x8_t,
) -> (int16x8_t, int16x8_t) {
let b1 = vcombine_s16(vget_high_s16(in01), vget_high_s16(in23));
let c0 = vsraq_n_s16::<1>(b1, vqdmulhq_n_s16(b1, KC1));
let c1 = vqdmulhq_n_s16(b1, KC2_HALF);
let a = vqadd_s16(vget_low_s16(in01), vget_low_s16(in23));
let b = vqsub_s16(vget_low_s16(in01), vget_low_s16(in23));
let c = vqsub_s16(vget_low_s16(c1), vget_high_s16(c0));
let d = vqadd_s16(vget_low_s16(c0), vget_high_s16(c1));
let d0 = vcombine_s16(a, b); let d1 = vcombine_s16(d, c);
let e0 = vqaddq_s16(d0, d1); let e_tmp = vqsubq_s16(d0, d1); let e1 = vcombine_s16(vget_high_s16(e_tmp), vget_low_s16(e_tmp));
let tmp = vzipq_s16(e0, e1);
let out = vzipq_s16(tmp.0, tmp.1);
(out.0, out.1)
}
#[arcane]
pub(crate) fn idct_add_residue_inplace_neon(
_token: NeonToken,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
idct_add_residue_inplace_neon_inner(_token, coeffs, block, y0, x0, stride, dc_only);
}
#[rite]
pub(crate) fn idct_add_residue_inplace_neon_inner(
_token: NeonToken,
coeffs: &mut [i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
dc_only: bool,
) {
if dc_only {
idct_add_residue_dc_neon(_token, coeffs, block, y0, x0, stride);
} else {
idct_add_residue_full_neon(_token, coeffs, block, y0, x0, stride);
}
coeffs.fill(0);
}
#[rite]
fn idct_add_residue_dc_neon(
_token: NeonToken,
coeffs: &[i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
) {
let base = y0 * stride + x0;
let region = &mut block[base..base + 3 * stride + 4];
let dc = coeffs[0];
let dc_adj = ((dc + 4) >> 3) as i16;
let dc_vec = vdupq_n_s16(dc_adj);
for &off in &[0, stride, stride * 2, stride * 3] {
let pred_bytes: [u8; 4] = region[off..off + 4].try_into().unwrap();
let pred_u32 = u32::from_ne_bytes(pred_bytes);
let pred_v = vreinterpret_u8_u32(vmov_n_u32(pred_u32));
let pred_i16 = vreinterpretq_s16_u16(vmovl_u8(pred_v));
let sum = vaddq_s16(pred_i16, dc_vec);
let packed = vqmovun_s16(sum);
let result_u32 = vget_lane_u32::<0>(vreinterpret_u32_u8(packed));
region[off..off + 4].copy_from_slice(&result_u32.to_ne_bytes());
}
}
#[rite]
fn idct_add_residue_full_neon(
_token: NeonToken,
coeffs: &[i32; 16],
block: &mut [u8],
y0: usize,
x0: usize,
stride: usize,
) {
let base = y0 * stride + x0;
let region = &mut block[base..base + 3 * stride + 4];
let (c0, c1, c2, c3) = rows4(coeffs);
let r0 = simd_mem::vld1q_s32(c0);
let r1 = simd_mem::vld1q_s32(c1);
let r2 = simd_mem::vld1q_s32(c2);
let r3 = simd_mem::vld1q_s32(c3);
let in01 = vcombine_s16(vmovn_s32(r0), vmovn_s32(r1));
let in23 = vcombine_s16(vmovn_s32(r2), vmovn_s32(r3));
let (t01, t23) = itransform_pass_neon(_token, in01, in23);
let (res01, res23) = itransform_pass_neon(_token, t01, t23);
let res_rows: [(int16x8_t, bool); 4] =
[(res01, false), (res01, true), (res23, false), (res23, true)];
let offsets = [0, stride, stride * 2, stride * 3];
for (idx, &(res, use_high)) in res_rows.iter().enumerate() {
let residual = if use_high {
vcombine_s16(vget_high_s16(res), vget_high_s16(res))
} else {
vcombine_s16(vget_low_s16(res), vget_low_s16(res))
};
let off = offsets[idx];
let pred_bytes: [u8; 4] = region[off..off + 4].try_into().unwrap();
let pred_u32 = u32::from_ne_bytes(pred_bytes);
let pred_v = vreinterpret_u8_u32(vmov_n_u32(pred_u32));
let pred_i16 = vreinterpretq_s16_u16(vmovl_u8(pred_v));
let out = vrsraq_n_s16::<3>(pred_i16, residual);
let packed = vqmovun_s16(out);
let result_u32 = vget_lane_u32::<0>(vreinterpret_u32_u8(packed));
region[off..off + 4].copy_from_slice(&result_u32.to_ne_bytes());
}
}
#[arcane]
pub(crate) fn ftransform_from_u8_4x4_neon(
_token: NeonToken,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
ftransform_from_u8_4x4_neon_inner(_token, src, ref_)
}
#[rite]
fn ftransform_from_u8_4x4_neon_inner(
_token: NeonToken,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
let s = simd_mem::vld1q_u8(src);
let r = simd_mem::vld1q_u8(ref_);
let diff_01 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
let diff_23 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
let d0 = vget_low_s16(diff_01);
let d1 = vget_high_s16(diff_01);
let d2 = vget_low_s16(diff_23);
let d3 = vget_high_s16(diff_23);
let (t0t1, t3t2) = transpose_4x4_s16_neon(_token, d0, d1, d2, d3);
let (p0p1, p3p2) = forward_pass_1_neon(_token, t0t1, t3t2);
let out = forward_pass_2_neon(_token, p0p1, p3p2);
let mut result = [0i32; 16];
let (r0, r1, r2, r3) = rows4_mut(&mut result);
simd_mem::vst1q_s32(r0, out[0]);
simd_mem::vst1q_s32(r1, out[1]);
simd_mem::vst1q_s32(r2, out[2]);
simd_mem::vst1q_s32(r3, out[3]);
result
}
#[arcane]
pub(crate) fn ftransform2_neon(
_token: NeonToken,
src: &[u8],
ref_: &[u8],
src_stride: usize,
ref_stride: usize,
out: &mut [[i32; 16]; 2],
) {
for blk in 0..2 {
let sx = blk * 4;
let mut s_flat = [0u8; 16];
let mut r_flat = [0u8; 16];
for row in 0..4 {
s_flat[row * 4..row * 4 + 4].copy_from_slice(&src[row * src_stride + sx..][..4]);
r_flat[row * 4..row * 4 + 4].copy_from_slice(&ref_[row * ref_stride + sx..][..4]);
}
out[blk] = ftransform_from_u8_4x4_neon(_token, &s_flat, &r_flat);
}
}
#[arcane]
pub(crate) fn add_residue_neon(
_token: NeonToken,
pblock: &mut [u8],
rblock: &[i32; 16],
y0: usize,
x0: usize,
stride: usize,
) {
add_residue_neon_inner(_token, pblock, rblock, y0, x0, stride);
}
#[rite]
fn add_residue_neon_inner(
_token: NeonToken,
pblock: &mut [u8],
rblock: &[i32; 16],
y0: usize,
x0: usize,
stride: usize,
) {
let (b0, b1, b2, b3) = rows4(rblock);
let r0_i32 = simd_mem::vld1q_s32(b0);
let r1_i32 = simd_mem::vld1q_s32(b1);
let r2_i32 = simd_mem::vld1q_s32(b2);
let r3_i32 = simd_mem::vld1q_s32(b3);
let r0_i16 = vmovn_s32(r0_i32); let r1_i16 = vmovn_s32(r1_i32);
let r2_i16 = vmovn_s32(r2_i32);
let r3_i16 = vmovn_s32(r3_i32);
for (row, r_i16) in [(0usize, r0_i16), (1, r1_i16), (2, r2_i16), (3, r3_i16)] {
let offset = (y0 + row) * stride + x0;
let mut pred_bytes = [0u8; 8]; pred_bytes[..4].copy_from_slice(&pblock[offset..offset + 4]);
let pred_u8 = simd_mem::vld1_u8(&pred_bytes);
let pred_i16 = vreinterpretq_s16_u16(vmovl_u8(pred_u8));
let sum = vaddq_s16(pred_i16, vcombine_s16(r_i16, vdup_n_s16(0)));
let result = vqmovun_s16(sum);
let mut out_bytes = [0u8; 8];
simd_mem::vst1_u8(&mut out_bytes, result);
pblock[offset..offset + 4].copy_from_slice(&out_bytes[..4]);
}
}
}
#[cfg(target_arch = "aarch64")]
pub(crate) use neon_transform::*;
#[cfg(target_arch = "wasm32")]
mod wasm_transform {
use super::*;
pub(crate) fn dct4x4_wasm(_token: Wasm128Token, block: &mut [i32; 16]) {
dct4x4_wasm_impl(_token, block);
}
#[cfg(target_arch = "wasm32")]
#[arcane]
pub(crate) fn idct4x4_wasm(_token: Wasm128Token, block: &mut [i32; 16]) {
idct4x4_wasm_impl(_token, block);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn load_row(block: &[i32], row: usize) -> v128 {
let off = row * 4;
i32x4(block[off], block[off + 1], block[off + 2], block[off + 3])
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn store_row(block: &mut [i32], row: usize, v: v128) {
let off = row * 4;
block[off] = i32x4_extract_lane::<0>(v);
block[off + 1] = i32x4_extract_lane::<1>(v);
block[off + 2] = i32x4_extract_lane::<2>(v);
block[off + 3] = i32x4_extract_lane::<3>(v);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn mulhi32x4(v: v128, c: i32) -> v128 {
let c_vec = i32x4_splat(c);
let lo_prod = i64x2_extmul_low_i32x4(v, c_vec);
let hi_prod = i64x2_extmul_high_i32x4(v, c_vec);
let lo_shifted = i64x2_shr(lo_prod, 16);
let hi_shifted = i64x2_shr(hi_prod, 16);
i32x4(
i64x2_extract_lane::<0>(lo_shifted) as i32,
i64x2_extract_lane::<1>(lo_shifted) as i32,
i64x2_extract_lane::<0>(hi_shifted) as i32,
i64x2_extract_lane::<1>(hi_shifted) as i32,
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn transpose4x4(r0: v128, r1: v128, r2: v128, r3: v128) -> (v128, v128, v128, v128) {
let t0 = i32x4_shuffle::<0, 4, 1, 5>(r0, r1); let t1 = i32x4_shuffle::<2, 6, 3, 7>(r0, r1); let t2 = i32x4_shuffle::<0, 4, 1, 5>(r2, r3); let t3 = i32x4_shuffle::<2, 6, 3, 7>(r2, r3);
let o0 = i64x2_shuffle::<0, 2>(t0, t2); let o1 = i64x2_shuffle::<1, 3>(t0, t2); let o2 = i64x2_shuffle::<0, 2>(t1, t3); let o3 = i64x2_shuffle::<1, 3>(t1, t3);
(o0, o1, o2, o3)
}
const WASM_CONST1: i32 = 20091; const WASM_CONST2: i32 = 35468;
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn idct_butterfly(in0: v128, in1: v128, in2: v128, in3: v128) -> (v128, v128, v128, v128) {
let a = i32x4_add(in0, in2);
let b = i32x4_sub(in0, in2);
let t1_c = mulhi32x4(in1, WASM_CONST2);
let t2_c = i32x4_add(in3, mulhi32x4(in3, WASM_CONST1));
let c = i32x4_sub(t1_c, t2_c);
let t1_d = i32x4_add(in1, mulhi32x4(in1, WASM_CONST1));
let t2_d = mulhi32x4(in3, WASM_CONST2);
let d = i32x4_add(t1_d, t2_d);
let out0 = i32x4_add(a, d);
let out1 = i32x4_add(b, c);
let out2 = i32x4_sub(b, c);
let out3 = i32x4_sub(a, d);
(out0, out1, out2, out3)
}
#[cfg(target_arch = "wasm32")]
#[rite]
pub(crate) fn idct4x4_wasm_impl(_token: Wasm128Token, block: &mut [i32; 16]) {
let r0 = load_row(block, 0);
let r1 = load_row(block, 1);
let r2 = load_row(block, 2);
let r3 = load_row(block, 3);
let (v0, v1, v2, v3) = idct_butterfly(r0, r1, r2, r3);
let (c0, c1, c2, c3) = transpose4x4(v0, v1, v2, v3);
let (t0, t1, t2, t3) = idct_butterfly(c0, c1, c2, c3);
let (f0, f1, f2, f3) = transpose4x4(t0, t1, t2, t3);
let four = i32x4_splat(4);
let o0 = i32x4_shr(i32x4_add(f0, four), 3);
let o1 = i32x4_shr(i32x4_add(f1, four), 3);
let o2 = i32x4_shr(i32x4_add(f2, four), 3);
let o3 = i32x4_shr(i32x4_add(f3, four), 3);
store_row(block, 0, o0);
store_row(block, 1, o1);
store_row(block, 2, o2);
store_row(block, 3, o3);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn dct_butterfly(in0: v128, in1: v128, in2: v128, in3: v128) -> (v128, v128, v128, v128) {
let eight = i32x4_splat(8);
let a = i32x4_mul(i32x4_add(in0, in3), eight);
let b = i32x4_mul(i32x4_add(in1, in2), eight);
let c = i32x4_mul(i32x4_sub(in1, in2), eight);
let d = i32x4_mul(i32x4_sub(in0, in3), eight);
let k2217 = i32x4_splat(2217);
let k5352 = i32x4_splat(5352);
let out0 = i32x4_add(a, b);
let k14500 = i32x4_splat(14500);
let out1 = i32x4_shr(
i32x4_add(i32x4_add(i32x4_mul(c, k2217), i32x4_mul(d, k5352)), k14500),
12,
);
let out2 = i32x4_sub(a, b);
let k7500 = i32x4_splat(7500);
let out3 = i32x4_shr(
i32x4_add(i32x4_sub(i32x4_mul(d, k2217), i32x4_mul(c, k5352)), k7500),
12,
);
(out0, out1, out2, out3)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn dct_butterfly_pass2(in0: v128, in1: v128, in2: v128, in3: v128) -> (v128, v128, v128, v128) {
let a = i32x4_add(in0, in3);
let b = i32x4_add(in1, in2);
let c = i32x4_sub(in1, in2);
let d = i32x4_sub(in0, in3);
let k2217 = i32x4_splat(2217);
let k5352 = i32x4_splat(5352);
let k7 = i32x4_splat(7);
let k12000 = i32x4_splat(12000);
let k51000 = i32x4_splat(51000);
let out0 = i32x4_shr(i32x4_add(i32x4_add(a, b), k7), 4);
let out1_raw = i32x4_shr(
i32x4_add(i32x4_add(i32x4_mul(c, k2217), i32x4_mul(d, k5352)), k12000),
16,
);
let d_ne_0 = v128_not(i32x4_eq(d, i32x4_splat(0)));
let out1 = i32x4_sub(out1_raw, d_ne_0);
let out2 = i32x4_shr(i32x4_add(i32x4_sub(a, b), k7), 4);
let out3 = i32x4_shr(
i32x4_add(i32x4_sub(i32x4_mul(d, k2217), i32x4_mul(c, k5352)), k51000),
16,
);
(out0, out1, out2, out3)
}
#[cfg(target_arch = "wasm32")]
#[arcane]
pub(crate) fn ftransform_from_u8_4x4_wasm(
_token: Wasm128Token,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
ftransform_from_u8_4x4_wasm_impl(_token, src, ref_)
}
#[cfg(target_arch = "wasm32")]
#[rite]
pub(crate) fn ftransform_from_u8_4x4_wasm_impl(
_token: Wasm128Token,
src: &[u8; 16],
ref_: &[u8; 16],
) -> [i32; 16] {
let src_vec = u8x16(
src[0], src[1], src[2], src[3], src[4], src[5], src[6], src[7], src[8], src[9],
src[10], src[11], src[12], src[13], src[14], src[15],
);
let ref_vec = u8x16(
ref_[0], ref_[1], ref_[2], ref_[3], ref_[4], ref_[5], ref_[6], ref_[7], ref_[8],
ref_[9], ref_[10], ref_[11], ref_[12], ref_[13], ref_[14], ref_[15],
);
let src_lo = u16x8_extend_low_u8x16(src_vec);
let src_hi = u16x8_extend_high_u8x16(src_vec);
let ref_lo = u16x8_extend_low_u8x16(ref_vec);
let ref_hi = u16x8_extend_high_u8x16(ref_vec);
let diff_lo = i16x8_sub(src_lo, ref_lo);
let diff_hi = i16x8_sub(src_hi, ref_hi);
let r0 = i32x4_extend_low_i16x8(diff_lo);
let r1 = i32x4_extend_high_i16x8(diff_lo);
let r2 = i32x4_extend_low_i16x8(diff_hi);
let r3 = i32x4_extend_high_i16x8(diff_hi);
let (c0, c1, c2, c3) = transpose4x4(r0, r1, r2, r3);
let (v0, v1, v2, v3) = dct_butterfly(c0, c1, c2, c3);
let (t0, t1, t2, t3) = transpose4x4(v0, v1, v2, v3);
let (o0, o1, o2, o3) = dct_butterfly_pass2(t0, t1, t2, t3);
let mut result = [0i32; 16];
store_row(&mut result, 0, o0);
store_row(&mut result, 1, o1);
store_row(&mut result, 2, o2);
store_row(&mut result, 3, o3);
result
}
#[cfg(target_arch = "wasm32")]
#[rite]
pub(crate) fn dct4x4_wasm_impl(_token: Wasm128Token, block: &mut [i32; 16]) {
let r0 = load_row(block, 0);
let r1 = load_row(block, 1);
let r2 = load_row(block, 2);
let r3 = load_row(block, 3);
let (c0, c1, c2, c3) = transpose4x4(r0, r1, r2, r3);
let (v0, v1, v2, v3) = dct_butterfly(c0, c1, c2, c3);
let (t0, t1, t2, t3) = transpose4x4(v0, v1, v2, v3);
let (o0, o1, o2, o3) = dct_butterfly_pass2(t0, t1, t2, t3);
store_row(block, 0, o0);
store_row(block, 1, o1);
store_row(block, 2, o2);
store_row(block, 3, o3);
}
}
#[cfg(target_arch = "wasm32")]
pub(crate) use wasm_transform::*;
#[cfg(test)]
mod tests {
extern crate std;
use super::*;
#[test]
fn test_dct_inverse() {
const BLOCK: [i32; 16] = [
38, 6, 210, 107, 42, 125, 185, 151, 241, 224, 125, 233, 227, 8, 57, 96,
];
let mut dct_block = BLOCK;
dct4x4(&mut dct_block);
let mut inverse_dct_block = dct_block;
idct4x4(&mut inverse_dct_block);
assert_eq!(BLOCK, inverse_dct_block);
}
fn reference_idct4x4(block: &mut [i32; 16]) {
const C1: i64 = 20091;
const C2: i64 = 35468;
fn fetch(block: &[i32], idx: usize) -> i64 {
i64::from(block[idx])
}
for i in 0usize..4 {
let a1 = fetch(block, i) + fetch(block, 8 + i);
let b1 = fetch(block, i) - fetch(block, 8 + i);
let t1 = (fetch(block, 4 + i) * C2) >> 16;
let t2 = fetch(block, 12 + i) + ((fetch(block, 12 + i) * C1) >> 16);
let c1 = t1 - t2;
let t1 = fetch(block, 4 + i) + ((fetch(block, 4 + i) * C1) >> 16);
let t2 = (fetch(block, 12 + i) * C2) >> 16;
let d1 = t1 + t2;
block[i] = (a1 + d1) as i32;
block[4 + i] = (b1 + c1) as i32;
block[12 + i] = (a1 - d1) as i32;
block[8 + i] = (b1 - c1) as i32;
}
for i in 0usize..4 {
let a1 = fetch(block, 4 * i) + fetch(block, 4 * i + 2);
let b1 = fetch(block, 4 * i) - fetch(block, 4 * i + 2);
let t1 = (fetch(block, 4 * i + 1) * C2) >> 16;
let t2 = fetch(block, 4 * i + 3) + ((fetch(block, 4 * i + 3) * C1) >> 16);
let c1 = t1 - t2;
let t1 = fetch(block, 4 * i + 1) + ((fetch(block, 4 * i + 1) * C1) >> 16);
let t2 = (fetch(block, 4 * i + 3) * C2) >> 16;
let d1 = t1 + t2;
block[4 * i] = ((a1 + d1 + 4) >> 3) as i32;
block[4 * i + 3] = ((a1 - d1 + 4) >> 3) as i32;
block[4 * i + 1] = ((b1 + c1 + 4) >> 3) as i32;
block[4 * i + 2] = ((b1 - c1 + 4) >> 3) as i32;
}
}
fn reference_add_residue(
block: &mut [u8],
coeffs: &[i32; 16],
y0: usize,
x0: usize,
stride: usize,
) {
for row in 0..4 {
for col in 0..4 {
let pos = (y0 + row) * stride + x0 + col;
let val = i32::from(block[pos]) + coeffs[row * 4 + col];
block[pos] = val.clamp(0, 255) as u8;
}
}
}
#[test]
fn idct_add_residue_matches_reference() {
let test_cases: &[[i32; 16]] = &[
[100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[
200, 50, -30, 20, 40, -20, 10, -5, -10, 5, -3, 2, 8, -4, 2, -1,
],
[
500, -300, 200, -100, 150, -80, 60, -40, -50, 30, -20, 10, 25, -15, 10, -5,
],
[0; 16],
[0, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[
2047, -2048, 1000, -1000, 500, -500, 250, -250, 125, -125, 63, -63, 31, -31, 15,
-15,
],
];
let stride = 12; let y0 = 1;
let x0 = 1;
let _block_size = stride * 6;
for (case_idx, &coeffs_orig) in test_cases.iter().enumerate() {
let mut ref_coeffs = coeffs_orig;
reference_idct4x4(&mut ref_coeffs);
let mut ref_block = [128u8; 72]; reference_add_residue(&mut ref_block, &ref_coeffs, y0, x0, stride);
let mut disp_coeffs = coeffs_orig;
let mut disp_block = [128u8; 72];
crate::common::prediction::idct_add_residue_and_clear::<72>(
&mut disp_block,
&mut disp_coeffs,
y0,
x0,
stride,
);
if ref_block != disp_block {
std::eprintln!("MISMATCH in test case {case_idx}: coeffs = {coeffs_orig:?}");
for row in 0..4 {
for col in 0..4 {
let pos = (y0 + row) * stride + x0 + col;
let r = ref_block[pos];
let d = disp_block[pos];
if r != d {
std::eprintln!(
" [{row},{col}] (pos {pos}): ref={r} dispatched={d} diff={}",
r as i32 - d as i32
);
}
}
}
}
assert_eq!(
ref_block, disp_block,
"test case {case_idx}: dispatched IDCT+add_residue differs from reference"
);
}
}
}