#![deny(unsafe_code)]
#![allow(dead_code)]
use core::arch::wasm32::*;
use crate::include::common::bitdepth::BitDepth;
use crate::include::dav1d::picture::PicOffset;
use crate::src::levels::TxfmSize;
use crate::src::safe_simd::pixel_access::{wasm_loadi32, wasm_storei32};
use crate::src::strided::Strided as _;
use zerocopy::IntoBytes;
use crate::src::levels::DCT_DCT;
#[inline(always)]
fn dct4_4rows(
c0: v128,
c1: v128,
c2: v128,
c3: v128,
clip_min: i32,
clip_max: i32,
) -> (v128, v128, v128, v128) {
let sqrt2 = i32x4_splat(181);
let rnd8 = i32x4_splat(128);
let c1567_v = i32x4_splat(1567);
let c_312_v = i32x4_splat(3784 - 4096); let rnd12 = i32x4_splat(2048);
let t0 = i32x4_shr(i32x4_add(i32x4_mul(i32x4_add(c0, c2), sqrt2), rnd8), 8);
let t1 = i32x4_shr(i32x4_add(i32x4_mul(i32x4_sub(c0, c2), sqrt2), rnd8), 8);
let t2 = i32x4_sub(
i32x4_shr(
i32x4_add(
i32x4_sub(i32x4_mul(c1, c1567_v), i32x4_mul(c3, c_312_v)),
rnd12,
),
12,
),
c3,
);
let t3 = i32x4_add(
i32x4_shr(
i32x4_add(
i32x4_add(i32x4_mul(c1, c_312_v), i32x4_mul(c3, c1567_v)),
rnd12,
),
12,
),
c1,
);
let vmin = i32x4_splat(clip_min);
let vmax = i32x4_splat(clip_max);
let out0 = i32x4_max(i32x4_min(i32x4_add(t0, t3), vmax), vmin);
let out1 = i32x4_max(i32x4_min(i32x4_add(t1, t2), vmax), vmin);
let out2 = i32x4_max(i32x4_min(i32x4_sub(t1, t2), vmax), vmin);
let out3 = i32x4_max(i32x4_min(i32x4_sub(t0, t3), vmax), vmin);
(out0, out1, out2, out3)
}
#[inline(always)]
fn transpose_4x4(r0: v128, r1: v128, r2: v128, r3: v128) -> (v128, v128, v128, v128) {
let t01_lo = i32x4_shuffle::<0, 4, 1, 5>(r0, r1); let t01_hi = i32x4_shuffle::<2, 6, 3, 7>(r0, r1); let t23_lo = i32x4_shuffle::<0, 4, 1, 5>(r2, r3); let t23_hi = i32x4_shuffle::<2, 6, 3, 7>(r2, r3);
let c0 = i64x2_shuffle::<0, 2>(t01_lo, t23_lo); let c1 = i64x2_shuffle::<1, 3>(t01_lo, t23_lo); let c2 = i64x2_shuffle::<0, 2>(t01_hi, t23_hi); let c3 = i64x2_shuffle::<1, 3>(t01_hi, t23_hi);
(c0, c1, c2, c3)
}
fn inv_txfm_add_dct_dct_4x4_8bpc(
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let c0 = i32x4(
coeff[0] as i32,
coeff[1] as i32,
coeff[2] as i32,
coeff[3] as i32,
);
let c1 = i32x4(
coeff[4] as i32,
coeff[5] as i32,
coeff[6] as i32,
coeff[7] as i32,
);
let c2 = i32x4(
coeff[8] as i32,
coeff[9] as i32,
coeff[10] as i32,
coeff[11] as i32,
);
let c3 = i32x4(
coeff[12] as i32,
coeff[13] as i32,
coeff[14] as i32,
coeff[15] as i32,
);
let row_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 7
};
let row_clip_max = !row_clip_min;
let col_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 5
};
let col_clip_max = !col_clip_min;
let (r0, r1, r2, r3) = dct4_4rows(c0, c1, c2, c3, row_clip_min, row_clip_max);
let (tc0, tc1, tc2, tc3) = transpose_4x4(r0, r1, r2, r3);
let cmin = i32x4_splat(col_clip_min);
let cmax = i32x4_splat(col_clip_max);
let tc0 = i32x4_max(i32x4_min(tc0, cmax), cmin);
let tc1 = i32x4_max(i32x4_min(tc1, cmax), cmin);
let tc2 = i32x4_max(i32x4_min(tc2, cmax), cmin);
let tc3 = i32x4_max(i32x4_min(tc3, cmax), cmin);
let (f0, f1, f2, f3) = dct4_4rows(tc0, tc1, tc2, tc3, col_clip_min, col_clip_max);
let (out0, out1, out2, out3) = transpose_4x4(f0, f1, f2, f3);
let rnd = i32x4_splat(8);
let out0 = i32x4_shr(i32x4_add(out0, rnd), 4);
let out1 = i32x4_shr(i32x4_add(out1, rnd), 4);
let out2 = i32x4_shr(i32x4_add(out2, rnd), 4);
let out3 = i32x4_shr(i32x4_add(out3, rnd), 4);
let zero = i32x4_splat(0);
let max_val = i32x4_splat(bitdepth_max);
let d0 = wasm_loadi32!(&dst[..4]);
let d0_wide = i32x4(
u8x16_extract_lane::<0>(d0) as i32,
u8x16_extract_lane::<1>(d0) as i32,
u8x16_extract_lane::<2>(d0) as i32,
u8x16_extract_lane::<3>(d0) as i32,
);
let sum0 = i32x4_max(i32x4_min(i32x4_add(d0_wide, out0), max_val), zero);
let packed0 = u8x16(
i32x4_extract_lane::<0>(sum0) as u8,
i32x4_extract_lane::<1>(sum0) as u8,
i32x4_extract_lane::<2>(sum0) as u8,
i32x4_extract_lane::<3>(sum0) as u8,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
);
wasm_storei32!(&mut dst[..4], packed0);
let off1 = dst_stride;
let d1 = wasm_loadi32!(&dst[off1..off1 + 4]);
let d1_wide = i32x4(
u8x16_extract_lane::<0>(d1) as i32,
u8x16_extract_lane::<1>(d1) as i32,
u8x16_extract_lane::<2>(d1) as i32,
u8x16_extract_lane::<3>(d1) as i32,
);
let sum1 = i32x4_max(i32x4_min(i32x4_add(d1_wide, out1), max_val), zero);
let packed1 = u8x16(
i32x4_extract_lane::<0>(sum1) as u8,
i32x4_extract_lane::<1>(sum1) as u8,
i32x4_extract_lane::<2>(sum1) as u8,
i32x4_extract_lane::<3>(sum1) as u8,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
);
wasm_storei32!(&mut dst[off1..off1 + 4], packed1);
let off2 = dst_stride * 2;
let d2 = wasm_loadi32!(&dst[off2..off2 + 4]);
let d2_wide = i32x4(
u8x16_extract_lane::<0>(d2) as i32,
u8x16_extract_lane::<1>(d2) as i32,
u8x16_extract_lane::<2>(d2) as i32,
u8x16_extract_lane::<3>(d2) as i32,
);
let sum2 = i32x4_max(i32x4_min(i32x4_add(d2_wide, out2), max_val), zero);
let packed2 = u8x16(
i32x4_extract_lane::<0>(sum2) as u8,
i32x4_extract_lane::<1>(sum2) as u8,
i32x4_extract_lane::<2>(sum2) as u8,
i32x4_extract_lane::<3>(sum2) as u8,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
);
wasm_storei32!(&mut dst[off2..off2 + 4], packed2);
let off3 = dst_stride * 3;
let d3 = wasm_loadi32!(&dst[off3..off3 + 4]);
let d3_wide = i32x4(
u8x16_extract_lane::<0>(d3) as i32,
u8x16_extract_lane::<1>(d3) as i32,
u8x16_extract_lane::<2>(d3) as i32,
u8x16_extract_lane::<3>(d3) as i32,
);
let sum3 = i32x4_max(i32x4_min(i32x4_add(d3_wide, out3), max_val), zero);
let packed3 = u8x16(
i32x4_extract_lane::<0>(sum3) as u8,
i32x4_extract_lane::<1>(sum3) as u8,
i32x4_extract_lane::<2>(sum3) as u8,
i32x4_extract_lane::<3>(sum3) as u8,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
);
wasm_storei32!(&mut dst[off3..off3 + 4], packed3);
coeff[..16].fill(0);
}
fn inv_txfm_add_dct_dct_4x4_16bpc(
dst: &mut [u16],
dst_stride_u16: usize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let c0 = i32x4(coeff[0], coeff[1], coeff[2], coeff[3]);
let c1 = i32x4(coeff[4], coeff[5], coeff[6], coeff[7]);
let c2 = i32x4(coeff[8], coeff[9], coeff[10], coeff[11]);
let c3 = i32x4(coeff[12], coeff[13], coeff[14], coeff[15]);
let row_clip_min = (!bitdepth_max) << 7;
let row_clip_max = !row_clip_min;
let col_clip_min = (!bitdepth_max) << 5;
let col_clip_max = !col_clip_min;
let (r0, r1, r2, r3) = dct4_4rows(c0, c1, c2, c3, row_clip_min, row_clip_max);
let (tc0, tc1, tc2, tc3) = transpose_4x4(r0, r1, r2, r3);
let cmin = i32x4_splat(col_clip_min);
let cmax = i32x4_splat(col_clip_max);
let tc0 = i32x4_max(i32x4_min(tc0, cmax), cmin);
let tc1 = i32x4_max(i32x4_min(tc1, cmax), cmin);
let tc2 = i32x4_max(i32x4_min(tc2, cmax), cmin);
let tc3 = i32x4_max(i32x4_min(tc3, cmax), cmin);
let (f0, f1, f2, f3) = dct4_4rows(tc0, tc1, tc2, tc3, col_clip_min, col_clip_max);
let (out0, out1, out2, out3) = transpose_4x4(f0, f1, f2, f3);
let rnd = i32x4_splat(8);
let out0 = i32x4_shr(i32x4_add(out0, rnd), 4);
let out1 = i32x4_shr(i32x4_add(out1, rnd), 4);
let out2 = i32x4_shr(i32x4_add(out2, rnd), 4);
let out3 = i32x4_shr(i32x4_add(out3, rnd), 4);
let zero = i32x4_splat(0);
let max_val = i32x4_splat(bitdepth_max);
for (row_idx, out_row) in [(0, out0), (1, out1), (2, out2), (3, out3)] {
let off = row_idx * dst_stride_u16;
let d = i32x4(
dst[off] as i32,
dst[off + 1] as i32,
dst[off + 2] as i32,
dst[off + 3] as i32,
);
let sum = i32x4_max(i32x4_min(i32x4_add(d, out_row), max_val), zero);
dst[off] = i32x4_extract_lane::<0>(sum) as u16;
dst[off + 1] = i32x4_extract_lane::<1>(sum) as u16;
dst[off + 2] = i32x4_extract_lane::<2>(sum) as u16;
dst[off + 3] = i32x4_extract_lane::<3>(sum) as u16;
}
coeff[..16].fill(0);
}
#[inline(always)]
fn dct8_4rows(
c0: v128,
c1: v128,
c2: v128,
c3: v128,
c4: v128,
c5: v128,
c6: v128,
c7: v128,
clip_min: i32,
clip_max: i32,
) -> (v128, v128, v128, v128, v128, v128, v128, v128) {
let vmin = i32x4_splat(clip_min);
let vmax = i32x4_splat(clip_max);
let clip = |v: v128| i32x4_max(i32x4_min(v, vmax), vmin);
let (e0, e1, e2, e3) = dct4_4rows(c0, c2, c4, c6, clip_min, clip_max);
let c799_v = i32x4_splat(799);
let c4017_off = i32x4_splat(4017 - 4096); let c1703_v = i32x4_splat(1703);
let c1138_v = i32x4_splat(1138);
let sqrt2 = i32x4_splat(181);
let rnd12 = i32x4_splat(2048);
let rnd11 = i32x4_splat(1024);
let rnd8 = i32x4_splat(128);
let t4a = i32x4_sub(
i32x4_shr(
i32x4_add(
i32x4_sub(i32x4_mul(c1, c799_v), i32x4_mul(c7, c4017_off)),
rnd12,
),
12,
),
c7,
);
let t7a = i32x4_add(
i32x4_shr(
i32x4_add(
i32x4_add(i32x4_mul(c1, c4017_off), i32x4_mul(c7, c799_v)),
rnd12,
),
12,
),
c1,
);
let t5a = i32x4_shr(
i32x4_add(
i32x4_sub(i32x4_mul(c5, c1703_v), i32x4_mul(c3, c1138_v)),
rnd11,
),
11,
);
let t6a = i32x4_shr(
i32x4_add(
i32x4_add(i32x4_mul(c5, c1138_v), i32x4_mul(c3, c1703_v)),
rnd11,
),
11,
);
let t4 = clip(i32x4_add(t4a, t5a));
let t5a_diff = clip(i32x4_sub(t4a, t5a));
let t7 = clip(i32x4_add(t7a, t6a));
let t6a_diff = clip(i32x4_sub(t7a, t6a));
let t5 = i32x4_shr(
i32x4_add(i32x4_mul(i32x4_sub(t6a_diff, t5a_diff), sqrt2), rnd8),
8,
);
let t6 = i32x4_shr(
i32x4_add(i32x4_mul(i32x4_add(t6a_diff, t5a_diff), sqrt2), rnd8),
8,
);
let out0 = clip(i32x4_add(e0, t7));
let out1 = clip(i32x4_add(e1, t6));
let out2 = clip(i32x4_add(e2, t5));
let out3 = clip(i32x4_add(e3, t4));
let out4 = clip(i32x4_sub(e3, t4));
let out5 = clip(i32x4_sub(e2, t5));
let out6 = clip(i32x4_sub(e1, t6));
let out7 = clip(i32x4_sub(e0, t7));
(out0, out1, out2, out3, out4, out5, out6, out7)
}
#[inline(always)]
fn transpose_8x4_to_4x8(
c0: v128,
c1: v128,
c2: v128,
c3: v128,
c4: v128,
c5: v128,
c6: v128,
c7: v128,
) -> ((v128, v128), (v128, v128), (v128, v128), (v128, v128)) {
let (t0_lo, t1_lo, t2_lo, t3_lo) = transpose_4x4(c0, c1, c2, c3);
let (t0_hi, t1_hi, t2_hi, t3_hi) = transpose_4x4(c4, c5, c6, c7);
(
(t0_lo, t0_hi),
(t1_lo, t1_hi),
(t2_lo, t2_hi),
(t3_lo, t3_hi),
)
}
fn inv_txfm_add_dct_dct_8x8_8bpc(
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let row_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 7
};
let row_clip_max = !row_clip_min;
let col_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 5
};
let col_clip_max = !col_clip_min;
let load_col_lo = |col: usize| -> v128 {
let base = col * 8;
i32x4(
coeff[base] as i32,
coeff[base + 1] as i32,
coeff[base + 2] as i32,
coeff[base + 3] as i32,
)
};
let load_col_hi = |col: usize| -> v128 {
let base = col * 8 + 4;
i32x4(
coeff[base] as i32,
coeff[base + 1] as i32,
coeff[base + 2] as i32,
coeff[base + 3] as i32,
)
};
let (r0_lo, r1_lo, r2_lo, r3_lo, r4_lo, r5_lo, r6_lo, r7_lo) = dct8_4rows(
load_col_lo(0),
load_col_lo(1),
load_col_lo(2),
load_col_lo(3),
load_col_lo(4),
load_col_lo(5),
load_col_lo(6),
load_col_lo(7),
row_clip_min,
row_clip_max,
);
let (r0_hi, r1_hi, r2_hi, r3_hi, r4_hi, r5_hi, r6_hi, r7_hi) = dct8_4rows(
load_col_hi(0),
load_col_hi(1),
load_col_hi(2),
load_col_hi(3),
load_col_hi(4),
load_col_hi(5),
load_col_hi(6),
load_col_hi(7),
row_clip_min,
row_clip_max,
);
let rnd_shift = i32x4_splat(1); let cmin = i32x4_splat(col_clip_min);
let cmax = i32x4_splat(col_clip_max);
let clamp_shift = |v: v128| -> v128 {
i32x4_max(i32x4_min(i32x4_shr(i32x4_add(v, rnd_shift), 1), cmax), cmin)
};
let c0_lo = clamp_shift(r0_lo);
let c0_hi = clamp_shift(r0_hi);
let c1_lo = clamp_shift(r1_lo);
let c1_hi = clamp_shift(r1_hi);
let c2_lo = clamp_shift(r2_lo);
let c2_hi = clamp_shift(r2_hi);
let c3_lo = clamp_shift(r3_lo);
let c3_hi = clamp_shift(r3_hi);
let c4_lo = clamp_shift(r4_lo);
let c4_hi = clamp_shift(r4_hi);
let c5_lo = clamp_shift(r5_lo);
let c5_hi = clamp_shift(r5_hi);
let c6_lo = clamp_shift(r6_lo);
let c6_hi = clamp_shift(r6_hi);
let c7_lo = clamp_shift(r7_lo);
let c7_hi = clamp_shift(r7_hi);
let ((row0_lo, row0_hi), (row1_lo, row1_hi), (row2_lo, row2_hi), (row3_lo, row3_hi)) =
transpose_8x4_to_4x8(c0_lo, c1_lo, c2_lo, c3_lo, c4_lo, c5_lo, c6_lo, c7_lo);
let ((row4_lo, row4_hi), (row5_lo, row5_hi), (row6_lo, row6_hi), (row7_lo, row7_hi)) =
transpose_8x4_to_4x8(c0_hi, c1_hi, c2_hi, c3_hi, c4_hi, c5_hi, c6_hi, c7_hi);
let (f0_lo, f1_lo, f2_lo, f3_lo, f4_lo, f5_lo, f6_lo, f7_lo) = dct8_4rows(
row0_lo,
row1_lo,
row2_lo,
row3_lo,
row4_lo,
row5_lo,
row6_lo,
row7_lo,
col_clip_min,
col_clip_max,
);
let (f0_hi, f1_hi, f2_hi, f3_hi, f4_hi, f5_hi, f6_hi, f7_hi) = dct8_4rows(
row0_hi,
row1_hi,
row2_hi,
row3_hi,
row4_hi,
row5_hi,
row6_hi,
row7_hi,
col_clip_min,
col_clip_max,
);
let rnd = i32x4_splat(8);
#[inline(always)]
fn write_row_8bpc(dst: &mut [u8], off: usize, out_lo: v128, out_hi: v128, bdmax: i32) {
let rnd = i32x4_splat(8);
let out_lo = i32x4_shr(i32x4_add(out_lo, rnd), 4);
let out_hi = i32x4_shr(i32x4_add(out_hi, rnd), 4);
dst[off + 0] =
(dst[off + 0] as i32 + i32x4_extract_lane::<0>(out_lo)).clamp(0, bdmax) as u8;
dst[off + 1] =
(dst[off + 1] as i32 + i32x4_extract_lane::<1>(out_lo)).clamp(0, bdmax) as u8;
dst[off + 2] =
(dst[off + 2] as i32 + i32x4_extract_lane::<2>(out_lo)).clamp(0, bdmax) as u8;
dst[off + 3] =
(dst[off + 3] as i32 + i32x4_extract_lane::<3>(out_lo)).clamp(0, bdmax) as u8;
dst[off + 4] =
(dst[off + 4] as i32 + i32x4_extract_lane::<0>(out_hi)).clamp(0, bdmax) as u8;
dst[off + 5] =
(dst[off + 5] as i32 + i32x4_extract_lane::<1>(out_hi)).clamp(0, bdmax) as u8;
dst[off + 6] =
(dst[off + 6] as i32 + i32x4_extract_lane::<2>(out_hi)).clamp(0, bdmax) as u8;
dst[off + 7] =
(dst[off + 7] as i32 + i32x4_extract_lane::<3>(out_hi)).clamp(0, bdmax) as u8;
}
let _ = rnd;
write_row_8bpc(dst, 0, f0_lo, f0_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride, f1_lo, f1_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride * 2, f2_lo, f2_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride * 3, f3_lo, f3_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride * 4, f4_lo, f4_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride * 5, f5_lo, f5_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride * 6, f6_lo, f6_hi, bitdepth_max);
write_row_8bpc(dst, dst_stride * 7, f7_lo, f7_hi, bitdepth_max);
coeff[..64].fill(0);
}
pub fn itxfm_add_dispatch<BD: BitDepth>(
tx_size: usize,
tx_type: usize,
dst: PicOffset,
coeff: &mut [BD::Coef],
eob: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
let txfm = match TxfmSize::from_repr(tx_size) {
Some(t) => t,
None => return false,
};
let (w, h) = txfm.to_wh();
let byte_stride_u = dst.stride().unsigned_abs() as usize;
let bd_c = bd.into_c();
if tx_type as u8 != DCT_DCT {
return false;
}
match BD::BPC {
BPC::BPC8 => {
let coeff_i16: &mut [i16] = zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i16 reinterpretation");
let (mut guard, _base) = dst.strided_slice_mut::<BD>(w, h);
let dst_u8: &mut [u8] = guard.as_mut_bytes();
match txfm {
TxfmSize::S4x4 => {
inv_txfm_add_dct_dct_4x4_8bpc(dst_u8, byte_stride_u, coeff_i16, eob, bd_c);
true
}
TxfmSize::S8x8 => {
inv_txfm_add_dct_dct_8x8_8bpc(dst_u8, byte_stride_u, coeff_i16, eob, bd_c);
true
}
_ => false,
}
}
BPC::BPC16 => {
let coeff_i32: &mut [i32] = zerocopy::FromBytes::mut_from_bytes(coeff.as_mut_bytes())
.expect("coeff alignment/size mismatch for i32 reinterpretation");
let (mut guard, _base) = dst.strided_slice_mut::<BD>(w, h);
let dst_bytes: &mut [u8] = guard.as_mut_bytes();
let dst_u16: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_bytes)
.expect("dst alignment/size mismatch for u16 reinterpretation");
let stride_u16 = byte_stride_u / 2;
match txfm {
TxfmSize::S4x4 => {
inv_txfm_add_dct_dct_4x4_16bpc(dst_u16, stride_u16, coeff_i32, eob, bd_c);
true
}
_ => false,
}
}
}
}