use archmage::{Desktop64, Server64, SimdToken, arcane, rite};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::include::common::bitdepth::AsPrimitive;
use crate::include::common::bitdepth::BitDepth;
use crate::include::common::bitdepth::DynCoef;
use crate::include::common::bitdepth::DynPixel;
use crate::include::common::intops::iclip;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;
use crate::src::safe_simd::pixel_access::Flex;
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, loadu_256, loadu_512, storei32, storei64, storeu_128, storeu_256,
storeu_512,
};
use std::ffi::c_int;
use std::num::NonZeroUsize;
use std::slice;
const SQRT2_BITS: i32 = 8;
const SQRT2_HALF: i32 = 181;
#[cfg(target_arch = "x86_64")]
macro_rules! transpose_8x8_i32 {
($a:expr) => {{
let __cols: [__m256i; 8] = $a;
let __t0 = _mm256_unpacklo_epi32(__cols[0], __cols[1]);
let __t1 = _mm256_unpackhi_epi32(__cols[0], __cols[1]);
let __t2 = _mm256_unpacklo_epi32(__cols[2], __cols[3]);
let __t3 = _mm256_unpackhi_epi32(__cols[2], __cols[3]);
let __t4 = _mm256_unpacklo_epi32(__cols[4], __cols[5]);
let __t5 = _mm256_unpackhi_epi32(__cols[4], __cols[5]);
let __t6 = _mm256_unpacklo_epi32(__cols[6], __cols[7]);
let __t7 = _mm256_unpackhi_epi32(__cols[6], __cols[7]);
let __u0 = _mm256_unpacklo_epi64(__t0, __t2);
let __u1 = _mm256_unpackhi_epi64(__t0, __t2);
let __u2 = _mm256_unpacklo_epi64(__t1, __t3);
let __u3 = _mm256_unpackhi_epi64(__t1, __t3);
let __u4 = _mm256_unpacklo_epi64(__t4, __t6);
let __u5 = _mm256_unpackhi_epi64(__t4, __t6);
let __u6 = _mm256_unpacklo_epi64(__t5, __t7);
let __u7 = _mm256_unpackhi_epi64(__t5, __t7);
let __r0 = _mm256_permute2x128_si256::<0x20>(__u0, __u4);
let __r1 = _mm256_permute2x128_si256::<0x20>(__u1, __u5);
let __r2 = _mm256_permute2x128_si256::<0x20>(__u2, __u6);
let __r3 = _mm256_permute2x128_si256::<0x20>(__u3, __u7);
let __r4 = _mm256_permute2x128_si256::<0x31>(__u0, __u4);
let __r5 = _mm256_permute2x128_si256::<0x31>(__u1, __u5);
let __r6 = _mm256_permute2x128_si256::<0x31>(__u2, __u6);
let __r7 = _mm256_permute2x128_si256::<0x31>(__u3, __u7);
[__r0, __r1, __r2, __r3, __r4, __r5, __r6, __r7]
}};
}
#[cfg(target_arch = "x86_64")]
#[allow(unused_macros)]
macro_rules! itx_mul2x_pack {
($paired:expr, $coef_a:expr, $coef_b:expr, $rnd:expr, $shift:literal) => {{
let __coef_a: i32 = $coef_a as i32;
let __coef_b: i32 = $coef_b as i32;
let __packed_coef = _mm256_set1_epi32(
((__coef_a as u32) & 0xFFFF) as i32 | (((__coef_b as u32) & 0xFFFF) << 16) as i32,
);
let __prod = _mm256_madd_epi16($paired, __packed_coef);
let __rounded = _mm256_add_epi32(__prod, _mm256_set1_epi32($rnd));
_mm256_srai_epi32::<$shift>(__rounded)
}};
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_4x4_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{loadi32, storei32, storeu_128};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row0 = _mm_set_epi32(
coeff[12] as i32,
coeff[8] as i32,
coeff[4] as i32,
coeff[0] as i32,
);
let row1 = _mm_set_epi32(
coeff[13] as i32,
coeff[9] as i32,
coeff[5] as i32,
coeff[1] as i32,
);
let row2 = _mm_set_epi32(
coeff[14] as i32,
coeff[10] as i32,
coeff[6] as i32,
coeff[2] as i32,
);
let row3 = _mm_set_epi32(
coeff[15] as i32,
coeff[11] as i32,
coeff[7] as i32,
coeff[3] as i32,
);
let rows01 = _mm256_set_m128i(row1, row0);
let rows23 = _mm256_set_m128i(row3, row2);
let row_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 7
};
let row_clip_max = !row_clip_min;
let col_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 5
};
let col_clip_max = !col_clip_min;
let (rows01_out, rows23_out) =
dct4_2rows_avx2(_token, rows01, rows23, row_clip_min, row_clip_max);
let r0 = _mm256_castsi256_si128(rows01_out);
let r1 = _mm256_extracti128_si256(rows01_out, 1);
let r2 = _mm256_castsi256_si128(rows23_out);
let r3 = _mm256_extracti128_si256(rows23_out, 1);
let t01_lo = _mm_unpacklo_epi32(r0, r1);
let t01_hi = _mm_unpackhi_epi32(r0, r1);
let t23_lo = _mm_unpacklo_epi32(r2, r3);
let t23_hi = _mm_unpackhi_epi32(r2, r3);
let col0 = _mm_unpacklo_epi64(t01_lo, t23_lo);
let col1 = _mm_unpackhi_epi64(t01_lo, t23_lo);
let col2 = _mm_unpacklo_epi64(t01_hi, t23_hi);
let col3 = _mm_unpackhi_epi64(t01_hi, t23_hi);
let cmin = _mm_set1_epi32(col_clip_min);
let cmax = _mm_set1_epi32(col_clip_max);
let col0 = _mm_max_epi32(_mm_min_epi32(col0, cmax), cmin);
let col1 = _mm_max_epi32(_mm_min_epi32(col1, cmax), cmin);
let col2 = _mm_max_epi32(_mm_min_epi32(col2, cmax), cmin);
let col3 = _mm_max_epi32(_mm_min_epi32(col3, cmax), cmin);
let cols01 = _mm256_set_m128i(col1, col0);
let cols23 = _mm256_set_m128i(col3, col2);
let (cols01_out, cols23_out) =
dct4_2rows_avx2(_token, cols01, cols23, col_clip_min, col_clip_max);
let rnd = _mm256_set1_epi32(8);
let cols01_scaled = _mm256_srai_epi32(_mm256_add_epi32(cols01_out, rnd), 4);
let cols23_scaled = _mm256_srai_epi32(_mm256_add_epi32(cols23_out, rnd), 4);
let c0 = _mm256_castsi256_si128(cols01_scaled);
let c1 = _mm256_extracti128_si256(cols01_scaled, 1);
let c2 = _mm256_castsi256_si128(cols23_scaled);
let c3 = _mm256_extracti128_si256(cols23_scaled, 1);
let u01_lo = _mm_unpacklo_epi32(c0, c1);
let u01_hi = _mm_unpackhi_epi32(c0, c1);
let u23_lo = _mm_unpacklo_epi32(c2, c3);
let u23_hi = _mm_unpackhi_epi32(c2, c3);
let final0 = _mm_unpacklo_epi64(u01_lo, u23_lo);
let final1 = _mm_unpackhi_epi64(u01_lo, u23_lo);
let final2 = _mm_unpacklo_epi64(u01_hi, u23_hi);
let final3 = _mm_unpackhi_epi64(u01_hi, u23_hi);
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let d0 = loadi32!(&dst[..4]);
let d0_16 = _mm_unpacklo_epi8(d0, zero);
let d0_32 = _mm_cvtepi16_epi32(d0_16);
let sum0 = _mm_add_epi32(d0_32, final0);
let sum0_16 = _mm_packs_epi32(sum0, sum0);
let sum0_clamped = _mm_max_epi16(_mm_min_epi16(sum0_16, max_val), zero);
let sum0_8 = _mm_packus_epi16(sum0_clamped, sum0_clamped);
storei32!(&mut dst[..4], sum0_8);
let off1 = dst_stride;
let d1 = loadi32!(&dst[off1..off1 + 4]);
let d1_16 = _mm_unpacklo_epi8(d1, zero);
let d1_32 = _mm_cvtepi16_epi32(d1_16);
let sum1 = _mm_add_epi32(d1_32, final1);
let sum1_16 = _mm_packs_epi32(sum1, sum1);
let sum1_clamped = _mm_max_epi16(_mm_min_epi16(sum1_16, max_val), zero);
let sum1_8 = _mm_packus_epi16(sum1_clamped, sum1_clamped);
storei32!(&mut dst[off1..off1 + 4], sum1_8);
let off2 = dst_stride * 2;
let d2 = loadi32!(&dst[off2..off2 + 4]);
let d2_16 = _mm_unpacklo_epi8(d2, zero);
let d2_32 = _mm_cvtepi16_epi32(d2_16);
let sum2 = _mm_add_epi32(d2_32, final2);
let sum2_16 = _mm_packs_epi32(sum2, sum2);
let sum2_clamped = _mm_max_epi16(_mm_min_epi16(sum2_16, max_val), zero);
let sum2_8 = _mm_packus_epi16(sum2_clamped, sum2_clamped);
storei32!(&mut dst[off2..off2 + 4], sum2_8);
let off3 = dst_stride * 3;
let d3 = loadi32!(&dst[off3..off3 + 4]);
let d3_16 = _mm_unpacklo_epi8(d3, zero);
let d3_32 = _mm_cvtepi16_epi32(d3_16);
let sum3 = _mm_add_epi32(d3_32, final3);
let sum3_16 = _mm_packs_epi32(sum3, sum3);
let sum3_clamped = _mm_max_epi16(_mm_min_epi16(sum3_16, max_val), zero);
let sum3_8 = _mm_packus_epi16(sum3_clamped, sum3_clamped);
storei32!(&mut dst[off3..off3 + 4], sum3_8);
let coeff_bytes = zerocopy::IntoBytes::as_mut_bytes(&mut *coeff);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[..16]).unwrap(),
_mm_setzero_si128()
);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[16..32]).unwrap(),
_mm_setzero_si128()
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn dct4_2rows_avx2(
_token: Desktop64,
rows01: __m256i,
rows23: __m256i,
clip_min: i32,
clip_max: i32,
) -> (__m256i, __m256i) {
let sqrt2 = _mm256_set1_epi32(181);
let rnd8 = _mm256_set1_epi32(128);
let c1567 = _mm256_set1_epi32(1567);
let c_312 = _mm256_set1_epi32(3784 - 4096);
let rnd12 = _mm256_set1_epi32(2048);
let in0_01 = _mm256_shuffle_epi32(rows01, 0b00_00_00_00);
let in1_01 = _mm256_shuffle_epi32(rows01, 0b01_01_01_01);
let in2_01 = _mm256_shuffle_epi32(rows01, 0b10_10_10_10);
let in3_01 = _mm256_shuffle_epi32(rows01, 0b11_11_11_11);
let sum02_01 = _mm256_add_epi32(in0_01, in2_01);
let t0_01 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_mullo_epi32(sum02_01, sqrt2), rnd8),
8,
);
let diff02_01 = _mm256_sub_epi32(in0_01, in2_01);
let t1_01 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_mullo_epi32(diff02_01, sqrt2), rnd8),
8,
);
let mul1_1567_01 = _mm256_mullo_epi32(in1_01, c1567);
let mul3_312_01 = _mm256_mullo_epi32(in3_01, c_312);
let t2_inner_01 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_sub_epi32(mul1_1567_01, mul3_312_01), rnd12),
12,
);
let t2_01 = _mm256_sub_epi32(t2_inner_01, in3_01);
let mul1_312_01 = _mm256_mullo_epi32(in1_01, c_312);
let mul3_1567_01 = _mm256_mullo_epi32(in3_01, c1567);
let t3_inner_01 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_add_epi32(mul1_312_01, mul3_1567_01), rnd12),
12,
);
let t3_01 = _mm256_add_epi32(t3_inner_01, in1_01);
let vmin = _mm256_set1_epi32(clip_min);
let vmax = _mm256_set1_epi32(clip_max);
let out0_01 = _mm256_max_epi32(_mm256_min_epi32(_mm256_add_epi32(t0_01, t3_01), vmax), vmin);
let out1_01 = _mm256_max_epi32(_mm256_min_epi32(_mm256_add_epi32(t1_01, t2_01), vmax), vmin);
let out2_01 = _mm256_max_epi32(_mm256_min_epi32(_mm256_sub_epi32(t1_01, t2_01), vmax), vmin);
let out3_01 = _mm256_max_epi32(_mm256_min_epi32(_mm256_sub_epi32(t0_01, t3_01), vmax), vmin);
let mask0 = _mm256_set_epi32(0, 0, 0, -1i32, 0, 0, 0, -1i32);
let mask1 = _mm256_set_epi32(0, 0, -1i32, 0, 0, 0, -1i32, 0);
let mask2 = _mm256_set_epi32(0, -1i32, 0, 0, 0, -1i32, 0, 0);
let mask3 = _mm256_set_epi32(-1i32, 0, 0, 0, -1i32, 0, 0, 0);
let rows01_out = _mm256_or_si256(
_mm256_or_si256(
_mm256_and_si256(out0_01, mask0),
_mm256_and_si256(_mm256_shuffle_epi32(out1_01, 0b00_00_00_01), mask1),
),
_mm256_or_si256(
_mm256_and_si256(_mm256_shuffle_epi32(out2_01, 0b00_00_10_00), mask2),
_mm256_and_si256(_mm256_shuffle_epi32(out3_01, 0b00_11_00_00), mask3),
),
);
let in0_23 = _mm256_shuffle_epi32(rows23, 0b00_00_00_00);
let in1_23 = _mm256_shuffle_epi32(rows23, 0b01_01_01_01);
let in2_23 = _mm256_shuffle_epi32(rows23, 0b10_10_10_10);
let in3_23 = _mm256_shuffle_epi32(rows23, 0b11_11_11_11);
let sum02_23 = _mm256_add_epi32(in0_23, in2_23);
let t0_23 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_mullo_epi32(sum02_23, sqrt2), rnd8),
8,
);
let diff02_23 = _mm256_sub_epi32(in0_23, in2_23);
let t1_23 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_mullo_epi32(diff02_23, sqrt2), rnd8),
8,
);
let mul1_1567_23 = _mm256_mullo_epi32(in1_23, c1567);
let mul3_312_23 = _mm256_mullo_epi32(in3_23, c_312);
let t2_inner_23 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_sub_epi32(mul1_1567_23, mul3_312_23), rnd12),
12,
);
let t2_23 = _mm256_sub_epi32(t2_inner_23, in3_23);
let mul1_312_23 = _mm256_mullo_epi32(in1_23, c_312);
let mul3_1567_23 = _mm256_mullo_epi32(in3_23, c1567);
let t3_inner_23 = _mm256_srai_epi32(
_mm256_add_epi32(_mm256_add_epi32(mul1_312_23, mul3_1567_23), rnd12),
12,
);
let t3_23 = _mm256_add_epi32(t3_inner_23, in1_23);
let out0_23 = _mm256_max_epi32(_mm256_min_epi32(_mm256_add_epi32(t0_23, t3_23), vmax), vmin);
let out1_23 = _mm256_max_epi32(_mm256_min_epi32(_mm256_add_epi32(t1_23, t2_23), vmax), vmin);
let out2_23 = _mm256_max_epi32(_mm256_min_epi32(_mm256_sub_epi32(t1_23, t2_23), vmax), vmin);
let out3_23 = _mm256_max_epi32(_mm256_min_epi32(_mm256_sub_epi32(t0_23, t3_23), vmax), vmin);
let rows23_out = _mm256_or_si256(
_mm256_or_si256(
_mm256_and_si256(out0_23, mask0),
_mm256_and_si256(_mm256_shuffle_epi32(out1_23, 0b00_00_00_01), mask1),
),
_mm256_or_si256(
_mm256_and_si256(_mm256_shuffle_epi32(out2_23, 0b00_00_10_00), mask2),
_mm256_and_si256(_mm256_shuffle_epi32(out3_23, 0b00_11_00_00), mask3),
),
);
(rows01_out, rows23_out)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x4_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_4x4_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_4x4_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
dst_stride: usize, coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let stride_u16 = dst_stride / 2;
let row0 = _mm_set_epi32(
coeff[12] as i32,
coeff[8] as i32,
coeff[4] as i32,
coeff[0] as i32,
);
let row1 = _mm_set_epi32(
coeff[13] as i32,
coeff[9] as i32,
coeff[5] as i32,
coeff[1] as i32,
);
let row2 = _mm_set_epi32(
coeff[14] as i32,
coeff[10] as i32,
coeff[6] as i32,
coeff[2] as i32,
);
let row3 = _mm_set_epi32(
coeff[15] as i32,
coeff[11] as i32,
coeff[7] as i32,
coeff[3] as i32,
);
let rows01 = _mm256_set_m128i(row1, row0);
let rows23 = _mm256_set_m128i(row3, row2);
let row_clip_min = (!bitdepth_max) << 7;
let row_clip_max = !row_clip_min;
let col_clip_min = (!bitdepth_max) << 5;
let col_clip_max = !col_clip_min;
let (rows01_out, rows23_out) =
dct4_2rows_avx2(_token, rows01, rows23, row_clip_min, row_clip_max);
let r0 = _mm256_castsi256_si128(rows01_out);
let r1 = _mm256_extracti128_si256(rows01_out, 1);
let r2 = _mm256_castsi256_si128(rows23_out);
let r3 = _mm256_extracti128_si256(rows23_out, 1);
let t01_lo = _mm_unpacklo_epi32(r0, r1);
let t01_hi = _mm_unpackhi_epi32(r0, r1);
let t23_lo = _mm_unpacklo_epi32(r2, r3);
let t23_hi = _mm_unpackhi_epi32(r2, r3);
let c0 = _mm_unpacklo_epi64(t01_lo, t23_lo);
let c1 = _mm_unpackhi_epi64(t01_lo, t23_lo);
let c2 = _mm_unpacklo_epi64(t01_hi, t23_hi);
let c3 = _mm_unpackhi_epi64(t01_hi, t23_hi);
let cmin = _mm_set1_epi32(col_clip_min);
let cmax = _mm_set1_epi32(col_clip_max);
let c0 = _mm_max_epi32(_mm_min_epi32(c0, cmax), cmin);
let c1 = _mm_max_epi32(_mm_min_epi32(c1, cmax), cmin);
let c2 = _mm_max_epi32(_mm_min_epi32(c2, cmax), cmin);
let c3 = _mm_max_epi32(_mm_min_epi32(c3, cmax), cmin);
let cols01 = _mm256_set_m128i(c1, c0);
let cols23 = _mm256_set_m128i(c3, c2);
let (cols01_out, cols23_out) =
dct4_2rows_avx2(_token, cols01, cols23, col_clip_min, col_clip_max);
let col0 = _mm256_castsi256_si128(cols01_out);
let col1 = _mm256_extracti128_si256(cols01_out, 1);
let col2 = _mm256_castsi256_si128(cols23_out);
let col3 = _mm256_extracti128_si256(cols23_out, 1);
let t01_lo = _mm_unpacklo_epi32(col0, col1);
let t01_hi = _mm_unpackhi_epi32(col0, col1);
let t23_lo = _mm_unpacklo_epi32(col2, col3);
let t23_hi = _mm_unpackhi_epi32(col2, col3);
let out0 = _mm_unpacklo_epi64(t01_lo, t23_lo);
let out1 = _mm_unpackhi_epi64(t01_lo, t23_lo);
let out2 = _mm_unpacklo_epi64(t01_hi, t23_hi);
let out3 = _mm_unpackhi_epi64(t01_hi, t23_hi);
let rnd = _mm_set1_epi32(8);
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi32(bitdepth_max);
let dst0 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[..4]));
let dst0_32 = _mm_unpacklo_epi16(dst0, zero);
let scaled0 = _mm_srai_epi32(_mm_add_epi32(out0, rnd), 4);
let sum0 = _mm_add_epi32(dst0_32, scaled0);
let clamped0 = _mm_max_epi32(_mm_min_epi32(sum0, max_val), zero);
let packed0 = _mm_packus_epi32(clamped0, clamped0);
storei64!(zerocopy::IntoBytes::as_mut_bytes(&mut dst[..4]), packed0);
let dst_off1 = stride_u16;
let dst1 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[dst_off1..dst_off1 + 4]));
let dst1_32 = _mm_unpacklo_epi16(dst1, zero);
let scaled1 = _mm_srai_epi32(_mm_add_epi32(out1, rnd), 4);
let sum1 = _mm_add_epi32(dst1_32, scaled1);
let clamped1 = _mm_max_epi32(_mm_min_epi32(sum1, max_val), zero);
let packed1 = _mm_packus_epi32(clamped1, clamped1);
storei64!(
zerocopy::IntoBytes::as_mut_bytes(&mut dst[dst_off1..dst_off1 + 4]),
packed1
);
let dst_off2 = stride_u16 * 2;
let dst2 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[dst_off2..dst_off2 + 4]));
let dst2_32 = _mm_unpacklo_epi16(dst2, zero);
let scaled2 = _mm_srai_epi32(_mm_add_epi32(out2, rnd), 4);
let sum2 = _mm_add_epi32(dst2_32, scaled2);
let clamped2 = _mm_max_epi32(_mm_min_epi32(sum2, max_val), zero);
let packed2 = _mm_packus_epi32(clamped2, clamped2);
storei64!(
zerocopy::IntoBytes::as_mut_bytes(&mut dst[dst_off2..dst_off2 + 4]),
packed2
);
let dst_off3 = stride_u16 * 3;
let dst3 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[dst_off3..dst_off3 + 4]));
let dst3_32 = _mm_unpacklo_epi16(dst3, zero);
let scaled3 = _mm_srai_epi32(_mm_add_epi32(out3, rnd), 4);
let sum3 = _mm_add_epi32(dst3_32, scaled3);
let clamped3 = _mm_max_epi32(_mm_min_epi32(sum3, max_val), zero);
let packed3 = _mm_packus_epi32(clamped3, clamped3);
storei64!(
zerocopy::IntoBytes::as_mut_bytes(&mut dst[dst_off3..dst_off3 + 4]),
packed3
);
coeff[..16].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x4_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u16, _coeff_len as usize * stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_4x4_16bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_wht_wht_4x4_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let col0 = _mm_srai_epi32(
_mm_set_epi32(
coeff[3] as i32,
coeff[2] as i32,
coeff[1] as i32,
coeff[0] as i32,
),
2,
);
let col1 = _mm_srai_epi32(
_mm_set_epi32(
coeff[7] as i32,
coeff[6] as i32,
coeff[5] as i32,
coeff[4] as i32,
),
2,
);
let col2 = _mm_srai_epi32(
_mm_set_epi32(
coeff[11] as i32,
coeff[10] as i32,
coeff[9] as i32,
coeff[8] as i32,
),
2,
);
let col3 = _mm_srai_epi32(
_mm_set_epi32(
coeff[15] as i32,
coeff[14] as i32,
coeff[13] as i32,
coeff[12] as i32,
),
2,
);
let t0 = _mm_add_epi32(col0, col1);
let t2 = _mm_sub_epi32(col2, col3);
let t4 = _mm_srai_epi32(_mm_sub_epi32(t0, t2), 1);
let t3 = _mm_sub_epi32(t4, col3);
let t1 = _mm_sub_epi32(t4, col1);
let r0 = _mm_sub_epi32(t0, t3);
let r1 = t3;
let r2 = t1;
let r3 = _mm_add_epi32(t2, t1);
let t01_lo = _mm_unpacklo_epi32(r0, r1);
let t01_hi = _mm_unpackhi_epi32(r0, r1);
let t23_lo = _mm_unpacklo_epi32(r2, r3);
let t23_hi = _mm_unpackhi_epi32(r2, r3);
let row0 = _mm_unpacklo_epi64(t01_lo, t23_lo);
let row1 = _mm_unpackhi_epi64(t01_lo, t23_lo);
let row2 = _mm_unpacklo_epi64(t01_hi, t23_hi);
let row3 = _mm_unpackhi_epi64(t01_hi, t23_hi);
let t0 = _mm_add_epi32(row0, row1);
let t2 = _mm_sub_epi32(row2, row3);
let t4 = _mm_srai_epi32(_mm_sub_epi32(t0, t2), 1);
let t3 = _mm_sub_epi32(t4, row3);
let t1 = _mm_sub_epi32(t4, row1);
let final0 = _mm_sub_epi32(t0, t3);
let final1 = t3;
let final2 = t1;
let final3 = _mm_add_epi32(t2, t1);
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let d0 = loadi32!(&dst[..4]);
let d0_32 = _mm_cvtepi16_epi32(_mm_unpacklo_epi8(d0, zero));
let sum0 = _mm_add_epi32(d0_32, final0);
let sum0_8 = _mm_packus_epi16(
_mm_max_epi16(_mm_min_epi16(_mm_packs_epi32(sum0, sum0), max_val), zero),
zero,
);
storei32!(&mut dst[..4], sum0_8);
let off1 = dst_stride;
let d1 = loadi32!(&dst[off1..off1 + 4]);
let d1_32 = _mm_cvtepi16_epi32(_mm_unpacklo_epi8(d1, zero));
let sum1 = _mm_add_epi32(d1_32, final1);
let sum1_8 = _mm_packus_epi16(
_mm_max_epi16(_mm_min_epi16(_mm_packs_epi32(sum1, sum1), max_val), zero),
zero,
);
storei32!(&mut dst[off1..off1 + 4], sum1_8);
let off2 = dst_stride * 2;
let d2 = loadi32!(&dst[off2..off2 + 4]);
let d2_32 = _mm_cvtepi16_epi32(_mm_unpacklo_epi8(d2, zero));
let sum2 = _mm_add_epi32(d2_32, final2);
let sum2_8 = _mm_packus_epi16(
_mm_max_epi16(_mm_min_epi16(_mm_packs_epi32(sum2, sum2), max_val), zero),
zero,
);
storei32!(&mut dst[off2..off2 + 4], sum2_8);
let off3 = dst_stride * 3;
let d3 = loadi32!(&dst[off3..off3 + 4]);
let d3_32 = _mm_cvtepi16_epi32(_mm_unpacklo_epi8(d3, zero));
let sum3 = _mm_add_epi32(d3_32, final3);
let sum3_8 = _mm_packus_epi16(
_mm_max_epi16(_mm_min_epi16(_mm_packs_epi32(sum3, sum3), max_val), zero),
zero,
);
storei32!(&mut dst[off3..off3 + 4], sum3_8);
let coeff_bytes = zerocopy::IntoBytes::as_mut_bytes(&mut *coeff);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[..16]).unwrap(),
_mm_setzero_si128()
);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[16..32]).unwrap(),
_mm_setzero_si128()
);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_wht_wht_4x4_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let abs_stride = dst_stride.unsigned_abs();
let buf_size = 3 * abs_stride + 4;
let dst_slice = if dst_stride >= 0 {
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_size) }
} else {
let start = unsafe { (dst_ptr as *mut u8).offset(3 * dst_stride) };
let base = 3 * abs_stride;
unsafe { std::slice::from_raw_parts_mut(start.add(base), buf_size - base) }
};
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, 16) };
inv_txfm_add_wht_wht_4x4_8bpc_avx2_inner(
_token,
dst_slice,
abs_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_wht_wht_4x4_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
byte_stride: usize,
coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let dst_stride_u16 = byte_stride / 2;
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let col0 = _mm_srai_epi32(loadu_128!(&coeff[0..4], [i32; 4]), 2);
let col1 = _mm_srai_epi32(loadu_128!(&coeff[4..8], [i32; 4]), 2);
let col2 = _mm_srai_epi32(loadu_128!(&coeff[8..12], [i32; 4]), 2);
let col3 = _mm_srai_epi32(loadu_128!(&coeff[12..16], [i32; 4]), 2);
let t0 = _mm_add_epi32(col0, col1);
let t2 = _mm_sub_epi32(col2, col3);
let t4 = _mm_srai_epi32(_mm_sub_epi32(t0, t2), 1);
let t3 = _mm_sub_epi32(t4, col3);
let t1 = _mm_sub_epi32(t4, col1);
let r0 = _mm_sub_epi32(t0, t3);
let r1 = t3;
let r2 = t1;
let r3 = _mm_add_epi32(t2, t1);
let t01_lo = _mm_unpacklo_epi32(r0, r1);
let t01_hi = _mm_unpackhi_epi32(r0, r1);
let t23_lo = _mm_unpacklo_epi32(r2, r3);
let t23_hi = _mm_unpackhi_epi32(r2, r3);
let row0 = _mm_unpacklo_epi64(t01_lo, t23_lo);
let row1 = _mm_unpackhi_epi64(t01_lo, t23_lo);
let row2 = _mm_unpacklo_epi64(t01_hi, t23_hi);
let row3 = _mm_unpackhi_epi64(t01_hi, t23_hi);
let t0 = _mm_add_epi32(row0, row1);
let t2 = _mm_sub_epi32(row2, row3);
let t4 = _mm_srai_epi32(_mm_sub_epi32(t0, t2), 1);
let t3 = _mm_sub_epi32(t4, row3);
let t1 = _mm_sub_epi32(t4, row1);
let final0 = _mm_sub_epi32(t0, t3);
let final1 = t3;
let final2 = t1;
let final3 = _mm_add_epi32(t2, t1);
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi32(bitdepth_max);
let d0 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[..4]));
let d0_32 = _mm_cvtepu16_epi32(d0);
let sum0 = _mm_max_epi32(_mm_min_epi32(_mm_add_epi32(d0_32, final0), max_val), zero);
let sum0_16 = _mm_packus_epi32(sum0, sum0);
storei64!(zerocopy::IntoBytes::as_mut_bytes(&mut dst[..4]), sum0_16);
let off1 = dst_stride_u16;
let d1 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[off1..off1 + 4]));
let d1_32 = _mm_cvtepu16_epi32(d1);
let sum1 = _mm_max_epi32(_mm_min_epi32(_mm_add_epi32(d1_32, final1), max_val), zero);
let sum1_16 = _mm_packus_epi32(sum1, sum1);
storei64!(
zerocopy::IntoBytes::as_mut_bytes(&mut dst[off1..off1 + 4]),
sum1_16
);
let off2 = dst_stride_u16 * 2;
let d2 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[off2..off2 + 4]));
let d2_32 = _mm_cvtepu16_epi32(d2);
let sum2 = _mm_max_epi32(_mm_min_epi32(_mm_add_epi32(d2_32, final2), max_val), zero);
let sum2_16 = _mm_packus_epi32(sum2, sum2);
storei64!(
zerocopy::IntoBytes::as_mut_bytes(&mut dst[off2..off2 + 4]),
sum2_16
);
let off3 = dst_stride_u16 * 3;
let d3 = loadi64!(zerocopy::IntoBytes::as_bytes(&dst[off3..off3 + 4]));
let d3_32 = _mm_cvtepu16_epi32(d3);
let sum3 = _mm_max_epi32(_mm_min_epi32(_mm_add_epi32(d3_32, final3), max_val), zero);
let sum3_16 = _mm_packus_epi32(sum3, sum3);
storei64!(
zerocopy::IntoBytes::as_mut_bytes(&mut dst[off3..off3 + 4]),
sum3_16
);
let coeff_bytes = zerocopy::IntoBytes::as_mut_bytes(&mut *coeff);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[..16]).unwrap(),
_mm_setzero_si128()
);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[16..32]).unwrap(),
_mm_setzero_si128()
);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[32..48]).unwrap(),
_mm_setzero_si128()
);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut coeff_bytes[48..64]).unwrap(),
_mm_setzero_si128()
);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_wht_wht_4x4_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let abs_stride = dst_stride.unsigned_abs();
let buf_size_u16 = (3 * abs_stride + 8) / 2; let dst_slice = if dst_stride >= 0 {
unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, buf_size_u16) }
} else {
let start = unsafe { (dst_ptr as *mut u16).offset(3 * (dst_stride / 2)) };
let base_u16 = 3 * (abs_stride / 2);
unsafe { std::slice::from_raw_parts_mut(start.add(base_u16), buf_size_u16 - base_u16) }
};
let coeff_slice = unsafe { std::slice::from_raw_parts_mut(coeff as *mut i32, 16) };
inv_txfm_add_wht_wht_4x4_16bpc_avx2_inner(
_token,
dst_slice,
abs_stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub fn inv_identity_add_4x4_8bpc_avx2(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let identity4 = |v: i32| -> i32 { v + ((v * 1697 + 2048) >> 12) };
for y in 0..4 {
let dst_off = y * dst_stride;
let d = loadi32!(&dst[dst_off..dst_off + 4]);
let d16 = _mm_unpacklo_epi8(d, zero);
let c0 = coeff[y] as i32;
let c1 = coeff[y + 4] as i32;
let c2 = coeff[y + 8] as i32;
let c3 = coeff[y + 12] as i32;
let col_clip_min = if bitdepth_max == 255 {
i16::MIN as i32
} else {
(!bitdepth_max) << 5
};
let col_clip_max = !col_clip_min;
let scale = |v: i32| -> i32 { identity4(identity4(v).clamp(col_clip_min, col_clip_max)) };
let r0 = (scale(c0) + 8) >> 4;
let r1 = (scale(c1) + 8) >> 4;
let r2 = (scale(c2) + 8) >> 4;
let r3 = (scale(c3) + 8) >> 4;
let result = _mm_set_epi32(r3, r2, r1, r0);
let d32 = _mm_cvtepi16_epi32(d16);
let sum = _mm_add_epi32(d32, result);
let sum16 = _mm_packs_epi32(sum, sum);
let clamped = _mm_max_epi16(_mm_min_epi16(sum16, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei32!(&mut dst[dst_off..dst_off + 4], packed);
}
coeff[..16].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_4x4_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_identity_add_4x4_8bpc_avx2(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub fn inv_identity_add_8x8_8bpc_avx2(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let one = _mm_set1_epi32(1);
let eight = _mm_set1_epi32(8);
for y in 0..8 {
let dst_off = y * dst_stride;
let d = loadi64!(&dst[dst_off..dst_off + 8]);
let d16 = _mm_unpacklo_epi8(d, zero);
let mut coeffs = [0i16; 8];
for x in 0..8 {
coeffs[x] = coeff[y + x * 8];
}
let c_vec = loadu_128!(<&[i16; 8]>::try_from(&coeffs[..]).unwrap());
let c_lo = _mm_cvtepi16_epi32(c_vec);
let c_hi = _mm_cvtepi16_epi32(_mm_srli_si128(c_vec, 8));
let row_lo = _mm_slli_epi32(c_lo, 1);
let row_hi = _mm_slli_epi32(c_hi, 1);
let inter_lo = _mm_srai_epi32(_mm_add_epi32(row_lo, one), 1);
let inter_hi = _mm_srai_epi32(_mm_add_epi32(row_hi, one), 1);
let col_lo = _mm_slli_epi32(inter_lo, 1);
let col_hi = _mm_slli_epi32(inter_hi, 1);
let res_lo = _mm_srai_epi32(_mm_add_epi32(col_lo, eight), 4);
let res_hi = _mm_srai_epi32(_mm_add_epi32(col_hi, eight), 4);
let res16 = _mm_packs_epi32(res_lo, res_hi);
let sum = _mm_add_epi16(d16, res16);
let clamped = _mm_max_epi16(_mm_min_epi16(sum, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei64!(&mut dst[dst_off..dst_off + 8], packed);
}
coeff[..64].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x8_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_identity_add_8x8_8bpc_avx2(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
#[inline]
fn dct4_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
let in0 = c[0 * stride];
let in1 = c[1 * stride];
let in2 = c[2 * stride];
let in3 = c[3 * stride];
let t0 = (in0 + in2) * 181 + 128 >> 8;
let t1 = (in0 - in2) * 181 + 128 >> 8;
let t2 = (in1 * 1567 - in3 * (3784 - 4096) + 2048 >> 12) - in3;
let t3 = (in1 * (3784 - 4096) + in3 * 1567 + 2048 >> 12) + in1;
c[0 * stride] = clip(t0 + t3);
c[1 * stride] = clip(t1 + t2);
c[2 * stride] = clip(t1 - t2);
c[3 * stride] = clip(t0 - t3);
}
#[inline]
fn dct8_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
dct4_1d(c, stride * 2, min, max);
let in1 = c[1 * stride];
let in3 = c[3 * stride];
let in5 = c[5 * stride];
let in7 = c[7 * stride];
let t4a = (in1 * 799 - in7 * (4017 - 4096) + 2048 >> 12) - in7;
let t5a = in5 * 1703 - in3 * 1138 + 1024 >> 11;
let t6a = in5 * 1138 + in3 * 1703 + 1024 >> 11;
let t7a = (in1 * (4017 - 4096) + in7 * 799 + 2048 >> 12) + in1;
let t4 = clip(t4a + t5a);
let t5a = clip(t4a - t5a);
let t7 = clip(t7a + t6a);
let t6a = clip(t7a - t6a);
let t5 = (t6a - t5a) * 181 + 128 >> 8;
let t6 = (t6a + t5a) * 181 + 128 >> 8;
let t0 = c[0 * stride];
let t1 = c[2 * stride];
let t2 = c[4 * stride];
let t3 = c[6 * stride];
c[0 * stride] = clip(t0 + t7);
c[1 * stride] = clip(t1 + t6);
c[2 * stride] = clip(t2 + t5);
c[3 * stride] = clip(t3 + t4);
c[4 * stride] = clip(t3 - t4);
c[5 * stride] = clip(t2 - t5);
c[6 * stride] = clip(t1 - t6);
c[7 * stride] = clip(t0 - t7);
}
#[inline]
fn adst4_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
let in0 = c[0 * stride];
let in1 = c[1 * stride];
let in2 = c[2 * stride];
let in3 = c[3 * stride];
let out0 =
((1321 * in0 + (3803 - 4096) * in2 + (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048)
>> 12)
+ in2
+ in3
+ in1;
let out1 =
(((2482 - 4096) * in0 - 1321 * in2 - (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048)
>> 12)
+ in0
- in3
+ in1;
let out2 = (209 * (in0 - in2 + in3) + 128) >> 8;
let out3 = (((3803 - 4096) * in0 + (2482 - 4096) * in2 - 1321 * in3 - (3344 - 4096) * in1
+ 2048)
>> 12)
+ in0
+ in2
- in1;
c[0 * stride] = clip(out0);
c[1 * stride] = clip(out1);
c[2 * stride] = clip(out2);
c[3 * stride] = clip(out3);
}
#[inline]
fn flipadst4_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
let in0 = c[0 * stride];
let in1 = c[1 * stride];
let in2 = c[2 * stride];
let in3 = c[3 * stride];
let out0 =
((1321 * in0 + (3803 - 4096) * in2 + (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048)
>> 12)
+ in2
+ in3
+ in1;
let out1 =
(((2482 - 4096) * in0 - 1321 * in2 - (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048)
>> 12)
+ in0
- in3
+ in1;
let out2 = (209 * (in0 - in2 + in3) + 128) >> 8;
let out3 = (((3803 - 4096) * in0 + (2482 - 4096) * in2 - 1321 * in3 - (3344 - 4096) * in1
+ 2048)
>> 12)
+ in0
+ in2
- in1;
c[0 * stride] = clip(out3);
c[1 * stride] = clip(out2);
c[2 * stride] = clip(out1);
c[3 * stride] = clip(out0);
}
#[inline]
fn adst8_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
let in0 = c[0 * stride];
let in1 = c[1 * stride];
let in2 = c[2 * stride];
let in3 = c[3 * stride];
let in4 = c[4 * stride];
let in5 = c[5 * stride];
let in6 = c[6 * stride];
let in7 = c[7 * stride];
let t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7;
let t1a = ((401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
let t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5;
let t3a = ((1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
let t4a = (1299 * in3 + 1583 * in4 + 1024) >> 11;
let t5a = (1583 * in3 - 1299 * in4 + 1024) >> 11;
let t6a = ((1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
let t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1;
let t0 = clip(t0a + t4a);
let t1 = clip(t1a + t5a);
let t2 = clip(t2a + t6a);
let t3 = clip(t3a + t7a);
let t4 = clip(t0a - t4a);
let t5 = clip(t1a - t5a);
let t6 = clip(t2a - t6a);
let t7 = clip(t3a - t7a);
let t4a = (((3784 - 4096) * t4 + 1567 * t5 + 2048) >> 12) + t4;
let t5a = ((1567 * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
let t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7;
let t7a = ((1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
let out0 = clip(t0 + t2);
let out7 = -clip(t1 + t3);
let t2_final = clip(t0 - t2);
let t3_final = clip(t1 - t3);
let out1 = -clip(t4a + t6a);
let out6 = clip(t5a + t7a);
let t6_final = clip(t4a - t6a);
let t7_final = clip(t5a - t7a);
let out3 = -(((t2_final + t3_final) * 181 + 128) >> 8);
let out4 = ((t2_final - t3_final) * 181 + 128) >> 8;
let out2 = ((t6_final + t7_final) * 181 + 128) >> 8;
let out5 = -(((t6_final - t7_final) * 181 + 128) >> 8);
c[0 * stride] = out0;
c[1 * stride] = out1;
c[2 * stride] = out2;
c[3 * stride] = out3;
c[4 * stride] = out4;
c[5 * stride] = out5;
c[6 * stride] = out6;
c[7 * stride] = out7;
}
#[inline]
fn flipadst8_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
let in0 = c[0 * stride];
let in1 = c[1 * stride];
let in2 = c[2 * stride];
let in3 = c[3 * stride];
let in4 = c[4 * stride];
let in5 = c[5 * stride];
let in6 = c[6 * stride];
let in7 = c[7 * stride];
let t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7;
let t1a = ((401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
let t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5;
let t3a = ((1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
let t4a = (1299 * in3 + 1583 * in4 + 1024) >> 11;
let t5a = (1583 * in3 - 1299 * in4 + 1024) >> 11;
let t6a = ((1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
let t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1;
let t0 = clip(t0a + t4a);
let t1 = clip(t1a + t5a);
let t2 = clip(t2a + t6a);
let t3 = clip(t3a + t7a);
let t4 = clip(t0a - t4a);
let t5 = clip(t1a - t5a);
let t6 = clip(t2a - t6a);
let t7 = clip(t3a - t7a);
let t4a = (((3784 - 4096) * t4 + 1567 * t5 + 2048) >> 12) + t4;
let t5a = ((1567 * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
let t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7;
let t7a = ((1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
let out0 = clip(t0 + t2);
let out7 = -clip(t1 + t3);
let t2_final = clip(t0 - t2);
let t3_final = clip(t1 - t3);
let out1 = -clip(t4a + t6a);
let out6 = clip(t5a + t7a);
let t6_final = clip(t4a - t6a);
let t7_final = clip(t5a - t7a);
let out3 = -(((t2_final + t3_final) * 181 + 128) >> 8);
let out4 = ((t2_final - t3_final) * 181 + 128) >> 8;
let out2 = ((t6_final + t7_final) * 181 + 128) >> 8;
let out5 = -(((t6_final - t7_final) * 181 + 128) >> 8);
c[0 * stride] = out7;
c[1 * stride] = out6;
c[2 * stride] = out5;
c[3 * stride] = out4;
c[4 * stride] = out3;
c[5 * stride] = out2;
c[6 * stride] = out1;
c[7 * stride] = out0;
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_8x8_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let _row_clip_min = i16::MIN as i32;
let _row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 64];
{
let coeff_arr: &[i16; 64] = coeff.as_slice()[..64].try_into().unwrap();
let raw = dct8_row_pass_i16_simd(_token, *coeff_arr);
let rnd_v = _mm256_set1_epi32(1);
let col_min_v = _mm256_set1_epi32(col_clip_min);
let col_max_v = _mm256_set1_epi32(col_clip_max);
for y in 0..8 {
let v = loadu_256!(&raw[y * 8..y * 8 + 8], [i32; 8]);
let shifted = _mm256_srai_epi32::<1>(_mm256_add_epi32(v, rnd_v));
let clipped = _mm256_max_epi32(_mm256_min_epi32(shifted, col_max_v), col_min_v);
storeu_256!(&mut tmp[y * 8..y * 8 + 8], [i32; 8], clipped);
}
}
let col_out = dct8_col_pass_i16(_token, &tmp);
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..8 {
let dst_off = y * dst_stride;
let d = loadi64!(&dst[dst_off..dst_off + 8]);
let d16 = _mm_unpacklo_epi8(d, zero);
let c_scaled = _mm256_srai_epi32(_mm256_add_epi32(col_out[y], rnd_final), 4);
let c_lo_scaled = _mm256_castsi256_si128(c_scaled);
let c_hi_scaled = _mm256_extracti128_si256(c_scaled, 1);
let c16 = _mm_packs_epi32(c_lo_scaled, c_hi_scaled);
let sum = _mm_add_epi16(d16, c16);
let clamped = _mm_max_epi16(_mm_min_epi16(sum, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei64!(&mut dst[dst_off..dst_off + 8], packed);
}
coeff[..64].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x8_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_8x8_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_8x8_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
dst_stride: usize, coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let stride_u16 = dst_stride / 2;
let row_clip_min = (!bitdepth_max) << 7;
let row_clip_max = !row_clip_min;
let col_clip_min = (!bitdepth_max) << 5;
let col_clip_max = !col_clip_min;
let mut tmp = [0i32; 64];
let rnd = 1;
let shift = 1;
for y in 0..8 {
let mut scratch = [0i32; 8];
for x in 0..8 {
scratch[x] = coeff[y + x * 8] as i32;
}
dct8_1d(&mut scratch[..8], 1, row_clip_min, row_clip_max);
for x in 0..8 {
tmp[y * 8 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
{
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
let mut v = [_mm256_setzero_si256(); 8];
for i in 0..8 {
v[i] = loadu_256!(&tmp[i * 8..i * 8 + 8], [i32; 8]);
}
dct8_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..8 {
storeu_256!(&mut tmp[i * 8..i * 8 + 8], [i32; 8], v[i]);
}
}
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi32(bitdepth_max);
let rnd_final = _mm_set1_epi32(8);
for y in 0..8 {
let dst_off = y * stride_u16;
let d = loadu_128!(<&[u16; 8]>::try_from(&dst[dst_off..dst_off + 8]).unwrap());
let d_lo = _mm_unpacklo_epi16(d, zero); let d_hi = _mm_unpackhi_epi16(d, zero);
let c_256 = loadu_256!(&tmp[y * 8..y * 8 + 8], [i32; 8]);
let c_lo = _mm256_castsi256_si128(c_256);
let c_hi = _mm256_extracti128_si256(c_256, 1);
let c_lo_scaled = _mm_srai_epi32(_mm_add_epi32(c_lo, rnd_final), 4);
let c_hi_scaled = _mm_srai_epi32(_mm_add_epi32(c_hi, rnd_final), 4);
let sum_lo = _mm_add_epi32(d_lo, c_lo_scaled);
let sum_hi = _mm_add_epi32(d_hi, c_hi_scaled);
let clamped_lo = _mm_max_epi32(_mm_min_epi32(sum_lo, max_val), zero);
let clamped_hi = _mm_max_epi32(_mm_min_epi32(sum_hi, max_val), zero);
let packed = _mm_packus_epi32(clamped_lo, clamped_hi);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[dst_off..dst_off + 8]).unwrap(),
packed
);
}
coeff[..64].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x8_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u16, _coeff_len as usize * stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_8x8_16bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
pub fn inv_identity_add_16x16_8bpc_avx2(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let mut tmp = [[0i32; 16]; 16];
for y in 0..16 {
for x in 0..16 {
let c = coeff[y + x * 16] as i32;
let r = 2 * c + ((c * 1697 + 1024) >> 11);
tmp[y][x] = r;
}
}
for y in 0..16 {
for x in 0..16 {
tmp[y][x] = ((tmp[y][x] + 2) >> 2).clamp(i16::MIN as i32, i16::MAX as i32);
}
}
for x in 0..16 {
for y in 0..16 {
let c = tmp[y][x];
let r = 2 * c + ((c * 1697 + 1024) >> 11);
tmp[y][x] = (r + 8) >> 4;
}
}
for y in 0..16 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d_lo = _mm256_cvtepu8_epi16(d);
let c_vec = _mm256_set_epi16(
tmp[y][15] as i16,
tmp[y][14] as i16,
tmp[y][13] as i16,
tmp[y][12] as i16,
tmp[y][11] as i16,
tmp[y][10] as i16,
tmp[y][9] as i16,
tmp[y][8] as i16,
tmp[y][7] as i16,
tmp[y][6] as i16,
tmp[y][5] as i16,
tmp[y][4] as i16,
tmp[y][3] as i16,
tmp[y][2] as i16,
tmp[y][1] as i16,
tmp[y][0] as i16,
);
let sum = _mm256_add_epi16(d_lo, c_vec);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed_lo = _mm256_castsi256_si128(packed);
let packed_hi = _mm256_extracti128_si256(packed, 1);
let result = _mm_unpacklo_epi64(packed_lo, packed_hi);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
result
);
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_identity_add_16x16_8bpc_avx2(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
#[inline]
fn dct16_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
dct8_1d(c, stride * 2, min, max);
let in1 = c[1 * stride];
let in3 = c[3 * stride];
let in5 = c[5 * stride];
let in7 = c[7 * stride];
let in9 = c[9 * stride];
let in11 = c[11 * stride];
let in13 = c[13 * stride];
let in15 = c[15 * stride];
let t8a = (in1 * 401 - in15 * (4076 - 4096) + 2048 >> 12) - in15;
let t9a = in9 * 1583 - in7 * 1299 + 1024 >> 11;
let t10a = (in5 * 1931 - in11 * (3612 - 4096) + 2048 >> 12) - in11;
let t11a = (in13 * (3920 - 4096) - in3 * 1189 + 2048 >> 12) + in13;
let t12a = (in13 * 1189 + in3 * (3920 - 4096) + 2048 >> 12) + in3;
let t13a = (in5 * (3612 - 4096) + in11 * 1931 + 2048 >> 12) + in5;
let t14a = in9 * 1299 + in7 * 1583 + 1024 >> 11;
let t15a = (in1 * (4076 - 4096) + in15 * 401 + 2048 >> 12) + in1;
let t8 = clip(t8a + t9a);
let mut t9 = clip(t8a - t9a);
let mut t10 = clip(t11a - t10a);
let mut t11 = clip(t11a + t10a);
let mut t12 = clip(t12a + t13a);
let mut t13 = clip(t12a - t13a);
let mut t14 = clip(t15a - t14a);
let t15 = clip(t15a + t14a);
let t9a = (t14 * 1567 - t9 * (3784 - 4096) + 2048 >> 12) - t9;
let t14a = (t14 * (3784 - 4096) + t9 * 1567 + 2048 >> 12) + t14;
let t10a = (-(t13 * (3784 - 4096) + t10 * 1567) + 2048 >> 12) - t13;
let t13a = (t13 * 1567 - t10 * (3784 - 4096) + 2048 >> 12) - t10;
let t8a = clip(t8 + t11);
t9 = clip(t9a + t10a);
t10 = clip(t9a - t10a);
let t11a = clip(t8 - t11);
let t12a = clip(t15 - t12);
t13 = clip(t14a - t13a);
t14 = clip(t14a + t13a);
let t15a = clip(t15 + t12);
let t10a_new = (t13 - t10) * 181 + 128 >> 8;
let t13a_new = (t13 + t10) * 181 + 128 >> 8;
t11 = (t12a - t11a) * 181 + 128 >> 8;
t12 = (t12a + t11a) * 181 + 128 >> 8;
let t0 = c[0 * stride];
let t1 = c[2 * stride];
let t2 = c[4 * stride];
let t3 = c[6 * stride];
let t4 = c[8 * stride];
let t5 = c[10 * stride];
let t6 = c[12 * stride];
let t7 = c[14 * stride];
c[0 * stride] = clip(t0 + t15a);
c[1 * stride] = clip(t1 + t14);
c[2 * stride] = clip(t2 + t13a_new);
c[3 * stride] = clip(t3 + t12);
c[4 * stride] = clip(t4 + t11);
c[5 * stride] = clip(t5 + t10a_new);
c[6 * stride] = clip(t6 + t9);
c[7 * stride] = clip(t7 + t8a);
c[8 * stride] = clip(t7 - t8a);
c[9 * stride] = clip(t6 - t9);
c[10 * stride] = clip(t5 - t10a_new);
c[11 * stride] = clip(t4 - t11);
c[12 * stride] = clip(t3 - t12);
c[13 * stride] = clip(t2 - t13a_new);
c[14 * stride] = clip(t1 - t14);
c[15 * stride] = clip(t0 - t15a);
}
#[inline]
fn adst16_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
let clip = |v: i32| v.clamp(min, max);
let in0 = c[0 * stride];
let in1 = c[1 * stride];
let in2 = c[2 * stride];
let in3 = c[3 * stride];
let in4 = c[4 * stride];
let in5 = c[5 * stride];
let in6 = c[6 * stride];
let in7 = c[7 * stride];
let in8 = c[8 * stride];
let in9 = c[9 * stride];
let in10 = c[10 * stride];
let in11 = c[11 * stride];
let in12 = c[12 * stride];
let in13 = c[13 * stride];
let in14 = c[14 * stride];
let in15 = c[15 * stride];
let mut t0 = ((in15 * (4091 - 4096) + in0 * 201 + 2048) >> 12) + in15;
let mut t1 = ((in15 * 201 - in0 * (4091 - 4096) + 2048) >> 12) - in0;
let mut t2 = ((in13 * (3973 - 4096) + in2 * 995 + 2048) >> 12) + in13;
let mut t3 = ((in13 * 995 - in2 * (3973 - 4096) + 2048) >> 12) - in2;
let mut t4 = ((in11 * (3703 - 4096) + in4 * 1751 + 2048) >> 12) + in11;
let mut t5 = ((in11 * 1751 - in4 * (3703 - 4096) + 2048) >> 12) - in4;
let mut t6 = (in9 * 1645 + in6 * 1220 + 1024) >> 11;
let mut t7 = (in9 * 1220 - in6 * 1645 + 1024) >> 11;
let mut t8 = ((in7 * 2751 + in8 * (3035 - 4096) + 2048) >> 12) + in8;
let mut t9 = ((in7 * (3035 - 4096) - in8 * 2751 + 2048) >> 12) + in7;
let mut t10 = ((in5 * 2106 + in10 * (3513 - 4096) + 2048) >> 12) + in10;
let mut t11 = ((in5 * (3513 - 4096) - in10 * 2106 + 2048) >> 12) + in5;
let mut t12 = ((in3 * 1380 + in12 * (3857 - 4096) + 2048) >> 12) + in12;
let mut t13 = ((in3 * (3857 - 4096) - in12 * 1380 + 2048) >> 12) + in3;
let mut t14 = ((in1 * 601 + in14 * (4052 - 4096) + 2048) >> 12) + in14;
let mut t15 = ((in1 * (4052 - 4096) - in14 * 601 + 2048) >> 12) + in1;
let t0a = clip(t0 + t8);
let t1a = clip(t1 + t9);
let mut t2a = clip(t2 + t10);
let mut t3a = clip(t3 + t11);
let mut t4a = clip(t4 + t12);
let mut t5a = clip(t5 + t13);
let mut t6a = clip(t6 + t14);
let mut t7a = clip(t7 + t15);
let mut t8a = clip(t0 - t8);
let mut t9a = clip(t1 - t9);
let mut t10a = clip(t2 - t10);
let mut t11a = clip(t3 - t11);
let mut t12a = clip(t4 - t12);
let mut t13a = clip(t5 - t13);
let mut t14a = clip(t6 - t14);
let mut t15a = clip(t7 - t15);
t8 = ((t8a * (4017 - 4096) + t9a * 799 + 2048) >> 12) + t8a;
t9 = ((t8a * 799 - t9a * (4017 - 4096) + 2048) >> 12) - t9a;
t10 = ((t10a * 2276 + t11a * (3406 - 4096) + 2048) >> 12) + t11a;
t11 = ((t10a * (3406 - 4096) - t11a * 2276 + 2048) >> 12) + t10a;
t12 = ((t13a * (4017 - 4096) - t12a * 799 + 2048) >> 12) + t13a;
t13 = ((t13a * 799 + t12a * (4017 - 4096) + 2048) >> 12) + t12a;
t14 = ((t15a * 2276 - t14a * (3406 - 4096) + 2048) >> 12) - t14a;
t15 = ((t15a * (3406 - 4096) + t14a * 2276 + 2048) >> 12) + t15a;
t0 = clip(t0a + t4a);
t1 = clip(t1a + t5a);
t2 = clip(t2a + t6a);
t3 = clip(t3a + t7a);
t4 = clip(t0a - t4a);
t5 = clip(t1a - t5a);
t6 = clip(t2a - t6a);
t7 = clip(t3a - t7a);
t8a = clip(t8 + t12);
t9a = clip(t9 + t13);
t10a = clip(t10 + t14);
t11a = clip(t11 + t15);
t12a = clip(t8 - t12);
t13a = clip(t9 - t13);
t14a = clip(t10 - t14);
t15a = clip(t11 - t15);
t4a = ((t4 * (3784 - 4096) + t5 * 1567 + 2048) >> 12) + t4;
t5a = ((t4 * 1567 - t5 * (3784 - 4096) + 2048) >> 12) - t5;
t6a = ((t7 * (3784 - 4096) - t6 * 1567 + 2048) >> 12) + t7;
t7a = ((t7 * 1567 + t6 * (3784 - 4096) + 2048) >> 12) + t6;
t12 = ((t12a * (3784 - 4096) + t13a * 1567 + 2048) >> 12) + t12a;
t13 = ((t12a * 1567 - t13a * (3784 - 4096) + 2048) >> 12) - t13a;
t14 = ((t15a * (3784 - 4096) - t14a * 1567 + 2048) >> 12) + t15a;
t15 = ((t15a * 1567 + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
c[0 * stride] = clip(t0 + t2);
c[15 * stride] = -clip(t1 + t3);
t2a = clip(t0 - t2);
t3a = clip(t1 - t3);
c[3 * stride] = -clip(t4a + t6a);
c[12 * stride] = clip(t5a + t7a);
t6 = clip(t4a - t6a);
t7 = clip(t5a - t7a);
c[1 * stride] = -clip(t8a + t10a);
c[14 * stride] = clip(t9a + t11a);
t10 = clip(t8a - t10a);
t11 = clip(t9a - t11a);
c[2 * stride] = clip(t12 + t14);
c[13 * stride] = -clip(t13 + t15);
t14a = clip(t12 - t14);
t15a = clip(t13 - t15);
c[7 * stride] = -(((t2a + t3a) * 181 + 128) >> 8);
c[8 * stride] = ((t2a - t3a) * 181 + 128) >> 8;
c[4 * stride] = ((t6 + t7) * 181 + 128) >> 8;
c[11 * stride] = -(((t6 - t7) * 181 + 128) >> 8);
c[6 * stride] = ((t10 + t11) * 181 + 128) >> 8;
c[9 * stride] = -(((t10 - t11) * 181 + 128) >> 8);
c[5 * stride] = -(((t14a + t15a) * 181 + 128) >> 8);
c[10 * stride] = ((t14a - t15a) * 181 + 128) >> 8;
}
#[inline]
fn flipadst16_1d(c: &mut [i32], stride: usize, min: i32, max: i32) {
adst16_1d(c, stride, min, max);
for i in 0..8 {
c.swap(i * stride, (15 - i) * stride);
}
}
#[inline]
fn identity4_1d(c: &mut [i32], stride: usize, _min: i32, _max: i32) {
for i in 0..4 {
let in_0 = c[i * stride];
c[i * stride] = in_0 + (in_0 * 1697 + 2048 >> 12);
}
}
#[inline]
fn identity8_1d(c: &mut [i32], stride: usize, _min: i32, _max: i32) {
for i in 0..8 {
c[i * stride] *= 2;
}
}
#[inline]
fn identity16_1d(c: &mut [i32], stride: usize, _min: i32, _max: i32) {
for i in 0..16 {
let in_0 = c[i * stride];
c[i * stride] = 2 * in_0 + (in_0 * 1697 + 1024 >> 11);
}
}