#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_16x32_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 16 * 32];
{
let coeff_slice = coeff.as_slice();
row_dct16_8bpc_block(
_token,
coeff_slice,
32,
32,
true,
1,
1,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct32_cols_avx512(t512, &mut tmp, 16, 32, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..2 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 32];
for i in 0..32 {
v[i] = loadu_256!(&tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8]);
}
dct32_1d_cols8_i16(_token, &mut v, min_v, max_v);
for i in 0..32 {
storeu_256!(&mut tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8], v[i]);
}
}
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..32 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..512].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x32_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_16x32_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
#[allow(dead_code)]
fn simd_row_adst8_8bpc_8rows(
token: Desktop64,
coeff: &[i16],
coeff_h: usize,
y_base: usize,
apply_rect2: bool,
flipped: bool,
rnd: i32,
shift: i32,
tmp: &mut [i32],
row_min: i32,
row_max: i32,
col_min: i32,
col_max: i32,
) {
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let rect2_v = _mm256_set1_epi32(181);
let bias_v = _mm256_set1_epi32(128);
let rnd_v = _mm256_set1_epi32(rnd);
let mut cols = [_mm256_setzero_si256(); 8];
for x in 0..8 {
let off = y_base + x * coeff_h;
let arr: &[i16; 8] = (&coeff[off..off + 8]).try_into().unwrap();
let v16 = loadu_128!(arr);
let v32 = _mm256_cvtepi16_epi32(v16);
cols[x] = if apply_rect2 {
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(v32, rect2_v), bias_v))
} else {
v32
};
}
adst8_1d_cols8(token, &mut cols, row_min_v, row_max_v);
if flipped {
cols.reverse();
}
for x in 0..8 {
let rounded = match shift {
0 => _mm256_add_epi32(cols[x], rnd_v),
1 => _mm256_srai_epi32::<1>(_mm256_add_epi32(cols[x], rnd_v)),
2 => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
_ => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
};
cols[x] = _mm256_max_epi32(_mm256_min_epi32(rounded, col_max_v), col_min_v);
}
let rows = transpose_8x8_i32!(cols);
let s = 8;
storeu_256!(
&mut tmp[(y_base + 0) * s..(y_base + 0) * s + 8],
[i32; 8],
rows[0]
);
storeu_256!(
&mut tmp[(y_base + 1) * s..(y_base + 1) * s + 8],
[i32; 8],
rows[1]
);
storeu_256!(
&mut tmp[(y_base + 2) * s..(y_base + 2) * s + 8],
[i32; 8],
rows[2]
);
storeu_256!(
&mut tmp[(y_base + 3) * s..(y_base + 3) * s + 8],
[i32; 8],
rows[3]
);
storeu_256!(
&mut tmp[(y_base + 4) * s..(y_base + 4) * s + 8],
[i32; 8],
rows[4]
);
storeu_256!(
&mut tmp[(y_base + 5) * s..(y_base + 5) * s + 8],
[i32; 8],
rows[5]
);
storeu_256!(
&mut tmp[(y_base + 6) * s..(y_base + 6) * s + 8],
[i32; 8],
rows[6]
);
storeu_256!(
&mut tmp[(y_base + 7) * s..(y_base + 7) * s + 8],
[i32; 8],
rows[7]
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
fn simd_row_dct8_8bpc_8rows(
token: Desktop64,
coeff: &[i16],
coeff_h: usize,
y_base: usize,
apply_rect2: bool,
rnd: i32,
shift: i32,
tmp: &mut [i32],
row_min: i32,
row_max: i32,
col_min: i32,
col_max: i32,
) {
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let rect2_v = _mm256_set1_epi32(181);
let bias_v = _mm256_set1_epi32(128);
let rnd_v = _mm256_set1_epi32(rnd);
let mut cols = [_mm256_setzero_si256(); 8];
for x in 0..8 {
let off = y_base + x * coeff_h;
let arr: &[i16; 8] = (&coeff[off..off + 8]).try_into().unwrap();
let v16 = loadu_128!(arr);
let v32 = _mm256_cvtepi16_epi32(v16);
cols[x] = if apply_rect2 {
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(v32, rect2_v), bias_v))
} else {
v32
};
}
dct8_1d_cols8(token, &mut cols, row_min_v, row_max_v);
for x in 0..8 {
let rounded = match shift {
0 => _mm256_add_epi32(cols[x], rnd_v),
1 => _mm256_srai_epi32::<1>(_mm256_add_epi32(cols[x], rnd_v)),
2 => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
_ => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
};
cols[x] = _mm256_max_epi32(_mm256_min_epi32(rounded, col_max_v), col_min_v);
}
let rows = transpose_8x8_i32!(cols);
let s = 8;
storeu_256!(
&mut tmp[(y_base + 0) * s..(y_base + 0) * s + 8],
[i32; 8],
rows[0]
);
storeu_256!(
&mut tmp[(y_base + 1) * s..(y_base + 1) * s + 8],
[i32; 8],
rows[1]
);
storeu_256!(
&mut tmp[(y_base + 2) * s..(y_base + 2) * s + 8],
[i32; 8],
rows[2]
);
storeu_256!(
&mut tmp[(y_base + 3) * s..(y_base + 3) * s + 8],
[i32; 8],
rows[3]
);
storeu_256!(
&mut tmp[(y_base + 4) * s..(y_base + 4) * s + 8],
[i32; 8],
rows[4]
);
storeu_256!(
&mut tmp[(y_base + 5) * s..(y_base + 5) * s + 8],
[i32; 8],
rows[5]
);
storeu_256!(
&mut tmp[(y_base + 6) * s..(y_base + 6) * s + 8],
[i32; 8],
rows[6]
);
storeu_256!(
&mut tmp[(y_base + 7) * s..(y_base + 7) * s + 8],
[i32; 8],
rows[7]
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
fn dct8_row_build_pair(_token: Desktop64, a: __m128i, b: __m128i) -> __m256i {
let lo = _mm_unpacklo_epi16(a, b);
let hi = _mm_unpackhi_epi16(a, b);
_mm256_set_m128i(hi, lo)
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
fn dct8_row_coef_pack(_token: Desktop64, c_lo: i16, c_hi: i16) -> __m256i {
let packed = ((c_lo as u32) & 0xFFFF) as i32 | (((c_hi as u32) & 0xFFFF) << 16) as i32;
_mm256_set1_epi32(packed)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn dct8_row_pass_i16_simd(_token: Desktop64, coeff_col_major: [i16; 64]) -> [i32; 64] {
let mut col_xmm = [_mm_setzero_si128(); 8];
for x in 0..8 {
let arr: &[i16; 8] = (&coeff_col_major[x * 8..x * 8 + 8]).try_into().unwrap();
col_xmm[x] = loadu_128!(arr);
}
let pair_17 = dct8_row_build_pair(_token, col_xmm[1], col_xmm[7]);
let pair_53 = dct8_row_build_pair(_token, col_xmm[5], col_xmm[3]);
let pair_26 = dct8_row_build_pair(_token, col_xmm[2], col_xmm[6]);
let pair_04 = dct8_row_build_pair(_token, col_xmm[0], col_xmm[4]);
let pd_2048 = _mm256_set1_epi32(2048);
let pd_128 = _mm256_set1_epi32(128);
let t4a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_17, dct8_row_coef_pack(_token, 799, -4017)),
pd_2048,
));
let t7a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_17, dct8_row_coef_pack(_token, 4017, 799)),
pd_2048,
));
let t5a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_53, dct8_row_coef_pack(_token, 3406, -2276)),
pd_2048,
));
let t6a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_53, dct8_row_coef_pack(_token, 2276, 3406)),
pd_2048,
));
let t2 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_26, dct8_row_coef_pack(_token, 1567, -3784)),
pd_2048,
));
let t3 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_26, dct8_row_coef_pack(_token, 3784, 1567)),
pd_2048,
));
let t0 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_04, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let t1 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_04, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let row_min = i16::MIN as i32;
let row_max = i16::MAX as i32;
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let clip = |v: __m256i| _mm256_max_epi32(_mm256_min_epi32(v, row_max_v), row_min_v);
let t4 = clip(_mm256_add_epi32(t4a, t5a));
let t5a_n = clip(_mm256_sub_epi32(t4a, t5a));
let t7 = clip(_mm256_add_epi32(t7a, t6a));
let t6a_n = clip(_mm256_sub_epi32(t7a, t6a));
let c_181 = _mm256_set1_epi32(181);
let d = _mm256_sub_epi32(t6a_n, t5a_n);
let t5 = _mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(d, c_181), pd_128));
let s = _mm256_add_epi32(t6a_n, t5a_n);
let t6 = _mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(s, c_181), pd_128));
let tmp0 = clip(_mm256_add_epi32(t0, t3));
let tmp1 = clip(_mm256_add_epi32(t1, t2));
let tmp2 = clip(_mm256_sub_epi32(t1, t2));
let tmp3 = clip(_mm256_sub_epi32(t0, t3));
let mut cols = [_mm256_setzero_si256(); 8];
cols[0] = clip(_mm256_add_epi32(tmp0, t7));
cols[1] = clip(_mm256_add_epi32(tmp1, t6));
cols[2] = clip(_mm256_add_epi32(tmp2, t5));
cols[3] = clip(_mm256_add_epi32(tmp3, t4));
cols[4] = clip(_mm256_sub_epi32(tmp3, t4));
cols[5] = clip(_mm256_sub_epi32(tmp2, t5));
cols[6] = clip(_mm256_sub_epi32(tmp1, t6));
cols[7] = clip(_mm256_sub_epi32(tmp0, t7));
let rows = transpose_8x8_i32!(cols);
let mut out = [0i32; 64];
for y in 0..8 {
let arr: &mut [i32; 8] = (&mut out[y * 8..y * 8 + 8]).try_into().unwrap();
storeu_256!(arr, [i32; 8], rows[y]);
}
out
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn dct8_col_pass_i16(_token: Desktop64, tmp_row_major: &[i32; 64]) -> [__m256i; 8] {
let mut row_xmm = [_mm_setzero_si128(); 8];
for y in 0..8 {
let v = loadu_256!(&tmp_row_major[y * 8..y * 8 + 8], [i32; 8]);
let lo128 = _mm256_castsi256_si128(v);
let hi128 = _mm256_extracti128_si256(v, 1);
row_xmm[y] = _mm_packs_epi32(lo128, hi128);
}
let pair_17 = dct8_row_build_pair(_token, row_xmm[1], row_xmm[7]);
let pair_53 = dct8_row_build_pair(_token, row_xmm[5], row_xmm[3]);
let pair_26 = dct8_row_build_pair(_token, row_xmm[2], row_xmm[6]);
let pair_04 = dct8_row_build_pair(_token, row_xmm[0], row_xmm[4]);
let pd_2048 = _mm256_set1_epi32(2048);
let pd_128 = _mm256_set1_epi32(128);
let t4a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_17, dct8_row_coef_pack(_token, 799, -4017)),
pd_2048,
));
let t7a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_17, dct8_row_coef_pack(_token, 4017, 799)),
pd_2048,
));
let t5a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_53, dct8_row_coef_pack(_token, 3406, -2276)),
pd_2048,
));
let t6a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_53, dct8_row_coef_pack(_token, 2276, 3406)),
pd_2048,
));
let t2 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_26, dct8_row_coef_pack(_token, 1567, -3784)),
pd_2048,
));
let t3 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_26, dct8_row_coef_pack(_token, 3784, 1567)),
pd_2048,
));
let t0 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_04, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let t1 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_04, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let col_min = i16::MIN as i32;
let col_max = i16::MAX as i32;
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let clip = |v: __m256i| _mm256_max_epi32(_mm256_min_epi32(v, col_max_v), col_min_v);
let t4 = clip(_mm256_add_epi32(t4a, t5a));
let t5a_n = clip(_mm256_sub_epi32(t4a, t5a));
let t7 = clip(_mm256_add_epi32(t7a, t6a));
let t6a_n = clip(_mm256_sub_epi32(t7a, t6a));
let t5a_n_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(t5a_n),
_mm256_extracti128_si256(t5a_n, 1),
);
let t6a_n_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(t6a_n),
_mm256_extracti128_si256(t6a_n, 1),
);
let pair_65 = dct8_row_build_pair(_token, t6a_n_xmm, t5a_n_xmm);
let t5 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_65, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let t6 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_65, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let tmp0 = clip(_mm256_add_epi32(t0, t3));
let tmp1 = clip(_mm256_add_epi32(t1, t2));
let tmp2 = clip(_mm256_sub_epi32(t1, t2));
let tmp3 = clip(_mm256_sub_epi32(t0, t3));
let mut out = [_mm256_setzero_si256(); 8];
out[0] = clip(_mm256_add_epi32(tmp0, t7));
out[1] = clip(_mm256_add_epi32(tmp1, t6));
out[2] = clip(_mm256_add_epi32(tmp2, t5));
out[3] = clip(_mm256_add_epi32(tmp3, t4));
out[4] = clip(_mm256_sub_epi32(tmp3, t4));
out[5] = clip(_mm256_sub_epi32(tmp2, t5));
out[6] = clip(_mm256_sub_epi32(tmp1, t6));
out[7] = clip(_mm256_sub_epi32(tmp0, t7));
out
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn dct16_col_pass_i16(_token: Desktop64, tmp_row_major: &[i32; 256]) -> [i32; 256] {
let mut result = [0i32; 256];
let col_min = i16::MIN as i32;
let col_max = i16::MAX as i32;
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let clip = |v: __m256i| _mm256_max_epi32(_mm256_min_epi32(v, col_max_v), col_min_v);
let pd_2048 = _mm256_set1_epi32(2048);
let pd_128 = _mm256_set1_epi32(128);
for cx_chunk in 0..2u32 {
let cx = (cx_chunk * 8) as usize;
let mut row_xmm = [_mm_setzero_si128(); 16];
for y in 0..16 {
let v = loadu_256!(&tmp_row_major[y * 16 + cx..y * 16 + cx + 8], [i32; 8]);
let lo128 = _mm256_castsi256_si128(v);
let hi128 = _mm256_extracti128_si256(v, 1);
row_xmm[y] = _mm_packs_epi32(lo128, hi128);
}
let pair_0_8 = dct8_row_build_pair(_token, row_xmm[0], row_xmm[8]);
let pair_4_12 = dct8_row_build_pair(_token, row_xmm[4], row_xmm[12]);
let e_t0 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_0_8, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let e_t1 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_0_8, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let e_t2 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_4_12, dct8_row_coef_pack(_token, 1567, -3784)),
pd_2048,
));
let e_t3 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_4_12, dct8_row_coef_pack(_token, 3784, 1567)),
pd_2048,
));
let dct4_0 = clip(_mm256_add_epi32(e_t0, e_t3));
let dct4_1 = clip(_mm256_add_epi32(e_t1, e_t2));
let dct4_2 = clip(_mm256_sub_epi32(e_t1, e_t2));
let dct4_3 = clip(_mm256_sub_epi32(e_t0, e_t3));
let pair_2_14 = dct8_row_build_pair(_token, row_xmm[2], row_xmm[14]);
let pair_10_6 = dct8_row_build_pair(_token, row_xmm[10], row_xmm[6]);
let t4a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_2_14, dct8_row_coef_pack(_token, 799, -4017)),
pd_2048,
));
let t7a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_2_14, dct8_row_coef_pack(_token, 4017, 799)),
pd_2048,
));
let t5a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_10_6, dct8_row_coef_pack(_token, 3406, -2276)),
pd_2048,
));
let t6a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_10_6, dct8_row_coef_pack(_token, 2276, 3406)),
pd_2048,
));
let e_t4 = clip(_mm256_add_epi32(t4a, t5a));
let e_t5a_n = clip(_mm256_sub_epi32(t4a, t5a));
let e_t7 = clip(_mm256_add_epi32(t7a, t6a));
let e_t6a_n = clip(_mm256_sub_epi32(t7a, t6a));
let e_t5a_n_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(e_t5a_n),
_mm256_extracti128_si256(e_t5a_n, 1),
);
let e_t6a_n_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(e_t6a_n),
_mm256_extracti128_si256(e_t6a_n, 1),
);
let pair_65 = dct8_row_build_pair(_token, e_t6a_n_xmm, e_t5a_n_xmm);
let e_t5 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_65, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let e_t6 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_65, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let even_0 = clip(_mm256_add_epi32(dct4_0, e_t7));
let even_1 = clip(_mm256_add_epi32(dct4_1, e_t6));
let even_2 = clip(_mm256_add_epi32(dct4_2, e_t5));
let even_3 = clip(_mm256_add_epi32(dct4_3, e_t4));
let even_4 = clip(_mm256_sub_epi32(dct4_3, e_t4));
let even_5 = clip(_mm256_sub_epi32(dct4_2, e_t5));
let even_6 = clip(_mm256_sub_epi32(dct4_1, e_t6));
let even_7 = clip(_mm256_sub_epi32(dct4_0, e_t7));
let pair_1_15 = dct8_row_build_pair(_token, row_xmm[1], row_xmm[15]);
let pair_9_7 = dct8_row_build_pair(_token, row_xmm[9], row_xmm[7]);
let pair_5_11 = dct8_row_build_pair(_token, row_xmm[5], row_xmm[11]);
let pair_13_3 = dct8_row_build_pair(_token, row_xmm[13], row_xmm[3]);
let o_t8a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_1_15, dct8_row_coef_pack(_token, 401, -4076)),
pd_2048,
));
let o_t15a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_1_15, dct8_row_coef_pack(_token, 4076, 401)),
pd_2048,
));
let o_t9a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_9_7, dct8_row_coef_pack(_token, 3166, -2598)),
pd_2048,
));
let o_t14a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_9_7, dct8_row_coef_pack(_token, 2598, 3166)),
pd_2048,
));
let o_t10a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_5_11, dct8_row_coef_pack(_token, 1931, -3612)),
pd_2048,
));
let o_t13a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_5_11, dct8_row_coef_pack(_token, 3612, 1931)),
pd_2048,
));
let o_t11a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13_3, dct8_row_coef_pack(_token, 3920, -1189)),
pd_2048,
));
let o_t12a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13_3, dct8_row_coef_pack(_token, 1189, 3920)),
pd_2048,
));
let o_t8 = clip(_mm256_add_epi32(o_t8a, o_t9a));
let o_t9 = clip(_mm256_sub_epi32(o_t8a, o_t9a));
let o_t10 = clip(_mm256_sub_epi32(o_t11a, o_t10a));
let o_t11 = clip(_mm256_add_epi32(o_t11a, o_t10a));
let o_t12 = clip(_mm256_add_epi32(o_t12a, o_t13a));
let o_t13 = clip(_mm256_sub_epi32(o_t12a, o_t13a));
let o_t14 = clip(_mm256_sub_epi32(o_t15a, o_t14a));
let o_t15 = clip(_mm256_add_epi32(o_t15a, o_t14a));
let o_t14_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t14),
_mm256_extracti128_si256(o_t14, 1),
);
let o_t9_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t9),
_mm256_extracti128_si256(o_t9, 1),
);
let pair_14_9 = dct8_row_build_pair(_token, o_t14_xmm, o_t9_xmm);
let o_t9a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_14_9, dct8_row_coef_pack(_token, 1567, -3784)),
pd_2048,
));
let o_t14a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_14_9, dct8_row_coef_pack(_token, 3784, 1567)),
pd_2048,
));
let o_t13_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t13),
_mm256_extracti128_si256(o_t13, 1),
);
let o_t10_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t10),
_mm256_extracti128_si256(o_t10, 1),
);
let pair_13_10 = dct8_row_build_pair(_token, o_t13_xmm, o_t10_xmm);
let o_t10a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13_10, dct8_row_coef_pack(_token, -3784, -1567)),
pd_2048,
));
let o_t13a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13_10, dct8_row_coef_pack(_token, 1567, -3784)),
pd_2048,
));
let o_t8a_f = clip(_mm256_add_epi32(o_t8, o_t11));
let o_t9_f = clip(_mm256_add_epi32(o_t9a_new, o_t10a_new));
let o_t10_f = clip(_mm256_sub_epi32(o_t9a_new, o_t10a_new));
let o_t11a_f = clip(_mm256_sub_epi32(o_t8, o_t11));
let o_t12a_f = clip(_mm256_sub_epi32(o_t15, o_t12));
let o_t13_f = clip(_mm256_sub_epi32(o_t14a_new, o_t13a_new));
let o_t14_f = clip(_mm256_add_epi32(o_t14a_new, o_t13a_new));
let o_t15a_f = clip(_mm256_add_epi32(o_t15, o_t12));
let o_t13_f_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t13_f),
_mm256_extracti128_si256(o_t13_f, 1),
);
let o_t10_f_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t10_f),
_mm256_extracti128_si256(o_t10_f, 1),
);
let pair_13f_10f = dct8_row_build_pair(_token, o_t13_f_xmm, o_t10_f_xmm);
let o_t10a_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13f_10f, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let o_t13a_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13f_10f, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let o_t12a_f_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t12a_f),
_mm256_extracti128_si256(o_t12a_f, 1),
);
let o_t11a_f_xmm = _mm_packs_epi32(
_mm256_castsi256_si128(o_t11a_f),
_mm256_extracti128_si256(o_t11a_f, 1),
);
let pair_12a_11a = dct8_row_build_pair(_token, o_t12a_f_xmm, o_t11a_f_xmm);
let o_t11_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_12a_11a, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let o_t12_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_12a_11a, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let odd = [
o_t15a_f, o_t14_f, o_t13a_f, o_t12_f, o_t11_f, o_t10a_f, o_t9_f, o_t8a_f,
];
let even = [
even_0, even_1, even_2, even_3, even_4, even_5, even_6, even_7,
];
let mut cols = [_mm256_setzero_si256(); 16];
for k in 0..8 {
cols[k] = clip(_mm256_add_epi32(even[k], odd[k]));
cols[15 - k] = clip(_mm256_sub_epi32(even[k], odd[k]));
}
for y in 0..16 {
storeu_256!(&mut result[y * 16 + cx..y * 16 + cx + 8], [i32; 8], cols[y]);
}
}
result
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn dct16_row_pass_i16_simd(_token: Desktop64, coeff_col_major: [i16; 256]) -> [i32; 256] {
let mut out = [0i32; 256];
let row_min = i16::MIN as i32;
let row_max = i16::MAX as i32;
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let clip = |v: __m256i| _mm256_max_epi32(_mm256_min_epi32(v, row_max_v), row_min_v);
let pd_2048 = _mm256_set1_epi32(2048);
let pd_128 = _mm256_set1_epi32(128);
let c_181 = _mm256_set1_epi32(181);
for batch in 0..2u32 {
let y_base = (batch * 8) as usize;
let mut col_xmm = [_mm_setzero_si128(); 16];
let mut nz = _mm_setzero_si128();
for x in 0..16 {
let off = y_base + x * 16;
let arr: &[i16; 8] = (&coeff_col_major[off..off + 8]).try_into().unwrap();
col_xmm[x] = loadu_128!(arr);
nz = _mm_or_si128(nz, col_xmm[x]);
}
if _mm_testz_si128(nz, nz) != 0 {
continue;
}
let pair_0_8 = dct8_row_build_pair(_token, col_xmm[0], col_xmm[8]);
let pair_4_12 = dct8_row_build_pair(_token, col_xmm[4], col_xmm[12]);
let e_t0 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_0_8, dct8_row_coef_pack(_token, 181, 181)),
pd_128,
));
let e_t1 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_0_8, dct8_row_coef_pack(_token, 181, -181)),
pd_128,
));
let e_t2 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_4_12, dct8_row_coef_pack(_token, 1567, -3784)),
pd_2048,
));
let e_t3 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_4_12, dct8_row_coef_pack(_token, 3784, 1567)),
pd_2048,
));
let dct4_0 = clip(_mm256_add_epi32(e_t0, e_t3));
let dct4_1 = clip(_mm256_add_epi32(e_t1, e_t2));
let dct4_2 = clip(_mm256_sub_epi32(e_t1, e_t2));
let dct4_3 = clip(_mm256_sub_epi32(e_t0, e_t3));
let pair_2_14 = dct8_row_build_pair(_token, col_xmm[2], col_xmm[14]);
let pair_10_6 = dct8_row_build_pair(_token, col_xmm[10], col_xmm[6]);
let t4a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_2_14, dct8_row_coef_pack(_token, 799, -4017)),
pd_2048,
));
let t7a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_2_14, dct8_row_coef_pack(_token, 4017, 799)),
pd_2048,
));
let t5a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_10_6, dct8_row_coef_pack(_token, 3406, -2276)),
pd_2048,
));
let t6a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_10_6, dct8_row_coef_pack(_token, 2276, 3406)),
pd_2048,
));
let t4 = clip(_mm256_add_epi32(t4a, t5a));
let t5a_n = clip(_mm256_sub_epi32(t4a, t5a));
let t7 = clip(_mm256_add_epi32(t7a, t6a));
let t6a_n = clip(_mm256_sub_epi32(t7a, t6a));
let d_65 = _mm256_sub_epi32(t6a_n, t5a_n);
let t5 = _mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(d_65, c_181), pd_128));
let s_65 = _mm256_add_epi32(t6a_n, t5a_n);
let t6 = _mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(s_65, c_181), pd_128));
let even = [
clip(_mm256_add_epi32(dct4_0, t7)),
clip(_mm256_add_epi32(dct4_1, t6)),
clip(_mm256_add_epi32(dct4_2, t5)),
clip(_mm256_add_epi32(dct4_3, t4)),
clip(_mm256_sub_epi32(dct4_3, t4)),
clip(_mm256_sub_epi32(dct4_2, t5)),
clip(_mm256_sub_epi32(dct4_1, t6)),
clip(_mm256_sub_epi32(dct4_0, t7)),
];
let pair_1_15 = dct8_row_build_pair(_token, col_xmm[1], col_xmm[15]);
let pair_9_7 = dct8_row_build_pair(_token, col_xmm[9], col_xmm[7]);
let pair_5_11 = dct8_row_build_pair(_token, col_xmm[5], col_xmm[11]);
let pair_13_3 = dct8_row_build_pair(_token, col_xmm[13], col_xmm[3]);
let o_t8a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_1_15, dct8_row_coef_pack(_token, 401, -4076)),
pd_2048,
));
let o_t15a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_1_15, dct8_row_coef_pack(_token, 4076, 401)),
pd_2048,
));
let o_t9a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_9_7, dct8_row_coef_pack(_token, 3166, -2598)),
pd_2048,
));
let o_t14a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_9_7, dct8_row_coef_pack(_token, 2598, 3166)),
pd_2048,
));
let o_t10a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_5_11, dct8_row_coef_pack(_token, 1931, -3612)),
pd_2048,
));
let o_t13a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_5_11, dct8_row_coef_pack(_token, 3612, 1931)),
pd_2048,
));
let o_t11a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13_3, dct8_row_coef_pack(_token, 3920, -1189)),
pd_2048,
));
let o_t12a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13_3, dct8_row_coef_pack(_token, 1189, 3920)),
pd_2048,
));
let o_t8 = clip(_mm256_add_epi32(o_t8a, o_t9a));
let mut o_t9 = clip(_mm256_sub_epi32(o_t8a, o_t9a));
let mut o_t10 = clip(_mm256_sub_epi32(o_t11a, o_t10a));
let o_t11 = clip(_mm256_add_epi32(o_t11a, o_t10a));
let o_t12 = clip(_mm256_add_epi32(o_t12a, o_t13a));
let mut o_t13 = clip(_mm256_sub_epi32(o_t12a, o_t13a));
let mut o_t14 = clip(_mm256_sub_epi32(o_t15a, o_t14a));
let o_t15 = clip(_mm256_add_epi32(o_t15a, o_t14a));
let c_1567 = _mm256_set1_epi32(1567);
let c_3784 = _mm256_set1_epi32(3784);
let o_t9a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(o_t14, c_1567),
_mm256_mullo_epi32(o_t9, c_3784),
),
pd_2048,
));
let o_t14a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(o_t14, c_3784),
_mm256_mullo_epi32(o_t9, c_1567),
),
pd_2048,
));
let o_t10a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_setzero_si256(),
_mm256_add_epi32(
_mm256_mullo_epi32(o_t13, c_3784),
_mm256_mullo_epi32(o_t10, c_1567),
),
),
pd_2048,
));
let o_t13a_new = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(o_t13, c_1567),
_mm256_mullo_epi32(o_t10, c_3784),
),
pd_2048,
));
let o_t8a_f = clip(_mm256_add_epi32(o_t8, o_t11));
o_t9 = clip(_mm256_add_epi32(o_t9a_new, o_t10a_new));
o_t10 = clip(_mm256_sub_epi32(o_t9a_new, o_t10a_new));
let o_t11a_f = clip(_mm256_sub_epi32(o_t8, o_t11));
let o_t12a_f = clip(_mm256_sub_epi32(o_t15, o_t12));
o_t13 = clip(_mm256_sub_epi32(o_t14a_new, o_t13a_new));
o_t14 = clip(_mm256_add_epi32(o_t14a_new, o_t13a_new));
let o_t15a_f = clip(_mm256_add_epi32(o_t15, o_t12));
let d2 = _mm256_sub_epi32(o_t13, o_t10);
let o_t10a_f =
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(d2, c_181), pd_128));
let s2 = _mm256_add_epi32(o_t13, o_t10);
let o_t13a_f =
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(s2, c_181), pd_128));
let d3 = _mm256_sub_epi32(o_t12a_f, o_t11a_f);
let o_t11_f =
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(d3, c_181), pd_128));
let s3 = _mm256_add_epi32(o_t12a_f, o_t11a_f);
let o_t12_f =
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(s3, c_181), pd_128));
let odd = [
o_t15a_f, o_t14, o_t13a_f, o_t12_f, o_t11_f, o_t10a_f, o_t9, o_t8a_f,
];
let mut cols = [_mm256_setzero_si256(); 16];
for k in 0..8 {
cols[k] = clip(_mm256_add_epi32(even[k], odd[k]));
cols[15 - k] = clip(_mm256_sub_epi32(even[k], odd[k]));
}
for chunk in 0..2u32 {
let b = (chunk * 8) as usize;
let chunk_cols: [__m256i; 8] = [
cols[b],
cols[b + 1],
cols[b + 2],
cols[b + 3],
cols[b + 4],
cols[b + 5],
cols[b + 6],
cols[b + 7],
];
let rows = transpose_8x8_i32!(chunk_cols);
for r in 0..8 {
let dst_off = (y_base + r) * 16 + b;
let arr: &mut [i32; 8] = (&mut out[dst_off..dst_off + 8]).try_into().unwrap();
storeu_256!(arr, [i32; 8], rows[r]);
}
}
}
out
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn dct32_row_pass_i16_simd(_token: Desktop64, coeff_col_major: [i16; 1024]) -> [i32; 1024] {
let mut out = [0i32; 1024];
let build_pair = dct8_row_build_pair;
let coef_pack = dct8_row_coef_pack;
for batch in 0..4u32 {
let y_base = (batch * 8) as usize;
let mut cx = [_mm_setzero_si128(); 32];
let mut nz = _mm_setzero_si128();
for x in 0..32 {
let off = y_base + x * 32;
let arr: &[i16; 8] = (&coeff_col_major[off..off + 8]).try_into().unwrap();
cx[x] = loadu_128!(arr);
nz = _mm_or_si128(nz, cx[x]);
}
if _mm_testz_si128(nz, nz) != 0 {
continue;
}
let pd_2048 = _mm256_set1_epi32(2048);
let pd_128 = _mm256_set1_epi32(128);
let row_min_v = _mm256_set1_epi32(i16::MIN as i32);
let row_max_v = _mm256_set1_epi32(i16::MAX as i32);
let clip = |v: __m256i| _mm256_max_epi32(_mm256_min_epi32(v, row_max_v), row_min_v);
let c_181 = _mm256_set1_epi32(181);
let pair_04 = build_pair(_token, cx[0], cx[16]);
let pair_13 = build_pair(_token, cx[8], cx[24]);
let dct4_t0 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_04, coef_pack(_token, 181, 181)),
pd_128,
));
let dct4_t1 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_madd_epi16(pair_04, coef_pack(_token, 181, -181)),
pd_128,
));
let dct4_t2 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13, coef_pack(_token, 1567, -3784)),
pd_2048,
));
let dct4_t3 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_13, coef_pack(_token, 3784, 1567)),
pd_2048,
));
let dct4_o0 = clip(_mm256_add_epi32(dct4_t0, dct4_t3));
let dct4_o1 = clip(_mm256_add_epi32(dct4_t1, dct4_t2));
let dct4_o2 = clip(_mm256_sub_epi32(dct4_t1, dct4_t2));
let dct4_o3 = clip(_mm256_sub_epi32(dct4_t0, dct4_t3));
let pair_8_17 = build_pair(_token, cx[4], cx[28]);
let pair_8_53 = build_pair(_token, cx[20], cx[12]);
let t4a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_8_17, coef_pack(_token, 799, -4017)),
pd_2048,
));
let t7a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_8_17, coef_pack(_token, 4017, 799)),
pd_2048,
));
let t5a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_8_53, coef_pack(_token, 3406, -2276)),
pd_2048,
));
let t6a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_8_53, coef_pack(_token, 2276, 3406)),
pd_2048,
));
let t4 = clip(_mm256_add_epi32(t4a, t5a));
let t5a_n = clip(_mm256_sub_epi32(t4a, t5a));
let t7 = clip(_mm256_add_epi32(t7a, t6a));
let t6a_n = clip(_mm256_sub_epi32(t7a, t6a));
let d_56 = _mm256_sub_epi32(t6a_n, t5a_n);
let t5 = _mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(d_56, c_181), pd_128));
let s_56 = _mm256_add_epi32(t6a_n, t5a_n);
let t6 = _mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(s_56, c_181), pd_128));
let dct8_o0 = clip(_mm256_add_epi32(dct4_o0, t7));
let dct8_o1 = clip(_mm256_add_epi32(dct4_o1, t6));
let dct8_o2 = clip(_mm256_add_epi32(dct4_o2, t5));
let dct8_o3 = clip(_mm256_add_epi32(dct4_o3, t4));
let dct8_o4 = clip(_mm256_sub_epi32(dct4_o3, t4));
let dct8_o5 = clip(_mm256_sub_epi32(dct4_o2, t5));
let dct8_o6 = clip(_mm256_sub_epi32(dct4_o1, t6));
let dct8_o7 = clip(_mm256_sub_epi32(dct4_o0, t7));
let pair_16_1_15 = build_pair(_token, cx[2], cx[30]);
let pair_16_9_7 = build_pair(_token, cx[18], cx[14]);
let pair_16_5_11 = build_pair(_token, cx[10], cx[22]);
let pair_16_13_3 = build_pair(_token, cx[26], cx[6]);
let t8a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_1_15, coef_pack(_token, 401, -4076)),
pd_2048,
));
let t15a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_1_15, coef_pack(_token, 4076, 401)),
pd_2048,
));
let t9a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_9_7, coef_pack(_token, 3166, -2598)),
pd_2048,
));
let t14a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_9_7, coef_pack(_token, 2598, 3166)),
pd_2048,
));
let t10a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_5_11, coef_pack(_token, 1931, -3612)),
pd_2048,
));
let t13a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_5_11, coef_pack(_token, 3612, 1931)),
pd_2048,
));
let t11a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_13_3, coef_pack(_token, 3920, -1189)),
pd_2048,
));
let t12a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_16_13_3, coef_pack(_token, 1189, 3920)),
pd_2048,
));
let t8 = clip(_mm256_add_epi32(t8a, t9a));
let mut t9 = clip(_mm256_sub_epi32(t8a, t9a));
let mut t10 = clip(_mm256_sub_epi32(t11a, t10a));
let t11 = clip(_mm256_add_epi32(t11a, t10a));
let t12 = clip(_mm256_add_epi32(t12a, t13a));
let mut t13 = clip(_mm256_sub_epi32(t12a, t13a));
let mut t14 = clip(_mm256_sub_epi32(t15a, t14a));
let t15 = clip(_mm256_add_epi32(t15a, t14a));
let c1567 = _mm256_set1_epi32(1567);
let c3784 = _mm256_set1_epi32(3784);
let t9a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t14, c1567),
_mm256_mullo_epi32(t9, c3784),
),
pd_2048,
));
let t14a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(t14, c3784),
_mm256_mullo_epi32(t9, c1567),
),
pd_2048,
));
let t10a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_setzero_si256(),
_mm256_add_epi32(
_mm256_mullo_epi32(t13, c3784),
_mm256_mullo_epi32(t10, c1567),
),
),
pd_2048,
));
let t13a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t13, c1567),
_mm256_mullo_epi32(t10, c3784),
),
pd_2048,
));
let t8a = clip(_mm256_add_epi32(t8, t11));
t9 = clip(_mm256_add_epi32(t9a, t10a));
t10 = clip(_mm256_sub_epi32(t9a, t10a));
let t11a = clip(_mm256_sub_epi32(t8, t11));
let t12a = clip(_mm256_sub_epi32(t15, t12));
t13 = clip(_mm256_sub_epi32(t14a, t13a));
t14 = clip(_mm256_add_epi32(t14a, t13a));
let t15a = clip(_mm256_add_epi32(t15, t12));
let t10a = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_sub_epi32(t13, t10), c_181),
pd_128,
));
let t13a = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(t13, t10), c_181),
pd_128,
));
let t11 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_sub_epi32(t12a, t11a), c_181),
pd_128,
));
let t12 = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(t12a, t11a), c_181),
pd_128,
));
let dct16_o = [
clip(_mm256_add_epi32(dct8_o0, t15a)),
clip(_mm256_add_epi32(dct8_o1, t14)),
clip(_mm256_add_epi32(dct8_o2, t13a)),
clip(_mm256_add_epi32(dct8_o3, t12)),
clip(_mm256_add_epi32(dct8_o4, t11)),
clip(_mm256_add_epi32(dct8_o5, t10a)),
clip(_mm256_add_epi32(dct8_o6, t9)),
clip(_mm256_add_epi32(dct8_o7, t8a)),
clip(_mm256_sub_epi32(dct8_o7, t8a)),
clip(_mm256_sub_epi32(dct8_o6, t9)),
clip(_mm256_sub_epi32(dct8_o5, t10a)),
clip(_mm256_sub_epi32(dct8_o4, t11)),
clip(_mm256_sub_epi32(dct8_o3, t12)),
clip(_mm256_sub_epi32(dct8_o2, t13a)),
clip(_mm256_sub_epi32(dct8_o1, t14)),
clip(_mm256_sub_epi32(dct8_o0, t15a)),
];
let pair_32_1_31 = build_pair(_token, cx[1], cx[31]);
let pair_32_17_15 = build_pair(_token, cx[17], cx[15]);
let pair_32_9_23 = build_pair(_token, cx[9], cx[23]);
let pair_32_25_7 = build_pair(_token, cx[25], cx[7]);
let pair_32_5_27 = build_pair(_token, cx[5], cx[27]);
let pair_32_21_11 = build_pair(_token, cx[21], cx[11]);
let pair_32_13_19 = build_pair(_token, cx[13], cx[19]);
let pair_32_29_3 = build_pair(_token, cx[29], cx[3]);
let t16a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_1_31, coef_pack(_token, 201, -4091)),
pd_2048,
));
let t31a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_1_31, coef_pack(_token, 4091, 201)),
pd_2048,
));
let t17a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_17_15, coef_pack(_token, 3035, -2751)),
pd_2048,
));
let t30a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_17_15, coef_pack(_token, 2751, 3035)),
pd_2048,
));
let t18a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_9_23, coef_pack(_token, 1751, -3703)),
pd_2048,
));
let t29a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_9_23, coef_pack(_token, 3703, 1751)),
pd_2048,
));
let t19a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_25_7, coef_pack(_token, 3857, -1380)),
pd_2048,
));
let t28a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_25_7, coef_pack(_token, 1380, 3857)),
pd_2048,
));
let t20a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_5_27, coef_pack(_token, 995, -3973)),
pd_2048,
));
let t27a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_5_27, coef_pack(_token, 3973, 995)),
pd_2048,
));
let t21a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_21_11, coef_pack(_token, 3513, -2106)),
pd_2048,
));
let t26a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_21_11, coef_pack(_token, 2106, 3513)),
pd_2048,
));
let t22a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_13_19, coef_pack(_token, 2440, -3290)),
pd_2048,
));
let t25a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_13_19, coef_pack(_token, 3290, 2440)),
pd_2048,
));
let t23a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_29_3, coef_pack(_token, 4052, -601)),
pd_2048,
));
let t24a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_madd_epi16(pair_32_29_3, coef_pack(_token, 601, 4052)),
pd_2048,
));
let mut t16 = clip(_mm256_add_epi32(t16a, t17a));
let mut t17 = clip(_mm256_sub_epi32(t16a, t17a));
let mut t18 = clip(_mm256_sub_epi32(t19a, t18a));
let t19 = clip(_mm256_add_epi32(t19a, t18a));
let t20 = clip(_mm256_add_epi32(t20a, t21a));
let mut t21 = clip(_mm256_sub_epi32(t20a, t21a));
let mut t22 = clip(_mm256_sub_epi32(t23a, t22a));
let mut t23 = clip(_mm256_add_epi32(t23a, t22a));
let mut t24 = clip(_mm256_add_epi32(t24a, t25a));
let mut t25 = clip(_mm256_sub_epi32(t24a, t25a));
let mut t26 = clip(_mm256_sub_epi32(t27a, t26a));
let t27 = clip(_mm256_add_epi32(t27a, t26a));
let t28 = clip(_mm256_add_epi32(t28a, t29a));
let mut t29 = clip(_mm256_sub_epi32(t28a, t29a));
let mut t30 = clip(_mm256_sub_epi32(t31a, t30a));
let mut t31 = clip(_mm256_add_epi32(t31a, t30a));
let c799 = _mm256_set1_epi32(799);
let c4017 = _mm256_set1_epi32(4017);
let c1703 = _mm256_set1_epi32(1703);
let c1138 = _mm256_set1_epi32(1138);
let pd_1024 = _mm256_set1_epi32(1024);
let t17a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t30, c799),
_mm256_mullo_epi32(t17, c4017),
),
pd_2048,
));
let t30a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(t30, c4017),
_mm256_mullo_epi32(t17, c799),
),
pd_2048,
));
let t18a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_setzero_si256(),
_mm256_add_epi32(
_mm256_mullo_epi32(t29, c4017),
_mm256_mullo_epi32(t18, c799),
),
),
pd_2048,
));
let t29a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t29, c799),
_mm256_mullo_epi32(t18, c4017),
),
pd_2048,
));
let t21a = _mm256_srai_epi32::<11>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t26, c1703),
_mm256_mullo_epi32(t21, c1138),
),
pd_1024,
));
let t26a = _mm256_srai_epi32::<11>(_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(t26, c1138),
_mm256_mullo_epi32(t21, c1703),
),
pd_1024,
));
let t22a = _mm256_srai_epi32::<11>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_setzero_si256(),
_mm256_add_epi32(
_mm256_mullo_epi32(t25, c1138),
_mm256_mullo_epi32(t22, c1703),
),
),
pd_1024,
));
let t25a = _mm256_srai_epi32::<11>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t25, c1703),
_mm256_mullo_epi32(t22, c1138),
),
pd_1024,
));
let t16a = clip(_mm256_add_epi32(t16, t19));
t17 = clip(_mm256_add_epi32(t17a, t18a));
t18 = clip(_mm256_sub_epi32(t17a, t18a));
let t19a = clip(_mm256_sub_epi32(t16, t19));
let t20a = clip(_mm256_sub_epi32(t23, t20));
t21 = clip(_mm256_sub_epi32(t22a, t21a));
t22 = clip(_mm256_add_epi32(t22a, t21a));
let t23a = clip(_mm256_add_epi32(t23, t20));
let t24a = clip(_mm256_add_epi32(t24, t27));
t25 = clip(_mm256_add_epi32(t25a, t26a));
t26 = clip(_mm256_sub_epi32(t25a, t26a));
let t27a = clip(_mm256_sub_epi32(t24, t27));
let t28a = clip(_mm256_sub_epi32(t31, t28));
t29 = clip(_mm256_sub_epi32(t30a, t29a));
t30 = clip(_mm256_add_epi32(t30a, t29a));
let t31a = clip(_mm256_add_epi32(t31, t28));
let t18a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t29, c1567),
_mm256_mullo_epi32(t18, c3784),
),
pd_2048,
));
let t29a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(t29, c3784),
_mm256_mullo_epi32(t18, c1567),
),
pd_2048,
));
let t19 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t28a, c1567),
_mm256_mullo_epi32(t19a, c3784),
),
pd_2048,
));
let t28 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(t28a, c3784),
_mm256_mullo_epi32(t19a, c1567),
),
pd_2048,
));
let t20 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_setzero_si256(),
_mm256_add_epi32(
_mm256_mullo_epi32(t27a, c3784),
_mm256_mullo_epi32(t20a, c1567),
),
),
pd_2048,
));
let t27 = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t27a, c1567),
_mm256_mullo_epi32(t20a, c3784),
),
pd_2048,
));
let t21a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_setzero_si256(),
_mm256_add_epi32(
_mm256_mullo_epi32(t26, c3784),
_mm256_mullo_epi32(t21, c1567),
),
),
pd_2048,
));
let t26a = _mm256_srai_epi32::<12>(_mm256_add_epi32(
_mm256_sub_epi32(
_mm256_mullo_epi32(t26, c1567),
_mm256_mullo_epi32(t21, c3784),
),
pd_2048,
));
t16 = clip(_mm256_add_epi32(t16a, t23a));
let t17a = clip(_mm256_add_epi32(t17, t22));
t18 = clip(_mm256_add_epi32(t18a, t21a));
let t19a = clip(_mm256_add_epi32(t19, t20));
let t20a = clip(_mm256_sub_epi32(t19, t20));
t21 = clip(_mm256_sub_epi32(t18a, t21a));
let t22a = clip(_mm256_sub_epi32(t17, t22));
t23 = clip(_mm256_sub_epi32(t16a, t23a));
t24 = clip(_mm256_sub_epi32(t31a, t24a));
let t25a = clip(_mm256_sub_epi32(t30, t25));
t26 = clip(_mm256_sub_epi32(t29a, t26a));
let t27a = clip(_mm256_sub_epi32(t28, t27));
let t28a = clip(_mm256_add_epi32(t28, t27));
t29 = clip(_mm256_add_epi32(t29a, t26a));
let t30a = clip(_mm256_add_epi32(t30, t25));
t31 = clip(_mm256_add_epi32(t31a, t24a));
let t20_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_sub_epi32(t27a, t20a), c_181),
pd_128,
));
let t27_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(t27a, t20a), c_181),
pd_128,
));
let t21a_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_sub_epi32(t26, t21), c_181),
pd_128,
));
let t26a_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(t26, t21), c_181),
pd_128,
));
let t22_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_sub_epi32(t25a, t22a), c_181),
pd_128,
));
let t25_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(t25a, t22a), c_181),
pd_128,
));
let t23a_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_sub_epi32(t24, t23), c_181),
pd_128,
));
let t24a_f = _mm256_srai_epi32::<8>(_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(t24, t23), c_181),
pd_128,
));
let mut cols = [_mm256_setzero_si256(); 32];
cols[0] = clip(_mm256_add_epi32(dct16_o[0], t31));
cols[1] = clip(_mm256_add_epi32(dct16_o[1], t30a));
cols[2] = clip(_mm256_add_epi32(dct16_o[2], t29));
cols[3] = clip(_mm256_add_epi32(dct16_o[3], t28a));
cols[4] = clip(_mm256_add_epi32(dct16_o[4], t27_f));
cols[5] = clip(_mm256_add_epi32(dct16_o[5], t26a_f));
cols[6] = clip(_mm256_add_epi32(dct16_o[6], t25_f));
cols[7] = clip(_mm256_add_epi32(dct16_o[7], t24a_f));
cols[8] = clip(_mm256_add_epi32(dct16_o[8], t23a_f));
cols[9] = clip(_mm256_add_epi32(dct16_o[9], t22_f));
cols[10] = clip(_mm256_add_epi32(dct16_o[10], t21a_f));
cols[11] = clip(_mm256_add_epi32(dct16_o[11], t20_f));
cols[12] = clip(_mm256_add_epi32(dct16_o[12], t19a));
cols[13] = clip(_mm256_add_epi32(dct16_o[13], t18));
cols[14] = clip(_mm256_add_epi32(dct16_o[14], t17a));
cols[15] = clip(_mm256_add_epi32(dct16_o[15], t16));
cols[16] = clip(_mm256_sub_epi32(dct16_o[15], t16));
cols[17] = clip(_mm256_sub_epi32(dct16_o[14], t17a));
cols[18] = clip(_mm256_sub_epi32(dct16_o[13], t18));
cols[19] = clip(_mm256_sub_epi32(dct16_o[12], t19a));
cols[20] = clip(_mm256_sub_epi32(dct16_o[11], t20_f));
cols[21] = clip(_mm256_sub_epi32(dct16_o[10], t21a_f));
cols[22] = clip(_mm256_sub_epi32(dct16_o[9], t22_f));
cols[23] = clip(_mm256_sub_epi32(dct16_o[8], t23a_f));
cols[24] = clip(_mm256_sub_epi32(dct16_o[7], t24a_f));
cols[25] = clip(_mm256_sub_epi32(dct16_o[6], t25_f));
cols[26] = clip(_mm256_sub_epi32(dct16_o[5], t26a_f));
cols[27] = clip(_mm256_sub_epi32(dct16_o[4], t27_f));
cols[28] = clip(_mm256_sub_epi32(dct16_o[3], t28a));
cols[29] = clip(_mm256_sub_epi32(dct16_o[2], t29));
cols[30] = clip(_mm256_sub_epi32(dct16_o[1], t30a));
cols[31] = clip(_mm256_sub_epi32(dct16_o[0], t31));
for chunk in 0..4 {
let b = chunk * 8;
let chunk_cols: [__m256i; 8] = [
cols[b],
cols[b + 1],
cols[b + 2],
cols[b + 3],
cols[b + 4],
cols[b + 5],
cols[b + 6],
cols[b + 7],
];
let rows = transpose_8x8_i32!(chunk_cols);
for row in 0..8 {
let y = y_base + row;
let arr: &mut [i32; 8] = (&mut out[y * 32 + b..y * 32 + b + 8]).try_into().unwrap();
storeu_256!(arr, [i32; 8], rows[row]);
}
}
}
out
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
fn simd_row_adst16_8bpc_8rows(
token: Desktop64,
coeff: &[i16],
coeff_h: usize,
y_base: usize,
apply_rect2: bool,
flipped: bool,
rnd: i32,
shift: i32,
tmp: &mut [i32],
row_min: i32,
row_max: i32,
col_min: i32,
col_max: i32,
) {
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let rect2_v = _mm256_set1_epi32(181);
let bias_v = _mm256_set1_epi32(128);
let rnd_v = _mm256_set1_epi32(rnd);
let mut cols = [_mm256_setzero_si256(); 16];
for x in 0..16 {
let off = y_base + x * coeff_h;
let arr: &[i16; 8] = (&coeff[off..off + 8]).try_into().unwrap();
let v16 = loadu_128!(arr);
let v32 = _mm256_cvtepi16_epi32(v16);
cols[x] = if apply_rect2 {
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(v32, rect2_v), bias_v))
} else {
v32
};
}
adst16_1d_cols8(token, &mut cols, row_min_v, row_max_v);
if flipped {
cols.reverse();
}
for x in 0..16 {
let rounded = match shift {
0 => _mm256_add_epi32(cols[x], rnd_v),
1 => _mm256_srai_epi32::<1>(_mm256_add_epi32(cols[x], rnd_v)),
2 => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
_ => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
};
cols[x] = _mm256_max_epi32(_mm256_min_epi32(rounded, col_max_v), col_min_v);
}
for chunk in 0..2 {
let b = chunk * 8;
let chunk_cols: [__m256i; 8] = [
cols[b + 0],
cols[b + 1],
cols[b + 2],
cols[b + 3],
cols[b + 4],
cols[b + 5],
cols[b + 6],
cols[b + 7],
];
let rows = transpose_8x8_i32!(chunk_cols);
let s = 16;
storeu_256!(
&mut tmp[(y_base + 0) * s + b..(y_base + 0) * s + b + 8],
[i32; 8],
rows[0]
);
storeu_256!(
&mut tmp[(y_base + 1) * s + b..(y_base + 1) * s + b + 8],
[i32; 8],
rows[1]
);
storeu_256!(
&mut tmp[(y_base + 2) * s + b..(y_base + 2) * s + b + 8],
[i32; 8],
rows[2]
);
storeu_256!(
&mut tmp[(y_base + 3) * s + b..(y_base + 3) * s + b + 8],
[i32; 8],
rows[3]
);
storeu_256!(
&mut tmp[(y_base + 4) * s + b..(y_base + 4) * s + b + 8],
[i32; 8],
rows[4]
);
storeu_256!(
&mut tmp[(y_base + 5) * s + b..(y_base + 5) * s + b + 8],
[i32; 8],
rows[5]
);
storeu_256!(
&mut tmp[(y_base + 6) * s + b..(y_base + 6) * s + b + 8],
[i32; 8],
rows[6]
);
storeu_256!(
&mut tmp[(y_base + 7) * s + b..(y_base + 7) * s + b + 8],
[i32; 8],
rows[7]
);
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
fn simd_row_dct16_8bpc_8rows(
token: Desktop64,
coeff: &[i16],
coeff_h: usize,
y_base: usize,
apply_rect2: bool,
rnd: i32,
shift: i32,
tmp: &mut [i32],
row_min: i32,
row_max: i32,
col_min: i32,
col_max: i32,
) {
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let rect2_v = _mm256_set1_epi32(181);
let bias_v = _mm256_set1_epi32(128);
let rnd_v = _mm256_set1_epi32(rnd);
let mut cols = [_mm256_setzero_si256(); 16];
for x in 0..16 {
let off = y_base + x * coeff_h;
let arr: &[i16; 8] = (&coeff[off..off + 8]).try_into().unwrap();
let v16 = loadu_128!(arr);
let v32 = _mm256_cvtepi16_epi32(v16);
cols[x] = if apply_rect2 {
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(v32, rect2_v), bias_v))
} else {
v32
};
}
dct16_1d_cols8(token, &mut cols, row_min_v, row_max_v);
for x in 0..16 {
let rounded = match shift {
0 => _mm256_add_epi32(cols[x], rnd_v),
1 => _mm256_srai_epi32::<1>(_mm256_add_epi32(cols[x], rnd_v)),
2 => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
_ => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
};
cols[x] = _mm256_max_epi32(_mm256_min_epi32(rounded, col_max_v), col_min_v);
}
for chunk in 0..2 {
let b = chunk * 8;
let chunk_cols: [__m256i; 8] = [
cols[b + 0],
cols[b + 1],
cols[b + 2],
cols[b + 3],
cols[b + 4],
cols[b + 5],
cols[b + 6],
cols[b + 7],
];
let rows = transpose_8x8_i32!(chunk_cols);
let s = 16;
storeu_256!(
&mut tmp[(y_base + 0) * s + b..(y_base + 0) * s + b + 8],
[i32; 8],
rows[0]
);
storeu_256!(
&mut tmp[(y_base + 1) * s + b..(y_base + 1) * s + b + 8],
[i32; 8],
rows[1]
);
storeu_256!(
&mut tmp[(y_base + 2) * s + b..(y_base + 2) * s + b + 8],
[i32; 8],
rows[2]
);
storeu_256!(
&mut tmp[(y_base + 3) * s + b..(y_base + 3) * s + b + 8],
[i32; 8],
rows[3]
);
storeu_256!(
&mut tmp[(y_base + 4) * s + b..(y_base + 4) * s + b + 8],
[i32; 8],
rows[4]
);
storeu_256!(
&mut tmp[(y_base + 5) * s + b..(y_base + 5) * s + b + 8],
[i32; 8],
rows[5]
);
storeu_256!(
&mut tmp[(y_base + 6) * s + b..(y_base + 6) * s + b + 8],
[i32; 8],
rows[6]
);
storeu_256!(
&mut tmp[(y_base + 7) * s + b..(y_base + 7) * s + b + 8],
[i32; 8],
rows[7]
);
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[inline(always)]
fn simd_row_dct32_8bpc_8rows(
token: Desktop64,
coeff: &[i16],
coeff_h: usize,
y_base: usize,
apply_rect2: bool,
rnd: i32,
shift: i32,
tmp: &mut [i32],
row_min: i32,
row_max: i32,
col_min: i32,
col_max: i32,
) {
let row_min_v = _mm256_set1_epi32(row_min);
let row_max_v = _mm256_set1_epi32(row_max);
let col_min_v = _mm256_set1_epi32(col_min);
let col_max_v = _mm256_set1_epi32(col_max);
let rect2_v = _mm256_set1_epi32(181);
let bias_v = _mm256_set1_epi32(128);
let rnd_v = _mm256_set1_epi32(rnd);
let mut cols = [_mm256_setzero_si256(); 32];
for x in 0..32 {
let off = y_base + x * coeff_h;
let arr: &[i16; 8] = (&coeff[off..off + 8]).try_into().unwrap();
let v16 = loadu_128!(arr);
let v32 = _mm256_cvtepi16_epi32(v16);
cols[x] = if apply_rect2 {
_mm256_srai_epi32::<8>(_mm256_add_epi32(_mm256_mullo_epi32(v32, rect2_v), bias_v))
} else {
v32
};
}
dct32_1d_cols8_i16(token, &mut cols, row_min_v, row_max_v);
for x in 0..32 {
let rounded = match shift {
1 => _mm256_srai_epi32::<1>(_mm256_add_epi32(cols[x], rnd_v)),
2 => _mm256_srai_epi32::<2>(_mm256_add_epi32(cols[x], rnd_v)),
_ => _mm256_add_epi32(cols[x], rnd_v),
};
cols[x] = _mm256_max_epi32(_mm256_min_epi32(rounded, col_max_v), col_min_v);
}
for chunk in 0..4 {
let b = chunk * 8;
let chunk_cols: [__m256i; 8] = [
cols[b + 0],
cols[b + 1],
cols[b + 2],
cols[b + 3],
cols[b + 4],
cols[b + 5],
cols[b + 6],
cols[b + 7],
];
let rows = transpose_8x8_i32!(chunk_cols);
let s = 32;
storeu_256!(
&mut tmp[(y_base + 0) * s + b..(y_base + 0) * s + b + 8],
[i32; 8],
rows[0]
);
storeu_256!(
&mut tmp[(y_base + 1) * s + b..(y_base + 1) * s + b + 8],
[i32; 8],
rows[1]
);
storeu_256!(
&mut tmp[(y_base + 2) * s + b..(y_base + 2) * s + b + 8],
[i32; 8],
rows[2]
);
storeu_256!(
&mut tmp[(y_base + 3) * s + b..(y_base + 3) * s + b + 8],
[i32; 8],
rows[3]
);
storeu_256!(
&mut tmp[(y_base + 4) * s + b..(y_base + 4) * s + b + 8],
[i32; 8],
rows[4]
);
storeu_256!(
&mut tmp[(y_base + 5) * s + b..(y_base + 5) * s + b + 8],
[i32; 8],
rows[5]
);
storeu_256!(
&mut tmp[(y_base + 6) * s + b..(y_base + 6) * s + b + 8],
[i32; 8],
rows[6]
);
storeu_256!(
&mut tmp[(y_base + 7) * s + b..(y_base + 7) * s + b + 8],
[i32; 8],
rows[7]
);
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_32x16_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32 * 16];
{
let coeff_slice = coeff.as_slice();
row_dct32_8bpc_block(
_token,
coeff_slice,
16,
16,
true,
1,
1,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct16_cols_avx512(t512, &mut tmp, 32, 16, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..4 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 16];
for i in 0..16 {
v[i] = loadu_256!(&tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8]);
}
dct16_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..16 {
storeu_256!(&mut tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8], v[i]);
}
}
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 32, 32, 16, bitdepth_max);
coeff[..512].fill(0);
return;
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..16 {
let dst_off = y * dst_stride;
for chunk in 0..2 {
let chunk_off = chunk * 16;
let d = loadu_128!(
<&[u8; 16]>::try_from(&dst[dst_off + chunk_off..dst_off + chunk_off + 16]).unwrap()
);
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 32 + chunk_off + 7],
tmp[y * 32 + chunk_off + 6],
tmp[y * 32 + chunk_off + 5],
tmp[y * 32 + chunk_off + 4],
tmp[y * 32 + chunk_off + 3],
tmp[y * 32 + chunk_off + 2],
tmp[y * 32 + chunk_off + 1],
tmp[y * 32 + chunk_off + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 32 + chunk_off + 15],
tmp[y * 32 + chunk_off + 14],
tmp[y * 32 + chunk_off + 13],
tmp[y * 32 + chunk_off + 12],
tmp[y * 32 + chunk_off + 11],
tmp[y * 32 + chunk_off + 10],
tmp[y * 32 + chunk_off + 9],
tmp[y * 32 + chunk_off + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off + chunk_off..dst_off + chunk_off + 16])
.unwrap(),
_mm256_castsi256_si128(packed)
);
}
}
coeff[..512].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_32x16_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_identity_identity_16x32_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let clip_min = i16::MIN as i32;
let clip_max = i16::MAX as i32;
let mut tmp = [0i32; 16 * 32];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
let rnd = 1;
let shift = 1;
for y in 0..32 {
let mut scratch = [0i32; 16];
for x in 0..16 {
scratch[x] = rect2_scale(coeff[y + x * 32] as i32);
}
identity16_1d(&mut scratch[..16], 1, clip_min, clip_max);
for x in 0..16 {
tmp[y * 16 + x] = iclip((scratch[x] + rnd) >> shift, clip_min, clip_max);
}
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
identity_shift_cols_avx512::<2>(t512, &mut tmp, 16, 32);
} else {
for cx_chunk in 0..2 {
let cx = cx_chunk * 8;
for i in 0..32 {
let v = loadu_256!(&tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8]);
let result = _mm256_slli_epi32::<2>(v);
storeu_256!(&mut tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8], result);
}
}
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..32 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..512].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_16x32_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_identity_identity_16x32_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_identity_identity_32x16_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let clip_min = i16::MIN as i32;
let clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32 * 16];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
let rnd = 1;
let shift = 1;
for y in 0..16 {
let mut scratch = [0i32; 32];
for x in 0..32 {
scratch[x] = rect2_scale(coeff[y + x * 16] as i32);
}
identity32_1d(&mut scratch[..32], 1, clip_min, clip_max);
for x in 0..32 {
tmp[y * 32 + x] = iclip((scratch[x] + rnd) >> shift, clip_min, clip_max);
}
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
identity16_cols_avx512(t512, &mut tmp, 32, 16);
} else {
let c1697 = _mm256_set1_epi32(1697);
let c1024 = _mm256_set1_epi32(1024);
for cx_chunk in 0..4 {
let cx = cx_chunk * 8;
for i in 0..16 {
let v = loadu_256!(&tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8]);
let two_v = _mm256_slli_epi32::<1>(v);
let mul = _mm256_mullo_epi32(v, c1697);
let shifted = _mm256_srai_epi32::<11>(_mm256_add_epi32(mul, c1024));
let result = _mm256_add_epi32(two_v, shifted);
storeu_256!(&mut tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8], result);
}
}
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 32, 32, 16, bitdepth_max);
coeff[..512].fill(0);
return;
}
for y in 0..16 {
let dst_off = y * dst_stride;
for x in 0..32 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 32 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..512].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_identity_identity_32x16_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_32x64_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32 * 64];
{
let coeff_slice = coeff.as_slice();
for y_base in [0usize, 8, 16, 24] {
simd_row_dct32_8bpc_8rows(
_token,
coeff_slice,
32,
y_base,
true,
1,
1,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
}
for y in 32..64 {
for x in 0..32 {
tmp[y * 32 + x] = 0;
}
}
for x in 0..32 {
dct64_1d(&mut tmp[x..], 32, col_clip_min, col_clip_max);
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 32, 32, 64, bitdepth_max);
coeff[..1024].fill(0);
return;
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..64 {
let dst_off = y * dst_stride;
for chunk in 0..2 {
let chunk_off = chunk * 16;
let d = loadu_128!(
<&[u8; 16]>::try_from(&dst[dst_off + chunk_off..dst_off + chunk_off + 16]).unwrap()
);
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 32 + chunk_off + 7],
tmp[y * 32 + chunk_off + 6],
tmp[y * 32 + chunk_off + 5],
tmp[y * 32 + chunk_off + 4],
tmp[y * 32 + chunk_off + 3],
tmp[y * 32 + chunk_off + 2],
tmp[y * 32 + chunk_off + 1],
tmp[y * 32 + chunk_off + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 32 + chunk_off + 15],
tmp[y * 32 + chunk_off + 14],
tmp[y * 32 + chunk_off + 13],
tmp[y * 32 + chunk_off + 12],
tmp[y * 32 + chunk_off + 11],
tmp[y * 32 + chunk_off + 10],
tmp[y * 32 + chunk_off + 9],
tmp[y * 32 + chunk_off + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off + chunk_off..dst_off + chunk_off + 16])
.unwrap(),
_mm256_castsi256_si128(packed)
);
}
}
coeff[..1024].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x64_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_32x64_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_64x32_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 64 * 32];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
let rnd = 1;
let shift = 1;
for y in 0..32 {
let mut scratch = [0i32; 64];
for x in 0..32 {
scratch[x] = rect2_scale(coeff[y + x * 32] as i32);
}
for x in 32..64 {
scratch[x] = 0;
}
dct64_1d(&mut scratch[..64], 1, row_clip_min, row_clip_max);
for x in 0..64 {
tmp[y * 64 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct32_cols_avx512(t512, &mut tmp, 64, 32, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..8 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 32];
for i in 0..32 {
v[i] = loadu_256!(&tmp[i * 64 + cx..i * 64 + cx + 8], [i32; 8]);
}
dct32_1d_cols8_i16(_token, &mut v, min_v, max_v);
for i in 0..32 {
storeu_256!(&mut tmp[i * 64 + cx..i * 64 + cx + 8], [i32; 8], v[i]);
}
}
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 64, 64, 32, bitdepth_max);
coeff[..1024].fill(0);
return;
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..32 {
let dst_off = y * dst_stride;
for chunk in 0..4 {
let chunk_off = chunk * 16;
let d = loadu_128!(
<&[u8; 16]>::try_from(&dst[dst_off + chunk_off..dst_off + chunk_off + 16]).unwrap()
);
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 64 + chunk_off + 7],
tmp[y * 64 + chunk_off + 6],
tmp[y * 64 + chunk_off + 5],
tmp[y * 64 + chunk_off + 4],
tmp[y * 64 + chunk_off + 3],
tmp[y * 64 + chunk_off + 2],
tmp[y * 64 + chunk_off + 1],
tmp[y * 64 + chunk_off + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 64 + chunk_off + 15],
tmp[y * 64 + chunk_off + 14],
tmp[y * 64 + chunk_off + 13],
tmp[y * 64 + chunk_off + 12],
tmp[y * 64 + chunk_off + 11],
tmp[y * 64 + chunk_off + 10],
tmp[y * 64 + chunk_off + 9],
tmp[y * 64 + chunk_off + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off + chunk_off..dst_off + chunk_off + 16])
.unwrap(),
_mm256_castsi256_si128(packed)
);
}
}
coeff[..1024].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x32_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_64x32_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_4x16_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 4 * 16];
let rnd = 1;
let shift = 1;
for y in 0..16 {
let mut scratch = [0i32; 4];
for x in 0..4 {
scratch[x] = coeff[y + x * 16] as i32;
}
dct4_1d(&mut scratch[..4], 1, row_clip_min, row_clip_max);
for x in 0..4 {
tmp[y * 4 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
for x in 0..4 {
dct16_1d(&mut tmp[x..], 4, col_clip_min, col_clip_max);
}
for y in 0..16 {
let dst_off = y * dst_stride;
for x in 0..4 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 4 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..64].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_4x16_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_16x4_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 16 * 4];
let rnd = 1;
let shift = 1;
for y in 0..4 {
let mut scratch = [0i32; 16];
for x in 0..16 {
scratch[x] = coeff[y + x * 4] as i32;
}
dct16_1d(&mut scratch[..16], 1, row_clip_min, row_clip_max);
for x in 0..16 {
tmp[y * 16 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct4_cols_avx512(t512, &mut tmp, 16, 4, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..2 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 4];
for i in 0..4 {
v[i] = loadu_256!(&tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8]);
}
dct4_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..4 {
storeu_256!(&mut tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8], v[i]);
}
}
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..4 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..64].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x4_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_16x4_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
macro_rules! impl_4x16_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 4 * 16];
let rnd = 1;
let shift = 1;
for y in 0..16 {
let mut scratch = [0i32; 4];
for x in 0..4 {
scratch[x] = coeff[y + x * 16] as i32;
}
$row_fn(&mut scratch[..4], 1, row_clip_min, row_clip_max);
for x in 0..4 {
tmp[y * 4 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
for x in 0..4 {
$col_fn(&mut tmp[x..], 4, col_clip_min, col_clip_max);
}
for y in 0..16 {
let dst_off = y * dst_stride;
for x in 0..4 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 4 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..64].fill(0);
}
};
}
macro_rules! impl_16x4_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 16 * 4];
let rnd = 1;
let shift = 1;
for y in 0..4 {
let mut scratch = [0i32; 16];
for x in 0..16 {
scratch[x] = coeff[y + x * 4] as i32;
}
$row_fn(&mut scratch[..16], 1, row_clip_min, row_clip_max);
for x in 0..16 {
tmp[y * 16 + x] =
iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
for x in 0..16 {
$col_fn(&mut tmp[x..], 16, col_clip_min, col_clip_max);
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..4 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..64].fill(0);
}
};
}
impl_4x16_transform!(
inv_txfm_add_adst_dct_4x16_8bpc_avx2_inner,
adst4_1d,
dct16_1d
);
impl_4x16_transform!(
inv_txfm_add_dct_adst_4x16_8bpc_avx2_inner,
dct4_1d,
adst16_1d
);
impl_4x16_transform!(
inv_txfm_add_adst_adst_4x16_8bpc_avx2_inner,
adst4_1d,
adst16_1d
);
impl_4x16_transform!(
inv_txfm_add_flipadst_dct_4x16_8bpc_avx2_inner,
flipadst4_1d,
dct16_1d
);
impl_4x16_transform!(
inv_txfm_add_dct_flipadst_4x16_8bpc_avx2_inner,
dct4_1d,
flipadst16_1d
);
impl_4x16_transform!(
inv_txfm_add_flipadst_flipadst_4x16_8bpc_avx2_inner,
flipadst4_1d,
flipadst16_1d
);
impl_4x16_transform!(
inv_txfm_add_adst_flipadst_4x16_8bpc_avx2_inner,
adst4_1d,
flipadst16_1d
);
impl_4x16_transform!(
inv_txfm_add_flipadst_adst_4x16_8bpc_avx2_inner,
flipadst4_1d,
adst16_1d
);
impl_16x4_transform!(
inv_txfm_add_adst_dct_16x4_8bpc_avx2_inner,
adst16_1d,
dct4_1d
);
impl_16x4_transform!(
inv_txfm_add_dct_adst_16x4_8bpc_avx2_inner,
dct16_1d,
adst4_1d
);
impl_16x4_transform!(
inv_txfm_add_adst_adst_16x4_8bpc_avx2_inner,
adst16_1d,
adst4_1d
);
impl_16x4_transform!(
inv_txfm_add_flipadst_dct_16x4_8bpc_avx2_inner,
flipadst16_1d,
dct4_1d
);
impl_16x4_transform!(
inv_txfm_add_dct_flipadst_16x4_8bpc_avx2_inner,
dct16_1d,
flipadst4_1d
);
impl_16x4_transform!(
inv_txfm_add_flipadst_flipadst_16x4_8bpc_avx2_inner,
flipadst16_1d,
flipadst4_1d
);
impl_16x4_transform!(
inv_txfm_add_adst_flipadst_16x4_8bpc_avx2_inner,
adst16_1d,
flipadst4_1d
);
impl_16x4_transform!(
inv_txfm_add_flipadst_adst_16x4_8bpc_avx2_inner,
flipadst16_1d,
adst4_1d
);
macro_rules! impl_4x16_ffi_wrapper {
($name:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
macro_rules! impl_16x4_ffi_wrapper {
($name:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
impl_4x16_ffi_wrapper!(
inv_txfm_add_adst_dct_4x16_8bpc_avx2,
inv_txfm_add_adst_dct_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_dct_adst_4x16_8bpc_avx2,
inv_txfm_add_dct_adst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_adst_adst_4x16_8bpc_avx2,
inv_txfm_add_adst_adst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_flipadst_dct_4x16_8bpc_avx2,
inv_txfm_add_flipadst_dct_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_dct_flipadst_4x16_8bpc_avx2,
inv_txfm_add_dct_flipadst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_4x16_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_adst_flipadst_4x16_8bpc_avx2,
inv_txfm_add_adst_flipadst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_flipadst_adst_4x16_8bpc_avx2,
inv_txfm_add_flipadst_adst_4x16_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_adst_dct_16x4_8bpc_avx2,
inv_txfm_add_adst_dct_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_dct_adst_16x4_8bpc_avx2,
inv_txfm_add_dct_adst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_adst_adst_16x4_8bpc_avx2,
inv_txfm_add_adst_adst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_flipadst_dct_16x4_8bpc_avx2,
inv_txfm_add_flipadst_dct_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_dct_flipadst_16x4_8bpc_avx2,
inv_txfm_add_dct_flipadst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_16x4_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_adst_flipadst_16x4_8bpc_avx2,
inv_txfm_add_adst_flipadst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_flipadst_adst_16x4_8bpc_avx2,
inv_txfm_add_flipadst_adst_16x4_8bpc_avx2_inner
);
impl_4x16_transform!(
inv_txfm_add_identity_identity_4x16_8bpc_avx2_inner,
identity4_1d,
identity16_1d
);
impl_16x4_transform!(
inv_txfm_add_identity_identity_16x4_8bpc_avx2_inner,
identity16_1d,
identity4_1d
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_identity_identity_4x16_8bpc_avx2,
inv_txfm_add_identity_identity_4x16_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_identity_identity_16x4_8bpc_avx2,
inv_txfm_add_identity_identity_16x4_8bpc_avx2_inner
);
impl_4x16_transform!(
inv_txfm_add_identity_dct_4x16_8bpc_avx2_inner,
identity4_1d,
dct16_1d
);
impl_4x16_transform!(
inv_txfm_add_dct_identity_4x16_8bpc_avx2_inner,
dct4_1d,
identity16_1d
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_identity_dct_4x16_8bpc_avx2,
inv_txfm_add_identity_dct_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_dct_identity_4x16_8bpc_avx2,
inv_txfm_add_dct_identity_4x16_8bpc_avx2_inner
);
impl_16x4_transform!(
inv_txfm_add_identity_dct_16x4_8bpc_avx2_inner,
identity16_1d,
dct4_1d
);
impl_16x4_transform!(
inv_txfm_add_dct_identity_16x4_8bpc_avx2_inner,
dct16_1d,
identity4_1d
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_identity_dct_16x4_8bpc_avx2,
inv_txfm_add_identity_dct_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_dct_identity_16x4_8bpc_avx2,
inv_txfm_add_dct_identity_16x4_8bpc_avx2_inner
);
impl_4x16_transform!(
inv_txfm_add_identity_adst_4x16_8bpc_avx2_inner,
identity4_1d,
adst16_1d
);
impl_4x16_transform!(
inv_txfm_add_adst_identity_4x16_8bpc_avx2_inner,
adst4_1d,
identity16_1d
);
impl_4x16_transform!(
inv_txfm_add_identity_flipadst_4x16_8bpc_avx2_inner,
identity4_1d,
flipadst16_1d
);
impl_4x16_transform!(
inv_txfm_add_flipadst_identity_4x16_8bpc_avx2_inner,
flipadst4_1d,
identity16_1d
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_identity_adst_4x16_8bpc_avx2,
inv_txfm_add_identity_adst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_adst_identity_4x16_8bpc_avx2,
inv_txfm_add_adst_identity_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_identity_flipadst_4x16_8bpc_avx2,
inv_txfm_add_identity_flipadst_4x16_8bpc_avx2_inner
);
impl_4x16_ffi_wrapper!(
inv_txfm_add_flipadst_identity_4x16_8bpc_avx2,
inv_txfm_add_flipadst_identity_4x16_8bpc_avx2_inner
);
impl_16x4_transform!(
inv_txfm_add_identity_adst_16x4_8bpc_avx2_inner,
identity16_1d,
adst4_1d
);
impl_16x4_transform!(
inv_txfm_add_adst_identity_16x4_8bpc_avx2_inner,
adst16_1d,
identity4_1d
);
impl_16x4_transform!(
inv_txfm_add_identity_flipadst_16x4_8bpc_avx2_inner,
identity16_1d,
flipadst4_1d
);
impl_16x4_transform!(
inv_txfm_add_flipadst_identity_16x4_8bpc_avx2_inner,
flipadst16_1d,
identity4_1d
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_identity_adst_16x4_8bpc_avx2,
inv_txfm_add_identity_adst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_adst_identity_16x4_8bpc_avx2,
inv_txfm_add_adst_identity_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_identity_flipadst_16x4_8bpc_avx2,
inv_txfm_add_identity_flipadst_16x4_8bpc_avx2_inner
);
impl_16x4_ffi_wrapper!(
inv_txfm_add_flipadst_identity_16x4_8bpc_avx2,
inv_txfm_add_flipadst_identity_16x4_8bpc_avx2_inner
);
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_8x32_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 8 * 32];
{
let coeff_slice = coeff.as_slice();
for y_base in [0usize, 8, 16, 24] {
simd_row_dct8_8bpc_8rows(
_token,
coeff_slice,
32,
y_base,
false,
2,
2,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
}
{
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
let mut v = [_mm256_setzero_si256(); 32];
for i in 0..32 {
v[i] = loadu_256!(&tmp[i * 8..i * 8 + 8], [i32; 8]);
}
dct32_1d_cols8_i16(_token, &mut v, min_v, max_v);
for i in 0..32 {
storeu_256!(&mut tmp[i * 8..i * 8 + 8], [i32; 8], v[i]);
}
}
for y in 0..32 {
let dst_off = y * dst_stride;
for x in 0..8 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 8 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x32_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_8x32_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_32x8_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32 * 8];
{
let coeff_slice = coeff.as_slice();
simd_row_dct32_8bpc_8rows(
_token,
coeff_slice,
8,
0,
false,
2,
2,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct8_cols_avx512(t512, &mut tmp, 32, 8, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..4 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 8];
for i in 0..8 {
v[i] = loadu_256!(&tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8]);
}
dct8_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..8 {
storeu_256!(&mut tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8], v[i]);
}
}
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 32, 32, 8, bitdepth_max);
coeff[..256].fill(0);
return;
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..8 {
let dst_off = y * dst_stride;
for chunk in 0..2 {
let chunk_off = chunk * 16;
let d = loadu_128!(
<&[u8; 16]>::try_from(&dst[dst_off + chunk_off..dst_off + chunk_off + 16]).unwrap()
);
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 32 + chunk_off + 7],
tmp[y * 32 + chunk_off + 6],
tmp[y * 32 + chunk_off + 5],
tmp[y * 32 + chunk_off + 4],
tmp[y * 32 + chunk_off + 3],
tmp[y * 32 + chunk_off + 2],
tmp[y * 32 + chunk_off + 1],
tmp[y * 32 + chunk_off + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 32 + chunk_off + 15],
tmp[y * 32 + chunk_off + 14],
tmp[y * 32 + chunk_off + 13],
tmp[y * 32 + chunk_off + 12],
tmp[y * 32 + chunk_off + 11],
tmp[y * 32 + chunk_off + 10],
tmp[y * 32 + chunk_off + 9],
tmp[y * 32 + chunk_off + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off + chunk_off..dst_off + chunk_off + 16])
.unwrap(),
_mm256_castsi256_si128(packed)
);
}
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_32x8_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_32x8_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_identity_identity_8x32_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let clip_min = i16::MIN as i32;
let clip_max = i16::MAX as i32;
let mut tmp = [0i32; 8 * 32];
let rnd = 2;
let shift = 2;
for y in 0..32 {
let mut scratch = [0i32; 8];
for x in 0..8 {
scratch[x] = coeff[y + x * 32] as i32;
}
identity8_1d(&mut scratch[..8], 1, clip_min, clip_max);
for x in 0..8 {
tmp[y * 8 + x] = iclip((scratch[x] + rnd) >> shift, clip_min, clip_max);
}
}
{
for i in 0..32 {
let v = loadu_256!(&tmp[i * 8..i * 8 + 8], [i32; 8]);
let result = _mm256_slli_epi32::<2>(v);
storeu_256!(&mut tmp[i * 8..i * 8 + 8], [i32; 8], result);
}
}
for y in 0..32 {
let dst_off = y * dst_stride;
for x in 0..8 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 8 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_8x32_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_identity_identity_8x32_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_identity_identity_32x8_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let clip_min = i16::MIN as i32;
let clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32 * 8];
let rnd = 2;
let shift = 2;
for y in 0..8 {
let mut scratch = [0i32; 32];
for x in 0..32 {
scratch[x] = coeff[y + x * 8] as i32;
}
identity32_1d(&mut scratch[..32], 1, clip_min, clip_max);
for x in 0..32 {
tmp[y * 32 + x] = iclip((scratch[x] + rnd) >> shift, clip_min, clip_max);
}
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
identity_shift_cols_avx512::<1>(t512, &mut tmp, 32, 8);
} else {
for cx_chunk in 0..4 {
let cx = cx_chunk * 8;
for i in 0..8 {
let v = loadu_256!(&tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8]);
let result = _mm256_slli_epi32::<1>(v);
storeu_256!(&mut tmp[i * 32 + cx..i * 32 + cx + 8], [i32; 8], result);
}
}
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 32, 32, 8, bitdepth_max);
coeff[..256].fill(0);
return;
}
for y in 0..8 {
let dst_off = y * dst_stride;
for x in 0..32 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 32 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_identity_identity_32x8_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_identity_identity_32x8_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_16x64_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 16 * 64];
{
let coeff_slice = coeff.as_slice();
for y_base in [0usize, 8, 16, 24] {
simd_row_dct16_8bpc_8rows(
_token,
coeff_slice,
32,
y_base,
false,
2,
2,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
}
for y in 32..64 {
for x in 0..16 {
tmp[y * 16 + x] = 0;
}
}
for x in 0..16 {
dct64_1d(&mut tmp[x..], 16, col_clip_min, col_clip_max);
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..64 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..512].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x64_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_16x64_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_64x16_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 64 * 16];
let rnd = 2;
let shift = 2;
for y in 0..16 {
let mut scratch = [0i32; 64];
for x in 0..32 {
scratch[x] = coeff[y + x * 16] as i32;
}
for x in 32..64 {
scratch[x] = 0;
}
dct64_1d(&mut scratch[..64], 1, row_clip_min, row_clip_max);
for x in 0..64 {
tmp[y * 64 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct16_cols_avx512(t512, &mut tmp, 64, 16, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..8 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 16];
for i in 0..16 {
v[i] = loadu_256!(&tmp[i * 64 + cx..i * 64 + cx + 8], [i32; 8]);
}
dct16_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..16 {
storeu_256!(&mut tmp[i * 64 + cx..i * 64 + cx + 8], [i32; 8], v[i]);
}
}
}
#[cfg(target_arch = "x86_64")]
if let Some(t512) = crate::src::cpu::summon_avx512() {
add_to_dst_8bpc_avx512(t512, &mut *dst, dst_stride, &tmp, 64, 64, 16, bitdepth_max);
coeff[..512].fill(0);
return;
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..16 {
let dst_off = y * dst_stride;
for chunk in 0..4 {
let chunk_off = chunk * 16;
let d = loadu_128!(
<&[u8; 16]>::try_from(&dst[dst_off + chunk_off..dst_off + chunk_off + 16]).unwrap()
);
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 64 + chunk_off + 7],
tmp[y * 64 + chunk_off + 6],
tmp[y * 64 + chunk_off + 5],
tmp[y * 64 + chunk_off + 4],
tmp[y * 64 + chunk_off + 3],
tmp[y * 64 + chunk_off + 2],
tmp[y * 64 + chunk_off + 1],
tmp[y * 64 + chunk_off + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 64 + chunk_off + 15],
tmp[y * 64 + chunk_off + 14],
tmp[y * 64 + chunk_off + 13],
tmp[y * 64 + chunk_off + 12],
tmp[y * 64 + chunk_off + 11],
tmp[y * 64 + chunk_off + 10],
tmp[y * 64 + chunk_off + 9],
tmp[y * 64 + chunk_off + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off + chunk_off..dst_off + chunk_off + 16])
.unwrap(),
_mm256_castsi256_si128(packed)
);
}
}
coeff[..512].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_64x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_64x16_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}