#[allow(unused_macros)]
macro_rules! impl_16x16_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 256];
inv_txfm_16x16_inner(
&mut tmp,
&*coeff,
$row_fn,
$col_fn,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
add_16x16_to_dst(
_token,
&mut *dst,
dst_stride,
&tmp,
&mut *coeff,
bitdepth_max,
);
}
};
}
macro_rules! impl_16x16_transform_simd_col {
($name:ident, $row_fn:ident, $simd_col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 256];
inv_txfm_16x16_row_pass_only(
&mut tmp,
&*coeff,
$row_fn,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
$simd_col_fn(_token, &mut tmp, col_clip_min, col_clip_max);
add_16x16_to_dst(
_token,
&mut *dst,
dst_stride,
&tmp,
&mut *coeff,
bitdepth_max,
);
}
};
}
macro_rules! impl_16x16_transform_simd_row_adst_col {
($name:ident, $flipped:expr, $simd_col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 256];
{
let coeff_slice = coeff.as_slice();
for y_base in [0usize, 8] {
simd_row_adst16_8bpc_8rows(
_token,
coeff_slice,
16,
y_base,
false,
$flipped,
2,
2,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
}
$simd_col_fn(_token, &mut tmp, col_clip_min, col_clip_max);
add_16x16_to_dst(
_token,
&mut *dst,
dst_stride,
&tmp,
&mut *coeff,
bitdepth_max,
);
}
};
}
macro_rules! impl_16x16_transform_simd_row_dct_col {
($name:ident, $simd_col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 256];
{
let coeff_slice = coeff.as_slice();
row_dct16_8bpc_block(
_token,
coeff_slice,
16,
16,
false,
2,
2,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
$simd_col_fn(_token, &mut tmp, col_clip_min, col_clip_max);
add_16x16_to_dst(
_token,
&mut *dst,
dst_stride,
&tmp,
&mut *coeff,
bitdepth_max,
);
}
};
}
macro_rules! impl_16x16_ffi_wrapper {
($wrapper:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $wrapper(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_adst_dct_16x16_8bpc_avx2_inner,
false,
dct16x16_cols_simd
);
impl_16x16_transform_simd_row_dct_col!(
inv_txfm_add_dct_adst_16x16_8bpc_avx2_inner,
adst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_adst_adst_16x16_8bpc_avx2_inner,
false,
adst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_flipadst_dct_16x16_8bpc_avx2_inner,
true,
dct16x16_cols_simd
);
impl_16x16_transform_simd_row_dct_col!(
inv_txfm_add_dct_flipadst_16x16_8bpc_avx2_inner,
flipadst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_flipadst_flipadst_16x16_8bpc_avx2_inner,
true,
flipadst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_adst_flipadst_16x16_8bpc_avx2_inner,
false,
flipadst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_flipadst_adst_16x16_8bpc_avx2_inner,
true,
adst16x16_cols_simd
);
impl_16x16_transform_simd_col!(
inv_txfm_add_identity_dct_16x16_8bpc_avx2_inner,
identity16_1d,
dct16x16_cols_simd
);
impl_16x16_transform_simd_row_dct_col!(
inv_txfm_add_dct_identity_16x16_8bpc_avx2_inner,
identity16x16_cols_simd
);
impl_16x16_transform_simd_col!(
inv_txfm_add_identity_adst_16x16_8bpc_avx2_inner,
identity16_1d,
adst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_adst_identity_16x16_8bpc_avx2_inner,
false,
identity16x16_cols_simd
);
impl_16x16_transform_simd_col!(
inv_txfm_add_identity_flipadst_16x16_8bpc_avx2_inner,
identity16_1d,
flipadst16x16_cols_simd
);
impl_16x16_transform_simd_row_adst_col!(
inv_txfm_add_flipadst_identity_16x16_8bpc_avx2_inner,
true,
identity16x16_cols_simd
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_adst_dct_16x16_8bpc_avx2,
inv_txfm_add_adst_dct_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_dct_adst_16x16_8bpc_avx2,
inv_txfm_add_dct_adst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_adst_adst_16x16_8bpc_avx2,
inv_txfm_add_adst_adst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_flipadst_dct_16x16_8bpc_avx2,
inv_txfm_add_flipadst_dct_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_dct_flipadst_16x16_8bpc_avx2,
inv_txfm_add_dct_flipadst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_16x16_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_adst_flipadst_16x16_8bpc_avx2,
inv_txfm_add_adst_flipadst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_flipadst_adst_16x16_8bpc_avx2,
inv_txfm_add_flipadst_adst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_identity_dct_16x16_8bpc_avx2,
inv_txfm_add_identity_dct_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_dct_identity_16x16_8bpc_avx2,
inv_txfm_add_dct_identity_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_identity_adst_16x16_8bpc_avx2,
inv_txfm_add_identity_adst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_adst_identity_16x16_8bpc_avx2,
inv_txfm_add_adst_identity_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_identity_flipadst_16x16_8bpc_avx2,
inv_txfm_add_identity_flipadst_16x16_8bpc_avx2_inner
);
impl_16x16_ffi_wrapper!(
inv_txfm_add_flipadst_identity_16x16_8bpc_avx2,
inv_txfm_add_flipadst_identity_16x16_8bpc_avx2_inner
);
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_16x16_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let _row_clip_min = i16::MIN as i32;
let _row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 256];
{
let coeff_arr: &[i16; 256] = coeff.as_slice()[..256].try_into().unwrap();
let raw = dct16_row_pass_i16_simd(_token, *coeff_arr);
let rnd_v = _mm256_set1_epi32(2);
let col_min_v = _mm256_set1_epi32(col_clip_min);
let col_max_v = _mm256_set1_epi32(col_clip_max);
for y in 0..16 {
for chunk in 0..2u32 {
let b = (chunk * 8) as usize;
let off = y * 16 + b;
let v = loadu_256!(&raw[off..off + 8], [i32; 8]);
let shifted = _mm256_srai_epi32::<2>(_mm256_add_epi32(v, rnd_v));
let clamped = _mm256_max_epi32(_mm256_min_epi32(shifted, col_max_v), col_min_v);
storeu_256!(&mut tmp[off..off + 8], [i32; 8], clamped);
}
}
}
let col_out = dct16_col_pass_i16(_token, &tmp);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..16 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = loadu_256!(&col_out[y * 16..y * 16 + 8], [i32; 8]);
let c1 = loadu_256!(&col_out[y * 16 + 8..y * 16 + 16], [i32; 8]);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_16x16_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_16x16_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
dst_stride: usize, coeff: &mut [i32],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let stride_u16 = dst_stride / 2;
let row_clip_min = (!bitdepth_max) << 7;
let row_clip_max = !row_clip_min;
let col_clip_min = (!bitdepth_max) << 5;
let col_clip_max = !col_clip_min;
let mut tmp = [0i32; 256];
let rnd = 2;
let shift = 2;
for y in 0..16 {
let mut scratch = [0i32; 16];
for x in 0..16 {
scratch[x] = coeff[y + x * 16] as i32;
}
dct16_1d(&mut scratch[..16], 1, row_clip_min, row_clip_max);
for x in 0..16 {
tmp[y * 16 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
dct16x16_cols_simd(_token, &mut tmp, col_clip_min, col_clip_max);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(bitdepth_max);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..16 {
let dst_off = y * stride_u16;
let d = loadu_256!(<&[u16; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d_lo = _mm256_unpacklo_epi16(d, _mm256_setzero_si256());
let d_hi = _mm256_unpackhi_epi16(d, _mm256_setzero_si256());
let d_0_4 = _mm256_permute2x128_si256(d_lo, d_hi, 0x20); let d_4_8 = _mm256_permute2x128_si256(d_lo, d_hi, 0x31);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32::<4>(_mm256_add_epi32(c0, rnd_final));
let c1_scaled = _mm256_srai_epi32::<4>(_mm256_add_epi32(c1, rnd_final));
let sum0 = _mm256_add_epi32(d_0_4, c0_scaled);
let sum1 = _mm256_add_epi32(d_4_8, c1_scaled);
let clamped0 = _mm256_max_epi32(_mm256_min_epi32(sum0, max_val), zero);
let clamped1 = _mm256_max_epi32(_mm256_min_epi32(sum1, max_val), zero);
let packed = _mm256_packus_epi32(clamped0, clamped1);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
packed
);
}
coeff[..256].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x16_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u16, _coeff_len as usize * stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_16x16_16bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_4x8_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
for y in 0..8 {
let mut scratch = [0i32; 4];
for x in 0..4 {
scratch[x] = rect2_scale(coeff[y + x * 8] as i32);
}
dct4_1d(&mut scratch[..4], 1, row_clip_min, row_clip_max);
for x in 0..4 {
tmp[y * 4 + x] = iclip(scratch[x], col_clip_min, col_clip_max);
}
}
for x in 0..4 {
dct8_1d(&mut tmp[x..], 4, col_clip_min, col_clip_max);
}
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
for y in 0..8 {
let dst_off = y * dst_stride;
let d = loadi32!(&dst[dst_off..dst_off + 4]);
let d16 = _mm_unpacklo_epi8(d, zero);
let d32 = _mm_cvtepi16_epi32(d16);
let c = _mm_set_epi32(
(tmp[y * 4 + 3] + 8) >> 4,
(tmp[y * 4 + 2] + 8) >> 4,
(tmp[y * 4 + 1] + 8) >> 4,
(tmp[y * 4 + 0] + 8) >> 4,
);
let sum = _mm_add_epi32(d32, c);
let sum16 = _mm_packs_epi32(sum, sum);
let clamped = _mm_max_epi16(_mm_min_epi16(sum16, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei32!(&mut dst[dst_off..dst_off + 4], packed);
}
coeff[..32].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_4x8_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_4x8_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_8x4_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
for y in 0..4 {
let mut scratch = [0i32; 8];
for x in 0..8 {
scratch[x] = rect2_scale(coeff[y + x * 4] as i32);
}
dct8_1d(&mut scratch[..8], 1, row_clip_min, row_clip_max);
for x in 0..8 {
tmp[y * 8 + x] = iclip(scratch[x], col_clip_min, col_clip_max);
}
}
{
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
let mut v = [_mm256_setzero_si256(); 4];
for i in 0..4 {
v[i] = loadu_256!(&tmp[i * 8..i * 8 + 8], [i32; 8]);
}
dct4_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..4 {
storeu_256!(&mut tmp[i * 8..i * 8 + 8], [i32; 8], v[i]);
}
}
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..4 {
let dst_off = y * dst_stride;
let d = loadi64!(&dst[dst_off..dst_off + 8]);
let d16 = _mm_unpacklo_epi8(d, zero);
let c_lo = _mm_set_epi32(
tmp[y * 8 + 3],
tmp[y * 8 + 2],
tmp[y * 8 + 1],
tmp[y * 8 + 0],
);
let c_hi = _mm_set_epi32(
tmp[y * 8 + 7],
tmp[y * 8 + 6],
tmp[y * 8 + 5],
tmp[y * 8 + 4],
);
let c_lo_256 = _mm256_set_m128i(c_hi, c_lo);
let c_scaled = _mm256_srai_epi32(_mm256_add_epi32(c_lo_256, rnd_final), 4);
let c_lo_scaled = _mm256_castsi256_si128(c_scaled);
let c_hi_scaled = _mm256_extracti128_si256(c_scaled, 1);
let c16 = _mm_packs_epi32(c_lo_scaled, c_hi_scaled);
let sum = _mm_add_epi16(d16, c16);
let clamped = _mm_max_epi16(_mm_min_epi16(sum, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei64!(&mut dst[dst_off..dst_off + 8], packed);
}
coeff[..32].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x4_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_8x4_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
macro_rules! impl_4x8_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
for y in 0..8 {
let mut scratch = [0i32; 4];
for x in 0..4 {
scratch[x] = rect2_scale(coeff[y + x * 8] as i32);
}
$row_fn(&mut scratch[..4], 1, row_clip_min, row_clip_max);
for x in 0..4 {
tmp[y * 4 + x] = iclip(scratch[x], col_clip_min, col_clip_max);
}
}
for x in 0..4 {
$col_fn(&mut tmp[x..], 4, col_clip_min, col_clip_max);
}
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
for y in 0..8 {
let dst_off = y * dst_stride;
let d = loadi32!(&dst[dst_off..dst_off + 4]);
let d16 = _mm_unpacklo_epi8(d, zero);
let d32 = _mm_cvtepi16_epi32(d16);
let c = _mm_set_epi32(
(tmp[y * 4 + 3] + 8) >> 4,
(tmp[y * 4 + 2] + 8) >> 4,
(tmp[y * 4 + 1] + 8) >> 4,
(tmp[y * 4 + 0] + 8) >> 4,
);
let sum = _mm_add_epi32(d32, c);
let sum16 = _mm_packs_epi32(sum, sum);
let clamped = _mm_max_epi16(_mm_min_epi16(sum16, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei32!(&mut dst[dst_off..dst_off + 4], packed);
}
coeff[..32].fill(0);
}
};
}
macro_rules! impl_8x4_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 32];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
for y in 0..4 {
let mut scratch = [0i32; 8];
for x in 0..8 {
scratch[x] = rect2_scale(coeff[y + x * 4] as i32);
}
$row_fn(&mut scratch[..8], 1, row_clip_min, row_clip_max);
for x in 0..8 {
tmp[y * 8 + x] = iclip(scratch[x], col_clip_min, col_clip_max);
}
}
for x in 0..8 {
$col_fn(&mut tmp[x..], 8, col_clip_min, col_clip_max);
}
for y in 0..4 {
let dst_off = y * dst_stride;
for x in 0..8 {
let d = dst[dst_off + x] as i32;
let c = (tmp[y * 8 + x] + 8) >> 4;
let result = iclip(d + c, 0, bitdepth_max);
dst[dst_off + x] = result as u8;
}
}
coeff[..32].fill(0);
}
};
}
impl_4x8_transform!(inv_txfm_add_adst_dct_4x8_8bpc_avx2_inner, adst4_1d, dct8_1d);
impl_4x8_transform!(inv_txfm_add_dct_adst_4x8_8bpc_avx2_inner, dct4_1d, adst8_1d);
impl_4x8_transform!(
inv_txfm_add_adst_adst_4x8_8bpc_avx2_inner,
adst4_1d,
adst8_1d
);
impl_4x8_transform!(
inv_txfm_add_flipadst_dct_4x8_8bpc_avx2_inner,
flipadst4_1d,
dct8_1d
);
impl_4x8_transform!(
inv_txfm_add_dct_flipadst_4x8_8bpc_avx2_inner,
dct4_1d,
flipadst8_1d
);
impl_4x8_transform!(
inv_txfm_add_flipadst_flipadst_4x8_8bpc_avx2_inner,
flipadst4_1d,
flipadst8_1d
);
impl_4x8_transform!(
inv_txfm_add_adst_flipadst_4x8_8bpc_avx2_inner,
adst4_1d,
flipadst8_1d
);
impl_4x8_transform!(
inv_txfm_add_flipadst_adst_4x8_8bpc_avx2_inner,
flipadst4_1d,
adst8_1d
);
impl_8x4_transform!(inv_txfm_add_adst_dct_8x4_8bpc_avx2_inner, adst8_1d, dct4_1d);
impl_8x4_transform!(inv_txfm_add_dct_adst_8x4_8bpc_avx2_inner, dct8_1d, adst4_1d);
impl_8x4_transform!(
inv_txfm_add_adst_adst_8x4_8bpc_avx2_inner,
adst8_1d,
adst4_1d
);
impl_8x4_transform!(
inv_txfm_add_flipadst_dct_8x4_8bpc_avx2_inner,
flipadst8_1d,
dct4_1d
);
impl_8x4_transform!(
inv_txfm_add_dct_flipadst_8x4_8bpc_avx2_inner,
dct8_1d,
flipadst4_1d
);
impl_8x4_transform!(
inv_txfm_add_flipadst_flipadst_8x4_8bpc_avx2_inner,
flipadst8_1d,
flipadst4_1d
);
impl_8x4_transform!(
inv_txfm_add_adst_flipadst_8x4_8bpc_avx2_inner,
adst8_1d,
flipadst4_1d
);
impl_8x4_transform!(
inv_txfm_add_flipadst_adst_8x4_8bpc_avx2_inner,
flipadst8_1d,
adst4_1d
);
macro_rules! impl_4x8_ffi_wrapper {
($name:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
impl_4x8_ffi_wrapper!(
inv_txfm_add_adst_dct_4x8_8bpc_avx2,
inv_txfm_add_adst_dct_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_dct_adst_4x8_8bpc_avx2,
inv_txfm_add_dct_adst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_adst_adst_4x8_8bpc_avx2,
inv_txfm_add_adst_adst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_flipadst_dct_4x8_8bpc_avx2,
inv_txfm_add_flipadst_dct_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_dct_flipadst_4x8_8bpc_avx2,
inv_txfm_add_dct_flipadst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_4x8_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_adst_flipadst_4x8_8bpc_avx2,
inv_txfm_add_adst_flipadst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_flipadst_adst_4x8_8bpc_avx2,
inv_txfm_add_flipadst_adst_4x8_8bpc_avx2_inner
);
macro_rules! impl_8x4_ffi_wrapper {
($name:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
impl_8x4_ffi_wrapper!(
inv_txfm_add_adst_dct_8x4_8bpc_avx2,
inv_txfm_add_adst_dct_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_dct_adst_8x4_8bpc_avx2,
inv_txfm_add_dct_adst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_adst_adst_8x4_8bpc_avx2,
inv_txfm_add_adst_adst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_flipadst_dct_8x4_8bpc_avx2,
inv_txfm_add_flipadst_dct_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_dct_flipadst_8x4_8bpc_avx2,
inv_txfm_add_dct_flipadst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_8x4_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_adst_flipadst_8x4_8bpc_avx2,
inv_txfm_add_adst_flipadst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_flipadst_adst_8x4_8bpc_avx2,
inv_txfm_add_flipadst_adst_8x4_8bpc_avx2_inner
);
impl_4x8_transform!(
inv_txfm_add_identity_identity_4x8_8bpc_avx2_inner,
identity4_1d,
identity8_1d
);
impl_8x4_transform!(
inv_txfm_add_identity_identity_8x4_8bpc_avx2_inner,
identity8_1d,
identity4_1d
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_identity_identity_4x8_8bpc_avx2,
inv_txfm_add_identity_identity_4x8_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_identity_identity_8x4_8bpc_avx2,
inv_txfm_add_identity_identity_8x4_8bpc_avx2_inner
);
impl_4x8_transform!(
inv_txfm_add_identity_dct_4x8_8bpc_avx2_inner,
identity4_1d,
dct8_1d
);
impl_4x8_transform!(
inv_txfm_add_dct_identity_4x8_8bpc_avx2_inner,
dct4_1d,
identity8_1d
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_identity_dct_4x8_8bpc_avx2,
inv_txfm_add_identity_dct_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_dct_identity_4x8_8bpc_avx2,
inv_txfm_add_dct_identity_4x8_8bpc_avx2_inner
);
impl_8x4_transform!(
inv_txfm_add_identity_dct_8x4_8bpc_avx2_inner,
identity8_1d,
dct4_1d
);
impl_8x4_transform!(
inv_txfm_add_dct_identity_8x4_8bpc_avx2_inner,
dct8_1d,
identity4_1d
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_identity_dct_8x4_8bpc_avx2,
inv_txfm_add_identity_dct_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_dct_identity_8x4_8bpc_avx2,
inv_txfm_add_dct_identity_8x4_8bpc_avx2_inner
);
impl_4x8_transform!(
inv_txfm_add_identity_adst_4x8_8bpc_avx2_inner,
identity4_1d,
adst8_1d
);
impl_4x8_transform!(
inv_txfm_add_adst_identity_4x8_8bpc_avx2_inner,
adst4_1d,
identity8_1d
);
impl_4x8_transform!(
inv_txfm_add_identity_flipadst_4x8_8bpc_avx2_inner,
identity4_1d,
flipadst8_1d
);
impl_4x8_transform!(
inv_txfm_add_flipadst_identity_4x8_8bpc_avx2_inner,
flipadst4_1d,
identity8_1d
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_identity_adst_4x8_8bpc_avx2,
inv_txfm_add_identity_adst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_adst_identity_4x8_8bpc_avx2,
inv_txfm_add_adst_identity_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_identity_flipadst_4x8_8bpc_avx2,
inv_txfm_add_identity_flipadst_4x8_8bpc_avx2_inner
);
impl_4x8_ffi_wrapper!(
inv_txfm_add_flipadst_identity_4x8_8bpc_avx2,
inv_txfm_add_flipadst_identity_4x8_8bpc_avx2_inner
);
impl_8x4_transform!(
inv_txfm_add_identity_adst_8x4_8bpc_avx2_inner,
identity8_1d,
adst4_1d
);
impl_8x4_transform!(
inv_txfm_add_adst_identity_8x4_8bpc_avx2_inner,
adst8_1d,
identity4_1d
);
impl_8x4_transform!(
inv_txfm_add_identity_flipadst_8x4_8bpc_avx2_inner,
identity8_1d,
flipadst4_1d
);
impl_8x4_transform!(
inv_txfm_add_flipadst_identity_8x4_8bpc_avx2_inner,
flipadst8_1d,
identity4_1d
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_identity_adst_8x4_8bpc_avx2,
inv_txfm_add_identity_adst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_adst_identity_8x4_8bpc_avx2,
inv_txfm_add_adst_identity_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_identity_flipadst_8x4_8bpc_avx2,
inv_txfm_add_identity_flipadst_8x4_8bpc_avx2_inner
);
impl_8x4_ffi_wrapper!(
inv_txfm_add_flipadst_identity_8x4_8bpc_avx2,
inv_txfm_add_flipadst_identity_8x4_8bpc_avx2_inner
);
macro_rules! impl_8x16_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 128];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
let rnd = 1;
let shift = 1;
for y in 0..16 {
let mut scratch = [0i32; 8];
for x in 0..8 {
scratch[x] = rect2_scale(coeff[y + x * 16] as i32);
}
$row_fn(&mut scratch[..8], 1, row_clip_min, row_clip_max);
for x in 0..8 {
tmp[y * 8 + x] = iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
for x in 0..8 {
$col_fn(&mut tmp[x..], 8, col_clip_min, col_clip_max);
}
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..16 {
let dst_off = y * dst_stride;
let d = loadi64!(&dst[dst_off..dst_off + 8]);
let d16 = _mm_unpacklo_epi8(d, zero);
let c_lo = _mm_set_epi32(
tmp[y * 8 + 3],
tmp[y * 8 + 2],
tmp[y * 8 + 1],
tmp[y * 8 + 0],
);
let c_hi = _mm_set_epi32(
tmp[y * 8 + 7],
tmp[y * 8 + 6],
tmp[y * 8 + 5],
tmp[y * 8 + 4],
);
let c_lo_256 = _mm256_set_m128i(c_hi, c_lo);
let c_scaled = _mm256_srai_epi32(_mm256_add_epi32(c_lo_256, rnd_final), 4);
let c_lo_scaled = _mm256_castsi256_si128(c_scaled);
let c_hi_scaled = _mm256_extracti128_si256(c_scaled, 1);
let c16 = _mm_packs_epi32(c_lo_scaled, c_hi_scaled);
let sum = _mm_add_epi16(d16, c16);
let clamped = _mm_max_epi16(_mm_min_epi16(sum, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei64!(&mut dst[dst_off..dst_off + 8], packed);
}
coeff[..128].fill(0);
}
};
}
macro_rules! impl_16x8_transform {
($name:ident, $row_fn:ident, $col_fn:ident) => {
#[cfg(target_arch = "x86_64")]
#[arcane]
fn $name(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
use crate::src::safe_simd::pixel_access::{
loadi32, loadi64, loadu_128, storei32, storei64, storeu_128,
};
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 128];
let rect2_scale = |v: i32| (v * 181 + 128) >> 8;
let rnd = 1;
let shift = 1;
for y in 0..8 {
let mut scratch = [0i32; 16];
for x in 0..16 {
scratch[x] = rect2_scale(coeff[y + x * 8] as i32);
}
$row_fn(&mut scratch[..16], 1, row_clip_min, row_clip_max);
for x in 0..16 {
tmp[y * 16 + x] =
iclip((scratch[x] + rnd) >> shift, col_clip_min, col_clip_max);
}
}
for x in 0..16 {
$col_fn(&mut tmp[x..], 16, col_clip_min, col_clip_max);
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..8 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..128].fill(0);
}
};
}
impl_8x16_transform!(
inv_txfm_add_adst_dct_8x16_8bpc_avx2_inner,
adst8_1d,
dct16_1d
);
impl_8x16_transform!(
inv_txfm_add_dct_adst_8x16_8bpc_avx2_inner,
dct8_1d,
adst16_1d
);
impl_8x16_transform!(
inv_txfm_add_adst_adst_8x16_8bpc_avx2_inner,
adst8_1d,
adst16_1d
);
impl_8x16_transform!(
inv_txfm_add_flipadst_dct_8x16_8bpc_avx2_inner,
flipadst8_1d,
dct16_1d
);
impl_8x16_transform!(
inv_txfm_add_dct_flipadst_8x16_8bpc_avx2_inner,
dct8_1d,
flipadst16_1d
);
impl_8x16_transform!(
inv_txfm_add_flipadst_flipadst_8x16_8bpc_avx2_inner,
flipadst8_1d,
flipadst16_1d
);
impl_8x16_transform!(
inv_txfm_add_adst_flipadst_8x16_8bpc_avx2_inner,
adst8_1d,
flipadst16_1d
);
impl_8x16_transform!(
inv_txfm_add_flipadst_adst_8x16_8bpc_avx2_inner,
flipadst8_1d,
adst16_1d
);
impl_16x8_transform!(
inv_txfm_add_adst_dct_16x8_8bpc_avx2_inner,
adst16_1d,
dct8_1d
);
impl_16x8_transform!(
inv_txfm_add_dct_adst_16x8_8bpc_avx2_inner,
dct16_1d,
adst8_1d
);
impl_16x8_transform!(
inv_txfm_add_adst_adst_16x8_8bpc_avx2_inner,
adst16_1d,
adst8_1d
);
impl_16x8_transform!(
inv_txfm_add_flipadst_dct_16x8_8bpc_avx2_inner,
flipadst16_1d,
dct8_1d
);
impl_16x8_transform!(
inv_txfm_add_dct_flipadst_16x8_8bpc_avx2_inner,
dct16_1d,
flipadst8_1d
);
impl_16x8_transform!(
inv_txfm_add_flipadst_flipadst_16x8_8bpc_avx2_inner,
flipadst16_1d,
flipadst8_1d
);
impl_16x8_transform!(
inv_txfm_add_adst_flipadst_16x8_8bpc_avx2_inner,
adst16_1d,
flipadst8_1d
);
impl_16x8_transform!(
inv_txfm_add_flipadst_adst_16x8_8bpc_avx2_inner,
flipadst16_1d,
adst8_1d
);
macro_rules! impl_8x16_ffi_wrapper {
($name:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
macro_rules! impl_16x8_ffi_wrapper {
($name:ident, $inner:ident) => {
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
_coeff_len as usize * stride + stride,
)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
$inner(_token, dst_slice, stride, coeff_slice, eob, bitdepth_max);
}
};
}
impl_8x16_ffi_wrapper!(
inv_txfm_add_adst_dct_8x16_8bpc_avx2,
inv_txfm_add_adst_dct_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_dct_adst_8x16_8bpc_avx2,
inv_txfm_add_dct_adst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_adst_adst_8x16_8bpc_avx2,
inv_txfm_add_adst_adst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_flipadst_dct_8x16_8bpc_avx2,
inv_txfm_add_flipadst_dct_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_dct_flipadst_8x16_8bpc_avx2,
inv_txfm_add_dct_flipadst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_8x16_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_adst_flipadst_8x16_8bpc_avx2,
inv_txfm_add_adst_flipadst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_flipadst_adst_8x16_8bpc_avx2,
inv_txfm_add_flipadst_adst_8x16_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_adst_dct_16x8_8bpc_avx2,
inv_txfm_add_adst_dct_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_dct_adst_16x8_8bpc_avx2,
inv_txfm_add_dct_adst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_adst_adst_16x8_8bpc_avx2,
inv_txfm_add_adst_adst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_flipadst_dct_16x8_8bpc_avx2,
inv_txfm_add_flipadst_dct_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_dct_flipadst_16x8_8bpc_avx2,
inv_txfm_add_dct_flipadst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_flipadst_flipadst_16x8_8bpc_avx2,
inv_txfm_add_flipadst_flipadst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_adst_flipadst_16x8_8bpc_avx2,
inv_txfm_add_adst_flipadst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_flipadst_adst_16x8_8bpc_avx2,
inv_txfm_add_flipadst_adst_16x8_8bpc_avx2_inner
);
impl_8x16_transform!(
inv_txfm_add_identity_identity_8x16_8bpc_avx2_inner,
identity8_1d,
identity16_1d
);
impl_16x8_transform!(
inv_txfm_add_identity_identity_16x8_8bpc_avx2_inner,
identity16_1d,
identity8_1d
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_identity_identity_8x16_8bpc_avx2,
inv_txfm_add_identity_identity_8x16_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_identity_identity_16x8_8bpc_avx2,
inv_txfm_add_identity_identity_16x8_8bpc_avx2_inner
);
impl_8x16_transform!(
inv_txfm_add_identity_dct_8x16_8bpc_avx2_inner,
identity8_1d,
dct16_1d
);
impl_8x16_transform!(
inv_txfm_add_dct_identity_8x16_8bpc_avx2_inner,
dct8_1d,
identity16_1d
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_identity_dct_8x16_8bpc_avx2,
inv_txfm_add_identity_dct_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_dct_identity_8x16_8bpc_avx2,
inv_txfm_add_dct_identity_8x16_8bpc_avx2_inner
);
impl_16x8_transform!(
inv_txfm_add_identity_dct_16x8_8bpc_avx2_inner,
identity16_1d,
dct8_1d
);
impl_16x8_transform!(
inv_txfm_add_dct_identity_16x8_8bpc_avx2_inner,
dct16_1d,
identity8_1d
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_identity_dct_16x8_8bpc_avx2,
inv_txfm_add_identity_dct_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_dct_identity_16x8_8bpc_avx2,
inv_txfm_add_dct_identity_16x8_8bpc_avx2_inner
);
impl_8x16_transform!(
inv_txfm_add_identity_adst_8x16_8bpc_avx2_inner,
identity8_1d,
adst16_1d
);
impl_8x16_transform!(
inv_txfm_add_adst_identity_8x16_8bpc_avx2_inner,
adst8_1d,
identity16_1d
);
impl_8x16_transform!(
inv_txfm_add_identity_flipadst_8x16_8bpc_avx2_inner,
identity8_1d,
flipadst16_1d
);
impl_8x16_transform!(
inv_txfm_add_flipadst_identity_8x16_8bpc_avx2_inner,
flipadst8_1d,
identity16_1d
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_identity_adst_8x16_8bpc_avx2,
inv_txfm_add_identity_adst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_adst_identity_8x16_8bpc_avx2,
inv_txfm_add_adst_identity_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_identity_flipadst_8x16_8bpc_avx2,
inv_txfm_add_identity_flipadst_8x16_8bpc_avx2_inner
);
impl_8x16_ffi_wrapper!(
inv_txfm_add_flipadst_identity_8x16_8bpc_avx2,
inv_txfm_add_flipadst_identity_8x16_8bpc_avx2_inner
);
impl_16x8_transform!(
inv_txfm_add_identity_adst_16x8_8bpc_avx2_inner,
identity16_1d,
adst8_1d
);
impl_16x8_transform!(
inv_txfm_add_adst_identity_16x8_8bpc_avx2_inner,
adst16_1d,
identity8_1d
);
impl_16x8_transform!(
inv_txfm_add_identity_flipadst_16x8_8bpc_avx2_inner,
identity16_1d,
flipadst8_1d
);
impl_16x8_transform!(
inv_txfm_add_flipadst_identity_16x8_8bpc_avx2_inner,
flipadst16_1d,
identity8_1d
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_identity_adst_16x8_8bpc_avx2,
inv_txfm_add_identity_adst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_adst_identity_16x8_8bpc_avx2,
inv_txfm_add_adst_identity_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_identity_flipadst_16x8_8bpc_avx2,
inv_txfm_add_identity_flipadst_16x8_8bpc_avx2_inner
);
impl_16x8_ffi_wrapper!(
inv_txfm_add_flipadst_identity_16x8_8bpc_avx2,
inv_txfm_add_flipadst_identity_16x8_8bpc_avx2_inner
);
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_8x16_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 128];
{
let coeff_slice = coeff.as_slice();
row_dct8_8bpc_block(
_token,
coeff_slice,
16,
16,
true,
1,
1,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
{
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
let mut v = [_mm256_setzero_si256(); 16];
for i in 0..16 {
v[i] = loadu_256!(&tmp[i * 8..i * 8 + 8], [i32; 8]);
}
dct16_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..16 {
storeu_256!(&mut tmp[i * 8..i * 8 + 8], [i32; 8], v[i]);
}
}
let zero = _mm_setzero_si128();
let max_val = _mm_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..16 {
let dst_off = y * dst_stride;
let d = loadi64!(&dst[dst_off..dst_off + 8]);
let d16 = _mm_unpacklo_epi8(d, zero);
let c_lo = _mm_set_epi32(
tmp[y * 8 + 3],
tmp[y * 8 + 2],
tmp[y * 8 + 1],
tmp[y * 8 + 0],
);
let c_hi = _mm_set_epi32(
tmp[y * 8 + 7],
tmp[y * 8 + 6],
tmp[y * 8 + 5],
tmp[y * 8 + 4],
);
let c_lo_256 = _mm256_set_m128i(c_hi, c_lo);
let c_scaled = _mm256_srai_epi32(_mm256_add_epi32(c_lo_256, rnd_final), 4);
let c_lo_scaled = _mm256_castsi256_si128(c_scaled);
let c_hi_scaled = _mm256_extracti128_si256(c_scaled, 1);
let c16 = _mm_packs_epi32(c_lo_scaled, c_hi_scaled);
let sum = _mm_add_epi16(d16, c16);
let clamped = _mm_max_epi16(_mm_min_epi16(sum, max_val), zero);
let packed = _mm_packus_epi16(clamped, clamped);
storei64!(&mut dst[dst_off..dst_off + 8], packed);
}
coeff[..128].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_8x16_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_8x16_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn inv_txfm_add_dct_dct_16x8_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
coeff: &mut [i16],
_eob: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mut coeff = coeff.flex_mut();
let row_clip_min = i16::MIN as i32;
let row_clip_max = i16::MAX as i32;
let col_clip_min = i16::MIN as i32;
let col_clip_max = i16::MAX as i32;
let mut tmp = [0i32; 128];
{
let coeff_slice = coeff.as_slice();
simd_row_dct16_8bpc_8rows(
_token,
coeff_slice,
8,
0,
true,
1,
1,
&mut tmp,
row_clip_min,
row_clip_max,
col_clip_min,
col_clip_max,
);
}
if let Some(t512) = crate::src::cpu::summon_avx512() {
dct8_cols_avx512(t512, &mut tmp, 16, 8, col_clip_min, col_clip_max);
} else {
let min_v = _mm256_set1_epi32(col_clip_min);
let max_v = _mm256_set1_epi32(col_clip_max);
for cx_chunk in 0..2 {
let cx = cx_chunk * 8;
let mut v = [_mm256_setzero_si256(); 8];
for i in 0..8 {
v[i] = loadu_256!(&tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8]);
}
dct8_1d_cols8(_token, &mut v, min_v, max_v);
for i in 0..8 {
storeu_256!(&mut tmp[i * 16 + cx..i * 16 + cx + 8], [i32; 8], v[i]);
}
}
}
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi16(bitdepth_max as i16);
let rnd_final = _mm256_set1_epi32(8);
for y in 0..8 {
let dst_off = y * dst_stride;
let d = loadu_128!(<&[u8; 16]>::try_from(&dst[dst_off..dst_off + 16]).unwrap());
let d16 = _mm256_cvtepu8_epi16(d);
let c0 = _mm256_set_epi32(
tmp[y * 16 + 7],
tmp[y * 16 + 6],
tmp[y * 16 + 5],
tmp[y * 16 + 4],
tmp[y * 16 + 3],
tmp[y * 16 + 2],
tmp[y * 16 + 1],
tmp[y * 16 + 0],
);
let c1 = _mm256_set_epi32(
tmp[y * 16 + 15],
tmp[y * 16 + 14],
tmp[y * 16 + 13],
tmp[y * 16 + 12],
tmp[y * 16 + 11],
tmp[y * 16 + 10],
tmp[y * 16 + 9],
tmp[y * 16 + 8],
);
let c0_scaled = _mm256_srai_epi32(_mm256_add_epi32(c0, rnd_final), 4);
let c1_scaled = _mm256_srai_epi32(_mm256_add_epi32(c1, rnd_final), 4);
let c16 = _mm256_packs_epi32(c0_scaled, c1_scaled);
let c16 = _mm256_permute4x64_epi64(c16, 0b11_01_10_00);
let sum = _mm256_add_epi16(d16, c16);
let clamped = _mm256_max_epi16(_mm256_min_epi16(sum, max_val), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[dst_off..dst_off + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
}
coeff[..128].fill(0);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(feature = "asm")]
pub unsafe extern "C" fn inv_txfm_add_dct_dct_16x8_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
coeff: *mut DynCoef,
eob: c_int,
bitdepth_max: c_int,
_coeff_len: u16,
_dst: *const FFISafe<PicOffset>,
) {
let _token = unsafe { Desktop64::forge_token_dangerously() };
let stride = dst_stride as usize;
let dst_slice = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, _coeff_len as usize * stride + stride)
};
let coeff_slice =
unsafe { std::slice::from_raw_parts_mut(coeff as *mut i16, _coeff_len as usize) };
inv_txfm_add_dct_dct_16x8_8bpc_avx2_inner(
_token,
dst_slice,
stride,
coeff_slice,
eob,
bitdepth_max,
);
}