#[cfg(target_arch = "x86")]
use core::arch::x86::{
__m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16,
_mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16,
_mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32,
_mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32,
_mm256_unpacklo_epi16, _mm256_unpacklo_epi32,
};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
__m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16,
_mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16,
_mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32,
_mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32,
_mm256_unpacklo_epi16, _mm256_unpacklo_epi32,
};
const CONST_BITS: i32 = 13;
const PASS1_BITS: i32 = 2;
const F_0_298: i16 = 2446;
const F_0_390: i16 = 3196;
const F_0_541: i16 = 4433;
const F_0_765: i16 = 6270;
const F_0_899: i16 = 7373;
const F_1_175: i16 = 9633;
const F_1_501: i16 = 12299;
const F_1_847: i16 = 15137;
const F_1_961: i16 = 16069;
const F_2_053: i16 = 16819;
const F_2_562: i16 = 20995;
const F_3_072: i16 = 25172;
const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS;
const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS;
#[inline(always)]
pub fn fdct_avx2(data: &mut [i16; 64]) {
unsafe {
fdct_avx2_internal(data);
}
}
#[target_feature(enable = "avx2")]
fn fdct_avx2_internal(data: &mut [i16; 64]) {
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
fn PW_F130_F054_MF130_F054() -> __m256i {
_mm256_set_epi16(
F_0_541,
F_0_541 - F_1_847,
F_0_541,
F_0_541 - F_1_847,
F_0_541,
F_0_541 - F_1_847,
F_0_541,
F_0_541 - F_1_847,
F_0_541,
F_0_541 + F_0_765,
F_0_541,
F_0_541 + F_0_765,
F_0_541,
F_0_541 + F_0_765,
F_0_541,
F_0_541 + F_0_765,
)
}
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
fn PW_MF078_F117_F078_F117() -> __m256i {
_mm256_set_epi16(
F_1_175,
F_1_175 - F_0_390,
F_1_175,
F_1_175 - F_0_390,
F_1_175,
F_1_175 - F_0_390,
F_1_175,
F_1_175 - F_0_390,
F_1_175,
F_1_175 - F_1_961,
F_1_175,
F_1_175 - F_1_961,
F_1_175,
F_1_175 - F_1_961,
F_1_175,
F_1_175 - F_1_961,
)
}
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
fn PW_MF060_MF089_MF050_MF256() -> __m256i {
_mm256_set_epi16(
-F_2_562,
F_2_053 - F_2_562,
-F_2_562,
F_2_053 - F_2_562,
-F_2_562,
F_2_053 - F_2_562,
-F_2_562,
F_2_053 - F_2_562,
-F_0_899,
F_0_298 - F_0_899,
-F_0_899,
F_0_298 - F_0_899,
-F_0_899,
F_0_298 - F_0_899,
-F_0_899,
F_0_298 - F_0_899,
)
}
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
fn PW_F050_MF256_F060_MF089() -> __m256i {
_mm256_set_epi16(
-F_0_899,
F_1_501 - F_0_899,
-F_0_899,
F_1_501 - F_0_899,
-F_0_899,
F_1_501 - F_0_899,
-F_0_899,
F_1_501 - F_0_899,
-F_2_562,
F_3_072 - F_2_562,
-F_2_562,
F_3_072 - F_2_562,
-F_2_562,
F_3_072 - F_2_562,
-F_2_562,
F_3_072 - F_2_562,
)
}
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
fn PD_DESCALE_P(first_pass: bool) -> __m256i {
if first_pass {
_mm256_set_epi32(
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
1 << (DESCALE_P1 - 1),
)
} else {
_mm256_set_epi32(
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
1 << (DESCALE_P2 - 1),
)
}
}
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
fn PW_DESCALE_P2X() -> __m256i {
_mm256_set_epi32(
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
)
}
#[target_feature(enable = "avx2")]
#[inline]
fn do_transpose(
i1: __m256i,
i2: __m256i,
i3: __m256i,
i4: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
let t5 = _mm256_unpacklo_epi16(i1, i2);
let t6 = _mm256_unpackhi_epi16(i1, i2);
let t7 = _mm256_unpacklo_epi16(i3, i4);
let t8 = _mm256_unpackhi_epi16(i3, i4);
let t1 = _mm256_unpacklo_epi32(t5, t7);
let t2 = _mm256_unpackhi_epi32(t5, t7);
let t3 = _mm256_unpacklo_epi32(t6, t8);
let t4 = _mm256_unpackhi_epi32(t6, t8);
(
_mm256_permute4x64_epi64(t1, 0x8D),
_mm256_permute4x64_epi64(t2, 0x8D),
_mm256_permute4x64_epi64(t3, 0xD8),
_mm256_permute4x64_epi64(t4, 0xD8),
)
}
#[target_feature(enable = "avx2")]
#[inline]
fn do_dct(
first_pass: bool,
i1: __m256i,
i2: __m256i,
i3: __m256i,
i4: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
let t5 = _mm256_sub_epi16(i1, i4); let t6 = _mm256_add_epi16(i1, i4); let t7 = _mm256_add_epi16(i2, i3); let t8 = _mm256_sub_epi16(i2, i3);
let t6 = _mm256_permute2x128_si256(t6, t6, 0x01); let t1 = _mm256_add_epi16(t6, t7); let t6 = _mm256_sub_epi16(t6, t7);
let t7 = _mm256_permute2x128_si256(t1, t1, 0x01); let t1 = _mm256_sign_epi16(
t1,
_mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1),
);
let t7 = _mm256_add_epi16(t7, t1);
let t1 = if first_pass {
_mm256_slli_epi16(t7, PASS1_BITS)
} else {
let t7 = _mm256_add_epi16(t7, PW_DESCALE_P2X());
_mm256_srai_epi16(t7, PASS1_BITS)
};
let t7 = _mm256_permute2x128_si256(t6, t6, 0x01); let t2 = _mm256_unpacklo_epi16(t6, t7);
let t6 = _mm256_unpackhi_epi16(t6, t7);
let t2 = _mm256_madd_epi16(t2, PW_F130_F054_MF130_F054()); let t6 = _mm256_madd_epi16(t6, PW_F130_F054_MF130_F054());
let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass));
let t6 = _mm256_add_epi32(t6, PD_DESCALE_P(first_pass));
let t2 = if first_pass {
_mm256_srai_epi32(t2, DESCALE_P1)
} else {
_mm256_srai_epi32(t2, DESCALE_P2)
};
let t6 = if first_pass {
_mm256_srai_epi32(t6, DESCALE_P1)
} else {
_mm256_srai_epi32(t6, DESCALE_P2)
};
let t3 = _mm256_packs_epi32(t2, t6);
let t7 = _mm256_add_epi16(t8, t5);
let t2 = _mm256_permute2x128_si256(t7, t7, 0x01); let t6 = _mm256_unpacklo_epi16(t7, t2);
let t7 = _mm256_unpackhi_epi16(t7, t2);
let t6 = _mm256_madd_epi16(t6, PW_MF078_F117_F078_F117()); let t7 = _mm256_madd_epi16(t7, PW_MF078_F117_F078_F117());
let t4 = _mm256_permute2x128_si256(t5, t5, 0x01); let t2 = _mm256_unpacklo_epi16(t8, t4);
let t4 = _mm256_unpackhi_epi16(t8, t4);
let t2 = _mm256_madd_epi16(t2, PW_MF060_MF089_MF050_MF256()); let t4 = _mm256_madd_epi16(t4, PW_MF060_MF089_MF050_MF256());
let t2 = _mm256_add_epi32(t2, t6); let t4 = _mm256_add_epi32(t4, t7);
let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass));
let t4 = _mm256_add_epi32(t4, PD_DESCALE_P(first_pass));
let t2 = if first_pass {
_mm256_srai_epi32(t2, DESCALE_P1)
} else {
_mm256_srai_epi32(t2, DESCALE_P2)
};
let t4 = if first_pass {
_mm256_srai_epi32(t4, DESCALE_P1)
} else {
_mm256_srai_epi32(t4, DESCALE_P2)
};
let t4 = _mm256_packs_epi32(t2, t4);
let t2 = _mm256_permute2x128_si256(t8, t8, 0x01);
let t8 = _mm256_unpacklo_epi16(t5, t2);
let t5 = _mm256_unpackhi_epi16(t5, t2);
let t8 = _mm256_madd_epi16(t8, PW_F050_MF256_F060_MF089()); let t5 = _mm256_madd_epi16(t5, PW_F050_MF256_F060_MF089());
let t8 = _mm256_add_epi32(t8, t6); let t5 = _mm256_add_epi32(t5, t7);
let t8 = _mm256_add_epi32(t8, PD_DESCALE_P(first_pass));
let t5 = _mm256_add_epi32(t5, PD_DESCALE_P(first_pass));
let t8 = if first_pass {
_mm256_srai_epi32(t8, DESCALE_P1)
} else {
_mm256_srai_epi32(t8, DESCALE_P2)
};
let t5 = if first_pass {
_mm256_srai_epi32(t5, DESCALE_P1)
} else {
_mm256_srai_epi32(t5, DESCALE_P2)
};
let t2 = _mm256_packs_epi32(t8, t5);
(t1, t2, t3, t4)
}
let ymm4 = avx_load(&data[0..16]);
let ymm5 = avx_load(&data[16..32]);
let ymm6 = avx_load(&data[32..48]);
let ymm7 = avx_load(&data[48..64]);
let ymm0 = _mm256_permute2x128_si256(ymm4, ymm6, 0x20);
let ymm1 = _mm256_permute2x128_si256(ymm4, ymm6, 0x31);
let ymm2 = _mm256_permute2x128_si256(ymm5, ymm7, 0x20);
let ymm3 = _mm256_permute2x128_si256(ymm5, ymm7, 0x31);
let (ymm0, ymm1, ymm2, ymm3) = do_transpose(ymm0, ymm1, ymm2, ymm3);
let (ymm0, ymm1, ymm2, ymm3) = do_dct(true, ymm0, ymm1, ymm2, ymm3);
let ymm4 = _mm256_permute2x128_si256(ymm1, ymm3, 0x20); let ymm1 = _mm256_permute2x128_si256(ymm1, ymm3, 0x31);
let (ymm0, ymm1, ymm2, ymm4) = do_transpose(ymm0, ymm1, ymm2, ymm4);
let (ymm0, ymm1, ymm2, ymm4) = do_dct(false, ymm0, ymm1, ymm2, ymm4);
let ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x30); let ymm5 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20); let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21);
avx_store(ymm3, &mut data[0..16]);
avx_store(ymm5, &mut data[16..32]);
avx_store(ymm6, &mut data[32..48]);
avx_store(ymm7, &mut data[48..64]);
}
#[target_feature(enable = "avx2")]
#[inline]
fn avx_load(input: &[i16]) -> __m256i {
assert!(input.len() == 16);
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) }
}
#[target_feature(enable = "avx2")]
#[inline]
fn avx_store(input: __m256i, output: &mut [i16]) {
assert!(output.len() == 16);
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) }
}