use std::arch::x86_64::*;
#[inline(always)]
pub(crate) fn _mm256_fma_pd<const FMA: bool>(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
unsafe {
if FMA {
_mm256_fmadd_pd(b, c, a)
} else {
_mm256_add_pd(_mm256_mul_pd(b, c), a)
}
}
}
#[inline(always)]
pub(crate) fn _mm_fma_pd<const FMA: bool>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
unsafe {
if FMA {
_mm_fmadd_pd(b, c, a)
} else {
_mm_add_pd(_mm_mul_pd(b, c), a)
}
}
}
#[inline(always)]
pub(crate) fn _mm256_fma_ps<const FMA: bool>(a: __m256, b: __m256, c: __m256) -> __m256 {
unsafe {
if FMA {
_mm256_fma_psx(a, b, c)
} else {
_mm256_add_ps(_mm256_mul_ps(b, c), a)
}
}
}
#[inline(always)]
fn _mm256_fma_psx(a: __m256, b: __m256, c: __m256) -> __m256 {
unsafe { _mm256_fmadd_ps(b, c, a) }
}
#[inline(always)]
pub(crate) const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
((z << 6) | (y << 4) | (x << 2) | w) as i32
}
#[inline(always)]
pub(crate) fn _mm256_select_si256(
mask: __m256i,
true_vals: __m256i,
false_vals: __m256i,
) -> __m256i {
unsafe { _mm256_blendv_epi8(false_vals, true_vals, mask) }
}
#[inline(always)]
pub(crate) fn _mm256_selecti_ps(mask: __m256i, true_vals: __m256, false_vals: __m256) -> __m256 {
unsafe { _mm256_blendv_ps(false_vals, true_vals, _mm256_castsi256_ps(mask)) }
}
#[inline(always)]
pub(crate) fn avx2_div_by255(v: __m256i) -> __m256i {
unsafe {
let addition = _mm256_set1_epi16(127);
let j0 = _mm256_add_epi16(v, addition);
let j1 = _mm256_srli_epi16::<8>(v);
_mm256_srli_epi16::<8>(_mm256_add_epi16(j0, j1))
}
}
#[cfg(feature = "nightly_f16")]
#[inline(always)]
pub(crate) fn avx_interleave_rgba_epi16(
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
unsafe {
let bg0 = _mm256_unpacklo_epi16(a, b);
let bg1 = _mm256_unpackhi_epi16(a, b);
let ra0 = _mm256_unpacklo_epi16(c, d);
let ra1 = _mm256_unpackhi_epi16(c, d);
let bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
let bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
let bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
let bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
let bgra0 = _mm256_permute2x128_si256::<32>(bgra0_, bgra1_);
let bgra2 = _mm256_permute2x128_si256::<49>(bgra0_, bgra1_);
let bgra1 = _mm256_permute2x128_si256::<32>(bgra2_, bgra3_);
let bgra3 = _mm256_permute2x128_si256::<49>(bgra2_, bgra3_);
(bgra0, bgra1, bgra2, bgra3)
}
}
#[cfg(feature = "nightly_f16")]
#[inline(always)]
pub(crate) fn avx_deinterleave_rgba_epi16(
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
unsafe {
let sh = _mm256_setr_epi8(
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5,
12, 13, 6, 7, 14, 15,
);
let p0 = _mm256_shuffle_epi8(a, sh);
let p1 = _mm256_shuffle_epi8(b, sh);
let p2 = _mm256_shuffle_epi8(c, sh);
let p3 = _mm256_shuffle_epi8(d, sh);
let p01l = _mm256_unpacklo_epi32(p0, p1);
let p01h = _mm256_unpackhi_epi32(p0, p1);
let p23l = _mm256_unpacklo_epi32(p2, p3);
let p23h = _mm256_unpackhi_epi32(p2, p3);
let pll = _mm256_permute2x128_si256::<32>(p01l, p23l);
let plh = _mm256_permute2x128_si256::<49>(p01l, p23l);
let phl = _mm256_permute2x128_si256::<32>(p01h, p23h);
let phh = _mm256_permute2x128_si256::<49>(p01h, p23h);
let b0 = _mm256_unpacklo_epi32(pll, plh);
let g0 = _mm256_unpackhi_epi32(pll, plh);
let r0 = _mm256_unpacklo_epi32(phl, phh);
let a0 = _mm256_unpackhi_epi32(phl, phh);
(b0, g0, r0, a0)
}
}
#[inline(always)]
pub(crate) fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
unsafe {
let packed = _mm256_packus_epi32(s_1, s_2);
const MASK: i32 = shuffle(3, 1, 2, 0);
_mm256_permute4x64_epi64::<MASK>(packed)
}
}
#[inline(always)]
pub(crate) fn avx2_pack_s32(s_1: __m256i, s_2: __m256i) -> __m256i {
unsafe {
let packed = _mm256_packs_epi32(s_1, s_2);
const MASK: i32 = shuffle(3, 1, 2, 0);
_mm256_permute4x64_epi64::<MASK>(packed)
}
}
#[inline(always)]
#[allow(dead_code)]
pub(crate) fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 {
unsafe { _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo), hi) }
}
#[inline(always)]
#[allow(dead_code)]
pub(crate) fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
unsafe {
_mm256_castps_si256(_mm256_insertf128_ps::<1>(
_mm256_castps128_ps256(_mm_castsi128_ps(lo)),
_mm_castsi128_ps(hi),
))
}
}
#[inline]
pub(crate) fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe {
let m = _mm256_set1_epi64x(1 << (64 - 1));
let x = _mm256_srli_epi64::<IMM8>(a);
_mm256_sub_epi64(_mm256_xor_si256(x, m), m)
}
}
#[inline(always)]
pub(crate) fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
unsafe {
const SHUFFLE_1: i32 = shuffle(2, 0, 2, 0);
let combined =
_mm256_shuffle_ps::<SHUFFLE_1>(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b));
const SHUFFLE_2: i32 = shuffle(3, 1, 2, 0);
let ordered = _mm256_permute4x64_pd::<SHUFFLE_2>(_mm256_castps_pd(combined));
_mm256_castpd_si256(ordered)
}
}
#[inline]
#[allow(dead_code)]
pub(crate) fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
unsafe {
let vf = _mm256_castsi256_ps(v);
let hi = _mm256_extractf128_ps::<1>(vf);
let lo = _mm256_castps256_ps128(vf);
const FLAGS: i32 = shuffle(2, 0, 2, 0);
let packed = _mm_shuffle_ps::<FLAGS>(lo, hi);
_mm_castps_si128(packed)
}
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn _mm256_dot16_avx_epi32<const HAS_DOT: bool>(
a: __m256i,
b: __m256i,
c: __m256i,
) -> __m256i {
unsafe {
#[cfg(feature = "avx512")]
{
if HAS_DOT {
_mm256_dpwssd_avx_epi32(a, b, c)
} else {
_mm256_add_epi32(a, _mm256_madd_epi16(b, c))
}
}
#[cfg(not(feature = "avx512"))]
{
_mm256_add_epi32(a, _mm256_madd_epi16(b, c))
}
}
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn _mm_dot16_avx_epi32<const HAS_DOT: bool>(
a: __m128i,
b: __m128i,
c: __m128i,
) -> __m128i {
unsafe {
#[cfg(feature = "avx512")]
{
if HAS_DOT {
_mm_dpwssd_avx_epi32(a, b, c)
} else {
_mm_add_epi32(a, _mm_madd_epi16(b, c))
}
}
#[cfg(not(feature = "avx512"))]
{
_mm_add_epi32(a, _mm_madd_epi16(b, c))
}
}
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn _mm_udot8_epi16<const DOT: bool>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe {
#[cfg(feature = "avx512")]
if DOT {
_mm_dpbusd_avx_epi32(a, b, c)
} else {
_mm_adds_epi16(a, _mm_maddubs_epi16(b, c))
}
#[cfg(not(feature = "avx512"))]
{
_mm_adds_epi16(a, _mm_maddubs_epi16(b, c))
}
}
}
#[inline(always)]
pub(crate) fn _mm256_prefer_fma_ps<const FMA: bool>(a: __m256, b: __m256, c: __m256) -> __m256 {
unsafe {
if FMA {
_mm256_fmadd_ps(b, c, a)
} else {
_mm256_add_ps(_mm256_mul_ps(b, c), a)
}
}
}
#[inline(always)]
pub(crate) fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128) -> __m128 {
unsafe {
if FMA {
_mm_fmadd_ps(b, c, a)
} else {
_mm_add_ps(_mm_mul_ps(b, c), a)
}
}
}
#[inline(always)]
pub(crate) fn _mm_prefer_fma_pd<const FMA: bool>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
unsafe {
if FMA {
_mm_fmadd_pd(b, c, a)
} else {
_mm_add_pd(_mm_mul_pd(b, c), a)
}
}
}
#[inline(always)]
pub(crate) fn _mm_reduce_r_epi32<const PRECISION: i32>(x: __m128i) -> __m128i {
unsafe {
const FIRST_MASK: i32 = shuffle(1, 0, 3, 2);
let hi64 = _mm_shuffle_epi32::<FIRST_MASK>(x);
let sum64 = _mm_add_epi32(hi64, x);
const SM: i32 = shuffle(1, 0, 3, 2);
let hi32 = _mm_shufflelo_epi16::<SM>(sum64);
_mm_srai_epi32::<PRECISION>(_mm_add_epi32(sum64, hi32))
}
}
#[inline(always)]
pub(crate) fn _mm_hsum_ps(v: __m128) -> __m128 {
unsafe {
let mut shuf = _mm_movehdup_ps(v);
let sums = _mm_add_ps(v, shuf);
shuf = _mm_movehl_ps(shuf, sums);
_mm_add_ss(sums, shuf)
}
}
#[inline(always)]
pub(crate) fn _mm_hsum_pd(v: __m128d) -> __m128d {
unsafe {
let undef = _mm_undefined_ps();
let shuftmp = _mm_movehl_ps(undef, _mm_castpd_ps(v));
let shuf = _mm_castps_pd(shuftmp);
_mm_add_sd(v, shuf)
}
}