use crate::sse::shuffle;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[inline]
pub(crate) fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128) -> __m128 {
unsafe {
if FMA {
_mm_fma_psx(a, b, c)
} else {
_mm_add_ps(_mm_mul_ps(b, c), a)
}
}
}
#[inline]
fn _mm_fma_psx(a: __m128, b: __m128, c: __m128) -> __m128 {
unsafe { _mm_fmadd_ps(b, c, a) }
}
#[inline(always)]
pub(crate) fn sse_deinterleave_rgba_ps(
v0: __m128,
v1: __m128,
v2: __m128,
v3: __m128,
) -> (__m128, __m128, __m128, __m128) {
unsafe {
let t02lo = _mm_unpacklo_ps(v0, v2);
let t13lo = _mm_unpacklo_ps(v1, v3);
let t02hi = _mm_unpackhi_ps(v0, v2);
let t13hi = _mm_unpackhi_ps(v1, v3);
let a = _mm_unpacklo_ps(t02lo, t13lo);
let b = _mm_unpackhi_ps(t02lo, t13lo);
let c = _mm_unpacklo_ps(t02hi, t13hi);
let d = _mm_unpackhi_ps(t02hi, t13hi);
(a, b, c, d)
}
}
#[inline(always)]
pub(crate) fn sse_interleave_rgba_ps(
v0: __m128,
v1: __m128,
v2: __m128,
v3: __m128,
) -> (__m128, __m128, __m128, __m128) {
unsafe {
let u0 = _mm_unpacklo_ps(v0, v2);
let u1 = _mm_unpacklo_ps(v1, v3);
let u2 = _mm_unpackhi_ps(v0, v2);
let u3 = _mm_unpackhi_ps(v1, v3);
let j0 = _mm_unpacklo_ps(u0, u1);
let j2 = _mm_unpacklo_ps(u2, u3);
let j1 = _mm_unpackhi_ps(u0, u1);
let j3 = _mm_unpackhi_ps(u2, u3);
(j0, j1, j2, j3)
}
}
#[inline(always)]
pub(crate) fn sse_deinterleave_rgba(
rgba0: __m128i,
rgba1: __m128i,
rgba2: __m128i,
rgba3: __m128i,
) -> (__m128i, __m128i, __m128i, __m128i) {
unsafe {
let t0 = _mm_unpacklo_epi8(rgba0, rgba1); let t1 = _mm_unpackhi_epi8(rgba0, rgba1);
let t2 = _mm_unpacklo_epi8(rgba2, rgba3); let t3 = _mm_unpackhi_epi8(rgba2, rgba3);
let t4 = _mm_unpacklo_epi16(t0, t2); let t5 = _mm_unpackhi_epi16(t0, t2);
let t6 = _mm_unpacklo_epi16(t1, t3);
let t7 = _mm_unpackhi_epi16(t1, t3);
let l1 = _mm_unpacklo_epi32(t4, t6); let l2 = _mm_unpackhi_epi32(t4, t6);
let l3 = _mm_unpacklo_epi32(t5, t7);
let l4 = _mm_unpackhi_epi32(t5, t7);
#[rustfmt::skip]
let shuffle = _mm_setr_epi8(0, 4, 8, 12,
1, 5, 9, 13,
2, 6, 10, 14,
3, 7, 11, 15,
);
let r1 = _mm_shuffle_epi8(_mm_unpacklo_epi32(l1, l3), shuffle);
let r2 = _mm_shuffle_epi8(_mm_unpackhi_epi32(l1, l3), shuffle);
let r3 = _mm_shuffle_epi8(_mm_unpacklo_epi32(l2, l4), shuffle);
let r4 = _mm_shuffle_epi8(_mm_unpackhi_epi32(l2, l4), shuffle);
(r1, r2, r3, r4)
}
}
#[inline(always)]
pub(crate) fn sse_interleave_rgba(
r: __m128i,
g: __m128i,
b: __m128i,
a: __m128i,
) -> (__m128i, __m128i, __m128i, __m128i) {
unsafe {
let rg_lo = _mm_unpacklo_epi8(r, g);
let rg_hi = _mm_unpackhi_epi8(r, g);
let ba_lo = _mm_unpacklo_epi8(b, a);
let ba_hi = _mm_unpackhi_epi8(b, a);
let rgba_0_lo = _mm_unpacklo_epi16(rg_lo, ba_lo);
let rgba_0_hi = _mm_unpackhi_epi16(rg_lo, ba_lo);
let rgba_1_lo = _mm_unpacklo_epi16(rg_hi, ba_hi);
let rgba_1_hi = _mm_unpackhi_epi16(rg_hi, ba_hi);
(rgba_0_lo, rgba_0_hi, rgba_1_lo, rgba_1_hi)
}
}
#[inline(always)]
pub(crate) fn _mm_hsum_ps(v: __m128) -> f32 {
unsafe {
let mut shuf = _mm_movehdup_ps(v);
let mut sums = _mm_add_ps(v, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
_mm_cvtss_f32(sums)
}
}
#[inline(always)]
#[allow(dead_code)]
pub(crate) fn sse_deinterleave_rgba_epi16(
rgba0: __m128i,
rgba1: __m128i,
rgba2: __m128i,
rgba3: __m128i,
) -> (__m128i, __m128i, __m128i, __m128i) {
unsafe {
let v0 = _mm_unpacklo_epi16(rgba0, rgba2); let v1 = _mm_unpackhi_epi16(rgba0, rgba2); let v2 = _mm_unpacklo_epi16(rgba1, rgba3); let v3 = _mm_unpackhi_epi16(rgba1, rgba3);
let u0 = _mm_unpacklo_epi16(v0, v2); let u1 = _mm_unpacklo_epi16(v1, v3); let u2 = _mm_unpackhi_epi16(v0, v2); let u3 = _mm_unpackhi_epi16(v1, v3);
let a = _mm_unpacklo_epi16(u0, u1);
let b = _mm_unpackhi_epi16(u0, u1);
let c = _mm_unpacklo_epi16(u2, u3);
let d = _mm_unpackhi_epi16(u2, u3);
(a, b, c, d)
}
}
#[inline(always)]
#[allow(dead_code)]
pub(crate) fn sse_interleave_rgba_epi16(
a: __m128i,
b: __m128i,
c: __m128i,
d: __m128i,
) -> (__m128i, __m128i, __m128i, __m128i) {
unsafe {
let u0 = _mm_unpacklo_epi16(a, c); let u1 = _mm_unpackhi_epi16(a, c); let u2 = _mm_unpacklo_epi16(b, d); let u3 = _mm_unpackhi_epi16(b, d);
let v0 = _mm_unpacklo_epi16(u0, u2); let v1 = _mm_unpackhi_epi16(u0, u2); let v2 = _mm_unpacklo_epi16(u1, u3); let v3 = _mm_unpackhi_epi16(u1, u3); (v0, v1, v2, v3)
}
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn _mm_hsum_epi16(x: __m128i) -> i16 {
unsafe {
let v0 = _mm_hadd_epi16(x, x);
const MASK: i32 = shuffle(0, 0, 0, 1);
let v1 = _mm_shuffle_epi32::<MASK>(v0);
let v2 = _mm_add_epi16(v0, v1);
_mm_extract_epi16::<0>(_mm_hadd_epi16(v2, v2)) as i16
}
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn _mm_hsum_epi32(x: __m128i) -> i32 {
unsafe {
const FIRST_MASK: i32 = shuffle(1, 0, 3, 2);
let hi64 = _mm_shuffle_epi32::<FIRST_MASK>(x);
let sum64 = _mm_add_epi32(hi64, x);
const SM: i32 = shuffle(1, 0, 3, 2);
let hi32 = _mm_shufflelo_epi16::<SM>(sum64);
let sum32 = _mm_add_epi32(sum64, hi32);
_mm_cvtsi128_si32(sum32)
}
}
#[inline(always)]
pub(crate) fn _mm_muladd_wide_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { _mm_add_epi32(a, _mm_madd_epi16(b, c)) }
}
#[inline(always)]
pub(crate) fn _mm_srai_epi64x<const IMM8: i32>(a: __m128i) -> __m128i {
unsafe {
let m = _mm_set1_epi64x(1 << (64 - 1));
let x = _mm_srli_epi64::<IMM8>(a);
_mm_sub_epi64(_mm_xor_si128(x, m), m)
}
}
#[inline]
pub(crate) fn _mm_packus_epi64(a: __m128i, b: __m128i) -> __m128i {
unsafe {
const SHUFFLE_MASK: i32 = shuffle(3, 1, 2, 0);
let a = _mm_shuffle_epi32::<SHUFFLE_MASK>(a);
let b1 = _mm_shuffle_epi32::<SHUFFLE_MASK>(b);
_mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b1)))
}
}
#[inline(always)]
pub(crate) fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
unsafe {
#[cfg(target_arch = "x86_64")]
{
if IMM == 0 {
_mm_cvtsi128_si64(d)
} else {
_mm_extract_epi64::<IMM>(d)
}
}
#[cfg(target_arch = "x86")]
{
let (low, high);
if IMM == 0 {
low = _mm_cvtsi128_si32(d);
high = _mm_cvtsi128_si32(_mm_srli_si128::<4>(d));
} else {
low = _mm_cvtsi128_si32(_mm_srli_si128::<8>(d));
high = _mm_cvtsi128_si32(_mm_srli_si128::<12>(d));
}
((high as i64) << 32) | low as i64
}
}
}
#[inline]
pub(crate) fn _mm_store3_u16(ptr: *mut u16, a: __m128i) {
unsafe {
let low_pixel = _mm_extract_epi32::<0>(a);
(ptr as *mut i32).write_unaligned(low_pixel);
(ptr as *mut i16)
.add(2)
.write_unaligned(_mm_extract_epi16::<2>(a) as i16);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_horizontal_add() {
unsafe {
if !std::arch::is_x86_feature_detected!("sse4.1") {
return;
}
let original: [i16; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
let original_sum = original.iter().sum::<i16>();
let v_sse = _mm_loadu_si128(original.as_ptr() as *const __m128i);
let h_sum = _mm_hsum_epi16(v_sse);
assert_eq!(h_sum, original_sum);
}
}
}