use core::arch::x86_64::*;
#[target_feature(enable = "sse2")]
pub(crate) fn complex_mul_sse2(left: __m128, right: __m128) -> __m128 {
let right_re = _mm_shuffle_ps(right, right, 0b10_10_00_00);
let right_im = _mm_shuffle_ps(right, right, 0b11_11_01_01);
let left_swap = _mm_shuffle_ps(left, left, 0b10_11_00_01);
let prod_re = _mm_mul_ps(left, right_re);
let prod_im = _mm_mul_ps(left_swap, right_im);
let sub_result = _mm_sub_ps(prod_re, prod_im);
let add_result = _mm_add_ps(prod_re, prod_im);
let select_odd = _mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, 0));
_mm_or_ps(
_mm_and_ps(select_odd, add_result),
_mm_andnot_ps(select_odd, sub_result),
)
}
#[target_feature(enable = "sse2")]
pub(crate) fn load_neg_imag_mask_sse2() -> __m128 {
_mm_set_ps(-0.0, 0.0, -0.0, 0.0)
}
#[target_feature(enable = "sse2")]
pub(crate) fn load_scale_sse2() -> __m128 {
const SQRT_HALF: f32 = core::f32::consts::FRAC_1_SQRT_2;
_mm_set1_ps(SQRT_HALF)
}
#[target_feature(enable = "sse2")]
pub(crate) fn complex_mul_i_sse2(vec: __m128, neg_imag_mask: __m128) -> __m128 {
let swapped = _mm_shuffle_ps(vec, vec, 0b10_11_00_01);
_mm_xor_ps(swapped, neg_imag_mask)
}
#[target_feature(enable = "sse2")]
pub(crate) fn complex_mul_sqrt3_i_sse2(vec: __m128, sqrt3_2: f32) -> __m128 {
let swapped = _mm_shuffle_ps(vec, vec, 0b10_11_00_01);
_mm_mul_ps(swapped, _mm_set_ps(-sqrt3_2, sqrt3_2, -sqrt3_2, sqrt3_2))
}
#[target_feature(enable = "sse2")]
pub(crate) fn w8x_sse2(xy: __m128, sign_mask: __m128, scale: __m128) -> __m128 {
let yx = _mm_shuffle_ps(xy, xy, 0b10_11_00_01);
let ymx = _mm_xor_ps(yx, sign_mask);
let sum = _mm_add_ps(xy, ymx);
_mm_mul_ps(scale, sum)
}
#[target_feature(enable = "sse2")]
pub(crate) fn v8x_sse2(xy: __m128, sign_mask: __m128, scale: __m128) -> __m128 {
let yx = _mm_shuffle_ps(xy, xy, 0b10_11_00_01);
let ymx = _mm_xor_ps(yx, sign_mask);
let diff = _mm_sub_ps(ymx, xy);
_mm_mul_ps(scale, diff)
}