use std::arch::x86_64::*;
use std::iter::Sum;
use std::ops::{Add, Div, Index, Mul, Neg, Sub};
use crate::*;
#[derive(Clone, Copy)]
#[repr(C, align(16))]
pub struct Vec4(pub [f32; 4]);
impl std::fmt::Debug for Vec4 {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("Vec4").field(&self.0).finish()
}
}
impl PartialEq for Vec4 {
fn eq(&self, other: &Self) -> bool {
self.0 == other.0
}
}
impl Vec4 {
pub const ZERO: Self = Vec4([0.0; 4]);
pub const ONE: Self = Vec4([1.0; 4]);
#[inline(always)]
pub(crate) fn load(&self) -> __m128 {
unsafe { _mm_load_ps(self.0.as_ptr()) }
}
#[inline(always)]
pub(crate) fn store(v: __m128) -> Self {
let mut r = Vec4([0.0; 4]);
unsafe { _mm_store_ps(r.0.as_mut_ptr(), v) };
r
}
#[inline(always)]
pub const fn splat(val: f32) -> Self {
Vec4([val; 4])
}
#[inline(always)]
pub fn mul_add(self, a: Self, b: Self) -> Self {
unsafe { Self::store(_mm_fmadd_ps(self.load(), a.load(), b.load())) }
}
#[inline(always)]
pub fn abs(self) -> Self {
unsafe {
let mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFF_FFFF));
Self::store(_mm_and_ps(self.load(), mask))
}
}
#[inline(always)]
pub fn sqrt(self) -> Self {
unsafe { Self::store(_mm_sqrt_ps(self.load())) }
}
#[inline(always)]
pub fn floor(self) -> Self {
unsafe {
Self::store(_mm_round_ps(
self.load(),
_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC,
))
}
}
#[inline(always)]
pub fn ceil(self) -> Self {
unsafe {
Self::store(_mm_round_ps(
self.load(),
_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC,
))
}
}
#[inline(always)]
pub fn round(self) -> Self {
unsafe {
let v = self.load();
let t = _mm_round_ps(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
let r = _mm_sub_ps(v, t);
let abs_r = _mm_andnot_ps(_mm_set1_ps(-0.0), r);
let need_adjust = _mm_cmpge_ps(abs_r, _mm_set1_ps(0.5));
let sign = _mm_and_ps(v, _mm_set1_ps(-0.0));
let one_signed = _mm_or_ps(_mm_set1_ps(1.0), sign);
let adjusted = _mm_add_ps(t, one_signed);
Self::store(_mm_or_ps(
_mm_and_ps(need_adjust, adjusted),
_mm_andnot_ps(need_adjust, t),
))
}
}
}
impl crate::precise::PreciseMath for Vec4 {
fn sin(self) -> Self {
unsafe { Self::store(sinf_u10_sse(self.load())) }
}
fn cos(self) -> Self {
unsafe { Self::store(cosf_u10_sse(self.load())) }
}
fn exp(self) -> Self {
unsafe { Self::store(expf_sse(self.load())) }
}
fn sum(self) -> f32 {
unsafe {
let v = _mm256_cvtps_pd(self.load());
let hi = _mm256_extractf128_pd(v, 1);
let lo = _mm256_castpd256_pd128(v);
let sum2 = _mm_add_pd(lo, hi);
let hi_elem = _mm_unpackhi_pd(sum2, sum2);
let sum1 = _mm_add_sd(sum2, hi_elem);
_mm_cvtss_f32(_mm_cvtsd_ss(_mm_setzero_ps(), sum1))
}
}
fn dot(self, other: Self) -> f32 {
unsafe {
let a_f64 = _mm256_cvtps_pd(self.load());
let b_f64 = _mm256_cvtps_pd(other.load());
let prod = _mm256_mul_pd(a_f64, b_f64);
let hi = _mm256_extractf128_pd(prod, 1);
let lo = _mm256_castpd256_pd128(prod);
let sum2 = _mm_add_pd(lo, hi);
let hi_elem = _mm_unpackhi_pd(sum2, sum2);
let sum1 = _mm_add_sd(sum2, hi_elem);
_mm_cvtss_f32(_mm_cvtsd_ss(_mm_setzero_ps(), sum1))
}
}
}
impl crate::fast::FastMath for Vec4 {
fn sin(self) -> Self {
unsafe { Self::store(sinf_u35_sse(self.load())) }
}
fn cos(self) -> Self {
unsafe { Self::store(cosf_u35_sse(self.load())) }
}
fn sum(self) -> f32 {
unsafe {
let v = self.load();
let shuf = _mm_movehdup_ps(v);
let sums = _mm_add_ps(v, shuf);
let shuf2 = _mm_movehl_ps(sums, sums);
let sums2 = _mm_add_ss(sums, shuf2);
_mm_cvtss_f32(sums2)
}
}
fn dot(self, other: Self) -> f32 {
unsafe {
let prod = _mm_mul_ps(self.load(), other.load());
let shuf = _mm_movehdup_ps(prod);
let sums = _mm_add_ps(prod, shuf);
let shuf2 = _mm_movehl_ps(sums, sums);
let sums2 = _mm_add_ss(sums, shuf2);
_mm_cvtss_f32(sums2)
}
}
}
impl Sum for Vec4 {
fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
iter.fold(Self::ZERO, |a, b| a + b)
}
}
impl Index<usize> for Vec4 {
type Output = f32;
fn index(&self, index: usize) -> &Self::Output {
&self.0[index]
}
}
impl Neg for Vec4 {
type Output = Vec4;
fn neg(self) -> Self::Output {
unsafe {
let sign = _mm_set1_ps(-0.0);
Self::store(_mm_xor_ps(self.load(), sign))
}
}
}
impl Add<Vec4> for Vec4 {
type Output = Vec4;
fn add(self, rhs: Vec4) -> Self::Output {
unsafe { Self::store(_mm_add_ps(self.load(), rhs.load())) }
}
}
impl Sub<Vec4> for Vec4 {
type Output = Vec4;
fn sub(self, rhs: Vec4) -> Self::Output {
unsafe { Self::store(_mm_sub_ps(self.load(), rhs.load())) }
}
}
impl Mul<Vec4> for Vec4 {
type Output = Vec4;
fn mul(self, rhs: Vec4) -> Self::Output {
unsafe { Self::store(_mm_mul_ps(self.load(), rhs.load())) }
}
}
impl Div<Vec4> for Vec4 {
type Output = Vec4;
fn div(self, rhs: Vec4) -> Self::Output {
unsafe { Self::store(_mm_div_ps(self.load(), rhs.load())) }
}
}
impl Add<f32> for Vec4 {
type Output = Vec4;
fn add(self, rhs: f32) -> Self::Output {
unsafe { Self::store(_mm_add_ps(self.load(), _mm_set1_ps(rhs))) }
}
}
impl Sub<f32> for Vec4 {
type Output = Vec4;
fn sub(self, rhs: f32) -> Self::Output {
unsafe { Self::store(_mm_sub_ps(self.load(), _mm_set1_ps(rhs))) }
}
}
impl Mul<f32> for Vec4 {
type Output = Vec4;
fn mul(self, rhs: f32) -> Self::Output {
unsafe { Self::store(_mm_mul_ps(self.load(), _mm_set1_ps(rhs))) }
}
}
impl Div<f32> for Vec4 {
type Output = Vec4;
fn div(self, rhs: f32) -> Self::Output {
unsafe { Self::store(_mm_div_ps(self.load(), _mm_set1_ps(rhs))) }
}
}
impl Add<Vec4> for f32 {
type Output = Vec4;
fn add(self, rhs: Vec4) -> Self::Output {
unsafe { Vec4::store(_mm_add_ps(_mm_set1_ps(self), rhs.load())) }
}
}
impl Sub<Vec4> for f32 {
type Output = Vec4;
fn sub(self, rhs: Vec4) -> Self::Output {
unsafe { Vec4::store(_mm_sub_ps(_mm_set1_ps(self), rhs.load())) }
}
}
impl Mul<Vec4> for f32 {
type Output = Vec4;
fn mul(self, rhs: Vec4) -> Self::Output {
unsafe { Vec4::store(_mm_mul_ps(_mm_set1_ps(self), rhs.load())) }
}
}
impl Div<Vec4> for f32 {
type Output = Vec4;
fn div(self, rhs: Vec4) -> Self::Output {
unsafe { Vec4::store(_mm_div_ps(_mm_set1_ps(self), rhs.load())) }
}
}
impl From<[f32; 4]> for Vec4 {
fn from(arr: [f32; 4]) -> Self {
Vec4(arr)
}
}
impl From<Vec4> for [f32; 4] {
fn from(v: Vec4) -> Self {
v.0
}
}
#[derive(Clone, Copy)]
struct F2x4 {
hi: __m128,
lo: __m128,
}
#[inline(always)]
unsafe fn df_normalize_sse(a: F2x4) -> F2x4 {
let s = _mm_add_ps(a.hi, a.lo);
F2x4 {
hi: s,
lo: _mm_add_ps(_mm_sub_ps(a.hi, s), a.lo),
}
}
#[inline(always)]
unsafe fn df_add2_f_f_sse(x: __m128, y: __m128) -> F2x4 {
let s = _mm_add_ps(x, y);
let v = _mm_sub_ps(s, x);
F2x4 {
hi: s,
lo: _mm_add_ps(_mm_sub_ps(x, _mm_sub_ps(s, v)), _mm_sub_ps(y, v)),
}
}
#[inline(always)]
unsafe fn df_add_f2_f_sse(x: F2x4, y: __m128) -> F2x4 {
let s = _mm_add_ps(x.hi, y);
F2x4 {
hi: s,
lo: _mm_add_ps(_mm_add_ps(_mm_sub_ps(x.hi, s), y), x.lo),
}
}
#[inline(always)]
unsafe fn df_add2_f2_f_sse(x: F2x4, y: __m128) -> F2x4 {
let s = _mm_add_ps(x.hi, y);
let v = _mm_sub_ps(s, x.hi);
let t = _mm_add_ps(_mm_sub_ps(x.hi, _mm_sub_ps(s, v)), _mm_sub_ps(y, v));
F2x4 {
hi: s,
lo: _mm_add_ps(t, x.lo),
}
}
#[inline(always)]
unsafe fn df_add2_f2_f2_sse(x: F2x4, y: F2x4) -> F2x4 {
let s = _mm_add_ps(x.hi, y.hi);
let v = _mm_sub_ps(s, x.hi);
let t = _mm_add_ps(_mm_sub_ps(x.hi, _mm_sub_ps(s, v)), _mm_sub_ps(y.hi, v));
F2x4 {
hi: s,
lo: _mm_add_ps(t, _mm_add_ps(x.lo, y.lo)),
}
}
#[inline(always)]
unsafe fn df_add_f_f2_sse(x: __m128, y: F2x4) -> F2x4 {
let s = _mm_add_ps(x, y.hi);
F2x4 {
hi: s,
lo: _mm_add_ps(_mm_add_ps(_mm_sub_ps(x, s), y.hi), y.lo),
}
}
#[inline(always)]
unsafe fn df_mul_f2_f2_sse(x: F2x4, y: F2x4) -> F2x4 {
let s = _mm_mul_ps(x.hi, y.hi);
let t = _mm_fmadd_ps(
x.hi,
y.lo,
_mm_fmadd_ps(x.lo, y.hi, _mm_fmsub_ps(x.hi, y.hi, s)),
);
F2x4 { hi: s, lo: t }
}
#[inline(always)]
unsafe fn df_squ_f2_sse(x: F2x4) -> F2x4 {
let s = _mm_mul_ps(x.hi, x.hi);
let t = _mm_fmadd_ps(_mm_add_ps(x.hi, x.hi), x.lo, _mm_fmsub_ps(x.hi, x.hi, s));
F2x4 { hi: s, lo: t }
}
#[inline(always)]
unsafe fn df_to_f_sse(x: F2x4, y: F2x4) -> __m128 {
_mm_fmadd_ps(x.hi, y.hi, _mm_fmadd_ps(x.lo, y.hi, _mm_mul_ps(x.hi, y.lo)))
}
#[inline(always)]
unsafe fn vpow2i_sse(q: __m128i) -> __m128 {
_mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(q, _mm_set1_epi32(0x7F)), 23))
}
#[inline(always)]
unsafe fn vldexp2_sse(d: __m128, e: __m128i) -> __m128 {
let e1 = _mm_srai_epi32(e, 1);
let e2 = _mm_sub_epi32(e, e1);
_mm_mul_ps(_mm_mul_ps(d, vpow2i_sse(e1)), vpow2i_sse(e2))
}
#[inline(always)]
unsafe fn rempif_sse(d: __m128) -> (F2x4, __m128i) {
#[repr(C, align(16))]
struct A16([f32; 4]);
#[repr(C, align(16))]
struct I16([i32; 4]);
let mut arr = A16([0.0; 4]);
_mm_store_ps(arr.0.as_mut_ptr(), d);
let mut rhi = A16([0.0; 4]);
let mut rlo = A16([0.0; 4]);
let mut rq = I16([0; 4]);
for i in 0..4 {
let (hi, lo, q) = rempif_scalar(arr.0[i]);
rhi.0[i] = hi;
rlo.0[i] = lo;
rq.0[i] = q;
}
(
F2x4 {
hi: _mm_load_ps(rhi.0.as_ptr()),
lo: _mm_load_ps(rlo.0.as_ptr()),
},
_mm_load_si128(rq.0.as_ptr() as *const __m128i),
)
}
#[inline(always)]
unsafe fn vmulsign_sse(x: __m128, y: __m128) -> __m128 {
_mm_xor_ps(x, _mm_and_ps(y, _mm_set1_ps(-0.0)))
}
#[inline(always)]
unsafe fn visnegzero_sse(d: __m128) -> __m128 {
_mm_cmpeq_ps(d, _mm_set1_ps(-0.0))
}
unsafe fn sinf_u10_sse(d: __m128) -> __m128 {
let neg_zero = _mm_set1_ps(-0.0);
let abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFF_FFFF));
let u = _mm_round_ps(
_mm_mul_ps(d, _mm_set1_ps(M_1_PI_F)),
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
);
let q = _mm_cvtps_epi32(u);
let v = _mm_fmadd_ps(u, _mm_set1_ps(-PI_A2F), d);
let mut s = df_add2_f_f_sse(v, _mm_mul_ps(u, _mm_set1_ps(-PI_B2F)));
s = df_add_f2_f_sse(s, _mm_mul_ps(u, _mm_set1_ps(-PI_C2F)));
let abs_d = _mm_and_ps(d, abs_mask);
let in_range = _mm_cmplt_ps(abs_d, _mm_set1_ps(TRIGRANGEMAX2F));
let all_in_range = _mm_movemask_ps(in_range) == 0xF;
let (mut final_s, mut final_q) = (s, q);
if !all_in_range {
let (dfi, q2_raw) = rempif_sse(d);
let q2_and = _mm_and_si128(q2_raw, _mm_set1_epi32(3));
let dfi_x_gt0 = _mm_cmpgt_ps(dfi.hi, _mm_setzero_ps());
let sel = _mm_castps_si128(dfi_x_gt0);
let mut q2 = _mm_add_epi32(
_mm_add_epi32(q2_and, q2_and),
_mm_or_si128(
_mm_and_si128(sel, _mm_set1_epi32(2)),
_mm_andnot_si128(sel, _mm_set1_epi32(1)),
),
);
q2 = _mm_srai_epi32(q2, 2);
let odd = _mm_cmpeq_epi32(_mm_and_si128(q2_raw, _mm_set1_epi32(1)), _mm_set1_epi32(1));
let odd_f = _mm_castsi128_ps(odd);
let half_pi_hi = _mm_set1_ps(3.1415927410125732422 * -0.5);
let half_pi_lo = _mm_set1_ps(-8.7422776573475857731e-08 * -0.5);
let pi_adj = F2x4 {
hi: vmulsign_sse(half_pi_hi, dfi.hi),
lo: vmulsign_sse(half_pi_lo, dfi.hi),
};
let adj = df_add2_f2_f2_sse(dfi, pi_adj);
let t_hi = _mm_or_ps(_mm_and_ps(odd_f, adj.hi), _mm_andnot_ps(odd_f, dfi.hi));
let t_lo = _mm_or_ps(_mm_and_ps(odd_f, adj.lo), _mm_andnot_ps(odd_f, dfi.lo));
let mut t = df_normalize_sse(F2x4 { hi: t_hi, lo: t_lo });
let is_bad = _mm_or_ps(
_mm_cmpeq_ps(_mm_and_ps(d, abs_mask), _mm_set1_ps(f32::INFINITY)),
_mm_cmpunord_ps(d, d),
);
t.hi = _mm_or_ps(t.hi, is_bad);
let in_range_i = _mm_castps_si128(in_range);
final_q = _mm_or_si128(
_mm_and_si128(in_range_i, q),
_mm_andnot_si128(in_range_i, q2),
);
final_s = F2x4 {
hi: _mm_or_ps(_mm_and_ps(in_range, s.hi), _mm_andnot_ps(in_range, t.hi)),
lo: _mm_or_ps(_mm_and_ps(in_range, s.lo), _mm_andnot_ps(in_range, t.lo)),
};
}
let t = final_s;
let s2 = df_squ_f2_sse(final_s);
let mut u = _mm_set1_ps(2.6083159809786593541503e-06);
u = _mm_fmadd_ps(u, s2.hi, _mm_set1_ps(-0.0001981069071916863322258));
u = _mm_fmadd_ps(u, s2.hi, _mm_set1_ps(0.00833307858556509017944336));
let inner = F2x4 {
hi: _mm_fmadd_ps(u, s2.hi, _mm_set1_ps(-0.166666597127914428710938)),
lo: _mm_setzero_ps(),
};
let x = df_add_f_f2_sse(_mm_set1_ps(1.0), df_mul_f2_f2_sse(inner, s2));
let mut result = df_to_f_sse(t, x);
let q_and_1 = _mm_cmpeq_epi32(_mm_and_si128(final_q, _mm_set1_epi32(1)), _mm_set1_epi32(1));
let sign_flip = _mm_and_ps(_mm_castsi128_ps(q_and_1), neg_zero);
result = _mm_xor_ps(result, sign_flip);
let is_neg_zero = visnegzero_sse(d);
result = _mm_or_ps(
_mm_and_ps(is_neg_zero, d),
_mm_andnot_ps(is_neg_zero, result),
);
result
}
unsafe fn cosf_u10_sse(d: __m128) -> __m128 {
let neg_zero = _mm_set1_ps(-0.0);
let abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFF_FFFF));
let dq = _mm_fmadd_ps(
_mm_round_ps(
_mm_fmadd_ps(d, _mm_set1_ps(M_1_PI_F), _mm_set1_ps(-0.5)),
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
),
_mm_set1_ps(2.0),
_mm_set1_ps(1.0),
);
let q = _mm_cvtps_epi32(dq);
let mut s = df_add2_f_f_sse(d, _mm_mul_ps(dq, _mm_set1_ps(-PI_A2F * 0.5)));
s = df_add2_f2_f_sse(s, _mm_mul_ps(dq, _mm_set1_ps(-PI_B2F * 0.5)));
s = df_add2_f2_f_sse(s, _mm_mul_ps(dq, _mm_set1_ps(-PI_C2F * 0.5)));
let abs_d = _mm_and_ps(d, abs_mask);
let in_range = _mm_cmplt_ps(abs_d, _mm_set1_ps(TRIGRANGEMAX2F));
let all_in_range = _mm_movemask_ps(in_range) == 0xF;
let (mut final_s, mut final_q) = (s, q);
if !all_in_range {
let (dfi, q2_raw) = rempif_sse(d);
let q2_and = _mm_and_si128(q2_raw, _mm_set1_epi32(3));
let dfi_x_gt0 = _mm_cmpgt_ps(dfi.hi, _mm_setzero_ps());
let sel = _mm_castps_si128(dfi_x_gt0);
let mut q2 = _mm_add_epi32(
_mm_add_epi32(q2_and, q2_and),
_mm_or_si128(
_mm_and_si128(sel, _mm_set1_epi32(8)),
_mm_andnot_si128(sel, _mm_set1_epi32(7)),
),
);
q2 = _mm_srai_epi32(q2, 1);
let even = _mm_cmpeq_epi32(
_mm_and_si128(q2_raw, _mm_set1_epi32(1)),
_mm_setzero_si128(),
);
let even_f = _mm_castsi128_ps(even);
let y = _mm_or_ps(
_mm_and_ps(dfi_x_gt0, _mm_setzero_ps()),
_mm_andnot_ps(dfi_x_gt0, _mm_set1_ps(-1.0)),
);
let half_pi_hi = _mm_set1_ps(3.1415927410125732422 * -0.5);
let half_pi_lo = _mm_set1_ps(-8.7422776573475857731e-08 * -0.5);
let pi_adj = F2x4 {
hi: vmulsign_sse(half_pi_hi, y),
lo: vmulsign_sse(half_pi_lo, y),
};
let adj = df_add2_f2_f2_sse(dfi, pi_adj);
let t_hi = _mm_or_ps(_mm_and_ps(even_f, adj.hi), _mm_andnot_ps(even_f, dfi.hi));
let t_lo = _mm_or_ps(_mm_and_ps(even_f, adj.lo), _mm_andnot_ps(even_f, dfi.lo));
let mut t = df_normalize_sse(F2x4 { hi: t_hi, lo: t_lo });
let is_bad = _mm_or_ps(
_mm_cmpeq_ps(abs_d, _mm_set1_ps(f32::INFINITY)),
_mm_cmpunord_ps(d, d),
);
t.hi = _mm_or_ps(t.hi, is_bad);
let in_range_i = _mm_castps_si128(in_range);
final_q = _mm_or_si128(
_mm_and_si128(in_range_i, q),
_mm_andnot_si128(in_range_i, q2),
);
final_s = F2x4 {
hi: _mm_or_ps(_mm_and_ps(in_range, s.hi), _mm_andnot_ps(in_range, t.hi)),
lo: _mm_or_ps(_mm_and_ps(in_range, s.lo), _mm_andnot_ps(in_range, t.lo)),
};
}
let t = final_s;
let s2 = df_squ_f2_sse(final_s);
let mut u = _mm_set1_ps(2.6083159809786593541503e-06);
u = _mm_fmadd_ps(u, s2.hi, _mm_set1_ps(-0.0001981069071916863322258));
u = _mm_fmadd_ps(u, s2.hi, _mm_set1_ps(0.00833307858556509017944336));
let inner = F2x4 {
hi: _mm_fmadd_ps(u, s2.hi, _mm_set1_ps(-0.166666597127914428710938)),
lo: _mm_setzero_ps(),
};
let x = df_add_f_f2_sse(_mm_set1_ps(1.0), df_mul_f2_f2_sse(inner, s2));
let mut result = df_to_f_sse(t, x);
let q_and_2_is_0 = _mm_cmpeq_epi32(
_mm_and_si128(final_q, _mm_set1_epi32(2)),
_mm_setzero_si128(),
);
let sign_flip = _mm_and_ps(_mm_castsi128_ps(q_and_2_is_0), neg_zero);
result = _mm_xor_ps(result, sign_flip);
result
}
unsafe fn expf_sse(d: __m128) -> __m128 {
let q = _mm_cvtps_epi32(_mm_mul_ps(d, _mm_set1_ps(R_LN2F)));
let qf = _mm_cvtepi32_ps(q);
let mut s = _mm_fmadd_ps(qf, _mm_set1_ps(-L2UF), d);
s = _mm_fmadd_ps(qf, _mm_set1_ps(-L2LF), s);
let mut u = _mm_set1_ps(0.000198527617612853646278381);
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.00139304355252534151077271));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.00833336077630519866943359));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.0416664853692054748535156));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.166666671633720397949219));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.5));
u = _mm_add_ps(_mm_set1_ps(1.0), _mm_fmadd_ps(_mm_mul_ps(s, s), u, s));
u = vldexp2_sse(u, q);
u = _mm_andnot_ps(_mm_cmplt_ps(d, _mm_set1_ps(-104.0)), u);
u = _mm_or_ps(
_mm_and_ps(
_mm_cmplt_ps(_mm_set1_ps(100.0), d),
_mm_set1_ps(f32::INFINITY),
),
_mm_andnot_ps(_mm_cmplt_ps(_mm_set1_ps(100.0), d), u),
);
u
}
unsafe fn sinf_u35_sse(d: __m128) -> __m128 {
let abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFF_FFFF));
let r = d;
let q = _mm_cvtps_epi32(_mm_mul_ps(d, _mm_set1_ps(M_1_PI_F)));
let u = _mm_cvtepi32_ps(q);
let mut d = _mm_fmadd_ps(u, _mm_set1_ps(-PI_A2F), d);
d = _mm_fmadd_ps(u, _mm_set1_ps(-PI_B2F), d);
d = _mm_fmadd_ps(u, _mm_set1_ps(-PI_C2F), d);
let abs_r = _mm_and_ps(r, abs_mask);
let in_range1 = _mm_cmplt_ps(abs_r, _mm_set1_ps(TRIGRANGEMAX2F));
let all_in_range1 = _mm_movemask_ps(in_range1) == 0xF;
let (mut final_d, mut final_q) = (d, q);
if !all_in_range1 {
let s = _mm_cvtepi32_ps(q);
let mut u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_AF), r);
u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_BF), u2);
u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_CF), u2);
u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_DF), u2);
final_d = _mm_or_ps(
_mm_and_ps(in_range1, d),
_mm_andnot_ps(in_range1, u2),
);
let in_range2 = _mm_cmplt_ps(abs_r, _mm_set1_ps(TRIGRANGEMAXF));
let all_in_range2 = _mm_movemask_ps(in_range2) == 0xF;
if !all_in_range2 {
let (dfi, q2_raw) = rempif_sse(r);
let q2_and = _mm_and_si128(q2_raw, _mm_set1_epi32(3));
let dfi_x_gt0 = _mm_cmpgt_ps(dfi.hi, _mm_setzero_ps());
let sel = _mm_castps_si128(dfi_x_gt0);
let mut q2 = _mm_add_epi32(
_mm_add_epi32(q2_and, q2_and),
_mm_or_si128(
_mm_and_si128(sel, _mm_set1_epi32(2)),
_mm_andnot_si128(sel, _mm_set1_epi32(1)),
),
);
q2 = _mm_srai_epi32(q2, 2);
let odd = _mm_cmpeq_epi32(
_mm_and_si128(q2_raw, _mm_set1_epi32(1)),
_mm_set1_epi32(1),
);
let odd_f = _mm_castsi128_ps(odd);
let half_pi_hi = _mm_set1_ps(3.1415927410125732422 * -0.5);
let half_pi_lo = _mm_set1_ps(-8.7422776573475857731e-08 * -0.5);
let pi_adj = F2x4 {
hi: vmulsign_sse(half_pi_hi, dfi.hi),
lo: vmulsign_sse(half_pi_lo, dfi.hi),
};
let adj = df_add2_f2_f2_sse(dfi, pi_adj);
let t_hi = _mm_or_ps(_mm_and_ps(odd_f, adj.hi), _mm_andnot_ps(odd_f, dfi.hi));
let t_lo = _mm_or_ps(_mm_and_ps(odd_f, adj.lo), _mm_andnot_ps(odd_f, dfi.lo));
let mut u3 = _mm_add_ps(t_hi, t_lo);
let is_bad = _mm_or_ps(
_mm_cmpeq_ps(abs_r, _mm_set1_ps(f32::INFINITY)),
_mm_cmpunord_ps(r, r),
);
u3 = _mm_or_ps(u3, is_bad);
let in_range2_i = _mm_castps_si128(in_range2);
final_q = _mm_or_si128(
_mm_and_si128(in_range2_i, final_q),
_mm_andnot_si128(in_range2_i, q2),
);
final_d = _mm_or_ps(
_mm_and_ps(in_range2, final_d),
_mm_andnot_ps(in_range2, u3),
);
}
}
let s = _mm_mul_ps(final_d, final_d);
let q_and_1 = _mm_cmpeq_epi32(
_mm_and_si128(final_q, _mm_set1_epi32(1)),
_mm_set1_epi32(1),
);
let sign_flip = _mm_and_ps(_mm_castsi128_ps(q_and_1), _mm_set1_ps(-0.0));
final_d = _mm_xor_ps(final_d, sign_flip);
let mut u = _mm_set1_ps(2.6083159809786593541503e-06);
u = _mm_fmadd_ps(u, s, _mm_set1_ps(-0.0001981069071916863322258));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.00833307858556509017944336));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(-0.166666597127914428710938));
u = _mm_add_ps(_mm_mul_ps(s, _mm_mul_ps(u, final_d)), final_d);
let is_neg_zero = visnegzero_sse(r);
u = _mm_or_ps(
_mm_and_ps(is_neg_zero, r),
_mm_andnot_ps(is_neg_zero, u),
);
u
}
unsafe fn cosf_u35_sse(d: __m128) -> __m128 {
let abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFF_FFFF));
let r = d;
let q = _mm_cvtps_epi32(_mm_sub_ps(
_mm_mul_ps(d, _mm_set1_ps(M_1_PI_F)),
_mm_set1_ps(0.5),
));
let q = _mm_add_epi32(_mm_add_epi32(q, q), _mm_set1_epi32(1));
let u = _mm_cvtepi32_ps(q);
let mut d = _mm_fmadd_ps(u, _mm_set1_ps(-PI_A2F * 0.5), d);
d = _mm_fmadd_ps(u, _mm_set1_ps(-PI_B2F * 0.5), d);
d = _mm_fmadd_ps(u, _mm_set1_ps(-PI_C2F * 0.5), d);
let abs_r = _mm_and_ps(r, abs_mask);
let in_range1 = _mm_cmplt_ps(abs_r, _mm_set1_ps(TRIGRANGEMAX2F));
let all_in_range1 = _mm_movemask_ps(in_range1) == 0xF;
let (mut final_d, mut final_q) = (d, q);
if !all_in_range1 {
let s = _mm_cvtepi32_ps(q);
let mut u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_AF * 0.5), r);
u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_BF * 0.5), u2);
u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_CF * 0.5), u2);
u2 = _mm_fmadd_ps(s, _mm_set1_ps(-PI_DF * 0.5), u2);
final_d = _mm_or_ps(
_mm_and_ps(in_range1, d),
_mm_andnot_ps(in_range1, u2),
);
let in_range2 = _mm_cmplt_ps(abs_r, _mm_set1_ps(TRIGRANGEMAXF));
let all_in_range2 = _mm_movemask_ps(in_range2) == 0xF;
if !all_in_range2 {
let (dfi, q2_raw) = rempif_sse(r);
let q2_and = _mm_and_si128(q2_raw, _mm_set1_epi32(3));
let dfi_x_gt0 = _mm_cmpgt_ps(dfi.hi, _mm_setzero_ps());
let sel = _mm_castps_si128(dfi_x_gt0);
let mut q2 = _mm_add_epi32(
_mm_add_epi32(q2_and, q2_and),
_mm_or_si128(
_mm_and_si128(sel, _mm_set1_epi32(8)),
_mm_andnot_si128(sel, _mm_set1_epi32(7)),
),
);
q2 = _mm_srai_epi32(q2, 1);
let even = _mm_cmpeq_epi32(
_mm_and_si128(q2_raw, _mm_set1_epi32(1)),
_mm_setzero_si128(),
);
let even_f = _mm_castsi128_ps(even);
let y = _mm_or_ps(
_mm_and_ps(dfi_x_gt0, _mm_setzero_ps()),
_mm_andnot_ps(dfi_x_gt0, _mm_set1_ps(-1.0)),
);
let half_pi_hi = _mm_set1_ps(3.1415927410125732422 * -0.5);
let half_pi_lo = _mm_set1_ps(-8.7422776573475857731e-08 * -0.5);
let pi_adj = F2x4 {
hi: vmulsign_sse(half_pi_hi, y),
lo: vmulsign_sse(half_pi_lo, y),
};
let adj = df_add2_f2_f2_sse(dfi, pi_adj);
let t_hi = _mm_or_ps(_mm_and_ps(even_f, adj.hi), _mm_andnot_ps(even_f, dfi.hi));
let t_lo = _mm_or_ps(_mm_and_ps(even_f, adj.lo), _mm_andnot_ps(even_f, dfi.lo));
let mut u3 = _mm_add_ps(t_hi, t_lo);
let is_bad = _mm_or_ps(
_mm_cmpeq_ps(abs_r, _mm_set1_ps(f32::INFINITY)),
_mm_cmpunord_ps(r, r),
);
u3 = _mm_or_ps(u3, is_bad);
let in_range2_i = _mm_castps_si128(in_range2);
final_q = _mm_or_si128(
_mm_and_si128(in_range2_i, final_q),
_mm_andnot_si128(in_range2_i, q2),
);
final_d = _mm_or_ps(
_mm_and_ps(in_range2, final_d),
_mm_andnot_ps(in_range2, u3),
);
}
}
let s = _mm_mul_ps(final_d, final_d);
let q_and_2_is_0 = _mm_cmpeq_epi32(
_mm_and_si128(final_q, _mm_set1_epi32(2)),
_mm_setzero_si128(),
);
let sign_flip = _mm_and_ps(_mm_castsi128_ps(q_and_2_is_0), _mm_set1_ps(-0.0));
final_d = _mm_xor_ps(final_d, sign_flip);
let mut u = _mm_set1_ps(2.6083159809786593541503e-06);
u = _mm_fmadd_ps(u, s, _mm_set1_ps(-0.0001981069071916863322258));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(0.00833307858556509017944336));
u = _mm_fmadd_ps(u, s, _mm_set1_ps(-0.166666597127914428710938));
u = _mm_add_ps(_mm_mul_ps(s, _mm_mul_ps(u, final_d)), final_d);
u
}