use crate::common::{f_fmla, f_fmlaf};
#[inline(always)]
fn atanf_gen_impl<Q: Fn(f64, f64, f64) -> f64>(x: f32, fma: Q) -> f32 {
const PI2: f64 = f64::from_bits(0x3ff921fb54442d18);
let t = x.to_bits();
let e = (t >> 23) & 0xff;
let gt = e >= 127;
let ta = t & 0x7fffffff;
if ta >= 0x4c700518u32 {
if ta > 0x7f800000u32 {
return x + x;
} return f32::copysign(PI2 as f32, x); }
if e < 127 - 13 {
if e < 127 - 25 {
if t << 1 == 0 {
return x;
}
let res = f_fmlaf(-x, x.abs(), x);
return res;
}
return f_fmlaf(-f64::from_bits(0x3fd5555560000000) as f32 * x, x * x, x);
}
let mut z = x as f64;
if gt {
z = 1.0 / z;
}
let z2 = z * z;
let z4 = z2 * z2;
let z8 = z4 * z4;
const CN: [u64; 7] = [
0x3fd51eccde075d67,
0x3fea76bb5637f2f2,
0x3fe81e0eed20de88,
0x3fd376c8ca67d11d,
0x3faaec7b69202ac6,
0x3f69561899acc73e,
0x3efbf9fa5b67e600,
];
const CD: [u64; 7] = [
0x3fd51eccde075d66,
0x3fedfbdd7b392d28,
0x3ff0000000000000,
0x3fdfd22bf0e89b54,
0x3fbd91ff8b576282,
0x3f8653ea99fc9bb0,
0x3f31e7fcc202340a,
];
let mut cn0 = fma(z2, f64::from_bits(CN[1]), f64::from_bits(CN[0]));
let cn2 = fma(z2, f64::from_bits(CN[3]), f64::from_bits(CN[2]));
let mut cn4 = fma(z2, f64::from_bits(CN[5]), f64::from_bits(CN[4]));
let cn6 = f64::from_bits(CN[6]);
cn0 = fma(z4, cn2, cn0);
cn4 = fma(z4, cn6, cn4);
cn0 = fma(z8, cn4, cn0);
cn0 *= z;
let mut cd0 = fma(z2, f64::from_bits(CD[1]), f64::from_bits(CD[0]));
let cd2 = fma(z2, f64::from_bits(CD[3]), f64::from_bits(CD[2]));
let mut cd4 = fma(z2, f64::from_bits(CD[5]), f64::from_bits(CD[4]));
let cd6 = f64::from_bits(CD[6]);
cd0 = fma(z4, cd2, cd0);
cd4 = fma(z4, cd6, cd4);
cd0 = fma(z8, cd4, cd0);
let r = cn0 / cd0;
if !gt {
return r as f32;
}
const PI_OVER2_H: f64 = f64::from_bits(0x3ff9000000000000);
const PI_OVER2_L: f64 = f64::from_bits(0x3f80fdaa22168c23);
((f64::copysign(PI_OVER2_L, z) - r) + f64::copysign(PI_OVER2_H, z)) as f32
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn atanf_fma_impl(x: f32) -> f32 {
atanf_gen_impl(x, f64::mul_add)
}
#[inline]
pub fn f_atanf(x: f32) -> f32 {
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
atanf_gen_impl(x, f_fmla)
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
use std::sync::OnceLock;
static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
let q = EXECUTOR.get_or_init(|| {
if std::arch::is_x86_feature_detected!("avx")
&& std::arch::is_x86_feature_detected!("fma")
{
atanf_fma_impl
} else {
fn def_atanf(x: f32) -> f32 {
atanf_gen_impl(x, f_fmla)
}
def_atanf
}
});
unsafe { q(x) }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn f_atan_test() {
assert!(
(f_atanf(1.0) - std::f32::consts::PI / 4f32).abs() < 1e-6,
"Invalid result {}",
f_atanf(1f32)
);
assert!(
(f_atanf(2f32) - 1.107148717794090503017065f32).abs() < 1e-6,
"Invalid result {}",
f_atanf(2f32)
);
assert!(
(f_atanf(5f32) - 1.3734007669450158608612719264f32).abs() < 1e-6,
"Invalid result {}",
f_atanf(5f32)
);
}
}