use crate::common::f_fmla;
static OFF: [f32; 8] = [0.0, 0.5, 1.0, 0.5, -0.0, -0.5, -1.0, -0.5];
static SGNF: [f32; 2] = [1., -1.];
static SGN: [f64; 2] = [1., -1.];
#[inline(always)]
fn atan2pif_gen_impl<Q: Fn(f64, f64, f64) -> f64>(y: f32, x: f32, fma: Q) -> f32 {
let tx = x.to_bits();
let ty: u32 = y.to_bits();
let ux: u32 = tx;
let uy: u32 = ty;
let ax: u32 = ux & 0x7fff_ffff;
let ay = uy & 0x7fff_ffff;
if ay >= (0xff << 23) || ax >= (0xff << 23) {
if ay > (0xff << 23) {
return x + y;
} if ax > (0xff << 23) {
return x + y;
} let yinf = ay == (0xff << 23);
let xinf = ax == (0xff << 23);
if yinf & xinf {
return if (ux >> 31) != 0 {
0.75 * SGNF[(uy >> 31) as usize]
} else {
0.25 * SGNF[(uy >> 31) as usize]
};
}
if xinf {
return if (ux >> 31) != 0 {
SGNF[(uy >> 31) as usize]
} else {
0.0 * SGNF[(uy >> 31) as usize]
};
}
if yinf {
return 0.5 * SGNF[(uy >> 31) as usize];
}
}
if ay == 0 {
if (ay | ax) == 0 {
let i: u32 = (uy >> 31) * 4 + (ux >> 31) * 2;
return OFF[i as usize];
}
if (ux >> 31) == 0 {
return 0.0 * SGNF[(uy >> 31) as usize];
}
}
if ax == ay {
static S: [f32; 4] = [0.25, 0.75, -0.25, -0.75];
let i = (uy >> 31) * 2 + (ux >> 31);
return S[i as usize];
}
let gt: usize = (ay > ax) as usize;
let i: u32 = (uy >> 31) * 4 + (ux >> 31) * 2 + gt as u32;
let zx = x as f64;
let zy = y as f64;
static M: [f64; 2] = [0., 1.];
let mut z = fma(M[gt], zx, M[1 - gt] * zy) / fma(M[gt], zy, M[1 - gt] * zx);
const CN: [u64; 7] = [
0x3fd45f306dc9c883,
0x3fe988d83a142ada,
0x3fe747bebf492057,
0x3fd2cc5645094ff3,
0x3faa0521c711ab66,
0x3f6881b8058b9a0d,
0x3efb16ff514a0af0,
];
let mut r = f64::from_bits(CN[0]);
let z2 = z * z;
z *= SGN[gt];
if z2 > f64::from_bits(0x3c90000000000000) {
let z4 = z2 * z2;
let z8 = z4 * z4;
let mut cn0 = fma(z2, f64::from_bits(CN[1]), r);
let cn2 = fma(z2, f64::from_bits(CN[3]), f64::from_bits(CN[2]));
let mut cn4 = fma(z2, f64::from_bits(CN[5]), f64::from_bits(CN[4]));
let cn6 = f64::from_bits(CN[6]);
cn0 += z4 * cn2;
cn4 += z4 * cn6;
cn0 += z8 * cn4;
const CD: [u64; 7] = [
0x3ff0000000000000,
0x4006b8b143a3f6da,
0x4008421201d18ed5,
0x3ff8221d086914eb,
0x3fd670657e3a07ba,
0x3fa0f4951fd1e72d,
0x3f4b3874b8798286,
];
let mut cd0 = fma(z2, f64::from_bits(CD[1]), f64::from_bits(CD[0]));
let cd2 = fma(z2, f64::from_bits(CD[3]), f64::from_bits(CD[2]));
let mut cd4 = fma(z2, f64::from_bits(CD[5]), f64::from_bits(CD[4]));
let cd6 = f64::from_bits(CD[6]);
cd0 = fma(z4, cd2, cd0);
cd4 = fma(z4, cd6, cd4);
cd0 = fma(z8, cd4, cd0);
r = cn0 / cd0;
}
fma(z, r, OFF[i as usize] as f64) as f32
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn atan2pif_fma_impl(y: f32, x: f32) -> f32 {
atan2pif_gen_impl(y, x, f64::mul_add)
}
#[inline]
pub fn f_atan2pif(y: f32, x: f32) -> f32 {
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
atan2pif_gen_impl(y, x, f_fmla)
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
use std::sync::OnceLock;
static EXECUTOR: OnceLock<unsafe fn(f32, f32) -> f32> = OnceLock::new();
let q = EXECUTOR.get_or_init(|| {
if std::arch::is_x86_feature_detected!("avx")
&& std::arch::is_x86_feature_detected!("fma")
{
atan2pif_fma_impl
} else {
fn def_atan2pif(y: f32, x: f32) -> f32 {
atan2pif_gen_impl(y, x, f_fmla)
}
def_atan2pif
}
});
unsafe { q(y, x) }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_atan2pif() {
assert_eq!(f_atan2pif(0.32131, 0.987565), 0.10012555);
assert_eq!(f_atan2pif(532.32131, 12.987565), 0.49223542);
assert_eq!(f_atan2pif(-754.32131, 12.987565), -0.494520042);
}
}