use crate::common::f_fmla;
#[inline(always)]
fn rapshon_refine_inv_cbrt(x: f64, a: f64) -> f64 {
x * f_fmla(-1. / 3. * a, x * x * x, 4. / 3.)
}
#[inline(always)]
#[allow(unused)]
fn rapshon_refine_inv_cbrt_fma(x: f64, a: f64) -> f64 {
x * f64::mul_add(-1. / 3. * a, x * x * x, 4. / 3.)
}
#[inline(always)]
fn halleys_div_free(x: f64, a: f64) -> f64 {
const K3: f64 = 2. / 9.;
const K2: f64 = 7. / 9.;
const K1: f64 = 14. / 9.;
let c = a * x * x * x;
let mut y = f_fmla(-K3, c, K2);
y = f_fmla(-c, y, K1);
y * x
}
#[inline(always)]
#[allow(unused)]
fn halleys_div_free_fma(x: f64, a: f64) -> f64 {
const K3: f64 = 2. / 9.;
const K2: f64 = 7. / 9.;
const K1: f64 = 14. / 9.;
let c = a * x * x * x;
let mut y = f64::mul_add(-K3, c, K2);
y = f64::mul_add(-c, y, K1);
y * x
}
#[inline(always)]
fn rcbrtf_gen_impl<Halley: Fn(f64, f64) -> f64, NewtonRaphson: Fn(f64, f64) -> f64>(
x: f32,
halley: Halley,
rapshon: NewtonRaphson,
) -> f32 {
let u = x.to_bits();
let au = u.wrapping_shl(1);
if au < (1u32 << 24) || au >= (0xffu32 << 24) {
if x.is_infinite() {
return if x.is_sign_negative() { -0.0 } else { 0.0 };
}
if au >= (0xffu32 << 24) {
return x + x;
}
if x == 0. {
return if x.is_sign_positive() {
f32::INFINITY
} else {
f32::NEG_INFINITY
};
}
}
let mut ui: u32 = x.to_bits();
let mut hx: u32 = ui & 0x7fffffff;
if hx < 0x00800000 {
if hx == 0 {
return x;
}
const TWO_EXP_24: f32 = f32::from_bits(0x4b800000);
ui = (x * TWO_EXP_24).to_bits();
hx = ui & 0x7fffffff;
const B: u32 = 0x54a21d2au32 + (8u32 << 23);
hx = B.wrapping_sub(hx / 3);
} else {
hx = 0x54a21d2au32.wrapping_sub(hx / 3);
}
ui &= 0x80000000;
ui |= hx;
let t = f32::from_bits(ui) as f64;
let dx = x as f64;
let mut t = halley(t, dx);
t = halley(t, dx);
t = rapshon(t, dx);
t as f32
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn rcbrtf_fma_impl(x: f32) -> f32 {
rcbrtf_gen_impl(x, halleys_div_free_fma, rapshon_refine_inv_cbrt_fma)
}
#[inline]
pub fn f_rcbrtf(x: f32) -> f32 {
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
rcbrtf_gen_impl(x, halleys_div_free, rapshon_refine_inv_cbrt)
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
use std::sync::OnceLock;
static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
let q = EXECUTOR.get_or_init(|| {
if std::arch::is_x86_feature_detected!("avx")
&& std::arch::is_x86_feature_detected!("fma")
{
rcbrtf_fma_impl
} else {
fn def_rcbrtf(x: f32) -> f32 {
rcbrtf_gen_impl(x, halleys_div_free, rapshon_refine_inv_cbrt)
}
def_rcbrtf
}
});
unsafe { q(x) }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fcbrtf() {
assert_eq!(f_rcbrtf(0.0), f32::INFINITY);
assert_eq!(f_rcbrtf(-0.0), f32::NEG_INFINITY);
assert_eq!(f_rcbrtf(-27.0), -1. / 3.);
assert_eq!(f_rcbrtf(27.0), 1. / 3.);
assert_eq!(f_rcbrtf(64.0), 0.25);
assert_eq!(f_rcbrtf(-64.0), -0.25);
assert_eq!(f_rcbrtf(f32::NEG_INFINITY), -0.0);
assert_eq!(f_rcbrtf(f32::INFINITY), 0.0);
assert!(f_rcbrtf(f32::NAN).is_nan());
}
}