use crate::bits::get_exponent_f32;
use crate::common::f_fmla;
use crate::rounding::CpuRound;
#[derive(Debug, Copy, Clone)]
pub(crate) struct ArgumentReducer {
pub(crate) x: f64,
pub(crate) x_abs: u32,
}
impl ArgumentReducer {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
target_arch = "aarch64"
))]
#[inline]
pub(crate) fn reduce_small(self) -> (f64, i64) {
const THIRTYTWO_OVER_PI: [u64; 3] =
[0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).cpu_round();
let mut y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
(y, kd as i64)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
target_arch = "aarch64"
)))]
#[inline(always)]
pub(crate) fn reduce_small(self) -> (f64, i64) {
const THIRTYTWO_OVER_PI: [u64; 3] =
[0x40245f306e000000, 0xbe3b1bbeae000000, 0x3c63f84eb0000000];
let prod = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
let kd = prod.cpu_round();
let mut y = prod - kd;
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
(y, unsafe {
kd.to_int_unchecked::<i64>() })
}
#[inline(always)]
pub(crate) fn reduce_small_fma(self) -> (f64, i64) {
const THIRTYTWO_OVER_PI: [u64; 3] =
[0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).round();
let mut y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
(y, unsafe {
kd.to_int_unchecked::<i64>() })
}
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
target_arch = "aarch64"
))]
#[inline]
pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
const THIRTYTWO_OVER_PI: [u64; 5] = [
0x40245f306dc9c883,
0xbcc6b01ec5417056,
0xb966447e493ad4ce,
0x360e21c820ff28b2,
0xb29508510ea79237,
];
if exponent < 99 {
let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
prod_hi = f64::from_bits(
prod_hi.to_bits()
& (if exponent < 55 {
0xfffffffffffff000
} else {
0xffffffffffffffff
}),
); let k_hi = prod_hi.cpu_round();
let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
let k_lo = prod_lo.cpu_round();
let mut y = f_fmla(
self.x,
f64::from_bits(THIRTYTWO_OVER_PI[1]),
truncated_prod - k_lo,
);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
return (y, k_lo as i64);
}
let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
prod_hi = f64::from_bits(
prod_hi.to_bits()
& (if exponent < 110 {
0xfffffffffffff000
} else {
0xffffffffffffffff
}),
); let k_hi = prod_hi.cpu_round();
let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
let k_lo = prod_lo.cpu_round();
let mut y = f_fmla(
self.x,
f64::from_bits(THIRTYTWO_OVER_PI[2]),
truncated_prod - k_lo,
);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
(y, k_lo as i64)
}
#[inline(always)]
#[allow(unused)]
pub(crate) fn reduce_large_fma(self, exponent: i32) -> (f64, i64) {
const THIRTYTWO_OVER_PI: [u64; 5] = [
0x40245f306dc9c883,
0xbcc6b01ec5417056,
0xb966447e493ad4ce,
0x360e21c820ff28b2,
0xb29508510ea79237,
];
if exponent < 99 {
let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
prod_hi = f64::from_bits(
prod_hi.to_bits()
& (if exponent < 55 {
0xfffffffffffff000
} else {
0xffffffffffffffff
}),
); let k_hi = prod_hi.round();
let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
let prod_lo =
f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
let k_lo = prod_lo.round();
let mut y = f64::mul_add(
self.x,
f64::from_bits(THIRTYTWO_OVER_PI[1]),
truncated_prod - k_lo,
);
y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
return (y, k_lo as i64);
}
let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
prod_hi = f64::from_bits(
prod_hi.to_bits()
& (if exponent < 110 {
0xfffffffffffff000
} else {
0xffffffffffffffff
}),
); let k_hi = prod_hi.round();
let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
let prod_lo = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
let k_lo = prod_lo.round();
let mut y = f64::mul_add(
self.x,
f64::from_bits(THIRTYTWO_OVER_PI[2]),
truncated_prod - k_lo,
);
y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
(y, k_lo as i64)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
target_arch = "aarch64"
)))]
#[inline]
pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
static THIRTYTWO_OVER_PI: [u64; 8] = [
0x40245f306e000000,
0xbe3b1bbeae000000,
0x3c63f84eb0000000,
0xba87056592000000,
0x38bc0db62a000000,
0xb6e4cd8778000000,
0xb51bef806c000000,
0x33363abdebbc561b,
];
let mut idx = 0;
const FRACTION_LEN: i32 = 23;
let x_lsb_exp_m4 = exponent - FRACTION_LEN;
static THIRTYTWO_OVER_PI_28_LSB_EXP: [i32; 8] =
[-24, -55, -81, -114, -143, -170, -200, -230];
while x_lsb_exp_m4 + THIRTYTWO_OVER_PI_28_LSB_EXP[idx] > 5 {
idx += 1;
}
let prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[idx]);
let k_hi = prod_hi.cpu_round();
let frac = prod_hi - k_hi;
let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 1]), frac);
let k_lo = prod_lo.cpu_round();
let mut y = prod_lo - k_lo;
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 2]), y);
y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 3]), y);
(y, (k_hi as i64).wrapping_add(k_lo as i64))
}
#[inline(always)]
pub(crate) fn reduce(self) -> (f64, i64) {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
target_arch = "aarch64"
))]
const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
target_arch = "aarch64"
)))]
const SMALL_PASS_BOUND: u32 = 0x4a80_0000u32;
if self.x_abs < SMALL_PASS_BOUND {
self.reduce_small()
} else {
self.reduce_large(get_exponent_f32(f32::from_bits(self.x_abs)))
}
}
#[inline(always)]
#[allow(unused)]
pub(crate) fn reduce_fma(self) -> (f64, i64) {
const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
if self.x_abs < SMALL_PASS_BOUND {
self.reduce_small_fma()
} else {
self.reduce_large_fma(get_exponent_f32(f32::from_bits(self.x_abs)))
}
}
}