use crate::common::{dd_fmla, dyad_fmla, f_fmla};
use crate::double_double::DoubleDouble;
use crate::err::erf_poly::{ERF_POLY, ERF_POLY_C2};
use crate::rounding::CpuFloor;
const TWO_OVER_SQRT_PI: DoubleDouble = DoubleDouble::new(
f64::from_bits(0x3c71ae3a914fed80),
f64::from_bits(0x3ff20dd750429b6d),
);
pub(crate) struct Erf {
pub(crate) result: DoubleDouble,
pub(crate) err: f64,
}
#[cold]
fn cr_erf_accurate_tiny(x: f64) -> DoubleDouble {
static P: [u64; 15] = [
0x3ff20dd750429b6d,
0x3c71ae3a914fed80,
0xbfd812746b0379e7,
0x3c6ee12e49ca96ba,
0x3fbce2f21a042be2,
0xbc52871bc0a0a0d0,
0xbf9b82ce31288b51,
0x3c21003accf1355c,
0x3f7565bcd0e6a53f,
0xbf4c02db40040cc3,
0x3f1f9a326fa3cf50,
0xbeef4d25e3c73ce9,
0x3ebb9eb332b31646,
0xbe864a4bd5eca4d7,
0x3e6c0acc2502e94e,
];
let z2 = x * x;
let mut h = f64::from_bits(P[21 / 2 + 4]);
for a in (12..=19).rev().step_by(2) {
h = dd_fmla(h, z2, f64::from_bits(P[(a / 2 + 4) as usize]))
}
let mut l = 0.;
for a in (8..=11).rev().step_by(2) {
let mut t = DoubleDouble::from_exact_mult(h, x);
t.lo = dd_fmla(l, x, t.lo);
let mut k = DoubleDouble::from_exact_mult(t.hi, x);
k.lo = dd_fmla(t.lo, x, k.lo);
let p = DoubleDouble::from_exact_add(f64::from_bits(P[(a / 2 + 4) as usize]), k.hi);
l = k.lo + p.lo;
h = p.hi;
}
for a in (1..=7).rev().step_by(2) {
let mut t = DoubleDouble::from_exact_mult(h, x);
t.lo = dd_fmla(l, x, t.lo);
let mut k = DoubleDouble::from_exact_mult(t.hi, x);
k.lo = dd_fmla(t.lo, x, k.lo);
let p = DoubleDouble::from_exact_add(f64::from_bits(P[a - 1]), k.hi);
l = k.lo + p.lo + f64::from_bits(P[a]);
h = p.hi;
}
let p = DoubleDouble::from_exact_mult(h, x);
l = dd_fmla(l, x, p.lo);
DoubleDouble::new(l, p.hi)
}
#[cold]
#[inline(never)]
pub(crate) fn erf_accurate(x: f64) -> DoubleDouble {
if x < 0.125
{
return cr_erf_accurate_tiny(x);
}
let v = (8.0 * x).cpu_floor();
let i: u32 = (8.0 * x) as u32;
let z = (x - 0.0625) - 0.125 * v;
let p = ERF_POLY_C2[(i - 1) as usize];
let mut h = f64::from_bits(p[26]);
for a in (11..=17).rev() {
h = dd_fmla(h, z, f64::from_bits(p[(8 + a) as usize]));
}
let mut l: f64 = 0.;
for a in (8..=10).rev() {
let mut t = DoubleDouble::from_exact_mult(h, z);
t.lo = dd_fmla(l, z, t.lo);
let p = DoubleDouble::from_exact_add(f64::from_bits(p[(8 + a) as usize]), t.hi);
h = p.hi;
l = p.lo + t.lo;
}
for a in (0..=7).rev() {
let mut t = DoubleDouble::from_exact_mult(h, z);
t.lo = dd_fmla(l, z, t.lo);
let v = DoubleDouble::from_exact_add(f64::from_bits(p[(2 * a) as usize]), t.hi);
h = v.hi;
l = v.lo + t.lo + f64::from_bits(p[(2 * a + 1) as usize]);
}
DoubleDouble::new(l, h)
}
#[inline]
pub(crate) fn erf_fast(x: f64) -> Erf {
if x < 0.0625
{
let z2 = DoubleDouble::from_exact_mult(x, x);
const C: [u64; 8] = [
0x3ff20dd750429b6d,
0x3c71ae3a7862d9c4,
0xbfd812746b0379e7,
0x3c6f1a64d72722a2,
0x3fbce2f21a042b7f,
0xbf9b82ce31189904,
0x3f7565bbf8a0fe0b,
0xbf4bf9f8d2c202e4,
];
let z4 = z2.hi * z2.hi;
let c9 = dd_fmla(f64::from_bits(C[7]), z2.hi, f64::from_bits(C[6]));
let mut c5 = dd_fmla(f64::from_bits(C[5]), z2.hi, f64::from_bits(C[4]));
c5 = dd_fmla(c9, z4, c5);
let mut t = DoubleDouble::from_exact_mult(z2.hi, c5);
let mut v = DoubleDouble::from_exact_add(f64::from_bits(C[2]), t.hi);
v.lo += t.lo + f64::from_bits(C[3]);
t = DoubleDouble::from_exact_mult(z2.hi, v.hi);
let h_c = v.hi;
t.lo += dd_fmla(z2.hi, v.lo, f64::from_bits(C[1]));
v = DoubleDouble::from_exact_add(f64::from_bits(C[0]), t.hi);
v.lo += dd_fmla(z2.lo, h_c, t.lo);
v = DoubleDouble::quick_mult_f64(v, x);
return Erf {
result: v,
err: f64::from_bits(0x3ba7800000000000),
};
}
let v = (16.0 * x).cpu_floor();
let i: u32 = (16.0 * x) as u32;
let z = (x - 0.03125) - 0.0625 * v;
let c = ERF_POLY[(i - 1) as usize];
let z2 = z * z;
let z4 = z2 * z2;
let c9 = dd_fmla(f64::from_bits(c[12]), z, f64::from_bits(c[11]));
let mut c7 = dd_fmla(f64::from_bits(c[10]), z, f64::from_bits(c[9]));
let c5 = dd_fmla(f64::from_bits(c[8]), z, f64::from_bits(c[7]));
let mut c3 = DoubleDouble::from_exact_add(f64::from_bits(c[5]), z * f64::from_bits(c[6]));
c7 = dd_fmla(c9, z2, c7);
let p = DoubleDouble::from_exact_add(c3.hi, c5 * z2);
c3.hi = p.hi;
c3.lo += p.lo;
let p = DoubleDouble::from_exact_add(c3.hi, c7 * z4);
c3.hi = p.hi;
c3.lo += p.lo;
let mut t = DoubleDouble::from_exact_mult(z, c3.hi);
let mut c2 = DoubleDouble::from_exact_add(f64::from_bits(c[4]), t.hi);
c2.lo += dd_fmla(z, c3.lo, t.lo);
t = DoubleDouble::from_exact_mult(z, c2.hi);
let mut v = DoubleDouble::from_exact_add(f64::from_bits(c[2]), t.hi);
v.lo += t.lo + dd_fmla(z, c2.lo, f64::from_bits(c[3]));
t = DoubleDouble::from_exact_mult(z, v.hi);
t.lo = dd_fmla(z, v.lo, t.lo);
v = DoubleDouble::from_exact_add(f64::from_bits(c[0]), t.hi);
v.lo += t.lo + f64::from_bits(c[1]);
Erf {
result: v,
err: f64::from_bits(0x3ba1100000000000),
}
}
pub fn f_erf(x: f64) -> f64 {
let z = f64::from_bits(x.to_bits() & 0x7fff_ffff_ffff_ffff);
let mut t = z.to_bits();
let ux = t;
if ux > 0x4017afb48dc96626
{
let os = f64::copysign(1.0, x);
const MASK: u64 = 0x7ff0000000000000u64;
if ux > MASK {
return x + x;
}
if ux == MASK {
return os;
}
return f_fmla(-f64::from_bits(0x3c90000000000000), os, os);
}
if z < f64::from_bits(0x3c20000000000000) {
if x == 0. {
return x;
}
let y = TWO_OVER_SQRT_PI.hi * x;
let sx = x * f64::from_bits(0x4690000000000000);
let mut p = DoubleDouble::quick_mult_f64(TWO_OVER_SQRT_PI, sx);
p.lo += f_fmla(-y, f64::from_bits(0x4690000000000000), p.hi);
let res = dyad_fmla(p.lo, f64::from_bits(0x3950000000000000), y);
return res;
}
let result = erf_fast(z);
let mut u = result.result.hi.to_bits();
let mut v = result.result.lo.to_bits();
t = x.to_bits();
const SIGN_MASK: u64 = 0x8000000000000000u64;
u ^= t & SIGN_MASK;
v ^= t & SIGN_MASK;
let left = f64::from_bits(u) + f_fmla(result.err, -f64::from_bits(u), f64::from_bits(v));
let right = f64::from_bits(u) + f_fmla(result.err, f64::from_bits(u), f64::from_bits(v));
if left == right {
return left;
}
let a_results = erf_accurate(z);
if x >= 0. {
a_results.to_f64()
} else {
(-a_results.hi) + (-a_results.lo)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_erf() {
assert_eq!(f_erf(0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000009456563898732),
0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010670589695636709);
assert_eq!(f_erf(0.), 0.);
assert_eq!(f_erf(1.), 0.8427007929497149);
assert_eq!(f_erf(0.49866735123), 0.5193279892991808);
assert_eq!(f_erf(-0.49866735123), -0.5193279892991808);
assert!(f_erf(f64::NAN).is_nan());
assert_eq!(f_erf(f64::INFINITY), 1.0);
assert_eq!(f_erf(f64::NEG_INFINITY), -1.0);
}
}