use crate::common::f_fmla;
static COEFFS: [[u64; 8]; 32] = [
[
0x3ff20dd750429b6d,
0xbfd812746b037753,
0x3fbce2f219e8596a,
0xbf9b82cdacb78fda,
0x3f756479297dfda5,
0xbf48b3ac5455ef02,
0xbf7126fcac367e3b,
0x3fb2d0bdb3ba4984,
],
[
0x3ff20dd750429b6d,
0xbfd812746b0379a8,
0x3fbce2f21a03cf2a,
0xbf9b82ce30de083e,
0x3f7565bcad3eb60f,
0xbf4c02c66f659256,
0x3f1f92f673385229,
0xbeedef402648ae90,
],
[
0x3ff20dd750429b34,
0xbfd812746b032dce,
0x3fbce2f219d84aae,
0xbf9b82ce22dcf139,
0x3f7565b9efcd4af1,
0xbf4c021f1af414bc,
0x3f1f7c6d177eff82,
0xbeec9e4410dcf865,
],
[
0x3ff20dd750426eab,
0xbfd812746ae592c7,
0x3fbce2f211525f14,
0xbf9b82ccc125e63f,
0x3f756596f261cfd3,
0xbf4bfde1ff8eeecf,
0x3f1f31a9d15dc5d8,
0xbeea5a4362844b3c,
],
[
0x3ff20dd75039c705,
0xbfd812746777e74d,
0x3fbce2f17af98a1b,
0xbf9b82be4b817cbe,
0x3f7564bec2e2962e,
0xbf4bee86f9da3558,
0x3f1e9443689dc0cc,
0xbee79c0f230805d8,
],
[
0x3ff20dd74f811211,
0xbfd81274371a3e8f,
0x3fbce2ec038262e5,
0xbf9b8265b82c5e1f,
0x3f75615a2e239267,
0xbf4bc63ae023dceb,
0x3f1d87c2102f7e06,
0xbee49584bea41d62,
],
[
0x3ff20dd746d063e3,
0xbfd812729a8a950f,
0x3fbce2cb0a2df232,
0xbf9b80eca1f51278,
0x3f75572e26c46815,
0xbf4b715e5638b65e,
0x3f1bfbb195484968,
0xbee177a565c15c52,
],
[
0x3ff20dd701b44486,
0xbfd812691145f237,
0x3fbce23a06b8cfd9,
0xbf9b7c1dc7245288,
0x3f753e92f7f397dd,
0xbf4ad97cc4acf0b2,
0x3f19f028b2b09b71,
0xbedcdc4da08da8c1,
],
[
0x3ff20dd5715ac332,
0xbfd8123e680bd0eb,
0x3fbce0457aded691,
0xbf9b6f52d52bed40,
0x3f750c291b84414c,
0xbf49ea246b1ad4a9,
0x3f177654674e0ca0,
0xbed737c11a1bcebb,
],
[
0x3ff20dce6593e114,
0xbfd811a59c02eadc,
0x3fbcdab53c7cd7d5,
0xbf9b526d2e321eed,
0x3f74b1d32cd8b994,
0xbf48963143ec0a1e,
0x3f14ad5700e4db91,
0xbed231e100e43ef2,
],
[
0x3ff20db48bfd5a62,
0xbfd80fdd84f9e308,
0x3fbccd340d462983,
0xbf9b196a29287680,
0x3f74210c2c13a0f7,
0xbf46dbdfb4ff71ae,
0x3f11bca2d17fbd71,
0xbecbca36f90c7cf5,
],
[
0x3ff20d64b2f8f508,
0xbfd80b4d4f19fa8b,
0x3fbcb088197262e3,
0xbf9ab51fd02e5b99,
0x3f734e1e5e81a632,
0xbf44c66377b502ce,
0x3f0d9ad25066213c,
0xbec4b0df7dd0cfa1,
],
[
0x3ff20c8fc1243576,
0xbfd8010cb2009e27,
0x3fbc7a47e9299315,
0xbf9a155be5683654,
0x3f7233502694997b,
0xbf426c94b7d81300,
0x3f08094f1de25fb9,
0xbebe0e3d776c6eef,
],
[
0x3ff20a9bd1611bc1,
0xbfd7ec7fbce83f90,
0x3fbc1d757d7317b7,
0xbf992c160cd589f0,
0x3f70d307269cc5c2,
0xbf3fda5b0d2d1879,
0x3f02fdd7b3b14a7f,
0xbeb54eed4a26af5a,
],
[
0x3ff20682834f943d,
0xbfd7c73f747bf5a9,
0x3fbb8c2db4a9ffd1,
0xbf97f0e4ffe989ec,
0x3f6e7061eae4166e,
0xbf3ad36e873fff2d,
0x3efd39222396128e,
0xbead83dacec5ea6b,
],
[
0x3ff1feb8d12676d7,
0xbfd7898347284afe,
0x3fbaba3466b34451,
0xbf9663adc573e2f9,
0x3f6ae99fb17c3e08,
0xbf3602f950ad5535,
0x3ef5e9717490609d,
0xbea3fca107bbc8d5,
],
[
0x3ff1f12fe3c536fa,
0xbfd72b1d1f22e6d3,
0x3fb99fc0eed4a896,
0xbf948db0a87bd8c6,
0x3f673e368895aa61,
0xbf319b35d5301fc8,
0x3ef007987e4bb033,
0xbe9a7edcd4c2dc70,
],
[
0x3ff1db7b0df84d5d,
0xbfd6a4e4a41cde02,
0x3fb83bbded16455d,
0xbf92809b3b36977e,
0x3f639c08bab44679,
0xbf2b7b45a70ed119,
0x3ee6e99b36410e7b,
0xbe913619bb7ebc0c,
],
[
0x3ff1bb1c85c4a527,
0xbfd5f23b99a249a3,
0x3fb694c91fa0d12c,
0xbf9053e1ce11c72d,
0x3f602bf72c50ea78,
0xbf24f478fb56cb02,
0x3ee005f80ecbe213,
0xbe85f2446bde7f5b,
],
[
0x3ff18dec3bd51f9d,
0xbfd5123f58346186,
0x3fb4b8a1ca536ab4,
0xbf8c4243015cc723,
0x3f5a1a8a01d351ef,
0xbf1f466b34f1d86b,
0x3ed5f835eea0bf6a,
0xbe7b83165b939234,
],
[
0x3ff152804c3369f4,
0xbfd4084cd4afd4bc,
0x3fb2ba2e836e47aa,
0xbf8800f2dfc6904b,
0x3f54a6daf0669c59,
0xbf16e326ab872317,
0x3ecd9761a6a755a5,
0xbe70fca33f9dd4b5,
],
[
0x3ff1087ad68356aa,
0xbfd2dbb044707459,
0x3fb0aea8ceaa0384,
0xbf840b516d52b3d2,
0x3f500c9e05f01d22,
0xbf1076afb0dc0ff7,
0x3ec39fadec400657,
0xbe64b5761352e7e3,
],
[
0x3ff0b0a7a8ba4a22,
0xbfd196990d22d4a1,
0x3fad5551e6ac0c4d,
0xbf807cce1770bd1a,
0x3f4890347b8848bf,
0xbf0757ec96750b6a,
0x3eb9b258a1e06bce,
0xbe58fc6d22da7572,
],
[
0x3ff04ce2be70fb47,
0xbfd0449e4b0b9cac,
0x3fa97f7424f4b0e7,
0xbf7ac825439c42f4,
0x3f428f5f65426dfb,
0xbf005b699a90f90f,
0x3eb0a888eecf4593,
0xbe4deace2b32bb31,
],
[
0x3fefbf9fb0e11cc8,
0xbfcde2640856545a,
0x3fa5f5b1f47f8510,
0xbf7588bc71eb41b9,
0x3f3bc6a0a772f56d,
0xbef6b9fad1f1657a,
0x3ea573204ba66504,
0xbe41d38065c94e44,
],
[
0x3feed8f18c99e031,
0xbfcb4cb6acd903b4,
0x3fa2c7f3dddd6fc1,
0xbf713052067df4e0,
0x3f34a5027444082f,
0xbeef672bab0e2554,
0x3e9b83c756348cc9,
0xbe3534f1a1079499,
],
[
0x3fedebd33044166d,
0xbfc8d7cd9053f7d8,
0x3f9ff9957fb3d6e7,
0xbf6b50be55de0f36,
0x3f2e92c8ec53a628,
0xbee5a4b88d508007,
0x3e91a27737559e26,
0xbe2942ae62cb2c14,
],
[
0x3fecfdbf0386f3bd,
0xbfc68e33d93b0dc4,
0x3f9b2683d58f53de,
0xbf65a9174e70d26f,
0x3f269ddd326d49cd,
0xbeddd8f397a8219c,
0x3e86a755016ad4dd,
0xbe1e366e0139187d,
],
[
0x3fec132adb8d7464,
0xbfc475a899f61b46,
0x3f970a431397a77c,
0xbf612e3d35beeee2,
0x3f20c16b05738333,
0xbed4a47f873e144e,
0x3e7d3d494c698c02,
0xbe12302c59547fe5,
],
[
0x3feb2f5fd05555e7,
0xbfc28feefbe03ec7,
0x3f93923acbb3a676,
0xbf5b4ff793cd6358,
0x3f18ea0eb8c913bc,
0xbeccb31ec2baceb1,
0x3e730011e7e80c04,
0xbe0617710635cb1d,
],
[
0x3fea54853cd9593e,
0xbfc0dbdbaea4dc8e,
0x3f90a93e2c20a0fd,
0xbf55c969ff401ea8,
0x3f129e0cc64fe627,
0xbec4160d8e9d3c2a,
0x3e68e7b67594624a,
0xbdfb1cf2c975b09b,
],
[
0x3fe983ceece09ff8,
0xbfbeacc78f7a2d00,
0x3f8c74418410655f,
0xbf51756a050e441e,
0x3f0bff3650f7f548,
0xbebc56c0217d3ada,
0x3e607b4918d0b489,
0xbdf0d4be8c1c50f8,
],
];
trait ErffBackend {
fn fma(&self, x: f64, y: f64, z: f64) -> f64;
}
struct GenErffBackend {}
impl ErffBackend for GenErffBackend {
#[inline(always)]
fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
f_fmla(x, y, z)
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
struct FmaErffBackend {}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
impl ErffBackend for FmaErffBackend {
#[inline(always)]
fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
f64::mul_add(x, y, z)
}
}
#[inline(always)]
fn erff_gen<B: ErffBackend>(x: f32, backend: B) -> f32 {
let x_u = x.to_bits();
let x_abs = x_u & 0x7fff_ffffu32;
if x_abs >= 0x4080_0000u32 {
static ONE: [f32; 2] = [1.0, -1.0];
static SMALL: [f32; 2] = [f32::from_bits(0xb3000000), f32::from_bits(0x33000000)];
let sign = x.is_sign_negative() as usize;
if x_abs >= 0x7f80_0000u32 {
return if x_abs > 0x7f80_0000 { x } else { ONE[sign] };
}
return ONE[sign] + SMALL[sign];
}
let xd = x as f64;
let xsq = xd * xd;
const EIGHT: u32 = 3 << 23;
let idx = unsafe { f32::from_bits(x_abs.wrapping_add(EIGHT)).to_int_unchecked::<usize>() };
let c = COEFFS[idx];
let x4 = xsq * xsq;
let c0 = backend.fma(xsq, f64::from_bits(c[1]), f64::from_bits(c[0]));
let c1 = backend.fma(xsq, f64::from_bits(c[3]), f64::from_bits(c[2]));
let c2 = backend.fma(xsq, f64::from_bits(c[5]), f64::from_bits(c[4]));
let c3 = backend.fma(xsq, f64::from_bits(c[7]), f64::from_bits(c[6]));
let x8 = x4 * x4;
let p0 = backend.fma(x4, c1, c0);
let p1 = backend.fma(x4, c3, c2);
(xd * backend.fma(x8, p1, p0)) as f32
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn erff_fma_impl(x: f32) -> f32 {
erff_gen(x, FmaErffBackend {})
}
#[inline]
pub fn f_erff(x: f32) -> f32 {
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
crate::err::erff::erff_gen(x, GenErffBackend {})
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
use std::sync::OnceLock;
static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
let q = EXECUTOR.get_or_init(|| {
if std::arch::is_x86_feature_detected!("avx")
&& std::arch::is_x86_feature_detected!("fma")
{
erff_fma_impl
} else {
fn def_erff(x: f32) -> f32 {
erff_gen(x, GenErffBackend {})
}
def_erff
}
});
unsafe { q(x) }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn f_erff_test() {
assert_eq!(f_erff(0.0), 0.0);
assert_eq!(f_erff(1.0), 0.8427008);
assert_eq!(f_erff(0.5), 0.5204999);
assert_eq!(f_erff(f32::INFINITY), 1.0);
assert_eq!(f_erff(f32::NEG_INFINITY), -1.0);
assert!(f_erff(f32::NAN).is_nan());
}
}