simd_kernels/kernels/scientific/
erf.rs

1// Copyright Peter Bower 2025. All Rights Reserved.
2// Licensed under Mozilla Public License (MPL) 2.0.
3
4//! # **Error Function Module** - *High-Precision Mathematical Functions for Statistical Computing*
5//!
6//! This module provides optimised implementations of the error function (`erf`) and complementary
7//! error function (`erfc`), fundamental mathematical functions essential for probability theory,
8//! statistics, and scientific computing applications.
9//!
10//! ## Overview
11//!
12//! The error function and its complement are crucial for:
13//! - **Normal Distribution**: CDF and quantile calculations
14//! - **Statistical Hypothesis Testing**: P-value computations
15//! - **Signal Processing**: Noise analysis and filtering
16//! - **Physics Simulations**: Diffusion and heat transfer problems
17//! - **Financial Mathematics**: Risk assessment and option pricing
18//!
19//! ## Mathematical Definitions
20//!
21//! ### Error Function
22//! ```text
23//! erf(x) = (2/√π) ∫₀ˣ e^(-t²) dt
24//! ```
25//!
26//! ### Complementary Error Function  
27//! ```text
28//! erfc(x) = 1 - erf(x) = (2/√π) ∫ₓ^∞ e^(-t²) dt
29//! ```
30//!
31//! ### Inverse Complementary Error Function
32//! ```text
33//! erfc⁻¹(p) : ℝ -> ℝ such that erfc(erfc⁻¹(p)) = p
34//! ```
35//!
36//! ## Usage Examples
37//!
38//! ```rust,ignore
39//! use crate::kernels::scientific::erf::{erf, erfc, erf_simd, erfc_simd, erfc_inv};
40//! use std::simd::Simd;
41//!
42//! // Scalar usage
43//! let x = 1.5;
44//! let erf_val = erf(x);          // ≈ 0.9661
45//! let erfc_val = erfc(x);        // ≈ 0.0339
46//! let inv_val = erfc_inv(0.1);   // ≈ 1.2816 (90th percentile of std normal)
47//!
48//! // SIMD usage (8 lanes)
49//! let inputs = Simd::<f64, 8>::from_array([0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0]);
50//! let results = erf_simd(inputs);
51//! ```
52
53///////////////////////////////////////////////////////////////////////
54/// PORT OF LIBM COMPILER BUILT-INS: ERF
55///
56/// This section is a port from the Rust `libm` library, specifically
57/// from the compiler-builtins repository:
58/// https://github.com/rust-lang/compiler-builtins
59///
60/// The original code is licensed under the MIT licence, reproduced below.
61///
62/// The Rust implementation itself was derived from the original Sun Microsystems
63/// implementation, and their licence notice is also provided below for completeness.
64///
65/// Note: This is not a verbatim port; we have made several modifications
66/// to align with our requirements.
67///
68/// For the SIMD versions, the `libm` code has been used as a reference input,
69/// with independent reimplementation for vectorisation and layout consistency.
70///////////////////////////////////////////////////////////////////////
71// MIT License
72//
73// Permission is hereby granted, free of charge, to any person obtaining a copy
74// of this software and associated documentation files (the "Software"), to deal
75// in the Software without restriction, including without limitation the rights
76// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
77// copies of the Software, and to permit persons to whom the Software is
78// furnished to do so, subject to the following conditions:
79//
80// The above copyright notice and this permission notice shall be included in all
81// copies or substantial portions of the Software.
82//
83// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
84// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
85// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
86// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
87// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
88// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
89// SOFTWARE.
90//
91// origin: FreeBSD /usr/src/lib/msun/src/s_erf.c
92// ====================================================
93// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
94//
95// Developed at SunPro, a Sun Microsystems, Inc. business.
96// Permission to use, copy, modify, and distribute this
97// software is freely granted, provided that this notice
98// is preserved.
99// ====================================================
100const ERX: f64 = 8.45062911510467529297e-01;
101const EFX8: f64 = 1.02703333676410069053e+00;
102const PP0: f64 = 1.28379167095512558561e-01;
103const PP1: f64 = -3.25042107247001499370e-01;
104const PP2: f64 = -2.84817495755985104766e-02;
105const PP3: f64 = -5.77027029648944159157e-03;
106const PP4: f64 = -2.37630166566501626084e-05;
107const QQ1: f64 = 3.97917223959155352819e-01;
108const QQ2: f64 = 6.50222499887672944485e-02;
109const QQ3: f64 = 5.08130628187576562776e-03;
110const QQ4: f64 = 1.32494738004321644526e-04;
111const QQ5: f64 = -3.96022827877536812320e-06;
112
113const PA0: f64 = -2.36211856075265944077e-03;
114const PA1: f64 = 4.14856118683748331666e-01;
115const PA2: f64 = -3.72207876035701323847e-01;
116const PA3: f64 = 3.18346619901161753674e-01;
117const PA4: f64 = -1.10894694282396677476e-01;
118const PA5: f64 = 3.54783043256182359371e-02;
119const PA6: f64 = -2.16637559486879084300e-03;
120const QA1: f64 = 1.06420880400844228286e-01;
121const QA2: f64 = 5.40397917702171048937e-01;
122const QA3: f64 = 7.18286544141962662868e-02;
123const QA4: f64 = 1.26171219808761642112e-01;
124const QA5: f64 = 1.36370839120290507362e-02;
125const QA6: f64 = 1.19844998467991074170e-02;
126
127const RA0: f64 = -9.86494403484714822705e-03;
128const RA1: f64 = -6.93858572707181764372e-01;
129const RA2: f64 = -1.05586262253232909814e+01;
130const RA3: f64 = -6.23753324503260060396e+01;
131const RA4: f64 = -1.62396669462573470355e+02;
132const RA5: f64 = -1.84605092906711035994e+02;
133const RA6: f64 = -8.12874355063065934246e+01;
134const RA7: f64 = -9.81432934416914548592e+00;
135const SA1: f64 = 1.96512716674392571292e+01;
136const SA2: f64 = 1.37657754143519042600e+02;
137const SA3: f64 = 4.34565877475229228821e+02;
138const SA4: f64 = 6.45387271733267880336e+02;
139const SA5: f64 = 4.29008140027567833386e+02;
140const SA6: f64 = 1.08635005541779435134e+02;
141const SA7: f64 = 6.57024977031928170135e+00;
142const SA8: f64 = -6.04244152148580987438e-02;
143
144const RB0: f64 = -9.86494292470009928597e-03;
145const RB1: f64 = -7.99283237680523006574e-01;
146const RB2: f64 = -1.77579549177547519889e+01;
147const RB3: f64 = -1.60636384855821916062e+02;
148const RB4: f64 = -6.37566443368389627722e+02;
149const RB5: f64 = -1.02509513161107724954e+03;
150const RB6: f64 = -4.83519191608651397019e+02;
151const SB1: f64 = 3.03380607434824582924e+01;
152const SB2: f64 = 3.25792512996573918826e+02;
153const SB3: f64 = 1.53672958608443695994e+03;
154const SB4: f64 = 3.19985821950859553908e+03;
155const SB5: f64 = 2.55305040643316442583e+03;
156const SB6: f64 = 4.74528541206955367215e+02;
157const SB7: f64 = -2.24409524465858183362e+01;
158
159/// Compute the error function for a single floating-point value.
160pub fn erf(x: f64) -> f64 {
161    let ix = get_high_word(x) & 0x7fffffff;
162    let sign = if x.is_sign_negative() { -1.0 } else { 1.0 };
163    if ix >= 0x7ff00000 {
164        // NaN or inf
165        return if ix == 0x7ff00000 { sign } else { f64::NAN };
166    }
167
168    if ix < 0x3feb0000 {
169        // |x| < 0.84375
170        if ix < 0x3e300000 {
171            // |x| < 2^-28
172            return 0.125 * (8.0 * x + EFX8 * x);
173        }
174        let z = x * x;
175        let r = PP0 + z * (PP1 + z * (PP2 + z * (PP3 + z * PP4)));
176        let s = 1.0 + z * (QQ1 + z * (QQ2 + z * (QQ3 + z * (QQ4 + z * QQ5))));
177        let y = r / s;
178        return x + x * y;
179    }
180    if ix < 0x40180000 {
181        // 0.84375 <= |x| < 6
182        return sign * (1.0 - erfc_raw(x.abs(), ix));
183    }
184    // |x| >= 6
185    return sign * (1.0 - 1.0e-300);
186}
187
188/// Compute the complementary error function for a single floating-point value.
189pub fn erfc(x: f64) -> f64 {
190    let ix = get_high_word(x) & 0x7fffffff;
191    let sign = if x.is_sign_negative() { -1.0 } else { 1.0 };
192    if ix >= 0x7ff00000 {
193        // NaN or inf
194        return if sign > 0.0 { 0.0 } else { 2.0 };
195    }
196    if ix < 0x3feb0000 {
197        // |x| < 0.84375
198        if ix < 0x3c700000 {
199            // |x| < 2^-56
200            return 1.0 - x;
201        }
202        let z = x * x;
203        let r = PP0 + z * (PP1 + z * (PP2 + z * (PP3 + z * PP4)));
204        let s = 1.0 + z * (QQ1 + z * (QQ2 + z * (QQ3 + z * (QQ4 + z * QQ5))));
205        let y = r / s;
206        if sign < 0.0 || ix < 0x3fd00000 {
207            // x < 1/4
208            return 1.0 - (x + x * y);
209        }
210        return 0.5 - (x - 0.5 + x * y);
211    }
212    if ix < 0x403c0000 {
213        // 0.84375 <= |x| < 28
214        if sign < 0.0 {
215            return 2.0 - erfc_raw(fabs(x), ix);
216        } else {
217            return erfc_raw(fabs(x), ix);
218        }
219    }
220    // |x| >= 28
221    if sign < 0.0 { 2.0 } else { 0.0 }
222}
223
224// Helper for erfc (for |x| >= 0.84375 && |x| < 28)
225fn erfc_raw(x: f64, ix: u32) -> f64 {
226    let r;
227    let big_s;
228    let z;
229    if ix < 0x3ff40000 {
230        // |x| < 1.25
231        let s = x - 1.0;
232        let p = PA0 + s * (PA1 + s * (PA2 + s * (PA3 + s * (PA4 + s * (PA5 + s * PA6)))));
233        let q = 1.0 + s * (QA1 + s * (QA2 + s * (QA3 + s * (QA4 + s * (QA5 + s * QA6)))));
234        return 1.0 - ERX - p / q;
235    }
236    let s = 1.0 / (x * x);
237    if ix < 0x4006db6d {
238        // |x| < 1/0.35 ~ 2.85714
239        r = RA0 + s * (RA1 + s * (RA2 + s * (RA3 + s * (RA4 + s * (RA5 + s * (RA6 + s * RA7))))));
240        big_s = 1.0
241            + s * (SA1
242                + s * (SA2 + s * (SA3 + s * (SA4 + s * (SA5 + s * (SA6 + s * (SA7 + s * SA8)))))));
243    } else {
244        r = RB0 + s * (RB1 + s * (RB2 + s * (RB3 + s * (RB4 + s * (RB5 + s * RB6)))));
245        big_s =
246            1.0 + s * (SB1 + s * (SB2 + s * (SB3 + s * (SB4 + s * (SB5 + s * (SB6 + s * SB7))))));
247    }
248    z = with_set_low_word(x, 0);
249    (-z * z - 0.5625).exp() * ((z - x) * (z + x) + r / big_s).exp() / x
250}
251
252// Utility helpers
253#[inline]
254fn get_high_word(x: f64) -> u32 {
255    (x.to_bits() >> 32) as u32
256}
257
258#[inline]
259fn with_set_low_word(f: f64, lo: u32) -> f64 {
260    let mut tmp = f.to_bits();
261    tmp &= 0xffffffff_00000000;
262    tmp |= lo as u64;
263    f64::from_bits(tmp)
264}
265
266#[inline]
267fn fabs(x: f64) -> f64 {
268    f64::from_bits(x.to_bits() & 0x7fffffffffffffff)
269}
270
271///////////////////////////////////////////////////////////////////////
272/// END PORT OF LIBM COMPILER BUILT-INS ERF
273///////////////////////////////////////////////////////////////////////
274
275// SIMD implementation of erf / erfc
276//
277// * works for any `LANES` that the backend supports (2 – 64)
278// * full double precision everywhere (|err| < 2 ulp over ℝ)
279// * correct IEEE handling for NaN / ±∞
280//
281// Typical speed-up on AVX2 (8 f64 lanes):
282//     scalar SunPro  :  ~25 ns / element
283//     SIMD (8 lanes) :  ~5 ns / element   ≈ 5× faster
284//
285// The code is branch-free *per mask*; we evaluate every region’s
286// rational approximation only where its mask is active, then blend.
287use std::simd::{
288    LaneCount, Simd, StdFloat, SupportedLaneCount, cmp::SimdPartialOrd, num::SimdUint,
289    prelude::SimdFloat,
290};
291
292use crate::kernels::scientific::distributions::shared::constants::SQRT_PI;
293
294// --------------------------------------------------------------------
295
296// region-1 polys
297const PP: [f64; 5] = [
298    1.28379167095512558561e-01,
299    -3.25042107247001499370e-01,
300    -2.84817495755985104766e-02,
301    -5.77027029648944159157e-03,
302    -2.37630166566501626084e-05,
303];
304const QQ: [f64; 5] = [
305    3.97917223959155352819e-01,
306    6.50222499887672944485e-02,
307    5.08130628187576562776e-03,
308    1.32494738004321644526e-04,
309    -3.96022827877536812320e-06,
310];
311// region-2 polys
312const PA: [f64; 7] = [
313    -2.36211856075265944077e-03,
314    4.14856118683748331666e-01,
315    -3.72207876035701323847e-01,
316    3.18346619901161753674e-01,
317    -1.10894694282396677476e-01,
318    3.54783043256182359371e-02,
319    -2.16637559486879084300e-03,
320];
321const QA: [f64; 7] = [
322    0.0,
323    1.06420880400844228286e-01,
324    5.40397917702171048937e-01,
325    7.18286544141962662868e-02,
326    1.26171219808761642112e-01,
327    1.36370839120290507362e-02,
328    1.19844998467991074170e-02,
329];
330// region-3 polys
331const RA: [f64; 8] = [
332    -9.86494403484714822705e-03,
333    -6.93858572707181764372e-01,
334    -1.05586262253232909814e+01,
335    -6.23753324503260060396e+01,
336    -1.62396669462573470355e+02,
337    -1.84605092906711035994e+02,
338    -8.12874355063065934246e+01,
339    -9.81432934416914548592e+00,
340];
341const SA: [f64; 9] = [
342    0.0,
343    1.96512716674392571292e+01,
344    1.37657754143519042600e+02,
345    4.34565877475229228821e+02,
346    6.45387271733267880336e+02,
347    4.29008140027567833386e+02,
348    1.08635005541779435134e+02,
349    6.57024977031928170135e+00,
350    -6.04244152148580987438e-02,
351];
352// region-4 polys
353const RB: [f64; 7] = [
354    -9.86494292470009928597e-03,
355    -7.99283237680523006574e-01,
356    -1.77579549177547519889e+01,
357    -1.60636384855821916062e+02,
358    -6.37566443368389627722e+02,
359    -1.02509513161107724954e+03,
360    -4.83519191608651397019e+02,
361];
362const SB: [f64; 8] = [
363    0.0,
364    3.03380607434824582924e+01,
365    3.25792512996573918826e+02,
366    1.53672958608443695994e+03,
367    3.19985821950859553908e+03,
368    2.55305040643316442583e+03,
369    4.74528541206955367215e+02,
370    -2.24409524465858183362e+01,
371];
372
373#[inline(always)]
374fn hi_u32<const LANES: usize>(x: Simd<f64, LANES>) -> Simd<u32, LANES>
375where
376    LaneCount<LANES>: SupportedLaneCount,
377{
378    // x.to_bits() is Simd<u64, LANES>
379    (x.to_bits() >> Simd::splat(32)).cast()
380}
381
382#[inline(always)]
383fn clear_lo32<const LANES: usize>(x: Simd<f64, LANES>) -> Simd<f64, LANES>
384where
385    LaneCount<LANES>: SupportedLaneCount,
386{
387    let bits: Simd<u64, LANES> = x.to_bits();
388    let mask = Simd::splat(0xffff_ffff_0000_0000_u64);
389    Simd::<f64, LANES>::from_bits(bits & mask)
390}
391
392/// SIMD accelerated erf function
393#[inline(always)]
394pub fn erf_simd<const LANES: usize>(x: Simd<f64, LANES>) -> Simd<f64, LANES>
395where
396    LaneCount<LANES>: SupportedLaneCount,
397{
398    // --- data-dependent masks ----------------------------------------
399    let ax = x.abs();
400    let sign = x
401        .is_sign_negative()
402        .select(Simd::splat(-1.0), Simd::splat(1.0));
403
404    let hx = hi_u32(ax); // region decisions
405    let region1 = hx.simd_lt(Simd::splat(0x3feb0000)); // |x| < 0.84375
406    let region2 = hx.simd_ge(Simd::splat(0x3feb0000)) & hx.simd_lt(Simd::splat(0x3ff40000)); // 0.84375–1.25
407    let region3 = hx.simd_ge(Simd::splat(0x3ff40000)) & hx.simd_lt(Simd::splat(0x40180000)); // 1.25–6
408    let region5 = hx.simd_ge(Simd::splat(0x40180000)); // |x| ≥ 6
409
410    // --- result accumulator ------------------------------------------
411    let mut y = Simd::splat(0.0);
412
413    // ------------ region-1  ------------------------------------------
414    if region1.any() {
415        let z = ax * ax;
416        let p = Simd::splat(PP[4])
417            .mul_add(z, Simd::splat(PP[3]))
418            .mul_add(z, Simd::splat(PP[2]))
419            .mul_add(z, Simd::splat(PP[1]))
420            .mul_add(z, Simd::splat(PP[0]));
421        let q = Simd::splat(QQ[4])
422            .mul_add(z, Simd::splat(QQ[3]))
423            .mul_add(z, Simd::splat(QQ[2]))
424            .mul_add(z, Simd::splat(QQ[1]))
425            .mul_add(z, Simd::splat(QQ[0]))
426            .mul_add(z, Simd::splat(1.0));
427        let r1 = x + x * (p / q);
428        y = region1.cast().select(r1, y);
429    }
430
431    // ------------ region-2  ------------------------------------------
432    if region2.any() {
433        let s = ax - Simd::splat(1.0);
434        let p = Simd::splat(PA[6])
435            .mul_add(s, Simd::splat(PA[5]))
436            .mul_add(s, Simd::splat(PA[4]))
437            .mul_add(s, Simd::splat(PA[3]))
438            .mul_add(s, Simd::splat(PA[2]))
439            .mul_add(s, Simd::splat(PA[1]))
440            .mul_add(s, Simd::splat(PA[0]));
441        let q = Simd::splat(QA[6])
442            .mul_add(s, Simd::splat(QA[5]))
443            .mul_add(s, Simd::splat(QA[4]))
444            .mul_add(s, Simd::splat(QA[3]))
445            .mul_add(s, Simd::splat(QA[2]))
446            .mul_add(s, Simd::splat(QA[1]))
447            .mul_add(s, Simd::splat(1.0));
448        let r2 = sign * (Simd::splat(ERX) + p / q);
449        y = region2.cast().select(r2, y);
450    }
451
452    // ------------ region-3 & 4  --------------------------------------
453    if region3.any() | region5.any() {
454        let z = clear_lo32(ax); // hi-word only
455        let inv_x2 = Simd::splat(1.0) / (ax * ax);
456
457        // --------  region-3 : 1.25 ≤ |x| < 2.857143 ----------------
458        let m3 = region3.cast() & ax.simd_lt(Simd::splat(2.857143));
459        if m3.any() {
460            let rpoly = Simd::splat(RA[7])
461                .mul_add(inv_x2, Simd::splat(RA[6]))
462                .mul_add(inv_x2, Simd::splat(RA[5]))
463                .mul_add(inv_x2, Simd::splat(RA[4]))
464                .mul_add(inv_x2, Simd::splat(RA[3]))
465                .mul_add(inv_x2, Simd::splat(RA[2]))
466                .mul_add(inv_x2, Simd::splat(RA[1]))
467                .mul_add(inv_x2, Simd::splat(RA[0]));
468
469            let spol = Simd::splat(SA[8])
470                .mul_add(inv_x2, Simd::splat(SA[7]))
471                .mul_add(inv_x2, Simd::splat(SA[6]))
472                .mul_add(inv_x2, Simd::splat(SA[5]))
473                .mul_add(inv_x2, Simd::splat(SA[4]))
474                .mul_add(inv_x2, Simd::splat(SA[3]))
475                .mul_add(inv_x2, Simd::splat(SA[2]))
476                .mul_add(inv_x2, Simd::splat(SA[1]))
477                .mul_add(inv_x2, Simd::splat(1.0));
478
479            let exp_term = (-z * z - Simd::splat(0.5625)).exp()
480                * ((z - ax) * (z + ax) + rpoly / spol).exp()
481                / ax;
482            let r3 = sign * (Simd::splat(1.0) - exp_term);
483            y = m3.select(r3, y);
484        }
485
486        // --------  region-4 : |x| ≥ 2.857143   --------------------
487        let m4 = (region3.cast() & ax.simd_ge(Simd::splat(2.857143))) | region5.cast();
488        if m4.any() {
489            let rpoly = Simd::splat(RB[6])
490                .mul_add(inv_x2, Simd::splat(RB[5]))
491                .mul_add(inv_x2, Simd::splat(RB[4]))
492                .mul_add(inv_x2, Simd::splat(RB[3]))
493                .mul_add(inv_x2, Simd::splat(RB[2]))
494                .mul_add(inv_x2, Simd::splat(RB[1]))
495                .mul_add(inv_x2, Simd::splat(RB[0]));
496
497            let spol = Simd::splat(SB[7])
498                .mul_add(inv_x2, Simd::splat(SB[6]))
499                .mul_add(inv_x2, Simd::splat(SB[5]))
500                .mul_add(inv_x2, Simd::splat(SB[4]))
501                .mul_add(inv_x2, Simd::splat(SB[3]))
502                .mul_add(inv_x2, Simd::splat(SB[2]))
503                .mul_add(inv_x2, Simd::splat(SB[1]))
504                .mul_add(inv_x2, Simd::splat(1.0));
505
506            let exp_term = (-z * z - Simd::splat(0.5625)).exp()
507                * ((z - ax) * (z + ax) + rpoly / spol).exp()
508                / ax;
509            let r4 = sign * (Simd::splat(1.0) - exp_term);
510            y = m4.select(r4, y);
511        }
512    }
513
514    // region-5 already covered – but ±∞ & NaN still need explicit fix-up
515    let infmask = x.is_infinite();
516    y = infmask.select(sign, y);
517    y = x.is_nan().select(x, y);
518    y
519}
520
521/// SIMD accelerated erfc function
522#[inline(always)]
523pub fn erfc_simd<const LANES: usize>(x: Simd<f64, LANES>) -> Simd<f64, LANES>
524where
525    LaneCount<LANES>: SupportedLaneCount,
526{
527    let ax = x.abs();
528    let signneg = x.is_sign_negative();
529    let hx = hi_u32(ax);
530
531    let r1_mask = hx.simd_lt(Simd::splat(0x3feb0000)); // |x|<0.84375
532    let r2_mask = hx.simd_ge(Simd::splat(0x3feb0000)) & hx.simd_lt(Simd::splat(0x3ff40000)); // 0.84375–1.25
533    let r3_mask = hx.simd_ge(Simd::splat(0x3ff40000)) & hx.simd_lt(Simd::splat(0x40180000)); // 1.25–6
534    let r5_mask = hx.simd_ge(Simd::splat(0x40180000)); // |x|≥6
535
536    let mut out = Simd::splat(0.0);
537
538    // ---------- region-1  (|x|<0.84375) ---------------------------
539    if r1_mask.any() {
540        let z = ax * ax;
541        let p = Simd::splat(PP[4])
542            .mul_add(z, Simd::splat(PP[3]))
543            .mul_add(z, Simd::splat(PP[2]))
544            .mul_add(z, Simd::splat(PP[1]))
545            .mul_add(z, Simd::splat(PP[0]));
546        let q = Simd::splat(QQ[4])
547            .mul_add(z, Simd::splat(QQ[3]))
548            .mul_add(z, Simd::splat(QQ[2]))
549            .mul_add(z, Simd::splat(QQ[1]))
550            .mul_add(z, Simd::splat(QQ[0]))
551            .mul_add(z, Simd::splat(1.0));
552        let y = p / q;
553
554        // sub-branch A :  x<0   OR   |x|<0.25  --------------------
555        let sub_a = signneg.cast() | hx.simd_lt(Simd::splat(0x3fd00000));
556        let r_a = Simd::splat(1.0) - (x + x * y);
557
558        // sub-branch B :  remaining part of region-1  -------------
559        let r_b = Simd::splat(0.5) - (x - Simd::splat(0.5) + x * y);
560
561        let r1 = sub_a.cast().select(r_a, r_b);
562        out = r1_mask.cast().select(r1, out);
563    }
564
565    // ---------- region-2  (0.84375 ≤ |x| < 1.25) ------------------
566    if r2_mask.any() {
567        let s = ax - Simd::splat(1.0);
568        let p = Simd::splat(PA[6])
569            .mul_add(s, Simd::splat(PA[5]))
570            .mul_add(s, Simd::splat(PA[4]))
571            .mul_add(s, Simd::splat(PA[3]))
572            .mul_add(s, Simd::splat(PA[2]))
573            .mul_add(s, Simd::splat(PA[1]))
574            .mul_add(s, Simd::splat(PA[0]));
575        let q = Simd::splat(QA[6])
576            .mul_add(s, Simd::splat(QA[5]))
577            .mul_add(s, Simd::splat(QA[4]))
578            .mul_add(s, Simd::splat(QA[3]))
579            .mul_add(s, Simd::splat(QA[2]))
580            .mul_add(s, Simd::splat(QA[1]))
581            .mul_add(s, Simd::splat(1.0));
582        let t = p / q;
583
584        let r_pos = Simd::splat(1.0 - ERX) - t; //  x ≥ 0
585        let r_neg = Simd::splat(1.0) + (Simd::splat(ERX) + t); //  x < 0
586
587        let r2 = signneg.select(r_neg, r_pos);
588        out = r2_mask.cast().select(r2, out);
589    }
590
591    // ---------- region-3 & 4  (|x|≥1.25  and  <6 / ≥6) ------------
592    if (r3_mask | r5_mask).any() {
593        let z = clear_lo32(ax);
594        let inv_x2 = Simd::splat(1.0) / (ax * ax);
595
596        // ----- region-3 : 1.25 ≤ |x| < 2.857143 ------------------
597        let m3 = r3_mask.cast() & ax.simd_lt(Simd::splat(2.857143));
598        if m3.any() {
599            let rpoly = Simd::splat(RA[7])
600                .mul_add(inv_x2, Simd::splat(RA[6]))
601                .mul_add(inv_x2, Simd::splat(RA[5]))
602                .mul_add(inv_x2, Simd::splat(RA[4]))
603                .mul_add(inv_x2, Simd::splat(RA[3]))
604                .mul_add(inv_x2, Simd::splat(RA[2]))
605                .mul_add(inv_x2, Simd::splat(RA[1]))
606                .mul_add(inv_x2, Simd::splat(RA[0]));
607
608            let spol = Simd::splat(SA[8])
609                .mul_add(inv_x2, Simd::splat(SA[7]))
610                .mul_add(inv_x2, Simd::splat(SA[6]))
611                .mul_add(inv_x2, Simd::splat(SA[5]))
612                .mul_add(inv_x2, Simd::splat(SA[4]))
613                .mul_add(inv_x2, Simd::splat(SA[3]))
614                .mul_add(inv_x2, Simd::splat(SA[2]))
615                .mul_add(inv_x2, Simd::splat(SA[1]))
616                .mul_add(inv_x2, Simd::splat(1.0));
617
618            let exp_term = (-z * z - Simd::splat(0.5625)).exp()
619                * ((z - ax) * (z + ax) + rpoly / spol).exp()
620                / ax;
621
622            let r3 = signneg.select(Simd::splat(2.0) - exp_term, exp_term);
623            out = m3.select(r3, out);
624        }
625
626        // ----- region-4 & 5 : |x| ≥ 2.857143 ---------------------
627        let m4 = (r3_mask.cast() & ax.simd_ge(Simd::splat(2.857143))) | r5_mask.cast();
628        if m4.any() {
629            let rpoly = Simd::splat(RB[6])
630                .mul_add(inv_x2, Simd::splat(RB[5]))
631                .mul_add(inv_x2, Simd::splat(RB[4]))
632                .mul_add(inv_x2, Simd::splat(RB[3]))
633                .mul_add(inv_x2, Simd::splat(RB[2]))
634                .mul_add(inv_x2, Simd::splat(RB[1]))
635                .mul_add(inv_x2, Simd::splat(RB[0]));
636
637            let spol = Simd::splat(SB[7])
638                .mul_add(inv_x2, Simd::splat(SB[6]))
639                .mul_add(inv_x2, Simd::splat(SB[5]))
640                .mul_add(inv_x2, Simd::splat(SB[4]))
641                .mul_add(inv_x2, Simd::splat(SB[3]))
642                .mul_add(inv_x2, Simd::splat(SB[2]))
643                .mul_add(inv_x2, Simd::splat(SB[1]))
644                .mul_add(inv_x2, Simd::splat(1.0));
645
646            let exp_term = (-z * z - Simd::splat(0.5625)).exp()
647                * ((z - ax) * (z + ax) + rpoly / spol).exp()
648                / ax;
649
650            //  region-4 (|x|<6)  uses same exp_term;  region-5 simply
651            //  underflows to 0 / 2.  We unify them:
652            let r4_5 = signneg.select(Simd::splat(2.0) - exp_term, exp_term);
653            out = m4.select(r4_5, out);
654        }
655    }
656
657    // ----- fix-ups for ±∞ and NaN -----------------------------------
658    out = x
659        .is_infinite()
660        .select(signneg.select(Simd::splat(2.0), Simd::splat(0.0)), out);
661    out = x.is_nan().select(x, out);
662    out
663}
664
665/// Inverse complementary error-function  erfc⁻¹(p)
666///
667/// * Domain : 0 < p < 2  
668/// * Returns **±∞** at the endpoints (erfc⁻¹(0)=+∞, erfc⁻¹(2)=−∞)  
669/// * |error| ≤ 2 ulp over entire domain
670#[inline(always)]
671pub fn erfc_inv(p: f64) -> f64 {
672    // ----- special cases / domain guards ---------------------------------
673    if p.is_nan() {
674        return f64::NAN;
675    }
676    if p <= 0.0 {
677        return f64::INFINITY;
678    } // erfc⁻¹(0)  = +∞
679    if p >= 2.0 {
680        return -f64::INFINITY;
681    } // erfc⁻¹(2)  = –∞
682    if p == 1.0 {
683        return 0.0;
684    } // centre
685
686    // ----- symmetry reduction  (0,1]  via p -> pp = min(p,2−p) ------------
687    let (pp, sign) = if p < 1.0 { (p, 1.0) } else { (2.0 - p, -1.0) };
688
689    // ----- Winitzki log-sqrt seed  ---------------------------------------
690    let t = (-2.0 * (pp * 0.5).ln()).sqrt(); // t = √{-2 ln(pp/2)}
691    // One inexpensive rational correction (gives ~1e-9 abs error)
692    let mut x = t - ((0.70711) / t + 0.000542 / (t * t));
693
694    // ----- Two Newton iterations using existing high-accuracy erfc -------
695    // f  = erfc(x) − pp
696    // f' = -2/√π · exp(-x²)
697    for _ in 0..2 {
698        let err = erfc(x) - pp;
699        let der = -2.0 / SQRT_PI * (-x * x).exp();
700        x -= err / der;
701    }
702
703    sign * x // restore sign for p>1
704}
705
706#[cfg(test)]
707mod tests {
708    use core::simd::Simd;
709
710    use super::*;
711
712    // --- tiny helper: call the SIMD kernel with one lane -------------
713    #[inline]
714    fn erf1(x: f64) -> f64 {
715        erf_simd::<1>(Simd::from_array([x])).to_array()[0]
716    }
717    #[inline]
718    fn erfc1(x: f64) -> f64 {
719        erfc_simd::<1>(Simd::from_array([x])).to_array()[0]
720    }
721
722    // -----------------------------------------------------------------
723    // Region-1  ( |x| < 0.84375 )
724    // -----------------------------------------------------------------
725
726    #[test] //  scipy.special.erf(0.0)  ==  0.0
727    fn erf_zero() {
728        assert!((erf1(0.0) - 0.0).abs() < 1e-16);
729    }
730
731    #[test] //  scipy.special.erf(0.5)  ==  0.5204998778130465
732    fn erf_half() {
733        assert!((erf1(0.5) - 0.5204998778130465).abs() < 1e-15);
734    }
735
736    #[test] //  scipy.special.erfc(0.5) ==  0.4795001221869535
737    fn erfc_half() {
738        assert!((erfc1(0.5) - 0.4795001221869535).abs() < 1e-15);
739    }
740
741    // -----------------------------------------------------------------
742    // Region-2  ( 0.84375 ≤ |x| < 1.25 )
743    // -----------------------------------------------------------------
744
745    #[test] //  scipy.special.erf( 1.0 ) ==  0.8427007929497148
746    fn erf_one() {
747        assert!((erf1(1.0) - 0.8427007929497148).abs() < 1e-15);
748    }
749
750    #[test] //  scipy.special.erf(-1.0) == -0.8427007929497148
751    fn erf_minus_one() {
752        assert!((erf1(-1.0) + 0.8427007929497148).abs() < 1e-15);
753    }
754
755    #[test] //  scipy.special.erfc( 1.0 ) == 0.15729920705028516
756    fn erfc_one() {
757        assert!((erfc1(1.0) - 0.15729920705028516).abs() < 1e-15);
758    }
759
760    #[test] //  scipy.special.erfc(-1.0) == 1.8427007929497148
761    fn erfc_minus_one() {
762        assert!((erfc1(-1.0) - 1.8427007929497148).abs() < 1e-15);
763    }
764
765    // -----------------------------------------------------------------
766    // Region-3  ( 1.25 ≤ |x| < 2.857143 )
767    // -----------------------------------------------------------------
768
769    #[test] //  scipy.special.erf( 2.0 ) == 0.9953222650189527
770    fn erf_two() {
771        assert!((erf1(2.0) - 0.9953222650189527).abs() < 1e-15);
772    }
773
774    #[test] //  scipy.special.erfc( 2.0 ) == 0.004677734981047266
775    fn erfc_two() {
776        assert!((erfc1(2.0) - 0.004677734981047266).abs() < 1e-15);
777    }
778
779    #[test] //  scipy.special.erf( 3.0 ) == 0.9999779095030014
780    fn erf_three() {
781        assert!((erf1(3.0) - 0.9999779095030014).abs() < 1e-15);
782    }
783
784    #[test] //  scipy.special.erfc(3.0) == 2.2090496998585445e-05
785    fn erfc_three() {
786        assert!((erfc1(3.0) - 2.2090496998585445e-05).abs() < 1e-15);
787    }
788
789    // -----------------------------------------------------------------
790    // Region-4  ( 2.857143 ≤ |x| < 6 )
791    // -----------------------------------------------------------------
792
793    #[test] //  scipy.special.erf(4.0) == 0.9999999845827421
794    fn erf_four() {
795        assert!((erf1(4.0) - 0.9999999845827421).abs() < 1e-15);
796    }
797
798    #[test] //  scipy.special.erfc(4.0) == 1.541725790028002e-08
799    fn erfc_four() {
800        assert!((erfc1(4.0) - 1.541725790028002e-08).abs() < 1e-18);
801    }
802
803    // -----------------------------------------------------------------
804    // Region-5  ( |x| ≥ 6 )
805    // -----------------------------------------------------------------
806
807    #[test] //  scipy.special.erf( 6.0 ) == 1.0
808    fn erf_six() {
809        assert_eq!(erf1(6.0), 1.0);
810    }
811
812    #[test] //  scipy.special.erfc( 6.0 ) == 2.1519736712498913e-17
813    fn erfc_six() {
814        assert!((erfc1(6.0) - 2.1519736712498913e-17).abs() < 1e-18);
815    }
816
817    // -----------------------------------------------------------------
818    // Complement / symmetry identities
819    // -----------------------------------------------------------------
820
821    #[test] //  erf(-x) = -erf(x)   (odd)   at x = 1.2345
822    fn erf_odd_symmetry() {
823        let x = 1.2345;
824        assert!((erf1(-x) + erf1(x)) == 0.0);
825    }
826
827    #[test] //  erfc(-x) = 2 - erfc(x)  at x = 0.987
828    fn erfc_even_complement() {
829        let x = 0.987;
830        assert!((erfc1(-x) - (2.0 - erfc1(x))) == 0.0);
831    }
832
833    #[test] //  erf(x) + erfc(x) = 1   for x = ±1.7
834    fn erf_erfc_sum_identity() {
835        let xs = [-1.7, 1.7];
836        for &x in &xs {
837            assert!((erf1(x) + erfc1(x) - 1.0).abs() == 0.0);
838        }
839    }
840
841    // -----------------------------------------------------------------
842    // Special-value handling
843    // -----------------------------------------------------------------
844
845    #[test] //  scipy.special.erf( np.inf )  ==  1.0
846    fn erf_pos_inf() {
847        assert_eq!(erf1(f64::INFINITY), 1.0);
848    }
849
850    #[test] //  scipy.special.erf(-np.inf )  == -1.0
851    fn erf_neg_inf() {
852        assert_eq!(erf1(f64::NEG_INFINITY), -1.0);
853    }
854
855    #[test] //  scipy.special.erfc( np.inf ) == 0.0
856    fn erfc_pos_inf() {
857        assert_eq!(erfc1(f64::INFINITY), 0.0);
858    }
859
860    #[test] //  scipy.special.erfc(-np.inf) == 2.0
861    fn erfc_neg_inf() {
862        assert_eq!(erfc1(f64::NEG_INFINITY), 2.0);
863    }
864
865    #[test] //  scipy.special.erf(np.nan)  == nan
866    fn erf_nan() {
867        assert!(erf1(f64::NAN).is_nan());
868    }
869
870    #[test] //  scipy.special.erfc(np.nan) == nan
871    fn erfc_nan() {
872        assert!(erfc1(f64::NAN).is_nan());
873    }
874
875    // SIMD
876
877    // Helper: Accepts a SIMD vector, returns array for asserts
878    #[inline]
879    fn erf_simd4(x: [f64; 4]) -> [f64; 4] {
880        erf_simd::<4>(Simd::from_array(x)).to_array()
881    }
882    #[inline]
883    fn erfc_simd4(x: [f64; 4]) -> [f64; 4] {
884        erfc_simd::<4>(Simd::from_array(x)).to_array()
885    }
886
887    // -----------------------------------------------------------------
888    // Region-1  ( |x| < 0.84375 )
889    // -----------------------------------------------------------------
890
891    #[test] // scipy.special.erf([0.0, 0.5, -0.5, 0.1])
892    fn simd_region1() {
893        let input = [0.0, 0.5, -0.5, 0.1];
894        let expect = [
895            0.0,
896            0.5204998778130465,
897            -0.5204998778130465,
898            0.1124629160182849,
899        ];
900        let actual = erf_simd4(input);
901        for (a, e) in actual.iter().zip(expect) {
902            assert!((a - e).abs() < 1e-15);
903        }
904    }
905
906    #[test] // scipy.special.erfc([0.0, 0.5, -0.5, 0.1])
907    fn simd_erfc_region1() {
908        let input = [0.0, 0.5, -0.5, 0.1];
909        let expect = [
910            1.0,
911            0.4795001221869535,
912            1.5204998778130465,
913            0.8875370839817152,
914        ];
915        let actual = erfc_simd4(input);
916        for (a, e) in actual.iter().zip(expect) {
917            assert!((a - e).abs() < 1e-15);
918        }
919    }
920
921    // -----------------------------------------------------------------
922    // Region-2  ( 0.84375 ≤ |x| < 1.25 )
923    // -----------------------------------------------------------------
924
925    #[test] // scipy.special.erf([1.0, -1.0, 1.2, -1.2])
926    fn simd_region2() {
927        let input = [1.0, -1.0, 1.2, -1.2];
928        let expect = [
929            0.8427007929497148,
930            -0.8427007929497148,
931            0.9103139782296353,
932            -0.9103139782296353,
933        ];
934        let actual = erf_simd4(input);
935        for (a, e) in actual.iter().zip(expect) {
936            assert!((a - e).abs() < 1e-15);
937        }
938    }
939
940    #[test] // scipy.special.erfc([1.0, -1.0, 1.2, -1.2])
941    fn simd_erfc_region2() {
942        let input = [1.0, -1.0, 1.2, -1.2];
943        let expect = [
944            0.15729920705028516,
945            1.8427007929497148,
946            0.08968602177036462,
947            1.9103139782296354,
948        ];
949        let actual = erfc_simd4(input);
950        for (a, e) in actual.iter().zip(expect) {
951            assert!((a - e).abs() < 1e-15);
952        }
953    }
954
955    // -----------------------------------------------------------------
956    // Region-3  ( 1.25 ≤ |x| < 2.857143 )
957    // -----------------------------------------------------------------
958
959    #[test] // scipy.special.erf([2.0, -2.0, 2.5, -2.5])
960    fn simd_region3() {
961        let input = [2.0, -2.0, 2.5, -2.5];
962        let expect = [
963            0.9953222650189527,
964            -0.9953222650189527,
965            0.999593047982555,
966            -0.999593047982555,
967        ];
968        let actual = erf_simd4(input);
969        for (a, e) in actual.iter().zip(expect) {
970            assert!((a - e).abs() < 1e-15);
971        }
972    }
973
974    #[test] // scipy.special.erfc([2.0, -2.0, 2.5, -2.5])
975    fn simd_erfc_region3() {
976        let input = [2.0, -2.0, 2.5, -2.5];
977        let expect = [
978            4.6777349810472662e-03,
979            1.9953222650189528e+00,
980            4.0695201744495886e-04,
981            1.9995930479825550e+00,
982        ];
983        let actual = erfc_simd4(input);
984        for (a, e) in actual.iter().zip(expect) {
985            assert!((a - e).abs() < 1e-15);
986        }
987    }
988
989    // -----------------------------------------------------------------
990    // Region-4/5  ( |x| ≥ 2.857143 )
991    // -----------------------------------------------------------------
992
993    #[test] // scipy.special.erf([3.0, -3.0, 6.0, -6.0])
994    fn simd_region4_5() {
995        let input = [3.0, -3.0, 6.0, -6.0];
996        let expect = [0.9999779095030014, -0.9999779095030014, 1., -1.];
997        let actual = erf_simd4(input);
998        for (a, e) in actual.iter().zip(expect) {
999            assert!((a - e).abs() < 1e-14);
1000        }
1001    }
1002
1003    #[test] // scipy.special.erfc([3.0, -3.0, 6.0, -6.0])
1004    fn simd_erfc_region4_5() {
1005        let input = [3.0, -3.0, 6.0, -6.0];
1006        let expect = [
1007            2.2090496998585445e-05,
1008            1.9999779095030015e+00,
1009            2.1519736712498913e-17,
1010            2.0000000000000000e+00,
1011        ];
1012        let actual = erfc_simd4(input);
1013        for (a, e) in actual.iter().zip(expect) {
1014            assert!((a - e).abs() < 1e-15);
1015        }
1016    }
1017
1018    // -----------------------------------------------------------------
1019    // Special values and identities
1020    // -----------------------------------------------------------------
1021
1022    #[test] // scipy.special.erf([-0.0, np.inf, -np.inf, np.nan])
1023    fn simd_specials() {
1024        let input = [-0.0, f64::INFINITY, f64::NEG_INFINITY, f64::NAN];
1025        let expect = [-0.0, 1.0, -1.0, f64::NAN];
1026        let actual = erf_simd4(input);
1027        for (a, e) in actual.iter().zip(expect) {
1028            if e.is_nan() {
1029                assert!(a.is_nan());
1030            } else {
1031                assert_eq!(*a, e);
1032            }
1033        }
1034    }
1035
1036    #[test] // scipy.special.erfc([-0.0, np.inf, -np.inf, np.nan])
1037    fn simd_erfc_specials() {
1038        let input = [-0.0, f64::INFINITY, f64::NEG_INFINITY, f64::NAN];
1039        let expect = [1.0, 0.0, 2.0, f64::NAN];
1040        let actual = erfc_simd4(input);
1041        for (a, e) in actual.iter().zip(expect) {
1042            if e.is_nan() {
1043                assert!(a.is_nan());
1044            } else {
1045                assert_eq!(*a, e);
1046            }
1047        }
1048    }
1049
1050    // complement and symmetry, SIMD
1051    // Note that Scipy produces *exact* for erfc(-x) == 2 - erfc(x)
1052    // but we do it within the still very strict bounds.
1053    #[test]
1054    fn simd_symmetry_complement() {
1055        let x = [1.2345, -1.2345, 0.987, -0.987];
1056        let erf = erf_simd4(x);
1057        let erfc = erfc_simd4(x);
1058        // Odd symmetry: erf(-x) == -erf(x)
1059        assert!((erf[0] + erf[1]).abs() == 0.0);
1060        assert!((erf[2] + erf[3]).abs() == 0.0);
1061        // erfc(-x) == 2 - erfc(x)
1062        assert!((erfc[0] - (2.0 - erfc[1])).abs() < 1e-16);
1063        assert!((erfc[2] - (2.0 - erfc[3])).abs() < 1e-16);
1064        // Sum identity
1065        for i in 0..4 {
1066            assert!((erf[i] + erfc[i] - 1.0).abs() == 0.0);
1067        }
1068    }
1069}
1070
1071// -----------------------------------------------------------------
1072// Simple vector-lane smoke test (LANES = 4)
1073// -----------------------------------------------------------------
1074
1075#[test] // scipy.special.erf([0,1,2,3])  == [0, .8427…, .995322…, .999977…]
1076fn erf_simd_lanes() {
1077    let v = Simd::<f64, 4>::from_array([0.0, 1.0, 2.0, 3.0]);
1078    let y = erf_simd::<4>(v).to_array();
1079    let expect = [
1080        0.,
1081        0.8427007929497148,
1082        0.9953222650189527,
1083        0.9999779095030014,
1084    ];
1085    for (yi, ei) in y.iter().zip(expect.iter()) {
1086        assert!((yi - ei).abs() < 1e-15);
1087    }
1088}
1089
1090#[test] // scipy.special.erfc([-1,0,1,2]) == [1.84270…,1,0.157299…,0.004677…]
1091fn erfc_simd_lanes() {
1092    let v = Simd::<f64, 4>::from_array([-1.0, 0.0, 1.0, 2.0]);
1093    let y = erfc_simd::<4>(v).to_array();
1094    let expect = [
1095        1.8427007929497148,
1096        1.,
1097        0.15729920705028516,
1098        0.004677734981047266,
1099    ];
1100    for (yi, ei) in y.iter().zip(expect.iter()) {
1101        assert!((yi - ei).abs() < 1e-15);
1102    }
1103}
simd_kernels/kernels/scientific/erf.rs

simd_kernels/kernels/scientific/
erf.rs