pxfm 0.1.29

Fast and accurate math
Documentation
/*
 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
 * //
 * // Redistribution and use in source and binary forms, with or without modification,
 * // are permitted provided that the following conditions are met:
 * //
 * // 1.  Redistributions of source code must retain the above copyright notice, this
 * // list of conditions and the following disclaimer.
 * //
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
 * // this list of conditions and the following disclaimer in the documentation
 * // and/or other materials provided with the distribution.
 * //
 * // 3.  Neither the name of the copyright holder nor the names of its
 * // contributors may be used to endorse or promote products derived from
 * // this software without specific prior written permission.
 * //
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
use crate::exponents::expf::{ExpfBackend, GenericExpfBackend};

#[inline(always)]
fn expm1f_gen<B: ExpfBackend>(x: f32, backend: B) -> f32 {
    let x_u: u32 = x.to_bits();
    let x_abs = x_u & 0x7fff_ffffu32;

    // When |x| > 25*log(2), or nan
    if x_abs >= 0x418a_a123u32 {
        // x < log(2^-25)
        if x.is_sign_negative() {
            // exp(-Inf) = 0
            if x.is_infinite() {
                return -1.0;
            }
            // exp(nan) = nan
            if x.is_nan() {
                return x;
            }
            return -1.0;
        } else {
            // x >= 89 or nan
            if x_u >= 0x42b2_0000 {
                return x + f32::INFINITY;
            }
        }
    }

    // |x| < 2^-4
    if x_abs < 0x3d80_0000u32 {
        // |x| < 2^-25
        if x_abs < 0x3300_0000u32 {
            // x = -0.0f
            if x_u == 0x8000_0000u32 {
                return x;
            }
            // When |x| < 2^-25, the relative error of the approximation e^x - 1 ~ x
            // is:
            //   |(e^x - 1) - x| / |e^x - 1| < |x^2| / |x|
            //                               = |x|
            //                               < 2^-25
            //                               < epsilon(1)/2.
            // To simplify the rounding decision and make it more efficient, we use
            //   fma(x, x, x) ~ x + x^2 instead.
            // Note: to use the formula x + x^2 to decide the correct rounding, we
            // do need fma(x, x, x) to prevent underflow caused by x*x when |x| <
            // 2^-76. For targets without FMA instructions, we simply use double for
            // intermediate results as it is more efficient than using an emulated
            // version of FMA.
            #[cfg(any(
                all(
                    any(target_arch = "x86", target_arch = "x86_64"),
                    target_feature = "fma"
                ),
                target_arch = "aarch64"
            ))]
            {
                return backend.fmaf(x, x, x);
            }
            #[cfg(not(any(
                all(
                    any(target_arch = "x86", target_arch = "x86_64"),
                    target_feature = "fma"
                ),
                target_arch = "aarch64"
            )))]
            {
                let xd = x as f64;
                return backend.fma(xd, xd, xd) as f32;
            }
        }

        const C: [u64; 7] = [
            0x3fe0000000000000,
            0x3fc55555555557dd,
            0x3fa55555555552fa,
            0x3f8111110fcd58b7,
            0x3f56c16c1717660b,
            0x3f2a0241f0006d62,
            0x3efa01e3f8d3c060,
        ];

        // 2^-25 <= |x| < 2^-4
        let xd = x as f64;
        let xsq = xd * xd;
        // Degree-8 minimax polynomial generated by Sollya with:
        // > display = hexadecimal;
        // > P = fpminimax((expm1(x) - x)/x^2, 6, [|D...|], [-2^-4, 2^-4]);

        return backend.fma(
            backend.polyeval7(
                xd,
                f64::from_bits(C[0]),
                f64::from_bits(C[1]),
                f64::from_bits(C[2]),
                f64::from_bits(C[3]),
                f64::from_bits(C[4]),
                f64::from_bits(C[5]),
                f64::from_bits(C[6]),
            ),
            xsq,
            xd,
        ) as f32;
    }

    // For -104 < x < 89, to compute expm1(x), we perform the following range
    // reduction: find hi, mid, lo such that:
    //   x = hi + mid + lo, in which
    //     hi is an integer,
    //     mid * 2^7 is an integer
    //     -2^(-8) <= lo < 2^-8.
    // In particular,
    //   hi + mid = round(x * 2^7) * 2^(-7).
    // Then,
    //   expm1(x) = expm1(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo) - 1.
    // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
    // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
    // generated by Sollya.

    // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
    let kf = backend.roundf(x * 128.);
    // Subtract (hi + mid) from x to get lo.
    let xd = backend.fmaf(kf, -0.0078125 /* - 1/128 */, x) as f64;
    let mut x_hi = unsafe { kf.to_int_unchecked::<i32>() }; // it's already not indeterminate.
    x_hi += 104 << 7;
    // hi = x_hi >> 7
    let exp_hi = f64::from_bits(crate::exponents::expf::EXP_M1[(x_hi >> 7) as usize]);
    // mid * 2^7 = x_hi & 0x0000'007fU;
    let exp_mid = f64::from_bits(crate::exponents::expf::EXP_M2[(x_hi & 0x7f) as usize]);
    // Degree-4 minimax polynomial generated by Sollya with the following
    // commands:
    // d = [-2^-8, 2^-8];
    // f_exp = expm1(x)/x;
    // Q = fpminimax(f_exp, 3, [|D...|], [-2^-8, 2^-8]);
    let p = backend.polyeval5(
        xd,
        1.,
        f64::from_bits(0x3feffffffffff777),
        f64::from_bits(0x3fe000000000071c),
        f64::from_bits(0x3fc555566668e5e7),
        f64::from_bits(0x3fa55555555ef243),
    );
    backend.fma(p * exp_hi, exp_mid, -1.) as f32
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn expm1f_fma_impl(x: f32) -> f32 {
    use crate::exponents::expf::FmaBackend;
    expm1f_gen(x, FmaBackend {})
}

/// Computes e^x - 1
///
/// Max ULP 0.5
#[inline]
pub fn f_expm1f(x: f32) -> f32 {
    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
    {
        expm1f_gen(x, GenericExpfBackend {})
    }
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        use std::sync::OnceLock;
        static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
        let q = EXECUTOR.get_or_init(|| {
            if std::arch::is_x86_feature_detected!("avx")
                && std::arch::is_x86_feature_detected!("fma")
            {
                expm1f_fma_impl
            } else {
                fn def_expm1f(x: f32) -> f32 {
                    expm1f_gen(x, GenericExpfBackend {})
                }
                def_expm1f
            }
        });
        unsafe { q(x) }
    }
}

#[cfg(test)]
mod tests {
    use crate::f_expm1f;

    #[test]
    fn test_expm1f() {
        assert_eq!(f_expm1f(-3.0923562e-5), -3.0923085e-5);
        assert_eq!(f_expm1f(2.213121), 8.144211);
        assert_eq!(f_expm1f(-3.213121), -0.9597691);
        assert_eq!(f_expm1f(-2.35099e-38), -2.35099e-38);
        assert_eq!(
            f_expm1f(0.00000000000000000000000000000000000004355616),
            0.00000000000000000000000000000000000004355616
        );
        assert_eq!(f_expm1f(25.12315), 81441420000.0);
        assert_eq!(f_expm1f(12.986543), 436498.6);
        assert_eq!(f_expm1f(-12.986543), -0.99999774);
        assert_eq!(f_expm1f(-25.12315), -1.0);
        assert_eq!(f_expm1f(f32::INFINITY), f32::INFINITY);
        assert_eq!(f_expm1f(f32::NEG_INFINITY), -1.);
        assert!(f_expm1f(f32::NAN).is_nan());
    }
}