pxfm 0.1.29 - Docs.rs

/*
 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
 * //
 * // Redistribution and use in source and binary forms, with or without modification,
 * // are permitted provided that the following conditions are met:
 * //
 * // 1.  Redistributions of source code must retain the above copyright notice, this
 * // list of conditions and the following disclaimer.
 * //
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
 * // this list of conditions and the following disclaimer in the documentation
 * // and/or other materials provided with the distribution.
 * //
 * // 3.  Neither the name of the copyright holder nor the names of its
 * // contributors may be used to endorse or promote products derived from
 * // this software without specific prior written permission.
 * //
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
use crate::common::{f_fmla, f_fmlaf, pow2if};
use crate::exponents::expf::{ExpfBackend, GenericExpfBackend};
use std::hint::black_box;

const TBLSIZE: usize = 64;

#[repr(align(64))]
struct Exp2Table([(u32, u32); TBLSIZE]);

#[rustfmt::skip]
static EXP2FT: Exp2Table = Exp2Table([(0x3F3504F3, 0xB2D4175E),(0x3F36FD92, 0x3268D5EF),(0x3F38FBAF, 0xB30E8719),(0x3F3AFF5B, 0x3319E7DA),(0x3F3D08A4, 0x333CD82F),(0x3F3F179A, 0x330E1902),(0x3F412C4D, 0x32CCF4D7),(0x3F4346CD, 0x328F330E),(0x3F45672A, 0xB201B5B7),(0x3F478D75, 0x32CCCE34),(0x3F49B9BE, 0x335E937C),(0x3F4BEC15, 0x2FF41909),(0x3F4E248C, 0xB21760EA),(0x3F506334, 0x3283628B),(0x3F52A81E, 0x3340F500),(0x3F54F35B, 0x331202BD),(0x3F5744FD, 0x32B66A3E),(0x3F599D16, 0x32D0D9B1),(0x3F5BFBB8, 0x332ED93F),(0x3F5E60F5, 0x3350A709),(0x3F60CCDF, 0x32025744),(0x3F633F89, 0xB33A7C4D),(0x3F65B907, 0x321DA4E9),(0x3F68396A, 0xB2FF36A7),(0x3F6AC0C7, 0x3217E40E),(0x3F6D4F30, 0xB2400CBB),(0x3F6FE4BA, 0x331A2ACC),(0x3F728177, 0xB2B7D3E5),(0x3F75257D, 0xB1FED2BE),(0x3F77D0DF, 0xB32B73BA),(0x3F7A83B3, 0x32579081),(0x3F7D3E0C, 0xB19726B5),(0x3F800000, 0x00000000),(0x3F8164D2, 0x320C09FB),(0x3F82CD87, 0x3391E031),(0x3F843A29, 0x33287EEF),(0x3F85AAC3, 0xB38F6665),(0x3F871F62, 0x339004AB),(0x3F88980F, 0x33AC4561),(0x3F8A14D5, 0xB39CDAEA),(0x3F8B95C2, 0x32949D5C),(0x3F8D1ADF, 0xB36F79FA),(0x3F8EA43A, 0x33971DC2),(0x3F9031DC, 0xB32BD022),(0x3F91C3D3, 0xB3928952),(0x3F935A2B, 0xB2EBFECF),(0x3F94F4F0, 0x3357B8BB),(0x3F96942D, 0xB307353B),(0x3F9837F0, 0xB345DFE9),(0x3F99E046, 0x3382A804),(0x3F9B8D3A, 0x3326993E),(0x3F9D3EDA, 0x3350A029),(0x3F9EF532, 0xB3605F62),(0x3FA0B051, 0xB210909B),(0x3FA27043, 0xB0DDC369),(0x3FA43516, 0x33385844),(0x3FA5FED7, 0x33400757),(0x3FA7CD94, 0x3325446E),(0x3FA9A15B, 0x33237A50),(0x3FAB7A3A, 0x33201CA4),(0x3FAD583F, 0x32394687),(0x3FAF3B79, 0x332E1225),(0x3FB123F6, 0x33838969),(0x3FB311C4, 0xB219F2BA)]);

/**
Generated by SageMath:
```python
print("[")
for k in range(64):
    k = RealField(150)(2)**(RealField(150)(k) / RealField(150)(64))
    print(double_to_hex(k) + ",")
print("];")
```
**/
pub(crate) static EXP2F_TABLE: [u64; 64] = [
    0x3ff0000000000000,
    0x3ff02c9a3e778061,
    0x3ff059b0d3158574,
    0x3ff0874518759bc8,
    0x3ff0b5586cf9890f,
    0x3ff0e3ec32d3d1a2,
    0x3ff11301d0125b51,
    0x3ff1429aaea92de0,
    0x3ff172b83c7d517b,
    0x3ff1a35beb6fcb75,
    0x3ff1d4873168b9aa,
    0x3ff2063b88628cd6,
    0x3ff2387a6e756238,
    0x3ff26b4565e27cdd,
    0x3ff29e9df51fdee1,
    0x3ff2d285a6e4030b,
    0x3ff306fe0a31b715,
    0x3ff33c08b26416ff,
    0x3ff371a7373aa9cb,
    0x3ff3a7db34e59ff7,
    0x3ff3dea64c123422,
    0x3ff4160a21f72e2a,
    0x3ff44e086061892d,
    0x3ff486a2b5c13cd0,
    0x3ff4bfdad5362a27,
    0x3ff4f9b2769d2ca7,
    0x3ff5342b569d4f82,
    0x3ff56f4736b527da,
    0x3ff5ab07dd485429,
    0x3ff5e76f15ad2148,
    0x3ff6247eb03a5585,
    0x3ff6623882552225,
    0x3ff6a09e667f3bcd,
    0x3ff6dfb23c651a2f,
    0x3ff71f75e8ec5f74,
    0x3ff75feb564267c9,
    0x3ff7a11473eb0187,
    0x3ff7e2f336cf4e62,
    0x3ff82589994cce13,
    0x3ff868d99b4492ed,
    0x3ff8ace5422aa0db,
    0x3ff8f1ae99157736,
    0x3ff93737b0cdc5e5,
    0x3ff97d829fde4e50,
    0x3ff9c49182a3f090,
    0x3ffa0c667b5de565,
    0x3ffa5503b23e255d,
    0x3ffa9e6b5579fdbf,
    0x3ffae89f995ad3ad,
    0x3ffb33a2b84f15fb,
    0x3ffb7f76f2fb5e47,
    0x3ffbcc1e904bc1d2,
    0x3ffc199bdd85529c,
    0x3ffc67f12e57d14b,
    0x3ffcb720dcef9069,
    0x3ffd072d4a07897c,
    0x3ffd5818dcfba487,
    0x3ffda9e603db3285,
    0x3ffdfc97337b9b5f,
    0x3ffe502ee78b3ff6,
    0x3ffea4afa2a490da,
    0x3ffefa1bee615a27,
    0x3fff50765b6e4540,
    0x3fffa7c1819e90d8,
];

/* ULP 0.508 method
  let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;

  let ui = f32::to_bits(d + redux);
  let mut i0 = ui;
  i0 = i0.wrapping_add(TBLSIZE as u32 / 2);
  let k = i0 / TBLSIZE as u32;
  i0 &= TBLSIZE as u32 - 1;
  let mut uf = f32::from_bits(ui);
  uf -= redux;

  let item = EXP2FT.0[i0 as usize];
  let z0: f32 = f32::from_bits(item.0);
  let z1: f32 = f32::from_bits(item.1);

  let f: f32 = d - uf - z1;

  let mut u = 0.055504108664458832;
  u = f_fmlaf(u, f, 0.24022650695908768);
  u = f_fmlaf(u, f, 0.69314718055994973);
  u *= f;

  let i2 = pow2if(k as i32);
  f_fmlaf(u, z0, z0) * i2
*/

#[inline(always)]
fn exp2f_gen<B: ExpfBackend>(x: f32, backend: B) -> f32 {
    let mut t = x.to_bits();

    if (t & 0xffff) == 0 {
        // x maybe integer
        let k: i32 = (((t >> 23) & 0xff) as i32).wrapping_sub(127); // 2^k <= |x| < 2^(k+1)
        if k >= 0 && k < 9 && (t << (9i32.wrapping_add(k))) == 0 {
            // x integer, with 1 <= |x| < 2^9
            let msk = (t as i32) >> 31;
            let mut m: i32 = (((t & 0x7fffff) | (1 << 23)) >> (23 - k)) as i32;
            m = (m ^ msk).wrapping_sub(msk).wrapping_add(127);
            if m > 0 && m < 255 {
                t = (m as u32).wrapping_shl(23);
                return f32::from_bits(t);
            } else if m <= 0 && m > -23 {
                t = 1i32.wrapping_shl(22i32.wrapping_add(m) as u32) as u32;
                return f32::from_bits(t);
            }
        }
    }
    let ux = t.wrapping_shl(1);

    if ux >= 0x86000000u32 || ux < 0x65000000u32 {
        // |x| >= 128 or x=nan or |x| < 0x1p-26
        if ux < 0x65000000u32 {
            return 1.0 + x;
        } // |x| < 0x1p-26
        // if x < -149 or 128 <= x is special
        if !(t >= 0xc3000000u32 && t < 0xc3150000u32) {
            if ux >= 0xffu32 << 24 {
                // x is inf or nan
                if ux > (0xffu32 << 24) {
                    return x + x;
                } // x = nan
                static IR: [f32; 2] = [f32::INFINITY, 0.];
                return IR[(t >> 31) as usize]; // x = +-inf
            }
            if t >= 0xc3150000u32 {
                // x < -149
                let z = x;
                let mut y = f_fmla(
                    z as f64 + 149.,
                    f64::from_bits(0x3690000000000000),
                    f64::from_bits(0x36a0000000000000),
                );
                y = y.max(f64::from_bits(0x3680000000000000));
                return y as f32;
            }
            // now x >= 128
            let r = black_box(f64::from_bits(0x47e0000000000000))
                * black_box(f64::from_bits(0x47e0000000000000));
            return r as f32;
        }
    }

    if ux <= 0x7a000000u32 {
        // |x| < 1/32

        // Generated by Sollya exp2 on range [-1/32;1/32]:
        // d = [-1/32, 1/32];
        // f_exp2f = (2^y - 1)/y;
        // Q = fpminimax(f_exp2f, 5, [|D...|], d, relative, floating);

        // See ./notes/exp2f_small.sollya
        const C: [u64; 6] = [
            0x3fe62e42fefa39f3,
            0x3fcebfbdff82c57b,
            0x3fac6b08d6f2d7aa,
            0x3f83b2ab6fc92f5d,
            0x3f55d897cfe27125,
            0x3f243090e61e6af1,
        ];
        let xd = x as f64;
        let p = backend.polyeval6(
            xd,
            f64::from_bits(C[0]),
            f64::from_bits(C[1]),
            f64::from_bits(C[2]),
            f64::from_bits(C[3]),
            f64::from_bits(C[4]),
            f64::from_bits(C[5]),
        );
        return backend.fma(p, xd, 1.) as f32;
    }

    let x_d = x as f64;
    let kf = backend.round(x_d * 64.);
    let k = unsafe { kf.to_int_unchecked::<i32>() }; // it's already not indeterminate.
    // dx = lo = x - (hi + mid) = x - kf * 2^(-6)
    let dx = backend.fma(f64::from_bits(0xbf90000000000000), kf, x_d);

    const TABLE_BITS: u32 = 6;
    const TABLE_MASK: u64 = (1u64 << TABLE_BITS) - 1;

    // hi = floor(kf * 2^(-5))
    // exp_hi = shift hi to the exponent field of double precision.
    let exp_hi: i64 = ((k >> TABLE_BITS) as i64).wrapping_shl(52);

    // mh = 2^hi * 2^mid
    // mh_bits = bit field of mh
    let mh_bits = (EXP2F_TABLE[((k as u64) & TABLE_MASK) as usize] as i64).wrapping_add(exp_hi);
    let mh = f64::from_bits(mh_bits as u64);

    // Degree-4 polynomial approximating (2^x - 1)/x generated by Sollya with:
    // > P = fpminimax((2^y - 1)/y, 4, [|D...|], [-1/64. 1/64]);
    // see ./notes/exp2f.sollya
    const C: [u64; 5] = [
        0x3fe62e42fefa39ef,
        0x3fcebfbdff8131c4,
        0x3fac6b08d7061695,
        0x3f83b2b1bee74b2a,
        0x3f55d88091198529,
    ];
    let dx_sq = dx * dx;
    let c1 = backend.fma(dx, f64::from_bits(C[0]), 1.0);
    let c2 = backend.fma(dx, f64::from_bits(C[2]), f64::from_bits(C[1]));
    let c3 = backend.fma(dx, f64::from_bits(C[4]), f64::from_bits(C[3]));
    let p = backend.fma(dx_sq, c3, c2);
    // 2^x = 2^(hi + mid + lo)
    //     = 2^(hi + mid) * 2^lo
    //     ~ mh * (1 + lo * P(lo))
    //     = mh + (mh*lo) * P(lo)
    backend.fma(p, dx_sq * mh, c1 * mh) as f32
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn exp2f_fma_impl(x: f32) -> f32 {
    use crate::exponents::expf::FmaBackend;
    exp2f_gen(x, FmaBackend {})
}

/// Computing exp2f
///
/// ULP 0.4999994
#[inline]
pub fn f_exp2f(x: f32) -> f32 {
    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
    {
        exp2f_gen(x, GenericExpfBackend {})
    }
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        use std::sync::OnceLock;
        static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
        let q = EXECUTOR.get_or_init(|| {
            if std::arch::is_x86_feature_detected!("avx")
                && std::arch::is_x86_feature_detected!("fma")
            {
                exp2f_fma_impl
            } else {
                fn def_exp2f(x: f32) -> f32 {
                    exp2f_gen(x, GenericExpfBackend {})
                }
                def_exp2f
            }
        });
        unsafe { q(x) }
    }
}

#[inline]
pub(crate) fn dirty_exp2f(d: f32) -> f32 {
    let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;

    let ui = f32::to_bits(d + redux);
    let mut i0 = ui;
    i0 = i0.wrapping_add(TBLSIZE as u32 / 2);
    let k = i0 / TBLSIZE as u32;
    i0 &= TBLSIZE as u32 - 1;
    let mut uf = f32::from_bits(ui);
    uf -= redux;

    let item = EXP2FT.0[i0 as usize];
    let z0: f32 = f32::from_bits(item.0);

    let f: f32 = d - uf;

    let mut u = 0.24022650695908768;
    u = f_fmlaf(u, f, 0.69314718055994973);
    u *= f;

    let i2 = pow2if(k as i32);
    f_fmlaf(u, z0, z0) * i2
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_exp2f() {
        assert!(f_exp2f(f32::from_bits(0x7fc0_0000)).is_nan());
        assert_eq!(f_exp2f(1. / 64.), 1.0108893);
        assert_eq!(f_exp2f(2.0), 4.0);
        assert_eq!(f_exp2f(3.0), 8.0);
        assert_eq!(f_exp2f(4.0), 16.0);
        assert_eq!(f_exp2f(10.0), 1024.0);
        assert_eq!(f_exp2f(-10.0), 0.0009765625);
        assert!(f_exp2f(f32::NAN).is_nan());
        assert_eq!(f_exp2f(-0.35), 0.7845841);
        assert_eq!(f_exp2f(0.35), 1.2745606);
        assert!(f_exp2f(f32::INFINITY).is_infinite());
        assert_eq!(f_exp2f(f32::NEG_INFINITY), 0.0);
    }

    #[test]
    fn test_dirty_exp2f() {
        assert!((dirty_exp2f(0.35f32) - 0.35f32.exp2()).abs() < 1e-5);
        assert!((dirty_exp2f(-0.6f32) - (-0.6f32).exp2()).abs() < 1e-5);
    }
}