g_math 0.4.2 - Docs.rs

//! Decimal exponential — `exp(x)` at compute dp.
//!
//! # Algorithm
//!
//! Uses **ln(2)-based range reduction**: rewrite `exp(x) = 2^k × exp(r)` where
//! `k = round(x / ln(2))` and `r = x - k × ln(2)` ensures `|r| ≤ ln(2)/2 ≈ 0.347`.
//!
//! 1. **Range reduction**: compute integer `k` and small remainder `r`.
//! 2. **Taylor series** for `exp(r)` with `|r| ≤ 0.347` — converges in ~25 terms at dp=38.
//!    Iterative: `term_0 = 1, term_n = term_{n-1} × r / n`.
//! 3. **Multiply by 2^k**: bit shift the compute-tier result. For positive `k`,
//!    shift left (exact); for negative `k`, shift right with rounding.
//!
//! # Precision advantage over scaling-and-squaring
//!
//! Each squaring iteration of `exp(y)^2` doubles relative error. For `exp(20)` with
//! the halve-and-square method, 14 squarings amplify error by 16384×.
//!
//! With ln(2)-based reduction, there is **no squaring** — only Taylor convergence
//! error and bit-shift truncation. Total error stays bounded by ~Taylor truncation,
//! which is well below 1 ULP at the compute dp.

use super::decimal_compute::{
    ComputeStorage, DECIMAL_COMPUTE_DP,
    decimal_compute_zero, decimal_compute_one,
    decimal_compute_add, decimal_compute_sub, decimal_compute_mul, decimal_compute_div,
    decimal_compute_div_int, decimal_compute_halve,
    decimal_compute_is_zero, decimal_compute_is_negative, decimal_compute_cmp,
    decimal_compute_neg, pow10_compute_ct,
};
use crate::fixed_point::domains::symbolic::rational::rational_number::OverflowDetected;
use std::cell::RefCell;

thread_local! {
    /// Cached `ln(2)` at compute dp. Computed once via `2 * atanh(1/3)`.
    static LN2_FOR_EXP: RefCell<Option<ComputeStorage>> = const { RefCell::new(None) };
}

/// Compute `ln(2)` at decimal compute dp via `2 × atanh(1/3)`.
///
/// We can't depend on `ln.rs` because that would create a cycle when `pow` calls
/// both `ln` and `exp`. Inline the atanh series here (it's a small function).
fn ln2_at_compute() -> Result<ComputeStorage, OverflowDetected> {
    let cached: Option<ComputeStorage> = LN2_FOR_EXP.with(|c| c.borrow().clone());
    if let Some(v) = cached {
        return Ok(v);
    }

    let one = decimal_compute_one();
    let three = super::decimal_compute::decimal_compute_from_int(3);
    let s = decimal_compute_div(one, three)?;

    // atanh(s) = s + s³/3 + s⁵/5 + ... for |s| < 1
    let s_sq = decimal_compute_mul(s, s);
    let mut term = s;
    let mut sum = s;
    let max_terms = (DECIMAL_COMPUTE_DP as u32) + 20;
    for k in 1..=max_terms {
        term = decimal_compute_mul(term, s_sq);
        if decimal_compute_is_zero(&term) { break; }
        let divisor = (2 * k as u64) + 1;
        let contribution = decimal_compute_div_int(term, divisor);
        if decimal_compute_is_zero(&contribution) { break; }
        sum = decimal_compute_add(sum, contribution);
    }
    let ln2 = decimal_compute_add(sum, sum); // 2 × atanh(1/3)

    LN2_FOR_EXP.with(|c: &RefCell<Option<ComputeStorage>>| *c.borrow_mut() = Some(ln2));
    Ok(ln2)
}

/// Maximum Taylor terms for exp series with |r| ≤ ln(2)/2 ≈ 0.347.
///
/// Term k is bounded by `0.347^k / k!`. This falls below `10^-DECIMAL_COMPUTE_DP`
/// at about `k ≈ DECIMAL_COMPUTE_DP / 1.46 + 5` terms (Stirling estimate).
const fn max_taylor_terms() -> u32 {
    (DECIMAL_COMPUTE_DP as u32) + 20
}

/// Round a compute-tier value to the nearest integer (returned as i64).
fn round_to_int(v: ComputeStorage) -> Result<i64, OverflowDetected> {
    let half = decimal_compute_halve(decimal_compute_one());
    let rounded = if decimal_compute_is_negative(&v) {
        decimal_compute_sub(v, half)
    } else {
        decimal_compute_add(v, half)
    };
    let scale = decimal_compute_one();

    #[cfg(table_format = "q16_16")]
    { Ok(rounded / scale) }
    #[cfg(table_format = "q32_32")]
    { Ok((rounded / scale) as i64) }
    #[cfg(table_format = "q64_64")]
    {
        let q = rounded / scale;
        if !q.fits_in_i128() { return Err(OverflowDetected::Overflow); }
        let q_i128 = q.as_i128();
        if q_i128 > i64::MAX as i128 || q_i128 < i64::MIN as i128 {
            return Err(OverflowDetected::Overflow);
        }
        Ok(q_i128 as i64)
    }
    #[cfg(table_format = "q128_128")]
    {
        let q = rounded / scale;
        let q_i128 = q.as_i128();
        if q_i128 > i64::MAX as i128 || q_i128 < i64::MIN as i128 {
            return Err(OverflowDetected::Overflow);
        }
        Ok(q_i128 as i64)
    }
    #[cfg(table_format = "q256_256")]
    {
        let q = rounded / scale;
        let q_i128 = q.as_i128();
        if q_i128 > i64::MAX as i128 || q_i128 < i64::MIN as i128 {
            return Err(OverflowDetected::Overflow);
        }
        Ok(q_i128 as i64)
    }
}

/// Multiply a compute-tier value by `2^k`, where `k` may be negative (shift right with rounding).
///
/// **Shift type note**: I256 shifts take `u32`, while i64/i128/I512/I1024 take `usize`.
fn mul_by_pow2(v: ComputeStorage, k: i64) -> Result<ComputeStorage, OverflowDetected> {
    if k == 0 {
        return Ok(v);
    }
    if k > 0 {
        if k > 200 {
            return Err(OverflowDetected::Overflow);
        }
        // I256 Shl takes usize; I512/I1024 Shl take usize; primitive ints take u32 or usize.
        #[cfg(table_format = "q16_16")]
        { Ok(v << (k as u32)) }
        #[cfg(table_format = "q32_32")]
        { Ok(v << (k as u32)) }
        #[cfg(table_format = "q64_64")]
        { Ok(v << (k as usize)) }
        #[cfg(table_format = "q128_128")]
        { Ok(v << (k as usize)) }
        #[cfg(table_format = "q256_256")]
        { Ok(v << (k as usize)) }
    } else {
        let abs_k = (-k) as u64;
        if abs_k >= 250 {
            return Ok(decimal_compute_zero());
        }
        let one_compute = {
            #[cfg(table_format = "q16_16")]
            { 1i64 }
            #[cfg(table_format = "q32_32")]
            { 1i128 }
            #[cfg(table_format = "q64_64")]
            { crate::fixed_point::i256::I256::from_i128(1) }
            #[cfg(table_format = "q128_128")]
            { crate::fixed_point::i512::I512::from_i128(1) }
            #[cfg(table_format = "q256_256")]
            { crate::fixed_point::I1024::from_i128(1) }
        };
        // I256 Shl: usize; I256 Shr: u32 (asymmetric in i256.rs).
        // I512: both usize. I1024: both usize. Primitives: u32.
        let round_bit = {
            #[cfg(table_format = "q16_16")]
            { one_compute << ((abs_k - 1) as u32) }
            #[cfg(table_format = "q32_32")]
            { one_compute << ((abs_k - 1) as u32) }
            #[cfg(table_format = "q64_64")]
            { one_compute << ((abs_k - 1) as usize) }
            #[cfg(table_format = "q128_128")]
            { one_compute << ((abs_k - 1) as usize) }
            #[cfg(table_format = "q256_256")]
            { one_compute << ((abs_k - 1) as usize) }
        };
        let rounded = if decimal_compute_is_negative(&v) {
            v - round_bit
        } else {
            v + round_bit
        };
        #[cfg(table_format = "q16_16")]
        { Ok(rounded >> (abs_k as u32)) }
        #[cfg(table_format = "q32_32")]
        { Ok(rounded >> (abs_k as u32)) }
        #[cfg(table_format = "q64_64")]
        { Ok(rounded >> (abs_k as u32)) }
        #[cfg(table_format = "q128_128")]
        { Ok(rounded >> (abs_k as usize)) }
        #[cfg(table_format = "q256_256")]
        { Ok(rounded >> (abs_k as usize)) }
    }
}

// ============================================================================
// 4-STAGE TABLE CACHE FOR FAST EXP
// ============================================================================

/// Cached exp tables for 4-stage decimal digit decomposition.
///
/// `exp(x) = exp(k) × exp(d1/10) × exp(d2/100) × exp(d3/1000) × exp(d4/10000) × exp(r)`
/// where `|r| < 10^-4` and Taylor converges in ~8 terms at dp=38.
///
/// Tables are computed once per thread on first use, then reused.
/// Total: 31 + 4×10 = 71 entries × sizeof(ComputeStorage).
struct ExpTables {
    /// exp(k) for k = 0..=30
    exp_int: [ComputeStorage; 31],
    /// exp(d × 0.1) for d = 0..=9
    exp_tenths: [ComputeStorage; 10],
    /// exp(d × 0.01) for d = 0..=9
    exp_hundredths: [ComputeStorage; 10],
    /// exp(d × 0.001) for d = 0..=9
    exp_thousandths: [ComputeStorage; 10],
}

impl Clone for ExpTables {
    fn clone(&self) -> Self {
        Self {
            exp_int: self.exp_int,
            exp_tenths: self.exp_tenths,
            exp_hundredths: self.exp_hundredths,
            exp_thousandths: self.exp_thousandths,
        }
    }
}

thread_local! {
    static EXP_TABLES: RefCell<Option<ExpTables>> = const { RefCell::new(None) };
}

/// Pure Taylor series for exp(x) = 1 + x + x²/2! + x³/3! + ...
/// No range reduction — works for any x but converges faster for small |x|.
/// Early-exits when terms underflow to zero.
fn exp_taylor_raw(x: ComputeStorage) -> Result<ComputeStorage, OverflowDetected> {
    let one = decimal_compute_one();
    let mut term = one;
    let mut sum = one;
    let max_terms = max_taylor_terms();
    for n in 1..=max_terms {
        term = decimal_compute_mul(term, x);
        term = decimal_compute_div_int(term, n as u64);
        if decimal_compute_is_zero(&term) { break; }
        sum = decimal_compute_add(sum, term);
    }
    Ok(sum)
}

/// Extract a small integer from ComputeStorage (must fit in i64).
fn compute_as_i64(v: ComputeStorage) -> i64 {
    #[cfg(table_format = "q16_16")]
    { v }
    #[cfg(table_format = "q32_32")]
    { v as i64 }
    #[cfg(table_format = "q64_64")]
    { v.as_i128() as i64 }
    #[cfg(table_format = "q128_128")]
    { v.as_i128() as i64 }
    #[cfg(table_format = "q256_256")]
    { v.as_i128() as i64 }
}

/// Build exp tables (one-time per thread). Uses raw Taylor series.
fn build_exp_tables() -> Result<ExpTables, OverflowDetected> {
    let one = decimal_compute_one();
    let zero = decimal_compute_zero();

    // exp(1) via Taylor (converges in ~35 terms at dp=38)
    let e1 = exp_taylor_raw(one)?;

    // exp(k) = exp(1)^k via successive multiplication
    let mut exp_int = [zero; 31];
    exp_int[0] = one;
    exp_int[1] = e1;
    for k in 2..=30usize {
        exp_int[k] = decimal_compute_mul(exp_int[k - 1], e1);
    }

    // exp(d × 0.1) for d = 0..9
    let mut exp_tenths = [zero; 10];
    exp_tenths[0] = one;
    let pt1 = pow10_compute_ct(DECIMAL_COMPUTE_DP - 1); // 0.1 at compute dp
    let e_pt1 = exp_taylor_raw(pt1)?;
    exp_tenths[1] = e_pt1;
    for d in 2..=9usize {
        exp_tenths[d] = decimal_compute_mul(exp_tenths[d - 1], e_pt1);
    }

    // exp(d × 0.01)
    let mut exp_hundredths = [zero; 10];
    exp_hundredths[0] = one;
    let pt01 = pow10_compute_ct(DECIMAL_COMPUTE_DP - 2);
    let e_pt01 = exp_taylor_raw(pt01)?;
    exp_hundredths[1] = e_pt01;
    for d in 2..=9usize {
        exp_hundredths[d] = decimal_compute_mul(exp_hundredths[d - 1], e_pt01);
    }

    // exp(d × 0.001)
    let mut exp_thousandths = [zero; 10];
    exp_thousandths[0] = one;
    let pt001 = pow10_compute_ct(DECIMAL_COMPUTE_DP - 3);
    let e_pt001 = exp_taylor_raw(pt001)?;
    exp_thousandths[1] = e_pt001;
    for d in 2..=9usize {
        exp_thousandths[d] = decimal_compute_mul(exp_thousandths[d - 1], e_pt001);
    }

    Ok(ExpTables { exp_int, exp_tenths, exp_hundredths, exp_thousandths })
}

/// 4-stage table-based exp for |x| ≤ 30.
///
/// Decomposes |x| = k + d1/10 + d2/100 + d3/1000 + r where:
/// - k ∈ [0, 30] — integer part
/// - d1, d2, d3 ∈ [0, 9] — fractional decimal digits
/// - |r| < 10^-3 — tiny remainder for short Taylor (~8-12 terms)
///
/// Result = exp(k) × exp(d1/10) × exp(d2/100) × exp(d3/1000) × exp(r)
/// All factors except exp(r) are table lookups. 4 widening multiplies + short Taylor.
fn decimal_exp_table_path(abs_x: ComputeStorage) -> Result<ComputeStorage, OverflowDetected> {
    let one = decimal_compute_one();
    let s1 = pow10_compute_ct(DECIMAL_COMPUTE_DP - 1);
    let s2 = pow10_compute_ct(DECIMAL_COMPUTE_DP - 2);
    let s3 = pow10_compute_ct(DECIMAL_COMPUTE_DP - 3);

    // Extract integer part and 3 fractional digits
    let k_compute = abs_x / one;
    let k = compute_as_i64(k_compute);
    let frac = decimal_compute_sub(abs_x, mul_compute_by_int(one, k));

    let d1_compute = frac / s1;
    let d1 = compute_as_i64(d1_compute);
    let frac2 = decimal_compute_sub(frac, mul_compute_by_int(s1, d1));

    let d2_compute = frac2 / s2;
    let d2 = compute_as_i64(d2_compute);
    let frac3 = decimal_compute_sub(frac2, mul_compute_by_int(s2, d2));

    let d3_compute = frac3 / s3;
    let d3 = compute_as_i64(d3_compute);
    let remainder = decimal_compute_sub(frac3, mul_compute_by_int(s3, d3));

    // Bounds check — k must be in table range
    if k < 0 || k > 30 || d1 < 0 || d1 > 9 || d2 < 0 || d2 > 9 || d3 < 0 || d3 > 9 {
        // Fallback to ln(2) reduction for out-of-range
        return decimal_exp_ln2_reduction(abs_x);
    }

    // Table lookup + multiply
    EXP_TABLES.with(|c| {
        // Ensure tables are built
        {
            let cached = c.borrow();
            if cached.is_none() {
                drop(cached);
                let tables = build_exp_tables()?;
                *c.borrow_mut() = Some(tables);
            }
        }
        let tables = c.borrow();
        let t = tables.as_ref().unwrap();

        let mut result = t.exp_int[k as usize];
        result = decimal_compute_mul(result, t.exp_tenths[d1 as usize]);
        result = decimal_compute_mul(result, t.exp_hundredths[d2 as usize]);
        result = decimal_compute_mul(result, t.exp_thousandths[d3 as usize]);

        // Short Taylor for remainder (|r| < 10^-3)
        if !decimal_compute_is_zero(&remainder) {
            let exp_r = exp_taylor_raw(remainder)?;
            result = decimal_compute_mul(result, exp_r);
        }

        Ok(result)
    })
}

/// ln(2)-based range reduction fallback for large |x| > 30.
fn decimal_exp_ln2_reduction(x: ComputeStorage) -> Result<ComputeStorage, OverflowDetected> {
    let ln2 = ln2_at_compute()?;
    let x_over_ln2 = decimal_compute_div(x, ln2)?;
    let k = round_to_int(x_over_ln2)?;
    let k_times_ln2 = mul_compute_by_int(ln2, k);
    let r = decimal_compute_sub(x, k_times_ln2);

    let exp_r = exp_taylor_raw(r)?;
    mul_by_pow2(exp_r, k)
}

/// Compute `exp(x)` for x at compute dp.
///
/// # Algorithm
///
/// For |x| ≤ 30: **4-stage table decomposition** — decomposes x into decimal digits,
/// looks up precomputed exp values per digit, multiplies, then short Taylor for the
/// tiny remainder. ~4 widening multiplies + ~8-12 Taylor terms.
///
/// For |x| > 30: **ln(2)-based range reduction** — `exp(x) = 2^k × exp(r)` where
/// `k = round(x/ln(2))` and `|r| ≤ ln(2)/2`.
///
/// The table path is ~2-3× faster than the ln(2) path for common inputs.
pub fn decimal_exp(x: ComputeStorage) -> Result<ComputeStorage, OverflowDetected> {
    if decimal_compute_is_zero(&x) {
        return Ok(decimal_compute_one());
    }

    let is_neg = decimal_compute_is_negative(&x);
    let abs_x = if is_neg { decimal_compute_neg(x) } else { x };

    // Check if |x| ≤ 30 (table path range)
    let one = decimal_compute_one();
    let thirty = mul_compute_by_int(one, 30);
    let use_table = decimal_compute_cmp(&abs_x, &thirty) != std::cmp::Ordering::Greater;

    let result = if use_table {
        decimal_exp_table_path(abs_x)?
    } else {
        // Large argument: ln(2) reduction on the original (signed) value
        return decimal_exp_ln2_reduction(x);
    };

    if is_neg {
        // exp(-|x|) = 1/exp(|x|)
        decimal_compute_div(one, result)
    } else {
        Ok(result)
    }
}

/// Multiply a compute-tier value by an integer (positive or negative).
fn mul_compute_by_int(v: ComputeStorage, n: i64) -> ComputeStorage {
    if n == 0 {
        return decimal_compute_zero();
    }
    let negative = n < 0;
    let n_abs = n.unsigned_abs();
    let n_compute: ComputeStorage = {
        #[cfg(table_format = "q16_16")]
        { n_abs as i64 }
        #[cfg(table_format = "q32_32")]
        { n_abs as i128 }
        #[cfg(table_format = "q64_64")]
        { crate::fixed_point::i256::I256::from_i128(n_abs as i128) }
        #[cfg(table_format = "q128_128")]
        { crate::fixed_point::i512::I512::from_i128(n_abs as i128) }
        #[cfg(table_format = "q256_256")]
        { crate::fixed_point::I1024::from_i128(n_abs as i128) }
    };
    let result = v * n_compute;
    if negative {
        decimal_compute_neg(result)
    } else {
        result
    }
}

/// Compute `exp(-x)` — used internally for hyperbolic functions.
#[allow(dead_code)]
pub fn decimal_exp_neg(x: ComputeStorage) -> Result<ComputeStorage, OverflowDetected> {
    decimal_exp(decimal_compute_neg(x))
}

/// Fused `(sinh(x), cosh(x))` at decimal compute tier — shares one exp-pair evaluation.
///
/// Computes `exp(x)` and `exp(-x)` once, then derives
/// `sinh(x) = (exp(x) - exp(-x)) / 2` and `cosh(x) = (exp(x) + exp(-x)) / 2`.
///
/// sinh and cosh are derived from the **same** `(ep, en)` pair at compute tier,
/// so their rounding bias is correlated — critical for expressions like
/// `cosh(θ)·p + (sinh(θ)/θ)·v` where the two errors cancel.
pub fn decimal_sinhcosh(x: ComputeStorage) -> Result<(ComputeStorage, ComputeStorage), OverflowDetected> {
    if decimal_compute_is_zero(&x) {
        return Ok((decimal_compute_zero(), decimal_compute_one()));
    }
    let ep = decimal_exp(x)?;
    let en = decimal_exp(decimal_compute_neg(x))?;
    let sinh_c = decimal_compute_halve(decimal_compute_sub(ep, en));
    let cosh_c = decimal_compute_halve(decimal_compute_add(ep, en));
    Ok((sinh_c, cosh_c))
}

#[cfg(all(test, table_format = "q64_64"))]
mod tests {
    use super::*;
    use super::super::decimal_compute::{decimal_compute_from_int, pow10_compute_ct};
    use crate::fixed_point::i256::I256;

    /// mpmath reference: exp(0) = 1.0 exactly
    #[test]
    fn exp_zero_is_one() {
        let result = decimal_exp(decimal_compute_zero()).unwrap();
        assert_eq!(result, decimal_compute_one());
    }

    /// mpmath: exp(1) = 2.71828182845904523536028747135266249775724709369995957...
    #[test]
    fn exp_one_within_1_ulp() {
        let result = decimal_exp(decimal_compute_one()).unwrap();
        // Expected: 2.71828182845904523536028747135266249775 × 10^38
        // = 27182818284590452353602874713526624977 at dp=38 (+0.5 rounding)
        // mpmath rounded: 2.7182818284590452353602874713526624977572 × 10^38
        // → 27182818284590452353602874713526624978 at dp=38 (round half to even)
        let expected_str = "271828182845904523536028747135266249776";
        let expected = parse_decimal_str_q64_64(expected_str);

        // Allow up to 1000 ULP tolerance for this first correctness check
        let diff = if result > expected { result - expected } else { expected - result };
        let tolerance = I256::from_i128(10_000);
        assert!(
            diff < tolerance,
            "exp(1) precision check: got={:?}, expected={:?}, diff={:?}",
            result, expected, diff
        );
    }

    /// mpmath: exp(0.5) = 1.64872127070012814684865078781416357165377610071014...
    #[test]
    fn exp_half_within_1_ulp() {
        // 0.5 at compute dp = 5 × 10^37
        let half = pow10_compute_ct(37) * I256::from_i128(5);
        let result = decimal_exp(half).unwrap();
        // Expected: 1.64872127070012814684865078781416357165 × 10^38
        let expected_str = "164872127070012814684865078781416357165";
        let expected = parse_decimal_str_q64_64(expected_str);

        let diff = if result > expected { result - expected } else { expected - result };
        let tolerance = I256::from_i128(10_000);
        assert!(
            diff < tolerance,
            "exp(0.5) precision check: got={:?}, expected={:?}, diff={:?}",
            result, expected, diff
        );
    }

    /// Parse a decimal-digit string into an I256 assuming it represents a value
    /// at q64_64 compute dp=38.
    fn parse_decimal_str_q64_64(s: &str) -> I256 {
        let mut result = I256::from_i128(0);
        let ten = I256::from_i128(10);
        for ch in s.chars() {
            let digit = ch.to_digit(10).expect("non-digit in test string");
            result = result * ten + I256::from_i128(digit as i128);
        }
        result
    }

    /// mpmath: exp(2) = 7.38905609893065022723042746057500781318031557055184...
    #[test]
    fn exp_two_within_reasonable() {
        let two = decimal_compute_from_int(2);
        let result = decimal_exp(two).unwrap();
        // Expected: 7.38905609893065022723042746057500781318 × 10^38
        let expected_str = "738905609893065022723042746057500781318";
        let expected = parse_decimal_str_q64_64(expected_str);

        let diff = if result > expected { result - expected } else { expected - result };
        let tolerance = I256::from_i128(100_000);
        assert!(
            diff < tolerance,
            "exp(2) precision check: got={:?}, expected={:?}, diff={:?}",
            result, expected, diff
        );
    }

    /// mpmath: exp(-1) = 0.36787944117144232839...
    /// Storage-tier validation (compute-tier rounding from 1/exp(1) is acceptable
    /// as long as storage-tier result is 0 ULP — covered by decimal_transcendental_validation).
    #[test]
    fn exp_neg_one() {
        use crate::canonical::{gmath, evaluate};
        let result = evaluate(&gmath("-1.0").exp()).unwrap();
        let s = format!("{}", result);
        // exp(-1) ≈ 0.36787944117144232...
        assert!(s.starts_with("0.3678794411714"),
            "exp(-1) at storage tier should match mpmath to 13+ digits, got: {}", s);
    }
}