nanobook 0.10.0

//! Statistical functions for quantitative analysis.
//!
//! Provides Spearman rank correlation and quintile spread analysis,
//! replacing direct scipy/numpy calls in qtrade.
//!
//! # References
//!
//! - SciPy `spearmanr`: <https://github.com/scipy/scipy/blob/main/scipy/stats/_correlation.py>
//! - Average-rank tie-breaking follows the standard convention.
//! - Welford 1962, "Note on a method for calculating corrected sums of squares
//!   and products," *Technometrics* 4 (3): 419–420.

// ---------------------------------------------------------------------------
// Numerically-stable moment accumulation (Welford)
// ---------------------------------------------------------------------------

/// Compute `(mean, m2)` over a slice via Welford's online algorithm,
/// where `m2 = sum_i (x_i - mean)^2` is the sum of squared deviations
/// from the mean.
///
/// Callers pick the variance convention:
/// - Population variance (ddof=0): `m2 / n`
/// - Sample variance (ddof=1):     `m2 / (n - 1)`
///
/// Returns `(0.0, 0.0)` for empty slices and `(slice[0], 0.0)` for
/// single-element slices.
///
/// This is used by rolling-window statistics to avoid the catastrophic
/// cancellation that afflicts the naive `sum_sq - sum^2/n` formula on
/// high-mean, low-variance series. For a series like
/// `[1000.0 + 1e-9 * i]`, the naive formula returns variance ≈ 0 via
/// rounding, and any downstream `.max(0.0).sqrt()` silently emits 0.
/// Welford maintains `(mean, m2)` in a form that resists this loss.
#[inline]
pub(crate) fn welford_mean_m2(slice: &[f64]) -> (f64, f64) {
    let mut mean = 0.0_f64;
    let mut m2 = 0.0_f64;
    let mut n = 0.0_f64;
    for &x in slice {
        n += 1.0;
        let delta = x - mean;
        mean += delta / n;
        let delta2 = x - mean;
        m2 += delta * delta2;
    }
    (mean, m2)
}

// ---------------------------------------------------------------------------
// Ranking
// ---------------------------------------------------------------------------

/// Compute ranks with average tie-breaking (matches scipy's default).
///
/// Elements are ranked 1..N. Tied values receive the average of their ranks.
///
/// # NaN handling
///
/// If any input value is NaN, this function returns a vector of `n` NaN
/// ranks. This matches `scipy.stats.rankdata(values, nan_policy='propagate')`.
/// Older nanobook versions silently assigned ranks to NaN inputs via
/// `partial_cmp().unwrap_or(Ordering::Equal)`, producing incorrect Spearman
/// correlations and quintile spreads downstream. Callers that want NaN
/// inputs treated as ordinary values must filter or substitute first.
fn rankdata(values: &[f64]) -> Vec<f64> {
    let n = values.len();
    if n == 0 {
        return vec![];
    }

    // NaN propagation: any NaN in the input produces an all-NaN output.
    // partial_cmp returns None for NaN comparisons, and the sort fallback
    // `Ordering::Equal` would scatter NaN values through the rank
    // assignment, silently corrupting any downstream statistic.
    if values.iter().any(|v| v.is_nan()) {
        return vec![f64::NAN; n];
    }

    // Sort indices by value. With NaN ruled out above, partial_cmp is
    // infallible for the remaining finite/+-inf values, so the Equal
    // fallback is unreachable.
    let mut indices: Vec<usize> = (0..n).collect();
    indices.sort_by(|&a, &b| {
        values[a]
            .partial_cmp(&values[b])
            .unwrap_or(std::cmp::Ordering::Equal)
    });

    let mut ranks = vec![0.0_f64; n];
    let mut i = 0;
    while i < n {
        // Find the extent of the tie group
        let mut j = i + 1;
        while j < n && values[indices[j]] == values[indices[i]] {
            j += 1;
        }

        // Average rank for this tie group (1-based)
        let avg_rank = (i + 1 + j) as f64 / 2.0;
        for &idx in &indices[i..j] {
            ranks[idx] = avg_rank;
        }
        i = j;
    }

    ranks
}

/// Pearson correlation coefficient between two slices.
fn pearson(x: &[f64], y: &[f64]) -> f64 {
    let n = x.len() as f64;
    if n < 2.0 {
        return f64::NAN;
    }

    let mean_x = x.iter().sum::<f64>() / n;
    let mean_y = y.iter().sum::<f64>() / n;

    let mut cov = 0.0_f64;
    let mut var_x = 0.0_f64;
    let mut var_y = 0.0_f64;

    for i in 0..x.len() {
        let dx = x[i] - mean_x;
        let dy = y[i] - mean_y;
        cov += dx * dy;
        var_x += dx * dx;
        var_y += dy * dy;
    }

    if var_x == 0.0 || var_y == 0.0 {
        return f64::NAN;
    }

    cov / (var_x * var_y).sqrt()
}

// ---------------------------------------------------------------------------
// t-distribution CDF (for p-value computation)
// ---------------------------------------------------------------------------

/// Regularized incomplete beta function I_x(a, b) via continued fraction.
///
/// Uses the symmetry relation I_x(a,b) = 1 - I_{1-x}(b,a) when x > a/(a+b)
/// to ensure the continued fraction converges quickly.
fn regularized_incomplete_beta(x: f64, a: f64, b: f64) -> f64 {
    if x <= 0.0 {
        return 0.0;
    }
    if x >= 1.0 {
        return 1.0;
    }

    // Use symmetry relation for convergence: when x > (a+1)/(a+b+2), flip.
    if x > (a + 1.0) / (a + b + 2.0) {
        return 1.0 - regularized_incomplete_beta_cf(1.0 - x, b, a);
    }
    regularized_incomplete_beta_cf(x, a, b)
}

/// Core evaluation of I_x(a, b) = front * betacf(a, b, x).
fn regularized_incomplete_beta_cf(x: f64, a: f64, b: f64) -> f64 {
    let ln_beta = ln_gamma(a) + ln_gamma(b) - ln_gamma(a + b);
    let front = (x.ln() * a + (1.0 - x).ln() * b - ln_beta).exp() / a;
    front * betacf(a, b, x)
}

/// Continued fraction for the incomplete beta function (Numerical Recipes).
fn betacf(a: f64, b: f64, x: f64) -> f64 {
    let tiny = 1e-30_f64;
    let eps = 3e-14;
    let max_iter = 200;

    let qab = a + b;
    let qap = a + 1.0;
    let qam = a - 1.0;

    let mut c = 1.0_f64;
    let mut d = 1.0 - qab * x / qap;
    if d.abs() < tiny {
        d = tiny;
    }
    d = 1.0 / d;
    let mut h = d;

    for m in 1..=max_iter {
        let em = m as f64;
        let m2 = 2.0 * em;

        // Even step
        let aa = em * (b - em) * x / ((qam + m2) * (a + m2));
        d = 1.0 + aa * d;
        if d.abs() < tiny {
            d = tiny;
        }
        c = 1.0 + aa / c;
        if c.abs() < tiny {
            c = tiny;
        }
        d = 1.0 / d;
        h *= d * c;

        // Odd step
        let aa = -(a + em) * (qab + em) * x / ((a + m2) * (qap + m2));
        d = 1.0 + aa * d;
        if d.abs() < tiny {
            d = tiny;
        }
        c = 1.0 + aa / c;
        if c.abs() < tiny {
            c = tiny;
        }
        d = 1.0 / d;
        let del = d * c;
        h *= del;

        if (del - 1.0).abs() <= eps {
            return h;
        }
    }

    h // max iterations reached
}

/// Log-gamma function (Stirling's approximation + Lanczos).
fn ln_gamma(x: f64) -> f64 {
    // Lanczos approximation coefficients (g=7) — exact values required for accuracy.
    #[allow(clippy::excessive_precision)]
    let coefs = [
        0.999_999_999_999_809_93,
        676.520_368_121_885_1,
        -1259.139_216_722_402_8,
        771.323_428_777_653_08,
        -176.615_029_162_140_6,
        12.507_343_278_686_905,
        -0.138_571_095_265_720_12,
        9.984_369_578_019_572e-6,
        1.505_632_735_149_311_6e-7,
    ];

    if x < 0.5 {
        // Reflection formula
        let pi = std::f64::consts::PI;
        return pi.ln() - (pi * x).sin().ln() - ln_gamma(1.0 - x);
    }

    let x = x - 1.0;
    let mut sum = coefs[0];
    for (i, &c) in coefs[1..].iter().enumerate() {
        sum += c / (x + i as f64 + 1.0);
    }

    let t = x + 7.5; // g + 0.5
    0.5 * (2.0 * std::f64::consts::PI).ln() + (t.ln() * (x + 0.5)) - t + sum.ln()
}

/// Two-tailed p-value from t-statistic using t-distribution with `df` degrees of freedom.
fn t_distribution_two_tailed_p(t_stat: f64, df: f64) -> f64 {
    if df <= 0.0 {
        return f64::NAN;
    }
    let x = df / (df + t_stat * t_stat);
    let p_one_tail = 0.5 * regularized_incomplete_beta(x, df / 2.0, 0.5);
    2.0 * p_one_tail
}

// ---------------------------------------------------------------------------
// Public functions
// ---------------------------------------------------------------------------

/// Spearman rank correlation coefficient with two-tailed p-value.
///
/// Matches scipy.stats.spearmanr behavior:
/// - Uses average-rank tie-breaking.
/// - P-value from t-distribution: `t = r * sqrt((n-2)/(1-r^2))`.
/// - Returns `(NaN, NaN)` if `n < 3`.
///
/// # Arguments
///
/// * `x`, `y` — Equal-length slices of observations.
///
/// # Returns
///
/// `(correlation, p_value)`
pub fn spearman(x: &[f64], y: &[f64]) -> (f64, f64) {
    let n = x.len();
    if n != y.len() || n < 3 {
        return (f64::NAN, f64::NAN);
    }

    let rank_x = rankdata(x);
    let rank_y = rankdata(y);
    let r = pearson(&rank_x, &rank_y);

    if r.is_nan() {
        return (f64::NAN, f64::NAN);
    }

    // Clamp to avoid NaN from sqrt of negative number at r = +/-1
    let r_clamped = r.clamp(-1.0, 1.0);
    if (r_clamped.abs() - 1.0).abs() < 1e-15 {
        return (r_clamped, 0.0);
    }

    let df = n as f64 - 2.0;
    let t_stat = r_clamped * (df / (1.0 - r_clamped * r_clamped)).sqrt();
    let p_value = t_distribution_two_tailed_p(t_stat, df);

    (r_clamped, p_value)
}

/// Quintile spread: mean of top quintile returns minus mean of bottom quintile returns.
///
/// Sorts observations by `scores`, splits into `n_quantiles` groups, and returns
/// the difference between the mean of the top group's `returns` and the bottom group's.
///
/// # Arguments
///
/// * `scores` — Factor scores (higher = better expected return).
/// * `returns` — Realized returns corresponding to each score.
/// * `n_quantiles` — Number of groups (typically 5 for quintiles).
///
/// # Returns
///
/// `top_mean - bottom_mean`, or NaN if inputs are invalid.
///
/// # Group sizing when `n` is not divisible by `n_quantiles`
///
/// Group size is `floor(n / n_quantiles)`. The top and bottom groups each
/// contain exactly that many elements; any middle observations that would
/// land in partially-filled interior quantiles are **excluded from the
/// spread**. Example: `n = 23, n_quantiles = 5` → `group_size = 4`, so
/// the bottom 4 and top 4 sorted-by-score observations define the spread
/// and the middle 15 are ignored. This matches the convention in
/// factor-research papers where only the extreme deciles/quintiles enter
/// the long-short portfolio.
///
/// # NaN handling
///
/// Returns NaN if any element of `scores` or `returns` is NaN. Sorting
/// with NaN values present would scatter them through the quantile
/// groups (since `partial_cmp` returns `None` for NaN and the
/// `Ordering::Equal` fallback is arbitrary), producing a numerically
/// meaningless `top_mean - bottom_mean`. Callers who need to include
/// NaN-valued positions must substitute a sentinel first.
pub fn quintile_spread(scores: &[f64], returns: &[f64], n_quantiles: usize) -> f64 {
    let n = scores.len();
    if n != returns.len() || n < n_quantiles || n_quantiles == 0 {
        return f64::NAN;
    }

    // NaN propagation: any NaN in either input produces NaN output.
    if scores.iter().any(|v| v.is_nan()) || returns.iter().any(|v| v.is_nan()) {
        return f64::NAN;
    }

    // Sort indices by score (ascending). With NaN ruled out above the
    // Equal fallback is unreachable.
    let mut indices: Vec<usize> = (0..n).collect();
    indices.sort_by(|&a, &b| {
        scores[a]
            .partial_cmp(&scores[b])
            .unwrap_or(std::cmp::Ordering::Equal)
    });

    let group_size = n / n_quantiles;
    if group_size == 0 {
        return f64::NAN;
    }

    // Bottom group (lowest scores)
    let bottom_mean: f64 = indices[..group_size]
        .iter()
        .map(|&i| returns[i])
        .sum::<f64>()
        / group_size as f64;

    // Top group (highest scores)
    let top_start = n - group_size;
    let top_mean: f64 = indices[top_start..]
        .iter()
        .map(|&i| returns[i])
        .sum::<f64>()
        / group_size as f64;

    top_mean - bottom_mean
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn rankdata_no_ties() {
        let values = [3.0, 1.0, 2.0];
        let ranks = rankdata(&values);
        assert!((ranks[0] - 3.0).abs() < 1e-10);
        assert!((ranks[1] - 1.0).abs() < 1e-10);
        assert!((ranks[2] - 2.0).abs() < 1e-10);
    }

    #[test]
    fn rankdata_with_ties() {
        let values = [1.0, 2.0, 2.0, 4.0];
        let ranks = rankdata(&values);
        assert!((ranks[0] - 1.0).abs() < 1e-10);
        assert!((ranks[1] - 2.5).abs() < 1e-10); // tied → average
        assert!((ranks[2] - 2.5).abs() < 1e-10);
        assert!((ranks[3] - 4.0).abs() < 1e-10);
    }

    #[test]
    fn rankdata_empty() {
        let ranks = rankdata(&[]);
        assert!(ranks.is_empty());
    }

    #[test]
    fn spearman_perfect_positive() {
        let x: Vec<f64> = (0..50).map(|i| i as f64).collect();
        let y: Vec<f64> = (0..50).map(|i| i as f64).collect();
        let (r, p) = spearman(&x, &y);
        assert!((r - 1.0).abs() < 1e-10, "expected r=1.0, got {r}");
        assert!(p < 1e-10, "expected p≈0, got {p}");
    }

    #[test]
    fn spearman_perfect_negative() {
        let x: Vec<f64> = (0..50).map(|i| i as f64).collect();
        let y: Vec<f64> = (0..50).rev().map(|i| i as f64).collect();
        let (r, p) = spearman(&x, &y);
        assert!((r - (-1.0)).abs() < 1e-10, "expected r=-1.0, got {r}");
        assert!(p < 1e-10, "expected p≈0, got {p}");
    }

    #[test]
    fn spearman_too_few() {
        let x = [1.0, 2.0];
        let y = [3.0, 4.0];
        let (r, p) = spearman(&x, &y);
        assert!(r.is_nan());
        assert!(p.is_nan());
    }

    #[test]
    fn spearman_unequal_length() {
        let x = [1.0, 2.0, 3.0];
        let y = [1.0, 2.0];
        let (r, _) = spearman(&x, &y);
        assert!(r.is_nan());
    }

    #[test]
    fn quintile_spread_basic() {
        // Scores: 1..10, Returns match scores → positive spread
        let scores: Vec<f64> = (1..=10).map(|i| i as f64).collect();
        let returns: Vec<f64> = (1..=10).map(|i| i as f64 * 0.01).collect();
        let spread = quintile_spread(&scores, &returns, 5);
        assert!(spread > 0.0, "expected positive spread, got {spread}");
    }

    #[test]
    fn quintile_spread_zero_for_random() {
        // Scores inversely related to returns → negative spread
        let scores: Vec<f64> = (1..=10).map(|i| i as f64).collect();
        let returns: Vec<f64> = (1..=10).rev().map(|i| i as f64 * 0.01).collect();
        let spread = quintile_spread(&scores, &returns, 5);
        assert!(spread < 0.0, "expected negative spread, got {spread}");
    }

    #[test]
    fn quintile_spread_invalid() {
        let scores = [1.0, 2.0];
        let returns = [0.01, 0.02];
        let spread = quintile_spread(&scores, &returns, 5);
        assert!(spread.is_nan());
    }

    // --- NaN propagation (N2-series numerical fix) -----------------------

    #[test]
    fn rankdata_nan_input_propagates() {
        let ranks = rankdata(&[1.0, f64::NAN, 2.0, 3.0]);
        assert_eq!(ranks.len(), 4);
        for (i, r) in ranks.iter().enumerate() {
            assert!(r.is_nan(), "rankdata[{i}] = {r}, expected NaN");
        }
    }

    #[test]
    fn rankdata_multiple_nan_propagates() {
        let ranks = rankdata(&[f64::NAN, f64::NAN, f64::NAN]);
        assert!(ranks.iter().all(|r| r.is_nan()));
    }

    #[test]
    fn rankdata_positive_infinity_is_ordered_not_nan() {
        // +Inf is not NaN; it should rank at the top, not trigger NaN
        // propagation.
        let ranks = rankdata(&[1.0, 2.0, f64::INFINITY, 3.0]);
        assert!(ranks.iter().all(|r| !r.is_nan()), "infinity ≠ NaN");
        assert!(ranks[2] > ranks[0], "+Inf should rank above finite values");
        assert!(ranks[2] > ranks[3], "+Inf should rank above finite values");
    }

    #[test]
    fn spearman_nan_input_propagates_to_result() {
        let x = [1.0, 2.0, 3.0, f64::NAN, 5.0];
        let y = [1.0, 2.0, 3.0, 4.0, 5.0];
        let (r, p) = spearman(&x, &y);
        assert!(r.is_nan(), "spearman r on NaN input should be NaN, got {r}");
        assert!(p.is_nan(), "spearman p on NaN input should be NaN, got {p}");
    }

    #[test]
    fn spearman_nan_in_either_input_propagates() {
        let finite = [1.0, 2.0, 3.0, 4.0, 5.0];
        let with_nan = [1.0, 2.0, f64::NAN, 4.0, 5.0];
        let (rx, _) = spearman(&with_nan, &finite);
        let (ry, _) = spearman(&finite, &with_nan);
        assert!(rx.is_nan(), "NaN in x must propagate");
        assert!(ry.is_nan(), "NaN in y must propagate");
    }

    #[test]
    fn quintile_spread_nan_score_propagates() {
        let mut scores: Vec<f64> = (1..=10).map(|i| i as f64).collect();
        scores[3] = f64::NAN;
        let returns: Vec<f64> = (1..=10).map(|i| i as f64 * 0.01).collect();
        let spread = quintile_spread(&scores, &returns, 5);
        assert!(spread.is_nan(), "NaN score must propagate, got {spread}");
    }

    #[test]
    fn quintile_spread_nan_return_propagates() {
        let scores: Vec<f64> = (1..=10).map(|i| i as f64).collect();
        let mut returns: Vec<f64> = (1..=10).map(|i| i as f64 * 0.01).collect();
        returns[7] = f64::NAN;
        let spread = quintile_spread(&scores, &returns, 5);
        assert!(spread.is_nan(), "NaN return must propagate, got {spread}");
    }

    // Proptest: any NaN anywhere in the input must produce an all-NaN
    // rankdata output. Exhaustively random coverage beyond the hand-
    // picked unit tests above.
    proptest::proptest! {
        #[test]
        fn prop_rankdata_nan_anywhere_produces_all_nan(
            len in 1usize..50,
            nan_idx in 0usize..50,
            values in proptest::collection::vec(-1_000.0f64..1_000.0, 1..50),
        ) {
            let mut xs = values;
            let len = len.min(xs.len());
            xs.truncate(len);
            let nan_idx = nan_idx % xs.len();
            xs[nan_idx] = f64::NAN;

            let ranks = rankdata(&xs);
            proptest::prop_assert_eq!(ranks.len(), xs.len());
            for r in &ranks {
                proptest::prop_assert!(r.is_nan());
            }
        }

        #[test]
        fn prop_rankdata_finite_input_produces_no_nan(
            values in proptest::collection::vec(-1_000.0f64..1_000.0, 2..50),
        ) {
            let ranks = rankdata(&values);
            for (i, r) in ranks.iter().enumerate() {
                proptest::prop_assert!(
                    !r.is_nan(),
                    "ranks[{}] = {} on finite input {:?}",
                    i, r, values
                );
            }
        }
    }

    #[test]
    fn ln_gamma_known_values() {
        // ln(Gamma(1)) = 0
        assert!(ln_gamma(1.0).abs() < 1e-10);
        // ln(Gamma(2)) = 0 (since Gamma(2) = 1! = 1)
        assert!(ln_gamma(2.0).abs() < 1e-10);
        // Gamma(5) = 24, ln(24) ≈ 3.178
        assert!((ln_gamma(5.0) - 24.0_f64.ln()).abs() < 1e-8);
    }
}