quant-metrics 0.7.0

//! Engle-Granger cointegration test and pair-trading statistics.
//!
//! Pure math — no I/O, no async, no external dependencies beyond std + rust_decimal.
//!
//! ## Algorithm
//!
//! 1. OLS regression: `y = alpha + beta * x + residuals`
//! 2. ADF test on residuals: tests if the spread is mean-reverting
//! 3. Half-life via Ornstein-Uhlenbeck: `hl = -ln(2) / ln(1 + theta)`
//! 4. Pearson correlation between the two price series

use rust_decimal::Decimal;
use thiserror::Error;

/// Minimum number of observations for a meaningful cointegration test.
pub const MIN_OBSERVATIONS: usize = 30;

/// Errors from cointegration computation.
#[derive(Debug, Error)]
pub enum CointegrationMathError {
    #[error("insufficient data: need at least {minimum} observations, got {actual}")]
    InsufficientData { actual: usize, minimum: usize },

    #[error("series length mismatch: {len_a} vs {len_b}")]
    LengthMismatch { len_a: usize, len_b: usize },

    #[error("degenerate data: {0}")]
    DegenerateData(String),
}

/// Result of an Engle-Granger cointegration test.
#[derive(Debug, Clone)]
pub struct EngleGrangerResult {
    /// ADF test statistic on the residuals.
    pub adf_statistic: f64,

    /// Approximate p-value for the ADF test.
    /// Uses MacKinnon critical values for two-variable cointegration.
    pub p_value: f64,

    /// OLS regression coefficient (hedge ratio): y = alpha + beta * x.
    pub beta: f64,

    /// OLS intercept.
    pub alpha: f64,

    /// Half-life of mean reversion in periods (e.g., days).
    /// Derived from Ornstein-Uhlenbeck process fit on the spread.
    /// `None` if the spread is not mean-reverting (theta >= 0).
    pub half_life: Option<f64>,

    /// Pearson correlation between the two series.
    pub correlation: f64,

    /// Mean of the spread (residuals).
    pub spread_mean: f64,

    /// Standard deviation of the spread.
    pub spread_std: f64,
}

/// Run an Engle-Granger cointegration test on two price series.
///
/// `prices_a` and `prices_b` are close prices in chronological order.
/// Returns the ADF statistic, p-value, hedge ratio, half-life, and correlation.
pub fn engle_granger(
    prices_a: &[f64],
    prices_b: &[f64],
) -> Result<EngleGrangerResult, CointegrationMathError> {
    let n = prices_a.len();

    if n != prices_b.len() {
        return Err(CointegrationMathError::LengthMismatch {
            len_a: n,
            len_b: prices_b.len(),
        });
    }

    if n < MIN_OBSERVATIONS {
        return Err(CointegrationMathError::InsufficientData {
            actual: n,
            minimum: MIN_OBSERVATIONS,
        });
    }

    // Step 1: OLS regression y = alpha + beta * x
    let (alpha, beta) = ols_regression(prices_a, prices_b)?;

    // Step 2: Compute residuals (spread)
    let residuals: Vec<f64> = prices_b
        .iter()
        .zip(prices_a.iter())
        .map(|(y, x)| y - alpha - beta * x)
        .collect();

    let spread_mean = mean(&residuals);
    let spread_std = std_dev(&residuals, spread_mean);

    if spread_std < 1e-15 {
        return Err(CointegrationMathError::DegenerateData(
            "spread has zero variance".to_string(),
        ));
    }

    // Step 3: ADF test on residuals
    let adf_statistic = adf_test_statistic(&residuals)?;

    // Step 4: Approximate p-value using MacKinnon critical values
    let p_value = adf_p_value(adf_statistic, n);

    // Step 5: Half-life via Ornstein-Uhlenbeck
    let half_life = ornstein_uhlenbeck_half_life(&residuals);

    // Step 6: Pearson correlation
    let correlation = pearson_correlation(prices_a, prices_b)?;

    Ok(EngleGrangerResult {
        adf_statistic,
        p_value,
        beta,
        alpha,
        half_life,
        correlation,
        spread_mean,
        spread_std,
    })
}

/// Convert spread mean and std from f64 to Decimal for the port layer.
pub fn spread_stats_to_decimal(mean: f64, std: f64) -> (Decimal, Decimal) {
    let ratio_mean = Decimal::from_f64_retain(mean).unwrap_or(Decimal::ZERO);
    let ratio_std = Decimal::from_f64_retain(std).unwrap_or(Decimal::ZERO);
    (ratio_mean, ratio_std)
}

// ── OLS regression ──────────────────────────────────────────────────────────

/// Simple OLS: y = alpha + beta * x.  Returns (alpha, beta).
///
/// Uses mean-centered formula to avoid catastrophic cancellation with
/// large values (e.g., BTC prices at 50,000+).
fn ols_regression(x: &[f64], y: &[f64]) -> Result<(f64, f64), CointegrationMathError> {
    let n = x.len() as f64;
    let mean_x = x.iter().sum::<f64>() / n;
    let mean_y = y.iter().sum::<f64>() / n;

    let mut sum_dx_dy: f64 = 0.0;
    let mut sum_dx2: f64 = 0.0;

    for (xi, yi) in x.iter().zip(y.iter()) {
        let dx = xi - mean_x;
        let dy = yi - mean_y;
        sum_dx_dy += dx * dy;
        sum_dx2 += dx * dx;
    }

    if sum_dx2 < 1e-15 {
        return Err(CointegrationMathError::DegenerateData(
            "x series has zero variance".to_string(),
        ));
    }

    let beta = sum_dx_dy / sum_dx2;
    let alpha = mean_y - beta * mean_x;

    Ok((alpha, beta))
}

// ── ADF test ────────────────────────────────────────────────────────────────

/// Augmented Dickey-Fuller test statistic on a residual series.
///
/// Tests: delta_r_t = theta * r_{t-1} + epsilon_t
/// Under H0 (unit root / no cointegration): theta = 0
/// Under H1 (stationarity / cointegration): theta < 0
///
/// Returns the t-statistic for theta.
fn adf_test_statistic(residuals: &[f64]) -> Result<f64, CointegrationMathError> {
    let n = residuals.len();
    if n < 3 {
        return Err(CointegrationMathError::InsufficientData {
            actual: n,
            minimum: 3,
        });
    }

    // delta_r = r_t - r_{t-1}
    // Regress delta_r on r_{t-1} (no constant, as the spread already has mean removed)
    let mut sum_xy: f64 = 0.0;
    let mut sum_x2: f64 = 0.0;
    let mut sum_e2: f64 = 0.0;

    for t in 1..n {
        let x = residuals[t - 1];
        let y = residuals[t] - residuals[t - 1];
        sum_xy += x * y;
        sum_x2 += x * x;
    }

    if sum_x2 < 1e-15 {
        return Err(CointegrationMathError::DegenerateData(
            "lagged residuals have zero variance".to_string(),
        ));
    }

    let theta = sum_xy / sum_x2;

    // Compute residuals of the ADF regression to get SE(theta)
    for t in 1..n {
        let x = residuals[t - 1];
        let y = residuals[t] - residuals[t - 1];
        let e = y - theta * x;
        sum_e2 += e * e;
    }

    let m = (n - 1) as f64; // number of observations in the ADF regression
    let sigma2 = sum_e2 / (m - 1.0); // variance of ADF residuals
    let se_theta = (sigma2 / sum_x2).sqrt();

    if se_theta < 1e-15 {
        return Err(CointegrationMathError::DegenerateData(
            "standard error of theta is zero".to_string(),
        ));
    }

    Ok(theta / se_theta)
}

/// Approximate p-value for ADF test using MacKinnon critical values.
///
/// For Engle-Granger two-variable cointegration test (asymptotic):
///   1% critical value ≈ -3.90
///   5% critical value ≈ -3.34
///   10% critical value ≈ -3.04
///
/// This is a rough interpolation suitable for decision-making.
fn adf_p_value(t_stat: f64, _n: usize) -> f64 {
    // MacKinnon (1994) critical values for two-variable case
    if t_stat <= -3.90 {
        0.01
    } else if t_stat <= -3.34 {
        // Linear interpolation between 1% and 5%
        0.01 + (0.04) * (t_stat - (-3.90)) / ((-3.34) - (-3.90))
    } else if t_stat <= -3.04 {
        // Linear interpolation between 5% and 10%
        0.05 + (0.05) * (t_stat - (-3.34)) / ((-3.04) - (-3.34))
    } else if t_stat <= -2.50 {
        // Linear interpolation between 10% and ~25%
        0.10 + (0.15) * (t_stat - (-3.04)) / ((-2.50) - (-3.04))
    } else {
        // Not significant
        0.50_f64
            .min(0.25 + (0.25) * (t_stat - (-2.50)) / ((-1.50) - (-2.50)))
            .max(0.25)
    }
}

// ── Ornstein-Uhlenbeck half-life ────────────────────────────────────────────

/// Half-life of mean reversion from an AR(1) fit on the spread.
///
/// Fits: r_t = phi * r_{t-1} + epsilon
/// theta = phi - 1 (speed of mean reversion)
/// half_life = -ln(2) / ln(phi)
///
/// Returns `None` if the spread is not mean-reverting (phi >= 1).
fn ornstein_uhlenbeck_half_life(residuals: &[f64]) -> Option<f64> {
    let n = residuals.len();
    if n < 3 {
        return None;
    }

    // AR(1) regression: r_t = phi * r_{t-1}
    let mut sum_xy: f64 = 0.0;
    let mut sum_x2: f64 = 0.0;

    for t in 1..n {
        sum_xy += residuals[t] * residuals[t - 1];
        sum_x2 += residuals[t - 1] * residuals[t - 1];
    }

    if sum_x2 < 1e-15 {
        return None;
    }

    let phi = sum_xy / sum_x2;

    if phi >= 1.0 || phi <= 0.0 {
        return None; // Not mean-reverting or explosive
    }

    let half_life = -f64::ln(2.0) / f64::ln(phi);

    if half_life.is_finite() && half_life > 0.0 {
        Some(half_life)
    } else {
        None
    }
}

// ── Pearson correlation ─────────────────────────────────────────────────────

/// Pearson correlation coefficient between two series.
pub fn pearson_correlation(x: &[f64], y: &[f64]) -> Result<f64, CointegrationMathError> {
    let n = x.len() as f64;
    let mean_x = mean(x);
    let mean_y = mean(y);

    let mut sum_xy: f64 = 0.0;
    let mut sum_x2: f64 = 0.0;
    let mut sum_y2: f64 = 0.0;

    for (xi, yi) in x.iter().zip(y.iter()) {
        let dx = xi - mean_x;
        let dy = yi - mean_y;
        sum_xy += dx * dy;
        sum_x2 += dx * dx;
        sum_y2 += dy * dy;
    }

    let denom = (sum_x2 * sum_y2).sqrt();
    if denom < 1e-15 * n {
        return Ok(0.0);
    }

    Ok(sum_xy / denom)
}

// ── Helpers ─────────────────────────────────────────────────────────────────

fn mean(data: &[f64]) -> f64 {
    if data.is_empty() {
        return 0.0;
    }
    data.iter().sum::<f64>() / data.len() as f64
}

fn std_dev(data: &[f64], m: f64) -> f64 {
    if data.len() < 2 {
        return 0.0;
    }
    let variance = data.iter().map(|x| (x - m).powi(2)).sum::<f64>() / (data.len() - 1) as f64;
    variance.sqrt()
}

#[cfg(test)]
mod tests;