Skip to main content

quant_metrics/cointegration/
mod.rs

1//! Engle-Granger cointegration test and pair-trading statistics.
2//!
3//! Pure math — no I/O, no async, no external dependencies beyond std + rust_decimal.
4//!
5//! ## Algorithm
6//!
7//! 1. OLS regression: `y = alpha + beta * x + residuals`
8//! 2. ADF test on residuals: tests if the spread is mean-reverting
9//! 3. Half-life via Ornstein-Uhlenbeck: `hl = -ln(2) / ln(1 + theta)`
10//! 4. Pearson correlation between the two price series
11
12use rust_decimal::Decimal;
13use thiserror::Error;
14
15/// Minimum number of observations for a meaningful cointegration test.
16pub const MIN_OBSERVATIONS: usize = 30;
17
18/// Errors from cointegration computation.
19#[derive(Debug, Error)]
20pub enum CointegrationMathError {
21    #[error("insufficient data: need at least {minimum} observations, got {actual}")]
22    InsufficientData { actual: usize, minimum: usize },
23
24    #[error("series length mismatch: {len_a} vs {len_b}")]
25    LengthMismatch { len_a: usize, len_b: usize },
26
27    #[error("degenerate data: {0}")]
28    DegenerateData(String),
29}
30
31/// Result of an Engle-Granger cointegration test.
32#[derive(Debug, Clone)]
33pub struct EngleGrangerResult {
34    /// ADF test statistic on the residuals.
35    pub adf_statistic: f64,
36
37    /// Approximate p-value for the ADF test.
38    /// Uses MacKinnon critical values for two-variable cointegration.
39    pub p_value: f64,
40
41    /// OLS regression coefficient (hedge ratio): y = alpha + beta * x.
42    pub beta: f64,
43
44    /// OLS intercept.
45    pub alpha: f64,
46
47    /// Half-life of mean reversion in periods (e.g., days).
48    /// Derived from Ornstein-Uhlenbeck process fit on the spread.
49    /// `None` if the spread is not mean-reverting (theta >= 0).
50    pub half_life: Option<f64>,
51
52    /// Pearson correlation between the two series.
53    pub correlation: f64,
54
55    /// Mean of the spread (residuals).
56    pub spread_mean: f64,
57
58    /// Standard deviation of the spread.
59    pub spread_std: f64,
60}
61
62/// Run an Engle-Granger cointegration test on two price series.
63///
64/// `prices_a` and `prices_b` are close prices in chronological order.
65/// Returns the ADF statistic, p-value, hedge ratio, half-life, and correlation.
66pub fn engle_granger(
67    prices_a: &[f64],
68    prices_b: &[f64],
69) -> Result<EngleGrangerResult, CointegrationMathError> {
70    let n = prices_a.len();
71
72    if n != prices_b.len() {
73        return Err(CointegrationMathError::LengthMismatch {
74            len_a: n,
75            len_b: prices_b.len(),
76        });
77    }
78
79    if n < MIN_OBSERVATIONS {
80        return Err(CointegrationMathError::InsufficientData {
81            actual: n,
82            minimum: MIN_OBSERVATIONS,
83        });
84    }
85
86    // Step 1: OLS regression y = alpha + beta * x
87    let (alpha, beta) = ols_regression(prices_a, prices_b)?;
88
89    // Step 2: Compute residuals (spread)
90    let residuals: Vec<f64> = prices_b
91        .iter()
92        .zip(prices_a.iter())
93        .map(|(y, x)| y - alpha - beta * x)
94        .collect();
95
96    let spread_mean = mean(&residuals);
97    let spread_std = std_dev(&residuals, spread_mean);
98
99    if spread_std < 1e-15 {
100        return Err(CointegrationMathError::DegenerateData(
101            "spread has zero variance".to_string(),
102        ));
103    }
104
105    // Step 3: ADF test on residuals
106    let adf_statistic = adf_test_statistic(&residuals)?;
107
108    // Step 4: Approximate p-value using MacKinnon critical values
109    let p_value = adf_p_value(adf_statistic, n);
110
111    // Step 5: Half-life via Ornstein-Uhlenbeck
112    let half_life = ornstein_uhlenbeck_half_life(&residuals);
113
114    // Step 6: Pearson correlation
115    let correlation = pearson_correlation(prices_a, prices_b)?;
116
117    Ok(EngleGrangerResult {
118        adf_statistic,
119        p_value,
120        beta,
121        alpha,
122        half_life,
123        correlation,
124        spread_mean,
125        spread_std,
126    })
127}
128
129/// Convert spread mean and std from f64 to Decimal for the port layer.
130pub fn spread_stats_to_decimal(mean: f64, std: f64) -> (Decimal, Decimal) {
131    let ratio_mean = Decimal::from_f64_retain(mean).unwrap_or(Decimal::ZERO);
132    let ratio_std = Decimal::from_f64_retain(std).unwrap_or(Decimal::ZERO);
133    (ratio_mean, ratio_std)
134}
135
136// ── OLS regression ──────────────────────────────────────────────────────────
137
138/// Simple OLS: y = alpha + beta * x.  Returns (alpha, beta).
139///
140/// Uses mean-centered formula to avoid catastrophic cancellation with
141/// large values (e.g., BTC prices at 50,000+).
142fn ols_regression(x: &[f64], y: &[f64]) -> Result<(f64, f64), CointegrationMathError> {
143    let n = x.len() as f64;
144    let mean_x = x.iter().sum::<f64>() / n;
145    let mean_y = y.iter().sum::<f64>() / n;
146
147    let mut sum_dx_dy: f64 = 0.0;
148    let mut sum_dx2: f64 = 0.0;
149
150    for (xi, yi) in x.iter().zip(y.iter()) {
151        let dx = xi - mean_x;
152        let dy = yi - mean_y;
153        sum_dx_dy += dx * dy;
154        sum_dx2 += dx * dx;
155    }
156
157    if sum_dx2 < 1e-15 {
158        return Err(CointegrationMathError::DegenerateData(
159            "x series has zero variance".to_string(),
160        ));
161    }
162
163    let beta = sum_dx_dy / sum_dx2;
164    let alpha = mean_y - beta * mean_x;
165
166    Ok((alpha, beta))
167}
168
169// ── ADF test ────────────────────────────────────────────────────────────────
170
171/// Augmented Dickey-Fuller test statistic on a residual series.
172///
173/// Tests: delta_r_t = theta * r_{t-1} + epsilon_t
174/// Under H0 (unit root / no cointegration): theta = 0
175/// Under H1 (stationarity / cointegration): theta < 0
176///
177/// Returns the t-statistic for theta.
178fn adf_test_statistic(residuals: &[f64]) -> Result<f64, CointegrationMathError> {
179    let n = residuals.len();
180    if n < 3 {
181        return Err(CointegrationMathError::InsufficientData {
182            actual: n,
183            minimum: 3,
184        });
185    }
186
187    // delta_r = r_t - r_{t-1}
188    // Regress delta_r on r_{t-1} (no constant, as the spread already has mean removed)
189    let mut sum_xy: f64 = 0.0;
190    let mut sum_x2: f64 = 0.0;
191    let mut sum_e2: f64 = 0.0;
192
193    for t in 1..n {
194        let x = residuals[t - 1];
195        let y = residuals[t] - residuals[t - 1];
196        sum_xy += x * y;
197        sum_x2 += x * x;
198    }
199
200    if sum_x2 < 1e-15 {
201        return Err(CointegrationMathError::DegenerateData(
202            "lagged residuals have zero variance".to_string(),
203        ));
204    }
205
206    let theta = sum_xy / sum_x2;
207
208    // Compute residuals of the ADF regression to get SE(theta)
209    for t in 1..n {
210        let x = residuals[t - 1];
211        let y = residuals[t] - residuals[t - 1];
212        let e = y - theta * x;
213        sum_e2 += e * e;
214    }
215
216    let m = (n - 1) as f64; // number of observations in the ADF regression
217    let sigma2 = sum_e2 / (m - 1.0); // variance of ADF residuals
218    let se_theta = (sigma2 / sum_x2).sqrt();
219
220    if se_theta < 1e-15 {
221        return Err(CointegrationMathError::DegenerateData(
222            "standard error of theta is zero".to_string(),
223        ));
224    }
225
226    Ok(theta / se_theta)
227}
228
229/// Approximate p-value for ADF test using MacKinnon critical values.
230///
231/// For Engle-Granger two-variable cointegration test (asymptotic):
232///   1% critical value ≈ -3.90
233///   5% critical value ≈ -3.34
234///   10% critical value ≈ -3.04
235///
236/// This is a rough interpolation suitable for decision-making.
237fn adf_p_value(t_stat: f64, _n: usize) -> f64 {
238    // MacKinnon (1994) critical values for two-variable case
239    if t_stat <= -3.90 {
240        0.01
241    } else if t_stat <= -3.34 {
242        // Linear interpolation between 1% and 5%
243        0.01 + (0.04) * (t_stat - (-3.90)) / ((-3.34) - (-3.90))
244    } else if t_stat <= -3.04 {
245        // Linear interpolation between 5% and 10%
246        0.05 + (0.05) * (t_stat - (-3.34)) / ((-3.04) - (-3.34))
247    } else if t_stat <= -2.50 {
248        // Linear interpolation between 10% and ~25%
249        0.10 + (0.15) * (t_stat - (-3.04)) / ((-2.50) - (-3.04))
250    } else {
251        // Not significant
252        0.50_f64
253            .min(0.25 + (0.25) * (t_stat - (-2.50)) / ((-1.50) - (-2.50)))
254            .max(0.25)
255    }
256}
257
258// ── Ornstein-Uhlenbeck half-life ────────────────────────────────────────────
259
260/// Half-life of mean reversion from an AR(1) fit on the spread.
261///
262/// Fits: r_t = phi * r_{t-1} + epsilon
263/// theta = phi - 1 (speed of mean reversion)
264/// half_life = -ln(2) / ln(phi)
265///
266/// Returns `None` if the spread is not mean-reverting (phi >= 1).
267fn ornstein_uhlenbeck_half_life(residuals: &[f64]) -> Option<f64> {
268    let n = residuals.len();
269    if n < 3 {
270        return None;
271    }
272
273    // AR(1) regression: r_t = phi * r_{t-1}
274    let mut sum_xy: f64 = 0.0;
275    let mut sum_x2: f64 = 0.0;
276
277    for t in 1..n {
278        sum_xy += residuals[t] * residuals[t - 1];
279        sum_x2 += residuals[t - 1] * residuals[t - 1];
280    }
281
282    if sum_x2 < 1e-15 {
283        return None;
284    }
285
286    let phi = sum_xy / sum_x2;
287
288    if phi >= 1.0 || phi <= 0.0 {
289        return None; // Not mean-reverting or explosive
290    }
291
292    let half_life = -f64::ln(2.0) / f64::ln(phi);
293
294    if half_life.is_finite() && half_life > 0.0 {
295        Some(half_life)
296    } else {
297        None
298    }
299}
300
301// ── Pearson correlation ─────────────────────────────────────────────────────
302
303/// Pearson correlation coefficient between two series.
304pub fn pearson_correlation(x: &[f64], y: &[f64]) -> Result<f64, CointegrationMathError> {
305    let n = x.len() as f64;
306    let mean_x = mean(x);
307    let mean_y = mean(y);
308
309    let mut sum_xy: f64 = 0.0;
310    let mut sum_x2: f64 = 0.0;
311    let mut sum_y2: f64 = 0.0;
312
313    for (xi, yi) in x.iter().zip(y.iter()) {
314        let dx = xi - mean_x;
315        let dy = yi - mean_y;
316        sum_xy += dx * dy;
317        sum_x2 += dx * dx;
318        sum_y2 += dy * dy;
319    }
320
321    let denom = (sum_x2 * sum_y2).sqrt();
322    if denom < 1e-15 * n {
323        return Ok(0.0);
324    }
325
326    Ok(sum_xy / denom)
327}
328
329// ── Helpers ─────────────────────────────────────────────────────────────────
330
331fn mean(data: &[f64]) -> f64 {
332    if data.is_empty() {
333        return 0.0;
334    }
335    data.iter().sum::<f64>() / data.len() as f64
336}
337
338fn std_dev(data: &[f64], m: f64) -> f64 {
339    if data.len() < 2 {
340        return 0.0;
341    }
342    let variance = data.iter().map(|x| (x - m).powi(2)).sum::<f64>() / (data.len() - 1) as f64;
343    variance.sqrt()
344}
345
346#[cfg(test)]
347mod tests;