u_analytics/
regression.rs

1//! Regression analysis.
2//!
3//! Simple and multiple linear regression with OLS, R², coefficient testing,
4//! ANOVA, VIF, and residual diagnostics.
5//!
6//! # Examples
7//!
8//! ```
9//! use u_analytics::regression::simple_linear_regression;
10//!
11//! let x = [1.0, 2.0, 3.0, 4.0, 5.0];
12//! let y = [2.1, 3.9, 6.1, 7.9, 10.1];
13//! let result = simple_linear_regression(&x, &y).unwrap();
14//! assert!((result.slope - 2.0).abs() < 0.1);
15//! assert!((result.intercept - 0.1).abs() < 0.2);
16//! assert!(result.r_squared > 0.99);
17//! ```
18
19use u_numflow::matrix::Matrix;
20use u_numflow::special;
21use u_numflow::stats;
22
23/// Result of a simple linear regression: y = intercept + slope · x.
24#[derive(Debug, Clone)]
25pub struct SimpleRegressionResult {
26    /// Slope coefficient (β₁).
27    pub slope: f64,
28    /// Intercept (β₀).
29    pub intercept: f64,
30    /// Coefficient of determination (R²).
31    pub r_squared: f64,
32    /// Adjusted R² = 1 - (1-R²)(n-1)/(n-2).
33    pub adjusted_r_squared: f64,
34    /// Standard error of the slope.
35    pub slope_se: f64,
36    /// Standard error of the intercept.
37    pub intercept_se: f64,
38    /// t-statistic for slope (H₀: β₁ = 0).
39    pub slope_t: f64,
40    /// t-statistic for intercept (H₀: β₀ = 0).
41    pub intercept_t: f64,
42    /// p-value for slope.
43    pub slope_p: f64,
44    /// p-value for intercept.
45    pub intercept_p: f64,
46    /// Residual standard error (√(SSE/(n-2))).
47    pub residual_se: f64,
48    /// F-statistic (= t² for simple regression).
49    pub f_statistic: f64,
50    /// p-value for F-statistic.
51    pub f_p_value: f64,
52    /// Residuals (yᵢ - ŷᵢ).
53    pub residuals: Vec<f64>,
54    /// Fitted values (ŷᵢ).
55    pub fitted: Vec<f64>,
56    /// Sample size.
57    pub n: usize,
58}
59
60/// Result of a multiple linear regression: y = Xβ + ε.
61#[derive(Debug, Clone)]
62pub struct MultipleRegressionResult {
63    /// Coefficient vector [β₀, β₁, ..., βₚ] (intercept first).
64    pub coefficients: Vec<f64>,
65    /// Standard errors of coefficients.
66    pub std_errors: Vec<f64>,
67    /// t-statistics for each coefficient.
68    pub t_statistics: Vec<f64>,
69    /// p-values for each coefficient.
70    pub p_values: Vec<f64>,
71    /// Coefficient of determination (R²).
72    pub r_squared: f64,
73    /// Adjusted R² = 1 - (1-R²)(n-1)/(n-p-1).
74    pub adjusted_r_squared: f64,
75    /// F-statistic for overall significance.
76    pub f_statistic: f64,
77    /// p-value for F-statistic.
78    pub f_p_value: f64,
79    /// Residual standard error.
80    pub residual_se: f64,
81    /// Residuals.
82    pub residuals: Vec<f64>,
83    /// Fitted values.
84    pub fitted: Vec<f64>,
85    /// VIF (Variance Inflation Factor) for each predictor (excludes intercept).
86    pub vif: Vec<f64>,
87    /// Sample size.
88    pub n: usize,
89    /// Number of predictors (excluding intercept).
90    pub p: usize,
91}
92
93// ---------------------------------------------------------------------------
94// Simple Linear Regression
95// ---------------------------------------------------------------------------
96
97/// Computes simple linear regression (OLS closed-form).
98///
99/// # Algorithm
100///
101/// β₁ = cov(x,y) / var(x)
102/// β₀ = ȳ - β₁·x̄
103///
104/// # Returns
105///
106/// `None` if fewer than 3 observations, slices differ in length, x has zero
107/// variance, or inputs contain non-finite values.
108///
109/// # References
110///
111/// Draper & Smith (1998). "Applied Regression Analysis", 3rd edition.
112///
113/// # Examples
114///
115/// ```
116/// use u_analytics::regression::simple_linear_regression;
117///
118/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
119/// let y = [2.0, 4.0, 6.0, 8.0, 10.0];
120/// let r = simple_linear_regression(&x, &y).unwrap();
121/// assert!((r.slope - 2.0).abs() < 1e-10);
122/// assert!((r.intercept).abs() < 1e-10);
123/// assert!((r.r_squared - 1.0).abs() < 1e-10);
124/// ```
125pub fn simple_linear_regression(x: &[f64], y: &[f64]) -> Option<SimpleRegressionResult> {
126    let n = x.len();
127    if n < 3 || n != y.len() {
128        return None;
129    }
130    if x.iter().any(|v| !v.is_finite()) || y.iter().any(|v| !v.is_finite()) {
131        return None;
132    }
133
134    let x_mean = stats::mean(x)?;
135    let y_mean = stats::mean(y)?;
136    let x_var = stats::variance(x)?;
137    let cov = stats::covariance(x, y)?;
138
139    if x_var < 1e-300 {
140        return None; // zero variance in x
141    }
142
143    let slope = cov / x_var;
144    let intercept = y_mean - slope * x_mean;
145
146    // Fitted values and residuals
147    let fitted: Vec<f64> = x.iter().map(|&xi| intercept + slope * xi).collect();
148    let residuals: Vec<f64> = y
149        .iter()
150        .zip(fitted.iter())
151        .map(|(&yi, &fi)| yi - fi)
152        .collect();
153
154    // Sum of squares
155    let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
156    let ss_tot: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
157
158    let nf = n as f64;
159    let df_res = nf - 2.0;
160
161    let r_squared = if ss_tot > 1e-300 {
162        1.0 - ss_res / ss_tot
163    } else {
164        1.0
165    };
166    let adjusted_r_squared = 1.0 - (1.0 - r_squared) * (nf - 1.0) / df_res;
167
168    // Residual standard error
169    let mse = ss_res / df_res;
170    let residual_se = mse.sqrt();
171
172    // Standard errors of coefficients
173    let ss_x: f64 = x.iter().map(|&xi| (xi - x_mean).powi(2)).sum();
174    let slope_se = (mse / ss_x).sqrt();
175    let intercept_se = (mse * (1.0 / nf + x_mean * x_mean / ss_x)).sqrt();
176
177    // t-statistics
178    let slope_t = if slope_se > 1e-300 {
179        slope / slope_se
180    } else {
181        f64::INFINITY
182    };
183    let intercept_t = if intercept_se > 1e-300 {
184        intercept / intercept_se
185    } else {
186        f64::INFINITY
187    };
188
189    // p-values via t-distribution
190    let slope_p = 2.0 * (1.0 - special::t_distribution_cdf(slope_t.abs(), df_res));
191    let intercept_p = 2.0 * (1.0 - special::t_distribution_cdf(intercept_t.abs(), df_res));
192
193    // F-statistic (= t² for simple regression)
194    let f_statistic = slope_t * slope_t;
195    let f_p_value = if f_statistic.is_infinite() {
196        0.0
197    } else {
198        1.0 - special::f_distribution_cdf(f_statistic, 1.0, df_res)
199    };
200
201    Some(SimpleRegressionResult {
202        slope,
203        intercept,
204        r_squared,
205        adjusted_r_squared,
206        slope_se,
207        intercept_se,
208        slope_t,
209        intercept_t,
210        slope_p,
211        intercept_p,
212        residual_se,
213        f_statistic,
214        f_p_value,
215        residuals,
216        fitted,
217        n,
218    })
219}
220
221// ---------------------------------------------------------------------------
222// Multiple Linear Regression
223// ---------------------------------------------------------------------------
224
225/// Computes multiple linear regression via OLS (Cholesky solve).
226///
227/// # Arguments
228///
229/// * `predictors` — Slice of predictor variable slices. Each inner slice is
230///   one predictor's observations. All must have the same length.
231/// * `y` — Response variable observations.
232///
233/// # Algorithm
234///
235/// Constructs the design matrix X = [1 | x₁ | x₂ | ... | xₚ] (intercept column prepended),
236/// then solves the normal equations X'Xβ = X'y via Cholesky decomposition.
237///
238/// # Returns
239///
240/// `None` if n < p+2, predictor lengths differ, inputs contain non-finite
241/// values, or the system is singular.
242///
243/// # References
244///
245/// Draper & Smith (1998). "Applied Regression Analysis", 3rd edition.
246/// Montgomery, Peck & Vining (2012). "Introduction to Linear Regression Analysis", 5th edition.
247///
248/// # Examples
249///
250/// ```
251/// use u_analytics::regression::multiple_linear_regression;
252///
253/// let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
254/// let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0];
255/// let y  = [5.1, 5.0, 9.2, 8.9, 13.1, 12.0, 17.2, 15.9];
256/// let result = multiple_linear_regression(&[&x1, &x2], &y).unwrap();
257/// assert!(result.r_squared > 0.95);
258/// assert_eq!(result.coefficients.len(), 3); // intercept + 2 predictors
259/// ```
260pub fn multiple_linear_regression(
261    predictors: &[&[f64]],
262    y: &[f64],
263) -> Option<MultipleRegressionResult> {
264    let p = predictors.len(); // number of predictors
265    let n = y.len();
266
267    if p == 0 || n < p + 2 {
268        return None;
269    }
270
271    // Validate lengths and finite values
272    for pred in predictors {
273        if pred.len() != n {
274            return None;
275        }
276        if pred.iter().any(|v| !v.is_finite()) {
277            return None;
278        }
279    }
280    if y.iter().any(|v| !v.is_finite()) {
281        return None;
282    }
283
284    let ncols = p + 1; // intercept + predictors
285
286    // Build design matrix X (n × ncols, row-major)
287    let mut x_data = Vec::with_capacity(n * ncols);
288    for i in 0..n {
289        x_data.push(1.0); // intercept
290        for pred in predictors {
291            x_data.push(pred[i]);
292        }
293    }
294    let x_mat = Matrix::new(n, ncols, x_data).ok()?;
295
296    // X'X (ncols × ncols)
297    let xt = x_mat.transpose();
298    let xtx = xt.mul_mat(&x_mat).ok()?;
299
300    // X'y (ncols × 1)
301    let xty = xt.mul_vec(y).ok()?;
302
303    // Solve via Cholesky: β = (X'X)⁻¹ X'y
304    let coefficients = xtx.cholesky_solve(&xty).ok()?;
305
306    // Fitted values and residuals
307    let fitted = x_mat.mul_vec(&coefficients).ok()?;
308    let residuals: Vec<f64> = y
309        .iter()
310        .zip(fitted.iter())
311        .map(|(&yi, &fi)| yi - fi)
312        .collect();
313
314    // R² and adjusted R²
315    let y_mean = stats::mean(y)?;
316    let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
317    let ss_tot: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
318
319    let nf = n as f64;
320    let pf = p as f64;
321    let df_res = nf - pf - 1.0;
322
323    let r_squared = if ss_tot > 1e-300 {
324        1.0 - ss_res / ss_tot
325    } else {
326        1.0
327    };
328    let adjusted_r_squared = 1.0 - (1.0 - r_squared) * (nf - 1.0) / df_res;
329
330    // Residual standard error
331    let mse = ss_res / df_res;
332    let residual_se = mse.sqrt();
333
334    // Standard errors: SE(β) = sqrt(diag((X'X)⁻¹) · MSE)
335    let xtx_inv = xtx.inverse().ok()?;
336    let mut std_errors = Vec::with_capacity(ncols);
337    let mut t_statistics = Vec::with_capacity(ncols);
338    let mut p_values = Vec::with_capacity(ncols);
339
340    for (j, &coeff_j) in coefficients.iter().enumerate() {
341        let se = (xtx_inv.get(j, j) * mse).sqrt();
342        std_errors.push(se);
343        let t = if se > 1e-300 {
344            coeff_j / se
345        } else {
346            f64::INFINITY
347        };
348        t_statistics.push(t);
349        let pv = 2.0 * (1.0 - special::t_distribution_cdf(t.abs(), df_res));
350        p_values.push(pv);
351    }
352
353    // F-statistic: (SS_reg / p) / (SS_res / (n-p-1))
354    let ss_reg = ss_tot - ss_res;
355    let f_statistic = if pf > 0.0 && mse > 1e-300 {
356        (ss_reg / pf) / mse
357    } else {
358        0.0
359    };
360    let f_p_value = if f_statistic.is_infinite() || f_statistic.is_nan() {
361        0.0
362    } else {
363        1.0 - special::f_distribution_cdf(f_statistic, pf, df_res)
364    };
365
366    // VIF for each predictor: VIF_j = 1/(1 - R²_j) where R²_j is from
367    // regressing x_j on all other predictors
368    let vif = compute_vif(predictors);
369
370    Some(MultipleRegressionResult {
371        coefficients,
372        std_errors,
373        t_statistics,
374        p_values,
375        r_squared,
376        adjusted_r_squared,
377        f_statistic,
378        f_p_value,
379        residual_se,
380        residuals,
381        fitted,
382        vif,
383        n,
384        p,
385    })
386}
387
388/// Computes VIF for each predictor by regressing each on all others.
389fn compute_vif(predictors: &[&[f64]]) -> Vec<f64> {
390    let p = predictors.len();
391    if p < 2 {
392        return vec![1.0; p];
393    }
394
395    let mut vif = Vec::with_capacity(p);
396    for j in 0..p {
397        // Regress x_j on all other predictors
398        let y_j = predictors[j];
399        let others: Vec<&[f64]> = predictors
400            .iter()
401            .enumerate()
402            .filter(|&(i, _)| i != j)
403            .map(|(_, v)| *v)
404            .collect();
405
406        if let Some(result) = multiple_linear_regression(&others, y_j) {
407            let r2 = result.r_squared;
408            if r2 < 1.0 - 1e-15 {
409                vif.push(1.0 / (1.0 - r2));
410            } else {
411                vif.push(f64::INFINITY); // perfect multicollinearity
412            }
413        } else {
414            vif.push(f64::NAN);
415        }
416    }
417    vif
418}
419
420/// Predicts y values given new x data and a simple regression result.
421///
422/// # Examples
423///
424/// ```
425/// use u_analytics::regression::{simple_linear_regression, predict_simple};
426///
427/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
428/// let y = [2.0, 4.0, 6.0, 8.0, 10.0];
429/// let model = simple_linear_regression(&x, &y).unwrap();
430/// let pred = predict_simple(&model, &[6.0, 7.0]);
431/// assert!((pred[0] - 12.0).abs() < 1e-10);
432/// assert!((pred[1] - 14.0).abs() < 1e-10);
433/// ```
434pub fn predict_simple(model: &SimpleRegressionResult, x_new: &[f64]) -> Vec<f64> {
435    x_new
436        .iter()
437        .map(|&xi| model.intercept + model.slope * xi)
438        .collect()
439}
440
441/// Predicts y values given new predictor data and a multiple regression result.
442///
443/// # Arguments
444///
445/// * `model` — Multiple regression result.
446/// * `predictors_new` — Slice of predictor slices (same order as training).
447///
448/// Returns `None` if predictor count doesn't match or lengths differ.
449pub fn predict_multiple(
450    model: &MultipleRegressionResult,
451    predictors_new: &[&[f64]],
452) -> Option<Vec<f64>> {
453    if predictors_new.len() != model.p {
454        return None;
455    }
456    let n = predictors_new.first()?.len();
457    for pred in predictors_new {
458        if pred.len() != n {
459            return None;
460        }
461    }
462
463    let mut result = Vec::with_capacity(n);
464    for i in 0..n {
465        let mut yi = model.coefficients[0]; // intercept
466        for (j, pred) in predictors_new.iter().enumerate() {
467            yi += model.coefficients[j + 1] * pred[i];
468        }
469        result.push(yi);
470    }
471    Some(result)
472}
473
474// ---------------------------------------------------------------------------
475// Multicollinearity diagnostics (public)
476// ---------------------------------------------------------------------------
477
478/// Computes the Variance Inflation Factor for each predictor.
479///
480/// `VIF_j = 1 / (1 - R²_j)`, where `R²_j` is the coefficient of determination
481/// from regressing predictor `j` on all other predictors. Standard thresholds
482/// are `VIF > 5` (moderate multicollinearity) and `VIF > 10` (severe).
483///
484/// # Returns
485///
486/// `None` if `predictors` is empty, lengths differ, fewer than `p+1`
487/// observations, or any predictor contains a non-finite value. With a single
488/// predictor, returns `Some(vec![1.0])` since VIF is undefined for one
489/// regressor.
490///
491/// # References
492///
493/// Marquardt (1970). "Generalized inverses, ridge regression."
494/// Belsley, Kuh, Welsch (1980). *Regression Diagnostics*.
495///
496/// # Examples
497///
498/// ```
499/// use u_analytics::regression::vif;
500///
501/// let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
502/// let x2 = [8.0, 1.0, 6.0, 3.0, 5.0, 7.0, 2.0, 4.0];
503/// let v = vif(&[&x1, &x2]).unwrap();
504/// assert!(v[0] < 2.0); // independent predictors
505/// ```
506pub fn vif(predictors: &[&[f64]]) -> Option<Vec<f64>> {
507    let p = predictors.len();
508    if p == 0 {
509        return None;
510    }
511    let n = predictors[0].len();
512    if !predictors.iter().all(|c| c.len() == n) {
513        return None;
514    }
515    if n <= p {
516        return None;
517    }
518    if predictors.iter().any(|c| c.iter().any(|v| !v.is_finite())) {
519        return None;
520    }
521    Some(compute_vif(predictors))
522}
523
524/// Computes the 2-norm condition number of the sample covariance matrix.
525///
526/// `cond(Σ) = λ_max / λ_min`, where `λ` are the eigenvalues of the unbiased
527/// sample covariance matrix of the (column-major) predictors. Standard
528/// threshold: `cond > 30` indicates multicollinearity worth investigating;
529/// `cond > 100` is severe (Belsley 1991).
530///
531/// # Returns
532///
533/// `None` for empty input, length mismatch, fewer than `p+1` observations,
534/// non-finite values, or eigenvalue decomposition failure. Returns
535/// `f64::INFINITY` when the smallest eigenvalue is below `1e-15`
536/// (numerically singular).
537///
538/// # References
539///
540/// Belsley (1991). *Conditioning Diagnostics*.
541///
542/// # Examples
543///
544/// ```
545/// use u_analytics::regression::condition_number;
546///
547/// let x1 = [1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
548/// let x2 = [1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0];
549/// let c = condition_number(&[&x1, &x2]).unwrap();
550/// assert!(c < 5.0); // near-orthogonal columns: cond should be small
551/// ```
552pub fn condition_number(predictors: &[&[f64]]) -> Option<f64> {
553    use u_numflow::matrix::Matrix;
554    let p = predictors.len();
555    if p == 0 {
556        return None;
557    }
558    let n = predictors[0].len();
559    if !predictors.iter().all(|c| c.len() == n) {
560        return None;
561    }
562    if n <= p {
563        return None;
564    }
565    if predictors.iter().any(|c| c.iter().any(|v| !v.is_finite())) {
566        return None;
567    }
568
569    // Sample covariance matrix (unbiased, denominator n-1)
570    let mut means = vec![0.0_f64; p];
571    for (j, col) in predictors.iter().enumerate() {
572        means[j] = col.iter().sum::<f64>() / n as f64;
573    }
574    let mut cov = vec![0.0_f64; p * p];
575    let denom = (n - 1) as f64;
576    for r in 0..p {
577        let col_r = predictors[r];
578        let mean_r = means[r];
579        for c in r..p {
580            let col_c = predictors[c];
581            let mean_c = means[c];
582            let sum: f64 = col_r
583                .iter()
584                .zip(col_c.iter())
585                .map(|(&a, &b)| (a - mean_r) * (b - mean_c))
586                .sum();
587            let v = sum / denom;
588            cov[r * p + c] = v;
589            cov[c * p + r] = v;
590        }
591    }
592
593    let mat = Matrix::new(p, p, cov).ok()?;
594    let (eigenvalues, _eigenvectors) = mat.eigen_symmetric().ok()?;
595    let lambda_max = eigenvalues
596        .iter()
597        .copied()
598        .fold(f64::NEG_INFINITY, f64::max);
599    let lambda_min = eigenvalues.iter().copied().fold(f64::INFINITY, f64::min);
600
601    if !lambda_max.is_finite() || lambda_max <= 0.0 {
602        return None;
603    }
604    if lambda_min < 1e-15 {
605        return Some(f64::INFINITY);
606    }
607    Some(lambda_max / lambda_min)
608}
609
610#[cfg(test)]
611mod tests {
612    use super::*;
613
614    // -----------------------------------------------------------------------
615    // Simple regression tests
616    // -----------------------------------------------------------------------
617
618    #[test]
619    fn simple_perfect_fit() {
620        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
621        let y = [3.0, 5.0, 7.0, 9.0, 11.0]; // y = 1 + 2x
622        let r = simple_linear_regression(&x, &y).expect("should compute");
623        assert!((r.slope - 2.0).abs() < 1e-10);
624        assert!((r.intercept - 1.0).abs() < 1e-10);
625        assert!((r.r_squared - 1.0).abs() < 1e-10);
626    }
627
628    #[test]
629    fn simple_with_noise() {
630        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
631        let y = [2.1, 3.9, 6.1, 7.9, 10.1]; // y ≈ 2x + 0.1
632        let r = simple_linear_regression(&x, &y).expect("should compute");
633        assert!((r.slope - 2.0).abs() < 0.1);
634        assert!(r.r_squared > 0.99);
635        assert_eq!(r.residuals.len(), 5);
636        assert_eq!(r.fitted.len(), 5);
637    }
638
639    #[test]
640    fn simple_residuals_sum_to_zero() {
641        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
642        let y = [2.1, 3.9, 6.1, 7.9, 10.1];
643        let r = simple_linear_regression(&x, &y).expect("should compute");
644        let sum: f64 = r.residuals.iter().sum();
645        assert!(sum.abs() < 1e-10, "residuals sum = {sum}");
646    }
647
648    #[test]
649    fn simple_f_equals_t_squared() {
650        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
651        let y = [2.1, 3.9, 6.1, 7.9, 10.1];
652        let r = simple_linear_regression(&x, &y).expect("should compute");
653        assert!(
654            (r.f_statistic - r.slope_t * r.slope_t).abs() < 1e-8,
655            "F = {}, t² = {}",
656            r.f_statistic,
657            r.slope_t * r.slope_t
658        );
659    }
660
661    #[test]
662    fn simple_significant_slope() {
663        let x: Vec<f64> = (0..20).map(|i| i as f64).collect();
664        let y: Vec<f64> = x.iter().map(|&xi| 3.0 + 2.0 * xi).collect();
665        let r = simple_linear_regression(&x, &y).expect("should compute");
666        assert!(r.slope_p < 1e-10, "slope p = {}", r.slope_p);
667        assert!(r.f_p_value < 1e-10, "F p = {}", r.f_p_value);
668    }
669
670    #[test]
671    fn simple_negative_slope() {
672        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
673        let y = [10.0, 8.0, 6.0, 4.0, 2.0]; // y = 12 - 2x
674        let r = simple_linear_regression(&x, &y).expect("should compute");
675        assert!((r.slope + 2.0).abs() < 1e-10);
676        assert!((r.intercept - 12.0).abs() < 1e-10);
677    }
678
679    #[test]
680    fn simple_edge_cases() {
681        assert!(simple_linear_regression(&[1.0, 2.0], &[3.0, 4.0]).is_none()); // n < 3
682        assert!(simple_linear_regression(&[1.0, 2.0, 3.0], &[4.0, 5.0]).is_none()); // mismatch
683        assert!(simple_linear_regression(&[5.0, 5.0, 5.0], &[1.0, 2.0, 3.0]).is_none()); // zero var
684        assert!(simple_linear_regression(&[1.0, f64::NAN, 3.0], &[4.0, 5.0, 6.0]).is_none());
685    }
686
687    /// Verifies exact OLS numeric reference from the spec.
688    ///
689    /// x = [1,2,3,4,5], y = [2,4,5,4,5]
690    /// x̄=3, ȳ=4
691    /// Σ(xᵢ-x̄)(yᵢ-ȳ) = 6, Σ(xᵢ-x̄)² = 10
692    /// β̂₁ = 6/10 = 0.6, β̂₀ = 4 - 0.6·3 = 2.2
693    /// SS_tot = 6, SS_res = 2.4, R² = 1 - 2.4/6 = 0.6
694    ///
695    /// Reference: Draper & Smith (1998), Applied Regression Analysis, 3rd ed.
696    #[test]
697    fn simple_numeric_reference_ols() {
698        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
699        let y = [2.0, 4.0, 5.0, 4.0, 5.0];
700        let r = simple_linear_regression(&x, &y).expect("should compute");
701
702        assert!(
703            (r.slope - 0.6).abs() < 1e-10,
704            "β̂₁ expected 0.6, got {}",
705            r.slope
706        );
707        assert!(
708            (r.intercept - 2.2).abs() < 1e-10,
709            "β̂₀ expected 2.2, got {}",
710            r.intercept
711        );
712        assert!(
713            (r.r_squared - 0.6).abs() < 1e-3,
714            "R² expected 0.6, got {}",
715            r.r_squared
716        );
717
718        // Verify fitted values
719        // ŷ₁=2.8, ŷ₂=3.4, ŷ₃=4.0, ŷ₄=4.6, ŷ₅=5.2
720        let expected_fitted = [2.8, 3.4, 4.0, 4.6, 5.2];
721        for (i, (&fi, &ef)) in r.fitted.iter().zip(expected_fitted.iter()).enumerate() {
722            assert!(
723                (fi - ef).abs() < 1e-10,
724                "ŷ_{} expected {}, got {}",
725                i + 1,
726                ef,
727                fi
728            );
729        }
730    }
731
732    #[test]
733    fn simple_adjusted_r_squared() {
734        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
735        let y = [2.1, 3.9, 6.1, 7.9, 10.1];
736        let r = simple_linear_regression(&x, &y).expect("should compute");
737        // For perfect fit, adjusted R² ≈ R²
738        assert!(r.adjusted_r_squared <= r.r_squared);
739        assert!(r.adjusted_r_squared > 0.98);
740    }
741
742    // -----------------------------------------------------------------------
743    // Multiple regression tests
744    // -----------------------------------------------------------------------
745
746    #[test]
747    fn multiple_perfect_fit() {
748        // y = 1 + 2*x1 + 3*x2
749        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
750        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
751        let y: Vec<f64> = x1
752            .iter()
753            .zip(x2.iter())
754            .map(|(&a, &b)| 1.0 + 2.0 * a + 3.0 * b)
755            .collect();
756        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
757
758        assert!(
759            (r.coefficients[0] - 1.0).abs() < 1e-8,
760            "β₀ = {}",
761            r.coefficients[0]
762        );
763        assert!(
764            (r.coefficients[1] - 2.0).abs() < 1e-8,
765            "β₁ = {}",
766            r.coefficients[1]
767        );
768        assert!(
769            (r.coefficients[2] - 3.0).abs() < 1e-8,
770            "β₂ = {}",
771            r.coefficients[2]
772        );
773        assert!((r.r_squared - 1.0).abs() < 1e-8);
774    }
775
776    #[test]
777    fn multiple_with_noise() {
778        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
779        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0];
780        let y = [5.1, 5.0, 9.2, 8.9, 13.1, 12.0, 17.2, 15.9];
781        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
782        assert!(r.r_squared > 0.95);
783        assert_eq!(r.coefficients.len(), 3);
784        assert_eq!(r.std_errors.len(), 3);
785        assert_eq!(r.residuals.len(), 8);
786        assert_eq!(r.vif.len(), 2);
787    }
788
789    #[test]
790    fn multiple_residuals_sum_to_zero() {
791        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
792        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0];
793        let y = [5.1, 5.0, 9.2, 8.9, 13.1, 12.0, 17.2, 15.9];
794        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
795        let sum: f64 = r.residuals.iter().sum();
796        assert!(sum.abs() < 1e-8, "residuals sum = {sum}");
797    }
798
799    #[test]
800    fn multiple_single_predictor_matches_simple() {
801        let x = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
802        let y: Vec<f64> = x
803            .iter()
804            .map(|&xi| 3.0 + 2.5 * xi + 0.1 * (xi - 5.0))
805            .collect();
806
807        let simple = simple_linear_regression(&x, &y).expect("simple");
808        let multi = multiple_linear_regression(&[&x], &y).expect("multiple");
809
810        assert!(
811            (simple.slope - multi.coefficients[1]).abs() < 1e-8,
812            "slope: {} vs {}",
813            simple.slope,
814            multi.coefficients[1]
815        );
816        assert!(
817            (simple.intercept - multi.coefficients[0]).abs() < 1e-8,
818            "intercept: {} vs {}",
819            simple.intercept,
820            multi.coefficients[0]
821        );
822        assert!((simple.r_squared - multi.r_squared).abs() < 1e-8);
823    }
824
825    #[test]
826    fn multiple_edge_cases() {
827        let x1 = [1.0, 2.0];
828        let y = [3.0, 4.0];
829        assert!(multiple_linear_regression(&[&x1], &y).is_none()); // n < p+2
830
831        let x2 = [1.0, 2.0, 3.0];
832        let y2 = [4.0, 5.0];
833        assert!(multiple_linear_regression(&[&x2], &y2).is_none()); // length mismatch
834    }
835
836    #[test]
837    fn multiple_vif_independent_predictors() {
838        // Independent predictors → VIF ≈ 1
839        let x1 = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0];
840        let x2 = [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
841        let y: Vec<f64> = x1
842            .iter()
843            .zip(x2.iter())
844            .map(|(&a, &b)| 1.0 + 2.0 * a + 3.0 * b)
845            .collect();
846        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
847        for (i, &v) in r.vif.iter().enumerate() {
848            assert!(
849                v < 2.0,
850                "VIF[{i}] = {v}, expected near 1.0 for independent predictors"
851            );
852        }
853    }
854
855    #[test]
856    fn multiple_vif_correlated_predictors() {
857        // Highly but not perfectly correlated predictors → higher VIF
858        let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
859        // Add small perturbation to break perfect collinearity
860        let noise = [
861            0.1, -0.2, 0.3, -0.1, 0.2, -0.3, 0.1, -0.1, 0.2, -0.2, 0.3, -0.1, 0.1, -0.2, 0.3, -0.3,
862            0.1, -0.1, 0.2, -0.2,
863        ];
864        let x2: Vec<f64> = x1
865            .iter()
866            .zip(noise.iter())
867            .map(|(&v, &n)| v * 0.9 + 1.0 + n)
868            .collect();
869        let y: Vec<f64> = x1
870            .iter()
871            .zip(x2.iter())
872            .map(|(&a, &b)| 1.0 + a + b)
873            .collect();
874        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
875        // VIF > 5 for highly correlated predictors
876        assert!(
877            r.vif[0] > 5.0,
878            "VIF[0] = {}, expected > 5.0 for correlated predictors",
879            r.vif[0]
880        );
881    }
882
883    // -----------------------------------------------------------------------
884    // Prediction tests
885    // -----------------------------------------------------------------------
886
887    #[test]
888    fn predict_simple_basic() {
889        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
890        let y = [3.0, 5.0, 7.0, 9.0, 11.0]; // y = 1 + 2x
891        let model = simple_linear_regression(&x, &y).expect("should compute");
892        let pred = predict_simple(&model, &[6.0, 7.0]);
893        assert!((pred[0] - 13.0).abs() < 1e-10);
894        assert!((pred[1] - 15.0).abs() < 1e-10);
895    }
896
897    #[test]
898    fn predict_multiple_basic() {
899        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
900        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
901        let y: Vec<f64> = x1
902            .iter()
903            .zip(x2.iter())
904            .map(|(&a, &b)| 1.0 + 2.0 * a + 3.0 * b)
905            .collect();
906        let model = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
907
908        let new_x1 = [11.0];
909        let new_x2 = [6.0];
910        let pred = predict_multiple(&model, &[&new_x1, &new_x2]).expect("should predict");
911        let expected = 1.0 + 2.0 * 11.0 + 3.0 * 6.0;
912        assert!(
913            (pred[0] - expected).abs() < 1e-6,
914            "pred = {}, expected = {}",
915            pred[0],
916            expected
917        );
918    }
919
920    // -----------------------------------------------------------------------
921    // Multicollinearity diagnostics — public vif() / condition_number()
922    // -----------------------------------------------------------------------
923
924    #[test]
925    fn vif_pub_independent_predictors() {
926        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
927        let x2 = [8.0, 1.0, 6.0, 3.0, 5.0, 7.0, 2.0, 4.0];
928        let v = vif(&[&x1, &x2]).expect("should compute");
929        for r in &v {
930            assert!(*r < 2.0, "VIF should be near 1.0 for independent, got {r}");
931        }
932    }
933
934    #[test]
935    fn vif_pub_collinear_predictors() {
936        let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
937        let x2: Vec<f64> = x1.iter().map(|&v| v * 2.0 + 0.001).collect();
938        let v = vif(&[&x1[..], &x2[..]]).expect("should compute");
939        assert!(
940            v[0] > 50.0,
941            "VIF should explode under near-collinearity, got {}",
942            v[0]
943        );
944    }
945
946    #[test]
947    fn vif_pub_rejects_short_input() {
948        let x = [1.0, 2.0];
949        assert!(vif(&[&x[..], &x[..]]).is_none());
950    }
951
952    #[test]
953    fn vif_pub_rejects_non_finite() {
954        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
955        let x2 = [1.0, 2.0, f64::NAN, 4.0, 5.0, 6.0];
956        assert!(vif(&[&x1[..], &x2[..]]).is_none());
957    }
958
959    #[test]
960    fn condition_number_orthogonal() {
961        let x1 = [1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
962        let x2 = [1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0];
963        let c = condition_number(&[&x1[..], &x2[..]]).expect("should compute");
964        assert!(c < 5.0, "near-orthogonal: cond should be small, got {c}");
965    }
966
967    #[test]
968    fn condition_number_collinear_explodes() {
969        let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
970        let x2: Vec<f64> = x1.iter().map(|&v| v * 2.0 + 1e-6).collect();
971        let c = condition_number(&[&x1[..], &x2[..]]).expect("should compute");
972        assert!(
973            c > 1e5 || c.is_infinite(),
974            "collinear: cond should be huge, got {c}"
975        );
976    }
977
978    #[test]
979    fn condition_number_rejects_short_input() {
980        let x = [1.0, 2.0];
981        assert!(condition_number(&[&x[..], &x[..]]).is_none());
982    }
983
984    #[test]
985    fn predict_multiple_wrong_predictors() {
986        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
987        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
988        let y: Vec<f64> = x1.iter().zip(x2.iter()).map(|(&a, &b)| a + b).collect();
989        let model = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
990
991        // Wrong number of predictors
992        assert!(predict_multiple(&model, &[&[1.0]]).is_none());
993    }
994}
995
996#[cfg(test)]
997mod proptests {
998    use super::*;
999    use proptest::prelude::*;
1000
1001    proptest! {
1002        #[test]
1003        fn simple_r_squared_bounded(
1004            data in proptest::collection::vec(-1e3_f64..1e3, 5..=30)
1005                .prop_flat_map(|x| {
1006                    let n = x.len();
1007                    (Just(x), proptest::collection::vec(-1e3_f64..1e3, n..=n))
1008                })
1009        ) {
1010            let (x, y) = data;
1011            if let Some(r) = simple_linear_regression(&x, &y) {
1012                prop_assert!(r.r_squared >= -0.01 && r.r_squared <= 1.01,
1013                    "R² = {}", r.r_squared);
1014            }
1015        }
1016
1017        #[test]
1018        fn simple_residuals_orthogonal_to_x(
1019            data in proptest::collection::vec(-1e3_f64..1e3, 5..=30)
1020                .prop_flat_map(|x| {
1021                    let n = x.len();
1022                    (Just(x), proptest::collection::vec(-1e3_f64..1e3, n..=n))
1023                })
1024        ) {
1025            let (x, y) = data;
1026            if let Some(r) = simple_linear_regression(&x, &y) {
1027                // Σ(xᵢ · eᵢ) should be near zero (OLS normal equation)
1028                let dot: f64 = x.iter().zip(r.residuals.iter()).map(|(&xi, &ei)| xi * ei).sum();
1029                let norm = r.residuals.iter().map(|e| e * e).sum::<f64>().sqrt();
1030                let x_norm = x.iter().map(|xi| xi * xi).sum::<f64>().sqrt();
1031                if norm > 1e-10 && x_norm > 1e-10 {
1032                    prop_assert!((dot / (norm * x_norm)).abs() < 1e-6,
1033                        "residuals not orthogonal to x: dot={dot}");
1034                }
1035            }
1036        }
1037
1038        #[test]
1039        fn multiple_r_squared_bounded(
1040            x1 in proptest::collection::vec(-1e3_f64..1e3, 8..=20),
1041            x2_seed in proptest::collection::vec(-1e3_f64..1e3, 8..=20),
1042            y_seed in proptest::collection::vec(-1e3_f64..1e3, 8..=20),
1043        ) {
1044            let n = x1.len().min(x2_seed.len()).min(y_seed.len());
1045            let x2 = &x2_seed[..n];
1046            let y = &y_seed[..n];
1047            let x1 = &x1[..n];
1048            if let Some(r) = multiple_linear_regression(&[x1, x2], y) {
1049                prop_assert!(r.r_squared >= -0.01 && r.r_squared <= 1.01,
1050                    "R² = {}", r.r_squared);
1051            }
1052        }
1053    }
1054}
u_analytics/regression.rs

u_analytics/
regression.rs