u_analytics/
regression.rs

1//! Regression analysis.
2//!
3//! Simple and multiple linear regression with OLS, R², coefficient testing,
4//! ANOVA, VIF, and residual diagnostics.
5//!
6//! # Examples
7//!
8//! ```
9//! use u_analytics::regression::simple_linear_regression;
10//!
11//! let x = [1.0, 2.0, 3.0, 4.0, 5.0];
12//! let y = [2.1, 3.9, 6.1, 7.9, 10.1];
13//! let result = simple_linear_regression(&x, &y).unwrap();
14//! assert!((result.slope - 2.0).abs() < 0.1);
15//! assert!((result.intercept - 0.1).abs() < 0.2);
16//! assert!(result.r_squared > 0.99);
17//! ```
18
19use u_numflow::matrix::Matrix;
20use u_numflow::special;
21use u_numflow::stats;
22
23/// Result of a simple linear regression: y = intercept + slope · x.
24#[derive(Debug, Clone)]
25pub struct SimpleRegressionResult {
26    /// Slope coefficient (β₁).
27    pub slope: f64,
28    /// Intercept (β₀).
29    pub intercept: f64,
30    /// Coefficient of determination (R²).
31    pub r_squared: f64,
32    /// Adjusted R² = 1 - (1-R²)(n-1)/(n-2).
33    pub adjusted_r_squared: f64,
34    /// Standard error of the slope.
35    pub slope_se: f64,
36    /// Standard error of the intercept.
37    pub intercept_se: f64,
38    /// t-statistic for slope (H₀: β₁ = 0).
39    pub slope_t: f64,
40    /// t-statistic for intercept (H₀: β₀ = 0).
41    pub intercept_t: f64,
42    /// p-value for slope.
43    pub slope_p: f64,
44    /// p-value for intercept.
45    pub intercept_p: f64,
46    /// Residual standard error (√(SSE/(n-2))).
47    pub residual_se: f64,
48    /// F-statistic (= t² for simple regression).
49    pub f_statistic: f64,
50    /// p-value for F-statistic.
51    pub f_p_value: f64,
52    /// Residuals (yᵢ - ŷᵢ).
53    pub residuals: Vec<f64>,
54    /// Fitted values (ŷᵢ).
55    pub fitted: Vec<f64>,
56    /// Sample size.
57    pub n: usize,
58}
59
60/// Result of a multiple linear regression: y = Xβ + ε.
61#[derive(Debug, Clone)]
62pub struct MultipleRegressionResult {
63    /// Coefficient vector [β₀, β₁, ..., βₚ] (intercept first).
64    pub coefficients: Vec<f64>,
65    /// Standard errors of coefficients.
66    pub std_errors: Vec<f64>,
67    /// t-statistics for each coefficient.
68    pub t_statistics: Vec<f64>,
69    /// p-values for each coefficient.
70    pub p_values: Vec<f64>,
71    /// Coefficient of determination (R²).
72    pub r_squared: f64,
73    /// Adjusted R² = 1 - (1-R²)(n-1)/(n-p-1).
74    pub adjusted_r_squared: f64,
75    /// F-statistic for overall significance.
76    pub f_statistic: f64,
77    /// p-value for F-statistic.
78    pub f_p_value: f64,
79    /// Residual standard error.
80    pub residual_se: f64,
81    /// Residuals.
82    pub residuals: Vec<f64>,
83    /// Fitted values.
84    pub fitted: Vec<f64>,
85    /// VIF (Variance Inflation Factor) for each predictor (excludes intercept).
86    pub vif: Vec<f64>,
87    /// Sample size.
88    pub n: usize,
89    /// Number of predictors (excluding intercept).
90    pub p: usize,
91}
92
93// ---------------------------------------------------------------------------
94// Simple Linear Regression
95// ---------------------------------------------------------------------------
96
97/// Computes simple linear regression (OLS closed-form).
98///
99/// # Algorithm
100///
101/// β₁ = cov(x,y) / var(x)
102/// β₀ = ȳ - β₁·x̄
103///
104/// # Returns
105///
106/// `None` if fewer than 3 observations, slices differ in length, x has zero
107/// variance, or inputs contain non-finite values.
108///
109/// # References
110///
111/// Draper & Smith (1998). "Applied Regression Analysis", 3rd edition.
112///
113/// # Examples
114///
115/// ```
116/// use u_analytics::regression::simple_linear_regression;
117///
118/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
119/// let y = [2.0, 4.0, 6.0, 8.0, 10.0];
120/// let r = simple_linear_regression(&x, &y).unwrap();
121/// assert!((r.slope - 2.0).abs() < 1e-10);
122/// assert!((r.intercept).abs() < 1e-10);
123/// assert!((r.r_squared - 1.0).abs() < 1e-10);
124/// ```
125pub fn simple_linear_regression(x: &[f64], y: &[f64]) -> Option<SimpleRegressionResult> {
126    let n = x.len();
127    if n < 3 || n != y.len() {
128        return None;
129    }
130    if x.iter().any(|v| !v.is_finite()) || y.iter().any(|v| !v.is_finite()) {
131        return None;
132    }
133
134    let x_mean = stats::mean(x)?;
135    let y_mean = stats::mean(y)?;
136    let x_var = stats::variance(x)?;
137    let cov = stats::covariance(x, y)?;
138
139    if x_var < 1e-300 {
140        return None; // zero variance in x
141    }
142
143    let slope = cov / x_var;
144    let intercept = y_mean - slope * x_mean;
145
146    // Fitted values and residuals
147    let fitted: Vec<f64> = x.iter().map(|&xi| intercept + slope * xi).collect();
148    let residuals: Vec<f64> = y
149        .iter()
150        .zip(fitted.iter())
151        .map(|(&yi, &fi)| yi - fi)
152        .collect();
153
154    // Sum of squares
155    let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
156    let ss_tot: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
157
158    let nf = n as f64;
159    let df_res = nf - 2.0;
160
161    let r_squared = if ss_tot > 1e-300 {
162        1.0 - ss_res / ss_tot
163    } else {
164        1.0
165    };
166    let adjusted_r_squared = 1.0 - (1.0 - r_squared) * (nf - 1.0) / df_res;
167
168    // Residual standard error
169    let mse = ss_res / df_res;
170    let residual_se = mse.sqrt();
171
172    // Standard errors of coefficients
173    let ss_x: f64 = x.iter().map(|&xi| (xi - x_mean).powi(2)).sum();
174    let slope_se = (mse / ss_x).sqrt();
175    let intercept_se = (mse * (1.0 / nf + x_mean * x_mean / ss_x)).sqrt();
176
177    // t-statistics
178    let slope_t = if slope_se > 1e-300 {
179        slope / slope_se
180    } else {
181        f64::INFINITY
182    };
183    let intercept_t = if intercept_se > 1e-300 {
184        intercept / intercept_se
185    } else {
186        f64::INFINITY
187    };
188
189    // p-values via t-distribution
190    let slope_p = 2.0 * (1.0 - special::t_distribution_cdf(slope_t.abs(), df_res));
191    let intercept_p = 2.0 * (1.0 - special::t_distribution_cdf(intercept_t.abs(), df_res));
192
193    // F-statistic (= t² for simple regression)
194    let f_statistic = slope_t * slope_t;
195    let f_p_value = if f_statistic.is_infinite() {
196        0.0
197    } else {
198        1.0 - special::f_distribution_cdf(f_statistic, 1.0, df_res)
199    };
200
201    Some(SimpleRegressionResult {
202        slope,
203        intercept,
204        r_squared,
205        adjusted_r_squared,
206        slope_se,
207        intercept_se,
208        slope_t,
209        intercept_t,
210        slope_p,
211        intercept_p,
212        residual_se,
213        f_statistic,
214        f_p_value,
215        residuals,
216        fitted,
217        n,
218    })
219}
220
221// ---------------------------------------------------------------------------
222// Multiple Linear Regression
223// ---------------------------------------------------------------------------
224
225/// Computes multiple linear regression via OLS (Cholesky solve).
226///
227/// # Arguments
228///
229/// * `predictors` — Slice of predictor variable slices. Each inner slice is
230///   one predictor's observations. All must have the same length.
231/// * `y` — Response variable observations.
232///
233/// # Algorithm
234///
235/// Constructs the design matrix X = [1 | x₁ | x₂ | ... | xₚ] (intercept column prepended),
236/// then solves the normal equations X'Xβ = X'y via Cholesky decomposition.
237///
238/// # Returns
239///
240/// `None` if n < p+2, predictor lengths differ, inputs contain non-finite
241/// values, or the system is singular.
242///
243/// # References
244///
245/// Draper & Smith (1998). "Applied Regression Analysis", 3rd edition.
246/// Montgomery, Peck & Vining (2012). "Introduction to Linear Regression Analysis", 5th edition.
247///
248/// # Examples
249///
250/// ```
251/// use u_analytics::regression::multiple_linear_regression;
252///
253/// let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
254/// let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0];
255/// let y  = [5.1, 5.0, 9.2, 8.9, 13.1, 12.0, 17.2, 15.9];
256/// let result = multiple_linear_regression(&[&x1, &x2], &y).unwrap();
257/// assert!(result.r_squared > 0.95);
258/// assert_eq!(result.coefficients.len(), 3); // intercept + 2 predictors
259/// ```
260pub fn multiple_linear_regression(
261    predictors: &[&[f64]],
262    y: &[f64],
263) -> Option<MultipleRegressionResult> {
264    let p = predictors.len(); // number of predictors
265    let n = y.len();
266
267    if p == 0 || n < p + 2 {
268        return None;
269    }
270
271    // Validate lengths and finite values
272    for pred in predictors {
273        if pred.len() != n {
274            return None;
275        }
276        if pred.iter().any(|v| !v.is_finite()) {
277            return None;
278        }
279    }
280    if y.iter().any(|v| !v.is_finite()) {
281        return None;
282    }
283
284    let ncols = p + 1; // intercept + predictors
285
286    // Build design matrix X (n × ncols, row-major)
287    let mut x_data = Vec::with_capacity(n * ncols);
288    for i in 0..n {
289        x_data.push(1.0); // intercept
290        for pred in predictors {
291            x_data.push(pred[i]);
292        }
293    }
294    let x_mat = Matrix::new(n, ncols, x_data).ok()?;
295
296    // X'X (ncols × ncols)
297    let xt = x_mat.transpose();
298    let xtx = xt.mul_mat(&x_mat).ok()?;
299
300    // X'y (ncols × 1)
301    let xty = xt.mul_vec(y).ok()?;
302
303    // Solve via Cholesky: β = (X'X)⁻¹ X'y
304    let coefficients = xtx.cholesky_solve(&xty).ok()?;
305
306    // Fitted values and residuals
307    let fitted = x_mat.mul_vec(&coefficients).ok()?;
308    let residuals: Vec<f64> = y
309        .iter()
310        .zip(fitted.iter())
311        .map(|(&yi, &fi)| yi - fi)
312        .collect();
313
314    // R² and adjusted R²
315    let y_mean = stats::mean(y)?;
316    let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
317    let ss_tot: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
318
319    let nf = n as f64;
320    let pf = p as f64;
321    let df_res = nf - pf - 1.0;
322
323    let r_squared = if ss_tot > 1e-300 {
324        1.0 - ss_res / ss_tot
325    } else {
326        1.0
327    };
328    let adjusted_r_squared = 1.0 - (1.0 - r_squared) * (nf - 1.0) / df_res;
329
330    // Residual standard error
331    let mse = ss_res / df_res;
332    let residual_se = mse.sqrt();
333
334    // Standard errors: SE(β) = sqrt(diag((X'X)⁻¹) · MSE)
335    let xtx_inv = xtx.inverse().ok()?;
336    let mut std_errors = Vec::with_capacity(ncols);
337    let mut t_statistics = Vec::with_capacity(ncols);
338    let mut p_values = Vec::with_capacity(ncols);
339
340    for (j, &coeff_j) in coefficients.iter().enumerate() {
341        let se = (xtx_inv.get(j, j) * mse).sqrt();
342        std_errors.push(se);
343        let t = if se > 1e-300 {
344            coeff_j / se
345        } else {
346            f64::INFINITY
347        };
348        t_statistics.push(t);
349        let pv = 2.0 * (1.0 - special::t_distribution_cdf(t.abs(), df_res));
350        p_values.push(pv);
351    }
352
353    // F-statistic: (SS_reg / p) / (SS_res / (n-p-1))
354    let ss_reg = ss_tot - ss_res;
355    let f_statistic = if pf > 0.0 && mse > 1e-300 {
356        (ss_reg / pf) / mse
357    } else {
358        0.0
359    };
360    let f_p_value = if f_statistic.is_infinite() || f_statistic.is_nan() {
361        0.0
362    } else {
363        1.0 - special::f_distribution_cdf(f_statistic, pf, df_res)
364    };
365
366    // VIF for each predictor: VIF_j = 1/(1 - R²_j) where R²_j is from
367    // regressing x_j on all other predictors
368    let vif = compute_vif(predictors);
369
370    Some(MultipleRegressionResult {
371        coefficients,
372        std_errors,
373        t_statistics,
374        p_values,
375        r_squared,
376        adjusted_r_squared,
377        f_statistic,
378        f_p_value,
379        residual_se,
380        residuals,
381        fitted,
382        vif,
383        n,
384        p,
385    })
386}
387
388/// Computes VIF for each predictor by regressing each on all others.
389fn compute_vif(predictors: &[&[f64]]) -> Vec<f64> {
390    let p = predictors.len();
391    if p < 2 {
392        return vec![1.0; p];
393    }
394
395    let mut vif = Vec::with_capacity(p);
396    for j in 0..p {
397        // Regress x_j on all other predictors
398        let y_j = predictors[j];
399        let others: Vec<&[f64]> = predictors
400            .iter()
401            .enumerate()
402            .filter(|&(i, _)| i != j)
403            .map(|(_, v)| *v)
404            .collect();
405
406        if let Some(result) = multiple_linear_regression(&others, y_j) {
407            let r2 = result.r_squared;
408            if r2 < 1.0 - 1e-15 {
409                vif.push(1.0 / (1.0 - r2));
410            } else {
411                vif.push(f64::INFINITY); // perfect multicollinearity
412            }
413        } else {
414            vif.push(f64::NAN);
415        }
416    }
417    vif
418}
419
420/// Predicts y values given new x data and a simple regression result.
421///
422/// # Examples
423///
424/// ```
425/// use u_analytics::regression::{simple_linear_regression, predict_simple};
426///
427/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
428/// let y = [2.0, 4.0, 6.0, 8.0, 10.0];
429/// let model = simple_linear_regression(&x, &y).unwrap();
430/// let pred = predict_simple(&model, &[6.0, 7.0]);
431/// assert!((pred[0] - 12.0).abs() < 1e-10);
432/// assert!((pred[1] - 14.0).abs() < 1e-10);
433/// ```
434pub fn predict_simple(model: &SimpleRegressionResult, x_new: &[f64]) -> Vec<f64> {
435    x_new
436        .iter()
437        .map(|&xi| model.intercept + model.slope * xi)
438        .collect()
439}
440
441/// Predicts y values given new predictor data and a multiple regression result.
442///
443/// # Arguments
444///
445/// * `model` — Multiple regression result.
446/// * `predictors_new` — Slice of predictor slices (same order as training).
447///
448/// Returns `None` if predictor count doesn't match or lengths differ.
449pub fn predict_multiple(
450    model: &MultipleRegressionResult,
451    predictors_new: &[&[f64]],
452) -> Option<Vec<f64>> {
453    if predictors_new.len() != model.p {
454        return None;
455    }
456    let n = predictors_new.first()?.len();
457    for pred in predictors_new {
458        if pred.len() != n {
459            return None;
460        }
461    }
462
463    let mut result = Vec::with_capacity(n);
464    for i in 0..n {
465        let mut yi = model.coefficients[0]; // intercept
466        for (j, pred) in predictors_new.iter().enumerate() {
467            yi += model.coefficients[j + 1] * pred[i];
468        }
469        result.push(yi);
470    }
471    Some(result)
472}
473
474// ---------------------------------------------------------------------------
475// Multicollinearity diagnostics (public)
476// ---------------------------------------------------------------------------
477
478/// Computes the Variance Inflation Factor for each predictor.
479///
480/// `VIF_j = 1 / (1 - R²_j)`, where `R²_j` is the coefficient of determination
481/// from regressing predictor `j` on all other predictors. Standard thresholds
482/// are `VIF > 5` (moderate multicollinearity) and `VIF > 10` (severe).
483///
484/// # Returns
485///
486/// `None` if `predictors` is empty, lengths differ, fewer than `p+1`
487/// observations, or any predictor contains a non-finite value. With a single
488/// predictor, returns `Some(vec![1.0])` since VIF is undefined for one
489/// regressor.
490///
491/// # References
492///
493/// Marquardt (1970). "Generalized inverses, ridge regression."
494/// Belsley, Kuh, Welsch (1980). *Regression Diagnostics*.
495///
496/// # Examples
497///
498/// ```
499/// use u_analytics::regression::vif;
500///
501/// let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
502/// let x2 = [8.0, 1.0, 6.0, 3.0, 5.0, 7.0, 2.0, 4.0];
503/// let v = vif(&[&x1, &x2]).unwrap();
504/// assert!(v[0] < 2.0); // independent predictors
505/// ```
506pub fn vif(predictors: &[&[f64]]) -> Option<Vec<f64>> {
507    let p = predictors.len();
508    if p == 0 {
509        return None;
510    }
511    let n = predictors[0].len();
512    if !predictors.iter().all(|c| c.len() == n) {
513        return None;
514    }
515    if n <= p {
516        return None;
517    }
518    if predictors
519        .iter()
520        .any(|c| c.iter().any(|v| !v.is_finite()))
521    {
522        return None;
523    }
524    Some(compute_vif(predictors))
525}
526
527/// Computes the 2-norm condition number of the sample covariance matrix.
528///
529/// `cond(Σ) = λ_max / λ_min`, where `λ` are the eigenvalues of the unbiased
530/// sample covariance matrix of the (column-major) predictors. Standard
531/// threshold: `cond > 30` indicates multicollinearity worth investigating;
532/// `cond > 100` is severe (Belsley 1991).
533///
534/// # Returns
535///
536/// `None` for empty input, length mismatch, fewer than `p+1` observations,
537/// non-finite values, or eigenvalue decomposition failure. Returns
538/// `f64::INFINITY` when the smallest eigenvalue is below `1e-15`
539/// (numerically singular).
540///
541/// # References
542///
543/// Belsley (1991). *Conditioning Diagnostics*.
544///
545/// # Examples
546///
547/// ```
548/// use u_analytics::regression::condition_number;
549///
550/// let x1 = [1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
551/// let x2 = [1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0];
552/// let c = condition_number(&[&x1, &x2]).unwrap();
553/// assert!(c < 5.0); // near-orthogonal columns: cond should be small
554/// ```
555pub fn condition_number(predictors: &[&[f64]]) -> Option<f64> {
556    use u_numflow::matrix::Matrix;
557    let p = predictors.len();
558    if p == 0 {
559        return None;
560    }
561    let n = predictors[0].len();
562    if !predictors.iter().all(|c| c.len() == n) {
563        return None;
564    }
565    if n <= p {
566        return None;
567    }
568    if predictors
569        .iter()
570        .any(|c| c.iter().any(|v| !v.is_finite()))
571    {
572        return None;
573    }
574
575    // Sample covariance matrix (unbiased, denominator n-1)
576    let mut means = vec![0.0_f64; p];
577    for (j, col) in predictors.iter().enumerate() {
578        means[j] = col.iter().sum::<f64>() / n as f64;
579    }
580    let mut cov = vec![0.0_f64; p * p];
581    let denom = (n - 1) as f64;
582    for r in 0..p {
583        let col_r = predictors[r];
584        let mean_r = means[r];
585        for c in r..p {
586            let col_c = predictors[c];
587            let mean_c = means[c];
588            let sum: f64 = col_r
589                .iter()
590                .zip(col_c.iter())
591                .map(|(&a, &b)| (a - mean_r) * (b - mean_c))
592                .sum();
593            let v = sum / denom;
594            cov[r * p + c] = v;
595            cov[c * p + r] = v;
596        }
597    }
598
599    let mat = Matrix::new(p, p, cov).ok()?;
600    let (eigenvalues, _eigenvectors) = mat.eigen_symmetric().ok()?;
601    let lambda_max = eigenvalues
602        .iter()
603        .copied()
604        .fold(f64::NEG_INFINITY, f64::max);
605    let lambda_min = eigenvalues.iter().copied().fold(f64::INFINITY, f64::min);
606
607    if !lambda_max.is_finite() || lambda_max <= 0.0 {
608        return None;
609    }
610    if lambda_min < 1e-15 {
611        return Some(f64::INFINITY);
612    }
613    Some(lambda_max / lambda_min)
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619
620    // -----------------------------------------------------------------------
621    // Simple regression tests
622    // -----------------------------------------------------------------------
623
624    #[test]
625    fn simple_perfect_fit() {
626        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
627        let y = [3.0, 5.0, 7.0, 9.0, 11.0]; // y = 1 + 2x
628        let r = simple_linear_regression(&x, &y).expect("should compute");
629        assert!((r.slope - 2.0).abs() < 1e-10);
630        assert!((r.intercept - 1.0).abs() < 1e-10);
631        assert!((r.r_squared - 1.0).abs() < 1e-10);
632    }
633
634    #[test]
635    fn simple_with_noise() {
636        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
637        let y = [2.1, 3.9, 6.1, 7.9, 10.1]; // y ≈ 2x + 0.1
638        let r = simple_linear_regression(&x, &y).expect("should compute");
639        assert!((r.slope - 2.0).abs() < 0.1);
640        assert!(r.r_squared > 0.99);
641        assert_eq!(r.residuals.len(), 5);
642        assert_eq!(r.fitted.len(), 5);
643    }
644
645    #[test]
646    fn simple_residuals_sum_to_zero() {
647        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
648        let y = [2.1, 3.9, 6.1, 7.9, 10.1];
649        let r = simple_linear_regression(&x, &y).expect("should compute");
650        let sum: f64 = r.residuals.iter().sum();
651        assert!(sum.abs() < 1e-10, "residuals sum = {sum}");
652    }
653
654    #[test]
655    fn simple_f_equals_t_squared() {
656        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
657        let y = [2.1, 3.9, 6.1, 7.9, 10.1];
658        let r = simple_linear_regression(&x, &y).expect("should compute");
659        assert!(
660            (r.f_statistic - r.slope_t * r.slope_t).abs() < 1e-8,
661            "F = {}, t² = {}",
662            r.f_statistic,
663            r.slope_t * r.slope_t
664        );
665    }
666
667    #[test]
668    fn simple_significant_slope() {
669        let x: Vec<f64> = (0..20).map(|i| i as f64).collect();
670        let y: Vec<f64> = x.iter().map(|&xi| 3.0 + 2.0 * xi).collect();
671        let r = simple_linear_regression(&x, &y).expect("should compute");
672        assert!(r.slope_p < 1e-10, "slope p = {}", r.slope_p);
673        assert!(r.f_p_value < 1e-10, "F p = {}", r.f_p_value);
674    }
675
676    #[test]
677    fn simple_negative_slope() {
678        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
679        let y = [10.0, 8.0, 6.0, 4.0, 2.0]; // y = 12 - 2x
680        let r = simple_linear_regression(&x, &y).expect("should compute");
681        assert!((r.slope + 2.0).abs() < 1e-10);
682        assert!((r.intercept - 12.0).abs() < 1e-10);
683    }
684
685    #[test]
686    fn simple_edge_cases() {
687        assert!(simple_linear_regression(&[1.0, 2.0], &[3.0, 4.0]).is_none()); // n < 3
688        assert!(simple_linear_regression(&[1.0, 2.0, 3.0], &[4.0, 5.0]).is_none()); // mismatch
689        assert!(simple_linear_regression(&[5.0, 5.0, 5.0], &[1.0, 2.0, 3.0]).is_none()); // zero var
690        assert!(simple_linear_regression(&[1.0, f64::NAN, 3.0], &[4.0, 5.0, 6.0]).is_none());
691    }
692
693    /// Verifies exact OLS numeric reference from the spec.
694    ///
695    /// x = [1,2,3,4,5], y = [2,4,5,4,5]
696    /// x̄=3, ȳ=4
697    /// Σ(xᵢ-x̄)(yᵢ-ȳ) = 6, Σ(xᵢ-x̄)² = 10
698    /// β̂₁ = 6/10 = 0.6, β̂₀ = 4 - 0.6·3 = 2.2
699    /// SS_tot = 6, SS_res = 2.4, R² = 1 - 2.4/6 = 0.6
700    ///
701    /// Reference: Draper & Smith (1998), Applied Regression Analysis, 3rd ed.
702    #[test]
703    fn simple_numeric_reference_ols() {
704        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
705        let y = [2.0, 4.0, 5.0, 4.0, 5.0];
706        let r = simple_linear_regression(&x, &y).expect("should compute");
707
708        assert!(
709            (r.slope - 0.6).abs() < 1e-10,
710            "β̂₁ expected 0.6, got {}",
711            r.slope
712        );
713        assert!(
714            (r.intercept - 2.2).abs() < 1e-10,
715            "β̂₀ expected 2.2, got {}",
716            r.intercept
717        );
718        assert!(
719            (r.r_squared - 0.6).abs() < 1e-3,
720            "R² expected 0.6, got {}",
721            r.r_squared
722        );
723
724        // Verify fitted values
725        // ŷ₁=2.8, ŷ₂=3.4, ŷ₃=4.0, ŷ₄=4.6, ŷ₅=5.2
726        let expected_fitted = [2.8, 3.4, 4.0, 4.6, 5.2];
727        for (i, (&fi, &ef)) in r.fitted.iter().zip(expected_fitted.iter()).enumerate() {
728            assert!(
729                (fi - ef).abs() < 1e-10,
730                "ŷ_{} expected {}, got {}",
731                i + 1,
732                ef,
733                fi
734            );
735        }
736    }
737
738    #[test]
739    fn simple_adjusted_r_squared() {
740        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
741        let y = [2.1, 3.9, 6.1, 7.9, 10.1];
742        let r = simple_linear_regression(&x, &y).expect("should compute");
743        // For perfect fit, adjusted R² ≈ R²
744        assert!(r.adjusted_r_squared <= r.r_squared);
745        assert!(r.adjusted_r_squared > 0.98);
746    }
747
748    // -----------------------------------------------------------------------
749    // Multiple regression tests
750    // -----------------------------------------------------------------------
751
752    #[test]
753    fn multiple_perfect_fit() {
754        // y = 1 + 2*x1 + 3*x2
755        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
756        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
757        let y: Vec<f64> = x1
758            .iter()
759            .zip(x2.iter())
760            .map(|(&a, &b)| 1.0 + 2.0 * a + 3.0 * b)
761            .collect();
762        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
763
764        assert!(
765            (r.coefficients[0] - 1.0).abs() < 1e-8,
766            "β₀ = {}",
767            r.coefficients[0]
768        );
769        assert!(
770            (r.coefficients[1] - 2.0).abs() < 1e-8,
771            "β₁ = {}",
772            r.coefficients[1]
773        );
774        assert!(
775            (r.coefficients[2] - 3.0).abs() < 1e-8,
776            "β₂ = {}",
777            r.coefficients[2]
778        );
779        assert!((r.r_squared - 1.0).abs() < 1e-8);
780    }
781
782    #[test]
783    fn multiple_with_noise() {
784        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
785        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0];
786        let y = [5.1, 5.0, 9.2, 8.9, 13.1, 12.0, 17.2, 15.9];
787        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
788        assert!(r.r_squared > 0.95);
789        assert_eq!(r.coefficients.len(), 3);
790        assert_eq!(r.std_errors.len(), 3);
791        assert_eq!(r.residuals.len(), 8);
792        assert_eq!(r.vif.len(), 2);
793    }
794
795    #[test]
796    fn multiple_residuals_sum_to_zero() {
797        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
798        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0];
799        let y = [5.1, 5.0, 9.2, 8.9, 13.1, 12.0, 17.2, 15.9];
800        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
801        let sum: f64 = r.residuals.iter().sum();
802        assert!(sum.abs() < 1e-8, "residuals sum = {sum}");
803    }
804
805    #[test]
806    fn multiple_single_predictor_matches_simple() {
807        let x = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
808        let y: Vec<f64> = x
809            .iter()
810            .map(|&xi| 3.0 + 2.5 * xi + 0.1 * (xi - 5.0))
811            .collect();
812
813        let simple = simple_linear_regression(&x, &y).expect("simple");
814        let multi = multiple_linear_regression(&[&x], &y).expect("multiple");
815
816        assert!(
817            (simple.slope - multi.coefficients[1]).abs() < 1e-8,
818            "slope: {} vs {}",
819            simple.slope,
820            multi.coefficients[1]
821        );
822        assert!(
823            (simple.intercept - multi.coefficients[0]).abs() < 1e-8,
824            "intercept: {} vs {}",
825            simple.intercept,
826            multi.coefficients[0]
827        );
828        assert!((simple.r_squared - multi.r_squared).abs() < 1e-8);
829    }
830
831    #[test]
832    fn multiple_edge_cases() {
833        let x1 = [1.0, 2.0];
834        let y = [3.0, 4.0];
835        assert!(multiple_linear_regression(&[&x1], &y).is_none()); // n < p+2
836
837        let x2 = [1.0, 2.0, 3.0];
838        let y2 = [4.0, 5.0];
839        assert!(multiple_linear_regression(&[&x2], &y2).is_none()); // length mismatch
840    }
841
842    #[test]
843    fn multiple_vif_independent_predictors() {
844        // Independent predictors → VIF ≈ 1
845        let x1 = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0];
846        let x2 = [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
847        let y: Vec<f64> = x1
848            .iter()
849            .zip(x2.iter())
850            .map(|(&a, &b)| 1.0 + 2.0 * a + 3.0 * b)
851            .collect();
852        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
853        for (i, &v) in r.vif.iter().enumerate() {
854            assert!(
855                v < 2.0,
856                "VIF[{i}] = {v}, expected near 1.0 for independent predictors"
857            );
858        }
859    }
860
861    #[test]
862    fn multiple_vif_correlated_predictors() {
863        // Highly but not perfectly correlated predictors → higher VIF
864        let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
865        // Add small perturbation to break perfect collinearity
866        let noise = [
867            0.1, -0.2, 0.3, -0.1, 0.2, -0.3, 0.1, -0.1, 0.2, -0.2, 0.3, -0.1, 0.1, -0.2, 0.3, -0.3,
868            0.1, -0.1, 0.2, -0.2,
869        ];
870        let x2: Vec<f64> = x1
871            .iter()
872            .zip(noise.iter())
873            .map(|(&v, &n)| v * 0.9 + 1.0 + n)
874            .collect();
875        let y: Vec<f64> = x1
876            .iter()
877            .zip(x2.iter())
878            .map(|(&a, &b)| 1.0 + a + b)
879            .collect();
880        let r = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
881        // VIF > 5 for highly correlated predictors
882        assert!(
883            r.vif[0] > 5.0,
884            "VIF[0] = {}, expected > 5.0 for correlated predictors",
885            r.vif[0]
886        );
887    }
888
889    // -----------------------------------------------------------------------
890    // Prediction tests
891    // -----------------------------------------------------------------------
892
893    #[test]
894    fn predict_simple_basic() {
895        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
896        let y = [3.0, 5.0, 7.0, 9.0, 11.0]; // y = 1 + 2x
897        let model = simple_linear_regression(&x, &y).expect("should compute");
898        let pred = predict_simple(&model, &[6.0, 7.0]);
899        assert!((pred[0] - 13.0).abs() < 1e-10);
900        assert!((pred[1] - 15.0).abs() < 1e-10);
901    }
902
903    #[test]
904    fn predict_multiple_basic() {
905        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
906        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
907        let y: Vec<f64> = x1
908            .iter()
909            .zip(x2.iter())
910            .map(|(&a, &b)| 1.0 + 2.0 * a + 3.0 * b)
911            .collect();
912        let model = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
913
914        let new_x1 = [11.0];
915        let new_x2 = [6.0];
916        let pred = predict_multiple(&model, &[&new_x1, &new_x2]).expect("should predict");
917        let expected = 1.0 + 2.0 * 11.0 + 3.0 * 6.0;
918        assert!(
919            (pred[0] - expected).abs() < 1e-6,
920            "pred = {}, expected = {}",
921            pred[0],
922            expected
923        );
924    }
925
926    // -----------------------------------------------------------------------
927    // Multicollinearity diagnostics — public vif() / condition_number()
928    // -----------------------------------------------------------------------
929
930    #[test]
931    fn vif_pub_independent_predictors() {
932        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
933        let x2 = [8.0, 1.0, 6.0, 3.0, 5.0, 7.0, 2.0, 4.0];
934        let v = vif(&[&x1, &x2]).expect("should compute");
935        for r in &v {
936            assert!(*r < 2.0, "VIF should be near 1.0 for independent, got {r}");
937        }
938    }
939
940    #[test]
941    fn vif_pub_collinear_predictors() {
942        let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
943        let x2: Vec<f64> = x1.iter().map(|&v| v * 2.0 + 0.001).collect();
944        let v = vif(&[&x1[..], &x2[..]]).expect("should compute");
945        assert!(
946            v[0] > 50.0,
947            "VIF should explode under near-collinearity, got {}",
948            v[0]
949        );
950    }
951
952    #[test]
953    fn vif_pub_rejects_short_input() {
954        let x = [1.0, 2.0];
955        assert!(vif(&[&x[..], &x[..]]).is_none());
956    }
957
958    #[test]
959    fn vif_pub_rejects_non_finite() {
960        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
961        let x2 = [1.0, 2.0, f64::NAN, 4.0, 5.0, 6.0];
962        assert!(vif(&[&x1[..], &x2[..]]).is_none());
963    }
964
965    #[test]
966    fn condition_number_orthogonal() {
967        let x1 = [1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
968        let x2 = [1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0];
969        let c = condition_number(&[&x1[..], &x2[..]]).expect("should compute");
970        assert!(c < 5.0, "near-orthogonal: cond should be small, got {c}");
971    }
972
973    #[test]
974    fn condition_number_collinear_explodes() {
975        let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
976        let x2: Vec<f64> = x1.iter().map(|&v| v * 2.0 + 1e-6).collect();
977        let c = condition_number(&[&x1[..], &x2[..]]).expect("should compute");
978        assert!(
979            c > 1e5 || c.is_infinite(),
980            "collinear: cond should be huge, got {c}"
981        );
982    }
983
984    #[test]
985    fn condition_number_rejects_short_input() {
986        let x = [1.0, 2.0];
987        assert!(condition_number(&[&x[..], &x[..]]).is_none());
988    }
989
990    #[test]
991    fn predict_multiple_wrong_predictors() {
992        let x1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
993        let x2 = [2.0, 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
994        let y: Vec<f64> = x1.iter().zip(x2.iter()).map(|(&a, &b)| a + b).collect();
995        let model = multiple_linear_regression(&[&x1, &x2], &y).expect("should compute");
996
997        // Wrong number of predictors
998        assert!(predict_multiple(&model, &[&[1.0]]).is_none());
999    }
1000}
1001
1002#[cfg(test)]
1003mod proptests {
1004    use super::*;
1005    use proptest::prelude::*;
1006
1007    proptest! {
1008        #[test]
1009        fn simple_r_squared_bounded(
1010            data in proptest::collection::vec(-1e3_f64..1e3, 5..=30)
1011                .prop_flat_map(|x| {
1012                    let n = x.len();
1013                    (Just(x), proptest::collection::vec(-1e3_f64..1e3, n..=n))
1014                })
1015        ) {
1016            let (x, y) = data;
1017            if let Some(r) = simple_linear_regression(&x, &y) {
1018                prop_assert!(r.r_squared >= -0.01 && r.r_squared <= 1.01,
1019                    "R² = {}", r.r_squared);
1020            }
1021        }
1022
1023        #[test]
1024        fn simple_residuals_orthogonal_to_x(
1025            data in proptest::collection::vec(-1e3_f64..1e3, 5..=30)
1026                .prop_flat_map(|x| {
1027                    let n = x.len();
1028                    (Just(x), proptest::collection::vec(-1e3_f64..1e3, n..=n))
1029                })
1030        ) {
1031            let (x, y) = data;
1032            if let Some(r) = simple_linear_regression(&x, &y) {
1033                // Σ(xᵢ · eᵢ) should be near zero (OLS normal equation)
1034                let dot: f64 = x.iter().zip(r.residuals.iter()).map(|(&xi, &ei)| xi * ei).sum();
1035                let norm = r.residuals.iter().map(|e| e * e).sum::<f64>().sqrt();
1036                let x_norm = x.iter().map(|xi| xi * xi).sum::<f64>().sqrt();
1037                if norm > 1e-10 && x_norm > 1e-10 {
1038                    prop_assert!((dot / (norm * x_norm)).abs() < 1e-6,
1039                        "residuals not orthogonal to x: dot={dot}");
1040                }
1041            }
1042        }
1043
1044        #[test]
1045        fn multiple_r_squared_bounded(
1046            x1 in proptest::collection::vec(-1e3_f64..1e3, 8..=20),
1047            x2_seed in proptest::collection::vec(-1e3_f64..1e3, 8..=20),
1048            y_seed in proptest::collection::vec(-1e3_f64..1e3, 8..=20),
1049        ) {
1050            let n = x1.len().min(x2_seed.len()).min(y_seed.len());
1051            let x2 = &x2_seed[..n];
1052            let y = &y_seed[..n];
1053            let x1 = &x1[..n];
1054            if let Some(r) = multiple_linear_regression(&[x1, x2], y) {
1055                prop_assert!(r.r_squared >= -0.01 && r.r_squared <= 1.01,
1056                    "R² = {}", r.r_squared);
1057            }
1058        }
1059    }
1060}
u_analytics/regression.rs

u_analytics/
regression.rs