linreg_core/core.rs
1//! Core OLS regression implementation.
2//!
3//! This module provides the main Ordinary Least Squares regression functionality
4//! that can be used directly in Rust code. Functions accept native Rust slices
5//! and return Result types for proper error handling.
6//!
7//! # Example
8//!
9//! ```
10//! # use linreg_core::core::ols_regression;
11//! # use linreg_core::Error;
12//! let y = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
13//! let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
14//! let x2 = vec![2.0, 3.0, 3.5, 4.0, 4.5, 5.0];
15//! let names = vec![
16//! "Intercept".to_string(),
17//! "X1".to_string(),
18//! "X2".to_string(),
19//! ];
20//!
21//! let result = ols_regression(&y, &[x1, x2], &names)?;
22//! # Ok::<(), Error>(())
23//! ```
24
25use crate::distributions::{fisher_snedecor_cdf, student_t_cdf, student_t_inverse_cdf};
26use crate::error::{Error, Result};
27use crate::linalg::{vec_dot, vec_mean, vec_sub, Matrix};
28use serde::Serialize;
29
30// ============================================================================
31// Numerical Constants
32// ============================================================================
33
34/// Minimum threshold for standardized residual denominator to avoid division by zero.
35/// When (1 - leverage) is very small, the observation has extremely high leverage
36/// and standardized residuals may be unreliable.
37const MIN_LEVERAGE_DENOM: f64 = 1e-10;
38
39// ============================================================================
40// Result Types
41// ============================================================================
42//
43// Structs containing the output of regression computations.
44
45/// Result of VIF (Variance Inflation Factor) calculation.
46///
47/// VIF measures how much the variance of an estimated regression coefficient
48/// increases due to multicollinearity among the predictors.
49///
50/// # Fields
51///
52/// * `variable` - Name of the predictor variable
53/// * `vif` - Variance Inflation Factor (VIF > 10 indicates high multicollinearity)
54/// * `rsquared` - R-squared from regressing this predictor on all others
55/// * `interpretation` - Human-readable interpretation of the VIF value
56///
57/// # Example
58///
59/// ```
60/// # use linreg_core::core::VifResult;
61/// let vif = VifResult {
62/// variable: "X1".to_string(),
63/// vif: 2.5,
64/// rsquared: 0.6,
65/// interpretation: "Low multicollinearity".to_string(),
66/// };
67/// assert_eq!(vif.variable, "X1");
68/// ```
69#[derive(Debug, Clone, Serialize)]
70pub struct VifResult {
71 /// Name of the predictor variable
72 pub variable: String,
73 /// Variance Inflation Factor (VIF > 10 indicates high multicollinearity)
74 pub vif: f64,
75 /// R-squared from regressing this predictor on all others
76 pub rsquared: f64,
77 /// Human-readable interpretation of the VIF value
78 pub interpretation: String,
79}
80
81/// Complete output from OLS regression.
82///
83/// Contains all coefficients, statistics, diagnostics, and residuals from
84/// an Ordinary Least Squares regression.
85///
86/// # Example
87///
88/// ```
89/// # use linreg_core::core::ols_regression;
90/// let y = vec![1.0, 2.0, 3.0, 4.0, 5.0];
91/// let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
92/// let names = vec!["Intercept".to_string(), "X1".to_string()];
93///
94/// let result = ols_regression(&y, &[x1], &names).unwrap();
95/// assert!(result.r_squared > 0.0);
96/// assert_eq!(result.coefficients.len(), 2); // intercept + 1 predictor
97/// ```
98#[derive(Debug, Clone, Serialize)]
99pub struct RegressionOutput {
100 /// Regression coefficients (including intercept)
101 pub coefficients: Vec<f64>,
102 /// Standard errors of coefficients
103 pub std_errors: Vec<f64>,
104 /// t-statistics for coefficient significance tests
105 pub t_stats: Vec<f64>,
106 /// Two-tailed p-values for coefficients
107 pub p_values: Vec<f64>,
108 /// Lower bounds of 95% confidence intervals
109 pub conf_int_lower: Vec<f64>,
110 /// Upper bounds of 95% confidence intervals
111 pub conf_int_upper: Vec<f64>,
112 /// R-squared (coefficient of determination)
113 pub r_squared: f64,
114 /// Adjusted R-squared (accounts for number of predictors)
115 pub adj_r_squared: f64,
116 /// F-statistic for overall model significance
117 pub f_statistic: f64,
118 /// P-value for F-statistic
119 pub f_p_value: f64,
120 /// Mean squared error of residuals
121 pub mse: f64,
122 /// Root mean squared error (prediction error in original units)
123 pub rmse: f64,
124 /// Mean absolute error of residuals
125 pub mae: f64,
126 /// Standard error of the regression (residual standard deviation)
127 pub std_error: f64,
128 /// Raw residuals (observed - predicted)
129 pub residuals: Vec<f64>,
130 /// Standardized residuals (accounting for leverage)
131 pub standardized_residuals: Vec<f64>,
132 /// Fitted/predicted values
133 pub predictions: Vec<f64>,
134 /// Leverage values for each observation (diagonal of hat matrix)
135 pub leverage: Vec<f64>,
136 /// Variance Inflation Factors for detecting multicollinearity
137 pub vif: Vec<VifResult>,
138 /// Number of observations
139 pub n: usize,
140 /// Number of predictor variables (excluding intercept)
141 pub k: usize,
142 /// Degrees of freedom for residuals (n - k - 1)
143 pub df: usize,
144 /// Names of variables (including intercept)
145 pub variable_names: Vec<String>,
146}
147
148// ============================================================================
149// Statistical Helper Functions
150// ============================================================================
151//
152// Utility functions for computing p-values, critical values, and leverage.
153
154/// Computes a two-tailed p-value from a t-statistic.
155///
156/// Uses the Student's t-distribution CDF to calculate the probability
157/// of observing a t-statistic as extreme as the one provided.
158///
159/// # Arguments
160///
161/// * `t` - The t-statistic value
162/// * `df` - Degrees of freedom
163///
164/// # Example
165///
166/// ```
167/// # use linreg_core::core::two_tailed_p_value;
168/// let p = two_tailed_p_value(2.0, 20.0);
169/// assert!(p > 0.0 && p < 0.1);
170/// ```
171pub fn two_tailed_p_value(t: f64, df: f64) -> f64 {
172 if t.abs() > 100.0 {
173 return 0.0;
174 }
175
176 let cdf = student_t_cdf(t, df);
177 if t >= 0.0 {
178 2.0 * (1.0 - cdf)
179 } else {
180 2.0 * cdf
181 }
182}
183
184/// Computes the critical t-value for a given significance level and degrees of freedom.
185///
186/// Returns the t-value such that the area under the t-distribution curve
187/// to the right of it equals alpha/2 (two-tailed test).
188///
189/// # Arguments
190///
191/// * `df` - Degrees of freedom
192/// * `alpha` - Significance level (typically 0.05 for 95% confidence)
193///
194/// # Example
195///
196/// ```
197/// # use linreg_core::core::t_critical_quantile;
198/// let t_crit = t_critical_quantile(20.0, 0.05);
199/// assert!(t_crit > 2.0); // approximately 2.086 for df=20, alpha=0.05
200/// ```
201pub fn t_critical_quantile(df: f64, alpha: f64) -> f64 {
202 let p = 1.0 - alpha / 2.0;
203 student_t_inverse_cdf(p, df)
204}
205
206/// Computes a p-value from an F-statistic.
207///
208/// Uses the F-distribution CDF to calculate the probability of observing
209/// an F-statistic as extreme as the one provided.
210///
211/// # Arguments
212///
213/// * `f_stat` - The F-statistic value
214/// * `df1` - Numerator degrees of freedom
215/// * `df2` - Denominator degrees of freedom
216///
217/// # Example
218///
219/// ```
220/// # use linreg_core::core::f_p_value;
221/// let p = f_p_value(5.0, 2.0, 20.0);
222/// assert!(p > 0.0 && p < 0.05);
223/// ```
224pub fn f_p_value(f_stat: f64, df1: f64, df2: f64) -> f64 {
225 if f_stat <= 0.0 {
226 return 1.0;
227 }
228 1.0 - fisher_snedecor_cdf(f_stat, df1, df2)
229}
230
231/// Computes leverage values from the design matrix and its inverse.
232///
233/// Leverage measures how far an observation's predictor values are from
234/// the center of the predictor space. High leverage points can have
235/// disproportionate influence on the regression results.
236///
237/// # Arguments
238///
239/// * `x` - Design matrix (including intercept column)
240/// * `xtx_inv` - Inverse of X'X matrix
241///
242/// # Example
243///
244/// ```
245/// # use linreg_core::core::compute_leverage;
246/// # use linreg_core::linalg::Matrix;
247/// // Design matrix with intercept: [[1, 1], [1, 2], [1, 3]]
248/// let x = Matrix::new(3, 2, vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0]);
249/// let xtx = x.transpose().matmul(&x);
250/// let xtx_inv = xtx.invert().unwrap();
251///
252/// let leverage = compute_leverage(&x, &xtx_inv);
253/// assert_eq!(leverage.len(), 3);
254/// // Leverage values should sum to the number of parameters (2)
255/// assert!((leverage.iter().sum::<f64>() - 2.0).abs() < 0.01);
256/// ```
257#[allow(clippy::needless_range_loop)]
258pub fn compute_leverage(x: &Matrix, xtx_inv: &Matrix) -> Vec<f64> {
259 let n = x.rows;
260 let mut leverage = vec![0.0; n];
261 for i in 0..n {
262 // x_row is (1, cols)
263 // temp = x_row * xtx_inv (1, cols)
264 // lev = temp * x_row^T (1, 1)
265
266 // Manual row extraction and multiplication
267 let mut row_vec = vec![0.0; x.cols];
268 for j in 0..x.cols {
269 row_vec[j] = x.get(i, j);
270 }
271
272 let mut temp_vec = vec![0.0; x.cols];
273 for c in 0..x.cols {
274 let mut sum = 0.0;
275 for k in 0..x.cols {
276 sum += row_vec[k] * xtx_inv.get(k, c);
277 }
278 temp_vec[c] = sum;
279 }
280
281 leverage[i] = vec_dot(&temp_vec, &row_vec);
282 }
283 leverage
284}
285
286// ============================================================================
287// VIF Calculation
288// ============================================================================
289//
290// Variance Inflation Factor analysis for detecting multicollinearity.
291
292/// Calculates Variance Inflation Factors for all predictors.
293///
294/// VIF quantifies the severity of multicollinearity in a regression analysis.
295/// A VIF > 10 indicates high multicollinearity that may need to be addressed.
296///
297/// # Arguments
298///
299/// * `x_vars` - Predictor variables (each of length n)
300/// * `names` - Variable names (including intercept as first element)
301/// * `n` - Number of observations
302///
303/// # Returns
304///
305/// Vector of VIF results for each predictor (excluding intercept).
306///
307/// # Example
308///
309/// ```
310/// # use linreg_core::core::calculate_vif;
311/// let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
312/// let x2 = vec![2.0, 4.0, 6.0, 8.0, 10.0];
313/// let names = vec!["Intercept".to_string(), "X1".to_string(), "X2".to_string()];
314///
315/// let vif_results = calculate_vif(&[x1, x2], &names, 5);
316/// assert_eq!(vif_results.len(), 2);
317/// // Perfectly collinear variables will have VIF = infinity
318/// ```
319pub fn calculate_vif(x_vars: &[Vec<f64>], names: &[String], n: usize) -> Vec<VifResult> {
320 let k = x_vars.len();
321 if k <= 1 {
322 return vec![];
323 }
324
325 // Standardize predictors (Z-score)
326 let mut z_data = vec![0.0; n * k];
327
328 for (j, var) in x_vars.iter().enumerate() {
329 let mean = vec_mean(var);
330 let variance = var.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / ((n - 1) as f64);
331 let std_dev = variance.sqrt();
332
333 // Handle constant variables
334 if std_dev.abs() < 1e-10 {
335 return names
336 .iter()
337 .skip(1)
338 .map(|name| VifResult {
339 variable: name.clone(),
340 vif: f64::INFINITY,
341 rsquared: 1.0,
342 interpretation: "Constant variable (undefined correlation)".to_string(),
343 })
344 .collect();
345 }
346
347 for i in 0..n {
348 z_data[i * k + j] = (var[i] - mean) / std_dev;
349 }
350 }
351
352 let z = Matrix::new(n, k, z_data);
353
354 // Correlation Matrix R = (1/(n-1)) * Z^T * Z
355 let z_t = z.transpose();
356 let zt_z = z_t.matmul(&z);
357
358 // Scale by 1/(n-1)
359 let mut r_corr = zt_z; // Copy
360 let factor = 1.0 / ((n - 1) as f64);
361 for val in &mut r_corr.data {
362 *val *= factor;
363 }
364
365 // Invert R using QR on R_corr (since it's symmetric positive definite, Cholesky is better but QR works)
366 // Or just generic inversion. We implemented generic inversion for Upper Triangular.
367 // Let's use QR: A = QR => A^-1 = R^-1 Q^T
368 let (q_corr, r_corr_tri) = r_corr.qr();
369
370 let r_inv_opt = r_corr_tri.invert_upper_triangular();
371
372 let r_inv = match r_inv_opt {
373 Some(inv) => inv.matmul(&q_corr.transpose()),
374 None => {
375 return names
376 .iter()
377 .skip(1)
378 .map(|name| VifResult {
379 variable: name.clone(),
380 vif: f64::INFINITY,
381 rsquared: 1.0,
382 interpretation: "Perfect multicollinearity (singular matrix)".to_string(),
383 })
384 .collect();
385 },
386 };
387
388 // Extract diagonals
389 let mut results = vec![];
390 for j in 0..k {
391 let vif = r_inv.get(j, j);
392 let vif = if vif < 1.0 { 1.0 } else { vif };
393 let rsquared = 1.0 - 1.0 / vif;
394
395 let interpretation = if vif.is_infinite() {
396 "Perfect multicollinearity".to_string()
397 } else if vif > 10.0 {
398 "High multicollinearity - consider removing".to_string()
399 } else if vif > 5.0 {
400 "Moderate multicollinearity".to_string()
401 } else {
402 "Low multicollinearity".to_string()
403 };
404
405 results.push(VifResult {
406 variable: names[j + 1].clone(),
407 vif,
408 rsquared,
409 interpretation,
410 });
411 }
412
413 results
414}
415
416// ============================================================================
417// OLS Regression
418// ============================================================================
419//
420// Ordinary Least Squares regression implementation using QR decomposition.
421
422/// Performs Ordinary Least Squares regression using QR decomposition.
423///
424/// Uses a numerically stable QR decomposition approach to solve the normal
425/// equations. Returns comprehensive statistics including coefficients,
426/// standard errors, t-statistics, p-values, and diagnostic measures.
427///
428/// # Arguments
429///
430/// * `y` - Response variable (n observations)
431/// * `x_vars` - Predictor variables (each of length n)
432/// * `variable_names` - Names for variables (including intercept)
433///
434/// # Returns
435///
436/// A [`RegressionOutput`] containing all regression statistics and diagnostics.
437///
438/// # Errors
439///
440/// Returns [`Error::InsufficientData`] if n ≤ k + 1.
441/// Returns [`Error::SingularMatrix`] if perfect multicollinearity exists.
442/// Returns [`Error::InvalidInput`] if coefficients are NaN.
443///
444/// # Example
445///
446/// ```
447/// # use linreg_core::core::ols_regression;
448/// # use linreg_core::Error;
449/// let y = vec![2.5, 3.7, 4.2, 5.1, 6.3, 7.0];
450/// let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
451/// let x2 = vec![2.0, 4.0, 5.0, 4.0, 3.0, 2.0];
452/// let names = vec![
453/// "Intercept".to_string(),
454/// "Temperature".to_string(),
455/// "Pressure".to_string(),
456/// ];
457///
458/// let result = ols_regression(&y, &[x1, x2], &names)?;
459/// println!("R-squared: {}", result.r_squared);
460/// # Ok::<(), Error>(())
461/// ```
462#[allow(clippy::needless_range_loop)]
463pub fn ols_regression(
464 y: &[f64],
465 x_vars: &[Vec<f64>],
466 variable_names: &[String],
467) -> Result<RegressionOutput> {
468 let n = y.len();
469 let k = x_vars.len();
470 let p = k + 1;
471
472 // Validate inputs
473 if n <= k + 1 {
474 return Err(Error::InsufficientData {
475 required: k + 2,
476 available: n,
477 });
478 }
479
480 // Validate dimensions and finite values using shared helper
481 crate::diagnostics::validate_regression_data(y, x_vars)?;
482
483 // Prepare variable names
484 let mut names = variable_names.to_vec();
485 while names.len() <= k {
486 names.push(format!("X{}", names.len()));
487 }
488
489 // Create design matrix
490 let mut x_data = vec![0.0; n * p];
491 for (row, _yi) in y.iter().enumerate() {
492 x_data[row * p] = 1.0; // intercept
493 for (col, x_var) in x_vars.iter().enumerate() {
494 x_data[row * p + col + 1] = x_var[row];
495 }
496 }
497
498 let x_matrix = Matrix::new(n, p, x_data);
499
500 // QR Decomposition
501 let (q, r) = x_matrix.qr();
502
503 // Solve R * beta = Q^T * y
504 // extract upper p x p part of R
505 let mut r_upper = Matrix::zeros(p, p);
506 for i in 0..p {
507 for j in 0..p {
508 r_upper.set(i, j, r.get(i, j));
509 }
510 }
511
512 // Q^T * y
513 let q_t = q.transpose();
514 let qty = q_t.mul_vec(y);
515
516 // Take first p elements of qty
517 let rhs_vec = qty[0..p].to_vec();
518 let rhs_mat = Matrix::new(p, 1, rhs_vec); // column vector
519
520 // Invert R_upper
521 let r_inv = match r_upper.invert_upper_triangular() {
522 Some(inv) => inv,
523 None => return Err(Error::SingularMatrix),
524 };
525
526 let beta_mat = r_inv.matmul(&rhs_mat);
527 let beta = beta_mat.data;
528
529 // Validate coefficients
530 if beta.iter().any(|&b| b.is_nan()) {
531 return Err(Error::InvalidInput("Coefficients contain NaN".to_string()));
532 }
533
534 // Compute predictions and residuals
535 let predictions = x_matrix.mul_vec(&beta);
536 let residuals = vec_sub(y, &predictions);
537
538 // Compute sums of squares
539 let y_mean = vec_mean(y);
540 let ss_total: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
541 let ss_residual: f64 = residuals.iter().map(|&r| r.powi(2)).sum();
542 let ss_regression = ss_total - ss_residual;
543
544 // R-squared and adjusted R-squared
545 let r_squared = if ss_total == 0.0 {
546 f64::NAN
547 } else {
548 1.0 - ss_residual / ss_total
549 };
550
551 let adj_r_squared = if ss_total == 0.0 {
552 f64::NAN
553 } else {
554 1.0 - (1.0 - r_squared) * ((n - 1) as f64 / (n - k - 1) as f64)
555 };
556
557 // Mean squared error and standard error
558 let df = n - k - 1;
559 let mse = ss_residual / df as f64;
560 let std_error = mse.sqrt();
561
562 // Standard errors using (X'X)^-1 = R^-1 (R')^-1
563 // xtx_inv = r_inv * r_inv^T
564 let xtx_inv = r_inv.matmul(&r_inv.transpose());
565
566 let mut std_errors = vec![0.0; k + 1];
567 for i in 0..=k {
568 std_errors[i] = (xtx_inv.get(i, i) * mse).sqrt();
569 if std_errors[i].is_nan() {
570 return Err(Error::InvalidInput(format!(
571 "Standard error for coefficient {} is NaN",
572 i
573 )));
574 }
575 }
576
577 // T-statistics and p-values
578 let t_stats: Vec<f64> = beta
579 .iter()
580 .zip(&std_errors)
581 .map(|(&b, &se)| b / se)
582 .collect();
583 let p_values: Vec<f64> = t_stats
584 .iter()
585 .map(|&t| two_tailed_p_value(t, df as f64))
586 .collect();
587
588 // Confidence intervals
589 let alpha = 0.05;
590 let t_critical = t_critical_quantile(df as f64, alpha);
591
592 let conf_int_lower: Vec<f64> = beta
593 .iter()
594 .zip(&std_errors)
595 .map(|(&b, &se)| b - t_critical * se)
596 .collect();
597 let conf_int_upper: Vec<f64> = beta
598 .iter()
599 .zip(&std_errors)
600 .map(|(&b, &se)| b + t_critical * se)
601 .collect();
602
603 // Leverage
604 let leverage = compute_leverage(&x_matrix, &xtx_inv);
605
606 // Standardized residuals
607 let residuals_vec = residuals.clone();
608 let standardized_residuals: Vec<f64> = residuals_vec
609 .iter()
610 .zip(&leverage)
611 .map(|(&r, &h)| {
612 let factor = (1.0 - h).max(MIN_LEVERAGE_DENOM).sqrt();
613 let denom = std_error * factor;
614 if denom > MIN_LEVERAGE_DENOM {
615 r / denom
616 } else {
617 0.0
618 }
619 })
620 .collect();
621
622 // F-statistic
623 let f_statistic = (ss_regression / k as f64) / mse;
624 let f_p_value = f_p_value(f_statistic, k as f64, df as f64);
625
626 // RMSE and MAE
627 let rmse = std_error;
628 let mae: f64 = residuals_vec.iter().map(|&r| r.abs()).sum::<f64>() / n as f64;
629
630 // VIF
631 let vif = calculate_vif(x_vars, &names, n);
632
633 Ok(RegressionOutput {
634 coefficients: beta,
635 std_errors,
636 t_stats,
637 p_values,
638 conf_int_lower,
639 conf_int_upper,
640 r_squared,
641 adj_r_squared,
642 f_statistic,
643 f_p_value,
644 mse,
645 rmse,
646 mae,
647 std_error,
648 residuals: residuals_vec,
649 standardized_residuals,
650 predictions,
651 leverage,
652 vif,
653 n,
654 k,
655 df,
656 variable_names: names,
657 })
658}