use crate::distributions::{fisher_snedecor_cdf, student_t_cdf, student_t_inverse_cdf};
use crate::error::{Error, Result};
use crate::linalg::{vec_dot, vec_mean, vec_sub, Matrix};
use crate::serialization::types::ModelType;
use crate::impl_serialization;
use serde::{Deserialize, Serialize};
const MIN_LEVERAGE_DENOM: f64 = 1e-10;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VifResult {
pub variable: String,
pub vif: f64,
pub rsquared: f64,
pub interpretation: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegressionOutput {
pub coefficients: Vec<f64>,
pub std_errors: Vec<f64>,
pub t_stats: Vec<f64>,
pub p_values: Vec<f64>,
pub conf_int_lower: Vec<f64>,
pub conf_int_upper: Vec<f64>,
pub r_squared: f64,
pub adj_r_squared: f64,
pub f_statistic: f64,
pub f_p_value: f64,
pub mse: f64,
pub rmse: f64,
pub mae: f64,
pub std_error: f64,
pub residuals: Vec<f64>,
pub standardized_residuals: Vec<f64>,
pub predictions: Vec<f64>,
pub leverage: Vec<f64>,
pub vif: Vec<VifResult>,
pub n: usize,
pub k: usize,
pub df: usize,
pub variable_names: Vec<String>,
pub log_likelihood: f64,
pub aic: f64,
pub bic: f64,
}
pub fn two_tailed_p_value(t: f64, df: f64) -> f64 {
if t.abs() > 100.0 {
return 0.0;
}
let cdf = student_t_cdf(t, df);
if t >= 0.0 {
2.0 * (1.0 - cdf)
} else {
2.0 * cdf
}
}
pub fn t_critical_quantile(df: f64, alpha: f64) -> f64 {
let p = 1.0 - alpha / 2.0;
student_t_inverse_cdf(p, df)
}
pub fn f_p_value(f_stat: f64, df1: f64, df2: f64) -> f64 {
if f_stat <= 0.0 {
return 1.0;
}
1.0 - fisher_snedecor_cdf(f_stat, df1, df2)
}
#[allow(clippy::needless_range_loop)]
pub fn compute_leverage(x: &Matrix, xtx_inv: &Matrix) -> Vec<f64> {
let n = x.rows;
let mut leverage = vec![0.0; n];
for i in 0..n {
let mut row_vec = vec![0.0; x.cols];
for j in 0..x.cols {
row_vec[j] = x.get(i, j);
}
let mut temp_vec = vec![0.0; x.cols];
for c in 0..x.cols {
let mut sum = 0.0;
for k in 0..x.cols {
sum += row_vec[k] * xtx_inv.get(k, c);
}
temp_vec[c] = sum;
}
leverage[i] = vec_dot(&temp_vec, &row_vec);
}
leverage
}
pub fn log_likelihood(n: usize, _mse: f64, ssr: f64) -> f64 {
let n_f64 = n as f64;
let two_pi = 2.0 * std::f64::consts::PI;
-0.5 * n_f64 * (two_pi * ssr / n_f64).ln() - n_f64 / 2.0
}
pub fn aic(log_likelihood: f64, n_coef: usize) -> f64 {
let k = n_coef + 1;
2.0 * k as f64 - 2.0 * log_likelihood
}
pub fn bic(log_likelihood: f64, n_coef: usize, n_obs: usize) -> f64 {
let k = n_coef + 1;
k as f64 * (n_obs as f64).ln() - 2.0 * log_likelihood
}
pub fn aic_python(log_likelihood: f64, n_coef: usize) -> f64 {
2.0 * n_coef as f64 - 2.0 * log_likelihood
}
pub fn bic_python(log_likelihood: f64, n_coef: usize, n_obs: usize) -> f64 {
n_coef as f64 * (n_obs as f64).ln() - 2.0 * log_likelihood
}
pub fn calculate_vif(x_vars: &[Vec<f64>], names: &[String], n: usize) -> Vec<VifResult> {
let k = x_vars.len();
if k <= 1 {
return vec![];
}
let mut z_data = vec![0.0; n * k];
for (j, var) in x_vars.iter().enumerate() {
let mean = vec_mean(var);
let variance = var.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / ((n - 1) as f64);
let std_dev = variance.sqrt();
if std_dev.abs() < 1e-10 {
return names
.iter()
.skip(1)
.map(|name| VifResult {
variable: name.clone(),
vif: f64::INFINITY,
rsquared: 1.0,
interpretation: "Constant variable (undefined correlation)".to_string(),
})
.collect();
}
for i in 0..n {
z_data[i * k + j] = (var[i] - mean) / std_dev;
}
}
let z = Matrix::new(n, k, z_data);
let z_t = z.transpose();
let zt_z = z_t.matmul(&z);
let mut r_corr = zt_z; let factor = 1.0 / ((n - 1) as f64);
for val in &mut r_corr.data {
*val *= factor;
}
let (q_corr, r_corr_tri) = r_corr.qr();
let r_inv_opt = r_corr_tri.invert_upper_triangular();
let r_inv = match r_inv_opt {
Some(inv) => inv.matmul(&q_corr.transpose()),
None => {
return names
.iter()
.skip(1)
.map(|name| VifResult {
variable: name.clone(),
vif: f64::INFINITY,
rsquared: 1.0,
interpretation: "Perfect multicollinearity (singular matrix)".to_string(),
})
.collect();
},
};
let mut results = vec![];
for j in 0..k {
let vif = r_inv.get(j, j);
let vif = if vif < 1.0 { 1.0 } else { vif };
let rsquared = 1.0 - 1.0 / vif;
let interpretation = if vif.is_infinite() {
"Perfect multicollinearity".to_string()
} else if vif > 10.0 {
"High multicollinearity - consider removing".to_string()
} else if vif > 5.0 {
"Moderate multicollinearity".to_string()
} else {
"Low multicollinearity".to_string()
};
results.push(VifResult {
variable: names[j + 1].clone(),
vif,
rsquared,
interpretation,
});
}
results
}
#[allow(clippy::needless_range_loop)]
pub fn ols_regression(
y: &[f64],
x_vars: &[Vec<f64>],
variable_names: &[String],
) -> Result<RegressionOutput> {
let n = y.len();
let k = x_vars.len();
let p = k + 1;
if n <= k + 1 {
return Err(Error::InsufficientData {
required: k + 2,
available: n,
});
}
crate::diagnostics::validate_regression_data(y, x_vars)?;
let mut names = variable_names.to_vec();
while names.len() <= k {
names.push(format!("X{}", names.len()));
}
let mut x_data = vec![0.0; n * p];
for (row, _yi) in y.iter().enumerate() {
x_data[row * p] = 1.0; for (col, x_var) in x_vars.iter().enumerate() {
x_data[row * p + col + 1] = x_var[row];
}
}
let x_matrix = Matrix::new(n, p, x_data);
let qr_result = x_matrix.qr_linpack(None);
let rank = qr_result.rank;
let beta = match x_matrix.qr_solve_linpack(&qr_result, y) {
Some(b) => b,
None => return Err(Error::SingularMatrix),
};
let active: Vec<bool> = beta.iter().map(|b| !b.is_nan()).collect();
let n_active = active.iter().filter(|&&a| a).count();
let beta_for_pred: Vec<f64> = beta.iter().map(|&b| if b.is_nan() { 0.0 } else { b }).collect();
let predictions = x_matrix.mul_vec(&beta_for_pred);
let residuals = vec_sub(y, &predictions);
let df = n - rank;
let y_mean = vec_mean(y);
let ss_total: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
let ss_residual: f64 = residuals.iter().map(|&r| r.powi(2)).sum();
let ss_regression = ss_total - ss_residual;
let k_effective = rank.saturating_sub(1); let r_squared = if ss_total == 0.0 {
f64::NAN
} else {
1.0 - ss_residual / ss_total
};
let adj_r_squared = if ss_total == 0.0 || df == 0 {
f64::NAN
} else {
1.0 - (1.0 - r_squared) * ((n - 1) as f64 / df as f64)
};
let mse = if df > 0 { ss_residual / df as f64 } else { f64::NAN };
let std_error = mse.sqrt();
let mut std_errors = vec![f64::NAN; p];
let mut t_stats = vec![f64::NAN; p];
let mut p_values = vec![f64::NAN; p];
let mut xtx_inv = Matrix::zeros(p, p);
if rank > 0 && df > 0 {
let mut r_sub = Matrix::zeros(rank, rank);
for i in 0..rank {
for j in i..rank {
r_sub.set(i, j, qr_result.qr.get(i, j));
}
}
if let Some(r_inv_sub) = r_sub.invert_upper_triangular() {
let xtx_inv_pivoted = r_inv_sub.matmul(&r_inv_sub.transpose());
for i in 0..rank {
let orig_i = qr_result.pivot[i] - 1; for j in 0..rank {
let orig_j = qr_result.pivot[j] - 1;
xtx_inv.set(orig_i, orig_j, xtx_inv_pivoted.get(i, j));
}
}
for col in 0..p {
if active[col] {
let se = (xtx_inv.get(col, col) * mse).sqrt();
if !se.is_nan() {
std_errors[col] = se;
t_stats[col] = beta[col] / se;
p_values[col] = two_tailed_p_value(t_stats[col], df as f64);
}
}
}
}
}
let alpha = 0.05;
let t_critical = if df > 0 { t_critical_quantile(df as f64, alpha) } else { f64::NAN };
let conf_int_lower: Vec<f64> = beta
.iter()
.zip(&std_errors)
.map(|(&b, &se)| if b.is_nan() || se.is_nan() { f64::NAN } else { b - t_critical * se })
.collect();
let conf_int_upper: Vec<f64> = beta
.iter()
.zip(&std_errors)
.map(|(&b, &se)| if b.is_nan() || se.is_nan() { f64::NAN } else { b + t_critical * se })
.collect();
let leverage = compute_leverage(&x_matrix, &xtx_inv);
let residuals_vec = residuals.clone();
let standardized_residuals: Vec<f64> = residuals_vec
.iter()
.zip(&leverage)
.map(|(&r, &h)| {
let factor = (1.0 - h).max(MIN_LEVERAGE_DENOM).sqrt();
let denom = std_error * factor;
if denom > MIN_LEVERAGE_DENOM {
r / denom
} else {
0.0
}
})
.collect();
let f_statistic = if k_effective > 0 && df > 0 {
(ss_regression / k_effective as f64) / mse
} else {
f64::NAN
};
let f_p_value = if f_statistic.is_nan() {
f64::NAN
} else {
f_p_value(f_statistic, k_effective as f64, df as f64)
};
let rmse = std_error;
let mae: f64 = residuals_vec.iter().map(|&r| r.abs()).sum::<f64>() / n as f64;
let vif = calculate_vif(x_vars, &names, n);
let ll = log_likelihood(n, mse, ss_residual);
let n_coef = n_active; let aic_val = aic(ll, n_coef);
let bic_val = bic(ll, n_coef, n);
Ok(RegressionOutput {
coefficients: beta,
std_errors,
t_stats,
p_values,
conf_int_lower,
conf_int_upper,
r_squared,
adj_r_squared,
f_statistic,
f_p_value,
mse,
rmse,
mae,
std_error,
residuals: residuals_vec,
standardized_residuals,
predictions,
leverage,
vif,
n,
k,
df,
variable_names: names,
log_likelihood: ll,
aic: aic_val,
bic: bic_val,
})
}
impl_serialization!(RegressionOutput, ModelType::OLS, "OLS");
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_aic_bic_formulas_known_values() {
let ll = -100.0;
let n_coef = 3; let n_obs = 100;
let aic_val = aic(ll, n_coef);
let bic_val = bic(ll, n_coef, n_obs);
assert!((aic_val - 208.0).abs() < 1e-10);
let expected_bic = 4.0 * (100.0_f64).ln() + 200.0;
assert!((bic_val - expected_bic).abs() < 1e-10);
}
#[test]
fn test_bic_greater_than_aic_for_reasonable_n() {
let ll = -50.0;
let n_coef = 2;
let aic_val = aic(ll, n_coef);
let bic_val = bic(ll, n_coef, 100);
assert!(bic_val > aic_val);
}
#[test]
fn test_log_likelihood_returns_finite() {
let n = 10;
let mse = 4.0;
let ssr = mse * (n - 2) as f64;
let ll = log_likelihood(n, mse, ssr);
assert!(ll.is_finite());
}
#[test]
fn test_log_likelihood_increases_with_better_fit() {
let n = 10;
let ll_worse = log_likelihood(n, 10.0, 80.0);
let ll_better = log_likelihood(n, 2.0, 16.0);
assert!(ll_better > ll_worse);
}
#[test]
fn test_model_selection_criteria_present_in_output() {
let y = vec![2.0, 4.0, 5.0, 4.0, 5.0];
let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let names = vec!["Intercept".to_string(), "X1".to_string()];
let result = ols_regression(&y, &[x1], &names).unwrap();
assert!(result.log_likelihood.is_finite());
assert!(result.aic.is_finite());
assert!(result.bic.is_finite());
assert!(result.aic > 0.0);
assert!(result.bic > 0.0);
}
#[test]
fn test_regression_output_has_correct_dimensions() {
let y = vec![3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0];
let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let x2 = vec![3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0];
let names = vec!["Intercept".into(), "X1".into(), "X2".into()];
let result = ols_regression(&y, &[x1, x2], &names).unwrap();
let n_coef = 3;
let k = n_coef + 1;
let expected_aic = 2.0 * k as f64 - 2.0 * result.log_likelihood;
assert!((result.aic - expected_aic).abs() < 1e-10);
let expected_bic = k as f64 * (result.n as f64).ln() - 2.0 * result.log_likelihood;
assert!((result.bic - expected_bic).abs() < 1e-10);
}
#[test]
fn test_aic_python_convention() {
let ll = -100.0;
let n_coef = 3;
let aic_py = aic_python(ll, n_coef);
assert!((aic_py - 206.0).abs() < 1e-10);
}
#[test]
fn test_bic_python_convention() {
let ll = -100.0;
let n_coef = 3;
let n_obs = 100;
let bic_py = bic_python(ll, n_coef, n_obs);
let expected_bic = 3.0 * (100.0_f64).ln() + 200.0;
assert!((bic_py - expected_bic).abs() < 1e-10);
}
#[test]
fn test_python_aic_smaller_than_r_aic() {
let ll = -50.0;
let n_coef = 2;
let aic_r = aic(ll, n_coef);
let aic_py = aic_python(ll, n_coef);
assert_eq!(aic_r - aic_py, 2.0);
}
#[test]
fn test_log_likelihood_formula_matches_r() {
let n = 100;
let ssr = 450.0;
let mse = ssr / (n as f64 - 2.0);
let ll = log_likelihood(n, mse, ssr);
let two_pi = 2.0 * std::f64::consts::PI;
let expected = -0.5 * n as f64 * (two_pi * ssr / n as f64).ln() - n as f64 / 2.0;
assert!((ll - expected).abs() < 1e-10);
}
#[test]
fn test_aic_bic_with_perfect_fit() {
let n = 10;
let ssr = 0.001; let mse = ssr / (n as f64 - 2.0);
let ll = log_likelihood(n, mse, ssr);
let aic_val = aic(ll, 2);
let bic_val = bic(ll, 2, n);
assert!(ll > 0.0);
assert!(aic_val.is_finite());
assert!(bic_val.is_finite());
}
#[test]
fn test_aic_bic_model_selection() {
let n = 100;
let ll_simple = -150.0;
let aic_simple = aic(ll_simple, 2);
let bic_simple = bic(ll_simple, 2, n);
let ll_complex = -148.0; let aic_complex = aic(ll_complex, 5);
let bic_complex = bic(ll_complex, 5, n);
assert!(aic_simple < aic_complex);
assert!(bic_simple < bic_complex);
}
#[test]
fn test_log_likelihood_scale_invariance() {
let ssr_per_obs = 1.0;
let n1 = 50;
let ssr1 = ssr_per_obs * n1 as f64;
let ll1 = log_likelihood(n1, ssr1 / (n1 as f64 - 2.0), ssr1);
let n2 = 100;
let ssr2 = ssr_per_obs * n2 as f64;
let ll2 = log_likelihood(n2, ssr2 / (n2 as f64 - 2.0), ssr2);
assert!(ll2 < ll1);
let ll_per_obs1 = ll1 / n1 as f64;
let ll_per_obs2 = ll2 / n2 as f64;
assert!((ll_per_obs1 - ll_per_obs2).abs() < 0.1);
}
#[test]
fn test_regularized_regression_has_model_selection_criteria() {
let y = vec![2.0, 4.0, 6.0, 8.0, 10.0];
let x_data = vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0, 5.0];
let x = crate::linalg::Matrix::new(5, 2, x_data);
let options = crate::regularized::ridge::RidgeFitOptions {
lambda: 0.1,
intercept: true,
standardize: false,
..Default::default()
};
let fit = crate::regularized::ridge::ridge_fit(&x, &y, &options).unwrap();
assert!(fit.log_likelihood.is_finite());
assert!(fit.aic.is_finite());
assert!(fit.bic.is_finite());
}
#[test]
fn test_elastic_net_regression_has_model_selection_criteria() {
let y = vec![2.0, 4.0, 6.0, 8.0, 10.0];
let x_data = vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0, 5.0];
let x = crate::linalg::Matrix::new(5, 2, x_data);
let options = crate::regularized::elastic_net::ElasticNetOptions {
lambda: 0.1,
alpha: 0.5,
intercept: true,
standardize: false,
..Default::default()
};
let fit = crate::regularized::elastic_net::elastic_net_fit(&x, &y, &options).unwrap();
assert!(fit.log_likelihood.is_finite());
assert!(fit.aic.is_finite());
assert!(fit.bic.is_finite());
}
}