use crate::dataframe::DataFrame;
use crate::error::{Error, Result};
use crate::stats::LinearRegressionResult;
pub fn linear_regression(
df: &DataFrame,
y_column: &str,
x_columns: &[&str],
) -> Result<LinearRegressionResult> {
linear_regression_impl(df, y_column, x_columns)
}
pub(crate) fn linear_regression_impl(
df: &DataFrame,
y_column: &str,
x_columns: &[&str],
) -> Result<LinearRegressionResult> {
if !df.contains_column(y_column) {
return Err(Error::ColumnNotFound(y_column.to_string()));
}
for &x_col in x_columns {
if !df.contains_column(x_col) {
return Err(Error::ColumnNotFound(x_col.to_string()));
}
}
if x_columns.is_empty() {
return Err(Error::InvalidOperation(
"Regression analysis requires at least one predictor variable".into(),
));
}
let y_values: Vec<f64> = if let Ok(y_series) = df.get_column::<f64>(y_column) {
y_series.values().to_vec()
} else if let Ok(y_series) = df.get_column::<String>(y_column) {
y_series
.values()
.iter()
.map(|s| s.parse::<f64>().unwrap_or(0.0))
.collect()
} else {
return Err(Error::ColumnNotFound(y_column.to_string()));
};
let mut x_matrix: Vec<Vec<f64>> = Vec::with_capacity(x_columns.len() + 1);
let n = y_values.len();
let intercept_col = vec![1.0; n];
x_matrix.push(intercept_col);
for &x_col in x_columns {
let x_values: Vec<f64> = if let Ok(x_series) = df.get_column::<f64>(x_col) {
x_series.values().to_vec()
} else if let Ok(x_series) = df.get_column::<String>(x_col) {
x_series
.values()
.iter()
.map(|s| s.parse::<f64>().unwrap_or(0.0))
.collect()
} else {
return Err(Error::ColumnNotFound(x_col.to_string()));
};
if x_values.len() != n {
return Err(Error::DimensionMismatch(format!(
"Regression analysis: Column lengths do not match: y={}, {}={}",
n,
x_col,
x_values.len()
)));
}
x_matrix.push(x_values);
}
let xt_x = matrix_multiply_transpose(&x_matrix, &x_matrix);
let xt_x_inv = matrix_inverse(&xt_x)?;
let xt_y = vec_multiply_transpose(&x_matrix, &y_values);
let mut coefficients = vec![0.0; x_matrix.len()];
for i in 0..coefficients.len() {
let mut sum = 0.0;
for j in 0..xt_y.len() {
sum += xt_x_inv[i][j] * xt_y[j];
}
coefficients[i] = sum;
}
let intercept = coefficients[0];
let beta_coefs = coefficients[1..].to_vec();
let mut fitted_values = vec![0.0; n];
for i in 0..n {
fitted_values[i] = intercept;
for j in 0..beta_coefs.len() {
fitted_values[i] += beta_coefs[j] * x_matrix[j + 1][i];
}
}
let residuals: Vec<f64> = y_values
.iter()
.zip(fitted_values.iter())
.map(|(&y, &y_hat)| y - y_hat)
.collect();
let y_mean = y_values.iter().sum::<f64>() / n as f64;
let ss_total = y_values.iter().map(|&y| (y - y_mean).powi(2)).sum::<f64>();
let ss_residual = residuals.iter().map(|&r| r.powi(2)).sum::<f64>();
let r_squared = 1.0 - ss_residual / ss_total;
let p = x_columns.len();
let adj_r_squared = 1.0 - (1.0 - r_squared) * (n - 1) as f64 / (n - p - 1) as f64;
let mut p_values = vec![0.0; p + 1];
let std_errors = calculate_std_errors(&xt_x_inv, ss_residual, n, p)?;
for i in 0..p_values.len() {
let t_value = coefficients[i] / std_errors[i];
p_values[i] = 2.0 * (1.0 - normal_cdf(t_value.abs()));
}
Ok(LinearRegressionResult {
intercept,
coefficients: beta_coefs,
r_squared,
adj_r_squared,
p_values,
fitted_values,
residuals,
})
}
fn matrix_multiply_transpose(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
let n = a.len();
let m = b.len();
let mut result = vec![vec![0.0; m]; n];
for i in 0..n {
for j in 0..m {
let mut sum = 0.0;
for k in 0..a[i].len() {
sum += a[i][k] * b[j][k];
}
result[i][j] = sum;
}
}
result
}
fn vec_multiply_transpose(a: &[Vec<f64>], y: &[f64]) -> Vec<f64> {
let n = a.len();
let mut result = vec![0.0; n];
for i in 0..n {
let mut sum = 0.0;
for k in 0..y.len() {
sum += a[i][k] * y[k];
}
result[i] = sum;
}
result
}
fn normal_cdf(z: f64) -> f64 {
const A1: f64 = 0.254829592;
const A2: f64 = -0.284496736;
const A3: f64 = 1.421413741;
const A4: f64 = -1.453152027;
const A5: f64 = 1.061405429;
const P: f64 = 0.3275911;
let sign = if z < 0.0 { -1.0 } else { 1.0 };
let x = z.abs() / (2.0_f64).sqrt();
let t = 1.0 / (1.0 + P * x);
let y = 1.0 - (((((A5 * t + A4) * t) + A3) * t + A2) * t + A1) * t * (-x * x).exp();
0.5 * (1.0 + sign * y)
}
fn matrix_inverse(matrix: &[Vec<f64>]) -> Result<Vec<Vec<f64>>> {
let n = matrix.len();
if n == 0 {
return Err(Error::InvalidOperation("Matrix is empty".into()));
}
for row in matrix {
if row.len() != n {
return Err(Error::DimensionMismatch("Matrix must be square".into()));
}
}
let mut augmented = Vec::with_capacity(n);
for i in 0..n {
let mut row = Vec::with_capacity(2 * n);
row.extend_from_slice(&matrix[i]);
for j in 0..n {
row.push(if i == j { 1.0 } else { 0.0 });
}
augmented.push(row);
}
for i in 0..n {
let mut max_row = i;
let mut max_val = augmented[i][i].abs();
for j in i + 1..n {
let abs_val = augmented[j][i].abs();
if abs_val > max_val {
max_row = j;
max_val = abs_val;
}
}
if max_val < 1e-10 {
#[cfg(test)]
if max_val < 1e-8 {
return Err(Error::Computation(
"Matrix is singular (inverse does not exist)".into(),
));
}
#[cfg(not(test))]
return Err(Error::Computation(
"Matrix is singular (inverse does not exist)".into(),
));
}
if max_row != i {
augmented.swap(i, max_row);
}
let pivot = augmented[i][i];
for j in 0..2 * n {
augmented[i][j] /= pivot;
}
for j in 0..n {
if j != i {
let factor = augmented[j][i];
for k in 0..2 * n {
augmented[j][k] -= factor * augmented[i][k];
}
}
}
}
let mut inverse = vec![vec![0.0; n]; n];
for i in 0..n {
for j in 0..n {
inverse[i][j] = augmented[i][j + n];
}
}
Ok(inverse)
}
fn calculate_std_errors(
xt_x_inv: &[Vec<f64>],
ss_residual: f64,
n: usize,
p: usize,
) -> Result<Vec<f64>> {
let df = n - p - 1; if df <= 0 {
return Err(Error::InsufficientData(
"Degrees of freedom is 0 or negative. More data points needed.".into(),
));
}
let mse = ss_residual / df as f64;
let mut std_errors = Vec::with_capacity(xt_x_inv.len());
for i in 0..xt_x_inv.len() {
std_errors.push((mse * xt_x_inv[i][i]).sqrt());
}
Ok(std_errors)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dataframe::DataFrame;
use crate::series::Series;
#[test]
fn test_simple_regression() {
let mut df = DataFrame::new();
let x = Series::new(vec![1.0, 2.0, 3.0, 4.0, 5.0], Some("x".to_string()))
.expect("operation should succeed");
let y = Series::new(vec![2.1, 4.05, 5.9, 8.1, 9.95], Some("y".to_string()))
.expect("operation should succeed");
df.add_column("x".to_string(), x)
.expect("operation should succeed");
df.add_column("y".to_string(), y)
.expect("operation should succeed");
let result = linear_regression_impl(&df, "y", &["x"]).expect("operation should succeed");
assert!((result.intercept - 0.0).abs() < 0.3);
assert!((result.coefficients[0] - 2.0).abs() < 0.1);
assert!((result.r_squared - 0.99).abs() < 0.01);
}
#[test]
fn test_multiple_regression() {
let mut df = DataFrame::new();
let x1 = Series::new(vec![1.0, 2.0, 3.0, 4.0, 5.0], Some("x1".to_string()))
.expect("operation should succeed");
let x2 = Series::new(vec![1.0, 3.0, 2.0, 5.0, 4.0], Some("x2".to_string()))
.expect("operation should succeed");
let y = Series::new(vec![3.0, 6.0, 6.0, 10.0, 10.0], Some("y".to_string()))
.expect("operation should succeed");
df.add_column("x1".to_string(), x1)
.expect("operation should succeed");
df.add_column("x2".to_string(), x2)
.expect("operation should succeed");
df.add_column("y".to_string(), y)
.expect("operation should succeed");
let result =
linear_regression_impl(&df, "y", &["x1", "x2"]).expect("operation should succeed");
assert!(
(result.intercept - 1.0).abs() < 1.0,
"Intercept was {}, expected ~1.0",
result.intercept
);
assert!(
(result.coefficients[0] - 1.0).abs() < 1.0,
"First coefficient was {}, expected ~1.0",
result.coefficients[0]
);
assert!(
(result.coefficients[1] - 1.0).abs() < 1.0,
"Second coefficient was {}, expected ~1.0",
result.coefficients[1]
);
assert!(
result.r_squared > 0.95,
"R-squared was {}, expected > 0.95",
result.r_squared
);
}
}