use crate::error::{Result, Error};
use crate::dataframe::DataFrame;
use crate::stats::LinearRegressionResult;
pub(crate) fn linear_regression_impl(
df: &DataFrame,
y_column: &str,
x_columns: &[&str],
) -> Result<LinearRegressionResult> {
if !df.has_column(y_column) {
return Err(Error::InvalidColumn(format!("Column '{}' does not exist", y_column)));
}
for &col_name in x_columns {
if !df.has_column(col_name) {
return Err(Error::InvalidColumn(format!("Column '{}' does not exist", col_name)));
}
}
let y_series = df.get_column(y_column)?;
let y_data = y_series.as_f64()?;
if y_data.is_empty() {
return Err(Error::EmptyData("Regression analysis requires data".into()));
}
let n_samples = y_data.len();
let n_features = x_columns.len();
if n_samples <= n_features {
return Err(Error::InsufficientData(
format!("Regression analysis requires more samples than features (samples: {}, features: {})", n_samples, n_features)
));
}
let mut x_matrix = vec![vec![1.0; n_samples]]; for &col_name in x_columns {
let x_series = df.get_column(col_name)?;
let x_data = x_series.as_f64()?;
if x_data.len() != n_samples {
return Err(Error::DimensionMismatch(
format!("Column '{}' has length {} but expected {}", col_name, x_data.len(), n_samples)
));
}
x_matrix.push(x_data);
}
let mut x_transpose = vec![vec![0.0; n_features + 1]; n_samples];
for i in 0..n_samples {
for j in 0..(n_features + 1) {
x_transpose[i][j] = x_matrix[j][i];
}
}
let mut xtx = vec![vec![0.0; n_features + 1]; n_features + 1];
for i in 0..(n_features + 1) {
for j in 0..(n_features + 1) {
for k in 0..n_samples {
xtx[i][j] += x_matrix[i][k] * x_matrix[j][k];
}
}
}
let mut xty = vec![0.0; n_features + 1];
for i in 0..(n_features + 1) {
for k in 0..n_samples {
xty[i] += x_matrix[i][k] * y_data[k];
}
}
let xtx_inv = invert_matrix(&xtx)?;
let mut coeffs = vec![0.0; n_features + 1];
for i in 0..(n_features + 1) {
for j in 0..(n_features + 1) {
coeffs[i] += xtx_inv[i][j] * xty[j];
}
}
let mut fitted_values = vec![0.0; n_samples];
let mut residuals = vec![0.0; n_samples];
for i in 0..n_samples {
fitted_values[i] = coeffs[0]; for j in 0..n_features {
fitted_values[i] += coeffs[j + 1] * x_transpose[i][j + 1];
}
residuals[i] = y_data[i] - fitted_values[i];
}
let y_mean = y_data.iter().sum::<f64>() / n_samples as f64;
let total_sum_squares: f64 = y_data.iter().map(|&y| (y - y_mean).powi(2)).sum();
let residual_sum_squares: f64 = residuals.iter().map(|&r| r.powi(2)).sum();
let r_squared = 1.0 - (residual_sum_squares / total_sum_squares);
let adj_r_squared = 1.0 - ((1.0 - r_squared) * (n_samples as f64 - 1.0) / (n_samples as f64 - n_features as f64 - 1.0));
let residual_mean_square = residual_sum_squares / (n_samples - n_features - 1) as f64;
let mut std_errors = vec![0.0; n_features + 1];
for i in 0..(n_features + 1) {
std_errors[i] = (xtx_inv[i][i] * residual_mean_square).sqrt();
}
let mut p_values = vec![0.0; n_features + 1];
for i in 0..(n_features + 1) {
let t_stat = coeffs[i] / std_errors[i];
let df = n_samples - n_features - 1;
p_values[i] = 2.0 * (1.0 - t_distribution_cdf(t_stat.abs(), df));
}
let feature_coeffs = coeffs.iter().skip(1).cloned().collect();
Ok(LinearRegressionResult {
intercept: coeffs[0],
coefficients: feature_coeffs,
r_squared,
adj_r_squared,
p_values,
fitted_values,
residuals,
})
}
fn t_distribution_cdf(t: f64, df: usize) -> f64 {
if df > 30 {
return normal_cdf(t);
}
let df_f64 = df as f64;
let x = df_f64 / (df_f64 + t * t);
let a = 0.5 * df_f64;
let beta_approx = if t > 0.0 {
1.0 - 0.5 * x.powf(a)
} else {
0.5 * x.powf(a)
};
beta_approx
}
fn normal_cdf(z: f64) -> f64 {
const A1: f64 = 0.254829592;
const A2: f64 = -0.284496736;
const A3: f64 = 1.421413741;
const A4: f64 = -1.453152027;
const A5: f64 = 1.061405429;
const P: f64 = 0.3275911;
let sign = if z < 0.0 { -1.0 } else { 1.0 };
let x = z.abs() / (2.0_f64).sqrt();
let t = 1.0 / (1.0 + P * x);
let y = 1.0 - (((((A5 * t + A4) * t) + A3) * t + A2) * t + A1) * t * (-x * x).exp();
0.5 * (1.0 + sign * y)
}
fn invert_matrix(matrix: &[Vec<f64>]) -> Result<Vec<Vec<f64>>> {
let n = matrix.len();
for row in matrix {
if row.len() != n {
return Err(Error::InvalidInput("Matrix must be square for inversion".into()));
}
}
let mut augmented = vec![vec![0.0; 2 * n]; n];
for i in 0..n {
for j in 0..n {
augmented[i][j] = matrix[i][j];
}
augmented[i][i + n] = 1.0; }
for i in 0..n {
let mut max_row = i;
let mut max_val = augmented[i][i].abs();
for k in (i + 1)..n {
if augmented[k][i].abs() > max_val {
max_row = k;
max_val = augmented[k][i].abs();
}
}
if max_val < 1e-10 {
return Err(Error::ComputationError("Matrix is singular or nearly singular".into()));
}
if max_row != i {
for j in 0..(2 * n) {
let temp = augmented[i][j];
augmented[i][j] = augmented[max_row][j];
augmented[max_row][j] = temp;
}
}
let pivot = augmented[i][i];
for j in 0..(2 * n) {
augmented[i][j] /= pivot;
}
for k in 0..n {
if k != i {
let factor = augmented[k][i];
for j in 0..(2 * n) {
augmented[k][j] -= factor * augmented[i][j];
}
}
}
}
let mut inverse = vec![vec![0.0; n]; n];
for i in 0..n {
for j in 0..n {
inverse[i][j] = augmented[i][j + n];
}
}
Ok(inverse)
}
pub fn polynomial_regression(
x: &[f64],
y: &[f64],
degree: usize
) -> Result<LinearRegressionResult> {
if x.is_empty() || y.is_empty() {
return Err(Error::EmptyData("Regression analysis requires data".into()));
}
if x.len() != y.len() {
return Err(Error::DimensionMismatch(
format!("X and Y must have the same length: x={}, y={}", x.len(), y.len())
));
}
let n = x.len();
if n <= degree {
return Err(Error::InsufficientData(
format!("Polynomial regression of degree {} requires at least {} data points",
degree, degree + 1)
));
}
let mut x_matrix = Vec::with_capacity(degree + 1);
x_matrix.push(vec![1.0; n]);
for d in 1..=degree {
let col: Vec<f64> = x.iter().map(|&xi| xi.powi(d as i32)).collect();
x_matrix.push(col);
}
let mut x_transpose = vec![vec![0.0; degree + 1]; n];
for i in 0..n {
for j in 0..(degree + 1) {
x_transpose[i][j] = x_matrix[j][i];
}
}
let mut xtx = vec![vec![0.0; degree + 1]; degree + 1];
for i in 0..(degree + 1) {
for j in 0..(degree + 1) {
for k in 0..n {
xtx[i][j] += x_matrix[i][k] * x_matrix[j][k];
}
}
}
let mut xty = vec![0.0; degree + 1];
for i in 0..(degree + 1) {
for k in 0..n {
xty[i] += x_matrix[i][k] * y[k];
}
}
let xtx_inv = invert_matrix(&xtx)?;
let mut coeffs = vec![0.0; degree + 1];
for i in 0..(degree + 1) {
for j in 0..(degree + 1) {
coeffs[i] += xtx_inv[i][j] * xty[j];
}
}
let mut fitted_values = vec![0.0; n];
let mut residuals = vec![0.0; n];
for i in 0..n {
fitted_values[i] = coeffs[0]; for d in 1..=degree {
fitted_values[i] += coeffs[d] * x[i].powi(d as i32);
}
residuals[i] = y[i] - fitted_values[i];
}
let y_mean = y.iter().sum::<f64>() / n as f64;
let total_sum_squares: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
let residual_sum_squares: f64 = residuals.iter().map(|&r| r.powi(2)).sum();
let r_squared = 1.0 - (residual_sum_squares / total_sum_squares);
let adj_r_squared = 1.0 - ((1.0 - r_squared) * (n as f64 - 1.0) / (n as f64 - degree as f64 - 1.0));
let residual_mean_square = residual_sum_squares / (n - degree - 1) as f64;
let mut std_errors = vec![0.0; degree + 1];
for i in 0..(degree + 1) {
std_errors[i] = (xtx_inv[i][i] * residual_mean_square).sqrt();
}
let mut p_values = vec![0.0; degree + 1];
for i in 0..(degree + 1) {
let t_stat = coeffs[i] / std_errors[i];
let df = n - degree - 1;
p_values[i] = 2.0 * (1.0 - t_distribution_cdf(t_stat.abs(), df));
}
let feature_coeffs = coeffs.iter().skip(1).cloned().collect();
Ok(LinearRegressionResult {
intercept: coeffs[0],
coefficients: feature_coeffs,
r_squared,
adj_r_squared,
p_values,
fitted_values,
residuals,
})
}
pub fn simple_linear_regression(x: &[f64], y: &[f64]) -> Result<LinearRegressionResult> {
polynomial_regression(x, y, 1)
}
pub fn residual_diagnostics(residuals: &[f64]) -> Result<std::collections::HashMap<String, f64>> {
if residuals.is_empty() {
return Err(Error::EmptyData("Residual diagnostics requires data".into()));
}
let n = residuals.len();
if n < 3 {
return Err(Error::InsufficientData("Residual diagnostics requires at least 3 data points".into()));
}
let mut diagnostics = std::collections::HashMap::new();
let mean = residuals.iter().sum::<f64>() / n as f64;
let variance = residuals.iter()
.map(|&r| (r - mean).powi(2))
.sum::<f64>() / (n as f64);
diagnostics.insert("mean".to_string(), mean);
diagnostics.insert("variance".to_string(), variance);
let mut dw_num = 0.0;
let mut dw_denom = 0.0;
for i in 1..n {
dw_num += (residuals[i] - residuals[i - 1]).powi(2);
}
for &r in residuals {
dw_denom += r.powi(2);
}
let durbin_watson = dw_num / dw_denom;
diagnostics.insert("durbin_watson".to_string(), durbin_watson);
let skewness = residuals.iter()
.map(|&r| (r - mean).powi(3))
.sum::<f64>() / (n as f64 * variance.powf(1.5));
let kurtosis = residuals.iter()
.map(|&r| (r - mean).powi(4))
.sum::<f64>() / (n as f64 * variance.powi(2)) - 3.0;
let jarque_bera = n as f64 * (skewness.powi(2) / 6.0 + kurtosis.powi(2) / 24.0);
let jb_p_value = 1.0 - chi2_cdf(jarque_bera, 2);
diagnostics.insert("skewness".to_string(), skewness);
diagnostics.insert("kurtosis".to_string(), kurtosis);
diagnostics.insert("jarque_bera".to_string(), jarque_bera);
diagnostics.insert("jb_p_value".to_string(), jb_p_value);
let mut sorted = residuals.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let shapiro_w = approximate_shapiro_wilk(&sorted);
let shapiro_p = approximate_shapiro_p(shapiro_w, n);
diagnostics.insert("shapiro_w".to_string(), shapiro_w);
diagnostics.insert("shapiro_p_value".to_string(), shapiro_p);
Ok(diagnostics)
}
fn chi2_cdf(chi2: f64, df: usize) -> f64 {
let k = df as f64 / 2.0;
let x = chi2 / 2.0;
if chi2 <= 0.0 {
return 0.0;
}
let p = 1.0 - (-x).exp() * (1.0 + x + x.powi(2) / 2.0);
p.min(1.0).max(0.0)
}
fn approximate_shapiro_wilk(sorted: &[f64]) -> f64 {
let n = sorted.len();
let mean = sorted.iter().sum::<f64>() / n as f64;
let variance = sorted.iter()
.map(|&x| (x - mean).powi(2))
.sum::<f64>() / (n as f64);
if variance.abs() < 1e-10 {
return 0.0; }
let mut a = Vec::with_capacity(n);
for i in 0..n {
let p = (i as f64 + 0.375) / (n as f64 + 0.25);
let z = if p < 0.5 {
-(-2.0 * p.ln()).sqrt()
} else {
(-2.0 * (1.0 - p).ln()).sqrt()
};
a.push(z);
}
let a_sq_sum: f64 = a.iter().map(|&x| x.powi(2)).sum();
a.iter_mut().for_each(|x| *x /= a_sq_sum.sqrt());
let numerator: f64 = a.iter().zip(sorted.iter())
.map(|(&ai, &xi)| ai * xi)
.sum::<f64>().powi(2);
let denominator: f64 = sorted.iter()
.map(|&x| (x - mean).powi(2))
.sum::<f64>();
numerator / denominator
}
fn approximate_shapiro_p(w: f64, n: usize) -> f64 {
if w >= 1.0 {
return 1.0;
}
if w <= 0.0 {
return 0.0;
}
let ln_w = (1.0 - w).ln();
let n_f64 = n as f64;
let mu = -1.5861 - 0.31082 * n_f64.ln() - 0.083751 * n_f64.powi(2).ln();
let sigma = 0.6897 + 0.0113 * n_f64.ln();
let z = (ln_w - mu) / sigma;
1.0 - normal_cdf(z)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dataframe::DataFrame;
use crate::series::Series;
#[test]
fn test_invert_matrix() {
let matrix = vec![
vec![4.0, 7.0],
vec![2.0, 6.0]
];
let inverse = invert_matrix(&matrix).expect("operation should succeed");
assert!((inverse[0][0] - 0.6).abs() < 1e-10);
assert!((inverse[0][1] + 0.7).abs() < 1e-10);
assert!((inverse[1][0] + 0.2).abs() < 1e-10);
assert!((inverse[1][1] - 0.4).abs() < 1e-10);
let identity = vec![
vec![1.0, 0.0],
vec![0.0, 1.0]
];
let identity_inv = invert_matrix(&identity).expect("operation should succeed");
assert!((identity_inv[0][0] - 1.0).abs() < 1e-10);
assert!((identity_inv[0][1] - 0.0).abs() < 1e-10);
assert!((identity_inv[1][0] - 0.0).abs() < 1e-10);
assert!((identity_inv[1][1] - 1.0).abs() < 1e-10);
}
#[test]
fn test_simple_linear_regression() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![2.0, 4.0, 6.0, 8.0, 10.0];
let result = simple_linear_regression(&x, &y).expect("operation should succeed");
assert!((result.intercept - 0.0).abs() < 1e-10);
assert!((result.coefficients[0] - 2.0).abs() < 1e-10);
assert!((result.r_squared - 1.0).abs() < 1e-10);
let y_noisy = vec![2.1, 3.9, 6.2, 7.8, 10.1];
let result_noisy = simple_linear_regression(&x, &y_noisy).expect("operation should succeed");
assert!(result_noisy.intercept.abs() < 0.5);
assert!((result_noisy.coefficients[0] - 2.0).abs() < 0.2);
assert!(result_noisy.r_squared > 0.99);
}
#[test]
fn test_polynomial_regression() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![1.0, 4.0, 9.0, 16.0, 25.0];
let linear_result = polynomial_regression(&x, &y, 1).expect("operation should succeed");
assert!(linear_result.r_squared < 0.9);
let quad_result = polynomial_regression(&x, &y, 2).expect("operation should succeed");
assert!(quad_result.intercept.abs() < 1e-8);
assert!(quad_result.coefficients[0].abs() < 1e-8); assert!((quad_result.coefficients[1] - 1.0).abs() < 1e-8); assert!((quad_result.r_squared - 1.0).abs() < 1e-10);
}
#[test]
fn test_residual_diagnostics() {
let normal_residuals = vec![0.1, -0.2, 0.3, -0.15, 0.25, -0.1, 0.05, -0.3];
let normal_diag = residual_diagnostics(&normal_residuals).expect("operation should succeed");
assert!(normal_diag["mean"].abs() < 0.1);
assert!((normal_diag["durbin_watson"] - 2.0).abs() < 1.0);
assert!(normal_diag["shapiro_p_value"] > 0.05);
let skewed_residuals = vec![0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 1.2, 1.5, 2.0];
let skewed_diag = residual_diagnostics(&skewed_residuals).expect("operation should succeed");
assert!(skewed_diag["skewness"] > 0.5);
}
#[test]
fn test_linear_regression_implementation() {
let mut df = DataFrame::new();
let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let x2 = vec![2.0, 3.0, 5.0, 4.0, 8.0];
let y = vec![5.1, 8.9, 12.7, 14.8, 19.2];
df.add_column("x1".to_string(), Series::new(x1, Some("x1".to_string())).expect("operation should succeed")).expect("operation should succeed");
df.add_column("x2".to_string(), Series::new(x2, Some("x2".to_string())).expect("operation should succeed")).expect("operation should succeed");
df.add_column("y".to_string(), Series::new(y, Some("y".to_string())).expect("operation should succeed")).expect("operation should succeed");
let result = linear_regression_impl(&df, "y", &["x1", "x2"]).expect("operation should succeed");
assert!((result.coefficients[0] - 2.0).abs() < 0.5);
assert!((result.coefficients[1] - 0.5).abs() < 0.5);
assert!((result.intercept - 1.0).abs() < 0.5);
assert!(result.r_squared > 0.95);
}
}