#![allow(clippy::needless_range_loop)]
use super::types::DfbetasResult;
use crate::error::{Error, Result};
use crate::linalg::{vec_sub, Matrix};
use std::collections::HashMap;
pub fn dfbetas_test(y: &[f64], x_vars: &[Vec<f64>]) -> Result<DfbetasResult> {
let n = y.len();
let k = x_vars.len();
let p = k + 1;
if n <= p {
return Err(Error::InsufficientData {
required: p + 1,
available: n,
});
}
super::helpers::validate_regression_data(y, x_vars)?;
let mut x_data = vec![1.0; n * p];
for row in 0..n {
for (col, x_var) in x_vars.iter().enumerate() {
x_data[row * p + col + 1] = x_var[row];
}
}
let x_full = Matrix::new(n, p, x_data);
let (q, r) = x_full.qr();
let mut r_upper = Matrix::zeros(p, p);
for i in 0..p {
for j in 0..p {
r_upper.set(i, j, r.get(i, j));
}
}
let q_t = q.transpose();
let qty = q_t.mul_vec(y);
let rhs_vec = qty[0..p].to_vec();
let rhs_mat = Matrix::new(p, 1, rhs_vec);
let r_inv = match r_upper.invert_upper_triangular() {
Some(inv) => inv,
None => return Err(Error::SingularMatrix),
};
let beta = r_inv.matmul(&rhs_mat);
let beta_data = beta.data;
if beta_data.iter().any(|&b| b.is_nan()) {
return Err(Error::InvalidInput("Coefficients contain NaN".to_string()));
}
let predictions = x_full.mul_vec(&beta_data);
let residuals = vec_sub(y, &predictions);
let rss: f64 = residuals.iter().map(|&r| r * r).sum();
let df_residual = n - p;
let mse = rss / (df_residual as f64);
let _sigma = mse.sqrt();
let xtx_inv = r_inv.matmul(&r_inv.transpose());
let mut leverage = vec![0.0; n];
for i in 0..n {
let mut h_ii = 0.0;
for j in 0..p {
let x_ij = if j == 0 { 1.0 } else { x_vars[j - 1][i] };
for k in 0..p {
let x_ik = if k == 0 { 1.0 } else { x_vars[k - 1][i] };
h_ii += x_ij * xtx_inv.get(j, k) * x_ik;
}
}
leverage[i] = h_ii;
}
let mut dfbetas = vec![vec![0.0; p]; n];
let mut xtx_inv_diag_sqrt = vec![0.0; p];
for j in 0..p {
let diag_val = xtx_inv.get(j, j);
xtx_inv_diag_sqrt[j] = diag_val.sqrt();
}
for i in 0..n {
let r_i = residuals[i];
let h_ii = leverage[i];
let one_minus_h = (1.0 - h_ii).max(f64::EPSILON);
let loo_rss = (rss - r_i * r_i / one_minus_h).max(0.0);
let sigma_i = (loo_rss / (df_residual - 1) as f64).sqrt();
for j in 0..p {
let xtx_inv_x = {
let mut sum = 0.0;
for k in 0..p {
let x_ik = if k == 0 { 1.0 } else { x_vars[k - 1][i] };
sum += xtx_inv.get(j, k) * x_ik;
}
sum
};
let dfbeta = if sigma_i > 0.0 && xtx_inv_diag_sqrt[j] > 0.0 {
r_i * xtx_inv_x / (one_minus_h * sigma_i * xtx_inv_diag_sqrt[j])
} else {
0.0
};
dfbetas[i][j] = dfbeta;
}
}
let threshold = 2.0 / (n as f64).sqrt();
let mut influential_observations: HashMap<usize, Vec<usize>> = HashMap::new();
for j in 0..p {
let mut influential: Vec<usize> = Vec::new();
for i in 0..n {
if dfbetas[i][j].abs() > threshold {
influential.push(i + 1); }
}
if !influential.is_empty() {
influential_observations.insert(j + 1, influential); }
}
let mut max_dfbetas = 0.0;
let mut max_obs = 0;
let mut max_coef = 0;
for i in 0..n {
for j in 0..p {
let abs_val = dfbetas[i][j].abs();
if abs_val > max_dfbetas {
max_dfbetas = abs_val;
max_obs = i + 1;
max_coef = j + 1;
}
}
}
let total_influential: usize = influential_observations
.values()
.map(|v| v.len())
.sum();
let interpretation = if total_influential == 0 {
format!(
"Maximum |DFBETAS| is {:.4} (observation {}, coefficient {}). No observations exceed the threshold of {:.4}. No highly influential observations detected.",
max_dfbetas, max_obs, max_coef, threshold
)
} else {
let coef_names: Vec<String> = influential_observations
.keys()
.map(|k| if *k == 1 { "intercept".to_string() } else { format!("X{}", k - 1) })
.collect();
format!(
"Maximum |DFBETAS| is {:.4} (observation {}, coefficient {}). {} observation(s) exceed the threshold of {:.4} for coefficient(s): {:?}. These observations have high influence on specific coefficients.",
max_dfbetas, max_obs, max_coef, total_influential, threshold, coef_names
)
};
let guidance = if total_influential == 0 {
"No influential observations detected. The model coefficients appear stable with respect to individual observations.".to_string()
} else if total_influential > n / 4 {
format!(
"Multiple observations ({}) exceed the DFBETAS threshold. This may indicate model misspecification or the presence of multiple outliers. Consider: 1) examining residual plots, 2) checking data accuracy, 3) testing alternative model specifications.",
total_influential
)
} else {
let mut details = Vec::new();
for (coef_idx, obs_list) in &influential_observations {
let coef_name = if *coef_idx == 1 { "intercept" } else { &format!("X{}", coef_idx - 1) };
details.push(format!("{}: {:?}", coef_name, obs_list));
}
format!(
"Some observations have high influence on specific coefficients ({}). Investigate these observations: {}. Consider verifying data accuracy and assessing model fit without these points.",
details.join(", "),
total_influential
)
};
Ok(DfbetasResult {
test_name: "DFBETAS".to_string(),
dfbetas,
n,
p,
threshold,
influential_observations,
interpretation,
guidance,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dfbetas_simple() {
let y = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let result = dfbetas_test(&y, &[x1]).unwrap();
assert_eq!(result.dfbetas.len(), 5);
assert_eq!(result.dfbetas[0].len(), 2);
for row in &result.dfbetas {
for &val in row {
assert!(val.is_finite());
}
}
assert!((result.threshold - 2.0 / 5.0_f64.sqrt()).abs() < 1e-10);
}
#[test]
fn test_dfbetas_with_outlier() {
let y = vec![1.0, 2.0, 3.0, 4.0, 20.0];
let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let result = dfbetas_test(&y, &[x1]).unwrap();
for row in &result.dfbetas {
for &val in row {
assert!(val.is_finite());
}
}
let outlier_influence: f64 = result.dfbetas[4].iter().map(|&v| v.abs()).sum();
let typical_influence: f64 = result.dfbetas[1].iter().map(|&v| v.abs()).sum();
assert!(
outlier_influence > typical_influence,
"Outlier should have higher DFBETAS influence than typical observation"
);
}
#[test]
fn test_dfbetas_insufficient_data() {
let y = vec![1.0, 2.0];
let x1 = vec![1.0, 2.0];
let result = dfbetas_test(&y, &[x1]);
assert!(result.is_err());
}
}