use super::types::CooksDistanceResult;
use crate::core::compute_leverage;
use crate::error::{Error, Result};
use crate::linalg::{vec_sub, Matrix};
pub fn cooks_distance_test(y: &[f64], x_vars: &[Vec<f64>]) -> Result<CooksDistanceResult> {
let n = y.len();
let k = x_vars.len();
let p = k + 1;
if n <= p {
return Err(Error::InsufficientData {
required: p + 1,
available: n,
});
}
super::helpers::validate_regression_data(y, x_vars)?;
let mut x_data = vec![1.0; n * p];
for row in 0..n {
x_data[row * p] = 1.0; for (col, x_var) in x_vars.iter().enumerate() {
x_data[row * p + col + 1] = x_var[row];
}
}
let x_full = Matrix::new(n, p, x_data);
let (q, r) = x_full.qr();
let mut r_upper = Matrix::zeros(p, p);
for i in 0..p {
for j in 0..p {
r_upper.set(i, j, r.get(i, j));
}
}
let q_t = q.transpose();
let qty = q_t.mul_vec(y);
let rhs_vec = qty[0..p].to_vec();
let rhs_mat = Matrix::new(p, 1, rhs_vec);
let r_inv = match r_upper.invert_upper_triangular() {
Some(inv) => inv,
None => return Err(Error::SingularMatrix),
};
let beta = r_inv.matmul(&rhs_mat);
let beta_data = beta.data;
if beta_data.iter().any(|&b| b.is_nan()) {
return Err(Error::InvalidInput("Coefficients contain NaN".to_string()));
}
let predictions = x_full.mul_vec(&beta_data);
let residuals = vec_sub(y, &predictions);
let rss: f64 = residuals.iter().map(|&r| r * r).sum();
let df_residual = n - p;
let mse = rss / (df_residual as f64);
let (_q2, r2) = x_full.qr();
let mut r_upper2 = Matrix::zeros(p, p);
for i in 0..p {
for j in 0..p {
r_upper2.set(i, j, r2.get(i, j));
}
}
let r_inv2 = match r_upper2.invert_upper_triangular() {
Some(inv) => inv,
None => return Err(Error::SingularMatrix),
};
let xtx_inv = r_inv2.matmul(&r_inv2.transpose());
let leverage = compute_leverage(&x_full, &xtx_inv);
let mut distances = Vec::with_capacity(n);
let y_variance: f64 = y
.iter()
.map(|&yi| {
let mean = y.iter().sum::<f64>() / y.len() as f64;
(yi - mean).powi(2)
})
.sum::<f64>()
/ (y.len() - 1) as f64;
let mse_is_effectively_zero = mse < y_variance * f64::EPSILON.sqrt() || mse < 1e-20;
if mse_is_effectively_zero {
distances = vec![0.0; n];
} else {
for i in 0..n {
let r_i = residuals[i];
let h_i = leverage[i];
let one_minus_h = (1.0 - h_i).max(f64::EPSILON);
let numerator = r_i * r_i * h_i;
let denominator = (p as f64) * mse * one_minus_h * one_minus_h;
let d_i = if denominator > 0.0 {
let result = numerator / denominator;
if result.is_finite() && result >= 0.0 {
result
} else {
0.0
}
} else {
0.0
};
distances.push(d_i);
}
}
let threshold_4_over_n = 4.0 / (n as f64);
let threshold_4_over_df = if df_residual > 0 {
4.0 / (df_residual as f64)
} else {
0.0
};
let threshold_1 = 1.0;
let influential_4_over_n: Vec<usize> = distances
.iter()
.enumerate()
.filter(|(_, &d)| d > threshold_4_over_n)
.map(|(i, _)| i + 1)
.collect();
let influential_4_over_df: Vec<usize> = distances
.iter()
.enumerate()
.filter(|(_, &d)| d > threshold_4_over_df)
.map(|(i, _)| i + 1)
.collect();
let influential_1: Vec<usize> = distances
.iter()
.enumerate()
.filter(|(_, &d)| d > threshold_1)
.map(|(i, _)| i + 1)
.collect();
let max_d = distances.iter().cloned().fold(0.0, f64::max);
let max_idx = distances
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i + 1)
.unwrap_or(0);
let interpretation = if influential_1.is_empty() && max_d < 1.0 {
format!(
"Maximum Cook's distance is {:.4} (observation {}). No observations exceed D > 1 threshold. No highly influential observations detected.",
max_d, max_idx
)
} else if !influential_1.is_empty() {
format!(
"Maximum Cook's distance is {:.4} (observation {}). {} observation(s) exceed D > 1 threshold: {:?}. These observations have high influence on the model.",
max_d, max_idx, influential_1.len(), influential_1
)
} else {
format!(
"Maximum Cook's distance is {:.4} (observation {}). {} observation(s) exceed 4/n threshold ({:.4}): {:?}",
max_d, max_idx, influential_4_over_n.len(), threshold_4_over_n, influential_4_over_n
)
};
let guidance = if !influential_1.is_empty() {
format!(
"Highly influential observations detected (D > 1). Investigate observations {:?}. Consider: 1) verifying data accuracy, 2) checking if these observations represent a different population, 3) assessing model fit without these points, 4) using robust regression methods.",
influential_1
)
} else if (influential_4_over_n.len() as f64) > n as f64 / 5.0 {
format!(
"Multiple observations ({}) exceed the 4/n threshold. This may indicate model misspecification or outliers. Consider examining residual plots and checking model assumptions.",
influential_4_over_n.len()
)
} else if !influential_4_over_n.is_empty() {
format!(
"Some observations ({:?}) are moderately influential. Examine these points to ensure they are valid and not data entry errors.",
influential_4_over_n
)
} else {
"No influential observations detected. The model appears stable with respect to individual observations.".to_string()
};
Ok(CooksDistanceResult {
test_name: "Cook's Distance".to_string(),
distances,
p,
mse,
threshold_4_over_n,
threshold_4_over_df,
threshold_1,
influential_4_over_n,
influential_4_over_df,
influential_1,
interpretation,
guidance,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cooks_distance_simple() {
let y = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let result = cooks_distance_test(&y, &[x1]).unwrap();
assert_eq!(result.distances.len(), 5);
for &d in &result.distances {
assert!(d.is_finite());
assert!(d >= 0.0);
}
assert!(result.influential_1.is_empty());
}
#[test]
fn test_cooks_distance_with_outlier() {
let y = vec![1.0, 2.0, 3.0, 4.0, 100.0]; let x1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let result = cooks_distance_test(&y, &[x1]).unwrap();
assert_eq!(result.distances.len(), 5);
let max_idx = result
.distances
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i)
.unwrap();
assert_eq!(max_idx, 4);
}
#[test]
fn test_cooks_distance_insufficient_data() {
let y = vec![1.0, 2.0];
let x1 = vec![1.0, 2.0];
let result = cooks_distance_test(&y, &[x1]);
assert!(result.is_err());
}
}