1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
use crate::error::GreenersError;
use crate::CovarianceType; // Needed to call OLS fit
use crate::OLS; // We reuse OLS for the Breusch-Pagan auxiliary regression
use ndarray::{Array1, Array2};
use ndarray_linalg::{Inverse, SVD};
use statrs::distribution::{ChiSquared, ContinuousCDF};
pub struct Diagnostics;
impl Diagnostics {
/// Jarque-Bera test for Normality of Residuals.
/// H0: Residuals are normally distributed.
///
/// Returns: (JB-Statistic, p-value)
pub fn jarque_bera(residuals: &Array1<f64>) -> Result<(f64, f64), GreenersError> {
let n = residuals.len() as f64;
let mean = residuals.mean().unwrap_or(0.0);
// Calculate Central Moments
let m2 = residuals.mapv(|r| (r - mean).powi(2)).sum() / n;
let m3 = residuals.mapv(|r| (r - mean).powi(3)).sum() / n;
let m4 = residuals.mapv(|r| (r - mean).powi(4)).sum() / n;
// Skewness (S) and Kurtosis (K)
let skewness = m3 / m2.powf(1.5);
let kurtosis = m4 / m2.powi(2);
// JB = (n/6) * (S^2 + (K - 3)^2 / 4)
let jb_stat = (n / 6.0) * (skewness.powi(2) + (kurtosis - 3.0).powi(2) / 4.0);
// Chi-Square Distribution with 2 degrees of freedom
let chi2 = ChiSquared::new(2.0).map_err(|_| GreenersError::OptimizationFailed)?;
let p_value = 1.0 - chi2.cdf(jb_stat);
Ok((jb_stat, p_value))
}
/// Breusch-Pagan test for Heteroskedasticity.
/// H0: Homoskedasticity (Variance is constant).
///
/// Steps:
/// 1. Get squared residuals (u^2).
/// 2. Run auxiliary regression: u^2 = alpha + delta*X + error.
/// 3. LM Statistic = n * R_squared_aux.
///
/// Returns: (LM-Statistic, p-value)
pub fn breusch_pagan(
residuals: &Array1<f64>,
x: &Array2<f64>,
) -> Result<(f64, f64), GreenersError> {
let n = residuals.len() as f64;
// 1. Auxiliary dependent variable: squared residuals
let u_sq = residuals.mapv(|x| x.powi(2));
// 2. Auxiliary Regression: u^2 against X
// We use CovarianceType::NonRobust because we only want the R2
let aux_model = OLS::fit(&u_sq, x, CovarianceType::NonRobust)?;
// 3. Lagrange Multiplier Statistic = n * R2
let lm_stat = n * aux_model.r_squared;
// Degrees of freedom = k (number of regressors in auxiliary, excluding constant if any, but here we simplify to k-1 if intercept)
// The correct BP is df = number of exogenous variables causing variance.
// Assuming X has intercept and we want to test the variables:
let df = (x.ncols() - 1) as f64;
// Protection for case X has only intercept or df <= 0
let df_safe = if df <= 0.0 { 1.0 } else { df };
let chi2 = ChiSquared::new(df_safe).map_err(|_| GreenersError::OptimizationFailed)?;
let p_value = 1.0 - chi2.cdf(lm_stat);
Ok((lm_stat, p_value))
}
/// Durbin-Watson Test for Autocorrelation of Residuals.
/// Range: [0, 4].
/// - 2.0: No autocorrelation.
/// - 0 to <2: Positive autocorrelation (Common in time series).
/// - >2 to 4: Negative autocorrelation.
pub fn durbin_watson(residuals: &Array1<f64>) -> f64 {
let n = residuals.len();
if n < 2 {
return 0.0;
}
let mut numerator = 0.0;
// Sum of squared differences: sum((e_t - e_{t-1})^2)
for t in 1..n {
let diff = residuals[t] - residuals[t - 1];
numerator += diff.powi(2);
}
// Sum of squared residuals: sum(e_t^2)
let denominator = residuals.mapv(|x| x.powi(2)).sum();
if denominator == 0.0 {
0.0
} else {
numerator / denominator
}
}
/// Variance Inflation Factor (VIF) for each predictor.
///
/// VIF measures how much the variance of an estimated regression coefficient
/// increases due to multicollinearity. For variable j:
///
/// VIF_j = 1 / (1 - R²_j)
///
/// where R²_j is the R-squared from regressing X_j on all other predictors.
///
/// **Interpretation:**
/// - VIF = 1: No correlation with other predictors
/// - VIF < 5: Acceptable multicollinearity
/// - VIF 5-10: Moderate multicollinearity (caution needed)
/// - VIF > 10: High multicollinearity (problematic)
///
/// **Note:** If X includes an intercept column (all 1s), VIF is undefined
/// for that column. This function returns NaN for constant columns.
///
/// # Arguments
/// * `x` - Design matrix (n × k), typically including intercept
///
/// # Returns
/// Array of VIF values for each column. Intercept column will have VIF = NaN.
pub fn vif(x: &Array2<f64>) -> Result<Array1<f64>, GreenersError> {
let k = x.ncols();
let mut vif_values = Array1::<f64>::zeros(k);
for j in 0..k {
// Check if column is constant (e.g., intercept)
let col_j = x.column(j);
let col_mean = col_j.mean().unwrap_or(0.0);
let col_var = col_j.mapv(|v| (v - col_mean).powi(2)).sum();
if col_var < 1e-12 {
// Constant column (likely intercept) - VIF is undefined
vif_values[j] = f64::NAN;
continue;
}
// Regress X_j on all other X columns
// Build X_{-j} (all columns except j)
let mut x_minus_j_cols = Vec::new();
for i in 0..k {
if i != j {
x_minus_j_cols.push(x.column(i).to_owned());
}
}
if x_minus_j_cols.is_empty() {
// Only one predictor - VIF = 1
vif_values[j] = 1.0;
continue;
}
// Stack columns to create X_{-j}
let n = x.nrows();
let mut x_minus_j = Array2::<f64>::zeros((n, x_minus_j_cols.len()));
for (col_idx, col_data) in x_minus_j_cols.iter().enumerate() {
x_minus_j.column_mut(col_idx).assign(col_data);
}
// Run auxiliary regression: X_j = X_{-j} * beta + error
let y_j = col_j.to_owned();
match OLS::fit(&y_j, &x_minus_j, CovarianceType::NonRobust) {
Ok(aux_result) => {
let r_squared = aux_result.r_squared;
// VIF = 1 / (1 - R²)
// Protection: If R² ≈ 1, VIF → ∞
if r_squared >= 0.9999 {
vif_values[j] = f64::INFINITY;
} else {
vif_values[j] = 1.0 / (1.0 - r_squared);
}
}
Err(_) => {
// If regression fails (e.g., perfect collinearity), set VIF to infinity
vif_values[j] = f64::INFINITY;
}
}
}
Ok(vif_values)
}
/// Leverage values (diagonal elements of hat matrix H = X(X'X)^-1X').
///
/// Leverage measures how far an observation's predictor values are from
/// the mean of the predictor values. High leverage points have the potential
/// to be influential.
///
/// **Interpretation:**
/// - Average leverage: h̄ = k/n (where k = number of parameters, n = observations)
/// - High leverage threshold: h_i > 2k/n or h_i > 3k/n
/// - Range: 0 ≤ h_i ≤ 1
///
/// **Note:** High leverage alone doesn't mean the point is influential.
/// Use Cook's distance to identify truly influential observations.
///
/// # Arguments
/// * `x` - Design matrix (n × k)
///
/// # Returns
/// Array of leverage values (one per observation)
pub fn leverage(x: &Array2<f64>) -> Result<Array1<f64>, GreenersError> {
let x_t = x.t();
let xtx = x_t.dot(x);
let xtx_inv = xtx.inv()?;
// H = X(X'X)^-1X'
// We only need diagonal elements: h_i = x_i' (X'X)^-1 x_i
let n = x.nrows();
let mut h_values = Array1::<f64>::zeros(n);
for i in 0..n {
let x_i = x.row(i);
// h_i = x_i' * (X'X)^-1 * x_i
let temp = xtx_inv.dot(&x_i);
h_values[i] = x_i.dot(&temp);
}
Ok(h_values)
}
/// Cook's Distance for detecting influential observations.
///
/// Cook's D measures the influence of each observation on the fitted values.
/// It combines leverage and residual size to identify observations that
/// significantly affect the regression results.
///
/// Formula: D_i = (e_i² / (k * MSE)) * (h_i / (1 - h_i)²)
///
/// where:
/// - e_i = residual for observation i
/// - k = number of parameters (including intercept)
/// - MSE = mean squared error
/// - h_i = leverage for observation i
///
/// **Interpretation:**
/// - D_i > 1: Highly influential (investigate!)
/// - D_i > 4/n: Potentially influential (common threshold)
/// - D_i > 0.5: Worth examining
///
/// **Rule of thumb:** D_i > 4/(n-k-1) suggests influence
///
/// # Arguments
/// * `residuals` - Residuals from the regression
/// * `x` - Design matrix (n × k)
/// * `mse` - Mean squared error (σ²)
///
/// # Returns
/// Array of Cook's distances (one per observation)
pub fn cooks_distance(
residuals: &Array1<f64>,
x: &Array2<f64>,
mse: f64,
) -> Result<Array1<f64>, GreenersError> {
let n = residuals.len();
let k = x.ncols();
// Get leverage values
let h_values = Self::leverage(x)?;
let mut cook_d = Array1::<f64>::zeros(n);
for i in 0..n {
let e_i = residuals[i];
let h_i = h_values[i];
// Protect against h_i = 1 (would cause division by zero)
if h_i >= 0.9999 {
cook_d[i] = f64::INFINITY;
continue;
}
// D_i = (e_i² / (k * MSE)) * (h_i / (1 - h_i)²)
let numerator = e_i.powi(2) * h_i;
let denominator = (k as f64) * mse * (1.0 - h_i).powi(2);
cook_d[i] = numerator / denominator;
}
Ok(cook_d)
}
/// Condition Number of the design matrix.
///
/// The condition number measures multicollinearity by computing the ratio
/// of the largest to smallest singular value of X:
///
/// κ(X) = σ_max / σ_min
///
/// **Interpretation:**
/// - κ < 10: No multicollinearity
/// - κ 10-30: Moderate multicollinearity
/// - κ 30-100: Strong multicollinearity (caution)
/// - κ > 100: Severe multicollinearity (problematic)
///
/// **Advantage over VIF:** Single number summarizing overall collinearity
///
/// **Note:** Automatically handles intercept and scaling issues via SVD.
///
/// # Arguments
/// * `x` - Design matrix (n × k)
///
/// # Returns
/// Condition number (scalar)
pub fn condition_number(x: &Array2<f64>) -> Result<f64, GreenersError> {
// Use Singular Value Decomposition to get singular values
let (_u, s, _vt) = x.svd(false, false)?;
// Condition number = max(σ) / min(σ)
let sigma_max = s.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
let sigma_min = s.iter().cloned().fold(f64::INFINITY, f64::min);
if sigma_min < 1e-12 {
// Near-singular matrix
Ok(f64::INFINITY)
} else {
Ok(sigma_max / sigma_min)
}
}
}