use crate::error::GreenersError;
use crate::CovarianceType;
use crate::{DataFrame, Formula};
use ndarray::{Array1, Array2};
use ndarray_linalg::Inverse;
use statrs::distribution::{ContinuousCDF, StudentsT};
use std::fmt;
use ndarray as nd;
#[derive(Debug)]
pub struct IvResult {
pub params: Array1<f64>,
pub std_errors: Array1<f64>,
pub t_values: Array1<f64>,
pub p_values: Array1<f64>,
pub r_squared: f64,
pub n_obs: usize,
pub df_resid: usize,
pub sigma: f64,
pub cov_type: CovarianceType,
}
impl fmt::Display for IvResult {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let cov_str = match &self.cov_type {
CovarianceType::NonRobust => "Non-Robust".to_string(),
CovarianceType::HC1 => "Robust (HC1)".to_string(),
CovarianceType::HC2 => "Robust (HC2)".to_string(),
CovarianceType::HC3 => "Robust (HC3)".to_string(),
CovarianceType::HC4 => "Robust (HC4)".to_string(),
CovarianceType::NeweyWest(lags) => format!("HAC (Newey-West, L={})", lags),
CovarianceType::Clustered(clusters) => {
let n_clusters = clusters.iter().collect::<std::collections::HashSet<_>>().len();
format!("Clustered ({} clusters)", n_clusters)
}
CovarianceType::ClusteredTwoWay(clusters1, clusters2) => {
let n_clusters_1 = clusters1.iter().collect::<std::collections::HashSet<_>>().len();
let n_clusters_2 = clusters2.iter().collect::<std::collections::HashSet<_>>().len();
format!("Two-Way Clustered ({}×{})", n_clusters_1, n_clusters_2)
}
};
writeln!(f, "\n{:=^78}", " IV (2SLS) Regression Results ")?;
writeln!(
f,
"{:<20} {:>15} || {:<20} {:>15.4}",
"Dep. Variable:", "y", "R-squared:", self.r_squared
)?;
writeln!(
f,
"{:<20} {:>15} || {:<20} {:>15.4}",
"Estimator:", "2SLS", "Sigma:", self.sigma
)?;
writeln!(
f,
"{:<20} {:>15} || {:<20} {:>15}",
"Covariance Type:", cov_str, "No. Observations:", self.n_obs
)?;
writeln!(f, "\n{:-^78}", "")?;
writeln!(
f,
"{:<10} | {:>10} | {:>10} | {:>8} | {:>8}",
"Variable", "coef", "std err", "t", "P>|t|"
)?;
writeln!(f, "{:-^78}", "")?;
for i in 0..self.params.len() {
writeln!(
f,
"x{:<9} | {:>10.4} | {:>10.4} | {:>8.3} | {:>8.3}",
i, self.params[i], self.std_errors[i], self.t_values[i], self.p_values[i]
)?;
}
writeln!(f, "{:=^78}", "")
}
}
impl IvResult {
pub fn predict(&self, x_new: &Array2<f64>) -> Array1<f64> {
x_new.dot(&self.params)
}
pub fn fitted_values(&self, x: &Array2<f64>) -> Array1<f64> {
x.dot(&self.params)
}
pub fn residuals(&self, y: &Array1<f64>, x: &Array2<f64>) -> Array1<f64> {
let y_hat = x.dot(&self.params);
y - &y_hat
}
}
pub struct IV;
impl IV {
pub fn from_formula(
endog_formula: &Formula,
instrument_formula: &Formula,
data: &DataFrame,
cov_type: CovarianceType,
) -> Result<IvResult, GreenersError> {
let (y, x) = data.to_design_matrix(endog_formula)?;
let z = if instrument_formula.intercept {
let n_rows = data.n_rows();
let n_cols = instrument_formula.independents.len() + 1;
let mut z_mat = Array2::<f64>::zeros((n_rows, n_cols));
for i in 0..n_rows {
z_mat[[i, 0]] = 1.0;
}
for (j, var_name) in instrument_formula.independents.iter().enumerate() {
let col_data = data.get(var_name)?;
for i in 0..n_rows {
z_mat[[i, j + 1]] = col_data[i];
}
}
z_mat
} else {
let n_rows = data.n_rows();
let n_cols = instrument_formula.independents.len();
let mut z_mat = Array2::<f64>::zeros((n_rows, n_cols));
for (j, var_name) in instrument_formula.independents.iter().enumerate() {
let col_data = data.get(var_name)?;
for i in 0..n_rows {
z_mat[[i, j]] = col_data[i];
}
}
z_mat
};
Self::fit(&y, &x, &z, cov_type)
}
pub fn fit(
y: &Array1<f64>,
x: &Array2<f64>,
z: &Array2<f64>,
cov_type: CovarianceType,
) -> Result<IvResult, GreenersError> {
let n = x.nrows();
let k = x.ncols();
let l = z.ncols();
if y.len() != n || z.nrows() != n {
return Err(GreenersError::ShapeMismatch("Row count mismatch".into()));
}
if l < k {
return Err(GreenersError::ShapeMismatch(format!(
"Order Condition Failed: Not enough instruments. Z has {} cols, X has {} cols.",
l, k
)));
}
let z_t = z.t();
let zt_z = z_t.dot(z);
let zt_z_inv = zt_z.inv()?;
let zt_x = z_t.dot(x);
let first_stage_coeffs = zt_z_inv.dot(&zt_x);
let x_hat = z.dot(&first_stage_coeffs);
let x_hat_t = x_hat.t();
let xht_xh = x_hat_t.dot(&x_hat);
let xht_xh_inv = xht_xh.inv()?;
let xht_y = x_hat_t.dot(y);
let beta = xht_xh_inv.dot(&xht_y);
let predicted_original = x.dot(&beta);
let residuals = y - &predicted_original;
let ssr = residuals.dot(&residuals);
let df_resid = n - k;
let sigma2 = ssr / (df_resid as f64);
let sigma = sigma2.sqrt();
let cov_matrix = match cov_type {
CovarianceType::NonRobust => &xht_xh_inv * sigma2,
CovarianceType::HC1 => {
let u_squared = residuals.mapv(|r| r.powi(2));
let mut xhat_weighted = x_hat.clone();
for (i, mut row) in xhat_weighted.axis_iter_mut(nd::Axis(0)).enumerate() {
row *= u_squared[i];
}
let meat = x_hat_t.dot(&xhat_weighted);
let bread = &xht_xh_inv;
let sandwich = bread.dot(&meat).dot(bread);
let correction = (n as f64) / (df_resid as f64);
sandwich * correction
}
CovarianceType::HC2 => {
let mut leverage = Array1::<f64>::zeros(n);
for i in 0..n {
let xhat_i = x_hat.row(i);
let temp = xht_xh_inv.dot(&xhat_i);
leverage[i] = xhat_i.dot(&temp);
}
let mut u_adjusted = Array1::<f64>::zeros(n);
for i in 0..n {
let h_i = leverage[i];
if h_i >= 0.9999 {
u_adjusted[i] = residuals[i].powi(2);
} else {
u_adjusted[i] = residuals[i].powi(2) / (1.0 - h_i);
}
}
let mut xhat_weighted = x_hat.clone();
for (i, mut row) in xhat_weighted.axis_iter_mut(nd::Axis(0)).enumerate() {
row *= u_adjusted[i];
}
let meat = x_hat_t.dot(&xhat_weighted);
let bread = &xht_xh_inv;
bread.dot(&meat).dot(bread)
}
CovarianceType::HC3 => {
let mut leverage = Array1::<f64>::zeros(n);
for i in 0..n {
let xhat_i = x_hat.row(i);
let temp = xht_xh_inv.dot(&xhat_i);
leverage[i] = xhat_i.dot(&temp);
}
let mut u_adjusted = Array1::<f64>::zeros(n);
for i in 0..n {
let h_i = leverage[i];
if h_i >= 0.9999 {
u_adjusted[i] = residuals[i].powi(2);
} else {
u_adjusted[i] = residuals[i].powi(2) / (1.0 - h_i).powi(2);
}
}
let mut xhat_weighted = x_hat.clone();
for (i, mut row) in xhat_weighted.axis_iter_mut(nd::Axis(0)).enumerate() {
row *= u_adjusted[i];
}
let meat = x_hat_t.dot(&xhat_weighted);
let bread = &xht_xh_inv;
bread.dot(&meat).dot(bread)
}
CovarianceType::HC4 => {
let mut leverage = Array1::<f64>::zeros(n);
for i in 0..n {
let xhat_i = x_hat.row(i);
let temp = xht_xh_inv.dot(&xhat_i);
leverage[i] = xhat_i.dot(&temp);
}
let mut u_adjusted = Array1::<f64>::zeros(n);
for i in 0..n {
let h_i = leverage[i];
if h_i >= 0.9999 {
u_adjusted[i] = residuals[i].powi(2);
} else {
let delta_i = 4.0_f64.min((n as f64) * h_i / (k as f64));
u_adjusted[i] = residuals[i].powi(2) / (1.0 - h_i).powf(delta_i);
}
}
let mut xhat_weighted = x_hat.clone();
for (i, mut row) in xhat_weighted.axis_iter_mut(nd::Axis(0)).enumerate() {
row *= u_adjusted[i];
}
let meat = x_hat_t.dot(&xhat_weighted);
let bread = &xht_xh_inv;
bread.dot(&meat).dot(bread)
}
CovarianceType::NeweyWest(lags) => {
let u_squared = residuals.mapv(|r| r.powi(2));
let mut xhat_weighted = x_hat.clone();
for (i, mut row) in xhat_weighted.axis_iter_mut(nd::Axis(0)).enumerate() {
row *= u_squared[i];
}
let mut meat = x_hat_t.dot(&xhat_weighted);
for l in 1..=lags {
let weight = 1.0 - (l as f64) / ((lags + 1) as f64);
let mut omega_l = Array2::<f64>::zeros((k, k));
for t in l..n {
let scale = residuals[t] * residuals[t - l];
let row_t = x_hat.row(t);
let row_prev = x_hat.row(t - l);
for i in 0..k {
for j in 0..k {
omega_l[[i, j]] += scale * row_t[i] * row_prev[j];
}
}
}
let omega_l_t = omega_l.t();
let term = &omega_l + &omega_l_t;
meat = meat + (&term * weight);
}
let bread = &xht_xh_inv;
let sandwich = bread.dot(&meat).dot(bread);
let correction = (n as f64) / (df_resid as f64);
sandwich * correction
}
CovarianceType::Clustered(ref cluster_ids) => {
if cluster_ids.len() != n {
return Err(GreenersError::ShapeMismatch(format!(
"Cluster IDs length ({}) must match number of observations ({})",
cluster_ids.len(),
n
)));
}
use std::collections::HashMap;
let mut clusters: HashMap<usize, Vec<usize>> = HashMap::new();
for (obs_idx, &cluster_id) in cluster_ids.iter().enumerate() {
clusters.entry(cluster_id).or_insert_with(Vec::new).push(obs_idx);
}
let n_clusters = clusters.len();
let mut meat = Array2::<f64>::zeros((k, k));
for (_cluster_id, obs_indices) in clusters.iter() {
let cluster_size = obs_indices.len();
let mut xhat_g = Array2::<f64>::zeros((cluster_size, k));
let mut u_g = Array1::<f64>::zeros(cluster_size);
for (i, &obs_idx) in obs_indices.iter().enumerate() {
xhat_g.row_mut(i).assign(&x_hat.row(obs_idx));
u_g[i] = residuals[obs_idx];
}
for i in 0..cluster_size {
for j in 0..cluster_size {
let scale = u_g[i] * u_g[j];
let x_i = xhat_g.row(i);
let x_j = xhat_g.row(j);
for p in 0..k {
for q in 0..k {
meat[[p, q]] += scale * x_i[p] * x_j[q];
}
}
}
}
}
let bread = &xht_xh_inv;
let sandwich = bread.dot(&meat).dot(bread);
let g_correction = (n_clusters as f64) / ((n_clusters - 1) as f64);
let df_correction = ((n - 1) as f64) / (df_resid as f64);
sandwich * g_correction * df_correction
}
CovarianceType::ClusteredTwoWay(ref cluster_ids_1, ref cluster_ids_2) => {
if cluster_ids_1.len() != n || cluster_ids_2.len() != n {
return Err(GreenersError::ShapeMismatch(format!(
"Both cluster ID vectors must match number of observations ({})",
n
)));
}
let compute_clustered_meat = |cluster_ids: &[usize]| -> Array2<f64> {
use std::collections::HashMap;
let mut clusters: HashMap<usize, Vec<usize>> = HashMap::new();
for (obs_idx, &cluster_id) in cluster_ids.iter().enumerate() {
clusters.entry(cluster_id).or_insert_with(Vec::new).push(obs_idx);
}
let mut meat = Array2::<f64>::zeros((k, k));
for (_cluster_id, obs_indices) in clusters.iter() {
let cluster_size = obs_indices.len();
let mut xhat_g = Array2::<f64>::zeros((cluster_size, k));
let mut u_g = Array1::<f64>::zeros(cluster_size);
for (i, &obs_idx) in obs_indices.iter().enumerate() {
xhat_g.row_mut(i).assign(&x_hat.row(obs_idx));
u_g[i] = residuals[obs_idx];
}
for i in 0..cluster_size {
for j in 0..cluster_size {
let scale = u_g[i] * u_g[j];
let x_i = xhat_g.row(i);
let x_j = xhat_g.row(j);
for p in 0..k {
for q in 0..k {
meat[[p, q]] += scale * x_i[p] * x_j[q];
}
}
}
}
}
meat
};
let meat_1 = compute_clustered_meat(cluster_ids_1);
let meat_2 = compute_clustered_meat(cluster_ids_2);
let max_cluster2 = cluster_ids_2.iter().max().unwrap_or(&0) + 1;
let intersection_ids: Vec<usize> = cluster_ids_1
.iter()
.zip(cluster_ids_2.iter())
.map(|(&c1, &c2)| c1 * max_cluster2 + c2)
.collect();
let meat_12 = compute_clustered_meat(&intersection_ids);
let meat = &meat_1 + &meat_2 - &meat_12;
let bread = &xht_xh_inv;
let sandwich = bread.dot(&meat).dot(bread);
use std::collections::HashSet;
let n_clusters_1: HashSet<_> = cluster_ids_1.iter().collect();
let n_clusters_2: HashSet<_> = cluster_ids_2.iter().collect();
let g = n_clusters_1.len().min(n_clusters_2.len());
let g_correction = (g as f64) / ((g - 1) as f64);
let df_correction = ((n - 1) as f64) / (df_resid as f64);
sandwich * g_correction * df_correction
}
};
let std_errors = cov_matrix.diag().mapv(f64::sqrt);
let t_values = &beta / &std_errors;
let t_dist = StudentsT::new(0.0, 1.0, df_resid as f64)
.map_err(|_| GreenersError::OptimizationFailed)?;
let p_values = t_values.mapv(|t| 2.0 * (1.0 - t_dist.cdf(t.abs())));
let y_mean = y.mean().unwrap_or(0.0);
let sst = y.mapv(|val| (val - y_mean).powi(2)).sum();
let r_squared = 1.0 - (ssr / sst);
Ok(IvResult {
params: beta,
std_errors,
t_values,
p_values,
r_squared,
n_obs: n,
df_resid,
sigma,
cov_type,
})
}
}