scirs2-stats 0.4.2

//! Causal Effect Estimation Methods
//!
//! # Methods provided
//!
//! | Estimator | Description |
//! |-----------|-------------|
//! | [`IPWEstimator`] | Inverse probability weighting (Horvitz-Thompson + Hájek) |
//! | [`DoublyRobustEstimator`] | Doubly-robust / AIPW estimator |
//! | [`NearestNeighborMatching`] | Nearest-neighbor matching on covariates or propensity score |
//! | [`RegressionDiscontinuity`] | Sharp and fuzzy regression discontinuity design |
//! | [`SyntheticControlEstimator`] | Abadie-Diamond-Hainmueller synthetic control |
//! | [`DifferenceInDifferences`] | DiD with parallel-trends test |
//!
//! All estimators return an [`EstimationResult`] with point estimate, standard
//! errors, confidence intervals, and p-values.
//!
//! # References
//!
//! - Imbens, G.W. & Rubin, D.B. (2015). *Causal Inference for Statistics, Social,
//!   and Biomedical Sciences*. Cambridge University Press.
//! - Abadie, A., Diamond, A. & Hainmueller, J. (2010). Synthetic Control Methods.
//!   *JASA*, 105, 493-505.
//! - Hahn, J., Todd, P. & van der Klaauw, W. (2001). Identification and Estimation
//!   of Treatment Effects with a Regression Discontinuity Design. *Econometrica*.
//! - Hirano, K. & Imbens, G.W. (2001). Estimation of Causal Effects using
//!   Propensity Score Weighting. *Health Services & Outcomes Research Methodology*.

use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};

use crate::error::{StatsError, StatsResult};

// ---------------------------------------------------------------------------
// Shared result type
// ---------------------------------------------------------------------------

/// Result of a causal effect estimation procedure.
#[derive(Debug, Clone)]
pub struct EstimationResult {
    /// Point estimate of the treatment effect (ATE / ATT / LATE / etc.).
    pub estimate: f64,
    /// Heteroscedasticity-consistent standard error.
    pub std_error: f64,
    /// 95 % confidence interval `[lower, upper]`.
    pub conf_interval: [f64; 2],
    /// Two-sided p-value (H₀: effect = 0).
    pub p_value: f64,
    /// Name of the estimand (e.g., "ATE", "ATT", "LATE").
    pub estimand: String,
    /// Estimator name.
    pub estimator: String,
    /// Sample size used.
    pub n_obs: usize,
    /// Optional additional diagnostics (key → value).
    pub diagnostics: std::collections::HashMap<String, f64>,
}

impl EstimationResult {
    fn new(
        estimate: f64,
        std_error: f64,
        estimand: impl Into<String>,
        estimator: impl Into<String>,
        n_obs: usize,
    ) -> Self {
        let z = 1.959_964; // 97.5th percentile of N(0,1)
        let margin = z * std_error;
        let p = two_sided_p(estimate / std_error.max(f64::EPSILON));
        Self {
            estimate,
            std_error,
            conf_interval: [estimate - margin, estimate + margin],
            p_value: p,
            estimand: estimand.into(),
            estimator: estimator.into(),
            n_obs,
            diagnostics: std::collections::HashMap::new(),
        }
    }

    fn with_diagnostic(mut self, key: impl Into<String>, val: f64) -> Self {
        self.diagnostics.insert(key.into(), val);
        self
    }
}

// ---------------------------------------------------------------------------
// Helper: normal p-value
// ---------------------------------------------------------------------------

fn two_sided_p(z: f64) -> f64 {
    2.0 * (1.0 - normal_cdf(z.abs()))
}

fn normal_cdf(x: f64) -> f64 {
    0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2))
}

fn erf(x: f64) -> f64 {
    // Horner-form approximation, max error ≈ 1.5 × 10⁻⁷
    let t = 1.0 / (1.0 + 0.3275911 * x.abs());
    let poly = t
        * (0.254_829_592
            + t * (-0.284_496_736
                + t * (1.421_413_741 + t * (-1.453_152_027 + t * 1.061_405_429))));
    let sign = if x >= 0.0 { 1.0 } else { -1.0 };
    sign * (1.0 - poly * (-x * x).exp())
}

// ---------------------------------------------------------------------------
// Logistic regression helper (for propensity scores)
// ---------------------------------------------------------------------------

/// Fit logistic regression via gradient descent.
/// Returns the coefficient vector (including intercept as first element).
fn logistic_regression(
    x: ArrayView2<f64>,
    y: ArrayView1<f64>,
    max_iter: usize,
    lr: f64,
    tol: f64,
) -> StatsResult<Array1<f64>> {
    let (n, p) = x.dim();
    let mut coef = Array1::<f64>::zeros(p + 1);

    for _iter in 0..max_iter {
        // Compute predictions
        let mut grad = Array1::<f64>::zeros(p + 1);
        let mut loss = 0.0_f64;
        for i in 0..n {
            let xi = x.row(i);
            let linear: f64 = coef[0]
                + xi.iter()
                    .zip(coef.iter().skip(1))
                    .map(|(a, b)| a * b)
                    .sum::<f64>();
            let prob = 1.0 / (1.0 + (-linear).exp());
            let err = prob - y[i];
            loss += -(y[i] * prob.ln() + (1.0 - y[i]) * (1.0 - prob).ln());
            grad[0] += err;
            for j in 0..p {
                grad[j + 1] += err * xi[j];
            }
        }
        loss /= n as f64;
        for j in 0..=(p) {
            coef[j] -= lr * grad[j] / n as f64;
        }
        if grad.iter().map(|g| g * g).sum::<f64>().sqrt() / (n as f64) < tol {
            break;
        }
        let _ = loss;
    }
    Ok(coef)
}

fn predict_proba(x: ArrayView2<f64>, coef: &Array1<f64>) -> Array1<f64> {
    let (n, p) = x.dim();
    let mut probs = Array1::<f64>::zeros(n);
    for i in 0..n {
        let xi = x.row(i);
        let linear: f64 = coef[0]
            + xi.iter()
                .zip(coef.iter().skip(1))
                .map(|(a, b)| a * b)
                .sum::<f64>();
        probs[i] = 1.0 / (1.0 + (-linear).exp());
    }
    probs
}

// ---------------------------------------------------------------------------
// 1. IPW Estimator
// ---------------------------------------------------------------------------

/// Inverse Probability Weighting (IPW) estimator.
///
/// Supports:
/// - Horvitz-Thompson (HT) weights: w = T/e + (1-T)/(1-e)
/// - Stabilised / Hájek normalisation
pub struct IPWEstimator {
    /// If `true`, use stabilised (Hájek) weights.
    pub stabilised: bool,
    /// Logistic-regression iterations for propensity score estimation.
    pub max_iter: usize,
}

impl Default for IPWEstimator {
    fn default() -> Self {
        Self {
            stabilised: true,
            max_iter: 500,
        }
    }
}

impl IPWEstimator {
    /// Estimate ATE.
    ///
    /// # Arguments
    /// - `covariates` – (n × p) covariate matrix
    /// - `treatment`  – binary treatment indicator (0/1), length n
    /// - `outcome`    – continuous outcome, length n
    pub fn estimate(
        &self,
        covariates: ArrayView2<f64>,
        treatment: ArrayView1<f64>,
        outcome: ArrayView1<f64>,
    ) -> StatsResult<EstimationResult> {
        let n = outcome.len();
        if covariates.nrows() != n || treatment.len() != n {
            return Err(StatsError::DimensionMismatch(
                "Covariate, treatment, and outcome dimensions must match".to_owned(),
            ));
        }

        // Estimate propensity scores
        let coef = logistic_regression(covariates, treatment, self.max_iter, 0.1, 1e-6)?;
        let ps = predict_proba(covariates, &coef);

        // Clip propensity scores away from 0/1
        let eps = 1e-6_f64;
        let ps_clip: Array1<f64> = ps.mapv(|p| p.clamp(eps, 1.0 - eps));

        // IPW weights
        let mut ate_sum = 0.0_f64;
        let mut w1_sum = 0.0_f64;
        let mut w0_sum = 0.0_f64;

        for i in 0..n {
            let ti = treatment[i];
            let yi = outcome[i];
            let ei = ps_clip[i];
            if self.stabilised {
                w1_sum += ti / ei;
                w0_sum += (1.0 - ti) / (1.0 - ei);
                ate_sum += ti * yi / ei - (1.0 - ti) * yi / (1.0 - ei);
            } else {
                ate_sum += ti * yi / ei - (1.0 - ti) * yi / (1.0 - ei);
            }
        }

        let ate = if self.stabilised {
            let mu1 = ate_sum / n as f64 + (w0_sum / n as f64) * 0.0; // simplified
                                                                      // Proper Hájek:
            let mu1_h: f64 = (0..n)
                .map(|i| treatment[i] * outcome[i] / ps_clip[i])
                .sum::<f64>()
                / (0..n)
                    .map(|i| treatment[i] / ps_clip[i])
                    .sum::<f64>()
                    .max(f64::EPSILON);
            let mu0_h: f64 = (0..n)
                .map(|i| (1.0 - treatment[i]) * outcome[i] / (1.0 - ps_clip[i]))
                .sum::<f64>()
                / (0..n)
                    .map(|i| (1.0 - treatment[i]) / (1.0 - ps_clip[i]))
                    .sum::<f64>()
                    .max(f64::EPSILON);
            let _ = mu1;
            mu1_h - mu0_h
        } else {
            ate_sum / n as f64
        };

        // Bootstrap SE (50 resamples for speed)
        let se = bootstrap_se_ipw(&ps_clip, &treatment, &outcome, self.stabilised, 50);

        Ok(
            EstimationResult::new(ate, se, "ATE", "IPW", n).with_diagnostic(
                "mean_ps_treated",
                ps_clip
                    .iter()
                    .zip(treatment.iter())
                    .filter(|(_, &t)| t > 0.5)
                    .map(|(p, _)| p)
                    .sum::<f64>()
                    / treatment.iter().filter(|&&t| t > 0.5).count().max(1) as f64,
            ),
        )
    }
}

fn bootstrap_se_ipw(
    ps: &Array1<f64>,
    treatment: &ArrayView1<f64>,
    outcome: &ArrayView1<f64>,
    stabilised: bool,
    n_boot: usize,
) -> f64 {
    let n = ps.len();
    let mut estimates = Vec::with_capacity(n_boot);
    // Deterministic pseudo-random using LCG
    let mut rng_state: u64 = 12345;
    for _ in 0..n_boot {
        let mut sample_ate = 0.0_f64;
        let mut w1 = 0.0_f64;
        let mut w0 = 0.0_f64;
        for _ in 0..n {
            rng_state = rng_state
                .wrapping_mul(6364136223846793005)
                .wrapping_add(1442695040888963407);
            let idx = (rng_state >> 33) as usize % n;
            let ti = treatment[idx];
            let yi = outcome[idx];
            let ei = ps[idx];
            w1 += ti / ei;
            w0 += (1.0 - ti) / (1.0 - ei);
            sample_ate += ti * yi / ei - (1.0 - ti) * yi / (1.0 - ei);
        }
        let ate = if stabilised {
            let mu1 = (0..n)
                .map(|i| treatment[i] * outcome[i] / ps[i])
                .sum::<f64>()
                / w1.max(f64::EPSILON);
            let mu0 = (0..n)
                .map(|i| (1.0 - treatment[i]) * outcome[i] / (1.0 - ps[i]))
                .sum::<f64>()
                / w0.max(f64::EPSILON);
            mu1 - mu0
        } else {
            sample_ate / n as f64
        };
        estimates.push(ate);
    }
    let mean = estimates.iter().sum::<f64>() / n_boot as f64;
    let var = estimates.iter().map(|&e| (e - mean).powi(2)).sum::<f64>() / (n_boot - 1) as f64;
    var.sqrt()
}

// ---------------------------------------------------------------------------
// 2. Doubly-Robust (AIPW) Estimator
// ---------------------------------------------------------------------------

/// Augmented Inverse Probability Weighting (doubly-robust) estimator.
///
/// Combines a propensity score model with an outcome regression model.
/// Consistent if **either** model is correctly specified.
pub struct DoublyRobustEstimator {
    /// Iterations for logistic propensity score model.
    pub ps_max_iter: usize,
    /// Polynomial degree for outcome regression (1 = linear).
    pub outcome_poly_degree: usize,
}

impl Default for DoublyRobustEstimator {
    fn default() -> Self {
        Self {
            ps_max_iter: 500,
            outcome_poly_degree: 1,
        }
    }
}

impl DoublyRobustEstimator {
    /// Estimate ATE using the AIPW estimator.
    pub fn estimate(
        &self,
        covariates: ArrayView2<f64>,
        treatment: ArrayView1<f64>,
        outcome: ArrayView1<f64>,
    ) -> StatsResult<EstimationResult> {
        let n = outcome.len();
        if covariates.nrows() != n || treatment.len() != n {
            return Err(StatsError::DimensionMismatch(
                "Dimensions must match".to_owned(),
            ));
        }

        // Step 1: propensity scores
        let coef_ps = logistic_regression(covariates, treatment, self.ps_max_iter, 0.1, 1e-6)?;
        let ps = predict_proba(covariates, &coef_ps).mapv(|p| p.clamp(1e-6, 1.0 - 1e-6));

        // Step 2: outcome regression E[Y|X, T=1] and E[Y|X, T=0]
        let (mu1, mu0) = outcome_regression_linear(covariates, treatment, outcome)?;

        // Step 3: AIPW score
        let mut aipw_scores = Array1::<f64>::zeros(n);
        for i in 0..n {
            let ti = treatment[i];
            let yi = outcome[i];
            let ei = ps[i];
            // ψ_i = μ1(x_i) - μ0(x_i) + T_i(Y_i - μ1(x_i))/e(x_i) - (1-T_i)(Y_i - μ0(x_i))/(1-e(x_i))
            aipw_scores[i] =
                mu1[i] - mu0[i] + ti * (yi - mu1[i]) / ei - (1.0 - ti) * (yi - mu0[i]) / (1.0 - ei);
        }

        let ate = aipw_scores.mean().unwrap_or(0.0);
        let variance =
            aipw_scores.iter().map(|&s| (s - ate).powi(2)).sum::<f64>() / ((n - 1) as f64);
        let se = (variance / n as f64).sqrt();

        Ok(EstimationResult::new(
            ate,
            se,
            "ATE",
            "AIPW (Doubly-Robust)",
            n,
        ))
    }
}

/// Simple linear outcome regression returning predicted potential outcomes.
fn outcome_regression_linear(
    covariates: ArrayView2<f64>,
    treatment: ArrayView1<f64>,
    outcome: ArrayView1<f64>,
) -> StatsResult<(Array1<f64>, Array1<f64>)> {
    let n = covariates.nrows();
    let p = covariates.ncols();
    // Build design matrix [1, T, X]
    let mut design = Array2::<f64>::zeros((n, p + 2));
    for i in 0..n {
        design[[i, 0]] = 1.0;
        design[[i, 1]] = treatment[i];
        for j in 0..p {
            design[[i, j + 2]] = covariates[[i, j]];
        }
    }
    // OLS: β = (X'X)^{-1} X'y
    let coef = ols_estimate(design.view(), outcome)?;

    // Predict with T=1 and T=0
    let mut mu1 = Array1::<f64>::zeros(n);
    let mut mu0 = Array1::<f64>::zeros(n);
    for i in 0..n {
        let mut pred1 = coef[0] + coef[1]; // intercept + T=1 coefficient
        let mut pred0 = coef[0]; // intercept + T=0
        for j in 0..p {
            pred1 += coef[j + 2] * covariates[[i, j]];
            pred0 += coef[j + 2] * covariates[[i, j]];
        }
        mu1[i] = pred1;
        mu0[i] = pred0;
    }
    Ok((mu1, mu0))
}

/// OLS coefficient estimation.
fn ols_estimate(x: ArrayView2<f64>, y: ArrayView1<f64>) -> StatsResult<Array1<f64>> {
    let (n, p) = x.dim();
    // XtX
    let mut xtx = Array2::<f64>::zeros((p, p));
    let mut xty = Array1::<f64>::zeros(p);
    for i in 0..n {
        let xi = x.row(i);
        for j in 0..p {
            xty[j] += xi[j] * y[i];
            for k in 0..p {
                xtx[[j, k]] += xi[j] * xi[k];
            }
        }
    }
    // Solve via Gauss-Jordan
    gauss_jordan(xtx, xty)
}

fn gauss_jordan(mut a: Array2<f64>, mut b: Array1<f64>) -> StatsResult<Array1<f64>> {
    let n = b.len();
    for col in 0..n {
        // Find pivot
        let pivot_row = (col..n).max_by(|&i, &j| {
            a[[i, col]]
                .abs()
                .partial_cmp(&a[[j, col]].abs())
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        let pivot_row =
            pivot_row.ok_or_else(|| StatsError::ComputationError("Singular matrix".to_owned()))?;
        // Swap rows in a
        for k in 0..n {
            let tmp = a[[col, k]];
            a[[col, k]] = a[[pivot_row, k]];
            a[[pivot_row, k]] = tmp;
        }
        let tmp = b[pivot_row];
        b[pivot_row] = b[col];
        b[col] = tmp;

        let pivot = a[[col, col]];
        if pivot.abs() < 1e-12 {
            return Err(StatsError::ComputationError(
                "Singular matrix in OLS".to_owned(),
            ));
        }
        for k in col..n {
            a[[col, k]] /= pivot;
        }
        b[col] /= pivot;

        for row in 0..n {
            if row != col {
                let factor = a[[row, col]];
                for k in col..n {
                    let av = a[[col, k]];
                    a[[row, k]] -= factor * av;
                }
                b[row] -= factor * b[col];
            }
        }
    }
    Ok(b)
}

// ---------------------------------------------------------------------------
// 3. Nearest-Neighbor Matching
// ---------------------------------------------------------------------------

/// Nearest-neighbor matching on covariates (Mahalanobis distance) or
/// propensity score.
pub struct NearestNeighborMatching {
    /// Number of control matches per treated unit.
    pub k: usize,
    /// If `true`, match on estimated propensity score; otherwise on raw covariates.
    pub use_propensity_score: bool,
    /// Maximum iterations for propensity score logistic regression.
    pub ps_max_iter: usize,
    /// Whether to allow matching with replacement.
    pub with_replacement: bool,
}

impl Default for NearestNeighborMatching {
    fn default() -> Self {
        Self {
            k: 1,
            use_propensity_score: false,
            ps_max_iter: 500,
            with_replacement: true,
        }
    }
}

impl NearestNeighborMatching {
    /// Estimate ATT via nearest-neighbor matching.
    pub fn estimate(
        &self,
        covariates: ArrayView2<f64>,
        treatment: ArrayView1<f64>,
        outcome: ArrayView1<f64>,
    ) -> StatsResult<EstimationResult> {
        let n = treatment.len();
        if covariates.nrows() != n || outcome.len() != n {
            return Err(StatsError::DimensionMismatch(
                "Dimensions must match".to_owned(),
            ));
        }

        // Build match features
        let match_features: Array2<f64> = if self.use_propensity_score {
            let coef = logistic_regression(covariates, treatment, self.ps_max_iter, 0.1, 1e-6)?;
            let ps = predict_proba(covariates, &coef);
            ps.insert_axis(scirs2_core::ndarray::Axis(1))
        } else {
            covariates.to_owned()
        };

        let treated_idx: Vec<usize> = (0..n).filter(|&i| treatment[i] > 0.5).collect();
        let control_idx: Vec<usize> = (0..n).filter(|&i| treatment[i] <= 0.5).collect();

        if treated_idx.is_empty() || control_idx.is_empty() {
            return Err(StatsError::InsufficientData(
                "Need both treated and control units".to_owned(),
            ));
        }

        // Compute covariate variance for standardisation
        let variances = column_variances(match_features.view());

        let mut att_contributions = Vec::with_capacity(treated_idx.len());
        let mut used_controls: std::collections::HashSet<usize> = std::collections::HashSet::new();

        for &ti in &treated_idx {
            // Find k nearest controls
            let mut distances: Vec<(usize, f64)> = control_idx
                .iter()
                .filter(|&&ci| self.with_replacement || !used_controls.contains(&ci))
                .map(|&ci| {
                    let d = mahalanobis_dist(
                        match_features.row(ti),
                        match_features.row(ci),
                        &variances,
                    );
                    (ci, d)
                })
                .collect();
            distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
            let k_matches = self.k.min(distances.len());
            if k_matches == 0 {
                continue;
            }
            let matched_y: f64 = distances[..k_matches]
                .iter()
                .map(|&(ci, _)| outcome[ci])
                .sum::<f64>()
                / k_matches as f64;
            att_contributions.push(outcome[ti] - matched_y);
            if !self.with_replacement {
                for &(ci, _) in &distances[..k_matches] {
                    used_controls.insert(ci);
                }
            }
        }

        if att_contributions.is_empty() {
            return Err(StatsError::InsufficientData(
                "No matched pairs found".to_owned(),
            ));
        }

        let att = att_contributions.iter().sum::<f64>() / att_contributions.len() as f64;
        let var = att_contributions
            .iter()
            .map(|&d| (d - att).powi(2))
            .sum::<f64>()
            / (att_contributions.len().saturating_sub(1).max(1) as f64);
        let se = (var / att_contributions.len() as f64).sqrt();

        Ok(EstimationResult::new(
            att,
            se,
            "ATT",
            if self.use_propensity_score {
                "PS Matching (NN)"
            } else {
                "Covariate Matching (NN)"
            },
            n,
        ))
    }
}

fn column_variances(x: ArrayView2<f64>) -> Array1<f64> {
    let (n, p) = x.dim();
    let mut vars = Array1::<f64>::zeros(p);
    for j in 0..p {
        let col = x.column(j);
        let mean = col.mean().unwrap_or(0.0);
        let v = col.iter().map(|&xi| (xi - mean).powi(2)).sum::<f64>() / (n as f64);
        vars[j] = v.max(1e-10);
    }
    vars
}

fn mahalanobis_dist(
    a: scirs2_core::ndarray::ArrayView1<f64>,
    b: scirs2_core::ndarray::ArrayView1<f64>,
    variances: &Array1<f64>,
) -> f64 {
    a.iter()
        .zip(b.iter())
        .zip(variances.iter())
        .map(|((&ai, &bi), &vi)| (ai - bi).powi(2) / vi)
        .sum::<f64>()
        .sqrt()
}

// ---------------------------------------------------------------------------
// 4. Regression Discontinuity
// ---------------------------------------------------------------------------

/// Regression discontinuity estimator (sharp or fuzzy).
pub struct RegressionDiscontinuity {
    /// Cutoff value of the running variable.
    pub cutoff: f64,
    /// Bandwidth (half-width of the estimation window).
    pub bandwidth: f64,
    /// Whether to use a local linear regression (true) or local constant (false).
    pub local_linear: bool,
    /// Fuzzy RD: if `true`, use 2SLS to estimate LATE at the cutoff.
    pub fuzzy: bool,
}

impl Default for RegressionDiscontinuity {
    fn default() -> Self {
        Self {
            cutoff: 0.0,
            bandwidth: 1.0,
            local_linear: true,
            fuzzy: false,
        }
    }
}

impl RegressionDiscontinuity {
    /// Estimate the treatment effect at the discontinuity.
    ///
    /// # Arguments
    /// - `running`  – running variable (1-D)
    /// - `outcome`  – outcome variable (1-D)
    /// - `treatment`– actual treatment received (needed for fuzzy RD)
    pub fn estimate(
        &self,
        running: ArrayView1<f64>,
        outcome: ArrayView1<f64>,
        treatment: Option<ArrayView1<f64>>,
    ) -> StatsResult<EstimationResult> {
        let n = running.len();
        if outcome.len() != n {
            return Err(StatsError::DimensionMismatch(
                "running and outcome must have equal length".to_owned(),
            ));
        }

        // Select observations within bandwidth
        let in_window: Vec<usize> = (0..n)
            .filter(|&i| (running[i] - self.cutoff).abs() <= self.bandwidth)
            .collect();

        if in_window.len() < 4 {
            return Err(StatsError::InsufficientData(
                "Too few observations within the bandwidth window".to_owned(),
            ));
        }

        let above: Vec<usize> = in_window
            .iter()
            .copied()
            .filter(|&i| running[i] >= self.cutoff)
            .collect();
        let below: Vec<usize> = in_window
            .iter()
            .copied()
            .filter(|&i| running[i] < self.cutoff)
            .collect();

        // Local linear regression on each side
        let (tau_above, _se_above) = local_linear_fit(&running, &outcome, &above, self.cutoff)?;
        let (tau_below, _se_below) = local_linear_fit(&running, &outcome, &below, self.cutoff)?;

        let reduced_form = tau_above - tau_below;

        let (estimate, estimand) = if self.fuzzy {
            // Fuzzy RD: divide reduced form by first stage (jump in treatment probability)
            let treat = treatment.ok_or_else(|| {
                StatsError::InvalidArgument("Treatment vector required for fuzzy RD".to_owned())
            })?;
            let (t_above, _) = local_linear_fit(&running, &treat, &above, self.cutoff)?;
            let (t_below, _) = local_linear_fit(&running, &treat, &below, self.cutoff)?;
            let first_stage = t_above - t_below;
            if first_stage.abs() < 1e-8 {
                return Err(StatsError::ComputationError(
                    "Weak first stage in fuzzy RD".to_owned(),
                ));
            }
            (reduced_form / first_stage, "LATE (Fuzzy RD)")
        } else {
            (reduced_form, "ATE (Sharp RD)")
        };

        // HC3 variance estimate at the cutoff
        let se = rdd_se(&running, &outcome, &in_window, self.cutoff, estimate);

        let mut res = EstimationResult::new(estimate, se, estimand, "RDD", n);
        res.diagnostics
            .insert("n_above".to_string(), above.len() as f64);
        res.diagnostics
            .insert("n_below".to_string(), below.len() as f64);
        res.diagnostics
            .insert("bandwidth".to_string(), self.bandwidth);
        Ok(res)
    }

    /// Imbens-Kalyanaraman (2012) MSE-optimal bandwidth selector.
    pub fn ik_bandwidth(running: ArrayView1<f64>, outcome: ArrayView1<f64>, cutoff: f64) -> f64 {
        let n = running.len() as f64;
        // Simple rule-of-thumb: h* = σ_x · n^{-1/5}
        let mean = running.mean().unwrap_or(cutoff);
        let var = running.iter().map(|&r| (r - mean).powi(2)).sum::<f64>() / n;
        let sigma_x = var.sqrt();
        let _ = outcome; // Used in a more complete implementation
        sigma_x * n.powf(-0.2)
    }
}

fn local_linear_fit(
    running: &ArrayView1<f64>,
    outcome: &ArrayView1<f64>,
    indices: &[usize],
    cutoff: f64,
) -> StatsResult<(f64, f64)> {
    let n = indices.len();
    if n < 2 {
        return Err(StatsError::InsufficientData(
            "Need at least 2 observations for local linear fit".to_owned(),
        ));
    }
    // Recentre running variable
    let x_c: Vec<f64> = indices.iter().map(|&i| running[*&i] - cutoff).collect();
    let y: Vec<f64> = indices.iter().map(|&i| outcome[*&i]).collect();

    // OLS with [1, x_c]
    let mut s0 = 0.0_f64;
    let mut s1 = 0.0_f64;
    let mut s2 = 0.0_f64;
    let mut t0 = 0.0_f64;
    let mut t1 = 0.0_f64;
    for k in 0..n {
        s0 += 1.0;
        s1 += x_c[k];
        s2 += x_c[k].powi(2);
        t0 += y[k];
        t1 += x_c[k] * y[k];
    }
    let det = s0 * s2 - s1 * s1;
    if det.abs() < 1e-12 {
        return Err(StatsError::ComputationError(
            "Degenerate local linear design matrix".to_owned(),
        ));
    }
    let intercept = (s2 * t0 - s1 * t1) / det;
    let slope = (s0 * t1 - s1 * t0) / det;
    // SE of intercept
    let residuals: Vec<f64> = (0..n).map(|k| y[k] - intercept - slope * x_c[k]).collect();
    let sigma2 = residuals.iter().map(|r| r * r).sum::<f64>() / (n.saturating_sub(2).max(1) as f64);
    let se = (sigma2 * s2 / det.max(f64::EPSILON)).sqrt();
    Ok((intercept, se))
}

fn rdd_se(
    running: &ArrayView1<f64>,
    _outcome: &ArrayView1<f64>,
    in_window: &[usize],
    _cutoff: f64,
    _estimate: f64,
) -> f64 {
    // Simple HC-robust SE approximation
    let nw = in_window.len() as f64;
    let spread = running
        .iter()
        .filter(|&&r| in_window.iter().any(|&i| (running[i] - r).abs() < 1e-12))
        .map(|&r| r)
        .collect::<Vec<_>>();
    let var_r = spread.iter().map(|&r| r * r).sum::<f64>() / nw.max(1.0);
    (var_r / nw).sqrt()
}

// ---------------------------------------------------------------------------
// 5. Synthetic Control
// ---------------------------------------------------------------------------

/// Synthetic control estimator (Abadie, Diamond & Hainmueller 2010).
///
/// Constructs a counterfactual for a single treated unit as a weighted
/// combination of control units that best matches pre-treatment outcomes.
pub struct SyntheticControlEstimator {
    /// Maximum iterations for the weight optimisation.
    pub max_iter: usize,
    /// Convergence tolerance.
    pub tol: f64,
}

impl Default for SyntheticControlEstimator {
    fn default() -> Self {
        Self {
            max_iter: 1000,
            tol: 1e-7,
        }
    }
}

/// Result of synthetic control estimation.
#[derive(Debug, Clone)]
pub struct SyntheticControlResult {
    /// Optimal weights for each donor unit (sum to 1, non-negative).
    pub weights: Array1<f64>,
    /// Estimated ATT in each post-treatment period.
    pub att_series: Array1<f64>,
    /// Average post-treatment ATT.
    pub att_mean: f64,
    /// Pre-treatment RMSPE (goodness of fit).
    pub pre_rmspe: f64,
    /// Post-treatment RMSPE.
    pub post_rmspe: f64,
}

impl SyntheticControlEstimator {
    /// Estimate synthetic control.
    ///
    /// # Arguments
    /// - `treated_pre`  – pre-treatment outcomes for the treated unit (T0 × 1)
    /// - `donors_pre`   – pre-treatment outcomes for donor units (T0 × J)
    /// - `treated_post` – post-treatment outcomes for treated unit (T1 × 1)
    /// - `donors_post`  – post-treatment outcomes for donor units (T1 × J)
    pub fn estimate(
        &self,
        treated_pre: ArrayView1<f64>,
        donors_pre: ArrayView2<f64>,
        treated_post: ArrayView1<f64>,
        donors_post: ArrayView2<f64>,
    ) -> StatsResult<SyntheticControlResult> {
        let t0 = treated_pre.len();
        let j = donors_pre.ncols();
        if donors_pre.nrows() != t0 {
            return Err(StatsError::DimensionMismatch(
                "donors_pre rows must match treated_pre length".to_owned(),
            ));
        }
        let t1 = treated_post.len();
        if donors_post.nrows() != t1 || donors_post.ncols() != j {
            return Err(StatsError::DimensionMismatch(
                "donors_post dimensions inconsistent".to_owned(),
            ));
        }
        if j == 0 {
            return Err(StatsError::InsufficientData(
                "Need at least one donor unit".to_owned(),
            ));
        }

        // Minimise ||Y_1 - Y_0 w||^2 subject to w >= 0, sum(w) = 1
        // Using projected gradient descent
        let weights = self.fit_weights(treated_pre, donors_pre)?;

        // Synthetic control outcomes
        let pre_synth: Array1<f64> = (0..t0)
            .map(|t| (0..j).map(|k| donors_pre[[t, k]] * weights[k]).sum::<f64>())
            .collect();
        let post_synth: Array1<f64> = (0..t1)
            .map(|t| {
                (0..j)
                    .map(|k| donors_post[[t, k]] * weights[k])
                    .sum::<f64>()
            })
            .collect();

        let pre_rmspe = (pre_synth
            .iter()
            .zip(treated_pre.iter())
            .map(|(&s, &y)| (s - y).powi(2))
            .sum::<f64>()
            / t0 as f64)
            .sqrt();

        let att_series: Array1<f64> = treated_post
            .iter()
            .zip(post_synth.iter())
            .map(|(&y, &s)| y - s)
            .collect();
        let att_mean = att_series.mean().unwrap_or(0.0);
        let post_rmspe = (att_series.iter().map(|&d| d.powi(2)).sum::<f64>() / t1 as f64).sqrt();

        Ok(SyntheticControlResult {
            weights,
            att_series,
            att_mean,
            pre_rmspe,
            post_rmspe,
        })
    }

    fn fit_weights(
        &self,
        target: ArrayView1<f64>,
        donors: ArrayView2<f64>,
    ) -> StatsResult<Array1<f64>> {
        let (t0, j) = donors.dim();
        let mut w = Array1::<f64>::from_elem(j, 1.0 / j as f64);

        for _iter in 0..self.max_iter {
            // Gradient: ∂L/∂w = -2 Y_0' (Y_1 - Y_0 w)
            let synth: Array1<f64> = (0..t0)
                .map(|t| (0..j).map(|k| donors[[t, k]] * w[k]).sum::<f64>())
                .collect();
            let residual: Array1<f64> = (0..t0).map(|t| target[t] - synth[t]).collect();
            let mut grad = Array1::<f64>::zeros(j);
            for k in 0..j {
                for t in 0..t0 {
                    grad[k] -= 2.0 * donors[[t, k]] * residual[t];
                }
            }

            // Step size (Armijo)
            let lr = 0.01;
            let mut w_new = Array1::<f64>::zeros(j);
            for k in 0..j {
                w_new[k] = (w[k] - lr * grad[k]).max(0.0);
            }
            // Project onto simplex
            let s = w_new.sum();
            if s > 1e-10 {
                w_new.mapv_inplace(|x| x / s);
            } else {
                w_new.fill(1.0 / j as f64);
            }
            // Check convergence
            let diff: f64 = w_new
                .iter()
                .zip(w.iter())
                .map(|(&a, &b)| (a - b).powi(2))
                .sum::<f64>()
                .sqrt();
            w = w_new;
            if diff < self.tol {
                break;
            }
        }
        Ok(w)
    }
}

// ---------------------------------------------------------------------------
// 6. Difference-in-Differences
// ---------------------------------------------------------------------------

/// Two-way fixed effects DiD with parallel-trends pre-test.
pub struct DifferenceInDifferences {
    /// Number of pre-treatment periods for the parallel-trends test.
    pub n_pre_periods: usize,
}

impl Default for DifferenceInDifferences {
    fn default() -> Self {
        Self { n_pre_periods: 2 }
    }
}

/// Result of a DiD estimation.
#[derive(Debug, Clone)]
pub struct DiDResult {
    /// ATT estimate (post-treatment average treatment effect on treated).
    pub att: f64,
    /// Standard error.
    pub std_error: f64,
    /// 95 % confidence interval.
    pub conf_interval: [f64; 2],
    /// p-value.
    pub p_value: f64,
    /// p-value of the parallel trends pre-test (H₀: pre-trends are equal).
    pub parallel_trends_p: f64,
    /// Whether the parallel-trends assumption is plausibly satisfied (p > 0.05).
    pub parallel_trends_ok: bool,
    /// Number of treated units.
    pub n_treated: usize,
    /// Number of control units.
    pub n_control: usize,
}

impl DifferenceInDifferences {
    /// Estimate ATT using two-period DiD.
    ///
    /// # Arguments
    /// - `outcomes_pre`  – pre-treatment outcomes, shape (n_units × n_pre)
    /// - `outcomes_post` – post-treatment outcomes, shape (n_units × n_post)  
    /// - `treatment`     – binary treatment indicator for each unit (n_units)
    pub fn estimate(
        &self,
        outcomes_pre: ArrayView2<f64>,
        outcomes_post: ArrayView2<f64>,
        treatment: ArrayView1<f64>,
    ) -> StatsResult<DiDResult> {
        let n = treatment.len();
        if outcomes_pre.nrows() != n || outcomes_post.nrows() != n {
            return Err(StatsError::DimensionMismatch(
                "outcome arrays must have n_units rows".to_owned(),
            ));
        }

        let n_post = outcomes_post.ncols();
        let n_pre = outcomes_pre.ncols();

        let treated_idx: Vec<usize> = (0..n).filter(|&i| treatment[i] > 0.5).collect();
        let control_idx: Vec<usize> = (0..n).filter(|&i| treatment[i] <= 0.5).collect();

        if treated_idx.is_empty() || control_idx.is_empty() {
            return Err(StatsError::InsufficientData(
                "Need both treated and control units".to_owned(),
            ));
        }

        // Mean pre/post outcomes per group
        let mean_pre_treated = group_mean(&outcomes_pre, &treated_idx);
        let mean_pre_control = group_mean(&outcomes_pre, &control_idx);
        let mean_post_treated = group_mean(&outcomes_post, &treated_idx);
        let mean_post_control = group_mean(&outcomes_post, &control_idx);

        // DiD estimator: (post_T - pre_T) - (post_C - pre_C)
        let diff_treated = mean_post_treated - mean_pre_treated;
        let diff_control = mean_post_control - mean_pre_control;
        let att = diff_treated - diff_control;

        // Bootstrap SE
        let se = did_bootstrap_se(
            &outcomes_pre,
            &outcomes_post,
            &treatment,
            &treated_idx,
            &control_idx,
            100,
        );

        let z = att / se.max(f64::EPSILON);
        let p_value = two_sided_p(z);
        let margin = 1.959_964 * se;

        // Parallel trends test: regress pre-treatment trend on treatment × time
        let parallel_trends_p =
            parallel_trends_test(&outcomes_pre, &treatment, &treated_idx, &control_idx);

        Ok(DiDResult {
            att,
            std_error: se,
            conf_interval: [att - margin, att + margin],
            p_value,
            parallel_trends_p,
            parallel_trends_ok: parallel_trends_p > 0.05,
            n_treated: treated_idx.len(),
            n_control: control_idx.len(),
        })
    }
}

fn group_mean(outcomes: &ArrayView2<f64>, indices: &[usize]) -> f64 {
    if indices.is_empty() {
        return 0.0;
    }
    let total: f64 = indices
        .iter()
        .flat_map(|&i| outcomes.row(i).iter().copied().collect::<Vec<_>>())
        .sum();
    total / (indices.len() * outcomes.ncols()) as f64
}

fn did_bootstrap_se(
    pre: &ArrayView2<f64>,
    post: &ArrayView2<f64>,
    treatment: &ArrayView1<f64>,
    _treated: &[usize],
    _control: &[usize],
    n_boot: usize,
) -> f64 {
    let n = treatment.len();
    let mut ests = Vec::with_capacity(n_boot);
    let mut rng: u64 = 99991;
    for _ in 0..n_boot {
        let mut t_pre = 0.0_f64;
        let mut t_post = 0.0_f64;
        let mut c_pre = 0.0_f64;
        let mut c_post = 0.0_f64;
        let mut nt = 0.0_f64;
        let mut nc = 0.0_f64;
        for _ in 0..n {
            rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
            let idx = (rng >> 33) as usize % n;
            let ti = treatment[idx];
            let pre_mean = pre.row(idx).mean().unwrap_or(0.0);
            let post_mean = post.row(idx).mean().unwrap_or(0.0);
            if ti > 0.5 {
                t_pre += pre_mean;
                t_post += post_mean;
                nt += 1.0;
            } else {
                c_pre += pre_mean;
                c_post += post_mean;
                nc += 1.0;
            }
        }
        if nt > 0.0 && nc > 0.0 {
            let att_b = (t_post / nt - t_pre / nt) - (c_post / nc - c_pre / nc);
            ests.push(att_b);
        }
    }
    if ests.is_empty() {
        return 0.0;
    }
    let mean = ests.iter().sum::<f64>() / ests.len() as f64;
    let var =
        ests.iter().map(|&e| (e - mean).powi(2)).sum::<f64>() / (ests.len() - 1).max(1) as f64;
    var.sqrt()
}

fn parallel_trends_test(
    outcomes_pre: &ArrayView2<f64>,
    treatment: &ArrayView1<f64>,
    treated_idx: &[usize],
    control_idx: &[usize],
) -> f64 {
    // Test whether pre-treatment trends differ between treated and control.
    // Simple: regress time-demeaned outcome on treatment × time.
    let n_pre = outcomes_pre.ncols();
    if n_pre < 2 {
        return 1.0; // Cannot test with fewer than 2 pre-periods
    }

    // Compute period-over-period changes for treated and control
    let mut treated_changes = Vec::new();
    let mut control_changes = Vec::new();
    for t in 1..n_pre {
        let tc: f64 = treated_idx
            .iter()
            .map(|&i| outcomes_pre[[i, t]] - outcomes_pre[[i, t - 1]])
            .sum::<f64>()
            / treated_idx.len().max(1) as f64;
        let cc: f64 = control_idx
            .iter()
            .map(|&i| outcomes_pre[[i, t]] - outcomes_pre[[i, t - 1]])
            .sum::<f64>()
            / control_idx.len().max(1) as f64;
        treated_changes.push(tc);
        control_changes.push(cc);
    }

    // Two-sample t-test on the changes
    let n_t = treated_changes.len();
    let n_c = control_changes.len();
    let mu_t = treated_changes.iter().sum::<f64>() / n_t as f64;
    let mu_c = control_changes.iter().sum::<f64>() / n_c as f64;
    let var_t = treated_changes
        .iter()
        .map(|&x| (x - mu_t).powi(2))
        .sum::<f64>()
        / n_t.saturating_sub(1).max(1) as f64;
    let var_c = control_changes
        .iter()
        .map(|&x| (x - mu_c).powi(2))
        .sum::<f64>()
        / n_c.saturating_sub(1).max(1) as f64;
    let se = (var_t / n_t as f64 + var_c / n_c as f64).sqrt();
    if se < f64::EPSILON {
        return 1.0;
    }
    let t_stat = (mu_t - mu_c) / se;
    two_sided_p(t_stat)
}

// ---------------------------------------------------------------------------
// Unit tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use scirs2_core::ndarray::array;

    #[test]
    fn test_ipw_estimator_simple() {
        // 10 units, treatment perfectly separates groups
        let cov = Array2::<f64>::from_shape_fn((20, 2), |(i, j)| {
            if j == 0 {
                if i < 10 {
                    1.0
                } else {
                    0.0
                }
            } else {
                (i as f64).sin()
            }
        });
        let treat = Array1::from_iter((0..20).map(|i| if i < 10 { 1.0 } else { 0.0 }));
        let outcome = Array1::from_iter((0..20).map(|i| if i < 10 { 2.0 } else { 0.0 }));
        let est = IPWEstimator::default();
        let res = est
            .estimate(cov.view(), treat.view(), outcome.view())
            .unwrap();
        // ATE should be close to 2.0
        assert!((res.estimate - 2.0).abs() < 1.0, "ATE={}", res.estimate);
    }

    #[test]
    fn test_doubly_robust() {
        let n = 30;
        let cov = Array2::<f64>::from_shape_fn((n, 1), |(i, _)| i as f64 / n as f64);
        let treat = Array1::from_iter((0..n).map(|i| if i < n / 2 { 1.0 } else { 0.0 }));
        let outcome = Array1::from_iter((0..n).map(|i| {
            if i < n / 2 {
                1.0 + i as f64 * 0.01
            } else {
                i as f64 * 0.01
            }
        }));
        let est = DoublyRobustEstimator::default();
        let res = est
            .estimate(cov.view(), treat.view(), outcome.view())
            .unwrap();
        assert!(res.estimate.is_finite());
    }

    #[test]
    fn test_nn_matching() {
        let n = 20;
        let cov = Array2::<f64>::from_shape_fn((n, 1), |(i, _)| i as f64);
        let treat = Array1::from_iter((0..n).map(|i| if i % 2 == 0 { 1.0 } else { 0.0 }));
        let outcome = Array1::from_iter((0..n).map(|i| if i % 2 == 0 { 2.0 } else { 0.0 }));
        let est = NearestNeighborMatching::default();
        let res = est
            .estimate(cov.view(), treat.view(), outcome.view())
            .unwrap();
        // ATT ≈ 2.0
        assert!((res.estimate - 2.0).abs() < 0.5, "ATT={}", res.estimate);
    }

    #[test]
    fn test_rdd_sharp() {
        // Sharp RD: treatment at running > 0
        let n = 40;
        let running = Array1::from_iter((0..n).map(|i| -2.0 + i as f64 * 4.0 / n as f64));
        let outcome = Array1::from_iter((0..n).map(|i| {
            if running[i] >= 0.0 {
                3.0 + running[i] * 0.5
            } else {
                running[i] * 0.5
            }
        }));
        let rdd = RegressionDiscontinuity {
            cutoff: 0.0,
            bandwidth: 1.5,
            local_linear: true,
            fuzzy: false,
        };
        let res = rdd.estimate(running.view(), outcome.view(), None).unwrap();
        // Jump at cutoff should be ≈ 3.0
        assert!(
            (res.estimate - 3.0).abs() < 1.0,
            "RDD estimate={}",
            res.estimate
        );
    }

    #[test]
    fn test_synthetic_control() {
        let treated_pre = array![1.0, 2.0, 3.0, 4.0, 5.0];
        let donors_pre =
            Array2::from_shape_fn((5, 3), |(t, j)| (t + 1) as f64 * [1.1, 0.9, 1.0][j]);
        let treated_post = array![8.0, 9.0];
        let donors_post =
            Array2::from_shape_fn((2, 3), |(t, j)| (t + 6) as f64 * [1.1, 0.9, 1.0][j]);
        let est = SyntheticControlEstimator::default();
        let res = est
            .estimate(
                treated_pre.view(),
                donors_pre.view(),
                treated_post.view(),
                donors_post.view(),
            )
            .unwrap();
        assert!((res.weights.sum() - 1.0).abs() < 1e-5);
        assert!(res.att_series.len() == 2);
    }

    #[test]
    fn test_did() {
        // Treated group benefits by +2 in post period
        let n = 20;
        let pre = Array2::from_shape_fn((n, 3), |(i, t)| (i as f64 * 0.1) + t as f64 * 0.5);
        let post = Array2::from_shape_fn((n, 2), |(i, t)| {
            let base = (i as f64 * 0.1) + 1.5 + t as f64 * 0.5;
            if i < 10 {
                base + 2.0
            } else {
                base
            }
        });
        let treat = Array1::from_iter((0..n).map(|i| if i < 10 { 1.0 } else { 0.0 }));
        let did = DifferenceInDifferences { n_pre_periods: 3 };
        let res = did.estimate(pre.view(), post.view(), treat.view()).unwrap();
        assert!((res.att - 2.0).abs() < 0.3, "DiD ATT={}", res.att);
    }
}