use crate::error::{StatsError, StatsResult};
use scirs2_core::ndarray::{Array1, Array2, ArrayView1};
#[derive(Debug, Clone)]
pub struct RDDResult {
pub estimate: f64,
pub std_error: f64,
pub t_stat: f64,
pub p_value: f64,
pub conf_interval: [f64; 2],
pub bandwidth: f64,
pub n_left: usize,
pub n_right: usize,
pub poly_order: usize,
pub estimator: String,
}
#[derive(Debug, Clone)]
pub struct RDDPlot {
pub x_bins: Array1<f64>,
pub y_means: Array1<f64>,
pub y_se: Array1<f64>,
pub cutoff: f64,
pub n_bins_each_side: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BandwidthMethod {
IK,
CCT,
CV,
}
fn triangular_kernel(u: f64) -> f64 {
if u.abs() < 1.0 {
1.0 - u.abs()
} else {
0.0
}
}
fn epanechnikov_kernel(u: f64) -> f64 {
if u.abs() < 1.0 {
0.75 * (1.0 - u * u)
} else {
0.0
}
}
fn uniform_kernel(u: f64) -> f64 {
if u.abs() <= 1.0 {
0.5
} else {
0.0
}
}
fn local_poly_fit(
x: &[f64],
y: &[f64],
weights: &[f64],
x0: f64,
poly_order: usize,
) -> StatsResult<(f64, f64)> {
let n = x.len();
if n < poly_order + 1 {
return Err(StatsError::InsufficientData(format!(
"Need at least {} points for poly_order={}, got {}",
poly_order + 1,
poly_order,
n
)));
}
let k = poly_order + 1;
let mut xmat = Array2::<f64>::zeros((n, k));
let mut y_vec = Array1::<f64>::zeros(n);
for (i, (&xi, (&yi, &wi))) in x.iter().zip(y.iter().zip(weights.iter())).enumerate() {
let sqrt_w = wi.sqrt();
y_vec[i] = yi * sqrt_w;
let dx = xi - x0;
let mut pow = 1.0_f64;
for j in 0..k {
xmat[[i, j]] = pow * sqrt_w;
pow *= dx;
}
}
let xtx = xmat.t().dot(&xmat);
let xty = xmat.t().dot(&y_vec);
let xtx_inv = cholesky_invert_rdd(&xtx.view())?;
let beta = xtx_inv.dot(&xty);
let fitted = xmat.dot(&beta);
let residuals = &y_vec - &fitted;
let rss: f64 = residuals.iter().map(|&r| r * r).sum();
let df = (n as f64) - k as f64;
let s2 = if df > 0.0 { rss / df } else { rss };
let var_intercept = xtx_inv[[0, 0]] * s2;
let se = var_intercept.max(0.0).sqrt();
Ok((beta[0], se))
}
fn cholesky_invert_rdd(a: &scirs2_core::ndarray::ArrayView2<f64>) -> StatsResult<Array2<f64>> {
let n = a.nrows();
let mut l = Array2::<f64>::zeros((n, n));
for i in 0..n {
for j in 0..=i {
let mut s = a[[i, j]];
for p in 0..j {
s -= l[[i, p]] * l[[j, p]];
}
if i == j {
if s <= 0.0 {
return Err(StatsError::ComputationError(
"Matrix not positive definite (RDD)".into(),
));
}
l[[i, j]] = s.sqrt();
} else {
l[[i, j]] = s / l[[j, j]];
}
}
}
let mut linv = Array2::<f64>::zeros((n, n));
for j in 0..n {
linv[[j, j]] = 1.0 / l[[j, j]];
for i in (j + 1)..n {
let mut s = 0.0_f64;
for p in j..i {
s += l[[i, p]] * linv[[p, j]];
}
linv[[i, j]] = -s / l[[i, i]];
}
}
Ok(linv.t().dot(&linv))
}
fn normal_p_value_rdd(z: f64) -> f64 {
2.0 * (1.0 - normal_cdf_rdd(z.abs()))
}
fn normal_cdf_rdd(x: f64) -> f64 {
0.5 * (1.0 + erf_approx(x / std::f64::consts::SQRT_2))
}
fn erf_approx(x: f64) -> f64 {
let t = 1.0 / (1.0 + 0.3275911 * x.abs());
let y = 1.0
- (0.254829592
+ (-0.284496736 + (1.421413741 + (-1.453152027 + 1.061405429 * t) * t) * t) * t)
* t
* (-x * x).exp();
if x >= 0.0 {
y
} else {
-y
}
}
pub struct BandwidthSelector;
impl BandwidthSelector {
pub fn select(
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
cutoff: f64,
method: BandwidthMethod,
poly_order: usize,
) -> StatsResult<f64> {
match method {
BandwidthMethod::IK => Self::ik_bandwidth(x, y, cutoff, poly_order),
BandwidthMethod::CCT => Self::cct_bandwidth(x, y, cutoff, poly_order),
BandwidthMethod::CV => Self::cv_bandwidth(x, y, cutoff, poly_order),
}
}
fn ik_bandwidth(
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
cutoff: f64,
poly_order: usize,
) -> StatsResult<f64> {
let n = x.len();
let x_std = std_dev(x);
let h_pilot = 1.84 * x_std * (n as f64).powf(-1.0 / 5.0);
let f_x = x
.iter()
.filter(|&&xi| (xi - cutoff).abs() < h_pilot)
.count() as f64
/ (n as f64 * 2.0 * h_pilot).max(1e-15);
if f_x < 1e-10 {
return Err(StatsError::ComputationError(
"No observations near cutoff; cannot select bandwidth".into(),
));
}
let (x_l, y_l, x_r, y_r) = split_at_cutoff(x, y, cutoff);
let var_l = conditional_variance_at_boundary(&x_l, &y_l, cutoff, h_pilot, poly_order)?;
let var_r = conditional_variance_at_boundary(&x_r, &y_r, cutoff, h_pilot, poly_order)?;
let sigma2 = (var_l + var_r) / 2.0;
let m2_l = estimate_m2(&x_l, &y_l, cutoff)?;
let m2_r = estimate_m2(&x_r, &y_r, cutoff)?;
let m2_sq = ((m2_r - m2_l) / 2.0).powi(2);
if m2_sq < 1e-15 {
return Ok(h_pilot);
}
let p = poly_order as f64;
let c_k = 3.4375_f64; let h_opt = c_k
* (sigma2 / (f_x * m2_sq)).powf(1.0 / (2.0 * p + 3.0))
* (n as f64).powf(-1.0 / (2.0 * p + 3.0));
Ok(h_opt.max(0.01 * x_std))
}
fn cct_bandwidth(
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
cutoff: f64,
poly_order: usize,
) -> StatsResult<f64> {
let h_ik = Self::ik_bandwidth(x, y, cutoff, poly_order)?;
Ok(h_ik * 1.2)
}
fn cv_bandwidth(
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
cutoff: f64,
poly_order: usize,
) -> StatsResult<f64> {
let x_std = std_dev(x);
let h_min = 0.1 * x_std;
let h_max = 1.5 * x_std;
let n_grid = 20_usize;
let mut best_h = h_min;
let mut best_cv = f64::INFINITY;
for k in 0..n_grid {
let h = h_min + (h_max - h_min) * k as f64 / (n_grid - 1) as f64;
let cv = loocv_rdd(x, y, cutoff, h, poly_order)?;
if cv < best_cv {
best_cv = cv;
best_h = h;
}
}
Ok(best_h)
}
}
fn loocv_rdd(
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
cutoff: f64,
h: f64,
poly_order: usize,
) -> StatsResult<f64> {
let n = x.len();
let mut cv_score = 0.0_f64;
let mut count = 0_usize;
for i in 0..n {
let xi = x[i];
let yi = y[i];
let side = xi >= cutoff;
let x_in: Vec<f64> = (0..n)
.filter(|&j| j != i && (x[j] >= cutoff) == side)
.filter(|&j| ((x[j] - cutoff) / h).abs() < 1.0)
.map(|j| x[j])
.collect();
let y_in: Vec<f64> = (0..n)
.filter(|&j| j != i && (x[j] >= cutoff) == side)
.filter(|&j| ((x[j] - cutoff) / h).abs() < 1.0)
.map(|j| y[j])
.collect();
let w_in: Vec<f64> = x_in
.iter()
.map(|&xj| triangular_kernel((xj - cutoff) / h))
.collect();
if x_in.len() < poly_order + 1 {
continue;
}
let (y_hat, _) = local_poly_fit(&x_in, &y_in, &w_in, xi, poly_order)?;
cv_score += (yi - y_hat).powi(2);
count += 1;
}
if count == 0 {
return Err(StatsError::InsufficientData(
"No valid observations for CV bandwidth selection".into(),
));
}
Ok(cv_score / count as f64)
}
fn std_dev(x: &ArrayView1<f64>) -> f64 {
let n = x.len();
if n < 2 {
return 1.0;
}
let mean = x.iter().sum::<f64>() / n as f64;
let var = x.iter().map(|&xi| (xi - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
var.sqrt().max(1e-15)
}
fn split_at_cutoff(
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
cutoff: f64,
) -> (Vec<f64>, Vec<f64>, Vec<f64>, Vec<f64>) {
let mut x_l = Vec::new();
let mut y_l = Vec::new();
let mut x_r = Vec::new();
let mut y_r = Vec::new();
for (&xi, &yi) in x.iter().zip(y.iter()) {
if xi < cutoff {
x_l.push(xi);
y_l.push(yi);
} else {
x_r.push(xi);
y_r.push(yi);
}
}
(x_l, y_l, x_r, y_r)
}
fn conditional_variance_at_boundary(
x: &[f64],
y: &[f64],
cutoff: f64,
h: f64,
poly_order: usize,
) -> StatsResult<f64> {
let weights: Vec<f64> = x
.iter()
.map(|&xi| triangular_kernel((xi - cutoff) / h))
.collect();
let in_bw: Vec<usize> = (0..x.len()).filter(|&i| weights[i] > 0.0).collect();
if in_bw.len() < poly_order + 2 {
return Ok(1.0); }
let xin: Vec<f64> = in_bw.iter().map(|&i| x[i]).collect();
let yin: Vec<f64> = in_bw.iter().map(|&i| y[i]).collect();
let win: Vec<f64> = in_bw.iter().map(|&i| weights[i]).collect();
let n = xin.len();
let k = poly_order + 1;
let mut xmat = Array2::<f64>::zeros((n, k));
let mut y_vec = Array1::<f64>::zeros(n);
for (i, (&xi, (&yi, &wi))) in xin.iter().zip(yin.iter().zip(win.iter())).enumerate() {
let sqrt_w = wi.sqrt();
y_vec[i] = yi * sqrt_w;
let dx = xi - cutoff;
let mut pow = 1.0_f64;
for j in 0..k {
xmat[[i, j]] = pow * sqrt_w;
pow *= dx;
}
}
let xtx = xmat.t().dot(&xmat);
let xty = xmat.t().dot(&y_vec);
let xtx_inv = cholesky_invert_rdd(&xtx.view()).unwrap_or_else(|_| Array2::eye(k));
let beta = xtx_inv.dot(&xty);
let fitted = xmat.dot(&beta);
let resid = &y_vec - &fitted;
let df = (n - k) as f64;
Ok(if df > 0.0 {
resid.iter().map(|&r| r * r).sum::<f64>() / df
} else {
1.0
})
}
fn estimate_m2(x: &[f64], y: &[f64], cutoff: f64) -> StatsResult<f64> {
let n = x.len();
if n < 4 {
return Ok(0.0);
}
let k = 3_usize;
let mut xmat = Array2::<f64>::zeros((n, k));
let mut y_vec = Array1::<f64>::zeros(n);
for (i, (&xi, &yi)) in x.iter().zip(y.iter()).enumerate() {
let dx = xi - cutoff;
y_vec[i] = yi;
xmat[[i, 0]] = 1.0;
xmat[[i, 1]] = dx;
xmat[[i, 2]] = dx * dx;
}
let xtx = xmat.t().dot(&xmat);
let xty = xmat.t().dot(&y_vec);
let xtx_inv = cholesky_invert_rdd(&xtx.view()).unwrap_or_else(|_| Array2::eye(k));
let beta = xtx_inv.dot(&xty);
Ok(2.0 * beta[2])
}
pub struct RDD {
pub cutoff: f64,
pub poly_order: usize,
pub kernel: String,
}
impl RDD {
pub fn new(cutoff: f64, poly_order: usize, kernel: &str) -> Self {
Self {
cutoff,
poly_order,
kernel: kernel.to_string(),
}
}
pub fn estimate(
&self,
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
bandwidth: Option<f64>,
) -> StatsResult<RDDResult> {
let n = x.len();
if n < 4 {
return Err(StatsError::InsufficientData(
"Need at least 4 observations".into(),
));
}
if y.len() != n {
return Err(StatsError::DimensionMismatch(
"x and y must have the same length".into(),
));
}
let h = match bandwidth {
Some(bw) => bw,
None => {
BandwidthSelector::select(x, y, self.cutoff, BandwidthMethod::IK, self.poly_order)?
}
};
let k_fn: Box<dyn Fn(f64) -> f64> = match self.kernel.as_str() {
"epanechnikov" => Box::new(epanechnikov_kernel),
"uniform" => Box::new(uniform_kernel),
_ => Box::new(triangular_kernel),
};
let (x_l, y_l, w_l): (Vec<f64>, Vec<f64>, Vec<f64>) = x
.iter()
.zip(y.iter())
.filter(|(&xi, _)| xi < self.cutoff && (xi - self.cutoff).abs() <= h)
.map(|(&xi, &yi)| {
let w = k_fn((xi - self.cutoff) / h);
(xi, yi, w)
})
.fold(
(Vec::new(), Vec::new(), Vec::new()),
|(mut ax, mut ay, mut aw), (xi, yi, wi)| {
ax.push(xi);
ay.push(yi);
aw.push(wi);
(ax, ay, aw)
},
);
let (x_r, y_r, w_r): (Vec<f64>, Vec<f64>, Vec<f64>) = x
.iter()
.zip(y.iter())
.filter(|(&xi, _)| xi >= self.cutoff && (xi - self.cutoff).abs() <= h)
.map(|(&xi, &yi)| {
let w = k_fn((xi - self.cutoff) / h);
(xi, yi, w)
})
.fold(
(Vec::new(), Vec::new(), Vec::new()),
|(mut ax, mut ay, mut aw), (xi, yi, wi)| {
ax.push(xi);
ay.push(yi);
aw.push(wi);
(ax, ay, aw)
},
);
let n_left = x_l.len();
let n_right = x_r.len();
if n_left < self.poly_order + 1 {
return Err(StatsError::InsufficientData(format!(
"Insufficient observations left of cutoff ({n_left}) for poly_order={}",
self.poly_order
)));
}
if n_right < self.poly_order + 1 {
return Err(StatsError::InsufficientData(format!(
"Insufficient observations right of cutoff ({n_right}) for poly_order={}",
self.poly_order
)));
}
let (mu_l, se_l) = local_poly_fit(&x_l, &y_l, &w_l, self.cutoff, self.poly_order)?;
let (mu_r, se_r) = local_poly_fit(&x_r, &y_r, &w_r, self.cutoff, self.poly_order)?;
let estimate = mu_r - mu_l;
let std_error = (se_l * se_l + se_r * se_r).sqrt();
let t_stat = if std_error > 0.0 {
estimate / std_error
} else {
0.0
};
let p_value = normal_p_value_rdd(t_stat);
let ci = [estimate - 1.96 * std_error, estimate + 1.96 * std_error];
Ok(RDDResult {
estimate,
std_error,
t_stat,
p_value,
conf_interval: ci,
bandwidth: h,
n_left,
n_right,
poly_order: self.poly_order,
estimator: "Sharp-RDD".into(),
})
}
pub fn plot_data(
&self,
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
n_bins_each_side: usize,
) -> StatsResult<RDDPlot> {
let n = x.len();
if n < 2 {
return Err(StatsError::InsufficientData(
"Need at least 2 observations".into(),
));
}
let x_min = x.iter().cloned().fold(f64::INFINITY, f64::min);
let x_max = x.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
let mut x_bins = Vec::with_capacity(2 * n_bins_each_side);
let mut y_means = Vec::with_capacity(2 * n_bins_each_side);
let mut y_se = Vec::with_capacity(2 * n_bins_each_side);
let step_l = (self.cutoff - x_min).max(1e-10) / n_bins_each_side as f64;
for b in 0..n_bins_each_side {
let lo = x_min + b as f64 * step_l;
let hi = lo + step_l;
let mid = (lo + hi) / 2.0;
let vals: Vec<f64> = x
.iter()
.zip(y.iter())
.filter(|(&xi, _)| xi >= lo && xi < hi)
.map(|(_, &yi)| yi)
.collect();
if vals.is_empty() {
x_bins.push(mid);
y_means.push(f64::NAN);
y_se.push(f64::NAN);
} else {
let m = vals.iter().sum::<f64>() / vals.len() as f64;
let se = if vals.len() > 1 {
let var = vals.iter().map(|&v| (v - m).powi(2)).sum::<f64>()
/ (vals.len() * (vals.len() - 1)) as f64;
var.sqrt()
} else {
0.0
};
x_bins.push(mid);
y_means.push(m);
y_se.push(se);
}
}
let step_r = (x_max - self.cutoff).max(1e-10) / n_bins_each_side as f64;
for b in 0..n_bins_each_side {
let lo = self.cutoff + b as f64 * step_r;
let hi = lo + step_r;
let mid = (lo + hi) / 2.0;
let vals: Vec<f64> = x
.iter()
.zip(y.iter())
.filter(|(&xi, _)| xi >= lo && xi < hi)
.map(|(_, &yi)| yi)
.collect();
if vals.is_empty() {
x_bins.push(mid);
y_means.push(f64::NAN);
y_se.push(f64::NAN);
} else {
let m = vals.iter().sum::<f64>() / vals.len() as f64;
let se = if vals.len() > 1 {
let var = vals.iter().map(|&v| (v - m).powi(2)).sum::<f64>()
/ (vals.len() * (vals.len() - 1)) as f64;
var.sqrt()
} else {
0.0
};
x_bins.push(mid);
y_means.push(m);
y_se.push(se);
}
}
Ok(RDDPlot {
x_bins: Array1::from_vec(x_bins),
y_means: Array1::from_vec(y_means),
y_se: Array1::from_vec(y_se),
cutoff: self.cutoff,
n_bins_each_side,
})
}
}
pub struct FuzzyRDD {
pub cutoff: f64,
pub poly_order: usize,
pub kernel: String,
}
impl FuzzyRDD {
pub fn new(cutoff: f64, poly_order: usize, kernel: &str) -> Self {
Self {
cutoff,
poly_order,
kernel: kernel.to_string(),
}
}
pub fn estimate(
&self,
x: &ArrayView1<f64>,
y: &ArrayView1<f64>,
d: &ArrayView1<f64>,
bandwidth: Option<f64>,
) -> StatsResult<RDDResult> {
let n = x.len();
if y.len() != n || d.len() != n {
return Err(StatsError::DimensionMismatch(
"x, y, d must all have the same length".into(),
));
}
let h = match bandwidth {
Some(bw) => bw,
None => {
BandwidthSelector::select(x, y, self.cutoff, BandwidthMethod::IK, self.poly_order)?
}
};
let rdd_y = RDD::new(self.cutoff, self.poly_order, &self.kernel);
let res_y = rdd_y.estimate(x, y, Some(h))?;
let rdd_d = RDD::new(self.cutoff, self.poly_order, &self.kernel);
let res_d = rdd_d.estimate(x, d, Some(h))?;
let first_stage = res_d.estimate;
if first_stage.abs() < 1e-8 {
return Err(StatsError::ComputationError(
"First stage near zero; no discontinuity in treatment probability".into(),
));
}
let estimate = res_y.estimate / first_stage;
let d_hat = first_stage;
let y_hat = res_y.estimate;
let var_wald =
(res_y.std_error / d_hat).powi(2) + (y_hat / d_hat.powi(2) * res_d.std_error).powi(2);
let std_error = var_wald.sqrt();
let t_stat = if std_error > 0.0 {
estimate / std_error
} else {
0.0
};
let p_value = normal_p_value_rdd(t_stat);
let ci = [estimate - 1.96 * std_error, estimate + 1.96 * std_error];
Ok(RDDResult {
estimate,
std_error,
t_stat,
p_value,
conf_interval: ci,
bandwidth: h,
n_left: res_y.n_left,
n_right: res_y.n_right,
poly_order: self.poly_order,
estimator: "Fuzzy-RDD".into(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array1;
fn make_rdd_data(n: usize, cutoff: f64, effect: f64) -> (Array1<f64>, Array1<f64>) {
let x: Array1<f64> = (0..n)
.map(|i| (i as f64) / (n as f64) * 2.0 - 1.0)
.collect();
let y: Array1<f64> = x
.iter()
.map(|&xi| {
let base = 1.0 + 0.5 * xi;
let jump = if xi >= cutoff { effect } else { 0.0 };
base + jump
})
.collect();
(x, y)
}
#[test]
fn test_sharp_rdd_recovers_effect() {
let (x, y) = make_rdd_data(500, 0.0, 2.0);
let rdd = RDD::new(0.0, 1, "triangular");
let res = rdd
.estimate(&x.view(), &y.view(), None)
.expect("Sharp RDD should succeed");
assert!(
(res.estimate - 2.0).abs() < 0.3,
"Expected effect≈2.0, got {}",
res.estimate
);
assert_eq!(res.estimator, "Sharp-RDD");
}
#[test]
fn test_rdd_bandwidth_ik() {
let (x, y) = make_rdd_data(300, 0.0, 1.5);
let h = BandwidthSelector::select(&x.view(), &y.view(), 0.0, BandwidthMethod::IK, 1)
.expect("IK bandwidth should succeed");
assert!(h > 0.0, "Bandwidth must be positive");
}
#[test]
fn test_rdd_plot_data() {
let (x, y) = make_rdd_data(200, 0.0, 1.0);
let rdd = RDD::new(0.0, 1, "triangular");
let plot = rdd
.plot_data(&x.view(), &y.view(), 5)
.expect("Plot data should succeed");
assert_eq!(plot.x_bins.len(), 10);
assert_eq!(plot.n_bins_each_side, 5);
}
#[test]
fn test_fuzzy_rdd() {
let n = 400_usize;
let cutoff = 0.0_f64;
let x: Array1<f64> = (0..n).map(|i| (i as f64) / n as f64 * 2.0 - 1.0).collect();
let d: Array1<f64> = x
.iter()
.map(|&xi| if xi >= cutoff { 1.0 } else { 0.0 })
.collect();
let y: Array1<f64> = x
.iter()
.zip(d.iter())
.map(|(&xi, &di)| 1.0 + 0.5 * xi + 2.0 * di)
.collect();
let frdd = FuzzyRDD::new(cutoff, 1, "triangular");
let res = frdd
.estimate(&x.view(), &y.view(), &d.view(), None)
.expect("Fuzzy RDD should succeed");
assert!(
(res.estimate - 2.0).abs() < 0.5,
"Expected LATE≈2.0, got {}",
res.estimate
);
}
}