use crate::error::{StatsError, StatsResult};
fn sorted_copy(x: &[f64]) -> Vec<f64> {
let mut v = x.to_vec();
v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
v
}
fn median_of_sorted(sorted: &[f64]) -> f64 {
let n = sorted.len();
if n == 0 {
return f64::NAN;
}
let mid = n / 2;
if n % 2 == 0 {
(sorted[mid - 1] + sorted[mid]) * 0.5
} else {
sorted[mid]
}
}
fn median_of(x: &[f64]) -> f64 {
if x.is_empty() {
return f64::NAN;
}
let sorted = sorted_copy(x);
median_of_sorted(&sorted)
}
pub fn huber_location(data: &[f64], k: f64, tol: f64, max_iter: usize) -> StatsResult<f64> {
if data.is_empty() {
return Err(StatsError::InsufficientData(
"huber_location requires at least one observation".into(),
));
}
if k <= 0.0 {
return Err(StatsError::InvalidArgument(
"tuning constant k must be positive".into(),
));
}
let mut mu = median_of(data);
for _iter in 0..max_iter {
let scale = {
let devs: Vec<f64> = data.iter().map(|&x| (x - mu).abs()).collect();
let m = median_of(&devs);
m / 0.6745
};
if scale < f64::EPSILON {
return Ok(mu);
}
let mut sum_wx = 0.0_f64;
let mut sum_w = 0.0_f64;
for &x in data {
let u = (x - mu) / scale;
let w = if u.abs() <= k { 1.0 } else { k / u.abs() };
sum_wx += w * x;
sum_w += w;
}
if sum_w < f64::EPSILON {
return Err(StatsError::ConvergenceError(
"huber_location: weight sum near zero".into(),
));
}
let mu_new = sum_wx / sum_w;
let delta = (mu_new - mu).abs();
mu = mu_new;
if delta <= tol {
return Ok(mu);
}
}
Err(StatsError::ConvergenceError(format!(
"huber_location did not converge after {} iterations",
max_iter
)))
}
pub fn biweight_location(data: &[f64], c: f64) -> StatsResult<f64> {
if data.is_empty() {
return Err(StatsError::InsufficientData(
"biweight_location requires at least one observation".into(),
));
}
if c <= 0.0 {
return Err(StatsError::InvalidArgument(
"tuning constant c must be positive".into(),
));
}
let m = median_of(data);
let devs: Vec<f64> = data.iter().map(|&x| (x - m).abs()).collect();
let mad = median_of(&devs);
if mad < f64::EPSILON {
return Ok(m);
}
let mut sum_wx = 0.0_f64;
let mut sum_w = 0.0_f64;
for &x in data {
let u = (x - m) / mad;
if u.abs() <= c {
let t = 1.0 - (u / c).powi(2);
let w = t * t;
sum_wx += w * x;
sum_w += w;
}
}
if sum_w < f64::EPSILON {
return Ok(m);
}
Ok(sum_wx / sum_w)
}
pub fn trimmed_mean(data: &[f64], proportiontocut: f64) -> StatsResult<f64> {
if data.is_empty() {
return Err(StatsError::InsufficientData(
"trimmed_mean requires at least one observation".into(),
));
}
if !(0.0..0.5).contains(&proportiontocut) {
return Err(StatsError::InvalidArgument(
"proportiontocut must be in [0, 0.5)".into(),
));
}
let sorted = sorted_copy(data);
let n = sorted.len();
let cut = (n as f64 * proportiontocut).floor() as usize;
let lo = cut;
let hi = n - cut;
if lo >= hi {
return Err(StatsError::InsufficientData(
"trimmed_mean: proportiontocut removes all observations".into(),
));
}
let sum: f64 = sorted[lo..hi].iter().sum();
Ok(sum / (hi - lo) as f64)
}
pub fn winsorized_mean(data: &[f64], limits: (f64, f64)) -> StatsResult<f64> {
if data.is_empty() {
return Err(StatsError::InsufficientData(
"winsorized_mean requires at least one observation".into(),
));
}
let (lo_frac, hi_frac) = limits;
if lo_frac < 0.0 || hi_frac < 0.0 || lo_frac + hi_frac >= 1.0 {
return Err(StatsError::InvalidArgument(
"limits must be non-negative and sum to less than 1".into(),
));
}
let sorted = sorted_copy(data);
let n = sorted.len();
let lo_idx = (n as f64 * lo_frac).floor() as usize;
let hi_idx = n - (n as f64 * hi_frac).floor() as usize - 1;
if lo_idx > hi_idx {
return Err(StatsError::InsufficientData(
"winsorized_mean: limits clip all observations".into(),
));
}
let lo_val = sorted[lo_idx];
let hi_val = sorted[hi_idx];
let sum: f64 = sorted
.iter()
.map(|&x| x.clamp(lo_val, hi_val))
.sum();
Ok(sum / n as f64)
}
pub fn mad(data: &[f64]) -> StatsResult<f64> {
if data.is_empty() {
return Err(StatsError::InsufficientData(
"mad requires at least one observation".into(),
));
}
let med = median_of(data);
let devs: Vec<f64> = data.iter().map(|&x| (x - med).abs()).collect();
let raw_mad = median_of(&devs);
const K: f64 = 1.4826022185056018;
Ok(K * raw_mad)
}
pub fn qn_scale(data: &[f64]) -> StatsResult<f64> {
let n = data.len();
if n < 2 {
return Err(StatsError::InsufficientData(
"qn_scale requires at least 2 observations".into(),
));
}
let mut diffs: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
for i in 0..n {
for j in (i + 1)..n {
diffs.push((data[i] - data[j]).abs());
}
}
diffs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let half = n / 2 + 1;
let h = half * (half - 1) / 2;
let idx = h.min(diffs.len()) - 1;
let raw_qn = diffs[idx];
const CN: f64 = 2.2219;
Ok(CN * raw_qn)
}
pub fn sn_scale(data: &[f64]) -> StatsResult<f64> {
let n = data.len();
if n < 2 {
return Err(StatsError::InsufficientData(
"sn_scale requires at least 2 observations".into(),
));
}
let mut row_medians: Vec<f64> = Vec::with_capacity(n);
for i in 0..n {
let mut diffs: Vec<f64> = (0..n)
.filter(|&j| j != i)
.map(|j| (data[i] - data[j]).abs())
.collect();
diffs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let idx = (diffs.len() - 1 + 1) / 2; row_medians.push(diffs[idx]);
}
row_medians.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let idx = (n + 1) / 2 - 1;
let raw_sn = row_medians[idx];
const CN: f64 = 1.1926;
Ok(CN * raw_sn)
}
#[derive(Debug, Clone)]
pub struct TheilSen {
slope: f64,
intercept: f64,
}
impl TheilSen {
pub fn fit(x: &[f64], y: &[f64]) -> StatsResult<Self> {
if x.len() != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"x and y must have the same length, got {} and {}",
x.len(),
y.len()
)));
}
if x.len() < 2 {
return Err(StatsError::InsufficientData(
"TheilSen requires at least 2 observations".into(),
));
}
let n = x.len();
let mut slopes: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
for i in 0..n {
for j in (i + 1)..n {
let dx = x[j] - x[i];
if dx.abs() > f64::EPSILON {
slopes.push((y[j] - y[i]) / dx);
}
}
}
if slopes.is_empty() {
return Err(StatsError::ComputationError(
"TheilSen: all x values are identical; slope undefined".into(),
));
}
let slope = median_of(&slopes);
let intercepts: Vec<f64> = x
.iter()
.zip(y.iter())
.map(|(&xi, &yi)| yi - slope * xi)
.collect();
let intercept = median_of(&intercepts);
Ok(TheilSen { slope, intercept })
}
pub fn predict(&self, x: &[f64]) -> Vec<f64> {
x.iter().map(|&xi| self.slope * xi + self.intercept).collect()
}
pub fn slope(&self) -> f64 {
self.slope
}
pub fn intercept(&self) -> f64 {
self.intercept
}
}
#[derive(Debug, Clone)]
pub struct PassingBablok {
slope: f64,
intercept: f64,
}
impl PassingBablok {
pub fn fit(x: &[f64], y: &[f64]) -> StatsResult<Self> {
if x.len() != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"x and y must have the same length, got {} and {}",
x.len(),
y.len()
)));
}
if x.len() < 3 {
return Err(StatsError::InsufficientData(
"PassingBablok requires at least 3 observations".into(),
));
}
let n = x.len();
let mut slopes: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
let mut neg_one_count = 0usize;
for i in 0..n {
for j in (i + 1)..n {
let dx = x[j] - x[i];
let dy = y[j] - y[i];
if dx.abs() < f64::EPSILON && dy.abs() < f64::EPSILON {
continue;
}
let s = if dx.abs() < f64::EPSILON {
if dy > 0.0 { f64::INFINITY } else { f64::NEG_INFINITY }
} else {
dy / dx
};
if (s + 1.0).abs() < f64::EPSILON {
neg_one_count += 1;
continue;
}
slopes.push(s);
}
}
if slopes.is_empty() {
return Err(StatsError::ComputationError(
"PassingBablok: insufficient valid slope pairs".into(),
));
}
slopes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let k = neg_one_count;
let m = slopes.len();
let mid_idx = if (m + 2 * k) % 2 == 0 {
let i1 = ((m + 2 * k) / 2).saturating_sub(1 + k).min(m - 1);
let i2 = ((m + 2 * k) / 2).saturating_sub(k).min(m - 1);
let slope = (slopes[i1] + slopes[i2]) * 0.5;
let intercepts: Vec<f64> = x
.iter()
.zip(y.iter())
.map(|(&xi, &yi)| yi - slope * xi)
.collect();
let intercept = median_of(&intercepts);
return Ok(PassingBablok { slope, intercept });
} else {
((m + 2 * k + 1) / 2).saturating_sub(1 + k).min(m - 1)
};
let slope = slopes[mid_idx];
let intercepts: Vec<f64> = x
.iter()
.zip(y.iter())
.map(|(&xi, &yi)| yi - slope * xi)
.collect();
let intercept = median_of(&intercepts);
Ok(PassingBablok { slope, intercept })
}
pub fn slope(&self) -> f64 {
self.slope
}
pub fn intercept(&self) -> f64 {
self.intercept
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_huber_location_clean_data() {
let data: Vec<f64> = (1..=9).map(|i| i as f64).collect();
let loc = huber_location(&data, 1.345, 1e-8, 200).expect("converged");
assert!((loc - 5.0).abs() < 0.05, "loc={loc}");
}
#[test]
fn test_huber_location_outlier_resistant() {
let mut data: Vec<f64> = (1..=9).map(|i| i as f64).collect();
data.push(1000.0); let loc = huber_location(&data, 1.345, 1e-8, 200).expect("converged");
assert!(loc < 20.0, "loc={loc} should be robust to outlier");
}
#[test]
fn test_huber_location_empty() {
assert!(huber_location(&[], 1.345, 1e-6, 100).is_err());
}
#[test]
fn test_huber_location_invalid_k() {
assert!(huber_location(&[1.0, 2.0], -1.0, 1e-6, 100).is_err());
}
#[test]
fn test_biweight_location_symmetric() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let loc = biweight_location(&data, 6.0).expect("ok");
assert!((loc - 3.0).abs() < 0.01, "loc={loc}");
}
#[test]
fn test_biweight_location_outlier() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0];
let loc = biweight_location(&data, 6.0).expect("ok");
assert!(loc < 10.0, "biweight should resist outlier, got {loc}");
}
#[test]
fn test_biweight_location_empty() {
assert!(biweight_location(&[], 6.0).is_err());
}
#[test]
fn test_trimmed_mean_basic() {
let data = vec![1.0, 2.0, 3.0, 4.0, 100.0];
let tm = trimmed_mean(&data, 0.2).expect("ok");
assert!((tm - 3.0).abs() < 1e-10);
}
#[test]
fn test_trimmed_mean_zero_cut() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let tm = trimmed_mean(&data, 0.0).expect("ok");
let mean = data.iter().sum::<f64>() / 5.0;
assert!((tm - mean).abs() < 1e-10);
}
#[test]
fn test_trimmed_mean_invalid_cut() {
let data = vec![1.0, 2.0, 3.0];
assert!(trimmed_mean(&data, 0.6).is_err());
assert!(trimmed_mean(&data, -0.1).is_err());
}
#[test]
fn test_winsorized_mean_upper_tail() {
let data = vec![1.0, 2.0, 3.0, 4.0, 100.0];
let wm = winsorized_mean(&data, (0.0, 0.2)).expect("ok");
assert!((wm - 2.8).abs() < 1e-10, "wm={wm}");
}
#[test]
fn test_winsorized_mean_both_tails() {
let data = vec![-100.0, 1.0, 2.0, 3.0, 4.0, 100.0];
let wm = winsorized_mean(&data, (1.0 / 6.0, 1.0 / 6.0)).expect("ok");
assert!((wm - 2.5).abs() < 1e-6, "wm={wm}");
}
#[test]
fn test_mad_normal_scale() {
let data = vec![
-2.0, -1.0, -0.5, -0.2, 0.0, 0.1, 0.3, 0.7, 1.0, 1.5,
];
let m = mad(&data).expect("ok");
assert!(m > 0.0, "MAD should be positive");
}
#[test]
fn test_mad_constant_data() {
let data = vec![3.0, 3.0, 3.0, 3.0];
let m = mad(&data).expect("ok");
assert_eq!(m, 0.0);
}
#[test]
fn test_mad_empty() {
assert!(mad(&[]).is_err());
}
#[test]
fn test_qn_scale_positive() {
let data: Vec<f64> = (1..=10).map(|i| i as f64).collect();
let qn = qn_scale(&data).expect("ok");
assert!(qn > 0.0, "qn={qn}");
}
#[test]
fn test_qn_scale_constant() {
let data = vec![5.0; 10];
let qn = qn_scale(&data).expect("ok");
assert_eq!(qn, 0.0);
}
#[test]
fn test_qn_scale_single() {
assert!(qn_scale(&[1.0]).is_err());
}
#[test]
fn test_sn_scale_positive() {
let data: Vec<f64> = (1..=10).map(|i| i as f64).collect();
let sn = sn_scale(&data).expect("ok");
assert!(sn > 0.0, "sn={sn}");
}
#[test]
fn test_sn_scale_constant() {
let data = vec![7.0; 8];
let sn = sn_scale(&data).expect("ok");
assert_eq!(sn, 0.0);
}
#[test]
fn test_sn_scale_single() {
assert!(sn_scale(&[1.0]).is_err());
}
#[test]
fn test_theilsen_perfect_linear() {
let x: Vec<f64> = (0..=5).map(|i| i as f64).collect();
let y: Vec<f64> = x.iter().map(|&xi| 3.0 * xi + 1.0).collect();
let model = TheilSen::fit(&x, &y).expect("ok");
assert!((model.slope() - 3.0).abs() < 1e-10, "slope={}", model.slope());
assert!((model.intercept() - 1.0).abs() < 1e-10, "intercept={}", model.intercept());
}
#[test]
fn test_theilsen_outlier_resistant() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let mut y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; y[4] = 100.0; let model = TheilSen::fit(&x, &y).expect("ok");
assert!((model.slope() - 2.0).abs() < 1.0, "slope={}", model.slope());
}
#[test]
fn test_theilsen_predict() {
let x = vec![0.0, 1.0, 2.0];
let y = vec![1.0, 3.0, 5.0];
let model = TheilSen::fit(&x, &y).expect("ok");
let preds = model.predict(&[3.0, 4.0]);
assert!((preds[0] - 7.0).abs() < 1e-8, "pred0={}", preds[0]);
assert!((preds[1] - 9.0).abs() < 1e-8, "pred1={}", preds[1]);
}
#[test]
fn test_theilsen_length_mismatch() {
assert!(TheilSen::fit(&[1.0, 2.0], &[1.0]).is_err());
}
#[test]
fn test_theilsen_insufficient_data() {
assert!(TheilSen::fit(&[1.0], &[2.0]).is_err());
}
#[test]
fn test_passingbablok_identical_methods() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let y = x.clone();
let model = PassingBablok::fit(&x, &y).expect("ok");
assert!((model.slope() - 1.0).abs() < 0.1, "slope={}", model.slope());
assert!(model.intercept().abs() < 0.5, "intercept={}", model.intercept());
}
#[test]
fn test_passingbablok_scaled_methods() {
let x: Vec<f64> = (1..=10).map(|i| i as f64).collect();
let y: Vec<f64> = x.iter().map(|&xi| 2.0 * xi).collect();
let model = PassingBablok::fit(&x, &y).expect("ok");
assert!((model.slope() - 2.0).abs() < 0.1, "slope={}", model.slope());
}
#[test]
fn test_passingbablok_insufficient_data() {
assert!(PassingBablok::fit(&[1.0, 2.0], &[1.0, 2.0]).is_err());
}
#[test]
fn test_passingbablok_length_mismatch() {
assert!(PassingBablok::fit(&[1.0, 2.0, 3.0], &[1.0, 2.0]).is_err());
}
}