use crate::error::{AnomalyError, AnomalyResult};
use crate::handle::LcgRng;
#[derive(Debug, Clone)]
pub struct FastMcdConfig {
pub support_frac: f32,
pub n_starts: usize,
pub max_iter: usize,
pub seed: u64,
pub ridge: f32,
}
impl Default for FastMcdConfig {
fn default() -> Self {
Self {
support_frac: 0.5,
n_starts: 10,
max_iter: 50,
seed: 42,
ridge: 1e-5,
}
}
}
#[derive(Debug, Clone)]
pub struct FastMcdFit {
pub location: Vec<f32>,
pub cov_inv: Vec<f32>,
pub support: Vec<usize>,
pub n_features: usize,
pub n_samples_fitted: usize,
}
pub fn fast_mcd_fit(
data: &[f32],
n_samples: usize,
n_features: usize,
cfg: &FastMcdConfig,
) -> AnomalyResult<FastMcdFit> {
if n_features == 0 {
return Err(AnomalyError::InvalidFeatureCount { n: 0 });
}
if n_samples == 0 {
return Err(AnomalyError::EmptyInput);
}
if data.len() != n_samples * n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * n_features,
got: data.len(),
});
}
let h = (cfg.support_frac * n_samples as f32).floor() as usize + 1;
let h = h.min(n_samples);
if h <= n_features {
return Err(AnomalyError::InsufficientSamples {
need: n_features + 2,
got: h,
});
}
let mut rng = LcgRng::new(cfg.seed);
let mut best_log_det = f32::INFINITY;
let mut best_support: Vec<usize> = (0..h).collect();
let mut best_location = vec![0.0_f32; n_features];
let mut best_cov_inv = vec![0.0_f32; n_features * n_features];
let mut found = false;
for _ in 0..cfg.n_starts {
let mut idx: Vec<usize> = (0..n_samples).collect();
for i in 0..h {
let j = i + rng.next_usize(n_samples - i);
idx.swap(i, j);
}
let mut support: Vec<usize> = idx[..h].to_vec();
let mut prev_log_det = f32::INFINITY;
for _ in 0..cfg.max_iter {
let (loc, cov) = compute_location_cov(data, &support, n_features, cfg.ridge);
let log_det = cholesky_log_det(&cov, n_features).unwrap_or(f32::INFINITY);
if (log_det - prev_log_det).abs() < 1e-10 * (1.0 + prev_log_det.abs()) {
if log_det < best_log_det
&& let Some(cov_inv) = gauss_jordan_inv(&cov, n_features)
{
best_log_det = log_det;
best_support = support.clone();
best_location = loc;
best_cov_inv = cov_inv;
found = true;
}
break;
}
prev_log_det = log_det;
let cov_inv_opt = gauss_jordan_inv(&cov, n_features);
let cov_inv = match cov_inv_opt {
Some(ci) => ci,
None => break, };
let mut mahal2: Vec<(usize, f32)> = (0..n_samples)
.map(|i| {
let row = &data[i * n_features..(i + 1) * n_features];
let d2 = mahal_sq(row, &loc, &cov_inv, n_features);
(i, d2)
})
.collect();
mahal2.sort_unstable_by(|a, b| {
a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
});
support = mahal2[..h].iter().map(|(i, _)| *i).collect();
}
}
if !found {
return Err(AnomalyError::SingularCovariance);
}
Ok(FastMcdFit {
location: best_location,
cov_inv: best_cov_inv,
support: best_support,
n_features,
n_samples_fitted: n_samples,
})
}
pub fn fast_mcd_score(fit: &FastMcdFit, x: &[f32]) -> AnomalyResult<f32> {
if x.len() != fit.n_features {
return Err(AnomalyError::FeatureCountMismatch {
expected: fit.n_features,
got: x.len(),
});
}
let d2 = mahal_sq(x, &fit.location, &fit.cov_inv, fit.n_features);
Ok(d2.max(0.0))
}
pub fn fast_mcd_score_batch(
fit: &FastMcdFit,
x: &[f32],
n_samples: usize,
) -> AnomalyResult<Vec<f32>> {
if x.len() != n_samples * fit.n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * fit.n_features,
got: x.len(),
});
}
let mut out = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let row = &x[i * fit.n_features..(i + 1) * fit.n_features];
out.push(fast_mcd_score(fit, row)?);
}
Ok(out)
}
pub fn fast_mcd_predict(
fit: &FastMcdFit,
x: &[f32],
n_samples: usize,
threshold: f32,
) -> AnomalyResult<Vec<bool>> {
let scores = fast_mcd_score_batch(fit, x, n_samples)?;
Ok(scores.iter().map(|&s| s > threshold).collect())
}
fn compute_location_cov(
data: &[f32],
support: &[usize],
d: usize,
ridge: f32,
) -> (Vec<f32>, Vec<f32>) {
let h = support.len();
let mut loc = vec![0.0_f32; d];
for &i in support {
let row = &data[i * d..(i + 1) * d];
for (j, &v) in row.iter().enumerate() {
loc[j] += v;
}
}
let inv_h = 1.0 / h as f32;
for v in &mut loc {
*v *= inv_h;
}
let mut cov = vec![0.0_f32; d * d];
for &i in support {
let row = &data[i * d..(i + 1) * d];
for r in 0..d {
let dr = row[r] - loc[r];
for c in 0..d {
let dc = row[c] - loc[c];
cov[r * d + c] += dr * dc;
}
}
}
let denom = if h > 1 { (h - 1) as f32 } else { 1.0 };
for v in &mut cov {
*v /= denom;
}
for i in 0..d {
cov[i * d + i] += ridge;
}
(loc, cov)
}
fn mahal_sq(x: &[f32], mu: &[f32], s_inv: &[f32], d: usize) -> f32 {
let mut diff = vec![0.0_f32; d];
for i in 0..d {
diff[i] = x[i] - mu[i];
}
let mut temp = vec![0.0_f32; d];
for i in 0..d {
let row = &s_inv[i * d..(i + 1) * d];
temp[i] = row.iter().zip(diff.iter()).map(|(a, b)| a * b).sum();
}
diff.iter()
.zip(temp.iter())
.map(|(a, b)| a * b)
.sum::<f32>()
}
fn cholesky_log_det(m: &[f32], d: usize) -> Option<f32> {
let mut l = vec![0.0_f32; d * d];
for i in 0..d {
for j in 0..=i {
let mut s = m[i * d + j];
for k in 0..j {
s -= l[i * d + k] * l[j * d + k];
}
if i == j {
if s <= 0.0 {
return None; }
l[i * d + j] = s.sqrt();
} else {
l[i * d + j] = s / l[j * d + j];
}
}
}
let log_det: f32 = (0..d).map(|i| l[i * d + i].ln()).sum::<f32>() * 2.0;
Some(log_det)
}
fn gauss_jordan_inv(m: &[f32], d: usize) -> Option<Vec<f32>> {
let w = 2 * d;
let mut aug = vec![0.0_f32; d * w];
for i in 0..d {
for j in 0..d {
aug[i * w + j] = m[i * d + j];
}
aug[i * w + d + i] = 1.0;
}
for col in 0..d {
let mut max_row = col;
let mut max_val = aug[col * w + col].abs();
for row in (col + 1)..d {
let v = aug[row * w + col].abs();
if v > max_val {
max_val = v;
max_row = row;
}
}
if max_row != col {
for k in 0..w {
aug.swap(col * w + k, max_row * w + k);
}
}
let pivot = aug[col * w + col];
if pivot.abs() < 1e-10 {
return None;
}
let inv_p = 1.0 / pivot;
for k in 0..w {
aug[col * w + k] *= inv_p;
}
for row in 0..d {
if row == col {
continue;
}
let factor = aug[row * w + col];
if factor.abs() < 1e-30 {
continue;
}
for k in 0..w {
let sub = factor * aug[col * w + k];
aug[row * w + k] -= sub;
}
}
}
let mut inv = vec![0.0_f32; d * d];
for i in 0..d {
for j in 0..d {
inv[i * d + j] = aug[i * w + d + j];
}
}
Some(inv)
}
#[cfg(test)]
mod tests {
use super::*;
fn lcg_normal(state: &mut u64) -> f32 {
*state = state
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1_442_695_040_888_963_407);
let u1 = ((*state >> 33) as f32 / u32::MAX as f32).max(1e-12);
*state = state
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1_442_695_040_888_963_407);
let u2 = (*state >> 33) as f32 / u32::MAX as f32;
(-2.0 * u1.ln()).sqrt() * (2.0 * std::f32::consts::PI * u2).cos()
}
fn gaussian_2d(n: usize, cx: f32, cy: f32, sigma: f32, seed: u64) -> Vec<f32> {
let mut state = seed;
(0..n)
.flat_map(|_| {
let x = lcg_normal(&mut state) * sigma + cx;
let y = lcg_normal(&mut state) * sigma + cy;
[x, y]
})
.collect()
}
#[test]
fn test_fit_and_score_basic() {
let data = gaussian_2d(30, 0.0, 0.0, 1.0, 1);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 30, 2, &cfg).expect("fast MCD fit should succeed");
let s = fast_mcd_score(&fit, &[0.0_f32, 0.0]).expect("MCD score at origin should succeed");
assert!(s.is_finite(), "score={s}");
}
#[test]
fn test_support_size_correct() {
let data = gaussian_2d(40, 0.0, 0.0, 1.0, 2);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 40, 2, &cfg).expect("FastMCD fit should succeed");
let expected_h = (0.5 * 40.0_f32).floor() as usize + 1; assert_eq!(fit.support.len(), expected_h, "support size mismatch");
}
#[test]
fn test_inlier_vs_outlier_scores() {
let mut data = gaussian_2d(40, 0.0, 0.0, 0.5, 3);
for _ in 0..5 {
data.extend_from_slice(&[50.0_f32, 50.0]);
}
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 45, 2, &cfg).expect("FastMCD fit should succeed");
let inlier_scores: Vec<f32> = (0..40)
.map(|i| {
fast_mcd_score(&fit, &data[i * 2..(i + 1) * 2])
.expect("FastMCD score in iterator should succeed")
})
.collect();
let outlier_scores: Vec<f32> = (40..45)
.map(|i| {
fast_mcd_score(&fit, &data[i * 2..(i + 1) * 2])
.expect("FastMCD score in iterator should succeed")
})
.collect();
let max_inlier = inlier_scores
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
let min_outlier = outlier_scores.iter().cloned().fold(f32::INFINITY, f32::min);
assert!(
min_outlier > max_inlier,
"min outlier score {min_outlier} must exceed max inlier score {max_inlier}"
);
}
#[test]
fn test_robust_location_vs_contaminated_mean() {
let mut data = gaussian_2d(30, 0.0, 0.0, 0.5, 5);
for _ in 0..10 {
data.extend_from_slice(&[100.0_f32, 100.0]);
}
let n = 40;
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, n, 2, &cfg).expect("FastMCD fit should succeed");
let emp_mean_x: f32 = (0..n).map(|i| data[i * 2]).sum::<f32>() / n as f32;
let emp_mean_y: f32 = (0..n).map(|i| data[i * 2 + 1]).sum::<f32>() / n as f32;
let mcd_dist = (fit.location[0].powi(2) + fit.location[1].powi(2)).sqrt();
let emp_dist = (emp_mean_x.powi(2) + emp_mean_y.powi(2)).sqrt();
assert!(
mcd_dist < emp_dist,
"MCD location dist {mcd_dist:.2} should be closer to origin than empirical {emp_dist:.2}"
);
}
#[test]
fn test_location_close_to_true_mean_clean_data() {
let data = gaussian_2d(60, 5.0, 5.0, 1.0, 7);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 60, 2, &cfg).expect("FastMCD fit should succeed");
let dist = ((fit.location[0] - 5.0).powi(2) + (fit.location[1] - 5.0).powi(2)).sqrt();
assert!(
dist < 2.0,
"location too far from true mean: dist={dist:.3}"
);
}
#[test]
fn test_score_near_zero_for_center() {
let data = gaussian_2d(40, 0.0, 0.0, 1.0, 8);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 40, 2, &cfg).expect("FastMCD fit should succeed");
let s = fast_mcd_score(&fit, &fit.location.clone())
.expect("FastMCD score at location should succeed");
assert!(s < 1e-3, "score at location estimate should be ~0, got {s}");
}
#[test]
fn test_score_grows_with_distance() {
let data = gaussian_2d(40, 0.0, 0.0, 0.3, 9);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 40, 2, &cfg).expect("FastMCD fit should succeed");
let s0 = fast_mcd_score(&fit, &fit.location.clone())
.expect("FastMCD score at location should succeed");
let s1 = fast_mcd_score(&fit, &[fit.location[0] + 2.0, fit.location[1]])
.expect("FastMCD score should succeed");
let s2 = fast_mcd_score(&fit, &[fit.location[0] + 10.0, fit.location[1]])
.expect("FastMCD score should succeed");
assert!(
s0 < s1,
"score at center({s0:.3}) should be < score at +2 ({s1:.3})"
);
assert!(
s1 < s2,
"score at +2 ({s1:.3}) should be < score at +10 ({s2:.3})"
);
}
#[test]
fn test_predict_flags_outliers() {
let mut data = gaussian_2d(30, 0.0, 0.0, 0.5, 10);
let n_outliers = 5;
for _ in 0..n_outliers {
data.extend_from_slice(&[30.0_f32, 30.0]);
}
let n = 35;
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, n, 2, &cfg).expect("FastMCD fit should succeed");
let inlier_max = (0..30)
.map(|i| {
fast_mcd_score(&fit, &data[i * 2..(i + 1) * 2])
.expect("FastMCD score in iterator should succeed")
})
.fold(f32::NEG_INFINITY, f32::max);
let threshold = inlier_max * 10.0;
let predictions =
fast_mcd_predict(&fit, &data, n, threshold).expect("FastMCD predict should succeed");
let flagged_outliers = predictions[30..].iter().filter(|&&b| b).count();
assert!(
flagged_outliers >= 4,
"should flag at least 4/5 outliers, got {flagged_outliers}"
);
}
#[test]
fn test_reproducibility_with_seed() {
let data = gaussian_2d(40, 0.0, 0.0, 1.0, 11);
let cfg = FastMcdConfig {
seed: 123,
..Default::default()
};
let fit1 = fast_mcd_fit(&data, 40, 2, &cfg).expect("FastMCD fit should succeed");
let fit2 = fast_mcd_fit(&data, 40, 2, &cfg).expect("FastMCD fit should succeed");
assert_eq!(
fit1.location, fit2.location,
"same seed must give same location"
);
}
#[test]
fn test_dimension_mismatch_score() {
let data = gaussian_2d(20, 0.0, 0.0, 1.0, 12);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 20, 2, &cfg).expect("FastMCD fit should succeed");
let result = fast_mcd_score(&fit, &[0.0_f32, 0.0, 0.0]); assert!(matches!(
result,
Err(AnomalyError::FeatureCountMismatch { .. })
));
}
#[test]
fn test_insufficient_samples() {
let data = vec![0.0_f32; 3 * 5];
let cfg = FastMcdConfig::default();
let result = fast_mcd_fit(&data, 3, 5, &cfg);
assert!(matches!(
result,
Err(AnomalyError::InsufficientSamples { .. })
));
}
#[test]
fn test_high_dimensional() {
let mut state = 77u64;
let data: Vec<f32> = (0..40 * 5).map(|_| lcg_normal(&mut state)).collect();
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 40, 5, &cfg).expect("FastMCD fit should succeed");
assert_eq!(fit.n_features, 5);
assert!(fit.location.len() == 5);
}
#[test]
fn test_single_feature() {
let data: Vec<f32> = (0..20).map(|i| i as f32).collect();
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 20, 1, &cfg).expect("FastMCD fit should succeed");
let s = fast_mcd_score(&fit, &fit.location.clone())
.expect("FastMCD score at location should succeed");
assert!(s < 1e-2, "1D score at location={s:.4}");
}
#[test]
fn test_support_has_smaller_mahal_avg() {
let data = gaussian_2d(40, 0.0, 0.0, 1.0, 14);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 40, 2, &cfg).expect("FastMCD fit should succeed");
let support_set: std::collections::HashSet<usize> = fit.support.iter().cloned().collect();
let support_avg: f32 = fit
.support
.iter()
.map(|&i| {
fast_mcd_score(&fit, &data[i * 2..(i + 1) * 2])
.expect("FastMCD score in iterator should succeed")
})
.sum::<f32>()
/ fit.support.len() as f32;
let nonsupport: Vec<usize> = (0..40).filter(|i| !support_set.contains(i)).collect();
if nonsupport.is_empty() {
return; }
let nonsupport_avg: f32 = nonsupport
.iter()
.map(|&i| {
fast_mcd_score(&fit, &data[i * 2..(i + 1) * 2])
.expect("FastMCD score in iterator should succeed")
})
.sum::<f32>()
/ nonsupport.len() as f32;
assert!(
support_avg <= nonsupport_avg,
"support avg Mahal ({support_avg:.3}) should be ≤ non-support avg ({nonsupport_avg:.3})"
);
}
#[test]
fn test_n_starts_one() {
let data = gaussian_2d(30, 0.0, 0.0, 1.0, 15);
let cfg = FastMcdConfig {
n_starts: 1,
..Default::default()
};
let fit = fast_mcd_fit(&data, 30, 2, &cfg).expect("FastMCD fit should succeed");
assert_eq!(fit.n_features, 2);
}
#[test]
fn test_score_batch_consistent() {
let data = gaussian_2d(30, 0.0, 0.0, 1.0, 16);
let cfg = FastMcdConfig::default();
let fit = fast_mcd_fit(&data, 30, 2, &cfg).expect("FastMCD fit should succeed");
let queries = vec![0.0_f32, 0.0, 1.0, 1.0, 5.0, 5.0];
let batch =
fast_mcd_score_batch(&fit, &queries, 3).expect("FastMCD batch score should succeed");
for i in 0..3 {
let single = fast_mcd_score(&fit, &queries[i * 2..(i + 1) * 2])
.expect("FastMCD single score should match batch score");
assert!(
(batch[i] - single).abs() < 1e-6,
"batch[{i}]={} single={}",
batch[i],
single
);
}
}
}