use crate::error::{AnomalyError, AnomalyResult};
use crate::handle::LcgRng;
#[derive(Debug, Clone)]
pub struct GmmConfig {
pub n_components: usize,
pub max_iter: usize,
pub tol: f64,
pub reg_covar: f64,
pub contamination: f64,
pub init_seed: u64,
}
impl Default for GmmConfig {
fn default() -> Self {
Self {
n_components: 2,
max_iter: 100,
tol: 1e-4,
reg_covar: 1e-6,
contamination: 0.1,
init_seed: 42,
}
}
}
pub struct GmmModel {
pub weights: Vec<f64>,
pub means: Vec<Vec<f64>>,
pub covars: Vec<Vec<f64>>,
pub chol_l: Vec<Vec<f64>>,
pub log_det: Vec<f64>,
pub threshold: f64,
pub n_features: usize,
pub n_components: usize,
pub n_iter: usize,
pub converged: bool,
pub log_likelihood: f64,
}
fn cholesky(a: &[f64], d: usize) -> AnomalyResult<Vec<f64>> {
let mut l = vec![0.0_f64; d * d];
for i in 0..d {
for j in 0..=i {
let mut s: f64 = a[i * d + j];
for k in 0..j {
s -= l[i * d + k] * l[j * d + k];
}
if i == j {
if s <= 0.0 {
return Err(AnomalyError::SingularCovariance);
}
l[i * d + j] = s.sqrt();
} else {
l[i * d + j] = s / l[j * d + j];
}
}
}
Ok(l)
}
#[inline]
fn log_det_from_chol(l: &[f64], d: usize) -> f64 {
let mut ld = 0.0_f64;
for i in 0..d {
ld += l[i * d + i].ln();
}
2.0 * ld
}
fn solve_chol(l: &[f64], b: &[f64], d: usize) -> f64 {
let mut y = vec![0.0_f64; d];
for i in 0..d {
let mut s = b[i];
for k in 0..i {
s -= l[i * d + k] * y[k];
}
y[i] = s / l[i * d + i];
}
y.iter().map(|v| v * v).sum()
}
#[inline]
fn log_gaussian(x: &[f64], mu: &[f64], chol: &[f64], log_det: f64, d: usize) -> f64 {
let diff: Vec<f64> = x.iter().zip(mu.iter()).map(|(xi, mi)| xi - mi).collect();
let maha_sq = solve_chol(chol, &diff, d);
-0.5 * (d as f64 * std::f64::consts::TAU.ln() + log_det + maha_sq)
}
fn log_sum_exp(a: &[f64]) -> f64 {
let max = a.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
if max.is_infinite() {
return max;
}
let sum: f64 = a.iter().map(|&v| (v - max).exp()).sum();
max + sum.ln()
}
fn kmeans_plus_plus(
data: &[f64],
n: usize,
d: usize,
k: usize,
rng: &mut LcgRng,
) -> AnomalyResult<Vec<Vec<f64>>> {
if n < k {
return Err(AnomalyError::InsufficientSamples { need: k, got: n });
}
let mut centres: Vec<Vec<f64>> = Vec::with_capacity(k);
let first = rng.next_usize(n);
centres.push(data[first * d..(first + 1) * d].to_vec());
let mut dists = vec![f64::INFINITY; n];
for _ in 1..k {
let last_centre = centres.last().expect("centres non-empty");
for i in 0..n {
let row = &data[i * d..(i + 1) * d];
let sq: f64 = row
.iter()
.zip(last_centre.iter())
.map(|(xi, ci)| (xi - ci) * (xi - ci))
.sum();
if sq < dists[i] {
dists[i] = sq;
}
}
let total: f64 = dists.iter().sum();
if total <= 0.0 {
centres.push(data[rng.next_usize(n) * d..rng.next_usize(n) * d + d].to_vec());
continue;
}
let threshold = (rng.next_u32() as f64 / (u32::MAX as f64 + 1.0)) * total;
let mut cumsum = 0.0_f64;
let mut chosen = n - 1;
for (i, &d_i) in dists.iter().enumerate() {
cumsum += d_i;
if cumsum >= threshold {
chosen = i;
break;
}
}
centres.push(data[chosen * d..(chosen + 1) * d].to_vec());
}
Ok(centres)
}
fn e_step(
data: &[f64],
n: usize,
d: usize,
weights: &[f64],
means: &[Vec<f64>],
chol_l: &[Vec<f64>],
log_det: &[f64],
n_components: usize,
log_resp: &mut [f64],
) -> f64 {
let mut log_lik = 0.0_f64;
for i in 0..n {
let xi = &data[i * d..(i + 1) * d];
let offset = i * n_components;
for k in 0..n_components {
let log_pi_k = weights[k].ln();
let log_p_k = log_gaussian(xi, &means[k], &chol_l[k], log_det[k], d);
log_resp[offset + k] = log_pi_k + log_p_k;
}
let lse = log_sum_exp(&log_resp[offset..offset + n_components]);
log_lik += lse;
for k in 0..n_components {
log_resp[offset + k] -= lse;
}
}
log_lik
}
fn m_step(
data: &[f64],
n: usize,
d: usize,
log_resp: &[f64],
n_components: usize,
reg_covar: f64,
weights: &mut [f64],
means: &mut [Vec<f64>],
covars: &mut [Vec<f64>],
chol_l: &mut [Vec<f64>],
log_det: &mut [f64],
) -> AnomalyResult<()> {
let mut nk = vec![0.0_f64; n_components];
for i in 0..n {
for k in 0..n_components {
nk[k] += log_resp[i * n_components + k].exp();
}
}
let nk_min = 1e-8;
for nk_val in &mut nk {
if *nk_val < nk_min {
*nk_val = nk_min;
}
}
let n_f = n as f64;
for k in 0..n_components {
weights[k] = nk[k] / n_f;
}
for k in 0..n_components {
let mu = &mut means[k];
for v in mu.iter_mut() {
*v = 0.0;
}
for i in 0..n {
let r_ik = log_resp[i * n_components + k].exp();
let xi = &data[i * d..(i + 1) * d];
for j in 0..d {
mu[j] += r_ik * xi[j];
}
}
let inv_nk = nk[k].recip();
for v in mu.iter_mut() {
*v *= inv_nk;
}
}
for k in 0..n_components {
let cov = &mut covars[k];
for v in cov.iter_mut() {
*v = 0.0;
}
let mu_k = &means[k];
for i in 0..n {
let r_ik = log_resp[i * n_components + k].exp();
let xi = &data[i * d..(i + 1) * d];
for r in 0..d {
let diff_r = xi[r] - mu_k[r];
for c in 0..d {
let diff_c = xi[c] - mu_k[c];
cov[r * d + c] += r_ik * diff_r * diff_c;
}
}
}
let inv_nk = nk[k].recip();
for v in cov.iter_mut() {
*v *= inv_nk;
}
for j in 0..d {
cov[j * d + j] += reg_covar;
}
chol_l[k] = cholesky(cov, d)?;
log_det[k] = log_det_from_chol(&chol_l[k], d);
}
Ok(())
}
pub fn gmm_fit(
x: &[f64],
n_samples: usize,
n_features: usize,
cfg: &GmmConfig,
rng: &mut LcgRng,
) -> AnomalyResult<GmmModel> {
if n_samples == 0 {
return Err(AnomalyError::EmptyInput);
}
if n_features == 0 {
return Err(AnomalyError::InvalidFeatureCount { n: 0 });
}
if x.len() != n_samples * n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * n_features,
got: x.len(),
});
}
if cfg.n_components == 0 {
return Err(AnomalyError::Internal {
msg: "n_components must be > 0".into(),
});
}
if cfg.n_components > n_samples {
return Err(AnomalyError::InsufficientSamples {
need: cfg.n_components,
got: n_samples,
});
}
if !(cfg.contamination > 0.0 && cfg.contamination < 0.5) {
return Err(AnomalyError::Internal {
msg: format!(
"contamination must be in (0, 0.5), got {}",
cfg.contamination
),
});
}
let n = n_samples;
let d = n_features;
let k = cfg.n_components;
let mut init_rng = LcgRng::new(cfg.init_seed);
let init_means = kmeans_plus_plus(x, n, d, k, &mut init_rng)?;
let mut weights: Vec<f64> = vec![1.0 / k as f64; k];
let mut means: Vec<Vec<f64>> = init_means;
let mut covars: Vec<Vec<f64>> = (0..k)
.map(|_| {
let mut cov = vec![0.0_f64; d * d];
for j in 0..d {
cov[j * d + j] = 1.0 + cfg.reg_covar;
}
cov
})
.collect();
let mut chol_l: Vec<Vec<f64>> = covars
.iter()
.map(|c| cholesky(c, d))
.collect::<AnomalyResult<Vec<_>>>()?;
let mut log_det: Vec<f64> = chol_l.iter().map(|l| log_det_from_chol(l, d)).collect();
let mut log_resp = vec![0.0_f64; n * k];
let mut prev_log_lik = f64::NEG_INFINITY;
let mut final_log_lik = f64::NEG_INFINITY;
let mut converged = false;
let mut n_iter = 0usize;
for iter in 0..cfg.max_iter {
n_iter = iter + 1;
let log_lik = e_step(
x,
n,
d,
&weights,
&means,
&chol_l,
&log_det,
k,
&mut log_resp,
);
final_log_lik = log_lik;
if iter > 0 && (log_lik - prev_log_lik).abs() < cfg.tol {
converged = true;
break;
}
prev_log_lik = log_lik;
m_step(
x,
n,
d,
&log_resp,
k,
cfg.reg_covar,
&mut weights,
&mut means,
&mut covars,
&mut chol_l,
&mut log_det,
)?;
}
let train_scores: Vec<f64> =
compute_neg_log_lik(x, n, d, &weights, &means, &chol_l, &log_det, k);
let mut sorted = train_scores.clone();
sorted.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let rank = ((1.0 - cfg.contamination) * n as f64).floor() as usize;
let rank = rank.min(n - 1);
let threshold = sorted[rank];
let _ = rng.next_u32();
Ok(GmmModel {
weights,
means,
covars,
chol_l,
log_det,
threshold,
n_features: d,
n_components: k,
n_iter,
converged,
log_likelihood: final_log_lik,
})
}
fn compute_neg_log_lik(
data: &[f64],
n: usize,
d: usize,
weights: &[f64],
means: &[Vec<f64>],
chol_l: &[Vec<f64>],
log_det: &[f64],
k: usize,
) -> Vec<f64> {
(0..n)
.map(|i| {
let xi = &data[i * d..(i + 1) * d];
let log_probs: Vec<f64> = (0..k)
.map(|kk| {
weights[kk].ln() + log_gaussian(xi, &means[kk], &chol_l[kk], log_det[kk], d)
})
.collect();
-log_sum_exp(&log_probs)
})
.collect()
}
pub fn gmm_log_likelihood(
model: &GmmModel,
x: &[f64],
n_samples: usize,
) -> AnomalyResult<Vec<f64>> {
if model.weights.is_empty() {
return Err(AnomalyError::NotFitted);
}
if n_samples == 0 {
return Err(AnomalyError::EmptyInput);
}
if x.len() != n_samples * model.n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * model.n_features,
got: x.len(),
});
}
let lls: Vec<f64> = (0..n_samples)
.map(|i| {
let xi = &x[i * model.n_features..(i + 1) * model.n_features];
let log_probs: Vec<f64> = (0..model.n_components)
.map(|k| {
model.weights[k].ln()
+ log_gaussian(
xi,
&model.means[k],
&model.chol_l[k],
model.log_det[k],
model.n_features,
)
})
.collect();
log_sum_exp(&log_probs)
})
.collect();
Ok(lls)
}
pub fn gmm_score(model: &GmmModel, x: &[f64], n_samples: usize) -> AnomalyResult<Vec<f64>> {
let lls = gmm_log_likelihood(model, x, n_samples)?;
Ok(lls.into_iter().map(|v| -v).collect())
}
pub fn gmm_predict(model: &GmmModel, x: &[f64], n_samples: usize) -> AnomalyResult<Vec<i32>> {
let scores = gmm_score(model, x, n_samples)?;
Ok(scores
.into_iter()
.map(|s| if s >= model.threshold { -1 } else { 1 })
.collect())
}
pub fn gmm_sample(model: &GmmModel, n: usize, rng: &mut LcgRng) -> AnomalyResult<Vec<f64>> {
if model.weights.is_empty() {
return Err(AnomalyError::NotFitted);
}
if n == 0 {
return Ok(Vec::new());
}
let d = model.n_features;
let k = model.n_components;
let mut out = Vec::with_capacity(n * d);
for _ in 0..n {
let u = (rng.next_u32() as f64) / (u32::MAX as f64 + 1.0);
let mut cumsum = 0.0_f64;
let mut comp = k - 1;
for kk in 0..k {
cumsum += model.weights[kk];
if u < cumsum {
comp = kk;
break;
}
}
let mu = &model.means[comp];
let l = &model.chol_l[comp];
let mut z = vec![0.0_f64; d];
let mut idx = 0;
while idx + 1 < d {
let (a, b) = rng.next_normal_pair();
z[idx] = a as f64;
z[idx + 1] = b as f64;
idx += 2;
}
if idx < d {
z[idx] = rng.next_normal() as f64;
}
for row in 0..d {
let mut val = mu[row];
for col in 0..=row {
val += l[row * d + col] * z[col];
}
out.push(val);
}
}
Ok(out)
}
#[cfg(test)]
mod tests {
use super::*;
fn gaussian_cloud(n: usize, mu: [f64; 2], sigma: f64, seed: u64) -> Vec<f64> {
let mut rng = LcgRng::new(seed);
let mut data = Vec::with_capacity(n * 2);
for _ in 0..n {
data.push(mu[0] + rng.next_normal() as f64 * sigma);
data.push(mu[1] + rng.next_normal() as f64 * sigma);
}
data
}
#[test]
fn gmm_fit_runs() {
let data = gaussian_cloud(50, [0.0, 0.0], 1.0, 1);
let cfg = GmmConfig {
n_components: 2,
..GmmConfig::default()
};
let mut rng = LcgRng::new(1);
let model = gmm_fit(&data, 50, 2, &cfg, &mut rng);
assert!(model.is_ok(), "gmm_fit failed: {:?}", model.err());
}
#[test]
fn gmm_weights_sum_to_1() {
let data = gaussian_cloud(80, [0.0, 0.0], 1.0, 2);
let cfg = GmmConfig::default();
let mut rng = LcgRng::new(2);
let model =
gmm_fit(&data, 80, 2, &cfg, &mut rng).expect("GMM fit on 80 samples should succeed");
let sum: f64 = model.weights.iter().sum();
assert!(
(sum - 1.0).abs() < 1e-10,
"weights sum = {sum}, expected 1.0"
);
}
#[test]
fn gmm_log_lik_shape() {
let data = gaussian_cloud(60, [0.0, 0.0], 1.0, 3);
let cfg = GmmConfig::default();
let mut rng = LcgRng::new(3);
let model =
gmm_fit(&data, 60, 2, &cfg, &mut rng).expect("GMM fit on 60 samples should succeed");
let lls = gmm_log_likelihood(&model, &data, 60)
.expect("log-likelihood computation should succeed");
assert_eq!(lls.len(), 60);
}
#[test]
fn gmm_log_lik_finite() {
let data = gaussian_cloud(60, [0.0, 0.0], 1.0, 4);
let cfg = GmmConfig::default();
let mut rng = LcgRng::new(4);
let model =
gmm_fit(&data, 60, 2, &cfg, &mut rng).expect("GMM fit on 60 samples should succeed");
let lls = gmm_log_likelihood(&model, &data, 60)
.expect("log-likelihood computation should succeed");
for (i, &v) in lls.iter().enumerate() {
assert!(v.is_finite(), "log_lik[{i}] = {v} is not finite");
}
}
#[test]
fn gmm_score_higher_for_outlier() {
let mut data = gaussian_cloud(60, [0.0, 0.0], 0.5, 5);
data.push(100.0);
data.push(100.0);
let n = 61;
let cfg = GmmConfig {
n_components: 1, max_iter: 100,
tol: 1e-4,
reg_covar: 1e-6,
contamination: 0.05,
init_seed: 5,
};
let mut rng = LcgRng::new(5);
let model = gmm_fit(&data, n, 2, &cfg, &mut rng)
.expect("GMM fit with outlier appended should succeed");
let scores =
gmm_score(&model, &data, n).expect("GMM score should succeed on training data");
let outlier_score = scores[n - 1];
let inlier_mean: f64 = scores[..n - 1].iter().sum::<f64>() / (n - 1) as f64;
assert!(
outlier_score > inlier_mean,
"outlier score {outlier_score:.4} should > inlier mean {inlier_mean:.4}"
);
}
#[test]
fn gmm_predict_labels() {
let data = gaussian_cloud(80, [0.0, 0.0], 1.0, 6);
let cfg = GmmConfig::default();
let mut rng = LcgRng::new(6);
let model =
gmm_fit(&data, 80, 2, &cfg, &mut rng).expect("GMM fit on 80 samples should succeed");
let labels =
gmm_predict(&model, &data, 80).expect("GMM predict on training data should succeed");
for &l in &labels {
assert!(l == 1 || l == -1, "label={l} not in {{-1, +1}}");
}
}
#[test]
fn gmm_sample_shape() {
let data = gaussian_cloud(60, [0.0, 0.0], 1.0, 7);
let cfg = GmmConfig::default();
let mut rng = LcgRng::new(7);
let model =
gmm_fit(&data, 60, 2, &cfg, &mut rng).expect("GMM fit on 60 samples should succeed");
let n_gen = 30usize;
let samples = gmm_sample(&model, n_gen, &mut rng)
.expect("GMM sample from fitted model should succeed");
assert_eq!(
samples.len(),
n_gen * 2,
"expected {} elements, got {}",
n_gen * 2,
samples.len()
);
}
#[test]
fn gmm_2d_cluster_separation() {
let mut data = gaussian_cloud(80, [-10.0, -10.0], 0.3, 8);
data.extend(gaussian_cloud(80, [10.0, 10.0], 0.3, 81));
let n = 160usize;
let cfg = GmmConfig {
n_components: 2,
max_iter: 200,
tol: 1e-6,
reg_covar: 1e-6,
contamination: 0.1,
init_seed: 8,
};
let mut rng = LcgRng::new(8);
let model =
gmm_fit(&data, n, 2, &cfg, &mut rng).expect("GMM fit on 2-cluster data should succeed");
let targets = [[-10.0_f64, -10.0], [10.0, 10.0]];
for mu in &model.means {
let nearest_dist = targets
.iter()
.map(|t| {
let dx = mu[0] - t[0];
let dy = mu[1] - t[1];
(dx * dx + dy * dy).sqrt()
})
.fold(f64::INFINITY, f64::min);
assert!(
nearest_dist < 3.0,
"mean {:?} is not near any cluster center (nearest dist {nearest_dist:.3})",
mu
);
}
}
#[test]
fn gmm_single_component() {
let data = gaussian_cloud(100, [5.0, -3.0], 2.0, 9);
let cfg = GmmConfig {
n_components: 1,
max_iter: 50,
tol: 1e-6,
reg_covar: 1e-6,
contamination: 0.1,
init_seed: 9,
};
let mut rng = LcgRng::new(9);
let model = gmm_fit(&data, 100, 2, &cfg, &mut rng)
.expect("single-component GMM fit should succeed");
assert_eq!(model.weights.len(), 1);
assert!(
(model.weights[0] - 1.0).abs() < 1e-10,
"weight[0] must be 1"
);
let mu = &model.means[0];
assert!((mu[0] - 5.0).abs() < 0.5, "μ[0]={:.3} expected ≈5", mu[0]);
assert!((mu[1] - -3.0).abs() < 0.5, "μ[1]={:.3} expected ≈-3", mu[1]);
}
#[test]
fn gmm_convergence_flag() {
let data = gaussian_cloud(100, [0.0, 0.0], 0.01, 10);
let cfg = GmmConfig {
n_components: 2,
max_iter: 500,
tol: 1e-6,
reg_covar: 1e-6,
contamination: 0.1,
init_seed: 10,
};
let mut rng = LcgRng::new(10);
let model = gmm_fit(&data, 100, 2, &cfg, &mut rng)
.expect("GMM fit on tight cluster should succeed");
assert!(
model.converged || model.n_iter < cfg.max_iter,
"expected convergence or early stop, n_iter={}",
model.n_iter
);
}
#[test]
fn gmm_sample_values_finite() {
let data = gaussian_cloud(60, [0.0, 0.0], 1.0, 11);
let cfg = GmmConfig::default();
let mut rng = LcgRng::new(11);
let model =
gmm_fit(&data, 60, 2, &cfg, &mut rng).expect("GMM fit on 60 samples should succeed");
let samples =
gmm_sample(&model, 50, &mut rng).expect("GMM sample should produce 50 valid points");
for (i, &v) in samples.iter().enumerate() {
assert!(v.is_finite(), "sample[{i}]={v} is not finite");
}
}
#[test]
fn gmm_high_dimensional() {
let d = 5usize;
let n = 100usize;
let mut rng_gen = LcgRng::new(12);
let data: Vec<f64> = (0..n * d).map(|_| rng_gen.next_normal() as f64).collect();
let cfg = GmmConfig {
n_components: 3,
max_iter: 100,
tol: 1e-4,
reg_covar: 1e-4,
contamination: 0.1,
init_seed: 12,
};
let mut rng = LcgRng::new(12);
let model =
gmm_fit(&data, n, d, &cfg, &mut rng).expect("GMM fit on 5-D data should succeed");
let scores = gmm_score(&model, &data, n).expect("GMM score on 5-D data should succeed");
assert_eq!(scores.len(), n);
assert!(
scores.iter().all(|s| s.is_finite()),
"all scores should be finite"
);
}
#[test]
fn cholesky_identity_matrix() {
let id = vec![1.0_f64, 0.0, 0.0, 1.0];
let l = cholesky(&id, 2).expect("Cholesky of identity matrix should succeed");
assert!((l[0] - 1.0).abs() < 1e-12);
assert!(l[1].abs() < 1e-12);
assert!(l[2].abs() < 1e-12);
assert!((l[3] - 1.0).abs() < 1e-12);
assert!((log_det_from_chol(&l, 2)).abs() < 1e-12, "log|I|=0");
}
}