use crate::error::{AnomalyError, AnomalyResult};
use crate::handle::LcgRng;
#[derive(Debug, Clone)]
pub struct CblofConfig {
pub n_clusters: usize,
pub alpha: f32,
pub beta: f32,
pub max_iter: usize,
pub contamination: f32,
pub use_cluster_weight: bool,
pub init_seed: u64,
}
impl Default for CblofConfig {
fn default() -> Self {
Self {
n_clusters: 8,
alpha: 0.9,
beta: 5.0,
max_iter: 100,
contamination: 0.1,
use_cluster_weight: true,
init_seed: 42,
}
}
}
pub struct CblofFit {
pub centroids: Vec<f32>,
pub labels: Vec<usize>,
pub large_cluster_ids: Vec<usize>,
pub cluster_sizes: Vec<usize>,
pub d: usize,
pub contamination: f32,
pub use_cluster_weight: bool,
}
#[inline]
fn sq_dist(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| (x - y) * (x - y)).sum()
}
#[inline]
fn eucl_dist(a: &[f32], b: &[f32]) -> f32 {
sq_dist(a, b).sqrt()
}
fn kmeans_plus_plus_init(data: &[f32], n: usize, d: usize, k: usize, rng: &mut LcgRng) -> Vec<f32> {
let mut centroids: Vec<f32> = Vec::with_capacity(k * d);
let first = rng.next_usize(n);
centroids.extend_from_slice(&data[first * d..(first + 1) * d]);
let mut dists = vec![f32::INFINITY; n];
for c_idx in 1..k {
let last_c = ¢roids[(c_idx - 1) * d..c_idx * d];
for i in 0..n {
let xi = &data[i * d..(i + 1) * d];
let sd = sq_dist(xi, last_c);
if sd < dists[i] {
dists[i] = sd;
}
}
let total: f32 = dists.iter().sum();
if total <= 0.0 {
let idx = rng.next_usize(n);
centroids.extend_from_slice(&data[idx * d..(idx + 1) * d]);
continue;
}
let threshold = rng.next_f32() * total;
let mut cumsum = 0.0_f32;
let mut chosen = n - 1;
for (i, &d_i) in dists.iter().enumerate() {
cumsum += d_i;
if cumsum >= threshold {
chosen = i;
break;
}
}
centroids.extend_from_slice(&data[chosen * d..(chosen + 1) * d]);
}
centroids
}
fn kmeans(
data: &[f32],
n: usize,
d: usize,
k: usize,
max_iter: usize,
rng: &mut LcgRng,
) -> (Vec<f32>, Vec<usize>) {
let mut centroids = kmeans_plus_plus_init(data, n, d, k, rng);
let mut labels = vec![0_usize; n];
for _ in 0..max_iter {
let mut changed = false;
for i in 0..n {
let xi = &data[i * d..(i + 1) * d];
let nearest = (0..k)
.min_by(|&a, &b| {
let da = sq_dist(xi, ¢roids[a * d..(a + 1) * d]);
let db = sq_dist(xi, ¢roids[b * d..(b + 1) * d]);
da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
})
.unwrap_or(0);
if labels[i] != nearest {
labels[i] = nearest;
changed = true;
}
}
if !changed {
break;
}
let mut sums = vec![0.0_f32; k * d];
let mut counts = vec![0_usize; k];
for i in 0..n {
let c = labels[i];
counts[c] += 1;
let xi = &data[i * d..(i + 1) * d];
for j in 0..d {
sums[c * d + j] += xi[j];
}
}
for c in 0..k {
if counts[c] > 0 {
let inv = 1.0 / counts[c] as f32;
for j in 0..d {
centroids[c * d + j] = sums[c * d + j] * inv;
}
}
}
}
(centroids, labels)
}
fn find_large_clusters(
cluster_sizes: &[usize],
k: usize,
alpha: f32,
beta: f32,
n: usize,
) -> Vec<usize> {
let mut order: Vec<usize> = (0..k).collect();
order.sort_unstable_by(|&a, &b| cluster_sizes[b].cmp(&cluster_sizes[a]));
let alpha_threshold = (alpha * n as f32).ceil() as usize;
let mut cumulative = 0_usize;
let mut lc_prefix_len = 1;
cumulative += cluster_sizes[order[0]];
for pos in 1..k {
if cumulative >= alpha_threshold {
let size_last_lc = cluster_sizes[order[pos - 1]];
let size_next = cluster_sizes[order[pos]];
if size_next == 0 || (size_last_lc as f32 / size_next as f32) >= beta {
lc_prefix_len = pos;
break;
}
}
cumulative += cluster_sizes[order[pos]];
lc_prefix_len = pos + 1;
}
order[..lc_prefix_len].to_vec()
}
pub fn cblof_fit(data: &[f32], n: usize, d: usize, cfg: CblofConfig) -> AnomalyResult<CblofFit> {
if n == 0 {
return Err(AnomalyError::EmptyInput);
}
if data.len() != n * d {
return Err(AnomalyError::DimensionMismatch {
expected: n * d,
got: data.len(),
});
}
if cfg.n_clusters == 0 {
return Err(AnomalyError::InvalidK { k: 0 });
}
if cfg.n_clusters > n {
return Err(AnomalyError::InvalidK { k: cfg.n_clusters });
}
let k = cfg.n_clusters;
let mut rng = LcgRng::new(cfg.init_seed);
let (centroids, labels) = kmeans(data, n, d, k, cfg.max_iter, &mut rng);
let mut cluster_sizes = vec![0_usize; k];
for &l in &labels {
cluster_sizes[l] += 1;
}
let large_cluster_ids = find_large_clusters(&cluster_sizes, k, cfg.alpha, cfg.beta, n);
if large_cluster_ids.is_empty() {
return Err(AnomalyError::Internal {
msg: "LC/SC split produced no large clusters".into(),
});
}
Ok(CblofFit {
centroids,
labels,
large_cluster_ids,
cluster_sizes,
d,
contamination: cfg.contamination,
use_cluster_weight: cfg.use_cluster_weight,
})
}
pub fn cblof_score(fit: &CblofFit, data: &[f32], n: usize) -> AnomalyResult<Vec<f32>> {
if n == 0 {
return Err(AnomalyError::EmptyInput);
}
let expected_len = n * fit.d;
if data.len() != expected_len {
return Err(AnomalyError::DimensionMismatch {
expected: expected_len,
got: data.len(),
});
}
let k = fit.cluster_sizes.len();
let mut scores = Vec::with_capacity(n);
for i in 0..n {
let xi = &data[i * fit.d..(i + 1) * fit.d];
let (nearest_c, nearest_dist) = (0..k)
.map(|c| {
let mu_c = &fit.centroids[c * fit.d..(c + 1) * fit.d];
(c, eucl_dist(xi, mu_c))
})
.min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or((0, 0.0));
let is_large = fit.large_cluster_ids.contains(&nearest_c);
let score = if is_large {
if fit.use_cluster_weight {
fit.cluster_sizes[nearest_c] as f32 * nearest_dist
} else {
nearest_dist
}
} else {
let min_lc_dist = fit
.large_cluster_ids
.iter()
.map(|&lc| {
let mu_lc = &fit.centroids[lc * fit.d..(lc + 1) * fit.d];
eucl_dist(xi, mu_lc)
})
.fold(f32::INFINITY, f32::min);
if fit.use_cluster_weight {
fit.cluster_sizes[nearest_c] as f32 * min_lc_dist
} else {
min_lc_dist
}
};
scores.push(score);
}
Ok(scores)
}
pub fn cblof_predict(fit: &CblofFit, data: &[f32], n: usize) -> AnomalyResult<Vec<bool>> {
let scores = cblof_score(fit, data, n)?;
let n_outliers = ((fit.contamination * n as f32).ceil() as usize).min(n);
let mut indexed: Vec<(usize, f32)> = scores.iter().enumerate().map(|(i, &s)| (i, s)).collect();
indexed.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut result = vec![false; n];
for &(i, _) in indexed.iter().take(n_outliers) {
result[i] = true;
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::handle::LcgRng;
fn default_cfg() -> CblofConfig {
CblofConfig::default()
}
fn cluster_2d(n: usize, cx: f32, cy: f32, sigma: f32, seed: u64) -> Vec<f32> {
let mut rng = LcgRng::new(seed);
(0..n)
.flat_map(|_| {
[
cx + rng.next_normal() * sigma,
cy + rng.next_normal() * sigma,
]
})
.collect()
}
#[test]
fn two_blobs_outlier_top_ranked() {
let mut data = cluster_2d(40, 0.0, 0.0, 0.3, 1);
data.extend(cluster_2d(40, 10.0, 10.0, 0.3, 2));
data.extend_from_slice(&[100.0_f32, 100.0]); let n = 81;
let cfg = CblofConfig {
n_clusters: 3,
alpha: 0.8,
beta: 3.0,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
let scores = cblof_score(&fit, &data, n).expect("CBLOF score should succeed");
let max_idx = scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i)
.expect("scores vec is non-empty");
assert_eq!(
max_idx, 80,
"outlier at index 80 should have highest CBLOF score"
);
}
#[test]
fn outlier_score_exceeds_inlier() {
let mut data = cluster_2d(50, 0.0, 0.0, 0.5, 3);
let outlier = vec![200.0_f32, 200.0];
data.extend_from_slice(&outlier);
let n = 51;
let cfg = CblofConfig {
n_clusters: 4,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
let scores = cblof_score(&fit, &data, n).expect("CBLOF score should succeed");
let out_score = scores[50];
let max_in = scores[..50]
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
assert!(
out_score > max_in,
"outlier score {out_score} should exceed max inlier score {max_in}"
);
}
#[test]
fn cluster_weight_scales_score() {
let data = cluster_2d(30, 0.0, 0.0, 0.2, 4);
let test_pt = vec![5.0_f32, 5.0];
let fit_w = cblof_fit(
&data,
30,
2,
CblofConfig {
use_cluster_weight: true,
n_clusters: 3,
..default_cfg()
},
)
.expect("CBLOF fit should succeed");
let fit_nw = cblof_fit(
&data,
30,
2,
CblofConfig {
use_cluster_weight: false,
n_clusters: 3,
..default_cfg()
},
)
.expect("CBLOF fit should succeed");
let score_w = cblof_score(&fit_w, &test_pt, 1).expect("CBLOF score should succeed")[0];
let score_nw = cblof_score(&fit_nw, &test_pt, 1).expect("CBLOF score should succeed")[0];
assert!(
score_w >= score_nw,
"weighted score {score_w} should ≥ unweighted score {score_nw}"
);
}
#[test]
fn large_cluster_split_alpha() {
let n = 90;
let mut data = cluster_2d(30, 0.0, 0.0, 0.1, 5);
data.extend(cluster_2d(30, 10.0, 0.0, 0.1, 6));
data.extend(cluster_2d(30, 0.0, 10.0, 0.1, 7));
let cfg = CblofConfig {
n_clusters: 3,
alpha: 0.5,
beta: 100.0, ..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
assert!(
!fit.large_cluster_ids.is_empty(),
"should have at least one large cluster"
);
}
#[test]
fn large_cluster_split_beta() {
let mut data = cluster_2d(30, 0.0, 0.0, 0.2, 8);
data.extend(cluster_2d(30, 5.0, 5.0, 0.2, 9));
let n = 60;
let cfg = CblofConfig {
n_clusters: 2,
alpha: 1.0,
beta: 1.0,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
assert_eq!(
fit.large_cluster_ids.len(),
2,
"both clusters should be large when alpha=1.0 and beta=1.0"
);
}
#[test]
fn k_equals_one_all_large() {
let data = cluster_2d(20, 1.0, 1.0, 0.5, 10);
let cfg = CblofConfig {
n_clusters: 1,
..default_cfg()
};
let fit = cblof_fit(&data, 20, 2, cfg).expect("CBLOF fit should succeed");
assert_eq!(
fit.large_cluster_ids,
vec![0],
"k=1 → single large cluster at index 0"
);
}
#[test]
fn k_equals_n_each_point_own_cluster() {
let n = 5;
let data: Vec<f32> = (0..n).flat_map(|i| [i as f32 * 10.0, 0.0_f32]).collect();
let cfg = CblofConfig {
n_clusters: n,
alpha: 0.9,
beta: 2.0,
max_iter: 200,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
let total: usize = fit.cluster_sizes.iter().sum();
assert_eq!(total, n);
}
#[test]
fn invalid_k_zero() {
let data = cluster_2d(10, 0.0, 0.0, 1.0, 11);
let cfg = CblofConfig {
n_clusters: 0,
..default_cfg()
};
assert!(matches!(
cblof_fit(&data, 10, 2, cfg),
Err(AnomalyError::InvalidK { .. })
));
}
#[test]
fn invalid_k_exceeds_n() {
let data = cluster_2d(5, 0.0, 0.0, 1.0, 12);
let cfg = CblofConfig {
n_clusters: 10, ..default_cfg()
};
assert!(matches!(
cblof_fit(&data, 5, 2, cfg),
Err(AnomalyError::InvalidK { .. })
));
}
#[test]
fn dim_mismatch_score() {
let data = cluster_2d(20, 0.0, 0.0, 1.0, 13);
let cfg = CblofConfig {
n_clusters: 3,
..default_cfg()
};
let fit = cblof_fit(&data, 20, 2, cfg).expect("CBLOF fit should succeed");
let bad = vec![1.0_f32; 15];
assert!(matches!(
cblof_score(&fit, &bad, 5),
Err(AnomalyError::DimensionMismatch { .. })
));
}
#[test]
fn dim_mismatch_fit() {
let data = vec![1.0_f32; 11];
let cfg = CblofConfig {
n_clusters: 2,
..default_cfg()
};
assert!(matches!(
cblof_fit(&data, 5, 3, cfg),
Err(AnomalyError::DimensionMismatch { .. })
));
}
#[test]
fn empty_input() {
assert!(matches!(
cblof_fit(&[], 0, 2, default_cfg()),
Err(AnomalyError::EmptyInput)
));
}
#[test]
fn predict_marks_correct_fraction() {
let data = cluster_2d(20, 0.0, 0.0, 0.5, 14);
let cfg = CblofConfig {
n_clusters: 3,
contamination: 0.1,
..default_cfg()
};
let fit = cblof_fit(&data, 20, 2, cfg).expect("CBLOF fit should succeed");
let preds = cblof_predict(&fit, &data, 20).expect("CBLOF predict should succeed");
let n_out = preds.iter().filter(|&&p| p).count();
assert_eq!(n_out, 2, "expected 2 predicted outliers, got {n_out}");
}
#[test]
fn centroids_count_equals_k() {
let data = cluster_2d(30, 0.0, 0.0, 1.0, 15);
let k = 5;
let cfg = CblofConfig {
n_clusters: k,
..default_cfg()
};
let fit = cblof_fit(&data, 30, 2, cfg).expect("CBLOF fit should succeed");
assert_eq!(
fit.centroids.len(),
k * 2,
"expected {} centroid elements (k={k}, d=2), got {}",
k * 2,
fit.centroids.len()
);
}
#[test]
fn labels_valid_range() {
let data = cluster_2d(40, 1.0, -1.0, 1.0, 16);
let k = 4;
let cfg = CblofConfig {
n_clusters: k,
..default_cfg()
};
let fit = cblof_fit(&data, 40, 2, cfg).expect("CBLOF fit should succeed");
for (i, &l) in fit.labels.iter().enumerate() {
assert!(l < k, "label[{i}] = {l} out of range [0, {k})");
}
}
#[test]
fn cluster_sizes_sum_to_n() {
let n = 50;
let data = cluster_2d(n, 0.0, 0.0, 2.0, 17);
let cfg = CblofConfig {
n_clusters: 5,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
let total: usize = fit.cluster_sizes.iter().sum();
assert_eq!(total, n, "cluster sizes should sum to n={n}, got {total}");
}
#[test]
fn deterministic_with_same_seed() {
let data = cluster_2d(30, 0.0, 0.0, 1.0, 18);
let cfg1 = CblofConfig {
n_clusters: 4,
init_seed: 77,
..default_cfg()
};
let cfg2 = cfg1.clone();
let fit1 = cblof_fit(&data, 30, 2, cfg1).expect("CBLOF fit should succeed");
let fit2 = cblof_fit(&data, 30, 2, cfg2).expect("CBLOF fit should succeed");
for (i, (&a, &b)) in fit1.centroids.iter().zip(fit2.centroids.iter()).enumerate() {
assert_eq!(a, b, "centroid[{i}] differs across runs with same seed");
}
}
#[test]
fn alpha_1_all_large() {
let data = cluster_2d(30, 0.0, 0.0, 1.0, 19);
let k = 3;
let cfg = CblofConfig {
n_clusters: k,
alpha: 1.0,
beta: 1000.0, ..default_cfg()
};
let fit = cblof_fit(&data, 30, 2, cfg).expect("CBLOF fit should succeed");
assert_eq!(
fit.large_cluster_ids.len(),
k,
"alpha=1.0 should make all {k} clusters large"
);
}
#[test]
fn inlier_score_lower_than_outlier() {
let data = cluster_2d(40, 0.0, 0.0, 0.5, 20);
let cfg = CblofConfig {
n_clusters: 4,
..default_cfg()
};
let fit = cblof_fit(&data, 40, 2, cfg).expect("CBLOF fit should succeed");
let inlier = vec![0.0_f32, 0.0]; let outlier = vec![100.0_f32, 100.0];
let s_in = cblof_score(&fit, &inlier, 1).expect("CBLOF score should succeed")[0];
let s_out = cblof_score(&fit, &outlier, 1).expect("CBLOF score should succeed")[0];
assert!(
s_out > s_in,
"outlier score {s_out} should exceed inlier score {s_in}"
);
}
#[test]
fn large_cluster_ids_nonempty() {
let data = cluster_2d(20, 0.0, 0.0, 1.0, 21);
let fit = cblof_fit(&data, 20, 2, default_cfg()).expect("CBLOF fit should succeed");
assert!(
!fit.large_cluster_ids.is_empty(),
"must have at least 1 large cluster"
);
}
#[test]
fn small_cluster_score_uses_nearest_lc() {
let mut data = cluster_2d(40, 0.0, 0.0, 0.1, 22);
data.extend(cluster_2d(40, 20.0, 0.0, 0.1, 23));
data.extend_from_slice(&[100.0_f32, 0.0]);
let n = 81;
let cfg = CblofConfig {
n_clusters: 3,
alpha: 0.95,
beta: 3.0,
contamination: 0.05,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
let scores = cblof_score(&fit, &data, n).expect("CBLOF score should succeed");
let isolated_score = scores[80];
let mean_cluster_score: f32 = scores[..80].iter().sum::<f32>() / 80.0;
assert!(
isolated_score > mean_cluster_score,
"isolated SC point score {isolated_score} should > cluster mean {mean_cluster_score}"
);
}
#[test]
fn kmeans_converges_on_obvious_clusters() {
let n = 90;
let mut data = cluster_2d(30, 0.0, 0.0, 0.05, 24);
data.extend(cluster_2d(30, 100.0, 0.0, 0.05, 25));
data.extend(cluster_2d(30, 0.0, 100.0, 0.05, 26));
let cfg = CblofConfig {
n_clusters: 3,
alpha: 0.9,
beta: 2.0,
max_iter: 200,
..default_cfg()
};
let fit = cblof_fit(&data, n, 2, cfg).expect("CBLOF fit should succeed");
let mut sizes = fit.cluster_sizes.clone();
sizes.sort_unstable();
for &sz in &sizes {
assert_eq!(
sz, 30,
"each cluster should have exactly 30 points, got {sz}"
);
}
}
#[test]
fn score_is_nonneg() {
let data = cluster_2d(30, 0.0, 0.0, 1.0, 27);
let cfg = CblofConfig {
n_clusters: 4,
..default_cfg()
};
let fit = cblof_fit(&data, 30, 2, cfg).expect("CBLOF fit should succeed");
let scores = cblof_score(&fit, &data, 30).expect("CBLOF score should succeed");
for &s in &scores {
assert!(s >= 0.0, "CBLOF score should be non-negative, got {s}");
}
}
}