use turbovec::IdMapIndex;
pub const TIER2_K: usize = 5;
pub enum DetectionResult {
Normal,
Anomaly {
score: f32,
nearest_incidents: Vec<u64>,
},
}
pub struct AnomalyDetector {
frozen_centroids: Vec<Vec<f32>>,
anomaly_threshold: f32,
}
fn euclidean(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b)
.map(|(x, y)| (x - y) * (x - y))
.sum::<f32>()
.sqrt()
}
impl AnomalyDetector {
pub fn new(frozen_centroids: Vec<Vec<f32>>, anomaly_threshold: f32) -> Self {
assert!(
!frozen_centroids.is_empty(),
"At least one centroid is required"
);
Self {
frozen_centroids,
anomaly_threshold,
}
}
pub fn fit(normal_vectors: &[f32], dim: usize, k: usize, anomaly_threshold: f32) -> Self {
assert!(dim > 0 && !normal_vectors.is_empty());
assert!(
normal_vectors.len().is_multiple_of(dim),
"Input vectors must form a flat n×dim array"
);
let n = normal_vectors.len() / dim;
let k = k.clamp(1, n);
let row = |i: usize| &normal_vectors[i * dim..(i + 1) * dim];
let mut centroids: Vec<Vec<f32>> = (0..k).map(|c| row(c * n / k).to_vec()).collect();
let mut assignment = vec![0usize; n];
for _ in 0..16 {
for (i, assign) in assignment.iter_mut().enumerate().take(n) {
let mut best = (f32::INFINITY, 0usize);
for (c, centroid) in centroids.iter().enumerate() {
let d = euclidean(row(i), centroid);
if d < best.0 {
best = (d, c);
}
}
*assign = best.1;
}
let mut sums = vec![vec![0f32; dim]; k];
let mut counts = vec![0usize; k];
for i in 0..n {
counts[assignment[i]] += 1;
for (s, v) in sums[assignment[i]].iter_mut().zip(row(i)) {
*s += v;
}
}
for c in 0..k {
if counts[c] > 0 {
for s in sums[c].iter_mut() {
*s /= counts[c] as f32;
}
centroids[c] = sums[c].clone();
}
}
}
Self::new(centroids, anomaly_threshold)
}
pub fn fit_auto(normal_vectors: &[f32], dim: usize, k: usize, floor: f32) -> Self {
const AUTO_THRESHOLD_MARGIN: f32 = 1.25;
let mut detector = Self::fit(normal_vectors, dim, k, floor);
let n = normal_vectors.len() / dim;
let mut distances: Vec<f32> = (0..n)
.map(|i| detector.min_distance(&normal_vectors[i * dim..(i + 1) * dim]))
.collect();
distances.sort_by(f32::total_cmp);
let p99 = distances[((distances.len() - 1) as f32 * 0.99) as usize];
detector.anomaly_threshold = (p99 * AUTO_THRESHOLD_MARGIN).max(floor);
detector
}
pub fn threshold(&self) -> f32 {
self.anomaly_threshold
}
pub fn min_distance(&self, vector: &[f32]) -> f32 {
self.frozen_centroids
.iter()
.map(|c| euclidean(c, vector))
.fold(f32::INFINITY, f32::min)
}
pub fn detect(&self, vector: &[f32], search_index: &IdMapIndex) -> DetectionResult {
self.detect_filtered(vector, search_index, None)
}
pub fn detect_filtered(
&self,
vector: &[f32],
search_index: &IdMapIndex,
allowlist: Option<&[u64]>,
) -> DetectionResult {
let score = self.min_distance(vector);
if score <= self.anomaly_threshold {
return DetectionResult::Normal;
}
DetectionResult::Anomaly {
score,
nearest_incidents: tier2_context(vector, search_index, allowlist),
}
}
}
fn tier2_context(vector: &[f32], index: &IdMapIndex, allowlist: Option<&[u64]>) -> Vec<u64> {
if index.is_empty() || index.dim() != vector.len() {
return Vec::new();
}
match allowlist {
Some(ids) => {
let present: Vec<u64> = ids
.iter()
.copied()
.filter(|&id| index.contains(id))
.collect();
if present.is_empty() {
return Vec::new();
}
index
.search_with_allowlist(vector, TIER2_K, Some(&present))
.1
}
None => index.search(vector, TIER2_K).1,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fit_two_clusters_and_min_distance() {
let mut data = Vec::new();
for j in 0..10 {
let eps = j as f32 * 0.01;
data.extend_from_slice(&[1.0, eps, 0.0, 0.0]);
data.extend_from_slice(&[0.0, 0.0, 1.0, eps]);
}
let det = AnomalyDetector::fit(&data, 4, 2, 0.5);
assert!(
det.min_distance(&[1.0, 0.0, 0.0, 0.0]) < 0.2,
"Cluster centers should be close to their respective members"
);
assert!(
det.min_distance(&[-1.0, 0.0, 0.0, 0.0]) > 1.0,
"Opposite directions should yield large distances"
);
}
#[test]
fn fit_auto_derives_threshold_from_spread() {
let mut data = Vec::new();
for j in 0..50 {
let eps = j as f32 * 0.01; data.extend_from_slice(&[1.0, eps, 0.0, 0.0]);
}
let det = AnomalyDetector::fit_auto(&data, 4, 1, 0.05);
assert!(
det.threshold() > 0.05,
"p99-derived threshold should exceed the floor: {}",
det.threshold()
);
assert!(det.min_distance(&[1.0, 0.2, 0.0, 0.0]) <= det.threshold());
let tight: Vec<f32> = [1.0f32, 0.0, 0.0, 0.0].repeat(20);
let det = AnomalyDetector::fit_auto(&tight, 4, 1, 0.5);
assert_eq!(det.threshold(), 0.5, "floor must dominate when p99 ≈ 0");
}
}