use anofox_ml_core::{FitUnsupervised, Result, RustMlError};
use ndarray::{Array1, Array2};
#[derive(Debug, Clone)]
pub struct Hdbscan {
pub min_samples: usize,
pub min_cluster_size: usize,
}
impl Hdbscan {
pub fn new(min_samples: usize, min_cluster_size: usize) -> Self {
Self {
min_samples,
min_cluster_size,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct FittedHdbscan {
pub labels: Array1<f64>,
pub n_clusters: usize,
}
fn euclid(a: &[f64], b: &[f64]) -> f64 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f64>()
.sqrt()
}
struct DSU {
parent: Vec<usize>,
size: Vec<usize>,
}
impl DSU {
fn new(n: usize) -> Self {
Self {
parent: (0..n).collect(),
size: vec![1; n],
}
}
fn find(&mut self, mut x: usize) -> usize {
while self.parent[x] != x {
self.parent[x] = self.parent[self.parent[x]];
x = self.parent[x];
}
x
}
fn union(&mut self, a: usize, b: usize) -> usize {
let ra = self.find(a);
let rb = self.find(b);
if ra == rb {
return ra;
}
let (big, small) = if self.size[ra] >= self.size[rb] {
(ra, rb)
} else {
(rb, ra)
};
self.parent[small] = big;
self.size[big] += self.size[small];
big
}
}
impl FitUnsupervised<f64> for Hdbscan {
type Fitted = FittedHdbscan;
fn fit(&self, x: &Array2<f64>) -> Result<Self::Fitted> {
let n = x.nrows();
if n == 0 {
return Err(RustMlError::EmptyInput("empty input".into()));
}
if self.min_samples < 1 || self.min_cluster_size < 2 {
return Err(RustMlError::InvalidParameter(
"min_samples >= 1, min_cluster_size >= 2".into(),
));
}
let mut d = vec![vec![0.0_f64; n]; n];
for i in 0..n {
let xi: Vec<f64> = x.row(i).iter().copied().collect();
for j in (i + 1)..n {
let xj: Vec<f64> = x.row(j).iter().copied().collect();
let val = euclid(&xi, &xj);
d[i][j] = val;
d[j][i] = val;
}
}
let mut core = vec![0.0_f64; n];
for i in 0..n {
let mut nb: Vec<f64> = (0..n).filter(|&j| j != i).map(|j| d[i][j]).collect();
nb.sort_by(|a, b| a.partial_cmp(b).unwrap());
let k = (self.min_samples - 1).min(nb.len().saturating_sub(1));
core[i] = if nb.is_empty() { 0.0 } else { nb[k] };
}
let mut mr = vec![vec![0.0_f64; n]; n];
for i in 0..n {
for j in 0..n {
if i != j {
mr[i][j] = d[i][j].max(core[i]).max(core[j]);
}
}
}
let mut in_tree = vec![false; n];
let mut min_dist = vec![f64::INFINITY; n];
let mut closest = vec![0_usize; n];
in_tree[0] = true;
for j in 1..n {
min_dist[j] = mr[0][j];
closest[j] = 0;
}
let mut edges: Vec<(usize, usize, f64)> = Vec::with_capacity(n - 1);
for _ in 1..n {
let mut best = f64::INFINITY;
let mut bi = 0;
for j in 0..n {
if !in_tree[j] && min_dist[j] < best {
best = min_dist[j];
bi = j;
}
}
edges.push((closest[bi], bi, best));
in_tree[bi] = true;
for j in 0..n {
if !in_tree[j] && mr[bi][j] < min_dist[j] {
min_dist[j] = mr[bi][j];
closest[j] = bi;
}
}
}
edges.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap());
let dsu = DSU::new(n);
let _ = dsu;
let mut dsu2 = DSU::new(n);
let mut cluster_label = vec![-1.0_f64; n];
let mut frozen_as_noise = vec![false; n];
let mut next_id = 0.0_f64;
let mut finalised: std::collections::HashSet<usize> = std::collections::HashSet::new();
for &(a, b, _) in &edges {
let ra = dsu2.find(a);
let rb = dsu2.find(b);
if ra == rb {
continue;
}
let a_fin = finalised.contains(&ra);
let b_fin = finalised.contains(&rb);
let sa = dsu2.size[ra];
let sb = dsu2.size[rb];
if a_fin && b_fin {
continue;
} else if a_fin || b_fin {
let join_root = if a_fin { rb } else { ra };
for i in 0..n {
if dsu2.find(i) == join_root && cluster_label[i] < 0.0 {
frozen_as_noise[i] = true;
}
}
let old_fin_root = if a_fin { ra } else { rb };
dsu2.union(a, b);
let new_root = dsu2.find(a);
if new_root != old_fin_root {
finalised.remove(&old_fin_root);
finalised.insert(new_root);
}
} else {
let a_big = sa >= self.min_cluster_size;
let b_big = sb >= self.min_cluster_size;
if a_big && b_big {
for r in [ra, rb] {
let label = next_id;
next_id += 1.0;
for i in 0..n {
if dsu2.find(i) == r && !frozen_as_noise[i] {
cluster_label[i] = label;
}
}
finalised.insert(r);
}
} else {
dsu2.union(a, b);
}
}
}
let mut labels: Vec<f64> = cluster_label
.iter()
.enumerate()
.map(|(i, &l)| if frozen_as_noise[i] { -1.0 } else { l })
.collect();
let mut n_clusters = next_id as usize;
if n_clusters == 0 {
let mut sizes = std::collections::HashMap::<usize, usize>::new();
for i in 0..n {
if !frozen_as_noise[i] {
*sizes.entry(dsu2.find(i)).or_insert(0) += 1;
}
}
if let Some((&big_root, &sz)) = sizes.iter().max_by_key(|(_, s)| *s) {
if sz >= self.min_cluster_size {
for i in 0..n {
if !frozen_as_noise[i] && dsu2.find(i) == big_root {
labels[i] = 0.0;
} else {
labels[i] = -1.0;
}
}
n_clusters = 1;
}
}
}
Ok(FittedHdbscan {
labels: Array1::from_vec(labels),
n_clusters,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::array;
#[test]
fn test_hdbscan_two_blobs_with_noise() {
let x = array![
[0.0_f64, 0.0],
[0.1, 0.1],
[-0.1, 0.2],
[0.05, -0.1],
[0.0, 0.15],
[10.0, 10.0],
[10.1, 9.9],
[9.8, 10.2],
[10.05, 9.95],
[10.0, 10.1],
[50.0, 50.0],
];
let fitted = Hdbscan::new(2, 3).fit(&x).unwrap();
let l0 = fitted.labels[0];
for i in 1..5 {
assert_eq!(fitted.labels[i], l0);
}
let l5 = fitted.labels[5];
for i in 6..10 {
assert_eq!(fitted.labels[i], l5);
}
assert_ne!(l0, l5);
assert_eq!(
fitted.labels[10], -1.0,
"outlier should be noise, got {}",
fitted.labels[10]
);
}
}