use crate::dataset::Dataset;
use crate::distance::{cosine_distance, euclidean_sq, manhattan};
use crate::error::{Result, ScryLearnError};
use crate::neighbors::DistanceMetric;
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct Hdbscan {
min_cluster_size: usize,
min_samples: Option<usize>,
metric: DistanceMetric,
labels: Vec<i32>,
n_clusters: usize,
outlier_scores: Vec<f64>,
fitted: bool,
#[cfg_attr(feature = "serde", serde(default))]
_schema_version: u32,
}
impl Hdbscan {
pub fn new() -> Self {
Self {
min_cluster_size: 5,
min_samples: None,
metric: DistanceMetric::Euclidean,
labels: Vec::new(),
n_clusters: 0,
outlier_scores: Vec::new(),
fitted: false,
_schema_version: crate::version::SCHEMA_VERSION,
}
}
pub fn min_cluster_size(mut self, size: usize) -> Self {
self.min_cluster_size = size;
self
}
pub fn min_samples(mut self, k: usize) -> Self {
self.min_samples = Some(k);
self
}
pub fn metric(mut self, m: DistanceMetric) -> Self {
self.metric = m;
self
}
pub fn labels(&self) -> &[i32] {
&self.labels
}
pub fn n_clusters(&self) -> usize {
self.n_clusters
}
pub fn n_noise(&self) -> usize {
self.labels.iter().filter(|&&l| l == -1).count()
}
pub fn outlier_scores(&self) -> &[f64] {
&self.outlier_scores
}
pub fn fit(&mut self, data: &Dataset) -> Result<()> {
data.validate_finite()?;
let n = data.n_samples();
if n == 0 {
return Err(ScryLearnError::EmptyDataset);
}
if n < self.min_cluster_size {
self.labels = vec![-1; n];
self.n_clusters = 0;
self.outlier_scores = vec![1.0; n];
self.fitted = true;
return Ok(());
}
let rows = data.feature_matrix(); let k = self.min_samples.unwrap_or(self.min_cluster_size);
let k = k.min(n - 1).max(1);
let dist = pairwise_distances(&rows, self.metric);
let core_dist = core_distances(&dist, k);
let mr_dist = mutual_reachability(&dist, &core_dist);
let mst = prim_mst(&mr_dist);
let mut sorted_edges = mst;
sorted_edges.sort_by(|a, b| a.2.total_cmp(&b.2));
let (labels, n_clusters, outlier_scores) =
extract_clusters(&sorted_edges, n, self.min_cluster_size);
self.labels = labels;
self.n_clusters = n_clusters;
self.outlier_scores = outlier_scores;
self.fitted = true;
Ok(())
}
pub fn fit_predict(&mut self, data: &Dataset) -> Result<Vec<i32>> {
self.fit(data)?;
Ok(self.labels.clone())
}
}
impl Default for Hdbscan {
fn default() -> Self {
Self::new()
}
}
fn pairwise_distances(rows: &[Vec<f64>], metric: DistanceMetric) -> Vec<Vec<f64>> {
let n = rows.len();
let mut dist = vec![vec![0.0; n]; n];
for i in 0..n {
for j in (i + 1)..n {
let d = match metric {
DistanceMetric::Euclidean => euclidean_sq(&rows[i], &rows[j]).sqrt(),
DistanceMetric::Manhattan => manhattan(&rows[i], &rows[j]),
DistanceMetric::Cosine => cosine_distance(&rows[i], &rows[j]),
};
dist[i][j] = d;
dist[j][i] = d;
}
}
dist
}
fn core_distances(dist: &[Vec<f64>], k: usize) -> Vec<f64> {
let n = dist.len();
let mut core = Vec::with_capacity(n);
for i in 0..n {
let mut dists: Vec<f64> = (0..n).filter(|&j| j != i).map(|j| dist[i][j]).collect();
dists.sort_unstable_by(|a, b| a.total_cmp(b));
let kth = (k - 1).min(dists.len() - 1);
core.push(dists[kth]);
}
core
}
fn mutual_reachability(dist: &[Vec<f64>], core_dist: &[f64]) -> Vec<Vec<f64>> {
let n = dist.len();
let mut mr = vec![vec![0.0; n]; n];
for i in 0..n {
for j in (i + 1)..n {
let d = dist[i][j].max(core_dist[i]).max(core_dist[j]);
mr[i][j] = d;
mr[j][i] = d;
}
}
mr
}
fn prim_mst(dist: &[Vec<f64>]) -> Vec<(usize, usize, f64)> {
let n = dist.len();
if n <= 1 {
return Vec::new();
}
let mut in_tree = vec![false; n];
let mut min_edge = vec![f64::INFINITY; n];
let mut edge_from = vec![0usize; n];
let mut edges = Vec::with_capacity(n - 1);
in_tree[0] = true;
for j in 1..n {
min_edge[j] = dist[0][j];
edge_from[j] = 0;
}
for _ in 0..(n - 1) {
let mut best = usize::MAX;
let mut best_w = f64::INFINITY;
for j in 0..n {
if !in_tree[j] && min_edge[j] < best_w {
best = j;
best_w = min_edge[j];
}
}
if best == usize::MAX {
break;
}
in_tree[best] = true;
edges.push((edge_from[best], best, best_w));
for j in 0..n {
if !in_tree[j] && dist[best][j] < min_edge[j] {
min_edge[j] = dist[best][j];
edge_from[j] = best;
}
}
}
edges
}
fn extract_clusters(
sorted_edges: &[(usize, usize, f64)],
n: usize,
min_cluster_size: usize,
) -> (Vec<i32>, usize, Vec<f64>) {
let mut parent: Vec<usize> = (0..n).collect();
let mut size: Vec<usize> = vec![1; n];
let mut members: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
let mut point_lambda = vec![0.0_f64; n];
let mut cluster_assignments: Vec<i32> = vec![-1; n];
let mut next_cluster = 0i32;
for &(u, v, w) in sorted_edges {
let ru = find(&parent, u);
let rv = find(&parent, v);
if ru == rv {
continue;
}
let lambda = if w > 0.0 { 1.0 / w } else { f64::INFINITY };
for &pt in &members[ru] {
point_lambda[pt] = lambda;
}
for &pt in &members[rv] {
point_lambda[pt] = lambda;
}
let (small, big) = if size[ru] < size[rv] {
(ru, rv)
} else {
(rv, ru)
};
parent[small] = big;
size[big] += size[small];
let small_members = std::mem::take(&mut members[small]);
members[big].extend(small_members);
}
let mut uf_parent: Vec<usize> = (0..n).collect();
let mut uf_size: Vec<usize> = vec![1; n];
let mut uf_members: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
let mut finalized: Vec<bool> = vec![false; n];
for &(u, v, _w) in sorted_edges {
let ru = find(&uf_parent, u);
let rv = find(&uf_parent, v);
if ru == rv {
continue;
}
let size_u = uf_size[ru];
let size_v = uf_size[rv];
if size_u >= min_cluster_size && size_v >= min_cluster_size {
if !finalized[ru] {
for &pt in &uf_members[ru] {
cluster_assignments[pt] = next_cluster;
}
next_cluster += 1;
finalized[ru] = true;
}
if !finalized[rv] {
for &pt in &uf_members[rv] {
cluster_assignments[pt] = next_cluster;
}
next_cluster += 1;
finalized[rv] = true;
}
}
let (small, big) = if uf_size[ru] < uf_size[rv] {
(ru, rv)
} else {
(rv, ru)
};
uf_parent[small] = big;
uf_size[big] += uf_size[small];
let small_members = std::mem::take(&mut uf_members[small]);
uf_members[big].extend(small_members);
if finalized[small] || finalized[big] {
finalized[big] = true;
}
}
if next_cluster == 0 && n >= min_cluster_size {
cluster_assignments.fill(0);
next_cluster = 1;
}
let mut outlier_scores = vec![0.0_f64; n];
if next_cluster > 0 {
let mut cluster_max_lambda = vec![0.0_f64; next_cluster as usize];
for i in 0..n {
let c = cluster_assignments[i];
if c >= 0 {
let cu = c as usize;
if point_lambda[i] > cluster_max_lambda[cu] {
cluster_max_lambda[cu] = point_lambda[i];
}
}
}
for i in 0..n {
let c = cluster_assignments[i];
if c < 0 {
outlier_scores[i] = 1.0;
} else {
let max_l = cluster_max_lambda[c as usize];
if max_l > 0.0 {
outlier_scores[i] = 1.0 - (point_lambda[i] / max_l).min(1.0);
}
}
}
} else {
outlier_scores.fill(1.0);
}
(cluster_assignments, next_cluster as usize, outlier_scores)
}
fn find(parent: &[usize], mut x: usize) -> usize {
while parent[x] != x {
x = parent[x];
}
x
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hdbscan_two_clusters() {
let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let data = Dataset::new(
vec![f1, f2],
vec![0.0; 10],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new().min_cluster_size(3);
hdb.fit(&data).unwrap();
assert_eq!(hdb.n_clusters(), 2, "should find 2 clusters");
let labels = hdb.labels();
let cluster_a = labels[0];
let cluster_b = labels[5];
assert!(cluster_a >= 0, "first cluster should not be noise");
assert!(cluster_b >= 0, "second cluster should not be noise");
assert_ne!(cluster_a, cluster_b, "clusters should be different");
}
#[test]
fn test_hdbscan_with_noise() {
let mut f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let mut f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
f1.push(100.0);
f2.push(100.0);
let data = Dataset::new(
vec![f1, f2],
vec![0.0; 11],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new().min_cluster_size(3);
hdb.fit(&data).unwrap();
assert_eq!(hdb.n_clusters(), 2);
let labels = hdb.labels();
assert_eq!(labels[10], -1, "outlier should be noise");
}
#[test]
fn test_hdbscan_all_same() {
let f1 = vec![1.0; 10];
let f2 = vec![1.0; 10];
let data = Dataset::new(
vec![f1, f2],
vec![0.0; 10],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new().min_cluster_size(3);
hdb.fit(&data).unwrap();
assert_eq!(hdb.n_clusters(), 1);
assert_eq!(hdb.n_noise(), 0);
}
#[test]
fn test_hdbscan_min_cluster_size_respected() {
let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let data = Dataset::new(
vec![f1, f2],
vec![0.0; 10],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new().min_cluster_size(3);
hdb.fit(&data).unwrap();
let labels = hdb.labels();
let mut counts = std::collections::HashMap::new();
for &l in labels {
if l >= 0 {
*counts.entry(l).or_insert(0usize) += 1;
}
}
for (&cluster, &count) in &counts {
assert!(
count >= 3,
"cluster {} has {} members, less than min_cluster_size=3",
cluster,
count
);
}
}
#[test]
fn test_hdbscan_empty_dataset() {
let data = Dataset::new(Vec::<Vec<f64>>::new(), Vec::new(), Vec::new(), "label");
let mut hdb = Hdbscan::new();
assert!(hdb.fit(&data).is_err());
}
#[test]
fn test_hdbscan_outlier_scores() {
let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4, 100.0];
let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4, 100.0];
let data = Dataset::new(
vec![f1, f2],
vec![0.0; 11],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new().min_cluster_size(3);
hdb.fit(&data).unwrap();
let scores = hdb.outlier_scores();
assert_eq!(scores.len(), 11);
assert!(
(scores[10] - 1.0).abs() < 1e-6,
"noise point outlier score should be 1.0, got {}",
scores[10]
);
for &s in &scores[..5] {
assert!(
s < 1.0,
"cluster point should have outlier score < 1.0, got {s}"
);
}
}
#[test]
fn test_hdbscan_cosine_metric() {
use crate::neighbors::DistanceMetric;
let n_per = 5;
let mut f1 = Vec::new();
let mut f2 = Vec::new();
for i in 0..n_per {
let angle = 0.05 * i as f64;
f1.push(angle.cos());
f2.push(angle.sin());
}
for i in 0..n_per {
let angle = std::f64::consts::FRAC_PI_2 + 0.05 * i as f64;
f1.push(angle.cos());
f2.push(angle.sin());
}
let data = Dataset::new(
vec![f1, f2],
vec![0.0; n_per * 2],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new()
.min_cluster_size(3)
.metric(DistanceMetric::Cosine);
hdb.fit(&data).unwrap();
assert_eq!(
hdb.n_clusters(),
2,
"should find 2 clusters with cosine metric"
);
let labels = hdb.labels();
assert_ne!(
labels[0], labels[n_per],
"groups should have different labels"
);
}
#[test]
fn test_hdbscan_fit_predict() {
let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
let data = Dataset::new(
vec![f1, f2],
vec![0.0; 10],
vec!["x".into(), "y".into()],
"label",
);
let mut hdb = Hdbscan::new().min_cluster_size(3);
let labels = hdb.fit_predict(&data).unwrap();
assert_eq!(labels.len(), 10);
assert!(labels.iter().any(|&l| l >= 0), "should have some clusters");
}
}