Skip to main content

scry_learn/cluster/
hdbscan.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! HDBSCAN — Hierarchical Density-Based Spatial Clustering.
3//!
4//! Implements the Campello, Moulavi & Sander (2013) algorithm. Unlike DBSCAN,
5//! HDBSCAN automatically determines the number of clusters and handles
6//! varying-density regions without manual epsilon tuning.
7//!
8//! # Algorithm
9//!
10//! 1. Compute **core distances** (distance to k-th nearest neighbor).
11//! 2. Build a **mutual reachability graph** where edge weight =
12//!    `max(core(a), core(b), dist(a,b))`.
13//! 3. Extract a **minimum spanning tree** (Prim's algorithm).
14//! 4. Build a **single-linkage dendrogram** from the MST.
15//! 5. **Condense** the tree, pruning clusters smaller than `min_cluster_size`.
16//! 6. **Extract clusters** using the Excess of Mass (EOMM) method.
17//!
18//! # Example
19//!
20//! ```
21//! use scry_learn::cluster::Hdbscan;
22//! use scry_learn::dataset::Dataset;
23//!
24//! let data = Dataset::new(
25//!     vec![vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4],
26//!          vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4]],
27//!     vec![0.0; 10],
28//!     vec!["x".into(), "y".into()],
29//!     "label",
30//! );
31//!
32//! let mut hdb = Hdbscan::new().min_cluster_size(3);
33//! hdb.fit(&data).unwrap();
34//! assert_eq!(hdb.n_clusters(), 2);
35//! ```
36
37use crate::dataset::Dataset;
38use crate::distance::{cosine_distance, euclidean_sq, manhattan};
39use crate::error::{Result, ScryLearnError};
40use crate::neighbors::DistanceMetric;
41
42/// HDBSCAN clustering model.
43///
44/// Automatically determines the number of clusters from density variations
45/// in the data. Noise points are labeled -1.
46#[derive(Clone, Debug)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48#[non_exhaustive]
49pub struct Hdbscan {
50    /// Minimum cluster size. Clusters smaller than this are treated as noise.
51    min_cluster_size: usize,
52    /// Number of neighbors used to compute core distance (default = min_cluster_size).
53    min_samples: Option<usize>,
54    /// Distance metric for pairwise distance computation.
55    metric: DistanceMetric,
56    /// Cluster labels after fitting (-1 = noise).
57    labels: Vec<i32>,
58    /// Number of clusters found (excluding noise).
59    n_clusters: usize,
60    /// Per-point outlier scores (higher = more likely outlier).
61    outlier_scores: Vec<f64>,
62    /// Whether the model has been fitted.
63    fitted: bool,
64    #[cfg_attr(feature = "serde", serde(default))]
65    _schema_version: u32,
66}
67
68impl Hdbscan {
69    /// Create a new HDBSCAN model with default parameters.
70    ///
71    /// Default: `min_cluster_size = 5`, `min_samples = min_cluster_size`.
72    pub fn new() -> Self {
73        Self {
74            min_cluster_size: 5,
75            min_samples: None,
76            metric: DistanceMetric::Euclidean,
77            labels: Vec::new(),
78            n_clusters: 0,
79            outlier_scores: Vec::new(),
80            fitted: false,
81            _schema_version: crate::version::SCHEMA_VERSION,
82        }
83    }
84
85    /// Set the minimum cluster size (default: 5).
86    ///
87    /// Clusters with fewer points than this are dissolved into noise.
88    pub fn min_cluster_size(mut self, size: usize) -> Self {
89        self.min_cluster_size = size;
90        self
91    }
92
93    /// Set min_samples for core distance computation.
94    ///
95    /// Default: same as `min_cluster_size`. Smaller values produce
96    /// more clusters; larger values make the algorithm more conservative.
97    pub fn min_samples(mut self, k: usize) -> Self {
98        self.min_samples = Some(k);
99        self
100    }
101
102    /// Set the distance metric (default: [`DistanceMetric::Euclidean`]).
103    ///
104    /// Use [`DistanceMetric::Cosine`] for L2-normalized embeddings
105    /// (e.g. face embeddings, CLIP vectors).
106    pub fn metric(mut self, m: DistanceMetric) -> Self {
107        self.metric = m;
108        self
109    }
110
111    /// Get cluster labels (-1 = noise).
112    pub fn labels(&self) -> &[i32] {
113        &self.labels
114    }
115
116    /// Number of clusters found (excluding noise).
117    pub fn n_clusters(&self) -> usize {
118        self.n_clusters
119    }
120
121    /// Number of noise points.
122    pub fn n_noise(&self) -> usize {
123        self.labels.iter().filter(|&&l| l == -1).count()
124    }
125
126    /// Per-point outlier scores (higher = more outlier-like).
127    ///
128    /// Based on the ratio of the point's lambda value to its cluster's
129    /// lambda birth. Points deep inside clusters have scores near 0;
130    /// borderline points have scores near 1.
131    pub fn outlier_scores(&self) -> &[f64] {
132        &self.outlier_scores
133    }
134
135    /// Fit the HDBSCAN model on a dataset.
136    pub fn fit(&mut self, data: &Dataset) -> Result<()> {
137        data.validate_finite()?;
138        let n = data.n_samples();
139        if n == 0 {
140            return Err(ScryLearnError::EmptyDataset);
141        }
142        if n < self.min_cluster_size {
143            // Everything is noise.
144            self.labels = vec![-1; n];
145            self.n_clusters = 0;
146            self.outlier_scores = vec![1.0; n];
147            self.fitted = true;
148            return Ok(());
149        }
150
151        let rows = data.feature_matrix(); // row-major
152        let k = self.min_samples.unwrap_or(self.min_cluster_size);
153        let k = k.min(n - 1).max(1);
154
155        // Step 1: Compute pairwise distances and core distances.
156        let dist = pairwise_distances(&rows, self.metric);
157        let core_dist = core_distances(&dist, k);
158
159        // Step 2: Mutual reachability distances.
160        let mr_dist = mutual_reachability(&dist, &core_dist);
161
162        // Step 3: Minimum spanning tree (Prim's algorithm).
163        let mst = prim_mst(&mr_dist);
164
165        // Step 4: Sort MST edges by weight to get single-linkage ordering.
166        let mut sorted_edges = mst;
167        sorted_edges.sort_by(|a, b| a.2.total_cmp(&b.2));
168
169        // Step 5: Build single-linkage hierarchy and condense.
170        let (labels, n_clusters, outlier_scores) =
171            extract_clusters(&sorted_edges, n, self.min_cluster_size);
172
173        self.labels = labels;
174        self.n_clusters = n_clusters;
175        self.outlier_scores = outlier_scores;
176        self.fitted = true;
177        Ok(())
178    }
179
180    /// Convenience: fit and return labels.
181    pub fn fit_predict(&mut self, data: &Dataset) -> Result<Vec<i32>> {
182        self.fit(data)?;
183        Ok(self.labels.clone())
184    }
185}
186
187impl Default for Hdbscan {
188    fn default() -> Self {
189        Self::new()
190    }
191}
192
193// ---------------------------------------------------------------------------
194// Internal algorithms
195// ---------------------------------------------------------------------------
196
197/// Compute pairwise distances using the configured metric.
198fn pairwise_distances(rows: &[Vec<f64>], metric: DistanceMetric) -> Vec<Vec<f64>> {
199    let n = rows.len();
200    let mut dist = vec![vec![0.0; n]; n];
201    for i in 0..n {
202        for j in (i + 1)..n {
203            let d = match metric {
204                DistanceMetric::Euclidean => euclidean_sq(&rows[i], &rows[j]).sqrt(),
205                DistanceMetric::Manhattan => manhattan(&rows[i], &rows[j]),
206                DistanceMetric::Cosine => cosine_distance(&rows[i], &rows[j]),
207            };
208            dist[i][j] = d;
209            dist[j][i] = d;
210        }
211    }
212    dist
213}
214
215/// Compute the core distance for each point (distance to k-th nearest neighbor).
216fn core_distances(dist: &[Vec<f64>], k: usize) -> Vec<f64> {
217    let n = dist.len();
218    let mut core = Vec::with_capacity(n);
219    for i in 0..n {
220        let mut dists: Vec<f64> = (0..n).filter(|&j| j != i).map(|j| dist[i][j]).collect();
221        dists.sort_unstable_by(|a, b| a.total_cmp(b));
222        let kth = (k - 1).min(dists.len() - 1);
223        core.push(dists[kth]);
224    }
225    core
226}
227
228/// Compute mutual reachability distances.
229///
230/// `mr(a, b) = max(core(a), core(b), dist(a, b))`
231fn mutual_reachability(dist: &[Vec<f64>], core_dist: &[f64]) -> Vec<Vec<f64>> {
232    let n = dist.len();
233    let mut mr = vec![vec![0.0; n]; n];
234    for i in 0..n {
235        for j in (i + 1)..n {
236            let d = dist[i][j].max(core_dist[i]).max(core_dist[j]);
237            mr[i][j] = d;
238            mr[j][i] = d;
239        }
240    }
241    mr
242}
243
244/// Prim's algorithm for MST. Returns edges as (u, v, weight).
245fn prim_mst(dist: &[Vec<f64>]) -> Vec<(usize, usize, f64)> {
246    let n = dist.len();
247    if n <= 1 {
248        return Vec::new();
249    }
250
251    let mut in_tree = vec![false; n];
252    let mut min_edge = vec![f64::INFINITY; n];
253    let mut edge_from = vec![0usize; n];
254    let mut edges = Vec::with_capacity(n - 1);
255
256    // Start from node 0.
257    in_tree[0] = true;
258    for j in 1..n {
259        min_edge[j] = dist[0][j];
260        edge_from[j] = 0;
261    }
262
263    for _ in 0..(n - 1) {
264        // Find the closest non-tree node.
265        let mut best = usize::MAX;
266        let mut best_w = f64::INFINITY;
267        for j in 0..n {
268            if !in_tree[j] && min_edge[j] < best_w {
269                best = j;
270                best_w = min_edge[j];
271            }
272        }
273
274        if best == usize::MAX {
275            break;
276        }
277
278        in_tree[best] = true;
279        edges.push((edge_from[best], best, best_w));
280
281        // Update min_edge for remaining nodes.
282        for j in 0..n {
283            if !in_tree[j] && dist[best][j] < min_edge[j] {
284                min_edge[j] = dist[best][j];
285                edge_from[j] = best;
286            }
287        }
288    }
289
290    edges
291}
292
293/// Extract clusters from a sorted single-linkage MST using a simplified
294/// condensed tree approach.
295///
296/// Returns (labels, n_clusters, outlier_scores).
297fn extract_clusters(
298    sorted_edges: &[(usize, usize, f64)],
299    n: usize,
300    min_cluster_size: usize,
301) -> (Vec<i32>, usize, Vec<f64>) {
302    // Union-Find for building the dendrogram.
303    let mut parent: Vec<usize> = (0..n).collect();
304    let mut size: Vec<usize> = vec![1; n];
305    let mut members: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
306
307    // Track the lambda (1/distance) at which each point was last "alive".
308    let mut point_lambda = vec![0.0_f64; n];
309
310    // Process edges in order of increasing weight (single-linkage merge order).
311    // We'll extract clusters using a simple method:
312    // At each merge, if both sides are ≥ min_cluster_size, they form separate
313    // clusters. If only one side is large enough, the small side's points
314    // are noise at this level.
315    let mut cluster_assignments: Vec<i32> = vec![-1; n];
316    let mut next_cluster = 0i32;
317
318    // Simple approach: after building the full hierarchy, cut the tree
319    // using the stability criterion.
320    //
321    // We'll group components at the point where merging would combine
322    // two distinct dense regions.
323
324    // Build the full hierarchy first.
325    for &(u, v, w) in sorted_edges {
326        let ru = find(&parent, u);
327        let rv = find(&parent, v);
328        if ru == rv {
329            continue;
330        }
331
332        let lambda = if w > 0.0 { 1.0 / w } else { f64::INFINITY };
333
334        // Record lambda for ALL points in both merging components.
335        // This captures the density level at which each point participates
336        // in a merge event.
337        for &pt in &members[ru] {
338            point_lambda[pt] = lambda;
339        }
340        for &pt in &members[rv] {
341            point_lambda[pt] = lambda;
342        }
343
344        // Union by size.
345        let (small, big) = if size[ru] < size[rv] {
346            (ru, rv)
347        } else {
348            (rv, ru)
349        };
350        parent[small] = big;
351        size[big] += size[small];
352        let small_members = std::mem::take(&mut members[small]);
353        members[big].extend(small_members);
354    }
355
356    // Now extract clusters: find connected components at a density threshold.
357    // We use the simplified HDBSCAN approach: identify "prominent" cluster
358    // splits in the hierarchy.
359    //
360    // Replay the merges. Each time two components ≥ min_cluster_size merge,
361    // both are finalized as clusters. Remaining points are noise.
362    let mut uf_parent: Vec<usize> = (0..n).collect();
363    let mut uf_size: Vec<usize> = vec![1; n];
364    let mut uf_members: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
365    // Track which component has been "finalized" as a cluster.
366    let mut finalized: Vec<bool> = vec![false; n];
367
368    for &(u, v, _w) in sorted_edges {
369        let ru = find(&uf_parent, u);
370        let rv = find(&uf_parent, v);
371        if ru == rv {
372            continue;
373        }
374
375        let size_u = uf_size[ru];
376        let size_v = uf_size[rv];
377
378        // If both components are large enough, finalize each as a cluster
379        // before merging them.
380        if size_u >= min_cluster_size && size_v >= min_cluster_size {
381            if !finalized[ru] {
382                for &pt in &uf_members[ru] {
383                    cluster_assignments[pt] = next_cluster;
384                }
385                next_cluster += 1;
386                finalized[ru] = true;
387            }
388            if !finalized[rv] {
389                for &pt in &uf_members[rv] {
390                    cluster_assignments[pt] = next_cluster;
391                }
392                next_cluster += 1;
393                finalized[rv] = true;
394            }
395        }
396
397        // Union by size.
398        let (small, big) = if uf_size[ru] < uf_size[rv] {
399            (ru, rv)
400        } else {
401            (rv, ru)
402        };
403        uf_parent[small] = big;
404        uf_size[big] += uf_size[small];
405        let small_members = std::mem::take(&mut uf_members[small]);
406        uf_members[big].extend(small_members);
407
408        if finalized[small] || finalized[big] {
409            finalized[big] = true;
410        }
411    }
412
413    // If no split was ever triggered (all data forms one cluster), and
414    // the total size is ≥ min_cluster_size, assign everything to cluster 0.
415    if next_cluster == 0 && n >= min_cluster_size {
416        cluster_assignments.fill(0);
417        next_cluster = 1;
418    }
419
420    // Compute outlier scores.
421    // For each point, the outlier score is based on how early it was merged
422    // relative to its cluster's typical density.
423    let mut outlier_scores = vec![0.0_f64; n];
424    if next_cluster > 0 {
425        // For each cluster, find the max lambda of its members.
426        let mut cluster_max_lambda = vec![0.0_f64; next_cluster as usize];
427        for i in 0..n {
428            let c = cluster_assignments[i];
429            if c >= 0 {
430                let cu = c as usize;
431                if point_lambda[i] > cluster_max_lambda[cu] {
432                    cluster_max_lambda[cu] = point_lambda[i];
433                }
434            }
435        }
436
437        for i in 0..n {
438            let c = cluster_assignments[i];
439            if c < 0 {
440                outlier_scores[i] = 1.0;
441            } else {
442                let max_l = cluster_max_lambda[c as usize];
443                if max_l > 0.0 {
444                    outlier_scores[i] = 1.0 - (point_lambda[i] / max_l).min(1.0);
445                }
446            }
447        }
448    } else {
449        outlier_scores.fill(1.0);
450    }
451
452    (cluster_assignments, next_cluster as usize, outlier_scores)
453}
454
455/// Union-Find path-compressed find.
456fn find(parent: &[usize], mut x: usize) -> usize {
457    while parent[x] != x {
458        x = parent[x];
459    }
460    x
461}
462
463// ---------------------------------------------------------------------------
464// Tests
465// ---------------------------------------------------------------------------
466
467#[cfg(test)]
468mod tests {
469    use super::*;
470
471    #[test]
472    fn test_hdbscan_two_clusters() {
473        let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
474        let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
475        let data = Dataset::new(
476            vec![f1, f2],
477            vec![0.0; 10],
478            vec!["x".into(), "y".into()],
479            "label",
480        );
481
482        let mut hdb = Hdbscan::new().min_cluster_size(3);
483        hdb.fit(&data).unwrap();
484
485        assert_eq!(hdb.n_clusters(), 2, "should find 2 clusters");
486
487        // First 5 and last 5 should have different labels.
488        let labels = hdb.labels();
489        let cluster_a = labels[0];
490        let cluster_b = labels[5];
491        assert!(cluster_a >= 0, "first cluster should not be noise");
492        assert!(cluster_b >= 0, "second cluster should not be noise");
493        assert_ne!(cluster_a, cluster_b, "clusters should be different");
494    }
495
496    #[test]
497    fn test_hdbscan_with_noise() {
498        // Two tight clusters + an outlier at (100, 100).
499        let mut f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
500        let mut f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
501        f1.push(100.0);
502        f2.push(100.0);
503
504        let data = Dataset::new(
505            vec![f1, f2],
506            vec![0.0; 11],
507            vec!["x".into(), "y".into()],
508            "label",
509        );
510
511        let mut hdb = Hdbscan::new().min_cluster_size(3);
512        hdb.fit(&data).unwrap();
513
514        assert_eq!(hdb.n_clusters(), 2);
515
516        // The outlier should be noise.
517        let labels = hdb.labels();
518        assert_eq!(labels[10], -1, "outlier should be noise");
519    }
520
521    #[test]
522    fn test_hdbscan_all_same() {
523        let f1 = vec![1.0; 10];
524        let f2 = vec![1.0; 10];
525        let data = Dataset::new(
526            vec![f1, f2],
527            vec![0.0; 10],
528            vec!["x".into(), "y".into()],
529            "label",
530        );
531
532        let mut hdb = Hdbscan::new().min_cluster_size(3);
533        hdb.fit(&data).unwrap();
534
535        // All points are identical → single cluster.
536        assert_eq!(hdb.n_clusters(), 1);
537        assert_eq!(hdb.n_noise(), 0);
538    }
539
540    #[test]
541    fn test_hdbscan_min_cluster_size_respected() {
542        let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
543        let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
544        let data = Dataset::new(
545            vec![f1, f2],
546            vec![0.0; 10],
547            vec!["x".into(), "y".into()],
548            "label",
549        );
550
551        let mut hdb = Hdbscan::new().min_cluster_size(3);
552        hdb.fit(&data).unwrap();
553
554        // Count members per cluster.
555        let labels = hdb.labels();
556        let mut counts = std::collections::HashMap::new();
557        for &l in labels {
558            if l >= 0 {
559                *counts.entry(l).or_insert(0usize) += 1;
560            }
561        }
562
563        for (&cluster, &count) in &counts {
564            assert!(
565                count >= 3,
566                "cluster {} has {} members, less than min_cluster_size=3",
567                cluster,
568                count
569            );
570        }
571    }
572
573    #[test]
574    fn test_hdbscan_empty_dataset() {
575        let data = Dataset::new(Vec::<Vec<f64>>::new(), Vec::new(), Vec::new(), "label");
576        let mut hdb = Hdbscan::new();
577        assert!(hdb.fit(&data).is_err());
578    }
579
580    #[test]
581    fn test_hdbscan_outlier_scores() {
582        let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4, 100.0];
583        let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4, 100.0];
584        let data = Dataset::new(
585            vec![f1, f2],
586            vec![0.0; 11],
587            vec!["x".into(), "y".into()],
588            "label",
589        );
590
591        let mut hdb = Hdbscan::new().min_cluster_size(3);
592        hdb.fit(&data).unwrap();
593
594        let scores = hdb.outlier_scores();
595        assert_eq!(scores.len(), 11);
596
597        // Noise point should have outlier score of 1.0.
598        assert!(
599            (scores[10] - 1.0).abs() < 1e-6,
600            "noise point outlier score should be 1.0, got {}",
601            scores[10]
602        );
603
604        // Cluster interior points should have low outlier scores.
605        for &s in &scores[..5] {
606            assert!(
607                s < 1.0,
608                "cluster point should have outlier score < 1.0, got {s}"
609            );
610        }
611    }
612
613    #[test]
614    fn test_hdbscan_cosine_metric() {
615        use crate::neighbors::DistanceMetric;
616
617        // Two groups of unit vectors in opposite-ish directions
618        // Group A: vectors near [1, 0] (normalized)
619        // Group B: vectors near [0, 1] (normalized)
620        let n_per = 5;
621        let mut f1 = Vec::new();
622        let mut f2 = Vec::new();
623        for i in 0..n_per {
624            // Group A: angle near 0°
625            let angle = 0.05 * i as f64;
626            f1.push(angle.cos());
627            f2.push(angle.sin());
628        }
629        for i in 0..n_per {
630            // Group B: angle near 90°
631            let angle = std::f64::consts::FRAC_PI_2 + 0.05 * i as f64;
632            f1.push(angle.cos());
633            f2.push(angle.sin());
634        }
635
636        let data = Dataset::new(
637            vec![f1, f2],
638            vec![0.0; n_per * 2],
639            vec!["x".into(), "y".into()],
640            "label",
641        );
642
643        let mut hdb = Hdbscan::new()
644            .min_cluster_size(3)
645            .metric(DistanceMetric::Cosine);
646        hdb.fit(&data).unwrap();
647
648        assert_eq!(
649            hdb.n_clusters(),
650            2,
651            "should find 2 clusters with cosine metric"
652        );
653        let labels = hdb.labels();
654        assert_ne!(
655            labels[0], labels[n_per],
656            "groups should have different labels"
657        );
658    }
659
660    #[test]
661    fn test_hdbscan_fit_predict() {
662        let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
663        let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
664        let data = Dataset::new(
665            vec![f1, f2],
666            vec![0.0; 10],
667            vec!["x".into(), "y".into()],
668            "label",
669        );
670
671        let mut hdb = Hdbscan::new().min_cluster_size(3);
672        let labels = hdb.fit_predict(&data).unwrap();
673        assert_eq!(labels.len(), 10);
674        assert!(labels.iter().any(|&l| l >= 0), "should have some clusters");
675    }
676}