1use crate::dataset::Dataset;
38use crate::distance::{cosine_distance, euclidean_sq, manhattan};
39use crate::error::{Result, ScryLearnError};
40use crate::neighbors::DistanceMetric;
41
42#[derive(Clone, Debug)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48#[non_exhaustive]
49pub struct Hdbscan {
50 min_cluster_size: usize,
52 min_samples: Option<usize>,
54 metric: DistanceMetric,
56 labels: Vec<i32>,
58 n_clusters: usize,
60 outlier_scores: Vec<f64>,
62 fitted: bool,
64 #[cfg_attr(feature = "serde", serde(default))]
65 _schema_version: u32,
66}
67
68impl Hdbscan {
69 pub fn new() -> Self {
73 Self {
74 min_cluster_size: 5,
75 min_samples: None,
76 metric: DistanceMetric::Euclidean,
77 labels: Vec::new(),
78 n_clusters: 0,
79 outlier_scores: Vec::new(),
80 fitted: false,
81 _schema_version: crate::version::SCHEMA_VERSION,
82 }
83 }
84
85 pub fn min_cluster_size(mut self, size: usize) -> Self {
89 self.min_cluster_size = size;
90 self
91 }
92
93 pub fn min_samples(mut self, k: usize) -> Self {
98 self.min_samples = Some(k);
99 self
100 }
101
102 pub fn metric(mut self, m: DistanceMetric) -> Self {
107 self.metric = m;
108 self
109 }
110
111 pub fn labels(&self) -> &[i32] {
113 &self.labels
114 }
115
116 pub fn n_clusters(&self) -> usize {
118 self.n_clusters
119 }
120
121 pub fn n_noise(&self) -> usize {
123 self.labels.iter().filter(|&&l| l == -1).count()
124 }
125
126 pub fn outlier_scores(&self) -> &[f64] {
132 &self.outlier_scores
133 }
134
135 pub fn fit(&mut self, data: &Dataset) -> Result<()> {
137 data.validate_finite()?;
138 let n = data.n_samples();
139 if n == 0 {
140 return Err(ScryLearnError::EmptyDataset);
141 }
142 if n < self.min_cluster_size {
143 self.labels = vec![-1; n];
145 self.n_clusters = 0;
146 self.outlier_scores = vec![1.0; n];
147 self.fitted = true;
148 return Ok(());
149 }
150
151 let rows = data.feature_matrix(); let k = self.min_samples.unwrap_or(self.min_cluster_size);
153 let k = k.min(n - 1).max(1);
154
155 let dist = pairwise_distances(&rows, self.metric);
157 let core_dist = core_distances(&dist, k);
158
159 let mr_dist = mutual_reachability(&dist, &core_dist);
161
162 let mst = prim_mst(&mr_dist);
164
165 let mut sorted_edges = mst;
167 sorted_edges.sort_by(|a, b| a.2.total_cmp(&b.2));
168
169 let (labels, n_clusters, outlier_scores) =
171 extract_clusters(&sorted_edges, n, self.min_cluster_size);
172
173 self.labels = labels;
174 self.n_clusters = n_clusters;
175 self.outlier_scores = outlier_scores;
176 self.fitted = true;
177 Ok(())
178 }
179
180 pub fn fit_predict(&mut self, data: &Dataset) -> Result<Vec<i32>> {
182 self.fit(data)?;
183 Ok(self.labels.clone())
184 }
185}
186
187impl Default for Hdbscan {
188 fn default() -> Self {
189 Self::new()
190 }
191}
192
193fn pairwise_distances(rows: &[Vec<f64>], metric: DistanceMetric) -> Vec<Vec<f64>> {
199 let n = rows.len();
200 let mut dist = vec![vec![0.0; n]; n];
201 for i in 0..n {
202 for j in (i + 1)..n {
203 let d = match metric {
204 DistanceMetric::Euclidean => euclidean_sq(&rows[i], &rows[j]).sqrt(),
205 DistanceMetric::Manhattan => manhattan(&rows[i], &rows[j]),
206 DistanceMetric::Cosine => cosine_distance(&rows[i], &rows[j]),
207 };
208 dist[i][j] = d;
209 dist[j][i] = d;
210 }
211 }
212 dist
213}
214
215fn core_distances(dist: &[Vec<f64>], k: usize) -> Vec<f64> {
217 let n = dist.len();
218 let mut core = Vec::with_capacity(n);
219 for i in 0..n {
220 let mut dists: Vec<f64> = (0..n).filter(|&j| j != i).map(|j| dist[i][j]).collect();
221 dists.sort_unstable_by(|a, b| a.total_cmp(b));
222 let kth = (k - 1).min(dists.len() - 1);
223 core.push(dists[kth]);
224 }
225 core
226}
227
228fn mutual_reachability(dist: &[Vec<f64>], core_dist: &[f64]) -> Vec<Vec<f64>> {
232 let n = dist.len();
233 let mut mr = vec![vec![0.0; n]; n];
234 for i in 0..n {
235 for j in (i + 1)..n {
236 let d = dist[i][j].max(core_dist[i]).max(core_dist[j]);
237 mr[i][j] = d;
238 mr[j][i] = d;
239 }
240 }
241 mr
242}
243
244fn prim_mst(dist: &[Vec<f64>]) -> Vec<(usize, usize, f64)> {
246 let n = dist.len();
247 if n <= 1 {
248 return Vec::new();
249 }
250
251 let mut in_tree = vec![false; n];
252 let mut min_edge = vec![f64::INFINITY; n];
253 let mut edge_from = vec![0usize; n];
254 let mut edges = Vec::with_capacity(n - 1);
255
256 in_tree[0] = true;
258 for j in 1..n {
259 min_edge[j] = dist[0][j];
260 edge_from[j] = 0;
261 }
262
263 for _ in 0..(n - 1) {
264 let mut best = usize::MAX;
266 let mut best_w = f64::INFINITY;
267 for j in 0..n {
268 if !in_tree[j] && min_edge[j] < best_w {
269 best = j;
270 best_w = min_edge[j];
271 }
272 }
273
274 if best == usize::MAX {
275 break;
276 }
277
278 in_tree[best] = true;
279 edges.push((edge_from[best], best, best_w));
280
281 for j in 0..n {
283 if !in_tree[j] && dist[best][j] < min_edge[j] {
284 min_edge[j] = dist[best][j];
285 edge_from[j] = best;
286 }
287 }
288 }
289
290 edges
291}
292
293fn extract_clusters(
298 sorted_edges: &[(usize, usize, f64)],
299 n: usize,
300 min_cluster_size: usize,
301) -> (Vec<i32>, usize, Vec<f64>) {
302 let mut parent: Vec<usize> = (0..n).collect();
304 let mut size: Vec<usize> = vec![1; n];
305 let mut members: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
306
307 let mut point_lambda = vec![0.0_f64; n];
309
310 let mut cluster_assignments: Vec<i32> = vec![-1; n];
316 let mut next_cluster = 0i32;
317
318 for &(u, v, w) in sorted_edges {
326 let ru = find(&parent, u);
327 let rv = find(&parent, v);
328 if ru == rv {
329 continue;
330 }
331
332 let lambda = if w > 0.0 { 1.0 / w } else { f64::INFINITY };
333
334 for &pt in &members[ru] {
338 point_lambda[pt] = lambda;
339 }
340 for &pt in &members[rv] {
341 point_lambda[pt] = lambda;
342 }
343
344 let (small, big) = if size[ru] < size[rv] {
346 (ru, rv)
347 } else {
348 (rv, ru)
349 };
350 parent[small] = big;
351 size[big] += size[small];
352 let small_members = std::mem::take(&mut members[small]);
353 members[big].extend(small_members);
354 }
355
356 let mut uf_parent: Vec<usize> = (0..n).collect();
363 let mut uf_size: Vec<usize> = vec![1; n];
364 let mut uf_members: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
365 let mut finalized: Vec<bool> = vec![false; n];
367
368 for &(u, v, _w) in sorted_edges {
369 let ru = find(&uf_parent, u);
370 let rv = find(&uf_parent, v);
371 if ru == rv {
372 continue;
373 }
374
375 let size_u = uf_size[ru];
376 let size_v = uf_size[rv];
377
378 if size_u >= min_cluster_size && size_v >= min_cluster_size {
381 if !finalized[ru] {
382 for &pt in &uf_members[ru] {
383 cluster_assignments[pt] = next_cluster;
384 }
385 next_cluster += 1;
386 finalized[ru] = true;
387 }
388 if !finalized[rv] {
389 for &pt in &uf_members[rv] {
390 cluster_assignments[pt] = next_cluster;
391 }
392 next_cluster += 1;
393 finalized[rv] = true;
394 }
395 }
396
397 let (small, big) = if uf_size[ru] < uf_size[rv] {
399 (ru, rv)
400 } else {
401 (rv, ru)
402 };
403 uf_parent[small] = big;
404 uf_size[big] += uf_size[small];
405 let small_members = std::mem::take(&mut uf_members[small]);
406 uf_members[big].extend(small_members);
407
408 if finalized[small] || finalized[big] {
409 finalized[big] = true;
410 }
411 }
412
413 if next_cluster == 0 && n >= min_cluster_size {
416 cluster_assignments.fill(0);
417 next_cluster = 1;
418 }
419
420 let mut outlier_scores = vec![0.0_f64; n];
424 if next_cluster > 0 {
425 let mut cluster_max_lambda = vec![0.0_f64; next_cluster as usize];
427 for i in 0..n {
428 let c = cluster_assignments[i];
429 if c >= 0 {
430 let cu = c as usize;
431 if point_lambda[i] > cluster_max_lambda[cu] {
432 cluster_max_lambda[cu] = point_lambda[i];
433 }
434 }
435 }
436
437 for i in 0..n {
438 let c = cluster_assignments[i];
439 if c < 0 {
440 outlier_scores[i] = 1.0;
441 } else {
442 let max_l = cluster_max_lambda[c as usize];
443 if max_l > 0.0 {
444 outlier_scores[i] = 1.0 - (point_lambda[i] / max_l).min(1.0);
445 }
446 }
447 }
448 } else {
449 outlier_scores.fill(1.0);
450 }
451
452 (cluster_assignments, next_cluster as usize, outlier_scores)
453}
454
455fn find(parent: &[usize], mut x: usize) -> usize {
457 while parent[x] != x {
458 x = parent[x];
459 }
460 x
461}
462
463#[cfg(test)]
468mod tests {
469 use super::*;
470
471 #[test]
472 fn test_hdbscan_two_clusters() {
473 let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
474 let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
475 let data = Dataset::new(
476 vec![f1, f2],
477 vec![0.0; 10],
478 vec!["x".into(), "y".into()],
479 "label",
480 );
481
482 let mut hdb = Hdbscan::new().min_cluster_size(3);
483 hdb.fit(&data).unwrap();
484
485 assert_eq!(hdb.n_clusters(), 2, "should find 2 clusters");
486
487 let labels = hdb.labels();
489 let cluster_a = labels[0];
490 let cluster_b = labels[5];
491 assert!(cluster_a >= 0, "first cluster should not be noise");
492 assert!(cluster_b >= 0, "second cluster should not be noise");
493 assert_ne!(cluster_a, cluster_b, "clusters should be different");
494 }
495
496 #[test]
497 fn test_hdbscan_with_noise() {
498 let mut f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
500 let mut f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
501 f1.push(100.0);
502 f2.push(100.0);
503
504 let data = Dataset::new(
505 vec![f1, f2],
506 vec![0.0; 11],
507 vec!["x".into(), "y".into()],
508 "label",
509 );
510
511 let mut hdb = Hdbscan::new().min_cluster_size(3);
512 hdb.fit(&data).unwrap();
513
514 assert_eq!(hdb.n_clusters(), 2);
515
516 let labels = hdb.labels();
518 assert_eq!(labels[10], -1, "outlier should be noise");
519 }
520
521 #[test]
522 fn test_hdbscan_all_same() {
523 let f1 = vec![1.0; 10];
524 let f2 = vec![1.0; 10];
525 let data = Dataset::new(
526 vec![f1, f2],
527 vec![0.0; 10],
528 vec!["x".into(), "y".into()],
529 "label",
530 );
531
532 let mut hdb = Hdbscan::new().min_cluster_size(3);
533 hdb.fit(&data).unwrap();
534
535 assert_eq!(hdb.n_clusters(), 1);
537 assert_eq!(hdb.n_noise(), 0);
538 }
539
540 #[test]
541 fn test_hdbscan_min_cluster_size_respected() {
542 let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
543 let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
544 let data = Dataset::new(
545 vec![f1, f2],
546 vec![0.0; 10],
547 vec!["x".into(), "y".into()],
548 "label",
549 );
550
551 let mut hdb = Hdbscan::new().min_cluster_size(3);
552 hdb.fit(&data).unwrap();
553
554 let labels = hdb.labels();
556 let mut counts = std::collections::HashMap::new();
557 for &l in labels {
558 if l >= 0 {
559 *counts.entry(l).or_insert(0usize) += 1;
560 }
561 }
562
563 for (&cluster, &count) in &counts {
564 assert!(
565 count >= 3,
566 "cluster {} has {} members, less than min_cluster_size=3",
567 cluster,
568 count
569 );
570 }
571 }
572
573 #[test]
574 fn test_hdbscan_empty_dataset() {
575 let data = Dataset::new(Vec::<Vec<f64>>::new(), Vec::new(), Vec::new(), "label");
576 let mut hdb = Hdbscan::new();
577 assert!(hdb.fit(&data).is_err());
578 }
579
580 #[test]
581 fn test_hdbscan_outlier_scores() {
582 let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4, 100.0];
583 let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4, 100.0];
584 let data = Dataset::new(
585 vec![f1, f2],
586 vec![0.0; 11],
587 vec!["x".into(), "y".into()],
588 "label",
589 );
590
591 let mut hdb = Hdbscan::new().min_cluster_size(3);
592 hdb.fit(&data).unwrap();
593
594 let scores = hdb.outlier_scores();
595 assert_eq!(scores.len(), 11);
596
597 assert!(
599 (scores[10] - 1.0).abs() < 1e-6,
600 "noise point outlier score should be 1.0, got {}",
601 scores[10]
602 );
603
604 for &s in &scores[..5] {
606 assert!(
607 s < 1.0,
608 "cluster point should have outlier score < 1.0, got {s}"
609 );
610 }
611 }
612
613 #[test]
614 fn test_hdbscan_cosine_metric() {
615 use crate::neighbors::DistanceMetric;
616
617 let n_per = 5;
621 let mut f1 = Vec::new();
622 let mut f2 = Vec::new();
623 for i in 0..n_per {
624 let angle = 0.05 * i as f64;
626 f1.push(angle.cos());
627 f2.push(angle.sin());
628 }
629 for i in 0..n_per {
630 let angle = std::f64::consts::FRAC_PI_2 + 0.05 * i as f64;
632 f1.push(angle.cos());
633 f2.push(angle.sin());
634 }
635
636 let data = Dataset::new(
637 vec![f1, f2],
638 vec![0.0; n_per * 2],
639 vec!["x".into(), "y".into()],
640 "label",
641 );
642
643 let mut hdb = Hdbscan::new()
644 .min_cluster_size(3)
645 .metric(DistanceMetric::Cosine);
646 hdb.fit(&data).unwrap();
647
648 assert_eq!(
649 hdb.n_clusters(),
650 2,
651 "should find 2 clusters with cosine metric"
652 );
653 let labels = hdb.labels();
654 assert_ne!(
655 labels[0], labels[n_per],
656 "groups should have different labels"
657 );
658 }
659
660 #[test]
661 fn test_hdbscan_fit_predict() {
662 let f1 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
663 let f2 = vec![0.0, 0.1, 0.2, 0.3, 0.4, 10.0, 10.1, 10.2, 10.3, 10.4];
664 let data = Dataset::new(
665 vec![f1, f2],
666 vec![0.0; 10],
667 vec!["x".into(), "y".into()],
668 "label",
669 );
670
671 let mut hdb = Hdbscan::new().min_cluster_size(3);
672 let labels = hdb.fit_predict(&data).unwrap();
673 assert_eq!(labels.len(), 10);
674 assert!(labels.iter().any(|&l| l >= 0), "should have some clusters");
675 }
676}