1use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
15use scirs2_core::numeric::{Float, FromPrimitive};
16use std::collections::BinaryHeap;
17use std::fmt::Debug;
18
19use crate::error::{ClusteringError, Result};
20
21fn dist_sq<F: Float>(a: ArrayView1<F>, b: ArrayView1<F>) -> F {
27 let mut s = F::zero();
28 for i in 0..a.len().min(b.len()) {
29 let d = a[i] - b[i];
30 s = s + d * d;
31 }
32 s
33}
34
35fn pairwise_distances<F: Float + FromPrimitive + Debug>(data: ArrayView2<F>) -> Array2<F> {
37 let n = data.shape()[0];
38 let mut dists = Array2::<F>::zeros((n, n));
39 for i in 0..n {
40 for j in (i + 1)..n {
41 let d = dist_sq(data.row(i), data.row(j)).sqrt();
42 dists[[i, j]] = d;
43 dists[[j, i]] = d;
44 }
45 }
46 dists
47}
48
49fn knn_distances<F: Float + FromPrimitive + Debug>(
51 dist_mat: &Array2<F>,
52 k: usize,
53) -> Vec<Vec<(usize, F)>> {
54 let n = dist_mat.shape()[0];
55 let mut result = Vec::with_capacity(n);
56 for i in 0..n {
57 let mut dists: Vec<(usize, F)> = (0..n)
58 .filter(|&j| j != i)
59 .map(|j| (j, dist_mat[[i, j]]))
60 .collect();
61 dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
62 dists.truncate(k);
63 result.push(dists);
64 }
65 result
66}
67
68#[derive(Debug, Clone)]
74pub struct HdbscanStarConfig {
75 pub min_cluster_size: usize,
77 pub min_samples: usize,
79 pub compute_probabilities: bool,
81}
82
83impl Default for HdbscanStarConfig {
84 fn default() -> Self {
85 Self {
86 min_cluster_size: 5,
87 min_samples: 5,
88 compute_probabilities: true,
89 }
90 }
91}
92
93#[derive(Debug, Clone)]
95pub struct HdbscanStarResult<F: Float> {
96 pub labels: Array1<i32>,
98 pub probabilities: Option<Array1<F>>,
100 pub cluster_stabilities: Vec<F>,
102 pub n_clusters: usize,
104 pub outlier_scores: Array1<F>,
106}
107
108#[derive(Debug, Clone)]
110struct MstEdge<F: Float> {
111 i: usize,
112 j: usize,
113 weight: F,
114}
115
116pub fn hdbscan_star<F: Float + FromPrimitive + Debug>(
124 data: ArrayView2<F>,
125 config: &HdbscanStarConfig,
126) -> Result<HdbscanStarResult<F>> {
127 let n = data.shape()[0];
128
129 if n == 0 {
130 return Err(ClusteringError::InvalidInput("Empty input data".into()));
131 }
132 if config.min_cluster_size < 2 {
133 return Err(ClusteringError::InvalidInput(
134 "min_cluster_size must be >= 2".into(),
135 ));
136 }
137 if config.min_samples < 1 {
138 return Err(ClusteringError::InvalidInput(
139 "min_samples must be >= 1".into(),
140 ));
141 }
142
143 let dist_mat = pairwise_distances(data);
144 let mpts = config.min_samples;
145
146 let mut core_dists = Array1::<F>::zeros(n);
148 for i in 0..n {
149 let mut dists: Vec<F> = (0..n)
150 .filter(|&j| j != i)
151 .map(|j| dist_mat[[i, j]])
152 .collect();
153 dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
154 let k = (mpts - 1).min(dists.len().saturating_sub(1));
155 core_dists[i] = if k < dists.len() {
156 dists[k]
157 } else {
158 F::infinity()
159 };
160 }
161
162 let mut mrd = Array2::<F>::zeros((n, n));
165 for i in 0..n {
166 for j in (i + 1)..n {
167 let d = dist_mat[[i, j]];
168 let mr = d.max(core_dists[i]).max(core_dists[j]);
169 mrd[[i, j]] = mr;
170 mrd[[j, i]] = mr;
171 }
172 }
173
174 let mst = prims_mst(&mrd, n);
176
177 let mut sorted_edges = mst;
179 sorted_edges.sort_by(|a, b| {
180 a.weight
181 .partial_cmp(&b.weight)
182 .unwrap_or(std::cmp::Ordering::Equal)
183 });
184
185 let min_cs = config.min_cluster_size;
187 let (labels, stabilities) = extract_stable_clusters(&sorted_edges, n, min_cs);
188
189 let n_clusters = if stabilities.is_empty() {
190 0
191 } else {
192 labels
193 .iter()
194 .filter(|&&l| l >= 0)
195 .map(|&l| l)
196 .max()
197 .map(|m| m as usize + 1)
198 .unwrap_or(0)
199 };
200
201 let mut outlier_scores = Array1::<F>::zeros(n);
203 for i in 0..n {
204 if labels[i] < 0 {
205 outlier_scores[i] = F::one();
206 } else {
207 let ci = labels[i] as usize;
209 let mut min_core_dist = F::infinity();
210 for j in 0..n {
211 if j != i && labels[j] == labels[i] {
212 let d = mrd[[i, j]];
213 if d < min_core_dist {
214 min_core_dist = d;
215 }
216 }
217 }
218 if min_core_dist < F::infinity() {
219 let max_core = core_dists
220 .iter()
221 .copied()
222 .filter(|&d| d < F::infinity())
223 .fold(F::zero(), |a, b| a.max(b));
224 if max_core > F::epsilon() {
225 outlier_scores[i] = min_core_dist / max_core;
226 }
227 }
228 }
229 }
230
231 let probabilities = if config.compute_probabilities {
233 let mut probs = Array1::<F>::zeros(n);
234 for i in 0..n {
235 probs[i] = F::one() - outlier_scores[i].min(F::one());
236 }
237 Some(probs)
238 } else {
239 None
240 };
241
242 Ok(HdbscanStarResult {
243 labels,
244 probabilities,
245 cluster_stabilities: stabilities,
246 n_clusters,
247 outlier_scores,
248 })
249}
250
251fn prims_mst<F: Float>(dist_mat: &Array2<F>, n: usize) -> Vec<MstEdge<F>> {
253 if n <= 1 {
254 return Vec::new();
255 }
256
257 let mut in_tree = vec![false; n];
258 let mut key = vec![F::infinity(); n]; let mut parent = vec![0usize; n];
260 let mut edges = Vec::with_capacity(n - 1);
261
262 key[0] = F::zero();
263
264 for _ in 0..n {
265 let mut u = None;
267 let mut min_key = F::infinity();
268 for v in 0..n {
269 if !in_tree[v] && key[v] < min_key {
270 min_key = key[v];
271 u = Some(v);
272 }
273 }
274 let u = match u {
275 Some(v) => v,
276 None => break,
277 };
278 in_tree[u] = true;
279
280 if key[u] > F::zero() {
281 edges.push(MstEdge {
282 i: parent[u],
283 j: u,
284 weight: key[u],
285 });
286 }
287
288 for v in 0..n {
290 if !in_tree[v] && dist_mat[[u, v]] < key[v] {
291 key[v] = dist_mat[[u, v]];
292 parent[v] = u;
293 }
294 }
295 }
296
297 edges
298}
299
300fn extract_stable_clusters<F: Float + FromPrimitive + Debug>(
302 sorted_edges: &[MstEdge<F>],
303 n: usize,
304 min_cluster_size: usize,
305) -> (Array1<i32>, Vec<F>) {
306 let mut parent_uf: Vec<usize> = (0..n).collect();
308 let mut size: Vec<usize> = vec![1; n];
309
310 fn find(parent: &mut Vec<usize>, x: usize) -> usize {
311 let mut root = x;
312 while parent[root] != root {
313 root = parent[root];
314 }
315 let mut cur = x;
317 while cur != root {
318 let next = parent[cur];
319 parent[cur] = root;
320 cur = next;
321 }
322 root
323 }
324
325 fn union(parent: &mut Vec<usize>, size: &mut Vec<usize>, a: usize, b: usize) -> usize {
326 let ra = find(parent, a);
327 let rb = find(parent, b);
328 if ra == rb {
329 return ra;
330 }
331 if size[ra] < size[rb] {
332 parent[ra] = rb;
333 size[rb] += size[ra];
334 rb
335 } else {
336 parent[rb] = ra;
337 size[ra] += size[rb];
338 ra
339 }
340 }
341
342 let mut component_birth: Vec<Option<F>> = vec![None; n]; for edge in sorted_edges {
346 let ri = find(&mut parent_uf, edge.i);
347 let rj = find(&mut parent_uf, edge.j);
348 if ri == rj {
349 continue;
350 }
351
352 let lambda = if edge.weight > F::epsilon() {
353 F::one() / edge.weight
354 } else {
355 F::infinity()
356 };
357
358 let new_root = union(&mut parent_uf, &mut size, ri, rj);
360 if size[new_root] >= min_cluster_size && component_birth[new_root].is_none() {
361 component_birth[new_root] = Some(lambda);
362 }
363 }
364
365 let mut final_parent: Vec<usize> = (0..n).collect();
368 let mut final_size: Vec<usize> = vec![1; n];
369
370 for edge in sorted_edges {
371 let _root = union(&mut final_parent, &mut final_size, edge.i, edge.j);
372 }
373
374 let mut cluster_map: std::collections::HashMap<usize, i32> = std::collections::HashMap::new();
376 let mut next_label = 0i32;
377 let mut labels = Array1::from_elem(n, -1i32);
378
379 for i in 0..n {
380 let root = find(&mut final_parent, i);
381 if final_size[root] >= min_cluster_size {
382 let label = cluster_map.entry(root).or_insert_with(|| {
383 let l = next_label;
384 next_label += 1;
385 l
386 });
387 labels[i] = *label;
388 }
389 }
390
391 let mut stabilities = Vec::new();
393 for (&root, &label) in &cluster_map {
394 let birth = component_birth[root].unwrap_or_else(|| F::zero());
395 let sz = F::from(final_size[root]).unwrap_or_else(|| F::one());
396 stabilities.push(birth * sz);
397 }
398
399 (labels, stabilities)
400}
401
402#[derive(Debug, Clone)]
408pub struct AutoEpsilonConfig {
409 pub k: usize,
411 pub min_samples: usize,
413 pub sensitivity: f64,
415}
416
417impl Default for AutoEpsilonConfig {
418 fn default() -> Self {
419 Self {
420 k: 5,
421 min_samples: 5,
422 sensitivity: 1.0,
423 }
424 }
425}
426
427#[derive(Debug, Clone)]
429pub struct AutoEpsilonResult<F: Float> {
430 pub labels: Array1<i32>,
432 pub epsilon: F,
434 pub k_distances: Vec<F>,
436 pub elbow_index: usize,
438 pub n_clusters: usize,
440}
441
442pub fn auto_epsilon_dbscan<F: Float + FromPrimitive + Debug>(
447 data: ArrayView2<F>,
448 config: &AutoEpsilonConfig,
449) -> Result<AutoEpsilonResult<F>> {
450 let n = data.shape()[0];
451 if n == 0 {
452 return Err(ClusteringError::InvalidInput("Empty input data".into()));
453 }
454 if config.k == 0 {
455 return Err(ClusteringError::InvalidInput("k must be > 0".into()));
456 }
457
458 let dist_mat = pairwise_distances(data);
459 let k = config.k.min(n - 1);
460
461 let mut k_dists: Vec<F> = Vec::with_capacity(n);
463 for i in 0..n {
464 let mut row_dists: Vec<F> = (0..n)
465 .filter(|&j| j != i)
466 .map(|j| dist_mat[[i, j]])
467 .collect();
468 row_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
469 let kd = if k <= row_dists.len() {
470 row_dists[k - 1]
471 } else {
472 *row_dists.last().unwrap_or(&F::zero())
473 };
474 k_dists.push(kd);
475 }
476
477 k_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
479
480 let elbow_idx = find_elbow(&k_dists, config.sensitivity);
482 let epsilon = k_dists[elbow_idx];
483
484 let labels = run_dbscan(data, &dist_mat, epsilon, config.min_samples);
486
487 let n_clusters = labels
488 .iter()
489 .filter(|&&l| l >= 0)
490 .map(|&l| l)
491 .max()
492 .map(|m| m as usize + 1)
493 .unwrap_or(0);
494
495 Ok(AutoEpsilonResult {
496 labels,
497 epsilon,
498 k_distances: k_dists,
499 elbow_index: elbow_idx,
500 n_clusters,
501 })
502}
503
504fn find_elbow<F: Float + FromPrimitive + Debug>(values: &[F], sensitivity: f64) -> usize {
506 let n = values.len();
507 if n < 3 {
508 return n / 2;
509 }
510
511 let x0 = 0.0f64;
513 let y0 = values[0].to_f64().unwrap_or(0.0);
514 let x1 = (n - 1) as f64;
515 let y1 = values[n - 1].to_f64().unwrap_or(0.0);
516
517 let dx = x1 - x0;
518 let dy = y1 - y0;
519 let line_len = (dx * dx + dy * dy).sqrt().max(1e-15);
520
521 let mut max_dist = 0.0f64;
522 let mut elbow_idx = n / 2;
523
524 for i in 1..(n - 1) {
525 let xi = i as f64;
526 let yi = values[i].to_f64().unwrap_or(0.0);
527 let dist = ((dy * xi - dx * yi + x1 * y0 - y1 * x0).abs()) / line_len;
529 let adjusted = dist * sensitivity;
530 if adjusted > max_dist {
531 max_dist = adjusted;
532 elbow_idx = i;
533 }
534 }
535
536 elbow_idx
537}
538
539fn run_dbscan<F: Float + FromPrimitive + Debug>(
541 data: ArrayView2<F>,
542 dist_mat: &Array2<F>,
543 eps: F,
544 min_samples: usize,
545) -> Array1<i32> {
546 let n = data.shape()[0];
547 let mut labels = vec![-2i32; n]; let mut cluster_id = 0i32;
549
550 for i in 0..n {
551 if labels[i] != -2 {
552 continue;
553 }
554 let neighbors: Vec<usize> = (0..n).filter(|&j| dist_mat[[i, j]] <= eps).collect();
555
556 if neighbors.len() < min_samples {
557 labels[i] = -1; continue;
559 }
560
561 labels[i] = cluster_id;
562 let mut queue = neighbors;
563 let mut head = 0;
564 while head < queue.len() {
565 let cur = queue[head];
566 head += 1;
567 if labels[cur] == -1 {
568 labels[cur] = cluster_id;
569 continue;
570 }
571 if labels[cur] != -2 {
572 continue;
573 }
574 labels[cur] = cluster_id;
575
576 let cur_nb: Vec<usize> = (0..n).filter(|&j| dist_mat[[cur, j]] <= eps).collect();
577 if cur_nb.len() >= min_samples {
578 for nb in cur_nb {
579 if labels[nb] == -2 || labels[nb] == -1 {
580 queue.push(nb);
581 }
582 }
583 }
584 }
585 cluster_id += 1;
586 }
587
588 Array1::from_vec(labels)
589}
590
591#[derive(Debug, Clone)]
597pub struct SnnConfig {
598 pub k: usize,
600 pub snn_threshold: usize,
602 pub min_snn_neighbors: usize,
604}
605
606impl Default for SnnConfig {
607 fn default() -> Self {
608 Self {
609 k: 10,
610 snn_threshold: 3,
611 min_snn_neighbors: 3,
612 }
613 }
614}
615
616#[derive(Debug, Clone)]
618pub struct SnnResult<F: Float> {
619 pub labels: Array1<i32>,
621 pub snn_similarity: Array2<F>,
623 pub n_clusters: usize,
625}
626
627pub fn snn_clustering<F: Float + FromPrimitive + Debug>(
633 data: ArrayView2<F>,
634 config: &SnnConfig,
635) -> Result<SnnResult<F>> {
636 let n = data.shape()[0];
637 if n == 0 {
638 return Err(ClusteringError::InvalidInput("Empty input data".into()));
639 }
640 if config.k == 0 {
641 return Err(ClusteringError::InvalidInput("k must be > 0".into()));
642 }
643
644 let dist_mat = pairwise_distances(data);
645 let k = config.k.min(n - 1);
646 let knn = knn_distances(&dist_mat, k);
647
648 let mut snn_sim = Array2::<F>::zeros((n, n));
650 for i in 0..n {
651 let knn_i: std::collections::HashSet<usize> = knn[i].iter().map(|&(j, _)| j).collect();
652 for j in (i + 1)..n {
653 let knn_j: std::collections::HashSet<usize> =
654 knn[j].iter().map(|&(jj, _)| jj).collect();
655 let shared = knn_i.intersection(&knn_j).count();
656 let sim = F::from(shared).unwrap_or_else(|| F::zero());
657 snn_sim[[i, j]] = sim;
658 snn_sim[[j, i]] = sim;
659 }
660 }
661
662 let threshold = F::from(config.snn_threshold).unwrap_or_else(|| F::one());
664 let min_nb = config.min_snn_neighbors;
665
666 let mut labels = vec![-2i32; n];
667 let mut cluster_id = 0i32;
668
669 for i in 0..n {
670 if labels[i] != -2 {
671 continue;
672 }
673 let neighbors: Vec<usize> = (0..n)
674 .filter(|&j| j != i && snn_sim[[i, j]] >= threshold)
675 .collect();
676
677 if neighbors.len() < min_nb {
678 labels[i] = -1;
679 continue;
680 }
681
682 labels[i] = cluster_id;
683 let mut queue = neighbors;
684 let mut head = 0;
685 while head < queue.len() {
686 let cur = queue[head];
687 head += 1;
688 if labels[cur] == -1 {
689 labels[cur] = cluster_id;
690 continue;
691 }
692 if labels[cur] != -2 {
693 continue;
694 }
695 labels[cur] = cluster_id;
696
697 let cur_nb: Vec<usize> = (0..n)
698 .filter(|&j| j != cur && snn_sim[[cur, j]] >= threshold)
699 .collect();
700 if cur_nb.len() >= min_nb {
701 for nb in cur_nb {
702 if labels[nb] == -2 || labels[nb] == -1 {
703 queue.push(nb);
704 }
705 }
706 }
707 }
708 cluster_id += 1;
709 }
710
711 let n_clusters = labels
712 .iter()
713 .filter(|&&l| l >= 0)
714 .max()
715 .map(|&m| m as usize + 1)
716 .unwrap_or(0);
717
718 Ok(SnnResult {
719 labels: Array1::from_vec(labels),
720 snn_similarity: snn_sim,
721 n_clusters,
722 })
723}
724
725#[derive(Debug, Clone, Copy, PartialEq, Eq)]
731pub enum KdeKernel {
732 Gaussian,
734 Epanechnikov,
736 Uniform,
738}
739
740#[derive(Debug, Clone)]
742pub struct KdcConfig {
743 pub kernel: KdeKernel,
745 pub bandwidth: f64,
747 pub density_threshold: f64,
749 pub merge_distance: f64,
751}
752
753impl Default for KdcConfig {
754 fn default() -> Self {
755 Self {
756 kernel: KdeKernel::Gaussian,
757 bandwidth: 0.0,
758 density_threshold: 0.1,
759 merge_distance: 0.0,
760 }
761 }
762}
763
764#[derive(Debug, Clone)]
766pub struct KdcResult<F: Float> {
767 pub labels: Array1<i32>,
769 pub densities: Array1<F>,
771 pub bandwidth: F,
773 pub n_clusters: usize,
775}
776
777pub fn kernel_density_clustering<F: Float + FromPrimitive + Debug>(
783 data: ArrayView2<F>,
784 config: &KdcConfig,
785) -> Result<KdcResult<F>> {
786 let (n, d) = (data.shape()[0], data.shape()[1]);
787 if n == 0 {
788 return Err(ClusteringError::InvalidInput("Empty input data".into()));
789 }
790
791 let h = if config.bandwidth <= 0.0 {
793 silverman_bandwidth(data)
794 } else {
795 F::from(config.bandwidth).unwrap_or_else(|| F::one())
796 };
797
798 if h <= F::epsilon() {
799 return Err(ClusteringError::ComputationError(
800 "Bandwidth too small".into(),
801 ));
802 }
803
804 let mut densities = Array1::<F>::zeros(n);
806 let n_f = F::from(n).unwrap_or_else(|| F::one());
807 let h_d = h.powi(d as i32);
808 let norm_factor = n_f * h_d;
809
810 for i in 0..n {
811 let mut dens = F::zero();
812 for j in 0..n {
813 let u_sq = dist_sq(data.row(i), data.row(j)) / (h * h);
814 let kval = match config.kernel {
815 KdeKernel::Gaussian => {
816 let neg_half = F::from(-0.5).unwrap_or_else(|| F::zero());
817 (neg_half * u_sq).exp()
818 }
819 KdeKernel::Epanechnikov => {
820 if u_sq < F::one() {
821 F::one() - u_sq
822 } else {
823 F::zero()
824 }
825 }
826 KdeKernel::Uniform => {
827 if u_sq < F::one() {
828 F::one()
829 } else {
830 F::zero()
831 }
832 }
833 };
834 dens = dens + kval;
835 }
836 densities[i] = dens / norm_factor;
837 }
838
839 let max_density = densities.iter().copied().fold(F::zero(), |a, b| a.max(b));
841 let threshold = F::from(config.density_threshold).unwrap_or_else(|| F::zero()) * max_density;
842
843 let mut peak_assignments = vec![-1i32; n];
845 let mut peaks: Vec<usize> = Vec::new();
846
847 let mut local_max = vec![0usize; n];
849 for i in 0..n {
850 let mut current = i;
851 for _ in 0..100 {
852 let mut best = current;
854 let mut best_dens = densities[current];
855 for j in 0..n {
856 if dist_sq(data.row(current), data.row(j)).sqrt()
857 <= h * F::from(2.0).unwrap_or_else(|| F::one())
858 {
859 if densities[j] > best_dens {
860 best_dens = densities[j];
861 best = j;
862 }
863 }
864 }
865 if best == current {
866 break;
867 }
868 current = best;
869 }
870 local_max[i] = current;
871 }
872
873 let merge_dist = if config.merge_distance > 0.0 {
875 F::from(config.merge_distance).unwrap_or_else(|| h)
876 } else {
877 h
878 };
879
880 let mut peak_map: std::collections::HashMap<usize, i32> = std::collections::HashMap::new();
882 let mut next_label = 0i32;
883
884 for i in 0..n {
885 if densities[i] < threshold {
886 peak_assignments[i] = -1;
887 continue;
888 }
889 let peak = local_max[i];
890
891 let mut merged_label = None;
893 for (&existing_peak, &label) in &peak_map {
894 if dist_sq(data.row(peak), data.row(existing_peak)).sqrt() <= merge_dist {
895 merged_label = Some(label);
896 break;
897 }
898 }
899
900 let label = match merged_label {
901 Some(l) => l,
902 None => {
903 let l = next_label;
904 peak_map.insert(peak, l);
905 peaks.push(peak);
906 next_label += 1;
907 l
908 }
909 };
910 peak_assignments[i] = label;
911 peak_map.entry(peak).or_insert(label);
912 }
913
914 let n_clusters = next_label as usize;
915
916 Ok(KdcResult {
917 labels: Array1::from_vec(peak_assignments),
918 densities,
919 bandwidth: h,
920 n_clusters,
921 })
922}
923
924fn silverman_bandwidth<F: Float + FromPrimitive + Debug>(data: ArrayView2<F>) -> F {
926 let (n, d) = (data.shape()[0], data.shape()[1]);
927 if n < 2 {
928 return F::one();
929 }
930
931 let mut total_std = 0.0f64;
933 for dim in 0..d {
934 let mean = (0..n)
935 .map(|i| data[[i, dim]].to_f64().unwrap_or(0.0))
936 .sum::<f64>()
937 / n as f64;
938 let var = (0..n)
939 .map(|i| {
940 let diff = data[[i, dim]].to_f64().unwrap_or(0.0) - mean;
941 diff * diff
942 })
943 .sum::<f64>()
944 / (n - 1) as f64;
945 total_std += var.sqrt();
946 }
947 let avg_std = total_std / d as f64;
948
949 let h = 1.06 * avg_std * (n as f64).powf(-0.2);
952 F::from(h.max(1e-10)).unwrap_or_else(|| F::one())
953}
954
955#[derive(Debug, Clone)]
961pub struct LofConfig {
962 pub k: usize,
964 pub outlier_threshold: f64,
966}
967
968impl Default for LofConfig {
969 fn default() -> Self {
970 Self {
971 k: 5,
972 outlier_threshold: 1.5,
973 }
974 }
975}
976
977#[derive(Debug, Clone)]
979pub struct LofResult<F: Float> {
980 pub lof_scores: Array1<F>,
982 pub is_outlier: Vec<bool>,
984 pub n_outliers: usize,
986 pub k_distances: Array1<F>,
988 pub lrd: Array1<F>,
990}
991
992pub fn local_outlier_factor<F: Float + FromPrimitive + Debug>(
998 data: ArrayView2<F>,
999 config: &LofConfig,
1000) -> Result<LofResult<F>> {
1001 let n = data.shape()[0];
1002 if n == 0 {
1003 return Err(ClusteringError::InvalidInput("Empty input data".into()));
1004 }
1005 if config.k == 0 || config.k >= n {
1006 return Err(ClusteringError::InvalidInput(
1007 "k must be in [1, n_samples)".into(),
1008 ));
1009 }
1010
1011 let dist_mat = pairwise_distances(data);
1012 let k = config.k;
1013 let knn = knn_distances(&dist_mat, k);
1014
1015 let mut k_dist = Array1::<F>::zeros(n);
1017 for i in 0..n {
1018 k_dist[i] = if knn[i].len() >= k {
1019 knn[i][k - 1].1
1020 } else if let Some(last) = knn[i].last() {
1021 last.1
1022 } else {
1023 F::zero()
1024 };
1025 }
1026
1027 let mut lrd = Array1::<F>::zeros(n);
1030 for i in 0..n {
1031 let mut sum_reach = F::zero();
1032 let nb_count = knn[i].len();
1033 for &(j, d_ij) in &knn[i] {
1034 let reach = d_ij.max(k_dist[j]);
1035 sum_reach = sum_reach + reach;
1036 }
1037 if nb_count > 0 && sum_reach > F::epsilon() {
1038 lrd[i] = F::from(nb_count).unwrap_or_else(|| F::one()) / sum_reach;
1039 } else {
1040 lrd[i] = F::one(); }
1042 }
1043
1044 let mut lof_scores = Array1::<F>::zeros(n);
1046 for i in 0..n {
1047 let nb_count = knn[i].len();
1048 if nb_count == 0 || lrd[i] <= F::epsilon() {
1049 lof_scores[i] = F::one();
1050 continue;
1051 }
1052 let mut sum = F::zero();
1053 for &(j, _) in &knn[i] {
1054 sum = sum + lrd[j] / lrd[i];
1055 }
1056 lof_scores[i] = sum / F::from(nb_count).unwrap_or_else(|| F::one());
1057 }
1058
1059 let threshold = F::from(config.outlier_threshold)
1060 .unwrap_or_else(|| F::from(1.5).unwrap_or_else(|| F::one()));
1061 let is_outlier: Vec<bool> = lof_scores.iter().map(|&s| s > threshold).collect();
1062 let n_outliers = is_outlier.iter().filter(|&&o| o).count();
1063
1064 Ok(LofResult {
1065 lof_scores,
1066 is_outlier,
1067 n_outliers,
1068 k_distances: k_dist,
1069 lrd,
1070 })
1071}
1072
1073#[cfg(test)]
1078mod tests {
1079 use super::*;
1080 use scirs2_core::ndarray::Array2;
1081
1082 fn make_clustered_data() -> Array2<f64> {
1083 let mut data = Vec::new();
1084 for i in 0..15 {
1086 let n = (i as f64 * 0.073).sin() * 0.2;
1087 data.push(1.0 + n);
1088 data.push(1.0 + n * 0.5);
1089 }
1090 for i in 0..15 {
1092 let n = (i as f64 * 0.131).sin() * 0.2;
1093 data.push(5.0 + n);
1094 data.push(5.0 + n * 0.5);
1095 }
1096 data.push(10.0);
1098 data.push(10.0);
1099 Array2::from_shape_vec((31, 2), data).expect("shape failed")
1100 }
1101
1102 #[test]
1105 fn test_hdbscan_star_basic() {
1106 let data = make_clustered_data();
1107 let config = HdbscanStarConfig {
1108 min_cluster_size: 3,
1109 min_samples: 3,
1110 compute_probabilities: true,
1111 };
1112 let result = hdbscan_star(data.view(), &config).expect("hdbscan* failed");
1113 assert_eq!(result.labels.len(), 31);
1114 assert!(result.n_clusters >= 1);
1115 assert!(result.probabilities.is_some());
1116 assert_eq!(result.outlier_scores.len(), 31);
1117 }
1118
1119 #[test]
1120 fn test_hdbscan_star_empty() {
1121 let data = Array2::<f64>::zeros((0, 3));
1122 let config = HdbscanStarConfig::default();
1123 assert!(hdbscan_star(data.view(), &config).is_err());
1124 }
1125
1126 #[test]
1127 fn test_hdbscan_star_invalid_params() {
1128 let data = make_clustered_data();
1129 let config = HdbscanStarConfig {
1130 min_cluster_size: 1,
1131 ..Default::default()
1132 };
1133 assert!(hdbscan_star(data.view(), &config).is_err());
1134 }
1135
1136 #[test]
1137 fn test_hdbscan_star_small_data() {
1138 let data = Array2::from_shape_vec(
1139 (5, 2),
1140 vec![0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 5.0, 5.0, 5.1, 5.1],
1141 )
1142 .expect("shape");
1143 let config = HdbscanStarConfig {
1144 min_cluster_size: 2,
1145 min_samples: 2,
1146 compute_probabilities: false,
1147 };
1148 let result = hdbscan_star(data.view(), &config).expect("hdbscan* failed");
1149 assert_eq!(result.labels.len(), 5);
1150 assert!(result.probabilities.is_none());
1151 }
1152
1153 #[test]
1156 fn test_auto_epsilon_basic() {
1157 let data = make_clustered_data();
1158 let config = AutoEpsilonConfig {
1159 k: 3,
1160 min_samples: 3,
1161 sensitivity: 1.0,
1162 };
1163 let result = auto_epsilon_dbscan(data.view(), &config).expect("auto-eps failed");
1164 assert_eq!(result.labels.len(), 31);
1165 assert!(result.epsilon > 0.0);
1166 assert!(!result.k_distances.is_empty());
1167 }
1168
1169 #[test]
1170 fn test_auto_epsilon_empty() {
1171 let data = Array2::<f64>::zeros((0, 2));
1172 let config = AutoEpsilonConfig::default();
1173 assert!(auto_epsilon_dbscan(data.view(), &config).is_err());
1174 }
1175
1176 #[test]
1177 fn test_auto_epsilon_invalid_k() {
1178 let data = make_clustered_data();
1179 let config = AutoEpsilonConfig {
1180 k: 0,
1181 ..Default::default()
1182 };
1183 assert!(auto_epsilon_dbscan(data.view(), &config).is_err());
1184 }
1185
1186 #[test]
1187 fn test_find_elbow() {
1188 let values = vec![0.1, 0.2, 0.3, 0.4, 0.8, 1.5, 3.0, 5.0, 8.0, 12.0];
1189 let idx = find_elbow(&values, 1.0);
1190 assert!(idx > 0 && idx < values.len() - 1);
1192 }
1193
1194 #[test]
1197 fn test_snn_basic() {
1198 let data = make_clustered_data();
1199 let config = SnnConfig {
1200 k: 5,
1201 snn_threshold: 2,
1202 min_snn_neighbors: 2,
1203 };
1204 let result = snn_clustering(data.view(), &config).expect("snn failed");
1205 assert_eq!(result.labels.len(), 31);
1206 let n = result.snn_similarity.shape()[0];
1208 for i in 0..n {
1209 for j in 0..n {
1210 let diff = (result.snn_similarity[[i, j]] - result.snn_similarity[[j, i]]).abs();
1211 assert!(diff < 1e-10);
1212 }
1213 }
1214 }
1215
1216 #[test]
1217 fn test_snn_empty() {
1218 let data = Array2::<f64>::zeros((0, 2));
1219 let config = SnnConfig::default();
1220 assert!(snn_clustering(data.view(), &config).is_err());
1221 }
1222
1223 #[test]
1226 fn test_kde_clustering_basic() {
1227 let data = make_clustered_data();
1228 let config = KdcConfig {
1229 kernel: KdeKernel::Gaussian,
1230 bandwidth: 1.0,
1231 density_threshold: 0.05,
1232 merge_distance: 1.0,
1233 };
1234 let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
1235 assert_eq!(result.labels.len(), 31);
1236 assert_eq!(result.densities.len(), 31);
1237 assert!(result.bandwidth > 0.0);
1238 }
1239
1240 #[test]
1241 fn test_kde_clustering_auto_bandwidth() {
1242 let data = make_clustered_data();
1243 let config = KdcConfig {
1244 bandwidth: 0.0, ..Default::default()
1246 };
1247 let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
1248 assert!(result.bandwidth > 0.0);
1249 }
1250
1251 #[test]
1252 fn test_kde_clustering_epanechnikov() {
1253 let data = make_clustered_data();
1254 let config = KdcConfig {
1255 kernel: KdeKernel::Epanechnikov,
1256 bandwidth: 2.0,
1257 density_threshold: 0.01,
1258 merge_distance: 1.0,
1259 };
1260 let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
1261 assert_eq!(result.labels.len(), 31);
1262 }
1263
1264 #[test]
1265 fn test_kde_clustering_empty() {
1266 let data = Array2::<f64>::zeros((0, 2));
1267 let config = KdcConfig::default();
1268 assert!(kernel_density_clustering(data.view(), &config).is_err());
1269 }
1270
1271 #[test]
1272 fn test_silverman_bandwidth() {
1273 let data = Array2::from_shape_vec(
1274 (10, 2),
1275 vec![
1276 1.0, 2.0, 1.1, 2.1, 1.2, 1.9, 0.9, 2.2, 1.3, 1.8, 5.0, 6.0, 5.1, 6.1, 5.2, 5.9,
1277 4.9, 6.2, 5.3, 5.8,
1278 ],
1279 )
1280 .expect("shape");
1281 let h = silverman_bandwidth(data.view());
1282 assert!(h > 0.0);
1283 }
1284
1285 #[test]
1288 fn test_lof_basic() {
1289 let data = make_clustered_data();
1290 let config = LofConfig {
1291 k: 5,
1292 outlier_threshold: 1.5,
1293 };
1294 let result = local_outlier_factor(data.view(), &config).expect("lof failed");
1295 assert_eq!(result.lof_scores.len(), 31);
1296 assert_eq!(result.is_outlier.len(), 31);
1297 assert_eq!(result.k_distances.len(), 31);
1298 assert_eq!(result.lrd.len(), 31);
1299
1300 let outlier_score = result.lof_scores[30];
1302 assert!(
1303 outlier_score > 1.0,
1304 "Outlier LOF score should be > 1, got {}",
1305 outlier_score
1306 );
1307 }
1308
1309 #[test]
1310 fn test_lof_empty() {
1311 let data = Array2::<f64>::zeros((0, 2));
1312 let config = LofConfig::default();
1313 assert!(local_outlier_factor(data.view(), &config).is_err());
1314 }
1315
1316 #[test]
1317 fn test_lof_invalid_k() {
1318 let data = make_clustered_data();
1319 let config = LofConfig {
1320 k: 0,
1321 ..Default::default()
1322 };
1323 assert!(local_outlier_factor(data.view(), &config).is_err());
1324
1325 let config2 = LofConfig {
1326 k: 100, ..Default::default()
1328 };
1329 assert!(local_outlier_factor(data.view(), &config2).is_err());
1330 }
1331
1332 #[test]
1333 fn test_lof_uniform_data() {
1334 let data = Array2::from_shape_vec(
1336 (10, 2),
1337 vec![
1338 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1339 1.0, 1.0, 1.0, 1.0,
1340 ],
1341 )
1342 .expect("shape");
1343 let config = LofConfig {
1344 k: 3,
1345 outlier_threshold: 1.5,
1346 };
1347 let result = local_outlier_factor(data.view(), &config).expect("lof failed");
1348 for &score in result.lof_scores.iter() {
1350 assert!(
1351 (score - 1.0).abs() < 0.5,
1352 "LOF for identical points should be ~1, got {}",
1353 score
1354 );
1355 }
1356 }
1357
1358 #[test]
1361 fn test_prims_mst() {
1362 let mut dist = Array2::<f64>::zeros((4, 4));
1363 dist[[0, 1]] = 1.0;
1364 dist[[1, 0]] = 1.0;
1365 dist[[0, 2]] = 4.0;
1366 dist[[2, 0]] = 4.0;
1367 dist[[0, 3]] = 3.0;
1368 dist[[3, 0]] = 3.0;
1369 dist[[1, 2]] = 2.0;
1370 dist[[2, 1]] = 2.0;
1371 dist[[1, 3]] = 5.0;
1372 dist[[3, 1]] = 5.0;
1373 dist[[2, 3]] = 1.0;
1374 dist[[3, 2]] = 1.0;
1375
1376 let mst = prims_mst(&dist, 4);
1377 assert_eq!(mst.len(), 3);
1378 let total_weight: f64 = mst.iter().map(|e| e.weight).sum();
1379 assert!((total_weight - 4.0).abs() < 1e-10); }
1381
1382 #[test]
1385 fn test_dist_sq() {
1386 let a = Array1::from_vec(vec![1.0, 2.0]);
1387 let b = Array1::from_vec(vec![4.0, 6.0]);
1388 let d = dist_sq(a.view(), b.view());
1389 assert!((d - 25.0).abs() < 1e-10);
1390 }
1391
1392 #[test]
1393 fn test_knn_distances() {
1394 let dist = Array2::from_shape_vec(
1395 (4, 4),
1396 vec![
1397 0.0, 1.0, 3.0, 5.0, 1.0, 0.0, 2.0, 4.0, 3.0, 2.0, 0.0, 1.0, 5.0, 4.0, 1.0, 0.0,
1398 ],
1399 )
1400 .expect("shape");
1401 let knn = knn_distances(&dist, 2);
1402 assert_eq!(knn.len(), 4);
1403 assert_eq!(knn[0][0].0, 1);
1405 assert!((knn[0][0].1 - 1.0).abs() < 1e-10);
1406 }
1407}