Skip to main content

scirs2_cluster/
density_enhanced.rs

1//! Density-based clustering enhancements
2//!
3//! This module extends the base density-based clustering algorithms with
4//! advanced techniques for improved cluster detection and outlier analysis.
5//!
6//! # Algorithms
7//!
8//! - **HDBSCAN\***: Hierarchical DBSCAN with cluster stability extraction
9//! - **Auto-epsilon DBSCAN**: Automatic epsilon selection via k-distance elbow
10//! - **SNN density**: Shared nearest neighbor density clustering
11//! - **Kernel density clustering**: Density-based clustering using KDE
12//! - **LOF**: Local Outlier Factor for outlier/anomaly scoring
13
14use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
15use scirs2_core::numeric::{Float, FromPrimitive};
16use std::collections::BinaryHeap;
17use std::fmt::Debug;
18
19use crate::error::{ClusteringError, Result};
20
21// ---------------------------------------------------------------------------
22// Distance helpers
23// ---------------------------------------------------------------------------
24
25/// Squared Euclidean distance between two rows.
26fn dist_sq<F: Float>(a: ArrayView1<F>, b: ArrayView1<F>) -> F {
27    let mut s = F::zero();
28    for i in 0..a.len().min(b.len()) {
29        let d = a[i] - b[i];
30        s = s + d * d;
31    }
32    s
33}
34
35/// Full pairwise distance matrix (Euclidean).
36fn pairwise_distances<F: Float + FromPrimitive + Debug>(data: ArrayView2<F>) -> Array2<F> {
37    let n = data.shape()[0];
38    let mut dists = Array2::<F>::zeros((n, n));
39    for i in 0..n {
40        for j in (i + 1)..n {
41            let d = dist_sq(data.row(i), data.row(j)).sqrt();
42            dists[[i, j]] = d;
43            dists[[j, i]] = d;
44        }
45    }
46    dists
47}
48
49/// k-nearest-neighbor distances (sorted ascending) for each point.
50fn knn_distances<F: Float + FromPrimitive + Debug>(
51    dist_mat: &Array2<F>,
52    k: usize,
53) -> Vec<Vec<(usize, F)>> {
54    let n = dist_mat.shape()[0];
55    let mut result = Vec::with_capacity(n);
56    for i in 0..n {
57        let mut dists: Vec<(usize, F)> = (0..n)
58            .filter(|&j| j != i)
59            .map(|j| (j, dist_mat[[i, j]]))
60            .collect();
61        dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
62        dists.truncate(k);
63        result.push(dists);
64    }
65    result
66}
67
68// ---------------------------------------------------------------------------
69// HDBSCAN* with stability
70// ---------------------------------------------------------------------------
71
72/// Configuration for HDBSCAN* with stability-based cluster extraction.
73#[derive(Debug, Clone)]
74pub struct HdbscanStarConfig {
75    /// Minimum cluster size for stability computation.
76    pub min_cluster_size: usize,
77    /// Minimum number of samples for core distance.
78    pub min_samples: usize,
79    /// Whether to compute cluster membership probabilities.
80    pub compute_probabilities: bool,
81}
82
83impl Default for HdbscanStarConfig {
84    fn default() -> Self {
85        Self {
86            min_cluster_size: 5,
87            min_samples: 5,
88            compute_probabilities: true,
89        }
90    }
91}
92
93/// Result of HDBSCAN* with stability.
94#[derive(Debug, Clone)]
95pub struct HdbscanStarResult<F: Float> {
96    /// Cluster labels (-1 = noise).
97    pub labels: Array1<i32>,
98    /// Membership probabilities (if computed).
99    pub probabilities: Option<Array1<F>>,
100    /// Stability values for each cluster.
101    pub cluster_stabilities: Vec<F>,
102    /// Number of clusters found.
103    pub n_clusters: usize,
104    /// Outlier scores (higher = more outlier-like).
105    pub outlier_scores: Array1<F>,
106}
107
108/// Internal edge for MST construction.
109#[derive(Debug, Clone)]
110struct MstEdge<F: Float> {
111    i: usize,
112    j: usize,
113    weight: F,
114}
115
116/// Run HDBSCAN* with stability-based cluster extraction.
117///
118/// This is an enhanced version of HDBSCAN that:
119/// 1. Computes mutual reachability distances
120/// 2. Builds a minimum spanning tree
121/// 3. Constructs a condensed cluster tree
122/// 4. Extracts clusters based on stability (excess of mass)
123pub fn hdbscan_star<F: Float + FromPrimitive + Debug>(
124    data: ArrayView2<F>,
125    config: &HdbscanStarConfig,
126) -> Result<HdbscanStarResult<F>> {
127    let n = data.shape()[0];
128
129    if n == 0 {
130        return Err(ClusteringError::InvalidInput("Empty input data".into()));
131    }
132    if config.min_cluster_size < 2 {
133        return Err(ClusteringError::InvalidInput(
134            "min_cluster_size must be >= 2".into(),
135        ));
136    }
137    if config.min_samples < 1 {
138        return Err(ClusteringError::InvalidInput(
139            "min_samples must be >= 1".into(),
140        ));
141    }
142
143    let dist_mat = pairwise_distances(data);
144    let mpts = config.min_samples;
145
146    // Step 1: compute core distances (distance to mpts-th nearest neighbor)
147    let mut core_dists = Array1::<F>::zeros(n);
148    for i in 0..n {
149        let mut dists: Vec<F> = (0..n)
150            .filter(|&j| j != i)
151            .map(|j| dist_mat[[i, j]])
152            .collect();
153        dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
154        let k = (mpts - 1).min(dists.len().saturating_sub(1));
155        core_dists[i] = if k < dists.len() {
156            dists[k]
157        } else {
158            F::infinity()
159        };
160    }
161
162    // Step 2: mutual reachability distance
163    // mrd(a,b) = max(core(a), core(b), d(a,b))
164    let mut mrd = Array2::<F>::zeros((n, n));
165    for i in 0..n {
166        for j in (i + 1)..n {
167            let d = dist_mat[[i, j]];
168            let mr = d.max(core_dists[i]).max(core_dists[j]);
169            mrd[[i, j]] = mr;
170            mrd[[j, i]] = mr;
171        }
172    }
173
174    // Step 3: build MST using Prim's algorithm
175    let mst = prims_mst(&mrd, n);
176
177    // Step 4: sort MST edges by weight
178    let mut sorted_edges = mst;
179    sorted_edges.sort_by(|a, b| {
180        a.weight
181            .partial_cmp(&b.weight)
182            .unwrap_or(std::cmp::Ordering::Equal)
183    });
184
185    // Step 5: build condensed tree and extract clusters using stability
186    let min_cs = config.min_cluster_size;
187    let (labels, stabilities) = extract_stable_clusters(&sorted_edges, n, min_cs);
188
189    let n_clusters = if stabilities.is_empty() {
190        0
191    } else {
192        labels
193            .iter()
194            .filter(|&&l| l >= 0)
195            .map(|&l| l)
196            .max()
197            .map(|m| m as usize + 1)
198            .unwrap_or(0)
199    };
200
201    // Compute outlier scores: based on distance to nearest core point relative to cluster
202    let mut outlier_scores = Array1::<F>::zeros(n);
203    for i in 0..n {
204        if labels[i] < 0 {
205            outlier_scores[i] = F::one();
206        } else {
207            // Score based on how far from the cluster core
208            let ci = labels[i] as usize;
209            let mut min_core_dist = F::infinity();
210            for j in 0..n {
211                if j != i && labels[j] == labels[i] {
212                    let d = mrd[[i, j]];
213                    if d < min_core_dist {
214                        min_core_dist = d;
215                    }
216                }
217            }
218            if min_core_dist < F::infinity() {
219                let max_core = core_dists
220                    .iter()
221                    .copied()
222                    .filter(|&d| d < F::infinity())
223                    .fold(F::zero(), |a, b| a.max(b));
224                if max_core > F::epsilon() {
225                    outlier_scores[i] = min_core_dist / max_core;
226                }
227            }
228        }
229    }
230
231    // Compute probabilities if requested
232    let probabilities = if config.compute_probabilities {
233        let mut probs = Array1::<F>::zeros(n);
234        for i in 0..n {
235            probs[i] = F::one() - outlier_scores[i].min(F::one());
236        }
237        Some(probs)
238    } else {
239        None
240    };
241
242    Ok(HdbscanStarResult {
243        labels,
244        probabilities,
245        cluster_stabilities: stabilities,
246        n_clusters,
247        outlier_scores,
248    })
249}
250
251/// Prim's algorithm for MST on a distance matrix.
252fn prims_mst<F: Float>(dist_mat: &Array2<F>, n: usize) -> Vec<MstEdge<F>> {
253    if n <= 1 {
254        return Vec::new();
255    }
256
257    let mut in_tree = vec![false; n];
258    let mut key = vec![F::infinity(); n]; // minimum weight edge to tree
259    let mut parent = vec![0usize; n];
260    let mut edges = Vec::with_capacity(n - 1);
261
262    key[0] = F::zero();
263
264    for _ in 0..n {
265        // Find minimum key vertex not in tree
266        let mut u = None;
267        let mut min_key = F::infinity();
268        for v in 0..n {
269            if !in_tree[v] && key[v] < min_key {
270                min_key = key[v];
271                u = Some(v);
272            }
273        }
274        let u = match u {
275            Some(v) => v,
276            None => break,
277        };
278        in_tree[u] = true;
279
280        if key[u] > F::zero() {
281            edges.push(MstEdge {
282                i: parent[u],
283                j: u,
284                weight: key[u],
285            });
286        }
287
288        // Update keys
289        for v in 0..n {
290            if !in_tree[v] && dist_mat[[u, v]] < key[v] {
291                key[v] = dist_mat[[u, v]];
292                parent[v] = u;
293            }
294        }
295    }
296
297    edges
298}
299
300/// Extract stable clusters from sorted MST edges using simplified stability analysis.
301fn extract_stable_clusters<F: Float + FromPrimitive + Debug>(
302    sorted_edges: &[MstEdge<F>],
303    n: usize,
304    min_cluster_size: usize,
305) -> (Array1<i32>, Vec<F>) {
306    // Union-Find for tracking connected components
307    let mut parent_uf: Vec<usize> = (0..n).collect();
308    let mut size: Vec<usize> = vec![1; n];
309
310    fn find(parent: &mut Vec<usize>, x: usize) -> usize {
311        let mut root = x;
312        while parent[root] != root {
313            root = parent[root];
314        }
315        // Path compression
316        let mut cur = x;
317        while cur != root {
318            let next = parent[cur];
319            parent[cur] = root;
320            cur = next;
321        }
322        root
323    }
324
325    fn union(parent: &mut Vec<usize>, size: &mut Vec<usize>, a: usize, b: usize) -> usize {
326        let ra = find(parent, a);
327        let rb = find(parent, b);
328        if ra == rb {
329            return ra;
330        }
331        if size[ra] < size[rb] {
332            parent[ra] = rb;
333            size[rb] += size[ra];
334            rb
335        } else {
336            parent[rb] = ra;
337            size[ra] += size[rb];
338            ra
339        }
340    }
341
342    // Process edges in order, tracking when components reach min_cluster_size
343    let mut component_birth: Vec<Option<F>> = vec![None; n]; // lambda at which component was born as cluster
344
345    for edge in sorted_edges {
346        let ri = find(&mut parent_uf, edge.i);
347        let rj = find(&mut parent_uf, edge.j);
348        if ri == rj {
349            continue;
350        }
351
352        let lambda = if edge.weight > F::epsilon() {
353            F::one() / edge.weight
354        } else {
355            F::infinity()
356        };
357
358        // Check if merging creates a big-enough cluster
359        let new_root = union(&mut parent_uf, &mut size, ri, rj);
360        if size[new_root] >= min_cluster_size && component_birth[new_root].is_none() {
361            component_birth[new_root] = Some(lambda);
362        }
363    }
364
365    // Assign labels based on final connected components
366    // Reset parent for fresh traversal
367    let mut final_parent: Vec<usize> = (0..n).collect();
368    let mut final_size: Vec<usize> = vec![1; n];
369
370    for edge in sorted_edges {
371        let _root = union(&mut final_parent, &mut final_size, edge.i, edge.j);
372    }
373
374    // Find distinct components that are large enough
375    let mut cluster_map: std::collections::HashMap<usize, i32> = std::collections::HashMap::new();
376    let mut next_label = 0i32;
377    let mut labels = Array1::from_elem(n, -1i32);
378
379    for i in 0..n {
380        let root = find(&mut final_parent, i);
381        if final_size[root] >= min_cluster_size {
382            let label = cluster_map.entry(root).or_insert_with(|| {
383                let l = next_label;
384                next_label += 1;
385                l
386            });
387            labels[i] = *label;
388        }
389    }
390
391    // Compute stability for each cluster (simplified: proportional to component size * birth lambda)
392    let mut stabilities = Vec::new();
393    for (&root, &label) in &cluster_map {
394        let birth = component_birth[root].unwrap_or_else(|| F::zero());
395        let sz = F::from(final_size[root]).unwrap_or_else(|| F::one());
396        stabilities.push(birth * sz);
397    }
398
399    (labels, stabilities)
400}
401
402// ---------------------------------------------------------------------------
403// Auto-epsilon DBSCAN
404// ---------------------------------------------------------------------------
405
406/// Configuration for automatic epsilon selection.
407#[derive(Debug, Clone)]
408pub struct AutoEpsilonConfig {
409    /// Number of nearest neighbors for k-distance plot (typically = min_samples).
410    pub k: usize,
411    /// Minimum DBSCAN samples parameter.
412    pub min_samples: usize,
413    /// Sensitivity of elbow detection (higher = more sensitive).
414    pub sensitivity: f64,
415}
416
417impl Default for AutoEpsilonConfig {
418    fn default() -> Self {
419        Self {
420            k: 5,
421            min_samples: 5,
422            sensitivity: 1.0,
423        }
424    }
425}
426
427/// Result of auto-epsilon DBSCAN.
428#[derive(Debug, Clone)]
429pub struct AutoEpsilonResult<F: Float> {
430    /// Cluster labels (-1 = noise).
431    pub labels: Array1<i32>,
432    /// Selected epsilon value.
433    pub epsilon: F,
434    /// k-distance plot values (sorted).
435    pub k_distances: Vec<F>,
436    /// Elbow index in the k-distance plot.
437    pub elbow_index: usize,
438    /// Number of clusters found.
439    pub n_clusters: usize,
440}
441
442/// Run DBSCAN with automatic epsilon selection via k-distance elbow detection.
443///
444/// Computes the k-distance plot, identifies the "elbow" (point of maximum
445/// curvature), and uses that distance as epsilon for DBSCAN.
446pub fn auto_epsilon_dbscan<F: Float + FromPrimitive + Debug>(
447    data: ArrayView2<F>,
448    config: &AutoEpsilonConfig,
449) -> Result<AutoEpsilonResult<F>> {
450    let n = data.shape()[0];
451    if n == 0 {
452        return Err(ClusteringError::InvalidInput("Empty input data".into()));
453    }
454    if config.k == 0 {
455        return Err(ClusteringError::InvalidInput("k must be > 0".into()));
456    }
457
458    let dist_mat = pairwise_distances(data);
459    let k = config.k.min(n - 1);
460
461    // Compute k-distance for each point
462    let mut k_dists: Vec<F> = Vec::with_capacity(n);
463    for i in 0..n {
464        let mut row_dists: Vec<F> = (0..n)
465            .filter(|&j| j != i)
466            .map(|j| dist_mat[[i, j]])
467            .collect();
468        row_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
469        let kd = if k <= row_dists.len() {
470            row_dists[k - 1]
471        } else {
472            *row_dists.last().unwrap_or(&F::zero())
473        };
474        k_dists.push(kd);
475    }
476
477    // Sort k-distances ascending for the elbow plot
478    k_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
479
480    // Find elbow using maximum curvature (second derivative)
481    let elbow_idx = find_elbow(&k_dists, config.sensitivity);
482    let epsilon = k_dists[elbow_idx];
483
484    // Run DBSCAN with selected epsilon
485    let labels = run_dbscan(data, &dist_mat, epsilon, config.min_samples);
486
487    let n_clusters = labels
488        .iter()
489        .filter(|&&l| l >= 0)
490        .map(|&l| l)
491        .max()
492        .map(|m| m as usize + 1)
493        .unwrap_or(0);
494
495    Ok(AutoEpsilonResult {
496        labels,
497        epsilon,
498        k_distances: k_dists,
499        elbow_index: elbow_idx,
500        n_clusters,
501    })
502}
503
504/// Find the elbow point in a sorted distance curve.
505fn find_elbow<F: Float + FromPrimitive + Debug>(values: &[F], sensitivity: f64) -> usize {
506    let n = values.len();
507    if n < 3 {
508        return n / 2;
509    }
510
511    // Use perpendicular distance to the line from first to last point
512    let x0 = 0.0f64;
513    let y0 = values[0].to_f64().unwrap_or(0.0);
514    let x1 = (n - 1) as f64;
515    let y1 = values[n - 1].to_f64().unwrap_or(0.0);
516
517    let dx = x1 - x0;
518    let dy = y1 - y0;
519    let line_len = (dx * dx + dy * dy).sqrt().max(1e-15);
520
521    let mut max_dist = 0.0f64;
522    let mut elbow_idx = n / 2;
523
524    for i in 1..(n - 1) {
525        let xi = i as f64;
526        let yi = values[i].to_f64().unwrap_or(0.0);
527        // Perpendicular distance from point to line
528        let dist = ((dy * xi - dx * yi + x1 * y0 - y1 * x0).abs()) / line_len;
529        let adjusted = dist * sensitivity;
530        if adjusted > max_dist {
531            max_dist = adjusted;
532            elbow_idx = i;
533        }
534    }
535
536    elbow_idx
537}
538
539/// Run DBSCAN given a precomputed distance matrix.
540fn run_dbscan<F: Float + FromPrimitive + Debug>(
541    data: ArrayView2<F>,
542    dist_mat: &Array2<F>,
543    eps: F,
544    min_samples: usize,
545) -> Array1<i32> {
546    let n = data.shape()[0];
547    let mut labels = vec![-2i32; n]; // -2 = undefined
548    let mut cluster_id = 0i32;
549
550    for i in 0..n {
551        if labels[i] != -2 {
552            continue;
553        }
554        let neighbors: Vec<usize> = (0..n).filter(|&j| dist_mat[[i, j]] <= eps).collect();
555
556        if neighbors.len() < min_samples {
557            labels[i] = -1; // noise
558            continue;
559        }
560
561        labels[i] = cluster_id;
562        let mut queue = neighbors;
563        let mut head = 0;
564        while head < queue.len() {
565            let cur = queue[head];
566            head += 1;
567            if labels[cur] == -1 {
568                labels[cur] = cluster_id;
569                continue;
570            }
571            if labels[cur] != -2 {
572                continue;
573            }
574            labels[cur] = cluster_id;
575
576            let cur_nb: Vec<usize> = (0..n).filter(|&j| dist_mat[[cur, j]] <= eps).collect();
577            if cur_nb.len() >= min_samples {
578                for nb in cur_nb {
579                    if labels[nb] == -2 || labels[nb] == -1 {
580                        queue.push(nb);
581                    }
582                }
583            }
584        }
585        cluster_id += 1;
586    }
587
588    Array1::from_vec(labels)
589}
590
591// ---------------------------------------------------------------------------
592// Shared Nearest Neighbor (SNN) Density Clustering
593// ---------------------------------------------------------------------------
594
595/// Configuration for SNN density clustering.
596#[derive(Debug, Clone)]
597pub struct SnnConfig {
598    /// Number of nearest neighbors for SNN computation (k).
599    pub k: usize,
600    /// Minimum SNN similarity for two points to be considered neighbors.
601    pub snn_threshold: usize,
602    /// Minimum number of SNN-neighbors for a core point.
603    pub min_snn_neighbors: usize,
604}
605
606impl Default for SnnConfig {
607    fn default() -> Self {
608        Self {
609            k: 10,
610            snn_threshold: 3,
611            min_snn_neighbors: 3,
612        }
613    }
614}
615
616/// Result of SNN density clustering.
617#[derive(Debug, Clone)]
618pub struct SnnResult<F: Float> {
619    /// Cluster labels (-1 = noise).
620    pub labels: Array1<i32>,
621    /// SNN similarity matrix.
622    pub snn_similarity: Array2<F>,
623    /// Number of clusters found.
624    pub n_clusters: usize,
625}
626
627/// Run shared nearest neighbor (SNN) density clustering.
628///
629/// SNN similarity between two points is the number of shared items in
630/// their k-nearest-neighbor lists. Points with high SNN similarity form
631/// dense regions that become clusters.
632pub fn snn_clustering<F: Float + FromPrimitive + Debug>(
633    data: ArrayView2<F>,
634    config: &SnnConfig,
635) -> Result<SnnResult<F>> {
636    let n = data.shape()[0];
637    if n == 0 {
638        return Err(ClusteringError::InvalidInput("Empty input data".into()));
639    }
640    if config.k == 0 {
641        return Err(ClusteringError::InvalidInput("k must be > 0".into()));
642    }
643
644    let dist_mat = pairwise_distances(data);
645    let k = config.k.min(n - 1);
646    let knn = knn_distances(&dist_mat, k);
647
648    // Compute SNN similarity matrix
649    let mut snn_sim = Array2::<F>::zeros((n, n));
650    for i in 0..n {
651        let knn_i: std::collections::HashSet<usize> = knn[i].iter().map(|&(j, _)| j).collect();
652        for j in (i + 1)..n {
653            let knn_j: std::collections::HashSet<usize> =
654                knn[j].iter().map(|&(jj, _)| jj).collect();
655            let shared = knn_i.intersection(&knn_j).count();
656            let sim = F::from(shared).unwrap_or_else(|| F::zero());
657            snn_sim[[i, j]] = sim;
658            snn_sim[[j, i]] = sim;
659        }
660    }
661
662    // DBSCAN-like clustering using SNN similarity
663    let threshold = F::from(config.snn_threshold).unwrap_or_else(|| F::one());
664    let min_nb = config.min_snn_neighbors;
665
666    let mut labels = vec![-2i32; n];
667    let mut cluster_id = 0i32;
668
669    for i in 0..n {
670        if labels[i] != -2 {
671            continue;
672        }
673        let neighbors: Vec<usize> = (0..n)
674            .filter(|&j| j != i && snn_sim[[i, j]] >= threshold)
675            .collect();
676
677        if neighbors.len() < min_nb {
678            labels[i] = -1;
679            continue;
680        }
681
682        labels[i] = cluster_id;
683        let mut queue = neighbors;
684        let mut head = 0;
685        while head < queue.len() {
686            let cur = queue[head];
687            head += 1;
688            if labels[cur] == -1 {
689                labels[cur] = cluster_id;
690                continue;
691            }
692            if labels[cur] != -2 {
693                continue;
694            }
695            labels[cur] = cluster_id;
696
697            let cur_nb: Vec<usize> = (0..n)
698                .filter(|&j| j != cur && snn_sim[[cur, j]] >= threshold)
699                .collect();
700            if cur_nb.len() >= min_nb {
701                for nb in cur_nb {
702                    if labels[nb] == -2 || labels[nb] == -1 {
703                        queue.push(nb);
704                    }
705                }
706            }
707        }
708        cluster_id += 1;
709    }
710
711    let n_clusters = labels
712        .iter()
713        .filter(|&&l| l >= 0)
714        .max()
715        .map(|&m| m as usize + 1)
716        .unwrap_or(0);
717
718    Ok(SnnResult {
719        labels: Array1::from_vec(labels),
720        snn_similarity: snn_sim,
721        n_clusters,
722    })
723}
724
725// ---------------------------------------------------------------------------
726// Kernel Density Clustering
727// ---------------------------------------------------------------------------
728
729/// Kernel type for density estimation.
730#[derive(Debug, Clone, Copy, PartialEq, Eq)]
731pub enum KdeKernel {
732    /// Gaussian (RBF) kernel.
733    Gaussian,
734    /// Epanechnikov kernel.
735    Epanechnikov,
736    /// Uniform (box) kernel.
737    Uniform,
738}
739
740/// Configuration for kernel density clustering.
741#[derive(Debug, Clone)]
742pub struct KdcConfig {
743    /// Kernel type.
744    pub kernel: KdeKernel,
745    /// Bandwidth (h). If 0, auto-select via Silverman's rule.
746    pub bandwidth: f64,
747    /// Density threshold for peak detection (fraction of max density).
748    pub density_threshold: f64,
749    /// Merge distance: peaks closer than this are merged.
750    pub merge_distance: f64,
751}
752
753impl Default for KdcConfig {
754    fn default() -> Self {
755        Self {
756            kernel: KdeKernel::Gaussian,
757            bandwidth: 0.0,
758            density_threshold: 0.1,
759            merge_distance: 0.0,
760        }
761    }
762}
763
764/// Result of kernel density clustering.
765#[derive(Debug, Clone)]
766pub struct KdcResult<F: Float> {
767    /// Cluster labels (-1 = low density / noise).
768    pub labels: Array1<i32>,
769    /// Estimated density at each point.
770    pub densities: Array1<F>,
771    /// Bandwidth used.
772    pub bandwidth: F,
773    /// Number of clusters found.
774    pub n_clusters: usize,
775}
776
777/// Run kernel density-based clustering.
778///
779/// Estimates density at each data point, identifies local density peaks
780/// as cluster centres, and assigns each point to the nearest peak via
781/// gradient ascent (mean-shift style).
782pub fn kernel_density_clustering<F: Float + FromPrimitive + Debug>(
783    data: ArrayView2<F>,
784    config: &KdcConfig,
785) -> Result<KdcResult<F>> {
786    let (n, d) = (data.shape()[0], data.shape()[1]);
787    if n == 0 {
788        return Err(ClusteringError::InvalidInput("Empty input data".into()));
789    }
790
791    // Auto-bandwidth via Silverman's rule if needed
792    let h = if config.bandwidth <= 0.0 {
793        silverman_bandwidth(data)
794    } else {
795        F::from(config.bandwidth).unwrap_or_else(|| F::one())
796    };
797
798    if h <= F::epsilon() {
799        return Err(ClusteringError::ComputationError(
800            "Bandwidth too small".into(),
801        ));
802    }
803
804    // Estimate density at each point
805    let mut densities = Array1::<F>::zeros(n);
806    let n_f = F::from(n).unwrap_or_else(|| F::one());
807    let h_d = h.powi(d as i32);
808    let norm_factor = n_f * h_d;
809
810    for i in 0..n {
811        let mut dens = F::zero();
812        for j in 0..n {
813            let u_sq = dist_sq(data.row(i), data.row(j)) / (h * h);
814            let kval = match config.kernel {
815                KdeKernel::Gaussian => {
816                    let neg_half = F::from(-0.5).unwrap_or_else(|| F::zero());
817                    (neg_half * u_sq).exp()
818                }
819                KdeKernel::Epanechnikov => {
820                    if u_sq < F::one() {
821                        F::one() - u_sq
822                    } else {
823                        F::zero()
824                    }
825                }
826                KdeKernel::Uniform => {
827                    if u_sq < F::one() {
828                        F::one()
829                    } else {
830                        F::zero()
831                    }
832                }
833            };
834            dens = dens + kval;
835        }
836        densities[i] = dens / norm_factor;
837    }
838
839    // Find density peaks via gradient ascent (simplified mean-shift)
840    let max_density = densities.iter().copied().fold(F::zero(), |a, b| a.max(b));
841    let threshold = F::from(config.density_threshold).unwrap_or_else(|| F::zero()) * max_density;
842
843    // Assign each above-threshold point to a peak
844    let mut peak_assignments = vec![-1i32; n];
845    let mut peaks: Vec<usize> = Vec::new();
846
847    // Gradient ascent: each point walks to its local maximum
848    let mut local_max = vec![0usize; n];
849    for i in 0..n {
850        let mut current = i;
851        for _ in 0..100 {
852            // Find the highest-density neighbor
853            let mut best = current;
854            let mut best_dens = densities[current];
855            for j in 0..n {
856                if dist_sq(data.row(current), data.row(j)).sqrt()
857                    <= h * F::from(2.0).unwrap_or_else(|| F::one())
858                {
859                    if densities[j] > best_dens {
860                        best_dens = densities[j];
861                        best = j;
862                    }
863                }
864            }
865            if best == current {
866                break;
867            }
868            current = best;
869        }
870        local_max[i] = current;
871    }
872
873    // Group points by their local maximum
874    let merge_dist = if config.merge_distance > 0.0 {
875        F::from(config.merge_distance).unwrap_or_else(|| h)
876    } else {
877        h
878    };
879
880    // Deduplicate peaks
881    let mut peak_map: std::collections::HashMap<usize, i32> = std::collections::HashMap::new();
882    let mut next_label = 0i32;
883
884    for i in 0..n {
885        if densities[i] < threshold {
886            peak_assignments[i] = -1;
887            continue;
888        }
889        let peak = local_max[i];
890
891        // Check if this peak should be merged with an existing one
892        let mut merged_label = None;
893        for (&existing_peak, &label) in &peak_map {
894            if dist_sq(data.row(peak), data.row(existing_peak)).sqrt() <= merge_dist {
895                merged_label = Some(label);
896                break;
897            }
898        }
899
900        let label = match merged_label {
901            Some(l) => l,
902            None => {
903                let l = next_label;
904                peak_map.insert(peak, l);
905                peaks.push(peak);
906                next_label += 1;
907                l
908            }
909        };
910        peak_assignments[i] = label;
911        peak_map.entry(peak).or_insert(label);
912    }
913
914    let n_clusters = next_label as usize;
915
916    Ok(KdcResult {
917        labels: Array1::from_vec(peak_assignments),
918        densities,
919        bandwidth: h,
920        n_clusters,
921    })
922}
923
924/// Silverman's rule of thumb for bandwidth selection.
925fn silverman_bandwidth<F: Float + FromPrimitive + Debug>(data: ArrayView2<F>) -> F {
926    let (n, d) = (data.shape()[0], data.shape()[1]);
927    if n < 2 {
928        return F::one();
929    }
930
931    // Average standard deviation across all dimensions
932    let mut total_std = 0.0f64;
933    for dim in 0..d {
934        let mean = (0..n)
935            .map(|i| data[[i, dim]].to_f64().unwrap_or(0.0))
936            .sum::<f64>()
937            / n as f64;
938        let var = (0..n)
939            .map(|i| {
940                let diff = data[[i, dim]].to_f64().unwrap_or(0.0) - mean;
941                diff * diff
942            })
943            .sum::<f64>()
944            / (n - 1) as f64;
945        total_std += var.sqrt();
946    }
947    let avg_std = total_std / d as f64;
948
949    // Silverman's rule: h = 0.9 * min(sigma, IQR/1.34) * n^(-1/5)
950    // Simplified: h = 1.06 * sigma * n^(-1/5)
951    let h = 1.06 * avg_std * (n as f64).powf(-0.2);
952    F::from(h.max(1e-10)).unwrap_or_else(|| F::one())
953}
954
955// ---------------------------------------------------------------------------
956// Local Outlier Factor (LOF)
957// ---------------------------------------------------------------------------
958
959/// Configuration for LOF computation.
960#[derive(Debug, Clone)]
961pub struct LofConfig {
962    /// Number of nearest neighbors (k, also called MinPts).
963    pub k: usize,
964    /// LOF threshold above which a point is considered an outlier.
965    pub outlier_threshold: f64,
966}
967
968impl Default for LofConfig {
969    fn default() -> Self {
970        Self {
971            k: 5,
972            outlier_threshold: 1.5,
973        }
974    }
975}
976
977/// Result of LOF computation.
978#[derive(Debug, Clone)]
979pub struct LofResult<F: Float> {
980    /// LOF score for each data point (1.0 = normal density, > 1 = outlier-like).
981    pub lof_scores: Array1<F>,
982    /// Binary outlier labels (true = outlier).
983    pub is_outlier: Vec<bool>,
984    /// Number of outliers detected.
985    pub n_outliers: usize,
986    /// k-distance for each point.
987    pub k_distances: Array1<F>,
988    /// Local reachability density for each point.
989    pub lrd: Array1<F>,
990}
991
992/// Compute Local Outlier Factor (LOF) scores.
993///
994/// LOF measures the local density deviation of a point with respect to its
995/// neighbors. A LOF score significantly greater than 1 indicates the point
996/// is in a region of lower density (potential outlier).
997pub fn local_outlier_factor<F: Float + FromPrimitive + Debug>(
998    data: ArrayView2<F>,
999    config: &LofConfig,
1000) -> Result<LofResult<F>> {
1001    let n = data.shape()[0];
1002    if n == 0 {
1003        return Err(ClusteringError::InvalidInput("Empty input data".into()));
1004    }
1005    if config.k == 0 || config.k >= n {
1006        return Err(ClusteringError::InvalidInput(
1007            "k must be in [1, n_samples)".into(),
1008        ));
1009    }
1010
1011    let dist_mat = pairwise_distances(data);
1012    let k = config.k;
1013    let knn = knn_distances(&dist_mat, k);
1014
1015    // k-distance for each point: distance to k-th nearest neighbor
1016    let mut k_dist = Array1::<F>::zeros(n);
1017    for i in 0..n {
1018        k_dist[i] = if knn[i].len() >= k {
1019            knn[i][k - 1].1
1020        } else if let Some(last) = knn[i].last() {
1021            last.1
1022        } else {
1023            F::zero()
1024        };
1025    }
1026
1027    // Reachability distance: reach_dist_k(a, b) = max(k_dist(b), d(a, b))
1028    // Local reachability density: lrd(p) = 1 / (avg reach_dist from p to its k neighbors)
1029    let mut lrd = Array1::<F>::zeros(n);
1030    for i in 0..n {
1031        let mut sum_reach = F::zero();
1032        let nb_count = knn[i].len();
1033        for &(j, d_ij) in &knn[i] {
1034            let reach = d_ij.max(k_dist[j]);
1035            sum_reach = sum_reach + reach;
1036        }
1037        if nb_count > 0 && sum_reach > F::epsilon() {
1038            lrd[i] = F::from(nb_count).unwrap_or_else(|| F::one()) / sum_reach;
1039        } else {
1040            lrd[i] = F::one(); // avoid division by zero
1041        }
1042    }
1043
1044    // LOF score: avg(lrd(neighbor) / lrd(p)) for all k-neighbors
1045    let mut lof_scores = Array1::<F>::zeros(n);
1046    for i in 0..n {
1047        let nb_count = knn[i].len();
1048        if nb_count == 0 || lrd[i] <= F::epsilon() {
1049            lof_scores[i] = F::one();
1050            continue;
1051        }
1052        let mut sum = F::zero();
1053        for &(j, _) in &knn[i] {
1054            sum = sum + lrd[j] / lrd[i];
1055        }
1056        lof_scores[i] = sum / F::from(nb_count).unwrap_or_else(|| F::one());
1057    }
1058
1059    let threshold = F::from(config.outlier_threshold)
1060        .unwrap_or_else(|| F::from(1.5).unwrap_or_else(|| F::one()));
1061    let is_outlier: Vec<bool> = lof_scores.iter().map(|&s| s > threshold).collect();
1062    let n_outliers = is_outlier.iter().filter(|&&o| o).count();
1063
1064    Ok(LofResult {
1065        lof_scores,
1066        is_outlier,
1067        n_outliers,
1068        k_distances: k_dist,
1069        lrd,
1070    })
1071}
1072
1073// ---------------------------------------------------------------------------
1074// Tests
1075// ---------------------------------------------------------------------------
1076
1077#[cfg(test)]
1078mod tests {
1079    use super::*;
1080    use scirs2_core::ndarray::Array2;
1081
1082    fn make_clustered_data() -> Array2<f64> {
1083        let mut data = Vec::new();
1084        // Cluster A around (1, 1)
1085        for i in 0..15 {
1086            let n = (i as f64 * 0.073).sin() * 0.2;
1087            data.push(1.0 + n);
1088            data.push(1.0 + n * 0.5);
1089        }
1090        // Cluster B around (5, 5)
1091        for i in 0..15 {
1092            let n = (i as f64 * 0.131).sin() * 0.2;
1093            data.push(5.0 + n);
1094            data.push(5.0 + n * 0.5);
1095        }
1096        // Outlier
1097        data.push(10.0);
1098        data.push(10.0);
1099        Array2::from_shape_vec((31, 2), data).expect("shape failed")
1100    }
1101
1102    // -- HDBSCAN* tests --
1103
1104    #[test]
1105    fn test_hdbscan_star_basic() {
1106        let data = make_clustered_data();
1107        let config = HdbscanStarConfig {
1108            min_cluster_size: 3,
1109            min_samples: 3,
1110            compute_probabilities: true,
1111        };
1112        let result = hdbscan_star(data.view(), &config).expect("hdbscan* failed");
1113        assert_eq!(result.labels.len(), 31);
1114        assert!(result.n_clusters >= 1);
1115        assert!(result.probabilities.is_some());
1116        assert_eq!(result.outlier_scores.len(), 31);
1117    }
1118
1119    #[test]
1120    fn test_hdbscan_star_empty() {
1121        let data = Array2::<f64>::zeros((0, 3));
1122        let config = HdbscanStarConfig::default();
1123        assert!(hdbscan_star(data.view(), &config).is_err());
1124    }
1125
1126    #[test]
1127    fn test_hdbscan_star_invalid_params() {
1128        let data = make_clustered_data();
1129        let config = HdbscanStarConfig {
1130            min_cluster_size: 1,
1131            ..Default::default()
1132        };
1133        assert!(hdbscan_star(data.view(), &config).is_err());
1134    }
1135
1136    #[test]
1137    fn test_hdbscan_star_small_data() {
1138        let data = Array2::from_shape_vec(
1139            (5, 2),
1140            vec![0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 5.0, 5.0, 5.1, 5.1],
1141        )
1142        .expect("shape");
1143        let config = HdbscanStarConfig {
1144            min_cluster_size: 2,
1145            min_samples: 2,
1146            compute_probabilities: false,
1147        };
1148        let result = hdbscan_star(data.view(), &config).expect("hdbscan* failed");
1149        assert_eq!(result.labels.len(), 5);
1150        assert!(result.probabilities.is_none());
1151    }
1152
1153    // -- Auto-epsilon DBSCAN tests --
1154
1155    #[test]
1156    fn test_auto_epsilon_basic() {
1157        let data = make_clustered_data();
1158        let config = AutoEpsilonConfig {
1159            k: 3,
1160            min_samples: 3,
1161            sensitivity: 1.0,
1162        };
1163        let result = auto_epsilon_dbscan(data.view(), &config).expect("auto-eps failed");
1164        assert_eq!(result.labels.len(), 31);
1165        assert!(result.epsilon > 0.0);
1166        assert!(!result.k_distances.is_empty());
1167    }
1168
1169    #[test]
1170    fn test_auto_epsilon_empty() {
1171        let data = Array2::<f64>::zeros((0, 2));
1172        let config = AutoEpsilonConfig::default();
1173        assert!(auto_epsilon_dbscan(data.view(), &config).is_err());
1174    }
1175
1176    #[test]
1177    fn test_auto_epsilon_invalid_k() {
1178        let data = make_clustered_data();
1179        let config = AutoEpsilonConfig {
1180            k: 0,
1181            ..Default::default()
1182        };
1183        assert!(auto_epsilon_dbscan(data.view(), &config).is_err());
1184    }
1185
1186    #[test]
1187    fn test_find_elbow() {
1188        let values = vec![0.1, 0.2, 0.3, 0.4, 0.8, 1.5, 3.0, 5.0, 8.0, 12.0];
1189        let idx = find_elbow(&values, 1.0);
1190        // Elbow should be somewhere in the middle where curvature changes
1191        assert!(idx > 0 && idx < values.len() - 1);
1192    }
1193
1194    // -- SNN tests --
1195
1196    #[test]
1197    fn test_snn_basic() {
1198        let data = make_clustered_data();
1199        let config = SnnConfig {
1200            k: 5,
1201            snn_threshold: 2,
1202            min_snn_neighbors: 2,
1203        };
1204        let result = snn_clustering(data.view(), &config).expect("snn failed");
1205        assert_eq!(result.labels.len(), 31);
1206        // SNN similarity should be symmetric
1207        let n = result.snn_similarity.shape()[0];
1208        for i in 0..n {
1209            for j in 0..n {
1210                let diff = (result.snn_similarity[[i, j]] - result.snn_similarity[[j, i]]).abs();
1211                assert!(diff < 1e-10);
1212            }
1213        }
1214    }
1215
1216    #[test]
1217    fn test_snn_empty() {
1218        let data = Array2::<f64>::zeros((0, 2));
1219        let config = SnnConfig::default();
1220        assert!(snn_clustering(data.view(), &config).is_err());
1221    }
1222
1223    // -- KDE clustering tests --
1224
1225    #[test]
1226    fn test_kde_clustering_basic() {
1227        let data = make_clustered_data();
1228        let config = KdcConfig {
1229            kernel: KdeKernel::Gaussian,
1230            bandwidth: 1.0,
1231            density_threshold: 0.05,
1232            merge_distance: 1.0,
1233        };
1234        let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
1235        assert_eq!(result.labels.len(), 31);
1236        assert_eq!(result.densities.len(), 31);
1237        assert!(result.bandwidth > 0.0);
1238    }
1239
1240    #[test]
1241    fn test_kde_clustering_auto_bandwidth() {
1242        let data = make_clustered_data();
1243        let config = KdcConfig {
1244            bandwidth: 0.0, // auto
1245            ..Default::default()
1246        };
1247        let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
1248        assert!(result.bandwidth > 0.0);
1249    }
1250
1251    #[test]
1252    fn test_kde_clustering_epanechnikov() {
1253        let data = make_clustered_data();
1254        let config = KdcConfig {
1255            kernel: KdeKernel::Epanechnikov,
1256            bandwidth: 2.0,
1257            density_threshold: 0.01,
1258            merge_distance: 1.0,
1259        };
1260        let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
1261        assert_eq!(result.labels.len(), 31);
1262    }
1263
1264    #[test]
1265    fn test_kde_clustering_empty() {
1266        let data = Array2::<f64>::zeros((0, 2));
1267        let config = KdcConfig::default();
1268        assert!(kernel_density_clustering(data.view(), &config).is_err());
1269    }
1270
1271    #[test]
1272    fn test_silverman_bandwidth() {
1273        let data = Array2::from_shape_vec(
1274            (10, 2),
1275            vec![
1276                1.0, 2.0, 1.1, 2.1, 1.2, 1.9, 0.9, 2.2, 1.3, 1.8, 5.0, 6.0, 5.1, 6.1, 5.2, 5.9,
1277                4.9, 6.2, 5.3, 5.8,
1278            ],
1279        )
1280        .expect("shape");
1281        let h = silverman_bandwidth(data.view());
1282        assert!(h > 0.0);
1283    }
1284
1285    // -- LOF tests --
1286
1287    #[test]
1288    fn test_lof_basic() {
1289        let data = make_clustered_data();
1290        let config = LofConfig {
1291            k: 5,
1292            outlier_threshold: 1.5,
1293        };
1294        let result = local_outlier_factor(data.view(), &config).expect("lof failed");
1295        assert_eq!(result.lof_scores.len(), 31);
1296        assert_eq!(result.is_outlier.len(), 31);
1297        assert_eq!(result.k_distances.len(), 31);
1298        assert_eq!(result.lrd.len(), 31);
1299
1300        // The outlier at (10, 10) should have a high LOF score
1301        let outlier_score = result.lof_scores[30];
1302        assert!(
1303            outlier_score > 1.0,
1304            "Outlier LOF score should be > 1, got {}",
1305            outlier_score
1306        );
1307    }
1308
1309    #[test]
1310    fn test_lof_empty() {
1311        let data = Array2::<f64>::zeros((0, 2));
1312        let config = LofConfig::default();
1313        assert!(local_outlier_factor(data.view(), &config).is_err());
1314    }
1315
1316    #[test]
1317    fn test_lof_invalid_k() {
1318        let data = make_clustered_data();
1319        let config = LofConfig {
1320            k: 0,
1321            ..Default::default()
1322        };
1323        assert!(local_outlier_factor(data.view(), &config).is_err());
1324
1325        let config2 = LofConfig {
1326            k: 100, // k >= n
1327            ..Default::default()
1328        };
1329        assert!(local_outlier_factor(data.view(), &config2).is_err());
1330    }
1331
1332    #[test]
1333    fn test_lof_uniform_data() {
1334        // All same point => LOF should be ~1 for all
1335        let data = Array2::from_shape_vec(
1336            (10, 2),
1337            vec![
1338                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1339                1.0, 1.0, 1.0, 1.0,
1340            ],
1341        )
1342        .expect("shape");
1343        let config = LofConfig {
1344            k: 3,
1345            outlier_threshold: 1.5,
1346        };
1347        let result = local_outlier_factor(data.view(), &config).expect("lof failed");
1348        // LOF of identical points should be ~1
1349        for &score in result.lof_scores.iter() {
1350            assert!(
1351                (score - 1.0).abs() < 0.5,
1352                "LOF for identical points should be ~1, got {}",
1353                score
1354            );
1355        }
1356    }
1357
1358    // -- Prim's MST test --
1359
1360    #[test]
1361    fn test_prims_mst() {
1362        let mut dist = Array2::<f64>::zeros((4, 4));
1363        dist[[0, 1]] = 1.0;
1364        dist[[1, 0]] = 1.0;
1365        dist[[0, 2]] = 4.0;
1366        dist[[2, 0]] = 4.0;
1367        dist[[0, 3]] = 3.0;
1368        dist[[3, 0]] = 3.0;
1369        dist[[1, 2]] = 2.0;
1370        dist[[2, 1]] = 2.0;
1371        dist[[1, 3]] = 5.0;
1372        dist[[3, 1]] = 5.0;
1373        dist[[2, 3]] = 1.0;
1374        dist[[3, 2]] = 1.0;
1375
1376        let mst = prims_mst(&dist, 4);
1377        assert_eq!(mst.len(), 3);
1378        let total_weight: f64 = mst.iter().map(|e| e.weight).sum();
1379        assert!((total_weight - 4.0).abs() < 1e-10); // 1 + 2 + 1 = 4
1380    }
1381
1382    // -- Distance helper tests --
1383
1384    #[test]
1385    fn test_dist_sq() {
1386        let a = Array1::from_vec(vec![1.0, 2.0]);
1387        let b = Array1::from_vec(vec![4.0, 6.0]);
1388        let d = dist_sq(a.view(), b.view());
1389        assert!((d - 25.0).abs() < 1e-10);
1390    }
1391
1392    #[test]
1393    fn test_knn_distances() {
1394        let dist = Array2::from_shape_vec(
1395            (4, 4),
1396            vec![
1397                0.0, 1.0, 3.0, 5.0, 1.0, 0.0, 2.0, 4.0, 3.0, 2.0, 0.0, 1.0, 5.0, 4.0, 1.0, 0.0,
1398            ],
1399        )
1400        .expect("shape");
1401        let knn = knn_distances(&dist, 2);
1402        assert_eq!(knn.len(), 4);
1403        // Point 0's nearest: 1 (dist 1), 2 (dist 3)
1404        assert_eq!(knn[0][0].0, 1);
1405        assert!((knn[0][0].1 - 1.0).abs() < 1e-10);
1406    }
1407}