Skip to main content

scirs2_cluster/density/
mod.rs

1//! Density-based clustering algorithms
2//!
3//! This module provides implementations of density-based clustering algorithms such as:
4//! - DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
5//! - OPTICS (Ordering Points To Identify the Clustering Structure)
6//! - HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
7//!
8//! These algorithms are particularly useful for discovering clusters of arbitrary shape
9//! and for identifying noise points in the data. Density-based methods are also capable
10//! of finding clusters of arbitrary shapes and handling clusters of different densities.
11
12pub mod hdbscan;
13pub mod optics;
14
15use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
16use scirs2_core::numeric::{Float, FromPrimitive};
17// Directly implement distance functions
18mod distance {
19    use scirs2_core::numeric::Float;
20
21    /// Calculates the Euclidean distance (L2 norm) between two vectors
22    pub fn euclidean<F: Float>(a: &[F], b: &[F]) -> F {
23        let mut sum = F::zero();
24        for (a_i, b_i) in a.iter().zip(b.iter()) {
25            let diff = *a_i - *b_i;
26            sum = sum + diff * diff;
27        }
28        sum.sqrt()
29    }
30
31    /// Calculates the Manhattan distance (L1 norm) between two vectors
32    pub fn manhattan<F: Float>(a: &[F], b: &[F]) -> F {
33        let mut sum = F::zero();
34        for (a_i, b_i) in a.iter().zip(b.iter()) {
35            sum = sum + (*a_i - *b_i).abs();
36        }
37        sum
38    }
39
40    /// Calculates the Chebyshev distance (L∞ norm) between two vectors
41    pub fn chebyshev<F: Float>(a: &[F], b: &[F]) -> F {
42        let mut max = F::zero();
43        for (a_i, b_i) in a.iter().zip(b.iter()) {
44            let abs_diff = (*a_i - *b_i).abs();
45            if abs_diff > max {
46                max = abs_diff;
47            }
48        }
49        max
50    }
51
52    /// Calculates the Minkowski distance between two vectors with power p
53    pub fn minkowski<F: Float>(a: &[F], b: &[F], p: F) -> F {
54        let mut sum = F::zero();
55        for (a_i, b_i) in a.iter().zip(b.iter()) {
56            sum = sum + ((*a_i - *b_i).abs()).powf(p);
57        }
58        sum.powf(F::one() / p)
59    }
60}
61use std::collections::HashSet;
62use std::fmt::Debug;
63
64use crate::error::{ClusteringError, Result};
65
66/// Labels for DBSCAN clusters
67pub mod labels {
68    /// Noise point label (-1)
69    pub const NOISE: i32 = -1;
70    /// Undefined point label (not yet processed)
71    pub const UNDEFINED: i32 = -2;
72}
73
74/// Distance metric enumeration for clustering algorithms
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum DistanceMetric {
77    /// Euclidean distance (L2 norm)
78    Euclidean,
79
80    /// Manhattan distance (L1 norm)
81    Manhattan,
82
83    /// Chebyshev distance (L∞ norm)
84    Chebyshev,
85
86    /// Minkowski distance with p=3
87    Minkowski,
88}
89
90/// DBSCAN clustering algorithm (Density-Based Spatial Clustering of Applications with Noise)
91///
92/// # Arguments
93///
94/// * `data` - The input data as a 2D array (n_samples x n_features)
95/// * `eps` - The maximum distance between two samples for them to be considered neighbors
96/// * `min_samples` - The minimum number of samples in a neighborhood for a point to be a core point
97/// * `metric` - The distance metric to use (default: Euclidean)
98///
99/// # Returns
100///
101/// * `Result<Array1<i32>>` - Cluster labels for each point in the dataset.
102///   Labels are integers starting from 0, with -1 indicating noise points.
103///
104/// # Examples
105///
106/// ```
107/// use scirs2_core::ndarray::{Array2, ArrayView2};
108/// use scirs2_cluster::density::{dbscan, DistanceMetric};
109///
110/// // Example data with two clusters and noise
111/// let data = Array2::from_shape_vec((8, 2), vec![
112///     1.0, 2.0,   // Cluster 1
113///     1.5, 1.8,   // Cluster 1
114///     1.3, 1.9,   // Cluster 1
115///     5.0, 7.0,   // Cluster 2
116///     5.1, 6.8,   // Cluster 2
117///     5.2, 7.1,   // Cluster 2
118///     0.0, 10.0,  // Noise
119///     10.0, 0.0,  // Noise
120/// ]).expect("Operation failed");
121///
122/// // Run DBSCAN with eps=0.8 and min_samples=2
123/// let labels = dbscan(data.view(), 0.8, 2, Some(DistanceMetric::Euclidean)).expect("Operation failed");
124///
125/// // Print the results
126/// println!("Cluster assignments: {:?}", labels);
127/// ```
128#[allow(dead_code)]
129pub fn dbscan<F: Float + FromPrimitive + Debug + PartialOrd>(
130    data: ArrayView2<F>,
131    eps: F,
132    min_samples: usize,
133    metric: Option<DistanceMetric>,
134) -> Result<Array1<i32>> {
135    let n_samples = data.shape()[0];
136
137    if n_samples == 0 {
138        return Err(ClusteringError::InvalidInput("Empty input data".into()));
139    }
140
141    if eps <= F::zero() {
142        return Err(ClusteringError::InvalidInput("eps must be positive".into()));
143    }
144
145    if min_samples < 1 {
146        return Err(ClusteringError::InvalidInput(
147            "min_samples must be at least 1".into(),
148        ));
149    }
150
151    // Initialize labels to undefined
152    let mut labels = vec![labels::UNDEFINED; n_samples];
153
154    // Keep track of the current cluster label
155    let mut cluster_label: i32 = 0;
156
157    // Calculate pairwise distances
158    let mut distances = Array2::<F>::zeros((n_samples, n_samples));
159
160    // Fill distance matrix
161    for i in 0..n_samples {
162        for j in (i + 1)..n_samples {
163            let point1 = data.row(i).to_vec();
164            let point2 = data.row(j).to_vec();
165
166            let dist = match metric.unwrap_or(DistanceMetric::Euclidean) {
167                DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
168                DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
169                DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
170                DistanceMetric::Minkowski => distance::minkowski(
171                    &point1,
172                    &point2,
173                    F::from(3.0).expect("Failed to convert constant to float"),
174                ),
175            };
176
177            distances[[i, j]] = dist;
178            distances[[j, i]] = dist; // Distance matrix is symmetric
179        }
180    }
181
182    // Create a map of neighbor indices for each point
183    let mut neighborhoods: Vec<Vec<usize>> = Vec::with_capacity(n_samples);
184
185    for i in 0..n_samples {
186        let mut neighbors = Vec::new();
187        for j in 0..n_samples {
188            if i != j && distances[[i, j]] <= eps {
189                neighbors.push(j);
190            }
191        }
192        neighborhoods.push(neighbors);
193    }
194
195    // Main DBSCAN algorithm
196    for point_idx in 0..n_samples {
197        // Skip already processed points
198        if labels[point_idx] != labels::UNDEFINED {
199            continue;
200        }
201
202        // Check if point has enough neighbors to be a core point
203        let neighbors = &neighborhoods[point_idx];
204
205        if neighbors.len() + 1 < min_samples {
206            // Mark as noise (may be assigned to a cluster later as a border point)
207            labels[point_idx] = labels::NOISE;
208            continue;
209        }
210
211        // Start a new cluster
212        labels[point_idx] = cluster_label;
213
214        // Process neighbors using a queue (breadth-first search)
215        let mut queue: Vec<usize> = neighbors.clone();
216        let mut processed = HashSet::new();
217        processed.insert(point_idx);
218
219        let mut idx = 0;
220        while idx < queue.len() {
221            let current = queue[idx];
222            idx += 1;
223
224            // Skip already processed points
225            if processed.contains(&current) {
226                continue;
227            }
228            processed.insert(current);
229
230            // If was previously noise, add to cluster
231            if labels[current] == labels::NOISE {
232                labels[current] = cluster_label;
233                continue;
234            }
235
236            // If already part of another cluster, skip
237            if labels[current] != labels::UNDEFINED {
238                continue;
239            }
240
241            // Add to current cluster
242            labels[current] = cluster_label;
243
244            // If it's a core point, add its neighbors to the queue
245            let current_neighbors = &neighborhoods[current];
246            if current_neighbors.len() + 1 >= min_samples {
247                for &neighbor in current_neighbors {
248                    if !processed.contains(&neighbor) {
249                        queue.push(neighbor);
250                    }
251                }
252            }
253        }
254
255        // Move to next cluster
256        cluster_label += 1;
257    }
258
259    Ok(Array1::from(labels))
260}
261
262/// Runs the OPTICS clustering algorithm
263///
264/// This is a wrapper around the more detailed implementation in the optics module.
265/// See `optics::optics` for more details.
266///
267/// # Arguments
268///
269/// * `data` - The input data as a 2D array (n_samples x n_features)
270/// * `min_samples` - The minimum number of samples required for a point to be a core point
271/// * `max_eps` - The maximum distance for neighborhood consideration (default: infinity)
272/// * `metric` - Distance metric to use (default: Euclidean)
273///
274/// # Returns
275///
276/// * `Result<Array1<i32>>` - Cluster labels from a default extraction (using DBSCAN-like extraction
277///   with a reasonable epsilon value), or an error if the computation fails.
278#[allow(dead_code)]
279pub fn optics<F: Float + FromPrimitive + Debug + PartialOrd>(
280    data: ArrayView2<F>,
281    min_samples: usize,
282    max_eps: Option<F>,
283    metric: Option<DistanceMetric>,
284) -> Result<Array1<i32>> {
285    let result = optics::optics(data, min_samples, max_eps, metric)?;
286
287    // Convert max_eps to f64 for use with default extraction
288    let default_eps = match max_eps {
289        Some(_eps) => _eps.to_f64().unwrap_or(0.5) / 10.0,
290        None => 0.5,
291    };
292
293    Ok(optics::extract_dbscan_clustering(&result, default_eps))
294}