scirs2_cluster/density/mod.rs
1//! Density-based clustering algorithms
2//!
3//! This module provides implementations of density-based clustering algorithms such as:
4//! - DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
5//! - OPTICS (Ordering Points To Identify the Clustering Structure)
6//! - HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
7//!
8//! These algorithms are particularly useful for discovering clusters of arbitrary shape
9//! and for identifying noise points in the data. Density-based methods are also capable
10//! of finding clusters of arbitrary shapes and handling clusters of different densities.
11
12pub mod hdbscan;
13pub mod optics;
14
15use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
16use scirs2_core::numeric::{Float, FromPrimitive};
17// Directly implement distance functions
18mod distance {
19 use scirs2_core::numeric::Float;
20
21 /// Calculates the Euclidean distance (L2 norm) between two vectors
22 pub fn euclidean<F: Float>(a: &[F], b: &[F]) -> F {
23 let mut sum = F::zero();
24 for (a_i, b_i) in a.iter().zip(b.iter()) {
25 let diff = *a_i - *b_i;
26 sum = sum + diff * diff;
27 }
28 sum.sqrt()
29 }
30
31 /// Calculates the Manhattan distance (L1 norm) between two vectors
32 pub fn manhattan<F: Float>(a: &[F], b: &[F]) -> F {
33 let mut sum = F::zero();
34 for (a_i, b_i) in a.iter().zip(b.iter()) {
35 sum = sum + (*a_i - *b_i).abs();
36 }
37 sum
38 }
39
40 /// Calculates the Chebyshev distance (L∞ norm) between two vectors
41 pub fn chebyshev<F: Float>(a: &[F], b: &[F]) -> F {
42 let mut max = F::zero();
43 for (a_i, b_i) in a.iter().zip(b.iter()) {
44 let abs_diff = (*a_i - *b_i).abs();
45 if abs_diff > max {
46 max = abs_diff;
47 }
48 }
49 max
50 }
51
52 /// Calculates the Minkowski distance between two vectors with power p
53 pub fn minkowski<F: Float>(a: &[F], b: &[F], p: F) -> F {
54 let mut sum = F::zero();
55 for (a_i, b_i) in a.iter().zip(b.iter()) {
56 sum = sum + ((*a_i - *b_i).abs()).powf(p);
57 }
58 sum.powf(F::one() / p)
59 }
60}
61use std::collections::HashSet;
62use std::fmt::Debug;
63
64use crate::error::{ClusteringError, Result};
65
66/// Labels for DBSCAN clusters
67pub mod labels {
68 /// Noise point label (-1)
69 pub const NOISE: i32 = -1;
70 /// Undefined point label (not yet processed)
71 pub const UNDEFINED: i32 = -2;
72}
73
74/// Distance metric enumeration for clustering algorithms
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum DistanceMetric {
77 /// Euclidean distance (L2 norm)
78 Euclidean,
79
80 /// Manhattan distance (L1 norm)
81 Manhattan,
82
83 /// Chebyshev distance (L∞ norm)
84 Chebyshev,
85
86 /// Minkowski distance with p=3
87 Minkowski,
88}
89
90/// DBSCAN clustering algorithm (Density-Based Spatial Clustering of Applications with Noise)
91///
92/// # Arguments
93///
94/// * `data` - The input data as a 2D array (n_samples x n_features)
95/// * `eps` - The maximum distance between two samples for them to be considered neighbors
96/// * `min_samples` - The minimum number of samples in a neighborhood for a point to be a core point
97/// * `metric` - The distance metric to use (default: Euclidean)
98///
99/// # Returns
100///
101/// * `Result<Array1<i32>>` - Cluster labels for each point in the dataset.
102/// Labels are integers starting from 0, with -1 indicating noise points.
103///
104/// # Examples
105///
106/// ```
107/// use scirs2_core::ndarray::{Array2, ArrayView2};
108/// use scirs2_cluster::density::{dbscan, DistanceMetric};
109///
110/// // Example data with two clusters and noise
111/// let data = Array2::from_shape_vec((8, 2), vec![
112/// 1.0, 2.0, // Cluster 1
113/// 1.5, 1.8, // Cluster 1
114/// 1.3, 1.9, // Cluster 1
115/// 5.0, 7.0, // Cluster 2
116/// 5.1, 6.8, // Cluster 2
117/// 5.2, 7.1, // Cluster 2
118/// 0.0, 10.0, // Noise
119/// 10.0, 0.0, // Noise
120/// ]).expect("Operation failed");
121///
122/// // Run DBSCAN with eps=0.8 and min_samples=2
123/// let labels = dbscan(data.view(), 0.8, 2, Some(DistanceMetric::Euclidean)).expect("Operation failed");
124///
125/// // Print the results
126/// println!("Cluster assignments: {:?}", labels);
127/// ```
128#[allow(dead_code)]
129pub fn dbscan<F: Float + FromPrimitive + Debug + PartialOrd>(
130 data: ArrayView2<F>,
131 eps: F,
132 min_samples: usize,
133 metric: Option<DistanceMetric>,
134) -> Result<Array1<i32>> {
135 let n_samples = data.shape()[0];
136
137 if n_samples == 0 {
138 return Err(ClusteringError::InvalidInput("Empty input data".into()));
139 }
140
141 if eps <= F::zero() {
142 return Err(ClusteringError::InvalidInput("eps must be positive".into()));
143 }
144
145 if min_samples < 1 {
146 return Err(ClusteringError::InvalidInput(
147 "min_samples must be at least 1".into(),
148 ));
149 }
150
151 // Initialize labels to undefined
152 let mut labels = vec![labels::UNDEFINED; n_samples];
153
154 // Keep track of the current cluster label
155 let mut cluster_label: i32 = 0;
156
157 // Calculate pairwise distances
158 let mut distances = Array2::<F>::zeros((n_samples, n_samples));
159
160 // Fill distance matrix
161 for i in 0..n_samples {
162 for j in (i + 1)..n_samples {
163 let point1 = data.row(i).to_vec();
164 let point2 = data.row(j).to_vec();
165
166 let dist = match metric.unwrap_or(DistanceMetric::Euclidean) {
167 DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
168 DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
169 DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
170 DistanceMetric::Minkowski => distance::minkowski(
171 &point1,
172 &point2,
173 F::from(3.0).expect("Failed to convert constant to float"),
174 ),
175 };
176
177 distances[[i, j]] = dist;
178 distances[[j, i]] = dist; // Distance matrix is symmetric
179 }
180 }
181
182 // Create a map of neighbor indices for each point
183 let mut neighborhoods: Vec<Vec<usize>> = Vec::with_capacity(n_samples);
184
185 for i in 0..n_samples {
186 let mut neighbors = Vec::new();
187 for j in 0..n_samples {
188 if i != j && distances[[i, j]] <= eps {
189 neighbors.push(j);
190 }
191 }
192 neighborhoods.push(neighbors);
193 }
194
195 // Main DBSCAN algorithm
196 for point_idx in 0..n_samples {
197 // Skip already processed points
198 if labels[point_idx] != labels::UNDEFINED {
199 continue;
200 }
201
202 // Check if point has enough neighbors to be a core point
203 let neighbors = &neighborhoods[point_idx];
204
205 if neighbors.len() + 1 < min_samples {
206 // Mark as noise (may be assigned to a cluster later as a border point)
207 labels[point_idx] = labels::NOISE;
208 continue;
209 }
210
211 // Start a new cluster
212 labels[point_idx] = cluster_label;
213
214 // Process neighbors using a queue (breadth-first search)
215 let mut queue: Vec<usize> = neighbors.clone();
216 let mut processed = HashSet::new();
217 processed.insert(point_idx);
218
219 let mut idx = 0;
220 while idx < queue.len() {
221 let current = queue[idx];
222 idx += 1;
223
224 // Skip already processed points
225 if processed.contains(¤t) {
226 continue;
227 }
228 processed.insert(current);
229
230 // If was previously noise, add to cluster
231 if labels[current] == labels::NOISE {
232 labels[current] = cluster_label;
233 continue;
234 }
235
236 // If already part of another cluster, skip
237 if labels[current] != labels::UNDEFINED {
238 continue;
239 }
240
241 // Add to current cluster
242 labels[current] = cluster_label;
243
244 // If it's a core point, add its neighbors to the queue
245 let current_neighbors = &neighborhoods[current];
246 if current_neighbors.len() + 1 >= min_samples {
247 for &neighbor in current_neighbors {
248 if !processed.contains(&neighbor) {
249 queue.push(neighbor);
250 }
251 }
252 }
253 }
254
255 // Move to next cluster
256 cluster_label += 1;
257 }
258
259 Ok(Array1::from(labels))
260}
261
262/// Runs the OPTICS clustering algorithm
263///
264/// This is a wrapper around the more detailed implementation in the optics module.
265/// See `optics::optics` for more details.
266///
267/// # Arguments
268///
269/// * `data` - The input data as a 2D array (n_samples x n_features)
270/// * `min_samples` - The minimum number of samples required for a point to be a core point
271/// * `max_eps` - The maximum distance for neighborhood consideration (default: infinity)
272/// * `metric` - Distance metric to use (default: Euclidean)
273///
274/// # Returns
275///
276/// * `Result<Array1<i32>>` - Cluster labels from a default extraction (using DBSCAN-like extraction
277/// with a reasonable epsilon value), or an error if the computation fails.
278#[allow(dead_code)]
279pub fn optics<F: Float + FromPrimitive + Debug + PartialOrd>(
280 data: ArrayView2<F>,
281 min_samples: usize,
282 max_eps: Option<F>,
283 metric: Option<DistanceMetric>,
284) -> Result<Array1<i32>> {
285 let result = optics::optics(data, min_samples, max_eps, metric)?;
286
287 // Convert max_eps to f64 for use with default extraction
288 let default_eps = match max_eps {
289 Some(_eps) => _eps.to_f64().unwrap_or(0.5) / 10.0,
290 None => 0.5,
291 };
292
293 Ok(optics::extract_dbscan_clustering(&result, default_eps))
294}