Skip to main content

oxigdal_algorithms/vector/clustering/
dbscan.rs

1//! DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
2//!
3//! Finds clusters of arbitrary shape based on density connectivity.
4
5use crate::error::{AlgorithmError, Result};
6use oxigdal_core::vector::Point;
7use std::collections::{HashMap, HashSet, VecDeque};
8
9/// Options for DBSCAN clustering
10#[derive(Debug, Clone)]
11pub struct DbscanOptions {
12    /// Maximum distance for neighborhood (epsilon)
13    pub epsilon: f64,
14    /// Minimum number of points to form a dense region
15    pub min_points: usize,
16    /// Distance metric
17    pub metric: DistanceMetric,
18}
19
20impl Default for DbscanOptions {
21    fn default() -> Self {
22        Self {
23            epsilon: 0.5,
24            min_points: 5,
25            metric: DistanceMetric::Euclidean,
26        }
27    }
28}
29
30/// Distance metric for clustering
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum DistanceMetric {
33    /// Euclidean distance
34    Euclidean,
35    /// Manhattan distance
36    Manhattan,
37    /// Haversine distance (for geographic coordinates)
38    Haversine,
39}
40
41/// Result of DBSCAN clustering
42#[derive(Debug, Clone)]
43pub struct DbscanResult {
44    /// Cluster assignments (-1 for noise points)
45    pub labels: Vec<i32>,
46    /// Number of clusters found
47    pub num_clusters: usize,
48    /// Indices of noise points
49    pub noise_points: Vec<usize>,
50    /// Cluster statistics
51    pub cluster_sizes: HashMap<i32, usize>,
52}
53
54/// Perform DBSCAN clustering on points
55///
56/// # Arguments
57///
58/// * `points` - Points to cluster
59/// * `options` - DBSCAN options
60///
61/// # Returns
62///
63/// Clustering result with labels for each point
64///
65/// # Examples
66///
67/// ```
68/// use oxigdal_algorithms::vector::clustering::{dbscan_cluster, DbscanOptions};
69/// use oxigdal_algorithms::Point;
70/// # use oxigdal_algorithms::error::Result;
71///
72/// # fn main() -> Result<()> {
73/// let points = vec![
74///     Point::new(0.0, 0.0),
75///     Point::new(0.1, 0.1),
76///     Point::new(0.2, 0.1),
77///     Point::new(5.0, 5.0),
78///     Point::new(5.1, 5.0),
79/// ];
80///
81/// let options = DbscanOptions {
82///     epsilon: 0.5,
83///     min_points: 2,
84///     ..Default::default()
85/// };
86///
87/// let result = dbscan_cluster(&points, &options)?;
88/// assert!(result.num_clusters >= 1);
89/// # Ok(())
90/// # }
91/// ```
92pub fn dbscan_cluster(points: &[Point], options: &DbscanOptions) -> Result<DbscanResult> {
93    if points.is_empty() {
94        return Err(AlgorithmError::InvalidInput(
95            "Cannot cluster empty point set".to_string(),
96        ));
97    }
98
99    let n = points.len();
100    let mut labels = vec![-1; n]; // -1 = unvisited, 0 = noise, >0 = cluster ID
101    let mut cluster_id = 0;
102
103    for i in 0..n {
104        if labels[i] != -1 {
105            continue; // Already processed
106        }
107
108        let neighbors = range_query(points, i, options.epsilon, options.metric);
109
110        if neighbors.len() < options.min_points {
111            labels[i] = 0; // Mark as noise
112            continue;
113        }
114
115        // Start new cluster
116        cluster_id += 1;
117        expand_cluster(points, i, &neighbors, cluster_id, &mut labels, options)?;
118    }
119
120    // Build result
121    let mut noise_points = Vec::new();
122    let mut cluster_sizes: HashMap<i32, usize> = HashMap::new();
123
124    for (idx, &label) in labels.iter().enumerate() {
125        if label == 0 {
126            noise_points.push(idx);
127        } else if label > 0 {
128            *cluster_sizes.entry(label).or_insert(0) += 1;
129        }
130    }
131
132    Ok(DbscanResult {
133        labels,
134        num_clusters: cluster_id as usize,
135        noise_points,
136        cluster_sizes,
137    })
138}
139
140/// Expand a cluster from a core point
141fn expand_cluster(
142    points: &[Point],
143    point_idx: usize,
144    neighbors: &[usize],
145    cluster_id: i32,
146    labels: &mut [i32],
147    options: &DbscanOptions,
148) -> Result<()> {
149    labels[point_idx] = cluster_id;
150
151    let mut queue = VecDeque::from(neighbors.to_vec());
152    let mut processed = HashSet::new();
153    processed.insert(point_idx);
154
155    while let Some(neighbor_idx) = queue.pop_front() {
156        if processed.contains(&neighbor_idx) {
157            continue;
158        }
159        processed.insert(neighbor_idx);
160
161        if labels[neighbor_idx] == 0 {
162            // Change noise to border point
163            labels[neighbor_idx] = cluster_id;
164        }
165
166        if labels[neighbor_idx] != -1 {
167            continue; // Already in a cluster
168        }
169
170        labels[neighbor_idx] = cluster_id;
171
172        let neighbor_neighbors = range_query(points, neighbor_idx, options.epsilon, options.metric);
173
174        if neighbor_neighbors.len() >= options.min_points {
175            // This is also a core point
176            for &nn in &neighbor_neighbors {
177                if !processed.contains(&nn) {
178                    queue.push_back(nn);
179                }
180            }
181        }
182    }
183
184    Ok(())
185}
186
187/// Find all points within epsilon distance of a given point
188fn range_query(
189    points: &[Point],
190    point_idx: usize,
191    epsilon: f64,
192    metric: DistanceMetric,
193) -> Vec<usize> {
194    let query_point = &points[point_idx];
195    let mut neighbors = Vec::new();
196
197    for (i, point) in points.iter().enumerate() {
198        let dist = calculate_distance(query_point, point, metric);
199        if dist <= epsilon {
200            neighbors.push(i);
201        }
202    }
203
204    neighbors
205}
206
207/// Calculate distance between two points
208pub fn calculate_distance(p1: &Point, p2: &Point, metric: DistanceMetric) -> f64 {
209    match metric {
210        DistanceMetric::Euclidean => {
211            let dx = p1.coord.x - p2.coord.x;
212            let dy = p1.coord.y - p2.coord.y;
213            (dx * dx + dy * dy).sqrt()
214        }
215        DistanceMetric::Manhattan => {
216            let dx = (p1.coord.x - p2.coord.x).abs();
217            let dy = (p1.coord.y - p2.coord.y).abs();
218            dx + dy
219        }
220        DistanceMetric::Haversine => haversine_distance(p1, p2),
221    }
222}
223
224/// Calculate Haversine distance between two geographic points (in meters)
225fn haversine_distance(p1: &Point, p2: &Point) -> f64 {
226    const EARTH_RADIUS: f64 = 6371000.0; // meters
227
228    let lat1 = p1.coord.y.to_radians();
229    let lat2 = p2.coord.y.to_radians();
230    let delta_lat = (p2.coord.y - p1.coord.y).to_radians();
231    let delta_lon = (p2.coord.x - p1.coord.x).to_radians();
232
233    let a =
234        (delta_lat / 2.0).sin().powi(2) + lat1.cos() * lat2.cos() * (delta_lon / 2.0).sin().powi(2);
235
236    let c = 2.0 * a.sqrt().atan2((1.0 - a).sqrt());
237
238    EARTH_RADIUS * c
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244
245    #[test]
246    fn test_dbscan_simple() {
247        let points = vec![
248            Point::new(0.0, 0.0),
249            Point::new(0.1, 0.1),
250            Point::new(0.2, 0.1),
251            Point::new(5.0, 5.0),
252            Point::new(5.1, 5.0),
253        ];
254
255        let options = DbscanOptions {
256            epsilon: 0.5,
257            min_points: 2,
258            ..Default::default()
259        };
260
261        let result = dbscan_cluster(&points, &options);
262        assert!(result.is_ok());
263
264        let clustering = result.expect("Clustering failed");
265        assert!(clustering.num_clusters >= 1);
266    }
267
268    #[test]
269    fn test_dbscan_all_noise() {
270        let points = vec![
271            Point::new(0.0, 0.0),
272            Point::new(10.0, 10.0),
273            Point::new(20.0, 20.0),
274        ];
275
276        let options = DbscanOptions {
277            epsilon: 0.5,
278            min_points: 2,
279            ..Default::default()
280        };
281
282        let result = dbscan_cluster(&points, &options);
283        assert!(result.is_ok());
284
285        let clustering = result.expect("Clustering failed");
286        assert_eq!(clustering.num_clusters, 0);
287        assert_eq!(clustering.noise_points.len(), 3);
288    }
289
290    #[test]
291    fn test_haversine_distance() {
292        let p1 = Point::new(0.0, 0.0);
293        let p2 = Point::new(0.0, 0.01); // ~1.11 km
294
295        let dist = haversine_distance(&p1, &p2);
296        assert!(dist > 1000.0 && dist < 1200.0); // Approximately 1.11 km
297    }
298
299    #[test]
300    fn test_euclidean_distance() {
301        let p1 = Point::new(0.0, 0.0);
302        let p2 = Point::new(3.0, 4.0);
303
304        let dist = calculate_distance(&p1, &p2, DistanceMetric::Euclidean);
305        assert!((dist - 5.0).abs() < 1e-6);
306    }
307
308    #[test]
309    fn test_manhattan_distance() {
310        let p1 = Point::new(0.0, 0.0);
311        let p2 = Point::new(3.0, 4.0);
312
313        let dist = calculate_distance(&p1, &p2, DistanceMetric::Manhattan);
314        assert!((dist - 7.0).abs() < 1e-6);
315    }
316}