oxigdal_algorithms/vector/clustering/
dbscan.rs1use crate::error::{AlgorithmError, Result};
6use oxigdal_core::vector::Point;
7use std::collections::{HashMap, HashSet, VecDeque};
8
9#[derive(Debug, Clone)]
11pub struct DbscanOptions {
12 pub epsilon: f64,
14 pub min_points: usize,
16 pub metric: DistanceMetric,
18}
19
20impl Default for DbscanOptions {
21 fn default() -> Self {
22 Self {
23 epsilon: 0.5,
24 min_points: 5,
25 metric: DistanceMetric::Euclidean,
26 }
27 }
28}
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum DistanceMetric {
33 Euclidean,
35 Manhattan,
37 Haversine,
39}
40
41#[derive(Debug, Clone)]
43pub struct DbscanResult {
44 pub labels: Vec<i32>,
46 pub num_clusters: usize,
48 pub noise_points: Vec<usize>,
50 pub cluster_sizes: HashMap<i32, usize>,
52}
53
54pub fn dbscan_cluster(points: &[Point], options: &DbscanOptions) -> Result<DbscanResult> {
93 if points.is_empty() {
94 return Err(AlgorithmError::InvalidInput(
95 "Cannot cluster empty point set".to_string(),
96 ));
97 }
98
99 let n = points.len();
100 let mut labels = vec![-1; n]; let mut cluster_id = 0;
102
103 for i in 0..n {
104 if labels[i] != -1 {
105 continue; }
107
108 let neighbors = range_query(points, i, options.epsilon, options.metric);
109
110 if neighbors.len() < options.min_points {
111 labels[i] = 0; continue;
113 }
114
115 cluster_id += 1;
117 expand_cluster(points, i, &neighbors, cluster_id, &mut labels, options)?;
118 }
119
120 let mut noise_points = Vec::new();
122 let mut cluster_sizes: HashMap<i32, usize> = HashMap::new();
123
124 for (idx, &label) in labels.iter().enumerate() {
125 if label == 0 {
126 noise_points.push(idx);
127 } else if label > 0 {
128 *cluster_sizes.entry(label).or_insert(0) += 1;
129 }
130 }
131
132 Ok(DbscanResult {
133 labels,
134 num_clusters: cluster_id as usize,
135 noise_points,
136 cluster_sizes,
137 })
138}
139
140fn expand_cluster(
142 points: &[Point],
143 point_idx: usize,
144 neighbors: &[usize],
145 cluster_id: i32,
146 labels: &mut [i32],
147 options: &DbscanOptions,
148) -> Result<()> {
149 labels[point_idx] = cluster_id;
150
151 let mut queue = VecDeque::from(neighbors.to_vec());
152 let mut processed = HashSet::new();
153 processed.insert(point_idx);
154
155 while let Some(neighbor_idx) = queue.pop_front() {
156 if processed.contains(&neighbor_idx) {
157 continue;
158 }
159 processed.insert(neighbor_idx);
160
161 if labels[neighbor_idx] == 0 {
162 labels[neighbor_idx] = cluster_id;
164 }
165
166 if labels[neighbor_idx] != -1 {
167 continue; }
169
170 labels[neighbor_idx] = cluster_id;
171
172 let neighbor_neighbors = range_query(points, neighbor_idx, options.epsilon, options.metric);
173
174 if neighbor_neighbors.len() >= options.min_points {
175 for &nn in &neighbor_neighbors {
177 if !processed.contains(&nn) {
178 queue.push_back(nn);
179 }
180 }
181 }
182 }
183
184 Ok(())
185}
186
187fn range_query(
189 points: &[Point],
190 point_idx: usize,
191 epsilon: f64,
192 metric: DistanceMetric,
193) -> Vec<usize> {
194 let query_point = &points[point_idx];
195 let mut neighbors = Vec::new();
196
197 for (i, point) in points.iter().enumerate() {
198 let dist = calculate_distance(query_point, point, metric);
199 if dist <= epsilon {
200 neighbors.push(i);
201 }
202 }
203
204 neighbors
205}
206
207pub fn calculate_distance(p1: &Point, p2: &Point, metric: DistanceMetric) -> f64 {
209 match metric {
210 DistanceMetric::Euclidean => {
211 let dx = p1.coord.x - p2.coord.x;
212 let dy = p1.coord.y - p2.coord.y;
213 (dx * dx + dy * dy).sqrt()
214 }
215 DistanceMetric::Manhattan => {
216 let dx = (p1.coord.x - p2.coord.x).abs();
217 let dy = (p1.coord.y - p2.coord.y).abs();
218 dx + dy
219 }
220 DistanceMetric::Haversine => haversine_distance(p1, p2),
221 }
222}
223
224fn haversine_distance(p1: &Point, p2: &Point) -> f64 {
226 const EARTH_RADIUS: f64 = 6371000.0; let lat1 = p1.coord.y.to_radians();
229 let lat2 = p2.coord.y.to_radians();
230 let delta_lat = (p2.coord.y - p1.coord.y).to_radians();
231 let delta_lon = (p2.coord.x - p1.coord.x).to_radians();
232
233 let a =
234 (delta_lat / 2.0).sin().powi(2) + lat1.cos() * lat2.cos() * (delta_lon / 2.0).sin().powi(2);
235
236 let c = 2.0 * a.sqrt().atan2((1.0 - a).sqrt());
237
238 EARTH_RADIUS * c
239}
240
241#[cfg(test)]
242mod tests {
243 use super::*;
244
245 #[test]
246 fn test_dbscan_simple() {
247 let points = vec![
248 Point::new(0.0, 0.0),
249 Point::new(0.1, 0.1),
250 Point::new(0.2, 0.1),
251 Point::new(5.0, 5.0),
252 Point::new(5.1, 5.0),
253 ];
254
255 let options = DbscanOptions {
256 epsilon: 0.5,
257 min_points: 2,
258 ..Default::default()
259 };
260
261 let result = dbscan_cluster(&points, &options);
262 assert!(result.is_ok());
263
264 let clustering = result.expect("Clustering failed");
265 assert!(clustering.num_clusters >= 1);
266 }
267
268 #[test]
269 fn test_dbscan_all_noise() {
270 let points = vec![
271 Point::new(0.0, 0.0),
272 Point::new(10.0, 10.0),
273 Point::new(20.0, 20.0),
274 ];
275
276 let options = DbscanOptions {
277 epsilon: 0.5,
278 min_points: 2,
279 ..Default::default()
280 };
281
282 let result = dbscan_cluster(&points, &options);
283 assert!(result.is_ok());
284
285 let clustering = result.expect("Clustering failed");
286 assert_eq!(clustering.num_clusters, 0);
287 assert_eq!(clustering.noise_points.len(), 3);
288 }
289
290 #[test]
291 fn test_haversine_distance() {
292 let p1 = Point::new(0.0, 0.0);
293 let p2 = Point::new(0.0, 0.01); let dist = haversine_distance(&p1, &p2);
296 assert!(dist > 1000.0 && dist < 1200.0); }
298
299 #[test]
300 fn test_euclidean_distance() {
301 let p1 = Point::new(0.0, 0.0);
302 let p2 = Point::new(3.0, 4.0);
303
304 let dist = calculate_distance(&p1, &p2, DistanceMetric::Euclidean);
305 assert!((dist - 5.0).abs() < 1e-6);
306 }
307
308 #[test]
309 fn test_manhattan_distance() {
310 let p1 = Point::new(0.0, 0.0);
311 let p2 = Point::new(3.0, 4.0);
312
313 let dist = calculate_distance(&p1, &p2, DistanceMetric::Manhattan);
314 assert!((dist - 7.0).abs() < 1e-6);
315 }
316}