Skip to main content

oxigdal_analytics/clustering/
mod.rs

1//! Spatial Clustering Module
2//!
3//! This module provides clustering algorithms for geospatial data including:
4//! - K-means clustering for image classification
5//! - DBSCAN for spatial outlier detection
6//! - Cluster validation and quality metrics
7
8pub mod dbscan;
9pub mod kmeans;
10
11pub use dbscan::{DbscanClusterer, DbscanResult};
12pub use kmeans::{KMeansClusterer, KMeansResult};
13
14use crate::error::{AnalyticsError, Result};
15use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
16
17/// Cluster assignment and statistics
18#[derive(Debug, Clone)]
19pub struct ClusterInfo {
20    /// Cluster assignments for each point (-1 for noise/unassigned)
21    pub labels: Array1<i32>,
22    /// Number of clusters found
23    pub n_clusters: usize,
24    /// Cluster centers (if applicable)
25    pub centers: Option<Array2<f64>>,
26}
27
28/// Calculate silhouette score for clustering quality
29///
30/// Silhouette coefficient is a measure of how similar an object is to its own cluster
31/// compared to other clusters. Range: [-1, 1], higher is better.
32///
33/// # Arguments
34/// * `data` - Feature matrix (n_samples × n_features)
35/// * `labels` - Cluster assignments
36///
37/// # Errors
38/// Returns error if computation fails or data is invalid
39pub fn silhouette_score(data: &ArrayView2<f64>, labels: &Array1<i32>) -> Result<f64> {
40    let n_samples = data.nrows();
41
42    if n_samples != labels.len() {
43        return Err(AnalyticsError::dimension_mismatch(
44            format!("{}", n_samples),
45            format!("{}", labels.len()),
46        ));
47    }
48
49    if n_samples < 2 {
50        return Err(AnalyticsError::insufficient_data(
51            "Need at least 2 samples for silhouette score",
52        ));
53    }
54
55    // Find unique clusters (excluding noise label -1)
56    let mut unique_labels: Vec<i32> = labels.iter().copied().filter(|&x| x >= 0).collect();
57    unique_labels.sort_unstable();
58    unique_labels.dedup();
59
60    if unique_labels.len() < 2 {
61        return Err(AnalyticsError::insufficient_data(
62            "Need at least 2 clusters for silhouette score",
63        ));
64    }
65
66    let mut silhouette_sum = 0.0;
67    let mut n_valid = 0;
68
69    for i in 0..n_samples {
70        let label_i = labels[i];
71        if label_i < 0 {
72            continue; // Skip noise points
73        }
74
75        // Calculate a(i): average distance to points in same cluster
76        let mut a_sum = 0.0;
77        let mut a_count = 0;
78        for j in 0..n_samples {
79            if i != j && labels[j] == label_i {
80                a_sum += euclidean_distance(&data.row(i), &data.row(j))?;
81                a_count += 1;
82            }
83        }
84        let a_i = if a_count > 0 {
85            a_sum / (a_count as f64)
86        } else {
87            0.0
88        };
89
90        // Calculate b(i): minimum average distance to points in other clusters
91        let mut b_i = f64::INFINITY;
92        for &other_label in &unique_labels {
93            if other_label == label_i {
94                continue;
95            }
96
97            let mut b_sum = 0.0;
98            let mut b_count = 0;
99            for j in 0..n_samples {
100                if labels[j] == other_label {
101                    b_sum += euclidean_distance(&data.row(i), &data.row(j))?;
102                    b_count += 1;
103                }
104            }
105
106            if b_count > 0 {
107                let avg_dist = b_sum / (b_count as f64);
108                b_i = b_i.min(avg_dist);
109            }
110        }
111
112        // Calculate silhouette coefficient for point i
113        if b_i.is_finite() && a_i.is_finite() {
114            let max_val = a_i.max(b_i);
115            if max_val > f64::EPSILON {
116                silhouette_sum += (b_i - a_i) / max_val;
117                n_valid += 1;
118            }
119        }
120    }
121
122    if n_valid == 0 {
123        return Err(AnalyticsError::clustering_error(
124            "No valid silhouette coefficients computed",
125        ));
126    }
127
128    Ok(silhouette_sum / (n_valid as f64))
129}
130
131/// Calculate euclidean distance between two points
132fn euclidean_distance(
133    p1: &scirs2_core::ndarray::ArrayView1<f64>,
134    p2: &scirs2_core::ndarray::ArrayView1<f64>,
135) -> Result<f64> {
136    if p1.len() != p2.len() {
137        return Err(AnalyticsError::dimension_mismatch(
138            format!("{}", p1.len()),
139            format!("{}", p2.len()),
140        ));
141    }
142
143    let dist_sq: f64 = p1.iter().zip(p2.iter()).map(|(a, b)| (a - b).powi(2)).sum();
144
145    Ok(dist_sq.sqrt())
146}
147
148/// Calculate Davies-Bouldin Index for clustering quality
149///
150/// Lower values indicate better clustering.
151///
152/// # Arguments
153/// * `data` - Feature matrix (n_samples × n_features)
154/// * `labels` - Cluster assignments
155/// * `centers` - Cluster centers
156///
157/// # Errors
158/// Returns error if computation fails
159pub fn davies_bouldin_index(
160    data: &ArrayView2<f64>,
161    labels: &Array1<i32>,
162    centers: &Array2<f64>,
163) -> Result<f64> {
164    let n_clusters = centers.nrows();
165    if n_clusters < 2 {
166        return Err(AnalyticsError::insufficient_data(
167            "Need at least 2 clusters for Davies-Bouldin index",
168        ));
169    }
170
171    // Calculate average within-cluster distances
172    let mut s = vec![0.0; n_clusters];
173    let mut counts = vec![0; n_clusters];
174
175    for (i, &label) in labels.iter().enumerate() {
176        if label >= 0 {
177            let cluster_idx = label as usize;
178            if cluster_idx < n_clusters {
179                let dist = euclidean_distance(&data.row(i), &centers.row(cluster_idx))?;
180                s[cluster_idx] += dist;
181                counts[cluster_idx] += 1;
182            }
183        }
184    }
185
186    for i in 0..n_clusters {
187        if counts[i] > 0 {
188            s[i] /= counts[i] as f64;
189        }
190    }
191
192    // Calculate Davies-Bouldin index
193    let mut db_sum = 0.0;
194    for i in 0..n_clusters {
195        let mut max_ratio: f64 = 0.0;
196        for j in 0..n_clusters {
197            if i != j {
198                let m_ij = euclidean_distance(&centers.row(i), &centers.row(j))?;
199                if m_ij > f64::EPSILON {
200                    let ratio = (s[i] + s[j]) / m_ij;
201                    max_ratio = max_ratio.max(ratio);
202                }
203            }
204        }
205        db_sum += max_ratio;
206    }
207
208    Ok(db_sum / (n_clusters as f64))
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214    use approx::assert_abs_diff_eq;
215    use scirs2_core::ndarray::array;
216
217    #[test]
218    fn test_euclidean_distance() {
219        let p1 = array![0.0, 0.0];
220        let p2 = array![3.0, 4.0];
221        let dist = euclidean_distance(&p1.view(), &p2.view())
222            .expect("Euclidean distance calculation should succeed");
223        assert_abs_diff_eq!(dist, 5.0, epsilon = 1e-10);
224    }
225
226    #[test]
227    fn test_silhouette_score() {
228        // Create simple 2-cluster data
229        let data = array![[0.0, 0.0], [0.1, 0.1], [10.0, 10.0], [10.1, 10.1],];
230        let labels = array![0, 0, 1, 1];
231
232        let score = silhouette_score(&data.view(), &labels)
233            .expect("Silhouette score calculation should succeed");
234        assert!(score > 0.5); // Should have good separation
235    }
236
237    #[test]
238    fn test_davies_bouldin_index() {
239        let data = array![[0.0, 0.0], [0.1, 0.1], [10.0, 10.0], [10.1, 10.1],];
240        let labels = array![0, 0, 1, 1];
241        let centers = array![[0.05, 0.05], [10.05, 10.05]];
242
243        let db_index = davies_bouldin_index(&data.view(), &labels, &centers)
244            .expect("Davies-Bouldin index calculation should succeed");
245        assert!(db_index < 1.0); // Should be relatively low for well-separated clusters
246    }
247}