oxigdal_analytics/clustering/
mod.rs1pub mod dbscan;
9pub mod kmeans;
10
11pub use dbscan::{DbscanClusterer, DbscanResult};
12pub use kmeans::{KMeansClusterer, KMeansResult};
13
14use crate::error::{AnalyticsError, Result};
15use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
16
17#[derive(Debug, Clone)]
19pub struct ClusterInfo {
20 pub labels: Array1<i32>,
22 pub n_clusters: usize,
24 pub centers: Option<Array2<f64>>,
26}
27
28pub fn silhouette_score(data: &ArrayView2<f64>, labels: &Array1<i32>) -> Result<f64> {
40 let n_samples = data.nrows();
41
42 if n_samples != labels.len() {
43 return Err(AnalyticsError::dimension_mismatch(
44 format!("{}", n_samples),
45 format!("{}", labels.len()),
46 ));
47 }
48
49 if n_samples < 2 {
50 return Err(AnalyticsError::insufficient_data(
51 "Need at least 2 samples for silhouette score",
52 ));
53 }
54
55 let mut unique_labels: Vec<i32> = labels.iter().copied().filter(|&x| x >= 0).collect();
57 unique_labels.sort_unstable();
58 unique_labels.dedup();
59
60 if unique_labels.len() < 2 {
61 return Err(AnalyticsError::insufficient_data(
62 "Need at least 2 clusters for silhouette score",
63 ));
64 }
65
66 let mut silhouette_sum = 0.0;
67 let mut n_valid = 0;
68
69 for i in 0..n_samples {
70 let label_i = labels[i];
71 if label_i < 0 {
72 continue; }
74
75 let mut a_sum = 0.0;
77 let mut a_count = 0;
78 for j in 0..n_samples {
79 if i != j && labels[j] == label_i {
80 a_sum += euclidean_distance(&data.row(i), &data.row(j))?;
81 a_count += 1;
82 }
83 }
84 let a_i = if a_count > 0 {
85 a_sum / (a_count as f64)
86 } else {
87 0.0
88 };
89
90 let mut b_i = f64::INFINITY;
92 for &other_label in &unique_labels {
93 if other_label == label_i {
94 continue;
95 }
96
97 let mut b_sum = 0.0;
98 let mut b_count = 0;
99 for j in 0..n_samples {
100 if labels[j] == other_label {
101 b_sum += euclidean_distance(&data.row(i), &data.row(j))?;
102 b_count += 1;
103 }
104 }
105
106 if b_count > 0 {
107 let avg_dist = b_sum / (b_count as f64);
108 b_i = b_i.min(avg_dist);
109 }
110 }
111
112 if b_i.is_finite() && a_i.is_finite() {
114 let max_val = a_i.max(b_i);
115 if max_val > f64::EPSILON {
116 silhouette_sum += (b_i - a_i) / max_val;
117 n_valid += 1;
118 }
119 }
120 }
121
122 if n_valid == 0 {
123 return Err(AnalyticsError::clustering_error(
124 "No valid silhouette coefficients computed",
125 ));
126 }
127
128 Ok(silhouette_sum / (n_valid as f64))
129}
130
131fn euclidean_distance(
133 p1: &scirs2_core::ndarray::ArrayView1<f64>,
134 p2: &scirs2_core::ndarray::ArrayView1<f64>,
135) -> Result<f64> {
136 if p1.len() != p2.len() {
137 return Err(AnalyticsError::dimension_mismatch(
138 format!("{}", p1.len()),
139 format!("{}", p2.len()),
140 ));
141 }
142
143 let dist_sq: f64 = p1.iter().zip(p2.iter()).map(|(a, b)| (a - b).powi(2)).sum();
144
145 Ok(dist_sq.sqrt())
146}
147
148pub fn davies_bouldin_index(
160 data: &ArrayView2<f64>,
161 labels: &Array1<i32>,
162 centers: &Array2<f64>,
163) -> Result<f64> {
164 let n_clusters = centers.nrows();
165 if n_clusters < 2 {
166 return Err(AnalyticsError::insufficient_data(
167 "Need at least 2 clusters for Davies-Bouldin index",
168 ));
169 }
170
171 let mut s = vec![0.0; n_clusters];
173 let mut counts = vec![0; n_clusters];
174
175 for (i, &label) in labels.iter().enumerate() {
176 if label >= 0 {
177 let cluster_idx = label as usize;
178 if cluster_idx < n_clusters {
179 let dist = euclidean_distance(&data.row(i), ¢ers.row(cluster_idx))?;
180 s[cluster_idx] += dist;
181 counts[cluster_idx] += 1;
182 }
183 }
184 }
185
186 for i in 0..n_clusters {
187 if counts[i] > 0 {
188 s[i] /= counts[i] as f64;
189 }
190 }
191
192 let mut db_sum = 0.0;
194 for i in 0..n_clusters {
195 let mut max_ratio: f64 = 0.0;
196 for j in 0..n_clusters {
197 if i != j {
198 let m_ij = euclidean_distance(¢ers.row(i), ¢ers.row(j))?;
199 if m_ij > f64::EPSILON {
200 let ratio = (s[i] + s[j]) / m_ij;
201 max_ratio = max_ratio.max(ratio);
202 }
203 }
204 }
205 db_sum += max_ratio;
206 }
207
208 Ok(db_sum / (n_clusters as f64))
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214 use approx::assert_abs_diff_eq;
215 use scirs2_core::ndarray::array;
216
217 #[test]
218 fn test_euclidean_distance() {
219 let p1 = array![0.0, 0.0];
220 let p2 = array![3.0, 4.0];
221 let dist = euclidean_distance(&p1.view(), &p2.view())
222 .expect("Euclidean distance calculation should succeed");
223 assert_abs_diff_eq!(dist, 5.0, epsilon = 1e-10);
224 }
225
226 #[test]
227 fn test_silhouette_score() {
228 let data = array![[0.0, 0.0], [0.1, 0.1], [10.0, 10.0], [10.1, 10.1],];
230 let labels = array![0, 0, 1, 1];
231
232 let score = silhouette_score(&data.view(), &labels)
233 .expect("Silhouette score calculation should succeed");
234 assert!(score > 0.5); }
236
237 #[test]
238 fn test_davies_bouldin_index() {
239 let data = array![[0.0, 0.0], [0.1, 0.1], [10.0, 10.0], [10.1, 10.1],];
240 let labels = array![0, 0, 1, 1];
241 let centers = array![[0.05, 0.05], [10.05, 10.05]];
242
243 let db_index = davies_bouldin_index(&data.view(), &labels, ¢ers)
244 .expect("Davies-Bouldin index calculation should succeed");
245 assert!(db_index < 1.0); }
247}