1use super::MLError;
4use crate::DataFrame;
5pub struct ClusterAnalyzer {
10 pub n_clusters: usize,
12 pub algorithm: ClusteringAlgorithm,
14 pub centers: Option<Vec<Vec<f64>>>,
16 pub assignments: Option<Vec<usize>>,
18 pub results: Option<ClusteringResults>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
24pub enum ClusteringAlgorithm {
25 KMeans,
27 DBSCAN,
29 Hierarchical,
31 GaussianMixture,
33}
34
35#[derive(Debug, Clone)]
37pub struct ClusteringResults {
38 pub assignments: Vec<usize>,
40 pub centers: Option<Vec<Vec<f64>>>,
42 pub n_clusters: usize,
44 pub silhouette_score: Option<f64>,
46 pub inertia: Option<f64>,
48 pub cluster_stats: Vec<ClusterStats>,
50}
51
52#[derive(Debug, Clone)]
54pub struct ClusterStats {
55 pub cluster_id: usize,
57 pub size: usize,
59 pub center: Vec<f64>,
61 pub avg_distance: f64,
63 pub max_distance: f64,
65 pub min_distance: f64,
67}
68
69#[derive(Debug, Clone)]
71pub struct ClusteringConfig {
72 pub n_clusters: usize,
74 pub algorithm: ClusteringAlgorithm,
76 pub max_iterations: usize,
78 pub tolerance: f64,
80 pub random_seed: Option<u64>,
82}
83
84impl ClusterAnalyzer {
85 pub fn new(n_clusters: usize, algorithm: ClusteringAlgorithm) -> Self {
87 Self {
88 n_clusters,
89 algorithm,
90 centers: None,
91 assignments: None,
92 results: None,
93 }
94 }
95
96 pub fn cluster(&mut self, data: &DataFrame) -> Result<ClusteringResults, MLError> {
98 if data.height() < self.n_clusters {
99 return Err(MLError::InsufficientData(
100 format!(
101 "Need at least {} data points for {} clusters",
102 self.n_clusters, self.n_clusters
103 ),
104 ));
105 }
106
107 let points = self.extract_points(data)?;
108
109 let results = match self.algorithm {
110 ClusteringAlgorithm::KMeans => self.kmeans_clustering(&points)?,
111 ClusteringAlgorithm::DBSCAN => self.dbscan_clustering(&points)?,
112 ClusteringAlgorithm::Hierarchical => self.hierarchical_clustering(&points)?,
113 ClusteringAlgorithm::GaussianMixture => self.gaussian_mixture_clustering(&points)?,
114 };
115
116 self.results = Some(results.clone());
117 Ok(results)
118 }
119
120 fn extract_points(&self, data: &DataFrame) -> Result<Vec<Vec<f64>>, MLError> {
122 let mut points = Vec::new();
123 let columns = data.get_columns();
124
125 let mut numeric_columns = Vec::new();
127 for column in columns {
128 if let Ok(series) = column.f64() {
129 numeric_columns.push(series);
130 }
131 }
132
133 if numeric_columns.is_empty() {
134 return Err(MLError::InvalidData(
135 "No numeric columns found for clustering".to_string(),
136 ));
137 }
138
139 let n_rows = numeric_columns[0].len();
140 for i in 0..n_rows {
141 let mut point = Vec::new();
142 for series in &numeric_columns {
143 if let Some(value) = series.get(i) {
144 point.push(value);
145 } else {
146 return Err(MLError::InvalidData(
147 "Missing values in numeric columns".to_string(),
148 ));
149 }
150 }
151 points.push(point);
152 }
153
154 Ok(points)
155 }
156
157 fn kmeans_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
159 let n_points = points.len();
160 let n_features = points[0].len();
161 let max_iterations = 100;
162 let tolerance = 1e-4;
163
164 let mut centers = self.initialize_centers(points, self.n_clusters);
166 let mut assignments = vec![0; n_points];
167 let mut prev_inertia = f64::INFINITY;
168
169 for _iteration in 0..max_iterations {
170 for (i, point) in points.iter().enumerate() {
172 let mut min_distance = f64::INFINITY;
173 let mut best_cluster = 0;
174
175 for (j, center) in centers.iter().enumerate() {
176 let distance = self.euclidean_distance(point, center);
177 if distance < min_distance {
178 min_distance = distance;
179 best_cluster = j;
180 }
181 }
182 assignments[i] = best_cluster;
183 }
184
185 let mut new_centers = vec![vec![0.0; n_features]; self.n_clusters];
187 let mut cluster_counts = vec![0; self.n_clusters];
188
189 for (i, point) in points.iter().enumerate() {
190 let cluster = assignments[i];
191 for (j, &value) in point.iter().enumerate() {
192 new_centers[cluster][j] += value;
193 }
194 cluster_counts[cluster] += 1;
195 }
196
197 for i in 0..self.n_clusters {
198 if cluster_counts[i] > 0 {
199 for j in 0..n_features {
200 new_centers[i][j] /= cluster_counts[i] as f64;
201 }
202 }
203 }
204
205 let inertia = self.calculate_inertia(points, &assignments, &new_centers);
207
208 if (prev_inertia - inertia).abs() < tolerance {
210 break;
211 }
212
213 centers = new_centers;
214 prev_inertia = inertia;
215 }
216
217 self.centers = Some(centers.clone());
218 self.assignments = Some(assignments.clone());
219
220 let cluster_stats = self.calculate_cluster_stats(points, &assignments, ¢ers);
222 let silhouette_score = self.calculate_silhouette_score(points, &assignments);
223
224 Ok(ClusteringResults {
225 assignments,
226 centers: Some(centers),
227 n_clusters: self.n_clusters,
228 silhouette_score: Some(silhouette_score),
229 inertia: Some(prev_inertia),
230 cluster_stats,
231 })
232 }
233
234 fn dbscan_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
236 let eps = 0.5; let min_points = 3; let mut assignments = vec![usize::MAX; points.len()]; let mut cluster_id = 0;
241
242 for i in 0..points.len() {
243 if assignments[i] != usize::MAX {
244 continue; }
246
247 let neighbors = self.find_neighbors(points, i, eps);
248 if neighbors.len() < min_points {
249 assignments[i] = usize::MAX; continue;
251 }
252
253 assignments[i] = cluster_id;
255 let mut seed_set = neighbors;
256
257 while let Some(point_idx) = seed_set.pop() {
258 if assignments[point_idx] == usize::MAX {
259 assignments[point_idx] = cluster_id;
260 }
261 if assignments[point_idx] != usize::MAX {
262 continue;
263 }
264
265 assignments[point_idx] = cluster_id;
266 let point_neighbors = self.find_neighbors(points, point_idx, eps);
267 if point_neighbors.len() >= min_points {
268 seed_set.extend(point_neighbors);
269 }
270 }
271
272 cluster_id += 1;
273 }
274
275 let n_clusters = cluster_id;
276 let cluster_stats = self.calculate_dbscan_cluster_stats(points, &assignments, n_clusters);
277
278 Ok(ClusteringResults {
279 assignments,
280 centers: None,
281 n_clusters,
282 silhouette_score: None,
283 inertia: None,
284 cluster_stats,
285 })
286 }
287
288 fn hierarchical_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
290 let n_points = points.len();
292 let mut assignments = vec![0; n_points];
293
294 let mut clusters: Vec<Vec<usize>> = (0..n_points).map(|i| vec![i]).collect();
296
297 while clusters.len() > self.n_clusters {
298 let mut min_distance = f64::INFINITY;
300 let mut best_pair = (0, 1);
301
302 for i in 0..clusters.len() {
303 for j in (i + 1)..clusters.len() {
304 let distance = self.cluster_distance(&clusters[i], &clusters[j], points);
305 if distance < min_distance {
306 min_distance = distance;
307 best_pair = (i, j);
308 }
309 }
310 }
311
312 let (i, j) = best_pair;
314 let mut merged_cluster = clusters[i].clone();
315 merged_cluster.extend(clusters[j].clone());
316 clusters[i] = merged_cluster;
317 clusters.remove(j);
318 }
319
320 for (cluster_id, cluster) in clusters.iter().enumerate() {
322 for &point_idx in cluster {
323 assignments[point_idx] = cluster_id;
324 }
325 }
326
327 let cluster_stats = self.calculate_hierarchical_cluster_stats(points, &assignments, clusters.len());
328
329 Ok(ClusteringResults {
330 assignments,
331 centers: None,
332 n_clusters: clusters.len(),
333 silhouette_score: None,
334 inertia: None,
335 cluster_stats,
336 })
337 }
338
339 fn gaussian_mixture_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
341 let n_points = points.len();
343 let n_features = points[0].len();
344 let max_iterations = 50;
345
346 let mut means = self.initialize_centers(points, self.n_clusters);
348 let covariances = vec![vec![vec![1.0; n_features]; n_features]; self.n_clusters];
349 let mut weights = vec![1.0 / self.n_clusters as f64; self.n_clusters];
350 let mut assignments = vec![0; n_points];
351
352 for _ in 0..max_iterations {
353 let mut responsibilities = vec![vec![0.0; self.n_clusters]; n_points];
355
356 for (i, point) in points.iter().enumerate() {
357 let mut total_prob = 0.0;
358 for k in 0..self.n_clusters {
359 let prob = self.gaussian_probability(point, &means[k], &covariances[k]) * weights[k];
360 responsibilities[i][k] = prob;
361 total_prob += prob;
362 }
363
364 if total_prob > 0.0 {
366 for k in 0..self.n_clusters {
367 responsibilities[i][k] /= total_prob;
368 }
369 }
370 }
371
372 for k in 0..self.n_clusters {
374 let mut sum_resp = 0.0;
375 for i in 0..n_points {
376 sum_resp += responsibilities[i][k];
377 }
378
379 if sum_resp > 0.0 {
380 for j in 0..n_features {
382 means[k][j] = 0.0;
383 for i in 0..n_points {
384 means[k][j] += responsibilities[i][k] * points[i][j];
385 }
386 means[k][j] /= sum_resp;
387 }
388
389 weights[k] = sum_resp / n_points as f64;
391 }
392 }
393
394 for (i, _point) in points.iter().enumerate() {
396 let mut max_prob = 0.0;
397 let mut best_cluster = 0;
398 for k in 0..self.n_clusters {
399 if responsibilities[i][k] > max_prob {
400 max_prob = responsibilities[i][k];
401 best_cluster = k;
402 }
403 }
404 assignments[i] = best_cluster;
405 }
406 }
407
408 let cluster_stats = self.calculate_cluster_stats(points, &assignments, &means);
409
410 Ok(ClusteringResults {
411 assignments,
412 centers: Some(means),
413 n_clusters: self.n_clusters,
414 silhouette_score: None,
415 inertia: None,
416 cluster_stats,
417 })
418 }
419
420 fn initialize_centers(&self, points: &[Vec<f64>], n_clusters: usize) -> Vec<Vec<f64>> {
422 let n_features = points[0].len();
423 let mut centers = Vec::new();
424
425 for _ in 0..n_clusters {
426 let mut center = Vec::new();
427 for j in 0..n_features {
428 let min_val = points.iter().map(|p| p[j]).fold(f64::INFINITY, f64::min);
429 let max_val = points.iter().map(|p| p[j]).fold(f64::NEG_INFINITY, f64::max);
430 let random_val = min_val + (max_val - min_val) * 0.5; center.push(random_val);
432 }
433 centers.push(center);
434 }
435
436 centers
437 }
438
439 fn euclidean_distance(&self, a: &[f64], b: &[f64]) -> f64 {
441 a.iter()
442 .zip(b.iter())
443 .map(|(x, y)| (x - y).powi(2))
444 .sum::<f64>()
445 .sqrt()
446 }
447
448 fn calculate_inertia(&self, points: &[Vec<f64>], assignments: &[usize], centers: &[Vec<f64>]) -> f64 {
450 let mut inertia = 0.0;
451 for (i, point) in points.iter().enumerate() {
452 let cluster = assignments[i];
453 let distance = self.euclidean_distance(point, ¢ers[cluster]);
454 inertia += distance * distance;
455 }
456 inertia
457 }
458
459 fn find_neighbors(&self, points: &[Vec<f64>], point_idx: usize, eps: f64) -> Vec<usize> {
461 let mut neighbors = Vec::new();
462 let point = &points[point_idx];
463
464 for (i, other_point) in points.iter().enumerate() {
465 if i != point_idx {
466 let distance = self.euclidean_distance(point, other_point);
467 if distance <= eps {
468 neighbors.push(i);
469 }
470 }
471 }
472
473 neighbors
474 }
475
476 fn cluster_distance(&self, cluster1: &[usize], cluster2: &[usize], points: &[Vec<f64>]) -> f64 {
478 let mut min_distance = f64::INFINITY;
479
480 for &i in cluster1 {
481 for &j in cluster2 {
482 let distance = self.euclidean_distance(&points[i], &points[j]);
483 if distance < min_distance {
484 min_distance = distance;
485 }
486 }
487 }
488
489 min_distance
490 }
491
492 fn gaussian_probability(&self, point: &[f64], mean: &[f64], covariance: &[Vec<f64>]) -> f64 {
494 let diff = point[0] - mean[0];
496 let variance = covariance[0][0];
497 (-0.5 * diff * diff / variance).exp() / (2.0 * std::f64::consts::PI * variance).sqrt()
498 }
499
500 fn calculate_cluster_stats(
502 &self,
503 points: &[Vec<f64>],
504 assignments: &[usize],
505 centers: &[Vec<f64>],
506 ) -> Vec<ClusterStats> {
507 let mut cluster_stats = Vec::new();
508
509 for cluster_id in 0..self.n_clusters {
510 let cluster_points: Vec<&Vec<f64>> = points
511 .iter()
512 .zip(assignments.iter())
513 .filter(|(_, &assignment)| assignment == cluster_id)
514 .map(|(point, _)| point)
515 .collect();
516
517 if cluster_points.is_empty() {
518 continue;
519 }
520
521 let size = cluster_points.len();
522 let center = centers[cluster_id].clone();
523
524 let mut distances = Vec::new();
525 for point in &cluster_points {
526 let distance = self.euclidean_distance(point, ¢er);
527 distances.push(distance);
528 }
529
530 distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
531
532 cluster_stats.push(ClusterStats {
533 cluster_id,
534 size,
535 center,
536 avg_distance: distances.iter().sum::<f64>() / size as f64,
537 max_distance: *distances.last().unwrap(),
538 min_distance: *distances.first().unwrap(),
539 });
540 }
541
542 cluster_stats
543 }
544
545 fn calculate_dbscan_cluster_stats(
547 &self,
548 points: &[Vec<f64>],
549 assignments: &[usize],
550 n_clusters: usize,
551 ) -> Vec<ClusterStats> {
552 let mut cluster_stats = Vec::new();
553
554 for cluster_id in 0..n_clusters {
555 let cluster_points: Vec<&Vec<f64>> = points
556 .iter()
557 .zip(assignments.iter())
558 .filter(|(_, &assignment)| assignment == cluster_id)
559 .map(|(point, _)| point)
560 .collect();
561
562 if cluster_points.is_empty() {
563 continue;
564 }
565
566 let size = cluster_points.len();
567
568 let n_features = cluster_points[0].len();
570 let mut center = vec![0.0; n_features];
571 for point in &cluster_points {
572 for (j, &value) in point.iter().enumerate() {
573 center[j] += value;
574 }
575 }
576 for j in 0..n_features {
577 center[j] /= size as f64;
578 }
579
580 let mut distances = Vec::new();
582 for point in &cluster_points {
583 let distance = self.euclidean_distance(point, ¢er);
584 distances.push(distance);
585 }
586
587 distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
588
589 cluster_stats.push(ClusterStats {
590 cluster_id,
591 size,
592 center,
593 avg_distance: distances.iter().sum::<f64>() / size as f64,
594 max_distance: *distances.last().unwrap(),
595 min_distance: *distances.first().unwrap(),
596 });
597 }
598
599 cluster_stats
600 }
601
602 fn calculate_hierarchical_cluster_stats(
604 &self,
605 points: &[Vec<f64>],
606 assignments: &[usize],
607 n_clusters: usize,
608 ) -> Vec<ClusterStats> {
609 self.calculate_dbscan_cluster_stats(points, assignments, n_clusters)
610 }
611
612 fn calculate_silhouette_score(&self, points: &[Vec<f64>], assignments: &[usize]) -> f64 {
614 let mut total_silhouette = 0.0;
615 let mut count = 0;
616
617 for (i, point) in points.iter().enumerate() {
618 let cluster = assignments[i];
619
620 let mut intra_cluster_dist = 0.0;
622 let mut intra_cluster_count = 0;
623
624 for (j, other_point) in points.iter().enumerate() {
625 if i != j && assignments[j] == cluster {
626 intra_cluster_dist += self.euclidean_distance(point, other_point);
627 intra_cluster_count += 1;
628 }
629 }
630
631 if intra_cluster_count == 0 {
632 continue;
633 }
634
635 intra_cluster_dist /= intra_cluster_count as f64;
636
637 let mut min_inter_cluster_dist = f64::INFINITY;
639
640 for other_cluster in 0..self.n_clusters {
641 if other_cluster == cluster {
642 continue;
643 }
644
645 let mut inter_cluster_dist = 0.0;
646 let mut inter_cluster_count = 0;
647
648 for (j, other_point) in points.iter().enumerate() {
649 if assignments[j] == other_cluster {
650 inter_cluster_dist += self.euclidean_distance(point, other_point);
651 inter_cluster_count += 1;
652 }
653 }
654
655 if inter_cluster_count > 0 {
656 inter_cluster_dist /= inter_cluster_count as f64;
657 if inter_cluster_dist < min_inter_cluster_dist {
658 min_inter_cluster_dist = inter_cluster_dist;
659 }
660 }
661 }
662
663 if min_inter_cluster_dist != f64::INFINITY {
664 let silhouette = (min_inter_cluster_dist - intra_cluster_dist) /
665 min_inter_cluster_dist.max(intra_cluster_dist);
666 total_silhouette += silhouette;
667 count += 1;
668 }
669 }
670
671 if count > 0 {
672 total_silhouette / count as f64
673 } else {
674 0.0
675 }
676 }
677}