leptos_helios/intelligence/
clustering.rs

1//! Clustering Analysis Module
2
3use super::MLError;
4use crate::DataFrame;
5// use polars::prelude::*; // Currently unused
6// use std::collections::HashMap; // Currently unused
7
8/// Cluster analyzer for grouping similar data points
9pub struct ClusterAnalyzer {
10    /// Number of clusters to create
11    pub n_clusters: usize,
12    /// Clustering algorithm to use
13    pub algorithm: ClusteringAlgorithm,
14    /// Cluster centers (for K-means)
15    pub centers: Option<Vec<Vec<f64>>>,
16    /// Cluster assignments
17    pub assignments: Option<Vec<usize>>,
18    /// Clustering results
19    pub results: Option<ClusteringResults>,
20}
21
22/// Clustering algorithms supported
23#[derive(Debug, Clone, PartialEq)]
24pub enum ClusteringAlgorithm {
25    /// K-Means clustering
26    KMeans,
27    /// DBSCAN clustering
28    DBSCAN,
29    /// Hierarchical clustering
30    Hierarchical,
31    /// Gaussian Mixture Model
32    GaussianMixture,
33}
34
35/// Clustering results
36#[derive(Debug, Clone)]
37pub struct ClusteringResults {
38    /// Cluster assignments for each data point
39    pub assignments: Vec<usize>,
40    /// Cluster centers (for centroid-based algorithms)
41    pub centers: Option<Vec<Vec<f64>>>,
42    /// Number of clusters found
43    pub n_clusters: usize,
44    /// Silhouette score (clustering quality metric)
45    pub silhouette_score: Option<f64>,
46    /// Inertia (for K-means)
47    pub inertia: Option<f64>,
48    /// Cluster statistics
49    pub cluster_stats: Vec<ClusterStats>,
50}
51
52/// Statistics for each cluster
53#[derive(Debug, Clone)]
54pub struct ClusterStats {
55    /// Cluster ID
56    pub cluster_id: usize,
57    /// Number of points in cluster
58    pub size: usize,
59    /// Center of the cluster
60    pub center: Vec<f64>,
61    /// Average distance to center
62    pub avg_distance: f64,
63    /// Maximum distance to center
64    pub max_distance: f64,
65    /// Minimum distance to center
66    pub min_distance: f64,
67}
68
69/// Clustering configuration
70#[derive(Debug, Clone)]
71pub struct ClusteringConfig {
72    /// Number of clusters
73    pub n_clusters: usize,
74    /// Algorithm to use
75    pub algorithm: ClusteringAlgorithm,
76    /// Maximum iterations (for iterative algorithms)
77    pub max_iterations: usize,
78    /// Tolerance for convergence
79    pub tolerance: f64,
80    /// Random seed for reproducibility
81    pub random_seed: Option<u64>,
82}
83
84impl ClusterAnalyzer {
85    /// Create a new cluster analyzer
86    pub fn new(n_clusters: usize, algorithm: ClusteringAlgorithm) -> Self {
87        Self {
88            n_clusters,
89            algorithm,
90            centers: None,
91            assignments: None,
92            results: None,
93        }
94    }
95
96    /// Perform clustering on the data
97    pub fn cluster(&mut self, data: &DataFrame) -> Result<ClusteringResults, MLError> {
98        if data.height() < self.n_clusters {
99            return Err(MLError::InsufficientData(
100                format!(
101                    "Need at least {} data points for {} clusters",
102                    self.n_clusters, self.n_clusters
103                ),
104            ));
105        }
106
107        let points = self.extract_points(data)?;
108
109        let results = match self.algorithm {
110            ClusteringAlgorithm::KMeans => self.kmeans_clustering(&points)?,
111            ClusteringAlgorithm::DBSCAN => self.dbscan_clustering(&points)?,
112            ClusteringAlgorithm::Hierarchical => self.hierarchical_clustering(&points)?,
113            ClusteringAlgorithm::GaussianMixture => self.gaussian_mixture_clustering(&points)?,
114        };
115
116        self.results = Some(results.clone());
117        Ok(results)
118    }
119
120    /// Extract data points from DataFrame
121    fn extract_points(&self, data: &DataFrame) -> Result<Vec<Vec<f64>>, MLError> {
122        let mut points = Vec::new();
123        let columns = data.get_columns();
124
125        // Find numeric columns
126        let mut numeric_columns = Vec::new();
127        for column in columns {
128            if let Ok(series) = column.f64() {
129                numeric_columns.push(series);
130            }
131        }
132
133        if numeric_columns.is_empty() {
134            return Err(MLError::InvalidData(
135                "No numeric columns found for clustering".to_string(),
136            ));
137        }
138
139        let n_rows = numeric_columns[0].len();
140        for i in 0..n_rows {
141            let mut point = Vec::new();
142            for series in &numeric_columns {
143                if let Some(value) = series.get(i) {
144                    point.push(value);
145                } else {
146                    return Err(MLError::InvalidData(
147                        "Missing values in numeric columns".to_string(),
148                    ));
149                }
150            }
151            points.push(point);
152        }
153
154        Ok(points)
155    }
156
157    /// Perform K-means clustering
158    fn kmeans_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
159        let n_points = points.len();
160        let n_features = points[0].len();
161        let max_iterations = 100;
162        let tolerance = 1e-4;
163
164        // Initialize cluster centers randomly
165        let mut centers = self.initialize_centers(points, self.n_clusters);
166        let mut assignments = vec![0; n_points];
167        let mut prev_inertia = f64::INFINITY;
168
169        for _iteration in 0..max_iterations {
170            // Assign points to nearest centers
171            for (i, point) in points.iter().enumerate() {
172                let mut min_distance = f64::INFINITY;
173                let mut best_cluster = 0;
174
175                for (j, center) in centers.iter().enumerate() {
176                    let distance = self.euclidean_distance(point, center);
177                    if distance < min_distance {
178                        min_distance = distance;
179                        best_cluster = j;
180                    }
181                }
182                assignments[i] = best_cluster;
183            }
184
185            // Update centers
186            let mut new_centers = vec![vec![0.0; n_features]; self.n_clusters];
187            let mut cluster_counts = vec![0; self.n_clusters];
188
189            for (i, point) in points.iter().enumerate() {
190                let cluster = assignments[i];
191                for (j, &value) in point.iter().enumerate() {
192                    new_centers[cluster][j] += value;
193                }
194                cluster_counts[cluster] += 1;
195            }
196
197            for i in 0..self.n_clusters {
198                if cluster_counts[i] > 0 {
199                    for j in 0..n_features {
200                        new_centers[i][j] /= cluster_counts[i] as f64;
201                    }
202                }
203            }
204
205            // Calculate inertia
206            let inertia = self.calculate_inertia(points, &assignments, &new_centers);
207
208            // Check for convergence
209            if (prev_inertia - inertia).abs() < tolerance {
210                break;
211            }
212
213            centers = new_centers;
214            prev_inertia = inertia;
215        }
216
217        self.centers = Some(centers.clone());
218        self.assignments = Some(assignments.clone());
219
220        // Calculate cluster statistics
221        let cluster_stats = self.calculate_cluster_stats(points, &assignments, &centers);
222        let silhouette_score = self.calculate_silhouette_score(points, &assignments);
223
224        Ok(ClusteringResults {
225            assignments,
226            centers: Some(centers),
227            n_clusters: self.n_clusters,
228            silhouette_score: Some(silhouette_score),
229            inertia: Some(prev_inertia),
230            cluster_stats,
231        })
232    }
233
234    /// Perform DBSCAN clustering
235    fn dbscan_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
236        let eps = 0.5; // Distance threshold
237        let min_points = 3; // Minimum points for core point
238
239        let mut assignments = vec![usize::MAX; points.len()]; // -1 for noise
240        let mut cluster_id = 0;
241
242        for i in 0..points.len() {
243            if assignments[i] != usize::MAX {
244                continue; // Already processed
245            }
246
247            let neighbors = self.find_neighbors(points, i, eps);
248            if neighbors.len() < min_points {
249                assignments[i] = usize::MAX; // Noise point
250                continue;
251            }
252
253            // Start new cluster
254            assignments[i] = cluster_id;
255            let mut seed_set = neighbors;
256
257            while let Some(point_idx) = seed_set.pop() {
258                if assignments[point_idx] == usize::MAX {
259                    assignments[point_idx] = cluster_id;
260                }
261                if assignments[point_idx] != usize::MAX {
262                    continue;
263                }
264
265                assignments[point_idx] = cluster_id;
266                let point_neighbors = self.find_neighbors(points, point_idx, eps);
267                if point_neighbors.len() >= min_points {
268                    seed_set.extend(point_neighbors);
269                }
270            }
271
272            cluster_id += 1;
273        }
274
275        let n_clusters = cluster_id;
276        let cluster_stats = self.calculate_dbscan_cluster_stats(points, &assignments, n_clusters);
277
278        Ok(ClusteringResults {
279            assignments,
280            centers: None,
281            n_clusters,
282            silhouette_score: None,
283            inertia: None,
284            cluster_stats,
285        })
286    }
287
288    /// Perform hierarchical clustering
289    fn hierarchical_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
290        // Simplified hierarchical clustering using single linkage
291        let n_points = points.len();
292        let mut assignments = vec![0; n_points];
293
294        // Start with each point as its own cluster
295        let mut clusters: Vec<Vec<usize>> = (0..n_points).map(|i| vec![i]).collect();
296
297        while clusters.len() > self.n_clusters {
298            // Find closest pair of clusters
299            let mut min_distance = f64::INFINITY;
300            let mut best_pair = (0, 1);
301
302            for i in 0..clusters.len() {
303                for j in (i + 1)..clusters.len() {
304                    let distance = self.cluster_distance(&clusters[i], &clusters[j], points);
305                    if distance < min_distance {
306                        min_distance = distance;
307                        best_pair = (i, j);
308                    }
309                }
310            }
311
312            // Merge clusters
313            let (i, j) = best_pair;
314            let mut merged_cluster = clusters[i].clone();
315            merged_cluster.extend(clusters[j].clone());
316            clusters[i] = merged_cluster;
317            clusters.remove(j);
318        }
319
320        // Assign cluster IDs
321        for (cluster_id, cluster) in clusters.iter().enumerate() {
322            for &point_idx in cluster {
323                assignments[point_idx] = cluster_id;
324            }
325        }
326
327        let cluster_stats = self.calculate_hierarchical_cluster_stats(points, &assignments, clusters.len());
328
329        Ok(ClusteringResults {
330            assignments,
331            centers: None,
332            n_clusters: clusters.len(),
333            silhouette_score: None,
334            inertia: None,
335            cluster_stats,
336        })
337    }
338
339    /// Perform Gaussian Mixture Model clustering
340    fn gaussian_mixture_clustering(&mut self, points: &[Vec<f64>]) -> Result<ClusteringResults, MLError> {
341        // Simplified GMM implementation
342        let n_points = points.len();
343        let n_features = points[0].len();
344        let max_iterations = 50;
345
346        // Initialize means randomly
347        let mut means = self.initialize_centers(points, self.n_clusters);
348        let covariances = vec![vec![vec![1.0; n_features]; n_features]; self.n_clusters];
349        let mut weights = vec![1.0 / self.n_clusters as f64; self.n_clusters];
350        let mut assignments = vec![0; n_points];
351
352        for _ in 0..max_iterations {
353            // E-step: Calculate responsibilities
354            let mut responsibilities = vec![vec![0.0; self.n_clusters]; n_points];
355
356            for (i, point) in points.iter().enumerate() {
357                let mut total_prob = 0.0;
358                for k in 0..self.n_clusters {
359                    let prob = self.gaussian_probability(point, &means[k], &covariances[k]) * weights[k];
360                    responsibilities[i][k] = prob;
361                    total_prob += prob;
362                }
363
364                // Normalize
365                if total_prob > 0.0 {
366                    for k in 0..self.n_clusters {
367                        responsibilities[i][k] /= total_prob;
368                    }
369                }
370            }
371
372            // M-step: Update parameters
373            for k in 0..self.n_clusters {
374                let mut sum_resp = 0.0;
375                for i in 0..n_points {
376                    sum_resp += responsibilities[i][k];
377                }
378
379                if sum_resp > 0.0 {
380                    // Update mean
381                    for j in 0..n_features {
382                        means[k][j] = 0.0;
383                        for i in 0..n_points {
384                            means[k][j] += responsibilities[i][k] * points[i][j];
385                        }
386                        means[k][j] /= sum_resp;
387                    }
388
389                    // Update weight
390                    weights[k] = sum_resp / n_points as f64;
391                }
392            }
393
394            // Assign points to clusters
395            for (i, _point) in points.iter().enumerate() {
396                let mut max_prob = 0.0;
397                let mut best_cluster = 0;
398                for k in 0..self.n_clusters {
399                    if responsibilities[i][k] > max_prob {
400                        max_prob = responsibilities[i][k];
401                        best_cluster = k;
402                    }
403                }
404                assignments[i] = best_cluster;
405            }
406        }
407
408        let cluster_stats = self.calculate_cluster_stats(points, &assignments, &means);
409
410        Ok(ClusteringResults {
411            assignments,
412            centers: Some(means),
413            n_clusters: self.n_clusters,
414            silhouette_score: None,
415            inertia: None,
416            cluster_stats,
417        })
418    }
419
420    /// Initialize cluster centers randomly
421    fn initialize_centers(&self, points: &[Vec<f64>], n_clusters: usize) -> Vec<Vec<f64>> {
422        let n_features = points[0].len();
423        let mut centers = Vec::new();
424
425        for _ in 0..n_clusters {
426            let mut center = Vec::new();
427            for j in 0..n_features {
428                let min_val = points.iter().map(|p| p[j]).fold(f64::INFINITY, f64::min);
429                let max_val = points.iter().map(|p| p[j]).fold(f64::NEG_INFINITY, f64::max);
430                let random_val = min_val + (max_val - min_val) * 0.5; // Simplified random
431                center.push(random_val);
432            }
433            centers.push(center);
434        }
435
436        centers
437    }
438
439    /// Calculate Euclidean distance between two points
440    fn euclidean_distance(&self, a: &[f64], b: &[f64]) -> f64 {
441        a.iter()
442            .zip(b.iter())
443            .map(|(x, y)| (x - y).powi(2))
444            .sum::<f64>()
445            .sqrt()
446    }
447
448    /// Calculate inertia (sum of squared distances to centers)
449    fn calculate_inertia(&self, points: &[Vec<f64>], assignments: &[usize], centers: &[Vec<f64>]) -> f64 {
450        let mut inertia = 0.0;
451        for (i, point) in points.iter().enumerate() {
452            let cluster = assignments[i];
453            let distance = self.euclidean_distance(point, &centers[cluster]);
454            inertia += distance * distance;
455        }
456        inertia
457    }
458
459    /// Find neighbors within eps distance
460    fn find_neighbors(&self, points: &[Vec<f64>], point_idx: usize, eps: f64) -> Vec<usize> {
461        let mut neighbors = Vec::new();
462        let point = &points[point_idx];
463
464        for (i, other_point) in points.iter().enumerate() {
465            if i != point_idx {
466                let distance = self.euclidean_distance(point, other_point);
467                if distance <= eps {
468                    neighbors.push(i);
469                }
470            }
471        }
472
473        neighbors
474    }
475
476    /// Calculate distance between two clusters (single linkage)
477    fn cluster_distance(&self, cluster1: &[usize], cluster2: &[usize], points: &[Vec<f64>]) -> f64 {
478        let mut min_distance = f64::INFINITY;
479
480        for &i in cluster1 {
481            for &j in cluster2 {
482                let distance = self.euclidean_distance(&points[i], &points[j]);
483                if distance < min_distance {
484                    min_distance = distance;
485                }
486            }
487        }
488
489        min_distance
490    }
491
492    /// Calculate Gaussian probability
493    fn gaussian_probability(&self, point: &[f64], mean: &[f64], covariance: &[Vec<f64>]) -> f64 {
494        // Simplified 1D Gaussian for now
495        let diff = point[0] - mean[0];
496        let variance = covariance[0][0];
497        (-0.5 * diff * diff / variance).exp() / (2.0 * std::f64::consts::PI * variance).sqrt()
498    }
499
500    /// Calculate cluster statistics
501    fn calculate_cluster_stats(
502        &self,
503        points: &[Vec<f64>],
504        assignments: &[usize],
505        centers: &[Vec<f64>],
506    ) -> Vec<ClusterStats> {
507        let mut cluster_stats = Vec::new();
508
509        for cluster_id in 0..self.n_clusters {
510            let cluster_points: Vec<&Vec<f64>> = points
511                .iter()
512                .zip(assignments.iter())
513                .filter(|(_, &assignment)| assignment == cluster_id)
514                .map(|(point, _)| point)
515                .collect();
516
517            if cluster_points.is_empty() {
518                continue;
519            }
520
521            let size = cluster_points.len();
522            let center = centers[cluster_id].clone();
523
524            let mut distances = Vec::new();
525            for point in &cluster_points {
526                let distance = self.euclidean_distance(point, &center);
527                distances.push(distance);
528            }
529
530            distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
531
532            cluster_stats.push(ClusterStats {
533                cluster_id,
534                size,
535                center,
536                avg_distance: distances.iter().sum::<f64>() / size as f64,
537                max_distance: *distances.last().unwrap(),
538                min_distance: *distances.first().unwrap(),
539            });
540        }
541
542        cluster_stats
543    }
544
545    /// Calculate DBSCAN cluster statistics
546    fn calculate_dbscan_cluster_stats(
547        &self,
548        points: &[Vec<f64>],
549        assignments: &[usize],
550        n_clusters: usize,
551    ) -> Vec<ClusterStats> {
552        let mut cluster_stats = Vec::new();
553
554        for cluster_id in 0..n_clusters {
555            let cluster_points: Vec<&Vec<f64>> = points
556                .iter()
557                .zip(assignments.iter())
558                .filter(|(_, &assignment)| assignment == cluster_id)
559                .map(|(point, _)| point)
560                .collect();
561
562            if cluster_points.is_empty() {
563                continue;
564            }
565
566            let size = cluster_points.len();
567
568            // Calculate center
569            let n_features = cluster_points[0].len();
570            let mut center = vec![0.0; n_features];
571            for point in &cluster_points {
572                for (j, &value) in point.iter().enumerate() {
573                    center[j] += value;
574                }
575            }
576            for j in 0..n_features {
577                center[j] /= size as f64;
578            }
579
580            // Calculate distances
581            let mut distances = Vec::new();
582            for point in &cluster_points {
583                let distance = self.euclidean_distance(point, &center);
584                distances.push(distance);
585            }
586
587            distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
588
589            cluster_stats.push(ClusterStats {
590                cluster_id,
591                size,
592                center,
593                avg_distance: distances.iter().sum::<f64>() / size as f64,
594                max_distance: *distances.last().unwrap(),
595                min_distance: *distances.first().unwrap(),
596            });
597        }
598
599        cluster_stats
600    }
601
602    /// Calculate hierarchical cluster statistics
603    fn calculate_hierarchical_cluster_stats(
604        &self,
605        points: &[Vec<f64>],
606        assignments: &[usize],
607        n_clusters: usize,
608    ) -> Vec<ClusterStats> {
609        self.calculate_dbscan_cluster_stats(points, assignments, n_clusters)
610    }
611
612    /// Calculate silhouette score
613    fn calculate_silhouette_score(&self, points: &[Vec<f64>], assignments: &[usize]) -> f64 {
614        let mut total_silhouette = 0.0;
615        let mut count = 0;
616
617        for (i, point) in points.iter().enumerate() {
618            let cluster = assignments[i];
619            
620            // Calculate average distance to points in same cluster
621            let mut intra_cluster_dist = 0.0;
622            let mut intra_cluster_count = 0;
623            
624            for (j, other_point) in points.iter().enumerate() {
625                if i != j && assignments[j] == cluster {
626                    intra_cluster_dist += self.euclidean_distance(point, other_point);
627                    intra_cluster_count += 1;
628                }
629            }
630            
631            if intra_cluster_count == 0 {
632                continue;
633            }
634            
635            intra_cluster_dist /= intra_cluster_count as f64;
636            
637            // Calculate minimum average distance to other clusters
638            let mut min_inter_cluster_dist = f64::INFINITY;
639            
640            for other_cluster in 0..self.n_clusters {
641                if other_cluster == cluster {
642                    continue;
643                }
644                
645                let mut inter_cluster_dist = 0.0;
646                let mut inter_cluster_count = 0;
647                
648                for (j, other_point) in points.iter().enumerate() {
649                    if assignments[j] == other_cluster {
650                        inter_cluster_dist += self.euclidean_distance(point, other_point);
651                        inter_cluster_count += 1;
652                    }
653                }
654                
655                if inter_cluster_count > 0 {
656                    inter_cluster_dist /= inter_cluster_count as f64;
657                    if inter_cluster_dist < min_inter_cluster_dist {
658                        min_inter_cluster_dist = inter_cluster_dist;
659                    }
660                }
661            }
662            
663            if min_inter_cluster_dist != f64::INFINITY {
664                let silhouette = (min_inter_cluster_dist - intra_cluster_dist) / 
665                               min_inter_cluster_dist.max(intra_cluster_dist);
666                total_silhouette += silhouette;
667                count += 1;
668            }
669        }
670        
671        if count > 0 {
672            total_silhouette / count as f64
673        } else {
674            0.0
675        }
676    }
677}