ghostflow_ml/
clustering.rs

1//! Clustering algorithms - KMeans, DBSCAN, Agglomerative
2
3use ghostflow_core::Tensor;
4use rayon::prelude::*;
5use rand::prelude::*;
6
7/// K-Means clustering using Lloyd's algorithm
8pub struct KMeans {
9    pub n_clusters: usize,
10    pub max_iter: usize,
11    pub tol: f32,
12    pub n_init: usize,
13    pub init: KMeansInit,
14    pub cluster_centers_: Option<Vec<Vec<f32>>>,
15    pub labels_: Option<Vec<usize>>,
16    pub inertia_: Option<f32>,
17    pub n_iter_: usize,
18}
19
20#[derive(Clone, Copy, Debug)]
21pub enum KMeansInit {
22    Random,
23    KMeansPlusPlus,
24}
25
26impl KMeans {
27    pub fn new(n_clusters: usize) -> Self {
28        KMeans {
29            n_clusters,
30            max_iter: 300,
31            tol: 1e-4,
32            n_init: 10,
33            init: KMeansInit::KMeansPlusPlus,
34            cluster_centers_: None,
35            labels_: None,
36            inertia_: None,
37            n_iter_: 0,
38        }
39    }
40
41    pub fn max_iter(mut self, n: usize) -> Self {
42        self.max_iter = n;
43        self
44    }
45
46    pub fn n_init(mut self, n: usize) -> Self {
47        self.n_init = n;
48        self
49    }
50
51    pub fn init(mut self, init: KMeansInit) -> Self {
52        self.init = init;
53        self
54    }
55
56    fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
57        a.iter()
58            .zip(b.iter())
59            .map(|(&ai, &bi)| (ai - bi).powi(2))
60            .sum::<f32>()
61            .sqrt()
62    }
63
64    fn euclidean_distance_squared(a: &[f32], b: &[f32]) -> f32 {
65        a.iter()
66            .zip(b.iter())
67            .map(|(&ai, &bi)| (ai - bi).powi(2))
68            .sum::<f32>()
69    }
70
71    fn init_random(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
72        let mut rng = thread_rng();
73        let indices: Vec<usize> = (0..n_samples)
74            .choose_multiple(&mut rng, self.n_clusters)
75            .into_iter()
76            .collect();
77
78        indices.iter()
79            .map(|&i| x[i * n_features..(i + 1) * n_features].to_vec())
80            .collect()
81    }
82
83    fn init_kmeans_plusplus(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
84        let mut rng = thread_rng();
85        let mut centers = Vec::with_capacity(self.n_clusters);
86
87        // Choose first center randomly
88        let first_idx = rng.gen_range(0..n_samples);
89        centers.push(x[first_idx * n_features..(first_idx + 1) * n_features].to_vec());
90
91        // Choose remaining centers
92        for _ in 1..self.n_clusters {
93            // Compute distances to nearest center
94            let distances: Vec<f32> = (0..n_samples)
95                .map(|i| {
96                    let point = &x[i * n_features..(i + 1) * n_features];
97                    centers.iter()
98                        .map(|c| Self::euclidean_distance_squared(point, c))
99                        .fold(f32::INFINITY, f32::min)
100                })
101                .collect();
102
103            // Sample proportional to distance squared
104            let total: f32 = distances.iter().sum();
105            let threshold = rng.gen::<f32>() * total;
106            
107            let mut cumsum = 0.0f32;
108            let mut chosen_idx = 0;
109            for (i, &d) in distances.iter().enumerate() {
110                cumsum += d;
111                if cumsum >= threshold {
112                    chosen_idx = i;
113                    break;
114                }
115            }
116
117            centers.push(x[chosen_idx * n_features..(chosen_idx + 1) * n_features].to_vec());
118        }
119
120        centers
121    }
122
123    fn assign_labels(&self, x: &[f32], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> Vec<usize> {
124        (0..n_samples)
125            .into_par_iter()
126            .map(|i| {
127                let point = &x[i * n_features..(i + 1) * n_features];
128                centers.iter()
129                    .enumerate()
130                    .map(|(c, center)| (c, Self::euclidean_distance_squared(point, center)))
131                    .min_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap())
132                    .map(|(c, _)| c)
133                    .unwrap_or(0)
134            })
135            .collect()
136    }
137
138    fn update_centers(&self, x: &[f32], labels: &[usize], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
139        let mut new_centers = vec![vec![0.0f32; n_features]; self.n_clusters];
140        let mut counts = vec![0usize; self.n_clusters];
141
142        for i in 0..n_samples {
143            let label = labels[i];
144            counts[label] += 1;
145            for j in 0..n_features {
146                new_centers[label][j] += x[i * n_features + j];
147            }
148        }
149
150        for c in 0..self.n_clusters {
151            if counts[c] > 0 {
152                for j in 0..n_features {
153                    new_centers[c][j] /= counts[c] as f32;
154                }
155            }
156        }
157
158        new_centers
159    }
160
161    fn compute_inertia(&self, x: &[f32], labels: &[usize], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> f32 {
162        (0..n_samples)
163            .map(|i| {
164                let point = &x[i * n_features..(i + 1) * n_features];
165                Self::euclidean_distance_squared(point, &centers[labels[i]])
166            })
167            .sum()
168    }
169
170    fn single_run(&self, x: &[f32], n_samples: usize, n_features: usize) -> (Vec<Vec<f32>>, Vec<usize>, f32, usize) {
171        // Initialize centers
172        let mut centers = match self.init {
173            KMeansInit::Random => self.init_random(x, n_samples, n_features),
174            KMeansInit::KMeansPlusPlus => self.init_kmeans_plusplus(x, n_samples, n_features),
175        };
176
177        let mut labels = self.assign_labels(x, &centers, n_samples, n_features);
178        let mut n_iter = 0;
179
180        for iter in 0..self.max_iter {
181            n_iter = iter + 1;
182
183            // Update centers
184            let new_centers = self.update_centers(x, &labels, n_samples, n_features);
185
186            // Check convergence
187            let center_shift: f32 = centers.iter()
188                .zip(new_centers.iter())
189                .map(|(old, new)| Self::euclidean_distance(old, new))
190                .sum();
191
192            centers = new_centers;
193
194            if center_shift < self.tol {
195                break;
196            }
197
198            // Reassign labels
199            labels = self.assign_labels(x, &centers, n_samples, n_features);
200        }
201
202        let inertia = self.compute_inertia(x, &labels, &centers, n_samples, n_features);
203
204        (centers, labels, inertia, n_iter)
205    }
206
207    pub fn fit(&mut self, x: &Tensor) {
208        let x_data = x.data_f32();
209        let n_samples = x.dims()[0];
210        let n_features = x.dims()[1];
211
212        let mut best_centers = None;
213        let mut best_labels = None;
214        let mut best_inertia = f32::INFINITY;
215        let mut best_n_iter = 0;
216
217        for _ in 0..self.n_init {
218            let (centers, labels, inertia, n_iter) = self.single_run(&x_data, n_samples, n_features);
219
220            if inertia < best_inertia {
221                best_inertia = inertia;
222                best_centers = Some(centers);
223                best_labels = Some(labels);
224                best_n_iter = n_iter;
225            }
226        }
227
228        self.cluster_centers_ = best_centers;
229        self.labels_ = best_labels;
230        self.inertia_ = Some(best_inertia);
231        self.n_iter_ = best_n_iter;
232    }
233
234    pub fn predict(&self, x: &Tensor) -> Tensor {
235        let x_data = x.data_f32();
236        let n_samples = x.dims()[0];
237        let n_features = x.dims()[1];
238
239        let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
240        let labels = self.assign_labels(&x_data, centers, n_samples, n_features);
241
242        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
243        Tensor::from_slice(&labels_f32, &[n_samples]).unwrap()
244    }
245
246    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
247        self.fit(x);
248        let labels = self.labels_.as_ref().unwrap();
249        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
250        Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
251    }
252
253    pub fn transform(&self, x: &Tensor) -> Tensor {
254        let x_data = x.data_f32();
255        let n_samples = x.dims()[0];
256        let n_features = x.dims()[1];
257
258        let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
259
260        let mut distances = Vec::with_capacity(n_samples * self.n_clusters);
261        for i in 0..n_samples {
262            let point = &x_data[i * n_features..(i + 1) * n_features];
263            for center in centers {
264                distances.push(Self::euclidean_distance(point, center));
265            }
266        }
267
268        Tensor::from_slice(&distances, &[n_samples, self.n_clusters]).unwrap()
269    }
270}
271
272
273/// DBSCAN - Density-Based Spatial Clustering of Applications with Noise
274pub struct DBSCAN {
275    pub eps: f32,
276    pub min_samples: usize,
277    pub metric: DistanceMetric,
278    pub labels_: Option<Vec<i32>>,
279    pub core_sample_indices_: Option<Vec<usize>>,
280}
281
282#[derive(Clone, Copy, Debug)]
283pub enum DistanceMetric {
284    Euclidean,
285    Manhattan,
286    Cosine,
287}
288
289/// Helper struct to group clustering parameters
290struct ClusterParams<'a> {
291    x: &'a [f32],
292    labels: &'a mut [i32],
293    visited: &'a mut [bool],
294    n_samples: usize,
295    n_features: usize,
296}
297
298impl DBSCAN {
299    pub fn new(eps: f32, min_samples: usize) -> Self {
300        DBSCAN {
301            eps,
302            min_samples,
303            metric: DistanceMetric::Euclidean,
304            labels_: None,
305            core_sample_indices_: None,
306        }
307    }
308
309    pub fn metric(mut self, metric: DistanceMetric) -> Self {
310        self.metric = metric;
311        self
312    }
313
314    fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
315        match self.metric {
316            DistanceMetric::Euclidean => {
317                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
318            }
319            DistanceMetric::Manhattan => {
320                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
321            }
322            DistanceMetric::Cosine => {
323                let dot: f32 = a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum();
324                let norm_a: f32 = a.iter().map(|&x| x * x).sum::<f32>().sqrt();
325                let norm_b: f32 = b.iter().map(|&x| x * x).sum::<f32>().sqrt();
326                1.0 - dot / (norm_a * norm_b + 1e-10)
327            }
328        }
329    }
330
331    fn region_query(&self, x: &[f32], point_idx: usize, n_samples: usize, n_features: usize) -> Vec<usize> {
332        let point = &x[point_idx * n_features..(point_idx + 1) * n_features];
333        
334        (0..n_samples)
335            .filter(|&i| {
336                let other = &x[i * n_features..(i + 1) * n_features];
337                self.distance(point, other) <= self.eps
338            })
339            .collect()
340    }
341
342    fn expand_cluster(
343        &self,
344        cluster_params: &mut ClusterParams,
345        point_idx: usize,
346        neighbors: Vec<usize>,
347        cluster_id: i32,
348    ) {
349        cluster_params.labels[point_idx] = cluster_id;
350
351        let mut seeds = neighbors;
352        let mut i = 0;
353
354        while i < seeds.len() {
355            let q = seeds[i];
356
357            if !cluster_params.visited[q] {
358                cluster_params.visited[q] = true;
359                let q_neighbors = self.region_query(
360                    cluster_params.x,
361                    q,
362                    cluster_params.n_samples,
363                    cluster_params.n_features
364                );
365
366                if q_neighbors.len() >= self.min_samples {
367                    for &neighbor in &q_neighbors {
368                        if !seeds.contains(&neighbor) {
369                            seeds.push(neighbor);
370                        }
371                    }
372                }
373            }
374
375            if cluster_params.labels[q] == -1 {
376                cluster_params.labels[q] = cluster_id;
377            }
378
379            i += 1;
380        }
381    }
382
383    pub fn fit(&mut self, x: &Tensor) {
384        let x_data = x.data_f32();
385        let n_samples = x.dims()[0];
386        let n_features = x.dims()[1];
387
388        let mut labels = vec![-1i32; n_samples];
389        let mut visited = vec![false; n_samples];
390        let mut core_samples = Vec::new();
391        let mut cluster_id = 0i32;
392
393        for i in 0..n_samples {
394            if visited[i] {
395                continue;
396            }
397
398            visited[i] = true;
399            let neighbors = self.region_query(&x_data, i, n_samples, n_features);
400
401            if neighbors.len() >= self.min_samples {
402                core_samples.push(i);
403                let mut params = ClusterParams {
404                    x: &x_data,
405                    labels: &mut labels,
406                    visited: &mut visited,
407                    n_samples,
408                    n_features,
409                };
410                self.expand_cluster(
411                    &mut params,
412                    i,
413                    neighbors,
414                    cluster_id,
415                );
416                cluster_id += 1;
417            }
418        }
419
420        self.labels_ = Some(labels);
421        self.core_sample_indices_ = Some(core_samples);
422    }
423
424    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
425        self.fit(x);
426        let labels = self.labels_.as_ref().unwrap();
427        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
428        Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
429    }
430}
431
432/// Agglomerative (Hierarchical) Clustering
433pub struct AgglomerativeClustering {
434    pub n_clusters: usize,
435    pub linkage: Linkage,
436    pub metric: DistanceMetric,
437    pub labels_: Option<Vec<usize>>,
438    pub n_leaves_: usize,
439    pub n_connected_components_: usize,
440}
441
442#[derive(Clone, Copy, Debug)]
443pub enum Linkage {
444    Single,
445    Complete,
446    Average,
447    Ward,
448}
449
450impl AgglomerativeClustering {
451    pub fn new(n_clusters: usize) -> Self {
452        AgglomerativeClustering {
453            n_clusters,
454            linkage: Linkage::Ward,
455            metric: DistanceMetric::Euclidean,
456            labels_: None,
457            n_leaves_: 0,
458            n_connected_components_: 1,
459        }
460    }
461
462    pub fn linkage(mut self, linkage: Linkage) -> Self {
463        self.linkage = linkage;
464        self
465    }
466
467    fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
468        a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
469    }
470
471    fn compute_distance_matrix(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
472        let mut dist_matrix = vec![vec![0.0f32; n_samples]; n_samples];
473
474        for i in 0..n_samples {
475            for j in (i + 1)..n_samples {
476                let point_i = &x[i * n_features..(i + 1) * n_features];
477                let point_j = &x[j * n_features..(j + 1) * n_features];
478                let dist = Self::euclidean_distance(point_i, point_j);
479                dist_matrix[i][j] = dist;
480                dist_matrix[j][i] = dist;
481            }
482        }
483
484        dist_matrix
485    }
486
487    fn cluster_distance(
488        &self,
489        cluster_a: &[usize],
490        cluster_b: &[usize],
491        dist_matrix: &[Vec<f32>],
492        x: &[f32],
493        n_features: usize,
494    ) -> f32 {
495        match self.linkage {
496            Linkage::Single => {
497                let mut min_dist = f32::INFINITY;
498                for &i in cluster_a {
499                    for &j in cluster_b {
500                        min_dist = min_dist.min(dist_matrix[i][j]);
501                    }
502                }
503                min_dist
504            }
505            Linkage::Complete => {
506                let mut max_dist = 0.0f32;
507                for &i in cluster_a {
508                    for &j in cluster_b {
509                        max_dist = max_dist.max(dist_matrix[i][j]);
510                    }
511                }
512                max_dist
513            }
514            Linkage::Average => {
515                let mut sum = 0.0f32;
516                let count = cluster_a.len() * cluster_b.len();
517                for &i in cluster_a {
518                    for &j in cluster_b {
519                        sum += dist_matrix[i][j];
520                    }
521                }
522                sum / count as f32
523            }
524            Linkage::Ward => {
525                // Ward's method: minimize within-cluster variance
526                let n_a = cluster_a.len() as f32;
527                let n_b = cluster_b.len() as f32;
528
529                // Compute centroids
530                let mut centroid_a = vec![0.0f32; n_features];
531                let mut centroid_b = vec![0.0f32; n_features];
532
533                for &i in cluster_a {
534                    for j in 0..n_features {
535                        centroid_a[j] += x[i * n_features + j];
536                    }
537                }
538                for &i in cluster_b {
539                    for j in 0..n_features {
540                        centroid_b[j] += x[i * n_features + j];
541                    }
542                }
543
544                for j in 0..n_features {
545                    centroid_a[j] /= n_a;
546                    centroid_b[j] /= n_b;
547                }
548
549                let dist = Self::euclidean_distance(&centroid_a, &centroid_b);
550                (n_a * n_b / (n_a + n_b)) * dist * dist
551            }
552        }
553    }
554
555    pub fn fit(&mut self, x: &Tensor) {
556        let x_data = x.data_f32();
557        let n_samples = x.dims()[0];
558        let n_features = x.dims()[1];
559
560        self.n_leaves_ = n_samples;
561
562        // Compute initial distance matrix
563        let dist_matrix = self.compute_distance_matrix(&x_data, n_samples, n_features);
564
565        // Initialize clusters (each point is its own cluster)
566        let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
567
568        // Agglomerative clustering
569        while clusters.len() > self.n_clusters {
570            // Find closest pair of clusters
571            let mut min_dist = f32::INFINITY;
572            let mut merge_i = 0;
573            let mut merge_j = 1;
574
575            for i in 0..clusters.len() {
576                for j in (i + 1)..clusters.len() {
577                    let dist = self.cluster_distance(
578                        &clusters[i],
579                        &clusters[j],
580                        &dist_matrix,
581                        &x_data,
582                        n_features,
583                    );
584                    if dist < min_dist {
585                        min_dist = dist;
586                        merge_i = i;
587                        merge_j = j;
588                    }
589                }
590            }
591
592            // Merge clusters
593            let cluster_j = clusters.remove(merge_j);
594            clusters[merge_i].extend(cluster_j);
595        }
596
597        // Assign labels
598        let mut labels = vec![0usize; n_samples];
599        for (cluster_id, cluster) in clusters.iter().enumerate() {
600            for &point_idx in cluster {
601                labels[point_idx] = cluster_id;
602            }
603        }
604
605        self.labels_ = Some(labels);
606        self.n_connected_components_ = clusters.len();
607    }
608
609    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
610        self.fit(x);
611        let labels = self.labels_.as_ref().unwrap();
612        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
613        Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
614    }
615}
616
617#[cfg(test)]
618mod tests {
619    use super::*;
620
621    #[test]
622    fn test_kmeans() {
623        let x = Tensor::from_slice(&[0.0f32, 0.0,
624            0.1, 0.1,
625            0.2, 0.0,
626            5.0, 5.0,
627            5.1, 5.1,
628            5.0, 5.2,
629        ], &[6, 2]).unwrap();
630
631        let mut kmeans = KMeans::new(2).n_init(3);
632        let labels = kmeans.fit_predict(&x);
633
634        assert_eq!(labels.dims(), &[6]);
635        assert!(kmeans.inertia_.unwrap() < 1.0);
636    }
637
638    #[test]
639    fn test_dbscan() {
640        let x = Tensor::from_slice(&[0.0f32, 0.0,
641            0.1, 0.1,
642            0.2, 0.0,
643            5.0, 5.0,
644            5.1, 5.1,
645        ], &[5, 2]).unwrap();
646
647        let mut dbscan = DBSCAN::new(0.5, 2);
648        let labels = dbscan.fit_predict(&x);
649
650        assert_eq!(labels.dims(), &[5]);
651    }
652
653    #[test]
654    fn test_agglomerative() {
655        let x = Tensor::from_slice(&[0.0f32, 0.0,
656            0.1, 0.1,
657            5.0, 5.0,
658            5.1, 5.1,
659        ], &[4, 2]).unwrap();
660
661        let mut agg = AgglomerativeClustering::new(2);
662        let labels = agg.fit_predict(&x);
663
664        assert_eq!(labels.dims(), &[4]);
665    }
666}
667
668