ghostflow_ml/
clustering.rs

1//! Clustering algorithms - KMeans, DBSCAN, Agglomerative
2
3use ghostflow_core::Tensor;
4use rayon::prelude::*;
5use rand::prelude::*;
6
7/// K-Means clustering using Lloyd's algorithm
8pub struct KMeans {
9    pub n_clusters: usize,
10    pub max_iter: usize,
11    pub tol: f32,
12    pub n_init: usize,
13    pub init: KMeansInit,
14    pub cluster_centers_: Option<Vec<Vec<f32>>>,
15    pub labels_: Option<Vec<usize>>,
16    pub inertia_: Option<f32>,
17    pub n_iter_: usize,
18}
19
20#[derive(Clone, Copy, Debug)]
21pub enum KMeansInit {
22    Random,
23    KMeansPlusPlus,
24}
25
26impl KMeans {
27    pub fn new(n_clusters: usize) -> Self {
28        KMeans {
29            n_clusters,
30            max_iter: 300,
31            tol: 1e-4,
32            n_init: 10,
33            init: KMeansInit::KMeansPlusPlus,
34            cluster_centers_: None,
35            labels_: None,
36            inertia_: None,
37            n_iter_: 0,
38        }
39    }
40
41    pub fn max_iter(mut self, n: usize) -> Self {
42        self.max_iter = n;
43        self
44    }
45
46    pub fn n_init(mut self, n: usize) -> Self {
47        self.n_init = n;
48        self
49    }
50
51    pub fn init(mut self, init: KMeansInit) -> Self {
52        self.init = init;
53        self
54    }
55
56    fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
57        a.iter()
58            .zip(b.iter())
59            .map(|(&ai, &bi)| (ai - bi).powi(2))
60            .sum::<f32>()
61            .sqrt()
62    }
63
64    fn euclidean_distance_squared(a: &[f32], b: &[f32]) -> f32 {
65        a.iter()
66            .zip(b.iter())
67            .map(|(&ai, &bi)| (ai - bi).powi(2))
68            .sum::<f32>()
69    }
70
71    fn init_random(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
72        let mut rng = thread_rng();
73        let indices: Vec<usize> = (0..n_samples)
74            .choose_multiple(&mut rng, self.n_clusters)
75            .into_iter()
76            .collect();
77
78        indices.iter()
79            .map(|&i| x[i * n_features..(i + 1) * n_features].to_vec())
80            .collect()
81    }
82
83    fn init_kmeans_plusplus(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
84        let mut rng = thread_rng();
85        let mut centers = Vec::with_capacity(self.n_clusters);
86
87        // Choose first center randomly
88        let first_idx = rng.gen_range(0..n_samples);
89        centers.push(x[first_idx * n_features..(first_idx + 1) * n_features].to_vec());
90
91        // Choose remaining centers
92        for _ in 1..self.n_clusters {
93            // Compute distances to nearest center
94            let distances: Vec<f32> = (0..n_samples)
95                .map(|i| {
96                    let point = &x[i * n_features..(i + 1) * n_features];
97                    centers.iter()
98                        .map(|c| Self::euclidean_distance_squared(point, c))
99                        .fold(f32::INFINITY, f32::min)
100                })
101                .collect();
102
103            // Sample proportional to distance squared
104            let total: f32 = distances.iter().sum();
105            let threshold = rng.gen::<f32>() * total;
106            
107            let mut cumsum = 0.0f32;
108            let mut chosen_idx = 0;
109            for (i, &d) in distances.iter().enumerate() {
110                cumsum += d;
111                if cumsum >= threshold {
112                    chosen_idx = i;
113                    break;
114                }
115            }
116
117            centers.push(x[chosen_idx * n_features..(chosen_idx + 1) * n_features].to_vec());
118        }
119
120        centers
121    }
122
123    fn assign_labels(&self, x: &[f32], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> Vec<usize> {
124        (0..n_samples)
125            .into_par_iter()
126            .map(|i| {
127                let point = &x[i * n_features..(i + 1) * n_features];
128                centers.iter()
129                    .enumerate()
130                    .map(|(c, center)| (c, Self::euclidean_distance_squared(point, center)))
131                    .min_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap())
132                    .map(|(c, _)| c)
133                    .unwrap_or(0)
134            })
135            .collect()
136    }
137
138    fn update_centers(&self, x: &[f32], labels: &[usize], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
139        let mut new_centers = vec![vec![0.0f32; n_features]; self.n_clusters];
140        let mut counts = vec![0usize; self.n_clusters];
141
142        for i in 0..n_samples {
143            let label = labels[i];
144            counts[label] += 1;
145            for j in 0..n_features {
146                new_centers[label][j] += x[i * n_features + j];
147            }
148        }
149
150        for c in 0..self.n_clusters {
151            if counts[c] > 0 {
152                for j in 0..n_features {
153                    new_centers[c][j] /= counts[c] as f32;
154                }
155            }
156        }
157
158        new_centers
159    }
160
161    fn compute_inertia(&self, x: &[f32], labels: &[usize], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> f32 {
162        (0..n_samples)
163            .map(|i| {
164                let point = &x[i * n_features..(i + 1) * n_features];
165                Self::euclidean_distance_squared(point, &centers[labels[i]])
166            })
167            .sum()
168    }
169
170    fn single_run(&self, x: &[f32], n_samples: usize, n_features: usize) -> (Vec<Vec<f32>>, Vec<usize>, f32, usize) {
171        // Initialize centers
172        let mut centers = match self.init {
173            KMeansInit::Random => self.init_random(x, n_samples, n_features),
174            KMeansInit::KMeansPlusPlus => self.init_kmeans_plusplus(x, n_samples, n_features),
175        };
176
177        let mut labels = self.assign_labels(x, &centers, n_samples, n_features);
178        let mut n_iter = 0;
179
180        for iter in 0..self.max_iter {
181            n_iter = iter + 1;
182
183            // Update centers
184            let new_centers = self.update_centers(x, &labels, n_samples, n_features);
185
186            // Check convergence
187            let center_shift: f32 = centers.iter()
188                .zip(new_centers.iter())
189                .map(|(old, new)| Self::euclidean_distance(old, new))
190                .sum();
191
192            centers = new_centers;
193
194            if center_shift < self.tol {
195                break;
196            }
197
198            // Reassign labels
199            labels = self.assign_labels(x, &centers, n_samples, n_features);
200        }
201
202        let inertia = self.compute_inertia(x, &labels, &centers, n_samples, n_features);
203
204        (centers, labels, inertia, n_iter)
205    }
206
207    pub fn fit(&mut self, x: &Tensor) {
208        let x_data = x.data_f32();
209        let n_samples = x.dims()[0];
210        let n_features = x.dims()[1];
211
212        let mut best_centers = None;
213        let mut best_labels = None;
214        let mut best_inertia = f32::INFINITY;
215        let mut best_n_iter = 0;
216
217        for _ in 0..self.n_init {
218            let (centers, labels, inertia, n_iter) = self.single_run(&x_data, n_samples, n_features);
219
220            if inertia < best_inertia {
221                best_inertia = inertia;
222                best_centers = Some(centers);
223                best_labels = Some(labels);
224                best_n_iter = n_iter;
225            }
226        }
227
228        self.cluster_centers_ = best_centers;
229        self.labels_ = best_labels;
230        self.inertia_ = Some(best_inertia);
231        self.n_iter_ = best_n_iter;
232    }
233
234    pub fn predict(&self, x: &Tensor) -> Tensor {
235        let x_data = x.data_f32();
236        let n_samples = x.dims()[0];
237        let n_features = x.dims()[1];
238
239        let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
240        let labels = self.assign_labels(&x_data, centers, n_samples, n_features);
241
242        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
243        Tensor::from_slice(&labels_f32, &[n_samples]).unwrap()
244    }
245
246    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
247        self.fit(x);
248        let labels = self.labels_.as_ref().unwrap();
249        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
250        Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
251    }
252
253    pub fn transform(&self, x: &Tensor) -> Tensor {
254        let x_data = x.data_f32();
255        let n_samples = x.dims()[0];
256        let n_features = x.dims()[1];
257
258        let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
259
260        let mut distances = Vec::with_capacity(n_samples * self.n_clusters);
261        for i in 0..n_samples {
262            let point = &x_data[i * n_features..(i + 1) * n_features];
263            for center in centers {
264                distances.push(Self::euclidean_distance(point, center));
265            }
266        }
267
268        Tensor::from_slice(&distances, &[n_samples, self.n_clusters]).unwrap()
269    }
270}
271
272
273/// DBSCAN - Density-Based Spatial Clustering of Applications with Noise
274pub struct DBSCAN {
275    pub eps: f32,
276    pub min_samples: usize,
277    pub metric: DistanceMetric,
278    pub labels_: Option<Vec<i32>>,
279    pub core_sample_indices_: Option<Vec<usize>>,
280}
281
282#[derive(Clone, Copy, Debug)]
283pub enum DistanceMetric {
284    Euclidean,
285    Manhattan,
286    Cosine,
287}
288
289impl DBSCAN {
290    pub fn new(eps: f32, min_samples: usize) -> Self {
291        DBSCAN {
292            eps,
293            min_samples,
294            metric: DistanceMetric::Euclidean,
295            labels_: None,
296            core_sample_indices_: None,
297        }
298    }
299
300    pub fn metric(mut self, metric: DistanceMetric) -> Self {
301        self.metric = metric;
302        self
303    }
304
305    fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
306        match self.metric {
307            DistanceMetric::Euclidean => {
308                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
309            }
310            DistanceMetric::Manhattan => {
311                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
312            }
313            DistanceMetric::Cosine => {
314                let dot: f32 = a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum();
315                let norm_a: f32 = a.iter().map(|&x| x * x).sum::<f32>().sqrt();
316                let norm_b: f32 = b.iter().map(|&x| x * x).sum::<f32>().sqrt();
317                1.0 - dot / (norm_a * norm_b + 1e-10)
318            }
319        }
320    }
321
322    fn region_query(&self, x: &[f32], point_idx: usize, n_samples: usize, n_features: usize) -> Vec<usize> {
323        let point = &x[point_idx * n_features..(point_idx + 1) * n_features];
324        
325        (0..n_samples)
326            .filter(|&i| {
327                let other = &x[i * n_features..(i + 1) * n_features];
328                self.distance(point, other) <= self.eps
329            })
330            .collect()
331    }
332
333    fn expand_cluster(
334        &self,
335        x: &[f32],
336        labels: &mut [i32],
337        point_idx: usize,
338        neighbors: Vec<usize>,
339        cluster_id: i32,
340        n_samples: usize,
341        n_features: usize,
342        visited: &mut [bool],
343    ) {
344        labels[point_idx] = cluster_id;
345
346        let mut seeds = neighbors;
347        let mut i = 0;
348
349        while i < seeds.len() {
350            let q = seeds[i];
351
352            if !visited[q] {
353                visited[q] = true;
354                let q_neighbors = self.region_query(x, q, n_samples, n_features);
355
356                if q_neighbors.len() >= self.min_samples {
357                    for &neighbor in &q_neighbors {
358                        if !seeds.contains(&neighbor) {
359                            seeds.push(neighbor);
360                        }
361                    }
362                }
363            }
364
365            if labels[q] == -1 {
366                labels[q] = cluster_id;
367            }
368
369            i += 1;
370        }
371    }
372
373    pub fn fit(&mut self, x: &Tensor) {
374        let x_data = x.data_f32();
375        let n_samples = x.dims()[0];
376        let n_features = x.dims()[1];
377
378        let mut labels = vec![-1i32; n_samples];
379        let mut visited = vec![false; n_samples];
380        let mut core_samples = Vec::new();
381        let mut cluster_id = 0i32;
382
383        for i in 0..n_samples {
384            if visited[i] {
385                continue;
386            }
387
388            visited[i] = true;
389            let neighbors = self.region_query(&x_data, i, n_samples, n_features);
390
391            if neighbors.len() >= self.min_samples {
392                core_samples.push(i);
393                self.expand_cluster(
394                    &x_data,
395                    &mut labels,
396                    i,
397                    neighbors,
398                    cluster_id,
399                    n_samples,
400                    n_features,
401                    &mut visited,
402                );
403                cluster_id += 1;
404            }
405        }
406
407        self.labels_ = Some(labels);
408        self.core_sample_indices_ = Some(core_samples);
409    }
410
411    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
412        self.fit(x);
413        let labels = self.labels_.as_ref().unwrap();
414        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
415        Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
416    }
417}
418
419/// Agglomerative (Hierarchical) Clustering
420pub struct AgglomerativeClustering {
421    pub n_clusters: usize,
422    pub linkage: Linkage,
423    pub metric: DistanceMetric,
424    pub labels_: Option<Vec<usize>>,
425    pub n_leaves_: usize,
426    pub n_connected_components_: usize,
427}
428
429#[derive(Clone, Copy, Debug)]
430pub enum Linkage {
431    Single,
432    Complete,
433    Average,
434    Ward,
435}
436
437impl AgglomerativeClustering {
438    pub fn new(n_clusters: usize) -> Self {
439        AgglomerativeClustering {
440            n_clusters,
441            linkage: Linkage::Ward,
442            metric: DistanceMetric::Euclidean,
443            labels_: None,
444            n_leaves_: 0,
445            n_connected_components_: 1,
446        }
447    }
448
449    pub fn linkage(mut self, linkage: Linkage) -> Self {
450        self.linkage = linkage;
451        self
452    }
453
454    fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
455        a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
456    }
457
458    fn compute_distance_matrix(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
459        let mut dist_matrix = vec![vec![0.0f32; n_samples]; n_samples];
460
461        for i in 0..n_samples {
462            for j in (i + 1)..n_samples {
463                let point_i = &x[i * n_features..(i + 1) * n_features];
464                let point_j = &x[j * n_features..(j + 1) * n_features];
465                let dist = Self::euclidean_distance(point_i, point_j);
466                dist_matrix[i][j] = dist;
467                dist_matrix[j][i] = dist;
468            }
469        }
470
471        dist_matrix
472    }
473
474    fn cluster_distance(
475        &self,
476        cluster_a: &[usize],
477        cluster_b: &[usize],
478        dist_matrix: &[Vec<f32>],
479        x: &[f32],
480        n_features: usize,
481    ) -> f32 {
482        match self.linkage {
483            Linkage::Single => {
484                let mut min_dist = f32::INFINITY;
485                for &i in cluster_a {
486                    for &j in cluster_b {
487                        min_dist = min_dist.min(dist_matrix[i][j]);
488                    }
489                }
490                min_dist
491            }
492            Linkage::Complete => {
493                let mut max_dist = 0.0f32;
494                for &i in cluster_a {
495                    for &j in cluster_b {
496                        max_dist = max_dist.max(dist_matrix[i][j]);
497                    }
498                }
499                max_dist
500            }
501            Linkage::Average => {
502                let mut sum = 0.0f32;
503                let count = cluster_a.len() * cluster_b.len();
504                for &i in cluster_a {
505                    for &j in cluster_b {
506                        sum += dist_matrix[i][j];
507                    }
508                }
509                sum / count as f32
510            }
511            Linkage::Ward => {
512                // Ward's method: minimize within-cluster variance
513                let n_a = cluster_a.len() as f32;
514                let n_b = cluster_b.len() as f32;
515
516                // Compute centroids
517                let mut centroid_a = vec![0.0f32; n_features];
518                let mut centroid_b = vec![0.0f32; n_features];
519
520                for &i in cluster_a {
521                    for j in 0..n_features {
522                        centroid_a[j] += x[i * n_features + j];
523                    }
524                }
525                for &i in cluster_b {
526                    for j in 0..n_features {
527                        centroid_b[j] += x[i * n_features + j];
528                    }
529                }
530
531                for j in 0..n_features {
532                    centroid_a[j] /= n_a;
533                    centroid_b[j] /= n_b;
534                }
535
536                let dist = Self::euclidean_distance(&centroid_a, &centroid_b);
537                (n_a * n_b / (n_a + n_b)) * dist * dist
538            }
539        }
540    }
541
542    pub fn fit(&mut self, x: &Tensor) {
543        let x_data = x.data_f32();
544        let n_samples = x.dims()[0];
545        let n_features = x.dims()[1];
546
547        self.n_leaves_ = n_samples;
548
549        // Compute initial distance matrix
550        let dist_matrix = self.compute_distance_matrix(&x_data, n_samples, n_features);
551
552        // Initialize clusters (each point is its own cluster)
553        let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
554
555        // Agglomerative clustering
556        while clusters.len() > self.n_clusters {
557            // Find closest pair of clusters
558            let mut min_dist = f32::INFINITY;
559            let mut merge_i = 0;
560            let mut merge_j = 1;
561
562            for i in 0..clusters.len() {
563                for j in (i + 1)..clusters.len() {
564                    let dist = self.cluster_distance(
565                        &clusters[i],
566                        &clusters[j],
567                        &dist_matrix,
568                        &x_data,
569                        n_features,
570                    );
571                    if dist < min_dist {
572                        min_dist = dist;
573                        merge_i = i;
574                        merge_j = j;
575                    }
576                }
577            }
578
579            // Merge clusters
580            let cluster_j = clusters.remove(merge_j);
581            clusters[merge_i].extend(cluster_j);
582        }
583
584        // Assign labels
585        let mut labels = vec![0usize; n_samples];
586        for (cluster_id, cluster) in clusters.iter().enumerate() {
587            for &point_idx in cluster {
588                labels[point_idx] = cluster_id;
589            }
590        }
591
592        self.labels_ = Some(labels);
593        self.n_connected_components_ = clusters.len();
594    }
595
596    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
597        self.fit(x);
598        let labels = self.labels_.as_ref().unwrap();
599        let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
600        Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
601    }
602}
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607
608    #[test]
609    fn test_kmeans() {
610        let x = Tensor::from_slice(&[
611            0.0, 0.0,
612            0.1, 0.1,
613            0.2, 0.0,
614            5.0, 5.0,
615            5.1, 5.1,
616            5.0, 5.2,
617        ], &[6, 2]).unwrap();
618
619        let mut kmeans = KMeans::new(2).n_init(3);
620        let labels = kmeans.fit_predict(&x);
621
622        assert_eq!(labels.dims(), &[6]);
623        assert!(kmeans.inertia_.unwrap() < 1.0);
624    }
625
626    #[test]
627    fn test_dbscan() {
628        let x = Tensor::from_slice(&[
629            0.0, 0.0,
630            0.1, 0.1,
631            0.2, 0.0,
632            5.0, 5.0,
633            5.1, 5.1,
634        ], &[5, 2]).unwrap();
635
636        let mut dbscan = DBSCAN::new(0.5, 2);
637        let labels = dbscan.fit_predict(&x);
638
639        assert_eq!(labels.dims(), &[5]);
640    }
641
642    #[test]
643    fn test_agglomerative() {
644        let x = Tensor::from_slice(&[
645            0.0, 0.0,
646            0.1, 0.1,
647            5.0, 5.0,
648            5.1, 5.1,
649        ], &[4, 2]).unwrap();
650
651        let mut agg = AgglomerativeClustering::new(2);
652        let labels = agg.fit_predict(&x);
653
654        assert_eq!(labels.dims(), &[4]);
655    }
656}