1use ghostflow_core::Tensor;
4use rayon::prelude::*;
5use rand::prelude::*;
6
7pub struct KMeans {
9 pub n_clusters: usize,
10 pub max_iter: usize,
11 pub tol: f32,
12 pub n_init: usize,
13 pub init: KMeansInit,
14 pub cluster_centers_: Option<Vec<Vec<f32>>>,
15 pub labels_: Option<Vec<usize>>,
16 pub inertia_: Option<f32>,
17 pub n_iter_: usize,
18}
19
20#[derive(Clone, Copy, Debug)]
21pub enum KMeansInit {
22 Random,
23 KMeansPlusPlus,
24}
25
26impl KMeans {
27 pub fn new(n_clusters: usize) -> Self {
28 KMeans {
29 n_clusters,
30 max_iter: 300,
31 tol: 1e-4,
32 n_init: 10,
33 init: KMeansInit::KMeansPlusPlus,
34 cluster_centers_: None,
35 labels_: None,
36 inertia_: None,
37 n_iter_: 0,
38 }
39 }
40
41 pub fn max_iter(mut self, n: usize) -> Self {
42 self.max_iter = n;
43 self
44 }
45
46 pub fn n_init(mut self, n: usize) -> Self {
47 self.n_init = n;
48 self
49 }
50
51 pub fn init(mut self, init: KMeansInit) -> Self {
52 self.init = init;
53 self
54 }
55
56 fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
57 a.iter()
58 .zip(b.iter())
59 .map(|(&ai, &bi)| (ai - bi).powi(2))
60 .sum::<f32>()
61 .sqrt()
62 }
63
64 fn euclidean_distance_squared(a: &[f32], b: &[f32]) -> f32 {
65 a.iter()
66 .zip(b.iter())
67 .map(|(&ai, &bi)| (ai - bi).powi(2))
68 .sum::<f32>()
69 }
70
71 fn init_random(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
72 let mut rng = thread_rng();
73 let indices: Vec<usize> = (0..n_samples)
74 .choose_multiple(&mut rng, self.n_clusters)
75 .into_iter()
76 .collect();
77
78 indices.iter()
79 .map(|&i| x[i * n_features..(i + 1) * n_features].to_vec())
80 .collect()
81 }
82
83 fn init_kmeans_plusplus(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
84 let mut rng = thread_rng();
85 let mut centers = Vec::with_capacity(self.n_clusters);
86
87 let first_idx = rng.gen_range(0..n_samples);
89 centers.push(x[first_idx * n_features..(first_idx + 1) * n_features].to_vec());
90
91 for _ in 1..self.n_clusters {
93 let distances: Vec<f32> = (0..n_samples)
95 .map(|i| {
96 let point = &x[i * n_features..(i + 1) * n_features];
97 centers.iter()
98 .map(|c| Self::euclidean_distance_squared(point, c))
99 .fold(f32::INFINITY, f32::min)
100 })
101 .collect();
102
103 let total: f32 = distances.iter().sum();
105 let threshold = rng.gen::<f32>() * total;
106
107 let mut cumsum = 0.0f32;
108 let mut chosen_idx = 0;
109 for (i, &d) in distances.iter().enumerate() {
110 cumsum += d;
111 if cumsum >= threshold {
112 chosen_idx = i;
113 break;
114 }
115 }
116
117 centers.push(x[chosen_idx * n_features..(chosen_idx + 1) * n_features].to_vec());
118 }
119
120 centers
121 }
122
123 fn assign_labels(&self, x: &[f32], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> Vec<usize> {
124 (0..n_samples)
125 .into_par_iter()
126 .map(|i| {
127 let point = &x[i * n_features..(i + 1) * n_features];
128 centers.iter()
129 .enumerate()
130 .map(|(c, center)| (c, Self::euclidean_distance_squared(point, center)))
131 .min_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap())
132 .map(|(c, _)| c)
133 .unwrap_or(0)
134 })
135 .collect()
136 }
137
138 fn update_centers(&self, x: &[f32], labels: &[usize], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
139 let mut new_centers = vec![vec![0.0f32; n_features]; self.n_clusters];
140 let mut counts = vec![0usize; self.n_clusters];
141
142 for i in 0..n_samples {
143 let label = labels[i];
144 counts[label] += 1;
145 for j in 0..n_features {
146 new_centers[label][j] += x[i * n_features + j];
147 }
148 }
149
150 for c in 0..self.n_clusters {
151 if counts[c] > 0 {
152 for j in 0..n_features {
153 new_centers[c][j] /= counts[c] as f32;
154 }
155 }
156 }
157
158 new_centers
159 }
160
161 fn compute_inertia(&self, x: &[f32], labels: &[usize], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> f32 {
162 (0..n_samples)
163 .map(|i| {
164 let point = &x[i * n_features..(i + 1) * n_features];
165 Self::euclidean_distance_squared(point, ¢ers[labels[i]])
166 })
167 .sum()
168 }
169
170 fn single_run(&self, x: &[f32], n_samples: usize, n_features: usize) -> (Vec<Vec<f32>>, Vec<usize>, f32, usize) {
171 let mut centers = match self.init {
173 KMeansInit::Random => self.init_random(x, n_samples, n_features),
174 KMeansInit::KMeansPlusPlus => self.init_kmeans_plusplus(x, n_samples, n_features),
175 };
176
177 let mut labels = self.assign_labels(x, ¢ers, n_samples, n_features);
178 let mut n_iter = 0;
179
180 for iter in 0..self.max_iter {
181 n_iter = iter + 1;
182
183 let new_centers = self.update_centers(x, &labels, n_samples, n_features);
185
186 let center_shift: f32 = centers.iter()
188 .zip(new_centers.iter())
189 .map(|(old, new)| Self::euclidean_distance(old, new))
190 .sum();
191
192 centers = new_centers;
193
194 if center_shift < self.tol {
195 break;
196 }
197
198 labels = self.assign_labels(x, ¢ers, n_samples, n_features);
200 }
201
202 let inertia = self.compute_inertia(x, &labels, ¢ers, n_samples, n_features);
203
204 (centers, labels, inertia, n_iter)
205 }
206
207 pub fn fit(&mut self, x: &Tensor) {
208 let x_data = x.data_f32();
209 let n_samples = x.dims()[0];
210 let n_features = x.dims()[1];
211
212 let mut best_centers = None;
213 let mut best_labels = None;
214 let mut best_inertia = f32::INFINITY;
215 let mut best_n_iter = 0;
216
217 for _ in 0..self.n_init {
218 let (centers, labels, inertia, n_iter) = self.single_run(&x_data, n_samples, n_features);
219
220 if inertia < best_inertia {
221 best_inertia = inertia;
222 best_centers = Some(centers);
223 best_labels = Some(labels);
224 best_n_iter = n_iter;
225 }
226 }
227
228 self.cluster_centers_ = best_centers;
229 self.labels_ = best_labels;
230 self.inertia_ = Some(best_inertia);
231 self.n_iter_ = best_n_iter;
232 }
233
234 pub fn predict(&self, x: &Tensor) -> Tensor {
235 let x_data = x.data_f32();
236 let n_samples = x.dims()[0];
237 let n_features = x.dims()[1];
238
239 let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
240 let labels = self.assign_labels(&x_data, centers, n_samples, n_features);
241
242 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
243 Tensor::from_slice(&labels_f32, &[n_samples]).unwrap()
244 }
245
246 pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
247 self.fit(x);
248 let labels = self.labels_.as_ref().unwrap();
249 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
250 Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
251 }
252
253 pub fn transform(&self, x: &Tensor) -> Tensor {
254 let x_data = x.data_f32();
255 let n_samples = x.dims()[0];
256 let n_features = x.dims()[1];
257
258 let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
259
260 let mut distances = Vec::with_capacity(n_samples * self.n_clusters);
261 for i in 0..n_samples {
262 let point = &x_data[i * n_features..(i + 1) * n_features];
263 for center in centers {
264 distances.push(Self::euclidean_distance(point, center));
265 }
266 }
267
268 Tensor::from_slice(&distances, &[n_samples, self.n_clusters]).unwrap()
269 }
270}
271
272
273pub struct DBSCAN {
275 pub eps: f32,
276 pub min_samples: usize,
277 pub metric: DistanceMetric,
278 pub labels_: Option<Vec<i32>>,
279 pub core_sample_indices_: Option<Vec<usize>>,
280}
281
282#[derive(Clone, Copy, Debug)]
283pub enum DistanceMetric {
284 Euclidean,
285 Manhattan,
286 Cosine,
287}
288
289impl DBSCAN {
290 pub fn new(eps: f32, min_samples: usize) -> Self {
291 DBSCAN {
292 eps,
293 min_samples,
294 metric: DistanceMetric::Euclidean,
295 labels_: None,
296 core_sample_indices_: None,
297 }
298 }
299
300 pub fn metric(mut self, metric: DistanceMetric) -> Self {
301 self.metric = metric;
302 self
303 }
304
305 fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
306 match self.metric {
307 DistanceMetric::Euclidean => {
308 a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
309 }
310 DistanceMetric::Manhattan => {
311 a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
312 }
313 DistanceMetric::Cosine => {
314 let dot: f32 = a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum();
315 let norm_a: f32 = a.iter().map(|&x| x * x).sum::<f32>().sqrt();
316 let norm_b: f32 = b.iter().map(|&x| x * x).sum::<f32>().sqrt();
317 1.0 - dot / (norm_a * norm_b + 1e-10)
318 }
319 }
320 }
321
322 fn region_query(&self, x: &[f32], point_idx: usize, n_samples: usize, n_features: usize) -> Vec<usize> {
323 let point = &x[point_idx * n_features..(point_idx + 1) * n_features];
324
325 (0..n_samples)
326 .filter(|&i| {
327 let other = &x[i * n_features..(i + 1) * n_features];
328 self.distance(point, other) <= self.eps
329 })
330 .collect()
331 }
332
333 fn expand_cluster(
334 &self,
335 x: &[f32],
336 labels: &mut [i32],
337 point_idx: usize,
338 neighbors: Vec<usize>,
339 cluster_id: i32,
340 n_samples: usize,
341 n_features: usize,
342 visited: &mut [bool],
343 ) {
344 labels[point_idx] = cluster_id;
345
346 let mut seeds = neighbors;
347 let mut i = 0;
348
349 while i < seeds.len() {
350 let q = seeds[i];
351
352 if !visited[q] {
353 visited[q] = true;
354 let q_neighbors = self.region_query(x, q, n_samples, n_features);
355
356 if q_neighbors.len() >= self.min_samples {
357 for &neighbor in &q_neighbors {
358 if !seeds.contains(&neighbor) {
359 seeds.push(neighbor);
360 }
361 }
362 }
363 }
364
365 if labels[q] == -1 {
366 labels[q] = cluster_id;
367 }
368
369 i += 1;
370 }
371 }
372
373 pub fn fit(&mut self, x: &Tensor) {
374 let x_data = x.data_f32();
375 let n_samples = x.dims()[0];
376 let n_features = x.dims()[1];
377
378 let mut labels = vec![-1i32; n_samples];
379 let mut visited = vec![false; n_samples];
380 let mut core_samples = Vec::new();
381 let mut cluster_id = 0i32;
382
383 for i in 0..n_samples {
384 if visited[i] {
385 continue;
386 }
387
388 visited[i] = true;
389 let neighbors = self.region_query(&x_data, i, n_samples, n_features);
390
391 if neighbors.len() >= self.min_samples {
392 core_samples.push(i);
393 self.expand_cluster(
394 &x_data,
395 &mut labels,
396 i,
397 neighbors,
398 cluster_id,
399 n_samples,
400 n_features,
401 &mut visited,
402 );
403 cluster_id += 1;
404 }
405 }
406
407 self.labels_ = Some(labels);
408 self.core_sample_indices_ = Some(core_samples);
409 }
410
411 pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
412 self.fit(x);
413 let labels = self.labels_.as_ref().unwrap();
414 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
415 Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
416 }
417}
418
419pub struct AgglomerativeClustering {
421 pub n_clusters: usize,
422 pub linkage: Linkage,
423 pub metric: DistanceMetric,
424 pub labels_: Option<Vec<usize>>,
425 pub n_leaves_: usize,
426 pub n_connected_components_: usize,
427}
428
429#[derive(Clone, Copy, Debug)]
430pub enum Linkage {
431 Single,
432 Complete,
433 Average,
434 Ward,
435}
436
437impl AgglomerativeClustering {
438 pub fn new(n_clusters: usize) -> Self {
439 AgglomerativeClustering {
440 n_clusters,
441 linkage: Linkage::Ward,
442 metric: DistanceMetric::Euclidean,
443 labels_: None,
444 n_leaves_: 0,
445 n_connected_components_: 1,
446 }
447 }
448
449 pub fn linkage(mut self, linkage: Linkage) -> Self {
450 self.linkage = linkage;
451 self
452 }
453
454 fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
455 a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
456 }
457
458 fn compute_distance_matrix(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
459 let mut dist_matrix = vec![vec![0.0f32; n_samples]; n_samples];
460
461 for i in 0..n_samples {
462 for j in (i + 1)..n_samples {
463 let point_i = &x[i * n_features..(i + 1) * n_features];
464 let point_j = &x[j * n_features..(j + 1) * n_features];
465 let dist = Self::euclidean_distance(point_i, point_j);
466 dist_matrix[i][j] = dist;
467 dist_matrix[j][i] = dist;
468 }
469 }
470
471 dist_matrix
472 }
473
474 fn cluster_distance(
475 &self,
476 cluster_a: &[usize],
477 cluster_b: &[usize],
478 dist_matrix: &[Vec<f32>],
479 x: &[f32],
480 n_features: usize,
481 ) -> f32 {
482 match self.linkage {
483 Linkage::Single => {
484 let mut min_dist = f32::INFINITY;
485 for &i in cluster_a {
486 for &j in cluster_b {
487 min_dist = min_dist.min(dist_matrix[i][j]);
488 }
489 }
490 min_dist
491 }
492 Linkage::Complete => {
493 let mut max_dist = 0.0f32;
494 for &i in cluster_a {
495 for &j in cluster_b {
496 max_dist = max_dist.max(dist_matrix[i][j]);
497 }
498 }
499 max_dist
500 }
501 Linkage::Average => {
502 let mut sum = 0.0f32;
503 let count = cluster_a.len() * cluster_b.len();
504 for &i in cluster_a {
505 for &j in cluster_b {
506 sum += dist_matrix[i][j];
507 }
508 }
509 sum / count as f32
510 }
511 Linkage::Ward => {
512 let n_a = cluster_a.len() as f32;
514 let n_b = cluster_b.len() as f32;
515
516 let mut centroid_a = vec![0.0f32; n_features];
518 let mut centroid_b = vec![0.0f32; n_features];
519
520 for &i in cluster_a {
521 for j in 0..n_features {
522 centroid_a[j] += x[i * n_features + j];
523 }
524 }
525 for &i in cluster_b {
526 for j in 0..n_features {
527 centroid_b[j] += x[i * n_features + j];
528 }
529 }
530
531 for j in 0..n_features {
532 centroid_a[j] /= n_a;
533 centroid_b[j] /= n_b;
534 }
535
536 let dist = Self::euclidean_distance(¢roid_a, ¢roid_b);
537 (n_a * n_b / (n_a + n_b)) * dist * dist
538 }
539 }
540 }
541
542 pub fn fit(&mut self, x: &Tensor) {
543 let x_data = x.data_f32();
544 let n_samples = x.dims()[0];
545 let n_features = x.dims()[1];
546
547 self.n_leaves_ = n_samples;
548
549 let dist_matrix = self.compute_distance_matrix(&x_data, n_samples, n_features);
551
552 let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
554
555 while clusters.len() > self.n_clusters {
557 let mut min_dist = f32::INFINITY;
559 let mut merge_i = 0;
560 let mut merge_j = 1;
561
562 for i in 0..clusters.len() {
563 for j in (i + 1)..clusters.len() {
564 let dist = self.cluster_distance(
565 &clusters[i],
566 &clusters[j],
567 &dist_matrix,
568 &x_data,
569 n_features,
570 );
571 if dist < min_dist {
572 min_dist = dist;
573 merge_i = i;
574 merge_j = j;
575 }
576 }
577 }
578
579 let cluster_j = clusters.remove(merge_j);
581 clusters[merge_i].extend(cluster_j);
582 }
583
584 let mut labels = vec![0usize; n_samples];
586 for (cluster_id, cluster) in clusters.iter().enumerate() {
587 for &point_idx in cluster {
588 labels[point_idx] = cluster_id;
589 }
590 }
591
592 self.labels_ = Some(labels);
593 self.n_connected_components_ = clusters.len();
594 }
595
596 pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
597 self.fit(x);
598 let labels = self.labels_.as_ref().unwrap();
599 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
600 Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
601 }
602}
603
604#[cfg(test)]
605mod tests {
606 use super::*;
607
608 #[test]
609 fn test_kmeans() {
610 let x = Tensor::from_slice(&[
611 0.0, 0.0,
612 0.1, 0.1,
613 0.2, 0.0,
614 5.0, 5.0,
615 5.1, 5.1,
616 5.0, 5.2,
617 ], &[6, 2]).unwrap();
618
619 let mut kmeans = KMeans::new(2).n_init(3);
620 let labels = kmeans.fit_predict(&x);
621
622 assert_eq!(labels.dims(), &[6]);
623 assert!(kmeans.inertia_.unwrap() < 1.0);
624 }
625
626 #[test]
627 fn test_dbscan() {
628 let x = Tensor::from_slice(&[
629 0.0, 0.0,
630 0.1, 0.1,
631 0.2, 0.0,
632 5.0, 5.0,
633 5.1, 5.1,
634 ], &[5, 2]).unwrap();
635
636 let mut dbscan = DBSCAN::new(0.5, 2);
637 let labels = dbscan.fit_predict(&x);
638
639 assert_eq!(labels.dims(), &[5]);
640 }
641
642 #[test]
643 fn test_agglomerative() {
644 let x = Tensor::from_slice(&[
645 0.0, 0.0,
646 0.1, 0.1,
647 5.0, 5.0,
648 5.1, 5.1,
649 ], &[4, 2]).unwrap();
650
651 let mut agg = AgglomerativeClustering::new(2);
652 let labels = agg.fit_predict(&x);
653
654 assert_eq!(labels.dims(), &[4]);
655 }
656}