1use ghostflow_core::Tensor;
4use rayon::prelude::*;
5use rand::prelude::*;
6
7pub struct KMeans {
9 pub n_clusters: usize,
10 pub max_iter: usize,
11 pub tol: f32,
12 pub n_init: usize,
13 pub init: KMeansInit,
14 pub cluster_centers_: Option<Vec<Vec<f32>>>,
15 pub labels_: Option<Vec<usize>>,
16 pub inertia_: Option<f32>,
17 pub n_iter_: usize,
18}
19
20#[derive(Clone, Copy, Debug)]
21pub enum KMeansInit {
22 Random,
23 KMeansPlusPlus,
24}
25
26impl KMeans {
27 pub fn new(n_clusters: usize) -> Self {
28 KMeans {
29 n_clusters,
30 max_iter: 300,
31 tol: 1e-4,
32 n_init: 10,
33 init: KMeansInit::KMeansPlusPlus,
34 cluster_centers_: None,
35 labels_: None,
36 inertia_: None,
37 n_iter_: 0,
38 }
39 }
40
41 pub fn max_iter(mut self, n: usize) -> Self {
42 self.max_iter = n;
43 self
44 }
45
46 pub fn n_init(mut self, n: usize) -> Self {
47 self.n_init = n;
48 self
49 }
50
51 pub fn init(mut self, init: KMeansInit) -> Self {
52 self.init = init;
53 self
54 }
55
56 fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
57 a.iter()
58 .zip(b.iter())
59 .map(|(&ai, &bi)| (ai - bi).powi(2))
60 .sum::<f32>()
61 .sqrt()
62 }
63
64 fn euclidean_distance_squared(a: &[f32], b: &[f32]) -> f32 {
65 a.iter()
66 .zip(b.iter())
67 .map(|(&ai, &bi)| (ai - bi).powi(2))
68 .sum::<f32>()
69 }
70
71 fn init_random(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
72 let mut rng = thread_rng();
73 let indices: Vec<usize> = (0..n_samples)
74 .choose_multiple(&mut rng, self.n_clusters)
75 .into_iter()
76 .collect();
77
78 indices.iter()
79 .map(|&i| x[i * n_features..(i + 1) * n_features].to_vec())
80 .collect()
81 }
82
83 fn init_kmeans_plusplus(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
84 let mut rng = thread_rng();
85 let mut centers = Vec::with_capacity(self.n_clusters);
86
87 let first_idx = rng.gen_range(0..n_samples);
89 centers.push(x[first_idx * n_features..(first_idx + 1) * n_features].to_vec());
90
91 for _ in 1..self.n_clusters {
93 let distances: Vec<f32> = (0..n_samples)
95 .map(|i| {
96 let point = &x[i * n_features..(i + 1) * n_features];
97 centers.iter()
98 .map(|c| Self::euclidean_distance_squared(point, c))
99 .fold(f32::INFINITY, f32::min)
100 })
101 .collect();
102
103 let total: f32 = distances.iter().sum();
105 let threshold = rng.gen::<f32>() * total;
106
107 let mut cumsum = 0.0f32;
108 let mut chosen_idx = 0;
109 for (i, &d) in distances.iter().enumerate() {
110 cumsum += d;
111 if cumsum >= threshold {
112 chosen_idx = i;
113 break;
114 }
115 }
116
117 centers.push(x[chosen_idx * n_features..(chosen_idx + 1) * n_features].to_vec());
118 }
119
120 centers
121 }
122
123 fn assign_labels(&self, x: &[f32], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> Vec<usize> {
124 (0..n_samples)
125 .into_par_iter()
126 .map(|i| {
127 let point = &x[i * n_features..(i + 1) * n_features];
128 centers.iter()
129 .enumerate()
130 .map(|(c, center)| (c, Self::euclidean_distance_squared(point, center)))
131 .min_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap())
132 .map(|(c, _)| c)
133 .unwrap_or(0)
134 })
135 .collect()
136 }
137
138 fn update_centers(&self, x: &[f32], labels: &[usize], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
139 let mut new_centers = vec![vec![0.0f32; n_features]; self.n_clusters];
140 let mut counts = vec![0usize; self.n_clusters];
141
142 for i in 0..n_samples {
143 let label = labels[i];
144 counts[label] += 1;
145 for j in 0..n_features {
146 new_centers[label][j] += x[i * n_features + j];
147 }
148 }
149
150 for c in 0..self.n_clusters {
151 if counts[c] > 0 {
152 for j in 0..n_features {
153 new_centers[c][j] /= counts[c] as f32;
154 }
155 }
156 }
157
158 new_centers
159 }
160
161 fn compute_inertia(&self, x: &[f32], labels: &[usize], centers: &[Vec<f32>], n_samples: usize, n_features: usize) -> f32 {
162 (0..n_samples)
163 .map(|i| {
164 let point = &x[i * n_features..(i + 1) * n_features];
165 Self::euclidean_distance_squared(point, ¢ers[labels[i]])
166 })
167 .sum()
168 }
169
170 fn single_run(&self, x: &[f32], n_samples: usize, n_features: usize) -> (Vec<Vec<f32>>, Vec<usize>, f32, usize) {
171 let mut centers = match self.init {
173 KMeansInit::Random => self.init_random(x, n_samples, n_features),
174 KMeansInit::KMeansPlusPlus => self.init_kmeans_plusplus(x, n_samples, n_features),
175 };
176
177 let mut labels = self.assign_labels(x, ¢ers, n_samples, n_features);
178 let mut n_iter = 0;
179
180 for iter in 0..self.max_iter {
181 n_iter = iter + 1;
182
183 let new_centers = self.update_centers(x, &labels, n_samples, n_features);
185
186 let center_shift: f32 = centers.iter()
188 .zip(new_centers.iter())
189 .map(|(old, new)| Self::euclidean_distance(old, new))
190 .sum();
191
192 centers = new_centers;
193
194 if center_shift < self.tol {
195 break;
196 }
197
198 labels = self.assign_labels(x, ¢ers, n_samples, n_features);
200 }
201
202 let inertia = self.compute_inertia(x, &labels, ¢ers, n_samples, n_features);
203
204 (centers, labels, inertia, n_iter)
205 }
206
207 pub fn fit(&mut self, x: &Tensor) {
208 let x_data = x.data_f32();
209 let n_samples = x.dims()[0];
210 let n_features = x.dims()[1];
211
212 let mut best_centers = None;
213 let mut best_labels = None;
214 let mut best_inertia = f32::INFINITY;
215 let mut best_n_iter = 0;
216
217 for _ in 0..self.n_init {
218 let (centers, labels, inertia, n_iter) = self.single_run(&x_data, n_samples, n_features);
219
220 if inertia < best_inertia {
221 best_inertia = inertia;
222 best_centers = Some(centers);
223 best_labels = Some(labels);
224 best_n_iter = n_iter;
225 }
226 }
227
228 self.cluster_centers_ = best_centers;
229 self.labels_ = best_labels;
230 self.inertia_ = Some(best_inertia);
231 self.n_iter_ = best_n_iter;
232 }
233
234 pub fn predict(&self, x: &Tensor) -> Tensor {
235 let x_data = x.data_f32();
236 let n_samples = x.dims()[0];
237 let n_features = x.dims()[1];
238
239 let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
240 let labels = self.assign_labels(&x_data, centers, n_samples, n_features);
241
242 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
243 Tensor::from_slice(&labels_f32, &[n_samples]).unwrap()
244 }
245
246 pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
247 self.fit(x);
248 let labels = self.labels_.as_ref().unwrap();
249 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
250 Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
251 }
252
253 pub fn transform(&self, x: &Tensor) -> Tensor {
254 let x_data = x.data_f32();
255 let n_samples = x.dims()[0];
256 let n_features = x.dims()[1];
257
258 let centers = self.cluster_centers_.as_ref().expect("Model not fitted");
259
260 let mut distances = Vec::with_capacity(n_samples * self.n_clusters);
261 for i in 0..n_samples {
262 let point = &x_data[i * n_features..(i + 1) * n_features];
263 for center in centers {
264 distances.push(Self::euclidean_distance(point, center));
265 }
266 }
267
268 Tensor::from_slice(&distances, &[n_samples, self.n_clusters]).unwrap()
269 }
270}
271
272
273pub struct DBSCAN {
275 pub eps: f32,
276 pub min_samples: usize,
277 pub metric: DistanceMetric,
278 pub labels_: Option<Vec<i32>>,
279 pub core_sample_indices_: Option<Vec<usize>>,
280}
281
282#[derive(Clone, Copy, Debug)]
283pub enum DistanceMetric {
284 Euclidean,
285 Manhattan,
286 Cosine,
287}
288
289struct ClusterParams<'a> {
291 x: &'a [f32],
292 labels: &'a mut [i32],
293 visited: &'a mut [bool],
294 n_samples: usize,
295 n_features: usize,
296}
297
298impl DBSCAN {
299 pub fn new(eps: f32, min_samples: usize) -> Self {
300 DBSCAN {
301 eps,
302 min_samples,
303 metric: DistanceMetric::Euclidean,
304 labels_: None,
305 core_sample_indices_: None,
306 }
307 }
308
309 pub fn metric(mut self, metric: DistanceMetric) -> Self {
310 self.metric = metric;
311 self
312 }
313
314 fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
315 match self.metric {
316 DistanceMetric::Euclidean => {
317 a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
318 }
319 DistanceMetric::Manhattan => {
320 a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
321 }
322 DistanceMetric::Cosine => {
323 let dot: f32 = a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum();
324 let norm_a: f32 = a.iter().map(|&x| x * x).sum::<f32>().sqrt();
325 let norm_b: f32 = b.iter().map(|&x| x * x).sum::<f32>().sqrt();
326 1.0 - dot / (norm_a * norm_b + 1e-10)
327 }
328 }
329 }
330
331 fn region_query(&self, x: &[f32], point_idx: usize, n_samples: usize, n_features: usize) -> Vec<usize> {
332 let point = &x[point_idx * n_features..(point_idx + 1) * n_features];
333
334 (0..n_samples)
335 .filter(|&i| {
336 let other = &x[i * n_features..(i + 1) * n_features];
337 self.distance(point, other) <= self.eps
338 })
339 .collect()
340 }
341
342 fn expand_cluster(
343 &self,
344 cluster_params: &mut ClusterParams,
345 point_idx: usize,
346 neighbors: Vec<usize>,
347 cluster_id: i32,
348 ) {
349 cluster_params.labels[point_idx] = cluster_id;
350
351 let mut seeds = neighbors;
352 let mut i = 0;
353
354 while i < seeds.len() {
355 let q = seeds[i];
356
357 if !cluster_params.visited[q] {
358 cluster_params.visited[q] = true;
359 let q_neighbors = self.region_query(
360 cluster_params.x,
361 q,
362 cluster_params.n_samples,
363 cluster_params.n_features
364 );
365
366 if q_neighbors.len() >= self.min_samples {
367 for &neighbor in &q_neighbors {
368 if !seeds.contains(&neighbor) {
369 seeds.push(neighbor);
370 }
371 }
372 }
373 }
374
375 if cluster_params.labels[q] == -1 {
376 cluster_params.labels[q] = cluster_id;
377 }
378
379 i += 1;
380 }
381 }
382
383 pub fn fit(&mut self, x: &Tensor) {
384 let x_data = x.data_f32();
385 let n_samples = x.dims()[0];
386 let n_features = x.dims()[1];
387
388 let mut labels = vec![-1i32; n_samples];
389 let mut visited = vec![false; n_samples];
390 let mut core_samples = Vec::new();
391 let mut cluster_id = 0i32;
392
393 for i in 0..n_samples {
394 if visited[i] {
395 continue;
396 }
397
398 visited[i] = true;
399 let neighbors = self.region_query(&x_data, i, n_samples, n_features);
400
401 if neighbors.len() >= self.min_samples {
402 core_samples.push(i);
403 let mut params = ClusterParams {
404 x: &x_data,
405 labels: &mut labels,
406 visited: &mut visited,
407 n_samples,
408 n_features,
409 };
410 self.expand_cluster(
411 &mut params,
412 i,
413 neighbors,
414 cluster_id,
415 );
416 cluster_id += 1;
417 }
418 }
419
420 self.labels_ = Some(labels);
421 self.core_sample_indices_ = Some(core_samples);
422 }
423
424 pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
425 self.fit(x);
426 let labels = self.labels_.as_ref().unwrap();
427 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
428 Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
429 }
430}
431
432pub struct AgglomerativeClustering {
434 pub n_clusters: usize,
435 pub linkage: Linkage,
436 pub metric: DistanceMetric,
437 pub labels_: Option<Vec<usize>>,
438 pub n_leaves_: usize,
439 pub n_connected_components_: usize,
440}
441
442#[derive(Clone, Copy, Debug)]
443pub enum Linkage {
444 Single,
445 Complete,
446 Average,
447 Ward,
448}
449
450impl AgglomerativeClustering {
451 pub fn new(n_clusters: usize) -> Self {
452 AgglomerativeClustering {
453 n_clusters,
454 linkage: Linkage::Ward,
455 metric: DistanceMetric::Euclidean,
456 labels_: None,
457 n_leaves_: 0,
458 n_connected_components_: 1,
459 }
460 }
461
462 pub fn linkage(mut self, linkage: Linkage) -> Self {
463 self.linkage = linkage;
464 self
465 }
466
467 fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
468 a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
469 }
470
471 fn compute_distance_matrix(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<Vec<f32>> {
472 let mut dist_matrix = vec![vec![0.0f32; n_samples]; n_samples];
473
474 for i in 0..n_samples {
475 for j in (i + 1)..n_samples {
476 let point_i = &x[i * n_features..(i + 1) * n_features];
477 let point_j = &x[j * n_features..(j + 1) * n_features];
478 let dist = Self::euclidean_distance(point_i, point_j);
479 dist_matrix[i][j] = dist;
480 dist_matrix[j][i] = dist;
481 }
482 }
483
484 dist_matrix
485 }
486
487 fn cluster_distance(
488 &self,
489 cluster_a: &[usize],
490 cluster_b: &[usize],
491 dist_matrix: &[Vec<f32>],
492 x: &[f32],
493 n_features: usize,
494 ) -> f32 {
495 match self.linkage {
496 Linkage::Single => {
497 let mut min_dist = f32::INFINITY;
498 for &i in cluster_a {
499 for &j in cluster_b {
500 min_dist = min_dist.min(dist_matrix[i][j]);
501 }
502 }
503 min_dist
504 }
505 Linkage::Complete => {
506 let mut max_dist = 0.0f32;
507 for &i in cluster_a {
508 for &j in cluster_b {
509 max_dist = max_dist.max(dist_matrix[i][j]);
510 }
511 }
512 max_dist
513 }
514 Linkage::Average => {
515 let mut sum = 0.0f32;
516 let count = cluster_a.len() * cluster_b.len();
517 for &i in cluster_a {
518 for &j in cluster_b {
519 sum += dist_matrix[i][j];
520 }
521 }
522 sum / count as f32
523 }
524 Linkage::Ward => {
525 let n_a = cluster_a.len() as f32;
527 let n_b = cluster_b.len() as f32;
528
529 let mut centroid_a = vec![0.0f32; n_features];
531 let mut centroid_b = vec![0.0f32; n_features];
532
533 for &i in cluster_a {
534 for j in 0..n_features {
535 centroid_a[j] += x[i * n_features + j];
536 }
537 }
538 for &i in cluster_b {
539 for j in 0..n_features {
540 centroid_b[j] += x[i * n_features + j];
541 }
542 }
543
544 for j in 0..n_features {
545 centroid_a[j] /= n_a;
546 centroid_b[j] /= n_b;
547 }
548
549 let dist = Self::euclidean_distance(¢roid_a, ¢roid_b);
550 (n_a * n_b / (n_a + n_b)) * dist * dist
551 }
552 }
553 }
554
555 pub fn fit(&mut self, x: &Tensor) {
556 let x_data = x.data_f32();
557 let n_samples = x.dims()[0];
558 let n_features = x.dims()[1];
559
560 self.n_leaves_ = n_samples;
561
562 let dist_matrix = self.compute_distance_matrix(&x_data, n_samples, n_features);
564
565 let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
567
568 while clusters.len() > self.n_clusters {
570 let mut min_dist = f32::INFINITY;
572 let mut merge_i = 0;
573 let mut merge_j = 1;
574
575 for i in 0..clusters.len() {
576 for j in (i + 1)..clusters.len() {
577 let dist = self.cluster_distance(
578 &clusters[i],
579 &clusters[j],
580 &dist_matrix,
581 &x_data,
582 n_features,
583 );
584 if dist < min_dist {
585 min_dist = dist;
586 merge_i = i;
587 merge_j = j;
588 }
589 }
590 }
591
592 let cluster_j = clusters.remove(merge_j);
594 clusters[merge_i].extend(cluster_j);
595 }
596
597 let mut labels = vec![0usize; n_samples];
599 for (cluster_id, cluster) in clusters.iter().enumerate() {
600 for &point_idx in cluster {
601 labels[point_idx] = cluster_id;
602 }
603 }
604
605 self.labels_ = Some(labels);
606 self.n_connected_components_ = clusters.len();
607 }
608
609 pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
610 self.fit(x);
611 let labels = self.labels_.as_ref().unwrap();
612 let labels_f32: Vec<f32> = labels.iter().map(|&l| l as f32).collect();
613 Tensor::from_slice(&labels_f32, &[labels.len()]).unwrap()
614 }
615}
616
617#[cfg(test)]
618mod tests {
619 use super::*;
620
621 #[test]
622 fn test_kmeans() {
623 let x = Tensor::from_slice(&[0.0f32, 0.0,
624 0.1, 0.1,
625 0.2, 0.0,
626 5.0, 5.0,
627 5.1, 5.1,
628 5.0, 5.2,
629 ], &[6, 2]).unwrap();
630
631 let mut kmeans = KMeans::new(2).n_init(3);
632 let labels = kmeans.fit_predict(&x);
633
634 assert_eq!(labels.dims(), &[6]);
635 assert!(kmeans.inertia_.unwrap() < 1.0);
636 }
637
638 #[test]
639 fn test_dbscan() {
640 let x = Tensor::from_slice(&[0.0f32, 0.0,
641 0.1, 0.1,
642 0.2, 0.0,
643 5.0, 5.0,
644 5.1, 5.1,
645 ], &[5, 2]).unwrap();
646
647 let mut dbscan = DBSCAN::new(0.5, 2);
648 let labels = dbscan.fit_predict(&x);
649
650 assert_eq!(labels.dims(), &[5]);
651 }
652
653 #[test]
654 fn test_agglomerative() {
655 let x = Tensor::from_slice(&[0.0f32, 0.0,
656 0.1, 0.1,
657 5.0, 5.0,
658 5.1, 5.1,
659 ], &[4, 2]).unwrap();
660
661 let mut agg = AgglomerativeClustering::new(2);
662 let labels = agg.fit_predict(&x);
663
664 assert_eq!(labels.dims(), &[4]);
665 }
666}
667
668