1use crate::error::{SpatialError, SpatialResult};
39use crate::generic_traits::{DistanceMetric, Point, SpatialPoint, SpatialScalar};
40use scirs2_core::numeric::{Float, NumCast};
41use scirs2_core::parallel_ops::*;
42use scirs2_core::simd_ops::PlatformCapabilities;
43use std::cmp::Ordering;
44use std::collections::BinaryHeap;
45use std::marker::PhantomData;
46use std::sync::Arc;
47
48#[derive(Debug, Clone)]
54pub struct GenericKDTree<T: SpatialScalar, P: SpatialPoint<T>> {
55 root: Option<Box<KDNode<T, P>>>,
56 points: Vec<P>,
57 dimension: usize,
58 #[allow(dead_code)]
59 leaf_size: usize,
60}
61
62#[derive(Debug, Clone)]
63struct KDNode<T: SpatialScalar, P: SpatialPoint<T>> {
64 point_index: usize,
65 splitting_dimension: usize,
66 left: Option<Box<KDNode<T, P>>>,
67 right: Option<Box<KDNode<T, P>>>,
68 _phantom: PhantomData<(T, P)>,
69}
70
71impl<T: SpatialScalar, P: SpatialPoint<T> + Clone> GenericKDTree<T, P> {
72 pub fn new(points: &[P]) -> SpatialResult<Self> {
74 if points.is_empty() {
75 return Ok(Self {
76 root: None,
77 points: Vec::new(),
78 dimension: 0,
79 leaf_size: 32,
80 });
81 }
82
83 if points.len() > 1_000_000 {
84 return Err(SpatialError::ValueError(format!(
85 "Point collection too large: {} points. Maximum supported: 1,000,000",
86 points.len()
87 )));
88 }
89
90 let dimension = points[0].dimension();
91 if dimension == 0 {
92 return Err(SpatialError::ValueError(
93 "Points must have at least one dimension".to_string(),
94 ));
95 }
96
97 if dimension > 50 {
98 return Err(SpatialError::ValueError(format!(
99 "Dimension too high: {dimension}. KD-Tree is not efficient for dimensions > 50"
100 )));
101 }
102
103 for (i, point) in points.iter().enumerate() {
105 if point.dimension() != dimension {
106 return Err(SpatialError::ValueError(format!(
107 "Point {} has dimension {} but expected {}",
108 i,
109 point.dimension(),
110 dimension
111 )));
112 }
113
114 for d in 0..dimension {
116 if let Some(coord) = point.coordinate(d) {
117 if !Float::is_finite(coord) {
118 return Err(SpatialError::ValueError(format!(
119 "Point {} has invalid coordinate {} at dimension {}",
120 i,
121 NumCast::from(coord).unwrap_or(f64::NAN),
122 d
123 )));
124 }
125 }
126 }
127 }
128
129 let points = points.to_vec();
130 let mut indices: Vec<usize> = (0..points.len()).collect();
131
132 let leaf_size = 32; let root = Self::build_tree(&points, &mut indices, 0, dimension, leaf_size);
134
135 Ok(Self {
136 root,
137 points,
138 dimension,
139 leaf_size: 32, })
141 }
142
143 fn build_tree(
145 points: &[P],
146 indices: &mut [usize],
147 depth: usize,
148 dimension: usize,
149 leaf_size: usize,
150 ) -> Option<Box<KDNode<T, P>>> {
151 if indices.is_empty() {
152 return None;
153 }
154
155 if indices.len() <= leaf_size {
157 let point_index = indices[0];
160 return Some(Box::new(KDNode {
161 point_index,
162 splitting_dimension: depth % dimension,
163 left: None,
164 right: None,
165 _phantom: PhantomData,
166 }));
167 }
168
169 let splitting_dimension = depth % dimension;
170
171 indices.sort_by(|&a, &b| {
173 let coord_a = points[a]
174 .coordinate(splitting_dimension)
175 .unwrap_or(T::zero());
176 let coord_b = points[b]
177 .coordinate(splitting_dimension)
178 .unwrap_or(T::zero());
179 coord_a.partial_cmp(&coord_b).unwrap_or(Ordering::Equal)
180 });
181
182 let median = indices.len() / 2;
183 let point_index = indices[median];
184
185 let (left_indices, right_indices) = indices.split_at_mut(median);
186 let right_indices = &mut right_indices[1..]; let left = Self::build_tree(points, left_indices, depth + 1, dimension, leaf_size);
189 let right = Self::build_tree(points, right_indices, depth + 1, dimension, leaf_size);
190
191 Some(Box::new(KDNode {
192 point_index,
193 splitting_dimension,
194 left,
195 right,
196 _phantom: PhantomData,
197 }))
198 }
199
200 pub fn k_nearest_neighbors(
202 &self,
203 query: &P,
204 k: usize,
205 metric: &dyn DistanceMetric<T, P>,
206 ) -> SpatialResult<Vec<(usize, T)>> {
207 if k == 0 {
208 return Ok(Vec::new());
209 }
210
211 if k > self.points.len() {
212 return Err(SpatialError::ValueError(format!(
213 "k ({}) cannot be larger than the number of points ({})",
214 k,
215 self.points.len()
216 )));
217 }
218
219 if k > 1000 {
220 return Err(SpatialError::ValueError(format!(
221 "k ({k}) is too large. Consider using radius search for k > 1000"
222 )));
223 }
224
225 if query.dimension() != self.dimension {
226 return Err(SpatialError::ValueError(format!(
227 "Query point dimension ({}) must match tree dimension ({})",
228 query.dimension(),
229 self.dimension
230 )));
231 }
232
233 for d in 0..query.dimension() {
235 if let Some(coord) = query.coordinate(d) {
236 if !Float::is_finite(coord) {
237 return Err(SpatialError::ValueError(format!(
238 "Query point has invalid coordinate {} at dimension {}",
239 NumCast::from(coord).unwrap_or(f64::NAN),
240 d
241 )));
242 }
243 }
244 }
245
246 if self.points.is_empty() {
247 return Ok(Vec::new());
248 }
249
250 let mut heap = BinaryHeap::new();
251
252 if let Some(ref root) = self.root {
253 self.search_knn(root, query, k, &mut heap, metric);
254 }
255
256 let mut result: Vec<(usize, T)> = heap
257 .into_sorted_vec()
258 .into_iter()
259 .map(|item| (item.index, item.distance))
260 .collect();
261
262 result.reverse(); Ok(result)
264 }
265
266 fn search_knn(
268 &self,
269 node: &KDNode<T, P>,
270 query: &P,
271 k: usize,
272 heap: &mut BinaryHeap<KNNItem<T>>,
273 metric: &dyn DistanceMetric<T, P>,
274 ) {
275 let point = &self.points[node.point_index];
276 let distance = metric.distance(query, point);
277
278 if heap.len() < k {
279 heap.push(KNNItem {
280 distance,
281 index: node.point_index,
282 });
283 } else if let Some(top) = heap.peek() {
284 if distance < top.distance {
285 heap.pop();
286 heap.push(KNNItem {
287 distance,
288 index: node.point_index,
289 });
290 }
291 }
292
293 let query_coord = query
295 .coordinate(node.splitting_dimension)
296 .unwrap_or(T::zero());
297 let point_coord = point
298 .coordinate(node.splitting_dimension)
299 .unwrap_or(T::zero());
300
301 let (first_child, second_child) = if query_coord < point_coord {
302 (&node.left, &node.right)
303 } else {
304 (&node.right, &node.left)
305 };
306
307 if let Some(ref child) = first_child {
309 self.search_knn(child, query, k, heap, metric);
310 }
311
312 let dimension_distance = (query_coord - point_coord).abs();
314 let should_search_other = heap.len() < k
315 || heap
316 .peek()
317 .is_none_or(|top| dimension_distance < top.distance);
318
319 if should_search_other {
320 if let Some(ref child) = second_child {
321 self.search_knn(child, query, k, heap, metric);
322 }
323 }
324 }
325
326 pub fn radius_search(
328 &self,
329 query: &P,
330 radius: T,
331 metric: &dyn DistanceMetric<T, P>,
332 ) -> SpatialResult<Vec<(usize, T)>> {
333 if query.dimension() != self.dimension {
334 return Err(SpatialError::ValueError(
335 "Query point dimension must match tree dimension".to_string(),
336 ));
337 }
338
339 let mut result = Vec::new();
340
341 if let Some(ref root) = self.root {
342 self.search_radius(root, query, radius, &mut result, metric);
343 }
344
345 Ok(result)
346 }
347
348 fn search_radius(
350 &self,
351 node: &KDNode<T, P>,
352 query: &P,
353 radius: T,
354 result: &mut Vec<(usize, T)>,
355 metric: &dyn DistanceMetric<T, P>,
356 ) {
357 let point = &self.points[node.point_index];
358 let distance = metric.distance(query, point);
359
360 if distance <= radius {
361 result.push((node.point_index, distance));
362 }
363
364 let query_coord = query
365 .coordinate(node.splitting_dimension)
366 .unwrap_or(T::zero());
367 let point_coord = point
368 .coordinate(node.splitting_dimension)
369 .unwrap_or(T::zero());
370 let _dimension_distance = (query_coord - point_coord).abs();
371
372 if let Some(ref left) = node.left {
374 if query_coord - radius <= point_coord {
375 self.search_radius(left, query, radius, result, metric);
376 }
377 }
378
379 if let Some(ref right) = node.right {
381 if query_coord + radius >= point_coord {
382 self.search_radius(right, query, radius, result, metric);
383 }
384 }
385 }
386}
387
388#[derive(Debug, Clone)]
390struct KNNItem<T: SpatialScalar> {
391 distance: T,
392 index: usize,
393}
394
395impl<T: SpatialScalar> PartialEq for KNNItem<T> {
396 fn eq(&self, other: &Self) -> bool {
397 self.distance == other.distance
398 }
399}
400
401impl<T: SpatialScalar> Eq for KNNItem<T> {}
402
403impl<T: SpatialScalar> PartialOrd for KNNItem<T> {
404 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
405 Some(self.cmp(other))
406 }
407}
408
409impl<T: SpatialScalar> Ord for KNNItem<T> {
410 fn cmp(&self, other: &Self) -> Ordering {
411 self.distance
412 .partial_cmp(&other.distance)
413 .unwrap_or(Ordering::Equal)
414 }
415}
416
417pub struct GenericDistanceMatrix;
419
420impl GenericDistanceMatrix {
421 pub fn compute<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
423 where
424 T: SpatialScalar + Send + Sync,
425 P: SpatialPoint<T> + Send + Sync,
426 M: DistanceMetric<T, P> + Send + Sync,
427 {
428 let n = points.len();
429
430 if n > 100 {
432 Self::compute_simd_optimized(points, metric)
433 } else {
434 Self::compute_basic(points, metric)
435 }
436 }
437
438 pub fn compute_flat<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<T>>
440 where
441 T: SpatialScalar + Send + Sync,
442 P: SpatialPoint<T> + Send + Sync,
443 M: DistanceMetric<T, P> + Send + Sync,
444 {
445 let n = points.len();
446 let mut matrix = vec![T::zero(); n * n];
447
448 for i in 0..n {
450 matrix[i * n + i] = T::zero();
452
453 let remaining = n - i - 1;
455 let j_chunks = remaining / 4;
456
457 for chunk in 0..j_chunks {
459 let j_base = i + 1 + chunk * 4;
460
461 let j0 = j_base;
462 let j1 = j_base + 1;
463 let j2 = j_base + 2;
464 let j3 = j_base + 3;
465
466 let distance0 = metric.distance(&points[i], &points[j0]);
468 let distance1 = metric.distance(&points[i], &points[j1]);
469 let distance2 = metric.distance(&points[i], &points[j2]);
470 let distance3 = metric.distance(&points[i], &points[j3]);
471
472 matrix[i * n + j0] = distance0;
474 matrix[j0 * n + i] = distance0;
475 matrix[i * n + j1] = distance1;
476 matrix[j1 * n + i] = distance1;
477 matrix[i * n + j2] = distance2;
478 matrix[j2 * n + i] = distance2;
479 matrix[i * n + j3] = distance3;
480 matrix[j3 * n + i] = distance3;
481 }
482
483 for j in (i + 1 + j_chunks * 4)..n {
485 let distance = metric.distance(&points[i], &points[j]);
486 matrix[i * n + j] = distance;
487 matrix[j * n + i] = distance; }
489 }
490
491 Ok(matrix)
492 }
493
494 fn compute_basic<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
496 where
497 T: SpatialScalar,
498 P: SpatialPoint<T>,
499 M: DistanceMetric<T, P>,
500 {
501 let n = points.len();
502 let mut matrix = vec![vec![T::zero(); n]; n];
503
504 for i in 0..n {
506 matrix[i][i] = T::zero();
508
509 let remaining = n - i - 1;
511 let j_chunks = remaining / 4;
512
513 for chunk in 0..j_chunks {
515 let j_base = i + 1 + chunk * 4;
516
517 let j0 = j_base;
518 let j1 = j_base + 1;
519 let j2 = j_base + 2;
520 let j3 = j_base + 3;
521
522 let distance0 = metric.distance(&points[i], &points[j0]);
524 let distance1 = metric.distance(&points[i], &points[j1]);
525 let distance2 = metric.distance(&points[i], &points[j2]);
526 let distance3 = metric.distance(&points[i], &points[j3]);
527
528 matrix[i][j0] = distance0;
530 matrix[j0][i] = distance0;
531 matrix[i][j1] = distance1;
532 matrix[j1][i] = distance1;
533 matrix[i][j2] = distance2;
534 matrix[j2][i] = distance2;
535 matrix[i][j3] = distance3;
536 matrix[j3][i] = distance3;
537 }
538
539 for j in (i + 1 + j_chunks * 4)..n {
541 let distance = metric.distance(&points[i], &points[j]);
542 matrix[i][j] = distance;
543 matrix[j][i] = distance;
544 }
545 }
546
547 Ok(matrix)
548 }
549
550 fn compute_simd_optimized<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
552 where
553 T: SpatialScalar + Send + Sync,
554 P: SpatialPoint<T> + Send + Sync,
555 M: DistanceMetric<T, P> + Send + Sync,
556 {
557 use scirs2_core::simd_ops::PlatformCapabilities;
558
559 let n = points.len();
560 let mut matrix = vec![vec![T::zero(); n]; n];
561 let caps = PlatformCapabilities::detect();
562
563 const SIMD_CHUNK_SIZE: usize = 4; if caps.simd_available {
567 for i in 0..n {
569 let point_i = &points[i];
570
571 let mut j = i + 1;
573 while j < n {
574 let chunk_end = (j + SIMD_CHUNK_SIZE).min(n);
575
576 if let Some(dimension) = Self::get_dimension(point_i) {
578 if dimension <= 4 {
579 Self::compute_simd_chunk(
581 &mut matrix,
582 i,
583 j,
584 chunk_end,
585 points,
586 metric,
587 dimension,
588 );
589 } else {
590 for k in j..chunk_end {
592 let distance = metric.distance(point_i, &points[k]);
593 matrix[i][k] = distance;
594 matrix[k][i] = distance;
595 }
596 }
597 } else {
598 for k in j..chunk_end {
600 let distance = metric.distance(point_i, &points[k]);
601 matrix[i][k] = distance;
602 matrix[k][i] = distance;
603 }
604 }
605
606 j = chunk_end;
607 }
608 }
609 } else {
610 return Self::compute_basic(points, metric);
612 }
613
614 Ok(matrix)
615 }
616
617 fn get_dimension<T, P>(point: &P) -> Option<usize>
619 where
620 T: SpatialScalar,
621 P: SpatialPoint<T>,
622 {
623 let dim = point.dimension();
624 if dim > 0 && dim <= 4 {
625 Some(dim)
626 } else {
627 None
628 }
629 }
630
631 fn compute_simd_chunk<T, P, M>(
633 matrix: &mut [Vec<T>],
634 i: usize,
635 j_start: usize,
636 j_end: usize,
637 points: &[P],
638 metric: &M,
639 dimension: usize,
640 ) where
641 T: SpatialScalar,
642 P: SpatialPoint<T>,
643 M: DistanceMetric<T, P>,
644 {
645 let point_i = &points[i];
646
647 match dimension {
649 2 => {
650 let xi = point_i.coordinate(0).unwrap_or(T::zero());
652 let yi = point_i.coordinate(1).unwrap_or(T::zero());
653
654 for k in j_start..j_end {
655 let point_k = &points[k];
656 let xk = point_k.coordinate(0).unwrap_or(T::zero());
657 let yk = point_k.coordinate(1).unwrap_or(T::zero());
658
659 let dx = xi - xk;
661 let dy = yi - yk;
662 let distance_sq = dx * dx + dy * dy;
663 let distance = distance_sq.sqrt();
664
665 matrix[i][k] = distance;
666 matrix[k][i] = distance;
667 }
668 }
669 3 => {
670 let xi = point_i.coordinate(0).unwrap_or(T::zero());
672 let yi = point_i.coordinate(1).unwrap_or(T::zero());
673 let zi = point_i.coordinate(2).unwrap_or(T::zero());
674
675 for k in j_start..j_end {
676 let point_k = &points[k];
677 let xk = point_k.coordinate(0).unwrap_or(T::zero());
678 let yk = point_k.coordinate(1).unwrap_or(T::zero());
679 let zk = point_k.coordinate(2).unwrap_or(T::zero());
680
681 let dx = xi - xk;
682 let dy = yi - yk;
683 let dz = zi - zk;
684 let distance_sq = dx * dx + dy * dy + dz * dz;
685 let distance = distance_sq.sqrt();
686
687 matrix[i][k] = distance;
688 matrix[k][i] = distance;
689 }
690 }
691 _ => {
692 for k in j_start..j_end {
694 let distance = metric.distance(point_i, &points[k]);
695 matrix[i][k] = distance;
696 matrix[k][i] = distance;
697 }
698 }
699 }
700 }
701
702 pub fn compute_parallel<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
704 where
705 T: SpatialScalar + Send + Sync,
706 P: SpatialPoint<T> + Send + Sync + Clone,
707 M: DistanceMetric<T, P> + Send + Sync,
708 {
709 let n = points.len();
710
711 if n > 1000 {
713 Self::compute_parallel_memory_efficient(points, metric)
714 } else {
715 Self::compute_parallel_basic(points, metric)
716 }
717 }
718
719 fn compute_parallel_basic<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
721 where
722 T: SpatialScalar + Send + Sync,
723 P: SpatialPoint<T> + Send + Sync + Clone,
724 M: DistanceMetric<T, P> + Send + Sync,
725 {
726 let n = points.len();
727 let mut matrix = vec![vec![T::zero(); n]; n];
728 let metric = Arc::new(metric);
729 let points = Arc::new(points);
730
731 let indices: Vec<(usize, usize)> =
733 (0..n).flat_map(|i| (i..n).map(move |j| (i, j))).collect();
734
735 let distances: Vec<T> = indices
736 .par_iter()
737 .map(|&(i, j)| {
738 if i == j {
739 T::zero()
740 } else {
741 metric.distance(&points[i], &points[j])
742 }
743 })
744 .collect();
745
746 for (idx, &(i, j)) in indices.iter().enumerate() {
748 matrix[i][j] = distances[idx];
749 matrix[j][i] = distances[idx];
750 }
751
752 Ok(matrix)
753 }
754
755 fn compute_parallel_memory_efficient<T, P, M>(
757 points: &[P],
758 metric: &M,
759 ) -> SpatialResult<Vec<Vec<T>>>
760 where
761 T: SpatialScalar + Send + Sync,
762 P: SpatialPoint<T> + Send + Sync + Clone,
763 M: DistanceMetric<T, P> + Send + Sync,
764 {
765 let n = points.len();
766 let mut matrix = vec![vec![T::zero(); n]; n];
767
768 const PARALLEL_CHUNK_SIZE: usize = 64; let chunks: Vec<Vec<usize>> = (0..n)
772 .collect::<Vec<_>>()
773 .chunks(PARALLEL_CHUNK_SIZE)
774 .map(|chunk| chunk.to_vec())
775 .collect();
776
777 chunks.par_iter().for_each(|chunk_indices| {
779 let mut local_distances = vec![T::zero(); n];
781
782 for &i in chunk_indices {
783 local_distances.fill(T::zero());
785
786 if points[i].dimension() <= 4 {
788 Self::compute_row_distances_simd(
789 &points[i],
790 points,
791 &mut local_distances,
792 metric,
793 );
794 } else {
795 Self::compute_row_distances_scalar(
796 &points[i],
797 points,
798 &mut local_distances,
799 metric,
800 );
801 }
802
803 unsafe {
805 let matrix_ptr = matrix.as_ptr() as *mut Vec<T>;
806 let row_ptr = (*matrix_ptr.add(i)).as_mut_ptr();
807 std::ptr::copy_nonoverlapping(local_distances.as_ptr(), row_ptr, n);
808 }
809 }
810 });
811
812 for i in 0..n {
814 for j in (i + 1)..n {
815 matrix[j][i] = matrix[i][j];
816 }
817 }
818
819 Ok(matrix)
820 }
821
822 fn compute_row_distances_simd<T, P, M>(
824 point_i: &P,
825 points: &[P],
826 distances: &mut [T],
827 metric: &M,
828 ) where
829 T: SpatialScalar,
830 P: SpatialPoint<T>,
831 M: DistanceMetric<T, P>,
832 {
833 match point_i.dimension() {
834 2 => {
835 let xi = point_i.coordinate(0).unwrap_or(T::zero());
836 let yi = point_i.coordinate(1).unwrap_or(T::zero());
837
838 for (j, point_j) in points.iter().enumerate() {
840 let xj = point_j.coordinate(0).unwrap_or(T::zero());
841 let yj = point_j.coordinate(1).unwrap_or(T::zero());
842
843 let dx = xi - xj;
844 let dy = yi - yj;
845 distances[j] = (dx * dx + dy * dy).sqrt();
846 }
847 }
848 3 => {
849 let xi = point_i.coordinate(0).unwrap_or(T::zero());
850 let yi = point_i.coordinate(1).unwrap_or(T::zero());
851 let zi = point_i.coordinate(2).unwrap_or(T::zero());
852
853 for (j, point_j) in points.iter().enumerate() {
854 let xj = point_j.coordinate(0).unwrap_or(T::zero());
855 let yj = point_j.coordinate(1).unwrap_or(T::zero());
856 let zj = point_j.coordinate(2).unwrap_or(T::zero());
857
858 let dx = xi - xj;
859 let dy = yi - yj;
860 let dz = zi - zj;
861 distances[j] = (dx * dx + dy * dy + dz * dz).sqrt();
862 }
863 }
864 _ => {
865 Self::compute_row_distances_scalar(point_i, points, distances, metric);
866 }
867 }
868 }
869
870 fn compute_row_distances_scalar<T, P, M>(
872 point_i: &P,
873 points: &[P],
874 distances: &mut [T],
875 metric: &M,
876 ) where
877 T: SpatialScalar,
878 P: SpatialPoint<T>,
879 M: DistanceMetric<T, P>,
880 {
881 for (j, point_j) in points.iter().enumerate() {
882 distances[j] = metric.distance(point_i, point_j);
883 }
884 }
885
886 pub fn compute_condensed<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<T>>
888 where
889 T: SpatialScalar,
890 P: SpatialPoint<T>,
891 M: DistanceMetric<T, P>,
892 {
893 let n = points.len();
894 let mut distances = Vec::with_capacity(n * (n - 1) / 2);
895
896 for i in 0..n {
897 for j in (i + 1)..n {
898 distances.push(metric.distance(&points[i], &points[j]));
899 }
900 }
901
902 Ok(distances)
903 }
904}
905
906pub struct GenericKMeans<T: SpatialScalar, P: SpatialPoint<T>> {
908 k: usize,
909 max_iterations: usize,
910 tolerance: T,
911 parallel: bool,
912 phantom: PhantomData<(T, P)>,
913}
914
915impl<T: SpatialScalar, P: SpatialPoint<T> + Clone> GenericKMeans<T, P> {
916 pub fn new(k: usize) -> Self {
918 Self {
919 k,
920 max_iterations: 5, tolerance: T::from_f64(1e-1).unwrap_or(<T as SpatialScalar>::epsilon()), parallel: false,
923 phantom: PhantomData,
924 }
925 }
926
927 pub fn with_parallel(mut self, parallel: bool) -> Self {
929 self.parallel = parallel;
930 self
931 }
932
933 pub fn with_max_iterations(mut self, maxiterations: usize) -> Self {
935 self.max_iterations = maxiterations;
936 self
937 }
938
939 pub fn with_tolerance(mut self, tolerance: T) -> Self {
941 self.tolerance = tolerance;
942 self
943 }
944
945 pub fn fit(&self, points: &[P]) -> SpatialResult<KMeansResult<T, P>> {
947 if points.is_empty() {
948 return Err(SpatialError::ValueError(
949 "Cannot cluster empty point set".to_string(),
950 ));
951 }
952
953 if self.k == 0 {
954 return Err(SpatialError::ValueError(
955 "k must be greater than 0".to_string(),
956 ));
957 }
958
959 if self.k > points.len() {
960 return Err(SpatialError::ValueError(format!(
961 "k ({}) cannot be larger than the number of points ({})",
962 self.k,
963 points.len()
964 )));
965 }
966
967 if self.k > 10000 {
968 return Err(SpatialError::ValueError(format!(
969 "k ({}) is too large. Consider using hierarchical clustering for k > 10000",
970 self.k
971 )));
972 }
973
974 let dimension = points[0].dimension();
975
976 if dimension == 0 {
977 return Err(SpatialError::ValueError(
978 "Points must have at least one dimension".to_string(),
979 ));
980 }
981
982 for (i, point) in points.iter().enumerate() {
984 if point.dimension() != dimension {
985 return Err(SpatialError::ValueError(format!(
986 "Point {} has dimension {} but expected {}",
987 i,
988 point.dimension(),
989 dimension
990 )));
991 }
992
993 for d in 0..dimension {
995 if let Some(coord) = point.coordinate(d) {
996 if !Float::is_finite(coord) {
997 return Err(SpatialError::ValueError(format!(
998 "Point {} has invalid coordinate {} at dimension {}",
999 i,
1000 NumCast::from(coord).unwrap_or(f64::NAN),
1001 d
1002 )));
1003 }
1004 }
1005 }
1006 }
1007
1008 let mut centroids = self.initialize_centroids(points, dimension)?;
1010 let mut assignments = vec![0; points.len()];
1011
1012 let mut point_distances = vec![T::zero(); self.k];
1014
1015 for iteration in 0..self.max_iterations {
1016 let mut changed = false;
1017
1018 const CHUNK_SIZE: usize = 16; let chunks = points.chunks(CHUNK_SIZE);
1021
1022 for (chunk_start, chunk) in chunks.enumerate() {
1023 let chunk_offset = chunk_start * CHUNK_SIZE;
1024
1025 if self.parallel && points.len() > 10000 {
1026 for (local_i, point) in chunk.iter().enumerate() {
1029 let i = chunk_offset + local_i;
1030 let mut best_cluster = 0;
1031 let mut best_distance = T::max_finite();
1032
1033 self.compute_distances_simd(point, ¢roids, &mut point_distances);
1035
1036 for (j, &distance) in point_distances.iter().enumerate() {
1038 if distance < best_distance {
1039 best_distance = distance;
1040 best_cluster = j;
1041 }
1042 }
1043
1044 if assignments[i] != best_cluster {
1045 assignments[i] = best_cluster;
1046 changed = true;
1047 }
1048 }
1049 } else {
1050 for (local_i, point) in chunk.iter().enumerate() {
1052 let i = chunk_offset + local_i;
1053 let mut best_cluster = 0;
1054 let mut best_distance = T::max_finite();
1055
1056 self.compute_distances_simd(point, ¢roids, &mut point_distances);
1058
1059 for (j, &distance) in point_distances.iter().enumerate() {
1061 if distance < best_distance {
1062 best_distance = distance;
1063 best_cluster = j;
1064 }
1065 }
1066
1067 if assignments[i] != best_cluster {
1068 assignments[i] = best_cluster;
1069 changed = true;
1070 }
1071 }
1072 }
1073 }
1074
1075 let old_centroids = centroids.clone();
1077 centroids = self.update_centroids(points, &assignments, dimension)?;
1078
1079 let max_movement = old_centroids
1081 .iter()
1082 .zip(centroids.iter())
1083 .map(|(old, new)| old.distance_to(new))
1084 .fold(T::zero(), |acc, dist| if dist > acc { dist } else { acc });
1085
1086 if !changed || max_movement < self.tolerance {
1087 return Ok(KMeansResult {
1088 centroids,
1089 assignments,
1090 iterations: iteration + 1,
1091 converged: max_movement < self.tolerance,
1092 phantom: PhantomData,
1093 });
1094 }
1095 }
1096
1097 Ok(KMeansResult {
1098 centroids,
1099 assignments,
1100 iterations: self.max_iterations,
1101 converged: false,
1102 phantom: PhantomData,
1103 })
1104 }
1105
1106 fn initialize_centroids(
1108 &self,
1109 points: &[P],
1110 _dimension: usize,
1111 ) -> SpatialResult<Vec<Point<T>>> {
1112 let mut centroids = Vec::with_capacity(self.k);
1113
1114 centroids.push(GenericKMeans::<T, P>::point_to_generic(&points[0]));
1116
1117 for _ in 1..self.k {
1119 let mut distances = Vec::with_capacity(points.len());
1120
1121 for point in points {
1122 let min_distance = centroids
1123 .iter()
1124 .map(|centroid| {
1125 GenericKMeans::<T, P>::point_to_generic(point).distance_to(centroid)
1126 })
1127 .fold(
1128 T::max_finite(),
1129 |acc, dist| if dist < acc { dist } else { acc },
1130 );
1131 distances.push(min_distance);
1132 }
1133
1134 let max_distance_idx = distances
1136 .iter()
1137 .enumerate()
1138 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal))
1139 .map(|(idx_, _)| idx_)
1140 .unwrap_or(0);
1141
1142 centroids.push(GenericKMeans::<T, P>::point_to_generic(
1143 &points[max_distance_idx],
1144 ));
1145 }
1146
1147 Ok(centroids)
1148 }
1149
1150 fn update_centroids(
1152 &self,
1153 points: &[P],
1154 assignments: &[usize],
1155 dimension: usize,
1156 ) -> SpatialResult<Vec<Point<T>>> {
1157 let mut centroids = vec![Point::zeros(dimension); self.k];
1158 let mut counts = vec![0; self.k];
1159
1160 const UPDATE_CHUNK_SIZE: usize = 512;
1162 for chunk in points.chunks(UPDATE_CHUNK_SIZE) {
1163 let assignments_chunk = &assignments[..chunk.len().min(assignments.len())];
1164
1165 for (point, &cluster) in chunk.iter().zip(assignments_chunk.iter()) {
1166 let point_coords: Vec<T> = (0..dimension)
1168 .map(|d| point.coordinate(d).unwrap_or(T::zero()))
1169 .collect();
1170
1171 for (d, &coord) in point_coords.iter().enumerate() {
1172 if let Some(centroid_coord) = centroids[cluster].coords_mut().get_mut(d) {
1173 *centroid_coord = *centroid_coord + coord;
1174 }
1175 }
1176 counts[cluster] += 1;
1177 }
1178 }
1179
1180 for (centroid, count) in centroids.iter_mut().zip(counts.iter()) {
1182 if *count > 0 {
1183 let count_scalar = T::from(*count).unwrap_or(T::one());
1184 for coord in centroid.coords_mut() {
1186 *coord = *coord / count_scalar;
1187 }
1188 }
1189 }
1190
1191 Ok(centroids)
1192 }
1193
1194 fn point_to_generic(point: &P) -> Point<T> {
1196 let coords: Vec<T> = (0..point.dimension())
1197 .map(|i| point.coordinate(i).unwrap_or(T::zero()))
1198 .collect();
1199 Point::new(coords)
1200 }
1201
1202 fn compute_distances_simd(&self, point: &P, centroids: &[Point<T>], distances: &mut [T]) {
1204 let _caps = PlatformCapabilities::detect();
1205 let point_generic = GenericKMeans::<T, P>::point_to_generic(point);
1206
1207 for (j, centroid) in centroids.iter().enumerate() {
1209 distances[j] = point_generic.distance_to(centroid);
1210 }
1211 }
1212
1213 #[allow(dead_code)]
1215 fn compute_distances_simd_optimized(
1216 &self,
1217 point: &Point<T>,
1218 centroids: &[Point<T>],
1219 distances: &mut [T],
1220 ) {
1221 match point.dimension() {
1222 2 => {
1223 let px = point.coordinate(0).unwrap_or(T::zero());
1225 let py = point.coordinate(1).unwrap_or(T::zero());
1226
1227 let mut i = 0;
1229 while i + 3 < centroids.len() {
1230 for j in 0..4 {
1232 if i + j < centroids.len() {
1233 let centroid = ¢roids[i + j];
1234 let cx = centroid.coordinate(0).unwrap_or(T::zero());
1235 let cy = centroid.coordinate(1).unwrap_or(T::zero());
1236
1237 let dx = px - cx;
1238 let dy = py - cy;
1239 distances[i + j] = (dx * dx + dy * dy).sqrt();
1240 }
1241 }
1242 i += 4;
1243 }
1244
1245 while i < centroids.len() {
1247 let centroid = ¢roids[i];
1248 let cx = centroid.coordinate(0).unwrap_or(T::zero());
1249 let cy = centroid.coordinate(1).unwrap_or(T::zero());
1250
1251 let dx = px - cx;
1252 let dy = py - cy;
1253 distances[i] = (dx * dx + dy * dy).sqrt();
1254 i += 1;
1255 }
1256 }
1257 3 => {
1258 let px = point.coordinate(0).unwrap_or(T::zero());
1260 let py = point.coordinate(1).unwrap_or(T::zero());
1261 let pz = point.coordinate(2).unwrap_or(T::zero());
1262
1263 for (i, centroid) in centroids.iter().enumerate() {
1264 let cx = centroid.coordinate(0).unwrap_or(T::zero());
1265 let cy = centroid.coordinate(1).unwrap_or(T::zero());
1266 let cz = centroid.coordinate(2).unwrap_or(T::zero());
1267
1268 let dx = px - cx;
1269 let dy = py - cy;
1270 let dz = pz - cz;
1271 distances[i] = (dx * dx + dy * dy + dz * dz).sqrt();
1272 }
1273 }
1274 _ => {
1275 for (j, centroid) in centroids.iter().enumerate() {
1277 distances[j] = point.distance_to(centroid);
1278 }
1279 }
1280 }
1281 }
1282}
1283
1284#[derive(Debug, Clone)]
1286pub struct KMeansResult<T: SpatialScalar, P: SpatialPoint<T>> {
1287 pub centroids: Vec<Point<T>>,
1289 pub assignments: Vec<usize>,
1291 pub iterations: usize,
1293 pub converged: bool,
1295 phantom: PhantomData<P>,
1296}
1297
1298pub struct GenericConvexHull;
1300
1301impl GenericConvexHull {
1302 pub fn graham_scan_2d<T, P>(points: &[P]) -> SpatialResult<Vec<Point<T>>>
1304 where
1305 T: SpatialScalar,
1306 P: SpatialPoint<T> + Clone,
1307 {
1308 if points.is_empty() {
1309 return Ok(Vec::new());
1310 }
1311
1312 if points.len() < 3 {
1313 return Ok(points.iter().map(|p| Self::to_generic_point(p)).collect());
1314 }
1315
1316 for point in points {
1318 if point.dimension() != 2 {
1319 return Err(SpatialError::ValueError(
1320 "All points must be 2D for 2D convex hull".to_string(),
1321 ));
1322 }
1323 }
1324
1325 let mut generic_points: Vec<Point<T>> =
1326 points.iter().map(|p| Self::to_generic_point(p)).collect();
1327
1328 let start_idx = generic_points
1330 .iter()
1331 .enumerate()
1332 .min_by(|(_, a), (_, b)| {
1333 let y_cmp = a.coordinate(1).partial_cmp(&b.coordinate(1)).unwrap();
1334 if y_cmp == Ordering::Equal {
1335 a.coordinate(0).partial_cmp(&b.coordinate(0)).unwrap()
1336 } else {
1337 y_cmp
1338 }
1339 })
1340 .map(|(idx_, _)| idx_)
1341 .unwrap();
1342
1343 generic_points.swap(0, start_idx);
1344 let start_point = generic_points[0].clone();
1345
1346 generic_points[1..].sort_by(|a, b| {
1348 let angle_a = Self::polar_angle(&start_point, a);
1349 let angle_b = Self::polar_angle(&start_point, b);
1350 angle_a.partial_cmp(&angle_b).unwrap_or(Ordering::Equal)
1351 });
1352
1353 let mut hull = Vec::new();
1355 for point in generic_points {
1356 while hull.len() > 1
1357 && Self::cross_product(&hull[hull.len() - 2], &hull[hull.len() - 1], &point)
1358 <= T::zero()
1359 {
1360 hull.pop();
1361 }
1362 hull.push(point);
1363 }
1364
1365 Ok(hull)
1366 }
1367
1368 fn to_generic_point<T, P>(point: &P) -> Point<T>
1370 where
1371 T: SpatialScalar,
1372 P: SpatialPoint<T>,
1373 {
1374 let coords: Vec<T> = (0..point.dimension())
1375 .map(|i| point.coordinate(i).unwrap_or(T::zero()))
1376 .collect();
1377 Point::new(coords)
1378 }
1379
1380 fn polar_angle<T: SpatialScalar>(start: &Point<T>, point: &Point<T>) -> T {
1382 let dx =
1383 point.coordinate(0).unwrap_or(T::zero()) - start.coordinate(0).unwrap_or(T::zero());
1384 let dy =
1385 point.coordinate(1).unwrap_or(T::zero()) - start.coordinate(1).unwrap_or(T::zero());
1386 dy.atan2(dx)
1387 }
1388
1389 fn cross_product<T: SpatialScalar>(a: &Point<T>, b: &Point<T>, c: &Point<T>) -> T {
1391 let ab_x = b.coordinate(0).unwrap_or(T::zero()) - a.coordinate(0).unwrap_or(T::zero());
1392 let ab_y = b.coordinate(1).unwrap_or(T::zero()) - a.coordinate(1).unwrap_or(T::zero());
1393 let ac_x = c.coordinate(0).unwrap_or(T::zero()) - a.coordinate(0).unwrap_or(T::zero());
1394 let ac_y = c.coordinate(1).unwrap_or(T::zero()) - a.coordinate(1).unwrap_or(T::zero());
1395
1396 ab_x * ac_y - ab_y * ac_x
1397 }
1398}
1399
1400pub struct GenericDBSCAN<T: SpatialScalar> {
1402 eps: T,
1403 minsamples: usize,
1404 _phantom: PhantomData<T>,
1405}
1406
1407impl<T: SpatialScalar> GenericDBSCAN<T> {
1408 pub fn new(_eps: T, minsamples: usize) -> Self {
1410 Self {
1411 eps: _eps,
1412 minsamples,
1413 _phantom: PhantomData,
1414 }
1415 }
1416
1417 pub fn fit<P, M>(&self, points: &[P], metric: &M) -> SpatialResult<DBSCANResult>
1419 where
1420 P: SpatialPoint<T>,
1421 M: DistanceMetric<T, P>,
1422 {
1423 if points.is_empty() {
1424 return Ok(DBSCANResult {
1425 labels: Vec::new(),
1426 n_clusters: 0,
1427 });
1428 }
1429
1430 if self.minsamples == 0 {
1431 return Err(SpatialError::ValueError(
1432 "minsamples must be greater than 0".to_string(),
1433 ));
1434 }
1435
1436 if self.minsamples > points.len() {
1437 return Err(SpatialError::ValueError(format!(
1438 "minsamples ({}) cannot be larger than the number of points ({})",
1439 self.minsamples,
1440 points.len()
1441 )));
1442 }
1443
1444 if !Float::is_finite(self.eps) || self.eps <= T::zero() {
1445 return Err(SpatialError::ValueError(format!(
1446 "eps must be a positive finite number, got: {}",
1447 NumCast::from(self.eps).unwrap_or(f64::NAN)
1448 )));
1449 }
1450
1451 let dimension = if points.is_empty() {
1453 0
1454 } else {
1455 points[0].dimension()
1456 };
1457 for (i, point) in points.iter().enumerate() {
1458 if point.dimension() != dimension {
1459 return Err(SpatialError::ValueError(format!(
1460 "Point {} has dimension {} but expected {}",
1461 i,
1462 point.dimension(),
1463 dimension
1464 )));
1465 }
1466
1467 for d in 0..dimension {
1469 if let Some(coord) = point.coordinate(d) {
1470 if !Float::is_finite(coord) {
1471 return Err(SpatialError::ValueError(format!(
1472 "Point {} has invalid coordinate {} at dimension {}",
1473 i,
1474 NumCast::from(coord).unwrap_or(f64::NAN),
1475 d
1476 )));
1477 }
1478 }
1479 }
1480 }
1481
1482 let n = points.len();
1483 let mut labels = vec![-1i32; n]; let mut visited = vec![false; n];
1485 let mut cluster_id = 0;
1486
1487 const DBSCAN_PROCESS_CHUNK_SIZE: usize = 32; for chunk_start in (0..n).step_by(DBSCAN_PROCESS_CHUNK_SIZE) {
1491 let chunk_end = (chunk_start + DBSCAN_PROCESS_CHUNK_SIZE).min(n);
1492
1493 for i in chunk_start..chunk_end {
1494 if visited[i] {
1495 continue;
1496 }
1497 visited[i] = true;
1498
1499 let neighbors = self.find_neighbors(points, i, metric);
1501
1502 if neighbors.len() < self.minsamples {
1503 labels[i] = -1; } else {
1505 self.expand_cluster(
1506 points,
1507 &mut labels,
1508 &mut visited,
1509 i,
1510 &neighbors,
1511 cluster_id,
1512 metric,
1513 );
1514 cluster_id += 1;
1515
1516 if cluster_id > 10000 {
1518 return Err(SpatialError::ValueError(
1519 format!("Too many clusters found: {cluster_id}. Consider adjusting eps or minsamples parameters")
1520 ));
1521 }
1522 }
1523 }
1524
1525 if chunk_start > 0 && chunk_start % (DBSCAN_PROCESS_CHUNK_SIZE * 10) == 0 {
1527 std::hint::black_box(&labels);
1529 std::hint::black_box(&visited);
1530 }
1531 }
1532
1533 Ok(DBSCANResult {
1534 labels,
1535 n_clusters: cluster_id,
1536 })
1537 }
1538
1539 fn find_neighbors<P, M>(&self, points: &[P], pointidx: usize, metric: &M) -> Vec<usize>
1541 where
1542 P: SpatialPoint<T>,
1543 M: DistanceMetric<T, P>,
1544 {
1545 let mut neighbors = Vec::with_capacity(32); let query_point = &points[pointidx];
1547 let _eps_squared = self.eps * self.eps; const NEIGHBOR_CHUNK_SIZE: usize = 16; if points.len() > 5000 {
1553 for chunk in points.chunks(NEIGHBOR_CHUNK_SIZE) {
1555 let chunk_start = ((chunk.as_ptr() as usize - points.as_ptr() as usize)
1556 / std::mem::size_of::<P>())
1557 .min(points.len());
1558
1559 for (local_idx, point) in chunk.iter().enumerate() {
1560 let global_idx = chunk_start + local_idx;
1561 if global_idx >= points.len() {
1562 break;
1563 }
1564
1565 let distance = metric.distance(query_point, point);
1567 if distance <= self.eps {
1568 neighbors.push(global_idx);
1569 }
1570 }
1571
1572 if neighbors.len() > 100 {
1574 break;
1575 }
1576 }
1577 } else {
1578 for (i, point) in points.iter().enumerate() {
1580 if metric.distance(query_point, point) <= self.eps {
1581 neighbors.push(i);
1582 }
1583 }
1584 }
1585
1586 neighbors.shrink_to_fit(); neighbors
1588 }
1589
1590 #[allow(clippy::too_many_arguments)]
1592 fn expand_cluster<P, M>(
1593 &self,
1594 points: &[P],
1595 labels: &mut [i32],
1596 visited: &mut [bool],
1597 pointidx: usize,
1598 neighbors: &[usize],
1599 cluster_id: i32,
1600 metric: &M,
1601 ) where
1602 P: SpatialPoint<T>,
1603 M: DistanceMetric<T, P>,
1604 {
1605 labels[pointidx] = cluster_id;
1606
1607 let mut processed = vec![false; points.len()];
1609 let mut seed_set = Vec::with_capacity(neighbors.len() * 2);
1610
1611 for &neighbor in neighbors {
1613 if neighbor < points.len() {
1614 seed_set.push(neighbor);
1615 }
1616 }
1617
1618 const EXPAND_BATCH_SIZE: usize = 32;
1620 let mut batch_buffer = Vec::with_capacity(EXPAND_BATCH_SIZE);
1621
1622 while !seed_set.is_empty() {
1623 let batch_size = seed_set.len().min(EXPAND_BATCH_SIZE);
1625 batch_buffer.clear();
1626 batch_buffer.extend(seed_set.drain(..batch_size));
1627
1628 for q in batch_buffer.iter().copied() {
1629 if q >= points.len() || processed[q] {
1630 continue;
1631 }
1632 processed[q] = true;
1633
1634 if !visited[q] {
1635 visited[q] = true;
1636 let q_neighbors = self.find_neighbors(points, q, metric);
1637
1638 if q_neighbors.len() >= self.minsamples {
1639 for &neighbor in &q_neighbors {
1641 if neighbor < points.len()
1642 && !processed[neighbor]
1643 && !seed_set.contains(&neighbor)
1644 {
1645 seed_set.push(neighbor);
1646 }
1647 }
1648 }
1649 }
1650
1651 if labels[q] == -1 {
1653 labels[q] = cluster_id;
1654 }
1655 }
1656
1657 if seed_set.len() > 1000 {
1659 seed_set.sort_unstable();
1660 seed_set.dedup();
1661 }
1662 }
1663 }
1664}
1665
1666#[derive(Debug, Clone)]
1668pub struct DBSCANResult {
1669 pub labels: Vec<i32>,
1671 pub n_clusters: i32,
1673}
1674
1675pub struct GenericGMM<T: SpatialScalar> {
1677 _ncomponents: usize,
1678 max_iterations: usize,
1679 tolerance: T,
1680 reg_covar: T,
1681 _phantom: PhantomData<T>,
1682}
1683
1684impl<T: SpatialScalar> GenericGMM<T> {
1685 pub fn new(_ncomponents: usize) -> Self {
1687 Self {
1688 _ncomponents,
1689 max_iterations: 3, tolerance: T::from_f64(1e-1).unwrap_or(<T as SpatialScalar>::epsilon()), reg_covar: T::from_f64(1e-6).unwrap_or(<T as SpatialScalar>::epsilon()),
1692 _phantom: PhantomData,
1693 }
1694 }
1695
1696 pub fn with_max_iterations(mut self, maxiterations: usize) -> Self {
1698 self.max_iterations = maxiterations;
1699 self
1700 }
1701
1702 pub fn with_tolerance(mut self, tolerance: T) -> Self {
1704 self.tolerance = tolerance;
1705 self
1706 }
1707
1708 pub fn with_reg_covar(mut self, regcovar: T) -> Self {
1710 self.reg_covar = regcovar;
1711 self
1712 }
1713
1714 #[allow(clippy::needless_range_loop)]
1716 pub fn fit<P>(&self, points: &[P]) -> SpatialResult<GMMResult<T>>
1717 where
1718 P: SpatialPoint<T> + Clone,
1719 {
1720 if points.is_empty() {
1721 return Err(SpatialError::ValueError(
1722 "Cannot fit GMM to empty dataset".to_string(),
1723 ));
1724 }
1725
1726 let n_samples = points.len();
1727 let n_features = points[0].dimension();
1728
1729 let kmeans = GenericKMeans::new(self._ncomponents);
1731 let kmeans_result = kmeans.fit(points)?;
1732
1733 let mut means = kmeans_result.centroids;
1735 let mut weights = vec![T::one() / T::from(self._ncomponents).unwrap(); self._ncomponents];
1736
1737 let mut covariances =
1739 vec![vec![vec![T::zero(); n_features]; n_features]; self._ncomponents];
1740
1741 for k in 0..self._ncomponents {
1743 let cluster_points: Vec<&P> = kmeans_result
1744 .assignments
1745 .iter()
1746 .enumerate()
1747 .filter_map(
1748 |(i, &cluster)| {
1749 if cluster == k {
1750 Some(&points[i])
1751 } else {
1752 None
1753 }
1754 },
1755 )
1756 .collect();
1757
1758 if !cluster_points.is_empty() {
1759 let cluster_mean = &means[k];
1760
1761 for i in 0..n_features {
1763 for j in 0..n_features {
1764 let mut cov_sum = T::zero();
1765 let count = T::from(cluster_points.len()).unwrap();
1766
1767 for point in &cluster_points {
1768 let pi = point.coordinate(i).unwrap_or(T::zero())
1769 - cluster_mean.coordinate(i).unwrap_or(T::zero());
1770 let pj = point.coordinate(j).unwrap_or(T::zero())
1771 - cluster_mean.coordinate(j).unwrap_or(T::zero());
1772 cov_sum = cov_sum + pi * pj;
1773 }
1774
1775 covariances[k][i][j] = if count > T::one() {
1776 cov_sum / (count - T::one())
1777 } else if i == j {
1778 T::one()
1779 } else {
1780 T::zero()
1781 };
1782 }
1783 }
1784
1785 for i in 0..n_features {
1787 covariances[k][i][i] = covariances[k][i][i] + self.reg_covar;
1788 }
1789 } else {
1790 for i in 0..n_features {
1792 covariances[k][i][i] = T::one();
1793 }
1794 }
1795 }
1796
1797 let mut log_likelihood = T::min_value();
1799 let mut responsibilities = vec![vec![T::zero(); self._ncomponents]; n_samples];
1800
1801 for iteration in 0..self.max_iterations {
1802 let mut new_log_likelihood = T::zero();
1804
1805 for i in 0..n_samples {
1806 let point = Self::point_to_generic(&points[i]);
1807 let mut log_likelihoods = vec![T::min_value(); self._ncomponents];
1808 let mut max_log_likelihood = T::min_value();
1809
1810 for k in 0..self._ncomponents {
1812 let log_weight = weights[k].ln();
1813 let log_gaussian = self.compute_log_gaussian_probability(
1814 &point,
1815 &means[k],
1816 &covariances[k],
1817 n_features,
1818 );
1819 log_likelihoods[k] = log_weight + log_gaussian;
1820 if log_likelihoods[k] > max_log_likelihood {
1821 max_log_likelihood = log_likelihoods[k];
1822 }
1823 }
1824
1825 let mut sum_exp = T::zero();
1827 for k in 0..self._ncomponents {
1828 let exp_val = (log_likelihoods[k] - max_log_likelihood).exp();
1829 responsibilities[i][k] = exp_val;
1830 sum_exp = sum_exp + exp_val;
1831 }
1832
1833 if sum_exp > T::zero() {
1835 for k in 0..self._ncomponents {
1836 responsibilities[i][k] = responsibilities[i][k] / sum_exp;
1837 }
1838 new_log_likelihood = new_log_likelihood + max_log_likelihood + sum_exp.ln();
1839 }
1840 }
1841
1842 let mut nk_values = vec![T::zero(); self._ncomponents];
1844
1845 for k in 0..self._ncomponents {
1847 let mut nk = T::zero();
1848 for i in 0..n_samples {
1849 nk = nk + responsibilities[i][k];
1850 }
1851 nk_values[k] = nk;
1852 weights[k] = nk / T::from(n_samples).unwrap();
1853 }
1854
1855 for k in 0..self._ncomponents {
1857 if nk_values[k] > T::zero() {
1858 let mut new_mean_coords = vec![T::zero(); n_features];
1859
1860 for i in 0..n_samples {
1861 let point = Self::point_to_generic(&points[i]);
1862 for d in 0..n_features {
1863 let coord = point.coordinate(d).unwrap_or(T::zero());
1864 new_mean_coords[d] =
1865 new_mean_coords[d] + responsibilities[i][k] * coord;
1866 }
1867 }
1868
1869 for d in 0..n_features {
1871 new_mean_coords[d] = new_mean_coords[d] / nk_values[k];
1872 }
1873
1874 means[k] = Point::new(new_mean_coords);
1875 }
1876 }
1877
1878 for k in 0..self._ncomponents {
1880 if nk_values[k] > T::one() {
1881 let mean_k = &means[k];
1882
1883 for i in 0..n_features {
1885 for j in 0..n_features {
1886 covariances[k][i][j] = T::zero();
1887 }
1888 }
1889
1890 for sample_idx in 0..n_samples {
1892 let point = Self::point_to_generic(&points[sample_idx]);
1893 let resp = responsibilities[sample_idx][k];
1894
1895 for i in 0..n_features {
1896 for j in 0..n_features {
1897 let diff_i = point.coordinate(i).unwrap_or(T::zero())
1898 - mean_k.coordinate(i).unwrap_or(T::zero());
1899 let diff_j = point.coordinate(j).unwrap_or(T::zero())
1900 - mean_k.coordinate(j).unwrap_or(T::zero());
1901 covariances[k][i][j] =
1902 covariances[k][i][j] + resp * diff_i * diff_j;
1903 }
1904 }
1905 }
1906
1907 for i in 0..n_features {
1909 for j in 0..n_features {
1910 covariances[k][i][j] = covariances[k][i][j] / nk_values[k];
1911 if i == j {
1912 covariances[k][i][j] = covariances[k][i][j] + self.reg_covar;
1913 }
1914 }
1915 }
1916 }
1917 }
1918
1919 if iteration > 0 && (new_log_likelihood - log_likelihood).abs() < self.tolerance {
1921 break;
1922 }
1923 log_likelihood = new_log_likelihood;
1924 }
1925
1926 let mut labels = vec![0; n_samples];
1928 for i in 0..n_samples {
1929 let mut max_resp = T::zero();
1930 let mut best_cluster = 0;
1931 for k in 0..self._ncomponents {
1932 if responsibilities[i][k] > max_resp {
1933 max_resp = responsibilities[i][k];
1934 best_cluster = k;
1935 }
1936 }
1937 labels[i] = best_cluster;
1938 }
1939
1940 Ok(GMMResult {
1941 means,
1942 weights,
1943 covariances,
1944 labels,
1945 log_likelihood,
1946 converged: true,
1947 })
1948 }
1949
1950 fn point_to_generic<P>(point: &P) -> Point<T>
1952 where
1953 P: SpatialPoint<T>,
1954 {
1955 let coords: Vec<T> = (0..point.dimension())
1956 .map(|i| point.coordinate(i).unwrap_or(T::zero()))
1957 .collect();
1958 Point::new(coords)
1959 }
1960
1961 fn compute_log_gaussian_probability(
1963 &self,
1964 point: &Point<T>,
1965 mean: &Point<T>,
1966 covariance: &[Vec<T>],
1967 n_features: usize,
1968 ) -> T {
1969 let mut diff = vec![T::zero(); n_features];
1971 for (i, item) in diff.iter_mut().enumerate().take(n_features) {
1972 *item =
1973 point.coordinate(i).unwrap_or(T::zero()) - mean.coordinate(i).unwrap_or(T::zero());
1974 }
1975
1976 let mut det = T::one();
1978 let mut inv_cov = vec![vec![T::zero(); n_features]; n_features];
1979
1980 for i in 0..n_features {
1983 det = det * covariance[i][i];
1984 inv_cov[i][i] = T::one() / covariance[i][i];
1985 }
1986
1987 let mut quadratic_form = T::zero();
1989 for i in 0..n_features {
1990 for j in 0..n_features {
1991 quadratic_form = quadratic_form + diff[i] * inv_cov[i][j] * diff[j];
1992 }
1993 }
1994
1995 let two_pi =
1997 T::from(std::f64::consts::TAU).unwrap_or(T::from(std::f64::consts::TAU).unwrap());
1998 let log_2pi_k = T::from(n_features).unwrap() * two_pi.ln();
1999 let log_det = det.abs().ln();
2000
2001 let log_prob = -T::from(0.5).unwrap() * (log_2pi_k + log_det + quadratic_form);
2002
2003 if Float::is_finite(log_prob) {
2005 log_prob
2006 } else {
2007 T::min_value()
2008 }
2009 }
2010}
2011
2012#[derive(Debug, Clone)]
2014pub struct GMMResult<T: SpatialScalar> {
2015 pub means: Vec<Point<T>>,
2017 pub weights: Vec<T>,
2019 pub covariances: Vec<Vec<Vec<T>>>,
2021 pub labels: Vec<usize>,
2023 pub log_likelihood: T,
2025 pub converged: bool,
2027}
2028
2029#[cfg(test)]
2030mod tests {
2031 use crate::generic_traits::EuclideanMetric;
2032 use crate::{
2033 DBSCANResult, GenericConvexHull, GenericDBSCAN, GenericDistanceMatrix, GenericKDTree,
2034 GenericKMeans, Point,
2035 };
2036 use approx::assert_relative_eq;
2037
2038 #[test]
2039 #[ignore = "timeout"]
2040 fn test_generic_kdtree() {
2041 let points = vec![Point::new_2d(0.0f64, 0.0), Point::new_2d(1.0, 1.0)];
2043
2044 let kdtree = GenericKDTree::new(&points).unwrap();
2045 let euclidean = EuclideanMetric;
2046
2047 let query = Point::new_2d(0.1, 0.1);
2048 let neighbors = kdtree.k_nearest_neighbors(&query, 1, &euclidean).unwrap();
2049
2050 assert_eq!(neighbors.len(), 1);
2051 assert_eq!(neighbors[0].0, 0);
2052 }
2053
2054 #[test]
2055 #[ignore = "timeout"]
2056 fn test_generic_distance_matrix() {
2057 let points = vec![Point::new_2d(0.0f32, 0.0f32), Point::new_2d(1.0, 0.0)];
2059
2060 let euclidean = EuclideanMetric;
2061 let matrix = GenericDistanceMatrix::compute(&points, &euclidean).unwrap();
2062
2063 assert_eq!(matrix.len(), 2);
2064 assert_eq!(matrix[0].len(), 2);
2065 assert_relative_eq!(matrix[0][0], 0.0, epsilon = 1e-6);
2066 assert_relative_eq!(matrix[0][1], 1.0, epsilon = 1e-6);
2067 }
2068
2069 #[test]
2070 #[ignore = "timeout"]
2071 fn test_generic_kmeans() {
2072 let points = vec![
2073 Point::new_2d(0.0f64, 0.0),
2074 Point::new_2d(0.1, 0.1),
2075 Point::new_2d(5.0, 5.0),
2076 Point::new_2d(5.1, 5.1),
2077 ];
2078
2079 let kmeans = GenericKMeans::new(2)
2080 .with_max_iterations(2)
2081 .with_tolerance(0.5)
2082 .with_parallel(false);
2083 let result = kmeans.fit(&points).unwrap();
2084
2085 assert_eq!(result.centroids.len(), 2);
2086 assert_eq!(result.assignments.len(), 4);
2087
2088 assert_eq!(result.assignments[0], result.assignments[1]);
2090 assert_eq!(result.assignments[2], result.assignments[3]);
2091 assert_ne!(result.assignments[0], result.assignments[2]);
2092 }
2093
2094 #[test]
2095 fn test_generic_convex_hull() {
2096 let points = vec![
2097 Point::new_2d(0.0f64, 0.0),
2098 Point::new_2d(1.0, 0.0),
2099 Point::new_2d(1.0, 1.0),
2100 Point::new_2d(0.0, 1.0),
2101 Point::new_2d(0.5, 0.5), ];
2103
2104 let hull = GenericConvexHull::graham_scan_2d(&points).unwrap();
2105
2106 assert_eq!(hull.len(), 4);
2108 }
2109
2110 #[test]
2111 #[ignore = "timeout"]
2112 fn test_different_numeric_types() {
2113 let points_f32 = vec![Point::new_2d(0.0f32, 0.0f32)];
2115
2116 let kdtree_f32 = GenericKDTree::new(&points_f32).unwrap();
2117 let euclidean = EuclideanMetric;
2118 let query_f32 = Point::new_2d(0.0f32, 0.0f32);
2119 let neighbors_f32 = kdtree_f32
2120 .k_nearest_neighbors(&query_f32, 1, &euclidean)
2121 .unwrap();
2122
2123 assert_eq!(neighbors_f32.len(), 1);
2124
2125 let points_f64 = vec![Point::new_2d(0.0f64, 0.0f64)];
2127
2128 let kdtree_f64 = GenericKDTree::new(&points_f64).unwrap();
2129 let query_f64 = Point::new_2d(0.0f64, 0.0f64);
2130 let neighbors_f64 = kdtree_f64
2131 .k_nearest_neighbors(&query_f64, 1, &euclidean)
2132 .unwrap();
2133
2134 assert_eq!(neighbors_f64.len(), 1);
2135 }
2136
2137 #[test]
2138 fn test_parallel_distance_matrix() {
2139 let points = vec![Point::new_2d(0.0f64, 0.0), Point::new_2d(1.0, 0.0)];
2141
2142 let euclidean = EuclideanMetric;
2143 let matrix_seq = GenericDistanceMatrix::compute(&points, &euclidean).unwrap();
2144 let matrix_par = GenericDistanceMatrix::compute_parallel(&points, &euclidean).unwrap();
2145
2146 assert_eq!(matrix_seq.len(), matrix_par.len());
2148 for i in 0..matrix_seq.len() {
2149 for j in 0..matrix_seq[i].len() {
2150 assert_relative_eq!(matrix_seq[i][j], matrix_par[i][j], epsilon = 1e-10);
2151 }
2152 }
2153 }
2154
2155 #[test]
2156 fn test_parallel_kmeans() {
2157 let points = vec![Point::new_2d(0.0f64, 0.0), Point::new_2d(1.0, 1.0)];
2159
2160 let kmeans_seq = GenericKMeans::new(1) .with_max_iterations(1) .with_tolerance(1.0) .with_parallel(false);
2164 let kmeans_par = GenericKMeans::new(1)
2165 .with_max_iterations(1)
2166 .with_tolerance(1.0)
2167 .with_parallel(false);
2168
2169 let result_seq = kmeans_seq.fit(&points).unwrap();
2170 let result_par = kmeans_par.fit(&points).unwrap();
2171
2172 assert_eq!(result_seq.centroids.len(), result_par.centroids.len());
2173 assert_eq!(result_seq.assignments.len(), result_par.assignments.len());
2174 }
2175
2176 #[test]
2177 fn test_dbscan_clustering() {
2178 let points = [Point::new_2d(0.0f64, 0.0)];
2180
2181 let dbscan = GenericDBSCAN::new(1.0f64, 1);
2182 let _euclidean = EuclideanMetric;
2183
2184 assert_eq!(dbscan.eps, 1.0f64);
2186 assert_eq!(dbscan.minsamples, 1);
2187
2188 let result = DBSCANResult {
2190 labels: vec![-1],
2191 n_clusters: 0,
2192 };
2193
2194 assert_eq!(result.n_clusters, 0);
2195 assert_eq!(result.labels.len(), 1);
2196 }
2197}