1use crate::error::{SpatialError, SpatialResult};
39use crate::generic_traits::{DistanceMetric, Point, SpatialPoint, SpatialScalar};
40use num_traits::{Float, NumCast};
41use scirs2_core::parallel_ops::*;
42use scirs2_core::simd_ops::PlatformCapabilities;
43use std::cmp::Ordering;
44use std::collections::BinaryHeap;
45use std::marker::PhantomData;
46use std::sync::Arc;
47
48#[derive(Debug, Clone)]
54pub struct GenericKDTree<T: SpatialScalar, P: SpatialPoint<T>> {
55 root: Option<Box<KDNode<T, P>>>,
56 points: Vec<P>,
57 dimension: usize,
58 #[allow(dead_code)]
59 leaf_size: usize,
60}
61
62#[derive(Debug, Clone)]
63struct KDNode<T: SpatialScalar, P: SpatialPoint<T>> {
64 point_index: usize,
65 splitting_dimension: usize,
66 left: Option<Box<KDNode<T, P>>>,
67 right: Option<Box<KDNode<T, P>>>,
68 _phantom: PhantomData<(T, P)>,
69}
70
71impl<T: SpatialScalar, P: SpatialPoint<T> + Clone> GenericKDTree<T, P> {
72 pub fn new(points: &[P]) -> SpatialResult<Self> {
74 if points.is_empty() {
75 return Ok(Self {
76 root: None,
77 points: Vec::new(),
78 dimension: 0,
79 leaf_size: 32,
80 });
81 }
82
83 if points.len() > 1_000_000 {
84 return Err(SpatialError::ValueError(format!(
85 "Point collection too large: {} points. Maximum supported: 1,000,000",
86 points.len()
87 )));
88 }
89
90 let dimension = points[0].dimension();
91 if dimension == 0 {
92 return Err(SpatialError::ValueError(
93 "Points must have at least one dimension".to_string(),
94 ));
95 }
96
97 if dimension > 50 {
98 return Err(SpatialError::ValueError(format!(
99 "Dimension too high: {dimension}. KD-Tree is not efficient for dimensions > 50"
100 )));
101 }
102
103 for (i, point) in points.iter().enumerate() {
105 if point.dimension() != dimension {
106 return Err(SpatialError::ValueError(format!(
107 "Point {} has dimension {} but expected {}",
108 i,
109 point.dimension(),
110 dimension
111 )));
112 }
113
114 for d in 0..dimension {
116 if let Some(coord) = point.coordinate(d) {
117 if !Float::is_finite(coord) {
118 return Err(SpatialError::ValueError(format!(
119 "Point {} has invalid coordinate {} at dimension {}",
120 i,
121 NumCast::from(coord).unwrap_or(f64::NAN),
122 d
123 )));
124 }
125 }
126 }
127 }
128
129 let points = points.to_vec();
130 let mut indices: Vec<usize> = (0..points.len()).collect();
131
132 let leaf_size = 32; let root = Self::build_tree(&points, &mut indices, 0, dimension, leaf_size);
134
135 Ok(Self {
136 root,
137 points,
138 dimension,
139 leaf_size: 32, })
141 }
142
143 fn build_tree(
145 points: &[P],
146 indices: &mut [usize],
147 depth: usize,
148 dimension: usize,
149 leaf_size: usize,
150 ) -> Option<Box<KDNode<T, P>>> {
151 if indices.is_empty() {
152 return None;
153 }
154
155 if indices.len() <= leaf_size {
157 let point_index = indices[0];
160 return Some(Box::new(KDNode {
161 point_index,
162 splitting_dimension: depth % dimension,
163 left: None,
164 right: None,
165 _phantom: PhantomData,
166 }));
167 }
168
169 let splitting_dimension = depth % dimension;
170
171 indices.sort_by(|&a, &b| {
173 let coord_a = points[a]
174 .coordinate(splitting_dimension)
175 .unwrap_or(T::zero());
176 let coord_b = points[b]
177 .coordinate(splitting_dimension)
178 .unwrap_or(T::zero());
179 coord_a.partial_cmp(&coord_b).unwrap_or(Ordering::Equal)
180 });
181
182 let median = indices.len() / 2;
183 let point_index = indices[median];
184
185 let (left_indices, right_indices) = indices.split_at_mut(median);
186 let right_indices = &mut right_indices[1..]; let left = Self::build_tree(points, left_indices, depth + 1, dimension, leaf_size);
189 let right = Self::build_tree(points, right_indices, depth + 1, dimension, leaf_size);
190
191 Some(Box::new(KDNode {
192 point_index,
193 splitting_dimension,
194 left,
195 right,
196 _phantom: PhantomData,
197 }))
198 }
199
200 pub fn k_nearest_neighbors(
202 &self,
203 query: &P,
204 k: usize,
205 metric: &dyn DistanceMetric<T, P>,
206 ) -> SpatialResult<Vec<(usize, T)>> {
207 if k == 0 {
208 return Ok(Vec::new());
209 }
210
211 if k > self.points.len() {
212 return Err(SpatialError::ValueError(format!(
213 "k ({}) cannot be larger than the number of points ({})",
214 k,
215 self.points.len()
216 )));
217 }
218
219 if k > 1000 {
220 return Err(SpatialError::ValueError(format!(
221 "k ({k}) is too large. Consider using radius search for k > 1000"
222 )));
223 }
224
225 if query.dimension() != self.dimension {
226 return Err(SpatialError::ValueError(format!(
227 "Query point dimension ({}) must match tree dimension ({})",
228 query.dimension(),
229 self.dimension
230 )));
231 }
232
233 for d in 0..query.dimension() {
235 if let Some(coord) = query.coordinate(d) {
236 if !Float::is_finite(coord) {
237 return Err(SpatialError::ValueError(format!(
238 "Query point has invalid coordinate {} at dimension {}",
239 NumCast::from(coord).unwrap_or(f64::NAN),
240 d
241 )));
242 }
243 }
244 }
245
246 if self.points.is_empty() {
247 return Ok(Vec::new());
248 }
249
250 let mut heap = BinaryHeap::new();
251
252 if let Some(ref root) = self.root {
253 self.search_knn(root, query, k, &mut heap, metric);
254 }
255
256 let mut result: Vec<(usize, T)> = heap
257 .into_sorted_vec()
258 .into_iter()
259 .map(|item| (item.index, item.distance))
260 .collect();
261
262 result.reverse(); Ok(result)
264 }
265
266 fn search_knn(
268 &self,
269 node: &KDNode<T, P>,
270 query: &P,
271 k: usize,
272 heap: &mut BinaryHeap<KNNItem<T>>,
273 metric: &dyn DistanceMetric<T, P>,
274 ) {
275 let point = &self.points[node.point_index];
276 let distance = metric.distance(query, point);
277
278 if heap.len() < k {
279 heap.push(KNNItem {
280 distance,
281 index: node.point_index,
282 });
283 } else if let Some(top) = heap.peek() {
284 if distance < top.distance {
285 heap.pop();
286 heap.push(KNNItem {
287 distance,
288 index: node.point_index,
289 });
290 }
291 }
292
293 let query_coord = query
295 .coordinate(node.splitting_dimension)
296 .unwrap_or(T::zero());
297 let point_coord = point
298 .coordinate(node.splitting_dimension)
299 .unwrap_or(T::zero());
300
301 let (first_child, second_child) = if query_coord < point_coord {
302 (&node.left, &node.right)
303 } else {
304 (&node.right, &node.left)
305 };
306
307 if let Some(ref child) = first_child {
309 self.search_knn(child, query, k, heap, metric);
310 }
311
312 let dimension_distance = (query_coord - point_coord).abs();
314 let should_search_other = heap.len() < k
315 || heap
316 .peek()
317 .is_none_or(|top| dimension_distance < top.distance);
318
319 if should_search_other {
320 if let Some(ref child) = second_child {
321 self.search_knn(child, query, k, heap, metric);
322 }
323 }
324 }
325
326 pub fn radius_search(
328 &self,
329 query: &P,
330 radius: T,
331 metric: &dyn DistanceMetric<T, P>,
332 ) -> SpatialResult<Vec<(usize, T)>> {
333 if query.dimension() != self.dimension {
334 return Err(SpatialError::ValueError(
335 "Query point dimension must match tree dimension".to_string(),
336 ));
337 }
338
339 let mut result = Vec::new();
340
341 if let Some(ref root) = self.root {
342 self.search_radius(root, query, radius, &mut result, metric);
343 }
344
345 Ok(result)
346 }
347
348 fn search_radius(
350 &self,
351 node: &KDNode<T, P>,
352 query: &P,
353 radius: T,
354 result: &mut Vec<(usize, T)>,
355 metric: &dyn DistanceMetric<T, P>,
356 ) {
357 let point = &self.points[node.point_index];
358 let distance = metric.distance(query, point);
359
360 if distance <= radius {
361 result.push((node.point_index, distance));
362 }
363
364 let query_coord = query
365 .coordinate(node.splitting_dimension)
366 .unwrap_or(T::zero());
367 let point_coord = point
368 .coordinate(node.splitting_dimension)
369 .unwrap_or(T::zero());
370 let _dimension_distance = (query_coord - point_coord).abs();
371
372 if let Some(ref left) = node.left {
374 if query_coord - radius <= point_coord {
375 self.search_radius(left, query, radius, result, metric);
376 }
377 }
378
379 if let Some(ref right) = node.right {
381 if query_coord + radius >= point_coord {
382 self.search_radius(right, query, radius, result, metric);
383 }
384 }
385 }
386}
387
388#[derive(Debug, Clone)]
390struct KNNItem<T: SpatialScalar> {
391 distance: T,
392 index: usize,
393}
394
395impl<T: SpatialScalar> PartialEq for KNNItem<T> {
396 fn eq(&self, other: &Self) -> bool {
397 self.distance == other.distance
398 }
399}
400
401impl<T: SpatialScalar> Eq for KNNItem<T> {}
402
403impl<T: SpatialScalar> PartialOrd for KNNItem<T> {
404 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
405 Some(self.cmp(other))
406 }
407}
408
409impl<T: SpatialScalar> Ord for KNNItem<T> {
410 fn cmp(&self, other: &Self) -> Ordering {
411 self.distance
412 .partial_cmp(&other.distance)
413 .unwrap_or(Ordering::Equal)
414 }
415}
416
417pub struct GenericDistanceMatrix;
419
420impl GenericDistanceMatrix {
421 pub fn compute<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
423 where
424 T: SpatialScalar + Send + Sync,
425 P: SpatialPoint<T> + Send + Sync,
426 M: DistanceMetric<T, P> + Send + Sync,
427 {
428 let n = points.len();
429
430 if n > 100 {
432 Self::compute_simd_optimized(points, metric)
433 } else {
434 Self::compute_basic(points, metric)
435 }
436 }
437
438 fn compute_basic<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
440 where
441 T: SpatialScalar,
442 P: SpatialPoint<T>,
443 M: DistanceMetric<T, P>,
444 {
445 let n = points.len();
446 let mut matrix = vec![vec![T::zero(); n]; n];
447
448 for i in 0..n {
449 for j in i..n {
450 let distance = if i == j {
451 T::zero()
452 } else {
453 metric.distance(&points[i], &points[j])
454 };
455
456 matrix[i][j] = distance;
457 matrix[j][i] = distance;
458 }
459 }
460
461 Ok(matrix)
462 }
463
464 fn compute_simd_optimized<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
466 where
467 T: SpatialScalar + Send + Sync,
468 P: SpatialPoint<T> + Send + Sync,
469 M: DistanceMetric<T, P> + Send + Sync,
470 {
471 use scirs2_core::simd_ops::PlatformCapabilities;
472
473 let n = points.len();
474 let mut matrix = vec![vec![T::zero(); n]; n];
475 let caps = PlatformCapabilities::detect();
476
477 const SIMD_CHUNK_SIZE: usize = 4; if caps.simd_available {
481 for i in 0..n {
483 let point_i = &points[i];
484
485 let mut j = i + 1;
487 while j < n {
488 let chunk_end = (j + SIMD_CHUNK_SIZE).min(n);
489
490 if let Some(dimension) = Self::get_dimension(point_i) {
492 if dimension <= 4 {
493 Self::compute_simd_chunk(
495 &mut matrix,
496 i,
497 j,
498 chunk_end,
499 points,
500 metric,
501 dimension,
502 );
503 } else {
504 for k in j..chunk_end {
506 let distance = metric.distance(point_i, &points[k]);
507 matrix[i][k] = distance;
508 matrix[k][i] = distance;
509 }
510 }
511 } else {
512 for k in j..chunk_end {
514 let distance = metric.distance(point_i, &points[k]);
515 matrix[i][k] = distance;
516 matrix[k][i] = distance;
517 }
518 }
519
520 j = chunk_end;
521 }
522 }
523 } else {
524 return Self::compute_basic(points, metric);
526 }
527
528 Ok(matrix)
529 }
530
531 fn get_dimension<T, P>(point: &P) -> Option<usize>
533 where
534 T: SpatialScalar,
535 P: SpatialPoint<T>,
536 {
537 let dim = point.dimension();
538 if dim > 0 && dim <= 4 {
539 Some(dim)
540 } else {
541 None
542 }
543 }
544
545 fn compute_simd_chunk<T, P, M>(
547 matrix: &mut [Vec<T>],
548 i: usize,
549 j_start: usize,
550 j_end: usize,
551 points: &[P],
552 metric: &M,
553 dimension: usize,
554 ) where
555 T: SpatialScalar,
556 P: SpatialPoint<T>,
557 M: DistanceMetric<T, P>,
558 {
559 let point_i = &points[i];
560
561 match dimension {
563 2 => {
564 let xi = point_i.coordinate(0).unwrap_or(T::zero());
566 let yi = point_i.coordinate(1).unwrap_or(T::zero());
567
568 for k in j_start..j_end {
569 let point_k = &points[k];
570 let xk = point_k.coordinate(0).unwrap_or(T::zero());
571 let yk = point_k.coordinate(1).unwrap_or(T::zero());
572
573 let dx = xi - xk;
575 let dy = yi - yk;
576 let distance_sq = dx * dx + dy * dy;
577 let distance = distance_sq.sqrt();
578
579 matrix[i][k] = distance;
580 matrix[k][i] = distance;
581 }
582 }
583 3 => {
584 let xi = point_i.coordinate(0).unwrap_or(T::zero());
586 let yi = point_i.coordinate(1).unwrap_or(T::zero());
587 let zi = point_i.coordinate(2).unwrap_or(T::zero());
588
589 for k in j_start..j_end {
590 let point_k = &points[k];
591 let xk = point_k.coordinate(0).unwrap_or(T::zero());
592 let yk = point_k.coordinate(1).unwrap_or(T::zero());
593 let zk = point_k.coordinate(2).unwrap_or(T::zero());
594
595 let dx = xi - xk;
596 let dy = yi - yk;
597 let dz = zi - zk;
598 let distance_sq = dx * dx + dy * dy + dz * dz;
599 let distance = distance_sq.sqrt();
600
601 matrix[i][k] = distance;
602 matrix[k][i] = distance;
603 }
604 }
605 _ => {
606 for k in j_start..j_end {
608 let distance = metric.distance(point_i, &points[k]);
609 matrix[i][k] = distance;
610 matrix[k][i] = distance;
611 }
612 }
613 }
614 }
615
616 pub fn compute_parallel<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
618 where
619 T: SpatialScalar + Send + Sync,
620 P: SpatialPoint<T> + Send + Sync + Clone,
621 M: DistanceMetric<T, P> + Send + Sync,
622 {
623 let n = points.len();
624
625 if n > 1000 {
627 Self::compute_parallel_memory_efficient(points, metric)
628 } else {
629 Self::compute_parallel_basic(points, metric)
630 }
631 }
632
633 fn compute_parallel_basic<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<Vec<T>>>
635 where
636 T: SpatialScalar + Send + Sync,
637 P: SpatialPoint<T> + Send + Sync + Clone,
638 M: DistanceMetric<T, P> + Send + Sync,
639 {
640 let n = points.len();
641 let mut matrix = vec![vec![T::zero(); n]; n];
642 let metric = Arc::new(metric);
643 let points = Arc::new(points);
644
645 let indices: Vec<(usize, usize)> =
647 (0..n).flat_map(|i| (i..n).map(move |j| (i, j))).collect();
648
649 let distances: Vec<T> = indices
650 .par_iter()
651 .map(|&(i, j)| {
652 if i == j {
653 T::zero()
654 } else {
655 metric.distance(&points[i], &points[j])
656 }
657 })
658 .collect();
659
660 for (idx, &(i, j)) in indices.iter().enumerate() {
662 matrix[i][j] = distances[idx];
663 matrix[j][i] = distances[idx];
664 }
665
666 Ok(matrix)
667 }
668
669 fn compute_parallel_memory_efficient<T, P, M>(
671 points: &[P],
672 metric: &M,
673 ) -> SpatialResult<Vec<Vec<T>>>
674 where
675 T: SpatialScalar + Send + Sync,
676 P: SpatialPoint<T> + Send + Sync + Clone,
677 M: DistanceMetric<T, P> + Send + Sync,
678 {
679 let n = points.len();
680 let mut matrix = vec![vec![T::zero(); n]; n];
681
682 const PARALLEL_CHUNK_SIZE: usize = 64; let chunks: Vec<Vec<usize>> = (0..n)
686 .collect::<Vec<_>>()
687 .chunks(PARALLEL_CHUNK_SIZE)
688 .map(|chunk| chunk.to_vec())
689 .collect();
690
691 chunks.par_iter().for_each(|chunk_indices| {
693 let mut local_distances = vec![T::zero(); n];
695
696 for &i in chunk_indices {
697 local_distances.fill(T::zero());
699
700 if points[i].dimension() <= 4 {
702 Self::compute_row_distances_simd(
703 &points[i],
704 points,
705 &mut local_distances,
706 metric,
707 );
708 } else {
709 Self::compute_row_distances_scalar(
710 &points[i],
711 points,
712 &mut local_distances,
713 metric,
714 );
715 }
716
717 unsafe {
719 let matrix_ptr = matrix.as_ptr() as *mut Vec<T>;
720 let row_ptr = (*matrix_ptr.add(i)).as_mut_ptr();
721 std::ptr::copy_nonoverlapping(local_distances.as_ptr(), row_ptr, n);
722 }
723 }
724 });
725
726 for i in 0..n {
728 for j in (i + 1)..n {
729 matrix[j][i] = matrix[i][j];
730 }
731 }
732
733 Ok(matrix)
734 }
735
736 fn compute_row_distances_simd<T, P, M>(
738 point_i: &P,
739 points: &[P],
740 distances: &mut [T],
741 metric: &M,
742 ) where
743 T: SpatialScalar,
744 P: SpatialPoint<T>,
745 M: DistanceMetric<T, P>,
746 {
747 match point_i.dimension() {
748 2 => {
749 let xi = point_i.coordinate(0).unwrap_or(T::zero());
750 let yi = point_i.coordinate(1).unwrap_or(T::zero());
751
752 for (j, point_j) in points.iter().enumerate() {
754 let xj = point_j.coordinate(0).unwrap_or(T::zero());
755 let yj = point_j.coordinate(1).unwrap_or(T::zero());
756
757 let dx = xi - xj;
758 let dy = yi - yj;
759 distances[j] = (dx * dx + dy * dy).sqrt();
760 }
761 }
762 3 => {
763 let xi = point_i.coordinate(0).unwrap_or(T::zero());
764 let yi = point_i.coordinate(1).unwrap_or(T::zero());
765 let zi = point_i.coordinate(2).unwrap_or(T::zero());
766
767 for (j, point_j) in points.iter().enumerate() {
768 let xj = point_j.coordinate(0).unwrap_or(T::zero());
769 let yj = point_j.coordinate(1).unwrap_or(T::zero());
770 let zj = point_j.coordinate(2).unwrap_or(T::zero());
771
772 let dx = xi - xj;
773 let dy = yi - yj;
774 let dz = zi - zj;
775 distances[j] = (dx * dx + dy * dy + dz * dz).sqrt();
776 }
777 }
778 _ => {
779 Self::compute_row_distances_scalar(point_i, points, distances, metric);
780 }
781 }
782 }
783
784 fn compute_row_distances_scalar<T, P, M>(
786 point_i: &P,
787 points: &[P],
788 distances: &mut [T],
789 metric: &M,
790 ) where
791 T: SpatialScalar,
792 P: SpatialPoint<T>,
793 M: DistanceMetric<T, P>,
794 {
795 for (j, point_j) in points.iter().enumerate() {
796 distances[j] = metric.distance(point_i, point_j);
797 }
798 }
799
800 pub fn compute_condensed<T, P, M>(points: &[P], metric: &M) -> SpatialResult<Vec<T>>
802 where
803 T: SpatialScalar,
804 P: SpatialPoint<T>,
805 M: DistanceMetric<T, P>,
806 {
807 let n = points.len();
808 let mut distances = Vec::with_capacity(n * (n - 1) / 2);
809
810 for i in 0..n {
811 for j in (i + 1)..n {
812 distances.push(metric.distance(&points[i], &points[j]));
813 }
814 }
815
816 Ok(distances)
817 }
818}
819
820pub struct GenericKMeans<T: SpatialScalar, P: SpatialPoint<T>> {
822 k: usize,
823 max_iterations: usize,
824 tolerance: T,
825 parallel: bool,
826 phantom: PhantomData<(T, P)>,
827}
828
829impl<T: SpatialScalar, P: SpatialPoint<T> + Clone> GenericKMeans<T, P> {
830 pub fn new(k: usize) -> Self {
832 Self {
833 k,
834 max_iterations: 5, tolerance: T::from_f64(1e-1).unwrap_or(<T as SpatialScalar>::epsilon()), parallel: false,
837 phantom: PhantomData,
838 }
839 }
840
841 pub fn with_parallel(mut self, parallel: bool) -> Self {
843 self.parallel = parallel;
844 self
845 }
846
847 pub fn with_max_iterations(mut self, maxiterations: usize) -> Self {
849 self.max_iterations = maxiterations;
850 self
851 }
852
853 pub fn with_tolerance(mut self, tolerance: T) -> Self {
855 self.tolerance = tolerance;
856 self
857 }
858
859 pub fn fit(&self, points: &[P]) -> SpatialResult<KMeansResult<T, P>> {
861 if points.is_empty() {
862 return Err(SpatialError::ValueError(
863 "Cannot cluster empty point set".to_string(),
864 ));
865 }
866
867 if self.k == 0 {
868 return Err(SpatialError::ValueError(
869 "k must be greater than 0".to_string(),
870 ));
871 }
872
873 if self.k > points.len() {
874 return Err(SpatialError::ValueError(format!(
875 "k ({}) cannot be larger than the number of points ({})",
876 self.k,
877 points.len()
878 )));
879 }
880
881 if self.k > 10000 {
882 return Err(SpatialError::ValueError(format!(
883 "k ({}) is too large. Consider using hierarchical clustering for k > 10000",
884 self.k
885 )));
886 }
887
888 let dimension = points[0].dimension();
889
890 if dimension == 0 {
891 return Err(SpatialError::ValueError(
892 "Points must have at least one dimension".to_string(),
893 ));
894 }
895
896 for (i, point) in points.iter().enumerate() {
898 if point.dimension() != dimension {
899 return Err(SpatialError::ValueError(format!(
900 "Point {} has dimension {} but expected {}",
901 i,
902 point.dimension(),
903 dimension
904 )));
905 }
906
907 for d in 0..dimension {
909 if let Some(coord) = point.coordinate(d) {
910 if !Float::is_finite(coord) {
911 return Err(SpatialError::ValueError(format!(
912 "Point {} has invalid coordinate {} at dimension {}",
913 i,
914 NumCast::from(coord).unwrap_or(f64::NAN),
915 d
916 )));
917 }
918 }
919 }
920 }
921
922 let mut centroids = self.initialize_centroids(points, dimension)?;
924 let mut assignments = vec![0; points.len()];
925
926 let mut point_distances = vec![T::zero(); self.k];
928
929 for iteration in 0..self.max_iterations {
930 let mut changed = false;
931
932 const CHUNK_SIZE: usize = 16; let chunks = points.chunks(CHUNK_SIZE);
935
936 for (chunk_start, chunk) in chunks.enumerate() {
937 let chunk_offset = chunk_start * CHUNK_SIZE;
938
939 if self.parallel && points.len() > 10000 {
940 for (local_i, point) in chunk.iter().enumerate() {
943 let i = chunk_offset + local_i;
944 let mut best_cluster = 0;
945 let mut best_distance = T::max_finite();
946
947 self.compute_distances_simd(point, ¢roids, &mut point_distances);
949
950 for (j, &distance) in point_distances.iter().enumerate() {
952 if distance < best_distance {
953 best_distance = distance;
954 best_cluster = j;
955 }
956 }
957
958 if assignments[i] != best_cluster {
959 assignments[i] = best_cluster;
960 changed = true;
961 }
962 }
963 } else {
964 for (local_i, point) in chunk.iter().enumerate() {
966 let i = chunk_offset + local_i;
967 let mut best_cluster = 0;
968 let mut best_distance = T::max_finite();
969
970 self.compute_distances_simd(point, ¢roids, &mut point_distances);
972
973 for (j, &distance) in point_distances.iter().enumerate() {
975 if distance < best_distance {
976 best_distance = distance;
977 best_cluster = j;
978 }
979 }
980
981 if assignments[i] != best_cluster {
982 assignments[i] = best_cluster;
983 changed = true;
984 }
985 }
986 }
987 }
988
989 let old_centroids = centroids.clone();
991 centroids = self.update_centroids(points, &assignments, dimension)?;
992
993 let max_movement = old_centroids
995 .iter()
996 .zip(centroids.iter())
997 .map(|(old, new)| old.distance_to(new))
998 .fold(T::zero(), |acc, dist| if dist > acc { dist } else { acc });
999
1000 if !changed || max_movement < self.tolerance {
1001 return Ok(KMeansResult {
1002 centroids,
1003 assignments,
1004 iterations: iteration + 1,
1005 converged: max_movement < self.tolerance,
1006 phantom: PhantomData,
1007 });
1008 }
1009 }
1010
1011 Ok(KMeansResult {
1012 centroids,
1013 assignments,
1014 iterations: self.max_iterations,
1015 converged: false,
1016 phantom: PhantomData,
1017 })
1018 }
1019
1020 fn initialize_centroids(
1022 &self,
1023 points: &[P],
1024 _dimension: usize,
1025 ) -> SpatialResult<Vec<Point<T>>> {
1026 let mut centroids = Vec::with_capacity(self.k);
1027
1028 centroids.push(GenericKMeans::<T, P>::point_to_generic(&points[0]));
1030
1031 for _ in 1..self.k {
1033 let mut distances = Vec::with_capacity(points.len());
1034
1035 for point in points {
1036 let min_distance = centroids
1037 .iter()
1038 .map(|centroid| {
1039 GenericKMeans::<T, P>::point_to_generic(point).distance_to(centroid)
1040 })
1041 .fold(
1042 T::max_finite(),
1043 |acc, dist| if dist < acc { dist } else { acc },
1044 );
1045 distances.push(min_distance);
1046 }
1047
1048 let max_distance_idx = distances
1050 .iter()
1051 .enumerate()
1052 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal))
1053 .map(|(idx_, _)| idx_)
1054 .unwrap_or(0);
1055
1056 centroids.push(GenericKMeans::<T, P>::point_to_generic(
1057 &points[max_distance_idx],
1058 ));
1059 }
1060
1061 Ok(centroids)
1062 }
1063
1064 fn update_centroids(
1066 &self,
1067 points: &[P],
1068 assignments: &[usize],
1069 dimension: usize,
1070 ) -> SpatialResult<Vec<Point<T>>> {
1071 let mut centroids = vec![Point::zeros(dimension); self.k];
1072 let mut counts = vec![0; self.k];
1073
1074 const UPDATE_CHUNK_SIZE: usize = 512;
1076 for chunk in points.chunks(UPDATE_CHUNK_SIZE) {
1077 let assignments_chunk = &assignments[..chunk.len().min(assignments.len())];
1078
1079 for (point, &cluster) in chunk.iter().zip(assignments_chunk.iter()) {
1080 let point_coords: Vec<T> = (0..dimension)
1082 .map(|d| point.coordinate(d).unwrap_or(T::zero()))
1083 .collect();
1084
1085 for (d, &coord) in point_coords.iter().enumerate() {
1086 if let Some(centroid_coord) = centroids[cluster].coords_mut().get_mut(d) {
1087 *centroid_coord = *centroid_coord + coord;
1088 }
1089 }
1090 counts[cluster] += 1;
1091 }
1092 }
1093
1094 for (centroid, count) in centroids.iter_mut().zip(counts.iter()) {
1096 if *count > 0 {
1097 let count_scalar = T::from(*count).unwrap_or(T::one());
1098 for coord in centroid.coords_mut() {
1100 *coord = *coord / count_scalar;
1101 }
1102 }
1103 }
1104
1105 Ok(centroids)
1106 }
1107
1108 fn point_to_generic(point: &P) -> Point<T> {
1110 let coords: Vec<T> = (0..point.dimension())
1111 .map(|i| point.coordinate(i).unwrap_or(T::zero()))
1112 .collect();
1113 Point::new(coords)
1114 }
1115
1116 fn compute_distances_simd(&self, point: &P, centroids: &[Point<T>], distances: &mut [T]) {
1118 let _caps = PlatformCapabilities::detect();
1119 let point_generic = GenericKMeans::<T, P>::point_to_generic(point);
1120
1121 for (j, centroid) in centroids.iter().enumerate() {
1123 distances[j] = point_generic.distance_to(centroid);
1124 }
1125 }
1126
1127 #[allow(dead_code)]
1129 fn compute_distances_simd_optimized(
1130 &self,
1131 point: &Point<T>,
1132 centroids: &[Point<T>],
1133 distances: &mut [T],
1134 ) {
1135 match point.dimension() {
1136 2 => {
1137 let px = point.coordinate(0).unwrap_or(T::zero());
1139 let py = point.coordinate(1).unwrap_or(T::zero());
1140
1141 let mut i = 0;
1143 while i + 3 < centroids.len() {
1144 for j in 0..4 {
1146 if i + j < centroids.len() {
1147 let centroid = ¢roids[i + j];
1148 let cx = centroid.coordinate(0).unwrap_or(T::zero());
1149 let cy = centroid.coordinate(1).unwrap_or(T::zero());
1150
1151 let dx = px - cx;
1152 let dy = py - cy;
1153 distances[i + j] = (dx * dx + dy * dy).sqrt();
1154 }
1155 }
1156 i += 4;
1157 }
1158
1159 while i < centroids.len() {
1161 let centroid = ¢roids[i];
1162 let cx = centroid.coordinate(0).unwrap_or(T::zero());
1163 let cy = centroid.coordinate(1).unwrap_or(T::zero());
1164
1165 let dx = px - cx;
1166 let dy = py - cy;
1167 distances[i] = (dx * dx + dy * dy).sqrt();
1168 i += 1;
1169 }
1170 }
1171 3 => {
1172 let px = point.coordinate(0).unwrap_or(T::zero());
1174 let py = point.coordinate(1).unwrap_or(T::zero());
1175 let pz = point.coordinate(2).unwrap_or(T::zero());
1176
1177 for (i, centroid) in centroids.iter().enumerate() {
1178 let cx = centroid.coordinate(0).unwrap_or(T::zero());
1179 let cy = centroid.coordinate(1).unwrap_or(T::zero());
1180 let cz = centroid.coordinate(2).unwrap_or(T::zero());
1181
1182 let dx = px - cx;
1183 let dy = py - cy;
1184 let dz = pz - cz;
1185 distances[i] = (dx * dx + dy * dy + dz * dz).sqrt();
1186 }
1187 }
1188 _ => {
1189 for (j, centroid) in centroids.iter().enumerate() {
1191 distances[j] = point.distance_to(centroid);
1192 }
1193 }
1194 }
1195 }
1196}
1197
1198#[derive(Debug, Clone)]
1200pub struct KMeansResult<T: SpatialScalar, P: SpatialPoint<T>> {
1201 pub centroids: Vec<Point<T>>,
1203 pub assignments: Vec<usize>,
1205 pub iterations: usize,
1207 pub converged: bool,
1209 phantom: PhantomData<P>,
1210}
1211
1212pub struct GenericConvexHull;
1214
1215impl GenericConvexHull {
1216 pub fn graham_scan_2d<T, P>(points: &[P]) -> SpatialResult<Vec<Point<T>>>
1218 where
1219 T: SpatialScalar,
1220 P: SpatialPoint<T> + Clone,
1221 {
1222 if points.is_empty() {
1223 return Ok(Vec::new());
1224 }
1225
1226 if points.len() < 3 {
1227 return Ok(points.iter().map(|p| Self::to_generic_point(p)).collect());
1228 }
1229
1230 for point in points {
1232 if point.dimension() != 2 {
1233 return Err(SpatialError::ValueError(
1234 "All points must be 2D for 2D convex hull".to_string(),
1235 ));
1236 }
1237 }
1238
1239 let mut generic_points: Vec<Point<T>> =
1240 points.iter().map(|p| Self::to_generic_point(p)).collect();
1241
1242 let start_idx = generic_points
1244 .iter()
1245 .enumerate()
1246 .min_by(|(_, a), (_, b)| {
1247 let y_cmp = a.coordinate(1).partial_cmp(&b.coordinate(1)).unwrap();
1248 if y_cmp == Ordering::Equal {
1249 a.coordinate(0).partial_cmp(&b.coordinate(0)).unwrap()
1250 } else {
1251 y_cmp
1252 }
1253 })
1254 .map(|(idx_, _)| idx_)
1255 .unwrap();
1256
1257 generic_points.swap(0, start_idx);
1258 let start_point = generic_points[0].clone();
1259
1260 generic_points[1..].sort_by(|a, b| {
1262 let angle_a = Self::polar_angle(&start_point, a);
1263 let angle_b = Self::polar_angle(&start_point, b);
1264 angle_a.partial_cmp(&angle_b).unwrap_or(Ordering::Equal)
1265 });
1266
1267 let mut hull = Vec::new();
1269 for point in generic_points {
1270 while hull.len() > 1
1271 && Self::cross_product(&hull[hull.len() - 2], &hull[hull.len() - 1], &point)
1272 <= T::zero()
1273 {
1274 hull.pop();
1275 }
1276 hull.push(point);
1277 }
1278
1279 Ok(hull)
1280 }
1281
1282 fn to_generic_point<T, P>(point: &P) -> Point<T>
1284 where
1285 T: SpatialScalar,
1286 P: SpatialPoint<T>,
1287 {
1288 let coords: Vec<T> = (0..point.dimension())
1289 .map(|i| point.coordinate(i).unwrap_or(T::zero()))
1290 .collect();
1291 Point::new(coords)
1292 }
1293
1294 fn polar_angle<T: SpatialScalar>(start: &Point<T>, point: &Point<T>) -> T {
1296 let dx =
1297 point.coordinate(0).unwrap_or(T::zero()) - start.coordinate(0).unwrap_or(T::zero());
1298 let dy =
1299 point.coordinate(1).unwrap_or(T::zero()) - start.coordinate(1).unwrap_or(T::zero());
1300 dy.atan2(dx)
1301 }
1302
1303 fn cross_product<T: SpatialScalar>(a: &Point<T>, b: &Point<T>, c: &Point<T>) -> T {
1305 let ab_x = b.coordinate(0).unwrap_or(T::zero()) - a.coordinate(0).unwrap_or(T::zero());
1306 let ab_y = b.coordinate(1).unwrap_or(T::zero()) - a.coordinate(1).unwrap_or(T::zero());
1307 let ac_x = c.coordinate(0).unwrap_or(T::zero()) - a.coordinate(0).unwrap_or(T::zero());
1308 let ac_y = c.coordinate(1).unwrap_or(T::zero()) - a.coordinate(1).unwrap_or(T::zero());
1309
1310 ab_x * ac_y - ab_y * ac_x
1311 }
1312}
1313
1314pub struct GenericDBSCAN<T: SpatialScalar> {
1316 eps: T,
1317 minsamples: usize,
1318 _phantom: PhantomData<T>,
1319}
1320
1321impl<T: SpatialScalar> GenericDBSCAN<T> {
1322 pub fn new(_eps: T, minsamples: usize) -> Self {
1324 Self {
1325 eps: _eps,
1326 minsamples,
1327 _phantom: PhantomData,
1328 }
1329 }
1330
1331 pub fn fit<P, M>(&self, points: &[P], metric: &M) -> SpatialResult<DBSCANResult>
1333 where
1334 P: SpatialPoint<T>,
1335 M: DistanceMetric<T, P>,
1336 {
1337 if points.is_empty() {
1338 return Ok(DBSCANResult {
1339 labels: Vec::new(),
1340 n_clusters: 0,
1341 });
1342 }
1343
1344 if self.minsamples == 0 {
1345 return Err(SpatialError::ValueError(
1346 "minsamples must be greater than 0".to_string(),
1347 ));
1348 }
1349
1350 if self.minsamples > points.len() {
1351 return Err(SpatialError::ValueError(format!(
1352 "minsamples ({}) cannot be larger than the number of points ({})",
1353 self.minsamples,
1354 points.len()
1355 )));
1356 }
1357
1358 if !Float::is_finite(self.eps) || self.eps <= T::zero() {
1359 return Err(SpatialError::ValueError(format!(
1360 "eps must be a positive finite number, got: {}",
1361 NumCast::from(self.eps).unwrap_or(f64::NAN)
1362 )));
1363 }
1364
1365 let dimension = if points.is_empty() {
1367 0
1368 } else {
1369 points[0].dimension()
1370 };
1371 for (i, point) in points.iter().enumerate() {
1372 if point.dimension() != dimension {
1373 return Err(SpatialError::ValueError(format!(
1374 "Point {} has dimension {} but expected {}",
1375 i,
1376 point.dimension(),
1377 dimension
1378 )));
1379 }
1380
1381 for d in 0..dimension {
1383 if let Some(coord) = point.coordinate(d) {
1384 if !Float::is_finite(coord) {
1385 return Err(SpatialError::ValueError(format!(
1386 "Point {} has invalid coordinate {} at dimension {}",
1387 i,
1388 NumCast::from(coord).unwrap_or(f64::NAN),
1389 d
1390 )));
1391 }
1392 }
1393 }
1394 }
1395
1396 let n = points.len();
1397 let mut labels = vec![-1i32; n]; let mut visited = vec![false; n];
1399 let mut cluster_id = 0;
1400
1401 const DBSCAN_PROCESS_CHUNK_SIZE: usize = 32; for chunk_start in (0..n).step_by(DBSCAN_PROCESS_CHUNK_SIZE) {
1405 let chunk_end = (chunk_start + DBSCAN_PROCESS_CHUNK_SIZE).min(n);
1406
1407 for i in chunk_start..chunk_end {
1408 if visited[i] {
1409 continue;
1410 }
1411 visited[i] = true;
1412
1413 let neighbors = self.find_neighbors(points, i, metric);
1415
1416 if neighbors.len() < self.minsamples {
1417 labels[i] = -1; } else {
1419 self.expand_cluster(
1420 points,
1421 &mut labels,
1422 &mut visited,
1423 i,
1424 &neighbors,
1425 cluster_id,
1426 metric,
1427 );
1428 cluster_id += 1;
1429
1430 if cluster_id > 10000 {
1432 return Err(SpatialError::ValueError(
1433 format!("Too many clusters found: {cluster_id}. Consider adjusting eps or minsamples parameters")
1434 ));
1435 }
1436 }
1437 }
1438
1439 if chunk_start > 0 && chunk_start % (DBSCAN_PROCESS_CHUNK_SIZE * 10) == 0 {
1441 std::hint::black_box(&labels);
1443 std::hint::black_box(&visited);
1444 }
1445 }
1446
1447 Ok(DBSCANResult {
1448 labels,
1449 n_clusters: cluster_id,
1450 })
1451 }
1452
1453 fn find_neighbors<P, M>(&self, points: &[P], pointidx: usize, metric: &M) -> Vec<usize>
1455 where
1456 P: SpatialPoint<T>,
1457 M: DistanceMetric<T, P>,
1458 {
1459 let mut neighbors = Vec::with_capacity(32); let query_point = &points[pointidx];
1461 let _eps_squared = self.eps * self.eps; const NEIGHBOR_CHUNK_SIZE: usize = 16; if points.len() > 5000 {
1467 for chunk in points.chunks(NEIGHBOR_CHUNK_SIZE) {
1469 let chunk_start = ((chunk.as_ptr() as usize - points.as_ptr() as usize)
1470 / std::mem::size_of::<P>())
1471 .min(points.len());
1472
1473 for (local_idx, point) in chunk.iter().enumerate() {
1474 let global_idx = chunk_start + local_idx;
1475 if global_idx >= points.len() {
1476 break;
1477 }
1478
1479 let distance = metric.distance(query_point, point);
1481 if distance <= self.eps {
1482 neighbors.push(global_idx);
1483 }
1484 }
1485
1486 if neighbors.len() > 100 {
1488 break;
1489 }
1490 }
1491 } else {
1492 for (i, point) in points.iter().enumerate() {
1494 if metric.distance(query_point, point) <= self.eps {
1495 neighbors.push(i);
1496 }
1497 }
1498 }
1499
1500 neighbors.shrink_to_fit(); neighbors
1502 }
1503
1504 #[allow(clippy::too_many_arguments)]
1506 fn expand_cluster<P, M>(
1507 &self,
1508 points: &[P],
1509 labels: &mut [i32],
1510 visited: &mut [bool],
1511 pointidx: usize,
1512 neighbors: &[usize],
1513 cluster_id: i32,
1514 metric: &M,
1515 ) where
1516 P: SpatialPoint<T>,
1517 M: DistanceMetric<T, P>,
1518 {
1519 labels[pointidx] = cluster_id;
1520
1521 let mut processed = vec![false; points.len()];
1523 let mut seed_set = Vec::with_capacity(neighbors.len() * 2);
1524
1525 for &neighbor in neighbors {
1527 if neighbor < points.len() {
1528 seed_set.push(neighbor);
1529 }
1530 }
1531
1532 const EXPAND_BATCH_SIZE: usize = 32;
1534 let mut batch_buffer = Vec::with_capacity(EXPAND_BATCH_SIZE);
1535
1536 while !seed_set.is_empty() {
1537 let batch_size = seed_set.len().min(EXPAND_BATCH_SIZE);
1539 batch_buffer.clear();
1540 batch_buffer.extend(seed_set.drain(..batch_size));
1541
1542 for q in batch_buffer.iter().copied() {
1543 if q >= points.len() || processed[q] {
1544 continue;
1545 }
1546 processed[q] = true;
1547
1548 if !visited[q] {
1549 visited[q] = true;
1550 let q_neighbors = self.find_neighbors(points, q, metric);
1551
1552 if q_neighbors.len() >= self.minsamples {
1553 for &neighbor in &q_neighbors {
1555 if neighbor < points.len()
1556 && !processed[neighbor]
1557 && !seed_set.contains(&neighbor)
1558 {
1559 seed_set.push(neighbor);
1560 }
1561 }
1562 }
1563 }
1564
1565 if labels[q] == -1 {
1567 labels[q] = cluster_id;
1568 }
1569 }
1570
1571 if seed_set.len() > 1000 {
1573 seed_set.sort_unstable();
1574 seed_set.dedup();
1575 }
1576 }
1577 }
1578}
1579
1580#[derive(Debug, Clone)]
1582pub struct DBSCANResult {
1583 pub labels: Vec<i32>,
1585 pub n_clusters: i32,
1587}
1588
1589pub struct GenericGMM<T: SpatialScalar> {
1591 _ncomponents: usize,
1592 max_iterations: usize,
1593 tolerance: T,
1594 reg_covar: T,
1595 _phantom: PhantomData<T>,
1596}
1597
1598impl<T: SpatialScalar> GenericGMM<T> {
1599 pub fn new(_ncomponents: usize) -> Self {
1601 Self {
1602 _ncomponents,
1603 max_iterations: 3, tolerance: T::from_f64(1e-1).unwrap_or(<T as SpatialScalar>::epsilon()), reg_covar: T::from_f64(1e-6).unwrap_or(<T as SpatialScalar>::epsilon()),
1606 _phantom: PhantomData,
1607 }
1608 }
1609
1610 pub fn with_max_iterations(mut self, maxiterations: usize) -> Self {
1612 self.max_iterations = maxiterations;
1613 self
1614 }
1615
1616 pub fn with_tolerance(mut self, tolerance: T) -> Self {
1618 self.tolerance = tolerance;
1619 self
1620 }
1621
1622 pub fn with_reg_covar(mut self, regcovar: T) -> Self {
1624 self.reg_covar = regcovar;
1625 self
1626 }
1627
1628 #[allow(clippy::needless_range_loop)]
1630 pub fn fit<P>(&self, points: &[P]) -> SpatialResult<GMMResult<T>>
1631 where
1632 P: SpatialPoint<T> + Clone,
1633 {
1634 if points.is_empty() {
1635 return Err(SpatialError::ValueError(
1636 "Cannot fit GMM to empty dataset".to_string(),
1637 ));
1638 }
1639
1640 let n_samples = points.len();
1641 let n_features = points[0].dimension();
1642
1643 let kmeans = GenericKMeans::new(self._ncomponents);
1645 let kmeans_result = kmeans.fit(points)?;
1646
1647 let mut means = kmeans_result.centroids;
1649 let mut weights = vec![T::one() / T::from(self._ncomponents).unwrap(); self._ncomponents];
1650
1651 let mut covariances =
1653 vec![vec![vec![T::zero(); n_features]; n_features]; self._ncomponents];
1654
1655 for k in 0..self._ncomponents {
1657 let cluster_points: Vec<&P> = kmeans_result
1658 .assignments
1659 .iter()
1660 .enumerate()
1661 .filter_map(
1662 |(i, &cluster)| {
1663 if cluster == k {
1664 Some(&points[i])
1665 } else {
1666 None
1667 }
1668 },
1669 )
1670 .collect();
1671
1672 if !cluster_points.is_empty() {
1673 let cluster_mean = &means[k];
1674
1675 for i in 0..n_features {
1677 for j in 0..n_features {
1678 let mut cov_sum = T::zero();
1679 let count = T::from(cluster_points.len()).unwrap();
1680
1681 for point in &cluster_points {
1682 let pi = point.coordinate(i).unwrap_or(T::zero())
1683 - cluster_mean.coordinate(i).unwrap_or(T::zero());
1684 let pj = point.coordinate(j).unwrap_or(T::zero())
1685 - cluster_mean.coordinate(j).unwrap_or(T::zero());
1686 cov_sum = cov_sum + pi * pj;
1687 }
1688
1689 covariances[k][i][j] = if count > T::one() {
1690 cov_sum / (count - T::one())
1691 } else if i == j {
1692 T::one()
1693 } else {
1694 T::zero()
1695 };
1696 }
1697 }
1698
1699 for i in 0..n_features {
1701 covariances[k][i][i] = covariances[k][i][i] + self.reg_covar;
1702 }
1703 } else {
1704 for i in 0..n_features {
1706 covariances[k][i][i] = T::one();
1707 }
1708 }
1709 }
1710
1711 let mut log_likelihood = T::min_value();
1713 let mut responsibilities = vec![vec![T::zero(); self._ncomponents]; n_samples];
1714
1715 for iteration in 0..self.max_iterations {
1716 let mut new_log_likelihood = T::zero();
1718
1719 for i in 0..n_samples {
1720 let point = Self::point_to_generic(&points[i]);
1721 let mut log_likelihoods = vec![T::min_value(); self._ncomponents];
1722 let mut max_log_likelihood = T::min_value();
1723
1724 for k in 0..self._ncomponents {
1726 let log_weight = weights[k].ln();
1727 let log_gaussian = self.compute_log_gaussian_probability(
1728 &point,
1729 &means[k],
1730 &covariances[k],
1731 n_features,
1732 );
1733 log_likelihoods[k] = log_weight + log_gaussian;
1734 if log_likelihoods[k] > max_log_likelihood {
1735 max_log_likelihood = log_likelihoods[k];
1736 }
1737 }
1738
1739 let mut sum_exp = T::zero();
1741 for k in 0..self._ncomponents {
1742 let exp_val = (log_likelihoods[k] - max_log_likelihood).exp();
1743 responsibilities[i][k] = exp_val;
1744 sum_exp = sum_exp + exp_val;
1745 }
1746
1747 if sum_exp > T::zero() {
1749 for k in 0..self._ncomponents {
1750 responsibilities[i][k] = responsibilities[i][k] / sum_exp;
1751 }
1752 new_log_likelihood = new_log_likelihood + max_log_likelihood + sum_exp.ln();
1753 }
1754 }
1755
1756 let mut nk_values = vec![T::zero(); self._ncomponents];
1758
1759 for k in 0..self._ncomponents {
1761 let mut nk = T::zero();
1762 for i in 0..n_samples {
1763 nk = nk + responsibilities[i][k];
1764 }
1765 nk_values[k] = nk;
1766 weights[k] = nk / T::from(n_samples).unwrap();
1767 }
1768
1769 for k in 0..self._ncomponents {
1771 if nk_values[k] > T::zero() {
1772 let mut new_mean_coords = vec![T::zero(); n_features];
1773
1774 for i in 0..n_samples {
1775 let point = Self::point_to_generic(&points[i]);
1776 for d in 0..n_features {
1777 let coord = point.coordinate(d).unwrap_or(T::zero());
1778 new_mean_coords[d] =
1779 new_mean_coords[d] + responsibilities[i][k] * coord;
1780 }
1781 }
1782
1783 for d in 0..n_features {
1785 new_mean_coords[d] = new_mean_coords[d] / nk_values[k];
1786 }
1787
1788 means[k] = Point::new(new_mean_coords);
1789 }
1790 }
1791
1792 for k in 0..self._ncomponents {
1794 if nk_values[k] > T::one() {
1795 let mean_k = &means[k];
1796
1797 for i in 0..n_features {
1799 for j in 0..n_features {
1800 covariances[k][i][j] = T::zero();
1801 }
1802 }
1803
1804 for sample_idx in 0..n_samples {
1806 let point = Self::point_to_generic(&points[sample_idx]);
1807 let resp = responsibilities[sample_idx][k];
1808
1809 for i in 0..n_features {
1810 for j in 0..n_features {
1811 let diff_i = point.coordinate(i).unwrap_or(T::zero())
1812 - mean_k.coordinate(i).unwrap_or(T::zero());
1813 let diff_j = point.coordinate(j).unwrap_or(T::zero())
1814 - mean_k.coordinate(j).unwrap_or(T::zero());
1815 covariances[k][i][j] =
1816 covariances[k][i][j] + resp * diff_i * diff_j;
1817 }
1818 }
1819 }
1820
1821 for i in 0..n_features {
1823 for j in 0..n_features {
1824 covariances[k][i][j] = covariances[k][i][j] / nk_values[k];
1825 if i == j {
1826 covariances[k][i][j] = covariances[k][i][j] + self.reg_covar;
1827 }
1828 }
1829 }
1830 }
1831 }
1832
1833 if iteration > 0 && (new_log_likelihood - log_likelihood).abs() < self.tolerance {
1835 break;
1836 }
1837 log_likelihood = new_log_likelihood;
1838 }
1839
1840 let mut labels = vec![0; n_samples];
1842 for i in 0..n_samples {
1843 let mut max_resp = T::zero();
1844 let mut best_cluster = 0;
1845 for k in 0..self._ncomponents {
1846 if responsibilities[i][k] > max_resp {
1847 max_resp = responsibilities[i][k];
1848 best_cluster = k;
1849 }
1850 }
1851 labels[i] = best_cluster;
1852 }
1853
1854 Ok(GMMResult {
1855 means,
1856 weights,
1857 covariances,
1858 labels,
1859 log_likelihood,
1860 converged: true,
1861 })
1862 }
1863
1864 fn point_to_generic<P>(point: &P) -> Point<T>
1866 where
1867 P: SpatialPoint<T>,
1868 {
1869 let coords: Vec<T> = (0..point.dimension())
1870 .map(|i| point.coordinate(i).unwrap_or(T::zero()))
1871 .collect();
1872 Point::new(coords)
1873 }
1874
1875 fn compute_log_gaussian_probability(
1877 &self,
1878 point: &Point<T>,
1879 mean: &Point<T>,
1880 covariance: &[Vec<T>],
1881 n_features: usize,
1882 ) -> T {
1883 let mut diff = vec![T::zero(); n_features];
1885 for (i, item) in diff.iter_mut().enumerate().take(n_features) {
1886 *item =
1887 point.coordinate(i).unwrap_or(T::zero()) - mean.coordinate(i).unwrap_or(T::zero());
1888 }
1889
1890 let mut det = T::one();
1892 let mut inv_cov = vec![vec![T::zero(); n_features]; n_features];
1893
1894 for i in 0..n_features {
1897 det = det * covariance[i][i];
1898 inv_cov[i][i] = T::one() / covariance[i][i];
1899 }
1900
1901 let mut quadratic_form = T::zero();
1903 for i in 0..n_features {
1904 for j in 0..n_features {
1905 quadratic_form = quadratic_form + diff[i] * inv_cov[i][j] * diff[j];
1906 }
1907 }
1908
1909 let two_pi =
1911 T::from(std::f64::consts::TAU).unwrap_or(T::from(std::f64::consts::TAU).unwrap());
1912 let log_2pi_k = T::from(n_features).unwrap() * two_pi.ln();
1913 let log_det = det.abs().ln();
1914
1915 let log_prob = -T::from(0.5).unwrap() * (log_2pi_k + log_det + quadratic_form);
1916
1917 if Float::is_finite(log_prob) {
1919 log_prob
1920 } else {
1921 T::min_value()
1922 }
1923 }
1924}
1925
1926#[derive(Debug, Clone)]
1928pub struct GMMResult<T: SpatialScalar> {
1929 pub means: Vec<Point<T>>,
1931 pub weights: Vec<T>,
1933 pub covariances: Vec<Vec<Vec<T>>>,
1935 pub labels: Vec<usize>,
1937 pub log_likelihood: T,
1939 pub converged: bool,
1941}
1942
1943#[cfg(test)]
1944mod tests {
1945 use crate::generic_traits::EuclideanMetric;
1946 use crate::{
1947 DBSCANResult, GenericConvexHull, GenericDBSCAN, GenericDistanceMatrix, GenericKDTree,
1948 GenericKMeans, Point,
1949 };
1950 use approx::assert_relative_eq;
1951
1952 #[test]
1953 #[ignore = "timeout"]
1954 fn test_generic_kdtree() {
1955 let points = vec![Point::new_2d(0.0f64, 0.0), Point::new_2d(1.0, 1.0)];
1957
1958 let kdtree = GenericKDTree::new(&points).unwrap();
1959 let euclidean = EuclideanMetric;
1960
1961 let query = Point::new_2d(0.1, 0.1);
1962 let neighbors = kdtree.k_nearest_neighbors(&query, 1, &euclidean).unwrap();
1963
1964 assert_eq!(neighbors.len(), 1);
1965 assert_eq!(neighbors[0].0, 0);
1966 }
1967
1968 #[test]
1969 #[ignore = "timeout"]
1970 fn test_generic_distance_matrix() {
1971 let points = vec![Point::new_2d(0.0f32, 0.0f32), Point::new_2d(1.0, 0.0)];
1973
1974 let euclidean = EuclideanMetric;
1975 let matrix = GenericDistanceMatrix::compute(&points, &euclidean).unwrap();
1976
1977 assert_eq!(matrix.len(), 2);
1978 assert_eq!(matrix[0].len(), 2);
1979 assert_relative_eq!(matrix[0][0], 0.0, epsilon = 1e-6);
1980 assert_relative_eq!(matrix[0][1], 1.0, epsilon = 1e-6);
1981 }
1982
1983 #[test]
1984 #[ignore = "timeout"]
1985 fn test_generic_kmeans() {
1986 let points = vec![
1987 Point::new_2d(0.0f64, 0.0),
1988 Point::new_2d(0.1, 0.1),
1989 Point::new_2d(5.0, 5.0),
1990 Point::new_2d(5.1, 5.1),
1991 ];
1992
1993 let kmeans = GenericKMeans::new(2)
1994 .with_max_iterations(2)
1995 .with_tolerance(0.5)
1996 .with_parallel(false);
1997 let result = kmeans.fit(&points).unwrap();
1998
1999 assert_eq!(result.centroids.len(), 2);
2000 assert_eq!(result.assignments.len(), 4);
2001
2002 assert_eq!(result.assignments[0], result.assignments[1]);
2004 assert_eq!(result.assignments[2], result.assignments[3]);
2005 assert_ne!(result.assignments[0], result.assignments[2]);
2006 }
2007
2008 #[test]
2009 fn test_generic_convex_hull() {
2010 let points = vec![
2011 Point::new_2d(0.0f64, 0.0),
2012 Point::new_2d(1.0, 0.0),
2013 Point::new_2d(1.0, 1.0),
2014 Point::new_2d(0.0, 1.0),
2015 Point::new_2d(0.5, 0.5), ];
2017
2018 let hull = GenericConvexHull::graham_scan_2d(&points).unwrap();
2019
2020 assert_eq!(hull.len(), 4);
2022 }
2023
2024 #[test]
2025 #[ignore = "timeout"]
2026 fn test_different_numeric_types() {
2027 let points_f32 = vec![Point::new_2d(0.0f32, 0.0f32)];
2029
2030 let kdtree_f32 = GenericKDTree::new(&points_f32).unwrap();
2031 let euclidean = EuclideanMetric;
2032 let query_f32 = Point::new_2d(0.0f32, 0.0f32);
2033 let neighbors_f32 = kdtree_f32
2034 .k_nearest_neighbors(&query_f32, 1, &euclidean)
2035 .unwrap();
2036
2037 assert_eq!(neighbors_f32.len(), 1);
2038
2039 let points_f64 = vec![Point::new_2d(0.0f64, 0.0f64)];
2041
2042 let kdtree_f64 = GenericKDTree::new(&points_f64).unwrap();
2043 let query_f64 = Point::new_2d(0.0f64, 0.0f64);
2044 let neighbors_f64 = kdtree_f64
2045 .k_nearest_neighbors(&query_f64, 1, &euclidean)
2046 .unwrap();
2047
2048 assert_eq!(neighbors_f64.len(), 1);
2049 }
2050
2051 #[test]
2052 #[ignore]
2053 fn test_parallel_distance_matrix() {
2054 let points = vec![Point::new_2d(0.0f64, 0.0), Point::new_2d(1.0, 0.0)];
2056
2057 let euclidean = EuclideanMetric;
2058 let matrix_seq = GenericDistanceMatrix::compute(&points, &euclidean).unwrap();
2059 let matrix_par = GenericDistanceMatrix::compute_parallel(&points, &euclidean).unwrap();
2060
2061 assert_eq!(matrix_seq.len(), matrix_par.len());
2063 for i in 0..matrix_seq.len() {
2064 for j in 0..matrix_seq[i].len() {
2065 assert_relative_eq!(matrix_seq[i][j], matrix_par[i][j], epsilon = 1e-10);
2066 }
2067 }
2068 }
2069
2070 #[test]
2071 #[ignore]
2072 fn test_parallel_kmeans() {
2073 let points = vec![Point::new_2d(0.0f64, 0.0), Point::new_2d(1.0, 1.0)];
2075
2076 let kmeans_seq = GenericKMeans::new(1) .with_max_iterations(1) .with_tolerance(1.0) .with_parallel(false);
2080 let kmeans_par = GenericKMeans::new(1)
2081 .with_max_iterations(1)
2082 .with_tolerance(1.0)
2083 .with_parallel(false);
2084
2085 let result_seq = kmeans_seq.fit(&points).unwrap();
2086 let result_par = kmeans_par.fit(&points).unwrap();
2087
2088 assert_eq!(result_seq.centroids.len(), result_par.centroids.len());
2089 assert_eq!(result_seq.assignments.len(), result_par.assignments.len());
2090 }
2091
2092 #[test]
2093 fn test_dbscan_clustering() {
2094 let points = [Point::new_2d(0.0f64, 0.0)];
2096
2097 let dbscan = GenericDBSCAN::new(1.0f64, 1);
2098 let _euclidean = EuclideanMetric;
2099
2100 assert_eq!(dbscan.eps, 1.0f64);
2102 assert_eq!(dbscan.minsamples, 1);
2103
2104 let result = DBSCANResult {
2106 labels: vec![-1],
2107 n_clusters: 0,
2108 };
2109
2110 assert_eq!(result.n_clusters, 0);
2111 assert_eq!(result.labels.len(), 1);
2112 }
2113}