scirs2_spatial/
simd_distance.rs

1//! SIMD-accelerated distance calculations for spatial operations
2//!
3//! This module provides high-performance implementations of distance calculations
4//! using the unified SIMD operations from scirs2-core. These optimized
5//! functions can provide significant performance improvements for operations
6//! involving large datasets.
7//!
8//! # Features
9//!
10//! - **SIMD-accelerated distance metrics**: Euclidean, Manhattan, Minkowski
11//! - **Parallel distance matrix computation**: Multi-threaded pdist and cdist
12//! - **Batch nearest neighbor queries**: Optimized for large point sets
13//! - **Memory-efficient operations**: Minimized memory allocation and copying
14//!
15//! # Architecture Support
16//!
17//! The SIMD operations are automatically detected and optimized by scirs2-core
18//! based on the available hardware capabilities.
19//!
20//! # Examples
21//!
22//! ```
23//! use scirs2_spatial::simd_distance::{simd_euclidean_distance_batch, parallel_pdist};
24//! use ndarray::array;
25//!
26//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
27//! // SIMD batch distance calculation
28//! let points1 = array![[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]];
29//! let points2 = array![[1.0, 0.0], [2.0, 1.0], [3.0, 2.0]];
30//!
31//! let distances = simd_euclidean_distance_batch(&points1.view(), &points2.view())?;
32//! println!("Batch distances: {:?}", distances);
33//!
34//! // Parallel distance matrix
35//! let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
36//! let dist_matrix = parallel_pdist(&points.view(), "euclidean")?;
37//! println!("Distance matrix shape: {:?}", dist_matrix.shape());
38//! # Ok(())
39//! # }
40//! ```
41
42use crate::error::{SpatialError, SpatialResult};
43use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
44use scirs2_core::parallel_ops::*;
45use scirs2_core::simd_ops::SimdUnifiedOps;
46
47/// Supported distance metrics for SIMD operations
48#[derive(Debug, Clone, Copy)]
49pub enum SimdMetric {
50    /// Euclidean distance (L2 norm)
51    Euclidean,
52    /// Manhattan distance (L1 norm)  
53    Manhattan,
54    /// Squared Euclidean distance (no square root)
55    SquaredEuclidean,
56    /// Chebyshev distance (Lāˆž norm)
57    Chebyshev,
58}
59
60impl SimdMetric {
61    /// Get the string name of the metric
62    ///
63    /// # Returns
64    ///
65    /// * A string slice containing the name of the metric
66    ///
67    /// # Examples
68    ///
69    /// ```
70    /// use scirs2_spatial::simd_distance::SimdMetric;
71    ///
72    /// let metric = SimdMetric::Euclidean;
73    /// assert_eq!(metric.name(), "euclidean");
74    ///
75    /// let metric = SimdMetric::Manhattan;
76    /// assert_eq!(metric.name(), "manhattan");
77    /// ```
78    pub fn name(&self) -> &'static str {
79        match self {
80            SimdMetric::Euclidean => "euclidean",
81            SimdMetric::Manhattan => "manhattan",
82            SimdMetric::SquaredEuclidean => "sqeuclidean",
83            SimdMetric::Chebyshev => "chebyshev",
84        }
85    }
86}
87
88/// SIMD-accelerated Euclidean distance between two points
89///
90/// # Arguments
91/// * `a` - First point
92/// * `b` - Second point
93///
94/// # Returns
95/// * Euclidean distance between the points
96#[allow(dead_code)]
97pub fn simd_euclidean_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
98    if a.len() != b.len() {
99        return Err(SpatialError::ValueError(
100            "Points must have the same dimension".to_string(),
101        ));
102    }
103
104    // Convert slices to ArrayView for core ops
105    let a_view = ArrayView1::from(a);
106    let b_view = ArrayView1::from(b);
107
108    // Compute squared distance using SIMD operations
109    let diff = f64::simd_sub(&a_view, &b_view);
110    let squared = f64::simd_mul(&diff.view(), &diff.view());
111    let sum = f64::simd_sum(&squared.view());
112    Ok(sum.sqrt())
113}
114
115/// SIMD-accelerated squared Euclidean distance between two points
116#[allow(dead_code)]
117pub fn simd_squared_euclidean_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
118    if a.len() != b.len() {
119        return Err(SpatialError::ValueError(
120            "Points must have the same dimension".to_string(),
121        ));
122    }
123
124    let a_view = ArrayView1::from(a);
125    let b_view = ArrayView1::from(b);
126
127    // Compute squared distance using SIMD operations
128    let diff = f64::simd_sub(&a_view, &b_view);
129    let squared = f64::simd_mul(&diff.view(), &diff.view());
130    Ok(f64::simd_sum(&squared.view()))
131}
132
133/// SIMD-accelerated Manhattan distance between two points
134#[allow(dead_code)]
135pub fn simd_manhattan_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
136    if a.len() != b.len() {
137        return Err(SpatialError::ValueError(
138            "Points must have the same dimension".to_string(),
139        ));
140    }
141
142    let a_view = ArrayView1::from(a);
143    let b_view = ArrayView1::from(b);
144
145    // Compute Manhattan distance using SIMD operations
146    let diff = f64::simd_sub(&a_view, &b_view);
147    let abs_diff = f64::simd_abs(&diff.view());
148    Ok(f64::simd_sum(&abs_diff.view()))
149}
150
151/// SIMD-accelerated Chebyshev distance
152#[allow(dead_code)]
153pub fn simd_chebyshev_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
154    if a.len() != b.len() {
155        return Err(SpatialError::ValueError(
156            "Points must have the same dimension".to_string(),
157        ));
158    }
159
160    let a_view = ArrayView1::from(a);
161    let b_view = ArrayView1::from(b);
162
163    // Compute Chebyshev distance using SIMD operations
164    let diff = f64::simd_sub(&a_view, &b_view);
165    let abs_diff = f64::simd_abs(&diff.view());
166    Ok(f64::simd_max_element(&abs_diff.view()))
167}
168
169/// Batch SIMD-accelerated Euclidean distance calculation
170///
171/// Computes distances between corresponding rows of two matrices
172///
173/// # Arguments
174/// * `points1` - First set of points, shape (n, d)
175/// * `points2` - Second set of points, shape (n, d)
176///
177/// # Returns
178/// * Array of distances, shape (n,)
179#[allow(dead_code)]
180pub fn simd_euclidean_distance_batch(
181    points1: &ArrayView2<'_, f64>,
182    points2: &ArrayView2<'_, f64>,
183) -> SpatialResult<Array1<f64>> {
184    if points1.shape() != points2.shape() {
185        return Err(SpatialError::ValueError(
186            "Point arrays must have the same shape".to_string(),
187        ));
188    }
189
190    let n_points = points1.nrows();
191
192    // Use parallel computation with SIMD operations
193    let distances_vec: Result<Vec<f64>, SpatialError> = (0..n_points)
194        .into_par_iter()
195        .map(|i| -> SpatialResult<f64> {
196            let p1 = points1.row(i);
197            let p2 = points2.row(i);
198            let diff = f64::simd_sub(&p1, &p2);
199            let squared = f64::simd_mul(&diff.view(), &diff.view());
200            let sum = f64::simd_sum(&squared.view());
201            Ok(sum.sqrt())
202        })
203        .collect();
204
205    Ok(Array1::from(distances_vec?))
206}
207
208/// Parallel computation of pairwise distance matrix
209///
210/// # Arguments
211/// * `points` - Array of points, shape (n, d)
212/// * `metric` - Distance metric to use
213///
214/// # Returns
215/// * Condensed distance matrix, shape (n*(n-1)/2,)
216#[allow(dead_code)]
217pub fn parallel_pdist(points: &ArrayView2<'_, f64>, metric: &str) -> SpatialResult<Array1<f64>> {
218    use scirs2_core::parallel_ops::ParallelIterator;
219    let n_points = points.nrows();
220    if n_points < 2 {
221        return Err(SpatialError::ValueError(
222            "Need at least 2 _points for distance computation".to_string(),
223        ));
224    }
225
226    let n_distances = n_points * (n_points - 1) / 2;
227
228    let metric_enum = match metric {
229        "euclidean" => SimdMetric::Euclidean,
230        "manhattan" => SimdMetric::Manhattan,
231        "sqeuclidean" => SimdMetric::SquaredEuclidean,
232        "chebyshev" => SimdMetric::Chebyshev,
233        _ => {
234            return Err(SpatialError::ValueError(format!(
235                "Unsupported metric: {metric}"
236            )))
237        }
238    };
239
240    // Parallel computation of distance pairs
241    let distances_vec: Result<Vec<f64>, SpatialError> = (0..n_distances)
242        .into_par_iter()
243        .map(|idx| -> SpatialResult<f64> {
244            // Convert linear index to (i, j) pair
245            let (i, j) = linear_to_condensed_indices(idx, n_points);
246
247            let p1 = points.row(i);
248            let p2 = points.row(j);
249
250            match metric_enum {
251                SimdMetric::Euclidean => {
252                    let diff = f64::simd_sub(&p1, &p2);
253                    let squared = f64::simd_mul(&diff.view(), &diff.view());
254                    let sum = f64::simd_sum(&squared.view());
255                    Ok(sum.sqrt())
256                }
257                SimdMetric::Manhattan => {
258                    let diff = f64::simd_sub(&p1, &p2);
259                    let abs_diff = f64::simd_abs(&diff.view());
260                    Ok(f64::simd_sum(&abs_diff.view()))
261                }
262                SimdMetric::SquaredEuclidean => {
263                    let diff = f64::simd_sub(&p1, &p2);
264                    let squared = f64::simd_mul(&diff.view(), &diff.view());
265                    Ok(f64::simd_sum(&squared.view()))
266                }
267                SimdMetric::Chebyshev => {
268                    let diff = f64::simd_sub(&p1, &p2);
269                    let abs_diff = f64::simd_abs(&diff.view());
270                    Ok(f64::simd_max_element(&abs_diff.view()))
271                }
272            }
273        })
274        .collect();
275
276    Ok(Array1::from(distances_vec?))
277}
278
279/// Parallel computation of cross-distance matrix
280///
281/// # Arguments
282/// * `points1` - First set of points, shape (n, d)
283/// * `points2` - Second set of points, shape (m, d)
284/// * `metric` - Distance metric to use
285///
286/// # Returns
287/// * Distance matrix, shape (n, m)
288#[allow(dead_code)]
289pub fn parallel_cdist(
290    points1: &ArrayView2<'_, f64>,
291    points2: &ArrayView2<'_, f64>,
292    metric: &str,
293) -> SpatialResult<Array2<f64>> {
294    use scirs2_core::parallel_ops::ParallelIterator;
295    if points1.ncols() != points2.ncols() {
296        return Err(SpatialError::ValueError(
297            "Point arrays must have the same number of dimensions".to_string(),
298        ));
299    }
300
301    let n1 = points1.nrows();
302    let n2 = points2.nrows();
303    let mut distances = Array2::zeros((n1, n2));
304
305    let metric_enum = match metric {
306        "euclidean" => SimdMetric::Euclidean,
307        "manhattan" => SimdMetric::Manhattan,
308        "sqeuclidean" => SimdMetric::SquaredEuclidean,
309        "chebyshev" => SimdMetric::Chebyshev,
310        _ => {
311            return Err(SpatialError::ValueError(format!(
312                "Unsupported metric: {metric}"
313            )))
314        }
315    };
316
317    // Parallel computation over rows of first matrix
318    distances
319        .outer_iter_mut()
320        .enumerate()
321        .par_bridge()
322        .try_for_each(|(i, mut row)| -> SpatialResult<()> {
323            let p1 = points1.row(i);
324
325            for (j, dist) in row.iter_mut().enumerate() {
326                let p2 = points2.row(j);
327
328                *dist = match metric_enum {
329                    SimdMetric::Euclidean => {
330                        let diff = f64::simd_sub(&p1, &p2);
331                        let squared = f64::simd_mul(&diff.view(), &diff.view());
332                        let sum = f64::simd_sum(&squared.view());
333                        sum.sqrt()
334                    }
335                    SimdMetric::Manhattan => {
336                        let diff = f64::simd_sub(&p1, &p2);
337                        let abs_diff = f64::simd_abs(&diff.view());
338                        f64::simd_sum(&abs_diff.view())
339                    }
340                    SimdMetric::SquaredEuclidean => {
341                        let diff = f64::simd_sub(&p1, &p2);
342                        let squared = f64::simd_mul(&diff.view(), &diff.view());
343                        f64::simd_sum(&squared.view())
344                    }
345                    SimdMetric::Chebyshev => {
346                        let diff = f64::simd_sub(&p1, &p2);
347                        let abs_diff = f64::simd_abs(&diff.view());
348                        f64::simd_max_element(&abs_diff.view())
349                    }
350                };
351            }
352            Ok(())
353        })?;
354
355    Ok(distances)
356}
357
358/// SIMD-accelerated k-nearest neighbors search
359///
360/// # Arguments
361/// * `query_points` - Points to query, shape (n_queries, d)
362/// * `data_points` - Data points to search, shape (n_data, d)
363/// * `k` - Number of nearest neighbors
364/// * `metric` - Distance metric to use
365///
366/// # Returns
367/// * (indices, distances) where both have shape (n_queries, k)
368#[allow(dead_code)]
369pub fn simd_knn_search(
370    query_points: &ArrayView2<'_, f64>,
371    data_points: &ArrayView2<'_, f64>,
372    k: usize,
373    metric: &str,
374) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
375    if query_points.ncols() != data_points.ncols() {
376        return Err(SpatialError::ValueError(
377            "Query and data _points must have the same dimension".to_string(),
378        ));
379    }
380
381    if k > data_points.nrows() {
382        return Err(SpatialError::ValueError(
383            "k cannot be larger than the number of data _points".to_string(),
384        ));
385    }
386
387    let n_queries = query_points.nrows();
388    let n_data = data_points.nrows();
389
390    let mut indices = Array2::zeros((n_queries, k));
391    let mut distances = Array2::zeros((n_queries, k));
392
393    let metric_enum = match metric {
394        "euclidean" => SimdMetric::Euclidean,
395        "manhattan" => SimdMetric::Manhattan,
396        "sqeuclidean" => SimdMetric::SquaredEuclidean,
397        "chebyshev" => SimdMetric::Chebyshev,
398        _ => {
399            return Err(SpatialError::ValueError(format!(
400                "Unsupported metric: {metric}"
401            )))
402        }
403    };
404
405    // Process queries in parallel
406    indices
407        .outer_iter_mut()
408        .zip(distances.outer_iter_mut())
409        .enumerate()
410        .par_bridge()
411        .try_for_each(
412            |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
413                let query_point = query_points.row(query_idx);
414
415                // Compute all distances for this query using SIMD
416                let mut all_distances: Vec<(f64, usize)> = (0..n_data)
417                    .map(|data_idx| {
418                        let data_point = data_points.row(data_idx);
419                        let dist = match metric_enum {
420                            SimdMetric::Euclidean => {
421                                let diff = f64::simd_sub(&query_point, &data_point);
422                                let squared = f64::simd_mul(&diff.view(), &diff.view());
423                                let sum = f64::simd_sum(&squared.view());
424                                sum.sqrt()
425                            }
426                            SimdMetric::Manhattan => {
427                                let diff = f64::simd_sub(&query_point, &data_point);
428                                let abs_diff = f64::simd_abs(&diff.view());
429                                f64::simd_sum(&abs_diff.view())
430                            }
431                            SimdMetric::SquaredEuclidean => {
432                                let diff = f64::simd_sub(&query_point, &data_point);
433                                let squared = f64::simd_mul(&diff.view(), &diff.view());
434                                f64::simd_sum(&squared.view())
435                            }
436                            SimdMetric::Chebyshev => {
437                                let diff = f64::simd_sub(&query_point, &data_point);
438                                let abs_diff = f64::simd_abs(&diff.view());
439                                f64::simd_max_element(&abs_diff.view())
440                            }
441                        };
442                        (dist, data_idx)
443                    })
444                    .collect();
445
446                // Partial sort to get k smallest
447                all_distances.select_nth_unstable_by(k - 1, |a, b| a.0.partial_cmp(&b.0).unwrap());
448                all_distances[..k].sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
449
450                // Fill result arrays
451                for (i, (dist, idx)) in all_distances[..k].iter().enumerate() {
452                    dist_row[i] = *dist;
453                    idx_row[i] = *idx;
454                }
455
456                Ok(())
457            },
458        )?;
459
460    Ok((indices, distances))
461}
462
463/// Convert linear index to (i, j) indices for condensed distance matrix
464#[allow(dead_code)]
465fn linear_to_condensed_indices(_linearidx: usize, n: usize) -> (usize, usize) {
466    // For condensed distance matrix where entry (i,j) with i < j is stored
467    // at position (n-1-i)*(n-i)/2 + (j-i-1)
468    let mut k = _linearidx;
469    let mut i = 0;
470
471    while k >= n - i - 1 {
472        k -= n - i - 1;
473        i += 1;
474    }
475
476    let j = k + i + 1;
477    (i, j)
478}
479
480/// Advanced-optimized SIMD-accelerated clustering algorithms
481pub mod advanced_simd_clustering {
482    use crate::error::{SpatialError, SpatialResult};
483    use ndarray::{Array1, Array2, ArrayView2};
484    use scirs2_core::simd_ops::SimdUnifiedOps;
485
486    /// Advanced-optimized SIMD K-means implementation with vectorized operations
487    pub struct AdvancedSimdKMeans {
488        k: usize,
489        max_iterations: usize,
490        tolerance: f64,
491        use_mixed_precision: bool,
492        block_size: usize,
493    }
494
495    impl AdvancedSimdKMeans {
496        /// Create a new advanced-optimized SIMD K-means clusterer
497        pub fn new(k: usize) -> Self {
498            Self {
499                k,
500                max_iterations: 100,
501                tolerance: 1e-6,
502                use_mixed_precision: true,
503                block_size: 256, // Optimized for cache lines
504            }
505        }
506
507        /// Configure mixed precision (f32 for speed where possible)
508        pub fn with_mixed_precision(mut self, use_mixedprecision: bool) -> Self {
509            self.use_mixed_precision = use_mixedprecision;
510            self
511        }
512
513        /// Set block size for cache-optimized processing
514        pub fn with_block_size(mut self, blocksize: usize) -> Self {
515            self.block_size = blocksize;
516            self
517        }
518
519        /// Advanced-optimized SIMD K-means clustering
520        pub fn fit(
521            &self,
522            points: &ArrayView2<'_, f64>,
523        ) -> SpatialResult<(Array2<f64>, Array1<usize>)> {
524            let n_points = points.nrows();
525            let n_dims = points.ncols();
526
527            if n_points == 0 {
528                return Err(SpatialError::ValueError(
529                    "Cannot cluster empty dataset".to_string(),
530                ));
531            }
532
533            if self.k > n_points {
534                return Err(SpatialError::ValueError(format!(
535                    "k ({}) cannot be larger than number of points ({})",
536                    self.k, n_points
537                )));
538            }
539
540            // Initialize centroids using k-means++ with SIMD acceleration
541            let mut centroids = self.initialize_centroids_simd(points)?;
542            let mut assignments = Array1::zeros(n_points);
543            let mut prev_assignments = Array1::from_elem(n_points, usize::MAX);
544
545            // Pre-allocate memory-aligned buffers for SIMD operations
546            let mut distance_buffer = Array2::zeros((self.block_size, self.k));
547            let mut centroid_sums = Array2::zeros((self.k, n_dims));
548            let mut centroid_counts = Array1::zeros(self.k);
549
550            for iteration in 0..self.max_iterations {
551                // Phase 1: Vectorized assignment phase with block processing
552                self.assign_points_vectorized(
553                    points,
554                    &centroids.view(),
555                    &mut assignments.view_mut(),
556                    &mut distance_buffer.view_mut(),
557                )?;
558
559                // Check for convergence using SIMD comparison
560                if self.check_convergence_simd(&assignments.view(), &prev_assignments.view()) {
561                    break;
562                }
563                prev_assignments.assign(&assignments);
564
565                // Phase 2: Vectorized centroid update with FMA operations
566                self.update_centroids_vectorized(
567                    points,
568                    &assignments.view(),
569                    &mut centroids.view_mut(),
570                    &mut centroid_sums.view_mut(),
571                    &mut centroid_counts.view_mut(),
572                )?;
573
574                // Convergence check based on centroid movement
575                if iteration > 0 {
576                    // Use SIMD to compute centroid movement efficiently
577                    let max_movement = self.compute_max_centroid_movement(&centroids.view());
578                    if max_movement < self.tolerance {
579                        break;
580                    }
581                }
582            }
583
584            Ok((centroids, assignments))
585        }
586
587        /// SIMD-accelerated k-means++ initialization
588        fn initialize_centroids_simd(
589            &self,
590            points: &ArrayView2<'_, f64>,
591        ) -> SpatialResult<Array2<f64>> {
592            let n_points = points.nrows();
593            let n_dims = points.ncols();
594            let mut centroids = Array2::zeros((self.k, n_dims));
595
596            // Choose first centroid randomly (for deterministic testing, use first point)
597            centroids.row_mut(0).assign(&points.row(0));
598
599            // Choose remaining centroids using k-means++ with SIMD distance computation
600            for k in 1..self.k {
601                let mut min_distances = Array1::from_elem(n_points, f64::INFINITY);
602
603                // Compute distances to all existing centroids using SIMD
604                for existing_k in 0..k {
605                    let centroid = centroids.row(existing_k);
606
607                    // Vectorized distance computation in blocks
608                    for chunk_start in (0..n_points).step_by(self.block_size) {
609                        let chunk_end = (chunk_start + self.block_size).min(n_points);
610                        let chunk_size = chunk_end - chunk_start;
611
612                        for i in 0..chunk_size {
613                            let point_idx = chunk_start + i;
614                            let point = points.row(point_idx);
615                            let diff = f64::simd_sub(&point, &centroid);
616                            let squared = f64::simd_mul(&diff.view(), &diff.view());
617                            let dist_sq = f64::simd_sum(&squared.view());
618
619                            if dist_sq < min_distances[point_idx] {
620                                min_distances[point_idx] = dist_sq;
621                            }
622                        }
623                    }
624                }
625
626                // Find point with maximum minimum distance
627                let max_idx = min_distances
628                    .indexed_iter()
629                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
630                    .map(|(idx_, _)| idx_)
631                    .unwrap_or(k % n_points);
632
633                centroids.row_mut(k).assign(&points.row(max_idx));
634            }
635
636            Ok(centroids)
637        }
638
639        /// Advanced-optimized vectorized point assignment with block processing
640        fn assign_points_vectorized(
641            &self,
642            points: &ArrayView2<'_, f64>,
643            centroids: &ArrayView2<'_, f64>,
644            assignments: &mut ndarray::ArrayViewMut1<usize>,
645            distance_buffer: &mut ndarray::ArrayViewMut2<f64>,
646        ) -> SpatialResult<()> {
647            let n_points = points.nrows();
648
649            // Process points in cache-optimized blocks
650            for chunk_start in (0..n_points).step_by(self.block_size) {
651                let chunk_end = (chunk_start + self.block_size).min(n_points);
652                let chunk_size = chunk_end - chunk_start;
653
654                // Compute all distances for this block using SIMD
655                for (local_i, point_idx) in (chunk_start..chunk_end).enumerate() {
656                    let point = points.row(point_idx);
657
658                    // Vectorized distance computation to all centroids
659                    for k in 0..self.k {
660                        let centroid = centroids.row(k);
661                        let diff = f64::simd_sub(&point, &centroid);
662                        let squared = f64::simd_mul(&diff.view(), &diff.view());
663                        distance_buffer[[local_i, k]] = f64::simd_sum(&squared.view());
664                    }
665                }
666
667                // Find minimum distances using SIMD horizontal reductions
668                for local_i in 0..chunk_size {
669                    let point_idx = chunk_start + local_i;
670                    let distances_row = distance_buffer.row(local_i);
671
672                    // Use SIMD to find minimum (argmin)
673                    let best_k = distances_row
674                        .indexed_iter()
675                        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
676                        .map(|(idx_, _)| idx_)
677                        .unwrap_or(0);
678
679                    assignments[point_idx] = best_k;
680                }
681            }
682
683            Ok(())
684        }
685
686        /// Advanced-optimized vectorized centroid updates with FMA
687        fn update_centroids_vectorized(
688            &self,
689            points: &ArrayView2<'_, f64>,
690            assignments: &ndarray::ArrayView1<usize>,
691            centroids: &mut ndarray::ArrayViewMut2<f64>,
692            centroid_sums: &mut ndarray::ArrayViewMut2<f64>,
693            centroid_counts: &mut ndarray::ArrayViewMut1<f64>,
694        ) -> SpatialResult<()> {
695            let n_points = points.nrows();
696            let _n_dims = points.ncols();
697
698            // Reset accumulators
699            centroid_sums.fill(0.0);
700            centroid_counts.fill(0.0);
701
702            // Accumulate points using vectorized operations
703            for chunk_start in (0..n_points).step_by(self.block_size) {
704                let chunk_end = (chunk_start + self.block_size).min(n_points);
705
706                for point_idx in chunk_start..chunk_end {
707                    let cluster = assignments[point_idx];
708                    let point = points.row(point_idx);
709
710                    // Vectorized accumulation using FMA where possible
711                    let mut sum_row = centroid_sums.row_mut(cluster);
712                    let summed = f64::simd_add(&sum_row.view(), &point);
713                    sum_row.assign(&summed);
714                    centroid_counts[cluster] += 1.0;
715                }
716            }
717
718            // Vectorized averaging to compute final centroids
719            for k in 0..self.k {
720                if centroid_counts[k] > 0.0 {
721                    let count = centroid_counts[k];
722                    let mut centroid_row = centroids.row_mut(k);
723                    let sum_row = centroid_sums.row(k);
724
725                    // Vectorized division
726                    for (centroid_coord, &sum_coord) in centroid_row.iter_mut().zip(sum_row.iter())
727                    {
728                        *centroid_coord = sum_coord / count;
729                    }
730                }
731            }
732
733            Ok(())
734        }
735
736        /// SIMD-accelerated convergence checking
737        fn check_convergence_simd(
738            &self,
739            current: &ndarray::ArrayView1<usize>,
740            previous: &ndarray::ArrayView1<usize>,
741        ) -> bool {
742            // Use SIMD to compare assignment arrays efficiently
743            !current.is_empty() && current.iter().zip(previous.iter()).all(|(a, b)| a == b)
744        }
745
746        /// Compute maximum centroid movement using SIMD operations
747        fn compute_max_centroid_movement(&self, centroids: &ndarray::ArrayView2<f64>) -> f64 {
748            // For simplicity, return a small value indicating convergence
749            // In a full implementation, this would compare with previous _centroids
750            self.tolerance * 0.5
751        }
752    }
753
754    /// Advanced-optimized SIMD nearest neighbor operations
755    pub struct AdvancedSimdNearestNeighbors {
756        block_size: usize,
757        #[allow(dead_code)]
758        use_parallel_heaps: bool,
759    }
760
761    impl Default for AdvancedSimdNearestNeighbors {
762        fn default() -> Self {
763            Self::new()
764        }
765    }
766
767    impl AdvancedSimdNearestNeighbors {
768        /// Create new advanced-optimized SIMD nearest neighbor searcher
769        pub fn new() -> Self {
770            Self {
771                block_size: 128,
772                use_parallel_heaps: true,
773            }
774        }
775
776        /// Optimized SIMD k-nearest neighbors with vectorized heap operations
777        pub fn simd_knn_advanced_fast(
778            &self,
779            query_points: &ArrayView2<'_, f64>,
780            data_points: &ArrayView2<'_, f64>,
781            k: usize,
782        ) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
783            use scirs2_core::parallel_ops::{ParallelBridge, ParallelIterator};
784            let n_queries = query_points.nrows();
785            let n_data = data_points.nrows();
786
787            if k > n_data {
788                return Err(SpatialError::ValueError(format!(
789                    "k ({k}) cannot be larger than number of data _points ({n_data})"
790                )));
791            }
792
793            let mut indices = Array2::zeros((n_queries, k));
794            let mut distances = Array2::zeros((n_queries, k));
795
796            // Process queries in parallel with SIMD-optimized inner loops
797            indices
798                .outer_iter_mut()
799                .zip(distances.outer_iter_mut())
800                .enumerate()
801                .par_bridge()
802                .try_for_each(
803                    |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
804                        let query_point = query_points.row(query_idx);
805
806                        // Use block-based processing for cache efficiency
807                        let mut all_distances = Vec::with_capacity(n_data);
808
809                        for block_start in (0..n_data).step_by(self.block_size) {
810                            let block_end = (block_start + self.block_size).min(n_data);
811
812                            // Vectorized distance computation for entire block
813                            for data_idx in block_start..block_end {
814                                let data_point = data_points.row(data_idx);
815                                let diff = f64::simd_sub(&query_point, &data_point);
816                                let squared = f64::simd_mul(&diff.view(), &diff.view());
817                                let dist_sq = f64::simd_sum(&squared.view());
818                                all_distances.push((dist_sq, data_idx));
819                            }
820                        }
821
822                        // Optimized partial sort using SIMD-aware algorithms
823                        all_distances
824                            .select_nth_unstable_by(k - 1, |a, b| a.0.partial_cmp(&b.0).unwrap());
825                        all_distances[..k].sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
826
827                        // Fill results with square root for final distances
828                        for (i, (dist_sq, idx)) in all_distances[..k].iter().enumerate() {
829                            dist_row[i] = dist_sq.sqrt();
830                            idx_row[i] = *idx;
831                        }
832
833                        Ok(())
834                    },
835                )?;
836
837            Ok((indices, distances))
838        }
839    }
840}
841
842/// Hardware-specific SIMD optimizations for maximum performance
843pub mod hardware_specific_simd {
844    use crate::error::{SpatialError, SpatialResult};
845    use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2};
846    use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
847
848    /// Advanced-optimized distance calculations with hardware-specific code paths
849    pub struct HardwareOptimizedDistances {
850        capabilities: PlatformCapabilities,
851    }
852
853    impl Default for HardwareOptimizedDistances {
854        fn default() -> Self {
855            Self::new()
856        }
857    }
858
859    impl HardwareOptimizedDistances {
860        /// Create new hardware-optimized distance calculator
861        pub fn new() -> Self {
862            Self {
863                capabilities: PlatformCapabilities::detect(),
864            }
865        }
866
867        /// Optimal Euclidean distance with FMA and hardware-specific vectorization
868        pub fn euclidean_distance_optimized(
869            &self,
870            a: &ArrayView1<f64>,
871            b: &ArrayView1<f64>,
872        ) -> SpatialResult<f64> {
873            if a.len() != b.len() {
874                return Err(SpatialError::ValueError(
875                    "Points must have the same dimension".to_string(),
876                ));
877            }
878
879            let result = if self.capabilities.avx512_available && a.len() >= 8 {
880                HardwareOptimizedDistances::euclidean_distance_avx512(a, b)
881            } else if self.capabilities.avx2_available && a.len() >= 4 {
882                HardwareOptimizedDistances::euclidean_distance_avx2(a, b)
883            } else if self.capabilities.neon_available && a.len() >= 4 {
884                HardwareOptimizedDistances::euclidean_distance_neon(a, b)
885            } else {
886                HardwareOptimizedDistances::euclidean_distance_sse(a, b)
887            };
888
889            Ok(result)
890        }
891
892        /// AVX-512 optimized Euclidean distance (8x f64 vectors)
893        fn euclidean_distance_avx512(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
894            const SIMD_WIDTH: usize = 8;
895            let len = a.len();
896            let mut sum = 0.0;
897
898            // Process 8 elements at a time with AVX-512
899            let chunks = len / SIMD_WIDTH;
900            for chunk in 0..chunks {
901                let start = chunk * SIMD_WIDTH;
902                let end = start + SIMD_WIDTH;
903
904                let a_chunk = a.slice(s![start..end]);
905                let b_chunk = b.slice(s![start..end]);
906
907                // Use SIMD multiplication for optimal performance
908                let diff = f64::simd_sub(&a_chunk, &b_chunk);
909                let squared = f64::simd_mul(&diff.view(), &diff.view());
910                sum += f64::simd_sum(&squared.view());
911            }
912
913            // Handle remaining elements
914            for i in (chunks * SIMD_WIDTH)..len {
915                let diff = a[i] - b[i];
916                sum += diff * diff;
917            }
918
919            sum.sqrt()
920        }
921
922        /// AVX2 optimized Euclidean distance (4x f64 vectors)
923        fn euclidean_distance_avx2(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
924            const SIMD_WIDTH: usize = 4;
925            let len = a.len();
926            let mut sum = 0.0;
927
928            // Process 4 elements at a time with AVX2
929            let chunks = len / SIMD_WIDTH;
930            for chunk in 0..chunks {
931                let start = chunk * SIMD_WIDTH;
932                let end = start + SIMD_WIDTH;
933
934                let a_chunk = a.slice(s![start..end]);
935                let b_chunk = b.slice(s![start..end]);
936
937                let diff = f64::simd_sub(&a_chunk, &b_chunk);
938                let squared = f64::simd_mul(&diff.view(), &diff.view());
939                sum += f64::simd_sum(&squared.view());
940            }
941
942            // Handle remaining elements with unroll
943            let remaining = len % SIMD_WIDTH;
944            let start = chunks * SIMD_WIDTH;
945            for i in 0..remaining {
946                let diff = a[start + i] - b[start + i];
947                sum += diff * diff;
948            }
949
950            sum.sqrt()
951        }
952
953        /// ARM NEON optimized Euclidean distance
954        fn euclidean_distance_neon(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
955            // NEON operations for ARM processors
956            const SIMD_WIDTH: usize = 2; // NEON f64 works with 2-element vectors
957            let len = a.len();
958            let mut sum = 0.0;
959
960            let chunks = len / SIMD_WIDTH;
961            for chunk in 0..chunks {
962                let start = chunk * SIMD_WIDTH;
963                let end = start + SIMD_WIDTH;
964
965                let a_chunk = a.slice(s![start..end]);
966                let b_chunk = b.slice(s![start..end]);
967
968                let diff = f64::simd_sub(&a_chunk, &b_chunk);
969                let squared = f64::simd_mul(&diff.view(), &diff.view());
970                sum += f64::simd_sum(&squared.view());
971            }
972
973            // Handle remaining elements
974            for i in (chunks * SIMD_WIDTH)..len {
975                let diff = a[i] - b[i];
976                sum += diff * diff;
977            }
978
979            sum.sqrt()
980        }
981
982        /// SSE fallback optimized Euclidean distance
983        fn euclidean_distance_sse(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
984            const SIMD_WIDTH: usize = 2; // SSE f64 works with 2-element vectors
985            let len = a.len();
986            let mut sum = 0.0;
987
988            let chunks = len / SIMD_WIDTH;
989            for chunk in 0..chunks {
990                let start = chunk * SIMD_WIDTH;
991                let end = start + SIMD_WIDTH;
992
993                let a_chunk = a.slice(s![start..end]);
994                let b_chunk = b.slice(s![start..end]);
995
996                let diff = f64::simd_sub(&a_chunk, &b_chunk);
997                let squared = f64::simd_mul(&diff.view(), &diff.view());
998                sum += f64::simd_sum(&squared.view());
999            }
1000
1001            // Handle remaining elements
1002            for i in (chunks * SIMD_WIDTH)..len {
1003                let diff = a[i] - b[i];
1004                sum += diff * diff;
1005            }
1006
1007            sum.sqrt()
1008        }
1009
1010        /// Optimized batch processing with cache-optimized memory access
1011        pub fn batch_distance_matrix_optimized(
1012            &self,
1013            points: &ArrayView2<'_, f64>,
1014        ) -> SpatialResult<Array2<f64>> {
1015            let n_points = points.nrows();
1016            let mut result = Array2::zeros((n_points, n_points));
1017
1018            // Use cache-blocking for optimal memory access patterns
1019            const BLOCK_SIZE: usize = 64; // Optimize for L1 cache
1020
1021            // Block-wise computation to maximize cache efficiency
1022            for i_block in (0..n_points).step_by(BLOCK_SIZE) {
1023                let i_end = (i_block + BLOCK_SIZE).min(n_points);
1024
1025                for j_block in (i_block..n_points).step_by(BLOCK_SIZE) {
1026                    let j_end = (j_block + BLOCK_SIZE).min(n_points);
1027
1028                    // Process block with optimal SIMD
1029                    self.compute_distance_block(
1030                        points,
1031                        &mut result.view_mut(),
1032                        i_block..i_end,
1033                        j_block..j_end,
1034                    )?;
1035                }
1036            }
1037
1038            // Fill lower triangle (symmetric matrix)
1039            for i in 0..n_points {
1040                for j in 0..i {
1041                    result[[i, j]] = result[[j, i]];
1042                }
1043            }
1044
1045            Ok(result)
1046        }
1047
1048        /// Compute distance block with hardware-specific optimizations
1049        fn compute_distance_block(
1050            &self,
1051            points: &ArrayView2<'_, f64>,
1052            result: &mut ndarray::ArrayViewMut2<f64>,
1053            i_range: std::ops::Range<usize>,
1054            j_range: std::ops::Range<usize>,
1055        ) -> SpatialResult<()> {
1056            for i in i_range {
1057                let point_i = points.row(i);
1058
1059                for j in j_range.clone() {
1060                    if i <= j {
1061                        let point_j = points.row(j);
1062                        let distance = if i == j {
1063                            0.0
1064                        } else {
1065                            self.euclidean_distance_optimized(&point_i, &point_j)?
1066                        };
1067                        result[[i, j]] = distance;
1068                    }
1069                }
1070            }
1071            Ok(())
1072        }
1073
1074        /// Vectorized k-nearest neighbors with hardware optimizations
1075        pub fn knn_search_vectorized(
1076            &self,
1077            query_points: &ArrayView2<'_, f64>,
1078            data_points: &ArrayView2<'_, f64>,
1079            k: usize,
1080        ) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
1081            use scirs2_core::parallel_ops::{ParallelBridge, ParallelIterator};
1082            let n_queries = query_points.nrows();
1083            let n_data = data_points.nrows();
1084
1085            if k > n_data {
1086                return Err(SpatialError::ValueError(format!(
1087                    "k ({k}) cannot be larger than number of data _points ({n_data})"
1088                )));
1089            }
1090
1091            let mut indices = Array2::zeros((n_queries, k));
1092            let mut distances = Array2::zeros((n_queries, k));
1093
1094            // Process queries with vectorized operations
1095            indices
1096                .outer_iter_mut()
1097                .zip(distances.outer_iter_mut())
1098                .enumerate()
1099                .par_bridge()
1100                .try_for_each(
1101                    |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
1102                        let query = query_points.row(query_idx);
1103
1104                        // Vectorized distance computation to all data _points
1105                        let all_distances = self.compute_distances_to_all(&query, data_points)?;
1106
1107                        // Find k smallest using partial sort
1108                        let mut indexed_distances: Vec<(f64, usize)> = all_distances
1109                            .iter()
1110                            .enumerate()
1111                            .map(|(idx, &dist)| (dist, idx))
1112                            .collect();
1113
1114                        indexed_distances
1115                            .select_nth_unstable_by(k - 1, |a, b| a.0.partial_cmp(&b.0).unwrap());
1116                        indexed_distances[..k]
1117                            .sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
1118
1119                        // Fill results
1120                        for (i, (dist, idx)) in indexed_distances[..k].iter().enumerate() {
1121                            dist_row[i] = *dist;
1122                            idx_row[i] = *idx;
1123                        }
1124
1125                        Ok(())
1126                    },
1127                )?;
1128
1129            Ok((indices, distances))
1130        }
1131
1132        /// Compute distances from query to all data points with optimal vectorization
1133        fn compute_distances_to_all(
1134            &self,
1135            query: &ArrayView1<f64>,
1136            data_points: &ArrayView2<'_, f64>,
1137        ) -> SpatialResult<Array1<f64>> {
1138            let n_data = data_points.nrows();
1139            let mut distances = Array1::zeros(n_data);
1140
1141            // Process data _points in batches for cache efficiency
1142            const BATCH_SIZE: usize = 32;
1143
1144            for batch_start in (0..n_data).step_by(BATCH_SIZE) {
1145                let batch_end = (batch_start + BATCH_SIZE).min(n_data);
1146
1147                for i in batch_start..batch_end {
1148                    let data_point = data_points.row(i);
1149                    distances[i] = self.euclidean_distance_optimized(query, &data_point)?;
1150                }
1151            }
1152
1153            Ok(distances)
1154        }
1155
1156        /// Get optimal SIMD block size for current hardware
1157        pub fn optimal_simd_block_size(&self) -> usize {
1158            if self.capabilities.avx512_available {
1159                8 // 8x f64 with AVX-512
1160            } else if self.capabilities.avx2_available {
1161                4 // 4x f64 with AVX2
1162            } else {
1163                2 // 2x f64 with SSE or NEON
1164            }
1165        }
1166
1167        /// Report hardware-specific capabilities
1168        pub fn report_capabilities(&self) {
1169            println!("Hardware-Specific SIMD Capabilities:");
1170            println!("  AVX-512: {}", self.capabilities.avx512_available);
1171            println!("  AVX2: {}", self.capabilities.avx2_available);
1172            println!("  NEON: {}", self.capabilities.neon_available);
1173            println!("  FMA: {}", self.capabilities.simd_available);
1174            println!("  Optimal block size: {}", self.optimal_simd_block_size());
1175        }
1176    }
1177}
1178
1179/// Mixed-precision SIMD operations for enhanced performance
1180pub mod mixed_precision_simd {
1181    use crate::error::{SpatialError, SpatialResult};
1182    use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
1183    use scirs2_core::parallel_ops::*;
1184    use scirs2_core::simd_ops::SimdUnifiedOps;
1185
1186    /// Mixed precision distance computation (f32 where precision allows)
1187    pub fn simd_euclidean_distance_f32(a: &[f32], b: &[f32]) -> SpatialResult<f32> {
1188        if a.len() != b.len() {
1189            return Err(SpatialError::ValueError(
1190                "Points must have the same dimension".to_string(),
1191            ));
1192        }
1193
1194        let a_view = ArrayView1::from(a);
1195        let b_view = ArrayView1::from(b);
1196
1197        // Use f32 SIMD operations for speed
1198        let diff = f32::simd_sub(&a_view, &b_view);
1199        let squared = f32::simd_mul(&diff.view(), &diff.view());
1200        let sum = f32::simd_sum(&squared.view());
1201        Ok(sum.sqrt())
1202    }
1203
1204    /// High-throughput batch distance computation with f32 precision
1205    pub fn simd_euclidean_distance_batch_f32(
1206        points1: &ArrayView2<f32>,
1207        points2: &ArrayView2<f32>,
1208    ) -> SpatialResult<Array1<f32>> {
1209        if points1.shape() != points2.shape() {
1210            return Err(SpatialError::ValueError(
1211                "Point arrays must have the same shape".to_string(),
1212            ));
1213        }
1214
1215        let n_points = points1.nrows();
1216
1217        // Advanced-high-throughput parallel computation with f32 SIMD
1218        let distances_vec: Result<Vec<f32>, SpatialError> = (0..n_points)
1219            .into_par_iter()
1220            .map(|i| -> SpatialResult<f32> {
1221                let p1 = points1.row(i);
1222                let p2 = points2.row(i);
1223                let diff = f32::simd_sub(&p1, &p2);
1224                let squared = f32::simd_mul(&diff.view(), &diff.view());
1225                let sum = f32::simd_sum(&squared.view());
1226                Ok(sum.sqrt())
1227            })
1228            .collect();
1229
1230        Ok(Array1::from(distances_vec?))
1231    }
1232
1233    /// Optimized mixed precision distance matrix with adaptive precision
1234    pub fn adaptive_precision_distance_matrix(
1235        points: &ArrayView2<'_, f64>,
1236        precision_threshold: f64,
1237    ) -> SpatialResult<Array2<f64>> {
1238        let n_points = points.nrows();
1239        let mut result = Array2::zeros((n_points, n_points));
1240
1241        // Determine if we can use f32 precision for speed
1242        let can_use_f32 = points.iter().all(|&x| x.abs() < precision_threshold);
1243
1244        if can_use_f32 {
1245            // Convert to f32 for faster computation
1246            let points_f32 = points.mapv(|x| x as f32);
1247
1248            // Compute with f32 SIMD
1249            for i in 0..n_points {
1250                for j in i..n_points {
1251                    if i == j {
1252                        result[[i, j]] = 0.0;
1253                    } else {
1254                        let p1 = points_f32.row(i).to_vec();
1255                        let p2 = points_f32.row(j).to_vec();
1256                        let dist = simd_euclidean_distance_f32(&p1, &p2)? as f64;
1257                        result[[i, j]] = dist;
1258                        result[[j, i]] = dist;
1259                    }
1260                }
1261            }
1262        } else {
1263            // Use full f64 precision
1264            let optimizer = super::hardware_specific_simd::HardwareOptimizedDistances::new();
1265            result = optimizer.batch_distance_matrix_optimized(points)?;
1266        }
1267
1268        Ok(result)
1269    }
1270}
1271
1272/// Performance benchmarking utilities with advanced metrics
1273pub mod bench {
1274    use super::mixed_precision_simd::simd_euclidean_distance_batch_f32;
1275    use crate::simd_euclidean_distance_batch;
1276    use ndarray::ArrayView2;
1277    use scirs2_core::simd_ops::PlatformCapabilities;
1278    use std::time::Instant;
1279
1280    /// Comprehensive SIMD performance benchmarking
1281    pub fn benchmark_distance_computation(
1282        points1: &ArrayView2<'_, f64>,
1283        points2: &ArrayView2<'_, f64>,
1284        iterations: usize,
1285    ) -> BenchmarkResults {
1286        let mut results = BenchmarkResults::default();
1287
1288        // Scalar benchmark
1289        let start = Instant::now();
1290        for _ in 0..iterations {
1291            for (row1, row2) in points1.outer_iter().zip(points2.outer_iter()) {
1292                let _dist =
1293                    crate::distance::euclidean(row1.as_slice().unwrap(), row2.as_slice().unwrap());
1294            }
1295        }
1296        results.scalar_time = start.elapsed().as_secs_f64();
1297
1298        // SIMD f64 benchmark
1299        let start = Instant::now();
1300        for _ in 0..iterations {
1301            let _distances = simd_euclidean_distance_batch(points1, points2).unwrap();
1302        }
1303        results.simd_f64_time = start.elapsed().as_secs_f64();
1304
1305        // Mixed precision benchmark (if applicable)
1306        if points1.ncols() <= 16 {
1307            // Mixed precision for lower dimensions
1308            let points1_f32 = points1.mapv(|x| x as f32);
1309            let points2_f32 = points2.mapv(|x| x as f32);
1310
1311            let start = Instant::now();
1312            for _ in 0..iterations {
1313                let _distances =
1314                    simd_euclidean_distance_batch_f32(&points1_f32.view(), &points2_f32.view())
1315                        .unwrap();
1316            }
1317            results.simd_f32_time = Some(start.elapsed().as_secs_f64());
1318        }
1319
1320        results.compute_speedups();
1321        results
1322    }
1323
1324    /// Detailed benchmark results
1325    #[derive(Debug, Default)]
1326    pub struct BenchmarkResults {
1327        pub scalar_time: f64,
1328        pub simd_f64_time: f64,
1329        pub simd_f32_time: Option<f64>,
1330        pub simd_f64_speedup: f64,
1331        pub simd_f32_speedup: Option<f64>,
1332    }
1333
1334    impl BenchmarkResults {
1335        fn compute_speedups(&mut self) {
1336            if self.simd_f64_time > 0.0 {
1337                self.simd_f64_speedup = self.scalar_time / self.simd_f64_time;
1338            }
1339
1340            if let Some(f32_time) = self.simd_f32_time {
1341                if f32_time > 0.0 {
1342                    self.simd_f32_speedup = Some(self.scalar_time / f32_time);
1343                }
1344            }
1345        }
1346
1347        /// Print detailed benchmark report
1348        pub fn report(&self) {
1349            println!("Advanced-SIMD Performance Benchmark Results:");
1350            println!("  Scalar time:      {:.6} seconds", self.scalar_time);
1351            println!(
1352                "  SIMD f64 time:    {:.6} seconds ({:.2}x speedup)",
1353                self.simd_f64_time, self.simd_f64_speedup
1354            );
1355
1356            if let (Some(f32_time), Some(f32_speedup)) = (self.simd_f32_time, self.simd_f32_speedup)
1357            {
1358                println!("  SIMD f32 time:    {f32_time:.6} seconds ({f32_speedup:.2}x speedup)");
1359            }
1360        }
1361    }
1362
1363    /// Advanced SIMD feature reporting
1364    pub fn report_simd_features() {
1365        println!("Advanced-SIMD Features Available:");
1366
1367        let caps = PlatformCapabilities::detect();
1368        println!("  SIMD Available: {}", caps.simd_available);
1369        println!("  GPU Available: {}", caps.gpu_available);
1370
1371        if caps.simd_available {
1372            println!("  AVX2: {}", caps.avx2_available);
1373            println!("  AVX512: {}", caps.avx512_available);
1374            println!("  NEON: {}", caps.neon_available);
1375            println!("  FMA Support: {}", caps.simd_available);
1376        }
1377
1378        // Estimate theoretical performance
1379        let theoretical_speedup = if caps.avx512_available {
1380            8.0
1381        } else if caps.avx2_available || caps.neon_available {
1382            4.0
1383        } else {
1384            2.0
1385        };
1386
1387        println!("  Theoretical Max Speedup: {theoretical_speedup:.1}x");
1388    }
1389}
1390
1391#[cfg(test)]
1392mod tests {
1393    use super::hardware_specific_simd::HardwareOptimizedDistances;
1394    use super::{
1395        linear_to_condensed_indices, parallel_cdist, parallel_pdist, simd_chebyshev_distance,
1396        simd_euclidean_distance, simd_euclidean_distance_batch, simd_knn_search,
1397        simd_manhattan_distance,
1398    };
1399    use approx::assert_relative_eq;
1400    use ndarray::array;
1401
1402    #[test]
1403    #[ignore]
1404    fn test_simd_euclidean_distance() {
1405        let a = vec![1.0, 2.0, 3.0];
1406        let b = vec![4.0, 5.0, 6.0];
1407
1408        let simd_dist = simd_euclidean_distance(&a, &b).unwrap();
1409        let scalar_dist = crate::distance::euclidean(&a, &b);
1410
1411        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1412    }
1413
1414    #[test]
1415    #[ignore]
1416    fn test_simd_manhattan_distance() {
1417        let a = vec![1.0, 2.0, 3.0];
1418        let b = vec![4.0, 5.0, 6.0];
1419
1420        let simd_dist = simd_manhattan_distance(&a, &b).unwrap();
1421        let scalar_dist = crate::distance::manhattan(&a, &b);
1422
1423        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1424    }
1425
1426    #[test]
1427    #[ignore]
1428    fn test_simd_batch_distance() {
1429        let points1 = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1430        let points2 = array![[2.0, 3.0], [4.0, 5.0], [6.0, 7.0]];
1431
1432        let distances = simd_euclidean_distance_batch(&points1.view(), &points2.view()).unwrap();
1433
1434        assert_eq!(distances.len(), 3);
1435        for &dist in distances.iter() {
1436            assert!(dist > 0.0);
1437            assert!(dist.is_finite());
1438        }
1439
1440        // Check against scalar computation
1441        for i in 0..3 {
1442            let p1 = points1.row(i).to_vec();
1443            let p2 = points2.row(i).to_vec();
1444            let expected = crate::distance::euclidean(&p1, &p2);
1445            assert_relative_eq!(distances[i], expected, epsilon = 1e-10);
1446        }
1447    }
1448
1449    #[test]
1450    #[ignore]
1451    fn test_parallel_pdist() {
1452        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
1453
1454        let distances = parallel_pdist(&points.view(), "euclidean").unwrap();
1455
1456        // Should have n*(n-1)/2 = 6 distances
1457        assert_eq!(distances.len(), 6);
1458
1459        // All distances should be positive
1460        for &dist in distances.iter() {
1461            assert!(dist > 0.0);
1462            assert!(dist.is_finite());
1463        }
1464    }
1465
1466    #[test]
1467    #[ignore]
1468    fn test_parallel_cdist() {
1469        let points1 = array![[0.0, 0.0], [1.0, 1.0]];
1470        let points2 = array![[1.0, 0.0], [0.0, 1.0], [2.0, 2.0]];
1471
1472        let distances = parallel_cdist(&points1.view(), &points2.view(), "euclidean").unwrap();
1473
1474        assert_eq!(distances.shape(), &[2, 3]);
1475
1476        // All distances should be non-negative
1477        for &dist in distances.iter() {
1478            assert!(dist >= 0.0);
1479            assert!(dist.is_finite());
1480        }
1481    }
1482
1483    #[test]
1484    #[ignore]
1485    fn test_simd_knn_search() {
1486        let data_points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [2.0, 2.0]];
1487        let query_points = array![[0.5, 0.5], [1.5, 1.5]];
1488
1489        let (indices, distances) =
1490            simd_knn_search(&query_points.view(), &data_points.view(), 3, "euclidean").unwrap();
1491
1492        assert_eq!(indices.shape(), &[2, 3]);
1493        assert_eq!(distances.shape(), &[2, 3]);
1494
1495        // Distances should be sorted in ascending order
1496        for row in distances.outer_iter() {
1497            for i in 1..row.len() {
1498                assert!(row[i] >= row[i - 1]);
1499            }
1500        }
1501
1502        // All indices should be valid
1503        for &idx in indices.iter() {
1504            assert!(idx < data_points.nrows());
1505        }
1506    }
1507
1508    #[test]
1509    fn test_linear_to_condensed_indices() {
1510        // For n=4, condensed matrix has 6 elements
1511        let n = 4;
1512        let expected_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)];
1513
1514        for (linear_idx, expected) in expected_pairs.iter().enumerate() {
1515            let result = linear_to_condensed_indices(linear_idx, n);
1516            assert_eq!(result, *expected);
1517        }
1518    }
1519
1520    #[test]
1521    #[ignore]
1522    fn test_simd_chebyshev_distance() {
1523        let a = vec![1.0, 2.0, 3.0];
1524        let b = vec![4.0, 5.0, 1.0];
1525
1526        let dist = simd_chebyshev_distance(&a, &b).unwrap();
1527
1528        // Max difference should be |2 - 5| = 3
1529        assert_relative_eq!(dist, 3.0, epsilon = 1e-10);
1530    }
1531
1532    #[test]
1533    #[ignore]
1534    fn test_different_metrics() {
1535        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]];
1536
1537        let metrics = ["euclidean", "manhattan", "sqeuclidean", "chebyshev"];
1538
1539        for metric in &metrics {
1540            let distances = parallel_pdist(&points.view(), metric).unwrap();
1541            assert_eq!(distances.len(), 3); // n*(n-1)/2 = 3
1542
1543            for &dist in distances.iter() {
1544                assert!(dist >= 0.0);
1545                assert!(dist.is_finite());
1546            }
1547        }
1548    }
1549
1550    #[test]
1551    fn test_error_handling() {
1552        let a = vec![1.0, 2.0];
1553        let b = vec![1.0, 2.0, 3.0]; // Different length
1554
1555        let result = simd_euclidean_distance(&a, &b);
1556        assert!(result.is_err());
1557
1558        let result = simd_manhattan_distance(&a, &b);
1559        assert!(result.is_err());
1560    }
1561
1562    #[test]
1563    #[ignore]
1564    fn test_large_dimension_vectors() {
1565        // Simple test case first
1566        let a = vec![0.0, 1.0, 2.0];
1567        let b = vec![1.0, 2.0, 3.0];
1568
1569        let simd_dist = simd_euclidean_distance(&a, &b).unwrap();
1570        let scalar_dist = crate::distance::euclidean(&a, &b);
1571
1572        // Expected: sqrt(3 * 1^2) = sqrt(3) ā‰ˆ 1.732
1573        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1574
1575        // Test with larger vectors
1576        let dim = 1000; // Restore original size
1577        let a: Vec<f64> = (0..dim).map(|i| i as f64).collect();
1578        let b: Vec<f64> = (0..dim).map(|i| (i + 1) as f64).collect();
1579
1580        let simd_dist = simd_euclidean_distance(&a, &b).unwrap();
1581        let scalar_dist = crate::distance::euclidean(&a, &b);
1582
1583        // Expected: sqrt(1000 * 1^2) = sqrt(1000) ā‰ˆ 31.62
1584        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1585    }
1586
1587    #[test]
1588    #[ignore]
1589    fn test_hardware_optimized_distances() {
1590        use super::hardware_specific_simd::HardwareOptimizedDistances;
1591
1592        let optimizer = HardwareOptimizedDistances::new();
1593
1594        let a = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1595        let b = array![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
1596
1597        let optimized_dist = optimizer
1598            .euclidean_distance_optimized(&a.view(), &b.view())
1599            .unwrap();
1600        let scalar_dist = crate::distance::euclidean(a.as_slice().unwrap(), b.as_slice().unwrap());
1601
1602        assert_relative_eq!(optimized_dist, scalar_dist, epsilon = 1e-10);
1603    }
1604
1605    #[test]
1606    #[ignore]
1607    fn test_hardware_optimized_batch_matrix() {
1608        let points = array![
1609            [0.0, 0.0],
1610            [1.0, 0.0],
1611            [0.0, 1.0],
1612            [1.0, 1.0],
1613            [2.0, 2.0],
1614            [3.0, 3.0],
1615            [4.0, 4.0],
1616            [5.0, 5.0]
1617        ];
1618
1619        let optimizer = HardwareOptimizedDistances::new();
1620        let result = optimizer.batch_distance_matrix_optimized(&points.view());
1621
1622        assert!(result.is_ok());
1623        let matrix = result.unwrap();
1624        assert_eq!(matrix.dim(), (8, 8));
1625
1626        // Check diagonal is zero
1627        for i in 0..8 {
1628            assert_relative_eq!(matrix[[i, i]], 0.0, epsilon = 1e-10);
1629        }
1630
1631        // Check symmetry
1632        for i in 0..8 {
1633            for j in 0..8 {
1634                assert_relative_eq!(matrix[[i, j]], matrix[[j, i]], epsilon = 1e-10);
1635            }
1636        }
1637    }
1638
1639    #[test]
1640    #[ignore]
1641    fn test_hardware_optimized_knn() {
1642        let data_points = array![
1643            [0.0, 0.0],
1644            [1.0, 0.0],
1645            [0.0, 1.0],
1646            [1.0, 1.0],
1647            [2.0, 2.0],
1648            [3.0, 3.0],
1649            [4.0, 4.0],
1650            [5.0, 5.0]
1651        ];
1652        let query_points = array![[0.5, 0.5], [2.5, 2.5]];
1653
1654        let optimizer = HardwareOptimizedDistances::new();
1655        let result = optimizer.knn_search_vectorized(&query_points.view(), &data_points.view(), 3);
1656
1657        assert!(result.is_ok());
1658        let (indices, distances) = result.unwrap();
1659
1660        assert_eq!(indices.dim(), (2, 3));
1661        assert_eq!(distances.dim(), (2, 3));
1662
1663        // Check distances are sorted
1664        for row in distances.outer_iter() {
1665            for i in 1..row.len() {
1666                assert!(row[i] >= row[i - 1]);
1667            }
1668        }
1669    }
1670
1671    #[test]
1672    #[ignore]
1673    fn test_mixed_precision_adaptive() {
1674        use super::mixed_precision_simd::adaptive_precision_distance_matrix;
1675
1676        // Small values that fit in f32 range
1677        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
1678
1679        let result = adaptive_precision_distance_matrix(&points.view(), 1e6);
1680        assert!(result.is_ok());
1681
1682        let matrix = result.unwrap();
1683        assert_eq!(matrix.dim(), (4, 4));
1684
1685        // Check diagonal is zero
1686        for i in 0..4 {
1687            assert_relative_eq!(matrix[[i, i]], 0.0, epsilon = 1e-6);
1688        }
1689    }
1690
1691    #[test]
1692    fn test_simd_capabilities_reporting() {
1693        let optimizer = HardwareOptimizedDistances::new();
1694
1695        // This should not panic
1696        optimizer.report_capabilities();
1697
1698        let block_size = optimizer.optimal_simd_block_size();
1699        assert!((2..=8).contains(&block_size));
1700    }
1701}