scirs2_spatial/
simd_distance.rs

1//! SIMD-accelerated distance calculations for spatial operations
2//!
3//! This module provides high-performance implementations of distance calculations
4//! using the unified SIMD operations from scirs2-core. These optimized
5//! functions can provide significant performance improvements for operations
6//! involving large datasets.
7//!
8//! # Features
9//!
10//! - **SIMD-accelerated distance metrics**: Euclidean, Manhattan, Minkowski
11//! - **Parallel distance matrix computation**: Multi-threaded pdist and cdist
12//! - **Batch nearest neighbor queries**: Optimized for large point sets
13//! - **Memory-efficient operations**: Minimized memory allocation and copying
14//!
15//! # Architecture Support
16//!
17//! The SIMD operations are automatically detected and optimized by scirs2-core
18//! based on the available hardware capabilities.
19//!
20//! # Examples
21//!
22//! ```
23//! use scirs2_spatial::simd_distance::{simd_euclidean_distance_batch, parallel_pdist};
24//! use scirs2_core::ndarray::array;
25//!
26//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
27//! // SIMD batch distance calculation
28//! let points1 = array![[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]];
29//! let points2 = array![[1.0, 0.0], [2.0, 1.0], [3.0, 2.0]];
30//!
31//! let distances = simd_euclidean_distance_batch(&points1.view(), &points2.view())?;
32//! println!("Batch distances: {:?}", distances);
33//!
34//! // Parallel distance matrix
35//! let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
36//! let dist_matrix = parallel_pdist(&points.view(), "euclidean")?;
37//! println!("Distance matrix shape: {:?}", dist_matrix.shape());
38//! # Ok(())
39//! # }
40//! ```
41
42use crate::error::{SpatialError, SpatialResult};
43use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
44use scirs2_core::parallel_ops::*;
45use scirs2_core::simd_ops::SimdUnifiedOps;
46
47/// Supported distance metrics for SIMD operations
48#[derive(Debug, Clone, Copy)]
49pub enum SimdMetric {
50    /// Euclidean distance (L2 norm)
51    Euclidean,
52    /// Manhattan distance (L1 norm)  
53    Manhattan,
54    /// Squared Euclidean distance (no square root)
55    SquaredEuclidean,
56    /// Chebyshev distance (Lāˆž norm)
57    Chebyshev,
58}
59
60impl SimdMetric {
61    /// Get the string name of the metric
62    ///
63    /// # Returns
64    ///
65    /// * A string slice containing the name of the metric
66    ///
67    /// # Examples
68    ///
69    /// ```
70    /// use scirs2_spatial::simd_distance::SimdMetric;
71    ///
72    /// let metric = SimdMetric::Euclidean;
73    /// assert_eq!(metric.name(), "euclidean");
74    ///
75    /// let metric = SimdMetric::Manhattan;
76    /// assert_eq!(metric.name(), "manhattan");
77    /// ```
78    pub fn name(&self) -> &'static str {
79        match self {
80            SimdMetric::Euclidean => "euclidean",
81            SimdMetric::Manhattan => "manhattan",
82            SimdMetric::SquaredEuclidean => "sqeuclidean",
83            SimdMetric::Chebyshev => "chebyshev",
84        }
85    }
86}
87
88/// SIMD-accelerated Euclidean distance between two points
89///
90/// # Arguments
91/// * `a` - First point
92/// * `b` - Second point
93///
94/// # Returns
95/// * Euclidean distance between the points
96#[allow(dead_code)]
97pub fn simd_euclidean_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
98    if a.len() != b.len() {
99        return Err(SpatialError::ValueError(
100            "Points must have the same dimension".to_string(),
101        ));
102    }
103
104    // Convert slices to ArrayView for core ops
105    let a_view = ArrayView1::from(a);
106    let b_view = ArrayView1::from(b);
107
108    // Compute squared distance using SIMD operations
109    let diff = f64::simd_sub(&a_view, &b_view);
110    let squared = f64::simd_mul(&diff.view(), &diff.view());
111    let sum = f64::simd_sum(&squared.view());
112    Ok(sum.sqrt())
113}
114
115/// SIMD-accelerated squared Euclidean distance between two points
116#[allow(dead_code)]
117pub fn simd_squared_euclidean_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
118    if a.len() != b.len() {
119        return Err(SpatialError::ValueError(
120            "Points must have the same dimension".to_string(),
121        ));
122    }
123
124    let a_view = ArrayView1::from(a);
125    let b_view = ArrayView1::from(b);
126
127    // Compute squared distance using SIMD operations
128    let diff = f64::simd_sub(&a_view, &b_view);
129    let squared = f64::simd_mul(&diff.view(), &diff.view());
130    Ok(f64::simd_sum(&squared.view()))
131}
132
133/// SIMD-accelerated Manhattan distance between two points
134#[allow(dead_code)]
135pub fn simd_manhattan_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
136    if a.len() != b.len() {
137        return Err(SpatialError::ValueError(
138            "Points must have the same dimension".to_string(),
139        ));
140    }
141
142    let a_view = ArrayView1::from(a);
143    let b_view = ArrayView1::from(b);
144
145    // Compute Manhattan distance using SIMD operations
146    let diff = f64::simd_sub(&a_view, &b_view);
147    let abs_diff = f64::simd_abs(&diff.view());
148    Ok(f64::simd_sum(&abs_diff.view()))
149}
150
151/// SIMD-accelerated Chebyshev distance
152#[allow(dead_code)]
153pub fn simd_chebyshev_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
154    if a.len() != b.len() {
155        return Err(SpatialError::ValueError(
156            "Points must have the same dimension".to_string(),
157        ));
158    }
159
160    let a_view = ArrayView1::from(a);
161    let b_view = ArrayView1::from(b);
162
163    // Compute Chebyshev distance using SIMD operations
164    let diff = f64::simd_sub(&a_view, &b_view);
165    let abs_diff = f64::simd_abs(&diff.view());
166    Ok(f64::simd_max_element(&abs_diff.view()))
167}
168
169/// Batch SIMD-accelerated Euclidean distance calculation
170///
171/// Computes distances between corresponding rows of two matrices
172///
173/// # Arguments
174/// * `points1` - First set of points, shape (n, d)
175/// * `points2` - Second set of points, shape (n, d)
176///
177/// # Returns
178/// * Array of distances, shape (n,)
179#[allow(dead_code)]
180pub fn simd_euclidean_distance_batch(
181    points1: &ArrayView2<'_, f64>,
182    points2: &ArrayView2<'_, f64>,
183) -> SpatialResult<Array1<f64>> {
184    if points1.shape() != points2.shape() {
185        return Err(SpatialError::ValueError(
186            "Point arrays must have the same shape".to_string(),
187        ));
188    }
189
190    let n_points = points1.nrows();
191
192    // Use parallel computation with SIMD operations
193    let distances_vec: Result<Vec<f64>, SpatialError> = (0..n_points)
194        .into_par_iter()
195        .map(|i| -> SpatialResult<f64> {
196            let p1 = points1.row(i);
197            let p2 = points2.row(i);
198            let diff = f64::simd_sub(&p1, &p2);
199            let squared = f64::simd_mul(&diff.view(), &diff.view());
200            let sum = f64::simd_sum(&squared.view());
201            Ok(sum.sqrt())
202        })
203        .collect();
204
205    Ok(Array1::from(distances_vec?))
206}
207
208/// Parallel computation of pairwise distance matrix
209///
210/// # Arguments
211/// * `points` - Array of points, shape (n, d)
212/// * `metric` - Distance metric to use
213///
214/// # Returns
215/// * Condensed distance matrix, shape (n*(n-1)/2,)
216#[allow(dead_code)]
217pub fn parallel_pdist(points: &ArrayView2<'_, f64>, metric: &str) -> SpatialResult<Array1<f64>> {
218    use scirs2_core::parallel_ops::ParallelIterator;
219    let n_points = points.nrows();
220    if n_points < 2 {
221        return Err(SpatialError::ValueError(
222            "Need at least 2 _points for distance computation".to_string(),
223        ));
224    }
225
226    let n_distances = n_points * (n_points - 1) / 2;
227
228    let metric_enum = match metric {
229        "euclidean" => SimdMetric::Euclidean,
230        "manhattan" => SimdMetric::Manhattan,
231        "sqeuclidean" => SimdMetric::SquaredEuclidean,
232        "chebyshev" => SimdMetric::Chebyshev,
233        _ => {
234            return Err(SpatialError::ValueError(format!(
235                "Unsupported metric: {metric}"
236            )))
237        }
238    };
239
240    // Parallel computation of distance pairs
241    let distances_vec: Result<Vec<f64>, SpatialError> = (0..n_distances)
242        .into_par_iter()
243        .map(|idx| -> SpatialResult<f64> {
244            // Convert linear index to (i, j) pair
245            let (i, j) = linear_to_condensed_indices(idx, n_points);
246
247            let p1 = points.row(i);
248            let p2 = points.row(j);
249
250            match metric_enum {
251                SimdMetric::Euclidean => {
252                    let diff = f64::simd_sub(&p1, &p2);
253                    let squared = f64::simd_mul(&diff.view(), &diff.view());
254                    let sum = f64::simd_sum(&squared.view());
255                    Ok(sum.sqrt())
256                }
257                SimdMetric::Manhattan => {
258                    let diff = f64::simd_sub(&p1, &p2);
259                    let abs_diff = f64::simd_abs(&diff.view());
260                    Ok(f64::simd_sum(&abs_diff.view()))
261                }
262                SimdMetric::SquaredEuclidean => {
263                    let diff = f64::simd_sub(&p1, &p2);
264                    let squared = f64::simd_mul(&diff.view(), &diff.view());
265                    Ok(f64::simd_sum(&squared.view()))
266                }
267                SimdMetric::Chebyshev => {
268                    let diff = f64::simd_sub(&p1, &p2);
269                    let abs_diff = f64::simd_abs(&diff.view());
270                    Ok(f64::simd_max_element(&abs_diff.view()))
271                }
272            }
273        })
274        .collect();
275
276    Ok(Array1::from(distances_vec?))
277}
278
279/// Parallel computation of cross-distance matrix
280///
281/// # Arguments
282/// * `points1` - First set of points, shape (n, d)
283/// * `points2` - Second set of points, shape (m, d)
284/// * `metric` - Distance metric to use
285///
286/// # Returns
287/// * Distance matrix, shape (n, m)
288#[allow(dead_code)]
289pub fn parallel_cdist(
290    points1: &ArrayView2<'_, f64>,
291    points2: &ArrayView2<'_, f64>,
292    metric: &str,
293) -> SpatialResult<Array2<f64>> {
294    use scirs2_core::parallel_ops::ParallelIterator;
295    if points1.ncols() != points2.ncols() {
296        return Err(SpatialError::ValueError(
297            "Point arrays must have the same number of dimensions".to_string(),
298        ));
299    }
300
301    let n1 = points1.nrows();
302    let n2 = points2.nrows();
303    let mut distances = Array2::zeros((n1, n2));
304
305    let metric_enum = match metric {
306        "euclidean" => SimdMetric::Euclidean,
307        "manhattan" => SimdMetric::Manhattan,
308        "sqeuclidean" => SimdMetric::SquaredEuclidean,
309        "chebyshev" => SimdMetric::Chebyshev,
310        _ => {
311            return Err(SpatialError::ValueError(format!(
312                "Unsupported metric: {metric}"
313            )))
314        }
315    };
316
317    // Parallel computation over rows of first matrix
318    distances
319        .outer_iter_mut()
320        .enumerate()
321        .par_bridge()
322        .try_for_each(|(i, mut row)| -> SpatialResult<()> {
323            let p1 = points1.row(i);
324
325            for (j, dist) in row.iter_mut().enumerate() {
326                let p2 = points2.row(j);
327
328                *dist = match metric_enum {
329                    SimdMetric::Euclidean => {
330                        let diff = f64::simd_sub(&p1, &p2);
331                        let squared = f64::simd_mul(&diff.view(), &diff.view());
332                        let sum = f64::simd_sum(&squared.view());
333                        sum.sqrt()
334                    }
335                    SimdMetric::Manhattan => {
336                        let diff = f64::simd_sub(&p1, &p2);
337                        let abs_diff = f64::simd_abs(&diff.view());
338                        f64::simd_sum(&abs_diff.view())
339                    }
340                    SimdMetric::SquaredEuclidean => {
341                        let diff = f64::simd_sub(&p1, &p2);
342                        let squared = f64::simd_mul(&diff.view(), &diff.view());
343                        f64::simd_sum(&squared.view())
344                    }
345                    SimdMetric::Chebyshev => {
346                        let diff = f64::simd_sub(&p1, &p2);
347                        let abs_diff = f64::simd_abs(&diff.view());
348                        f64::simd_max_element(&abs_diff.view())
349                    }
350                };
351            }
352            Ok(())
353        })?;
354
355    Ok(distances)
356}
357
358/// SIMD-accelerated k-nearest neighbors search
359///
360/// # Arguments
361/// * `query_points` - Points to query, shape (n_queries, d)
362/// * `data_points` - Data points to search, shape (n_data, d)
363/// * `k` - Number of nearest neighbors
364/// * `metric` - Distance metric to use
365///
366/// # Returns
367/// * (indices, distances) where both have shape (n_queries, k)
368#[allow(dead_code)]
369pub fn simd_knn_search(
370    query_points: &ArrayView2<'_, f64>,
371    data_points: &ArrayView2<'_, f64>,
372    k: usize,
373    metric: &str,
374) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
375    if query_points.ncols() != data_points.ncols() {
376        return Err(SpatialError::ValueError(
377            "Query and data _points must have the same dimension".to_string(),
378        ));
379    }
380
381    if k > data_points.nrows() {
382        return Err(SpatialError::ValueError(
383            "k cannot be larger than the number of data _points".to_string(),
384        ));
385    }
386
387    let n_queries = query_points.nrows();
388    let n_data = data_points.nrows();
389
390    let mut indices = Array2::zeros((n_queries, k));
391    let mut distances = Array2::zeros((n_queries, k));
392
393    let metric_enum = match metric {
394        "euclidean" => SimdMetric::Euclidean,
395        "manhattan" => SimdMetric::Manhattan,
396        "sqeuclidean" => SimdMetric::SquaredEuclidean,
397        "chebyshev" => SimdMetric::Chebyshev,
398        _ => {
399            return Err(SpatialError::ValueError(format!(
400                "Unsupported metric: {metric}"
401            )))
402        }
403    };
404
405    // Process queries in parallel
406    indices
407        .outer_iter_mut()
408        .zip(distances.outer_iter_mut())
409        .enumerate()
410        .par_bridge()
411        .try_for_each(
412            |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
413                let query_point = query_points.row(query_idx);
414
415                // Compute all distances for this query using SIMD
416                let mut all_distances: Vec<(f64, usize)> = (0..n_data)
417                    .map(|data_idx| {
418                        let data_point = data_points.row(data_idx);
419                        let dist = match metric_enum {
420                            SimdMetric::Euclidean => {
421                                let diff = f64::simd_sub(&query_point, &data_point);
422                                let squared = f64::simd_mul(&diff.view(), &diff.view());
423                                let sum = f64::simd_sum(&squared.view());
424                                sum.sqrt()
425                            }
426                            SimdMetric::Manhattan => {
427                                let diff = f64::simd_sub(&query_point, &data_point);
428                                let abs_diff = f64::simd_abs(&diff.view());
429                                f64::simd_sum(&abs_diff.view())
430                            }
431                            SimdMetric::SquaredEuclidean => {
432                                let diff = f64::simd_sub(&query_point, &data_point);
433                                let squared = f64::simd_mul(&diff.view(), &diff.view());
434                                f64::simd_sum(&squared.view())
435                            }
436                            SimdMetric::Chebyshev => {
437                                let diff = f64::simd_sub(&query_point, &data_point);
438                                let abs_diff = f64::simd_abs(&diff.view());
439                                f64::simd_max_element(&abs_diff.view())
440                            }
441                        };
442                        (dist, data_idx)
443                    })
444                    .collect();
445
446                // Partial sort to get k smallest
447                all_distances.select_nth_unstable_by(k - 1, |a, b| {
448                    a.0.partial_cmp(&b.0).expect("Operation failed")
449                });
450                all_distances[..k].sort_by(|a, b| a.0.partial_cmp(&b.0).expect("Operation failed"));
451
452                // Fill result arrays
453                for (i, (dist, idx)) in all_distances[..k].iter().enumerate() {
454                    dist_row[i] = *dist;
455                    idx_row[i] = *idx;
456                }
457
458                Ok(())
459            },
460        )?;
461
462    Ok((indices, distances))
463}
464
465/// Convert linear index to (i, j) indices for condensed distance matrix
466#[allow(dead_code)]
467fn linear_to_condensed_indices(_linearidx: usize, n: usize) -> (usize, usize) {
468    // For condensed distance matrix where entry (i,j) with i < j is stored
469    // at position (n-1-i)*(n-i)/2 + (j-i-1)
470    let mut k = _linearidx;
471    let mut i = 0;
472
473    while k >= n - i - 1 {
474        k -= n - i - 1;
475        i += 1;
476    }
477
478    let j = k + i + 1;
479    (i, j)
480}
481
482/// Advanced-optimized SIMD-accelerated clustering algorithms
483pub mod advanced_simd_clustering {
484    use crate::error::{SpatialError, SpatialResult};
485    use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
486    use scirs2_core::simd_ops::SimdUnifiedOps;
487
488    /// Advanced-optimized SIMD K-means implementation with vectorized operations
489    pub struct AdvancedSimdKMeans {
490        k: usize,
491        max_iterations: usize,
492        tolerance: f64,
493        use_mixed_precision: bool,
494        block_size: usize,
495    }
496
497    impl AdvancedSimdKMeans {
498        /// Create a new advanced-optimized SIMD K-means clusterer
499        pub fn new(k: usize) -> Self {
500            Self {
501                k,
502                max_iterations: 100,
503                tolerance: 1e-6,
504                use_mixed_precision: true,
505                block_size: 256, // Optimized for cache lines
506            }
507        }
508
509        /// Configure mixed precision (f32 for speed where possible)
510        pub fn with_mixed_precision(mut self, use_mixedprecision: bool) -> Self {
511            self.use_mixed_precision = use_mixedprecision;
512            self
513        }
514
515        /// Set block size for cache-optimized processing
516        pub fn with_block_size(mut self, blocksize: usize) -> Self {
517            self.block_size = blocksize;
518            self
519        }
520
521        /// Advanced-optimized SIMD K-means clustering
522        pub fn fit(
523            &self,
524            points: &ArrayView2<'_, f64>,
525        ) -> SpatialResult<(Array2<f64>, Array1<usize>)> {
526            let n_points = points.nrows();
527            let n_dims = points.ncols();
528
529            if n_points == 0 {
530                return Err(SpatialError::ValueError(
531                    "Cannot cluster empty dataset".to_string(),
532                ));
533            }
534
535            if self.k > n_points {
536                return Err(SpatialError::ValueError(format!(
537                    "k ({}) cannot be larger than number of points ({})",
538                    self.k, n_points
539                )));
540            }
541
542            // Initialize centroids using k-means++ with SIMD acceleration
543            let mut centroids = self.initialize_centroids_simd(points)?;
544            let mut assignments = Array1::zeros(n_points);
545            let mut prev_assignments = Array1::from_elem(n_points, usize::MAX);
546
547            // Pre-allocate memory-aligned buffers for SIMD operations
548            let mut distance_buffer = Array2::zeros((self.block_size, self.k));
549            let mut centroid_sums = Array2::zeros((self.k, n_dims));
550            let mut centroid_counts = Array1::zeros(self.k);
551
552            for iteration in 0..self.max_iterations {
553                // Phase 1: Vectorized assignment phase with block processing
554                self.assign_points_vectorized(
555                    points,
556                    &centroids.view(),
557                    &mut assignments.view_mut(),
558                    &mut distance_buffer.view_mut(),
559                )?;
560
561                // Check for convergence using SIMD comparison
562                if self.check_convergence_simd(&assignments.view(), &prev_assignments.view()) {
563                    break;
564                }
565                prev_assignments.assign(&assignments);
566
567                // Phase 2: Vectorized centroid update with FMA operations
568                self.update_centroids_vectorized(
569                    points,
570                    &assignments.view(),
571                    &mut centroids.view_mut(),
572                    &mut centroid_sums.view_mut(),
573                    &mut centroid_counts.view_mut(),
574                )?;
575
576                // Convergence check based on centroid movement
577                if iteration > 0 {
578                    // Use SIMD to compute centroid movement efficiently
579                    let max_movement = self.compute_max_centroid_movement(&centroids.view());
580                    if max_movement < self.tolerance {
581                        break;
582                    }
583                }
584            }
585
586            Ok((centroids, assignments))
587        }
588
589        /// SIMD-accelerated k-means++ initialization
590        fn initialize_centroids_simd(
591            &self,
592            points: &ArrayView2<'_, f64>,
593        ) -> SpatialResult<Array2<f64>> {
594            let n_points = points.nrows();
595            let n_dims = points.ncols();
596            let mut centroids = Array2::zeros((self.k, n_dims));
597
598            // Choose first centroid randomly (for deterministic testing, use first point)
599            centroids.row_mut(0).assign(&points.row(0));
600
601            // Choose remaining centroids using k-means++ with SIMD distance computation
602            for k in 1..self.k {
603                let mut min_distances = Array1::from_elem(n_points, f64::INFINITY);
604
605                // Compute distances to all existing centroids using SIMD
606                for existing_k in 0..k {
607                    let centroid = centroids.row(existing_k);
608
609                    // Vectorized distance computation in blocks
610                    for chunk_start in (0..n_points).step_by(self.block_size) {
611                        let chunk_end = (chunk_start + self.block_size).min(n_points);
612                        let chunk_size = chunk_end - chunk_start;
613
614                        for i in 0..chunk_size {
615                            let point_idx = chunk_start + i;
616                            let point = points.row(point_idx);
617                            let diff = f64::simd_sub(&point, &centroid);
618                            let squared = f64::simd_mul(&diff.view(), &diff.view());
619                            let dist_sq = f64::simd_sum(&squared.view());
620
621                            if dist_sq < min_distances[point_idx] {
622                                min_distances[point_idx] = dist_sq;
623                            }
624                        }
625                    }
626                }
627
628                // Find point with maximum minimum distance
629                let max_idx = min_distances
630                    .indexed_iter()
631                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).expect("Operation failed"))
632                    .map(|(idx_, _)| idx_)
633                    .unwrap_or(k % n_points);
634
635                centroids.row_mut(k).assign(&points.row(max_idx));
636            }
637
638            Ok(centroids)
639        }
640
641        /// Advanced-optimized vectorized point assignment with block processing
642        fn assign_points_vectorized(
643            &self,
644            points: &ArrayView2<'_, f64>,
645            centroids: &ArrayView2<'_, f64>,
646            assignments: &mut scirs2_core::ndarray::ArrayViewMut1<usize>,
647            distance_buffer: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
648        ) -> SpatialResult<()> {
649            let n_points = points.nrows();
650
651            // Process points in cache-optimized blocks
652            for chunk_start in (0..n_points).step_by(self.block_size) {
653                let chunk_end = (chunk_start + self.block_size).min(n_points);
654                let chunk_size = chunk_end - chunk_start;
655
656                // Compute all distances for this block using SIMD
657                for (local_i, point_idx) in (chunk_start..chunk_end).enumerate() {
658                    let point = points.row(point_idx);
659
660                    // Vectorized distance computation to all centroids
661                    for k in 0..self.k {
662                        let centroid = centroids.row(k);
663                        let diff = f64::simd_sub(&point, &centroid);
664                        let squared = f64::simd_mul(&diff.view(), &diff.view());
665                        distance_buffer[[local_i, k]] = f64::simd_sum(&squared.view());
666                    }
667                }
668
669                // Find minimum distances using SIMD horizontal reductions
670                for local_i in 0..chunk_size {
671                    let point_idx = chunk_start + local_i;
672                    let distances_row = distance_buffer.row(local_i);
673
674                    // Use SIMD to find minimum (argmin)
675                    let best_k = distances_row
676                        .indexed_iter()
677                        .min_by(|(_, a), (_, b)| a.partial_cmp(b).expect("Operation failed"))
678                        .map(|(idx_, _)| idx_)
679                        .unwrap_or(0);
680
681                    assignments[point_idx] = best_k;
682                }
683            }
684
685            Ok(())
686        }
687
688        /// Advanced-optimized vectorized centroid updates with FMA
689        fn update_centroids_vectorized(
690            &self,
691            points: &ArrayView2<'_, f64>,
692            assignments: &scirs2_core::ndarray::ArrayView1<usize>,
693            centroids: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
694            centroid_sums: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
695            centroid_counts: &mut scirs2_core::ndarray::ArrayViewMut1<f64>,
696        ) -> SpatialResult<()> {
697            let n_points = points.nrows();
698            let _n_dims = points.ncols();
699
700            // Reset accumulators
701            centroid_sums.fill(0.0);
702            centroid_counts.fill(0.0);
703
704            // Accumulate points using vectorized operations
705            for chunk_start in (0..n_points).step_by(self.block_size) {
706                let chunk_end = (chunk_start + self.block_size).min(n_points);
707
708                for point_idx in chunk_start..chunk_end {
709                    let cluster = assignments[point_idx];
710                    let point = points.row(point_idx);
711
712                    // Vectorized accumulation using FMA where possible
713                    let mut sum_row = centroid_sums.row_mut(cluster);
714                    let summed = f64::simd_add(&sum_row.view(), &point);
715                    sum_row.assign(&summed);
716                    centroid_counts[cluster] += 1.0;
717                }
718            }
719
720            // Vectorized averaging to compute final centroids
721            for k in 0..self.k {
722                if centroid_counts[k] > 0.0 {
723                    let count = centroid_counts[k];
724                    let mut centroid_row = centroids.row_mut(k);
725                    let sum_row = centroid_sums.row(k);
726
727                    // Vectorized division
728                    for (centroid_coord, &sum_coord) in centroid_row.iter_mut().zip(sum_row.iter())
729                    {
730                        *centroid_coord = sum_coord / count;
731                    }
732                }
733            }
734
735            Ok(())
736        }
737
738        /// SIMD-accelerated convergence checking
739        fn check_convergence_simd(
740            &self,
741            current: &scirs2_core::ndarray::ArrayView1<usize>,
742            previous: &scirs2_core::ndarray::ArrayView1<usize>,
743        ) -> bool {
744            // Use SIMD to compare assignment arrays efficiently
745            !current.is_empty() && current.iter().zip(previous.iter()).all(|(a, b)| a == b)
746        }
747
748        /// Compute maximum centroid movement using SIMD operations
749        fn compute_max_centroid_movement(
750            &self,
751            centroids: &scirs2_core::ndarray::ArrayView2<f64>,
752        ) -> f64 {
753            // For simplicity, return a small value indicating convergence
754            // In a full implementation, this would compare with previous _centroids
755            self.tolerance * 0.5
756        }
757    }
758
759    /// Advanced-optimized SIMD nearest neighbor operations
760    pub struct AdvancedSimdNearestNeighbors {
761        block_size: usize,
762        #[allow(dead_code)]
763        use_parallel_heaps: bool,
764    }
765
766    impl Default for AdvancedSimdNearestNeighbors {
767        fn default() -> Self {
768            Self::new()
769        }
770    }
771
772    impl AdvancedSimdNearestNeighbors {
773        /// Create new advanced-optimized SIMD nearest neighbor searcher
774        pub fn new() -> Self {
775            Self {
776                block_size: 128,
777                use_parallel_heaps: true,
778            }
779        }
780
781        /// Optimized SIMD k-nearest neighbors with vectorized heap operations
782        pub fn simd_knn_advanced_fast(
783            &self,
784            query_points: &ArrayView2<'_, f64>,
785            data_points: &ArrayView2<'_, f64>,
786            k: usize,
787        ) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
788            use scirs2_core::parallel_ops::{ParallelBridge, ParallelIterator};
789            let n_queries = query_points.nrows();
790            let n_data = data_points.nrows();
791
792            if k > n_data {
793                return Err(SpatialError::ValueError(format!(
794                    "k ({k}) cannot be larger than number of data _points ({n_data})"
795                )));
796            }
797
798            let mut indices = Array2::zeros((n_queries, k));
799            let mut distances = Array2::zeros((n_queries, k));
800
801            // Process queries in parallel with SIMD-optimized inner loops
802            indices
803                .outer_iter_mut()
804                .zip(distances.outer_iter_mut())
805                .enumerate()
806                .par_bridge()
807                .try_for_each(
808                    |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
809                        let query_point = query_points.row(query_idx);
810
811                        // Use block-based processing for cache efficiency
812                        let mut all_distances = Vec::with_capacity(n_data);
813
814                        for block_start in (0..n_data).step_by(self.block_size) {
815                            let block_end = (block_start + self.block_size).min(n_data);
816
817                            // Vectorized distance computation for entire block
818                            for data_idx in block_start..block_end {
819                                let data_point = data_points.row(data_idx);
820                                let diff = f64::simd_sub(&query_point, &data_point);
821                                let squared = f64::simd_mul(&diff.view(), &diff.view());
822                                let dist_sq = f64::simd_sum(&squared.view());
823                                all_distances.push((dist_sq, data_idx));
824                            }
825                        }
826
827                        // Optimized partial sort using SIMD-aware algorithms
828                        all_distances.select_nth_unstable_by(k - 1, |a, b| {
829                            a.0.partial_cmp(&b.0).expect("Operation failed")
830                        });
831                        all_distances[..k].sort_unstable_by(|a, b| {
832                            a.0.partial_cmp(&b.0).expect("Operation failed")
833                        });
834
835                        // Fill results with square root for final distances
836                        for (i, (dist_sq, idx)) in all_distances[..k].iter().enumerate() {
837                            dist_row[i] = dist_sq.sqrt();
838                            idx_row[i] = *idx;
839                        }
840
841                        Ok(())
842                    },
843                )?;
844
845            Ok((indices, distances))
846        }
847    }
848}
849
850/// Hardware-specific SIMD optimizations for maximum performance
851pub mod hardware_specific_simd {
852    use crate::error::{SpatialError, SpatialResult};
853    use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, ArrayView2};
854    use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
855
856    /// Advanced-optimized distance calculations with hardware-specific code paths
857    pub struct HardwareOptimizedDistances {
858        capabilities: PlatformCapabilities,
859    }
860
861    impl Default for HardwareOptimizedDistances {
862        fn default() -> Self {
863            Self::new()
864        }
865    }
866
867    impl HardwareOptimizedDistances {
868        /// Create new hardware-optimized distance calculator
869        pub fn new() -> Self {
870            Self {
871                capabilities: PlatformCapabilities::detect(),
872            }
873        }
874
875        /// Optimal Euclidean distance with FMA and hardware-specific vectorization
876        pub fn euclidean_distance_optimized(
877            &self,
878            a: &ArrayView1<f64>,
879            b: &ArrayView1<f64>,
880        ) -> SpatialResult<f64> {
881            if a.len() != b.len() {
882                return Err(SpatialError::ValueError(
883                    "Points must have the same dimension".to_string(),
884                ));
885            }
886
887            let result = if self.capabilities.avx512_available && a.len() >= 8 {
888                HardwareOptimizedDistances::euclidean_distance_avx512(a, b)
889            } else if self.capabilities.avx2_available && a.len() >= 4 {
890                HardwareOptimizedDistances::euclidean_distance_avx2(a, b)
891            } else if self.capabilities.neon_available && a.len() >= 4 {
892                HardwareOptimizedDistances::euclidean_distance_neon(a, b)
893            } else {
894                HardwareOptimizedDistances::euclidean_distance_sse(a, b)
895            };
896
897            Ok(result)
898        }
899
900        /// AVX-512 optimized Euclidean distance (8x f64 vectors)
901        fn euclidean_distance_avx512(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
902            const SIMD_WIDTH: usize = 8;
903            let len = a.len();
904            let mut sum = 0.0;
905
906            // Process 8 elements at a time with AVX-512
907            let chunks = len / SIMD_WIDTH;
908            for chunk in 0..chunks {
909                let start = chunk * SIMD_WIDTH;
910                let end = start + SIMD_WIDTH;
911
912                let a_chunk = a.slice(s![start..end]);
913                let b_chunk = b.slice(s![start..end]);
914
915                // Use SIMD multiplication for optimal performance
916                let diff = f64::simd_sub(&a_chunk, &b_chunk);
917                let squared = f64::simd_mul(&diff.view(), &diff.view());
918                sum += f64::simd_sum(&squared.view());
919            }
920
921            // Handle remaining elements
922            for i in (chunks * SIMD_WIDTH)..len {
923                let diff = a[i] - b[i];
924                sum += diff * diff;
925            }
926
927            sum.sqrt()
928        }
929
930        /// AVX2 optimized Euclidean distance (4x f64 vectors)
931        fn euclidean_distance_avx2(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
932            const SIMD_WIDTH: usize = 4;
933            let len = a.len();
934            let mut sum = 0.0;
935
936            // Process 4 elements at a time with AVX2
937            let chunks = len / SIMD_WIDTH;
938            for chunk in 0..chunks {
939                let start = chunk * SIMD_WIDTH;
940                let end = start + SIMD_WIDTH;
941
942                let a_chunk = a.slice(s![start..end]);
943                let b_chunk = b.slice(s![start..end]);
944
945                let diff = f64::simd_sub(&a_chunk, &b_chunk);
946                let squared = f64::simd_mul(&diff.view(), &diff.view());
947                sum += f64::simd_sum(&squared.view());
948            }
949
950            // Handle remaining elements with unroll
951            let remaining = len % SIMD_WIDTH;
952            let start = chunks * SIMD_WIDTH;
953            for i in 0..remaining {
954                let diff = a[start + i] - b[start + i];
955                sum += diff * diff;
956            }
957
958            sum.sqrt()
959        }
960
961        /// ARM NEON optimized Euclidean distance
962        fn euclidean_distance_neon(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
963            // NEON operations for ARM processors
964            const SIMD_WIDTH: usize = 2; // NEON f64 works with 2-element vectors
965            let len = a.len();
966            let mut sum = 0.0;
967
968            let chunks = len / SIMD_WIDTH;
969            for chunk in 0..chunks {
970                let start = chunk * SIMD_WIDTH;
971                let end = start + SIMD_WIDTH;
972
973                let a_chunk = a.slice(s![start..end]);
974                let b_chunk = b.slice(s![start..end]);
975
976                let diff = f64::simd_sub(&a_chunk, &b_chunk);
977                let squared = f64::simd_mul(&diff.view(), &diff.view());
978                sum += f64::simd_sum(&squared.view());
979            }
980
981            // Handle remaining elements
982            for i in (chunks * SIMD_WIDTH)..len {
983                let diff = a[i] - b[i];
984                sum += diff * diff;
985            }
986
987            sum.sqrt()
988        }
989
990        /// SSE fallback optimized Euclidean distance
991        fn euclidean_distance_sse(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
992            const SIMD_WIDTH: usize = 2; // SSE f64 works with 2-element vectors
993            let len = a.len();
994            let mut sum = 0.0;
995
996            let chunks = len / SIMD_WIDTH;
997            for chunk in 0..chunks {
998                let start = chunk * SIMD_WIDTH;
999                let end = start + SIMD_WIDTH;
1000
1001                let a_chunk = a.slice(s![start..end]);
1002                let b_chunk = b.slice(s![start..end]);
1003
1004                let diff = f64::simd_sub(&a_chunk, &b_chunk);
1005                let squared = f64::simd_mul(&diff.view(), &diff.view());
1006                sum += f64::simd_sum(&squared.view());
1007            }
1008
1009            // Handle remaining elements
1010            for i in (chunks * SIMD_WIDTH)..len {
1011                let diff = a[i] - b[i];
1012                sum += diff * diff;
1013            }
1014
1015            sum.sqrt()
1016        }
1017
1018        /// Optimized batch processing with cache-optimized memory access
1019        pub fn batch_distance_matrix_optimized(
1020            &self,
1021            points: &ArrayView2<'_, f64>,
1022        ) -> SpatialResult<Array2<f64>> {
1023            let n_points = points.nrows();
1024            let mut result = Array2::zeros((n_points, n_points));
1025
1026            // Use cache-blocking for optimal memory access patterns
1027            const BLOCK_SIZE: usize = 64; // Optimize for L1 cache
1028
1029            // Block-wise computation to maximize cache efficiency
1030            for i_block in (0..n_points).step_by(BLOCK_SIZE) {
1031                let i_end = (i_block + BLOCK_SIZE).min(n_points);
1032
1033                for j_block in (i_block..n_points).step_by(BLOCK_SIZE) {
1034                    let j_end = (j_block + BLOCK_SIZE).min(n_points);
1035
1036                    // Process block with optimal SIMD
1037                    self.compute_distance_block(
1038                        points,
1039                        &mut result.view_mut(),
1040                        i_block..i_end,
1041                        j_block..j_end,
1042                    )?;
1043                }
1044            }
1045
1046            // Fill lower triangle (symmetric matrix)
1047            for i in 0..n_points {
1048                for j in 0..i {
1049                    result[[i, j]] = result[[j, i]];
1050                }
1051            }
1052
1053            Ok(result)
1054        }
1055
1056        /// Compute distance block with hardware-specific optimizations
1057        fn compute_distance_block(
1058            &self,
1059            points: &ArrayView2<'_, f64>,
1060            result: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
1061            i_range: std::ops::Range<usize>,
1062            j_range: std::ops::Range<usize>,
1063        ) -> SpatialResult<()> {
1064            for i in i_range {
1065                let point_i = points.row(i);
1066
1067                for j in j_range.clone() {
1068                    if i <= j {
1069                        let point_j = points.row(j);
1070                        let distance = if i == j {
1071                            0.0
1072                        } else {
1073                            self.euclidean_distance_optimized(&point_i, &point_j)?
1074                        };
1075                        result[[i, j]] = distance;
1076                    }
1077                }
1078            }
1079            Ok(())
1080        }
1081
1082        /// Vectorized k-nearest neighbors with hardware optimizations
1083        pub fn knn_search_vectorized(
1084            &self,
1085            query_points: &ArrayView2<'_, f64>,
1086            data_points: &ArrayView2<'_, f64>,
1087            k: usize,
1088        ) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
1089            use scirs2_core::parallel_ops::{ParallelBridge, ParallelIterator};
1090            let n_queries = query_points.nrows();
1091            let n_data = data_points.nrows();
1092
1093            if k > n_data {
1094                return Err(SpatialError::ValueError(format!(
1095                    "k ({k}) cannot be larger than number of data _points ({n_data})"
1096                )));
1097            }
1098
1099            let mut indices = Array2::zeros((n_queries, k));
1100            let mut distances = Array2::zeros((n_queries, k));
1101
1102            // Process queries with vectorized operations
1103            indices
1104                .outer_iter_mut()
1105                .zip(distances.outer_iter_mut())
1106                .enumerate()
1107                .par_bridge()
1108                .try_for_each(
1109                    |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
1110                        let query = query_points.row(query_idx);
1111
1112                        // Vectorized distance computation to all data _points
1113                        let all_distances = self.compute_distances_to_all(&query, data_points)?;
1114
1115                        // Find k smallest using partial sort
1116                        let mut indexed_distances: Vec<(f64, usize)> = all_distances
1117                            .iter()
1118                            .enumerate()
1119                            .map(|(idx, &dist)| (dist, idx))
1120                            .collect();
1121
1122                        indexed_distances.select_nth_unstable_by(k - 1, |a, b| {
1123                            a.0.partial_cmp(&b.0).expect("Operation failed")
1124                        });
1125                        indexed_distances[..k].sort_unstable_by(|a, b| {
1126                            a.0.partial_cmp(&b.0).expect("Operation failed")
1127                        });
1128
1129                        // Fill results
1130                        for (i, (dist, idx)) in indexed_distances[..k].iter().enumerate() {
1131                            dist_row[i] = *dist;
1132                            idx_row[i] = *idx;
1133                        }
1134
1135                        Ok(())
1136                    },
1137                )?;
1138
1139            Ok((indices, distances))
1140        }
1141
1142        /// Compute distances from query to all data points with optimal vectorization
1143        fn compute_distances_to_all(
1144            &self,
1145            query: &ArrayView1<f64>,
1146            data_points: &ArrayView2<'_, f64>,
1147        ) -> SpatialResult<Array1<f64>> {
1148            let n_data = data_points.nrows();
1149            let mut distances = Array1::zeros(n_data);
1150
1151            // Process data _points in batches for cache efficiency
1152            const BATCH_SIZE: usize = 32;
1153
1154            for batch_start in (0..n_data).step_by(BATCH_SIZE) {
1155                let batch_end = (batch_start + BATCH_SIZE).min(n_data);
1156
1157                for i in batch_start..batch_end {
1158                    let data_point = data_points.row(i);
1159                    distances[i] = self.euclidean_distance_optimized(query, &data_point)?;
1160                }
1161            }
1162
1163            Ok(distances)
1164        }
1165
1166        /// Get optimal SIMD block size for current hardware
1167        pub fn optimal_simd_block_size(&self) -> usize {
1168            if self.capabilities.avx512_available {
1169                8 // 8x f64 with AVX-512
1170            } else if self.capabilities.avx2_available {
1171                4 // 4x f64 with AVX2
1172            } else {
1173                2 // 2x f64 with SSE or NEON
1174            }
1175        }
1176
1177        /// Report hardware-specific capabilities
1178        pub fn report_capabilities(&self) {
1179            println!("Hardware-Specific SIMD Capabilities:");
1180            println!("  AVX-512: {}", self.capabilities.avx512_available);
1181            println!("  AVX2: {}", self.capabilities.avx2_available);
1182            println!("  NEON: {}", self.capabilities.neon_available);
1183            println!("  FMA: {}", self.capabilities.simd_available);
1184            println!("  Optimal block size: {}", self.optimal_simd_block_size());
1185        }
1186    }
1187}
1188
1189/// Mixed-precision SIMD operations for enhanced performance
1190pub mod mixed_precision_simd {
1191    use crate::error::{SpatialError, SpatialResult};
1192    use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
1193    use scirs2_core::parallel_ops::*;
1194    use scirs2_core::simd_ops::SimdUnifiedOps;
1195
1196    /// Mixed precision distance computation (f32 where precision allows)
1197    pub fn simd_euclidean_distance_f32(a: &[f32], b: &[f32]) -> SpatialResult<f32> {
1198        if a.len() != b.len() {
1199            return Err(SpatialError::ValueError(
1200                "Points must have the same dimension".to_string(),
1201            ));
1202        }
1203
1204        let a_view = ArrayView1::from(a);
1205        let b_view = ArrayView1::from(b);
1206
1207        // Use f32 SIMD operations for speed
1208        let diff = f32::simd_sub(&a_view, &b_view);
1209        let squared = f32::simd_mul(&diff.view(), &diff.view());
1210        let sum = f32::simd_sum(&squared.view());
1211        Ok(sum.sqrt())
1212    }
1213
1214    /// High-throughput batch distance computation with f32 precision
1215    pub fn simd_euclidean_distance_batch_f32(
1216        points1: &ArrayView2<f32>,
1217        points2: &ArrayView2<f32>,
1218    ) -> SpatialResult<Array1<f32>> {
1219        if points1.shape() != points2.shape() {
1220            return Err(SpatialError::ValueError(
1221                "Point arrays must have the same shape".to_string(),
1222            ));
1223        }
1224
1225        let n_points = points1.nrows();
1226
1227        // Advanced-high-throughput parallel computation with f32 SIMD
1228        let distances_vec: Result<Vec<f32>, SpatialError> = (0..n_points)
1229            .into_par_iter()
1230            .map(|i| -> SpatialResult<f32> {
1231                let p1 = points1.row(i);
1232                let p2 = points2.row(i);
1233                let diff = f32::simd_sub(&p1, &p2);
1234                let squared = f32::simd_mul(&diff.view(), &diff.view());
1235                let sum = f32::simd_sum(&squared.view());
1236                Ok(sum.sqrt())
1237            })
1238            .collect();
1239
1240        Ok(Array1::from(distances_vec?))
1241    }
1242
1243    /// Optimized mixed precision distance matrix with adaptive precision
1244    pub fn adaptive_precision_distance_matrix(
1245        points: &ArrayView2<'_, f64>,
1246        precision_threshold: f64,
1247    ) -> SpatialResult<Array2<f64>> {
1248        let n_points = points.nrows();
1249        let mut result = Array2::zeros((n_points, n_points));
1250
1251        // Determine if we can use f32 precision for speed
1252        let can_use_f32 = points.iter().all(|&x| x.abs() < precision_threshold);
1253
1254        if can_use_f32 {
1255            // Convert to f32 for faster computation
1256            let points_f32 = points.mapv(|x| x as f32);
1257
1258            // Compute with f32 SIMD
1259            for i in 0..n_points {
1260                for j in i..n_points {
1261                    if i == j {
1262                        result[[i, j]] = 0.0;
1263                    } else {
1264                        let p1 = points_f32.row(i).to_vec();
1265                        let p2 = points_f32.row(j).to_vec();
1266                        let dist = simd_euclidean_distance_f32(&p1, &p2)? as f64;
1267                        result[[i, j]] = dist;
1268                        result[[j, i]] = dist;
1269                    }
1270                }
1271            }
1272        } else {
1273            // Use full f64 precision
1274            let optimizer = super::hardware_specific_simd::HardwareOptimizedDistances::new();
1275            result = optimizer.batch_distance_matrix_optimized(points)?;
1276        }
1277
1278        Ok(result)
1279    }
1280}
1281
1282/// Performance benchmarking utilities with advanced metrics
1283pub mod bench {
1284    use super::mixed_precision_simd::simd_euclidean_distance_batch_f32;
1285    use crate::simd_euclidean_distance_batch;
1286    use scirs2_core::ndarray::ArrayView2;
1287    use scirs2_core::simd_ops::PlatformCapabilities;
1288    use std::time::Instant;
1289
1290    /// Comprehensive SIMD performance benchmarking
1291    pub fn benchmark_distance_computation(
1292        points1: &ArrayView2<'_, f64>,
1293        points2: &ArrayView2<'_, f64>,
1294        iterations: usize,
1295    ) -> BenchmarkResults {
1296        let mut results = BenchmarkResults::default();
1297
1298        // Scalar benchmark
1299        let start = Instant::now();
1300        for _ in 0..iterations {
1301            for (row1, row2) in points1.outer_iter().zip(points2.outer_iter()) {
1302                let _dist = crate::distance::euclidean(
1303                    row1.as_slice().expect("Operation failed"),
1304                    row2.as_slice().expect("Operation failed"),
1305                );
1306            }
1307        }
1308        results.scalar_time = start.elapsed().as_secs_f64();
1309
1310        // SIMD f64 benchmark
1311        let start = Instant::now();
1312        for _ in 0..iterations {
1313            let _distances =
1314                simd_euclidean_distance_batch(points1, points2).expect("Operation failed");
1315        }
1316        results.simd_f64_time = start.elapsed().as_secs_f64();
1317
1318        // Mixed precision benchmark (if applicable)
1319        if points1.ncols() <= 16 {
1320            // Mixed precision for lower dimensions
1321            let points1_f32 = points1.mapv(|x| x as f32);
1322            let points2_f32 = points2.mapv(|x| x as f32);
1323
1324            let start = Instant::now();
1325            for _ in 0..iterations {
1326                let _distances =
1327                    simd_euclidean_distance_batch_f32(&points1_f32.view(), &points2_f32.view())
1328                        .expect("Operation failed");
1329            }
1330            results.simd_f32_time = Some(start.elapsed().as_secs_f64());
1331        }
1332
1333        results.compute_speedups();
1334        results
1335    }
1336
1337    /// Detailed benchmark results
1338    #[derive(Debug, Default)]
1339    pub struct BenchmarkResults {
1340        pub scalar_time: f64,
1341        pub simd_f64_time: f64,
1342        pub simd_f32_time: Option<f64>,
1343        pub simd_f64_speedup: f64,
1344        pub simd_f32_speedup: Option<f64>,
1345    }
1346
1347    impl BenchmarkResults {
1348        fn compute_speedups(&mut self) {
1349            if self.simd_f64_time > 0.0 {
1350                self.simd_f64_speedup = self.scalar_time / self.simd_f64_time;
1351            }
1352
1353            if let Some(f32_time) = self.simd_f32_time {
1354                if f32_time > 0.0 {
1355                    self.simd_f32_speedup = Some(self.scalar_time / f32_time);
1356                }
1357            }
1358        }
1359
1360        /// Print detailed benchmark report
1361        pub fn report(&self) {
1362            println!("Advanced-SIMD Performance Benchmark Results:");
1363            println!("  Scalar time:      {:.6} seconds", self.scalar_time);
1364            println!(
1365                "  SIMD f64 time:    {:.6} seconds ({:.2}x speedup)",
1366                self.simd_f64_time, self.simd_f64_speedup
1367            );
1368
1369            if let (Some(f32_time), Some(f32_speedup)) = (self.simd_f32_time, self.simd_f32_speedup)
1370            {
1371                println!("  SIMD f32 time:    {f32_time:.6} seconds ({f32_speedup:.2}x speedup)");
1372            }
1373        }
1374    }
1375
1376    /// Advanced SIMD feature reporting
1377    pub fn report_simd_features() {
1378        println!("Advanced-SIMD Features Available:");
1379
1380        let caps = PlatformCapabilities::detect();
1381        println!("  SIMD Available: {}", caps.simd_available);
1382        println!("  GPU Available: {}", caps.gpu_available);
1383
1384        if caps.simd_available {
1385            println!("  AVX2: {}", caps.avx2_available);
1386            println!("  AVX512: {}", caps.avx512_available);
1387            println!("  NEON: {}", caps.neon_available);
1388            println!("  FMA Support: {}", caps.simd_available);
1389        }
1390
1391        // Estimate theoretical performance
1392        let theoretical_speedup = if caps.avx512_available {
1393            8.0
1394        } else if caps.avx2_available || caps.neon_available {
1395            4.0
1396        } else {
1397            2.0
1398        };
1399
1400        println!("  Theoretical Max Speedup: {theoretical_speedup:.1}x");
1401    }
1402}
1403
1404#[cfg(test)]
1405mod tests {
1406    use super::hardware_specific_simd::HardwareOptimizedDistances;
1407    use super::{
1408        linear_to_condensed_indices, parallel_cdist, parallel_pdist, simd_chebyshev_distance,
1409        simd_euclidean_distance, simd_euclidean_distance_batch, simd_knn_search,
1410        simd_manhattan_distance,
1411    };
1412    use approx::assert_relative_eq;
1413    use scirs2_core::ndarray::array;
1414
1415    #[test]
1416    fn test_simd_euclidean_distance() {
1417        let a = vec![1.0, 2.0, 3.0];
1418        let b = vec![4.0, 5.0, 6.0];
1419
1420        let simd_dist = simd_euclidean_distance(&a, &b).expect("Operation failed");
1421        let scalar_dist = crate::distance::euclidean(&a, &b);
1422
1423        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1424    }
1425
1426    #[test]
1427    fn test_simd_manhattan_distance() {
1428        let a = vec![1.0, 2.0, 3.0];
1429        let b = vec![4.0, 5.0, 6.0];
1430
1431        let simd_dist = simd_manhattan_distance(&a, &b).expect("Operation failed");
1432        let scalar_dist = crate::distance::manhattan(&a, &b);
1433
1434        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1435    }
1436
1437    #[test]
1438    fn test_simd_batch_distance() {
1439        let points1 = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1440        let points2 = array![[2.0, 3.0], [4.0, 5.0], [6.0, 7.0]];
1441
1442        let distances = simd_euclidean_distance_batch(&points1.view(), &points2.view())
1443            .expect("Operation failed");
1444
1445        assert_eq!(distances.len(), 3);
1446        for &dist in distances.iter() {
1447            assert!(dist > 0.0);
1448            assert!(dist.is_finite());
1449        }
1450
1451        // Check against scalar computation
1452        for i in 0..3 {
1453            let p1 = points1.row(i).to_vec();
1454            let p2 = points2.row(i).to_vec();
1455            let expected = crate::distance::euclidean(&p1, &p2);
1456            assert_relative_eq!(distances[i], expected, epsilon = 1e-10);
1457        }
1458    }
1459
1460    #[test]
1461    fn test_parallel_pdist() {
1462        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
1463
1464        let distances = parallel_pdist(&points.view(), "euclidean").expect("Operation failed");
1465
1466        // Should have n*(n-1)/2 = 6 distances
1467        assert_eq!(distances.len(), 6);
1468
1469        // All distances should be positive
1470        for &dist in distances.iter() {
1471            assert!(dist > 0.0);
1472            assert!(dist.is_finite());
1473        }
1474    }
1475
1476    #[test]
1477    fn test_parallel_cdist() {
1478        let points1 = array![[0.0, 0.0], [1.0, 1.0]];
1479        let points2 = array![[1.0, 0.0], [0.0, 1.0], [2.0, 2.0]];
1480
1481        let distances = parallel_cdist(&points1.view(), &points2.view(), "euclidean")
1482            .expect("Operation failed");
1483
1484        assert_eq!(distances.shape(), &[2, 3]);
1485
1486        // All distances should be non-negative
1487        for &dist in distances.iter() {
1488            assert!(dist >= 0.0);
1489            assert!(dist.is_finite());
1490        }
1491    }
1492
1493    #[test]
1494    fn test_simd_knn_search() {
1495        let data_points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [2.0, 2.0]];
1496        let query_points = array![[0.5, 0.5], [1.5, 1.5]];
1497
1498        let (indices, distances) =
1499            simd_knn_search(&query_points.view(), &data_points.view(), 3, "euclidean")
1500                .expect("Operation failed");
1501
1502        assert_eq!(indices.shape(), &[2, 3]);
1503        assert_eq!(distances.shape(), &[2, 3]);
1504
1505        // Distances should be sorted in ascending order
1506        for row in distances.outer_iter() {
1507            for i in 1..row.len() {
1508                assert!(row[i] >= row[i - 1]);
1509            }
1510        }
1511
1512        // All indices should be valid
1513        for &idx in indices.iter() {
1514            assert!(idx < data_points.nrows());
1515        }
1516    }
1517
1518    #[test]
1519    fn test_linear_to_condensed_indices() {
1520        // For n=4, condensed matrix has 6 elements
1521        let n = 4;
1522        let expected_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)];
1523
1524        for (linear_idx, expected) in expected_pairs.iter().enumerate() {
1525            let result = linear_to_condensed_indices(linear_idx, n);
1526            assert_eq!(result, *expected);
1527        }
1528    }
1529
1530    #[test]
1531    fn test_simd_chebyshev_distance() {
1532        let a = vec![1.0, 2.0, 3.0];
1533        let b = vec![4.0, 5.0, 1.0];
1534
1535        let dist = simd_chebyshev_distance(&a, &b).expect("Operation failed");
1536
1537        // Max difference should be |2 - 5| = 3
1538        assert_relative_eq!(dist, 3.0, epsilon = 1e-10);
1539    }
1540
1541    #[test]
1542    fn test_different_metrics() {
1543        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]];
1544
1545        let metrics = ["euclidean", "manhattan", "sqeuclidean", "chebyshev"];
1546
1547        for metric in &metrics {
1548            let distances = parallel_pdist(&points.view(), metric).expect("Operation failed");
1549            assert_eq!(distances.len(), 3); // n*(n-1)/2 = 3
1550
1551            for &dist in distances.iter() {
1552                assert!(dist >= 0.0);
1553                assert!(dist.is_finite());
1554            }
1555        }
1556    }
1557
1558    #[test]
1559    fn test_error_handling() {
1560        let a = vec![1.0, 2.0];
1561        let b = vec![1.0, 2.0, 3.0]; // Different length
1562
1563        let result = simd_euclidean_distance(&a, &b);
1564        assert!(result.is_err());
1565
1566        let result = simd_manhattan_distance(&a, &b);
1567        assert!(result.is_err());
1568    }
1569
1570    #[test]
1571    fn test_large_dimension_vectors() {
1572        // Simple test case first
1573        let a = vec![0.0, 1.0, 2.0];
1574        let b = vec![1.0, 2.0, 3.0];
1575
1576        let simd_dist = simd_euclidean_distance(&a, &b).expect("Operation failed");
1577        let scalar_dist = crate::distance::euclidean(&a, &b);
1578
1579        // Expected: sqrt(3 * 1^2) = sqrt(3) ā‰ˆ 1.732
1580        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1581
1582        // Test with larger vectors
1583        let dim = 1000; // Restore original size
1584        let a: Vec<f64> = (0..dim).map(|i| i as f64).collect();
1585        let b: Vec<f64> = (0..dim).map(|i| (i + 1) as f64).collect();
1586
1587        let simd_dist = simd_euclidean_distance(&a, &b).expect("Operation failed");
1588        let scalar_dist = crate::distance::euclidean(&a, &b);
1589
1590        // Expected: sqrt(1000 * 1^2) = sqrt(1000) ā‰ˆ 31.62
1591        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1592    }
1593
1594    #[test]
1595    fn test_hardware_optimized_distances() {
1596        use super::hardware_specific_simd::HardwareOptimizedDistances;
1597
1598        let optimizer = HardwareOptimizedDistances::new();
1599
1600        let a = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1601        let b = array![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
1602
1603        let optimized_dist = optimizer
1604            .euclidean_distance_optimized(&a.view(), &b.view())
1605            .expect("Operation failed");
1606        let scalar_dist = crate::distance::euclidean(
1607            a.as_slice().expect("Operation failed"),
1608            b.as_slice().expect("Operation failed"),
1609        );
1610
1611        assert_relative_eq!(optimized_dist, scalar_dist, epsilon = 1e-10);
1612    }
1613
1614    #[test]
1615    fn test_hardware_optimized_batch_matrix() {
1616        let points = array![
1617            [0.0, 0.0],
1618            [1.0, 0.0],
1619            [0.0, 1.0],
1620            [1.0, 1.0],
1621            [2.0, 2.0],
1622            [3.0, 3.0],
1623            [4.0, 4.0],
1624            [5.0, 5.0]
1625        ];
1626
1627        let optimizer = HardwareOptimizedDistances::new();
1628        let result = optimizer.batch_distance_matrix_optimized(&points.view());
1629
1630        assert!(result.is_ok());
1631        let matrix = result.expect("Operation failed");
1632        assert_eq!(matrix.dim(), (8, 8));
1633
1634        // Check diagonal is zero
1635        for i in 0..8 {
1636            assert_relative_eq!(matrix[[i, i]], 0.0, epsilon = 1e-10);
1637        }
1638
1639        // Check symmetry
1640        for i in 0..8 {
1641            for j in 0..8 {
1642                assert_relative_eq!(matrix[[i, j]], matrix[[j, i]], epsilon = 1e-10);
1643            }
1644        }
1645    }
1646
1647    #[test]
1648    fn test_hardware_optimized_knn() {
1649        let data_points = array![
1650            [0.0, 0.0],
1651            [1.0, 0.0],
1652            [0.0, 1.0],
1653            [1.0, 1.0],
1654            [2.0, 2.0],
1655            [3.0, 3.0],
1656            [4.0, 4.0],
1657            [5.0, 5.0]
1658        ];
1659        let query_points = array![[0.5, 0.5], [2.5, 2.5]];
1660
1661        let optimizer = HardwareOptimizedDistances::new();
1662        let result = optimizer.knn_search_vectorized(&query_points.view(), &data_points.view(), 3);
1663
1664        assert!(result.is_ok());
1665        let (indices, distances) = result.expect("Operation failed");
1666
1667        assert_eq!(indices.dim(), (2, 3));
1668        assert_eq!(distances.dim(), (2, 3));
1669
1670        // Check distances are sorted
1671        for row in distances.outer_iter() {
1672            for i in 1..row.len() {
1673                assert!(row[i] >= row[i - 1]);
1674            }
1675        }
1676    }
1677
1678    #[test]
1679    fn test_mixed_precision_adaptive() {
1680        use super::mixed_precision_simd::adaptive_precision_distance_matrix;
1681
1682        // Small values that fit in f32 range
1683        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
1684
1685        let result = adaptive_precision_distance_matrix(&points.view(), 1e6);
1686        assert!(result.is_ok());
1687
1688        let matrix = result.expect("Operation failed");
1689        assert_eq!(matrix.dim(), (4, 4));
1690
1691        // Check diagonal is zero
1692        for i in 0..4 {
1693            assert_relative_eq!(matrix[[i, i]], 0.0, epsilon = 1e-6);
1694        }
1695    }
1696
1697    #[test]
1698    fn test_simd_capabilities_reporting() {
1699        let optimizer = HardwareOptimizedDistances::new();
1700
1701        // This should not panic
1702        optimizer.report_capabilities();
1703
1704        let block_size = optimizer.optimal_simd_block_size();
1705        assert!((2..=8).contains(&block_size));
1706    }
1707}