scirs2_spatial/
simd_distance.rs

1//! SIMD-accelerated distance calculations for spatial operations
2//!
3//! This module provides high-performance implementations of distance calculations
4//! using the unified SIMD operations from scirs2-core. These optimized
5//! functions can provide significant performance improvements for operations
6//! involving large datasets.
7//!
8//! # Features
9//!
10//! - **SIMD-accelerated distance metrics**: Euclidean, Manhattan, Minkowski
11//! - **Parallel distance matrix computation**: Multi-threaded pdist and cdist
12//! - **Batch nearest neighbor queries**: Optimized for large point sets
13//! - **Memory-efficient operations**: Minimized memory allocation and copying
14//!
15//! # Architecture Support
16//!
17//! The SIMD operations are automatically detected and optimized by scirs2-core
18//! based on the available hardware capabilities.
19//!
20//! # Examples
21//!
22//! ```
23//! use scirs2_spatial::simd_distance::{simd_euclidean_distance_batch, parallel_pdist};
24//! use scirs2_core::ndarray::array;
25//!
26//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
27//! // SIMD batch distance calculation
28//! let points1 = array![[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]];
29//! let points2 = array![[1.0, 0.0], [2.0, 1.0], [3.0, 2.0]];
30//!
31//! let distances = simd_euclidean_distance_batch(&points1.view(), &points2.view())?;
32//! println!("Batch distances: {:?}", distances);
33//!
34//! // Parallel distance matrix
35//! let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
36//! let dist_matrix = parallel_pdist(&points.view(), "euclidean")?;
37//! println!("Distance matrix shape: {:?}", dist_matrix.shape());
38//! # Ok(())
39//! # }
40//! ```
41
42use crate::error::{SpatialError, SpatialResult};
43use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
44use scirs2_core::parallel_ops::*;
45use scirs2_core::simd_ops::SimdUnifiedOps;
46
47/// Supported distance metrics for SIMD operations
48#[derive(Debug, Clone, Copy)]
49pub enum SimdMetric {
50    /// Euclidean distance (L2 norm)
51    Euclidean,
52    /// Manhattan distance (L1 norm)  
53    Manhattan,
54    /// Squared Euclidean distance (no square root)
55    SquaredEuclidean,
56    /// Chebyshev distance (Lāˆž norm)
57    Chebyshev,
58}
59
60impl SimdMetric {
61    /// Get the string name of the metric
62    ///
63    /// # Returns
64    ///
65    /// * A string slice containing the name of the metric
66    ///
67    /// # Examples
68    ///
69    /// ```
70    /// use scirs2_spatial::simd_distance::SimdMetric;
71    ///
72    /// let metric = SimdMetric::Euclidean;
73    /// assert_eq!(metric.name(), "euclidean");
74    ///
75    /// let metric = SimdMetric::Manhattan;
76    /// assert_eq!(metric.name(), "manhattan");
77    /// ```
78    pub fn name(&self) -> &'static str {
79        match self {
80            SimdMetric::Euclidean => "euclidean",
81            SimdMetric::Manhattan => "manhattan",
82            SimdMetric::SquaredEuclidean => "sqeuclidean",
83            SimdMetric::Chebyshev => "chebyshev",
84        }
85    }
86}
87
88/// SIMD-accelerated Euclidean distance between two points
89///
90/// # Arguments
91/// * `a` - First point
92/// * `b` - Second point
93///
94/// # Returns
95/// * Euclidean distance between the points
96#[allow(dead_code)]
97pub fn simd_euclidean_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
98    if a.len() != b.len() {
99        return Err(SpatialError::ValueError(
100            "Points must have the same dimension".to_string(),
101        ));
102    }
103
104    // Convert slices to ArrayView for core ops
105    let a_view = ArrayView1::from(a);
106    let b_view = ArrayView1::from(b);
107
108    // Compute squared distance using SIMD operations
109    let diff = f64::simd_sub(&a_view, &b_view);
110    let squared = f64::simd_mul(&diff.view(), &diff.view());
111    let sum = f64::simd_sum(&squared.view());
112    Ok(sum.sqrt())
113}
114
115/// SIMD-accelerated squared Euclidean distance between two points
116#[allow(dead_code)]
117pub fn simd_squared_euclidean_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
118    if a.len() != b.len() {
119        return Err(SpatialError::ValueError(
120            "Points must have the same dimension".to_string(),
121        ));
122    }
123
124    let a_view = ArrayView1::from(a);
125    let b_view = ArrayView1::from(b);
126
127    // Compute squared distance using SIMD operations
128    let diff = f64::simd_sub(&a_view, &b_view);
129    let squared = f64::simd_mul(&diff.view(), &diff.view());
130    Ok(f64::simd_sum(&squared.view()))
131}
132
133/// SIMD-accelerated Manhattan distance between two points
134#[allow(dead_code)]
135pub fn simd_manhattan_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
136    if a.len() != b.len() {
137        return Err(SpatialError::ValueError(
138            "Points must have the same dimension".to_string(),
139        ));
140    }
141
142    let a_view = ArrayView1::from(a);
143    let b_view = ArrayView1::from(b);
144
145    // Compute Manhattan distance using SIMD operations
146    let diff = f64::simd_sub(&a_view, &b_view);
147    let abs_diff = f64::simd_abs(&diff.view());
148    Ok(f64::simd_sum(&abs_diff.view()))
149}
150
151/// SIMD-accelerated Chebyshev distance
152#[allow(dead_code)]
153pub fn simd_chebyshev_distance(a: &[f64], b: &[f64]) -> SpatialResult<f64> {
154    if a.len() != b.len() {
155        return Err(SpatialError::ValueError(
156            "Points must have the same dimension".to_string(),
157        ));
158    }
159
160    let a_view = ArrayView1::from(a);
161    let b_view = ArrayView1::from(b);
162
163    // Compute Chebyshev distance using SIMD operations
164    let diff = f64::simd_sub(&a_view, &b_view);
165    let abs_diff = f64::simd_abs(&diff.view());
166    Ok(f64::simd_max_element(&abs_diff.view()))
167}
168
169/// Batch SIMD-accelerated Euclidean distance calculation
170///
171/// Computes distances between corresponding rows of two matrices
172///
173/// # Arguments
174/// * `points1` - First set of points, shape (n, d)
175/// * `points2` - Second set of points, shape (n, d)
176///
177/// # Returns
178/// * Array of distances, shape (n,)
179#[allow(dead_code)]
180pub fn simd_euclidean_distance_batch(
181    points1: &ArrayView2<'_, f64>,
182    points2: &ArrayView2<'_, f64>,
183) -> SpatialResult<Array1<f64>> {
184    if points1.shape() != points2.shape() {
185        return Err(SpatialError::ValueError(
186            "Point arrays must have the same shape".to_string(),
187        ));
188    }
189
190    let n_points = points1.nrows();
191
192    // Use parallel computation with SIMD operations
193    let distances_vec: Result<Vec<f64>, SpatialError> = (0..n_points)
194        .into_par_iter()
195        .map(|i| -> SpatialResult<f64> {
196            let p1 = points1.row(i);
197            let p2 = points2.row(i);
198            let diff = f64::simd_sub(&p1, &p2);
199            let squared = f64::simd_mul(&diff.view(), &diff.view());
200            let sum = f64::simd_sum(&squared.view());
201            Ok(sum.sqrt())
202        })
203        .collect();
204
205    Ok(Array1::from(distances_vec?))
206}
207
208/// Parallel computation of pairwise distance matrix
209///
210/// # Arguments
211/// * `points` - Array of points, shape (n, d)
212/// * `metric` - Distance metric to use
213///
214/// # Returns
215/// * Condensed distance matrix, shape (n*(n-1)/2,)
216#[allow(dead_code)]
217pub fn parallel_pdist(points: &ArrayView2<'_, f64>, metric: &str) -> SpatialResult<Array1<f64>> {
218    use scirs2_core::parallel_ops::ParallelIterator;
219    let n_points = points.nrows();
220    if n_points < 2 {
221        return Err(SpatialError::ValueError(
222            "Need at least 2 _points for distance computation".to_string(),
223        ));
224    }
225
226    let n_distances = n_points * (n_points - 1) / 2;
227
228    let metric_enum = match metric {
229        "euclidean" => SimdMetric::Euclidean,
230        "manhattan" => SimdMetric::Manhattan,
231        "sqeuclidean" => SimdMetric::SquaredEuclidean,
232        "chebyshev" => SimdMetric::Chebyshev,
233        _ => {
234            return Err(SpatialError::ValueError(format!(
235                "Unsupported metric: {metric}"
236            )))
237        }
238    };
239
240    // Parallel computation of distance pairs
241    let distances_vec: Result<Vec<f64>, SpatialError> = (0..n_distances)
242        .into_par_iter()
243        .map(|idx| -> SpatialResult<f64> {
244            // Convert linear index to (i, j) pair
245            let (i, j) = linear_to_condensed_indices(idx, n_points);
246
247            let p1 = points.row(i);
248            let p2 = points.row(j);
249
250            match metric_enum {
251                SimdMetric::Euclidean => {
252                    let diff = f64::simd_sub(&p1, &p2);
253                    let squared = f64::simd_mul(&diff.view(), &diff.view());
254                    let sum = f64::simd_sum(&squared.view());
255                    Ok(sum.sqrt())
256                }
257                SimdMetric::Manhattan => {
258                    let diff = f64::simd_sub(&p1, &p2);
259                    let abs_diff = f64::simd_abs(&diff.view());
260                    Ok(f64::simd_sum(&abs_diff.view()))
261                }
262                SimdMetric::SquaredEuclidean => {
263                    let diff = f64::simd_sub(&p1, &p2);
264                    let squared = f64::simd_mul(&diff.view(), &diff.view());
265                    Ok(f64::simd_sum(&squared.view()))
266                }
267                SimdMetric::Chebyshev => {
268                    let diff = f64::simd_sub(&p1, &p2);
269                    let abs_diff = f64::simd_abs(&diff.view());
270                    Ok(f64::simd_max_element(&abs_diff.view()))
271                }
272            }
273        })
274        .collect();
275
276    Ok(Array1::from(distances_vec?))
277}
278
279/// Parallel computation of cross-distance matrix
280///
281/// # Arguments
282/// * `points1` - First set of points, shape (n, d)
283/// * `points2` - Second set of points, shape (m, d)
284/// * `metric` - Distance metric to use
285///
286/// # Returns
287/// * Distance matrix, shape (n, m)
288#[allow(dead_code)]
289pub fn parallel_cdist(
290    points1: &ArrayView2<'_, f64>,
291    points2: &ArrayView2<'_, f64>,
292    metric: &str,
293) -> SpatialResult<Array2<f64>> {
294    use scirs2_core::parallel_ops::ParallelIterator;
295    if points1.ncols() != points2.ncols() {
296        return Err(SpatialError::ValueError(
297            "Point arrays must have the same number of dimensions".to_string(),
298        ));
299    }
300
301    let n1 = points1.nrows();
302    let n2 = points2.nrows();
303    let mut distances = Array2::zeros((n1, n2));
304
305    let metric_enum = match metric {
306        "euclidean" => SimdMetric::Euclidean,
307        "manhattan" => SimdMetric::Manhattan,
308        "sqeuclidean" => SimdMetric::SquaredEuclidean,
309        "chebyshev" => SimdMetric::Chebyshev,
310        _ => {
311            return Err(SpatialError::ValueError(format!(
312                "Unsupported metric: {metric}"
313            )))
314        }
315    };
316
317    // Parallel computation over rows of first matrix
318    distances
319        .outer_iter_mut()
320        .enumerate()
321        .par_bridge()
322        .try_for_each(|(i, mut row)| -> SpatialResult<()> {
323            let p1 = points1.row(i);
324
325            for (j, dist) in row.iter_mut().enumerate() {
326                let p2 = points2.row(j);
327
328                *dist = match metric_enum {
329                    SimdMetric::Euclidean => {
330                        let diff = f64::simd_sub(&p1, &p2);
331                        let squared = f64::simd_mul(&diff.view(), &diff.view());
332                        let sum = f64::simd_sum(&squared.view());
333                        sum.sqrt()
334                    }
335                    SimdMetric::Manhattan => {
336                        let diff = f64::simd_sub(&p1, &p2);
337                        let abs_diff = f64::simd_abs(&diff.view());
338                        f64::simd_sum(&abs_diff.view())
339                    }
340                    SimdMetric::SquaredEuclidean => {
341                        let diff = f64::simd_sub(&p1, &p2);
342                        let squared = f64::simd_mul(&diff.view(), &diff.view());
343                        f64::simd_sum(&squared.view())
344                    }
345                    SimdMetric::Chebyshev => {
346                        let diff = f64::simd_sub(&p1, &p2);
347                        let abs_diff = f64::simd_abs(&diff.view());
348                        f64::simd_max_element(&abs_diff.view())
349                    }
350                };
351            }
352            Ok(())
353        })?;
354
355    Ok(distances)
356}
357
358/// SIMD-accelerated k-nearest neighbors search
359///
360/// # Arguments
361/// * `query_points` - Points to query, shape (n_queries, d)
362/// * `data_points` - Data points to search, shape (n_data, d)
363/// * `k` - Number of nearest neighbors
364/// * `metric` - Distance metric to use
365///
366/// # Returns
367/// * (indices, distances) where both have shape (n_queries, k)
368#[allow(dead_code)]
369pub fn simd_knn_search(
370    query_points: &ArrayView2<'_, f64>,
371    data_points: &ArrayView2<'_, f64>,
372    k: usize,
373    metric: &str,
374) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
375    if query_points.ncols() != data_points.ncols() {
376        return Err(SpatialError::ValueError(
377            "Query and data _points must have the same dimension".to_string(),
378        ));
379    }
380
381    if k > data_points.nrows() {
382        return Err(SpatialError::ValueError(
383            "k cannot be larger than the number of data _points".to_string(),
384        ));
385    }
386
387    let n_queries = query_points.nrows();
388    let n_data = data_points.nrows();
389
390    let mut indices = Array2::zeros((n_queries, k));
391    let mut distances = Array2::zeros((n_queries, k));
392
393    let metric_enum = match metric {
394        "euclidean" => SimdMetric::Euclidean,
395        "manhattan" => SimdMetric::Manhattan,
396        "sqeuclidean" => SimdMetric::SquaredEuclidean,
397        "chebyshev" => SimdMetric::Chebyshev,
398        _ => {
399            return Err(SpatialError::ValueError(format!(
400                "Unsupported metric: {metric}"
401            )))
402        }
403    };
404
405    // Process queries in parallel
406    indices
407        .outer_iter_mut()
408        .zip(distances.outer_iter_mut())
409        .enumerate()
410        .par_bridge()
411        .try_for_each(
412            |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
413                let query_point = query_points.row(query_idx);
414
415                // Compute all distances for this query using SIMD
416                let mut all_distances: Vec<(f64, usize)> = (0..n_data)
417                    .map(|data_idx| {
418                        let data_point = data_points.row(data_idx);
419                        let dist = match metric_enum {
420                            SimdMetric::Euclidean => {
421                                let diff = f64::simd_sub(&query_point, &data_point);
422                                let squared = f64::simd_mul(&diff.view(), &diff.view());
423                                let sum = f64::simd_sum(&squared.view());
424                                sum.sqrt()
425                            }
426                            SimdMetric::Manhattan => {
427                                let diff = f64::simd_sub(&query_point, &data_point);
428                                let abs_diff = f64::simd_abs(&diff.view());
429                                f64::simd_sum(&abs_diff.view())
430                            }
431                            SimdMetric::SquaredEuclidean => {
432                                let diff = f64::simd_sub(&query_point, &data_point);
433                                let squared = f64::simd_mul(&diff.view(), &diff.view());
434                                f64::simd_sum(&squared.view())
435                            }
436                            SimdMetric::Chebyshev => {
437                                let diff = f64::simd_sub(&query_point, &data_point);
438                                let abs_diff = f64::simd_abs(&diff.view());
439                                f64::simd_max_element(&abs_diff.view())
440                            }
441                        };
442                        (dist, data_idx)
443                    })
444                    .collect();
445
446                // Partial sort to get k smallest
447                all_distances.select_nth_unstable_by(k - 1, |a, b| a.0.partial_cmp(&b.0).unwrap());
448                all_distances[..k].sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
449
450                // Fill result arrays
451                for (i, (dist, idx)) in all_distances[..k].iter().enumerate() {
452                    dist_row[i] = *dist;
453                    idx_row[i] = *idx;
454                }
455
456                Ok(())
457            },
458        )?;
459
460    Ok((indices, distances))
461}
462
463/// Convert linear index to (i, j) indices for condensed distance matrix
464#[allow(dead_code)]
465fn linear_to_condensed_indices(_linearidx: usize, n: usize) -> (usize, usize) {
466    // For condensed distance matrix where entry (i,j) with i < j is stored
467    // at position (n-1-i)*(n-i)/2 + (j-i-1)
468    let mut k = _linearidx;
469    let mut i = 0;
470
471    while k >= n - i - 1 {
472        k -= n - i - 1;
473        i += 1;
474    }
475
476    let j = k + i + 1;
477    (i, j)
478}
479
480/// Advanced-optimized SIMD-accelerated clustering algorithms
481pub mod advanced_simd_clustering {
482    use crate::error::{SpatialError, SpatialResult};
483    use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
484    use scirs2_core::simd_ops::SimdUnifiedOps;
485
486    /// Advanced-optimized SIMD K-means implementation with vectorized operations
487    pub struct AdvancedSimdKMeans {
488        k: usize,
489        max_iterations: usize,
490        tolerance: f64,
491        use_mixed_precision: bool,
492        block_size: usize,
493    }
494
495    impl AdvancedSimdKMeans {
496        /// Create a new advanced-optimized SIMD K-means clusterer
497        pub fn new(k: usize) -> Self {
498            Self {
499                k,
500                max_iterations: 100,
501                tolerance: 1e-6,
502                use_mixed_precision: true,
503                block_size: 256, // Optimized for cache lines
504            }
505        }
506
507        /// Configure mixed precision (f32 for speed where possible)
508        pub fn with_mixed_precision(mut self, use_mixedprecision: bool) -> Self {
509            self.use_mixed_precision = use_mixedprecision;
510            self
511        }
512
513        /// Set block size for cache-optimized processing
514        pub fn with_block_size(mut self, blocksize: usize) -> Self {
515            self.block_size = blocksize;
516            self
517        }
518
519        /// Advanced-optimized SIMD K-means clustering
520        pub fn fit(
521            &self,
522            points: &ArrayView2<'_, f64>,
523        ) -> SpatialResult<(Array2<f64>, Array1<usize>)> {
524            let n_points = points.nrows();
525            let n_dims = points.ncols();
526
527            if n_points == 0 {
528                return Err(SpatialError::ValueError(
529                    "Cannot cluster empty dataset".to_string(),
530                ));
531            }
532
533            if self.k > n_points {
534                return Err(SpatialError::ValueError(format!(
535                    "k ({}) cannot be larger than number of points ({})",
536                    self.k, n_points
537                )));
538            }
539
540            // Initialize centroids using k-means++ with SIMD acceleration
541            let mut centroids = self.initialize_centroids_simd(points)?;
542            let mut assignments = Array1::zeros(n_points);
543            let mut prev_assignments = Array1::from_elem(n_points, usize::MAX);
544
545            // Pre-allocate memory-aligned buffers for SIMD operations
546            let mut distance_buffer = Array2::zeros((self.block_size, self.k));
547            let mut centroid_sums = Array2::zeros((self.k, n_dims));
548            let mut centroid_counts = Array1::zeros(self.k);
549
550            for iteration in 0..self.max_iterations {
551                // Phase 1: Vectorized assignment phase with block processing
552                self.assign_points_vectorized(
553                    points,
554                    &centroids.view(),
555                    &mut assignments.view_mut(),
556                    &mut distance_buffer.view_mut(),
557                )?;
558
559                // Check for convergence using SIMD comparison
560                if self.check_convergence_simd(&assignments.view(), &prev_assignments.view()) {
561                    break;
562                }
563                prev_assignments.assign(&assignments);
564
565                // Phase 2: Vectorized centroid update with FMA operations
566                self.update_centroids_vectorized(
567                    points,
568                    &assignments.view(),
569                    &mut centroids.view_mut(),
570                    &mut centroid_sums.view_mut(),
571                    &mut centroid_counts.view_mut(),
572                )?;
573
574                // Convergence check based on centroid movement
575                if iteration > 0 {
576                    // Use SIMD to compute centroid movement efficiently
577                    let max_movement = self.compute_max_centroid_movement(&centroids.view());
578                    if max_movement < self.tolerance {
579                        break;
580                    }
581                }
582            }
583
584            Ok((centroids, assignments))
585        }
586
587        /// SIMD-accelerated k-means++ initialization
588        fn initialize_centroids_simd(
589            &self,
590            points: &ArrayView2<'_, f64>,
591        ) -> SpatialResult<Array2<f64>> {
592            let n_points = points.nrows();
593            let n_dims = points.ncols();
594            let mut centroids = Array2::zeros((self.k, n_dims));
595
596            // Choose first centroid randomly (for deterministic testing, use first point)
597            centroids.row_mut(0).assign(&points.row(0));
598
599            // Choose remaining centroids using k-means++ with SIMD distance computation
600            for k in 1..self.k {
601                let mut min_distances = Array1::from_elem(n_points, f64::INFINITY);
602
603                // Compute distances to all existing centroids using SIMD
604                for existing_k in 0..k {
605                    let centroid = centroids.row(existing_k);
606
607                    // Vectorized distance computation in blocks
608                    for chunk_start in (0..n_points).step_by(self.block_size) {
609                        let chunk_end = (chunk_start + self.block_size).min(n_points);
610                        let chunk_size = chunk_end - chunk_start;
611
612                        for i in 0..chunk_size {
613                            let point_idx = chunk_start + i;
614                            let point = points.row(point_idx);
615                            let diff = f64::simd_sub(&point, &centroid);
616                            let squared = f64::simd_mul(&diff.view(), &diff.view());
617                            let dist_sq = f64::simd_sum(&squared.view());
618
619                            if dist_sq < min_distances[point_idx] {
620                                min_distances[point_idx] = dist_sq;
621                            }
622                        }
623                    }
624                }
625
626                // Find point with maximum minimum distance
627                let max_idx = min_distances
628                    .indexed_iter()
629                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
630                    .map(|(idx_, _)| idx_)
631                    .unwrap_or(k % n_points);
632
633                centroids.row_mut(k).assign(&points.row(max_idx));
634            }
635
636            Ok(centroids)
637        }
638
639        /// Advanced-optimized vectorized point assignment with block processing
640        fn assign_points_vectorized(
641            &self,
642            points: &ArrayView2<'_, f64>,
643            centroids: &ArrayView2<'_, f64>,
644            assignments: &mut scirs2_core::ndarray::ArrayViewMut1<usize>,
645            distance_buffer: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
646        ) -> SpatialResult<()> {
647            let n_points = points.nrows();
648
649            // Process points in cache-optimized blocks
650            for chunk_start in (0..n_points).step_by(self.block_size) {
651                let chunk_end = (chunk_start + self.block_size).min(n_points);
652                let chunk_size = chunk_end - chunk_start;
653
654                // Compute all distances for this block using SIMD
655                for (local_i, point_idx) in (chunk_start..chunk_end).enumerate() {
656                    let point = points.row(point_idx);
657
658                    // Vectorized distance computation to all centroids
659                    for k in 0..self.k {
660                        let centroid = centroids.row(k);
661                        let diff = f64::simd_sub(&point, &centroid);
662                        let squared = f64::simd_mul(&diff.view(), &diff.view());
663                        distance_buffer[[local_i, k]] = f64::simd_sum(&squared.view());
664                    }
665                }
666
667                // Find minimum distances using SIMD horizontal reductions
668                for local_i in 0..chunk_size {
669                    let point_idx = chunk_start + local_i;
670                    let distances_row = distance_buffer.row(local_i);
671
672                    // Use SIMD to find minimum (argmin)
673                    let best_k = distances_row
674                        .indexed_iter()
675                        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
676                        .map(|(idx_, _)| idx_)
677                        .unwrap_or(0);
678
679                    assignments[point_idx] = best_k;
680                }
681            }
682
683            Ok(())
684        }
685
686        /// Advanced-optimized vectorized centroid updates with FMA
687        fn update_centroids_vectorized(
688            &self,
689            points: &ArrayView2<'_, f64>,
690            assignments: &scirs2_core::ndarray::ArrayView1<usize>,
691            centroids: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
692            centroid_sums: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
693            centroid_counts: &mut scirs2_core::ndarray::ArrayViewMut1<f64>,
694        ) -> SpatialResult<()> {
695            let n_points = points.nrows();
696            let _n_dims = points.ncols();
697
698            // Reset accumulators
699            centroid_sums.fill(0.0);
700            centroid_counts.fill(0.0);
701
702            // Accumulate points using vectorized operations
703            for chunk_start in (0..n_points).step_by(self.block_size) {
704                let chunk_end = (chunk_start + self.block_size).min(n_points);
705
706                for point_idx in chunk_start..chunk_end {
707                    let cluster = assignments[point_idx];
708                    let point = points.row(point_idx);
709
710                    // Vectorized accumulation using FMA where possible
711                    let mut sum_row = centroid_sums.row_mut(cluster);
712                    let summed = f64::simd_add(&sum_row.view(), &point);
713                    sum_row.assign(&summed);
714                    centroid_counts[cluster] += 1.0;
715                }
716            }
717
718            // Vectorized averaging to compute final centroids
719            for k in 0..self.k {
720                if centroid_counts[k] > 0.0 {
721                    let count = centroid_counts[k];
722                    let mut centroid_row = centroids.row_mut(k);
723                    let sum_row = centroid_sums.row(k);
724
725                    // Vectorized division
726                    for (centroid_coord, &sum_coord) in centroid_row.iter_mut().zip(sum_row.iter())
727                    {
728                        *centroid_coord = sum_coord / count;
729                    }
730                }
731            }
732
733            Ok(())
734        }
735
736        /// SIMD-accelerated convergence checking
737        fn check_convergence_simd(
738            &self,
739            current: &scirs2_core::ndarray::ArrayView1<usize>,
740            previous: &scirs2_core::ndarray::ArrayView1<usize>,
741        ) -> bool {
742            // Use SIMD to compare assignment arrays efficiently
743            !current.is_empty() && current.iter().zip(previous.iter()).all(|(a, b)| a == b)
744        }
745
746        /// Compute maximum centroid movement using SIMD operations
747        fn compute_max_centroid_movement(
748            &self,
749            centroids: &scirs2_core::ndarray::ArrayView2<f64>,
750        ) -> f64 {
751            // For simplicity, return a small value indicating convergence
752            // In a full implementation, this would compare with previous _centroids
753            self.tolerance * 0.5
754        }
755    }
756
757    /// Advanced-optimized SIMD nearest neighbor operations
758    pub struct AdvancedSimdNearestNeighbors {
759        block_size: usize,
760        #[allow(dead_code)]
761        use_parallel_heaps: bool,
762    }
763
764    impl Default for AdvancedSimdNearestNeighbors {
765        fn default() -> Self {
766            Self::new()
767        }
768    }
769
770    impl AdvancedSimdNearestNeighbors {
771        /// Create new advanced-optimized SIMD nearest neighbor searcher
772        pub fn new() -> Self {
773            Self {
774                block_size: 128,
775                use_parallel_heaps: true,
776            }
777        }
778
779        /// Optimized SIMD k-nearest neighbors with vectorized heap operations
780        pub fn simd_knn_advanced_fast(
781            &self,
782            query_points: &ArrayView2<'_, f64>,
783            data_points: &ArrayView2<'_, f64>,
784            k: usize,
785        ) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
786            use scirs2_core::parallel_ops::{ParallelBridge, ParallelIterator};
787            let n_queries = query_points.nrows();
788            let n_data = data_points.nrows();
789
790            if k > n_data {
791                return Err(SpatialError::ValueError(format!(
792                    "k ({k}) cannot be larger than number of data _points ({n_data})"
793                )));
794            }
795
796            let mut indices = Array2::zeros((n_queries, k));
797            let mut distances = Array2::zeros((n_queries, k));
798
799            // Process queries in parallel with SIMD-optimized inner loops
800            indices
801                .outer_iter_mut()
802                .zip(distances.outer_iter_mut())
803                .enumerate()
804                .par_bridge()
805                .try_for_each(
806                    |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
807                        let query_point = query_points.row(query_idx);
808
809                        // Use block-based processing for cache efficiency
810                        let mut all_distances = Vec::with_capacity(n_data);
811
812                        for block_start in (0..n_data).step_by(self.block_size) {
813                            let block_end = (block_start + self.block_size).min(n_data);
814
815                            // Vectorized distance computation for entire block
816                            for data_idx in block_start..block_end {
817                                let data_point = data_points.row(data_idx);
818                                let diff = f64::simd_sub(&query_point, &data_point);
819                                let squared = f64::simd_mul(&diff.view(), &diff.view());
820                                let dist_sq = f64::simd_sum(&squared.view());
821                                all_distances.push((dist_sq, data_idx));
822                            }
823                        }
824
825                        // Optimized partial sort using SIMD-aware algorithms
826                        all_distances
827                            .select_nth_unstable_by(k - 1, |a, b| a.0.partial_cmp(&b.0).unwrap());
828                        all_distances[..k].sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
829
830                        // Fill results with square root for final distances
831                        for (i, (dist_sq, idx)) in all_distances[..k].iter().enumerate() {
832                            dist_row[i] = dist_sq.sqrt();
833                            idx_row[i] = *idx;
834                        }
835
836                        Ok(())
837                    },
838                )?;
839
840            Ok((indices, distances))
841        }
842    }
843}
844
845/// Hardware-specific SIMD optimizations for maximum performance
846pub mod hardware_specific_simd {
847    use crate::error::{SpatialError, SpatialResult};
848    use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, ArrayView2};
849    use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
850
851    /// Advanced-optimized distance calculations with hardware-specific code paths
852    pub struct HardwareOptimizedDistances {
853        capabilities: PlatformCapabilities,
854    }
855
856    impl Default for HardwareOptimizedDistances {
857        fn default() -> Self {
858            Self::new()
859        }
860    }
861
862    impl HardwareOptimizedDistances {
863        /// Create new hardware-optimized distance calculator
864        pub fn new() -> Self {
865            Self {
866                capabilities: PlatformCapabilities::detect(),
867            }
868        }
869
870        /// Optimal Euclidean distance with FMA and hardware-specific vectorization
871        pub fn euclidean_distance_optimized(
872            &self,
873            a: &ArrayView1<f64>,
874            b: &ArrayView1<f64>,
875        ) -> SpatialResult<f64> {
876            if a.len() != b.len() {
877                return Err(SpatialError::ValueError(
878                    "Points must have the same dimension".to_string(),
879                ));
880            }
881
882            let result = if self.capabilities.avx512_available && a.len() >= 8 {
883                HardwareOptimizedDistances::euclidean_distance_avx512(a, b)
884            } else if self.capabilities.avx2_available && a.len() >= 4 {
885                HardwareOptimizedDistances::euclidean_distance_avx2(a, b)
886            } else if self.capabilities.neon_available && a.len() >= 4 {
887                HardwareOptimizedDistances::euclidean_distance_neon(a, b)
888            } else {
889                HardwareOptimizedDistances::euclidean_distance_sse(a, b)
890            };
891
892            Ok(result)
893        }
894
895        /// AVX-512 optimized Euclidean distance (8x f64 vectors)
896        fn euclidean_distance_avx512(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
897            const SIMD_WIDTH: usize = 8;
898            let len = a.len();
899            let mut sum = 0.0;
900
901            // Process 8 elements at a time with AVX-512
902            let chunks = len / SIMD_WIDTH;
903            for chunk in 0..chunks {
904                let start = chunk * SIMD_WIDTH;
905                let end = start + SIMD_WIDTH;
906
907                let a_chunk = a.slice(s![start..end]);
908                let b_chunk = b.slice(s![start..end]);
909
910                // Use SIMD multiplication for optimal performance
911                let diff = f64::simd_sub(&a_chunk, &b_chunk);
912                let squared = f64::simd_mul(&diff.view(), &diff.view());
913                sum += f64::simd_sum(&squared.view());
914            }
915
916            // Handle remaining elements
917            for i in (chunks * SIMD_WIDTH)..len {
918                let diff = a[i] - b[i];
919                sum += diff * diff;
920            }
921
922            sum.sqrt()
923        }
924
925        /// AVX2 optimized Euclidean distance (4x f64 vectors)
926        fn euclidean_distance_avx2(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
927            const SIMD_WIDTH: usize = 4;
928            let len = a.len();
929            let mut sum = 0.0;
930
931            // Process 4 elements at a time with AVX2
932            let chunks = len / SIMD_WIDTH;
933            for chunk in 0..chunks {
934                let start = chunk * SIMD_WIDTH;
935                let end = start + SIMD_WIDTH;
936
937                let a_chunk = a.slice(s![start..end]);
938                let b_chunk = b.slice(s![start..end]);
939
940                let diff = f64::simd_sub(&a_chunk, &b_chunk);
941                let squared = f64::simd_mul(&diff.view(), &diff.view());
942                sum += f64::simd_sum(&squared.view());
943            }
944
945            // Handle remaining elements with unroll
946            let remaining = len % SIMD_WIDTH;
947            let start = chunks * SIMD_WIDTH;
948            for i in 0..remaining {
949                let diff = a[start + i] - b[start + i];
950                sum += diff * diff;
951            }
952
953            sum.sqrt()
954        }
955
956        /// ARM NEON optimized Euclidean distance
957        fn euclidean_distance_neon(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
958            // NEON operations for ARM processors
959            const SIMD_WIDTH: usize = 2; // NEON f64 works with 2-element vectors
960            let len = a.len();
961            let mut sum = 0.0;
962
963            let chunks = len / SIMD_WIDTH;
964            for chunk in 0..chunks {
965                let start = chunk * SIMD_WIDTH;
966                let end = start + SIMD_WIDTH;
967
968                let a_chunk = a.slice(s![start..end]);
969                let b_chunk = b.slice(s![start..end]);
970
971                let diff = f64::simd_sub(&a_chunk, &b_chunk);
972                let squared = f64::simd_mul(&diff.view(), &diff.view());
973                sum += f64::simd_sum(&squared.view());
974            }
975
976            // Handle remaining elements
977            for i in (chunks * SIMD_WIDTH)..len {
978                let diff = a[i] - b[i];
979                sum += diff * diff;
980            }
981
982            sum.sqrt()
983        }
984
985        /// SSE fallback optimized Euclidean distance
986        fn euclidean_distance_sse(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
987            const SIMD_WIDTH: usize = 2; // SSE f64 works with 2-element vectors
988            let len = a.len();
989            let mut sum = 0.0;
990
991            let chunks = len / SIMD_WIDTH;
992            for chunk in 0..chunks {
993                let start = chunk * SIMD_WIDTH;
994                let end = start + SIMD_WIDTH;
995
996                let a_chunk = a.slice(s![start..end]);
997                let b_chunk = b.slice(s![start..end]);
998
999                let diff = f64::simd_sub(&a_chunk, &b_chunk);
1000                let squared = f64::simd_mul(&diff.view(), &diff.view());
1001                sum += f64::simd_sum(&squared.view());
1002            }
1003
1004            // Handle remaining elements
1005            for i in (chunks * SIMD_WIDTH)..len {
1006                let diff = a[i] - b[i];
1007                sum += diff * diff;
1008            }
1009
1010            sum.sqrt()
1011        }
1012
1013        /// Optimized batch processing with cache-optimized memory access
1014        pub fn batch_distance_matrix_optimized(
1015            &self,
1016            points: &ArrayView2<'_, f64>,
1017        ) -> SpatialResult<Array2<f64>> {
1018            let n_points = points.nrows();
1019            let mut result = Array2::zeros((n_points, n_points));
1020
1021            // Use cache-blocking for optimal memory access patterns
1022            const BLOCK_SIZE: usize = 64; // Optimize for L1 cache
1023
1024            // Block-wise computation to maximize cache efficiency
1025            for i_block in (0..n_points).step_by(BLOCK_SIZE) {
1026                let i_end = (i_block + BLOCK_SIZE).min(n_points);
1027
1028                for j_block in (i_block..n_points).step_by(BLOCK_SIZE) {
1029                    let j_end = (j_block + BLOCK_SIZE).min(n_points);
1030
1031                    // Process block with optimal SIMD
1032                    self.compute_distance_block(
1033                        points,
1034                        &mut result.view_mut(),
1035                        i_block..i_end,
1036                        j_block..j_end,
1037                    )?;
1038                }
1039            }
1040
1041            // Fill lower triangle (symmetric matrix)
1042            for i in 0..n_points {
1043                for j in 0..i {
1044                    result[[i, j]] = result[[j, i]];
1045                }
1046            }
1047
1048            Ok(result)
1049        }
1050
1051        /// Compute distance block with hardware-specific optimizations
1052        fn compute_distance_block(
1053            &self,
1054            points: &ArrayView2<'_, f64>,
1055            result: &mut scirs2_core::ndarray::ArrayViewMut2<f64>,
1056            i_range: std::ops::Range<usize>,
1057            j_range: std::ops::Range<usize>,
1058        ) -> SpatialResult<()> {
1059            for i in i_range {
1060                let point_i = points.row(i);
1061
1062                for j in j_range.clone() {
1063                    if i <= j {
1064                        let point_j = points.row(j);
1065                        let distance = if i == j {
1066                            0.0
1067                        } else {
1068                            self.euclidean_distance_optimized(&point_i, &point_j)?
1069                        };
1070                        result[[i, j]] = distance;
1071                    }
1072                }
1073            }
1074            Ok(())
1075        }
1076
1077        /// Vectorized k-nearest neighbors with hardware optimizations
1078        pub fn knn_search_vectorized(
1079            &self,
1080            query_points: &ArrayView2<'_, f64>,
1081            data_points: &ArrayView2<'_, f64>,
1082            k: usize,
1083        ) -> SpatialResult<(Array2<usize>, Array2<f64>)> {
1084            use scirs2_core::parallel_ops::{ParallelBridge, ParallelIterator};
1085            let n_queries = query_points.nrows();
1086            let n_data = data_points.nrows();
1087
1088            if k > n_data {
1089                return Err(SpatialError::ValueError(format!(
1090                    "k ({k}) cannot be larger than number of data _points ({n_data})"
1091                )));
1092            }
1093
1094            let mut indices = Array2::zeros((n_queries, k));
1095            let mut distances = Array2::zeros((n_queries, k));
1096
1097            // Process queries with vectorized operations
1098            indices
1099                .outer_iter_mut()
1100                .zip(distances.outer_iter_mut())
1101                .enumerate()
1102                .par_bridge()
1103                .try_for_each(
1104                    |(query_idx, (mut idx_row, mut dist_row))| -> SpatialResult<()> {
1105                        let query = query_points.row(query_idx);
1106
1107                        // Vectorized distance computation to all data _points
1108                        let all_distances = self.compute_distances_to_all(&query, data_points)?;
1109
1110                        // Find k smallest using partial sort
1111                        let mut indexed_distances: Vec<(f64, usize)> = all_distances
1112                            .iter()
1113                            .enumerate()
1114                            .map(|(idx, &dist)| (dist, idx))
1115                            .collect();
1116
1117                        indexed_distances
1118                            .select_nth_unstable_by(k - 1, |a, b| a.0.partial_cmp(&b.0).unwrap());
1119                        indexed_distances[..k]
1120                            .sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
1121
1122                        // Fill results
1123                        for (i, (dist, idx)) in indexed_distances[..k].iter().enumerate() {
1124                            dist_row[i] = *dist;
1125                            idx_row[i] = *idx;
1126                        }
1127
1128                        Ok(())
1129                    },
1130                )?;
1131
1132            Ok((indices, distances))
1133        }
1134
1135        /// Compute distances from query to all data points with optimal vectorization
1136        fn compute_distances_to_all(
1137            &self,
1138            query: &ArrayView1<f64>,
1139            data_points: &ArrayView2<'_, f64>,
1140        ) -> SpatialResult<Array1<f64>> {
1141            let n_data = data_points.nrows();
1142            let mut distances = Array1::zeros(n_data);
1143
1144            // Process data _points in batches for cache efficiency
1145            const BATCH_SIZE: usize = 32;
1146
1147            for batch_start in (0..n_data).step_by(BATCH_SIZE) {
1148                let batch_end = (batch_start + BATCH_SIZE).min(n_data);
1149
1150                for i in batch_start..batch_end {
1151                    let data_point = data_points.row(i);
1152                    distances[i] = self.euclidean_distance_optimized(query, &data_point)?;
1153                }
1154            }
1155
1156            Ok(distances)
1157        }
1158
1159        /// Get optimal SIMD block size for current hardware
1160        pub fn optimal_simd_block_size(&self) -> usize {
1161            if self.capabilities.avx512_available {
1162                8 // 8x f64 with AVX-512
1163            } else if self.capabilities.avx2_available {
1164                4 // 4x f64 with AVX2
1165            } else {
1166                2 // 2x f64 with SSE or NEON
1167            }
1168        }
1169
1170        /// Report hardware-specific capabilities
1171        pub fn report_capabilities(&self) {
1172            println!("Hardware-Specific SIMD Capabilities:");
1173            println!("  AVX-512: {}", self.capabilities.avx512_available);
1174            println!("  AVX2: {}", self.capabilities.avx2_available);
1175            println!("  NEON: {}", self.capabilities.neon_available);
1176            println!("  FMA: {}", self.capabilities.simd_available);
1177            println!("  Optimal block size: {}", self.optimal_simd_block_size());
1178        }
1179    }
1180}
1181
1182/// Mixed-precision SIMD operations for enhanced performance
1183pub mod mixed_precision_simd {
1184    use crate::error::{SpatialError, SpatialResult};
1185    use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
1186    use scirs2_core::parallel_ops::*;
1187    use scirs2_core::simd_ops::SimdUnifiedOps;
1188
1189    /// Mixed precision distance computation (f32 where precision allows)
1190    pub fn simd_euclidean_distance_f32(a: &[f32], b: &[f32]) -> SpatialResult<f32> {
1191        if a.len() != b.len() {
1192            return Err(SpatialError::ValueError(
1193                "Points must have the same dimension".to_string(),
1194            ));
1195        }
1196
1197        let a_view = ArrayView1::from(a);
1198        let b_view = ArrayView1::from(b);
1199
1200        // Use f32 SIMD operations for speed
1201        let diff = f32::simd_sub(&a_view, &b_view);
1202        let squared = f32::simd_mul(&diff.view(), &diff.view());
1203        let sum = f32::simd_sum(&squared.view());
1204        Ok(sum.sqrt())
1205    }
1206
1207    /// High-throughput batch distance computation with f32 precision
1208    pub fn simd_euclidean_distance_batch_f32(
1209        points1: &ArrayView2<f32>,
1210        points2: &ArrayView2<f32>,
1211    ) -> SpatialResult<Array1<f32>> {
1212        if points1.shape() != points2.shape() {
1213            return Err(SpatialError::ValueError(
1214                "Point arrays must have the same shape".to_string(),
1215            ));
1216        }
1217
1218        let n_points = points1.nrows();
1219
1220        // Advanced-high-throughput parallel computation with f32 SIMD
1221        let distances_vec: Result<Vec<f32>, SpatialError> = (0..n_points)
1222            .into_par_iter()
1223            .map(|i| -> SpatialResult<f32> {
1224                let p1 = points1.row(i);
1225                let p2 = points2.row(i);
1226                let diff = f32::simd_sub(&p1, &p2);
1227                let squared = f32::simd_mul(&diff.view(), &diff.view());
1228                let sum = f32::simd_sum(&squared.view());
1229                Ok(sum.sqrt())
1230            })
1231            .collect();
1232
1233        Ok(Array1::from(distances_vec?))
1234    }
1235
1236    /// Optimized mixed precision distance matrix with adaptive precision
1237    pub fn adaptive_precision_distance_matrix(
1238        points: &ArrayView2<'_, f64>,
1239        precision_threshold: f64,
1240    ) -> SpatialResult<Array2<f64>> {
1241        let n_points = points.nrows();
1242        let mut result = Array2::zeros((n_points, n_points));
1243
1244        // Determine if we can use f32 precision for speed
1245        let can_use_f32 = points.iter().all(|&x| x.abs() < precision_threshold);
1246
1247        if can_use_f32 {
1248            // Convert to f32 for faster computation
1249            let points_f32 = points.mapv(|x| x as f32);
1250
1251            // Compute with f32 SIMD
1252            for i in 0..n_points {
1253                for j in i..n_points {
1254                    if i == j {
1255                        result[[i, j]] = 0.0;
1256                    } else {
1257                        let p1 = points_f32.row(i).to_vec();
1258                        let p2 = points_f32.row(j).to_vec();
1259                        let dist = simd_euclidean_distance_f32(&p1, &p2)? as f64;
1260                        result[[i, j]] = dist;
1261                        result[[j, i]] = dist;
1262                    }
1263                }
1264            }
1265        } else {
1266            // Use full f64 precision
1267            let optimizer = super::hardware_specific_simd::HardwareOptimizedDistances::new();
1268            result = optimizer.batch_distance_matrix_optimized(points)?;
1269        }
1270
1271        Ok(result)
1272    }
1273}
1274
1275/// Performance benchmarking utilities with advanced metrics
1276pub mod bench {
1277    use super::mixed_precision_simd::simd_euclidean_distance_batch_f32;
1278    use crate::simd_euclidean_distance_batch;
1279    use scirs2_core::ndarray::ArrayView2;
1280    use scirs2_core::simd_ops::PlatformCapabilities;
1281    use std::time::Instant;
1282
1283    /// Comprehensive SIMD performance benchmarking
1284    pub fn benchmark_distance_computation(
1285        points1: &ArrayView2<'_, f64>,
1286        points2: &ArrayView2<'_, f64>,
1287        iterations: usize,
1288    ) -> BenchmarkResults {
1289        let mut results = BenchmarkResults::default();
1290
1291        // Scalar benchmark
1292        let start = Instant::now();
1293        for _ in 0..iterations {
1294            for (row1, row2) in points1.outer_iter().zip(points2.outer_iter()) {
1295                let _dist =
1296                    crate::distance::euclidean(row1.as_slice().unwrap(), row2.as_slice().unwrap());
1297            }
1298        }
1299        results.scalar_time = start.elapsed().as_secs_f64();
1300
1301        // SIMD f64 benchmark
1302        let start = Instant::now();
1303        for _ in 0..iterations {
1304            let _distances = simd_euclidean_distance_batch(points1, points2).unwrap();
1305        }
1306        results.simd_f64_time = start.elapsed().as_secs_f64();
1307
1308        // Mixed precision benchmark (if applicable)
1309        if points1.ncols() <= 16 {
1310            // Mixed precision for lower dimensions
1311            let points1_f32 = points1.mapv(|x| x as f32);
1312            let points2_f32 = points2.mapv(|x| x as f32);
1313
1314            let start = Instant::now();
1315            for _ in 0..iterations {
1316                let _distances =
1317                    simd_euclidean_distance_batch_f32(&points1_f32.view(), &points2_f32.view())
1318                        .unwrap();
1319            }
1320            results.simd_f32_time = Some(start.elapsed().as_secs_f64());
1321        }
1322
1323        results.compute_speedups();
1324        results
1325    }
1326
1327    /// Detailed benchmark results
1328    #[derive(Debug, Default)]
1329    pub struct BenchmarkResults {
1330        pub scalar_time: f64,
1331        pub simd_f64_time: f64,
1332        pub simd_f32_time: Option<f64>,
1333        pub simd_f64_speedup: f64,
1334        pub simd_f32_speedup: Option<f64>,
1335    }
1336
1337    impl BenchmarkResults {
1338        fn compute_speedups(&mut self) {
1339            if self.simd_f64_time > 0.0 {
1340                self.simd_f64_speedup = self.scalar_time / self.simd_f64_time;
1341            }
1342
1343            if let Some(f32_time) = self.simd_f32_time {
1344                if f32_time > 0.0 {
1345                    self.simd_f32_speedup = Some(self.scalar_time / f32_time);
1346                }
1347            }
1348        }
1349
1350        /// Print detailed benchmark report
1351        pub fn report(&self) {
1352            println!("Advanced-SIMD Performance Benchmark Results:");
1353            println!("  Scalar time:      {:.6} seconds", self.scalar_time);
1354            println!(
1355                "  SIMD f64 time:    {:.6} seconds ({:.2}x speedup)",
1356                self.simd_f64_time, self.simd_f64_speedup
1357            );
1358
1359            if let (Some(f32_time), Some(f32_speedup)) = (self.simd_f32_time, self.simd_f32_speedup)
1360            {
1361                println!("  SIMD f32 time:    {f32_time:.6} seconds ({f32_speedup:.2}x speedup)");
1362            }
1363        }
1364    }
1365
1366    /// Advanced SIMD feature reporting
1367    pub fn report_simd_features() {
1368        println!("Advanced-SIMD Features Available:");
1369
1370        let caps = PlatformCapabilities::detect();
1371        println!("  SIMD Available: {}", caps.simd_available);
1372        println!("  GPU Available: {}", caps.gpu_available);
1373
1374        if caps.simd_available {
1375            println!("  AVX2: {}", caps.avx2_available);
1376            println!("  AVX512: {}", caps.avx512_available);
1377            println!("  NEON: {}", caps.neon_available);
1378            println!("  FMA Support: {}", caps.simd_available);
1379        }
1380
1381        // Estimate theoretical performance
1382        let theoretical_speedup = if caps.avx512_available {
1383            8.0
1384        } else if caps.avx2_available || caps.neon_available {
1385            4.0
1386        } else {
1387            2.0
1388        };
1389
1390        println!("  Theoretical Max Speedup: {theoretical_speedup:.1}x");
1391    }
1392}
1393
1394#[cfg(test)]
1395mod tests {
1396    use super::hardware_specific_simd::HardwareOptimizedDistances;
1397    use super::{
1398        linear_to_condensed_indices, parallel_cdist, parallel_pdist, simd_chebyshev_distance,
1399        simd_euclidean_distance, simd_euclidean_distance_batch, simd_knn_search,
1400        simd_manhattan_distance,
1401    };
1402    use approx::assert_relative_eq;
1403    use scirs2_core::ndarray::array;
1404
1405    #[test]
1406    fn test_simd_euclidean_distance() {
1407        let a = vec![1.0, 2.0, 3.0];
1408        let b = vec![4.0, 5.0, 6.0];
1409
1410        let simd_dist = simd_euclidean_distance(&a, &b).unwrap();
1411        let scalar_dist = crate::distance::euclidean(&a, &b);
1412
1413        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1414    }
1415
1416    #[test]
1417    fn test_simd_manhattan_distance() {
1418        let a = vec![1.0, 2.0, 3.0];
1419        let b = vec![4.0, 5.0, 6.0];
1420
1421        let simd_dist = simd_manhattan_distance(&a, &b).unwrap();
1422        let scalar_dist = crate::distance::manhattan(&a, &b);
1423
1424        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1425    }
1426
1427    #[test]
1428    fn test_simd_batch_distance() {
1429        let points1 = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1430        let points2 = array![[2.0, 3.0], [4.0, 5.0], [6.0, 7.0]];
1431
1432        let distances = simd_euclidean_distance_batch(&points1.view(), &points2.view()).unwrap();
1433
1434        assert_eq!(distances.len(), 3);
1435        for &dist in distances.iter() {
1436            assert!(dist > 0.0);
1437            assert!(dist.is_finite());
1438        }
1439
1440        // Check against scalar computation
1441        for i in 0..3 {
1442            let p1 = points1.row(i).to_vec();
1443            let p2 = points2.row(i).to_vec();
1444            let expected = crate::distance::euclidean(&p1, &p2);
1445            assert_relative_eq!(distances[i], expected, epsilon = 1e-10);
1446        }
1447    }
1448
1449    #[test]
1450    fn test_parallel_pdist() {
1451        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
1452
1453        let distances = parallel_pdist(&points.view(), "euclidean").unwrap();
1454
1455        // Should have n*(n-1)/2 = 6 distances
1456        assert_eq!(distances.len(), 6);
1457
1458        // All distances should be positive
1459        for &dist in distances.iter() {
1460            assert!(dist > 0.0);
1461            assert!(dist.is_finite());
1462        }
1463    }
1464
1465    #[test]
1466    fn test_parallel_cdist() {
1467        let points1 = array![[0.0, 0.0], [1.0, 1.0]];
1468        let points2 = array![[1.0, 0.0], [0.0, 1.0], [2.0, 2.0]];
1469
1470        let distances = parallel_cdist(&points1.view(), &points2.view(), "euclidean").unwrap();
1471
1472        assert_eq!(distances.shape(), &[2, 3]);
1473
1474        // All distances should be non-negative
1475        for &dist in distances.iter() {
1476            assert!(dist >= 0.0);
1477            assert!(dist.is_finite());
1478        }
1479    }
1480
1481    #[test]
1482    fn test_simd_knn_search() {
1483        let data_points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [2.0, 2.0]];
1484        let query_points = array![[0.5, 0.5], [1.5, 1.5]];
1485
1486        let (indices, distances) =
1487            simd_knn_search(&query_points.view(), &data_points.view(), 3, "euclidean").unwrap();
1488
1489        assert_eq!(indices.shape(), &[2, 3]);
1490        assert_eq!(distances.shape(), &[2, 3]);
1491
1492        // Distances should be sorted in ascending order
1493        for row in distances.outer_iter() {
1494            for i in 1..row.len() {
1495                assert!(row[i] >= row[i - 1]);
1496            }
1497        }
1498
1499        // All indices should be valid
1500        for &idx in indices.iter() {
1501            assert!(idx < data_points.nrows());
1502        }
1503    }
1504
1505    #[test]
1506    fn test_linear_to_condensed_indices() {
1507        // For n=4, condensed matrix has 6 elements
1508        let n = 4;
1509        let expected_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)];
1510
1511        for (linear_idx, expected) in expected_pairs.iter().enumerate() {
1512            let result = linear_to_condensed_indices(linear_idx, n);
1513            assert_eq!(result, *expected);
1514        }
1515    }
1516
1517    #[test]
1518    fn test_simd_chebyshev_distance() {
1519        let a = vec![1.0, 2.0, 3.0];
1520        let b = vec![4.0, 5.0, 1.0];
1521
1522        let dist = simd_chebyshev_distance(&a, &b).unwrap();
1523
1524        // Max difference should be |2 - 5| = 3
1525        assert_relative_eq!(dist, 3.0, epsilon = 1e-10);
1526    }
1527
1528    #[test]
1529    fn test_different_metrics() {
1530        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]];
1531
1532        let metrics = ["euclidean", "manhattan", "sqeuclidean", "chebyshev"];
1533
1534        for metric in &metrics {
1535            let distances = parallel_pdist(&points.view(), metric).unwrap();
1536            assert_eq!(distances.len(), 3); // n*(n-1)/2 = 3
1537
1538            for &dist in distances.iter() {
1539                assert!(dist >= 0.0);
1540                assert!(dist.is_finite());
1541            }
1542        }
1543    }
1544
1545    #[test]
1546    fn test_error_handling() {
1547        let a = vec![1.0, 2.0];
1548        let b = vec![1.0, 2.0, 3.0]; // Different length
1549
1550        let result = simd_euclidean_distance(&a, &b);
1551        assert!(result.is_err());
1552
1553        let result = simd_manhattan_distance(&a, &b);
1554        assert!(result.is_err());
1555    }
1556
1557    #[test]
1558    fn test_large_dimension_vectors() {
1559        // Simple test case first
1560        let a = vec![0.0, 1.0, 2.0];
1561        let b = vec![1.0, 2.0, 3.0];
1562
1563        let simd_dist = simd_euclidean_distance(&a, &b).unwrap();
1564        let scalar_dist = crate::distance::euclidean(&a, &b);
1565
1566        // Expected: sqrt(3 * 1^2) = sqrt(3) ā‰ˆ 1.732
1567        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1568
1569        // Test with larger vectors
1570        let dim = 1000; // Restore original size
1571        let a: Vec<f64> = (0..dim).map(|i| i as f64).collect();
1572        let b: Vec<f64> = (0..dim).map(|i| (i + 1) as f64).collect();
1573
1574        let simd_dist = simd_euclidean_distance(&a, &b).unwrap();
1575        let scalar_dist = crate::distance::euclidean(&a, &b);
1576
1577        // Expected: sqrt(1000 * 1^2) = sqrt(1000) ā‰ˆ 31.62
1578        assert_relative_eq!(simd_dist, scalar_dist, epsilon = 1e-10);
1579    }
1580
1581    #[test]
1582    #[ignore]
1583    fn test_hardware_optimized_distances() {
1584        use super::hardware_specific_simd::HardwareOptimizedDistances;
1585
1586        let optimizer = HardwareOptimizedDistances::new();
1587
1588        let a = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1589        let b = array![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
1590
1591        let optimized_dist = optimizer
1592            .euclidean_distance_optimized(&a.view(), &b.view())
1593            .unwrap();
1594        let scalar_dist = crate::distance::euclidean(a.as_slice().unwrap(), b.as_slice().unwrap());
1595
1596        assert_relative_eq!(optimized_dist, scalar_dist, epsilon = 1e-10);
1597    }
1598
1599    #[test]
1600    #[ignore]
1601    fn test_hardware_optimized_batch_matrix() {
1602        let points = array![
1603            [0.0, 0.0],
1604            [1.0, 0.0],
1605            [0.0, 1.0],
1606            [1.0, 1.0],
1607            [2.0, 2.0],
1608            [3.0, 3.0],
1609            [4.0, 4.0],
1610            [5.0, 5.0]
1611        ];
1612
1613        let optimizer = HardwareOptimizedDistances::new();
1614        let result = optimizer.batch_distance_matrix_optimized(&points.view());
1615
1616        assert!(result.is_ok());
1617        let matrix = result.unwrap();
1618        assert_eq!(matrix.dim(), (8, 8));
1619
1620        // Check diagonal is zero
1621        for i in 0..8 {
1622            assert_relative_eq!(matrix[[i, i]], 0.0, epsilon = 1e-10);
1623        }
1624
1625        // Check symmetry
1626        for i in 0..8 {
1627            for j in 0..8 {
1628                assert_relative_eq!(matrix[[i, j]], matrix[[j, i]], epsilon = 1e-10);
1629            }
1630        }
1631    }
1632
1633    #[test]
1634    #[ignore]
1635    fn test_hardware_optimized_knn() {
1636        let data_points = array![
1637            [0.0, 0.0],
1638            [1.0, 0.0],
1639            [0.0, 1.0],
1640            [1.0, 1.0],
1641            [2.0, 2.0],
1642            [3.0, 3.0],
1643            [4.0, 4.0],
1644            [5.0, 5.0]
1645        ];
1646        let query_points = array![[0.5, 0.5], [2.5, 2.5]];
1647
1648        let optimizer = HardwareOptimizedDistances::new();
1649        let result = optimizer.knn_search_vectorized(&query_points.view(), &data_points.view(), 3);
1650
1651        assert!(result.is_ok());
1652        let (indices, distances) = result.unwrap();
1653
1654        assert_eq!(indices.dim(), (2, 3));
1655        assert_eq!(distances.dim(), (2, 3));
1656
1657        // Check distances are sorted
1658        for row in distances.outer_iter() {
1659            for i in 1..row.len() {
1660                assert!(row[i] >= row[i - 1]);
1661            }
1662        }
1663    }
1664
1665    #[test]
1666    #[ignore]
1667    fn test_mixed_precision_adaptive() {
1668        use super::mixed_precision_simd::adaptive_precision_distance_matrix;
1669
1670        // Small values that fit in f32 range
1671        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
1672
1673        let result = adaptive_precision_distance_matrix(&points.view(), 1e6);
1674        assert!(result.is_ok());
1675
1676        let matrix = result.unwrap();
1677        assert_eq!(matrix.dim(), (4, 4));
1678
1679        // Check diagonal is zero
1680        for i in 0..4 {
1681            assert_relative_eq!(matrix[[i, i]], 0.0, epsilon = 1e-6);
1682        }
1683    }
1684
1685    #[test]
1686    fn test_simd_capabilities_reporting() {
1687        let optimizer = HardwareOptimizedDistances::new();
1688
1689        // This should not panic
1690        optimizer.report_capabilities();
1691
1692        let block_size = optimizer.optimal_simd_block_size();
1693        assert!((2..=8).contains(&block_size));
1694    }
1695}