scirs2_interpolate/
cache_aware.rs

1//! Cache-aware algorithm implementations for high-performance interpolation
2//!
3//! This module provides cache-optimized versions of computationally intensive
4//! interpolation algorithms. These implementations are designed to minimize
5//! cache misses and maximize data locality for better performance on modern
6//! CPU architectures.
7//!
8//! ## Key Optimizations
9//!
10//! - **Blocked matrix operations**: Process data in cache-friendly blocks
11//! - **Loop tiling**: Restructure loops to improve temporal locality
12//! - **Data layout optimization**: Arrange data to minimize cache misses
13//! - **Prefetching strategies**: Pre-load data that will be needed soon
14//! - **Memory access patterns**: Optimize for sequential and strided access
15//!
16//! ## Supported Methods
17//!
18//! - **Cache-aware RBF interpolation**: Blocked matrix operations for large datasets
19//! - **Cache-optimized B-spline evaluation**: Vectorized evaluation with prefetching
20//! - **Tiled distance computations**: Block-wise distance matrix calculation
21//! - **Memory-efficient nearest neighbor search**: Cache-friendly spatial indexing
22//!
23//! # Examples
24//!
25//! ```rust
26//! use scirs2_core::ndarray::{Array1, Array2};
27//! use scirs2_interpolate::cache_aware::{
28//!     CacheAwareRBF, CacheOptimizedConfig
29//! };
30//!
31//! // Create a cache-optimized RBF interpolator
32//! let points = Array2::from_shape_vec((1000, 3), vec![0.0; 3000]).unwrap();
33//! let values = Array1::from_vec(vec![1.0; 1000]);
34//! let config = CacheOptimizedConfig::default();
35//!
36//! let rbf = CacheAwareRBF::new(points, values, config).unwrap();
37//! ```
38
39use crate::advanced::rbf::RBFKernel;
40use crate::cache::CacheConfig;
41use crate::error::{InterpolateError, InterpolateResult};
42use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
43use scirs2_core::numeric::{Float, FromPrimitive};
44use std::fmt::{Debug, Display};
45
46/// Configuration for cache-optimized algorithms
47#[derive(Debug, Clone)]
48pub struct CacheOptimizedConfig {
49    /// Block size for tiled matrix operations
50    pub block_size: usize,
51    /// Enable prefetching for sequential access patterns
52    pub enable_prefetching: bool,
53    /// Use NUMA-aware memory allocation if available
54    pub numa_aware: bool,
55    /// Number of worker threads for parallel operations
56    pub num_threads: Option<usize>,
57    /// Cache configuration for internal caching
58    pub cache_config: CacheConfig,
59    /// Optimize for specific CPU cache sizes (L1, L2, L3 in KB)
60    pub cache_sizes: CacheSizes,
61}
62
63/// CPU cache size configuration for optimization
64#[derive(Debug, Clone)]
65pub struct CacheSizes {
66    /// L1 cache size in KB
67    pub l1_cache_kb: usize,
68    /// L2 cache size in KB
69    pub l2_cache_kb: usize,
70    /// L3 cache size in KB
71    pub l3_cache_kb: usize,
72}
73
74impl Default for CacheSizes {
75    fn default() -> Self {
76        Self {
77            l1_cache_kb: 32,   // Typical L1 cache size
78            l2_cache_kb: 256,  // Typical L2 cache size
79            l3_cache_kb: 8192, // Typical L3 cache size
80        }
81    }
82}
83
84impl Default for CacheOptimizedConfig {
85    fn default() -> Self {
86        Self {
87            block_size: 64, // Optimized for typical cache line sizes
88            enable_prefetching: true,
89            numa_aware: false,
90            num_threads: None, // Use default thread pool
91            cache_config: CacheConfig::default(),
92            cache_sizes: CacheSizes::default(),
93        }
94    }
95}
96
97/// Cache-aware RBF interpolator optimized for large datasets
98#[derive(Debug)]
99pub struct CacheAwareRBF<F>
100where
101    F: Float + FromPrimitive + Debug + Display + Send + Sync + 'static,
102{
103    /// Training points organized in cache-friendly layout
104    points: Array2<F>,
105    /// Training values
106    values: Array1<F>,
107    /// RBF kernel type
108    kernel: RBFKernel,
109    /// Epsilon parameter for RBF
110    epsilon: F,
111    /// Precomputed coefficients (if available)
112    coefficients: Option<Array1<F>>,
113    /// Configuration
114    _config: CacheOptimizedConfig,
115    /// Performance statistics
116    stats: CacheOptimizedStats,
117}
118
119/// Performance statistics for cache-optimized algorithms
120#[derive(Debug, Default)]
121pub struct CacheOptimizedStats {
122    /// Number of cache-optimized evaluations performed
123    pub evaluations: usize,
124    /// Total time spent in blocked operations (nanoseconds)
125    pub blocked_ops_time_ns: u64,
126    /// Number of cache blocks processed
127    pub blocks_processed: usize,
128    /// Estimated cache miss rate (approximate)
129    pub estimated_cache_miss_rate: f64,
130    /// Memory bandwidth utilization estimate
131    pub memory_bandwidth_utilization: f64,
132}
133
134impl<F> CacheAwareRBF<F>
135where
136    F: Float + FromPrimitive + Debug + Display + Send + Sync + 'static,
137{
138    /// Create a new cache-aware RBF interpolator
139    ///
140    /// # Arguments
141    ///
142    /// * `points` - Training data points (n_points × n_dims)
143    /// * `values` - Training data values
144    /// * `config` - Cache optimization configuration
145    ///
146    /// # Returns
147    ///
148    /// A new cache-optimized RBF interpolator
149    pub fn new(
150        points: Array2<F>,
151        values: Array1<F>,
152        config: CacheOptimizedConfig,
153    ) -> InterpolateResult<Self> {
154        if points.nrows() != values.len() {
155            return Err(InterpolateError::invalid_input(
156                "Number of points must match number of values".to_string(),
157            ));
158        }
159
160        let kernel = RBFKernel::Gaussian; // Default kernel
161        let epsilon = F::one(); // Default epsilon
162
163        Ok(Self {
164            points,
165            values,
166            kernel,
167            epsilon,
168            coefficients: None,
169            _config: config,
170            stats: CacheOptimizedStats::default(),
171        })
172    }
173
174    /// Set the RBF kernel type
175    pub fn with_kernel(mut self, kernel: RBFKernel) -> Self {
176        self.kernel = kernel;
177        self
178    }
179
180    /// Set the epsilon parameter
181    pub fn with_epsilon(mut self, epsilon: F) -> Self {
182        self.epsilon = epsilon;
183        self
184    }
185
186    /// Precompute RBF coefficients using cache-optimized matrix operations
187    ///
188    /// This method uses blocked matrix operations to improve cache performance
189    /// during the coefficient computation phase.
190    pub fn precompute_coefficients(&mut self) -> InterpolateResult<()> {
191        let start_time = std::time::Instant::now();
192
193        let n_points = self.points.nrows();
194        let block_size = self._config.block_size.min(n_points);
195
196        // Create distance matrix using blocked computation
197        let distance_matrix = self.compute_blocked_distance_matrix()?;
198
199        // Apply RBF kernel using cache-friendly operations
200        let kernel_matrix = self.apply_kernel_blocked(&distance_matrix)?;
201
202        // Solve the linear system using optimized solver
203        let coefficients = self.solve_rbf_system_optimized(&kernel_matrix)?;
204
205        self.coefficients = Some(coefficients);
206        self.stats.blocks_processed += n_points.div_ceil(block_size);
207        self.stats.blocked_ops_time_ns += start_time.elapsed().as_nanos() as u64;
208
209        Ok(())
210    }
211
212    /// Compute distance matrix using cache-optimized blocked operations
213    fn compute_blocked_distance_matrix(&self) -> InterpolateResult<Array2<F>> {
214        let n_points = self.points.nrows();
215        let n_dims = self.points.ncols();
216        let block_size = self._config.block_size;
217
218        let mut distance_matrix = Array2::zeros((n_points, n_points));
219
220        // Use blocked computation to improve cache locality
221        for i_block in (0..n_points).step_by(block_size) {
222            let i_end = (i_block + block_size).min(n_points);
223
224            for j_block in (0..n_points).step_by(block_size) {
225                let j_end = (j_block + block_size).min(n_points);
226
227                // Compute distances for this block
228                for i in i_block..i_end {
229                    for j in j_block..j_end {
230                        if i <= j {
231                            let dist = self.compute_distance_optimized(i, j, n_dims);
232                            distance_matrix[[i, j]] = dist;
233                            distance_matrix[[j, i]] = dist; // Symmetric matrix
234                        }
235                    }
236                }
237            }
238        }
239
240        Ok(distance_matrix)
241    }
242
243    /// Compute distance between two points with optimized memory access
244    fn compute_distance_optimized(&self, i: usize, j: usize, ndims: usize) -> F {
245        let mut sum_sq = F::zero();
246
247        // Process dimensions in chunks for better cache utilization
248        let chunk_size = 4; // Process 4 dimensions at a time
249
250        for dim_chunk in (0..ndims).step_by(chunk_size) {
251            let end_dim = (dim_chunk + chunk_size).min(ndims);
252
253            for dim in dim_chunk..end_dim {
254                let diff = self.points[[i, dim]] - self.points[[j, dim]];
255                sum_sq = sum_sq + diff * diff;
256            }
257        }
258
259        sum_sq.sqrt()
260    }
261
262    /// Apply RBF kernel to distance matrix using blocked operations
263    fn apply_kernel_blocked(&self, distance_matrix: &Array2<F>) -> InterpolateResult<Array2<F>> {
264        let n_points = distance_matrix.nrows();
265        let block_size = self._config.block_size;
266        let mut kernel_matrix = Array2::zeros((n_points, n_points));
267
268        for i_block in (0..n_points).step_by(block_size) {
269            let i_end = (i_block + block_size).min(n_points);
270
271            for j_block in (0..n_points).step_by(block_size) {
272                let j_end = (j_block + block_size).min(n_points);
273
274                // Apply kernel to this block
275                for i in i_block..i_end {
276                    for j in j_block..j_end {
277                        let dist = distance_matrix[[i, j]];
278                        kernel_matrix[[i, j]] = self.apply_kernel_function(dist);
279                    }
280                }
281            }
282        }
283
284        Ok(kernel_matrix)
285    }
286
287    /// Apply the RBF kernel function to a distance value
288    fn apply_kernel_function(&self, distance: F) -> F {
289        match self.kernel {
290            RBFKernel::Gaussian => {
291                let arg = -(distance * distance) / (self.epsilon * self.epsilon);
292                arg.exp()
293            }
294            RBFKernel::Multiquadric => (distance * distance + self.epsilon * self.epsilon).sqrt(),
295            RBFKernel::InverseMultiquadric => {
296                F::one() / (distance * distance + self.epsilon * self.epsilon).sqrt()
297            }
298            RBFKernel::Linear => distance,
299            RBFKernel::Cubic => distance * distance * distance,
300            RBFKernel::ThinPlateSpline => {
301                if distance == F::zero() {
302                    F::zero()
303                } else {
304                    distance * distance * distance.ln()
305                }
306            }
307            RBFKernel::Quintic => {
308                let r2 = distance * distance;
309                distance * r2 * r2 // r^5
310            }
311        }
312    }
313
314    /// Solve RBF system using cache-optimized linear algebra
315    fn solve_rbf_system_optimized(
316        &self,
317        kernel_matrix: &Array2<F>,
318    ) -> InterpolateResult<Array1<F>> {
319        // For now, use a simple approach
320        // In a full implementation, this would use blocked LU decomposition
321        // or other cache-optimized linear algebra routines
322
323        let n = kernel_matrix.nrows();
324        let mut augmented = Array2::zeros((n, n + 1));
325
326        // Copy kernel matrix and values vector
327        for i in 0..n {
328            for j in 0..n {
329                augmented[[i, j]] = kernel_matrix[[i, j]];
330            }
331            augmented[[i, n]] = self.values[i];
332        }
333
334        // Simple Gaussian elimination (would be optimized in production)
335        for k in 0..n {
336            // Find pivot
337            let mut max_row = k;
338            for i in (k + 1)..n {
339                if augmented[[i, k]].abs() > augmented[[max_row, k]].abs() {
340                    max_row = i;
341                }
342            }
343
344            // Swap rows
345            if max_row != k {
346                for j in 0..=n {
347                    let temp = augmented[[k, j]];
348                    augmented[[k, j]] = augmented[[max_row, j]];
349                    augmented[[max_row, j]] = temp;
350                }
351            }
352
353            // Eliminate column
354            for i in (k + 1)..n {
355                if augmented[[k, k]] != F::zero() {
356                    let factor = augmented[[i, k]] / augmented[[k, k]];
357                    for j in k..=n {
358                        augmented[[i, j]] = augmented[[i, j]] - factor * augmented[[k, j]];
359                    }
360                }
361            }
362        }
363
364        // Back substitution
365        let mut solution = Array1::zeros(n);
366        for i in (0..n).rev() {
367            let mut sum = F::zero();
368            for j in (i + 1)..n {
369                sum = sum + augmented[[i, j]] * solution[j];
370            }
371            if augmented[[i, i]] != F::zero() {
372                solution[i] = (augmented[[i, n]] - sum) / augmented[[i, i]];
373            }
374        }
375
376        Ok(solution)
377    }
378
379    /// Evaluate the RBF interpolator at query points using cache-optimized methods
380    ///
381    /// # Arguments
382    ///
383    /// * `query_points` - Points to evaluate (n_queries × n_dims)
384    ///
385    /// # Returns
386    ///
387    /// Interpolated values at query points
388    pub fn evaluate_cache_optimized(
389        &mut self,
390        query_points: &ArrayView2<F>,
391    ) -> InterpolateResult<Array1<F>> {
392        if self.coefficients.is_none() {
393            self.precompute_coefficients()?;
394        }
395
396        let coefficients = self.coefficients.as_ref().unwrap();
397        let n_queries = query_points.nrows();
398        let n_points = self.points.nrows();
399        let block_size = self._config.block_size;
400
401        let mut results = Array1::zeros(n_queries);
402
403        // Process queries in blocks for better cache performance
404        for query_block in (0..n_queries).step_by(block_size) {
405            let query_end = (query_block + block_size).min(n_queries);
406
407            for query_idx in query_block..query_end {
408                let query = query_points.row(query_idx);
409                let mut value = F::zero();
410
411                // Process training points in blocks
412                for point_block in (0..n_points).step_by(block_size) {
413                    let point_end = (point_block + block_size).min(n_points);
414
415                    for point_idx in point_block..point_end {
416                        let point = self.points.row(point_idx);
417                        let distance = self.compute_query_distance(&query, &point);
418                        let kernel_value = self.apply_kernel_function(distance);
419                        value = value + coefficients[point_idx] * kernel_value;
420                    }
421                }
422
423                results[query_idx] = value;
424            }
425        }
426
427        self.stats.evaluations += n_queries;
428        Ok(results)
429    }
430
431    /// Compute distance between query point and training point
432    fn compute_query_distance(&self, query: &ArrayView1<F>, point: &ArrayView1<F>) -> F {
433        let mut sum_sq = F::zero();
434        for i in 0..query.len() {
435            let diff = query[i] - point[i];
436            sum_sq = sum_sq + diff * diff;
437        }
438        sum_sq.sqrt()
439    }
440
441    /// Get performance statistics
442    pub fn stats(&self) -> &CacheOptimizedStats {
443        &self.stats
444    }
445
446    /// Reset performance statistics
447    pub fn reset_stats(&mut self) {
448        self.stats = CacheOptimizedStats::default();
449    }
450}
451
452/// Cache-aware B-spline evaluator with optimized memory access patterns
453#[derive(Debug)]
454pub struct CacheAwareBSpline<F>
455where
456    F: Float + FromPrimitive + Debug + Display + Copy + 'static,
457{
458    /// Knot vector
459    knots: Array1<F>,
460    /// Control coefficients
461    coefficients: Array1<F>,
462    /// Spline degree
463    degree: usize,
464    /// Configuration
465    config: CacheOptimizedConfig,
466    /// Precomputed basis function cache for common evaluation points
467    #[allow(dead_code)]
468    basis_cache: std::collections::HashMap<u64, Vec<F>>,
469}
470
471impl<F> CacheAwareBSpline<F>
472where
473    F: Float + FromPrimitive + Debug + Display + Copy + 'static,
474{
475    /// Create a new cache-aware B-spline
476    pub fn new(
477        knots: Array1<F>,
478        coefficients: Array1<F>,
479        degree: usize,
480        config: CacheOptimizedConfig,
481    ) -> InterpolateResult<Self> {
482        if knots.len() < coefficients.len() + degree + 1 {
483            return Err(InterpolateError::invalid_input(
484                "Invalid knot vector length for given coefficients and degree".to_string(),
485            ));
486        }
487
488        Ok(Self {
489            knots,
490            coefficients,
491            degree,
492            config,
493            basis_cache: std::collections::HashMap::new(),
494        })
495    }
496
497    /// Evaluate B-spline at multiple points using cache-optimized algorithm
498    pub fn evaluate_batch_cache_optimized(
499        &mut self,
500        x_values: &ArrayView1<F>,
501    ) -> InterpolateResult<Array1<F>> {
502        let n_points = x_values.len();
503        let mut results = Array1::zeros(n_points);
504        let block_size = self.config.block_size;
505
506        // Process evaluation points in blocks for better cache locality
507        for block_start in (0..n_points).step_by(block_size) {
508            let block_end = (block_start + block_size).min(n_points);
509
510            for i in block_start..block_end {
511                let x = x_values[i];
512                results[i] = self.evaluate_single_optimized(x)?;
513            }
514        }
515
516        Ok(results)
517    }
518
519    /// Evaluate B-spline at a single point with cache optimization
520    fn evaluate_single_optimized(&mut self, x: F) -> InterpolateResult<F> {
521        // Find knot span (could be cached for repeated similar evaluations)
522        let span = self.find_knot_span(x);
523
524        // Compute basis functions for this span
525        let basis = self.compute_basis_functions(x, span);
526
527        // Compute result using basis functions and coefficients
528        let mut result = F::zero();
529        for (i, &basis_val) in basis.iter().enumerate().take(self.degree + 1) {
530            let coeff_idx = span - self.degree + i;
531            if coeff_idx < self.coefficients.len() {
532                result = result + self.coefficients[coeff_idx] * basis_val;
533            }
534        }
535
536        Ok(result)
537    }
538
539    /// Find the knot span for a given parameter value
540    fn find_knot_span(&self, x: F) -> usize {
541        let n = self.coefficients.len();
542
543        if x >= self.knots[n] {
544            return n - 1;
545        }
546        if x <= self.knots[self.degree] {
547            return self.degree;
548        }
549
550        // Binary search for knot span
551        let mut low = self.degree;
552        let mut high = n;
553        let mut mid = (low + high) / 2;
554
555        while x < self.knots[mid] || x >= self.knots[mid + 1] {
556            if x < self.knots[mid] {
557                high = mid;
558            } else {
559                low = mid;
560            }
561            mid = (low + high) / 2;
562        }
563
564        mid
565    }
566
567    /// Compute basis functions using de Boor's algorithm
568    fn compute_basis_functions(&self, x: F, span: usize) -> Vec<F> {
569        let mut basis = vec![F::zero(); self.degree + 1];
570        basis[0] = F::one();
571
572        for j in 1..=self.degree {
573            let mut saved = F::zero();
574            #[allow(clippy::needless_range_loop)]
575            for r in 0..j {
576                let temp = basis[r];
577                let alpha_1 = if span + 1 + r >= j && span + 1 + r < self.knots.len() {
578                    let denom = self.knots[span + 1 + r] - self.knots[span + 1 + r - j];
579                    if denom != F::zero() {
580                        (x - self.knots[span + 1 + r - j]) / denom
581                    } else {
582                        F::zero()
583                    }
584                } else {
585                    F::zero()
586                };
587
588                basis[r] = saved + (F::one() - alpha_1) * temp;
589                saved = alpha_1 * temp;
590            }
591            basis[j] = saved;
592        }
593
594        basis
595    }
596}
597
598/// Create a cache-aware RBF interpolator with optimized settings
599///
600/// # Arguments
601///
602/// * `points` - Training data points
603/// * `values` - Training data values
604/// * `kernel` - RBF kernel type
605/// * `epsilon` - Kernel parameter
606///
607/// # Returns
608///
609/// A cache-optimized RBF interpolator
610#[allow(dead_code)]
611pub fn make_cache_aware_rbf<F>(
612    points: Array2<F>,
613    values: Array1<F>,
614    kernel: RBFKernel,
615    epsilon: F,
616) -> InterpolateResult<CacheAwareRBF<F>>
617where
618    F: Float + FromPrimitive + Debug + Display + Send + Sync + 'static,
619{
620    let config = CacheOptimizedConfig::default();
621
622    let mut rbf = CacheAwareRBF::new(points, values, config)?
623        .with_kernel(kernel)
624        .with_epsilon(epsilon);
625
626    rbf.precompute_coefficients()?;
627    Ok(rbf)
628}
629
630#[cfg(test)]
631mod tests {
632    use super::*;
633    use scirs2_core::ndarray::array;
634
635    #[test]
636    fn test_cache_aware_rbf_creation() {
637        let points = array![[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0]];
638        let values = array![0.0, 1.0, 1.0, 2.0];
639        let config = CacheOptimizedConfig::default();
640
641        let rbf = CacheAwareRBF::new(points, values, config);
642        assert!(rbf.is_ok());
643    }
644
645    #[test]
646    fn test_cache_aware_bspline_creation() {
647        let knots = array![0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0];
648        let coefficients = array![1.0, 2.0, 3.0, 4.0, 5.0];
649        let config = CacheOptimizedConfig::default();
650
651        let spline = CacheAwareBSpline::new(knots, coefficients, 2, config);
652        assert!(spline.is_ok());
653    }
654
655    #[test]
656    fn test_cache_optimized_config_defaults() {
657        let config = CacheOptimizedConfig::default();
658
659        assert_eq!(config.block_size, 64);
660        assert!(config.enable_prefetching);
661        assert!(!config.numa_aware);
662        assert!(config.num_threads.is_none());
663    }
664
665    #[test]
666    fn test_cache_sizes_defaults() {
667        let cache_sizes = CacheSizes::default();
668
669        assert_eq!(cache_sizes.l1_cache_kb, 32);
670        assert_eq!(cache_sizes.l2_cache_kb, 256);
671        assert_eq!(cache_sizes.l3_cache_kb, 8192);
672    }
673}