scirs2_integrate/
autotuning.rs

1//! Auto-tuning for hardware configurations
2//!
3//! This module provides automatic parameter tuning based on detected hardware
4//! characteristics. It optimizes algorithm parameters for CPU cores, cache sizes,
5//! memory bandwidth, and other system properties to achieve optimal performance.
6//!
7//! # Hardware Detection
8//!
9//! The auto-tuning system detects:
10//! - Number of CPU cores and threads
11//! - Cache sizes (L1, L2, L3)
12//! - Memory bandwidth characteristics
13//! - SIMD instruction set availability
14//! - GPU presence and capabilities
15//!
16//! # Examples
17//!
18//! ```
19//! use scirs2_integrate::autotuning::{HardwareDetector, AutoTuner, TuningProfile};
20//!
21//! // Detect hardware automatically
22//! let detector = HardwareDetector;
23//! let hardware = detector.detect();
24//! println!("Detected {} CPU cores", hardware.cpu_cores);
25//!
26//! // Create auto-tuner with detected hardware
27//! let tuner = AutoTuner::new(hardware);
28//! let profile = tuner.tune_for_problemsize(1000);
29//! ```
30
31use crate::common::IntegrateFloat;
32use std::collections::HashMap;
33use std::sync::OnceLock;
34use std::time::{Duration, Instant};
35
36/// Hardware characteristics detected at runtime
37#[derive(Debug, Clone)]
38pub struct HardwareInfo {
39    /// Number of physical CPU cores
40    pub cpu_cores: usize,
41    /// Number of logical CPU threads
42    pub cpu_threads: usize,
43    /// CPU brand and model
44    pub cpu_model: String,
45    /// L1 cache size per core (bytes)
46    pub l1_cache_size: usize,
47    /// L2 cache size per core (bytes)
48    pub l2_cache_size: usize,
49    /// L3 cache size total (bytes)
50    pub l3_cache_size: usize,
51    /// Memory size (bytes)
52    pub memory_size: usize,
53    /// Available SIMD instruction sets
54    pub simd_features: Vec<SimdFeature>,
55    /// Estimated memory bandwidth (bytes/second)
56    pub memory_bandwidth: Option<f64>,
57    /// GPU information
58    pub gpu_info: Option<GpuInfo>,
59}
60
61#[derive(Debug, Clone)]
62pub enum SimdFeature {
63    SSE,
64    SSE2,
65    SSE3,
66    SSSE3,
67    SSE41,
68    SSE42,
69    AVX,
70    AVX2,
71    AVX512F,
72    FMA,
73    NEON, // ARM
74}
75
76#[derive(Debug, Clone)]
77pub struct GpuInfo {
78    pub vendor: String,
79    pub model: String,
80    pub memory_size: usize,
81    pub compute_units: usize,
82}
83
84/// Hardware detection utilities
85pub struct HardwareDetector;
86
87impl HardwareDetector {
88    /// Detect hardware characteristics
89    pub fn detect(&self) -> HardwareInfo {
90        // Use cached detection result
91        static HARDWARE_INFO: OnceLock<HardwareInfo> = OnceLock::new();
92
93        HARDWARE_INFO.get_or_init(Self::detect_hardware).clone()
94    }
95
96    /// Perform actual hardware detection
97    fn detect_hardware() -> HardwareInfo {
98        let cpu_cores = Self::detect_cpu_cores();
99        let cpu_threads = Self::detect_cpu_threads();
100        let cpu_model = Self::detect_cpu_model();
101        let (l1_cache_size, l2_cache_size, l3_cache_size) = Self::detect_cache_sizes();
102        let memory_size = Self::detect_memory_size();
103        let simd_features = Self::detect_simd_features();
104        let memory_bandwidth = Self::estimate_memory_bandwidth();
105        let gpu_info = Self::detect_gpu();
106
107        HardwareInfo {
108            cpu_cores,
109            cpu_threads,
110            cpu_model,
111            l1_cache_size,
112            l2_cache_size,
113            l3_cache_size,
114            memory_size,
115            simd_features,
116            memory_bandwidth,
117            gpu_info,
118        }
119    }
120
121    /// Detect number of physical CPU cores
122    fn detect_cpu_cores() -> usize {
123        // Try to get physical core count
124        if let Some(cores) = std::thread::available_parallelism().ok().map(|n| n.get()) {
125            // This gives logical cores, estimate physical cores
126            cores / 2 // Rough estimate for hyperthreading
127        } else {
128            1
129        }
130        .max(1)
131    }
132
133    /// Detect number of logical CPU threads
134    fn detect_cpu_threads() -> usize {
135        std::thread::available_parallelism()
136            .ok()
137            .map(|n| n.get())
138            .unwrap_or(1)
139    }
140
141    /// Detect CPU model
142    fn detect_cpu_model() -> String {
143        format!("{} CPU", std::env::consts::ARCH)
144    }
145
146    /// Detect cache sizes
147    fn detect_cache_sizes() -> (usize, usize, usize) {
148        // Use reasonable defaults based on architecture
149        #[cfg(target_arch = "x86_64")]
150        {
151            // Modern x86_64 typical cache sizes
152            (32 * 1024, 256 * 1024, 8 * 1024 * 1024)
153        }
154
155        #[cfg(target_arch = "aarch64")]
156        {
157            // ARM64 typical cache sizes
158            (64 * 1024, 512 * 1024, 4 * 1024 * 1024)
159        }
160
161        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
162        {
163            // Conservative defaults for other architectures
164            (32 * 1024, 256 * 1024, 2 * 1024 * 1024)
165        }
166    }
167
168    /// Detect total memory size
169    fn detect_memory_size() -> usize {
170        // Simple heuristic based on available system memory
171        // In practice, you'd use platform-specific APIs
172        8 * 1024 * 1024 * 1024 // Default to 8GB
173    }
174
175    /// Detect available SIMD features
176    fn detect_simd_features() -> Vec<SimdFeature> {
177        let mut features = Vec::new();
178
179        #[cfg(target_arch = "x86_64")]
180        {
181            // Use std::is_x86_feature_detected! macro for runtime detection
182            if std::is_x86_feature_detected!("sse") {
183                features.push(SimdFeature::SSE);
184            }
185            if std::is_x86_feature_detected!("sse2") {
186                features.push(SimdFeature::SSE2);
187            }
188            if std::is_x86_feature_detected!("sse3") {
189                features.push(SimdFeature::SSE3);
190            }
191            if std::is_x86_feature_detected!("ssse3") {
192                features.push(SimdFeature::SSSE3);
193            }
194            if std::is_x86_feature_detected!("sse4.1") {
195                features.push(SimdFeature::SSE41);
196            }
197            if std::is_x86_feature_detected!("sse4.2") {
198                features.push(SimdFeature::SSE42);
199            }
200            if std::is_x86_feature_detected!("avx") {
201                features.push(SimdFeature::AVX);
202            }
203            if std::is_x86_feature_detected!("avx2") {
204                features.push(SimdFeature::AVX2);
205            }
206            if std::is_x86_feature_detected!("avx512f") {
207                features.push(SimdFeature::AVX512F);
208            }
209            if std::is_x86_feature_detected!("fma") {
210                features.push(SimdFeature::FMA);
211            }
212        }
213
214        #[cfg(target_arch = "aarch64")]
215        {
216            // NEON is standard on ARM64
217            features.push(SimdFeature::NEON);
218        }
219
220        features
221    }
222
223    /// Estimate memory bandwidth using a simple benchmark
224    fn estimate_memory_bandwidth() -> Option<f64> {
225        // Simple bandwidth estimation
226        let size = 10 * 1024 * 1024; // 10MB
227        let data: Vec<u64> = vec![1; size / 8];
228
229        let start = Instant::now();
230        let sum: u64 = data.iter().sum();
231        let duration = start.elapsed();
232
233        // Prevent optimization
234        let _ = sum;
235
236        if duration.as_nanos() > 0 {
237            let bytes_per_second = (size as f64) / duration.as_secs_f64();
238            Some(bytes_per_second)
239        } else {
240            None
241        }
242    }
243
244    /// Detect GPU information
245    fn detect_gpu() -> Option<GpuInfo> {
246        // Use scirs2-core's GPU detection functionality
247        let detection_result = scirs2_core::gpu::backends::detect_gpu_backends();
248
249        // Find the first non-CPU device
250        detection_result
251            .devices
252            .into_iter()
253            .find(|device| device.backend != scirs2_core::gpu::GpuBackend::Cpu)
254            .map(|device| GpuInfo {
255                vendor: match device.backend {
256                    scirs2_core::gpu::GpuBackend::Cuda => "NVIDIA".to_string(),
257                    scirs2_core::gpu::GpuBackend::Rocm => "AMD".to_string(),
258                    scirs2_core::gpu::GpuBackend::Metal => "Apple".to_string(),
259                    scirs2_core::gpu::GpuBackend::OpenCL => "Unknown".to_string(),
260                    scirs2_core::gpu::GpuBackend::Wgpu => "WebGPU".to_string(),
261                    scirs2_core::gpu::GpuBackend::Cpu => "CPU".to_string(),
262                },
263                model: device.device_name,
264                memory_size: device.memory_bytes.unwrap_or(0) as usize,
265                compute_units: if device.supports_tensors { 1 } else { 0 }, // Simplified
266            })
267    }
268}
269
270/// Tuning profile for specific problem characteristics
271#[derive(Debug, Clone)]
272pub struct TuningProfile {
273    /// Optimal number of threads for parallel algorithms
274    pub num_threads: usize,
275    /// Block size for cache-friendly algorithms
276    pub block_size: usize,
277    /// Chunk size for parallel work distribution
278    pub chunk_size: usize,
279    /// Whether to use SIMD optimizations
280    pub use_simd: bool,
281    /// Memory pool size for frequent allocations
282    pub memory_pool_size: usize,
283    /// Tolerance for iterative algorithms
284    pub default_tolerance: f64,
285    /// Maximum iterations for convergence
286    pub max_iterations: usize,
287    /// Whether to use GPU acceleration if available
288    pub use_gpu: bool,
289}
290
291/// Auto-tuner for algorithm parameters
292pub struct AutoTuner {
293    hardware: HardwareInfo,
294    cache: HashMap<String, TuningProfile>,
295}
296
297impl AutoTuner {
298    /// Create new auto-tuner with detected hardware
299    pub fn new(hardware: HardwareInfo) -> Self {
300        Self {
301            hardware,
302            cache: HashMap::new(),
303        }
304    }
305
306    /// Create auto-tuner with automatic hardware detection
307    pub fn auto(&self) -> Self {
308        Self::new(HardwareDetector.detect())
309    }
310
311    /// Tune parameters for specific problem size
312    pub fn tune_for_problemsize(&self, problemsize: usize) -> TuningProfile {
313        let cache_key = format!("size_{problemsize}");
314
315        if let Some(cached) = self.cache.get(&cache_key) {
316            return cached.clone();
317        }
318
319        self.compute_tuning_profile(problemsize)
320    }
321
322    /// Compute optimal tuning profile for given problem size
323    fn compute_tuning_profile(&self, problemsize: usize) -> TuningProfile {
324        // Determine optimal thread count
325        let num_threads = self.optimal_thread_count(problemsize);
326
327        // Determine optimal block _size based on cache
328        let block_size = self.optimal_block_size(problemsize);
329
330        // Determine chunk _size for parallel distribution
331        let chunk_size = Self::optimal_chunk_size(problemsize, num_threads);
332
333        // Determine if SIMD should be used
334        let use_simd = !self.hardware.simd_features.is_empty() && problemsize >= 64;
335
336        // Determine memory pool _size
337        let memory_pool_size = self.optimal_memory_pool_size(problemsize);
338
339        // Determine tolerances based on problem _size
340        let (default_tolerance, max_iterations) = Self::optimal_tolerances(problemsize);
341
342        // Determine GPU usage
343        let use_gpu = self.hardware.gpu_info.is_some() && problemsize >= 10000;
344
345        TuningProfile {
346            num_threads,
347            block_size,
348            chunk_size,
349            use_simd,
350            memory_pool_size,
351            default_tolerance,
352            max_iterations,
353            use_gpu,
354        }
355    }
356
357    /// Determine optimal thread count
358    fn optimal_thread_count(&self, problemsize: usize) -> usize {
359        let max_threads = self.hardware.cpu_threads;
360
361        if problemsize < 1000 {
362            // Small problems don't benefit from parallelization
363            1
364        } else if problemsize < 10000 {
365            // Medium problems use moderate parallelization
366            (max_threads / 2).clamp(1, 4)
367        } else {
368            // Large problems can use all available threads
369            max_threads.min(problemsize / 1000)
370        }
371    }
372
373    /// Determine optimal block size for cache efficiency
374    fn optimal_block_size(&self, problemsize: usize) -> usize {
375        let l1_elements = self.hardware.l1_cache_size / 8; // Assume f64
376        let l2_elements = self.hardware.l2_cache_size / 8;
377
378        if problemsize <= l1_elements {
379            // Fits in L1 cache
380            problemsize
381        } else if problemsize <= l2_elements {
382            // Use L1-sized blocks
383            l1_elements / 4
384        } else {
385            // Use L2-sized blocks for large problems
386            l2_elements / 16
387        }
388    }
389
390    /// Determine optimal chunk size for parallel distribution
391    fn optimal_chunk_size(_problemsize: usize, numthreads: usize) -> usize {
392        if numthreads <= 1 {
393            _problemsize
394        } else {
395            // Balance between parallelization overhead and load balancing
396            let min_chunk = 100; // Minimum chunk to avoid excessive overhead
397            let ideal_chunk = _problemsize / (numthreads * 4); // 4x oversubscription
398            ideal_chunk.max(min_chunk)
399        }
400    }
401
402    /// Determine optimal memory pool size
403    fn optimal_memory_pool_size(&self, problemsize: usize) -> usize {
404        // Use a fraction of available memory based on problem _size
405        let base_size = problemsize * 8 * 4; // 4x problem _size in bytes
406        let max_pool = self.hardware.memory_size / 8; // Use up to 1/8 of system memory
407
408        base_size.min(max_pool).max(1024 * 1024) // At least 1MB
409    }
410
411    /// Determine optimal tolerances and iteration limits
412    fn optimal_tolerances(_problemsize: usize) -> (f64, usize) {
413        if _problemsize < 1000 {
414            (1e-12, 100) // High accuracy for small problems
415        } else if _problemsize < 100000 {
416            (1e-10, 500) // Moderate accuracy for medium problems
417        } else {
418            (1e-8, 1000) // Lower accuracy for large problems
419        }
420    }
421
422    /// Benchmark-based tuning for specific algorithms
423    pub fn benchmark_tune<F: IntegrateFloat>(
424        &mut self,
425        algorithm_name: &str,
426        benchmark_fn: impl Fn(&TuningProfile) -> Duration,
427        problemsize: usize,
428    ) -> TuningProfile {
429        let base_profile = self.tune_for_problemsize(problemsize);
430
431        // Try different parameter variations
432        let mut best_profile = base_profile.clone();
433        let mut best_time = benchmark_fn(&base_profile);
434
435        // Test different thread counts
436        for threads in [1, 2, 4, 8, 16] {
437            if threads <= self.hardware.cpu_threads {
438                let mut profile = base_profile.clone();
439                profile.num_threads = threads;
440                profile.chunk_size = Self::optimal_chunk_size(problemsize, threads);
441
442                let time = benchmark_fn(&profile);
443                if time < best_time {
444                    best_time = time;
445                    best_profile = profile;
446                }
447            }
448        }
449
450        // Test different block sizes
451        for &factor in &[0.5, 1.0, 2.0, 4.0] {
452            let mut profile = best_profile.clone();
453            profile.block_size = ((base_profile.block_size as f64) * factor) as usize;
454            profile.block_size = profile.block_size.max(32).min(problemsize);
455
456            let time = benchmark_fn(&profile);
457            if time < best_time {
458                best_time = time;
459                best_profile = profile;
460            }
461        }
462
463        // Cache the result
464        let cache_key = format!("{algorithm_name}_{problemsize}");
465        self.cache.insert(cache_key, best_profile.clone());
466
467        best_profile
468    }
469
470    /// Get hardware information
471    pub fn hardware_info(&self) -> &HardwareInfo {
472        &self.hardware
473    }
474}
475
476/// Auto-tuning for specific algorithm types
477pub struct AlgorithmTuner;
478
479impl AlgorithmTuner {
480    /// Tune parameters for matrix operations
481    pub fn tune_matrix_operations(_hardware: &HardwareInfo, matrixsize: usize) -> TuningProfile {
482        let tuner = AutoTuner::new(_hardware.clone());
483
484        let mut profile = tuner.tune_for_problemsize(matrixsize * matrixsize);
485
486        // Matrix-specific adjustments
487        if matrixsize >= 1000 {
488            profile.block_size = 64; // Good block _size for matrix multiplication
489            profile.use_simd = true;
490        }
491
492        profile
493    }
494
495    /// Tune parameters for ODE solving
496    pub fn tune_ode_solver(
497        hardware: &HardwareInfo,
498        system_size: usize,
499        time_steps: usize,
500    ) -> TuningProfile {
501        let tuner = AutoTuner::new(hardware.clone());
502        let problemsize = system_size * time_steps;
503
504        let mut profile = tuner.tune_for_problemsize(problemsize);
505
506        // ODE-specific adjustments
507        if system_size > 100 {
508            profile.use_simd = true;
509            profile.default_tolerance = 1e-8; // Good balance for ODEs
510            profile.max_iterations = 50;
511        }
512
513        profile
514    }
515
516    /// Tune parameters for Monte Carlo integration
517    pub fn tune_monte_carlo(
518        hardware: &HardwareInfo,
519        dimensions: usize,
520        samples: usize,
521    ) -> TuningProfile {
522        let tuner = AutoTuner::new(hardware.clone());
523
524        let mut profile = tuner.tune_for_problemsize(samples);
525
526        // Monte Carlo specific adjustments
527        profile.num_threads = hardware.cpu_threads; // MC benefits from all threads
528        profile.chunk_size = (samples / (hardware.cpu_threads * 8)).max(1000);
529
530        if dimensions > 10 {
531            profile.use_gpu = hardware.gpu_info.is_some(); // High-D benefits from GPU
532        }
533
534        profile
535    }
536}
537
538#[cfg(test)]
539mod tests {
540    use super::*;
541
542    #[test]
543    fn test_hardware_detection() {
544        let detector = HardwareDetector;
545        let hardware = detector.detect();
546
547        assert!(hardware.cpu_cores > 0);
548        assert!(hardware.cpu_threads >= hardware.cpu_cores);
549        assert!(hardware.l1_cache_size > 0);
550        assert!(hardware.l2_cache_size >= hardware.l1_cache_size);
551        assert!(hardware.l3_cache_size >= hardware.l2_cache_size);
552        assert!(hardware.memory_size > 0);
553    }
554
555    #[test]
556    fn test_auto_tuner() {
557        let detector = HardwareDetector;
558        let hardware = detector.detect();
559        let tuner = AutoTuner::new(hardware);
560
561        // Test small problem
562        let small_profile = tuner.tune_for_problemsize(100);
563        assert_eq!(small_profile.num_threads, 1);
564
565        // Test large problem
566        let large_profile = tuner.tune_for_problemsize(100000);
567        assert!(large_profile.num_threads > 1);
568        assert!(large_profile.block_size > 0);
569        assert!(large_profile.chunk_size > 0);
570    }
571
572    #[test]
573    fn test_algorithm_specific_tuning() {
574        let detector = HardwareDetector;
575        let hardware = detector.detect();
576
577        // Test matrix operations tuning
578        let matrix_profile = AlgorithmTuner::tune_matrix_operations(&hardware, 1000);
579        assert_eq!(matrix_profile.block_size, 64);
580
581        // Test ODE solver tuning
582        let ode_profile = AlgorithmTuner::tune_ode_solver(&hardware, 100, 1000);
583        assert!(ode_profile.max_iterations > 0);
584        assert!(ode_profile.default_tolerance > 0.0);
585
586        // Test Monte Carlo tuning
587        let mc_profile = AlgorithmTuner::tune_monte_carlo(&hardware, 5, 1000000);
588        assert!(mc_profile.chunk_size > 0);
589    }
590}