Skip to main content

scirs2_integrate/
autotuning.rs

1//! Auto-tuning for hardware configurations
2//!
3//! This module provides automatic parameter tuning based on detected hardware
4//! characteristics. It optimizes algorithm parameters for CPU cores, cache sizes,
5//! memory bandwidth, and other system properties to achieve optimal performance.
6//!
7//! # Hardware Detection
8//!
9//! The auto-tuning system detects:
10//! - Number of CPU cores and threads
11//! - Cache sizes (L1, L2, L3)
12//! - Memory bandwidth characteristics
13//! - SIMD instruction set availability
14//! - GPU presence and capabilities
15//!
16//! # Examples
17//!
18//! ```
19//! use scirs2_integrate::autotuning::{HardwareDetector, AutoTuner, TuningProfile};
20//!
21//! // Detect hardware automatically
22//! let detector = HardwareDetector;
23//! let hardware = detector.detect();
24//! println!("Detected {} CPU cores", hardware.cpu_cores);
25//!
26//! // Create auto-tuner with detected hardware
27//! let tuner = AutoTuner::new(hardware);
28//! let profile = tuner.tune_for_problemsize(1000);
29//! ```
30
31use crate::common::IntegrateFloat;
32use std::collections::HashMap;
33use std::sync::OnceLock;
34use std::time::{Duration, Instant};
35
36/// Hardware characteristics detected at runtime
37#[derive(Debug, Clone)]
38pub struct HardwareInfo {
39    /// Number of physical CPU cores
40    pub cpu_cores: usize,
41    /// Number of logical CPU threads
42    pub cpu_threads: usize,
43    /// CPU brand and model
44    pub cpu_model: String,
45    /// L1 cache size per core (bytes)
46    pub l1_cache_size: usize,
47    /// L2 cache size per core (bytes)
48    pub l2_cache_size: usize,
49    /// L3 cache size total (bytes)
50    pub l3_cache_size: usize,
51    /// Memory size (bytes)
52    pub memory_size: usize,
53    /// Available SIMD instruction sets
54    pub simd_features: Vec<SimdFeature>,
55    /// Estimated memory bandwidth (bytes/second)
56    pub memory_bandwidth: Option<f64>,
57    /// GPU information
58    pub gpu_info: Option<GpuInfo>,
59}
60
61#[derive(Debug, Clone)]
62pub enum SimdFeature {
63    SSE,
64    SSE2,
65    SSE3,
66    SSSE3,
67    SSE41,
68    SSE42,
69    AVX,
70    AVX2,
71    AVX512F,
72    FMA,
73    NEON, // ARM
74}
75
76#[derive(Debug, Clone)]
77pub struct GpuInfo {
78    pub vendor: String,
79    pub model: String,
80    pub memory_size: usize,
81    pub compute_units: usize,
82}
83
84/// Hardware detection utilities
85pub struct HardwareDetector;
86
87impl HardwareDetector {
88    /// Detect hardware characteristics
89    pub fn detect(&self) -> HardwareInfo {
90        // Use cached detection result
91        static HARDWARE_INFO: OnceLock<HardwareInfo> = OnceLock::new();
92
93        HARDWARE_INFO.get_or_init(Self::detect_hardware).clone()
94    }
95
96    /// Perform actual hardware detection
97    fn detect_hardware() -> HardwareInfo {
98        let cpu_cores = Self::detect_cpu_cores();
99        let cpu_threads = Self::detect_cpu_threads();
100        let cpu_model = Self::detect_cpu_model();
101        let (l1_cache_size, l2_cache_size, l3_cache_size) = Self::detect_cache_sizes();
102        let memory_size = Self::detect_memory_size();
103        let simd_features = Self::detect_simd_features();
104        let memory_bandwidth = Self::estimate_memory_bandwidth();
105        let gpu_info = Self::detect_gpu();
106
107        HardwareInfo {
108            cpu_cores,
109            cpu_threads,
110            cpu_model,
111            l1_cache_size,
112            l2_cache_size,
113            l3_cache_size,
114            memory_size,
115            simd_features,
116            memory_bandwidth,
117            gpu_info,
118        }
119    }
120
121    /// Detect number of physical CPU cores
122    fn detect_cpu_cores() -> usize {
123        // Try to get physical core count
124        if let Some(cores) = std::thread::available_parallelism().ok().map(|n| n.get()) {
125            // This gives logical cores, estimate physical cores
126            cores / 2 // Rough estimate for hyperthreading
127        } else {
128            1
129        }
130        .max(1)
131    }
132
133    /// Detect number of logical CPU threads
134    fn detect_cpu_threads() -> usize {
135        std::thread::available_parallelism()
136            .ok()
137            .map(|n| n.get())
138            .unwrap_or(1)
139    }
140
141    /// Detect CPU model
142    fn detect_cpu_model() -> String {
143        format!("{} CPU", std::env::consts::ARCH)
144    }
145
146    /// Detect cache sizes
147    fn detect_cache_sizes() -> (usize, usize, usize) {
148        // Use reasonable defaults based on architecture
149        #[cfg(target_arch = "x86_64")]
150        {
151            // Modern x86_64 typical cache sizes
152            (32 * 1024, 256 * 1024, 8 * 1024 * 1024)
153        }
154
155        #[cfg(target_arch = "aarch64")]
156        {
157            // ARM64 typical cache sizes
158            (64 * 1024, 512 * 1024, 4 * 1024 * 1024)
159        }
160
161        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
162        {
163            // Conservative defaults for other architectures
164            (32 * 1024, 256 * 1024, 2 * 1024 * 1024)
165        }
166    }
167
168    /// Detect total memory size
169    fn detect_memory_size() -> usize {
170        // Simple heuristic based on available system memory
171        // In practice, you'd use platform-specific APIs
172        #[cfg(target_pointer_width = "32")]
173        {
174            512 * 1024 * 1024
175        } // Default to 512MB for 32-bit
176        #[cfg(target_pointer_width = "64")]
177        {
178            8usize * 1024 * 1024 * 1024
179        } // Default to 8GB for 64-bit
180    }
181
182    /// Detect available SIMD features
183    fn detect_simd_features() -> Vec<SimdFeature> {
184        let mut features = Vec::new();
185
186        #[cfg(target_arch = "x86_64")]
187        {
188            // Use std::is_x86_feature_detected! macro for runtime detection
189            if std::is_x86_feature_detected!("sse") {
190                features.push(SimdFeature::SSE);
191            }
192            if std::is_x86_feature_detected!("sse2") {
193                features.push(SimdFeature::SSE2);
194            }
195            if std::is_x86_feature_detected!("sse3") {
196                features.push(SimdFeature::SSE3);
197            }
198            if std::is_x86_feature_detected!("ssse3") {
199                features.push(SimdFeature::SSSE3);
200            }
201            if std::is_x86_feature_detected!("sse4.1") {
202                features.push(SimdFeature::SSE41);
203            }
204            if std::is_x86_feature_detected!("sse4.2") {
205                features.push(SimdFeature::SSE42);
206            }
207            if std::is_x86_feature_detected!("avx") {
208                features.push(SimdFeature::AVX);
209            }
210            if std::is_x86_feature_detected!("avx2") {
211                features.push(SimdFeature::AVX2);
212            }
213            if std::is_x86_feature_detected!("avx512f") {
214                features.push(SimdFeature::AVX512F);
215            }
216            if std::is_x86_feature_detected!("fma") {
217                features.push(SimdFeature::FMA);
218            }
219        }
220
221        #[cfg(target_arch = "aarch64")]
222        {
223            // NEON is standard on ARM64
224            features.push(SimdFeature::NEON);
225        }
226
227        features
228    }
229
230    /// Estimate memory bandwidth using a simple benchmark
231    fn estimate_memory_bandwidth() -> Option<f64> {
232        // Simple bandwidth estimation
233        let size = 10 * 1024 * 1024; // 10MB
234        let data: Vec<u64> = vec![1; size / 8];
235
236        let start = Instant::now();
237        let sum: u64 = data.iter().sum();
238        let duration = start.elapsed();
239
240        // Prevent optimization
241        let _ = sum;
242
243        if duration.as_nanos() > 0 {
244            let bytes_per_second = (size as f64) / duration.as_secs_f64();
245            Some(bytes_per_second)
246        } else {
247            None
248        }
249    }
250
251    /// Detect GPU information
252    fn detect_gpu() -> Option<GpuInfo> {
253        // Use scirs2-core's GPU detection functionality
254        let detection_result = scirs2_core::gpu::backends::detect_gpu_backends();
255
256        // Find the first non-CPU device
257        detection_result
258            .devices
259            .into_iter()
260            .find(|device| device.backend != scirs2_core::gpu::GpuBackend::Cpu)
261            .map(|device| GpuInfo {
262                vendor: match device.backend {
263                    scirs2_core::gpu::GpuBackend::Cuda => "NVIDIA".to_string(),
264                    scirs2_core::gpu::GpuBackend::Rocm => "AMD".to_string(),
265                    scirs2_core::gpu::GpuBackend::Metal => "Apple".to_string(),
266                    scirs2_core::gpu::GpuBackend::OpenCL => "Unknown".to_string(),
267                    scirs2_core::gpu::GpuBackend::Wgpu => "WebGPU".to_string(),
268                    scirs2_core::gpu::GpuBackend::Cpu => "CPU".to_string(),
269                },
270                model: device.device_name,
271                memory_size: device.memory_bytes.unwrap_or(0) as usize,
272                compute_units: if device.supports_tensors { 1 } else { 0 }, // Simplified
273            })
274    }
275}
276
277/// Tuning profile for specific problem characteristics
278#[derive(Debug, Clone)]
279pub struct TuningProfile {
280    /// Optimal number of threads for parallel algorithms
281    pub num_threads: usize,
282    /// Block size for cache-friendly algorithms
283    pub block_size: usize,
284    /// Chunk size for parallel work distribution
285    pub chunk_size: usize,
286    /// Whether to use SIMD optimizations
287    pub use_simd: bool,
288    /// Memory pool size for frequent allocations
289    pub memory_pool_size: usize,
290    /// Tolerance for iterative algorithms
291    pub default_tolerance: f64,
292    /// Maximum iterations for convergence
293    pub max_iterations: usize,
294    /// Whether to use GPU acceleration if available
295    pub use_gpu: bool,
296}
297
298/// Auto-tuner for algorithm parameters
299pub struct AutoTuner {
300    hardware: HardwareInfo,
301    cache: HashMap<String, TuningProfile>,
302}
303
304impl AutoTuner {
305    /// Create new auto-tuner with detected hardware
306    pub fn new(hardware: HardwareInfo) -> Self {
307        Self {
308            hardware,
309            cache: HashMap::new(),
310        }
311    }
312
313    /// Create auto-tuner with automatic hardware detection
314    pub fn auto(&self) -> Self {
315        Self::new(HardwareDetector.detect())
316    }
317
318    /// Tune parameters for specific problem size
319    pub fn tune_for_problemsize(&self, problemsize: usize) -> TuningProfile {
320        let cache_key = format!("size_{problemsize}");
321
322        if let Some(cached) = self.cache.get(&cache_key) {
323            return cached.clone();
324        }
325
326        self.compute_tuning_profile(problemsize)
327    }
328
329    /// Compute optimal tuning profile for given problem size
330    fn compute_tuning_profile(&self, problemsize: usize) -> TuningProfile {
331        // Determine optimal thread count
332        let num_threads = self.optimal_thread_count(problemsize);
333
334        // Determine optimal block _size based on cache
335        let block_size = self.optimal_block_size(problemsize);
336
337        // Determine chunk _size for parallel distribution
338        let chunk_size = Self::optimal_chunk_size(problemsize, num_threads);
339
340        // Determine if SIMD should be used
341        let use_simd = !self.hardware.simd_features.is_empty() && problemsize >= 64;
342
343        // Determine memory pool _size
344        let memory_pool_size = self.optimal_memory_pool_size(problemsize);
345
346        // Determine tolerances based on problem _size
347        let (default_tolerance, max_iterations) = Self::optimal_tolerances(problemsize);
348
349        // Determine GPU usage
350        let use_gpu = self.hardware.gpu_info.is_some() && problemsize >= 10000;
351
352        TuningProfile {
353            num_threads,
354            block_size,
355            chunk_size,
356            use_simd,
357            memory_pool_size,
358            default_tolerance,
359            max_iterations,
360            use_gpu,
361        }
362    }
363
364    /// Determine optimal thread count
365    fn optimal_thread_count(&self, problemsize: usize) -> usize {
366        let max_threads = self.hardware.cpu_threads;
367
368        if problemsize < 1000 {
369            // Small problems don't benefit from parallelization
370            1
371        } else if problemsize < 10000 {
372            // Medium problems use moderate parallelization
373            (max_threads / 2).clamp(1, 4)
374        } else {
375            // Large problems can use all available threads
376            max_threads.min(problemsize / 1000)
377        }
378    }
379
380    /// Determine optimal block size for cache efficiency
381    fn optimal_block_size(&self, problemsize: usize) -> usize {
382        let l1_elements = self.hardware.l1_cache_size / 8; // Assume f64
383        let l2_elements = self.hardware.l2_cache_size / 8;
384
385        if problemsize <= l1_elements {
386            // Fits in L1 cache
387            problemsize
388        } else if problemsize <= l2_elements {
389            // Use L1-sized blocks
390            l1_elements / 4
391        } else {
392            // Use L2-sized blocks for large problems
393            l2_elements / 16
394        }
395    }
396
397    /// Determine optimal chunk size for parallel distribution
398    fn optimal_chunk_size(_problemsize: usize, numthreads: usize) -> usize {
399        if numthreads <= 1 {
400            _problemsize
401        } else {
402            // Balance between parallelization overhead and load balancing
403            let min_chunk = 100; // Minimum chunk to avoid excessive overhead
404            let ideal_chunk = _problemsize / (numthreads * 4); // 4x oversubscription
405            ideal_chunk.max(min_chunk)
406        }
407    }
408
409    /// Determine optimal memory pool size
410    fn optimal_memory_pool_size(&self, problemsize: usize) -> usize {
411        // Use a fraction of available memory based on problem _size
412        let base_size = problemsize * 8 * 4; // 4x problem _size in bytes
413        let max_pool = self.hardware.memory_size / 8; // Use up to 1/8 of system memory
414
415        base_size.min(max_pool).max(1024 * 1024) // At least 1MB
416    }
417
418    /// Determine optimal tolerances and iteration limits
419    fn optimal_tolerances(_problemsize: usize) -> (f64, usize) {
420        if _problemsize < 1000 {
421            (1e-12, 100) // High accuracy for small problems
422        } else if _problemsize < 100000 {
423            (1e-10, 500) // Moderate accuracy for medium problems
424        } else {
425            (1e-8, 1000) // Lower accuracy for large problems
426        }
427    }
428
429    /// Benchmark-based tuning for specific algorithms
430    pub fn benchmark_tune<F: IntegrateFloat>(
431        &mut self,
432        algorithm_name: &str,
433        benchmark_fn: impl Fn(&TuningProfile) -> Duration,
434        problemsize: usize,
435    ) -> TuningProfile {
436        let base_profile = self.tune_for_problemsize(problemsize);
437
438        // Try different parameter variations
439        let mut best_profile = base_profile.clone();
440        let mut best_time = benchmark_fn(&base_profile);
441
442        // Test different thread counts
443        for threads in [1, 2, 4, 8, 16] {
444            if threads <= self.hardware.cpu_threads {
445                let mut profile = base_profile.clone();
446                profile.num_threads = threads;
447                profile.chunk_size = Self::optimal_chunk_size(problemsize, threads);
448
449                let time = benchmark_fn(&profile);
450                if time < best_time {
451                    best_time = time;
452                    best_profile = profile;
453                }
454            }
455        }
456
457        // Test different block sizes
458        for &factor in &[0.5, 1.0, 2.0, 4.0] {
459            let mut profile = best_profile.clone();
460            profile.block_size = ((base_profile.block_size as f64) * factor) as usize;
461            profile.block_size = profile.block_size.max(32).min(problemsize);
462
463            let time = benchmark_fn(&profile);
464            if time < best_time {
465                best_time = time;
466                best_profile = profile;
467            }
468        }
469
470        // Cache the result
471        let cache_key = format!("{algorithm_name}_{problemsize}");
472        self.cache.insert(cache_key, best_profile.clone());
473
474        best_profile
475    }
476
477    /// Get hardware information
478    pub fn hardware_info(&self) -> &HardwareInfo {
479        &self.hardware
480    }
481}
482
483/// Auto-tuning for specific algorithm types
484pub struct AlgorithmTuner;
485
486impl AlgorithmTuner {
487    /// Tune parameters for matrix operations
488    pub fn tune_matrix_operations(_hardware: &HardwareInfo, matrixsize: usize) -> TuningProfile {
489        let tuner = AutoTuner::new(_hardware.clone());
490
491        let mut profile = tuner.tune_for_problemsize(matrixsize * matrixsize);
492
493        // Matrix-specific adjustments
494        if matrixsize >= 1000 {
495            profile.block_size = 64; // Good block _size for matrix multiplication
496            profile.use_simd = true;
497        }
498
499        profile
500    }
501
502    /// Tune parameters for ODE solving
503    pub fn tune_ode_solver(
504        hardware: &HardwareInfo,
505        system_size: usize,
506        time_steps: usize,
507    ) -> TuningProfile {
508        let tuner = AutoTuner::new(hardware.clone());
509        let problemsize = system_size * time_steps;
510
511        let mut profile = tuner.tune_for_problemsize(problemsize);
512
513        // ODE-specific adjustments
514        if system_size > 100 {
515            profile.use_simd = true;
516            profile.default_tolerance = 1e-8; // Good balance for ODEs
517            profile.max_iterations = 50;
518        }
519
520        profile
521    }
522
523    /// Tune parameters for Monte Carlo integration
524    pub fn tune_monte_carlo(
525        hardware: &HardwareInfo,
526        dimensions: usize,
527        samples: usize,
528    ) -> TuningProfile {
529        let tuner = AutoTuner::new(hardware.clone());
530
531        let mut profile = tuner.tune_for_problemsize(samples);
532
533        // Monte Carlo specific adjustments
534        profile.num_threads = hardware.cpu_threads; // MC benefits from all threads
535        profile.chunk_size = (samples / (hardware.cpu_threads * 8)).max(1000);
536
537        if dimensions > 10 {
538            profile.use_gpu = hardware.gpu_info.is_some(); // High-D benefits from GPU
539        }
540
541        profile
542    }
543}
544
545#[cfg(test)]
546mod tests {
547    use super::*;
548
549    #[test]
550    fn test_hardware_detection() {
551        let detector = HardwareDetector;
552        let hardware = detector.detect();
553
554        assert!(hardware.cpu_cores > 0);
555        assert!(hardware.cpu_threads >= hardware.cpu_cores);
556        assert!(hardware.l1_cache_size > 0);
557        assert!(hardware.l2_cache_size >= hardware.l1_cache_size);
558        assert!(hardware.l3_cache_size >= hardware.l2_cache_size);
559        assert!(hardware.memory_size > 0);
560    }
561
562    #[test]
563    fn test_auto_tuner() {
564        let detector = HardwareDetector;
565        let hardware = detector.detect();
566        let tuner = AutoTuner::new(hardware);
567
568        // Test small problem
569        let small_profile = tuner.tune_for_problemsize(100);
570        assert_eq!(small_profile.num_threads, 1);
571
572        // Test large problem
573        let large_profile = tuner.tune_for_problemsize(100000);
574        assert!(large_profile.num_threads > 1);
575        assert!(large_profile.block_size > 0);
576        assert!(large_profile.chunk_size > 0);
577    }
578
579    #[test]
580    fn test_algorithm_specific_tuning() {
581        let detector = HardwareDetector;
582        let hardware = detector.detect();
583
584        // Test matrix operations tuning
585        let matrix_profile = AlgorithmTuner::tune_matrix_operations(&hardware, 1000);
586        assert_eq!(matrix_profile.block_size, 64);
587
588        // Test ODE solver tuning
589        let ode_profile = AlgorithmTuner::tune_ode_solver(&hardware, 100, 1000);
590        assert!(ode_profile.max_iterations > 0);
591        assert!(ode_profile.default_tolerance > 0.0);
592
593        // Test Monte Carlo tuning
594        let mc_profile = AlgorithmTuner::tune_monte_carlo(&hardware, 5, 1000000);
595        assert!(mc_profile.chunk_size > 0);
596    }
597}