1use crate::common::IntegrateFloat;
32use std::collections::HashMap;
33use std::sync::OnceLock;
34use std::time::{Duration, Instant};
35
36#[derive(Debug, Clone)]
38pub struct HardwareInfo {
39 pub cpu_cores: usize,
41 pub cpu_threads: usize,
43 pub cpu_model: String,
45 pub l1_cache_size: usize,
47 pub l2_cache_size: usize,
49 pub l3_cache_size: usize,
51 pub memory_size: usize,
53 pub simd_features: Vec<SimdFeature>,
55 pub memory_bandwidth: Option<f64>,
57 pub gpu_info: Option<GpuInfo>,
59}
60
61#[derive(Debug, Clone)]
62pub enum SimdFeature {
63 SSE,
64 SSE2,
65 SSE3,
66 SSSE3,
67 SSE41,
68 SSE42,
69 AVX,
70 AVX2,
71 AVX512F,
72 FMA,
73 NEON, }
75
76#[derive(Debug, Clone)]
77pub struct GpuInfo {
78 pub vendor: String,
79 pub model: String,
80 pub memory_size: usize,
81 pub compute_units: usize,
82}
83
84pub struct HardwareDetector;
86
87impl HardwareDetector {
88 pub fn detect(&self) -> HardwareInfo {
90 static HARDWARE_INFO: OnceLock<HardwareInfo> = OnceLock::new();
92
93 HARDWARE_INFO.get_or_init(Self::detect_hardware).clone()
94 }
95
96 fn detect_hardware() -> HardwareInfo {
98 let cpu_cores = Self::detect_cpu_cores();
99 let cpu_threads = Self::detect_cpu_threads();
100 let cpu_model = Self::detect_cpu_model();
101 let (l1_cache_size, l2_cache_size, l3_cache_size) = Self::detect_cache_sizes();
102 let memory_size = Self::detect_memory_size();
103 let simd_features = Self::detect_simd_features();
104 let memory_bandwidth = Self::estimate_memory_bandwidth();
105 let gpu_info = Self::detect_gpu();
106
107 HardwareInfo {
108 cpu_cores,
109 cpu_threads,
110 cpu_model,
111 l1_cache_size,
112 l2_cache_size,
113 l3_cache_size,
114 memory_size,
115 simd_features,
116 memory_bandwidth,
117 gpu_info,
118 }
119 }
120
121 fn detect_cpu_cores() -> usize {
123 if let Some(cores) = std::thread::available_parallelism().ok().map(|n| n.get()) {
125 cores / 2 } else {
128 1
129 }
130 .max(1)
131 }
132
133 fn detect_cpu_threads() -> usize {
135 std::thread::available_parallelism()
136 .ok()
137 .map(|n| n.get())
138 .unwrap_or(1)
139 }
140
141 fn detect_cpu_model() -> String {
143 format!("{} CPU", std::env::consts::ARCH)
144 }
145
146 fn detect_cache_sizes() -> (usize, usize, usize) {
148 #[cfg(target_arch = "x86_64")]
150 {
151 (32 * 1024, 256 * 1024, 8 * 1024 * 1024)
153 }
154
155 #[cfg(target_arch = "aarch64")]
156 {
157 (64 * 1024, 512 * 1024, 4 * 1024 * 1024)
159 }
160
161 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
162 {
163 (32 * 1024, 256 * 1024, 2 * 1024 * 1024)
165 }
166 }
167
168 fn detect_memory_size() -> usize {
170 #[cfg(target_pointer_width = "32")]
173 {
174 512 * 1024 * 1024
175 } #[cfg(target_pointer_width = "64")]
177 {
178 8usize * 1024 * 1024 * 1024
179 } }
181
182 fn detect_simd_features() -> Vec<SimdFeature> {
184 let mut features = Vec::new();
185
186 #[cfg(target_arch = "x86_64")]
187 {
188 if std::is_x86_feature_detected!("sse") {
190 features.push(SimdFeature::SSE);
191 }
192 if std::is_x86_feature_detected!("sse2") {
193 features.push(SimdFeature::SSE2);
194 }
195 if std::is_x86_feature_detected!("sse3") {
196 features.push(SimdFeature::SSE3);
197 }
198 if std::is_x86_feature_detected!("ssse3") {
199 features.push(SimdFeature::SSSE3);
200 }
201 if std::is_x86_feature_detected!("sse4.1") {
202 features.push(SimdFeature::SSE41);
203 }
204 if std::is_x86_feature_detected!("sse4.2") {
205 features.push(SimdFeature::SSE42);
206 }
207 if std::is_x86_feature_detected!("avx") {
208 features.push(SimdFeature::AVX);
209 }
210 if std::is_x86_feature_detected!("avx2") {
211 features.push(SimdFeature::AVX2);
212 }
213 if std::is_x86_feature_detected!("avx512f") {
214 features.push(SimdFeature::AVX512F);
215 }
216 if std::is_x86_feature_detected!("fma") {
217 features.push(SimdFeature::FMA);
218 }
219 }
220
221 #[cfg(target_arch = "aarch64")]
222 {
223 features.push(SimdFeature::NEON);
225 }
226
227 features
228 }
229
230 fn estimate_memory_bandwidth() -> Option<f64> {
232 let size = 10 * 1024 * 1024; let data: Vec<u64> = vec![1; size / 8];
235
236 let start = Instant::now();
237 let sum: u64 = data.iter().sum();
238 let duration = start.elapsed();
239
240 let _ = sum;
242
243 if duration.as_nanos() > 0 {
244 let bytes_per_second = (size as f64) / duration.as_secs_f64();
245 Some(bytes_per_second)
246 } else {
247 None
248 }
249 }
250
251 fn detect_gpu() -> Option<GpuInfo> {
253 let detection_result = scirs2_core::gpu::backends::detect_gpu_backends();
255
256 detection_result
258 .devices
259 .into_iter()
260 .find(|device| device.backend != scirs2_core::gpu::GpuBackend::Cpu)
261 .map(|device| GpuInfo {
262 vendor: match device.backend {
263 scirs2_core::gpu::GpuBackend::Cuda => "NVIDIA".to_string(),
264 scirs2_core::gpu::GpuBackend::Rocm => "AMD".to_string(),
265 scirs2_core::gpu::GpuBackend::Metal => "Apple".to_string(),
266 scirs2_core::gpu::GpuBackend::OpenCL => "Unknown".to_string(),
267 scirs2_core::gpu::GpuBackend::Wgpu => "WebGPU".to_string(),
268 scirs2_core::gpu::GpuBackend::Cpu => "CPU".to_string(),
269 },
270 model: device.device_name,
271 memory_size: device.memory_bytes.unwrap_or(0) as usize,
272 compute_units: if device.supports_tensors { 1 } else { 0 }, })
274 }
275}
276
277#[derive(Debug, Clone)]
279pub struct TuningProfile {
280 pub num_threads: usize,
282 pub block_size: usize,
284 pub chunk_size: usize,
286 pub use_simd: bool,
288 pub memory_pool_size: usize,
290 pub default_tolerance: f64,
292 pub max_iterations: usize,
294 pub use_gpu: bool,
296}
297
298pub struct AutoTuner {
300 hardware: HardwareInfo,
301 cache: HashMap<String, TuningProfile>,
302}
303
304impl AutoTuner {
305 pub fn new(hardware: HardwareInfo) -> Self {
307 Self {
308 hardware,
309 cache: HashMap::new(),
310 }
311 }
312
313 pub fn auto(&self) -> Self {
315 Self::new(HardwareDetector.detect())
316 }
317
318 pub fn tune_for_problemsize(&self, problemsize: usize) -> TuningProfile {
320 let cache_key = format!("size_{problemsize}");
321
322 if let Some(cached) = self.cache.get(&cache_key) {
323 return cached.clone();
324 }
325
326 self.compute_tuning_profile(problemsize)
327 }
328
329 fn compute_tuning_profile(&self, problemsize: usize) -> TuningProfile {
331 let num_threads = self.optimal_thread_count(problemsize);
333
334 let block_size = self.optimal_block_size(problemsize);
336
337 let chunk_size = Self::optimal_chunk_size(problemsize, num_threads);
339
340 let use_simd = !self.hardware.simd_features.is_empty() && problemsize >= 64;
342
343 let memory_pool_size = self.optimal_memory_pool_size(problemsize);
345
346 let (default_tolerance, max_iterations) = Self::optimal_tolerances(problemsize);
348
349 let use_gpu = self.hardware.gpu_info.is_some() && problemsize >= 10000;
351
352 TuningProfile {
353 num_threads,
354 block_size,
355 chunk_size,
356 use_simd,
357 memory_pool_size,
358 default_tolerance,
359 max_iterations,
360 use_gpu,
361 }
362 }
363
364 fn optimal_thread_count(&self, problemsize: usize) -> usize {
366 let max_threads = self.hardware.cpu_threads;
367
368 if problemsize < 1000 {
369 1
371 } else if problemsize < 10000 {
372 (max_threads / 2).clamp(1, 4)
374 } else {
375 max_threads.min(problemsize / 1000)
377 }
378 }
379
380 fn optimal_block_size(&self, problemsize: usize) -> usize {
382 let l1_elements = self.hardware.l1_cache_size / 8; let l2_elements = self.hardware.l2_cache_size / 8;
384
385 if problemsize <= l1_elements {
386 problemsize
388 } else if problemsize <= l2_elements {
389 l1_elements / 4
391 } else {
392 l2_elements / 16
394 }
395 }
396
397 fn optimal_chunk_size(_problemsize: usize, numthreads: usize) -> usize {
399 if numthreads <= 1 {
400 _problemsize
401 } else {
402 let min_chunk = 100; let ideal_chunk = _problemsize / (numthreads * 4); ideal_chunk.max(min_chunk)
406 }
407 }
408
409 fn optimal_memory_pool_size(&self, problemsize: usize) -> usize {
411 let base_size = problemsize * 8 * 4; let max_pool = self.hardware.memory_size / 8; base_size.min(max_pool).max(1024 * 1024) }
417
418 fn optimal_tolerances(_problemsize: usize) -> (f64, usize) {
420 if _problemsize < 1000 {
421 (1e-12, 100) } else if _problemsize < 100000 {
423 (1e-10, 500) } else {
425 (1e-8, 1000) }
427 }
428
429 pub fn benchmark_tune<F: IntegrateFloat>(
431 &mut self,
432 algorithm_name: &str,
433 benchmark_fn: impl Fn(&TuningProfile) -> Duration,
434 problemsize: usize,
435 ) -> TuningProfile {
436 let base_profile = self.tune_for_problemsize(problemsize);
437
438 let mut best_profile = base_profile.clone();
440 let mut best_time = benchmark_fn(&base_profile);
441
442 for threads in [1, 2, 4, 8, 16] {
444 if threads <= self.hardware.cpu_threads {
445 let mut profile = base_profile.clone();
446 profile.num_threads = threads;
447 profile.chunk_size = Self::optimal_chunk_size(problemsize, threads);
448
449 let time = benchmark_fn(&profile);
450 if time < best_time {
451 best_time = time;
452 best_profile = profile;
453 }
454 }
455 }
456
457 for &factor in &[0.5, 1.0, 2.0, 4.0] {
459 let mut profile = best_profile.clone();
460 profile.block_size = ((base_profile.block_size as f64) * factor) as usize;
461 profile.block_size = profile.block_size.max(32).min(problemsize);
462
463 let time = benchmark_fn(&profile);
464 if time < best_time {
465 best_time = time;
466 best_profile = profile;
467 }
468 }
469
470 let cache_key = format!("{algorithm_name}_{problemsize}");
472 self.cache.insert(cache_key, best_profile.clone());
473
474 best_profile
475 }
476
477 pub fn hardware_info(&self) -> &HardwareInfo {
479 &self.hardware
480 }
481}
482
483pub struct AlgorithmTuner;
485
486impl AlgorithmTuner {
487 pub fn tune_matrix_operations(_hardware: &HardwareInfo, matrixsize: usize) -> TuningProfile {
489 let tuner = AutoTuner::new(_hardware.clone());
490
491 let mut profile = tuner.tune_for_problemsize(matrixsize * matrixsize);
492
493 if matrixsize >= 1000 {
495 profile.block_size = 64; profile.use_simd = true;
497 }
498
499 profile
500 }
501
502 pub fn tune_ode_solver(
504 hardware: &HardwareInfo,
505 system_size: usize,
506 time_steps: usize,
507 ) -> TuningProfile {
508 let tuner = AutoTuner::new(hardware.clone());
509 let problemsize = system_size * time_steps;
510
511 let mut profile = tuner.tune_for_problemsize(problemsize);
512
513 if system_size > 100 {
515 profile.use_simd = true;
516 profile.default_tolerance = 1e-8; profile.max_iterations = 50;
518 }
519
520 profile
521 }
522
523 pub fn tune_monte_carlo(
525 hardware: &HardwareInfo,
526 dimensions: usize,
527 samples: usize,
528 ) -> TuningProfile {
529 let tuner = AutoTuner::new(hardware.clone());
530
531 let mut profile = tuner.tune_for_problemsize(samples);
532
533 profile.num_threads = hardware.cpu_threads; profile.chunk_size = (samples / (hardware.cpu_threads * 8)).max(1000);
536
537 if dimensions > 10 {
538 profile.use_gpu = hardware.gpu_info.is_some(); }
540
541 profile
542 }
543}
544
545#[cfg(test)]
546mod tests {
547 use super::*;
548
549 #[test]
550 fn test_hardware_detection() {
551 let detector = HardwareDetector;
552 let hardware = detector.detect();
553
554 assert!(hardware.cpu_cores > 0);
555 assert!(hardware.cpu_threads >= hardware.cpu_cores);
556 assert!(hardware.l1_cache_size > 0);
557 assert!(hardware.l2_cache_size >= hardware.l1_cache_size);
558 assert!(hardware.l3_cache_size >= hardware.l2_cache_size);
559 assert!(hardware.memory_size > 0);
560 }
561
562 #[test]
563 fn test_auto_tuner() {
564 let detector = HardwareDetector;
565 let hardware = detector.detect();
566 let tuner = AutoTuner::new(hardware);
567
568 let small_profile = tuner.tune_for_problemsize(100);
570 assert_eq!(small_profile.num_threads, 1);
571
572 let large_profile = tuner.tune_for_problemsize(100000);
574 assert!(large_profile.num_threads > 1);
575 assert!(large_profile.block_size > 0);
576 assert!(large_profile.chunk_size > 0);
577 }
578
579 #[test]
580 fn test_algorithm_specific_tuning() {
581 let detector = HardwareDetector;
582 let hardware = detector.detect();
583
584 let matrix_profile = AlgorithmTuner::tune_matrix_operations(&hardware, 1000);
586 assert_eq!(matrix_profile.block_size, 64);
587
588 let ode_profile = AlgorithmTuner::tune_ode_solver(&hardware, 100, 1000);
590 assert!(ode_profile.max_iterations > 0);
591 assert!(ode_profile.default_tolerance > 0.0);
592
593 let mc_profile = AlgorithmTuner::tune_monte_carlo(&hardware, 5, 1000000);
595 assert!(mc_profile.chunk_size > 0);
596 }
597}