1use crate::common::IntegrateFloat;
32use std::collections::HashMap;
33use std::sync::OnceLock;
34use std::time::{Duration, Instant};
35
36#[derive(Debug, Clone)]
38pub struct HardwareInfo {
39 pub cpu_cores: usize,
41 pub cpu_threads: usize,
43 pub cpu_model: String,
45 pub l1_cache_size: usize,
47 pub l2_cache_size: usize,
49 pub l3_cache_size: usize,
51 pub memory_size: usize,
53 pub simd_features: Vec<SimdFeature>,
55 pub memory_bandwidth: Option<f64>,
57 pub gpu_info: Option<GpuInfo>,
59}
60
61#[derive(Debug, Clone)]
62pub enum SimdFeature {
63 SSE,
64 SSE2,
65 SSE3,
66 SSSE3,
67 SSE41,
68 SSE42,
69 AVX,
70 AVX2,
71 AVX512F,
72 FMA,
73 NEON, }
75
76#[derive(Debug, Clone)]
77pub struct GpuInfo {
78 pub vendor: String,
79 pub model: String,
80 pub memory_size: usize,
81 pub compute_units: usize,
82}
83
84pub struct HardwareDetector;
86
87impl HardwareDetector {
88 pub fn detect(&self) -> HardwareInfo {
90 static HARDWARE_INFO: OnceLock<HardwareInfo> = OnceLock::new();
92
93 HARDWARE_INFO.get_or_init(Self::detect_hardware).clone()
94 }
95
96 fn detect_hardware() -> HardwareInfo {
98 let cpu_cores = Self::detect_cpu_cores();
99 let cpu_threads = Self::detect_cpu_threads();
100 let cpu_model = Self::detect_cpu_model();
101 let (l1_cache_size, l2_cache_size, l3_cache_size) = Self::detect_cache_sizes();
102 let memory_size = Self::detect_memory_size();
103 let simd_features = Self::detect_simd_features();
104 let memory_bandwidth = Self::estimate_memory_bandwidth();
105 let gpu_info = Self::detect_gpu();
106
107 HardwareInfo {
108 cpu_cores,
109 cpu_threads,
110 cpu_model,
111 l1_cache_size,
112 l2_cache_size,
113 l3_cache_size,
114 memory_size,
115 simd_features,
116 memory_bandwidth,
117 gpu_info,
118 }
119 }
120
121 fn detect_cpu_cores() -> usize {
123 if let Some(cores) = std::thread::available_parallelism().ok().map(|n| n.get()) {
125 cores / 2 } else {
128 1
129 }
130 .max(1)
131 }
132
133 fn detect_cpu_threads() -> usize {
135 std::thread::available_parallelism()
136 .ok()
137 .map(|n| n.get())
138 .unwrap_or(1)
139 }
140
141 fn detect_cpu_model() -> String {
143 format!("{} CPU", std::env::consts::ARCH)
144 }
145
146 fn detect_cache_sizes() -> (usize, usize, usize) {
148 #[cfg(target_arch = "x86_64")]
150 {
151 (32 * 1024, 256 * 1024, 8 * 1024 * 1024)
153 }
154
155 #[cfg(target_arch = "aarch64")]
156 {
157 (64 * 1024, 512 * 1024, 4 * 1024 * 1024)
159 }
160
161 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
162 {
163 (32 * 1024, 256 * 1024, 2 * 1024 * 1024)
165 }
166 }
167
168 fn detect_memory_size() -> usize {
170 8 * 1024 * 1024 * 1024 }
174
175 fn detect_simd_features() -> Vec<SimdFeature> {
177 let mut features = Vec::new();
178
179 #[cfg(target_arch = "x86_64")]
180 {
181 if std::is_x86_feature_detected!("sse") {
183 features.push(SimdFeature::SSE);
184 }
185 if std::is_x86_feature_detected!("sse2") {
186 features.push(SimdFeature::SSE2);
187 }
188 if std::is_x86_feature_detected!("sse3") {
189 features.push(SimdFeature::SSE3);
190 }
191 if std::is_x86_feature_detected!("ssse3") {
192 features.push(SimdFeature::SSSE3);
193 }
194 if std::is_x86_feature_detected!("sse4.1") {
195 features.push(SimdFeature::SSE41);
196 }
197 if std::is_x86_feature_detected!("sse4.2") {
198 features.push(SimdFeature::SSE42);
199 }
200 if std::is_x86_feature_detected!("avx") {
201 features.push(SimdFeature::AVX);
202 }
203 if std::is_x86_feature_detected!("avx2") {
204 features.push(SimdFeature::AVX2);
205 }
206 if std::is_x86_feature_detected!("avx512f") {
207 features.push(SimdFeature::AVX512F);
208 }
209 if std::is_x86_feature_detected!("fma") {
210 features.push(SimdFeature::FMA);
211 }
212 }
213
214 #[cfg(target_arch = "aarch64")]
215 {
216 features.push(SimdFeature::NEON);
218 }
219
220 features
221 }
222
223 fn estimate_memory_bandwidth() -> Option<f64> {
225 let size = 10 * 1024 * 1024; let data: Vec<u64> = vec![1; size / 8];
228
229 let start = Instant::now();
230 let sum: u64 = data.iter().sum();
231 let duration = start.elapsed();
232
233 let _ = sum;
235
236 if duration.as_nanos() > 0 {
237 let bytes_per_second = (size as f64) / duration.as_secs_f64();
238 Some(bytes_per_second)
239 } else {
240 None
241 }
242 }
243
244 fn detect_gpu() -> Option<GpuInfo> {
246 let detection_result = scirs2_core::gpu::backends::detect_gpu_backends();
248
249 detection_result
251 .devices
252 .into_iter()
253 .find(|device| device.backend != scirs2_core::gpu::GpuBackend::Cpu)
254 .map(|device| GpuInfo {
255 vendor: match device.backend {
256 scirs2_core::gpu::GpuBackend::Cuda => "NVIDIA".to_string(),
257 scirs2_core::gpu::GpuBackend::Rocm => "AMD".to_string(),
258 scirs2_core::gpu::GpuBackend::Metal => "Apple".to_string(),
259 scirs2_core::gpu::GpuBackend::OpenCL => "Unknown".to_string(),
260 scirs2_core::gpu::GpuBackend::Wgpu => "WebGPU".to_string(),
261 scirs2_core::gpu::GpuBackend::Cpu => "CPU".to_string(),
262 },
263 model: device.device_name,
264 memory_size: device.memory_bytes.unwrap_or(0) as usize,
265 compute_units: if device.supports_tensors { 1 } else { 0 }, })
267 }
268}
269
270#[derive(Debug, Clone)]
272pub struct TuningProfile {
273 pub num_threads: usize,
275 pub block_size: usize,
277 pub chunk_size: usize,
279 pub use_simd: bool,
281 pub memory_pool_size: usize,
283 pub default_tolerance: f64,
285 pub max_iterations: usize,
287 pub use_gpu: bool,
289}
290
291pub struct AutoTuner {
293 hardware: HardwareInfo,
294 cache: HashMap<String, TuningProfile>,
295}
296
297impl AutoTuner {
298 pub fn new(hardware: HardwareInfo) -> Self {
300 Self {
301 hardware,
302 cache: HashMap::new(),
303 }
304 }
305
306 pub fn auto(&self) -> Self {
308 Self::new(HardwareDetector.detect())
309 }
310
311 pub fn tune_for_problemsize(&self, problemsize: usize) -> TuningProfile {
313 let cache_key = format!("size_{problemsize}");
314
315 if let Some(cached) = self.cache.get(&cache_key) {
316 return cached.clone();
317 }
318
319 self.compute_tuning_profile(problemsize)
320 }
321
322 fn compute_tuning_profile(&self, problemsize: usize) -> TuningProfile {
324 let num_threads = self.optimal_thread_count(problemsize);
326
327 let block_size = self.optimal_block_size(problemsize);
329
330 let chunk_size = Self::optimal_chunk_size(problemsize, num_threads);
332
333 let use_simd = !self.hardware.simd_features.is_empty() && problemsize >= 64;
335
336 let memory_pool_size = self.optimal_memory_pool_size(problemsize);
338
339 let (default_tolerance, max_iterations) = Self::optimal_tolerances(problemsize);
341
342 let use_gpu = self.hardware.gpu_info.is_some() && problemsize >= 10000;
344
345 TuningProfile {
346 num_threads,
347 block_size,
348 chunk_size,
349 use_simd,
350 memory_pool_size,
351 default_tolerance,
352 max_iterations,
353 use_gpu,
354 }
355 }
356
357 fn optimal_thread_count(&self, problemsize: usize) -> usize {
359 let max_threads = self.hardware.cpu_threads;
360
361 if problemsize < 1000 {
362 1
364 } else if problemsize < 10000 {
365 (max_threads / 2).clamp(1, 4)
367 } else {
368 max_threads.min(problemsize / 1000)
370 }
371 }
372
373 fn optimal_block_size(&self, problemsize: usize) -> usize {
375 let l1_elements = self.hardware.l1_cache_size / 8; let l2_elements = self.hardware.l2_cache_size / 8;
377
378 if problemsize <= l1_elements {
379 problemsize
381 } else if problemsize <= l2_elements {
382 l1_elements / 4
384 } else {
385 l2_elements / 16
387 }
388 }
389
390 fn optimal_chunk_size(_problemsize: usize, numthreads: usize) -> usize {
392 if numthreads <= 1 {
393 _problemsize
394 } else {
395 let min_chunk = 100; let ideal_chunk = _problemsize / (numthreads * 4); ideal_chunk.max(min_chunk)
399 }
400 }
401
402 fn optimal_memory_pool_size(&self, problemsize: usize) -> usize {
404 let base_size = problemsize * 8 * 4; let max_pool = self.hardware.memory_size / 8; base_size.min(max_pool).max(1024 * 1024) }
410
411 fn optimal_tolerances(_problemsize: usize) -> (f64, usize) {
413 if _problemsize < 1000 {
414 (1e-12, 100) } else if _problemsize < 100000 {
416 (1e-10, 500) } else {
418 (1e-8, 1000) }
420 }
421
422 pub fn benchmark_tune<F: IntegrateFloat>(
424 &mut self,
425 algorithm_name: &str,
426 benchmark_fn: impl Fn(&TuningProfile) -> Duration,
427 problemsize: usize,
428 ) -> TuningProfile {
429 let base_profile = self.tune_for_problemsize(problemsize);
430
431 let mut best_profile = base_profile.clone();
433 let mut best_time = benchmark_fn(&base_profile);
434
435 for threads in [1, 2, 4, 8, 16] {
437 if threads <= self.hardware.cpu_threads {
438 let mut profile = base_profile.clone();
439 profile.num_threads = threads;
440 profile.chunk_size = Self::optimal_chunk_size(problemsize, threads);
441
442 let time = benchmark_fn(&profile);
443 if time < best_time {
444 best_time = time;
445 best_profile = profile;
446 }
447 }
448 }
449
450 for &factor in &[0.5, 1.0, 2.0, 4.0] {
452 let mut profile = best_profile.clone();
453 profile.block_size = ((base_profile.block_size as f64) * factor) as usize;
454 profile.block_size = profile.block_size.max(32).min(problemsize);
455
456 let time = benchmark_fn(&profile);
457 if time < best_time {
458 best_time = time;
459 best_profile = profile;
460 }
461 }
462
463 let cache_key = format!("{algorithm_name}_{problemsize}");
465 self.cache.insert(cache_key, best_profile.clone());
466
467 best_profile
468 }
469
470 pub fn hardware_info(&self) -> &HardwareInfo {
472 &self.hardware
473 }
474}
475
476pub struct AlgorithmTuner;
478
479impl AlgorithmTuner {
480 pub fn tune_matrix_operations(_hardware: &HardwareInfo, matrixsize: usize) -> TuningProfile {
482 let tuner = AutoTuner::new(_hardware.clone());
483
484 let mut profile = tuner.tune_for_problemsize(matrixsize * matrixsize);
485
486 if matrixsize >= 1000 {
488 profile.block_size = 64; profile.use_simd = true;
490 }
491
492 profile
493 }
494
495 pub fn tune_ode_solver(
497 hardware: &HardwareInfo,
498 system_size: usize,
499 time_steps: usize,
500 ) -> TuningProfile {
501 let tuner = AutoTuner::new(hardware.clone());
502 let problemsize = system_size * time_steps;
503
504 let mut profile = tuner.tune_for_problemsize(problemsize);
505
506 if system_size > 100 {
508 profile.use_simd = true;
509 profile.default_tolerance = 1e-8; profile.max_iterations = 50;
511 }
512
513 profile
514 }
515
516 pub fn tune_monte_carlo(
518 hardware: &HardwareInfo,
519 dimensions: usize,
520 samples: usize,
521 ) -> TuningProfile {
522 let tuner = AutoTuner::new(hardware.clone());
523
524 let mut profile = tuner.tune_for_problemsize(samples);
525
526 profile.num_threads = hardware.cpu_threads; profile.chunk_size = (samples / (hardware.cpu_threads * 8)).max(1000);
529
530 if dimensions > 10 {
531 profile.use_gpu = hardware.gpu_info.is_some(); }
533
534 profile
535 }
536}
537
538#[cfg(test)]
539mod tests {
540 use super::*;
541
542 #[test]
543 fn test_hardware_detection() {
544 let detector = HardwareDetector;
545 let hardware = detector.detect();
546
547 assert!(hardware.cpu_cores > 0);
548 assert!(hardware.cpu_threads >= hardware.cpu_cores);
549 assert!(hardware.l1_cache_size > 0);
550 assert!(hardware.l2_cache_size >= hardware.l1_cache_size);
551 assert!(hardware.l3_cache_size >= hardware.l2_cache_size);
552 assert!(hardware.memory_size > 0);
553 }
554
555 #[test]
556 fn test_auto_tuner() {
557 let detector = HardwareDetector;
558 let hardware = detector.detect();
559 let tuner = AutoTuner::new(hardware);
560
561 let small_profile = tuner.tune_for_problemsize(100);
563 assert_eq!(small_profile.num_threads, 1);
564
565 let large_profile = tuner.tune_for_problemsize(100000);
567 assert!(large_profile.num_threads > 1);
568 assert!(large_profile.block_size > 0);
569 assert!(large_profile.chunk_size > 0);
570 }
571
572 #[test]
573 fn test_algorithm_specific_tuning() {
574 let detector = HardwareDetector;
575 let hardware = detector.detect();
576
577 let matrix_profile = AlgorithmTuner::tune_matrix_operations(&hardware, 1000);
579 assert_eq!(matrix_profile.block_size, 64);
580
581 let ode_profile = AlgorithmTuner::tune_ode_solver(&hardware, 100, 1000);
583 assert!(ode_profile.max_iterations > 0);
584 assert!(ode_profile.default_tolerance > 0.0);
585
586 let mc_profile = AlgorithmTuner::tune_monte_carlo(&hardware, 5, 1000000);
588 assert!(mc_profile.chunk_size > 0);
589 }
590}