1use crate::gpu::{GpuBackend, GpuError, GpuKernelHandle};
7use rand::Rng;
8use std::collections::HashMap;
9use std::sync::{Arc, Mutex};
10use std::time::{Duration, Instant};
11use thiserror::Error;
12
13#[derive(Error, Debug)]
15pub enum AutoTuningError {
16 #[error("No tuning configurations available for kernel: {0}")]
18 NoConfigurations(String),
19
20 #[error("Auto-tuning failed: {0}")]
22 TuningFailed(String),
23
24 #[error("Invalid parameter configuration: {0}")]
26 InvalidConfiguration(String),
27
28 #[error("Benchmark execution failed: {0}")]
30 BenchmarkFailed(String),
31
32 #[error("GPU error: {0}")]
34 GpuError(#[from] GpuError),
35}
36
37#[derive(Debug, Clone, PartialEq)]
39pub struct KernelParameters {
40 pub work_group_size: [u32; 3],
42 pub global_work_size: [u32; 3],
44 pub local_memory_size: usize,
46 pub register_usage: Option<usize>,
48 pub cacheconfig: CacheConfig,
50 pub custom_params: HashMap<String, ParameterValue>,
52}
53
54impl Default for KernelParameters {
55 fn default() -> Self {
56 Self {
57 work_group_size: [16, 16, 1],
58 global_work_size: [1024, 1024, 1],
59 local_memory_size: 0,
60 register_usage: None,
61 cacheconfig: CacheConfig::Balanced,
62 custom_params: HashMap::new(),
63 }
64 }
65}
66
67#[derive(Debug, Clone, PartialEq)]
69pub enum ParameterValue {
70 Int(i64),
72 Float(f64),
74 String(String),
76 Bool(bool),
78 IntArray(Vec<i64>),
80 FloatArray(Vec<f64>),
82}
83
84impl ParameterValue {
85 pub fn as_int(&self) -> Option<i64> {
87 match self {
88 ParameterValue::Int(val) => Some(*val),
89 ParameterValue::Float(val) => Some(*val as i64),
90 _ => None,
91 }
92 }
93
94 pub fn as_float(&self) -> Option<f64> {
96 match self {
97 ParameterValue::Float(val) => Some(*val),
98 ParameterValue::Int(val) => Some(*val as f64),
99 _ => None,
100 }
101 }
102
103 pub fn as_string(&self) -> String {
105 match self {
106 ParameterValue::String(val) => val.clone(),
107 ParameterValue::Int(val) => val.to_string(),
108 ParameterValue::Float(val) => val.to_string(),
109 ParameterValue::Bool(val) => val.to_string(),
110 _ => format!("{self:?}"),
111 }
112 }
113}
114
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
117pub enum CacheConfig {
118 PreferL1,
120 PreferShared,
122 Balanced,
124 ReadOnly,
126 WriteThrough,
128}
129
130#[derive(Debug, Clone)]
132pub struct PerformanceMetrics {
133 pub execution_time: Duration,
135 pub throughput: f64,
137 pub memorybandwidth_util: f64,
139 pub compute_utilization: f64,
141 pub energy_efficiency: Option<f64>,
143 pub cache_metrics: CacheMetrics,
145}
146
147impl Default for PerformanceMetrics {
148 fn default() -> Self {
149 Self {
150 execution_time: Duration::from_millis(0),
151 throughput: 0.0,
152 memorybandwidth_util: 0.0,
153 compute_utilization: 0.0,
154 energy_efficiency: None,
155 cache_metrics: CacheMetrics::default(),
156 }
157 }
158}
159
160#[derive(Debug, Clone, Default)]
162pub struct CacheMetrics {
163 pub l1_hit_rate: f64,
165 pub l2_hit_rate: f64,
167 pub shared_memory_conflicts: usize,
169 pub coalescing_efficiency: f64,
171 pub memory_throughput: f64,
173 pub cache_pressure: f64,
175}
176
177#[derive(Debug, Clone)]
179pub struct TuningStrategy {
180 pub search_algorithm: SearchAlgorithm,
182 pub max_evaluations: usize,
184 pub time_budget: Duration,
186 pub benchmark_runs: usize,
188 pub convergence_threshold: f64,
190 pub use_history: bool,
192}
193
194impl Default for TuningStrategy {
195 fn default() -> Self {
196 Self {
197 search_algorithm: SearchAlgorithm::GridSearch,
198 max_evaluations: 100,
199 time_budget: Duration::from_secs(60),
200 benchmark_runs: 3,
201 convergence_threshold: 0.01, use_history: true,
203 }
204 }
205}
206
207#[derive(Debug, Clone, Copy, PartialEq, Eq)]
209pub enum SearchAlgorithm {
210 GridSearch,
212 RandomSearch,
214 BayesianOptimization,
216 GeneticAlgorithm,
218 SimulatedAnnealing,
220 DifferentialEvolution,
222 ParticleSwarm,
224}
225
226#[derive(Debug, Clone)]
228pub struct TuningSpace {
229 pub work_group_sizes: Vec<[u32; 3]>,
231 pub local_memory_sizes: Vec<usize>,
233 pub cache_configs: Vec<CacheConfig>,
235 pub custom_spaces: HashMap<String, Vec<ParameterValue>>,
237}
238
239impl Default for TuningSpace {
240 fn default() -> Self {
241 Self {
242 work_group_sizes: vec![
243 [8, 8, 1],
244 [16, 16, 1],
245 [32, 32, 1],
246 [64, 8, 1],
247 [8, 64, 1],
248 [128, 1, 1],
249 [256, 1, 1],
250 [512, 1, 1],
251 ],
252 local_memory_sizes: vec![0, 1024, 2048, 4096, 8192, 16384],
253 cache_configs: vec![
254 CacheConfig::Balanced,
255 CacheConfig::PreferL1,
256 CacheConfig::PreferShared,
257 CacheConfig::ReadOnly,
258 ],
259 custom_spaces: HashMap::new(),
260 }
261 }
262}
263
264#[derive(Debug, Clone)]
266pub struct TuningResult {
267 pub best_params: KernelParameters,
269 pub best_performance: PerformanceMetrics,
271 pub evaluations: usize,
273 pub tuning_time: Duration,
275 pub converged: bool,
277 pub improvement_factor: f64,
279}
280
281#[derive(Debug)]
283pub struct AutoTuner {
284 backend: GpuBackend,
285 strategy: TuningStrategy,
286 tuning_cache: Arc<Mutex<HashMap<String, TuningResult>>>,
287 device_info: DeviceInfo,
288}
289
290#[derive(Debug, Clone)]
292struct DeviceInfo {
293 compute_capability: String,
294 #[allow(dead_code)]
295 memory_size: usize,
296 max_work_group_size: usize,
297 max_local_memory_size: usize,
298 #[allow(dead_code)]
299 warp_size: usize,
300}
301
302impl AutoTuner {
303 pub fn new(backend: GpuBackend, strategy: TuningStrategy) -> Result<Self, AutoTuningError> {
305 let device_info = Self::detect_device_info(backend)?;
306
307 Ok(Self {
308 backend,
309 strategy,
310 tuning_cache: Arc::new(Mutex::new(HashMap::new())),
311 device_info,
312 })
313 }
314
315 pub fn tune(
317 &self,
318 kernel: &GpuKernelHandle,
319 kernel_name: &str,
320 problemsize: &[usize],
321 tuning_space: TuningSpace,
322 ) -> Result<TuningResult, AutoTuningError> {
323 let cache_key = self.generate_cache_key(kernel_name, problemsize);
324
325 if self.strategy.use_history {
327 if let Some(cached_result) = self
328 .tuning_cache
329 .lock()
330 .expect("Operation failed")
331 .get(&cache_key)
332 {
333 return Ok(cached_result.clone());
334 }
335 }
336
337 let start_time = Instant::now();
338 let mut best_params = KernelParameters::default();
339 let mut best_performance: Option<PerformanceMetrics> = None;
340 let mut evaluations = 0;
341
342 let configurations = self.generate_configurations(&tuning_space)?;
344
345 for (i, params) in configurations.iter().enumerate() {
346 if start_time.elapsed() > self.strategy.time_budget {
347 break;
348 }
349
350 if evaluations >= self.strategy.max_evaluations {
351 break;
352 }
353
354 match self.benchmark_configuration(kernel, params, problemsize) {
356 Ok(metrics) => {
357 evaluations += 1;
358
359 if best_performance.is_none()
360 || metrics.throughput
361 > best_performance
362 .as_ref()
363 .expect("Operation failed")
364 .throughput
365 {
366 best_params = params.clone();
367 best_performance = Some(metrics);
368 }
369
370 if let Some(ref best) = best_performance {
372 if self.check_convergence(best, i) {
373 break;
374 }
375 }
376 }
377 Err(e) => {
378 eprintln!("Benchmark failed for configuration {params:?}: {e}");
380 }
381 }
382 }
383
384 let best_performance = best_performance.ok_or_else(|| {
385 AutoTuningError::TuningFailed("No successful configurations".to_string())
386 })?;
387
388 let tuning_time = start_time.elapsed();
389 let improvement_factor = 1.0; let result = TuningResult {
392 best_params,
393 best_performance,
394 evaluations,
395 tuning_time,
396 converged: evaluations < self.strategy.max_evaluations,
397 improvement_factor,
398 };
399
400 self.tuning_cache
402 .lock()
403 .expect("Operation failed")
404 .insert(cache_key, result.clone());
405
406 Ok(result)
407 }
408
409 pub fn get_cached_results(&self) -> HashMap<String, TuningResult> {
411 self.tuning_cache.lock().expect("Operation failed").clone()
412 }
413
414 pub fn clear_cache(&self) {
416 self.tuning_cache.lock().expect("Operation failed").clear();
417 }
418
419 fn generate_configurations(
421 &self,
422 space: &TuningSpace,
423 ) -> Result<Vec<KernelParameters>, AutoTuningError> {
424 match self.strategy.search_algorithm {
425 SearchAlgorithm::GridSearch => self.grid_search_configurations(space),
426 SearchAlgorithm::RandomSearch => self.random_search_configurations(space),
427 _ => {
428 self.grid_search_configurations(space)
430 }
431 }
432 }
433
434 fn grid_search_configurations(
436 &self,
437 space: &TuningSpace,
438 ) -> Result<Vec<KernelParameters>, AutoTuningError> {
439 let mut configurations = Vec::new();
440
441 for &work_group_size in &space.work_group_sizes {
442 for &local_memory_size in &space.local_memory_sizes {
443 for &cache_config in &space.cache_configs {
444 if self.is_valid_configuration(work_group_size, local_memory_size) {
446 configurations.push(KernelParameters {
447 work_group_size,
448 global_work_size: [1024, 1024, 1], local_memory_size,
450 register_usage: None,
451 cacheconfig: cache_config,
452 custom_params: HashMap::new(),
453 });
454 }
455 }
456 }
457 }
458
459 Ok(configurations)
460 }
461
462 fn random_search_configurations(
464 &self,
465 space: &TuningSpace,
466 ) -> Result<Vec<KernelParameters>, AutoTuningError> {
467 let mut configurations = Vec::new();
468 let num_samples = self.strategy.max_evaluations.min(100);
469
470 for _ in 0..num_samples {
471 let work_group_size =
472 space.work_group_sizes[rand::rng().random_range(0..space.work_group_sizes.len())];
473 let local_memory_size = space.local_memory_sizes
474 [rand::rng().random_range(0..space.local_memory_sizes.len())];
475 let cache_config =
476 space.cache_configs[rand::rng().random_range(0..space.cache_configs.len())];
477
478 if self.is_valid_configuration(work_group_size, local_memory_size) {
479 configurations.push(KernelParameters {
480 work_group_size,
481 global_work_size: [1024, 1024, 1],
482 local_memory_size,
483 register_usage: None,
484 cacheconfig: cache_config,
485 custom_params: HashMap::new(),
486 });
487 }
488 }
489
490 Ok(configurations)
491 }
492
493 fn is_valid_configuration(&self, work_group_size: [u32; 3], local_memorysize: usize) -> bool {
495 let total_threads = work_group_size[0] * work_group_size[1] * work_group_size[2];
496
497 total_threads <= self.device_info.max_work_group_size as u32
498 && local_memorysize <= self.device_info.max_local_memory_size
499 }
500
501 fn benchmark_configuration(
503 &self,
504 kernel: &GpuKernelHandle,
505 params: &KernelParameters,
506 problemsize: &[usize],
507 ) -> Result<PerformanceMetrics, AutoTuningError> {
508 let mut execution_times = Vec::new();
509
510 for _ in 0..self.strategy.benchmark_runs {
512 let start = Instant::now();
513
514 kernel.dispatch(params.work_group_size);
516
517 let execution_time = start.elapsed();
524 execution_times.push(execution_time);
525 }
526
527 let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
529
530 let total_ops = problemsize.iter().product::<usize>() as f64;
532 let throughput = total_ops / avg_time.as_secs_f64();
533
534 Ok(PerformanceMetrics {
535 execution_time: avg_time,
536 throughput,
537 memorybandwidth_util: 0.8, compute_utilization: 0.9, energy_efficiency: None,
540 cache_metrics: CacheMetrics::default(),
541 })
542 }
543
544 fn check_convergence(&self, performance: &PerformanceMetrics, iteration: usize) -> bool {
546 iteration > 10 && iteration % 10 == 0
549 }
550
551 fn generate_cache_key(&self, kernel_name: &str, problemsize: &[usize]) -> String {
553 format!(
554 "{}_{}_{}_{:?}",
555 self.backend, self.device_info.compute_capability, kernel_name, problemsize
556 )
557 }
558
559 fn detect_device_info(backend: GpuBackend) -> Result<DeviceInfo, AutoTuningError> {
561 match backend {
563 GpuBackend::Cuda => Ok(DeviceInfo {
564 compute_capability: "8.0".to_string(),
565 memory_size: 12 * 1024 * 1024 * 1024, max_work_group_size: 1024,
567 max_local_memory_size: 48 * 1024, warp_size: 32,
569 }),
570 GpuBackend::Rocm => Ok(DeviceInfo {
571 compute_capability: "RDNA2".to_string(),
572 memory_size: 16 * 1024 * 1024 * 1024, max_work_group_size: 1024,
574 max_local_memory_size: 64 * 1024, warp_size: 64, }),
577 _ => Ok(DeviceInfo {
578 compute_capability: "Unknown".to_string(),
579 memory_size: 8 * 1024 * 1024 * 1024, max_work_group_size: 256,
581 max_local_memory_size: 16 * 1024, warp_size: 32,
583 }),
584 }
585 }
586}
587
588pub mod presets {
590 use super::*;
591
592 pub fn matrix_multiply_space() -> TuningSpace {
594 TuningSpace {
595 work_group_sizes: vec![
596 [16, 16, 1],
597 [32, 32, 1],
598 [8, 32, 1],
599 [32, 8, 1],
600 [64, 4, 1],
601 [4, 64, 1],
602 [128, 2, 1],
603 [2, 128, 1],
604 ],
605 local_memory_sizes: vec![0, 2048, 4096, 8192, 16384],
606 cache_configs: vec![CacheConfig::PreferShared, CacheConfig::Balanced],
607 custom_spaces: HashMap::new(),
608 }
609 }
610
611 pub fn convolution_space() -> TuningSpace {
613 TuningSpace {
614 work_group_sizes: vec![
615 [8, 8, 1],
616 [16, 16, 1],
617 [32, 8, 1],
618 [8, 32, 1],
619 [64, 1, 1],
620 [32, 4, 1],
621 [4, 32, 1],
622 ],
623 local_memory_sizes: vec![1024, 2048, 4096, 8192],
624 cache_configs: vec![CacheConfig::PreferL1, CacheConfig::ReadOnly],
625 custom_spaces: HashMap::new(),
626 }
627 }
628
629 pub fn reduction_space() -> TuningSpace {
631 TuningSpace {
632 work_group_sizes: vec![
633 [64, 1, 1],
634 [128, 1, 1],
635 [256, 1, 1],
636 [512, 1, 1],
637 [1024, 1, 1],
638 [32, 2, 1],
639 [16, 4, 1],
640 ],
641 local_memory_sizes: vec![512, 1024, 2048, 4096],
642 cache_configs: vec![CacheConfig::PreferShared],
643 custom_spaces: HashMap::new(),
644 }
645 }
646}
647
648#[cfg(test)]
649mod tests {
650 use super::*;
651
652 #[test]
653 fn test_parameter_value_conversion() {
654 let int_val = ParameterValue::Int(42);
655 assert_eq!(int_val.as_int(), Some(42));
656 assert_eq!(int_val.as_float(), Some(42.0));
657
658 let float_val = ParameterValue::Float(3.5);
659 assert_eq!(float_val.as_float(), Some(3.5));
660 assert_eq!(float_val.as_int(), Some(3));
661 }
662
663 #[test]
664 fn test_kernel_parameters_default() {
665 let params = KernelParameters::default();
666 assert_eq!(params.work_group_size, [16, 16, 1]);
667 assert_eq!(params.local_memory_size, 0);
668 }
669
670 #[test]
671 fn test_tuning_strategy_default() {
672 let strategy = TuningStrategy::default();
673 assert_eq!(strategy.search_algorithm, SearchAlgorithm::GridSearch);
674 assert_eq!(strategy.max_evaluations, 100);
675 }
676
677 #[test]
678 fn test_tuning_space_default() {
679 let space = TuningSpace::default();
680 assert!(!space.work_group_sizes.is_empty());
681 assert!(!space.cache_configs.is_empty());
682 }
683
684 #[test]
685 fn testmatrix_multiply_preset() {
686 let space = presets::matrix_multiply_space();
687 assert!(space.work_group_sizes.contains(&[16, 16, 1]));
688 assert!(space.cache_configs.contains(&CacheConfig::PreferShared));
689 }
690
691 #[test]
692 fn test_device_info_detection() {
693 let device_info = AutoTuner::detect_device_info(GpuBackend::Cuda);
694 assert!(device_info.is_ok());
695
696 let info = device_info.expect("Operation failed");
697 assert!(info.max_work_group_size > 0);
698 assert!(info.max_local_memory_size > 0);
699 }
700}