use crate::gpu::{GpuBackend, GpuError, GpuKernelHandle};
use rand::{Rng, RngExt};
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum AutoTuningError {
#[error("No tuning configurations available for kernel: {0}")]
NoConfigurations(String),
#[error("Auto-tuning failed: {0}")]
TuningFailed(String),
#[error("Invalid parameter configuration: {0}")]
InvalidConfiguration(String),
#[error("Benchmark execution failed: {0}")]
BenchmarkFailed(String),
#[error("GPU error: {0}")]
GpuError(#[from] GpuError),
}
#[derive(Debug, Clone, PartialEq)]
pub struct KernelParameters {
pub work_group_size: [u32; 3],
pub global_work_size: [u32; 3],
pub local_memory_size: usize,
pub register_usage: Option<usize>,
pub cacheconfig: CacheConfig,
pub custom_params: HashMap<String, ParameterValue>,
}
impl Default for KernelParameters {
fn default() -> Self {
Self {
work_group_size: [16, 16, 1],
global_work_size: [1024, 1024, 1],
local_memory_size: 0,
register_usage: None,
cacheconfig: CacheConfig::Balanced,
custom_params: HashMap::new(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum ParameterValue {
Int(i64),
Float(f64),
String(String),
Bool(bool),
IntArray(Vec<i64>),
FloatArray(Vec<f64>),
}
impl ParameterValue {
pub fn as_int(&self) -> Option<i64> {
match self {
ParameterValue::Int(val) => Some(*val),
ParameterValue::Float(val) => Some(*val as i64),
_ => None,
}
}
pub fn as_float(&self) -> Option<f64> {
match self {
ParameterValue::Float(val) => Some(*val),
ParameterValue::Int(val) => Some(*val as f64),
_ => None,
}
}
pub fn as_string(&self) -> String {
match self {
ParameterValue::String(val) => val.clone(),
ParameterValue::Int(val) => val.to_string(),
ParameterValue::Float(val) => val.to_string(),
ParameterValue::Bool(val) => val.to_string(),
_ => format!("{self:?}"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CacheConfig {
PreferL1,
PreferShared,
Balanced,
ReadOnly,
WriteThrough,
}
#[derive(Debug, Clone)]
pub struct PerformanceMetrics {
pub execution_time: Duration,
pub throughput: f64,
pub memorybandwidth_util: f64,
pub compute_utilization: f64,
pub energy_efficiency: Option<f64>,
pub cache_metrics: CacheMetrics,
}
impl Default for PerformanceMetrics {
fn default() -> Self {
Self {
execution_time: Duration::from_millis(0),
throughput: 0.0,
memorybandwidth_util: 0.0,
compute_utilization: 0.0,
energy_efficiency: None,
cache_metrics: CacheMetrics::default(),
}
}
}
#[derive(Debug, Clone, Default)]
pub struct CacheMetrics {
pub l1_hit_rate: f64,
pub l2_hit_rate: f64,
pub shared_memory_conflicts: usize,
pub coalescing_efficiency: f64,
pub memory_throughput: f64,
pub cache_pressure: f64,
}
#[derive(Debug, Clone)]
pub struct TuningStrategy {
pub search_algorithm: SearchAlgorithm,
pub max_evaluations: usize,
pub time_budget: Duration,
pub benchmark_runs: usize,
pub convergence_threshold: f64,
pub use_history: bool,
}
impl Default for TuningStrategy {
fn default() -> Self {
Self {
search_algorithm: SearchAlgorithm::GridSearch,
max_evaluations: 100,
time_budget: Duration::from_secs(60),
benchmark_runs: 3,
convergence_threshold: 0.01, use_history: true,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SearchAlgorithm {
GridSearch,
RandomSearch,
BayesianOptimization,
GeneticAlgorithm,
SimulatedAnnealing,
DifferentialEvolution,
ParticleSwarm,
}
#[derive(Debug, Clone)]
pub struct TuningSpace {
pub work_group_sizes: Vec<[u32; 3]>,
pub local_memory_sizes: Vec<usize>,
pub cache_configs: Vec<CacheConfig>,
pub custom_spaces: HashMap<String, Vec<ParameterValue>>,
}
impl Default for TuningSpace {
fn default() -> Self {
Self {
work_group_sizes: vec![
[8, 8, 1],
[16, 16, 1],
[32, 32, 1],
[64, 8, 1],
[8, 64, 1],
[128, 1, 1],
[256, 1, 1],
[512, 1, 1],
],
local_memory_sizes: vec![0, 1024, 2048, 4096, 8192, 16384],
cache_configs: vec![
CacheConfig::Balanced,
CacheConfig::PreferL1,
CacheConfig::PreferShared,
CacheConfig::ReadOnly,
],
custom_spaces: HashMap::new(),
}
}
}
#[derive(Debug, Clone)]
pub struct TuningResult {
pub best_params: KernelParameters,
pub best_performance: PerformanceMetrics,
pub evaluations: usize,
pub tuning_time: Duration,
pub converged: bool,
pub improvement_factor: f64,
}
#[derive(Debug)]
pub struct AutoTuner {
backend: GpuBackend,
strategy: TuningStrategy,
tuning_cache: Arc<Mutex<HashMap<String, TuningResult>>>,
device_info: DeviceInfo,
}
#[derive(Debug, Clone)]
struct DeviceInfo {
compute_capability: String,
#[allow(dead_code)]
memory_size: usize,
max_work_group_size: usize,
max_local_memory_size: usize,
#[allow(dead_code)]
warp_size: usize,
}
impl AutoTuner {
pub fn new(backend: GpuBackend, strategy: TuningStrategy) -> Result<Self, AutoTuningError> {
let device_info = Self::detect_device_info(backend)?;
Ok(Self {
backend,
strategy,
tuning_cache: Arc::new(Mutex::new(HashMap::new())),
device_info,
})
}
pub fn tune(
&self,
kernel: &GpuKernelHandle,
kernel_name: &str,
problemsize: &[usize],
tuning_space: TuningSpace,
) -> Result<TuningResult, AutoTuningError> {
let cache_key = self.generate_cache_key(kernel_name, problemsize);
if self.strategy.use_history {
if let Some(cached_result) = self
.tuning_cache
.lock()
.expect("Operation failed")
.get(&cache_key)
{
return Ok(cached_result.clone());
}
}
let start_time = Instant::now();
let mut best_params = KernelParameters::default();
let mut best_performance: Option<PerformanceMetrics> = None;
let mut evaluations = 0;
let configurations = self.generate_configurations(&tuning_space)?;
for (i, params) in configurations.iter().enumerate() {
if start_time.elapsed() > self.strategy.time_budget {
break;
}
if evaluations >= self.strategy.max_evaluations {
break;
}
match self.benchmark_configuration(kernel, params, problemsize) {
Ok(metrics) => {
evaluations += 1;
if best_performance.is_none()
|| metrics.throughput
> best_performance
.as_ref()
.expect("Operation failed")
.throughput
{
best_params = params.clone();
best_performance = Some(metrics);
}
if let Some(ref best) = best_performance {
if self.check_convergence(best, i) {
break;
}
}
}
Err(e) => {
eprintln!("Benchmark failed for configuration {params:?}: {e}");
}
}
}
let best_performance = best_performance.ok_or_else(|| {
AutoTuningError::TuningFailed("No successful configurations".to_string())
})?;
let tuning_time = start_time.elapsed();
let improvement_factor = 1.0;
let result = TuningResult {
best_params,
best_performance,
evaluations,
tuning_time,
converged: evaluations < self.strategy.max_evaluations,
improvement_factor,
};
self.tuning_cache
.lock()
.expect("Operation failed")
.insert(cache_key, result.clone());
Ok(result)
}
pub fn get_cached_results(&self) -> HashMap<String, TuningResult> {
self.tuning_cache.lock().expect("Operation failed").clone()
}
pub fn clear_cache(&self) {
self.tuning_cache.lock().expect("Operation failed").clear();
}
fn generate_configurations(
&self,
space: &TuningSpace,
) -> Result<Vec<KernelParameters>, AutoTuningError> {
match self.strategy.search_algorithm {
SearchAlgorithm::GridSearch => self.grid_search_configurations(space),
SearchAlgorithm::RandomSearch => self.random_search_configurations(space),
_ => {
self.grid_search_configurations(space)
}
}
}
fn grid_search_configurations(
&self,
space: &TuningSpace,
) -> Result<Vec<KernelParameters>, AutoTuningError> {
let mut configurations = Vec::new();
for &work_group_size in &space.work_group_sizes {
for &local_memory_size in &space.local_memory_sizes {
for &cache_config in &space.cache_configs {
if self.is_valid_configuration(work_group_size, local_memory_size) {
configurations.push(KernelParameters {
work_group_size,
global_work_size: [1024, 1024, 1], local_memory_size,
register_usage: None,
cacheconfig: cache_config,
custom_params: HashMap::new(),
});
}
}
}
}
Ok(configurations)
}
fn random_search_configurations(
&self,
space: &TuningSpace,
) -> Result<Vec<KernelParameters>, AutoTuningError> {
let mut configurations = Vec::new();
let num_samples = self.strategy.max_evaluations.min(100);
for _ in 0..num_samples {
let work_group_size =
space.work_group_sizes[rand::rng().random_range(0..space.work_group_sizes.len())];
let local_memory_size = space.local_memory_sizes
[rand::rng().random_range(0..space.local_memory_sizes.len())];
let cache_config =
space.cache_configs[rand::rng().random_range(0..space.cache_configs.len())];
if self.is_valid_configuration(work_group_size, local_memory_size) {
configurations.push(KernelParameters {
work_group_size,
global_work_size: [1024, 1024, 1],
local_memory_size,
register_usage: None,
cacheconfig: cache_config,
custom_params: HashMap::new(),
});
}
}
Ok(configurations)
}
fn is_valid_configuration(&self, work_group_size: [u32; 3], local_memorysize: usize) -> bool {
let total_threads = work_group_size[0] * work_group_size[1] * work_group_size[2];
total_threads <= self.device_info.max_work_group_size as u32
&& local_memorysize <= self.device_info.max_local_memory_size
}
fn benchmark_configuration(
&self,
kernel: &GpuKernelHandle,
params: &KernelParameters,
problemsize: &[usize],
) -> Result<PerformanceMetrics, AutoTuningError> {
let mut execution_times = Vec::new();
for _ in 0..self.strategy.benchmark_runs {
let start = Instant::now();
kernel.dispatch(params.work_group_size);
let execution_time = start.elapsed();
execution_times.push(execution_time);
}
let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
let total_ops = problemsize.iter().product::<usize>() as f64;
let throughput = total_ops / avg_time.as_secs_f64();
Ok(PerformanceMetrics {
execution_time: avg_time,
throughput,
memorybandwidth_util: 0.8, compute_utilization: 0.9, energy_efficiency: None,
cache_metrics: CacheMetrics::default(),
})
}
fn check_convergence(&self, performance: &PerformanceMetrics, iteration: usize) -> bool {
iteration > 10 && iteration % 10 == 0
}
fn generate_cache_key(&self, kernel_name: &str, problemsize: &[usize]) -> String {
format!(
"{}_{}_{}_{:?}",
self.backend, self.device_info.compute_capability, kernel_name, problemsize
)
}
fn detect_device_info(backend: GpuBackend) -> Result<DeviceInfo, AutoTuningError> {
match backend {
GpuBackend::Cuda => Ok(DeviceInfo {
compute_capability: "8.0".to_string(),
memory_size: (12u64 * 1024 * 1024 * 1024) as usize, max_work_group_size: 1024,
max_local_memory_size: 48 * 1024, warp_size: 32,
}),
GpuBackend::Rocm => Ok(DeviceInfo {
compute_capability: "RDNA2".to_string(),
memory_size: (16u64 * 1024 * 1024 * 1024) as usize, max_work_group_size: 1024,
max_local_memory_size: 64 * 1024, warp_size: 64, }),
_ => Ok(DeviceInfo {
compute_capability: "Unknown".to_string(),
memory_size: (8u64 * 1024 * 1024 * 1024) as usize, max_work_group_size: 256,
max_local_memory_size: 16 * 1024, warp_size: 32,
}),
}
}
}
pub mod presets {
use super::*;
pub fn matrix_multiply_space() -> TuningSpace {
TuningSpace {
work_group_sizes: vec![
[16, 16, 1],
[32, 32, 1],
[8, 32, 1],
[32, 8, 1],
[64, 4, 1],
[4, 64, 1],
[128, 2, 1],
[2, 128, 1],
],
local_memory_sizes: vec![0, 2048, 4096, 8192, 16384],
cache_configs: vec![CacheConfig::PreferShared, CacheConfig::Balanced],
custom_spaces: HashMap::new(),
}
}
pub fn convolution_space() -> TuningSpace {
TuningSpace {
work_group_sizes: vec![
[8, 8, 1],
[16, 16, 1],
[32, 8, 1],
[8, 32, 1],
[64, 1, 1],
[32, 4, 1],
[4, 32, 1],
],
local_memory_sizes: vec![1024, 2048, 4096, 8192],
cache_configs: vec![CacheConfig::PreferL1, CacheConfig::ReadOnly],
custom_spaces: HashMap::new(),
}
}
pub fn reduction_space() -> TuningSpace {
TuningSpace {
work_group_sizes: vec![
[64, 1, 1],
[128, 1, 1],
[256, 1, 1],
[512, 1, 1],
[1024, 1, 1],
[32, 2, 1],
[16, 4, 1],
],
local_memory_sizes: vec![512, 1024, 2048, 4096],
cache_configs: vec![CacheConfig::PreferShared],
custom_spaces: HashMap::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parameter_value_conversion() {
let int_val = ParameterValue::Int(42);
assert_eq!(int_val.as_int(), Some(42));
assert_eq!(int_val.as_float(), Some(42.0));
let float_val = ParameterValue::Float(3.5);
assert_eq!(float_val.as_float(), Some(3.5));
assert_eq!(float_val.as_int(), Some(3));
}
#[test]
fn test_kernel_parameters_default() {
let params = KernelParameters::default();
assert_eq!(params.work_group_size, [16, 16, 1]);
assert_eq!(params.local_memory_size, 0);
}
#[test]
fn test_tuning_strategy_default() {
let strategy = TuningStrategy::default();
assert_eq!(strategy.search_algorithm, SearchAlgorithm::GridSearch);
assert_eq!(strategy.max_evaluations, 100);
}
#[test]
fn test_tuning_space_default() {
let space = TuningSpace::default();
assert!(!space.work_group_sizes.is_empty());
assert!(!space.cache_configs.is_empty());
}
#[test]
fn testmatrix_multiply_preset() {
let space = presets::matrix_multiply_space();
assert!(space.work_group_sizes.contains(&[16, 16, 1]));
assert!(space.cache_configs.contains(&CacheConfig::PreferShared));
}
#[test]
fn test_device_info_detection() {
let device_info = AutoTuner::detect_device_info(GpuBackend::Cuda);
assert!(device_info.is_ok());
let info = device_info.expect("Operation failed");
assert!(info.max_work_group_size > 0);
assert!(info.max_local_memory_size > 0);
}
}