use crate::{DType, Device, Result, TensorError};
use std::collections::HashMap;
use std::sync::{Arc, Mutex, RwLock};
use std::time::{Duration, Instant};
#[cfg(feature = "serialize")]
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct GpuOpMetrics {
pub operation_name: String,
pub device_id: usize,
pub input_shapes: Vec<Vec<usize>>,
pub dtype: DType,
pub kernel_time: Duration,
pub h2d_transfer_time: Duration,
pub d2h_transfer_time: Duration,
pub total_time: Duration,
pub memory_bandwidth: Option<f64>,
pub gpu_utilization: Option<f64>,
pub memory_usage: u64,
pub achieved_flops: Option<f64>,
pub peak_flops: Option<f64>,
pub elements_processed: usize,
pub workgroup_config: WorkgroupConfig,
}
#[derive(Debug, Clone, Copy)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct WorkgroupConfig {
pub x: u32,
pub y: u32,
pub z: u32,
}
impl Default for WorkgroupConfig {
fn default() -> Self {
Self { x: 256, y: 1, z: 1 }
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct BottleneckAnalysis {
pub bottleneck_type: BottleneckType,
pub severity: f64, pub description: String,
pub recommendations: Vec<String>,
pub potential_improvement: f64, }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub enum BottleneckType {
MemoryBandwidth,
ComputeBound,
LatencyBound,
SynchronizationOverhead,
InsufficientParallelism,
SuboptimalWorkgroupSize,
MemoryCoalescingIssues,
RegisterPressure,
}
#[derive(Debug, Clone)]
pub struct GpuCapabilities {
pub device_name: String,
pub max_compute_units: u32,
pub max_workgroup_size: u32,
pub memory_bandwidth_gb_s: f64,
pub peak_compute_tflops: f64,
pub memory_size_gb: f64,
pub warp_size: u32, pub shared_memory_per_workgroup: u32,
pub supports_fp16: bool,
pub supports_int8: bool,
}
#[derive(Debug, Clone)]
pub struct OptimizationConfig {
pub enable_kernel_fusion: bool,
pub enable_memory_coalescing: bool,
pub enable_async_execution: bool,
pub target_memory_utilization: f64,
pub min_batch_size: usize,
pub enable_auto_tuning: bool,
pub max_tuning_iterations: usize,
}
impl Default for OptimizationConfig {
fn default() -> Self {
Self {
enable_kernel_fusion: true,
enable_memory_coalescing: true,
enable_async_execution: true,
target_memory_utilization: 0.8,
min_batch_size: 32,
enable_auto_tuning: true,
max_tuning_iterations: 10,
}
}
}
pub struct GpuPerformanceOptimizer {
device: Arc<wgpu::Device>,
queue: Arc<wgpu::Queue>,
capabilities: GpuCapabilities,
config: OptimizationConfig,
performance_history: RwLock<HashMap<String, Vec<GpuOpMetrics>>>,
optimal_configs: RwLock<HashMap<String, WorkgroupConfig>>,
active_profiling: Mutex<Option<ProfilingSession>>,
}
struct ProfilingSession {
operation_name: String,
start_time: Instant,
metrics: GpuOpMetrics,
}
impl GpuPerformanceOptimizer {
pub fn new(
device: Arc<wgpu::Device>,
queue: Arc<wgpu::Queue>,
capabilities: GpuCapabilities,
config: OptimizationConfig,
) -> Self {
Self {
device,
queue,
capabilities,
config,
performance_history: RwLock::new(HashMap::new()),
optimal_configs: RwLock::new(HashMap::new()),
active_profiling: Mutex::new(None),
}
}
pub fn start_profiling(
&self,
operation_name: &str,
device_id: usize,
input_shapes: Vec<Vec<usize>>,
dtype: DType,
) {
let metrics = GpuOpMetrics {
operation_name: operation_name.to_string(),
device_id,
input_shapes,
dtype,
kernel_time: Duration::ZERO,
h2d_transfer_time: Duration::ZERO,
d2h_transfer_time: Duration::ZERO,
total_time: Duration::ZERO,
memory_bandwidth: None,
gpu_utilization: None,
memory_usage: 0,
achieved_flops: None,
peak_flops: Some(self.capabilities.peak_compute_tflops * 1e12),
elements_processed: 0,
workgroup_config: WorkgroupConfig::default(),
};
let session = ProfilingSession {
operation_name: operation_name.to_string(),
start_time: Instant::now(),
metrics,
};
*self
.active_profiling
.lock()
.expect("lock should not be poisoned") = Some(session);
}
pub fn record_memory_transfer(
&self,
h2d_time: Duration,
d2h_time: Duration,
bytes_transferred: u64,
) {
if let Some(session) = self
.active_profiling
.lock()
.expect("lock should not be poisoned")
.as_mut()
{
session.metrics.h2d_transfer_time = h2d_time;
session.metrics.d2h_transfer_time = d2h_time;
session.metrics.memory_usage = bytes_transferred;
let total_transfer_time = h2d_time + d2h_time;
if total_transfer_time.as_secs_f64() > 0.0 {
let bandwidth_gb_s =
(bytes_transferred as f64) / (1e9 * total_transfer_time.as_secs_f64());
session.metrics.memory_bandwidth = Some(bandwidth_gb_s);
}
}
}
pub fn record_kernel_execution(
&self,
kernel_time: Duration,
elements_processed: usize,
workgroup_config: WorkgroupConfig,
) {
if let Some(session) = self
.active_profiling
.lock()
.expect("lock should not be poisoned")
.as_mut()
{
session.metrics.kernel_time = kernel_time;
session.metrics.elements_processed = elements_processed;
session.metrics.workgroup_config = workgroup_config;
if let Some(flops) =
self.estimate_flops(&session.metrics.operation_name, elements_processed)
{
let achieved_flops = flops / kernel_time.as_secs_f64();
session.metrics.achieved_flops = Some(achieved_flops);
}
}
}
pub fn finish_profiling(&self) -> Option<GpuOpMetrics> {
if let Some(session) = self
.active_profiling
.lock()
.expect("lock should not be poisoned")
.take()
{
let total_time = session.start_time.elapsed();
let mut metrics = session.metrics;
metrics.total_time = total_time;
if total_time.as_secs_f64() > 0.0 {
let compute_ratio = metrics.kernel_time.as_secs_f64() / total_time.as_secs_f64();
metrics.gpu_utilization = Some(compute_ratio * 100.0);
}
let mut history = self
.performance_history
.write()
.expect("write lock should not be poisoned");
history
.entry(metrics.operation_name.clone())
.or_insert_with(Vec::new)
.push(metrics.clone());
return Some(metrics);
}
None
}
pub fn analyze_bottlenecks(&self, metrics: &GpuOpMetrics) -> Vec<BottleneckAnalysis> {
let mut bottlenecks = Vec::new();
if let Some(bandwidth) = metrics.memory_bandwidth {
let bandwidth_utilization = bandwidth / self.capabilities.memory_bandwidth_gb_s;
if bandwidth_utilization > 0.8 {
bottlenecks.push(BottleneckAnalysis {
bottleneck_type: BottleneckType::MemoryBandwidth,
severity: bandwidth_utilization.min(1.0),
description: format!(
"Memory bandwidth utilization: {:.1}%",
bandwidth_utilization * 100.0
),
recommendations: vec![
"Enable memory coalescing optimization".to_string(),
"Consider kernel fusion to reduce memory traffic".to_string(),
"Use async execution to overlap computation and memory transfers"
.to_string(),
],
potential_improvement: (bandwidth_utilization - 0.8) * 0.5,
});
}
}
if let (Some(achieved_flops), Some(peak_flops)) =
(metrics.achieved_flops, metrics.peak_flops)
{
let compute_utilization = achieved_flops / peak_flops;
if compute_utilization < 0.3 {
bottlenecks.push(BottleneckAnalysis {
bottleneck_type: BottleneckType::ComputeBound,
severity: 1.0 - compute_utilization,
description: format!(
"Low compute utilization: {:.1}%",
compute_utilization * 100.0
),
recommendations: vec![
"Optimize workgroup size for better occupancy".to_string(),
"Consider data layout optimization".to_string(),
"Use vectorized operations where possible".to_string(),
],
potential_improvement: (0.3 - compute_utilization) * 2.0,
});
}
}
if let Some(gpu_util) = metrics.gpu_utilization {
if gpu_util < 50.0 {
bottlenecks.push(BottleneckAnalysis {
bottleneck_type: BottleneckType::SynchronizationOverhead,
severity: (50.0 - gpu_util) / 50.0,
description: format!("Low GPU utilization: {:.1}%", gpu_util),
recommendations: vec![
"Reduce synchronization points".to_string(),
"Use async execution patterns".to_string(),
"Batch operations to amortize overhead".to_string(),
],
potential_improvement: (50.0 - gpu_util) / 100.0,
});
}
}
let workgroup_size =
metrics.workgroup_config.x * metrics.workgroup_config.y * metrics.workgroup_config.z;
if workgroup_size < self.capabilities.warp_size
|| workgroup_size % self.capabilities.warp_size != 0
{
bottlenecks.push(BottleneckAnalysis {
bottleneck_type: BottleneckType::SuboptimalWorkgroupSize,
severity: 0.7,
description: format!("Suboptimal workgroup size: {}", workgroup_size),
recommendations: vec![
format!(
"Use workgroup sizes that are multiples of {}",
self.capabilities.warp_size
),
"Consider larger workgroup sizes for better occupancy".to_string(),
],
potential_improvement: 0.2,
});
}
bottlenecks
}
pub fn get_optimization_recommendations(&self, operation_name: &str) -> Vec<String> {
let mut recommendations = Vec::new();
let history = self
.performance_history
.read()
.expect("read lock should not be poisoned");
if let Some(metrics_history) = history.get(operation_name) {
if metrics_history.len() >= 3 {
let recent_metrics = &metrics_history[metrics_history.len() - 3..];
let avg_bandwidth = recent_metrics
.iter()
.filter_map(|m| m.memory_bandwidth)
.sum::<f64>()
/ recent_metrics.len() as f64;
if avg_bandwidth / self.capabilities.memory_bandwidth_gb_s > 0.7 {
recommendations.push(
"Operation is memory bandwidth bound - consider kernel fusion".to_string(),
);
}
let avg_gpu_util = recent_metrics
.iter()
.filter_map(|m| m.gpu_utilization)
.sum::<f64>()
/ recent_metrics.len() as f64;
if avg_gpu_util < 60.0 {
recommendations.push(
"Low GPU utilization - consider async execution or batching".to_string(),
);
}
}
}
match operation_name {
name if name.contains("matmul") => {
recommendations
.push("Consider tiled matrix multiplication for large matrices".to_string());
recommendations
.push("Use tensor cores if available for mixed precision".to_string());
}
name if name.contains("conv") => {
recommendations.push(
"Consider Winograd or FFT convolution for appropriate filter sizes".to_string(),
);
recommendations
.push("Use depthwise separable convolutions for mobile efficiency".to_string());
}
name if name.contains("add") || name.contains("mul") => {
recommendations.push(
"Consider fusing with following operations (e.g., activation functions)"
.to_string(),
);
}
_ => {}
}
recommendations
}
pub fn auto_tune_workgroup(&self, operation_name: &str, tensor_size: usize) -> WorkgroupConfig {
let optimal_configs = self
.optimal_configs
.read()
.expect("read lock should not be poisoned");
if let Some(config) = optimal_configs.get(operation_name) {
return *config;
}
drop(optimal_configs);
if !self.config.enable_auto_tuning {
return WorkgroupConfig::default();
}
let candidates = self.generate_workgroup_candidates(tensor_size);
let mut best_config = WorkgroupConfig::default();
let mut best_performance = 0.0;
for config in candidates {
let estimated_performance = self.estimate_workgroup_performance(config, tensor_size);
if estimated_performance > best_performance {
best_performance = estimated_performance;
best_config = config;
}
}
let mut optimal_configs = self
.optimal_configs
.write()
.expect("write lock should not be poisoned");
optimal_configs.insert(operation_name.to_string(), best_config);
best_config
}
fn generate_workgroup_candidates(&self, tensor_size: usize) -> Vec<WorkgroupConfig> {
let mut candidates = Vec::new();
let max_workgroup_size = self.capabilities.max_workgroup_size.min(1024);
for size in [64, 128, 256, 512].iter() {
if *size <= max_workgroup_size {
candidates.push(WorkgroupConfig {
x: *size,
y: 1,
z: 1,
});
}
}
if tensor_size > 10000 {
for x in [16, 32] {
for y in [8, 16] {
if x * y <= max_workgroup_size {
candidates.push(WorkgroupConfig { x, y, z: 1 });
}
}
}
}
candidates
}
fn estimate_workgroup_performance(&self, config: WorkgroupConfig, tensor_size: usize) -> f64 {
let workgroup_size = config.x * config.y * config.z;
let warp_efficiency = if workgroup_size % self.capabilities.warp_size == 0 {
1.0
} else {
0.8
};
let occupancy = (tensor_size as f32 / workgroup_size as f32).min(1.0);
workgroup_size as f64 * warp_efficiency * occupancy as f64
}
fn estimate_flops(&self, operation_name: &str, elements: usize) -> Option<f64> {
let flops_per_element = match operation_name {
name if name.contains("add") || name.contains("sub") => 1.0,
name if name.contains("mul") || name.contains("div") => 1.0,
name if name.contains("matmul") => {
elements as f64 * 2.0 }
name if name.contains("conv") => {
elements as f64 * 5.0
}
name if name.contains("relu") => 1.0,
name if name.contains("sigmoid") || name.contains("tanh") => 8.0,
name if name.contains("gelu") => 15.0,
_ => return None,
};
Some(elements as f64 * flops_per_element)
}
pub fn generate_performance_report(&self) -> PerformanceReport {
let history = self
.performance_history
.read()
.expect("read lock should not be poisoned");
let mut total_operations = 0;
let mut total_time = Duration::ZERO;
let mut avg_gpu_utilization = 0.0;
let mut avg_memory_bandwidth = 0.0;
let mut bottleneck_summary = HashMap::new();
for (op_name, metrics_list) in history.iter() {
total_operations += metrics_list.len();
for metrics in metrics_list {
total_time += metrics.total_time;
if let Some(gpu_util) = metrics.gpu_utilization {
avg_gpu_utilization += gpu_util;
}
if let Some(bandwidth) = metrics.memory_bandwidth {
avg_memory_bandwidth += bandwidth;
}
let bottlenecks = self.analyze_bottlenecks(metrics);
for bottleneck in bottlenecks {
*bottleneck_summary
.entry(bottleneck.bottleneck_type)
.or_insert(0) += 1;
}
}
}
if total_operations > 0 {
avg_gpu_utilization /= total_operations as f64;
avg_memory_bandwidth /= total_operations as f64;
}
PerformanceReport {
total_operations,
total_time,
avg_gpu_utilization,
avg_memory_bandwidth,
peak_memory_bandwidth: self.capabilities.memory_bandwidth_gb_s,
bottleneck_summary,
recommendations: self.generate_global_recommendations(),
}
}
fn generate_global_recommendations(&self) -> Vec<String> {
let mut recommendations = Vec::new();
let history = self
.performance_history
.read()
.expect("read lock should not be poisoned");
let total_ops = history.values().map(|v| v.len()).sum::<usize>();
if total_ops > 0 {
let low_util_ops = history
.values()
.flatten()
.filter(|m| m.gpu_utilization.unwrap_or(100.0) < 50.0)
.count();
if low_util_ops as f64 / total_ops as f64 > 0.3 {
recommendations.push(
"Consider enabling async execution to improve GPU utilization".to_string(),
);
}
let memory_bound_ops = history
.values()
.flatten()
.filter(|m| {
m.memory_bandwidth.unwrap_or(0.0) / self.capabilities.memory_bandwidth_gb_s
> 0.7
})
.count();
if memory_bound_ops as f64 / total_ops as f64 > 0.4 {
recommendations.push(
"Many operations are memory bandwidth bound - enable kernel fusion".to_string(),
);
}
}
recommendations
}
}
#[derive(Debug)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct PerformanceReport {
pub total_operations: usize,
pub total_time: Duration,
pub avg_gpu_utilization: f64,
pub avg_memory_bandwidth: f64,
pub peak_memory_bandwidth: f64,
pub bottleneck_summary: HashMap<BottleneckType, usize>,
pub recommendations: Vec<String>,
}
pub fn detect_gpu_capabilities(device: &wgpu::Device) -> GpuCapabilities {
GpuCapabilities {
device_name: "Generic GPU".to_string(),
max_compute_units: 32,
max_workgroup_size: 1024,
memory_bandwidth_gb_s: 500.0, peak_compute_tflops: 10.0, memory_size_gb: 8.0,
warp_size: 32, shared_memory_per_workgroup: 48 * 1024, supports_fp16: true,
supports_int8: true,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_workgroup_config_default() {
let config = WorkgroupConfig::default();
assert_eq!(config.x, 256);
assert_eq!(config.y, 1);
assert_eq!(config.z, 1);
}
#[test]
fn test_bottleneck_analysis() {
let metrics = GpuOpMetrics {
operation_name: "test_op".to_string(),
device_id: 0,
input_shapes: vec![vec![1024, 1024]],
dtype: DType::Float32,
kernel_time: Duration::from_millis(10),
h2d_transfer_time: Duration::from_millis(5),
d2h_transfer_time: Duration::from_millis(5),
total_time: Duration::from_millis(20),
memory_bandwidth: Some(400.0), gpu_utilization: Some(45.0), memory_usage: 4 * 1024 * 1024,
achieved_flops: Some(1e9),
peak_flops: Some(10e12),
elements_processed: 1024 * 1024,
workgroup_config: WorkgroupConfig::default(),
};
let capabilities = GpuCapabilities {
device_name: "Test GPU".to_string(),
max_compute_units: 32,
max_workgroup_size: 1024,
memory_bandwidth_gb_s: 500.0,
peak_compute_tflops: 10.0,
memory_size_gb: 8.0,
warp_size: 32,
shared_memory_per_workgroup: 48 * 1024,
supports_fp16: true,
supports_int8: true,
};
fn analyze_bottlenecks_logic(
capabilities: &GpuCapabilities,
metrics: &GpuOpMetrics,
) -> Vec<BottleneckAnalysis> {
let mut bottlenecks = Vec::new();
if let Some(bandwidth) = metrics.memory_bandwidth {
let bandwidth_utilization = bandwidth / capabilities.memory_bandwidth_gb_s;
if bandwidth_utilization > 0.8 {
bottlenecks.push(BottleneckAnalysis {
bottleneck_type: BottleneckType::MemoryBandwidth,
severity: bandwidth_utilization.min(1.0),
description: format!(
"Memory bandwidth utilization: {:.1}%",
bandwidth_utilization * 100.0
),
recommendations: vec![
"Enable memory coalescing optimization".to_string(),
"Consider kernel fusion to reduce memory traffic".to_string(),
"Use async execution to overlap computation and memory transfers"
.to_string(),
],
potential_improvement: (bandwidth_utilization - 0.8) * 0.5,
});
}
}
if let Some(utilization) = metrics.gpu_utilization {
if utilization < 50.0 {
bottlenecks.push(BottleneckAnalysis {
bottleneck_type: BottleneckType::SynchronizationOverhead,
severity: 1.0 - (utilization / 100.0),
description: format!("Low GPU utilization: {:.1}%", utilization),
recommendations: vec![
"Increase batch size to improve GPU utilization".to_string(),
"Use async execution to reduce synchronization overhead".to_string(),
],
potential_improvement: (50.0 - utilization) / 100.0,
});
}
}
bottlenecks
}
let bottlenecks = analyze_bottlenecks_logic(&capabilities, &metrics);
assert!(bottlenecks
.iter()
.any(|b| b.bottleneck_type == BottleneckType::SynchronizationOverhead));
}
}