use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Duration;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuKernelProfile {
pub kernel_name: String,
pub grid_size: (u32, u32, u32),
pub block_size: (u32, u32, u32),
pub shared_memory_bytes: usize,
pub registers_per_thread: u32,
pub occupancy: f64,
pub execution_time: Duration,
pub memory_bandwidth_gb_s: f64,
pub compute_utilization: f64,
pub stream_id: i32,
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct GpuProfiler {
#[allow(dead_code)]
device_count: i32,
pub(crate) active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
memory_pools: HashMap<i32, GpuMemoryPool>,
}
#[allow(dead_code)]
#[derive(Debug)]
pub struct GpuMemoryPool {
#[allow(dead_code)]
device_id: i32,
total_memory: usize,
free_memory: usize,
fragmentation_score: f64,
}
impl GpuProfiler {
pub fn new() -> Result<Self> {
Ok(Self {
device_count: 1, active_streams: HashMap::new(),
memory_pools: HashMap::new(),
})
}
pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
self.active_streams
.entry(kernel_profile.stream_id)
.or_default()
.push(kernel_profile);
}
pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
if let Some(kernels) = self.active_streams.get(&device_id) {
if kernels.is_empty() {
0.0
} else {
kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
}
} else {
0.0
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct GpuKernelSummary {
pub total_kernels: usize,
pub total_execution_time: Duration,
pub avg_occupancy: f64,
pub avg_compute_utilization: f64,
pub slowest_kernels: Vec<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_profiler_new() {
let profiler = GpuProfiler::new();
assert!(profiler.is_ok());
}
#[test]
fn test_gpu_profiler_utilization_empty() {
let profiler = GpuProfiler::new().expect("should create profiler");
assert!((profiler.get_gpu_utilization(0) - 0.0).abs() < 1e-9);
}
#[test]
fn test_gpu_profiler_profile_kernel() {
let mut profiler = GpuProfiler::new().expect("should create profiler");
let kernel = GpuKernelProfile {
kernel_name: "matmul".to_string(),
grid_size: (128, 1, 1),
block_size: (256, 1, 1),
shared_memory_bytes: 4096,
registers_per_thread: 32,
occupancy: 0.85,
execution_time: Duration::from_micros(500),
memory_bandwidth_gb_s: 300.0,
compute_utilization: 0.9,
stream_id: 0,
};
profiler.profile_kernel(kernel);
let util = profiler.get_gpu_utilization(0);
assert!((util - 0.9).abs() < 1e-9);
}
#[test]
fn test_gpu_profiler_multiple_kernels_avg() {
let mut profiler = GpuProfiler::new().expect("should create profiler");
for util in [0.8, 0.6] {
profiler.profile_kernel(GpuKernelProfile {
kernel_name: "kern".to_string(),
grid_size: (1, 1, 1),
block_size: (1, 1, 1),
shared_memory_bytes: 0,
registers_per_thread: 0,
occupancy: 0.5,
execution_time: Duration::from_micros(100),
memory_bandwidth_gb_s: 0.0,
compute_utilization: util,
stream_id: 0,
});
}
let avg = profiler.get_gpu_utilization(0);
assert!((avg - 0.7).abs() < 1e-9);
}
}