trustformers_debug/profiler/
gpu.rs1use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::Duration;
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct GpuKernelProfile {
11 pub kernel_name: String,
12 pub grid_size: (u32, u32, u32),
13 pub block_size: (u32, u32, u32),
14 pub shared_memory_bytes: usize,
15 pub registers_per_thread: u32,
16 pub occupancy: f64,
17 pub execution_time: Duration,
18 pub memory_bandwidth_gb_s: f64,
19 pub compute_utilization: f64,
20 pub stream_id: i32,
21}
22
23#[derive(Debug)]
25#[allow(dead_code)]
26pub struct GpuProfiler {
27 #[allow(dead_code)]
28 device_count: i32,
29 pub(crate) active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
30 memory_pools: HashMap<i32, GpuMemoryPool>,
31}
32
33#[allow(dead_code)]
34#[derive(Debug)]
35pub struct GpuMemoryPool {
36 #[allow(dead_code)]
37 device_id: i32,
38 total_memory: usize,
39 free_memory: usize,
40 fragmentation_score: f64,
41}
42
43impl GpuProfiler {
44 pub fn new() -> Result<Self> {
45 Ok(Self {
47 device_count: 1, active_streams: HashMap::new(),
49 memory_pools: HashMap::new(),
50 })
51 }
52
53 pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
54 self.active_streams
55 .entry(kernel_profile.stream_id)
56 .or_default()
57 .push(kernel_profile);
58 }
59
60 pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
61 if let Some(kernels) = self.active_streams.get(&device_id) {
63 if kernels.is_empty() {
64 0.0
65 } else {
66 kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
67 }
68 } else {
69 0.0
70 }
71 }
72}
73
74#[derive(Debug, Serialize, Deserialize)]
75pub struct GpuKernelSummary {
76 pub total_kernels: usize,
77 pub total_execution_time: Duration,
78 pub avg_occupancy: f64,
79 pub avg_compute_utilization: f64,
80 pub slowest_kernels: Vec<String>,
81}
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 #[test]
88 fn test_gpu_profiler_new() {
89 let profiler = GpuProfiler::new();
90 assert!(profiler.is_ok());
91 }
92
93 #[test]
94 fn test_gpu_profiler_utilization_empty() {
95 let profiler = GpuProfiler::new().expect("should create profiler");
96 assert!((profiler.get_gpu_utilization(0) - 0.0).abs() < 1e-9);
97 }
98
99 #[test]
100 fn test_gpu_profiler_profile_kernel() {
101 let mut profiler = GpuProfiler::new().expect("should create profiler");
102 let kernel = GpuKernelProfile {
103 kernel_name: "matmul".to_string(),
104 grid_size: (128, 1, 1),
105 block_size: (256, 1, 1),
106 shared_memory_bytes: 4096,
107 registers_per_thread: 32,
108 occupancy: 0.85,
109 execution_time: Duration::from_micros(500),
110 memory_bandwidth_gb_s: 300.0,
111 compute_utilization: 0.9,
112 stream_id: 0,
113 };
114 profiler.profile_kernel(kernel);
115 let util = profiler.get_gpu_utilization(0);
116 assert!((util - 0.9).abs() < 1e-9);
117 }
118
119 #[test]
120 fn test_gpu_profiler_multiple_kernels_avg() {
121 let mut profiler = GpuProfiler::new().expect("should create profiler");
122 for util in [0.8, 0.6] {
123 profiler.profile_kernel(GpuKernelProfile {
124 kernel_name: "kern".to_string(),
125 grid_size: (1, 1, 1),
126 block_size: (1, 1, 1),
127 shared_memory_bytes: 0,
128 registers_per_thread: 0,
129 occupancy: 0.5,
130 execution_time: Duration::from_micros(100),
131 memory_bandwidth_gb_s: 0.0,
132 compute_utilization: util,
133 stream_id: 0,
134 });
135 }
136 let avg = profiler.get_gpu_utilization(0);
137 assert!((avg - 0.7).abs() < 1e-9);
138 }
139}