1use std::collections::HashMap;
7use std::sync::{Arc, Mutex};
8use std::time::{Duration, Instant};
9
10#[derive(Debug, Clone)]
12pub struct KernelOccupancyStats {
13 pub kernel_name: String,
14 pub workgroup_size: u32,
15 pub workgroups_dispatched: u32,
16 pub theoretical_occupancy: f32,
17 pub achieved_occupancy: f32,
18 pub efficiency_ratio: f32,
19 pub memory_bandwidth_utilization: f32,
20 pub arithmetic_intensity: f32,
21}
22
23#[derive(Debug)]
25pub struct PerformanceMonitor {
26 inner: Arc<Mutex<PerformanceMonitorInner>>,
27}
28
29#[derive(Debug)]
30struct PerformanceMonitorInner {
31 operation_timings: HashMap<String, Vec<Duration>>,
32 memory_usage: HashMap<String, usize>,
33 total_allocations: usize,
34 total_deallocations: usize,
35 peak_memory: usize,
36 current_memory: usize,
37 kernel_occupancy: HashMap<String, Vec<KernelOccupancyStats>>,
38}
39
40impl Default for PerformanceMonitor {
41 fn default() -> Self {
42 Self::new()
43 }
44}
45
46impl PerformanceMonitor {
47 pub fn new() -> Self {
49 Self {
50 inner: Arc::new(Mutex::new(PerformanceMonitorInner {
51 operation_timings: HashMap::new(),
52 memory_usage: HashMap::new(),
53 total_allocations: 0,
54 total_deallocations: 0,
55 peak_memory: 0,
56 current_memory: 0,
57 kernel_occupancy: HashMap::new(),
58 })),
59 }
60 }
61
62 pub fn record_operation_time(&self, operation: &str, duration: Duration) {
64 if let Ok(mut inner) = self.inner.lock() {
65 inner
66 .operation_timings
67 .entry(operation.to_string())
68 .or_default()
69 .push(duration);
70 }
71 }
72
73 pub fn record_allocation(&self, operation: &str, size: usize) {
75 if let Ok(mut inner) = self.inner.lock() {
76 inner.memory_usage.insert(operation.to_string(), size);
77 inner.total_allocations += 1;
78 inner.current_memory += size;
79 if inner.current_memory > inner.peak_memory {
80 inner.peak_memory = inner.current_memory;
81 }
82 }
83 }
84
85 pub fn record_deallocation(&self, size: usize) {
87 if let Ok(mut inner) = self.inner.lock() {
88 inner.total_deallocations += 1;
89 inner.current_memory = inner.current_memory.saturating_sub(size);
90 }
91 }
92
93 pub fn get_average_time(&self, operation: &str) -> Option<Duration> {
95 if let Ok(inner) = self.inner.lock() {
96 if let Some(times) = inner.operation_timings.get(operation) {
97 if !times.is_empty() {
98 let total: Duration = times.iter().sum();
99 return Some(total / times.len() as u32);
100 }
101 }
102 }
103 None
104 }
105
106 pub fn get_all_operation_times(&self) -> HashMap<String, Vec<Duration>> {
108 if let Ok(inner) = self.inner.lock() {
109 inner.operation_timings.clone()
110 } else {
111 HashMap::new()
112 }
113 }
114
115 pub fn get_current_memory(&self) -> usize {
117 if let Ok(inner) = self.inner.lock() {
118 inner.current_memory
119 } else {
120 0
121 }
122 }
123
124 pub fn get_peak_memory(&self) -> usize {
126 if let Ok(inner) = self.inner.lock() {
127 inner.peak_memory
128 } else {
129 0
130 }
131 }
132
133 pub fn get_allocation_stats(&self) -> (usize, usize) {
135 if let Ok(inner) = self.inner.lock() {
136 (inner.total_allocations, inner.total_deallocations)
137 } else {
138 (0, 0)
139 }
140 }
141
142 pub fn generate_report(&self) -> String {
144 if let Ok(inner) = self.inner.lock() {
145 let mut report = String::new();
146 report.push_str("=== Performance Monitor Report ===\n\n");
147
148 report.push_str("Memory Statistics:\n");
149 report.push_str(&format!(
150 " Current Memory: {} bytes\n",
151 inner.current_memory
152 ));
153 report.push_str(&format!(" Peak Memory: {} bytes\n", inner.peak_memory));
154 report.push_str(&format!(
155 " Total Allocations: {}\n",
156 inner.total_allocations
157 ));
158 report.push_str(&format!(
159 " Total Deallocations: {}\n",
160 inner.total_deallocations
161 ));
162 report.push('\n');
163
164 report.push_str("Operation Timings:\n");
165 for (operation, times) in &inner.operation_timings {
166 if !times.is_empty() {
167 let total: Duration = times.iter().sum();
168 let avg = total / times.len() as u32;
169 let min = times.iter().min().copied().unwrap_or_default();
170 let max = times.iter().max().copied().unwrap_or_default();
171
172 report.push_str(&format!(" {operation}:\n"));
173 report.push_str(&format!(" Count: {}\n", times.len()));
174 report.push_str(&format!(" Average: {avg:?}\n"));
175 report.push_str(&format!(" Min: {min:?}\n"));
176 report.push_str(&format!(" Max: {max:?}\n"));
177 report.push_str(&format!(" Total: {total:?}\n"));
178 }
179 }
180
181 report
182 } else {
183 "Failed to generate report".to_string()
184 }
185 }
186
187 pub fn record_kernel_occupancy(&self, stats: KernelOccupancyStats) {
189 if let Ok(mut inner) = self.inner.lock() {
190 inner
191 .kernel_occupancy
192 .entry(stats.kernel_name.clone())
193 .or_default()
194 .push(stats);
195 }
196 }
197
198 pub fn get_kernel_occupancy(&self, kernel_name: &str) -> Vec<KernelOccupancyStats> {
200 if let Ok(inner) = self.inner.lock() {
201 inner
202 .kernel_occupancy
203 .get(kernel_name)
204 .cloned()
205 .unwrap_or_default()
206 } else {
207 Vec::new()
208 }
209 }
210
211 pub fn get_all_kernel_occupancy(&self) -> HashMap<String, Vec<KernelOccupancyStats>> {
213 if let Ok(inner) = self.inner.lock() {
214 inner.kernel_occupancy.clone()
215 } else {
216 HashMap::new()
217 }
218 }
219
220 pub fn get_average_kernel_occupancy(&self, kernel_name: &str) -> Option<f32> {
222 if let Ok(inner) = self.inner.lock() {
223 if let Some(stats) = inner.kernel_occupancy.get(kernel_name) {
224 if !stats.is_empty() {
225 let total: f32 = stats.iter().map(|s| s.achieved_occupancy).sum();
226 return Some(total / stats.len() as f32);
227 }
228 }
229 }
230 None
231 }
232
233 pub fn generate_occupancy_report(&self) -> String {
235 if let Ok(inner) = self.inner.lock() {
236 let mut report = String::new();
237 report.push_str("=== Kernel Occupancy Analysis ===\n\n");
238
239 for (kernel_name, stats_vec) in &inner.kernel_occupancy {
240 if !stats_vec.is_empty() {
241 let avg_occupancy: f32 =
242 stats_vec.iter().map(|s| s.achieved_occupancy).sum::<f32>()
243 / stats_vec.len() as f32;
244 let avg_efficiency: f32 =
245 stats_vec.iter().map(|s| s.efficiency_ratio).sum::<f32>()
246 / stats_vec.len() as f32;
247 let avg_bandwidth: f32 = stats_vec
248 .iter()
249 .map(|s| s.memory_bandwidth_utilization)
250 .sum::<f32>()
251 / stats_vec.len() as f32;
252 let avg_intensity: f32 = stats_vec
253 .iter()
254 .map(|s| s.arithmetic_intensity)
255 .sum::<f32>()
256 / stats_vec.len() as f32;
257
258 report.push_str(&format!("Kernel: {kernel_name}\n"));
259 report.push_str(&format!(" Invocations: {}\n", stats_vec.len()));
260 report.push_str(&format!(" Average Occupancy: {avg_occupancy:.2}%\n"));
261 report.push_str(&format!(" Average Efficiency: {avg_efficiency:.2}%\n"));
262 report.push_str(&format!(
263 " Average Bandwidth Utilization: {avg_bandwidth:.2}%\n"
264 ));
265 report.push_str(&format!(
266 " Average Arithmetic Intensity: {avg_intensity:.2}\n"
267 ));
268
269 if avg_occupancy < 50.0 {
271 report.push_str(
272 " ⚠️ Low occupancy detected. Consider increasing workgroup size.\n",
273 );
274 }
275 if avg_efficiency < 70.0 {
276 report.push_str(
277 " ⚠️ Low efficiency. Check for thread divergence or memory issues.\n",
278 );
279 }
280 if avg_bandwidth < 60.0 {
281 report.push_str(" ⚠️ Low memory bandwidth utilization. Consider memory access optimization.\n");
282 }
283
284 report.push('\n');
285 }
286 }
287
288 report
289 } else {
290 "Failed to generate occupancy report".to_string()
291 }
292 }
293
294 pub fn clear(&self) {
296 if let Ok(mut inner) = self.inner.lock() {
297 inner.operation_timings.clear();
298 inner.memory_usage.clear();
299 inner.total_allocations = 0;
300 inner.total_deallocations = 0;
301 inner.peak_memory = 0;
302 inner.current_memory = 0;
303 inner.kernel_occupancy.clear();
304 }
305 }
306}
307
308pub struct OperationTimer {
310 operation: String,
311 start: Instant,
312 monitor: Arc<PerformanceMonitor>,
313}
314
315impl OperationTimer {
316 pub fn new(operation: String, monitor: Arc<PerformanceMonitor>) -> Self {
318 Self {
319 operation,
320 start: Instant::now(),
321 monitor,
322 }
323 }
324}
325
326impl Drop for OperationTimer {
327 fn drop(&mut self) {
328 let duration = self.start.elapsed();
329 self.monitor
330 .record_operation_time(&self.operation, duration);
331 }
332}
333
334static GLOBAL_MONITOR: std::sync::OnceLock<Arc<PerformanceMonitor>> = std::sync::OnceLock::new();
336
337pub fn global_monitor() -> &'static PerformanceMonitor {
339 GLOBAL_MONITOR.get_or_init(|| Arc::new(PerformanceMonitor::new()))
340}
341
342pub fn global_monitor_arc() -> Arc<PerformanceMonitor> {
344 GLOBAL_MONITOR
345 .get_or_init(|| Arc::new(PerformanceMonitor::new()))
346 .clone()
347}
348
349#[macro_export]
351macro_rules! time_operation {
352 ($name:expr, $code:block) => {{
353 let monitor = $crate::memory::tracking::global_monitor_arc();
354 let _timer = $crate::memory::tracking::OperationTimer::new($name.to_string(), monitor);
355 $code
356 }};
357}
358
359#[cfg(test)]
360mod tests {
361 use super::*;
362 use std::thread;
363
364 #[test]
365 fn test_performance_monitor() {
366 let monitor = PerformanceMonitor::new();
367
368 monitor.record_operation_time("test_op", Duration::from_millis(100));
370 monitor.record_operation_time("test_op", Duration::from_millis(200));
371
372 let avg_time = monitor
373 .get_average_time("test_op")
374 .expect("test: get_average_time should succeed");
375 assert_eq!(avg_time, Duration::from_millis(150));
376
377 monitor.record_allocation("tensor_alloc", 1024);
379 assert_eq!(monitor.get_current_memory(), 1024);
380 assert_eq!(monitor.get_peak_memory(), 1024);
381
382 monitor.record_allocation("another_alloc", 512);
383 assert_eq!(monitor.get_current_memory(), 1536);
384 assert_eq!(monitor.get_peak_memory(), 1536);
385
386 monitor.record_deallocation(512);
387 assert_eq!(monitor.get_current_memory(), 1024);
388 assert_eq!(monitor.get_peak_memory(), 1536); let (allocs, deallocs) = monitor.get_allocation_stats();
391 assert_eq!(allocs, 2);
392 assert_eq!(deallocs, 1);
393 }
394
395 #[test]
396 fn test_operation_timer() {
397 let monitor = Arc::new(PerformanceMonitor::new());
398
399 {
400 let _timer = OperationTimer::new("sleep_test".to_string(), monitor.clone());
401 thread::sleep(Duration::from_millis(10));
402 }
403
404 let avg_time = monitor
405 .get_average_time("sleep_test")
406 .expect("test: get_average_time should succeed");
407 assert!(avg_time >= Duration::from_millis(9)); }
409
410 #[test]
411 fn test_report_generation() {
412 let monitor = PerformanceMonitor::new();
413 monitor.record_operation_time("op1", Duration::from_millis(100));
414 monitor.record_allocation("alloc1", 1024);
415
416 let report = monitor.generate_report();
417 assert!(report.contains("Performance Monitor Report"));
418 assert!(report.contains("Current Memory: 1024 bytes"));
419 assert!(report.contains("op1:"));
420 }
421
422 #[test]
423 fn test_global_monitor() {
424 let monitor1 = global_monitor();
425 let monitor2 = global_monitor();
426
427 assert!(std::ptr::eq(monitor1, monitor2));
429
430 let initial_memory = monitor1.get_current_memory();
432
433 monitor1.record_allocation("global_test", 512);
434 let final_memory = monitor2.get_current_memory();
435
436 assert_eq!(final_memory - initial_memory, 512);
438 }
439
440 #[test]
441 fn test_kernel_occupancy() {
442 let monitor = PerformanceMonitor::new();
443
444 let stats = KernelOccupancyStats {
445 kernel_name: "test_kernel".to_string(),
446 workgroup_size: 256,
447 workgroups_dispatched: 100,
448 theoretical_occupancy: 100.0,
449 achieved_occupancy: 85.0,
450 efficiency_ratio: 90.0,
451 memory_bandwidth_utilization: 75.0,
452 arithmetic_intensity: 2.5,
453 };
454
455 monitor.record_kernel_occupancy(stats);
456
457 let avg_occupancy = monitor
458 .get_average_kernel_occupancy("test_kernel")
459 .expect("test: get_average_kernel_occupancy should succeed");
460 assert_eq!(avg_occupancy, 85.0);
461
462 let occupancy_report = monitor.generate_occupancy_report();
463 assert!(occupancy_report.contains("Kernel Occupancy Analysis"));
464 assert!(occupancy_report.contains("test_kernel"));
465 }
466
467 #[test]
468 fn test_clear_statistics() {
469 let monitor = PerformanceMonitor::new();
470
471 monitor.record_operation_time("op", Duration::from_millis(100));
472 monitor.record_allocation("alloc", 1024);
473
474 assert_eq!(monitor.get_current_memory(), 1024);
475 assert!(monitor.get_average_time("op").is_some());
476
477 monitor.clear();
478
479 assert_eq!(monitor.get_current_memory(), 0);
480 assert!(monitor.get_average_time("op").is_none());
481 }
482}