1use crate::error::Result;
11use parking_lot::RwLock;
12use std::collections::HashMap;
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15use wgpu::{Device, Queue};
16
17#[derive(Clone)]
19pub struct GpuProfiler {
20 #[allow(dead_code)]
22 device: Arc<Device>,
23 #[allow(dead_code)]
25 queue: Arc<Queue>,
26 metrics: Arc<RwLock<ProfilingMetrics>>,
27 config: ProfilingConfig,
28 #[allow(dead_code)]
30 query_sets: Arc<RwLock<Vec<wgpu::QuerySet>>>,
31 timestamp_period: f32,
32}
33
34impl GpuProfiler {
35 pub fn new(device: Arc<Device>, queue: Arc<Queue>, config: ProfilingConfig) -> Result<Self> {
37 let timestamp_period = queue.get_timestamp_period();
39
40 Ok(Self {
41 device,
42 queue,
43 metrics: Arc::new(RwLock::new(ProfilingMetrics::default())),
44 config,
45 query_sets: Arc::new(RwLock::new(Vec::new())),
46 timestamp_period,
47 })
48 }
49
50 pub fn begin_profile(&self, label: &str) -> ProfileSession {
52 let start = Instant::now();
53 ProfileSession {
54 label: label.to_string(),
55 start,
56 profiler: self.clone(),
57 gpu_start_query: None,
58 gpu_end_query: None,
59 }
60 }
61
62 pub fn record_kernel_execution(
64 &self,
65 label: &str,
66 duration: Duration,
67 memory_bytes: u64,
68 compute_units: u32,
69 ) {
70 let mut metrics = self.metrics.write();
71 metrics.record_kernel(label, duration, memory_bytes, compute_units);
72 }
73
74 pub fn record_memory_transfer(&self, bytes: u64, duration: Duration, host_to_device: bool) {
76 let mut metrics = self.metrics.write();
77 metrics.record_transfer(bytes, duration, host_to_device);
78 }
79
80 pub fn get_metrics(&self) -> ProfilingMetrics {
82 self.metrics.read().clone()
83 }
84
85 pub fn generate_report(&self) -> ProfilingReport {
87 let metrics = self.metrics.read();
88 ProfilingReport::from_metrics(&metrics)
89 }
90
91 pub fn reset(&self) {
93 let mut metrics = self.metrics.write();
94 *metrics = ProfilingMetrics::default();
95 }
96
97 pub fn timestamp_period(&self) -> f32 {
99 self.timestamp_period
100 }
101
102 pub fn detect_bottlenecks(&self) -> Vec<PerformanceBottleneck> {
104 let metrics = self.metrics.read();
105 let mut bottlenecks = Vec::new();
106
107 if let Some(bandwidth_gbs) = metrics.average_memory_bandwidth_gbs() {
109 if bandwidth_gbs < self.config.min_expected_bandwidth_gbs {
110 bottlenecks.push(PerformanceBottleneck {
111 kind: BottleneckKind::MemoryBandwidth,
112 severity: BottleneckSeverity::High,
113 description: format!(
114 "Memory bandwidth {:.2} GB/s is below expected {:.2} GB/s",
115 bandwidth_gbs, self.config.min_expected_bandwidth_gbs
116 ),
117 suggestion: "Consider batching transfers or using compression".to_string(),
118 });
119 }
120 }
121
122 for (label, stats) in &metrics.kernel_stats {
124 if let Some(avg_duration) = stats.average_duration() {
125 if avg_duration > self.config.max_kernel_duration {
126 bottlenecks.push(PerformanceBottleneck {
127 kind: BottleneckKind::KernelExecution,
128 severity: BottleneckSeverity::Medium,
129 description: format!(
130 "Kernel '{}' average duration {:?} exceeds threshold {:?}",
131 label, avg_duration, self.config.max_kernel_duration
132 ),
133 suggestion: "Consider optimizing shader or reducing workload".to_string(),
134 });
135 }
136 }
137 }
138
139 let total_time = metrics.total_duration();
141 let transfer_time = metrics.total_transfer_duration();
142 if total_time > Duration::ZERO {
143 let transfer_ratio = transfer_time.as_secs_f64() / total_time.as_secs_f64();
144 if transfer_ratio > self.config.max_transfer_ratio {
145 bottlenecks.push(PerformanceBottleneck {
146 kind: BottleneckKind::TransferOverhead,
147 severity: BottleneckSeverity::High,
148 description: format!(
149 "Memory transfer overhead {:.1}% exceeds threshold {:.1}%",
150 transfer_ratio * 100.0,
151 self.config.max_transfer_ratio * 100.0
152 ),
153 suggestion: "Reduce data transfers or overlap with computation".to_string(),
154 });
155 }
156 }
157
158 bottlenecks
159 }
160}
161
162pub struct ProfileSession {
164 label: String,
165 start: Instant,
166 profiler: GpuProfiler,
167 #[allow(dead_code)]
169 gpu_start_query: Option<u32>,
170 #[allow(dead_code)]
172 gpu_end_query: Option<u32>,
173}
174
175impl ProfileSession {
176 pub fn end(self, memory_bytes: u64, compute_units: u32) {
178 let duration = self.start.elapsed();
179 self.profiler
180 .record_kernel_execution(&self.label, duration, memory_bytes, compute_units);
181 }
182
183 pub fn end_with_duration(self, duration: Duration, memory_bytes: u64, compute_units: u32) {
185 self.profiler
186 .record_kernel_execution(&self.label, duration, memory_bytes, compute_units);
187 }
188}
189
190#[derive(Debug, Clone)]
192pub struct ProfilingConfig {
193 pub detailed: bool,
195 pub min_expected_bandwidth_gbs: f64,
197 pub max_kernel_duration: Duration,
199 pub max_transfer_ratio: f64,
201 pub track_power: bool,
203}
204
205impl Default for ProfilingConfig {
206 fn default() -> Self {
207 Self {
208 detailed: true,
209 min_expected_bandwidth_gbs: 100.0,
210 max_kernel_duration: Duration::from_millis(100),
211 max_transfer_ratio: 0.3,
212 track_power: false,
213 }
214 }
215}
216
217#[derive(Debug, Clone, Default)]
219pub struct ProfilingMetrics {
220 pub kernel_stats: HashMap<String, KernelStats>,
222 pub transfer_stats: TransferStats,
224 pub overall: OverallMetrics,
226}
227
228impl ProfilingMetrics {
229 fn record_kernel(
231 &mut self,
232 label: &str,
233 duration: Duration,
234 memory_bytes: u64,
235 compute_units: u32,
236 ) {
237 let stats = self.kernel_stats.entry(label.to_string()).or_default();
238 stats.record(duration, memory_bytes, compute_units);
239 self.overall.total_kernel_time += duration;
240 self.overall.total_kernels += 1;
241 }
242
243 fn record_transfer(&mut self, bytes: u64, duration: Duration, host_to_device: bool) {
245 if host_to_device {
246 self.transfer_stats.host_to_device.record(bytes, duration);
247 } else {
248 self.transfer_stats.device_to_host.record(bytes, duration);
249 }
250 self.overall.total_transfer_time += duration;
251 self.overall.total_transfers += 1;
252 self.overall.total_bytes_transferred += bytes;
253 }
254
255 fn average_memory_bandwidth_gbs(&self) -> Option<f64> {
257 let total_bytes = self.overall.total_bytes_transferred;
258 let total_time = self.overall.total_transfer_time;
259
260 if total_time > Duration::ZERO && total_bytes > 0 {
261 let bytes_per_sec = total_bytes as f64 / total_time.as_secs_f64();
262 Some(bytes_per_sec / 1_000_000_000.0)
263 } else {
264 None
265 }
266 }
267
268 fn total_duration(&self) -> Duration {
270 self.overall.total_kernel_time + self.overall.total_transfer_time
271 }
272
273 fn total_transfer_duration(&self) -> Duration {
275 self.overall.total_transfer_time
276 }
277}
278
279#[derive(Debug, Clone, Default)]
281pub struct KernelStats {
282 pub executions: u64,
284 pub total_duration: Duration,
286 pub min_duration: Option<Duration>,
288 pub max_duration: Option<Duration>,
290 pub total_memory_bytes: u64,
292 pub total_compute_units: u64,
294}
295
296impl KernelStats {
297 fn record(&mut self, duration: Duration, memory_bytes: u64, compute_units: u32) {
298 self.executions += 1;
299 self.total_duration += duration;
300 self.total_memory_bytes += memory_bytes;
301 self.total_compute_units += compute_units as u64;
302
303 self.min_duration = Some(
304 self.min_duration
305 .map(|min| min.min(duration))
306 .unwrap_or(duration),
307 );
308 self.max_duration = Some(
309 self.max_duration
310 .map(|max| max.max(duration))
311 .unwrap_or(duration),
312 );
313 }
314
315 pub fn average_duration(&self) -> Option<Duration> {
317 if self.executions > 0 {
318 Some(self.total_duration / self.executions as u32)
319 } else {
320 None
321 }
322 }
323
324 pub fn bandwidth_gbs(&self) -> Option<f64> {
326 if self.total_duration > Duration::ZERO && self.total_memory_bytes > 0 {
327 let bytes_per_sec = self.total_memory_bytes as f64 / self.total_duration.as_secs_f64();
328 Some(bytes_per_sec / 1_000_000_000.0)
329 } else {
330 None
331 }
332 }
333}
334
335#[derive(Debug, Clone, Default)]
337pub struct TransferStats {
338 pub host_to_device: DirectionalTransferStats,
340 pub device_to_host: DirectionalTransferStats,
342}
343
344#[derive(Debug, Clone, Default)]
346pub struct DirectionalTransferStats {
347 pub count: u64,
349 pub total_bytes: u64,
351 pub total_duration: Duration,
353 pub min_duration: Option<Duration>,
355 pub max_duration: Option<Duration>,
357}
358
359impl DirectionalTransferStats {
360 fn record(&mut self, bytes: u64, duration: Duration) {
361 self.count += 1;
362 self.total_bytes += bytes;
363 self.total_duration += duration;
364
365 self.min_duration = Some(
366 self.min_duration
367 .map(|min| min.min(duration))
368 .unwrap_or(duration),
369 );
370 self.max_duration = Some(
371 self.max_duration
372 .map(|max| max.max(duration))
373 .unwrap_or(duration),
374 );
375 }
376
377 pub fn bandwidth_gbs(&self) -> Option<f64> {
379 if self.total_duration > Duration::ZERO && self.total_bytes > 0 {
380 let bytes_per_sec = self.total_bytes as f64 / self.total_duration.as_secs_f64();
381 Some(bytes_per_sec / 1_000_000_000.0)
382 } else {
383 None
384 }
385 }
386}
387
388#[derive(Debug, Clone, Default)]
390pub struct OverallMetrics {
391 pub total_kernel_time: Duration,
393 pub total_transfer_time: Duration,
395 pub total_kernels: u64,
397 pub total_transfers: u64,
399 pub total_bytes_transferred: u64,
401}
402
403#[derive(Debug, Clone)]
405pub struct ProfilingReport {
406 pub summary: ReportSummary,
408 pub kernel_details: Vec<KernelDetail>,
410 pub transfer_details: TransferDetail,
412 pub bottlenecks: Vec<PerformanceBottleneck>,
414}
415
416impl ProfilingReport {
417 fn from_metrics(metrics: &ProfilingMetrics) -> Self {
418 let mut kernel_details = Vec::new();
419 for (label, stats) in &metrics.kernel_stats {
420 kernel_details.push(KernelDetail {
421 name: label.clone(),
422 executions: stats.executions,
423 total_time: stats.total_duration,
424 avg_time: stats.average_duration().unwrap_or_default(),
425 min_time: stats.min_duration.unwrap_or_default(),
426 max_time: stats.max_duration.unwrap_or_default(),
427 bandwidth_gbs: stats.bandwidth_gbs(),
428 });
429 }
430
431 kernel_details.sort_by_key(|x| std::cmp::Reverse(x.total_time));
433
434 Self {
435 summary: ReportSummary {
436 total_duration: metrics.total_duration(),
437 kernel_time: metrics.overall.total_kernel_time,
438 transfer_time: metrics.overall.total_transfer_time,
439 total_kernels: metrics.overall.total_kernels,
440 total_transfers: metrics.overall.total_transfers,
441 average_bandwidth_gbs: metrics.average_memory_bandwidth_gbs(),
442 },
443 kernel_details,
444 transfer_details: TransferDetail {
445 host_to_device_count: metrics.transfer_stats.host_to_device.count,
446 host_to_device_bytes: metrics.transfer_stats.host_to_device.total_bytes,
447 host_to_device_bandwidth: metrics.transfer_stats.host_to_device.bandwidth_gbs(),
448 device_to_host_count: metrics.transfer_stats.device_to_host.count,
449 device_to_host_bytes: metrics.transfer_stats.device_to_host.total_bytes,
450 device_to_host_bandwidth: metrics.transfer_stats.device_to_host.bandwidth_gbs(),
451 },
452 bottlenecks: Vec::new(),
453 }
454 }
455
456 pub fn print(&self) {
458 println!("=== GPU Profiling Report ===");
459 println!("\nSummary:");
460 println!(" Total Duration: {:?}", self.summary.total_duration);
461 println!(
462 " Kernel Time: {:?} ({:.1}%)",
463 self.summary.kernel_time,
464 100.0 * self.summary.kernel_time.as_secs_f64()
465 / self.summary.total_duration.as_secs_f64()
466 );
467 println!(
468 " Transfer Time: {:?} ({:.1}%)",
469 self.summary.transfer_time,
470 100.0 * self.summary.transfer_time.as_secs_f64()
471 / self.summary.total_duration.as_secs_f64()
472 );
473 println!(" Total Kernels: {}", self.summary.total_kernels);
474 println!(" Total Transfers: {}", self.summary.total_transfers);
475 if let Some(bw) = self.summary.average_bandwidth_gbs {
476 println!(" Average Bandwidth: {:.2} GB/s", bw);
477 }
478
479 println!("\nTop Kernels by Time:");
480 for detail in self.kernel_details.iter().take(10) {
481 println!(
482 " {} ({} execs): {:?} total, {:?} avg",
483 detail.name, detail.executions, detail.total_time, detail.avg_time
484 );
485 if let Some(bw) = detail.bandwidth_gbs {
486 println!(" Bandwidth: {:.2} GB/s", bw);
487 }
488 }
489
490 println!("\nMemory Transfers:");
491 println!(
492 " Host->Device: {} transfers, {} bytes ({:.2} GB/s)",
493 self.transfer_details.host_to_device_count,
494 self.transfer_details.host_to_device_bytes,
495 self.transfer_details
496 .host_to_device_bandwidth
497 .unwrap_or(0.0)
498 );
499 println!(
500 " Device->Host: {} transfers, {} bytes ({:.2} GB/s)",
501 self.transfer_details.device_to_host_count,
502 self.transfer_details.device_to_host_bytes,
503 self.transfer_details
504 .device_to_host_bandwidth
505 .unwrap_or(0.0)
506 );
507
508 if !self.bottlenecks.is_empty() {
509 println!("\nPerformance Bottlenecks:");
510 for bottleneck in &self.bottlenecks {
511 println!(
512 " [{:?}] {:?}: {}",
513 bottleneck.severity, bottleneck.kind, bottleneck.description
514 );
515 println!(" Suggestion: {}", bottleneck.suggestion);
516 }
517 }
518 }
519}
520
521#[derive(Debug, Clone)]
523pub struct ReportSummary {
524 pub total_duration: Duration,
526 pub kernel_time: Duration,
528 pub transfer_time: Duration,
530 pub total_kernels: u64,
532 pub total_transfers: u64,
534 pub average_bandwidth_gbs: Option<f64>,
536}
537
538#[derive(Debug, Clone)]
540pub struct KernelDetail {
541 pub name: String,
543 pub executions: u64,
545 pub total_time: Duration,
547 pub avg_time: Duration,
549 pub min_time: Duration,
551 pub max_time: Duration,
553 pub bandwidth_gbs: Option<f64>,
555}
556
557#[derive(Debug, Clone)]
559pub struct TransferDetail {
560 pub host_to_device_count: u64,
562 pub host_to_device_bytes: u64,
564 pub host_to_device_bandwidth: Option<f64>,
566 pub device_to_host_count: u64,
568 pub device_to_host_bytes: u64,
570 pub device_to_host_bandwidth: Option<f64>,
572}
573
574#[derive(Debug, Clone)]
576pub struct PerformanceBottleneck {
577 pub kind: BottleneckKind,
579 pub severity: BottleneckSeverity,
581 pub description: String,
583 pub suggestion: String,
585}
586
587#[derive(Debug, Clone, Copy, PartialEq, Eq)]
589pub enum BottleneckKind {
590 MemoryBandwidth,
592 KernelExecution,
594 TransferOverhead,
596 Synchronization,
598}
599
600#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
602pub enum BottleneckSeverity {
603 Low,
605 Medium,
607 High,
609 Critical,
611}
612
613#[cfg(test)]
614mod tests {
615 use super::*;
616
617 #[test]
618 fn test_kernel_stats() {
619 let mut stats = KernelStats::default();
620 stats.record(Duration::from_millis(10), 1024, 8);
621 stats.record(Duration::from_millis(20), 2048, 16);
622
623 assert_eq!(stats.executions, 2);
624 assert_eq!(stats.total_memory_bytes, 3072);
625 assert_eq!(stats.total_compute_units, 24);
626 assert_eq!(stats.min_duration, Some(Duration::from_millis(10)));
627 assert_eq!(stats.max_duration, Some(Duration::from_millis(20)));
628 }
629
630 #[test]
631 fn test_transfer_stats() {
632 let mut stats = DirectionalTransferStats::default();
633 stats.record(1024, Duration::from_micros(10));
634 stats.record(2048, Duration::from_micros(20));
635
636 assert_eq!(stats.count, 2);
637 assert_eq!(stats.total_bytes, 3072);
638 assert!(stats.bandwidth_gbs().is_some());
639 }
640}