oxigdal_gpu_advanced/
profiling.rs

1//! GPU profiling and performance metrics.
2//!
3//! This module provides comprehensive GPU profiling capabilities including:
4//! - Kernel execution time measurement
5//! - Memory bandwidth tracking
6//! - GPU utilization metrics
7//! - Performance bottleneck detection
8//! - Power consumption monitoring (when available)
9
10use crate::error::Result;
11use parking_lot::RwLock;
12use std::collections::HashMap;
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15use wgpu::{Device, Queue};
16
17/// GPU profiling manager
18#[derive(Clone)]
19pub struct GpuProfiler {
20    /// Device for GPU timestamp queries (reserved for GPU profiling)
21    #[allow(dead_code)]
22    device: Arc<Device>,
23    /// Queue for GPU command submission (reserved for GPU profiling)
24    #[allow(dead_code)]
25    queue: Arc<Queue>,
26    metrics: Arc<RwLock<ProfilingMetrics>>,
27    config: ProfilingConfig,
28    /// Query sets for GPU timestamp queries (reserved for GPU profiling)
29    #[allow(dead_code)]
30    query_sets: Arc<RwLock<Vec<wgpu::QuerySet>>>,
31    timestamp_period: f32,
32}
33
34impl GpuProfiler {
35    /// Create a new GPU profiler
36    pub fn new(device: Arc<Device>, queue: Arc<Queue>, config: ProfilingConfig) -> Result<Self> {
37        // Get timestamp period for accurate timing
38        let timestamp_period = queue.get_timestamp_period();
39
40        Ok(Self {
41            device,
42            queue,
43            metrics: Arc::new(RwLock::new(ProfilingMetrics::default())),
44            config,
45            query_sets: Arc::new(RwLock::new(Vec::new())),
46            timestamp_period,
47        })
48    }
49
50    /// Start profiling a kernel execution
51    pub fn begin_profile(&self, label: &str) -> ProfileSession {
52        let start = Instant::now();
53        ProfileSession {
54            label: label.to_string(),
55            start,
56            profiler: self.clone(),
57            gpu_start_query: None,
58            gpu_end_query: None,
59        }
60    }
61
62    /// Record kernel execution metrics
63    pub fn record_kernel_execution(
64        &self,
65        label: &str,
66        duration: Duration,
67        memory_bytes: u64,
68        compute_units: u32,
69    ) {
70        let mut metrics = self.metrics.write();
71        metrics.record_kernel(label, duration, memory_bytes, compute_units);
72    }
73
74    /// Record memory transfer
75    pub fn record_memory_transfer(&self, bytes: u64, duration: Duration, host_to_device: bool) {
76        let mut metrics = self.metrics.write();
77        metrics.record_transfer(bytes, duration, host_to_device);
78    }
79
80    /// Get current metrics
81    pub fn get_metrics(&self) -> ProfilingMetrics {
82        self.metrics.read().clone()
83    }
84
85    /// Generate profiling report
86    pub fn generate_report(&self) -> ProfilingReport {
87        let metrics = self.metrics.read();
88        ProfilingReport::from_metrics(&metrics)
89    }
90
91    /// Reset all metrics
92    pub fn reset(&self) {
93        let mut metrics = self.metrics.write();
94        *metrics = ProfilingMetrics::default();
95    }
96
97    /// Get timestamp period in nanoseconds
98    pub fn timestamp_period(&self) -> f32 {
99        self.timestamp_period
100    }
101
102    /// Detect performance bottlenecks
103    pub fn detect_bottlenecks(&self) -> Vec<PerformanceBottleneck> {
104        let metrics = self.metrics.read();
105        let mut bottlenecks = Vec::new();
106
107        // Check memory bandwidth
108        if let Some(bandwidth_gbs) = metrics.average_memory_bandwidth_gbs() {
109            if bandwidth_gbs < self.config.min_expected_bandwidth_gbs {
110                bottlenecks.push(PerformanceBottleneck {
111                    kind: BottleneckKind::MemoryBandwidth,
112                    severity: BottleneckSeverity::High,
113                    description: format!(
114                        "Memory bandwidth {:.2} GB/s is below expected {:.2} GB/s",
115                        bandwidth_gbs, self.config.min_expected_bandwidth_gbs
116                    ),
117                    suggestion: "Consider batching transfers or using compression".to_string(),
118                });
119            }
120        }
121
122        // Check kernel efficiency
123        for (label, stats) in &metrics.kernel_stats {
124            if let Some(avg_duration) = stats.average_duration() {
125                if avg_duration > self.config.max_kernel_duration {
126                    bottlenecks.push(PerformanceBottleneck {
127                        kind: BottleneckKind::KernelExecution,
128                        severity: BottleneckSeverity::Medium,
129                        description: format!(
130                            "Kernel '{}' average duration {:?} exceeds threshold {:?}",
131                            label, avg_duration, self.config.max_kernel_duration
132                        ),
133                        suggestion: "Consider optimizing shader or reducing workload".to_string(),
134                    });
135                }
136            }
137        }
138
139        // Check transfer overhead
140        let total_time = metrics.total_duration();
141        let transfer_time = metrics.total_transfer_duration();
142        if total_time > Duration::ZERO {
143            let transfer_ratio = transfer_time.as_secs_f64() / total_time.as_secs_f64();
144            if transfer_ratio > self.config.max_transfer_ratio {
145                bottlenecks.push(PerformanceBottleneck {
146                    kind: BottleneckKind::TransferOverhead,
147                    severity: BottleneckSeverity::High,
148                    description: format!(
149                        "Memory transfer overhead {:.1}% exceeds threshold {:.1}%",
150                        transfer_ratio * 100.0,
151                        self.config.max_transfer_ratio * 100.0
152                    ),
153                    suggestion: "Reduce data transfers or overlap with computation".to_string(),
154                });
155            }
156        }
157
158        bottlenecks
159    }
160}
161
162/// Profile session for a single operation
163pub struct ProfileSession {
164    label: String,
165    start: Instant,
166    profiler: GpuProfiler,
167    /// GPU start query index (reserved for GPU timestamp queries)
168    #[allow(dead_code)]
169    gpu_start_query: Option<u32>,
170    /// GPU end query index (reserved for GPU timestamp queries)
171    #[allow(dead_code)]
172    gpu_end_query: Option<u32>,
173}
174
175impl ProfileSession {
176    /// End profiling and record metrics
177    pub fn end(self, memory_bytes: u64, compute_units: u32) {
178        let duration = self.start.elapsed();
179        self.profiler
180            .record_kernel_execution(&self.label, duration, memory_bytes, compute_units);
181    }
182
183    /// End with custom duration (for GPU timestamp queries)
184    pub fn end_with_duration(self, duration: Duration, memory_bytes: u64, compute_units: u32) {
185        self.profiler
186            .record_kernel_execution(&self.label, duration, memory_bytes, compute_units);
187    }
188}
189
190/// Profiling configuration
191#[derive(Debug, Clone)]
192pub struct ProfilingConfig {
193    /// Enable detailed profiling
194    pub detailed: bool,
195    /// Minimum expected memory bandwidth in GB/s
196    pub min_expected_bandwidth_gbs: f64,
197    /// Maximum acceptable kernel duration
198    pub max_kernel_duration: Duration,
199    /// Maximum acceptable transfer overhead ratio (0.0 - 1.0)
200    pub max_transfer_ratio: f64,
201    /// Enable power consumption tracking (if available)
202    pub track_power: bool,
203}
204
205impl Default for ProfilingConfig {
206    fn default() -> Self {
207        Self {
208            detailed: true,
209            min_expected_bandwidth_gbs: 100.0,
210            max_kernel_duration: Duration::from_millis(100),
211            max_transfer_ratio: 0.3,
212            track_power: false,
213        }
214    }
215}
216
217/// Aggregated profiling metrics
218#[derive(Debug, Clone, Default)]
219pub struct ProfilingMetrics {
220    /// Per-kernel statistics
221    pub kernel_stats: HashMap<String, KernelStats>,
222    /// Memory transfer statistics
223    pub transfer_stats: TransferStats,
224    /// Overall metrics
225    pub overall: OverallMetrics,
226}
227
228impl ProfilingMetrics {
229    /// Record a kernel execution
230    fn record_kernel(
231        &mut self,
232        label: &str,
233        duration: Duration,
234        memory_bytes: u64,
235        compute_units: u32,
236    ) {
237        let stats = self.kernel_stats.entry(label.to_string()).or_default();
238        stats.record(duration, memory_bytes, compute_units);
239        self.overall.total_kernel_time += duration;
240        self.overall.total_kernels += 1;
241    }
242
243    /// Record a memory transfer
244    fn record_transfer(&mut self, bytes: u64, duration: Duration, host_to_device: bool) {
245        if host_to_device {
246            self.transfer_stats.host_to_device.record(bytes, duration);
247        } else {
248            self.transfer_stats.device_to_host.record(bytes, duration);
249        }
250        self.overall.total_transfer_time += duration;
251        self.overall.total_transfers += 1;
252        self.overall.total_bytes_transferred += bytes;
253    }
254
255    /// Calculate average memory bandwidth in GB/s
256    fn average_memory_bandwidth_gbs(&self) -> Option<f64> {
257        let total_bytes = self.overall.total_bytes_transferred;
258        let total_time = self.overall.total_transfer_time;
259
260        if total_time > Duration::ZERO && total_bytes > 0 {
261            let bytes_per_sec = total_bytes as f64 / total_time.as_secs_f64();
262            Some(bytes_per_sec / 1_000_000_000.0)
263        } else {
264            None
265        }
266    }
267
268    /// Get total duration
269    fn total_duration(&self) -> Duration {
270        self.overall.total_kernel_time + self.overall.total_transfer_time
271    }
272
273    /// Get total transfer duration
274    fn total_transfer_duration(&self) -> Duration {
275        self.overall.total_transfer_time
276    }
277}
278
279/// Statistics for a specific kernel
280#[derive(Debug, Clone, Default)]
281pub struct KernelStats {
282    /// Number of executions
283    pub executions: u64,
284    /// Total execution time
285    pub total_duration: Duration,
286    /// Minimum execution time
287    pub min_duration: Option<Duration>,
288    /// Maximum execution time
289    pub max_duration: Option<Duration>,
290    /// Total memory accessed
291    pub total_memory_bytes: u64,
292    /// Total compute units used
293    pub total_compute_units: u64,
294}
295
296impl KernelStats {
297    fn record(&mut self, duration: Duration, memory_bytes: u64, compute_units: u32) {
298        self.executions += 1;
299        self.total_duration += duration;
300        self.total_memory_bytes += memory_bytes;
301        self.total_compute_units += compute_units as u64;
302
303        self.min_duration = Some(
304            self.min_duration
305                .map(|min| min.min(duration))
306                .unwrap_or(duration),
307        );
308        self.max_duration = Some(
309            self.max_duration
310                .map(|max| max.max(duration))
311                .unwrap_or(duration),
312        );
313    }
314
315    /// Calculate average duration
316    pub fn average_duration(&self) -> Option<Duration> {
317        if self.executions > 0 {
318            Some(self.total_duration / self.executions as u32)
319        } else {
320            None
321        }
322    }
323
324    /// Calculate bandwidth in GB/s
325    pub fn bandwidth_gbs(&self) -> Option<f64> {
326        if self.total_duration > Duration::ZERO && self.total_memory_bytes > 0 {
327            let bytes_per_sec = self.total_memory_bytes as f64 / self.total_duration.as_secs_f64();
328            Some(bytes_per_sec / 1_000_000_000.0)
329        } else {
330            None
331        }
332    }
333}
334
335/// Memory transfer statistics
336#[derive(Debug, Clone, Default)]
337pub struct TransferStats {
338    /// Host to device transfers
339    pub host_to_device: DirectionalTransferStats,
340    /// Device to host transfers
341    pub device_to_host: DirectionalTransferStats,
342}
343
344/// Directional transfer statistics
345#[derive(Debug, Clone, Default)]
346pub struct DirectionalTransferStats {
347    /// Number of transfers
348    pub count: u64,
349    /// Total bytes transferred
350    pub total_bytes: u64,
351    /// Total transfer time
352    pub total_duration: Duration,
353    /// Minimum transfer time
354    pub min_duration: Option<Duration>,
355    /// Maximum transfer time
356    pub max_duration: Option<Duration>,
357}
358
359impl DirectionalTransferStats {
360    fn record(&mut self, bytes: u64, duration: Duration) {
361        self.count += 1;
362        self.total_bytes += bytes;
363        self.total_duration += duration;
364
365        self.min_duration = Some(
366            self.min_duration
367                .map(|min| min.min(duration))
368                .unwrap_or(duration),
369        );
370        self.max_duration = Some(
371            self.max_duration
372                .map(|max| max.max(duration))
373                .unwrap_or(duration),
374        );
375    }
376
377    /// Calculate average bandwidth in GB/s
378    pub fn bandwidth_gbs(&self) -> Option<f64> {
379        if self.total_duration > Duration::ZERO && self.total_bytes > 0 {
380            let bytes_per_sec = self.total_bytes as f64 / self.total_duration.as_secs_f64();
381            Some(bytes_per_sec / 1_000_000_000.0)
382        } else {
383            None
384        }
385    }
386}
387
388/// Overall metrics
389#[derive(Debug, Clone, Default)]
390pub struct OverallMetrics {
391    /// Total kernel execution time
392    pub total_kernel_time: Duration,
393    /// Total memory transfer time
394    pub total_transfer_time: Duration,
395    /// Total number of kernels executed
396    pub total_kernels: u64,
397    /// Total number of transfers
398    pub total_transfers: u64,
399    /// Total bytes transferred
400    pub total_bytes_transferred: u64,
401}
402
403/// Profiling report
404#[derive(Debug, Clone)]
405pub struct ProfilingReport {
406    /// Summary statistics
407    pub summary: ReportSummary,
408    /// Per-kernel details
409    pub kernel_details: Vec<KernelDetail>,
410    /// Transfer details
411    pub transfer_details: TransferDetail,
412    /// Detected bottlenecks
413    pub bottlenecks: Vec<PerformanceBottleneck>,
414}
415
416impl ProfilingReport {
417    fn from_metrics(metrics: &ProfilingMetrics) -> Self {
418        let mut kernel_details = Vec::new();
419        for (label, stats) in &metrics.kernel_stats {
420            kernel_details.push(KernelDetail {
421                name: label.clone(),
422                executions: stats.executions,
423                total_time: stats.total_duration,
424                avg_time: stats.average_duration().unwrap_or_default(),
425                min_time: stats.min_duration.unwrap_or_default(),
426                max_time: stats.max_duration.unwrap_or_default(),
427                bandwidth_gbs: stats.bandwidth_gbs(),
428            });
429        }
430
431        // Sort by total time descending
432        kernel_details.sort_by_key(|x| std::cmp::Reverse(x.total_time));
433
434        Self {
435            summary: ReportSummary {
436                total_duration: metrics.total_duration(),
437                kernel_time: metrics.overall.total_kernel_time,
438                transfer_time: metrics.overall.total_transfer_time,
439                total_kernels: metrics.overall.total_kernels,
440                total_transfers: metrics.overall.total_transfers,
441                average_bandwidth_gbs: metrics.average_memory_bandwidth_gbs(),
442            },
443            kernel_details,
444            transfer_details: TransferDetail {
445                host_to_device_count: metrics.transfer_stats.host_to_device.count,
446                host_to_device_bytes: metrics.transfer_stats.host_to_device.total_bytes,
447                host_to_device_bandwidth: metrics.transfer_stats.host_to_device.bandwidth_gbs(),
448                device_to_host_count: metrics.transfer_stats.device_to_host.count,
449                device_to_host_bytes: metrics.transfer_stats.device_to_host.total_bytes,
450                device_to_host_bandwidth: metrics.transfer_stats.device_to_host.bandwidth_gbs(),
451            },
452            bottlenecks: Vec::new(),
453        }
454    }
455
456    /// Print report to stdout
457    pub fn print(&self) {
458        println!("=== GPU Profiling Report ===");
459        println!("\nSummary:");
460        println!("  Total Duration: {:?}", self.summary.total_duration);
461        println!(
462            "  Kernel Time: {:?} ({:.1}%)",
463            self.summary.kernel_time,
464            100.0 * self.summary.kernel_time.as_secs_f64()
465                / self.summary.total_duration.as_secs_f64()
466        );
467        println!(
468            "  Transfer Time: {:?} ({:.1}%)",
469            self.summary.transfer_time,
470            100.0 * self.summary.transfer_time.as_secs_f64()
471                / self.summary.total_duration.as_secs_f64()
472        );
473        println!("  Total Kernels: {}", self.summary.total_kernels);
474        println!("  Total Transfers: {}", self.summary.total_transfers);
475        if let Some(bw) = self.summary.average_bandwidth_gbs {
476            println!("  Average Bandwidth: {:.2} GB/s", bw);
477        }
478
479        println!("\nTop Kernels by Time:");
480        for detail in self.kernel_details.iter().take(10) {
481            println!(
482                "  {} ({} execs): {:?} total, {:?} avg",
483                detail.name, detail.executions, detail.total_time, detail.avg_time
484            );
485            if let Some(bw) = detail.bandwidth_gbs {
486                println!("    Bandwidth: {:.2} GB/s", bw);
487            }
488        }
489
490        println!("\nMemory Transfers:");
491        println!(
492            "  Host->Device: {} transfers, {} bytes ({:.2} GB/s)",
493            self.transfer_details.host_to_device_count,
494            self.transfer_details.host_to_device_bytes,
495            self.transfer_details
496                .host_to_device_bandwidth
497                .unwrap_or(0.0)
498        );
499        println!(
500            "  Device->Host: {} transfers, {} bytes ({:.2} GB/s)",
501            self.transfer_details.device_to_host_count,
502            self.transfer_details.device_to_host_bytes,
503            self.transfer_details
504                .device_to_host_bandwidth
505                .unwrap_or(0.0)
506        );
507
508        if !self.bottlenecks.is_empty() {
509            println!("\nPerformance Bottlenecks:");
510            for bottleneck in &self.bottlenecks {
511                println!(
512                    "  [{:?}] {:?}: {}",
513                    bottleneck.severity, bottleneck.kind, bottleneck.description
514                );
515                println!("    Suggestion: {}", bottleneck.suggestion);
516            }
517        }
518    }
519}
520
521/// Report summary
522#[derive(Debug, Clone)]
523pub struct ReportSummary {
524    /// Total profiling duration
525    pub total_duration: Duration,
526    /// Total kernel execution time
527    pub kernel_time: Duration,
528    /// Total transfer time
529    pub transfer_time: Duration,
530    /// Total number of kernels
531    pub total_kernels: u64,
532    /// Total number of transfers
533    pub total_transfers: u64,
534    /// Average bandwidth
535    pub average_bandwidth_gbs: Option<f64>,
536}
537
538/// Kernel detail in report
539#[derive(Debug, Clone)]
540pub struct KernelDetail {
541    /// Kernel name
542    pub name: String,
543    /// Number of executions
544    pub executions: u64,
545    /// Total time
546    pub total_time: Duration,
547    /// Average time
548    pub avg_time: Duration,
549    /// Minimum time
550    pub min_time: Duration,
551    /// Maximum time
552    pub max_time: Duration,
553    /// Bandwidth in GB/s
554    pub bandwidth_gbs: Option<f64>,
555}
556
557/// Transfer detail in report
558#[derive(Debug, Clone)]
559pub struct TransferDetail {
560    /// Host to device count
561    pub host_to_device_count: u64,
562    /// Host to device bytes
563    pub host_to_device_bytes: u64,
564    /// Host to device bandwidth
565    pub host_to_device_bandwidth: Option<f64>,
566    /// Device to host count
567    pub device_to_host_count: u64,
568    /// Device to host bytes
569    pub device_to_host_bytes: u64,
570    /// Device to host bandwidth
571    pub device_to_host_bandwidth: Option<f64>,
572}
573
574/// Performance bottleneck
575#[derive(Debug, Clone)]
576pub struct PerformanceBottleneck {
577    /// Bottleneck kind
578    pub kind: BottleneckKind,
579    /// Severity
580    pub severity: BottleneckSeverity,
581    /// Description
582    pub description: String,
583    /// Suggestion for improvement
584    pub suggestion: String,
585}
586
587/// Bottleneck kind
588#[derive(Debug, Clone, Copy, PartialEq, Eq)]
589pub enum BottleneckKind {
590    /// Memory bandwidth bottleneck
591    MemoryBandwidth,
592    /// Kernel execution bottleneck
593    KernelExecution,
594    /// Transfer overhead bottleneck
595    TransferOverhead,
596    /// Synchronization bottleneck
597    Synchronization,
598}
599
600/// Bottleneck severity
601#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
602pub enum BottleneckSeverity {
603    /// Low severity
604    Low,
605    /// Medium severity
606    Medium,
607    /// High severity
608    High,
609    /// Critical severity
610    Critical,
611}
612
613#[cfg(test)]
614mod tests {
615    use super::*;
616
617    #[test]
618    fn test_kernel_stats() {
619        let mut stats = KernelStats::default();
620        stats.record(Duration::from_millis(10), 1024, 8);
621        stats.record(Duration::from_millis(20), 2048, 16);
622
623        assert_eq!(stats.executions, 2);
624        assert_eq!(stats.total_memory_bytes, 3072);
625        assert_eq!(stats.total_compute_units, 24);
626        assert_eq!(stats.min_duration, Some(Duration::from_millis(10)));
627        assert_eq!(stats.max_duration, Some(Duration::from_millis(20)));
628    }
629
630    #[test]
631    fn test_transfer_stats() {
632        let mut stats = DirectionalTransferStats::default();
633        stats.record(1024, Duration::from_micros(10));
634        stats.record(2048, Duration::from_micros(20));
635
636        assert_eq!(stats.count, 2);
637        assert_eq!(stats.total_bytes, 3072);
638        assert!(stats.bandwidth_gbs().is_some());
639    }
640}
oxigdal_gpu_advanced/profiling.rs

oxigdal_gpu_advanced/
profiling.rs