amari_gpu/
timeline.rs

1//! GPU Timeline Analysis and Performance Profiling Infrastructure
2//!
3//! This module provides advanced GPU profiling capabilities with timeline analysis,
4//! bottleneck detection, and multi-GPU performance optimization insights.
5
6use crate::{DeviceId, GpuDevice, UnifiedGpuResult};
7use std::collections::{HashMap, VecDeque};
8use std::sync::{Arc, Mutex};
9use std::time::{Duration, Instant};
10
11/// GPU timeline event representing a specific operation
12#[derive(Debug, Clone)]
13pub struct TimelineEvent {
14    pub event_id: String,
15    pub device_id: DeviceId,
16    pub operation_type: String,
17    pub start_time: Instant,
18    pub end_time: Option<Instant>,
19    pub gpu_timestamp_start: Option<u64>,
20    pub gpu_timestamp_end: Option<u64>,
21    pub memory_usage_mb: f32,
22    pub workgroup_config: (u32, u32, u32),
23    pub buffer_sizes: Vec<u64>,
24    pub metadata: HashMap<String, String>,
25}
26
27impl TimelineEvent {
28    /// Create a new timeline event
29    pub fn new(
30        event_id: String,
31        device_id: DeviceId,
32        operation_type: String,
33        memory_usage_mb: f32,
34        workgroup_config: (u32, u32, u32),
35        buffer_sizes: Vec<u64>,
36    ) -> Self {
37        Self {
38            event_id,
39            device_id,
40            operation_type,
41            start_time: Instant::now(),
42            end_time: None,
43            gpu_timestamp_start: None,
44            gpu_timestamp_end: None,
45            memory_usage_mb,
46            workgroup_config,
47            buffer_sizes,
48            metadata: HashMap::new(),
49        }
50    }
51
52    /// Mark the event as completed
53    pub fn complete(&mut self) {
54        self.end_time = Some(Instant::now());
55    }
56
57    /// Get the CPU duration of the event
58    pub fn cpu_duration(&self) -> Option<Duration> {
59        self.end_time.map(|end| end.duration_since(self.start_time))
60    }
61
62    /// Get the GPU duration of the event (if available)
63    pub fn gpu_duration_ns(&self) -> Option<u64> {
64        match (self.gpu_timestamp_start, self.gpu_timestamp_end) {
65            (Some(start), Some(end)) => Some(end - start),
66            _ => None,
67        }
68    }
69
70    /// Calculate memory bandwidth utilization
71    pub fn memory_bandwidth_gb_s(&self) -> f32 {
72        if let Some(duration) = self.cpu_duration() {
73            let total_bytes: u64 = self.buffer_sizes.iter().sum::<u64>() * 2; // Read + Write
74            let duration_s = duration.as_secs_f32();
75            if duration_s > 0.0 {
76                (total_bytes as f32) / duration_s / 1e9
77            } else {
78                0.0
79            }
80        } else {
81            0.0
82        }
83    }
84
85    /// Add metadata to the event
86    pub fn add_metadata(&mut self, key: String, value: String) {
87        self.metadata.insert(key, value);
88    }
89}
90
91/// Timeline analyzer for GPU performance analysis
92pub struct GpuTimelineAnalyzer {
93    events: Arc<Mutex<VecDeque<TimelineEvent>>>,
94    max_events: usize,
95    devices: Arc<Mutex<HashMap<DeviceId, Arc<GpuDevice>>>>,
96}
97
98impl GpuTimelineAnalyzer {
99    /// Create a new timeline analyzer
100    pub fn new(max_events: usize) -> Self {
101        Self {
102            events: Arc::new(Mutex::new(VecDeque::with_capacity(max_events))),
103            max_events,
104            devices: Arc::new(Mutex::new(HashMap::new())),
105        }
106    }
107
108    /// Add a device to track
109    pub fn add_device(&self, device: Arc<GpuDevice>) {
110        if let Ok(mut devices) = self.devices.lock() {
111            devices.insert(device.id, device);
112        }
113    }
114
115    /// Record a timeline event
116    pub fn record_event(&self, event: TimelineEvent) {
117        if let Ok(mut events) = self.events.lock() {
118            events.push_back(event);
119
120            // Keep only the most recent events
121            while events.len() > self.max_events {
122                events.pop_front();
123            }
124        }
125    }
126
127    /// Get all events in the specified time range
128    pub fn get_events_in_range(&self, start: Instant, end: Instant) -> Vec<TimelineEvent> {
129        if let Ok(events) = self.events.lock() {
130            events
131                .iter()
132                .filter(|event| event.start_time >= start && event.start_time <= end)
133                .cloned()
134                .collect()
135        } else {
136            Vec::new()
137        }
138    }
139
140    /// Get events for a specific device
141    pub fn get_device_events(
142        &self,
143        device_id: DeviceId,
144        limit: Option<usize>,
145    ) -> Vec<TimelineEvent> {
146        if let Ok(events) = self.events.lock() {
147            let mut device_events: Vec<_> = events
148                .iter()
149                .filter(|event| event.device_id == device_id)
150                .cloned()
151                .collect();
152
153            if let Some(limit) = limit {
154                device_events.truncate(limit);
155            }
156
157            device_events
158        } else {
159            Vec::new()
160        }
161    }
162
163    /// Analyze GPU utilization over time
164    pub fn analyze_gpu_utilization(&self, window_duration: Duration) -> UtilizationAnalysis {
165        let now = Instant::now();
166        let window_start = now - window_duration;
167
168        let events = self.get_events_in_range(window_start, now);
169        let mut device_utilization = HashMap::new();
170
171        // Group events by device
172        for event in events {
173            let device_events = device_utilization
174                .entry(event.device_id)
175                .or_insert_with(Vec::new);
176            device_events.push(event);
177        }
178
179        let mut device_stats = HashMap::new();
180
181        for (device_id, events) in device_utilization {
182            let total_duration: Duration =
183                events.iter().filter_map(|event| event.cpu_duration()).sum();
184
185            let utilization_percent =
186                (total_duration.as_secs_f32() / window_duration.as_secs_f32()) * 100.0;
187            let utilization_percent = utilization_percent.min(100.0); // Cap at 100%
188
189            let avg_memory_bandwidth = if !events.is_empty() {
190                events
191                    .iter()
192                    .map(|e| e.memory_bandwidth_gb_s())
193                    .sum::<f32>()
194                    / events.len() as f32
195            } else {
196                0.0
197            };
198
199            device_stats.insert(
200                device_id,
201                DeviceUtilizationStats {
202                    utilization_percent,
203                    operation_count: events.len(),
204                    avg_memory_bandwidth_gb_s: avg_memory_bandwidth,
205                    total_duration,
206                },
207            );
208        }
209
210        UtilizationAnalysis {
211            analysis_window: window_duration,
212            device_stats,
213            timestamp: now,
214        }
215    }
216
217    /// Detect performance bottlenecks
218    pub fn detect_bottlenecks(&self, analysis_window: Duration) -> BottleneckAnalysis {
219        let utilization = self.analyze_gpu_utilization(analysis_window);
220        let events = self.get_events_in_range(Instant::now() - analysis_window, Instant::now());
221
222        let mut bottlenecks = Vec::new();
223
224        // Analyze GPU utilization bottlenecks
225        for (device_id, stats) in &utilization.device_stats {
226            if stats.utilization_percent < 50.0 {
227                bottlenecks.push(PerformanceBottleneck::LowGpuUtilization {
228                    device_id: *device_id,
229                    utilization_percent: stats.utilization_percent,
230                    recommendation: "Consider increasing batch size or workload complexity"
231                        .to_string(),
232                });
233            }
234
235            if stats.avg_memory_bandwidth_gb_s < 100.0 {
236                // Assuming 100 GB/s baseline
237                bottlenecks.push(PerformanceBottleneck::MemoryBandwidthUnderutilized {
238                    device_id: *device_id,
239                    bandwidth_gb_s: stats.avg_memory_bandwidth_gb_s,
240                    recommendation: "Optimize memory access patterns or increase data parallelism"
241                        .to_string(),
242                });
243            }
244        }
245
246        // Analyze synchronization bottlenecks
247        let sync_analysis = self.analyze_synchronization_overhead(&events);
248        if sync_analysis.avg_sync_overhead_percent > 20.0 {
249            bottlenecks.push(PerformanceBottleneck::SynchronizationOverhead {
250                overhead_percent: sync_analysis.avg_sync_overhead_percent,
251                recommendation: "Reduce synchronization frequency or use asynchronous operations"
252                    .to_string(),
253            });
254        }
255
256        // Analyze workgroup efficiency
257        let workgroup_analysis = self.analyze_workgroup_efficiency(&events);
258        for (device_id, efficiency) in workgroup_analysis {
259            if efficiency < 70.0 {
260                bottlenecks.push(PerformanceBottleneck::InefficientWorkgroups {
261                    device_id,
262                    efficiency_percent: efficiency,
263                    recommendation: "Optimize workgroup size or shared memory usage".to_string(),
264                });
265            }
266        }
267
268        let recommendations = self.generate_optimization_recommendations(&bottlenecks);
269
270        BottleneckAnalysis {
271            analysis_window,
272            bottlenecks,
273            recommendations,
274            timestamp: Instant::now(),
275        }
276    }
277
278    /// Analyze synchronization overhead
279    fn analyze_synchronization_overhead(
280        &self,
281        events: &[TimelineEvent],
282    ) -> SynchronizationAnalysis {
283        let mut total_operation_time = Duration::ZERO;
284        let mut total_sync_time = Duration::ZERO;
285
286        // Group events by operation type to identify synchronization patterns
287        let mut operation_groups = HashMap::new();
288        for event in events {
289            let group = operation_groups
290                .entry(&event.operation_type)
291                .or_insert_with(Vec::new);
292            group.push(event);
293        }
294
295        // Estimate synchronization overhead by looking at gaps between operations
296        for (_op_type, group_events) in operation_groups {
297            for window in group_events.windows(2) {
298                if let [event1, event2] = window {
299                    if let (Some(end1), start2) = (event1.end_time, event2.start_time) {
300                        if event1.device_id != event2.device_id {
301                            // Cross-device gap indicates potential synchronization
302                            let gap = start2.duration_since(end1);
303                            total_sync_time += gap;
304                        }
305                        if let Some(duration1) = event1.cpu_duration() {
306                            total_operation_time += duration1;
307                        }
308                    }
309                }
310            }
311        }
312
313        let sync_overhead_percent = if total_operation_time.as_nanos() > 0 {
314            (total_sync_time.as_nanos() as f32 / total_operation_time.as_nanos() as f32) * 100.0
315        } else {
316            0.0
317        };
318
319        SynchronizationAnalysis {
320            total_sync_time,
321            total_operation_time,
322            avg_sync_overhead_percent: sync_overhead_percent,
323            cross_device_operations: events.len(),
324        }
325    }
326
327    /// Analyze workgroup efficiency
328    fn analyze_workgroup_efficiency(&self, events: &[TimelineEvent]) -> HashMap<DeviceId, f32> {
329        let mut device_efficiency = HashMap::new();
330
331        for event in events {
332            if let Some(_duration) = event.cpu_duration() {
333                let (x, y, z) = event.workgroup_config;
334                let total_threads = x * y * z;
335
336                // Estimate efficiency based on workgroup utilization
337                // This is a simplified heuristic - in practice, you'd use GPU profiling data
338                let theoretical_max_threads = 1024; // Common maximum
339                let utilization = (total_threads as f32 / theoretical_max_threads as f32).min(1.0);
340
341                // Factor in memory bandwidth and duration
342                let memory_efficiency = (event.memory_bandwidth_gb_s() / 500.0).min(1.0); // Normalize to 500 GB/s
343
344                let efficiency = (utilization * 0.6 + memory_efficiency * 0.4) * 100.0;
345
346                let current_efficiency = device_efficiency.entry(event.device_id).or_insert(0.0);
347                *current_efficiency = (*current_efficiency + efficiency) / 2.0; // Running average
348            }
349        }
350
351        device_efficiency
352    }
353
354    /// Generate optimization recommendations
355    fn generate_optimization_recommendations(
356        &self,
357        bottlenecks: &[PerformanceBottleneck],
358    ) -> Vec<OptimizationRecommendation> {
359        let mut recommendations = Vec::new();
360
361        // Count bottleneck types
362        let mut low_utilization_count = 0;
363        let mut memory_issues = 0;
364        let mut sync_issues = 0;
365        let mut workgroup_issues = 0;
366
367        for bottleneck in bottlenecks {
368            match bottleneck {
369                PerformanceBottleneck::LowGpuUtilization { .. } => low_utilization_count += 1,
370                PerformanceBottleneck::MemoryBandwidthUnderutilized { .. } => memory_issues += 1,
371                PerformanceBottleneck::SynchronizationOverhead { .. } => sync_issues += 1,
372                PerformanceBottleneck::InefficientWorkgroups { .. } => workgroup_issues += 1,
373            }
374        }
375
376        // Generate high-level recommendations
377        if low_utilization_count > 0 {
378            recommendations.push(OptimizationRecommendation {
379                priority: RecommendationPriority::High,
380                category: "GPU Utilization".to_string(),
381                description: "Multiple devices showing low utilization".to_string(),
382                action: "Consider increasing batch sizes or enabling more parallel operations"
383                    .to_string(),
384                estimated_improvement: format!("{}% performance gain", low_utilization_count * 15),
385            });
386        }
387
388        if memory_issues > 0 {
389            recommendations.push(OptimizationRecommendation {
390                priority: RecommendationPriority::Medium,
391                category: "Memory Optimization".to_string(),
392                description: "Memory bandwidth underutilized".to_string(),
393                action: "Optimize data layouts and reduce memory transfer overhead".to_string(),
394                estimated_improvement: "10-25% performance gain".to_string(),
395            });
396        }
397
398        if sync_issues > 0 {
399            recommendations.push(OptimizationRecommendation {
400                priority: RecommendationPriority::High,
401                category: "Synchronization".to_string(),
402                description: "High synchronization overhead detected".to_string(),
403                action: "Implement asynchronous operations and reduce cross-device dependencies"
404                    .to_string(),
405                estimated_improvement: "20-40% performance gain".to_string(),
406            });
407        }
408
409        if workgroup_issues > 0 {
410            recommendations.push(OptimizationRecommendation {
411                priority: RecommendationPriority::Low,
412                category: "Workgroup Configuration".to_string(),
413                description: "Suboptimal workgroup configurations".to_string(),
414                action: "Tune workgroup sizes and shared memory usage".to_string(),
415                estimated_improvement: "5-15% performance gain".to_string(),
416            });
417        }
418
419        recommendations
420    }
421}
422
423/// Device utilization statistics
424#[derive(Debug, Clone)]
425pub struct DeviceUtilizationStats {
426    pub utilization_percent: f32,
427    pub operation_count: usize,
428    pub avg_memory_bandwidth_gb_s: f32,
429    pub total_duration: Duration,
430}
431
432/// GPU utilization analysis result
433#[derive(Debug, Clone)]
434pub struct UtilizationAnalysis {
435    pub analysis_window: Duration,
436    pub device_stats: HashMap<DeviceId, DeviceUtilizationStats>,
437    pub timestamp: Instant,
438}
439
440/// Synchronization analysis result
441#[derive(Debug, Clone)]
442pub struct SynchronizationAnalysis {
443    pub total_sync_time: Duration,
444    pub total_operation_time: Duration,
445    pub avg_sync_overhead_percent: f32,
446    pub cross_device_operations: usize,
447}
448
449/// Performance bottleneck types
450#[derive(Debug, Clone)]
451pub enum PerformanceBottleneck {
452    LowGpuUtilization {
453        device_id: DeviceId,
454        utilization_percent: f32,
455        recommendation: String,
456    },
457    MemoryBandwidthUnderutilized {
458        device_id: DeviceId,
459        bandwidth_gb_s: f32,
460        recommendation: String,
461    },
462    SynchronizationOverhead {
463        overhead_percent: f32,
464        recommendation: String,
465    },
466    InefficientWorkgroups {
467        device_id: DeviceId,
468        efficiency_percent: f32,
469        recommendation: String,
470    },
471}
472
473/// Bottleneck analysis result
474#[derive(Debug, Clone)]
475pub struct BottleneckAnalysis {
476    pub analysis_window: Duration,
477    pub bottlenecks: Vec<PerformanceBottleneck>,
478    pub recommendations: Vec<OptimizationRecommendation>,
479    pub timestamp: Instant,
480}
481
482/// Optimization recommendation priority
483#[derive(Debug, Clone)]
484pub enum RecommendationPriority {
485    Low,
486    Medium,
487    High,
488    Critical,
489}
490
491/// Optimization recommendation
492#[derive(Debug, Clone)]
493pub struct OptimizationRecommendation {
494    pub priority: RecommendationPriority,
495    pub category: String,
496    pub description: String,
497    pub action: String,
498    pub estimated_improvement: String,
499}
500
501/// Multi-GPU performance monitor
502pub struct MultiGpuPerformanceMonitor {
503    timeline_analyzer: GpuTimelineAnalyzer,
504    monitoring_enabled: bool,
505    analysis_interval: Duration,
506    last_analysis: Instant,
507}
508
509impl MultiGpuPerformanceMonitor {
510    /// Create a new performance monitor
511    pub fn new(max_events: usize, analysis_interval: Duration) -> Self {
512        Self {
513            timeline_analyzer: GpuTimelineAnalyzer::new(max_events),
514            monitoring_enabled: true,
515            analysis_interval,
516            last_analysis: Instant::now(),
517        }
518    }
519
520    /// Add a device to monitor
521    pub fn add_device(&self, device: Arc<GpuDevice>) {
522        self.timeline_analyzer.add_device(device);
523    }
524
525    /// Start monitoring an operation
526    pub fn start_operation(
527        &self,
528        operation_id: String,
529        device_id: DeviceId,
530        operation_type: String,
531        memory_usage_mb: f32,
532        workgroup_config: (u32, u32, u32),
533        buffer_sizes: Vec<u64>,
534    ) -> OperationHandle<'_> {
535        let event = TimelineEvent::new(
536            operation_id.clone(),
537            device_id,
538            operation_type,
539            memory_usage_mb,
540            workgroup_config,
541            buffer_sizes,
542        );
543
544        OperationHandle {
545            event,
546            monitor: self,
547        }
548    }
549
550    /// Complete an operation
551    fn complete_operation(&self, mut event: TimelineEvent) {
552        if self.monitoring_enabled {
553            event.complete();
554            self.timeline_analyzer.record_event(event);
555        }
556    }
557
558    /// Get performance analysis
559    pub fn get_performance_analysis(
560        &self,
561        window_duration: Duration,
562    ) -> UnifiedGpuResult<PerformanceAnalysisReport> {
563        let utilization = self
564            .timeline_analyzer
565            .analyze_gpu_utilization(window_duration);
566        let bottlenecks = self.timeline_analyzer.detect_bottlenecks(window_duration);
567
568        Ok(PerformanceAnalysisReport {
569            utilization_analysis: utilization,
570            bottleneck_analysis: bottlenecks,
571            timestamp: Instant::now(),
572        })
573    }
574
575    /// Enable or disable monitoring
576    pub fn set_monitoring_enabled(&mut self, enabled: bool) {
577        self.monitoring_enabled = enabled;
578    }
579
580    /// Check if automatic analysis should be performed
581    pub fn should_perform_analysis(&self) -> bool {
582        self.monitoring_enabled && self.last_analysis.elapsed() >= self.analysis_interval
583    }
584}
585
586/// Handle for tracking an operation
587pub struct OperationHandle<'a> {
588    event: TimelineEvent,
589    monitor: &'a MultiGpuPerformanceMonitor,
590}
591
592impl<'a> OperationHandle<'a> {
593    /// Add metadata to the operation
594    pub fn add_metadata(&mut self, key: String, value: String) {
595        self.event.add_metadata(key, value);
596    }
597
598    /// Set GPU timestamps
599    pub fn set_gpu_timestamps(&mut self, start: u64, end: u64) {
600        self.event.gpu_timestamp_start = Some(start);
601        self.event.gpu_timestamp_end = Some(end);
602    }
603}
604
605impl<'a> Drop for OperationHandle<'a> {
606    fn drop(&mut self) {
607        // Automatically complete the operation when the handle is dropped
608        let event = std::mem::replace(
609            &mut self.event,
610            TimelineEvent::new(
611                "dropped".to_string(),
612                crate::DeviceId(0),
613                "dropped".to_string(),
614                0.0,
615                (1, 1, 1),
616                vec![],
617            ),
618        );
619        self.monitor.complete_operation(event);
620    }
621}
622
623/// Combined performance analysis report
624#[derive(Debug, Clone)]
625pub struct PerformanceAnalysisReport {
626    pub utilization_analysis: UtilizationAnalysis,
627    pub bottleneck_analysis: BottleneckAnalysis,
628    pub timestamp: Instant,
629}
630
631impl PerformanceAnalysisReport {
632    /// Get overall performance score (0-100)
633    pub fn overall_performance_score(&self) -> f32 {
634        let avg_utilization = if !self.utilization_analysis.device_stats.is_empty() {
635            self.utilization_analysis
636                .device_stats
637                .values()
638                .map(|stats| stats.utilization_percent)
639                .sum::<f32>()
640                / self.utilization_analysis.device_stats.len() as f32
641        } else {
642            0.0
643        };
644
645        // Penalize for bottlenecks
646        let bottleneck_penalty = self.bottleneck_analysis.bottlenecks.len() as f32 * 5.0;
647        let score = avg_utilization - bottleneck_penalty;
648        score.clamp(0.0, 100.0)
649    }
650
651    /// Get summary statistics
652    pub fn get_summary(&self) -> PerformanceSummary {
653        let total_devices = self.utilization_analysis.device_stats.len();
654        let high_priority_issues = self
655            .bottleneck_analysis
656            .recommendations
657            .iter()
658            .filter(|rec| {
659                matches!(
660                    rec.priority,
661                    RecommendationPriority::High | RecommendationPriority::Critical
662                )
663            })
664            .count();
665
666        PerformanceSummary {
667            overall_score: self.overall_performance_score(),
668            total_devices,
669            active_bottlenecks: self.bottleneck_analysis.bottlenecks.len(),
670            high_priority_recommendations: high_priority_issues,
671            analysis_window: self.utilization_analysis.analysis_window,
672        }
673    }
674}
675
676/// Performance summary statistics
677#[derive(Debug, Clone)]
678pub struct PerformanceSummary {
679    pub overall_score: f32,
680    pub total_devices: usize,
681    pub active_bottlenecks: usize,
682    pub high_priority_recommendations: usize,
683    pub analysis_window: Duration,
684}
685
686#[cfg(test)]
687mod tests {
688    use super::*;
689
690    #[test]
691    fn test_timeline_event_creation() {
692        let event = TimelineEvent::new(
693            "test_event".to_string(),
694            crate::DeviceId(0),
695            "test_operation".to_string(),
696            100.0,
697            (64, 1, 1),
698            vec![1024, 2048],
699        );
700
701        assert_eq!(event.event_id, "test_event");
702        assert_eq!(event.device_id, crate::DeviceId(0));
703        assert_eq!(event.memory_usage_mb, 100.0);
704        assert!(event.end_time.is_none());
705    }
706
707    #[test]
708    fn test_timeline_analyzer() {
709        let analyzer = GpuTimelineAnalyzer::new(100);
710
711        let mut event = TimelineEvent::new(
712            "test".to_string(),
713            crate::DeviceId(0),
714            "matrix_multiply".to_string(),
715            50.0,
716            (16, 16, 1),
717            vec![1024],
718        );
719
720        std::thread::sleep(std::time::Duration::from_millis(10));
721        event.complete();
722
723        analyzer.record_event(event);
724
725        let events = analyzer.get_device_events(crate::DeviceId(0), None);
726        assert_eq!(events.len(), 1);
727    }
728
729    #[test]
730    fn test_performance_monitor() {
731        let monitor = MultiGpuPerformanceMonitor::new(100, Duration::from_secs(1));
732
733        let _handle = monitor.start_operation(
734            "test_op".to_string(),
735            crate::DeviceId(0),
736            "test".to_string(),
737            10.0,
738            (64, 1, 1),
739            vec![512],
740        );
741
742        // Handle will be dropped here, completing the operation
743
744        let analysis = monitor.get_performance_analysis(Duration::from_secs(1));
745        assert!(analysis.is_ok());
746    }
747}