1use crate::{DeviceId, GpuDevice, UnifiedGpuResult};
7use std::collections::{HashMap, VecDeque};
8use std::sync::{Arc, Mutex};
9use std::time::{Duration, Instant};
10
11#[derive(Debug, Clone)]
13pub struct TimelineEvent {
14 pub event_id: String,
15 pub device_id: DeviceId,
16 pub operation_type: String,
17 pub start_time: Instant,
18 pub end_time: Option<Instant>,
19 pub gpu_timestamp_start: Option<u64>,
20 pub gpu_timestamp_end: Option<u64>,
21 pub memory_usage_mb: f32,
22 pub workgroup_config: (u32, u32, u32),
23 pub buffer_sizes: Vec<u64>,
24 pub metadata: HashMap<String, String>,
25}
26
27impl TimelineEvent {
28 pub fn new(
30 event_id: String,
31 device_id: DeviceId,
32 operation_type: String,
33 memory_usage_mb: f32,
34 workgroup_config: (u32, u32, u32),
35 buffer_sizes: Vec<u64>,
36 ) -> Self {
37 Self {
38 event_id,
39 device_id,
40 operation_type,
41 start_time: Instant::now(),
42 end_time: None,
43 gpu_timestamp_start: None,
44 gpu_timestamp_end: None,
45 memory_usage_mb,
46 workgroup_config,
47 buffer_sizes,
48 metadata: HashMap::new(),
49 }
50 }
51
52 pub fn complete(&mut self) {
54 self.end_time = Some(Instant::now());
55 }
56
57 pub fn cpu_duration(&self) -> Option<Duration> {
59 self.end_time.map(|end| end.duration_since(self.start_time))
60 }
61
62 pub fn gpu_duration_ns(&self) -> Option<u64> {
64 match (self.gpu_timestamp_start, self.gpu_timestamp_end) {
65 (Some(start), Some(end)) => Some(end - start),
66 _ => None,
67 }
68 }
69
70 pub fn memory_bandwidth_gb_s(&self) -> f32 {
72 if let Some(duration) = self.cpu_duration() {
73 let total_bytes: u64 = self.buffer_sizes.iter().sum::<u64>() * 2; let duration_s = duration.as_secs_f32();
75 if duration_s > 0.0 {
76 (total_bytes as f32) / duration_s / 1e9
77 } else {
78 0.0
79 }
80 } else {
81 0.0
82 }
83 }
84
85 pub fn add_metadata(&mut self, key: String, value: String) {
87 self.metadata.insert(key, value);
88 }
89}
90
91pub struct GpuTimelineAnalyzer {
93 events: Arc<Mutex<VecDeque<TimelineEvent>>>,
94 max_events: usize,
95 devices: Arc<Mutex<HashMap<DeviceId, Arc<GpuDevice>>>>,
96}
97
98impl GpuTimelineAnalyzer {
99 pub fn new(max_events: usize) -> Self {
101 Self {
102 events: Arc::new(Mutex::new(VecDeque::with_capacity(max_events))),
103 max_events,
104 devices: Arc::new(Mutex::new(HashMap::new())),
105 }
106 }
107
108 pub fn add_device(&self, device: Arc<GpuDevice>) {
110 if let Ok(mut devices) = self.devices.lock() {
111 devices.insert(device.id, device);
112 }
113 }
114
115 pub fn record_event(&self, event: TimelineEvent) {
117 if let Ok(mut events) = self.events.lock() {
118 events.push_back(event);
119
120 while events.len() > self.max_events {
122 events.pop_front();
123 }
124 }
125 }
126
127 pub fn get_events_in_range(&self, start: Instant, end: Instant) -> Vec<TimelineEvent> {
129 if let Ok(events) = self.events.lock() {
130 events
131 .iter()
132 .filter(|event| event.start_time >= start && event.start_time <= end)
133 .cloned()
134 .collect()
135 } else {
136 Vec::new()
137 }
138 }
139
140 pub fn get_device_events(
142 &self,
143 device_id: DeviceId,
144 limit: Option<usize>,
145 ) -> Vec<TimelineEvent> {
146 if let Ok(events) = self.events.lock() {
147 let mut device_events: Vec<_> = events
148 .iter()
149 .filter(|event| event.device_id == device_id)
150 .cloned()
151 .collect();
152
153 if let Some(limit) = limit {
154 device_events.truncate(limit);
155 }
156
157 device_events
158 } else {
159 Vec::new()
160 }
161 }
162
163 pub fn analyze_gpu_utilization(&self, window_duration: Duration) -> UtilizationAnalysis {
165 let now = Instant::now();
166 let window_start = now - window_duration;
167
168 let events = self.get_events_in_range(window_start, now);
169 let mut device_utilization = HashMap::new();
170
171 for event in events {
173 let device_events = device_utilization
174 .entry(event.device_id)
175 .or_insert_with(Vec::new);
176 device_events.push(event);
177 }
178
179 let mut device_stats = HashMap::new();
180
181 for (device_id, events) in device_utilization {
182 let total_duration: Duration =
183 events.iter().filter_map(|event| event.cpu_duration()).sum();
184
185 let utilization_percent =
186 (total_duration.as_secs_f32() / window_duration.as_secs_f32()) * 100.0;
187 let utilization_percent = utilization_percent.min(100.0); let avg_memory_bandwidth = if !events.is_empty() {
190 events
191 .iter()
192 .map(|e| e.memory_bandwidth_gb_s())
193 .sum::<f32>()
194 / events.len() as f32
195 } else {
196 0.0
197 };
198
199 device_stats.insert(
200 device_id,
201 DeviceUtilizationStats {
202 utilization_percent,
203 operation_count: events.len(),
204 avg_memory_bandwidth_gb_s: avg_memory_bandwidth,
205 total_duration,
206 },
207 );
208 }
209
210 UtilizationAnalysis {
211 analysis_window: window_duration,
212 device_stats,
213 timestamp: now,
214 }
215 }
216
217 pub fn detect_bottlenecks(&self, analysis_window: Duration) -> BottleneckAnalysis {
219 let utilization = self.analyze_gpu_utilization(analysis_window);
220 let events = self.get_events_in_range(Instant::now() - analysis_window, Instant::now());
221
222 let mut bottlenecks = Vec::new();
223
224 for (device_id, stats) in &utilization.device_stats {
226 if stats.utilization_percent < 50.0 {
227 bottlenecks.push(PerformanceBottleneck::LowGpuUtilization {
228 device_id: *device_id,
229 utilization_percent: stats.utilization_percent,
230 recommendation: "Consider increasing batch size or workload complexity"
231 .to_string(),
232 });
233 }
234
235 if stats.avg_memory_bandwidth_gb_s < 100.0 {
236 bottlenecks.push(PerformanceBottleneck::MemoryBandwidthUnderutilized {
238 device_id: *device_id,
239 bandwidth_gb_s: stats.avg_memory_bandwidth_gb_s,
240 recommendation: "Optimize memory access patterns or increase data parallelism"
241 .to_string(),
242 });
243 }
244 }
245
246 let sync_analysis = self.analyze_synchronization_overhead(&events);
248 if sync_analysis.avg_sync_overhead_percent > 20.0 {
249 bottlenecks.push(PerformanceBottleneck::SynchronizationOverhead {
250 overhead_percent: sync_analysis.avg_sync_overhead_percent,
251 recommendation: "Reduce synchronization frequency or use asynchronous operations"
252 .to_string(),
253 });
254 }
255
256 let workgroup_analysis = self.analyze_workgroup_efficiency(&events);
258 for (device_id, efficiency) in workgroup_analysis {
259 if efficiency < 70.0 {
260 bottlenecks.push(PerformanceBottleneck::InefficientWorkgroups {
261 device_id,
262 efficiency_percent: efficiency,
263 recommendation: "Optimize workgroup size or shared memory usage".to_string(),
264 });
265 }
266 }
267
268 let recommendations = self.generate_optimization_recommendations(&bottlenecks);
269
270 BottleneckAnalysis {
271 analysis_window,
272 bottlenecks,
273 recommendations,
274 timestamp: Instant::now(),
275 }
276 }
277
278 fn analyze_synchronization_overhead(
280 &self,
281 events: &[TimelineEvent],
282 ) -> SynchronizationAnalysis {
283 let mut total_operation_time = Duration::ZERO;
284 let mut total_sync_time = Duration::ZERO;
285
286 let mut operation_groups = HashMap::new();
288 for event in events {
289 let group = operation_groups
290 .entry(&event.operation_type)
291 .or_insert_with(Vec::new);
292 group.push(event);
293 }
294
295 for (_op_type, group_events) in operation_groups {
297 for window in group_events.windows(2) {
298 if let [event1, event2] = window {
299 if let (Some(end1), start2) = (event1.end_time, event2.start_time) {
300 if event1.device_id != event2.device_id {
301 let gap = start2.duration_since(end1);
303 total_sync_time += gap;
304 }
305 if let Some(duration1) = event1.cpu_duration() {
306 total_operation_time += duration1;
307 }
308 }
309 }
310 }
311 }
312
313 let sync_overhead_percent = if total_operation_time.as_nanos() > 0 {
314 (total_sync_time.as_nanos() as f32 / total_operation_time.as_nanos() as f32) * 100.0
315 } else {
316 0.0
317 };
318
319 SynchronizationAnalysis {
320 total_sync_time,
321 total_operation_time,
322 avg_sync_overhead_percent: sync_overhead_percent,
323 cross_device_operations: events.len(),
324 }
325 }
326
327 fn analyze_workgroup_efficiency(&self, events: &[TimelineEvent]) -> HashMap<DeviceId, f32> {
329 let mut device_efficiency = HashMap::new();
330
331 for event in events {
332 if let Some(_duration) = event.cpu_duration() {
333 let (x, y, z) = event.workgroup_config;
334 let total_threads = x * y * z;
335
336 let theoretical_max_threads = 1024; let utilization = (total_threads as f32 / theoretical_max_threads as f32).min(1.0);
340
341 let memory_efficiency = (event.memory_bandwidth_gb_s() / 500.0).min(1.0); let efficiency = (utilization * 0.6 + memory_efficiency * 0.4) * 100.0;
345
346 let current_efficiency = device_efficiency.entry(event.device_id).or_insert(0.0);
347 *current_efficiency = (*current_efficiency + efficiency) / 2.0; }
349 }
350
351 device_efficiency
352 }
353
354 fn generate_optimization_recommendations(
356 &self,
357 bottlenecks: &[PerformanceBottleneck],
358 ) -> Vec<OptimizationRecommendation> {
359 let mut recommendations = Vec::new();
360
361 let mut low_utilization_count = 0;
363 let mut memory_issues = 0;
364 let mut sync_issues = 0;
365 let mut workgroup_issues = 0;
366
367 for bottleneck in bottlenecks {
368 match bottleneck {
369 PerformanceBottleneck::LowGpuUtilization { .. } => low_utilization_count += 1,
370 PerformanceBottleneck::MemoryBandwidthUnderutilized { .. } => memory_issues += 1,
371 PerformanceBottleneck::SynchronizationOverhead { .. } => sync_issues += 1,
372 PerformanceBottleneck::InefficientWorkgroups { .. } => workgroup_issues += 1,
373 }
374 }
375
376 if low_utilization_count > 0 {
378 recommendations.push(OptimizationRecommendation {
379 priority: RecommendationPriority::High,
380 category: "GPU Utilization".to_string(),
381 description: "Multiple devices showing low utilization".to_string(),
382 action: "Consider increasing batch sizes or enabling more parallel operations"
383 .to_string(),
384 estimated_improvement: format!("{}% performance gain", low_utilization_count * 15),
385 });
386 }
387
388 if memory_issues > 0 {
389 recommendations.push(OptimizationRecommendation {
390 priority: RecommendationPriority::Medium,
391 category: "Memory Optimization".to_string(),
392 description: "Memory bandwidth underutilized".to_string(),
393 action: "Optimize data layouts and reduce memory transfer overhead".to_string(),
394 estimated_improvement: "10-25% performance gain".to_string(),
395 });
396 }
397
398 if sync_issues > 0 {
399 recommendations.push(OptimizationRecommendation {
400 priority: RecommendationPriority::High,
401 category: "Synchronization".to_string(),
402 description: "High synchronization overhead detected".to_string(),
403 action: "Implement asynchronous operations and reduce cross-device dependencies"
404 .to_string(),
405 estimated_improvement: "20-40% performance gain".to_string(),
406 });
407 }
408
409 if workgroup_issues > 0 {
410 recommendations.push(OptimizationRecommendation {
411 priority: RecommendationPriority::Low,
412 category: "Workgroup Configuration".to_string(),
413 description: "Suboptimal workgroup configurations".to_string(),
414 action: "Tune workgroup sizes and shared memory usage".to_string(),
415 estimated_improvement: "5-15% performance gain".to_string(),
416 });
417 }
418
419 recommendations
420 }
421}
422
423#[derive(Debug, Clone)]
425pub struct DeviceUtilizationStats {
426 pub utilization_percent: f32,
427 pub operation_count: usize,
428 pub avg_memory_bandwidth_gb_s: f32,
429 pub total_duration: Duration,
430}
431
432#[derive(Debug, Clone)]
434pub struct UtilizationAnalysis {
435 pub analysis_window: Duration,
436 pub device_stats: HashMap<DeviceId, DeviceUtilizationStats>,
437 pub timestamp: Instant,
438}
439
440#[derive(Debug, Clone)]
442pub struct SynchronizationAnalysis {
443 pub total_sync_time: Duration,
444 pub total_operation_time: Duration,
445 pub avg_sync_overhead_percent: f32,
446 pub cross_device_operations: usize,
447}
448
449#[derive(Debug, Clone)]
451pub enum PerformanceBottleneck {
452 LowGpuUtilization {
453 device_id: DeviceId,
454 utilization_percent: f32,
455 recommendation: String,
456 },
457 MemoryBandwidthUnderutilized {
458 device_id: DeviceId,
459 bandwidth_gb_s: f32,
460 recommendation: String,
461 },
462 SynchronizationOverhead {
463 overhead_percent: f32,
464 recommendation: String,
465 },
466 InefficientWorkgroups {
467 device_id: DeviceId,
468 efficiency_percent: f32,
469 recommendation: String,
470 },
471}
472
473#[derive(Debug, Clone)]
475pub struct BottleneckAnalysis {
476 pub analysis_window: Duration,
477 pub bottlenecks: Vec<PerformanceBottleneck>,
478 pub recommendations: Vec<OptimizationRecommendation>,
479 pub timestamp: Instant,
480}
481
482#[derive(Debug, Clone)]
484pub enum RecommendationPriority {
485 Low,
486 Medium,
487 High,
488 Critical,
489}
490
491#[derive(Debug, Clone)]
493pub struct OptimizationRecommendation {
494 pub priority: RecommendationPriority,
495 pub category: String,
496 pub description: String,
497 pub action: String,
498 pub estimated_improvement: String,
499}
500
501pub struct MultiGpuPerformanceMonitor {
503 timeline_analyzer: GpuTimelineAnalyzer,
504 monitoring_enabled: bool,
505 analysis_interval: Duration,
506 last_analysis: Instant,
507}
508
509impl MultiGpuPerformanceMonitor {
510 pub fn new(max_events: usize, analysis_interval: Duration) -> Self {
512 Self {
513 timeline_analyzer: GpuTimelineAnalyzer::new(max_events),
514 monitoring_enabled: true,
515 analysis_interval,
516 last_analysis: Instant::now(),
517 }
518 }
519
520 pub fn add_device(&self, device: Arc<GpuDevice>) {
522 self.timeline_analyzer.add_device(device);
523 }
524
525 pub fn start_operation(
527 &self,
528 operation_id: String,
529 device_id: DeviceId,
530 operation_type: String,
531 memory_usage_mb: f32,
532 workgroup_config: (u32, u32, u32),
533 buffer_sizes: Vec<u64>,
534 ) -> OperationHandle<'_> {
535 let event = TimelineEvent::new(
536 operation_id.clone(),
537 device_id,
538 operation_type,
539 memory_usage_mb,
540 workgroup_config,
541 buffer_sizes,
542 );
543
544 OperationHandle {
545 event,
546 monitor: self,
547 }
548 }
549
550 fn complete_operation(&self, mut event: TimelineEvent) {
552 if self.monitoring_enabled {
553 event.complete();
554 self.timeline_analyzer.record_event(event);
555 }
556 }
557
558 pub fn get_performance_analysis(
560 &self,
561 window_duration: Duration,
562 ) -> UnifiedGpuResult<PerformanceAnalysisReport> {
563 let utilization = self
564 .timeline_analyzer
565 .analyze_gpu_utilization(window_duration);
566 let bottlenecks = self.timeline_analyzer.detect_bottlenecks(window_duration);
567
568 Ok(PerformanceAnalysisReport {
569 utilization_analysis: utilization,
570 bottleneck_analysis: bottlenecks,
571 timestamp: Instant::now(),
572 })
573 }
574
575 pub fn set_monitoring_enabled(&mut self, enabled: bool) {
577 self.monitoring_enabled = enabled;
578 }
579
580 pub fn should_perform_analysis(&self) -> bool {
582 self.monitoring_enabled && self.last_analysis.elapsed() >= self.analysis_interval
583 }
584}
585
586pub struct OperationHandle<'a> {
588 event: TimelineEvent,
589 monitor: &'a MultiGpuPerformanceMonitor,
590}
591
592impl<'a> OperationHandle<'a> {
593 pub fn add_metadata(&mut self, key: String, value: String) {
595 self.event.add_metadata(key, value);
596 }
597
598 pub fn set_gpu_timestamps(&mut self, start: u64, end: u64) {
600 self.event.gpu_timestamp_start = Some(start);
601 self.event.gpu_timestamp_end = Some(end);
602 }
603}
604
605impl<'a> Drop for OperationHandle<'a> {
606 fn drop(&mut self) {
607 let event = std::mem::replace(
609 &mut self.event,
610 TimelineEvent::new(
611 "dropped".to_string(),
612 crate::DeviceId(0),
613 "dropped".to_string(),
614 0.0,
615 (1, 1, 1),
616 vec![],
617 ),
618 );
619 self.monitor.complete_operation(event);
620 }
621}
622
623#[derive(Debug, Clone)]
625pub struct PerformanceAnalysisReport {
626 pub utilization_analysis: UtilizationAnalysis,
627 pub bottleneck_analysis: BottleneckAnalysis,
628 pub timestamp: Instant,
629}
630
631impl PerformanceAnalysisReport {
632 pub fn overall_performance_score(&self) -> f32 {
634 let avg_utilization = if !self.utilization_analysis.device_stats.is_empty() {
635 self.utilization_analysis
636 .device_stats
637 .values()
638 .map(|stats| stats.utilization_percent)
639 .sum::<f32>()
640 / self.utilization_analysis.device_stats.len() as f32
641 } else {
642 0.0
643 };
644
645 let bottleneck_penalty = self.bottleneck_analysis.bottlenecks.len() as f32 * 5.0;
647 let score = avg_utilization - bottleneck_penalty;
648 score.clamp(0.0, 100.0)
649 }
650
651 pub fn get_summary(&self) -> PerformanceSummary {
653 let total_devices = self.utilization_analysis.device_stats.len();
654 let high_priority_issues = self
655 .bottleneck_analysis
656 .recommendations
657 .iter()
658 .filter(|rec| {
659 matches!(
660 rec.priority,
661 RecommendationPriority::High | RecommendationPriority::Critical
662 )
663 })
664 .count();
665
666 PerformanceSummary {
667 overall_score: self.overall_performance_score(),
668 total_devices,
669 active_bottlenecks: self.bottleneck_analysis.bottlenecks.len(),
670 high_priority_recommendations: high_priority_issues,
671 analysis_window: self.utilization_analysis.analysis_window,
672 }
673 }
674}
675
676#[derive(Debug, Clone)]
678pub struct PerformanceSummary {
679 pub overall_score: f32,
680 pub total_devices: usize,
681 pub active_bottlenecks: usize,
682 pub high_priority_recommendations: usize,
683 pub analysis_window: Duration,
684}
685
686#[cfg(test)]
687mod tests {
688 use super::*;
689
690 #[test]
691 fn test_timeline_event_creation() {
692 let event = TimelineEvent::new(
693 "test_event".to_string(),
694 crate::DeviceId(0),
695 "test_operation".to_string(),
696 100.0,
697 (64, 1, 1),
698 vec![1024, 2048],
699 );
700
701 assert_eq!(event.event_id, "test_event");
702 assert_eq!(event.device_id, crate::DeviceId(0));
703 assert_eq!(event.memory_usage_mb, 100.0);
704 assert!(event.end_time.is_none());
705 }
706
707 #[test]
708 fn test_timeline_analyzer() {
709 let analyzer = GpuTimelineAnalyzer::new(100);
710
711 let mut event = TimelineEvent::new(
712 "test".to_string(),
713 crate::DeviceId(0),
714 "matrix_multiply".to_string(),
715 50.0,
716 (16, 16, 1),
717 vec![1024],
718 );
719
720 std::thread::sleep(std::time::Duration::from_millis(10));
721 event.complete();
722
723 analyzer.record_event(event);
724
725 let events = analyzer.get_device_events(crate::DeviceId(0), None);
726 assert_eq!(events.len(), 1);
727 }
728
729 #[test]
730 fn test_performance_monitor() {
731 let monitor = MultiGpuPerformanceMonitor::new(100, Duration::from_secs(1));
732
733 let _handle = monitor.start_operation(
734 "test_op".to_string(),
735 crate::DeviceId(0),
736 "test".to_string(),
737 10.0,
738 (64, 1, 1),
739 vec![512],
740 );
741
742 let analysis = monitor.get_performance_analysis(Duration::from_secs(1));
745 assert!(analysis.is_ok());
746 }
747}