Skip to main content

trustformers_debug/
profiler.rs

1//! Performance profiling tools for debugging
2
3use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant, SystemTime};
8use uuid::Uuid;
9
10use crate::DebugConfig;
11
12/// Profiling event types
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub enum ProfileEvent {
15    FunctionCall {
16        function_name: String,
17        duration: Duration,
18        memory_delta: i64,
19    },
20    LayerExecution {
21        layer_name: String,
22        layer_type: String,
23        forward_time: Duration,
24        backward_time: Option<Duration>,
25        memory_usage: usize,
26        parameter_count: usize,
27    },
28    TensorOperation {
29        operation: String,
30        tensor_shape: Vec<usize>,
31        duration: Duration,
32        memory_allocated: usize,
33    },
34    ModelInference {
35        batch_size: usize,
36        sequence_length: usize,
37        duration: Duration,
38        tokens_per_second: f64,
39    },
40    GradientComputation {
41        layer_name: String,
42        gradient_norm: f64,
43        duration: Duration,
44    },
45}
46
47/// Profiling statistics for analysis
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ProfileStats {
50    pub event_type: String,
51    pub count: usize,
52    pub total_duration: Duration,
53    pub avg_duration: Duration,
54    pub min_duration: Duration,
55    pub max_duration: Duration,
56    pub total_memory: i64,
57    pub avg_memory: f64,
58}
59
60/// Memory usage snapshot
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct MemorySnapshot {
63    pub timestamp: chrono::DateTime<chrono::Utc>,
64    pub heap_allocated: usize,
65    pub heap_used: usize,
66    pub stack_size: usize,
67    pub gpu_allocated: Option<usize>,
68    pub gpu_used: Option<usize>,
69}
70
71/// Performance bottleneck detection
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct PerformanceBottleneck {
74    pub bottleneck_type: BottleneckType,
75    pub location: String,
76    pub severity: BottleneckSeverity,
77    pub description: String,
78    pub suggestion: String,
79    pub metrics: HashMap<String, f64>,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub enum BottleneckType {
84    CpuBound,
85    MemoryBound,
86    IoBound,
87    GpuBound,
88    NetworkBound,
89    DataLoading,
90    ModelComputation,
91    GradientComputation,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub enum BottleneckSeverity {
96    Low,
97    Medium,
98    High,
99    Critical,
100}
101
102/// CPU profiling information
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct CpuProfile {
105    pub function_name: String,
106    pub self_time: Duration,
107    pub total_time: Duration,
108    pub call_count: usize,
109    pub children: Vec<CpuProfile>,
110}
111
112/// Enhanced GPU kernel profiling
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct GpuKernelProfile {
115    pub kernel_name: String,
116    pub grid_size: (u32, u32, u32),
117    pub block_size: (u32, u32, u32),
118    pub shared_memory_bytes: usize,
119    pub registers_per_thread: u32,
120    pub occupancy: f64,
121    pub execution_time: Duration,
122    pub memory_bandwidth_gb_s: f64,
123    pub compute_utilization: f64,
124    pub stream_id: i32,
125}
126
127/// Memory allocation tracking
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct MemoryAllocation {
130    pub allocation_id: Uuid,
131    pub size_bytes: usize,
132    pub allocation_type: MemoryAllocationType,
133    pub device_id: Option<i32>,
134    pub timestamp: SystemTime,
135    pub stack_trace: Vec<String>,
136    pub freed: bool,
137    pub free_timestamp: Option<SystemTime>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub enum MemoryAllocationType {
142    Host,
143    Device,
144    Unified,
145    Pinned,
146    Mapped,
147}
148
149/// Layer-wise latency analysis
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct LayerLatencyProfile {
152    pub layer_name: String,
153    pub layer_type: String,
154    pub input_shapes: Vec<Vec<usize>>,
155    pub output_shapes: Vec<Vec<usize>>,
156    pub cpu_time: Duration,
157    pub gpu_time: Duration,
158    pub memory_copy_time: Duration,
159    pub sync_time: Duration,
160    pub parameter_count: usize,
161    pub flops: u64,
162    pub memory_footprint_bytes: usize,
163    pub cache_hit_rate: f64,
164}
165
166/// I/O operation profiling
167#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct IoProfile {
169    pub operation_type: IoOperationType,
170    pub file_path: Option<String>,
171    pub bytes_transferred: usize,
172    pub duration: Duration,
173    pub bandwidth_mb_s: f64,
174    pub queue_time: Duration,
175    pub device_type: IoDeviceType,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub enum IoOperationType {
180    FileRead,
181    FileWrite,
182    NetworkRead,
183    NetworkWrite,
184    DatabaseQuery,
185    CacheLoad,
186    CacheStore,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
190pub enum IoDeviceType {
191    SSD,
192    HDD,
193    Network,
194    Memory,
195    Cache,
196}
197
198/// CPU bottleneck analysis
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CpuBottleneckAnalysis {
201    pub thread_id: u64,
202    pub cpu_usage: f64,
203    pub context_switches: u64,
204    pub cache_misses: u64,
205    pub instructions_per_cycle: f64,
206    pub branch_mispredictions: u64,
207    pub hot_functions: Vec<HotFunction>,
208    pub bottleneck_score: f64,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct HotFunction {
213    pub function_name: String,
214    pub self_time_percentage: f64,
215    pub call_count: usize,
216    pub avg_time_per_call: Duration,
217}
218
219/// Memory allocation tracker
220#[derive(Debug)]
221pub struct MemoryTracker {
222    allocations: HashMap<Uuid, MemoryAllocation>,
223    total_allocated: usize,
224    peak_allocated: usize,
225    allocation_count: usize,
226    deallocation_count: usize,
227}
228
229impl Default for MemoryTracker {
230    fn default() -> Self {
231        Self::new()
232    }
233}
234
235impl MemoryTracker {
236    pub fn new() -> Self {
237        Self {
238            allocations: HashMap::new(),
239            total_allocated: 0,
240            peak_allocated: 0,
241            allocation_count: 0,
242            deallocation_count: 0,
243        }
244    }
245
246    pub fn track_allocation(&mut self, allocation: MemoryAllocation) {
247        self.total_allocated += allocation.size_bytes;
248        self.allocation_count += 1;
249
250        if self.total_allocated > self.peak_allocated {
251            self.peak_allocated = self.total_allocated;
252        }
253
254        self.allocations.insert(allocation.allocation_id, allocation);
255    }
256
257    pub fn track_deallocation(&mut self, allocation_id: Uuid) {
258        if let Some(mut allocation) = self.allocations.remove(&allocation_id) {
259            allocation.freed = true;
260            allocation.free_timestamp = Some(SystemTime::now());
261            self.total_allocated = self.total_allocated.saturating_sub(allocation.size_bytes);
262            self.deallocation_count += 1;
263        }
264    }
265
266    pub fn get_memory_stats(&self) -> MemoryStats {
267        MemoryStats {
268            total_allocated: self.total_allocated,
269            peak_allocated: self.peak_allocated,
270            active_allocations: self.allocations.len(),
271            allocation_count: self.allocation_count,
272            deallocation_count: self.deallocation_count,
273            memory_efficiency: if self.allocation_count > 0 {
274                self.deallocation_count as f64 / self.allocation_count as f64
275            } else {
276                1.0
277            },
278        }
279    }
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize)]
283pub struct MemoryStats {
284    pub total_allocated: usize,
285    pub peak_allocated: usize,
286    pub active_allocations: usize,
287    pub allocation_count: usize,
288    pub deallocation_count: usize,
289    pub memory_efficiency: f64,
290}
291
292/// GPU profiler for kernel analysis
293#[derive(Debug)]
294#[allow(dead_code)]
295pub struct GpuProfiler {
296    #[allow(dead_code)]
297    device_count: i32,
298    active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
299    memory_pools: HashMap<i32, GpuMemoryPool>,
300}
301
302#[allow(dead_code)]
303#[derive(Debug)]
304pub struct GpuMemoryPool {
305    #[allow(dead_code)]
306    device_id: i32,
307    total_memory: usize,
308    free_memory: usize,
309    fragmentation_score: f64,
310}
311
312impl GpuProfiler {
313    pub fn new() -> Result<Self> {
314        // In practice, this would initialize CUDA/ROCm profiling
315        Ok(Self {
316            device_count: 1, // Simplified
317            active_streams: HashMap::new(),
318            memory_pools: HashMap::new(),
319        })
320    }
321
322    pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
323        self.active_streams
324            .entry(kernel_profile.stream_id)
325            .or_default()
326            .push(kernel_profile);
327    }
328
329    pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
330        // Simplified GPU utilization calculation
331        if let Some(kernels) = self.active_streams.get(&device_id) {
332            if kernels.is_empty() {
333                0.0
334            } else {
335                kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
336            }
337        } else {
338            0.0
339        }
340    }
341}
342
343/// I/O operation monitor
344#[derive(Debug)]
345pub struct IoMonitor {
346    active_operations: HashMap<Uuid, IoOperation>,
347    bandwidth_history: Vec<BandwidthSample>,
348    io_queue_depth: usize,
349}
350#[allow(dead_code)]
351#[derive(Debug)]
352pub struct IoOperation {
353    #[allow(dead_code)]
354    operation_id: Uuid,
355    start_time: Instant,
356    operation_type: IoOperationType,
357    bytes_expected: usize,
358}
359
360#[derive(Debug, Clone, Serialize, Deserialize)]
361pub struct BandwidthSample {
362    pub timestamp: SystemTime,
363    pub bandwidth_mb_s: f64,
364    pub device_type: IoDeviceType,
365}
366
367impl Default for IoMonitor {
368    fn default() -> Self {
369        Self::new()
370    }
371}
372
373impl IoMonitor {
374    pub fn new() -> Self {
375        Self {
376            active_operations: HashMap::new(),
377            bandwidth_history: Vec::new(),
378            io_queue_depth: 0,
379        }
380    }
381
382    pub fn start_io_operation(
383        &mut self,
384        operation_type: IoOperationType,
385        bytes_expected: usize,
386    ) -> Uuid {
387        let operation_id = Uuid::new_v4();
388        let operation = IoOperation {
389            operation_id,
390            start_time: Instant::now(),
391            operation_type,
392            bytes_expected,
393        };
394
395        self.active_operations.insert(operation_id, operation);
396        self.io_queue_depth += 1;
397        operation_id
398    }
399
400    pub fn finish_io_operation(
401        &mut self,
402        operation_id: Uuid,
403        bytes_transferred: usize,
404    ) -> Option<IoProfile> {
405        if let Some(operation) = self.active_operations.remove(&operation_id) {
406            let duration = operation.start_time.elapsed();
407            let bandwidth_mb_s = if duration.as_secs_f64() > 0.0 {
408                bytes_transferred as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()
409            } else {
410                0.0
411            };
412
413            self.io_queue_depth = self.io_queue_depth.saturating_sub(1);
414
415            let device_type = match operation.operation_type {
416                IoOperationType::FileRead | IoOperationType::FileWrite => IoDeviceType::SSD,
417                IoOperationType::NetworkRead | IoOperationType::NetworkWrite => {
418                    IoDeviceType::Network
419                },
420                IoOperationType::CacheLoad | IoOperationType::CacheStore => IoDeviceType::Cache,
421                _ => IoDeviceType::Memory,
422            };
423
424            // Record bandwidth sample
425            self.bandwidth_history.push(BandwidthSample {
426                timestamp: SystemTime::now(),
427                bandwidth_mb_s,
428                device_type: device_type.clone(),
429            });
430
431            // Keep only recent samples
432            if self.bandwidth_history.len() > 1000 {
433                self.bandwidth_history.drain(0..500);
434            }
435
436            Some(IoProfile {
437                operation_type: operation.operation_type,
438                file_path: None, // Would be filled in practice
439                bytes_transferred,
440                duration,
441                bandwidth_mb_s,
442                queue_time: Duration::from_millis(self.io_queue_depth as u64 * 10), // Simplified
443                device_type,
444            })
445        } else {
446            None
447        }
448    }
449
450    pub fn get_average_bandwidth(&self, device_type: &IoDeviceType) -> f64 {
451        let samples: Vec<f64> = self
452            .bandwidth_history
453            .iter()
454            .filter(|s| s.device_type == *device_type)
455            .map(|s| s.bandwidth_mb_s)
456            .collect();
457
458        if samples.is_empty() {
459            0.0
460        } else {
461            samples.iter().sum::<f64>() / samples.len() as f64
462        }
463    }
464}
465
466/// Performance profiler
467#[derive(Debug)]
468pub struct Profiler {
469    #[allow(dead_code)]
470    config: DebugConfig,
471    events: Vec<ProfileEvent>,
472    active_timers: HashMap<String, Instant>,
473    memory_snapshots: Vec<MemorySnapshot>,
474    start_time: Option<Instant>,
475    layer_profiles: HashMap<String, LayerProfile>,
476    bottlenecks: Vec<PerformanceBottleneck>,
477    // Enhanced profiling features
478    gpu_kernel_profiles: Vec<GpuKernelProfile>,
479    memory_allocations: HashMap<Uuid, MemoryAllocation>,
480    layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
481    io_profiles: Vec<IoProfile>,
482    cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
483    memory_tracker: Arc<Mutex<MemoryTracker>>,
484    gpu_profiler: Option<GpuProfiler>,
485    io_monitor: IoMonitor,
486}
487
488#[derive(Debug)]
489pub struct LayerProfile {
490    #[allow(dead_code)]
491    layer_name: String,
492    forward_times: Vec<Duration>,
493    backward_times: Vec<Duration>,
494    memory_usage: Vec<usize>,
495    call_count: usize,
496}
497
498impl LayerProfile {
499    /// Get forward execution times
500    pub fn forward_times(&self) -> &Vec<Duration> {
501        &self.forward_times
502    }
503
504    /// Get backward execution times
505    pub fn backward_times(&self) -> &Vec<Duration> {
506        &self.backward_times
507    }
508
509    /// Get memory usage samples
510    pub fn memory_usage(&self) -> &Vec<usize> {
511        &self.memory_usage
512    }
513
514    /// Get total number of calls
515    pub fn call_count(&self) -> usize {
516        self.call_count
517    }
518}
519
520impl Profiler {
521    /// Create a new profiler
522    pub fn new(config: &DebugConfig) -> Self {
523        Self {
524            config: config.clone(),
525            events: Vec::new(),
526            active_timers: HashMap::new(),
527            memory_snapshots: Vec::new(),
528            start_time: None,
529            layer_profiles: HashMap::new(),
530            bottlenecks: Vec::new(),
531            // Enhanced profiling features
532            gpu_kernel_profiles: Vec::new(),
533            memory_allocations: HashMap::new(),
534            layer_latency_profiles: HashMap::new(),
535            io_profiles: Vec::new(),
536            cpu_bottleneck_analysis: Vec::new(),
537            memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
538            gpu_profiler: GpuProfiler::new().ok(),
539            io_monitor: IoMonitor::new(),
540        }
541    }
542
543    /// Start profiling session
544    pub async fn start(&mut self) -> Result<()> {
545        tracing::info!("Starting performance profiler");
546        self.start_time = Some(Instant::now());
547        self.take_memory_snapshot();
548        Ok(())
549    }
550
551    /// Get reference to profiling events
552    pub fn get_events(&self) -> &Vec<ProfileEvent> {
553        &self.events
554    }
555
556    /// Start timing a function or operation
557    pub fn start_timer(&mut self, name: &str) {
558        self.active_timers.insert(name.to_string(), Instant::now());
559    }
560
561    /// End timing and record the event
562    pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
563        if let Some(start_time) = self.active_timers.remove(name) {
564            let duration = start_time.elapsed();
565
566            // Record basic function call event
567            self.events.push(ProfileEvent::FunctionCall {
568                function_name: name.to_string(),
569                duration,
570                memory_delta: 0, // Would need actual memory tracking
571            });
572
573            Some(duration)
574        } else {
575            tracing::warn!("Timer '{}' was not started", name);
576            None
577        }
578    }
579
580    /// Record layer execution timing
581    pub fn record_layer_execution(
582        &mut self,
583        layer_name: &str,
584        layer_type: &str,
585        forward_time: Duration,
586        backward_time: Option<Duration>,
587        memory_usage: usize,
588        parameter_count: usize,
589    ) {
590        // Record event
591        self.events.push(ProfileEvent::LayerExecution {
592            layer_name: layer_name.to_string(),
593            layer_type: layer_type.to_string(),
594            forward_time,
595            backward_time,
596            memory_usage,
597            parameter_count,
598        });
599
600        // Update layer profile
601        let profile =
602            self.layer_profiles
603                .entry(layer_name.to_string())
604                .or_insert_with(|| LayerProfile {
605                    layer_name: layer_name.to_string(),
606                    forward_times: Vec::new(),
607                    backward_times: Vec::new(),
608                    memory_usage: Vec::new(),
609                    call_count: 0,
610                });
611
612        profile.forward_times.push(forward_time);
613        if let Some(backward) = backward_time {
614            profile.backward_times.push(backward);
615        }
616        profile.memory_usage.push(memory_usage);
617        profile.call_count += 1;
618    }
619
620    /// Record tensor operation timing
621    pub fn record_tensor_operation(
622        &mut self,
623        operation: &str,
624        tensor_shape: &[usize],
625        duration: Duration,
626        memory_allocated: usize,
627    ) {
628        self.events.push(ProfileEvent::TensorOperation {
629            operation: operation.to_string(),
630            tensor_shape: tensor_shape.to_vec(),
631            duration,
632            memory_allocated,
633        });
634    }
635
636    /// Record model inference timing
637    pub fn record_model_inference(
638        &mut self,
639        batch_size: usize,
640        sequence_length: usize,
641        duration: Duration,
642    ) {
643        let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
644
645        self.events.push(ProfileEvent::ModelInference {
646            batch_size,
647            sequence_length,
648            duration,
649            tokens_per_second,
650        });
651    }
652
653    /// Record gradient computation timing
654    pub fn record_gradient_computation(
655        &mut self,
656        layer_name: &str,
657        gradient_norm: f64,
658        duration: Duration,
659    ) {
660        self.events.push(ProfileEvent::GradientComputation {
661            layer_name: layer_name.to_string(),
662            gradient_norm,
663            duration,
664        });
665    }
666
667    /// Take a memory usage snapshot
668    pub fn take_memory_snapshot(&mut self) {
669        // Simplified memory tracking - in practice would use system APIs
670        let snapshot = MemorySnapshot {
671            timestamp: chrono::Utc::now(),
672            heap_allocated: 0, // Would get from system
673            heap_used: 0,
674            stack_size: 0,
675            gpu_allocated: None,
676            gpu_used: None,
677        };
678
679        self.memory_snapshots.push(snapshot);
680
681        // Keep only recent snapshots to prevent memory growth
682        if self.memory_snapshots.len() > 1000 {
683            self.memory_snapshots.drain(0..500);
684        }
685    }
686
687    /// Analyze performance and detect bottlenecks
688    pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
689        self.bottlenecks.clear();
690
691        // Analyze layer execution times
692        self.analyze_layer_bottlenecks();
693
694        // Analyze memory usage patterns
695        self.analyze_memory_bottlenecks();
696
697        // Analyze tensor operation efficiency
698        self.analyze_tensor_bottlenecks();
699
700        self.bottlenecks.clone()
701    }
702
703    /// Get profiling statistics
704    pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
705        let mut stats = HashMap::new();
706
707        // Group events by type
708        let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
709
710        for event in &self.events {
711            let event_type = match event {
712                ProfileEvent::FunctionCall { .. } => "FunctionCall",
713                ProfileEvent::LayerExecution { .. } => "LayerExecution",
714                ProfileEvent::TensorOperation { .. } => "TensorOperation",
715                ProfileEvent::ModelInference { .. } => "ModelInference",
716                ProfileEvent::GradientComputation { .. } => "GradientComputation",
717            };
718
719            grouped_events.entry(event_type.to_string()).or_default().push(event);
720        }
721
722        // Calculate statistics for each event type
723        for (event_type, events) in grouped_events {
724            let durations: Vec<Duration> = events
725                .iter()
726                .filter_map(|event| match event {
727                    ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
728                    ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
729                    ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
730                    ProfileEvent::ModelInference { duration, .. } => Some(*duration),
731                    ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
732                })
733                .collect();
734
735            if !durations.is_empty() {
736                let total_duration: Duration = durations.iter().sum();
737                let avg_duration = total_duration / durations.len() as u32;
738                let min_duration = durations.iter().min().copied().unwrap_or_default();
739                let max_duration = durations.iter().max().copied().unwrap_or_default();
740
741                stats.insert(
742                    event_type.clone(),
743                    ProfileStats {
744                        event_type,
745                        count: durations.len(),
746                        total_duration,
747                        avg_duration,
748                        min_duration,
749                        max_duration,
750                        total_memory: 0, // Simplified
751                        avg_memory: 0.0,
752                    },
753                );
754            }
755        }
756
757        stats
758    }
759
760    /// Get layer-specific performance profiles
761    pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
762        &self.layer_profiles
763    }
764
765    /// Get memory usage over time
766    pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
767        &self.memory_snapshots
768    }
769
770    /// Generate performance report
771    pub async fn generate_report(&self) -> Result<ProfilerReport> {
772        let statistics = self.get_statistics();
773        let bottlenecks = self.bottlenecks.clone();
774        let total_events = self.events.len();
775
776        let total_runtime =
777            if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
778
779        // Calculate slowest layers
780        let slowest_layers = self.get_slowest_layers(5);
781
782        // Memory efficiency analysis
783        let memory_efficiency = self.analyze_memory_efficiency();
784
785        Ok(ProfilerReport {
786            total_events,
787            total_runtime,
788            statistics,
789            bottlenecks,
790            slowest_layers,
791            memory_efficiency,
792            recommendations: self.generate_performance_recommendations(),
793        })
794    }
795
796    /// Clear all profiling data
797    pub fn clear(&mut self) {
798        self.events.clear();
799        self.active_timers.clear();
800        self.memory_snapshots.clear();
801        self.layer_profiles.clear();
802        self.bottlenecks.clear();
803        self.start_time = None;
804        // Clear enhanced profiling data
805        self.gpu_kernel_profiles.clear();
806        self.memory_allocations.clear();
807        self.layer_latency_profiles.clear();
808        self.io_profiles.clear();
809        self.cpu_bottleneck_analysis.clear();
810        if let Ok(mut tracker) = self.memory_tracker.lock() {
811            *tracker = MemoryTracker::new();
812        }
813        self.io_monitor = IoMonitor::new();
814    }
815
816    // Enhanced profiling methods
817
818    /// Profile GPU kernel execution
819    pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
820        if let Some(ref mut gpu_profiler) = self.gpu_profiler {
821            gpu_profiler.profile_kernel(kernel_profile.clone());
822        }
823        self.gpu_kernel_profiles.push(kernel_profile);
824    }
825
826    /// Track memory allocation
827    pub fn track_memory_allocation(
828        &mut self,
829        size_bytes: usize,
830        allocation_type: MemoryAllocationType,
831        device_id: Option<i32>,
832        stack_trace: Vec<String>,
833    ) -> Uuid {
834        let allocation_id = Uuid::new_v4();
835        let allocation = MemoryAllocation {
836            allocation_id,
837            size_bytes,
838            allocation_type,
839            device_id,
840            timestamp: SystemTime::now(),
841            stack_trace,
842            freed: false,
843            free_timestamp: None,
844        };
845
846        if let Ok(mut tracker) = self.memory_tracker.lock() {
847            tracker.track_allocation(allocation.clone());
848        }
849
850        self.memory_allocations.insert(allocation_id, allocation);
851        allocation_id
852    }
853
854    /// Track memory deallocation
855    pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
856        if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
857            allocation.freed = true;
858            allocation.free_timestamp = Some(SystemTime::now());
859        }
860
861        if let Ok(mut tracker) = self.memory_tracker.lock() {
862            tracker.track_deallocation(allocation_id);
863        }
864    }
865
866    /// Profile layer latency with detailed breakdown
867    pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
868        self.layer_latency_profiles
869            .insert(layer_latency.layer_name.clone(), layer_latency);
870    }
871
872    /// Start I/O operation profiling
873    pub fn start_io_profiling(
874        &mut self,
875        operation_type: IoOperationType,
876        bytes_expected: usize,
877    ) -> Uuid {
878        self.io_monitor.start_io_operation(operation_type, bytes_expected)
879    }
880
881    /// Finish I/O operation profiling
882    pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
883        if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
884        {
885            self.io_profiles.push(profile);
886        }
887    }
888
889    /// Analyze CPU bottlenecks
890    pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
891        // Simplified CPU bottleneck analysis
892        // In practice, this would use system profiling APIs
893        let analysis = CpuBottleneckAnalysis {
894            thread_id: 0, // Use 0 as placeholder since thread::current().id().as_u64() is unstable
895            cpu_usage: 0.75, // Simplified
896            context_switches: 1000,
897            cache_misses: 500,
898            instructions_per_cycle: 2.5,
899            branch_mispredictions: 100,
900            hot_functions: vec![
901                HotFunction {
902                    function_name: "tensor_multiply".to_string(),
903                    self_time_percentage: 25.0,
904                    call_count: 1000,
905                    avg_time_per_call: Duration::from_micros(250),
906                },
907                HotFunction {
908                    function_name: "gradient_computation".to_string(),
909                    self_time_percentage: 20.0,
910                    call_count: 500,
911                    avg_time_per_call: Duration::from_micros(400),
912                },
913            ],
914            bottleneck_score: 0.6,
915        };
916
917        self.cpu_bottleneck_analysis.push(analysis.clone());
918        vec![analysis]
919    }
920
921    /// Get memory allocation statistics
922    pub fn get_memory_stats(&self) -> Option<MemoryStats> {
923        if let Ok(tracker) = self.memory_tracker.lock() {
924            Some(tracker.get_memory_stats())
925        } else {
926            None
927        }
928    }
929
930    /// Get GPU utilization metrics
931    pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
932        self.gpu_profiler
933            .as_ref()
934            .map(|profiler| profiler.get_gpu_utilization(device_id))
935    }
936
937    /// Get I/O bandwidth statistics
938    pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
939        let mut stats = HashMap::new();
940
941        stats.insert(
942            IoDeviceType::SSD,
943            self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
944        );
945        stats.insert(
946            IoDeviceType::HDD,
947            self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
948        );
949        stats.insert(
950            IoDeviceType::Network,
951            self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
952        );
953        stats.insert(
954            IoDeviceType::Memory,
955            self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
956        );
957        stats.insert(
958            IoDeviceType::Cache,
959            self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
960        );
961
962        stats
963    }
964
965    /// Get layer latency analysis
966    pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
967        self.layer_latency_profiles
968            .values()
969            .map(|profile| LayerLatencyAnalysis {
970                layer_name: profile.layer_name.clone(),
971                layer_type: profile.layer_type.clone(),
972                total_time: profile.cpu_time
973                    + profile.gpu_time
974                    + profile.memory_copy_time
975                    + profile.sync_time,
976                cpu_percentage: profile.cpu_time.as_secs_f64()
977                    / (profile.cpu_time
978                        + profile.gpu_time
979                        + profile.memory_copy_time
980                        + profile.sync_time)
981                        .as_secs_f64()
982                    * 100.0,
983                gpu_percentage: profile.gpu_time.as_secs_f64()
984                    / (profile.cpu_time
985                        + profile.gpu_time
986                        + profile.memory_copy_time
987                        + profile.sync_time)
988                        .as_secs_f64()
989                    * 100.0,
990                memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
991                    / (profile.cpu_time
992                        + profile.gpu_time
993                        + profile.memory_copy_time
994                        + profile.sync_time)
995                        .as_secs_f64()
996                    * 100.0,
997                flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
998                    profile.flops as f64 / profile.gpu_time.as_secs_f64()
999                } else {
1000                    0.0
1001                },
1002                memory_bandwidth_utilization: profile.cache_hit_rate,
1003                bottleneck_type: self.identify_layer_bottleneck(profile),
1004            })
1005            .collect()
1006    }
1007
1008    /// Get comprehensive performance analysis
1009    pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
1010        let memory_stats = self.get_memory_stats();
1011        let io_bandwidth_stats = self.get_io_bandwidth_stats();
1012        let layer_analysis = self.get_layer_latency_analysis();
1013
1014        let gpu_utilization =
1015            self.gpu_profiler.as_ref().map(|profiler| profiler.get_gpu_utilization(0));
1016
1017        PerformanceAnalysis {
1018            memory_stats,
1019            io_bandwidth_stats,
1020            layer_analysis,
1021            gpu_utilization,
1022            cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
1023            total_gpu_kernels: self.gpu_kernel_profiles.len(),
1024            total_io_operations: self.io_profiles.len(),
1025            performance_score: self.calculate_overall_performance_score(),
1026            recommendations: self.generate_enhanced_recommendations(),
1027        }
1028    }
1029
1030    fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
1031        let total_time =
1032            profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
1033
1034        if profile.memory_copy_time > total_time / 2 {
1035            "Memory Bandwidth".to_string()
1036        } else if profile.sync_time > total_time / 3 {
1037            "GPU Synchronization".to_string()
1038        } else if profile.gpu_time > profile.cpu_time * 10 {
1039            "GPU Compute".to_string()
1040        } else {
1041            "CPU Compute".to_string()
1042        }
1043    }
1044
1045    fn calculate_overall_performance_score(&self) -> f64 {
1046        let mut score: f64 = 100.0;
1047
1048        // Deduct for bottlenecks
1049        for bottleneck in &self.bottlenecks {
1050            match bottleneck.severity {
1051                BottleneckSeverity::Critical => score -= 20.0,
1052                BottleneckSeverity::High => score -= 10.0,
1053                BottleneckSeverity::Medium => score -= 5.0,
1054                BottleneckSeverity::Low => score -= 2.0,
1055            }
1056        }
1057
1058        // Deduct for poor GPU utilization
1059        if let Some(gpu_util) = self.get_gpu_utilization(0) {
1060            if gpu_util < 0.5 {
1061                score -= 15.0;
1062            } else if gpu_util < 0.7 {
1063                score -= 8.0;
1064            }
1065        }
1066
1067        // Deduct for memory inefficiency
1068        if let Some(memory_stats) = self.get_memory_stats() {
1069            if memory_stats.memory_efficiency < 0.8 {
1070                score -= 10.0;
1071            }
1072        }
1073
1074        score.max(0.0)
1075    }
1076
1077    fn generate_enhanced_recommendations(&self) -> Vec<String> {
1078        let mut recommendations = Vec::new();
1079
1080        // GPU utilization recommendations
1081        if let Some(gpu_util) = self.get_gpu_utilization(0) {
1082            if gpu_util < 0.5 {
1083                recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
1084            }
1085        }
1086
1087        // Memory recommendations
1088        if let Some(memory_stats) = self.get_memory_stats() {
1089            if memory_stats.memory_efficiency < 0.8 {
1090                recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
1091            }
1092
1093            if memory_stats.active_allocations > 10000 {
1094                recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
1095            }
1096        }
1097
1098        // I/O recommendations
1099        let io_stats = self.get_io_bandwidth_stats();
1100        if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
1101            if ssd_bandwidth < 100.0 {
1102                // Less than 100 MB/s
1103                recommendations.push(
1104                    "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
1105                        .to_string(),
1106                );
1107            }
1108        }
1109
1110        // Layer-specific recommendations
1111        let layer_analysis = self.get_layer_latency_analysis();
1112        for analysis in &layer_analysis {
1113            if analysis.memory_copy_percentage > 50.0 {
1114                recommendations.push(format!(
1115                    "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
1116                    analysis.layer_name
1117                ));
1118            }
1119
1120            if analysis.cpu_percentage > 80.0 {
1121                recommendations.push(format!(
1122                    "Layer '{}' is CPU bound. Consider GPU acceleration.",
1123                    analysis.layer_name
1124                ));
1125            }
1126        }
1127
1128        if recommendations.is_empty() {
1129            recommendations
1130                .push("Performance appears optimal based on current analysis.".to_string());
1131        }
1132
1133        recommendations
1134    }
1135
1136    // Private analysis methods
1137
1138    fn analyze_layer_bottlenecks(&mut self) {
1139        for (layer_name, profile) in &self.layer_profiles {
1140            if profile.forward_times.is_empty() {
1141                continue;
1142            }
1143
1144            let avg_forward_time =
1145                profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
1146
1147            // Consider a layer slow if it takes more than 100ms on average
1148            if avg_forward_time.as_millis() > 100 {
1149                let mut metrics = HashMap::new();
1150                metrics.insert(
1151                    "avg_forward_time_ms".to_string(),
1152                    avg_forward_time.as_millis() as f64,
1153                );
1154                metrics.insert("call_count".to_string(), profile.call_count as f64);
1155
1156                self.bottlenecks.push(PerformanceBottleneck {
1157                    bottleneck_type: BottleneckType::ModelComputation,
1158                    location: layer_name.clone(),
1159                    severity: if avg_forward_time.as_millis() > 500 {
1160                        BottleneckSeverity::High
1161                    } else {
1162                        BottleneckSeverity::Medium
1163                    },
1164                    description: format!(
1165                        "Layer '{}' has slow forward pass: {:.1}ms average",
1166                        layer_name,
1167                        avg_forward_time.as_millis()
1168                    ),
1169                    suggestion: "Consider optimizing layer implementation or reducing layer size"
1170                        .to_string(),
1171                    metrics,
1172                });
1173            }
1174        }
1175    }
1176
1177    fn analyze_memory_bottlenecks(&mut self) {
1178        if self.memory_snapshots.len() < 2 {
1179            return;
1180        }
1181
1182        // Check for memory growth trend
1183        let recent_snapshots = if self.memory_snapshots.len() > 10 {
1184            &self.memory_snapshots[self.memory_snapshots.len() - 10..]
1185        } else {
1186            &self.memory_snapshots
1187        };
1188
1189        if recent_snapshots.len() >= 5 {
1190            let initial_memory = recent_snapshots[0].heap_allocated;
1191            let final_memory = recent_snapshots
1192                .last()
1193                .expect("recent_snapshots has at least 5 elements")
1194                .heap_allocated;
1195
1196            if final_memory > initial_memory * 2 {
1197                let mut metrics = HashMap::new();
1198                metrics.insert(
1199                    "initial_memory_mb".to_string(),
1200                    initial_memory as f64 / (1024.0 * 1024.0),
1201                );
1202                metrics.insert(
1203                    "final_memory_mb".to_string(),
1204                    final_memory as f64 / (1024.0 * 1024.0),
1205                );
1206                metrics.insert(
1207                    "growth_ratio".to_string(),
1208                    final_memory as f64 / initial_memory as f64,
1209                );
1210
1211                self.bottlenecks.push(PerformanceBottleneck {
1212                    bottleneck_type: BottleneckType::MemoryBound,
1213                    location: "Memory Usage".to_string(),
1214                    severity: BottleneckSeverity::High,
1215                    description: "Significant memory growth detected during profiling".to_string(),
1216                    suggestion: "Check for memory leaks or inefficient memory usage patterns"
1217                        .to_string(),
1218                    metrics,
1219                });
1220            }
1221        }
1222    }
1223
1224    fn analyze_tensor_bottlenecks(&mut self) {
1225        // Group tensor operations by type
1226        let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
1227
1228        for event in &self.events {
1229            if let ProfileEvent::TensorOperation {
1230                operation,
1231                duration,
1232                ..
1233            } = event
1234            {
1235                operation_groups.entry(operation.clone()).or_default().push(*duration);
1236            }
1237        }
1238
1239        // Find slow operations
1240        for (operation, durations) in operation_groups {
1241            if durations.is_empty() {
1242                continue;
1243            }
1244
1245            let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
1246            let total_time = durations.iter().sum::<Duration>();
1247
1248            // Consider operation slow if it takes more than 10ms on average
1249            if avg_duration.as_millis() > 10 {
1250                let mut metrics = HashMap::new();
1251                metrics.insert(
1252                    "avg_duration_ms".to_string(),
1253                    avg_duration.as_millis() as f64,
1254                );
1255                metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
1256                metrics.insert("call_count".to_string(), durations.len() as f64);
1257
1258                self.bottlenecks.push(PerformanceBottleneck {
1259                    bottleneck_type: BottleneckType::CpuBound,
1260                    location: format!("Tensor Operation: {}", operation),
1261                    severity: if avg_duration.as_millis() > 50 {
1262                        BottleneckSeverity::High
1263                    } else {
1264                        BottleneckSeverity::Medium
1265                    },
1266                    description: format!(
1267                        "Tensor operation '{}' is slow: {:.1}ms average",
1268                        operation,
1269                        avg_duration.as_millis()
1270                    ),
1271                    suggestion:
1272                        "Consider optimizing tensor operation or using different data types"
1273                            .to_string(),
1274                    metrics,
1275                });
1276            }
1277        }
1278    }
1279
1280    fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
1281        let mut layer_times: Vec<(String, Duration)> = self
1282            .layer_profiles
1283            .iter()
1284            .map(|(name, profile)| {
1285                let avg_time = if profile.forward_times.is_empty() {
1286                    Duration::ZERO
1287                } else {
1288                    profile.forward_times.iter().sum::<Duration>()
1289                        / profile.forward_times.len() as u32
1290                };
1291                (name.clone(), avg_time)
1292            })
1293            .collect();
1294
1295        layer_times.sort_by_key(|item| std::cmp::Reverse(item.1));
1296        layer_times.truncate(limit);
1297        layer_times
1298    }
1299
1300    fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
1301        if self.memory_snapshots.is_empty() {
1302            return MemoryEfficiencyAnalysis::default();
1303        }
1304
1305        let memory_values: Vec<usize> =
1306            self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
1307
1308        let max_memory = memory_values.iter().max().copied().unwrap_or(0);
1309        let min_memory = memory_values.iter().min().copied().unwrap_or(0);
1310        let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
1311
1312        MemoryEfficiencyAnalysis {
1313            peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
1314            min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
1315            avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
1316            memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
1317            efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
1318        }
1319    }
1320
1321    fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
1322        if values.len() < 2 {
1323            return 0.0;
1324        }
1325
1326        let variance_sum: f64 = values
1327            .iter()
1328            .map(|&x| {
1329                let diff = x as f64 - mean as f64;
1330                diff * diff
1331            })
1332            .sum();
1333
1334        variance_sum / (values.len() - 1) as f64
1335    }
1336
1337    fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
1338        if values.is_empty() {
1339            return 0.0;
1340        }
1341
1342        let max_memory = values.iter().max().copied().unwrap_or(0);
1343        let min_memory = values.iter().min().copied().unwrap_or(0);
1344
1345        if max_memory == 0 {
1346            return 100.0;
1347        }
1348
1349        // Efficiency score: closer to 100% means more stable memory usage
1350        100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
1351    }
1352
1353    fn generate_performance_recommendations(&self) -> Vec<String> {
1354        let mut recommendations = Vec::new();
1355
1356        // Analyze bottlenecks for recommendations
1357        for bottleneck in &self.bottlenecks {
1358            match bottleneck.bottleneck_type {
1359                BottleneckType::ModelComputation => {
1360                    recommendations.push(
1361                        "Consider model architecture optimizations or layer fusion".to_string(),
1362                    );
1363                },
1364                BottleneckType::MemoryBound => {
1365                    recommendations.push(
1366                        "Optimize memory usage with gradient checkpointing or model parallelism"
1367                            .to_string(),
1368                    );
1369                },
1370                BottleneckType::CpuBound => {
1371                    recommendations.push(
1372                        "Consider GPU acceleration or optimized CPU implementations".to_string(),
1373                    );
1374                },
1375                _ => {},
1376            }
1377        }
1378
1379        // General recommendations based on profiling data
1380        if self.events.len() > 10000 {
1381            recommendations.push(
1382                "High number of profiling events - consider reducing profiling overhead"
1383                    .to_string(),
1384            );
1385        }
1386
1387        let stats = self.get_statistics();
1388        if let Some(layer_stats) = stats.get("LayerExecution") {
1389            if layer_stats.avg_duration.as_millis() > 50 {
1390                recommendations.push(
1391                    "Average layer execution time is high - consider layer optimization"
1392                        .to_string(),
1393                );
1394            }
1395        }
1396
1397        if recommendations.is_empty() {
1398            recommendations
1399                .push("Performance appears optimal based on current profiling data".to_string());
1400        }
1401
1402        recommendations
1403    }
1404}
1405
1406/// Memory efficiency analysis results
1407#[derive(Debug, Clone, Serialize, Deserialize)]
1408pub struct MemoryEfficiencyAnalysis {
1409    pub peak_memory_mb: f64,
1410    pub min_memory_mb: f64,
1411    pub avg_memory_mb: f64,
1412    pub memory_variance: f64,
1413    pub efficiency_score: f64,
1414}
1415
1416impl Default for MemoryEfficiencyAnalysis {
1417    fn default() -> Self {
1418        Self {
1419            peak_memory_mb: 0.0,
1420            min_memory_mb: 0.0,
1421            avg_memory_mb: 0.0,
1422            memory_variance: 0.0,
1423            efficiency_score: 100.0,
1424        }
1425    }
1426}
1427
1428/// Profiler report
1429#[derive(Debug, Clone, Serialize, Deserialize)]
1430pub struct ProfilerReport {
1431    pub total_events: usize,
1432    pub total_runtime: Duration,
1433    pub statistics: HashMap<String, ProfileStats>,
1434    pub bottlenecks: Vec<PerformanceBottleneck>,
1435    pub slowest_layers: Vec<(String, Duration)>,
1436    pub memory_efficiency: MemoryEfficiencyAnalysis,
1437    pub recommendations: Vec<String>,
1438}
1439
1440/// Scoped timer for automatic timing
1441pub struct ScopedTimer<'a> {
1442    profiler: &'a mut Profiler,
1443    name: String,
1444}
1445
1446impl<'a> ScopedTimer<'a> {
1447    pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1448        profiler.start_timer(&name);
1449        Self { profiler, name }
1450    }
1451}
1452
1453impl<'a> Drop for ScopedTimer<'a> {
1454    fn drop(&mut self) {
1455        self.profiler.end_timer(&self.name);
1456    }
1457}
1458
1459/// Layer latency analysis result
1460#[derive(Debug, Clone, Serialize, Deserialize)]
1461pub struct LayerLatencyAnalysis {
1462    pub layer_name: String,
1463    pub layer_type: String,
1464    pub total_time: Duration,
1465    pub cpu_percentage: f64,
1466    pub gpu_percentage: f64,
1467    pub memory_copy_percentage: f64,
1468    pub flops_per_second: f64,
1469    pub memory_bandwidth_utilization: f64,
1470    pub bottleneck_type: String,
1471}
1472
1473/// Comprehensive performance analysis
1474#[derive(Debug, Serialize, Deserialize)]
1475pub struct PerformanceAnalysis {
1476    pub memory_stats: Option<MemoryStats>,
1477    pub io_bandwidth_stats: HashMap<IoDeviceType, f64>,
1478    pub layer_analysis: Vec<LayerLatencyAnalysis>,
1479    pub gpu_utilization: Option<f64>,
1480    pub cpu_bottlenecks: Vec<CpuBottleneckAnalysis>,
1481    pub total_gpu_kernels: usize,
1482    pub total_io_operations: usize,
1483    pub performance_score: f64,
1484    pub recommendations: Vec<String>,
1485}
1486
1487/// Enhanced profiler report
1488#[derive(Debug, Serialize, Deserialize)]
1489pub struct EnhancedProfilerReport {
1490    pub basic_report: ProfilerReport,
1491    pub performance_analysis: PerformanceAnalysis,
1492    pub gpu_kernel_summary: GpuKernelSummary,
1493    pub memory_allocation_summary: MemoryAllocationSummary,
1494    pub io_performance_summary: IoPerformanceSummary,
1495}
1496
1497#[derive(Debug, Serialize, Deserialize)]
1498pub struct GpuKernelSummary {
1499    pub total_kernels: usize,
1500    pub total_execution_time: Duration,
1501    pub avg_occupancy: f64,
1502    pub avg_compute_utilization: f64,
1503    pub slowest_kernels: Vec<String>,
1504}
1505
1506#[derive(Debug, Serialize, Deserialize)]
1507pub struct MemoryAllocationSummary {
1508    pub total_allocations: usize,
1509    pub peak_memory_usage: usize,
1510    pub memory_efficiency: f64,
1511    pub largest_allocations: Vec<String>,
1512    pub memory_leaks: usize,
1513}
1514
1515#[derive(Debug, Serialize, Deserialize)]
1516pub struct IoPerformanceSummary {
1517    pub total_operations: usize,
1518    pub total_bytes_transferred: usize,
1519    pub avg_bandwidth_by_device: HashMap<IoDeviceType, f64>,
1520    pub slowest_operations: Vec<String>,
1521}
1522
1523impl Profiler {
1524    /// Generate enhanced profiler report with advanced metrics
1525    pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
1526        let basic_report = self.generate_report().await?;
1527        let performance_analysis = self.get_performance_analysis();
1528
1529        let gpu_kernel_summary = self.generate_gpu_kernel_summary();
1530        let memory_allocation_summary = self.generate_memory_allocation_summary();
1531        let io_performance_summary = self.generate_io_performance_summary();
1532
1533        Ok(EnhancedProfilerReport {
1534            basic_report,
1535            performance_analysis,
1536            gpu_kernel_summary,
1537            memory_allocation_summary,
1538            io_performance_summary,
1539        })
1540    }
1541
1542    fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
1543        let total_kernels = self.gpu_kernel_profiles.len();
1544        let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
1545
1546        let avg_occupancy = if total_kernels > 0 {
1547            self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
1548        } else {
1549            0.0
1550        };
1551
1552        let avg_compute_utilization = if total_kernels > 0 {
1553            self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1554                / total_kernels as f64
1555        } else {
1556            0.0
1557        };
1558
1559        let mut kernels_by_time: Vec<_> = self
1560            .gpu_kernel_profiles
1561            .iter()
1562            .map(|k| (k.kernel_name.clone(), k.execution_time))
1563            .collect();
1564        kernels_by_time.sort_by_key(|item| std::cmp::Reverse(item.1));
1565
1566        let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1567
1568        GpuKernelSummary {
1569            total_kernels,
1570            total_execution_time,
1571            avg_occupancy,
1572            avg_compute_utilization,
1573            slowest_kernels,
1574        }
1575    }
1576
1577    fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1578        let total_allocations = self.memory_allocations.len();
1579        let peak_memory_usage =
1580            self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1581
1582        let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1583            stats.memory_efficiency
1584        } else {
1585            1.0
1586        };
1587
1588        let mut allocations_by_size: Vec<_> = self
1589            .memory_allocations
1590            .values()
1591            .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1592            .collect();
1593        allocations_by_size.sort_by_key(|item| std::cmp::Reverse(item.1));
1594
1595        let largest_allocations =
1596            allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1597
1598        let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1599
1600        MemoryAllocationSummary {
1601            total_allocations,
1602            peak_memory_usage,
1603            memory_efficiency,
1604            largest_allocations,
1605            memory_leaks,
1606        }
1607    }
1608
1609    fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1610        let total_operations = self.io_profiles.len();
1611        let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1612
1613        let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1614
1615        let mut operations_by_duration: Vec<_> = self
1616            .io_profiles
1617            .iter()
1618            .map(|io| {
1619                (
1620                    format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1621                    io.duration,
1622                )
1623            })
1624            .collect();
1625        operations_by_duration.sort_by_key(|item| std::cmp::Reverse(item.1));
1626
1627        let slowest_operations =
1628            operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1629
1630        IoPerformanceSummary {
1631            total_operations,
1632            total_bytes_transferred,
1633            avg_bandwidth_by_device,
1634            slowest_operations,
1635        }
1636    }
1637}
1638
1639/// Macro for convenient timing
1640#[macro_export]
1641macro_rules! profile_scope {
1642    ($profiler:expr, $name:expr) => {
1643        let _timer = ScopedTimer::new($profiler, $name.to_string());
1644    };
1645}