Skip to main content

trustformers_debug/
profiler.rs

1//! Performance profiling tools for debugging
2
3use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant, SystemTime};
8use uuid::Uuid;
9
10use crate::DebugConfig;
11
12/// Profiling event types
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub enum ProfileEvent {
15    FunctionCall {
16        function_name: String,
17        duration: Duration,
18        memory_delta: i64,
19    },
20    LayerExecution {
21        layer_name: String,
22        layer_type: String,
23        forward_time: Duration,
24        backward_time: Option<Duration>,
25        memory_usage: usize,
26        parameter_count: usize,
27    },
28    TensorOperation {
29        operation: String,
30        tensor_shape: Vec<usize>,
31        duration: Duration,
32        memory_allocated: usize,
33    },
34    ModelInference {
35        batch_size: usize,
36        sequence_length: usize,
37        duration: Duration,
38        tokens_per_second: f64,
39    },
40    GradientComputation {
41        layer_name: String,
42        gradient_norm: f64,
43        duration: Duration,
44    },
45}
46
47/// Profiling statistics for analysis
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ProfileStats {
50    pub event_type: String,
51    pub count: usize,
52    pub total_duration: Duration,
53    pub avg_duration: Duration,
54    pub min_duration: Duration,
55    pub max_duration: Duration,
56    pub total_memory: i64,
57    pub avg_memory: f64,
58}
59
60/// Memory usage snapshot
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct MemorySnapshot {
63    pub timestamp: chrono::DateTime<chrono::Utc>,
64    pub heap_allocated: usize,
65    pub heap_used: usize,
66    pub stack_size: usize,
67    pub gpu_allocated: Option<usize>,
68    pub gpu_used: Option<usize>,
69}
70
71/// Performance bottleneck detection
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct PerformanceBottleneck {
74    pub bottleneck_type: BottleneckType,
75    pub location: String,
76    pub severity: BottleneckSeverity,
77    pub description: String,
78    pub suggestion: String,
79    pub metrics: HashMap<String, f64>,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub enum BottleneckType {
84    CpuBound,
85    MemoryBound,
86    IoBound,
87    GpuBound,
88    NetworkBound,
89    DataLoading,
90    ModelComputation,
91    GradientComputation,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub enum BottleneckSeverity {
96    Low,
97    Medium,
98    High,
99    Critical,
100}
101
102/// CPU profiling information
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct CpuProfile {
105    pub function_name: String,
106    pub self_time: Duration,
107    pub total_time: Duration,
108    pub call_count: usize,
109    pub children: Vec<CpuProfile>,
110}
111
112/// Enhanced GPU kernel profiling
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct GpuKernelProfile {
115    pub kernel_name: String,
116    pub grid_size: (u32, u32, u32),
117    pub block_size: (u32, u32, u32),
118    pub shared_memory_bytes: usize,
119    pub registers_per_thread: u32,
120    pub occupancy: f64,
121    pub execution_time: Duration,
122    pub memory_bandwidth_gb_s: f64,
123    pub compute_utilization: f64,
124    pub stream_id: i32,
125}
126
127/// Memory allocation tracking
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct MemoryAllocation {
130    pub allocation_id: Uuid,
131    pub size_bytes: usize,
132    pub allocation_type: MemoryAllocationType,
133    pub device_id: Option<i32>,
134    pub timestamp: SystemTime,
135    pub stack_trace: Vec<String>,
136    pub freed: bool,
137    pub free_timestamp: Option<SystemTime>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub enum MemoryAllocationType {
142    Host,
143    Device,
144    Unified,
145    Pinned,
146    Mapped,
147}
148
149/// Layer-wise latency analysis
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct LayerLatencyProfile {
152    pub layer_name: String,
153    pub layer_type: String,
154    pub input_shapes: Vec<Vec<usize>>,
155    pub output_shapes: Vec<Vec<usize>>,
156    pub cpu_time: Duration,
157    pub gpu_time: Duration,
158    pub memory_copy_time: Duration,
159    pub sync_time: Duration,
160    pub parameter_count: usize,
161    pub flops: u64,
162    pub memory_footprint_bytes: usize,
163    pub cache_hit_rate: f64,
164}
165
166/// I/O operation profiling
167#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct IoProfile {
169    pub operation_type: IoOperationType,
170    pub file_path: Option<String>,
171    pub bytes_transferred: usize,
172    pub duration: Duration,
173    pub bandwidth_mb_s: f64,
174    pub queue_time: Duration,
175    pub device_type: IoDeviceType,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub enum IoOperationType {
180    FileRead,
181    FileWrite,
182    NetworkRead,
183    NetworkWrite,
184    DatabaseQuery,
185    CacheLoad,
186    CacheStore,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
190pub enum IoDeviceType {
191    SSD,
192    HDD,
193    Network,
194    Memory,
195    Cache,
196}
197
198/// CPU bottleneck analysis
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CpuBottleneckAnalysis {
201    pub thread_id: u64,
202    pub cpu_usage: f64,
203    pub context_switches: u64,
204    pub cache_misses: u64,
205    pub instructions_per_cycle: f64,
206    pub branch_mispredictions: u64,
207    pub hot_functions: Vec<HotFunction>,
208    pub bottleneck_score: f64,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct HotFunction {
213    pub function_name: String,
214    pub self_time_percentage: f64,
215    pub call_count: usize,
216    pub avg_time_per_call: Duration,
217}
218
219/// Memory allocation tracker
220#[derive(Debug)]
221pub struct MemoryTracker {
222    allocations: HashMap<Uuid, MemoryAllocation>,
223    total_allocated: usize,
224    peak_allocated: usize,
225    allocation_count: usize,
226    deallocation_count: usize,
227}
228
229impl Default for MemoryTracker {
230    fn default() -> Self {
231        Self::new()
232    }
233}
234
235impl MemoryTracker {
236    pub fn new() -> Self {
237        Self {
238            allocations: HashMap::new(),
239            total_allocated: 0,
240            peak_allocated: 0,
241            allocation_count: 0,
242            deallocation_count: 0,
243        }
244    }
245
246    pub fn track_allocation(&mut self, allocation: MemoryAllocation) {
247        self.total_allocated += allocation.size_bytes;
248        self.allocation_count += 1;
249
250        if self.total_allocated > self.peak_allocated {
251            self.peak_allocated = self.total_allocated;
252        }
253
254        self.allocations.insert(allocation.allocation_id, allocation);
255    }
256
257    pub fn track_deallocation(&mut self, allocation_id: Uuid) {
258        if let Some(mut allocation) = self.allocations.remove(&allocation_id) {
259            allocation.freed = true;
260            allocation.free_timestamp = Some(SystemTime::now());
261            self.total_allocated = self.total_allocated.saturating_sub(allocation.size_bytes);
262            self.deallocation_count += 1;
263        }
264    }
265
266    pub fn get_memory_stats(&self) -> MemoryStats {
267        MemoryStats {
268            total_allocated: self.total_allocated,
269            peak_allocated: self.peak_allocated,
270            active_allocations: self.allocations.len(),
271            allocation_count: self.allocation_count,
272            deallocation_count: self.deallocation_count,
273            memory_efficiency: if self.allocation_count > 0 {
274                self.deallocation_count as f64 / self.allocation_count as f64
275            } else {
276                1.0
277            },
278        }
279    }
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize)]
283pub struct MemoryStats {
284    pub total_allocated: usize,
285    pub peak_allocated: usize,
286    pub active_allocations: usize,
287    pub allocation_count: usize,
288    pub deallocation_count: usize,
289    pub memory_efficiency: f64,
290}
291
292/// GPU profiler for kernel analysis
293#[derive(Debug)]
294#[allow(dead_code)]
295pub struct GpuProfiler {
296    #[allow(dead_code)]
297    device_count: i32,
298    active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
299    memory_pools: HashMap<i32, GpuMemoryPool>,
300}
301
302#[allow(dead_code)]
303#[derive(Debug)]
304pub struct GpuMemoryPool {
305    #[allow(dead_code)]
306    device_id: i32,
307    total_memory: usize,
308    free_memory: usize,
309    fragmentation_score: f64,
310}
311
312impl GpuProfiler {
313    pub fn new() -> Result<Self> {
314        // In practice, this would initialize CUDA/ROCm profiling
315        Ok(Self {
316            device_count: 1, // Simplified
317            active_streams: HashMap::new(),
318            memory_pools: HashMap::new(),
319        })
320    }
321
322    pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
323        self.active_streams
324            .entry(kernel_profile.stream_id)
325            .or_default()
326            .push(kernel_profile);
327    }
328
329    pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
330        // Simplified GPU utilization calculation
331        if let Some(kernels) = self.active_streams.get(&device_id) {
332            if kernels.is_empty() {
333                0.0
334            } else {
335                kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
336            }
337        } else {
338            0.0
339        }
340    }
341}
342
343/// I/O operation monitor
344#[derive(Debug)]
345pub struct IoMonitor {
346    active_operations: HashMap<Uuid, IoOperation>,
347    bandwidth_history: Vec<BandwidthSample>,
348    io_queue_depth: usize,
349}
350#[allow(dead_code)]
351#[derive(Debug)]
352pub struct IoOperation {
353    #[allow(dead_code)]
354    operation_id: Uuid,
355    start_time: Instant,
356    operation_type: IoOperationType,
357    bytes_expected: usize,
358}
359
360#[derive(Debug, Clone, Serialize, Deserialize)]
361pub struct BandwidthSample {
362    pub timestamp: SystemTime,
363    pub bandwidth_mb_s: f64,
364    pub device_type: IoDeviceType,
365}
366
367impl Default for IoMonitor {
368    fn default() -> Self {
369        Self::new()
370    }
371}
372
373impl IoMonitor {
374    pub fn new() -> Self {
375        Self {
376            active_operations: HashMap::new(),
377            bandwidth_history: Vec::new(),
378            io_queue_depth: 0,
379        }
380    }
381
382    pub fn start_io_operation(
383        &mut self,
384        operation_type: IoOperationType,
385        bytes_expected: usize,
386    ) -> Uuid {
387        let operation_id = Uuid::new_v4();
388        let operation = IoOperation {
389            operation_id,
390            start_time: Instant::now(),
391            operation_type,
392            bytes_expected,
393        };
394
395        self.active_operations.insert(operation_id, operation);
396        self.io_queue_depth += 1;
397        operation_id
398    }
399
400    pub fn finish_io_operation(
401        &mut self,
402        operation_id: Uuid,
403        bytes_transferred: usize,
404    ) -> Option<IoProfile> {
405        if let Some(operation) = self.active_operations.remove(&operation_id) {
406            let duration = operation.start_time.elapsed();
407            let bandwidth_mb_s = if duration.as_secs_f64() > 0.0 {
408                bytes_transferred as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()
409            } else {
410                0.0
411            };
412
413            self.io_queue_depth = self.io_queue_depth.saturating_sub(1);
414
415            let device_type = match operation.operation_type {
416                IoOperationType::FileRead | IoOperationType::FileWrite => IoDeviceType::SSD,
417                IoOperationType::NetworkRead | IoOperationType::NetworkWrite => {
418                    IoDeviceType::Network
419                },
420                IoOperationType::CacheLoad | IoOperationType::CacheStore => IoDeviceType::Cache,
421                _ => IoDeviceType::Memory,
422            };
423
424            // Record bandwidth sample
425            self.bandwidth_history.push(BandwidthSample {
426                timestamp: SystemTime::now(),
427                bandwidth_mb_s,
428                device_type: device_type.clone(),
429            });
430
431            // Keep only recent samples
432            if self.bandwidth_history.len() > 1000 {
433                self.bandwidth_history.drain(0..500);
434            }
435
436            Some(IoProfile {
437                operation_type: operation.operation_type,
438                file_path: None, // Would be filled in practice
439                bytes_transferred,
440                duration,
441                bandwidth_mb_s,
442                queue_time: Duration::from_millis(self.io_queue_depth as u64 * 10), // Simplified
443                device_type,
444            })
445        } else {
446            None
447        }
448    }
449
450    pub fn get_average_bandwidth(&self, device_type: &IoDeviceType) -> f64 {
451        let samples: Vec<f64> = self
452            .bandwidth_history
453            .iter()
454            .filter(|s| s.device_type == *device_type)
455            .map(|s| s.bandwidth_mb_s)
456            .collect();
457
458        if samples.is_empty() {
459            0.0
460        } else {
461            samples.iter().sum::<f64>() / samples.len() as f64
462        }
463    }
464}
465
466/// Performance profiler
467#[derive(Debug)]
468pub struct Profiler {
469    #[allow(dead_code)]
470    config: DebugConfig,
471    events: Vec<ProfileEvent>,
472    active_timers: HashMap<String, Instant>,
473    memory_snapshots: Vec<MemorySnapshot>,
474    start_time: Option<Instant>,
475    layer_profiles: HashMap<String, LayerProfile>,
476    bottlenecks: Vec<PerformanceBottleneck>,
477    // Enhanced profiling features
478    gpu_kernel_profiles: Vec<GpuKernelProfile>,
479    memory_allocations: HashMap<Uuid, MemoryAllocation>,
480    layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
481    io_profiles: Vec<IoProfile>,
482    cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
483    memory_tracker: Arc<Mutex<MemoryTracker>>,
484    gpu_profiler: Option<GpuProfiler>,
485    io_monitor: IoMonitor,
486}
487
488#[derive(Debug)]
489pub struct LayerProfile {
490    #[allow(dead_code)]
491    layer_name: String,
492    forward_times: Vec<Duration>,
493    backward_times: Vec<Duration>,
494    memory_usage: Vec<usize>,
495    call_count: usize,
496}
497
498impl LayerProfile {
499    /// Get forward execution times
500    pub fn forward_times(&self) -> &Vec<Duration> {
501        &self.forward_times
502    }
503
504    /// Get backward execution times
505    pub fn backward_times(&self) -> &Vec<Duration> {
506        &self.backward_times
507    }
508
509    /// Get memory usage samples
510    pub fn memory_usage(&self) -> &Vec<usize> {
511        &self.memory_usage
512    }
513
514    /// Get total number of calls
515    pub fn call_count(&self) -> usize {
516        self.call_count
517    }
518}
519
520impl Profiler {
521    /// Create a new profiler
522    pub fn new(config: &DebugConfig) -> Self {
523        Self {
524            config: config.clone(),
525            events: Vec::new(),
526            active_timers: HashMap::new(),
527            memory_snapshots: Vec::new(),
528            start_time: None,
529            layer_profiles: HashMap::new(),
530            bottlenecks: Vec::new(),
531            // Enhanced profiling features
532            gpu_kernel_profiles: Vec::new(),
533            memory_allocations: HashMap::new(),
534            layer_latency_profiles: HashMap::new(),
535            io_profiles: Vec::new(),
536            cpu_bottleneck_analysis: Vec::new(),
537            memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
538            gpu_profiler: GpuProfiler::new().ok(),
539            io_monitor: IoMonitor::new(),
540        }
541    }
542
543    /// Start profiling session
544    pub async fn start(&mut self) -> Result<()> {
545        tracing::info!("Starting performance profiler");
546        self.start_time = Some(Instant::now());
547        self.take_memory_snapshot();
548        Ok(())
549    }
550
551    /// Get reference to profiling events
552    pub fn get_events(&self) -> &Vec<ProfileEvent> {
553        &self.events
554    }
555
556    /// Start timing a function or operation
557    pub fn start_timer(&mut self, name: &str) {
558        self.active_timers.insert(name.to_string(), Instant::now());
559    }
560
561    /// End timing and record the event
562    pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
563        if let Some(start_time) = self.active_timers.remove(name) {
564            let duration = start_time.elapsed();
565
566            // Record basic function call event
567            self.events.push(ProfileEvent::FunctionCall {
568                function_name: name.to_string(),
569                duration,
570                memory_delta: 0, // Would need actual memory tracking
571            });
572
573            Some(duration)
574        } else {
575            tracing::warn!("Timer '{}' was not started", name);
576            None
577        }
578    }
579
580    /// Record layer execution timing
581    pub fn record_layer_execution(
582        &mut self,
583        layer_name: &str,
584        layer_type: &str,
585        forward_time: Duration,
586        backward_time: Option<Duration>,
587        memory_usage: usize,
588        parameter_count: usize,
589    ) {
590        // Record event
591        self.events.push(ProfileEvent::LayerExecution {
592            layer_name: layer_name.to_string(),
593            layer_type: layer_type.to_string(),
594            forward_time,
595            backward_time,
596            memory_usage,
597            parameter_count,
598        });
599
600        // Update layer profile
601        let profile =
602            self.layer_profiles
603                .entry(layer_name.to_string())
604                .or_insert_with(|| LayerProfile {
605                    layer_name: layer_name.to_string(),
606                    forward_times: Vec::new(),
607                    backward_times: Vec::new(),
608                    memory_usage: Vec::new(),
609                    call_count: 0,
610                });
611
612        profile.forward_times.push(forward_time);
613        if let Some(backward) = backward_time {
614            profile.backward_times.push(backward);
615        }
616        profile.memory_usage.push(memory_usage);
617        profile.call_count += 1;
618    }
619
620    /// Record tensor operation timing
621    pub fn record_tensor_operation(
622        &mut self,
623        operation: &str,
624        tensor_shape: &[usize],
625        duration: Duration,
626        memory_allocated: usize,
627    ) {
628        self.events.push(ProfileEvent::TensorOperation {
629            operation: operation.to_string(),
630            tensor_shape: tensor_shape.to_vec(),
631            duration,
632            memory_allocated,
633        });
634    }
635
636    /// Record model inference timing
637    pub fn record_model_inference(
638        &mut self,
639        batch_size: usize,
640        sequence_length: usize,
641        duration: Duration,
642    ) {
643        let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
644
645        self.events.push(ProfileEvent::ModelInference {
646            batch_size,
647            sequence_length,
648            duration,
649            tokens_per_second,
650        });
651    }
652
653    /// Record gradient computation timing
654    pub fn record_gradient_computation(
655        &mut self,
656        layer_name: &str,
657        gradient_norm: f64,
658        duration: Duration,
659    ) {
660        self.events.push(ProfileEvent::GradientComputation {
661            layer_name: layer_name.to_string(),
662            gradient_norm,
663            duration,
664        });
665    }
666
667    /// Take a memory usage snapshot
668    pub fn take_memory_snapshot(&mut self) {
669        // Simplified memory tracking - in practice would use system APIs
670        let snapshot = MemorySnapshot {
671            timestamp: chrono::Utc::now(),
672            heap_allocated: 0, // Would get from system
673            heap_used: 0,
674            stack_size: 0,
675            gpu_allocated: None,
676            gpu_used: None,
677        };
678
679        self.memory_snapshots.push(snapshot);
680
681        // Keep only recent snapshots to prevent memory growth
682        if self.memory_snapshots.len() > 1000 {
683            self.memory_snapshots.drain(0..500);
684        }
685    }
686
687    /// Analyze performance and detect bottlenecks
688    pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
689        self.bottlenecks.clear();
690
691        // Analyze layer execution times
692        self.analyze_layer_bottlenecks();
693
694        // Analyze memory usage patterns
695        self.analyze_memory_bottlenecks();
696
697        // Analyze tensor operation efficiency
698        self.analyze_tensor_bottlenecks();
699
700        self.bottlenecks.clone()
701    }
702
703    /// Get profiling statistics
704    pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
705        let mut stats = HashMap::new();
706
707        // Group events by type
708        let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
709
710        for event in &self.events {
711            let event_type = match event {
712                ProfileEvent::FunctionCall { .. } => "FunctionCall",
713                ProfileEvent::LayerExecution { .. } => "LayerExecution",
714                ProfileEvent::TensorOperation { .. } => "TensorOperation",
715                ProfileEvent::ModelInference { .. } => "ModelInference",
716                ProfileEvent::GradientComputation { .. } => "GradientComputation",
717            };
718
719            grouped_events.entry(event_type.to_string()).or_default().push(event);
720        }
721
722        // Calculate statistics for each event type
723        for (event_type, events) in grouped_events {
724            let durations: Vec<Duration> = events
725                .iter()
726                .filter_map(|event| match event {
727                    ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
728                    ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
729                    ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
730                    ProfileEvent::ModelInference { duration, .. } => Some(*duration),
731                    ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
732                })
733                .collect();
734
735            if !durations.is_empty() {
736                let total_duration: Duration = durations.iter().sum();
737                let avg_duration = total_duration / durations.len() as u32;
738                let min_duration = durations.iter().min().copied().unwrap_or_default();
739                let max_duration = durations.iter().max().copied().unwrap_or_default();
740
741                stats.insert(
742                    event_type.clone(),
743                    ProfileStats {
744                        event_type,
745                        count: durations.len(),
746                        total_duration,
747                        avg_duration,
748                        min_duration,
749                        max_duration,
750                        total_memory: 0, // Simplified
751                        avg_memory: 0.0,
752                    },
753                );
754            }
755        }
756
757        stats
758    }
759
760    /// Get layer-specific performance profiles
761    pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
762        &self.layer_profiles
763    }
764
765    /// Get memory usage over time
766    pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
767        &self.memory_snapshots
768    }
769
770    /// Generate performance report
771    pub async fn generate_report(&self) -> Result<ProfilerReport> {
772        let statistics = self.get_statistics();
773        let bottlenecks = self.bottlenecks.clone();
774        let total_events = self.events.len();
775
776        let total_runtime =
777            if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
778
779        // Calculate slowest layers
780        let slowest_layers = self.get_slowest_layers(5);
781
782        // Memory efficiency analysis
783        let memory_efficiency = self.analyze_memory_efficiency();
784
785        Ok(ProfilerReport {
786            total_events,
787            total_runtime,
788            statistics,
789            bottlenecks,
790            slowest_layers,
791            memory_efficiency,
792            recommendations: self.generate_performance_recommendations(),
793        })
794    }
795
796    /// Clear all profiling data
797    pub fn clear(&mut self) {
798        self.events.clear();
799        self.active_timers.clear();
800        self.memory_snapshots.clear();
801        self.layer_profiles.clear();
802        self.bottlenecks.clear();
803        self.start_time = None;
804        // Clear enhanced profiling data
805        self.gpu_kernel_profiles.clear();
806        self.memory_allocations.clear();
807        self.layer_latency_profiles.clear();
808        self.io_profiles.clear();
809        self.cpu_bottleneck_analysis.clear();
810        if let Ok(mut tracker) = self.memory_tracker.lock() {
811            *tracker = MemoryTracker::new();
812        }
813        self.io_monitor = IoMonitor::new();
814    }
815
816    // Enhanced profiling methods
817
818    /// Profile GPU kernel execution
819    pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
820        if let Some(ref mut gpu_profiler) = self.gpu_profiler {
821            gpu_profiler.profile_kernel(kernel_profile.clone());
822        }
823        self.gpu_kernel_profiles.push(kernel_profile);
824    }
825
826    /// Track memory allocation
827    pub fn track_memory_allocation(
828        &mut self,
829        size_bytes: usize,
830        allocation_type: MemoryAllocationType,
831        device_id: Option<i32>,
832        stack_trace: Vec<String>,
833    ) -> Uuid {
834        let allocation_id = Uuid::new_v4();
835        let allocation = MemoryAllocation {
836            allocation_id,
837            size_bytes,
838            allocation_type,
839            device_id,
840            timestamp: SystemTime::now(),
841            stack_trace,
842            freed: false,
843            free_timestamp: None,
844        };
845
846        if let Ok(mut tracker) = self.memory_tracker.lock() {
847            tracker.track_allocation(allocation.clone());
848        }
849
850        self.memory_allocations.insert(allocation_id, allocation);
851        allocation_id
852    }
853
854    /// Track memory deallocation
855    pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
856        if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
857            allocation.freed = true;
858            allocation.free_timestamp = Some(SystemTime::now());
859        }
860
861        if let Ok(mut tracker) = self.memory_tracker.lock() {
862            tracker.track_deallocation(allocation_id);
863        }
864    }
865
866    /// Profile layer latency with detailed breakdown
867    pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
868        self.layer_latency_profiles
869            .insert(layer_latency.layer_name.clone(), layer_latency);
870    }
871
872    /// Start I/O operation profiling
873    pub fn start_io_profiling(
874        &mut self,
875        operation_type: IoOperationType,
876        bytes_expected: usize,
877    ) -> Uuid {
878        self.io_monitor.start_io_operation(operation_type, bytes_expected)
879    }
880
881    /// Finish I/O operation profiling
882    pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
883        if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
884        {
885            self.io_profiles.push(profile);
886        }
887    }
888
889    /// Analyze CPU bottlenecks
890    pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
891        // Simplified CPU bottleneck analysis
892        // In practice, this would use system profiling APIs
893        let analysis = CpuBottleneckAnalysis {
894            thread_id: 0, // Use 0 as placeholder since thread::current().id().as_u64() is unstable
895            cpu_usage: 0.75, // Simplified
896            context_switches: 1000,
897            cache_misses: 500,
898            instructions_per_cycle: 2.5,
899            branch_mispredictions: 100,
900            hot_functions: vec![
901                HotFunction {
902                    function_name: "tensor_multiply".to_string(),
903                    self_time_percentage: 25.0,
904                    call_count: 1000,
905                    avg_time_per_call: Duration::from_micros(250),
906                },
907                HotFunction {
908                    function_name: "gradient_computation".to_string(),
909                    self_time_percentage: 20.0,
910                    call_count: 500,
911                    avg_time_per_call: Duration::from_micros(400),
912                },
913            ],
914            bottleneck_score: 0.6,
915        };
916
917        self.cpu_bottleneck_analysis.push(analysis.clone());
918        vec![analysis]
919    }
920
921    /// Get memory allocation statistics
922    pub fn get_memory_stats(&self) -> Option<MemoryStats> {
923        if let Ok(tracker) = self.memory_tracker.lock() {
924            Some(tracker.get_memory_stats())
925        } else {
926            None
927        }
928    }
929
930    /// Get GPU utilization metrics
931    pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
932        self.gpu_profiler
933            .as_ref()
934            .map(|profiler| profiler.get_gpu_utilization(device_id))
935    }
936
937    /// Get I/O bandwidth statistics
938    pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
939        let mut stats = HashMap::new();
940
941        stats.insert(
942            IoDeviceType::SSD,
943            self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
944        );
945        stats.insert(
946            IoDeviceType::HDD,
947            self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
948        );
949        stats.insert(
950            IoDeviceType::Network,
951            self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
952        );
953        stats.insert(
954            IoDeviceType::Memory,
955            self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
956        );
957        stats.insert(
958            IoDeviceType::Cache,
959            self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
960        );
961
962        stats
963    }
964
965    /// Get layer latency analysis
966    pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
967        self.layer_latency_profiles
968            .values()
969            .map(|profile| LayerLatencyAnalysis {
970                layer_name: profile.layer_name.clone(),
971                layer_type: profile.layer_type.clone(),
972                total_time: profile.cpu_time
973                    + profile.gpu_time
974                    + profile.memory_copy_time
975                    + profile.sync_time,
976                cpu_percentage: profile.cpu_time.as_secs_f64()
977                    / (profile.cpu_time
978                        + profile.gpu_time
979                        + profile.memory_copy_time
980                        + profile.sync_time)
981                        .as_secs_f64()
982                    * 100.0,
983                gpu_percentage: profile.gpu_time.as_secs_f64()
984                    / (profile.cpu_time
985                        + profile.gpu_time
986                        + profile.memory_copy_time
987                        + profile.sync_time)
988                        .as_secs_f64()
989                    * 100.0,
990                memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
991                    / (profile.cpu_time
992                        + profile.gpu_time
993                        + profile.memory_copy_time
994                        + profile.sync_time)
995                        .as_secs_f64()
996                    * 100.0,
997                flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
998                    profile.flops as f64 / profile.gpu_time.as_secs_f64()
999                } else {
1000                    0.0
1001                },
1002                memory_bandwidth_utilization: profile.cache_hit_rate,
1003                bottleneck_type: self.identify_layer_bottleneck(profile),
1004            })
1005            .collect()
1006    }
1007
1008    /// Get comprehensive performance analysis
1009    pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
1010        let memory_stats = self.get_memory_stats();
1011        let io_bandwidth_stats = self.get_io_bandwidth_stats();
1012        let layer_analysis = self.get_layer_latency_analysis();
1013
1014        let gpu_utilization =
1015            self.gpu_profiler.as_ref().map(|profiler| profiler.get_gpu_utilization(0));
1016
1017        PerformanceAnalysis {
1018            memory_stats,
1019            io_bandwidth_stats,
1020            layer_analysis,
1021            gpu_utilization,
1022            cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
1023            total_gpu_kernels: self.gpu_kernel_profiles.len(),
1024            total_io_operations: self.io_profiles.len(),
1025            performance_score: self.calculate_overall_performance_score(),
1026            recommendations: self.generate_enhanced_recommendations(),
1027        }
1028    }
1029
1030    fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
1031        let total_time =
1032            profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
1033
1034        if profile.memory_copy_time > total_time / 2 {
1035            "Memory Bandwidth".to_string()
1036        } else if profile.sync_time > total_time / 3 {
1037            "GPU Synchronization".to_string()
1038        } else if profile.gpu_time > profile.cpu_time * 10 {
1039            "GPU Compute".to_string()
1040        } else {
1041            "CPU Compute".to_string()
1042        }
1043    }
1044
1045    fn calculate_overall_performance_score(&self) -> f64 {
1046        let mut score: f64 = 100.0;
1047
1048        // Deduct for bottlenecks
1049        for bottleneck in &self.bottlenecks {
1050            match bottleneck.severity {
1051                BottleneckSeverity::Critical => score -= 20.0,
1052                BottleneckSeverity::High => score -= 10.0,
1053                BottleneckSeverity::Medium => score -= 5.0,
1054                BottleneckSeverity::Low => score -= 2.0,
1055            }
1056        }
1057
1058        // Deduct for poor GPU utilization
1059        if let Some(gpu_util) = self.get_gpu_utilization(0) {
1060            if gpu_util < 0.5 {
1061                score -= 15.0;
1062            } else if gpu_util < 0.7 {
1063                score -= 8.0;
1064            }
1065        }
1066
1067        // Deduct for memory inefficiency
1068        if let Some(memory_stats) = self.get_memory_stats() {
1069            if memory_stats.memory_efficiency < 0.8 {
1070                score -= 10.0;
1071            }
1072        }
1073
1074        score.max(0.0)
1075    }
1076
1077    fn generate_enhanced_recommendations(&self) -> Vec<String> {
1078        let mut recommendations = Vec::new();
1079
1080        // GPU utilization recommendations
1081        if let Some(gpu_util) = self.get_gpu_utilization(0) {
1082            if gpu_util < 0.5 {
1083                recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
1084            }
1085        }
1086
1087        // Memory recommendations
1088        if let Some(memory_stats) = self.get_memory_stats() {
1089            if memory_stats.memory_efficiency < 0.8 {
1090                recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
1091            }
1092
1093            if memory_stats.active_allocations > 10000 {
1094                recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
1095            }
1096        }
1097
1098        // I/O recommendations
1099        let io_stats = self.get_io_bandwidth_stats();
1100        if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
1101            if ssd_bandwidth < 100.0 {
1102                // Less than 100 MB/s
1103                recommendations.push(
1104                    "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
1105                        .to_string(),
1106                );
1107            }
1108        }
1109
1110        // Layer-specific recommendations
1111        let layer_analysis = self.get_layer_latency_analysis();
1112        for analysis in &layer_analysis {
1113            if analysis.memory_copy_percentage > 50.0 {
1114                recommendations.push(format!(
1115                    "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
1116                    analysis.layer_name
1117                ));
1118            }
1119
1120            if analysis.cpu_percentage > 80.0 {
1121                recommendations.push(format!(
1122                    "Layer '{}' is CPU bound. Consider GPU acceleration.",
1123                    analysis.layer_name
1124                ));
1125            }
1126        }
1127
1128        if recommendations.is_empty() {
1129            recommendations
1130                .push("Performance appears optimal based on current analysis.".to_string());
1131        }
1132
1133        recommendations
1134    }
1135
1136    // Private analysis methods
1137
1138    fn analyze_layer_bottlenecks(&mut self) {
1139        for (layer_name, profile) in &self.layer_profiles {
1140            if profile.forward_times.is_empty() {
1141                continue;
1142            }
1143
1144            let avg_forward_time =
1145                profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
1146
1147            // Consider a layer slow if it takes more than 100ms on average
1148            if avg_forward_time.as_millis() > 100 {
1149                let mut metrics = HashMap::new();
1150                metrics.insert(
1151                    "avg_forward_time_ms".to_string(),
1152                    avg_forward_time.as_millis() as f64,
1153                );
1154                metrics.insert("call_count".to_string(), profile.call_count as f64);
1155
1156                self.bottlenecks.push(PerformanceBottleneck {
1157                    bottleneck_type: BottleneckType::ModelComputation,
1158                    location: layer_name.clone(),
1159                    severity: if avg_forward_time.as_millis() > 500 {
1160                        BottleneckSeverity::High
1161                    } else {
1162                        BottleneckSeverity::Medium
1163                    },
1164                    description: format!(
1165                        "Layer '{}' has slow forward pass: {:.1}ms average",
1166                        layer_name,
1167                        avg_forward_time.as_millis()
1168                    ),
1169                    suggestion: "Consider optimizing layer implementation or reducing layer size"
1170                        .to_string(),
1171                    metrics,
1172                });
1173            }
1174        }
1175    }
1176
1177    fn analyze_memory_bottlenecks(&mut self) {
1178        if self.memory_snapshots.len() < 2 {
1179            return;
1180        }
1181
1182        // Check for memory growth trend
1183        let recent_snapshots = if self.memory_snapshots.len() > 10 {
1184            &self.memory_snapshots[self.memory_snapshots.len() - 10..]
1185        } else {
1186            &self.memory_snapshots
1187        };
1188
1189        if recent_snapshots.len() >= 5 {
1190            let initial_memory = recent_snapshots[0].heap_allocated;
1191            let final_memory = recent_snapshots.last().unwrap().heap_allocated;
1192
1193            if final_memory > initial_memory * 2 {
1194                let mut metrics = HashMap::new();
1195                metrics.insert(
1196                    "initial_memory_mb".to_string(),
1197                    initial_memory as f64 / (1024.0 * 1024.0),
1198                );
1199                metrics.insert(
1200                    "final_memory_mb".to_string(),
1201                    final_memory as f64 / (1024.0 * 1024.0),
1202                );
1203                metrics.insert(
1204                    "growth_ratio".to_string(),
1205                    final_memory as f64 / initial_memory as f64,
1206                );
1207
1208                self.bottlenecks.push(PerformanceBottleneck {
1209                    bottleneck_type: BottleneckType::MemoryBound,
1210                    location: "Memory Usage".to_string(),
1211                    severity: BottleneckSeverity::High,
1212                    description: "Significant memory growth detected during profiling".to_string(),
1213                    suggestion: "Check for memory leaks or inefficient memory usage patterns"
1214                        .to_string(),
1215                    metrics,
1216                });
1217            }
1218        }
1219    }
1220
1221    fn analyze_tensor_bottlenecks(&mut self) {
1222        // Group tensor operations by type
1223        let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
1224
1225        for event in &self.events {
1226            if let ProfileEvent::TensorOperation {
1227                operation,
1228                duration,
1229                ..
1230            } = event
1231            {
1232                operation_groups.entry(operation.clone()).or_default().push(*duration);
1233            }
1234        }
1235
1236        // Find slow operations
1237        for (operation, durations) in operation_groups {
1238            if durations.is_empty() {
1239                continue;
1240            }
1241
1242            let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
1243            let total_time = durations.iter().sum::<Duration>();
1244
1245            // Consider operation slow if it takes more than 10ms on average
1246            if avg_duration.as_millis() > 10 {
1247                let mut metrics = HashMap::new();
1248                metrics.insert(
1249                    "avg_duration_ms".to_string(),
1250                    avg_duration.as_millis() as f64,
1251                );
1252                metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
1253                metrics.insert("call_count".to_string(), durations.len() as f64);
1254
1255                self.bottlenecks.push(PerformanceBottleneck {
1256                    bottleneck_type: BottleneckType::CpuBound,
1257                    location: format!("Tensor Operation: {}", operation),
1258                    severity: if avg_duration.as_millis() > 50 {
1259                        BottleneckSeverity::High
1260                    } else {
1261                        BottleneckSeverity::Medium
1262                    },
1263                    description: format!(
1264                        "Tensor operation '{}' is slow: {:.1}ms average",
1265                        operation,
1266                        avg_duration.as_millis()
1267                    ),
1268                    suggestion:
1269                        "Consider optimizing tensor operation or using different data types"
1270                            .to_string(),
1271                    metrics,
1272                });
1273            }
1274        }
1275    }
1276
1277    fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
1278        let mut layer_times: Vec<(String, Duration)> = self
1279            .layer_profiles
1280            .iter()
1281            .map(|(name, profile)| {
1282                let avg_time = if profile.forward_times.is_empty() {
1283                    Duration::ZERO
1284                } else {
1285                    profile.forward_times.iter().sum::<Duration>()
1286                        / profile.forward_times.len() as u32
1287                };
1288                (name.clone(), avg_time)
1289            })
1290            .collect();
1291
1292        layer_times.sort_by(|a, b| b.1.cmp(&a.1));
1293        layer_times.truncate(limit);
1294        layer_times
1295    }
1296
1297    fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
1298        if self.memory_snapshots.is_empty() {
1299            return MemoryEfficiencyAnalysis::default();
1300        }
1301
1302        let memory_values: Vec<usize> =
1303            self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
1304
1305        let max_memory = memory_values.iter().max().copied().unwrap_or(0);
1306        let min_memory = memory_values.iter().min().copied().unwrap_or(0);
1307        let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
1308
1309        MemoryEfficiencyAnalysis {
1310            peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
1311            min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
1312            avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
1313            memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
1314            efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
1315        }
1316    }
1317
1318    fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
1319        if values.len() < 2 {
1320            return 0.0;
1321        }
1322
1323        let variance_sum: f64 = values
1324            .iter()
1325            .map(|&x| {
1326                let diff = x as f64 - mean as f64;
1327                diff * diff
1328            })
1329            .sum();
1330
1331        variance_sum / (values.len() - 1) as f64
1332    }
1333
1334    fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
1335        if values.is_empty() {
1336            return 0.0;
1337        }
1338
1339        let max_memory = values.iter().max().copied().unwrap_or(0);
1340        let min_memory = values.iter().min().copied().unwrap_or(0);
1341
1342        if max_memory == 0 {
1343            return 100.0;
1344        }
1345
1346        // Efficiency score: closer to 100% means more stable memory usage
1347        100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
1348    }
1349
1350    fn generate_performance_recommendations(&self) -> Vec<String> {
1351        let mut recommendations = Vec::new();
1352
1353        // Analyze bottlenecks for recommendations
1354        for bottleneck in &self.bottlenecks {
1355            match bottleneck.bottleneck_type {
1356                BottleneckType::ModelComputation => {
1357                    recommendations.push(
1358                        "Consider model architecture optimizations or layer fusion".to_string(),
1359                    );
1360                },
1361                BottleneckType::MemoryBound => {
1362                    recommendations.push(
1363                        "Optimize memory usage with gradient checkpointing or model parallelism"
1364                            .to_string(),
1365                    );
1366                },
1367                BottleneckType::CpuBound => {
1368                    recommendations.push(
1369                        "Consider GPU acceleration or optimized CPU implementations".to_string(),
1370                    );
1371                },
1372                _ => {},
1373            }
1374        }
1375
1376        // General recommendations based on profiling data
1377        if self.events.len() > 10000 {
1378            recommendations.push(
1379                "High number of profiling events - consider reducing profiling overhead"
1380                    .to_string(),
1381            );
1382        }
1383
1384        let stats = self.get_statistics();
1385        if let Some(layer_stats) = stats.get("LayerExecution") {
1386            if layer_stats.avg_duration.as_millis() > 50 {
1387                recommendations.push(
1388                    "Average layer execution time is high - consider layer optimization"
1389                        .to_string(),
1390                );
1391            }
1392        }
1393
1394        if recommendations.is_empty() {
1395            recommendations
1396                .push("Performance appears optimal based on current profiling data".to_string());
1397        }
1398
1399        recommendations
1400    }
1401}
1402
1403/// Memory efficiency analysis results
1404#[derive(Debug, Clone, Serialize, Deserialize)]
1405pub struct MemoryEfficiencyAnalysis {
1406    pub peak_memory_mb: f64,
1407    pub min_memory_mb: f64,
1408    pub avg_memory_mb: f64,
1409    pub memory_variance: f64,
1410    pub efficiency_score: f64,
1411}
1412
1413impl Default for MemoryEfficiencyAnalysis {
1414    fn default() -> Self {
1415        Self {
1416            peak_memory_mb: 0.0,
1417            min_memory_mb: 0.0,
1418            avg_memory_mb: 0.0,
1419            memory_variance: 0.0,
1420            efficiency_score: 100.0,
1421        }
1422    }
1423}
1424
1425/// Profiler report
1426#[derive(Debug, Clone, Serialize, Deserialize)]
1427pub struct ProfilerReport {
1428    pub total_events: usize,
1429    pub total_runtime: Duration,
1430    pub statistics: HashMap<String, ProfileStats>,
1431    pub bottlenecks: Vec<PerformanceBottleneck>,
1432    pub slowest_layers: Vec<(String, Duration)>,
1433    pub memory_efficiency: MemoryEfficiencyAnalysis,
1434    pub recommendations: Vec<String>,
1435}
1436
1437/// Scoped timer for automatic timing
1438pub struct ScopedTimer<'a> {
1439    profiler: &'a mut Profiler,
1440    name: String,
1441}
1442
1443impl<'a> ScopedTimer<'a> {
1444    pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1445        profiler.start_timer(&name);
1446        Self { profiler, name }
1447    }
1448}
1449
1450impl<'a> Drop for ScopedTimer<'a> {
1451    fn drop(&mut self) {
1452        self.profiler.end_timer(&self.name);
1453    }
1454}
1455
1456/// Layer latency analysis result
1457#[derive(Debug, Clone, Serialize, Deserialize)]
1458pub struct LayerLatencyAnalysis {
1459    pub layer_name: String,
1460    pub layer_type: String,
1461    pub total_time: Duration,
1462    pub cpu_percentage: f64,
1463    pub gpu_percentage: f64,
1464    pub memory_copy_percentage: f64,
1465    pub flops_per_second: f64,
1466    pub memory_bandwidth_utilization: f64,
1467    pub bottleneck_type: String,
1468}
1469
1470/// Comprehensive performance analysis
1471#[derive(Debug, Serialize, Deserialize)]
1472pub struct PerformanceAnalysis {
1473    pub memory_stats: Option<MemoryStats>,
1474    pub io_bandwidth_stats: HashMap<IoDeviceType, f64>,
1475    pub layer_analysis: Vec<LayerLatencyAnalysis>,
1476    pub gpu_utilization: Option<f64>,
1477    pub cpu_bottlenecks: Vec<CpuBottleneckAnalysis>,
1478    pub total_gpu_kernels: usize,
1479    pub total_io_operations: usize,
1480    pub performance_score: f64,
1481    pub recommendations: Vec<String>,
1482}
1483
1484/// Enhanced profiler report
1485#[derive(Debug, Serialize, Deserialize)]
1486pub struct EnhancedProfilerReport {
1487    pub basic_report: ProfilerReport,
1488    pub performance_analysis: PerformanceAnalysis,
1489    pub gpu_kernel_summary: GpuKernelSummary,
1490    pub memory_allocation_summary: MemoryAllocationSummary,
1491    pub io_performance_summary: IoPerformanceSummary,
1492}
1493
1494#[derive(Debug, Serialize, Deserialize)]
1495pub struct GpuKernelSummary {
1496    pub total_kernels: usize,
1497    pub total_execution_time: Duration,
1498    pub avg_occupancy: f64,
1499    pub avg_compute_utilization: f64,
1500    pub slowest_kernels: Vec<String>,
1501}
1502
1503#[derive(Debug, Serialize, Deserialize)]
1504pub struct MemoryAllocationSummary {
1505    pub total_allocations: usize,
1506    pub peak_memory_usage: usize,
1507    pub memory_efficiency: f64,
1508    pub largest_allocations: Vec<String>,
1509    pub memory_leaks: usize,
1510}
1511
1512#[derive(Debug, Serialize, Deserialize)]
1513pub struct IoPerformanceSummary {
1514    pub total_operations: usize,
1515    pub total_bytes_transferred: usize,
1516    pub avg_bandwidth_by_device: HashMap<IoDeviceType, f64>,
1517    pub slowest_operations: Vec<String>,
1518}
1519
1520impl Profiler {
1521    /// Generate enhanced profiler report with advanced metrics
1522    pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
1523        let basic_report = self.generate_report().await?;
1524        let performance_analysis = self.get_performance_analysis();
1525
1526        let gpu_kernel_summary = self.generate_gpu_kernel_summary();
1527        let memory_allocation_summary = self.generate_memory_allocation_summary();
1528        let io_performance_summary = self.generate_io_performance_summary();
1529
1530        Ok(EnhancedProfilerReport {
1531            basic_report,
1532            performance_analysis,
1533            gpu_kernel_summary,
1534            memory_allocation_summary,
1535            io_performance_summary,
1536        })
1537    }
1538
1539    fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
1540        let total_kernels = self.gpu_kernel_profiles.len();
1541        let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
1542
1543        let avg_occupancy = if total_kernels > 0 {
1544            self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
1545        } else {
1546            0.0
1547        };
1548
1549        let avg_compute_utilization = if total_kernels > 0 {
1550            self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1551                / total_kernels as f64
1552        } else {
1553            0.0
1554        };
1555
1556        let mut kernels_by_time: Vec<_> = self
1557            .gpu_kernel_profiles
1558            .iter()
1559            .map(|k| (k.kernel_name.clone(), k.execution_time))
1560            .collect();
1561        kernels_by_time.sort_by(|a, b| b.1.cmp(&a.1));
1562
1563        let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1564
1565        GpuKernelSummary {
1566            total_kernels,
1567            total_execution_time,
1568            avg_occupancy,
1569            avg_compute_utilization,
1570            slowest_kernels,
1571        }
1572    }
1573
1574    fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1575        let total_allocations = self.memory_allocations.len();
1576        let peak_memory_usage =
1577            self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1578
1579        let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1580            stats.memory_efficiency
1581        } else {
1582            1.0
1583        };
1584
1585        let mut allocations_by_size: Vec<_> = self
1586            .memory_allocations
1587            .values()
1588            .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1589            .collect();
1590        allocations_by_size.sort_by(|a, b| b.1.cmp(&a.1));
1591
1592        let largest_allocations =
1593            allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1594
1595        let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1596
1597        MemoryAllocationSummary {
1598            total_allocations,
1599            peak_memory_usage,
1600            memory_efficiency,
1601            largest_allocations,
1602            memory_leaks,
1603        }
1604    }
1605
1606    fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1607        let total_operations = self.io_profiles.len();
1608        let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1609
1610        let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1611
1612        let mut operations_by_duration: Vec<_> = self
1613            .io_profiles
1614            .iter()
1615            .map(|io| {
1616                (
1617                    format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1618                    io.duration,
1619                )
1620            })
1621            .collect();
1622        operations_by_duration.sort_by(|a, b| b.1.cmp(&a.1));
1623
1624        let slowest_operations =
1625            operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1626
1627        IoPerformanceSummary {
1628            total_operations,
1629            total_bytes_transferred,
1630            avg_bandwidth_by_device,
1631            slowest_operations,
1632        }
1633    }
1634}
1635
1636/// Macro for convenient timing
1637#[macro_export]
1638macro_rules! profile_scope {
1639    ($profiler:expr, $name:expr) => {
1640        let _timer = ScopedTimer::new($profiler, $name.to_string());
1641    };
1642}