trustformers_debug/
profiler.rs

1//! Performance profiling tools for debugging
2
3use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant, SystemTime};
8use uuid::Uuid;
9
10use crate::DebugConfig;
11
12/// Profiling event types
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub enum ProfileEvent {
15    FunctionCall {
16        function_name: String,
17        duration: Duration,
18        memory_delta: i64,
19    },
20    LayerExecution {
21        layer_name: String,
22        layer_type: String,
23        forward_time: Duration,
24        backward_time: Option<Duration>,
25        memory_usage: usize,
26        parameter_count: usize,
27    },
28    TensorOperation {
29        operation: String,
30        tensor_shape: Vec<usize>,
31        duration: Duration,
32        memory_allocated: usize,
33    },
34    ModelInference {
35        batch_size: usize,
36        sequence_length: usize,
37        duration: Duration,
38        tokens_per_second: f64,
39    },
40    GradientComputation {
41        layer_name: String,
42        gradient_norm: f64,
43        duration: Duration,
44    },
45}
46
47/// Profiling statistics for analysis
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ProfileStats {
50    pub event_type: String,
51    pub count: usize,
52    pub total_duration: Duration,
53    pub avg_duration: Duration,
54    pub min_duration: Duration,
55    pub max_duration: Duration,
56    pub total_memory: i64,
57    pub avg_memory: f64,
58}
59
60/// Memory usage snapshot
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct MemorySnapshot {
63    pub timestamp: chrono::DateTime<chrono::Utc>,
64    pub heap_allocated: usize,
65    pub heap_used: usize,
66    pub stack_size: usize,
67    pub gpu_allocated: Option<usize>,
68    pub gpu_used: Option<usize>,
69}
70
71/// Performance bottleneck detection
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct PerformanceBottleneck {
74    pub bottleneck_type: BottleneckType,
75    pub location: String,
76    pub severity: BottleneckSeverity,
77    pub description: String,
78    pub suggestion: String,
79    pub metrics: HashMap<String, f64>,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub enum BottleneckType {
84    CpuBound,
85    MemoryBound,
86    IoBound,
87    GpuBound,
88    NetworkBound,
89    DataLoading,
90    ModelComputation,
91    GradientComputation,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub enum BottleneckSeverity {
96    Low,
97    Medium,
98    High,
99    Critical,
100}
101
102/// CPU profiling information
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct CpuProfile {
105    pub function_name: String,
106    pub self_time: Duration,
107    pub total_time: Duration,
108    pub call_count: usize,
109    pub children: Vec<CpuProfile>,
110}
111
112/// Enhanced GPU kernel profiling
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct GpuKernelProfile {
115    pub kernel_name: String,
116    pub grid_size: (u32, u32, u32),
117    pub block_size: (u32, u32, u32),
118    pub shared_memory_bytes: usize,
119    pub registers_per_thread: u32,
120    pub occupancy: f64,
121    pub execution_time: Duration,
122    pub memory_bandwidth_gb_s: f64,
123    pub compute_utilization: f64,
124    pub stream_id: i32,
125}
126
127/// Memory allocation tracking
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct MemoryAllocation {
130    pub allocation_id: Uuid,
131    pub size_bytes: usize,
132    pub allocation_type: MemoryAllocationType,
133    pub device_id: Option<i32>,
134    pub timestamp: SystemTime,
135    pub stack_trace: Vec<String>,
136    pub freed: bool,
137    pub free_timestamp: Option<SystemTime>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub enum MemoryAllocationType {
142    Host,
143    Device,
144    Unified,
145    Pinned,
146    Mapped,
147}
148
149/// Layer-wise latency analysis
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct LayerLatencyProfile {
152    pub layer_name: String,
153    pub layer_type: String,
154    pub input_shapes: Vec<Vec<usize>>,
155    pub output_shapes: Vec<Vec<usize>>,
156    pub cpu_time: Duration,
157    pub gpu_time: Duration,
158    pub memory_copy_time: Duration,
159    pub sync_time: Duration,
160    pub parameter_count: usize,
161    pub flops: u64,
162    pub memory_footprint_bytes: usize,
163    pub cache_hit_rate: f64,
164}
165
166/// I/O operation profiling
167#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct IoProfile {
169    pub operation_type: IoOperationType,
170    pub file_path: Option<String>,
171    pub bytes_transferred: usize,
172    pub duration: Duration,
173    pub bandwidth_mb_s: f64,
174    pub queue_time: Duration,
175    pub device_type: IoDeviceType,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub enum IoOperationType {
180    FileRead,
181    FileWrite,
182    NetworkRead,
183    NetworkWrite,
184    DatabaseQuery,
185    CacheLoad,
186    CacheStore,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
190pub enum IoDeviceType {
191    SSD,
192    HDD,
193    Network,
194    Memory,
195    Cache,
196}
197
198/// CPU bottleneck analysis
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CpuBottleneckAnalysis {
201    pub thread_id: u64,
202    pub cpu_usage: f64,
203    pub context_switches: u64,
204    pub cache_misses: u64,
205    pub instructions_per_cycle: f64,
206    pub branch_mispredictions: u64,
207    pub hot_functions: Vec<HotFunction>,
208    pub bottleneck_score: f64,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct HotFunction {
213    pub function_name: String,
214    pub self_time_percentage: f64,
215    pub call_count: usize,
216    pub avg_time_per_call: Duration,
217}
218
219/// Memory allocation tracker
220#[derive(Debug)]
221pub struct MemoryTracker {
222    allocations: HashMap<Uuid, MemoryAllocation>,
223    total_allocated: usize,
224    peak_allocated: usize,
225    allocation_count: usize,
226    deallocation_count: usize,
227}
228
229impl MemoryTracker {
230    pub fn new() -> Self {
231        Self {
232            allocations: HashMap::new(),
233            total_allocated: 0,
234            peak_allocated: 0,
235            allocation_count: 0,
236            deallocation_count: 0,
237        }
238    }
239
240    pub fn track_allocation(&mut self, allocation: MemoryAllocation) {
241        self.total_allocated += allocation.size_bytes;
242        self.allocation_count += 1;
243
244        if self.total_allocated > self.peak_allocated {
245            self.peak_allocated = self.total_allocated;
246        }
247
248        self.allocations.insert(allocation.allocation_id, allocation);
249    }
250
251    pub fn track_deallocation(&mut self, allocation_id: Uuid) {
252        if let Some(mut allocation) = self.allocations.remove(&allocation_id) {
253            allocation.freed = true;
254            allocation.free_timestamp = Some(SystemTime::now());
255            self.total_allocated = self.total_allocated.saturating_sub(allocation.size_bytes);
256            self.deallocation_count += 1;
257        }
258    }
259
260    pub fn get_memory_stats(&self) -> MemoryStats {
261        MemoryStats {
262            total_allocated: self.total_allocated,
263            peak_allocated: self.peak_allocated,
264            active_allocations: self.allocations.len(),
265            allocation_count: self.allocation_count,
266            deallocation_count: self.deallocation_count,
267            memory_efficiency: if self.allocation_count > 0 {
268                self.deallocation_count as f64 / self.allocation_count as f64
269            } else {
270                1.0
271            },
272        }
273    }
274}
275
276#[derive(Debug, Clone, Serialize, Deserialize)]
277pub struct MemoryStats {
278    pub total_allocated: usize,
279    pub peak_allocated: usize,
280    pub active_allocations: usize,
281    pub allocation_count: usize,
282    pub deallocation_count: usize,
283    pub memory_efficiency: f64,
284}
285
286/// GPU profiler for kernel analysis
287#[derive(Debug)]
288#[allow(dead_code)]
289pub struct GpuProfiler {
290    #[allow(dead_code)]
291    device_count: i32,
292    active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
293    memory_pools: HashMap<i32, GpuMemoryPool>,
294}
295
296#[allow(dead_code)]
297#[derive(Debug)]
298pub struct GpuMemoryPool {
299    #[allow(dead_code)]
300    device_id: i32,
301    total_memory: usize,
302    free_memory: usize,
303    fragmentation_score: f64,
304}
305
306impl GpuProfiler {
307    pub fn new() -> Result<Self> {
308        // In practice, this would initialize CUDA/ROCm profiling
309        Ok(Self {
310            device_count: 1, // Simplified
311            active_streams: HashMap::new(),
312            memory_pools: HashMap::new(),
313        })
314    }
315
316    pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
317        self.active_streams
318            .entry(kernel_profile.stream_id)
319            .or_insert_with(Vec::new)
320            .push(kernel_profile);
321    }
322
323    pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
324        // Simplified GPU utilization calculation
325        if let Some(kernels) = self.active_streams.get(&device_id) {
326            if kernels.is_empty() {
327                0.0
328            } else {
329                kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
330            }
331        } else {
332            0.0
333        }
334    }
335}
336
337/// I/O operation monitor
338#[derive(Debug)]
339pub struct IoMonitor {
340    active_operations: HashMap<Uuid, IoOperation>,
341    bandwidth_history: Vec<BandwidthSample>,
342    io_queue_depth: usize,
343}
344#[allow(dead_code)]
345#[derive(Debug)]
346pub struct IoOperation {
347    #[allow(dead_code)]
348    operation_id: Uuid,
349    start_time: Instant,
350    operation_type: IoOperationType,
351    bytes_expected: usize,
352}
353
354#[derive(Debug, Clone, Serialize, Deserialize)]
355pub struct BandwidthSample {
356    pub timestamp: SystemTime,
357    pub bandwidth_mb_s: f64,
358    pub device_type: IoDeviceType,
359}
360
361impl IoMonitor {
362    pub fn new() -> Self {
363        Self {
364            active_operations: HashMap::new(),
365            bandwidth_history: Vec::new(),
366            io_queue_depth: 0,
367        }
368    }
369
370    pub fn start_io_operation(
371        &mut self,
372        operation_type: IoOperationType,
373        bytes_expected: usize,
374    ) -> Uuid {
375        let operation_id = Uuid::new_v4();
376        let operation = IoOperation {
377            operation_id,
378            start_time: Instant::now(),
379            operation_type,
380            bytes_expected,
381        };
382
383        self.active_operations.insert(operation_id, operation);
384        self.io_queue_depth += 1;
385        operation_id
386    }
387
388    pub fn finish_io_operation(
389        &mut self,
390        operation_id: Uuid,
391        bytes_transferred: usize,
392    ) -> Option<IoProfile> {
393        if let Some(operation) = self.active_operations.remove(&operation_id) {
394            let duration = operation.start_time.elapsed();
395            let bandwidth_mb_s = if duration.as_secs_f64() > 0.0 {
396                bytes_transferred as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()
397            } else {
398                0.0
399            };
400
401            self.io_queue_depth = self.io_queue_depth.saturating_sub(1);
402
403            let device_type = match operation.operation_type {
404                IoOperationType::FileRead | IoOperationType::FileWrite => IoDeviceType::SSD,
405                IoOperationType::NetworkRead | IoOperationType::NetworkWrite => {
406                    IoDeviceType::Network
407                },
408                IoOperationType::CacheLoad | IoOperationType::CacheStore => IoDeviceType::Cache,
409                _ => IoDeviceType::Memory,
410            };
411
412            // Record bandwidth sample
413            self.bandwidth_history.push(BandwidthSample {
414                timestamp: SystemTime::now(),
415                bandwidth_mb_s,
416                device_type: device_type.clone(),
417            });
418
419            // Keep only recent samples
420            if self.bandwidth_history.len() > 1000 {
421                self.bandwidth_history.drain(0..500);
422            }
423
424            Some(IoProfile {
425                operation_type: operation.operation_type,
426                file_path: None, // Would be filled in practice
427                bytes_transferred,
428                duration,
429                bandwidth_mb_s,
430                queue_time: Duration::from_millis(self.io_queue_depth as u64 * 10), // Simplified
431                device_type,
432            })
433        } else {
434            None
435        }
436    }
437
438    pub fn get_average_bandwidth(&self, device_type: &IoDeviceType) -> f64 {
439        let samples: Vec<f64> = self
440            .bandwidth_history
441            .iter()
442            .filter(|s| s.device_type == *device_type)
443            .map(|s| s.bandwidth_mb_s)
444            .collect();
445
446        if samples.is_empty() {
447            0.0
448        } else {
449            samples.iter().sum::<f64>() / samples.len() as f64
450        }
451    }
452}
453
454/// Performance profiler
455#[derive(Debug)]
456pub struct Profiler {
457    #[allow(dead_code)]
458    config: DebugConfig,
459    events: Vec<ProfileEvent>,
460    active_timers: HashMap<String, Instant>,
461    memory_snapshots: Vec<MemorySnapshot>,
462    start_time: Option<Instant>,
463    layer_profiles: HashMap<String, LayerProfile>,
464    bottlenecks: Vec<PerformanceBottleneck>,
465    // Enhanced profiling features
466    gpu_kernel_profiles: Vec<GpuKernelProfile>,
467    memory_allocations: HashMap<Uuid, MemoryAllocation>,
468    layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
469    io_profiles: Vec<IoProfile>,
470    cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
471    memory_tracker: Arc<Mutex<MemoryTracker>>,
472    gpu_profiler: Option<GpuProfiler>,
473    io_monitor: IoMonitor,
474}
475
476#[derive(Debug)]
477pub struct LayerProfile {
478    #[allow(dead_code)]
479    layer_name: String,
480    forward_times: Vec<Duration>,
481    backward_times: Vec<Duration>,
482    memory_usage: Vec<usize>,
483    call_count: usize,
484}
485
486impl LayerProfile {
487    /// Get forward execution times
488    pub fn forward_times(&self) -> &Vec<Duration> {
489        &self.forward_times
490    }
491
492    /// Get backward execution times
493    pub fn backward_times(&self) -> &Vec<Duration> {
494        &self.backward_times
495    }
496
497    /// Get memory usage samples
498    pub fn memory_usage(&self) -> &Vec<usize> {
499        &self.memory_usage
500    }
501
502    /// Get total number of calls
503    pub fn call_count(&self) -> usize {
504        self.call_count
505    }
506}
507
508impl Profiler {
509    /// Create a new profiler
510    pub fn new(config: &DebugConfig) -> Self {
511        Self {
512            config: config.clone(),
513            events: Vec::new(),
514            active_timers: HashMap::new(),
515            memory_snapshots: Vec::new(),
516            start_time: None,
517            layer_profiles: HashMap::new(),
518            bottlenecks: Vec::new(),
519            // Enhanced profiling features
520            gpu_kernel_profiles: Vec::new(),
521            memory_allocations: HashMap::new(),
522            layer_latency_profiles: HashMap::new(),
523            io_profiles: Vec::new(),
524            cpu_bottleneck_analysis: Vec::new(),
525            memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
526            gpu_profiler: GpuProfiler::new().ok(),
527            io_monitor: IoMonitor::new(),
528        }
529    }
530
531    /// Start profiling session
532    pub async fn start(&mut self) -> Result<()> {
533        tracing::info!("Starting performance profiler");
534        self.start_time = Some(Instant::now());
535        self.take_memory_snapshot();
536        Ok(())
537    }
538
539    /// Get reference to profiling events
540    pub fn get_events(&self) -> &Vec<ProfileEvent> {
541        &self.events
542    }
543
544    /// Start timing a function or operation
545    pub fn start_timer(&mut self, name: &str) {
546        self.active_timers.insert(name.to_string(), Instant::now());
547    }
548
549    /// End timing and record the event
550    pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
551        if let Some(start_time) = self.active_timers.remove(name) {
552            let duration = start_time.elapsed();
553
554            // Record basic function call event
555            self.events.push(ProfileEvent::FunctionCall {
556                function_name: name.to_string(),
557                duration,
558                memory_delta: 0, // Would need actual memory tracking
559            });
560
561            Some(duration)
562        } else {
563            tracing::warn!("Timer '{}' was not started", name);
564            None
565        }
566    }
567
568    /// Record layer execution timing
569    pub fn record_layer_execution(
570        &mut self,
571        layer_name: &str,
572        layer_type: &str,
573        forward_time: Duration,
574        backward_time: Option<Duration>,
575        memory_usage: usize,
576        parameter_count: usize,
577    ) {
578        // Record event
579        self.events.push(ProfileEvent::LayerExecution {
580            layer_name: layer_name.to_string(),
581            layer_type: layer_type.to_string(),
582            forward_time,
583            backward_time,
584            memory_usage,
585            parameter_count,
586        });
587
588        // Update layer profile
589        let profile =
590            self.layer_profiles
591                .entry(layer_name.to_string())
592                .or_insert_with(|| LayerProfile {
593                    layer_name: layer_name.to_string(),
594                    forward_times: Vec::new(),
595                    backward_times: Vec::new(),
596                    memory_usage: Vec::new(),
597                    call_count: 0,
598                });
599
600        profile.forward_times.push(forward_time);
601        if let Some(backward) = backward_time {
602            profile.backward_times.push(backward);
603        }
604        profile.memory_usage.push(memory_usage);
605        profile.call_count += 1;
606    }
607
608    /// Record tensor operation timing
609    pub fn record_tensor_operation(
610        &mut self,
611        operation: &str,
612        tensor_shape: &[usize],
613        duration: Duration,
614        memory_allocated: usize,
615    ) {
616        self.events.push(ProfileEvent::TensorOperation {
617            operation: operation.to_string(),
618            tensor_shape: tensor_shape.to_vec(),
619            duration,
620            memory_allocated,
621        });
622    }
623
624    /// Record model inference timing
625    pub fn record_model_inference(
626        &mut self,
627        batch_size: usize,
628        sequence_length: usize,
629        duration: Duration,
630    ) {
631        let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
632
633        self.events.push(ProfileEvent::ModelInference {
634            batch_size,
635            sequence_length,
636            duration,
637            tokens_per_second,
638        });
639    }
640
641    /// Record gradient computation timing
642    pub fn record_gradient_computation(
643        &mut self,
644        layer_name: &str,
645        gradient_norm: f64,
646        duration: Duration,
647    ) {
648        self.events.push(ProfileEvent::GradientComputation {
649            layer_name: layer_name.to_string(),
650            gradient_norm,
651            duration,
652        });
653    }
654
655    /// Take a memory usage snapshot
656    pub fn take_memory_snapshot(&mut self) {
657        // Simplified memory tracking - in practice would use system APIs
658        let snapshot = MemorySnapshot {
659            timestamp: chrono::Utc::now(),
660            heap_allocated: 0, // Would get from system
661            heap_used: 0,
662            stack_size: 0,
663            gpu_allocated: None,
664            gpu_used: None,
665        };
666
667        self.memory_snapshots.push(snapshot);
668
669        // Keep only recent snapshots to prevent memory growth
670        if self.memory_snapshots.len() > 1000 {
671            self.memory_snapshots.drain(0..500);
672        }
673    }
674
675    /// Analyze performance and detect bottlenecks
676    pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
677        self.bottlenecks.clear();
678
679        // Analyze layer execution times
680        self.analyze_layer_bottlenecks();
681
682        // Analyze memory usage patterns
683        self.analyze_memory_bottlenecks();
684
685        // Analyze tensor operation efficiency
686        self.analyze_tensor_bottlenecks();
687
688        self.bottlenecks.clone()
689    }
690
691    /// Get profiling statistics
692    pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
693        let mut stats = HashMap::new();
694
695        // Group events by type
696        let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
697
698        for event in &self.events {
699            let event_type = match event {
700                ProfileEvent::FunctionCall { .. } => "FunctionCall",
701                ProfileEvent::LayerExecution { .. } => "LayerExecution",
702                ProfileEvent::TensorOperation { .. } => "TensorOperation",
703                ProfileEvent::ModelInference { .. } => "ModelInference",
704                ProfileEvent::GradientComputation { .. } => "GradientComputation",
705            };
706
707            grouped_events
708                .entry(event_type.to_string())
709                .or_insert_with(Vec::new)
710                .push(event);
711        }
712
713        // Calculate statistics for each event type
714        for (event_type, events) in grouped_events {
715            let durations: Vec<Duration> = events
716                .iter()
717                .filter_map(|event| match event {
718                    ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
719                    ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
720                    ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
721                    ProfileEvent::ModelInference { duration, .. } => Some(*duration),
722                    ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
723                })
724                .collect();
725
726            if !durations.is_empty() {
727                let total_duration: Duration = durations.iter().sum();
728                let avg_duration = total_duration / durations.len() as u32;
729                let min_duration = durations.iter().min().copied().unwrap_or_default();
730                let max_duration = durations.iter().max().copied().unwrap_or_default();
731
732                stats.insert(
733                    event_type.clone(),
734                    ProfileStats {
735                        event_type,
736                        count: durations.len(),
737                        total_duration,
738                        avg_duration,
739                        min_duration,
740                        max_duration,
741                        total_memory: 0, // Simplified
742                        avg_memory: 0.0,
743                    },
744                );
745            }
746        }
747
748        stats
749    }
750
751    /// Get layer-specific performance profiles
752    pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
753        &self.layer_profiles
754    }
755
756    /// Get memory usage over time
757    pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
758        &self.memory_snapshots
759    }
760
761    /// Generate performance report
762    pub async fn generate_report(&self) -> Result<ProfilerReport> {
763        let statistics = self.get_statistics();
764        let bottlenecks = self.bottlenecks.clone();
765        let total_events = self.events.len();
766
767        let total_runtime =
768            if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
769
770        // Calculate slowest layers
771        let slowest_layers = self.get_slowest_layers(5);
772
773        // Memory efficiency analysis
774        let memory_efficiency = self.analyze_memory_efficiency();
775
776        Ok(ProfilerReport {
777            total_events,
778            total_runtime,
779            statistics,
780            bottlenecks,
781            slowest_layers,
782            memory_efficiency,
783            recommendations: self.generate_performance_recommendations(),
784        })
785    }
786
787    /// Clear all profiling data
788    pub fn clear(&mut self) {
789        self.events.clear();
790        self.active_timers.clear();
791        self.memory_snapshots.clear();
792        self.layer_profiles.clear();
793        self.bottlenecks.clear();
794        self.start_time = None;
795        // Clear enhanced profiling data
796        self.gpu_kernel_profiles.clear();
797        self.memory_allocations.clear();
798        self.layer_latency_profiles.clear();
799        self.io_profiles.clear();
800        self.cpu_bottleneck_analysis.clear();
801        if let Ok(mut tracker) = self.memory_tracker.lock() {
802            *tracker = MemoryTracker::new();
803        }
804        self.io_monitor = IoMonitor::new();
805    }
806
807    // Enhanced profiling methods
808
809    /// Profile GPU kernel execution
810    pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
811        if let Some(ref mut gpu_profiler) = self.gpu_profiler {
812            gpu_profiler.profile_kernel(kernel_profile.clone());
813        }
814        self.gpu_kernel_profiles.push(kernel_profile);
815    }
816
817    /// Track memory allocation
818    pub fn track_memory_allocation(
819        &mut self,
820        size_bytes: usize,
821        allocation_type: MemoryAllocationType,
822        device_id: Option<i32>,
823        stack_trace: Vec<String>,
824    ) -> Uuid {
825        let allocation_id = Uuid::new_v4();
826        let allocation = MemoryAllocation {
827            allocation_id,
828            size_bytes,
829            allocation_type,
830            device_id,
831            timestamp: SystemTime::now(),
832            stack_trace,
833            freed: false,
834            free_timestamp: None,
835        };
836
837        if let Ok(mut tracker) = self.memory_tracker.lock() {
838            tracker.track_allocation(allocation.clone());
839        }
840
841        self.memory_allocations.insert(allocation_id, allocation);
842        allocation_id
843    }
844
845    /// Track memory deallocation
846    pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
847        if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
848            allocation.freed = true;
849            allocation.free_timestamp = Some(SystemTime::now());
850        }
851
852        if let Ok(mut tracker) = self.memory_tracker.lock() {
853            tracker.track_deallocation(allocation_id);
854        }
855    }
856
857    /// Profile layer latency with detailed breakdown
858    pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
859        self.layer_latency_profiles
860            .insert(layer_latency.layer_name.clone(), layer_latency);
861    }
862
863    /// Start I/O operation profiling
864    pub fn start_io_profiling(
865        &mut self,
866        operation_type: IoOperationType,
867        bytes_expected: usize,
868    ) -> Uuid {
869        self.io_monitor.start_io_operation(operation_type, bytes_expected)
870    }
871
872    /// Finish I/O operation profiling
873    pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
874        if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
875        {
876            self.io_profiles.push(profile);
877        }
878    }
879
880    /// Analyze CPU bottlenecks
881    pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
882        // Simplified CPU bottleneck analysis
883        // In practice, this would use system profiling APIs
884        let analysis = CpuBottleneckAnalysis {
885            thread_id: 0, // Use 0 as placeholder since thread::current().id().as_u64() is unstable
886            cpu_usage: 0.75, // Simplified
887            context_switches: 1000,
888            cache_misses: 500,
889            instructions_per_cycle: 2.5,
890            branch_mispredictions: 100,
891            hot_functions: vec![
892                HotFunction {
893                    function_name: "tensor_multiply".to_string(),
894                    self_time_percentage: 25.0,
895                    call_count: 1000,
896                    avg_time_per_call: Duration::from_micros(250),
897                },
898                HotFunction {
899                    function_name: "gradient_computation".to_string(),
900                    self_time_percentage: 20.0,
901                    call_count: 500,
902                    avg_time_per_call: Duration::from_micros(400),
903                },
904            ],
905            bottleneck_score: 0.6,
906        };
907
908        self.cpu_bottleneck_analysis.push(analysis.clone());
909        vec![analysis]
910    }
911
912    /// Get memory allocation statistics
913    pub fn get_memory_stats(&self) -> Option<MemoryStats> {
914        if let Ok(tracker) = self.memory_tracker.lock() {
915            Some(tracker.get_memory_stats())
916        } else {
917            None
918        }
919    }
920
921    /// Get GPU utilization metrics
922    pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
923        self.gpu_profiler
924            .as_ref()
925            .map(|profiler| profiler.get_gpu_utilization(device_id))
926    }
927
928    /// Get I/O bandwidth statistics
929    pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
930        let mut stats = HashMap::new();
931
932        stats.insert(
933            IoDeviceType::SSD,
934            self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
935        );
936        stats.insert(
937            IoDeviceType::HDD,
938            self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
939        );
940        stats.insert(
941            IoDeviceType::Network,
942            self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
943        );
944        stats.insert(
945            IoDeviceType::Memory,
946            self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
947        );
948        stats.insert(
949            IoDeviceType::Cache,
950            self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
951        );
952
953        stats
954    }
955
956    /// Get layer latency analysis
957    pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
958        self.layer_latency_profiles
959            .values()
960            .map(|profile| LayerLatencyAnalysis {
961                layer_name: profile.layer_name.clone(),
962                layer_type: profile.layer_type.clone(),
963                total_time: profile.cpu_time
964                    + profile.gpu_time
965                    + profile.memory_copy_time
966                    + profile.sync_time,
967                cpu_percentage: profile.cpu_time.as_secs_f64()
968                    / (profile.cpu_time
969                        + profile.gpu_time
970                        + profile.memory_copy_time
971                        + profile.sync_time)
972                        .as_secs_f64()
973                    * 100.0,
974                gpu_percentage: profile.gpu_time.as_secs_f64()
975                    / (profile.cpu_time
976                        + profile.gpu_time
977                        + profile.memory_copy_time
978                        + profile.sync_time)
979                        .as_secs_f64()
980                    * 100.0,
981                memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
982                    / (profile.cpu_time
983                        + profile.gpu_time
984                        + profile.memory_copy_time
985                        + profile.sync_time)
986                        .as_secs_f64()
987                    * 100.0,
988                flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
989                    profile.flops as f64 / profile.gpu_time.as_secs_f64()
990                } else {
991                    0.0
992                },
993                memory_bandwidth_utilization: profile.cache_hit_rate,
994                bottleneck_type: self.identify_layer_bottleneck(profile),
995            })
996            .collect()
997    }
998
999    /// Get comprehensive performance analysis
1000    pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
1001        let memory_stats = self.get_memory_stats();
1002        let io_bandwidth_stats = self.get_io_bandwidth_stats();
1003        let layer_analysis = self.get_layer_latency_analysis();
1004
1005        let gpu_utilization = if let Some(profiler) = &self.gpu_profiler {
1006            Some(profiler.get_gpu_utilization(0))
1007        } else {
1008            None
1009        };
1010
1011        PerformanceAnalysis {
1012            memory_stats,
1013            io_bandwidth_stats,
1014            layer_analysis,
1015            gpu_utilization,
1016            cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
1017            total_gpu_kernels: self.gpu_kernel_profiles.len(),
1018            total_io_operations: self.io_profiles.len(),
1019            performance_score: self.calculate_overall_performance_score(),
1020            recommendations: self.generate_enhanced_recommendations(),
1021        }
1022    }
1023
1024    fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
1025        let total_time =
1026            profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
1027
1028        if profile.memory_copy_time > total_time / 2 {
1029            "Memory Bandwidth".to_string()
1030        } else if profile.sync_time > total_time / 3 {
1031            "GPU Synchronization".to_string()
1032        } else if profile.gpu_time > profile.cpu_time * 10 {
1033            "GPU Compute".to_string()
1034        } else {
1035            "CPU Compute".to_string()
1036        }
1037    }
1038
1039    fn calculate_overall_performance_score(&self) -> f64 {
1040        let mut score: f64 = 100.0;
1041
1042        // Deduct for bottlenecks
1043        for bottleneck in &self.bottlenecks {
1044            match bottleneck.severity {
1045                BottleneckSeverity::Critical => score -= 20.0,
1046                BottleneckSeverity::High => score -= 10.0,
1047                BottleneckSeverity::Medium => score -= 5.0,
1048                BottleneckSeverity::Low => score -= 2.0,
1049            }
1050        }
1051
1052        // Deduct for poor GPU utilization
1053        if let Some(gpu_util) = self.get_gpu_utilization(0) {
1054            if gpu_util < 0.5 {
1055                score -= 15.0;
1056            } else if gpu_util < 0.7 {
1057                score -= 8.0;
1058            }
1059        }
1060
1061        // Deduct for memory inefficiency
1062        if let Some(memory_stats) = self.get_memory_stats() {
1063            if memory_stats.memory_efficiency < 0.8 {
1064                score -= 10.0;
1065            }
1066        }
1067
1068        score.max(0.0)
1069    }
1070
1071    fn generate_enhanced_recommendations(&self) -> Vec<String> {
1072        let mut recommendations = Vec::new();
1073
1074        // GPU utilization recommendations
1075        if let Some(gpu_util) = self.get_gpu_utilization(0) {
1076            if gpu_util < 0.5 {
1077                recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
1078            }
1079        }
1080
1081        // Memory recommendations
1082        if let Some(memory_stats) = self.get_memory_stats() {
1083            if memory_stats.memory_efficiency < 0.8 {
1084                recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
1085            }
1086
1087            if memory_stats.active_allocations > 10000 {
1088                recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
1089            }
1090        }
1091
1092        // I/O recommendations
1093        let io_stats = self.get_io_bandwidth_stats();
1094        if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
1095            if ssd_bandwidth < 100.0 {
1096                // Less than 100 MB/s
1097                recommendations.push(
1098                    "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
1099                        .to_string(),
1100                );
1101            }
1102        }
1103
1104        // Layer-specific recommendations
1105        let layer_analysis = self.get_layer_latency_analysis();
1106        for analysis in &layer_analysis {
1107            if analysis.memory_copy_percentage > 50.0 {
1108                recommendations.push(format!(
1109                    "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
1110                    analysis.layer_name
1111                ));
1112            }
1113
1114            if analysis.cpu_percentage > 80.0 {
1115                recommendations.push(format!(
1116                    "Layer '{}' is CPU bound. Consider GPU acceleration.",
1117                    analysis.layer_name
1118                ));
1119            }
1120        }
1121
1122        if recommendations.is_empty() {
1123            recommendations
1124                .push("Performance appears optimal based on current analysis.".to_string());
1125        }
1126
1127        recommendations
1128    }
1129
1130    // Private analysis methods
1131
1132    fn analyze_layer_bottlenecks(&mut self) {
1133        for (layer_name, profile) in &self.layer_profiles {
1134            if profile.forward_times.is_empty() {
1135                continue;
1136            }
1137
1138            let avg_forward_time =
1139                profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
1140
1141            // Consider a layer slow if it takes more than 100ms on average
1142            if avg_forward_time.as_millis() > 100 {
1143                let mut metrics = HashMap::new();
1144                metrics.insert(
1145                    "avg_forward_time_ms".to_string(),
1146                    avg_forward_time.as_millis() as f64,
1147                );
1148                metrics.insert("call_count".to_string(), profile.call_count as f64);
1149
1150                self.bottlenecks.push(PerformanceBottleneck {
1151                    bottleneck_type: BottleneckType::ModelComputation,
1152                    location: layer_name.clone(),
1153                    severity: if avg_forward_time.as_millis() > 500 {
1154                        BottleneckSeverity::High
1155                    } else {
1156                        BottleneckSeverity::Medium
1157                    },
1158                    description: format!(
1159                        "Layer '{}' has slow forward pass: {:.1}ms average",
1160                        layer_name,
1161                        avg_forward_time.as_millis()
1162                    ),
1163                    suggestion: "Consider optimizing layer implementation or reducing layer size"
1164                        .to_string(),
1165                    metrics,
1166                });
1167            }
1168        }
1169    }
1170
1171    fn analyze_memory_bottlenecks(&mut self) {
1172        if self.memory_snapshots.len() < 2 {
1173            return;
1174        }
1175
1176        // Check for memory growth trend
1177        let recent_snapshots = if self.memory_snapshots.len() > 10 {
1178            &self.memory_snapshots[self.memory_snapshots.len() - 10..]
1179        } else {
1180            &self.memory_snapshots
1181        };
1182
1183        if recent_snapshots.len() >= 5 {
1184            let initial_memory = recent_snapshots[0].heap_allocated;
1185            let final_memory = recent_snapshots.last().unwrap().heap_allocated;
1186
1187            if final_memory > initial_memory * 2 {
1188                let mut metrics = HashMap::new();
1189                metrics.insert(
1190                    "initial_memory_mb".to_string(),
1191                    initial_memory as f64 / (1024.0 * 1024.0),
1192                );
1193                metrics.insert(
1194                    "final_memory_mb".to_string(),
1195                    final_memory as f64 / (1024.0 * 1024.0),
1196                );
1197                metrics.insert(
1198                    "growth_ratio".to_string(),
1199                    final_memory as f64 / initial_memory as f64,
1200                );
1201
1202                self.bottlenecks.push(PerformanceBottleneck {
1203                    bottleneck_type: BottleneckType::MemoryBound,
1204                    location: "Memory Usage".to_string(),
1205                    severity: BottleneckSeverity::High,
1206                    description: "Significant memory growth detected during profiling".to_string(),
1207                    suggestion: "Check for memory leaks or inefficient memory usage patterns"
1208                        .to_string(),
1209                    metrics,
1210                });
1211            }
1212        }
1213    }
1214
1215    fn analyze_tensor_bottlenecks(&mut self) {
1216        // Group tensor operations by type
1217        let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
1218
1219        for event in &self.events {
1220            if let ProfileEvent::TensorOperation {
1221                operation,
1222                duration,
1223                ..
1224            } = event
1225            {
1226                operation_groups
1227                    .entry(operation.clone())
1228                    .or_insert_with(Vec::new)
1229                    .push(*duration);
1230            }
1231        }
1232
1233        // Find slow operations
1234        for (operation, durations) in operation_groups {
1235            if durations.is_empty() {
1236                continue;
1237            }
1238
1239            let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
1240            let total_time = durations.iter().sum::<Duration>();
1241
1242            // Consider operation slow if it takes more than 10ms on average
1243            if avg_duration.as_millis() > 10 {
1244                let mut metrics = HashMap::new();
1245                metrics.insert(
1246                    "avg_duration_ms".to_string(),
1247                    avg_duration.as_millis() as f64,
1248                );
1249                metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
1250                metrics.insert("call_count".to_string(), durations.len() as f64);
1251
1252                self.bottlenecks.push(PerformanceBottleneck {
1253                    bottleneck_type: BottleneckType::CpuBound,
1254                    location: format!("Tensor Operation: {}", operation),
1255                    severity: if avg_duration.as_millis() > 50 {
1256                        BottleneckSeverity::High
1257                    } else {
1258                        BottleneckSeverity::Medium
1259                    },
1260                    description: format!(
1261                        "Tensor operation '{}' is slow: {:.1}ms average",
1262                        operation,
1263                        avg_duration.as_millis()
1264                    ),
1265                    suggestion:
1266                        "Consider optimizing tensor operation or using different data types"
1267                            .to_string(),
1268                    metrics,
1269                });
1270            }
1271        }
1272    }
1273
1274    fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
1275        let mut layer_times: Vec<(String, Duration)> = self
1276            .layer_profiles
1277            .iter()
1278            .map(|(name, profile)| {
1279                let avg_time = if profile.forward_times.is_empty() {
1280                    Duration::ZERO
1281                } else {
1282                    profile.forward_times.iter().sum::<Duration>()
1283                        / profile.forward_times.len() as u32
1284                };
1285                (name.clone(), avg_time)
1286            })
1287            .collect();
1288
1289        layer_times.sort_by(|a, b| b.1.cmp(&a.1));
1290        layer_times.truncate(limit);
1291        layer_times
1292    }
1293
1294    fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
1295        if self.memory_snapshots.is_empty() {
1296            return MemoryEfficiencyAnalysis::default();
1297        }
1298
1299        let memory_values: Vec<usize> =
1300            self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
1301
1302        let max_memory = memory_values.iter().max().copied().unwrap_or(0);
1303        let min_memory = memory_values.iter().min().copied().unwrap_or(0);
1304        let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
1305
1306        MemoryEfficiencyAnalysis {
1307            peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
1308            min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
1309            avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
1310            memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
1311            efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
1312        }
1313    }
1314
1315    fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
1316        if values.len() < 2 {
1317            return 0.0;
1318        }
1319
1320        let variance_sum: f64 = values
1321            .iter()
1322            .map(|&x| {
1323                let diff = x as f64 - mean as f64;
1324                diff * diff
1325            })
1326            .sum();
1327
1328        variance_sum / (values.len() - 1) as f64
1329    }
1330
1331    fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
1332        if values.is_empty() {
1333            return 0.0;
1334        }
1335
1336        let max_memory = values.iter().max().copied().unwrap_or(0);
1337        let min_memory = values.iter().min().copied().unwrap_or(0);
1338
1339        if max_memory == 0 {
1340            return 100.0;
1341        }
1342
1343        // Efficiency score: closer to 100% means more stable memory usage
1344        100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
1345    }
1346
1347    fn generate_performance_recommendations(&self) -> Vec<String> {
1348        let mut recommendations = Vec::new();
1349
1350        // Analyze bottlenecks for recommendations
1351        for bottleneck in &self.bottlenecks {
1352            match bottleneck.bottleneck_type {
1353                BottleneckType::ModelComputation => {
1354                    recommendations.push(
1355                        "Consider model architecture optimizations or layer fusion".to_string(),
1356                    );
1357                },
1358                BottleneckType::MemoryBound => {
1359                    recommendations.push(
1360                        "Optimize memory usage with gradient checkpointing or model parallelism"
1361                            .to_string(),
1362                    );
1363                },
1364                BottleneckType::CpuBound => {
1365                    recommendations.push(
1366                        "Consider GPU acceleration or optimized CPU implementations".to_string(),
1367                    );
1368                },
1369                _ => {},
1370            }
1371        }
1372
1373        // General recommendations based on profiling data
1374        if self.events.len() > 10000 {
1375            recommendations.push(
1376                "High number of profiling events - consider reducing profiling overhead"
1377                    .to_string(),
1378            );
1379        }
1380
1381        let stats = self.get_statistics();
1382        if let Some(layer_stats) = stats.get("LayerExecution") {
1383            if layer_stats.avg_duration.as_millis() > 50 {
1384                recommendations.push(
1385                    "Average layer execution time is high - consider layer optimization"
1386                        .to_string(),
1387                );
1388            }
1389        }
1390
1391        if recommendations.is_empty() {
1392            recommendations
1393                .push("Performance appears optimal based on current profiling data".to_string());
1394        }
1395
1396        recommendations
1397    }
1398}
1399
1400/// Memory efficiency analysis results
1401#[derive(Debug, Clone, Serialize, Deserialize)]
1402pub struct MemoryEfficiencyAnalysis {
1403    pub peak_memory_mb: f64,
1404    pub min_memory_mb: f64,
1405    pub avg_memory_mb: f64,
1406    pub memory_variance: f64,
1407    pub efficiency_score: f64,
1408}
1409
1410impl Default for MemoryEfficiencyAnalysis {
1411    fn default() -> Self {
1412        Self {
1413            peak_memory_mb: 0.0,
1414            min_memory_mb: 0.0,
1415            avg_memory_mb: 0.0,
1416            memory_variance: 0.0,
1417            efficiency_score: 100.0,
1418        }
1419    }
1420}
1421
1422/// Profiler report
1423#[derive(Debug, Clone, Serialize, Deserialize)]
1424pub struct ProfilerReport {
1425    pub total_events: usize,
1426    pub total_runtime: Duration,
1427    pub statistics: HashMap<String, ProfileStats>,
1428    pub bottlenecks: Vec<PerformanceBottleneck>,
1429    pub slowest_layers: Vec<(String, Duration)>,
1430    pub memory_efficiency: MemoryEfficiencyAnalysis,
1431    pub recommendations: Vec<String>,
1432}
1433
1434/// Scoped timer for automatic timing
1435pub struct ScopedTimer<'a> {
1436    profiler: &'a mut Profiler,
1437    name: String,
1438}
1439
1440impl<'a> ScopedTimer<'a> {
1441    pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1442        profiler.start_timer(&name);
1443        Self { profiler, name }
1444    }
1445}
1446
1447impl<'a> Drop for ScopedTimer<'a> {
1448    fn drop(&mut self) {
1449        self.profiler.end_timer(&self.name);
1450    }
1451}
1452
1453/// Layer latency analysis result
1454#[derive(Debug, Clone, Serialize, Deserialize)]
1455pub struct LayerLatencyAnalysis {
1456    pub layer_name: String,
1457    pub layer_type: String,
1458    pub total_time: Duration,
1459    pub cpu_percentage: f64,
1460    pub gpu_percentage: f64,
1461    pub memory_copy_percentage: f64,
1462    pub flops_per_second: f64,
1463    pub memory_bandwidth_utilization: f64,
1464    pub bottleneck_type: String,
1465}
1466
1467/// Comprehensive performance analysis
1468#[derive(Debug, Serialize, Deserialize)]
1469pub struct PerformanceAnalysis {
1470    pub memory_stats: Option<MemoryStats>,
1471    pub io_bandwidth_stats: HashMap<IoDeviceType, f64>,
1472    pub layer_analysis: Vec<LayerLatencyAnalysis>,
1473    pub gpu_utilization: Option<f64>,
1474    pub cpu_bottlenecks: Vec<CpuBottleneckAnalysis>,
1475    pub total_gpu_kernels: usize,
1476    pub total_io_operations: usize,
1477    pub performance_score: f64,
1478    pub recommendations: Vec<String>,
1479}
1480
1481/// Enhanced profiler report
1482#[derive(Debug, Serialize, Deserialize)]
1483pub struct EnhancedProfilerReport {
1484    pub basic_report: ProfilerReport,
1485    pub performance_analysis: PerformanceAnalysis,
1486    pub gpu_kernel_summary: GpuKernelSummary,
1487    pub memory_allocation_summary: MemoryAllocationSummary,
1488    pub io_performance_summary: IoPerformanceSummary,
1489}
1490
1491#[derive(Debug, Serialize, Deserialize)]
1492pub struct GpuKernelSummary {
1493    pub total_kernels: usize,
1494    pub total_execution_time: Duration,
1495    pub avg_occupancy: f64,
1496    pub avg_compute_utilization: f64,
1497    pub slowest_kernels: Vec<String>,
1498}
1499
1500#[derive(Debug, Serialize, Deserialize)]
1501pub struct MemoryAllocationSummary {
1502    pub total_allocations: usize,
1503    pub peak_memory_usage: usize,
1504    pub memory_efficiency: f64,
1505    pub largest_allocations: Vec<String>,
1506    pub memory_leaks: usize,
1507}
1508
1509#[derive(Debug, Serialize, Deserialize)]
1510pub struct IoPerformanceSummary {
1511    pub total_operations: usize,
1512    pub total_bytes_transferred: usize,
1513    pub avg_bandwidth_by_device: HashMap<IoDeviceType, f64>,
1514    pub slowest_operations: Vec<String>,
1515}
1516
1517impl Profiler {
1518    /// Generate enhanced profiler report with advanced metrics
1519    pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
1520        let basic_report = self.generate_report().await?;
1521        let performance_analysis = self.get_performance_analysis();
1522
1523        let gpu_kernel_summary = self.generate_gpu_kernel_summary();
1524        let memory_allocation_summary = self.generate_memory_allocation_summary();
1525        let io_performance_summary = self.generate_io_performance_summary();
1526
1527        Ok(EnhancedProfilerReport {
1528            basic_report,
1529            performance_analysis,
1530            gpu_kernel_summary,
1531            memory_allocation_summary,
1532            io_performance_summary,
1533        })
1534    }
1535
1536    fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
1537        let total_kernels = self.gpu_kernel_profiles.len();
1538        let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
1539
1540        let avg_occupancy = if total_kernels > 0 {
1541            self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
1542        } else {
1543            0.0
1544        };
1545
1546        let avg_compute_utilization = if total_kernels > 0 {
1547            self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1548                / total_kernels as f64
1549        } else {
1550            0.0
1551        };
1552
1553        let mut kernels_by_time: Vec<_> = self
1554            .gpu_kernel_profiles
1555            .iter()
1556            .map(|k| (k.kernel_name.clone(), k.execution_time))
1557            .collect();
1558        kernels_by_time.sort_by(|a, b| b.1.cmp(&a.1));
1559
1560        let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1561
1562        GpuKernelSummary {
1563            total_kernels,
1564            total_execution_time,
1565            avg_occupancy,
1566            avg_compute_utilization,
1567            slowest_kernels,
1568        }
1569    }
1570
1571    fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1572        let total_allocations = self.memory_allocations.len();
1573        let peak_memory_usage =
1574            self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1575
1576        let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1577            stats.memory_efficiency
1578        } else {
1579            1.0
1580        };
1581
1582        let mut allocations_by_size: Vec<_> = self
1583            .memory_allocations
1584            .values()
1585            .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1586            .collect();
1587        allocations_by_size.sort_by(|a, b| b.1.cmp(&a.1));
1588
1589        let largest_allocations =
1590            allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1591
1592        let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1593
1594        MemoryAllocationSummary {
1595            total_allocations,
1596            peak_memory_usage,
1597            memory_efficiency,
1598            largest_allocations,
1599            memory_leaks,
1600        }
1601    }
1602
1603    fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1604        let total_operations = self.io_profiles.len();
1605        let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1606
1607        let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1608
1609        let mut operations_by_duration: Vec<_> = self
1610            .io_profiles
1611            .iter()
1612            .map(|io| {
1613                (
1614                    format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1615                    io.duration,
1616                )
1617            })
1618            .collect();
1619        operations_by_duration.sort_by(|a, b| b.1.cmp(&a.1));
1620
1621        let slowest_operations =
1622            operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1623
1624        IoPerformanceSummary {
1625            total_operations,
1626            total_bytes_transferred,
1627            avg_bandwidth_by_device,
1628            slowest_operations,
1629        }
1630    }
1631}
1632
1633/// Macro for convenient timing
1634#[macro_export]
1635macro_rules! profile_scope {
1636    ($profiler:expr, $name:expr) => {
1637        let _timer = ScopedTimer::new($profiler, $name.to_string());
1638    };
1639}