Skip to main content

trustformers_debug/profiler/
mod.rs

1//! Performance profiling tools for debugging
2
3pub mod events;
4pub mod gpu;
5pub mod io_monitor;
6pub mod memory;
7pub mod report;
8
9// Re-export all public types from sub-modules
10pub use events::{
11    BottleneckSeverity, BottleneckType, CpuBottleneckAnalysis, CpuProfile, HotFunction,
12    MemorySnapshot, PerformanceBottleneck, ProfileEvent, ProfileStats,
13};
14pub use gpu::{GpuKernelProfile, GpuKernelSummary, GpuMemoryPool, GpuProfiler};
15pub use io_monitor::{
16    BandwidthSample, IoDeviceType, IoMonitor, IoOperation, IoOperationType, IoPerformanceSummary,
17    IoProfile, LayerLatencyProfile,
18};
19pub use memory::{
20    MemoryAllocation, MemoryAllocationType, MemoryEfficiencyAnalysis, MemoryStats, MemoryTracker,
21};
22pub use report::{
23    EnhancedProfilerReport, LayerLatencyAnalysis, MemoryAllocationSummary, PerformanceAnalysis,
24    ProfilerReport,
25};
26
27use anyhow::Result;
28use std::collections::HashMap;
29use std::sync::{Arc, Mutex};
30use std::time::{Duration, Instant, SystemTime};
31use uuid::Uuid;
32
33use crate::DebugConfig;
34
35/// Performance profiler
36#[derive(Debug)]
37pub struct Profiler {
38    #[allow(dead_code)]
39    config: DebugConfig,
40    events: Vec<ProfileEvent>,
41    active_timers: HashMap<String, Instant>,
42    memory_snapshots: Vec<MemorySnapshot>,
43    start_time: Option<Instant>,
44    layer_profiles: HashMap<String, LayerProfile>,
45    bottlenecks: Vec<PerformanceBottleneck>,
46    // Enhanced profiling features
47    gpu_kernel_profiles: Vec<GpuKernelProfile>,
48    memory_allocations: HashMap<Uuid, MemoryAllocation>,
49    layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
50    io_profiles: Vec<IoProfile>,
51    cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
52    memory_tracker: Arc<Mutex<MemoryTracker>>,
53    gpu_profiler: Option<GpuProfiler>,
54    io_monitor: IoMonitor,
55}
56
57#[derive(Debug)]
58pub struct LayerProfile {
59    #[allow(dead_code)]
60    layer_name: String,
61    forward_times: Vec<Duration>,
62    backward_times: Vec<Duration>,
63    memory_usage: Vec<usize>,
64    call_count: usize,
65}
66
67impl LayerProfile {
68    /// Get forward execution times
69    pub fn forward_times(&self) -> &Vec<Duration> {
70        &self.forward_times
71    }
72
73    /// Get backward execution times
74    pub fn backward_times(&self) -> &Vec<Duration> {
75        &self.backward_times
76    }
77
78    /// Get memory usage samples
79    pub fn memory_usage(&self) -> &Vec<usize> {
80        &self.memory_usage
81    }
82
83    /// Get total number of calls
84    pub fn call_count(&self) -> usize {
85        self.call_count
86    }
87}
88
89impl Profiler {
90    /// Create a new profiler
91    pub fn new(config: &DebugConfig) -> Self {
92        Self {
93            config: config.clone(),
94            events: Vec::new(),
95            active_timers: HashMap::new(),
96            memory_snapshots: Vec::new(),
97            start_time: None,
98            layer_profiles: HashMap::new(),
99            bottlenecks: Vec::new(),
100            // Enhanced profiling features
101            gpu_kernel_profiles: Vec::new(),
102            memory_allocations: HashMap::new(),
103            layer_latency_profiles: HashMap::new(),
104            io_profiles: Vec::new(),
105            cpu_bottleneck_analysis: Vec::new(),
106            memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
107            gpu_profiler: GpuProfiler::new().ok(),
108            io_monitor: IoMonitor::new(),
109        }
110    }
111
112    /// Start profiling session
113    pub async fn start(&mut self) -> Result<()> {
114        tracing::info!("Starting performance profiler");
115        self.start_time = Some(Instant::now());
116        self.take_memory_snapshot();
117        Ok(())
118    }
119
120    /// Get reference to profiling events
121    pub fn get_events(&self) -> &Vec<ProfileEvent> {
122        &self.events
123    }
124
125    /// Start timing a function or operation
126    pub fn start_timer(&mut self, name: &str) {
127        self.active_timers.insert(name.to_string(), Instant::now());
128    }
129
130    /// End timing and record the event
131    pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
132        if let Some(start_time) = self.active_timers.remove(name) {
133            let duration = start_time.elapsed();
134
135            // Record basic function call event
136            self.events.push(ProfileEvent::FunctionCall {
137                function_name: name.to_string(),
138                duration,
139                memory_delta: 0, // Would need actual memory tracking
140            });
141
142            Some(duration)
143        } else {
144            tracing::warn!("Timer '{}' was not started", name);
145            None
146        }
147    }
148
149    /// Record layer execution timing
150    pub fn record_layer_execution(
151        &mut self,
152        layer_name: &str,
153        layer_type: &str,
154        forward_time: Duration,
155        backward_time: Option<Duration>,
156        memory_usage: usize,
157        parameter_count: usize,
158    ) {
159        // Record event
160        self.events.push(ProfileEvent::LayerExecution {
161            layer_name: layer_name.to_string(),
162            layer_type: layer_type.to_string(),
163            forward_time,
164            backward_time,
165            memory_usage,
166            parameter_count,
167        });
168
169        // Update layer profile
170        let profile =
171            self.layer_profiles
172                .entry(layer_name.to_string())
173                .or_insert_with(|| LayerProfile {
174                    layer_name: layer_name.to_string(),
175                    forward_times: Vec::new(),
176                    backward_times: Vec::new(),
177                    memory_usage: Vec::new(),
178                    call_count: 0,
179                });
180
181        profile.forward_times.push(forward_time);
182        if let Some(backward) = backward_time {
183            profile.backward_times.push(backward);
184        }
185        profile.memory_usage.push(memory_usage);
186        profile.call_count += 1;
187    }
188
189    /// Record tensor operation timing
190    pub fn record_tensor_operation(
191        &mut self,
192        operation: &str,
193        tensor_shape: &[usize],
194        duration: Duration,
195        memory_allocated: usize,
196    ) {
197        self.events.push(ProfileEvent::TensorOperation {
198            operation: operation.to_string(),
199            tensor_shape: tensor_shape.to_vec(),
200            duration,
201            memory_allocated,
202        });
203    }
204
205    /// Record model inference timing
206    pub fn record_model_inference(
207        &mut self,
208        batch_size: usize,
209        sequence_length: usize,
210        duration: Duration,
211    ) {
212        let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
213
214        self.events.push(ProfileEvent::ModelInference {
215            batch_size,
216            sequence_length,
217            duration,
218            tokens_per_second,
219        });
220    }
221
222    /// Record gradient computation timing
223    pub fn record_gradient_computation(
224        &mut self,
225        layer_name: &str,
226        gradient_norm: f64,
227        duration: Duration,
228    ) {
229        self.events.push(ProfileEvent::GradientComputation {
230            layer_name: layer_name.to_string(),
231            gradient_norm,
232            duration,
233        });
234    }
235
236    /// Take a memory usage snapshot
237    pub fn take_memory_snapshot(&mut self) {
238        // Simplified memory tracking - in practice would use system APIs
239        let snapshot = MemorySnapshot {
240            timestamp: chrono::Utc::now(),
241            heap_allocated: 0, // Would get from system
242            heap_used: 0,
243            stack_size: 0,
244            gpu_allocated: None,
245            gpu_used: None,
246        };
247
248        self.memory_snapshots.push(snapshot);
249
250        // Keep only recent snapshots to prevent memory growth
251        if self.memory_snapshots.len() > 1000 {
252            self.memory_snapshots.drain(0..500);
253        }
254    }
255
256    /// Analyze performance and detect bottlenecks
257    pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
258        self.bottlenecks.clear();
259
260        // Analyze layer execution times
261        self.analyze_layer_bottlenecks();
262
263        // Analyze memory usage patterns
264        self.analyze_memory_bottlenecks();
265
266        // Analyze tensor operation efficiency
267        self.analyze_tensor_bottlenecks();
268
269        self.bottlenecks.clone()
270    }
271
272    /// Get profiling statistics
273    pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
274        let mut stats = HashMap::new();
275
276        // Group events by type
277        let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
278
279        for event in &self.events {
280            let event_type = match event {
281                ProfileEvent::FunctionCall { .. } => "FunctionCall",
282                ProfileEvent::LayerExecution { .. } => "LayerExecution",
283                ProfileEvent::TensorOperation { .. } => "TensorOperation",
284                ProfileEvent::ModelInference { .. } => "ModelInference",
285                ProfileEvent::GradientComputation { .. } => "GradientComputation",
286            };
287
288            grouped_events.entry(event_type.to_string()).or_default().push(event);
289        }
290
291        // Calculate statistics for each event type
292        for (event_type, events) in grouped_events {
293            let durations: Vec<Duration> = events
294                .iter()
295                .filter_map(|event| match event {
296                    ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
297                    ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
298                    ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
299                    ProfileEvent::ModelInference { duration, .. } => Some(*duration),
300                    ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
301                })
302                .collect();
303
304            if !durations.is_empty() {
305                let total_duration: Duration = durations.iter().sum();
306                let avg_duration = total_duration / durations.len() as u32;
307                let min_duration = durations.iter().min().copied().unwrap_or_default();
308                let max_duration = durations.iter().max().copied().unwrap_or_default();
309
310                stats.insert(
311                    event_type.clone(),
312                    ProfileStats {
313                        event_type,
314                        count: durations.len(),
315                        total_duration,
316                        avg_duration,
317                        min_duration,
318                        max_duration,
319                        total_memory: 0, // Simplified
320                        avg_memory: 0.0,
321                    },
322                );
323            }
324        }
325
326        stats
327    }
328
329    /// Get layer-specific performance profiles
330    pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
331        &self.layer_profiles
332    }
333
334    /// Get memory usage over time
335    pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
336        &self.memory_snapshots
337    }
338
339    /// Generate performance report
340    pub async fn generate_report(&self) -> Result<ProfilerReport> {
341        let statistics = self.get_statistics();
342        let bottlenecks = self.bottlenecks.clone();
343        let total_events = self.events.len();
344
345        let total_runtime =
346            if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
347
348        // Calculate slowest layers
349        let slowest_layers = self.get_slowest_layers(5);
350
351        // Memory efficiency analysis
352        let memory_efficiency = self.analyze_memory_efficiency();
353
354        Ok(ProfilerReport {
355            total_events,
356            total_runtime,
357            statistics,
358            bottlenecks,
359            slowest_layers,
360            memory_efficiency,
361            recommendations: self.generate_performance_recommendations(),
362        })
363    }
364
365    /// Clear all profiling data
366    pub fn clear(&mut self) {
367        self.events.clear();
368        self.active_timers.clear();
369        self.memory_snapshots.clear();
370        self.layer_profiles.clear();
371        self.bottlenecks.clear();
372        self.start_time = None;
373        // Clear enhanced profiling data
374        self.gpu_kernel_profiles.clear();
375        self.memory_allocations.clear();
376        self.layer_latency_profiles.clear();
377        self.io_profiles.clear();
378        self.cpu_bottleneck_analysis.clear();
379        if let Ok(mut tracker) = self.memory_tracker.lock() {
380            *tracker = MemoryTracker::new();
381        }
382        self.io_monitor = IoMonitor::new();
383    }
384
385    // Enhanced profiling methods
386
387    /// Profile GPU kernel execution
388    pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
389        if let Some(ref mut gpu_profiler) = self.gpu_profiler {
390            gpu_profiler.profile_kernel(kernel_profile.clone());
391        }
392        self.gpu_kernel_profiles.push(kernel_profile);
393    }
394
395    /// Track memory allocation
396    pub fn track_memory_allocation(
397        &mut self,
398        size_bytes: usize,
399        allocation_type: MemoryAllocationType,
400        device_id: Option<i32>,
401        stack_trace: Vec<String>,
402    ) -> Uuid {
403        let allocation_id = Uuid::new_v4();
404        let allocation = MemoryAllocation {
405            allocation_id,
406            size_bytes,
407            allocation_type,
408            device_id,
409            timestamp: SystemTime::now(),
410            stack_trace,
411            freed: false,
412            free_timestamp: None,
413        };
414
415        if let Ok(mut tracker) = self.memory_tracker.lock() {
416            tracker.track_allocation(allocation.clone());
417        }
418
419        self.memory_allocations.insert(allocation_id, allocation);
420        allocation_id
421    }
422
423    /// Track memory deallocation
424    pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
425        if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
426            allocation.freed = true;
427            allocation.free_timestamp = Some(SystemTime::now());
428        }
429
430        if let Ok(mut tracker) = self.memory_tracker.lock() {
431            tracker.track_deallocation(allocation_id);
432        }
433    }
434
435    /// Profile layer latency with detailed breakdown
436    pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
437        self.layer_latency_profiles
438            .insert(layer_latency.layer_name.clone(), layer_latency);
439    }
440
441    /// Start I/O operation profiling
442    pub fn start_io_profiling(
443        &mut self,
444        operation_type: IoOperationType,
445        bytes_expected: usize,
446    ) -> Uuid {
447        self.io_monitor.start_io_operation(operation_type, bytes_expected)
448    }
449
450    /// Finish I/O operation profiling
451    pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
452        if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
453        {
454            self.io_profiles.push(profile);
455        }
456    }
457
458    /// Analyze CPU bottlenecks
459    pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
460        // Simplified CPU bottleneck analysis
461        // In practice, this would use system profiling APIs
462        let analysis = CpuBottleneckAnalysis {
463            thread_id: 0, // Use 0 as placeholder since thread::current().id().as_u64() is unstable
464            cpu_usage: 0.75, // Simplified
465            context_switches: 1000,
466            cache_misses: 500,
467            instructions_per_cycle: 2.5,
468            branch_mispredictions: 100,
469            hot_functions: vec![
470                HotFunction {
471                    function_name: "tensor_multiply".to_string(),
472                    self_time_percentage: 25.0,
473                    call_count: 1000,
474                    avg_time_per_call: Duration::from_micros(250),
475                },
476                HotFunction {
477                    function_name: "gradient_computation".to_string(),
478                    self_time_percentage: 20.0,
479                    call_count: 500,
480                    avg_time_per_call: Duration::from_micros(400),
481                },
482            ],
483            bottleneck_score: 0.6,
484        };
485
486        self.cpu_bottleneck_analysis.push(analysis.clone());
487        vec![analysis]
488    }
489
490    /// Get memory allocation statistics
491    pub fn get_memory_stats(&self) -> Option<MemoryStats> {
492        if let Ok(tracker) = self.memory_tracker.lock() {
493            Some(tracker.get_memory_stats())
494        } else {
495            None
496        }
497    }
498
499    /// Get GPU utilization metrics
500    pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
501        self.gpu_profiler
502            .as_ref()
503            .map(|profiler| profiler.get_gpu_utilization(device_id))
504    }
505
506    /// Get I/O bandwidth statistics
507    pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
508        let mut stats = HashMap::new();
509
510        stats.insert(
511            IoDeviceType::SSD,
512            self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
513        );
514        stats.insert(
515            IoDeviceType::HDD,
516            self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
517        );
518        stats.insert(
519            IoDeviceType::Network,
520            self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
521        );
522        stats.insert(
523            IoDeviceType::Memory,
524            self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
525        );
526        stats.insert(
527            IoDeviceType::Cache,
528            self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
529        );
530
531        stats
532    }
533
534    /// Get layer latency analysis
535    pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
536        self.layer_latency_profiles
537            .values()
538            .map(|profile| LayerLatencyAnalysis {
539                layer_name: profile.layer_name.clone(),
540                layer_type: profile.layer_type.clone(),
541                total_time: profile.cpu_time
542                    + profile.gpu_time
543                    + profile.memory_copy_time
544                    + profile.sync_time,
545                cpu_percentage: profile.cpu_time.as_secs_f64()
546                    / (profile.cpu_time
547                        + profile.gpu_time
548                        + profile.memory_copy_time
549                        + profile.sync_time)
550                        .as_secs_f64()
551                    * 100.0,
552                gpu_percentage: profile.gpu_time.as_secs_f64()
553                    / (profile.cpu_time
554                        + profile.gpu_time
555                        + profile.memory_copy_time
556                        + profile.sync_time)
557                        .as_secs_f64()
558                    * 100.0,
559                memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
560                    / (profile.cpu_time
561                        + profile.gpu_time
562                        + profile.memory_copy_time
563                        + profile.sync_time)
564                        .as_secs_f64()
565                    * 100.0,
566                flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
567                    profile.flops as f64 / profile.gpu_time.as_secs_f64()
568                } else {
569                    0.0
570                },
571                memory_bandwidth_utilization: profile.cache_hit_rate,
572                bottleneck_type: self.identify_layer_bottleneck(profile),
573            })
574            .collect()
575    }
576
577    /// Get comprehensive performance analysis
578    pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
579        let memory_stats = self.get_memory_stats();
580        let io_bandwidth_stats = self.get_io_bandwidth_stats();
581        let layer_analysis = self.get_layer_latency_analysis();
582
583        let gpu_utilization =
584            self.gpu_profiler.as_ref().map(|profiler| profiler.get_gpu_utilization(0));
585
586        PerformanceAnalysis {
587            memory_stats,
588            io_bandwidth_stats,
589            layer_analysis,
590            gpu_utilization,
591            cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
592            total_gpu_kernels: self.gpu_kernel_profiles.len(),
593            total_io_operations: self.io_profiles.len(),
594            performance_score: self.calculate_overall_performance_score(),
595            recommendations: self.generate_enhanced_recommendations(),
596        }
597    }
598
599    fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
600        let total_time =
601            profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
602
603        if profile.memory_copy_time > total_time / 2 {
604            "Memory Bandwidth".to_string()
605        } else if profile.sync_time > total_time / 3 {
606            "GPU Synchronization".to_string()
607        } else if profile.gpu_time > profile.cpu_time * 10 {
608            "GPU Compute".to_string()
609        } else {
610            "CPU Compute".to_string()
611        }
612    }
613
614    fn calculate_overall_performance_score(&self) -> f64 {
615        let mut score: f64 = 100.0;
616
617        // Deduct for bottlenecks
618        for bottleneck in &self.bottlenecks {
619            match bottleneck.severity {
620                BottleneckSeverity::Critical => score -= 20.0,
621                BottleneckSeverity::High => score -= 10.0,
622                BottleneckSeverity::Medium => score -= 5.0,
623                BottleneckSeverity::Low => score -= 2.0,
624            }
625        }
626
627        // Deduct for poor GPU utilization
628        if let Some(gpu_util) = self.get_gpu_utilization(0) {
629            if gpu_util < 0.5 {
630                score -= 15.0;
631            } else if gpu_util < 0.7 {
632                score -= 8.0;
633            }
634        }
635
636        // Deduct for memory inefficiency
637        if let Some(memory_stats) = self.get_memory_stats() {
638            if memory_stats.memory_efficiency < 0.8 {
639                score -= 10.0;
640            }
641        }
642
643        score.max(0.0)
644    }
645
646    fn generate_enhanced_recommendations(&self) -> Vec<String> {
647        let mut recommendations = Vec::new();
648
649        // GPU utilization recommendations
650        if let Some(gpu_util) = self.get_gpu_utilization(0) {
651            if gpu_util < 0.5 {
652                recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
653            }
654        }
655
656        // Memory recommendations
657        if let Some(memory_stats) = self.get_memory_stats() {
658            if memory_stats.memory_efficiency < 0.8 {
659                recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
660            }
661
662            if memory_stats.active_allocations > 10000 {
663                recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
664            }
665        }
666
667        // I/O recommendations
668        let io_stats = self.get_io_bandwidth_stats();
669        if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
670            if ssd_bandwidth < 100.0 {
671                // Less than 100 MB/s
672                recommendations.push(
673                    "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
674                        .to_string(),
675                );
676            }
677        }
678
679        // Layer-specific recommendations
680        let layer_analysis = self.get_layer_latency_analysis();
681        for analysis in &layer_analysis {
682            if analysis.memory_copy_percentage > 50.0 {
683                recommendations.push(format!(
684                    "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
685                    analysis.layer_name
686                ));
687            }
688
689            if analysis.cpu_percentage > 80.0 {
690                recommendations.push(format!(
691                    "Layer '{}' is CPU bound. Consider GPU acceleration.",
692                    analysis.layer_name
693                ));
694            }
695        }
696
697        if recommendations.is_empty() {
698            recommendations
699                .push("Performance appears optimal based on current analysis.".to_string());
700        }
701
702        recommendations
703    }
704
705    // Private analysis methods
706
707    fn analyze_layer_bottlenecks(&mut self) {
708        for (layer_name, profile) in &self.layer_profiles {
709            if profile.forward_times.is_empty() {
710                continue;
711            }
712
713            let avg_forward_time =
714                profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
715
716            // Consider a layer slow if it takes more than 100ms on average
717            if avg_forward_time.as_millis() > 100 {
718                let mut metrics = HashMap::new();
719                metrics.insert(
720                    "avg_forward_time_ms".to_string(),
721                    avg_forward_time.as_millis() as f64,
722                );
723                metrics.insert("call_count".to_string(), profile.call_count as f64);
724
725                self.bottlenecks.push(PerformanceBottleneck {
726                    bottleneck_type: BottleneckType::ModelComputation,
727                    location: layer_name.clone(),
728                    severity: if avg_forward_time.as_millis() > 500 {
729                        BottleneckSeverity::High
730                    } else {
731                        BottleneckSeverity::Medium
732                    },
733                    description: format!(
734                        "Layer '{}' has slow forward pass: {:.1}ms average",
735                        layer_name,
736                        avg_forward_time.as_millis()
737                    ),
738                    suggestion: "Consider optimizing layer implementation or reducing layer size"
739                        .to_string(),
740                    metrics,
741                });
742            }
743        }
744    }
745
746    fn analyze_memory_bottlenecks(&mut self) {
747        if self.memory_snapshots.len() < 2 {
748            return;
749        }
750
751        // Check for memory growth trend
752        let recent_snapshots = if self.memory_snapshots.len() > 10 {
753            &self.memory_snapshots[self.memory_snapshots.len() - 10..]
754        } else {
755            &self.memory_snapshots
756        };
757
758        if recent_snapshots.len() >= 5 {
759            let initial_memory = recent_snapshots[0].heap_allocated;
760            let final_memory = recent_snapshots
761                .last()
762                .expect("recent_snapshots has at least 5 elements")
763                .heap_allocated;
764
765            if final_memory > initial_memory * 2 {
766                let mut metrics = HashMap::new();
767                metrics.insert(
768                    "initial_memory_mb".to_string(),
769                    initial_memory as f64 / (1024.0 * 1024.0),
770                );
771                metrics.insert(
772                    "final_memory_mb".to_string(),
773                    final_memory as f64 / (1024.0 * 1024.0),
774                );
775                metrics.insert(
776                    "growth_ratio".to_string(),
777                    final_memory as f64 / initial_memory as f64,
778                );
779
780                self.bottlenecks.push(PerformanceBottleneck {
781                    bottleneck_type: BottleneckType::MemoryBound,
782                    location: "Memory Usage".to_string(),
783                    severity: BottleneckSeverity::High,
784                    description: "Significant memory growth detected during profiling".to_string(),
785                    suggestion: "Check for memory leaks or inefficient memory usage patterns"
786                        .to_string(),
787                    metrics,
788                });
789            }
790        }
791    }
792
793    fn analyze_tensor_bottlenecks(&mut self) {
794        // Group tensor operations by type
795        let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
796
797        for event in &self.events {
798            if let ProfileEvent::TensorOperation {
799                operation,
800                duration,
801                ..
802            } = event
803            {
804                operation_groups.entry(operation.clone()).or_default().push(*duration);
805            }
806        }
807
808        // Find slow operations
809        for (operation, durations) in operation_groups {
810            if durations.is_empty() {
811                continue;
812            }
813
814            let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
815            let total_time = durations.iter().sum::<Duration>();
816
817            // Consider operation slow if it takes more than 10ms on average
818            if avg_duration.as_millis() > 10 {
819                let mut metrics = HashMap::new();
820                metrics.insert(
821                    "avg_duration_ms".to_string(),
822                    avg_duration.as_millis() as f64,
823                );
824                metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
825                metrics.insert("call_count".to_string(), durations.len() as f64);
826
827                self.bottlenecks.push(PerformanceBottleneck {
828                    bottleneck_type: BottleneckType::CpuBound,
829                    location: format!("Tensor Operation: {}", operation),
830                    severity: if avg_duration.as_millis() > 50 {
831                        BottleneckSeverity::High
832                    } else {
833                        BottleneckSeverity::Medium
834                    },
835                    description: format!(
836                        "Tensor operation '{}' is slow: {:.1}ms average",
837                        operation,
838                        avg_duration.as_millis()
839                    ),
840                    suggestion:
841                        "Consider optimizing tensor operation or using different data types"
842                            .to_string(),
843                    metrics,
844                });
845            }
846        }
847    }
848
849    fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
850        let mut layer_times: Vec<(String, Duration)> = self
851            .layer_profiles
852            .iter()
853            .map(|(name, profile)| {
854                let avg_time = if profile.forward_times.is_empty() {
855                    Duration::ZERO
856                } else {
857                    profile.forward_times.iter().sum::<Duration>()
858                        / profile.forward_times.len() as u32
859                };
860                (name.clone(), avg_time)
861            })
862            .collect();
863
864        layer_times.sort_by_key(|item| std::cmp::Reverse(item.1));
865        layer_times.truncate(limit);
866        layer_times
867    }
868
869    fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
870        if self.memory_snapshots.is_empty() {
871            return MemoryEfficiencyAnalysis::default();
872        }
873
874        let memory_values: Vec<usize> =
875            self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
876
877        let max_memory = memory_values.iter().max().copied().unwrap_or(0);
878        let min_memory = memory_values.iter().min().copied().unwrap_or(0);
879        let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
880
881        MemoryEfficiencyAnalysis {
882            peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
883            min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
884            avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
885            memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
886            efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
887        }
888    }
889
890    fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
891        if values.len() < 2 {
892            return 0.0;
893        }
894
895        let variance_sum: f64 = values
896            .iter()
897            .map(|&x| {
898                let diff = x as f64 - mean as f64;
899                diff * diff
900            })
901            .sum();
902
903        variance_sum / (values.len() - 1) as f64
904    }
905
906    fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
907        if values.is_empty() {
908            return 0.0;
909        }
910
911        let max_memory = values.iter().max().copied().unwrap_or(0);
912        let min_memory = values.iter().min().copied().unwrap_or(0);
913
914        if max_memory == 0 {
915            return 100.0;
916        }
917
918        // Efficiency score: closer to 100% means more stable memory usage
919        100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
920    }
921
922    fn generate_performance_recommendations(&self) -> Vec<String> {
923        let mut recommendations = Vec::new();
924
925        // Analyze bottlenecks for recommendations
926        for bottleneck in &self.bottlenecks {
927            match bottleneck.bottleneck_type {
928                BottleneckType::ModelComputation => {
929                    recommendations.push(
930                        "Consider model architecture optimizations or layer fusion".to_string(),
931                    );
932                },
933                BottleneckType::MemoryBound => {
934                    recommendations.push(
935                        "Optimize memory usage with gradient checkpointing or model parallelism"
936                            .to_string(),
937                    );
938                },
939                BottleneckType::CpuBound => {
940                    recommendations.push(
941                        "Consider GPU acceleration or optimized CPU implementations".to_string(),
942                    );
943                },
944                _ => {},
945            }
946        }
947
948        // General recommendations based on profiling data
949        if self.events.len() > 10000 {
950            recommendations.push(
951                "High number of profiling events - consider reducing profiling overhead"
952                    .to_string(),
953            );
954        }
955
956        let stats = self.get_statistics();
957        if let Some(layer_stats) = stats.get("LayerExecution") {
958            if layer_stats.avg_duration.as_millis() > 50 {
959                recommendations.push(
960                    "Average layer execution time is high - consider layer optimization"
961                        .to_string(),
962                );
963            }
964        }
965
966        if recommendations.is_empty() {
967            recommendations
968                .push("Performance appears optimal based on current profiling data".to_string());
969        }
970
971        recommendations
972    }
973
974    /// Generate enhanced profiler report with advanced metrics
975    pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
976        let basic_report = self.generate_report().await?;
977        let performance_analysis = self.get_performance_analysis();
978
979        let gpu_kernel_summary = self.generate_gpu_kernel_summary();
980        let memory_allocation_summary = self.generate_memory_allocation_summary();
981        let io_performance_summary = self.generate_io_performance_summary();
982
983        Ok(EnhancedProfilerReport {
984            basic_report,
985            performance_analysis,
986            gpu_kernel_summary,
987            memory_allocation_summary,
988            io_performance_summary,
989        })
990    }
991
992    fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
993        let total_kernels = self.gpu_kernel_profiles.len();
994        let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
995
996        let avg_occupancy = if total_kernels > 0 {
997            self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
998        } else {
999            0.0
1000        };
1001
1002        let avg_compute_utilization = if total_kernels > 0 {
1003            self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1004                / total_kernels as f64
1005        } else {
1006            0.0
1007        };
1008
1009        let mut kernels_by_time: Vec<_> = self
1010            .gpu_kernel_profiles
1011            .iter()
1012            .map(|k| (k.kernel_name.clone(), k.execution_time))
1013            .collect();
1014        kernels_by_time.sort_by_key(|item| std::cmp::Reverse(item.1));
1015
1016        let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1017
1018        GpuKernelSummary {
1019            total_kernels,
1020            total_execution_time,
1021            avg_occupancy,
1022            avg_compute_utilization,
1023            slowest_kernels,
1024        }
1025    }
1026
1027    fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1028        let total_allocations = self.memory_allocations.len();
1029        let peak_memory_usage =
1030            self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1031
1032        let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1033            stats.memory_efficiency
1034        } else {
1035            1.0
1036        };
1037
1038        let mut allocations_by_size: Vec<_> = self
1039            .memory_allocations
1040            .values()
1041            .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1042            .collect();
1043        allocations_by_size.sort_by_key(|item| std::cmp::Reverse(item.1));
1044
1045        let largest_allocations =
1046            allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1047
1048        let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1049
1050        MemoryAllocationSummary {
1051            total_allocations,
1052            peak_memory_usage,
1053            memory_efficiency,
1054            largest_allocations,
1055            memory_leaks,
1056        }
1057    }
1058
1059    fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1060        let total_operations = self.io_profiles.len();
1061        let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1062
1063        let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1064
1065        let mut operations_by_duration: Vec<_> = self
1066            .io_profiles
1067            .iter()
1068            .map(|io| {
1069                (
1070                    format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1071                    io.duration,
1072                )
1073            })
1074            .collect();
1075        operations_by_duration.sort_by_key(|item| std::cmp::Reverse(item.1));
1076
1077        let slowest_operations =
1078            operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1079
1080        IoPerformanceSummary {
1081            total_operations,
1082            total_bytes_transferred,
1083            avg_bandwidth_by_device,
1084            slowest_operations,
1085        }
1086    }
1087}
1088
1089/// Scoped timer for automatic timing
1090pub struct ScopedTimer<'a> {
1091    profiler: &'a mut Profiler,
1092    name: String,
1093}
1094
1095impl<'a> ScopedTimer<'a> {
1096    pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1097        profiler.start_timer(&name);
1098        Self { profiler, name }
1099    }
1100}
1101
1102impl<'a> Drop for ScopedTimer<'a> {
1103    fn drop(&mut self) {
1104        self.profiler.end_timer(&self.name);
1105    }
1106}
1107
1108/// Macro for convenient timing
1109#[macro_export]
1110macro_rules! profile_scope {
1111    ($profiler:expr, $name:expr) => {
1112        let _timer = ScopedTimer::new($profiler, $name.to_string());
1113    };
1114}
1115
1116#[cfg(test)]
1117mod tests {
1118    use super::*;
1119
1120    fn make_config() -> DebugConfig {
1121        DebugConfig::default()
1122    }
1123
1124    // --- Profiler tests ---
1125
1126    #[test]
1127    fn test_profiler_new() {
1128        let config = make_config();
1129        let profiler = Profiler::new(&config);
1130        assert!(profiler.events.is_empty());
1131        assert!(profiler.active_timers.is_empty());
1132        assert!(profiler.start_time.is_none());
1133    }
1134
1135    #[test]
1136    fn test_profiler_start_end_timer() {
1137        let config = make_config();
1138        let mut profiler = Profiler::new(&config);
1139        profiler.start_timer("test_op");
1140        let duration = profiler.end_timer("test_op");
1141        assert!(duration.is_some());
1142        assert_eq!(profiler.events.len(), 1);
1143    }
1144
1145    #[test]
1146    fn test_profiler_end_timer_not_started() {
1147        let config = make_config();
1148        let mut profiler = Profiler::new(&config);
1149        let duration = profiler.end_timer("nonexistent");
1150        assert!(duration.is_none());
1151    }
1152
1153    #[test]
1154    fn test_profiler_record_layer_execution() {
1155        let config = make_config();
1156        let mut profiler = Profiler::new(&config);
1157        profiler.record_layer_execution(
1158            "attention",
1159            "self_attention",
1160            Duration::from_millis(50),
1161            Some(Duration::from_millis(30)),
1162            1024,
1163            1000,
1164        );
1165        assert_eq!(profiler.events.len(), 1);
1166        let profiles = profiler.get_layer_profiles();
1167        assert!(profiles.contains_key("attention"));
1168        let lp = &profiles["attention"];
1169        assert_eq!(lp.call_count(), 1);
1170        assert_eq!(lp.forward_times().len(), 1);
1171        assert_eq!(lp.backward_times().len(), 1);
1172    }
1173
1174    #[test]
1175    fn test_profiler_record_tensor_operation() {
1176        let config = make_config();
1177        let mut profiler = Profiler::new(&config);
1178        profiler.record_tensor_operation("matmul", &[64, 128], Duration::from_micros(200), 8192);
1179        assert_eq!(profiler.events.len(), 1);
1180    }
1181
1182    #[test]
1183    fn test_profiler_record_model_inference() {
1184        let config = make_config();
1185        let mut profiler = Profiler::new(&config);
1186        profiler.record_model_inference(32, 512, Duration::from_millis(100));
1187        assert_eq!(profiler.events.len(), 1);
1188    }
1189
1190    #[test]
1191    fn test_profiler_record_gradient_computation() {
1192        let config = make_config();
1193        let mut profiler = Profiler::new(&config);
1194        profiler.record_gradient_computation("fc1", 0.5, Duration::from_millis(10));
1195        assert_eq!(profiler.events.len(), 1);
1196    }
1197
1198    #[test]
1199    fn test_profiler_get_statistics_empty() {
1200        let config = make_config();
1201        let profiler = Profiler::new(&config);
1202        let stats = profiler.get_statistics();
1203        assert!(stats.is_empty());
1204    }
1205
1206    #[test]
1207    fn test_profiler_get_statistics_with_events() {
1208        let config = make_config();
1209        let mut profiler = Profiler::new(&config);
1210        profiler.record_model_inference(8, 256, Duration::from_millis(50));
1211        profiler.record_model_inference(8, 256, Duration::from_millis(100));
1212        let stats = profiler.get_statistics();
1213        assert!(stats.contains_key("ModelInference"));
1214        let mi_stats = &stats["ModelInference"];
1215        assert_eq!(mi_stats.count, 2);
1216    }
1217
1218    #[test]
1219    fn test_profiler_clear() {
1220        let config = make_config();
1221        let mut profiler = Profiler::new(&config);
1222        profiler.start_timer("op1");
1223        profiler.end_timer("op1");
1224        profiler.take_memory_snapshot();
1225        profiler.clear();
1226        assert!(profiler.events.is_empty());
1227        assert!(profiler.active_timers.is_empty());
1228        assert!(profiler.memory_snapshots.is_empty());
1229        assert!(profiler.start_time.is_none());
1230    }
1231
1232    #[test]
1233    fn test_profiler_take_memory_snapshot() {
1234        let config = make_config();
1235        let mut profiler = Profiler::new(&config);
1236        profiler.take_memory_snapshot();
1237        assert_eq!(profiler.get_memory_timeline().len(), 1);
1238    }
1239
1240    #[test]
1241    fn test_profiler_memory_snapshot_limit() {
1242        let config = make_config();
1243        let mut profiler = Profiler::new(&config);
1244        for _ in 0..1100 {
1245            profiler.take_memory_snapshot();
1246        }
1247        // Should trim to ~500 after exceeding 1000
1248        assert!(profiler.get_memory_timeline().len() <= 601);
1249    }
1250
1251    #[test]
1252    fn test_profiler_analyze_performance_empty() {
1253        let config = make_config();
1254        let mut profiler = Profiler::new(&config);
1255        let bottlenecks = profiler.analyze_performance();
1256        assert!(bottlenecks.is_empty());
1257    }
1258
1259    #[test]
1260    fn test_profiler_analyze_performance_slow_layer() {
1261        let config = make_config();
1262        let mut profiler = Profiler::new(&config);
1263        for _ in 0..5 {
1264            profiler.record_layer_execution(
1265                "slow_layer",
1266                "dense",
1267                Duration::from_millis(600),
1268                None,
1269                4096,
1270                10000,
1271            );
1272        }
1273        let bottlenecks = profiler.analyze_performance();
1274        assert!(!bottlenecks.is_empty());
1275    }
1276
1277    #[test]
1278    fn test_profiler_get_slowest_layers() {
1279        let config = make_config();
1280        let mut profiler = Profiler::new(&config);
1281        profiler.record_layer_execution(
1282            "fast_layer",
1283            "relu",
1284            Duration::from_millis(1),
1285            None,
1286            128,
1287            0,
1288        );
1289        profiler.record_layer_execution(
1290            "slow_layer",
1291            "dense",
1292            Duration::from_millis(200),
1293            None,
1294            4096,
1295            10000,
1296        );
1297        let slowest = profiler.get_slowest_layers(2);
1298        assert_eq!(slowest.len(), 2);
1299        assert_eq!(slowest[0].0, "slow_layer");
1300    }
1301
1302    #[test]
1303    fn test_profiler_memory_efficiency_empty() {
1304        let config = make_config();
1305        let profiler = Profiler::new(&config);
1306        let analysis = profiler.analyze_memory_efficiency();
1307        assert!((analysis.efficiency_score - 100.0).abs() < 1e-9);
1308    }
1309
1310    #[test]
1311    fn test_profiler_calculate_memory_variance() {
1312        let config = make_config();
1313        let profiler = Profiler::new(&config);
1314        let values = vec![100, 200, 300];
1315        let mean = 200;
1316        let variance = profiler.calculate_memory_variance(&values, mean);
1317        // variance = ((100-200)^2 + (200-200)^2 + (300-200)^2) / 2 = 10000
1318        assert!((variance - 10000.0).abs() < 1e-3);
1319    }
1320
1321    #[test]
1322    fn test_profiler_calculate_memory_efficiency_score_empty() {
1323        let config = make_config();
1324        let profiler = Profiler::new(&config);
1325        let score = profiler.calculate_memory_efficiency_score(&[]);
1326        assert!((score - 0.0).abs() < 1e-9);
1327    }
1328
1329    #[test]
1330    fn test_profiler_calculate_memory_efficiency_score_stable() {
1331        let config = make_config();
1332        let profiler = Profiler::new(&config);
1333        let values = vec![100, 100, 100];
1334        let score = profiler.calculate_memory_efficiency_score(&values);
1335        assert!((score - 100.0).abs() < 1e-9);
1336    }
1337
1338    #[test]
1339    fn test_profiler_calculate_memory_efficiency_score_varied() {
1340        let config = make_config();
1341        let profiler = Profiler::new(&config);
1342        let values = vec![50, 100];
1343        let score = profiler.calculate_memory_efficiency_score(&values);
1344        // 100 * (1.0 - (100-50)/100) = 50.0
1345        assert!((score - 50.0).abs() < 1e-9);
1346    }
1347
1348    #[test]
1349    fn test_profiler_overall_performance_score_no_bottlenecks() {
1350        let config = make_config();
1351        let profiler = Profiler::new(&config);
1352        let score = profiler.calculate_overall_performance_score();
1353        // Score starts at 100 but GPU utilization of 0.0 deducts 15 points
1354        assert!(score >= 50.0);
1355        assert!(score <= 100.0);
1356    }
1357
1358    #[test]
1359    fn test_profiler_identify_layer_bottleneck_memory() {
1360        let config = make_config();
1361        let profiler = Profiler::new(&config);
1362        let profile = LayerLatencyProfile {
1363            layer_name: "test".to_string(),
1364            layer_type: "dense".to_string(),
1365            input_shapes: vec![vec![32, 128]],
1366            output_shapes: vec![vec![32, 256]],
1367            cpu_time: Duration::from_millis(10),
1368            gpu_time: Duration::from_millis(10),
1369            memory_copy_time: Duration::from_millis(100),
1370            sync_time: Duration::from_millis(5),
1371            parameter_count: 1000,
1372            flops: 100000,
1373            memory_footprint_bytes: 4096,
1374            cache_hit_rate: 0.5,
1375        };
1376        let bottleneck = profiler.identify_layer_bottleneck(&profile);
1377        assert_eq!(bottleneck, "Memory Bandwidth");
1378    }
1379
1380    #[test]
1381    fn test_profiler_identify_layer_bottleneck_sync() {
1382        let config = make_config();
1383        let profiler = Profiler::new(&config);
1384        let profile = LayerLatencyProfile {
1385            layer_name: "test".to_string(),
1386            layer_type: "dense".to_string(),
1387            input_shapes: vec![],
1388            output_shapes: vec![],
1389            cpu_time: Duration::from_millis(10),
1390            gpu_time: Duration::from_millis(10),
1391            memory_copy_time: Duration::from_millis(5),
1392            sync_time: Duration::from_millis(50),
1393            parameter_count: 0,
1394            flops: 0,
1395            memory_footprint_bytes: 0,
1396            cache_hit_rate: 0.0,
1397        };
1398        let bottleneck = profiler.identify_layer_bottleneck(&profile);
1399        assert_eq!(bottleneck, "GPU Synchronization");
1400    }
1401
1402    #[test]
1403    fn test_profiler_io_bandwidth_stats_empty() {
1404        let config = make_config();
1405        let profiler = Profiler::new(&config);
1406        let stats = profiler.get_io_bandwidth_stats();
1407        assert_eq!(stats.len(), 5);
1408        for &val in stats.values() {
1409            assert!((val - 0.0).abs() < 1e-9);
1410        }
1411    }
1412
1413    #[test]
1414    fn test_profiler_track_memory_allocation_and_deallocation() {
1415        let config = make_config();
1416        let mut profiler = Profiler::new(&config);
1417        let alloc_id = profiler.track_memory_allocation(
1418            4096,
1419            MemoryAllocationType::Host,
1420            None,
1421            vec!["test_frame".to_string()],
1422        );
1423        assert!(profiler.memory_allocations.contains_key(&alloc_id));
1424        profiler.track_memory_deallocation(alloc_id);
1425        let alloc = profiler.memory_allocations.get(&alloc_id);
1426        assert!(alloc.is_some());
1427        assert!(alloc.expect("allocation should exist").freed);
1428    }
1429
1430    #[test]
1431    fn test_profiler_gpu_kernel_summary_empty() {
1432        let config = make_config();
1433        let profiler = Profiler::new(&config);
1434        let summary = profiler.generate_gpu_kernel_summary();
1435        assert_eq!(summary.total_kernels, 0);
1436        assert!((summary.avg_occupancy - 0.0).abs() < 1e-9);
1437    }
1438
1439    #[test]
1440    fn test_profiler_memory_allocation_summary() {
1441        let config = make_config();
1442        let mut profiler = Profiler::new(&config);
1443        let _id = profiler.track_memory_allocation(
1444            1024,
1445            MemoryAllocationType::Device,
1446            Some(0),
1447            Vec::new(),
1448        );
1449        let summary = profiler.generate_memory_allocation_summary();
1450        assert_eq!(summary.total_allocations, 1);
1451        assert_eq!(summary.peak_memory_usage, 1024);
1452        assert_eq!(summary.memory_leaks, 1);
1453    }
1454
1455    #[test]
1456    fn test_profiler_io_performance_summary_empty() {
1457        let config = make_config();
1458        let profiler = Profiler::new(&config);
1459        let summary = profiler.generate_io_performance_summary();
1460        assert_eq!(summary.total_operations, 0);
1461        assert_eq!(summary.total_bytes_transferred, 0);
1462    }
1463
1464    #[test]
1465    fn test_profiler_analyze_cpu_bottlenecks() {
1466        let config = make_config();
1467        let mut profiler = Profiler::new(&config);
1468        let result = profiler.analyze_cpu_bottlenecks();
1469        assert!(!result.is_empty());
1470        assert_eq!(result[0].hot_functions.len(), 2);
1471    }
1472
1473    #[test]
1474    fn test_profiler_performance_analysis() {
1475        let config = make_config();
1476        let profiler = Profiler::new(&config);
1477        let analysis = profiler.get_performance_analysis();
1478        assert!(analysis.performance_score > 0.0);
1479        assert!(!analysis.recommendations.is_empty());
1480    }
1481
1482    #[test]
1483    fn test_profiler_generate_performance_recommendations_optimal() {
1484        let config = make_config();
1485        let profiler = Profiler::new(&config);
1486        let recs = profiler.generate_performance_recommendations();
1487        assert!(!recs.is_empty());
1488        assert!(recs[0].contains("optimal"));
1489    }
1490
1491    #[test]
1492    fn test_layer_profile_accessors() {
1493        let config = make_config();
1494        let mut profiler = Profiler::new(&config);
1495        profiler.record_layer_execution(
1496            "layer1",
1497            "conv",
1498            Duration::from_millis(10),
1499            Some(Duration::from_millis(5)),
1500            512,
1501            100,
1502        );
1503        let profiles = profiler.get_layer_profiles();
1504        let lp = &profiles["layer1"];
1505        assert_eq!(lp.forward_times().len(), 1);
1506        assert_eq!(lp.backward_times().len(), 1);
1507        assert_eq!(lp.memory_usage(), &vec![512]);
1508        assert_eq!(lp.call_count(), 1);
1509    }
1510}