1pub mod events;
4pub mod gpu;
5pub mod io_monitor;
6pub mod memory;
7pub mod report;
8
9pub use events::{
11 BottleneckSeverity, BottleneckType, CpuBottleneckAnalysis, CpuProfile, HotFunction,
12 MemorySnapshot, PerformanceBottleneck, ProfileEvent, ProfileStats,
13};
14pub use gpu::{GpuKernelProfile, GpuKernelSummary, GpuMemoryPool, GpuProfiler};
15pub use io_monitor::{
16 BandwidthSample, IoDeviceType, IoMonitor, IoOperation, IoOperationType, IoPerformanceSummary,
17 IoProfile, LayerLatencyProfile,
18};
19pub use memory::{
20 MemoryAllocation, MemoryAllocationType, MemoryEfficiencyAnalysis, MemoryStats, MemoryTracker,
21};
22pub use report::{
23 EnhancedProfilerReport, LayerLatencyAnalysis, MemoryAllocationSummary, PerformanceAnalysis,
24 ProfilerReport,
25};
26
27use anyhow::Result;
28use std::collections::HashMap;
29use std::sync::{Arc, Mutex};
30use std::time::{Duration, Instant, SystemTime};
31use uuid::Uuid;
32
33use crate::DebugConfig;
34
35#[derive(Debug)]
37pub struct Profiler {
38 #[allow(dead_code)]
39 config: DebugConfig,
40 events: Vec<ProfileEvent>,
41 active_timers: HashMap<String, Instant>,
42 memory_snapshots: Vec<MemorySnapshot>,
43 start_time: Option<Instant>,
44 layer_profiles: HashMap<String, LayerProfile>,
45 bottlenecks: Vec<PerformanceBottleneck>,
46 gpu_kernel_profiles: Vec<GpuKernelProfile>,
48 memory_allocations: HashMap<Uuid, MemoryAllocation>,
49 layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
50 io_profiles: Vec<IoProfile>,
51 cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
52 memory_tracker: Arc<Mutex<MemoryTracker>>,
53 gpu_profiler: Option<GpuProfiler>,
54 io_monitor: IoMonitor,
55}
56
57#[derive(Debug)]
58pub struct LayerProfile {
59 #[allow(dead_code)]
60 layer_name: String,
61 forward_times: Vec<Duration>,
62 backward_times: Vec<Duration>,
63 memory_usage: Vec<usize>,
64 call_count: usize,
65}
66
67impl LayerProfile {
68 pub fn forward_times(&self) -> &Vec<Duration> {
70 &self.forward_times
71 }
72
73 pub fn backward_times(&self) -> &Vec<Duration> {
75 &self.backward_times
76 }
77
78 pub fn memory_usage(&self) -> &Vec<usize> {
80 &self.memory_usage
81 }
82
83 pub fn call_count(&self) -> usize {
85 self.call_count
86 }
87}
88
89impl Profiler {
90 pub fn new(config: &DebugConfig) -> Self {
92 Self {
93 config: config.clone(),
94 events: Vec::new(),
95 active_timers: HashMap::new(),
96 memory_snapshots: Vec::new(),
97 start_time: None,
98 layer_profiles: HashMap::new(),
99 bottlenecks: Vec::new(),
100 gpu_kernel_profiles: Vec::new(),
102 memory_allocations: HashMap::new(),
103 layer_latency_profiles: HashMap::new(),
104 io_profiles: Vec::new(),
105 cpu_bottleneck_analysis: Vec::new(),
106 memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
107 gpu_profiler: GpuProfiler::new().ok(),
108 io_monitor: IoMonitor::new(),
109 }
110 }
111
112 pub async fn start(&mut self) -> Result<()> {
114 tracing::info!("Starting performance profiler");
115 self.start_time = Some(Instant::now());
116 self.take_memory_snapshot();
117 Ok(())
118 }
119
120 pub fn get_events(&self) -> &Vec<ProfileEvent> {
122 &self.events
123 }
124
125 pub fn start_timer(&mut self, name: &str) {
127 self.active_timers.insert(name.to_string(), Instant::now());
128 }
129
130 pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
132 if let Some(start_time) = self.active_timers.remove(name) {
133 let duration = start_time.elapsed();
134
135 self.events.push(ProfileEvent::FunctionCall {
137 function_name: name.to_string(),
138 duration,
139 memory_delta: 0, });
141
142 Some(duration)
143 } else {
144 tracing::warn!("Timer '{}' was not started", name);
145 None
146 }
147 }
148
149 pub fn record_layer_execution(
151 &mut self,
152 layer_name: &str,
153 layer_type: &str,
154 forward_time: Duration,
155 backward_time: Option<Duration>,
156 memory_usage: usize,
157 parameter_count: usize,
158 ) {
159 self.events.push(ProfileEvent::LayerExecution {
161 layer_name: layer_name.to_string(),
162 layer_type: layer_type.to_string(),
163 forward_time,
164 backward_time,
165 memory_usage,
166 parameter_count,
167 });
168
169 let profile =
171 self.layer_profiles
172 .entry(layer_name.to_string())
173 .or_insert_with(|| LayerProfile {
174 layer_name: layer_name.to_string(),
175 forward_times: Vec::new(),
176 backward_times: Vec::new(),
177 memory_usage: Vec::new(),
178 call_count: 0,
179 });
180
181 profile.forward_times.push(forward_time);
182 if let Some(backward) = backward_time {
183 profile.backward_times.push(backward);
184 }
185 profile.memory_usage.push(memory_usage);
186 profile.call_count += 1;
187 }
188
189 pub fn record_tensor_operation(
191 &mut self,
192 operation: &str,
193 tensor_shape: &[usize],
194 duration: Duration,
195 memory_allocated: usize,
196 ) {
197 self.events.push(ProfileEvent::TensorOperation {
198 operation: operation.to_string(),
199 tensor_shape: tensor_shape.to_vec(),
200 duration,
201 memory_allocated,
202 });
203 }
204
205 pub fn record_model_inference(
207 &mut self,
208 batch_size: usize,
209 sequence_length: usize,
210 duration: Duration,
211 ) {
212 let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
213
214 self.events.push(ProfileEvent::ModelInference {
215 batch_size,
216 sequence_length,
217 duration,
218 tokens_per_second,
219 });
220 }
221
222 pub fn record_gradient_computation(
224 &mut self,
225 layer_name: &str,
226 gradient_norm: f64,
227 duration: Duration,
228 ) {
229 self.events.push(ProfileEvent::GradientComputation {
230 layer_name: layer_name.to_string(),
231 gradient_norm,
232 duration,
233 });
234 }
235
236 pub fn take_memory_snapshot(&mut self) {
238 let snapshot = MemorySnapshot {
240 timestamp: chrono::Utc::now(),
241 heap_allocated: 0, heap_used: 0,
243 stack_size: 0,
244 gpu_allocated: None,
245 gpu_used: None,
246 };
247
248 self.memory_snapshots.push(snapshot);
249
250 if self.memory_snapshots.len() > 1000 {
252 self.memory_snapshots.drain(0..500);
253 }
254 }
255
256 pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
258 self.bottlenecks.clear();
259
260 self.analyze_layer_bottlenecks();
262
263 self.analyze_memory_bottlenecks();
265
266 self.analyze_tensor_bottlenecks();
268
269 self.bottlenecks.clone()
270 }
271
272 pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
274 let mut stats = HashMap::new();
275
276 let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
278
279 for event in &self.events {
280 let event_type = match event {
281 ProfileEvent::FunctionCall { .. } => "FunctionCall",
282 ProfileEvent::LayerExecution { .. } => "LayerExecution",
283 ProfileEvent::TensorOperation { .. } => "TensorOperation",
284 ProfileEvent::ModelInference { .. } => "ModelInference",
285 ProfileEvent::GradientComputation { .. } => "GradientComputation",
286 };
287
288 grouped_events.entry(event_type.to_string()).or_default().push(event);
289 }
290
291 for (event_type, events) in grouped_events {
293 let durations: Vec<Duration> = events
294 .iter()
295 .filter_map(|event| match event {
296 ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
297 ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
298 ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
299 ProfileEvent::ModelInference { duration, .. } => Some(*duration),
300 ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
301 })
302 .collect();
303
304 if !durations.is_empty() {
305 let total_duration: Duration = durations.iter().sum();
306 let avg_duration = total_duration / durations.len() as u32;
307 let min_duration = durations.iter().min().copied().unwrap_or_default();
308 let max_duration = durations.iter().max().copied().unwrap_or_default();
309
310 stats.insert(
311 event_type.clone(),
312 ProfileStats {
313 event_type,
314 count: durations.len(),
315 total_duration,
316 avg_duration,
317 min_duration,
318 max_duration,
319 total_memory: 0, avg_memory: 0.0,
321 },
322 );
323 }
324 }
325
326 stats
327 }
328
329 pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
331 &self.layer_profiles
332 }
333
334 pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
336 &self.memory_snapshots
337 }
338
339 pub async fn generate_report(&self) -> Result<ProfilerReport> {
341 let statistics = self.get_statistics();
342 let bottlenecks = self.bottlenecks.clone();
343 let total_events = self.events.len();
344
345 let total_runtime =
346 if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
347
348 let slowest_layers = self.get_slowest_layers(5);
350
351 let memory_efficiency = self.analyze_memory_efficiency();
353
354 Ok(ProfilerReport {
355 total_events,
356 total_runtime,
357 statistics,
358 bottlenecks,
359 slowest_layers,
360 memory_efficiency,
361 recommendations: self.generate_performance_recommendations(),
362 })
363 }
364
365 pub fn clear(&mut self) {
367 self.events.clear();
368 self.active_timers.clear();
369 self.memory_snapshots.clear();
370 self.layer_profiles.clear();
371 self.bottlenecks.clear();
372 self.start_time = None;
373 self.gpu_kernel_profiles.clear();
375 self.memory_allocations.clear();
376 self.layer_latency_profiles.clear();
377 self.io_profiles.clear();
378 self.cpu_bottleneck_analysis.clear();
379 if let Ok(mut tracker) = self.memory_tracker.lock() {
380 *tracker = MemoryTracker::new();
381 }
382 self.io_monitor = IoMonitor::new();
383 }
384
385 pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
389 if let Some(ref mut gpu_profiler) = self.gpu_profiler {
390 gpu_profiler.profile_kernel(kernel_profile.clone());
391 }
392 self.gpu_kernel_profiles.push(kernel_profile);
393 }
394
395 pub fn track_memory_allocation(
397 &mut self,
398 size_bytes: usize,
399 allocation_type: MemoryAllocationType,
400 device_id: Option<i32>,
401 stack_trace: Vec<String>,
402 ) -> Uuid {
403 let allocation_id = Uuid::new_v4();
404 let allocation = MemoryAllocation {
405 allocation_id,
406 size_bytes,
407 allocation_type,
408 device_id,
409 timestamp: SystemTime::now(),
410 stack_trace,
411 freed: false,
412 free_timestamp: None,
413 };
414
415 if let Ok(mut tracker) = self.memory_tracker.lock() {
416 tracker.track_allocation(allocation.clone());
417 }
418
419 self.memory_allocations.insert(allocation_id, allocation);
420 allocation_id
421 }
422
423 pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
425 if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
426 allocation.freed = true;
427 allocation.free_timestamp = Some(SystemTime::now());
428 }
429
430 if let Ok(mut tracker) = self.memory_tracker.lock() {
431 tracker.track_deallocation(allocation_id);
432 }
433 }
434
435 pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
437 self.layer_latency_profiles
438 .insert(layer_latency.layer_name.clone(), layer_latency);
439 }
440
441 pub fn start_io_profiling(
443 &mut self,
444 operation_type: IoOperationType,
445 bytes_expected: usize,
446 ) -> Uuid {
447 self.io_monitor.start_io_operation(operation_type, bytes_expected)
448 }
449
450 pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
452 if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
453 {
454 self.io_profiles.push(profile);
455 }
456 }
457
458 pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
460 let analysis = CpuBottleneckAnalysis {
463 thread_id: 0, cpu_usage: 0.75, context_switches: 1000,
466 cache_misses: 500,
467 instructions_per_cycle: 2.5,
468 branch_mispredictions: 100,
469 hot_functions: vec![
470 HotFunction {
471 function_name: "tensor_multiply".to_string(),
472 self_time_percentage: 25.0,
473 call_count: 1000,
474 avg_time_per_call: Duration::from_micros(250),
475 },
476 HotFunction {
477 function_name: "gradient_computation".to_string(),
478 self_time_percentage: 20.0,
479 call_count: 500,
480 avg_time_per_call: Duration::from_micros(400),
481 },
482 ],
483 bottleneck_score: 0.6,
484 };
485
486 self.cpu_bottleneck_analysis.push(analysis.clone());
487 vec![analysis]
488 }
489
490 pub fn get_memory_stats(&self) -> Option<MemoryStats> {
492 if let Ok(tracker) = self.memory_tracker.lock() {
493 Some(tracker.get_memory_stats())
494 } else {
495 None
496 }
497 }
498
499 pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
501 self.gpu_profiler
502 .as_ref()
503 .map(|profiler| profiler.get_gpu_utilization(device_id))
504 }
505
506 pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
508 let mut stats = HashMap::new();
509
510 stats.insert(
511 IoDeviceType::SSD,
512 self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
513 );
514 stats.insert(
515 IoDeviceType::HDD,
516 self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
517 );
518 stats.insert(
519 IoDeviceType::Network,
520 self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
521 );
522 stats.insert(
523 IoDeviceType::Memory,
524 self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
525 );
526 stats.insert(
527 IoDeviceType::Cache,
528 self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
529 );
530
531 stats
532 }
533
534 pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
536 self.layer_latency_profiles
537 .values()
538 .map(|profile| LayerLatencyAnalysis {
539 layer_name: profile.layer_name.clone(),
540 layer_type: profile.layer_type.clone(),
541 total_time: profile.cpu_time
542 + profile.gpu_time
543 + profile.memory_copy_time
544 + profile.sync_time,
545 cpu_percentage: profile.cpu_time.as_secs_f64()
546 / (profile.cpu_time
547 + profile.gpu_time
548 + profile.memory_copy_time
549 + profile.sync_time)
550 .as_secs_f64()
551 * 100.0,
552 gpu_percentage: profile.gpu_time.as_secs_f64()
553 / (profile.cpu_time
554 + profile.gpu_time
555 + profile.memory_copy_time
556 + profile.sync_time)
557 .as_secs_f64()
558 * 100.0,
559 memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
560 / (profile.cpu_time
561 + profile.gpu_time
562 + profile.memory_copy_time
563 + profile.sync_time)
564 .as_secs_f64()
565 * 100.0,
566 flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
567 profile.flops as f64 / profile.gpu_time.as_secs_f64()
568 } else {
569 0.0
570 },
571 memory_bandwidth_utilization: profile.cache_hit_rate,
572 bottleneck_type: self.identify_layer_bottleneck(profile),
573 })
574 .collect()
575 }
576
577 pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
579 let memory_stats = self.get_memory_stats();
580 let io_bandwidth_stats = self.get_io_bandwidth_stats();
581 let layer_analysis = self.get_layer_latency_analysis();
582
583 let gpu_utilization =
584 self.gpu_profiler.as_ref().map(|profiler| profiler.get_gpu_utilization(0));
585
586 PerformanceAnalysis {
587 memory_stats,
588 io_bandwidth_stats,
589 layer_analysis,
590 gpu_utilization,
591 cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
592 total_gpu_kernels: self.gpu_kernel_profiles.len(),
593 total_io_operations: self.io_profiles.len(),
594 performance_score: self.calculate_overall_performance_score(),
595 recommendations: self.generate_enhanced_recommendations(),
596 }
597 }
598
599 fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
600 let total_time =
601 profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
602
603 if profile.memory_copy_time > total_time / 2 {
604 "Memory Bandwidth".to_string()
605 } else if profile.sync_time > total_time / 3 {
606 "GPU Synchronization".to_string()
607 } else if profile.gpu_time > profile.cpu_time * 10 {
608 "GPU Compute".to_string()
609 } else {
610 "CPU Compute".to_string()
611 }
612 }
613
614 fn calculate_overall_performance_score(&self) -> f64 {
615 let mut score: f64 = 100.0;
616
617 for bottleneck in &self.bottlenecks {
619 match bottleneck.severity {
620 BottleneckSeverity::Critical => score -= 20.0,
621 BottleneckSeverity::High => score -= 10.0,
622 BottleneckSeverity::Medium => score -= 5.0,
623 BottleneckSeverity::Low => score -= 2.0,
624 }
625 }
626
627 if let Some(gpu_util) = self.get_gpu_utilization(0) {
629 if gpu_util < 0.5 {
630 score -= 15.0;
631 } else if gpu_util < 0.7 {
632 score -= 8.0;
633 }
634 }
635
636 if let Some(memory_stats) = self.get_memory_stats() {
638 if memory_stats.memory_efficiency < 0.8 {
639 score -= 10.0;
640 }
641 }
642
643 score.max(0.0)
644 }
645
646 fn generate_enhanced_recommendations(&self) -> Vec<String> {
647 let mut recommendations = Vec::new();
648
649 if let Some(gpu_util) = self.get_gpu_utilization(0) {
651 if gpu_util < 0.5 {
652 recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
653 }
654 }
655
656 if let Some(memory_stats) = self.get_memory_stats() {
658 if memory_stats.memory_efficiency < 0.8 {
659 recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
660 }
661
662 if memory_stats.active_allocations > 10000 {
663 recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
664 }
665 }
666
667 let io_stats = self.get_io_bandwidth_stats();
669 if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
670 if ssd_bandwidth < 100.0 {
671 recommendations.push(
673 "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
674 .to_string(),
675 );
676 }
677 }
678
679 let layer_analysis = self.get_layer_latency_analysis();
681 for analysis in &layer_analysis {
682 if analysis.memory_copy_percentage > 50.0 {
683 recommendations.push(format!(
684 "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
685 analysis.layer_name
686 ));
687 }
688
689 if analysis.cpu_percentage > 80.0 {
690 recommendations.push(format!(
691 "Layer '{}' is CPU bound. Consider GPU acceleration.",
692 analysis.layer_name
693 ));
694 }
695 }
696
697 if recommendations.is_empty() {
698 recommendations
699 .push("Performance appears optimal based on current analysis.".to_string());
700 }
701
702 recommendations
703 }
704
705 fn analyze_layer_bottlenecks(&mut self) {
708 for (layer_name, profile) in &self.layer_profiles {
709 if profile.forward_times.is_empty() {
710 continue;
711 }
712
713 let avg_forward_time =
714 profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
715
716 if avg_forward_time.as_millis() > 100 {
718 let mut metrics = HashMap::new();
719 metrics.insert(
720 "avg_forward_time_ms".to_string(),
721 avg_forward_time.as_millis() as f64,
722 );
723 metrics.insert("call_count".to_string(), profile.call_count as f64);
724
725 self.bottlenecks.push(PerformanceBottleneck {
726 bottleneck_type: BottleneckType::ModelComputation,
727 location: layer_name.clone(),
728 severity: if avg_forward_time.as_millis() > 500 {
729 BottleneckSeverity::High
730 } else {
731 BottleneckSeverity::Medium
732 },
733 description: format!(
734 "Layer '{}' has slow forward pass: {:.1}ms average",
735 layer_name,
736 avg_forward_time.as_millis()
737 ),
738 suggestion: "Consider optimizing layer implementation or reducing layer size"
739 .to_string(),
740 metrics,
741 });
742 }
743 }
744 }
745
746 fn analyze_memory_bottlenecks(&mut self) {
747 if self.memory_snapshots.len() < 2 {
748 return;
749 }
750
751 let recent_snapshots = if self.memory_snapshots.len() > 10 {
753 &self.memory_snapshots[self.memory_snapshots.len() - 10..]
754 } else {
755 &self.memory_snapshots
756 };
757
758 if recent_snapshots.len() >= 5 {
759 let initial_memory = recent_snapshots[0].heap_allocated;
760 let final_memory = recent_snapshots
761 .last()
762 .expect("recent_snapshots has at least 5 elements")
763 .heap_allocated;
764
765 if final_memory > initial_memory * 2 {
766 let mut metrics = HashMap::new();
767 metrics.insert(
768 "initial_memory_mb".to_string(),
769 initial_memory as f64 / (1024.0 * 1024.0),
770 );
771 metrics.insert(
772 "final_memory_mb".to_string(),
773 final_memory as f64 / (1024.0 * 1024.0),
774 );
775 metrics.insert(
776 "growth_ratio".to_string(),
777 final_memory as f64 / initial_memory as f64,
778 );
779
780 self.bottlenecks.push(PerformanceBottleneck {
781 bottleneck_type: BottleneckType::MemoryBound,
782 location: "Memory Usage".to_string(),
783 severity: BottleneckSeverity::High,
784 description: "Significant memory growth detected during profiling".to_string(),
785 suggestion: "Check for memory leaks or inefficient memory usage patterns"
786 .to_string(),
787 metrics,
788 });
789 }
790 }
791 }
792
793 fn analyze_tensor_bottlenecks(&mut self) {
794 let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
796
797 for event in &self.events {
798 if let ProfileEvent::TensorOperation {
799 operation,
800 duration,
801 ..
802 } = event
803 {
804 operation_groups.entry(operation.clone()).or_default().push(*duration);
805 }
806 }
807
808 for (operation, durations) in operation_groups {
810 if durations.is_empty() {
811 continue;
812 }
813
814 let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
815 let total_time = durations.iter().sum::<Duration>();
816
817 if avg_duration.as_millis() > 10 {
819 let mut metrics = HashMap::new();
820 metrics.insert(
821 "avg_duration_ms".to_string(),
822 avg_duration.as_millis() as f64,
823 );
824 metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
825 metrics.insert("call_count".to_string(), durations.len() as f64);
826
827 self.bottlenecks.push(PerformanceBottleneck {
828 bottleneck_type: BottleneckType::CpuBound,
829 location: format!("Tensor Operation: {}", operation),
830 severity: if avg_duration.as_millis() > 50 {
831 BottleneckSeverity::High
832 } else {
833 BottleneckSeverity::Medium
834 },
835 description: format!(
836 "Tensor operation '{}' is slow: {:.1}ms average",
837 operation,
838 avg_duration.as_millis()
839 ),
840 suggestion:
841 "Consider optimizing tensor operation or using different data types"
842 .to_string(),
843 metrics,
844 });
845 }
846 }
847 }
848
849 fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
850 let mut layer_times: Vec<(String, Duration)> = self
851 .layer_profiles
852 .iter()
853 .map(|(name, profile)| {
854 let avg_time = if profile.forward_times.is_empty() {
855 Duration::ZERO
856 } else {
857 profile.forward_times.iter().sum::<Duration>()
858 / profile.forward_times.len() as u32
859 };
860 (name.clone(), avg_time)
861 })
862 .collect();
863
864 layer_times.sort_by_key(|item| std::cmp::Reverse(item.1));
865 layer_times.truncate(limit);
866 layer_times
867 }
868
869 fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
870 if self.memory_snapshots.is_empty() {
871 return MemoryEfficiencyAnalysis::default();
872 }
873
874 let memory_values: Vec<usize> =
875 self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
876
877 let max_memory = memory_values.iter().max().copied().unwrap_or(0);
878 let min_memory = memory_values.iter().min().copied().unwrap_or(0);
879 let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
880
881 MemoryEfficiencyAnalysis {
882 peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
883 min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
884 avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
885 memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
886 efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
887 }
888 }
889
890 fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
891 if values.len() < 2 {
892 return 0.0;
893 }
894
895 let variance_sum: f64 = values
896 .iter()
897 .map(|&x| {
898 let diff = x as f64 - mean as f64;
899 diff * diff
900 })
901 .sum();
902
903 variance_sum / (values.len() - 1) as f64
904 }
905
906 fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
907 if values.is_empty() {
908 return 0.0;
909 }
910
911 let max_memory = values.iter().max().copied().unwrap_or(0);
912 let min_memory = values.iter().min().copied().unwrap_or(0);
913
914 if max_memory == 0 {
915 return 100.0;
916 }
917
918 100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
920 }
921
922 fn generate_performance_recommendations(&self) -> Vec<String> {
923 let mut recommendations = Vec::new();
924
925 for bottleneck in &self.bottlenecks {
927 match bottleneck.bottleneck_type {
928 BottleneckType::ModelComputation => {
929 recommendations.push(
930 "Consider model architecture optimizations or layer fusion".to_string(),
931 );
932 },
933 BottleneckType::MemoryBound => {
934 recommendations.push(
935 "Optimize memory usage with gradient checkpointing or model parallelism"
936 .to_string(),
937 );
938 },
939 BottleneckType::CpuBound => {
940 recommendations.push(
941 "Consider GPU acceleration or optimized CPU implementations".to_string(),
942 );
943 },
944 _ => {},
945 }
946 }
947
948 if self.events.len() > 10000 {
950 recommendations.push(
951 "High number of profiling events - consider reducing profiling overhead"
952 .to_string(),
953 );
954 }
955
956 let stats = self.get_statistics();
957 if let Some(layer_stats) = stats.get("LayerExecution") {
958 if layer_stats.avg_duration.as_millis() > 50 {
959 recommendations.push(
960 "Average layer execution time is high - consider layer optimization"
961 .to_string(),
962 );
963 }
964 }
965
966 if recommendations.is_empty() {
967 recommendations
968 .push("Performance appears optimal based on current profiling data".to_string());
969 }
970
971 recommendations
972 }
973
974 pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
976 let basic_report = self.generate_report().await?;
977 let performance_analysis = self.get_performance_analysis();
978
979 let gpu_kernel_summary = self.generate_gpu_kernel_summary();
980 let memory_allocation_summary = self.generate_memory_allocation_summary();
981 let io_performance_summary = self.generate_io_performance_summary();
982
983 Ok(EnhancedProfilerReport {
984 basic_report,
985 performance_analysis,
986 gpu_kernel_summary,
987 memory_allocation_summary,
988 io_performance_summary,
989 })
990 }
991
992 fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
993 let total_kernels = self.gpu_kernel_profiles.len();
994 let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
995
996 let avg_occupancy = if total_kernels > 0 {
997 self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
998 } else {
999 0.0
1000 };
1001
1002 let avg_compute_utilization = if total_kernels > 0 {
1003 self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1004 / total_kernels as f64
1005 } else {
1006 0.0
1007 };
1008
1009 let mut kernels_by_time: Vec<_> = self
1010 .gpu_kernel_profiles
1011 .iter()
1012 .map(|k| (k.kernel_name.clone(), k.execution_time))
1013 .collect();
1014 kernels_by_time.sort_by_key(|item| std::cmp::Reverse(item.1));
1015
1016 let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1017
1018 GpuKernelSummary {
1019 total_kernels,
1020 total_execution_time,
1021 avg_occupancy,
1022 avg_compute_utilization,
1023 slowest_kernels,
1024 }
1025 }
1026
1027 fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1028 let total_allocations = self.memory_allocations.len();
1029 let peak_memory_usage =
1030 self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1031
1032 let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1033 stats.memory_efficiency
1034 } else {
1035 1.0
1036 };
1037
1038 let mut allocations_by_size: Vec<_> = self
1039 .memory_allocations
1040 .values()
1041 .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1042 .collect();
1043 allocations_by_size.sort_by_key(|item| std::cmp::Reverse(item.1));
1044
1045 let largest_allocations =
1046 allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1047
1048 let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1049
1050 MemoryAllocationSummary {
1051 total_allocations,
1052 peak_memory_usage,
1053 memory_efficiency,
1054 largest_allocations,
1055 memory_leaks,
1056 }
1057 }
1058
1059 fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1060 let total_operations = self.io_profiles.len();
1061 let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1062
1063 let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1064
1065 let mut operations_by_duration: Vec<_> = self
1066 .io_profiles
1067 .iter()
1068 .map(|io| {
1069 (
1070 format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1071 io.duration,
1072 )
1073 })
1074 .collect();
1075 operations_by_duration.sort_by_key(|item| std::cmp::Reverse(item.1));
1076
1077 let slowest_operations =
1078 operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1079
1080 IoPerformanceSummary {
1081 total_operations,
1082 total_bytes_transferred,
1083 avg_bandwidth_by_device,
1084 slowest_operations,
1085 }
1086 }
1087}
1088
1089pub struct ScopedTimer<'a> {
1091 profiler: &'a mut Profiler,
1092 name: String,
1093}
1094
1095impl<'a> ScopedTimer<'a> {
1096 pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1097 profiler.start_timer(&name);
1098 Self { profiler, name }
1099 }
1100}
1101
1102impl<'a> Drop for ScopedTimer<'a> {
1103 fn drop(&mut self) {
1104 self.profiler.end_timer(&self.name);
1105 }
1106}
1107
1108#[macro_export]
1110macro_rules! profile_scope {
1111 ($profiler:expr, $name:expr) => {
1112 let _timer = ScopedTimer::new($profiler, $name.to_string());
1113 };
1114}
1115
1116#[cfg(test)]
1117mod tests {
1118 use super::*;
1119
1120 fn make_config() -> DebugConfig {
1121 DebugConfig::default()
1122 }
1123
1124 #[test]
1127 fn test_profiler_new() {
1128 let config = make_config();
1129 let profiler = Profiler::new(&config);
1130 assert!(profiler.events.is_empty());
1131 assert!(profiler.active_timers.is_empty());
1132 assert!(profiler.start_time.is_none());
1133 }
1134
1135 #[test]
1136 fn test_profiler_start_end_timer() {
1137 let config = make_config();
1138 let mut profiler = Profiler::new(&config);
1139 profiler.start_timer("test_op");
1140 let duration = profiler.end_timer("test_op");
1141 assert!(duration.is_some());
1142 assert_eq!(profiler.events.len(), 1);
1143 }
1144
1145 #[test]
1146 fn test_profiler_end_timer_not_started() {
1147 let config = make_config();
1148 let mut profiler = Profiler::new(&config);
1149 let duration = profiler.end_timer("nonexistent");
1150 assert!(duration.is_none());
1151 }
1152
1153 #[test]
1154 fn test_profiler_record_layer_execution() {
1155 let config = make_config();
1156 let mut profiler = Profiler::new(&config);
1157 profiler.record_layer_execution(
1158 "attention",
1159 "self_attention",
1160 Duration::from_millis(50),
1161 Some(Duration::from_millis(30)),
1162 1024,
1163 1000,
1164 );
1165 assert_eq!(profiler.events.len(), 1);
1166 let profiles = profiler.get_layer_profiles();
1167 assert!(profiles.contains_key("attention"));
1168 let lp = &profiles["attention"];
1169 assert_eq!(lp.call_count(), 1);
1170 assert_eq!(lp.forward_times().len(), 1);
1171 assert_eq!(lp.backward_times().len(), 1);
1172 }
1173
1174 #[test]
1175 fn test_profiler_record_tensor_operation() {
1176 let config = make_config();
1177 let mut profiler = Profiler::new(&config);
1178 profiler.record_tensor_operation("matmul", &[64, 128], Duration::from_micros(200), 8192);
1179 assert_eq!(profiler.events.len(), 1);
1180 }
1181
1182 #[test]
1183 fn test_profiler_record_model_inference() {
1184 let config = make_config();
1185 let mut profiler = Profiler::new(&config);
1186 profiler.record_model_inference(32, 512, Duration::from_millis(100));
1187 assert_eq!(profiler.events.len(), 1);
1188 }
1189
1190 #[test]
1191 fn test_profiler_record_gradient_computation() {
1192 let config = make_config();
1193 let mut profiler = Profiler::new(&config);
1194 profiler.record_gradient_computation("fc1", 0.5, Duration::from_millis(10));
1195 assert_eq!(profiler.events.len(), 1);
1196 }
1197
1198 #[test]
1199 fn test_profiler_get_statistics_empty() {
1200 let config = make_config();
1201 let profiler = Profiler::new(&config);
1202 let stats = profiler.get_statistics();
1203 assert!(stats.is_empty());
1204 }
1205
1206 #[test]
1207 fn test_profiler_get_statistics_with_events() {
1208 let config = make_config();
1209 let mut profiler = Profiler::new(&config);
1210 profiler.record_model_inference(8, 256, Duration::from_millis(50));
1211 profiler.record_model_inference(8, 256, Duration::from_millis(100));
1212 let stats = profiler.get_statistics();
1213 assert!(stats.contains_key("ModelInference"));
1214 let mi_stats = &stats["ModelInference"];
1215 assert_eq!(mi_stats.count, 2);
1216 }
1217
1218 #[test]
1219 fn test_profiler_clear() {
1220 let config = make_config();
1221 let mut profiler = Profiler::new(&config);
1222 profiler.start_timer("op1");
1223 profiler.end_timer("op1");
1224 profiler.take_memory_snapshot();
1225 profiler.clear();
1226 assert!(profiler.events.is_empty());
1227 assert!(profiler.active_timers.is_empty());
1228 assert!(profiler.memory_snapshots.is_empty());
1229 assert!(profiler.start_time.is_none());
1230 }
1231
1232 #[test]
1233 fn test_profiler_take_memory_snapshot() {
1234 let config = make_config();
1235 let mut profiler = Profiler::new(&config);
1236 profiler.take_memory_snapshot();
1237 assert_eq!(profiler.get_memory_timeline().len(), 1);
1238 }
1239
1240 #[test]
1241 fn test_profiler_memory_snapshot_limit() {
1242 let config = make_config();
1243 let mut profiler = Profiler::new(&config);
1244 for _ in 0..1100 {
1245 profiler.take_memory_snapshot();
1246 }
1247 assert!(profiler.get_memory_timeline().len() <= 601);
1249 }
1250
1251 #[test]
1252 fn test_profiler_analyze_performance_empty() {
1253 let config = make_config();
1254 let mut profiler = Profiler::new(&config);
1255 let bottlenecks = profiler.analyze_performance();
1256 assert!(bottlenecks.is_empty());
1257 }
1258
1259 #[test]
1260 fn test_profiler_analyze_performance_slow_layer() {
1261 let config = make_config();
1262 let mut profiler = Profiler::new(&config);
1263 for _ in 0..5 {
1264 profiler.record_layer_execution(
1265 "slow_layer",
1266 "dense",
1267 Duration::from_millis(600),
1268 None,
1269 4096,
1270 10000,
1271 );
1272 }
1273 let bottlenecks = profiler.analyze_performance();
1274 assert!(!bottlenecks.is_empty());
1275 }
1276
1277 #[test]
1278 fn test_profiler_get_slowest_layers() {
1279 let config = make_config();
1280 let mut profiler = Profiler::new(&config);
1281 profiler.record_layer_execution(
1282 "fast_layer",
1283 "relu",
1284 Duration::from_millis(1),
1285 None,
1286 128,
1287 0,
1288 );
1289 profiler.record_layer_execution(
1290 "slow_layer",
1291 "dense",
1292 Duration::from_millis(200),
1293 None,
1294 4096,
1295 10000,
1296 );
1297 let slowest = profiler.get_slowest_layers(2);
1298 assert_eq!(slowest.len(), 2);
1299 assert_eq!(slowest[0].0, "slow_layer");
1300 }
1301
1302 #[test]
1303 fn test_profiler_memory_efficiency_empty() {
1304 let config = make_config();
1305 let profiler = Profiler::new(&config);
1306 let analysis = profiler.analyze_memory_efficiency();
1307 assert!((analysis.efficiency_score - 100.0).abs() < 1e-9);
1308 }
1309
1310 #[test]
1311 fn test_profiler_calculate_memory_variance() {
1312 let config = make_config();
1313 let profiler = Profiler::new(&config);
1314 let values = vec![100, 200, 300];
1315 let mean = 200;
1316 let variance = profiler.calculate_memory_variance(&values, mean);
1317 assert!((variance - 10000.0).abs() < 1e-3);
1319 }
1320
1321 #[test]
1322 fn test_profiler_calculate_memory_efficiency_score_empty() {
1323 let config = make_config();
1324 let profiler = Profiler::new(&config);
1325 let score = profiler.calculate_memory_efficiency_score(&[]);
1326 assert!((score - 0.0).abs() < 1e-9);
1327 }
1328
1329 #[test]
1330 fn test_profiler_calculate_memory_efficiency_score_stable() {
1331 let config = make_config();
1332 let profiler = Profiler::new(&config);
1333 let values = vec![100, 100, 100];
1334 let score = profiler.calculate_memory_efficiency_score(&values);
1335 assert!((score - 100.0).abs() < 1e-9);
1336 }
1337
1338 #[test]
1339 fn test_profiler_calculate_memory_efficiency_score_varied() {
1340 let config = make_config();
1341 let profiler = Profiler::new(&config);
1342 let values = vec![50, 100];
1343 let score = profiler.calculate_memory_efficiency_score(&values);
1344 assert!((score - 50.0).abs() < 1e-9);
1346 }
1347
1348 #[test]
1349 fn test_profiler_overall_performance_score_no_bottlenecks() {
1350 let config = make_config();
1351 let profiler = Profiler::new(&config);
1352 let score = profiler.calculate_overall_performance_score();
1353 assert!(score >= 50.0);
1355 assert!(score <= 100.0);
1356 }
1357
1358 #[test]
1359 fn test_profiler_identify_layer_bottleneck_memory() {
1360 let config = make_config();
1361 let profiler = Profiler::new(&config);
1362 let profile = LayerLatencyProfile {
1363 layer_name: "test".to_string(),
1364 layer_type: "dense".to_string(),
1365 input_shapes: vec![vec![32, 128]],
1366 output_shapes: vec![vec![32, 256]],
1367 cpu_time: Duration::from_millis(10),
1368 gpu_time: Duration::from_millis(10),
1369 memory_copy_time: Duration::from_millis(100),
1370 sync_time: Duration::from_millis(5),
1371 parameter_count: 1000,
1372 flops: 100000,
1373 memory_footprint_bytes: 4096,
1374 cache_hit_rate: 0.5,
1375 };
1376 let bottleneck = profiler.identify_layer_bottleneck(&profile);
1377 assert_eq!(bottleneck, "Memory Bandwidth");
1378 }
1379
1380 #[test]
1381 fn test_profiler_identify_layer_bottleneck_sync() {
1382 let config = make_config();
1383 let profiler = Profiler::new(&config);
1384 let profile = LayerLatencyProfile {
1385 layer_name: "test".to_string(),
1386 layer_type: "dense".to_string(),
1387 input_shapes: vec![],
1388 output_shapes: vec![],
1389 cpu_time: Duration::from_millis(10),
1390 gpu_time: Duration::from_millis(10),
1391 memory_copy_time: Duration::from_millis(5),
1392 sync_time: Duration::from_millis(50),
1393 parameter_count: 0,
1394 flops: 0,
1395 memory_footprint_bytes: 0,
1396 cache_hit_rate: 0.0,
1397 };
1398 let bottleneck = profiler.identify_layer_bottleneck(&profile);
1399 assert_eq!(bottleneck, "GPU Synchronization");
1400 }
1401
1402 #[test]
1403 fn test_profiler_io_bandwidth_stats_empty() {
1404 let config = make_config();
1405 let profiler = Profiler::new(&config);
1406 let stats = profiler.get_io_bandwidth_stats();
1407 assert_eq!(stats.len(), 5);
1408 for &val in stats.values() {
1409 assert!((val - 0.0).abs() < 1e-9);
1410 }
1411 }
1412
1413 #[test]
1414 fn test_profiler_track_memory_allocation_and_deallocation() {
1415 let config = make_config();
1416 let mut profiler = Profiler::new(&config);
1417 let alloc_id = profiler.track_memory_allocation(
1418 4096,
1419 MemoryAllocationType::Host,
1420 None,
1421 vec!["test_frame".to_string()],
1422 );
1423 assert!(profiler.memory_allocations.contains_key(&alloc_id));
1424 profiler.track_memory_deallocation(alloc_id);
1425 let alloc = profiler.memory_allocations.get(&alloc_id);
1426 assert!(alloc.is_some());
1427 assert!(alloc.expect("allocation should exist").freed);
1428 }
1429
1430 #[test]
1431 fn test_profiler_gpu_kernel_summary_empty() {
1432 let config = make_config();
1433 let profiler = Profiler::new(&config);
1434 let summary = profiler.generate_gpu_kernel_summary();
1435 assert_eq!(summary.total_kernels, 0);
1436 assert!((summary.avg_occupancy - 0.0).abs() < 1e-9);
1437 }
1438
1439 #[test]
1440 fn test_profiler_memory_allocation_summary() {
1441 let config = make_config();
1442 let mut profiler = Profiler::new(&config);
1443 let _id = profiler.track_memory_allocation(
1444 1024,
1445 MemoryAllocationType::Device,
1446 Some(0),
1447 Vec::new(),
1448 );
1449 let summary = profiler.generate_memory_allocation_summary();
1450 assert_eq!(summary.total_allocations, 1);
1451 assert_eq!(summary.peak_memory_usage, 1024);
1452 assert_eq!(summary.memory_leaks, 1);
1453 }
1454
1455 #[test]
1456 fn test_profiler_io_performance_summary_empty() {
1457 let config = make_config();
1458 let profiler = Profiler::new(&config);
1459 let summary = profiler.generate_io_performance_summary();
1460 assert_eq!(summary.total_operations, 0);
1461 assert_eq!(summary.total_bytes_transferred, 0);
1462 }
1463
1464 #[test]
1465 fn test_profiler_analyze_cpu_bottlenecks() {
1466 let config = make_config();
1467 let mut profiler = Profiler::new(&config);
1468 let result = profiler.analyze_cpu_bottlenecks();
1469 assert!(!result.is_empty());
1470 assert_eq!(result[0].hot_functions.len(), 2);
1471 }
1472
1473 #[test]
1474 fn test_profiler_performance_analysis() {
1475 let config = make_config();
1476 let profiler = Profiler::new(&config);
1477 let analysis = profiler.get_performance_analysis();
1478 assert!(analysis.performance_score > 0.0);
1479 assert!(!analysis.recommendations.is_empty());
1480 }
1481
1482 #[test]
1483 fn test_profiler_generate_performance_recommendations_optimal() {
1484 let config = make_config();
1485 let profiler = Profiler::new(&config);
1486 let recs = profiler.generate_performance_recommendations();
1487 assert!(!recs.is_empty());
1488 assert!(recs[0].contains("optimal"));
1489 }
1490
1491 #[test]
1492 fn test_layer_profile_accessors() {
1493 let config = make_config();
1494 let mut profiler = Profiler::new(&config);
1495 profiler.record_layer_execution(
1496 "layer1",
1497 "conv",
1498 Duration::from_millis(10),
1499 Some(Duration::from_millis(5)),
1500 512,
1501 100,
1502 );
1503 let profiles = profiler.get_layer_profiles();
1504 let lp = &profiles["layer1"];
1505 assert_eq!(lp.forward_times().len(), 1);
1506 assert_eq!(lp.backward_times().len(), 1);
1507 assert_eq!(lp.memory_usage(), &vec![512]);
1508 assert_eq!(lp.call_count(), 1);
1509 }
1510}