1use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant, SystemTime};
8use uuid::Uuid;
9
10use crate::DebugConfig;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub enum ProfileEvent {
15 FunctionCall {
16 function_name: String,
17 duration: Duration,
18 memory_delta: i64,
19 },
20 LayerExecution {
21 layer_name: String,
22 layer_type: String,
23 forward_time: Duration,
24 backward_time: Option<Duration>,
25 memory_usage: usize,
26 parameter_count: usize,
27 },
28 TensorOperation {
29 operation: String,
30 tensor_shape: Vec<usize>,
31 duration: Duration,
32 memory_allocated: usize,
33 },
34 ModelInference {
35 batch_size: usize,
36 sequence_length: usize,
37 duration: Duration,
38 tokens_per_second: f64,
39 },
40 GradientComputation {
41 layer_name: String,
42 gradient_norm: f64,
43 duration: Duration,
44 },
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ProfileStats {
50 pub event_type: String,
51 pub count: usize,
52 pub total_duration: Duration,
53 pub avg_duration: Duration,
54 pub min_duration: Duration,
55 pub max_duration: Duration,
56 pub total_memory: i64,
57 pub avg_memory: f64,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct MemorySnapshot {
63 pub timestamp: chrono::DateTime<chrono::Utc>,
64 pub heap_allocated: usize,
65 pub heap_used: usize,
66 pub stack_size: usize,
67 pub gpu_allocated: Option<usize>,
68 pub gpu_used: Option<usize>,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct PerformanceBottleneck {
74 pub bottleneck_type: BottleneckType,
75 pub location: String,
76 pub severity: BottleneckSeverity,
77 pub description: String,
78 pub suggestion: String,
79 pub metrics: HashMap<String, f64>,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub enum BottleneckType {
84 CpuBound,
85 MemoryBound,
86 IoBound,
87 GpuBound,
88 NetworkBound,
89 DataLoading,
90 ModelComputation,
91 GradientComputation,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub enum BottleneckSeverity {
96 Low,
97 Medium,
98 High,
99 Critical,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct CpuProfile {
105 pub function_name: String,
106 pub self_time: Duration,
107 pub total_time: Duration,
108 pub call_count: usize,
109 pub children: Vec<CpuProfile>,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct GpuKernelProfile {
115 pub kernel_name: String,
116 pub grid_size: (u32, u32, u32),
117 pub block_size: (u32, u32, u32),
118 pub shared_memory_bytes: usize,
119 pub registers_per_thread: u32,
120 pub occupancy: f64,
121 pub execution_time: Duration,
122 pub memory_bandwidth_gb_s: f64,
123 pub compute_utilization: f64,
124 pub stream_id: i32,
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct MemoryAllocation {
130 pub allocation_id: Uuid,
131 pub size_bytes: usize,
132 pub allocation_type: MemoryAllocationType,
133 pub device_id: Option<i32>,
134 pub timestamp: SystemTime,
135 pub stack_trace: Vec<String>,
136 pub freed: bool,
137 pub free_timestamp: Option<SystemTime>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub enum MemoryAllocationType {
142 Host,
143 Device,
144 Unified,
145 Pinned,
146 Mapped,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct LayerLatencyProfile {
152 pub layer_name: String,
153 pub layer_type: String,
154 pub input_shapes: Vec<Vec<usize>>,
155 pub output_shapes: Vec<Vec<usize>>,
156 pub cpu_time: Duration,
157 pub gpu_time: Duration,
158 pub memory_copy_time: Duration,
159 pub sync_time: Duration,
160 pub parameter_count: usize,
161 pub flops: u64,
162 pub memory_footprint_bytes: usize,
163 pub cache_hit_rate: f64,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct IoProfile {
169 pub operation_type: IoOperationType,
170 pub file_path: Option<String>,
171 pub bytes_transferred: usize,
172 pub duration: Duration,
173 pub bandwidth_mb_s: f64,
174 pub queue_time: Duration,
175 pub device_type: IoDeviceType,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub enum IoOperationType {
180 FileRead,
181 FileWrite,
182 NetworkRead,
183 NetworkWrite,
184 DatabaseQuery,
185 CacheLoad,
186 CacheStore,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
190pub enum IoDeviceType {
191 SSD,
192 HDD,
193 Network,
194 Memory,
195 Cache,
196}
197
198#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CpuBottleneckAnalysis {
201 pub thread_id: u64,
202 pub cpu_usage: f64,
203 pub context_switches: u64,
204 pub cache_misses: u64,
205 pub instructions_per_cycle: f64,
206 pub branch_mispredictions: u64,
207 pub hot_functions: Vec<HotFunction>,
208 pub bottleneck_score: f64,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct HotFunction {
213 pub function_name: String,
214 pub self_time_percentage: f64,
215 pub call_count: usize,
216 pub avg_time_per_call: Duration,
217}
218
219#[derive(Debug)]
221pub struct MemoryTracker {
222 allocations: HashMap<Uuid, MemoryAllocation>,
223 total_allocated: usize,
224 peak_allocated: usize,
225 allocation_count: usize,
226 deallocation_count: usize,
227}
228
229impl Default for MemoryTracker {
230 fn default() -> Self {
231 Self::new()
232 }
233}
234
235impl MemoryTracker {
236 pub fn new() -> Self {
237 Self {
238 allocations: HashMap::new(),
239 total_allocated: 0,
240 peak_allocated: 0,
241 allocation_count: 0,
242 deallocation_count: 0,
243 }
244 }
245
246 pub fn track_allocation(&mut self, allocation: MemoryAllocation) {
247 self.total_allocated += allocation.size_bytes;
248 self.allocation_count += 1;
249
250 if self.total_allocated > self.peak_allocated {
251 self.peak_allocated = self.total_allocated;
252 }
253
254 self.allocations.insert(allocation.allocation_id, allocation);
255 }
256
257 pub fn track_deallocation(&mut self, allocation_id: Uuid) {
258 if let Some(mut allocation) = self.allocations.remove(&allocation_id) {
259 allocation.freed = true;
260 allocation.free_timestamp = Some(SystemTime::now());
261 self.total_allocated = self.total_allocated.saturating_sub(allocation.size_bytes);
262 self.deallocation_count += 1;
263 }
264 }
265
266 pub fn get_memory_stats(&self) -> MemoryStats {
267 MemoryStats {
268 total_allocated: self.total_allocated,
269 peak_allocated: self.peak_allocated,
270 active_allocations: self.allocations.len(),
271 allocation_count: self.allocation_count,
272 deallocation_count: self.deallocation_count,
273 memory_efficiency: if self.allocation_count > 0 {
274 self.deallocation_count as f64 / self.allocation_count as f64
275 } else {
276 1.0
277 },
278 }
279 }
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize)]
283pub struct MemoryStats {
284 pub total_allocated: usize,
285 pub peak_allocated: usize,
286 pub active_allocations: usize,
287 pub allocation_count: usize,
288 pub deallocation_count: usize,
289 pub memory_efficiency: f64,
290}
291
292#[derive(Debug)]
294#[allow(dead_code)]
295pub struct GpuProfiler {
296 #[allow(dead_code)]
297 device_count: i32,
298 active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
299 memory_pools: HashMap<i32, GpuMemoryPool>,
300}
301
302#[allow(dead_code)]
303#[derive(Debug)]
304pub struct GpuMemoryPool {
305 #[allow(dead_code)]
306 device_id: i32,
307 total_memory: usize,
308 free_memory: usize,
309 fragmentation_score: f64,
310}
311
312impl GpuProfiler {
313 pub fn new() -> Result<Self> {
314 Ok(Self {
316 device_count: 1, active_streams: HashMap::new(),
318 memory_pools: HashMap::new(),
319 })
320 }
321
322 pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
323 self.active_streams
324 .entry(kernel_profile.stream_id)
325 .or_default()
326 .push(kernel_profile);
327 }
328
329 pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
330 if let Some(kernels) = self.active_streams.get(&device_id) {
332 if kernels.is_empty() {
333 0.0
334 } else {
335 kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
336 }
337 } else {
338 0.0
339 }
340 }
341}
342
343#[derive(Debug)]
345pub struct IoMonitor {
346 active_operations: HashMap<Uuid, IoOperation>,
347 bandwidth_history: Vec<BandwidthSample>,
348 io_queue_depth: usize,
349}
350#[allow(dead_code)]
351#[derive(Debug)]
352pub struct IoOperation {
353 #[allow(dead_code)]
354 operation_id: Uuid,
355 start_time: Instant,
356 operation_type: IoOperationType,
357 bytes_expected: usize,
358}
359
360#[derive(Debug, Clone, Serialize, Deserialize)]
361pub struct BandwidthSample {
362 pub timestamp: SystemTime,
363 pub bandwidth_mb_s: f64,
364 pub device_type: IoDeviceType,
365}
366
367impl Default for IoMonitor {
368 fn default() -> Self {
369 Self::new()
370 }
371}
372
373impl IoMonitor {
374 pub fn new() -> Self {
375 Self {
376 active_operations: HashMap::new(),
377 bandwidth_history: Vec::new(),
378 io_queue_depth: 0,
379 }
380 }
381
382 pub fn start_io_operation(
383 &mut self,
384 operation_type: IoOperationType,
385 bytes_expected: usize,
386 ) -> Uuid {
387 let operation_id = Uuid::new_v4();
388 let operation = IoOperation {
389 operation_id,
390 start_time: Instant::now(),
391 operation_type,
392 bytes_expected,
393 };
394
395 self.active_operations.insert(operation_id, operation);
396 self.io_queue_depth += 1;
397 operation_id
398 }
399
400 pub fn finish_io_operation(
401 &mut self,
402 operation_id: Uuid,
403 bytes_transferred: usize,
404 ) -> Option<IoProfile> {
405 if let Some(operation) = self.active_operations.remove(&operation_id) {
406 let duration = operation.start_time.elapsed();
407 let bandwidth_mb_s = if duration.as_secs_f64() > 0.0 {
408 bytes_transferred as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()
409 } else {
410 0.0
411 };
412
413 self.io_queue_depth = self.io_queue_depth.saturating_sub(1);
414
415 let device_type = match operation.operation_type {
416 IoOperationType::FileRead | IoOperationType::FileWrite => IoDeviceType::SSD,
417 IoOperationType::NetworkRead | IoOperationType::NetworkWrite => {
418 IoDeviceType::Network
419 },
420 IoOperationType::CacheLoad | IoOperationType::CacheStore => IoDeviceType::Cache,
421 _ => IoDeviceType::Memory,
422 };
423
424 self.bandwidth_history.push(BandwidthSample {
426 timestamp: SystemTime::now(),
427 bandwidth_mb_s,
428 device_type: device_type.clone(),
429 });
430
431 if self.bandwidth_history.len() > 1000 {
433 self.bandwidth_history.drain(0..500);
434 }
435
436 Some(IoProfile {
437 operation_type: operation.operation_type,
438 file_path: None, bytes_transferred,
440 duration,
441 bandwidth_mb_s,
442 queue_time: Duration::from_millis(self.io_queue_depth as u64 * 10), device_type,
444 })
445 } else {
446 None
447 }
448 }
449
450 pub fn get_average_bandwidth(&self, device_type: &IoDeviceType) -> f64 {
451 let samples: Vec<f64> = self
452 .bandwidth_history
453 .iter()
454 .filter(|s| s.device_type == *device_type)
455 .map(|s| s.bandwidth_mb_s)
456 .collect();
457
458 if samples.is_empty() {
459 0.0
460 } else {
461 samples.iter().sum::<f64>() / samples.len() as f64
462 }
463 }
464}
465
466#[derive(Debug)]
468pub struct Profiler {
469 #[allow(dead_code)]
470 config: DebugConfig,
471 events: Vec<ProfileEvent>,
472 active_timers: HashMap<String, Instant>,
473 memory_snapshots: Vec<MemorySnapshot>,
474 start_time: Option<Instant>,
475 layer_profiles: HashMap<String, LayerProfile>,
476 bottlenecks: Vec<PerformanceBottleneck>,
477 gpu_kernel_profiles: Vec<GpuKernelProfile>,
479 memory_allocations: HashMap<Uuid, MemoryAllocation>,
480 layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
481 io_profiles: Vec<IoProfile>,
482 cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
483 memory_tracker: Arc<Mutex<MemoryTracker>>,
484 gpu_profiler: Option<GpuProfiler>,
485 io_monitor: IoMonitor,
486}
487
488#[derive(Debug)]
489pub struct LayerProfile {
490 #[allow(dead_code)]
491 layer_name: String,
492 forward_times: Vec<Duration>,
493 backward_times: Vec<Duration>,
494 memory_usage: Vec<usize>,
495 call_count: usize,
496}
497
498impl LayerProfile {
499 pub fn forward_times(&self) -> &Vec<Duration> {
501 &self.forward_times
502 }
503
504 pub fn backward_times(&self) -> &Vec<Duration> {
506 &self.backward_times
507 }
508
509 pub fn memory_usage(&self) -> &Vec<usize> {
511 &self.memory_usage
512 }
513
514 pub fn call_count(&self) -> usize {
516 self.call_count
517 }
518}
519
520impl Profiler {
521 pub fn new(config: &DebugConfig) -> Self {
523 Self {
524 config: config.clone(),
525 events: Vec::new(),
526 active_timers: HashMap::new(),
527 memory_snapshots: Vec::new(),
528 start_time: None,
529 layer_profiles: HashMap::new(),
530 bottlenecks: Vec::new(),
531 gpu_kernel_profiles: Vec::new(),
533 memory_allocations: HashMap::new(),
534 layer_latency_profiles: HashMap::new(),
535 io_profiles: Vec::new(),
536 cpu_bottleneck_analysis: Vec::new(),
537 memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
538 gpu_profiler: GpuProfiler::new().ok(),
539 io_monitor: IoMonitor::new(),
540 }
541 }
542
543 pub async fn start(&mut self) -> Result<()> {
545 tracing::info!("Starting performance profiler");
546 self.start_time = Some(Instant::now());
547 self.take_memory_snapshot();
548 Ok(())
549 }
550
551 pub fn get_events(&self) -> &Vec<ProfileEvent> {
553 &self.events
554 }
555
556 pub fn start_timer(&mut self, name: &str) {
558 self.active_timers.insert(name.to_string(), Instant::now());
559 }
560
561 pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
563 if let Some(start_time) = self.active_timers.remove(name) {
564 let duration = start_time.elapsed();
565
566 self.events.push(ProfileEvent::FunctionCall {
568 function_name: name.to_string(),
569 duration,
570 memory_delta: 0, });
572
573 Some(duration)
574 } else {
575 tracing::warn!("Timer '{}' was not started", name);
576 None
577 }
578 }
579
580 pub fn record_layer_execution(
582 &mut self,
583 layer_name: &str,
584 layer_type: &str,
585 forward_time: Duration,
586 backward_time: Option<Duration>,
587 memory_usage: usize,
588 parameter_count: usize,
589 ) {
590 self.events.push(ProfileEvent::LayerExecution {
592 layer_name: layer_name.to_string(),
593 layer_type: layer_type.to_string(),
594 forward_time,
595 backward_time,
596 memory_usage,
597 parameter_count,
598 });
599
600 let profile =
602 self.layer_profiles
603 .entry(layer_name.to_string())
604 .or_insert_with(|| LayerProfile {
605 layer_name: layer_name.to_string(),
606 forward_times: Vec::new(),
607 backward_times: Vec::new(),
608 memory_usage: Vec::new(),
609 call_count: 0,
610 });
611
612 profile.forward_times.push(forward_time);
613 if let Some(backward) = backward_time {
614 profile.backward_times.push(backward);
615 }
616 profile.memory_usage.push(memory_usage);
617 profile.call_count += 1;
618 }
619
620 pub fn record_tensor_operation(
622 &mut self,
623 operation: &str,
624 tensor_shape: &[usize],
625 duration: Duration,
626 memory_allocated: usize,
627 ) {
628 self.events.push(ProfileEvent::TensorOperation {
629 operation: operation.to_string(),
630 tensor_shape: tensor_shape.to_vec(),
631 duration,
632 memory_allocated,
633 });
634 }
635
636 pub fn record_model_inference(
638 &mut self,
639 batch_size: usize,
640 sequence_length: usize,
641 duration: Duration,
642 ) {
643 let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
644
645 self.events.push(ProfileEvent::ModelInference {
646 batch_size,
647 sequence_length,
648 duration,
649 tokens_per_second,
650 });
651 }
652
653 pub fn record_gradient_computation(
655 &mut self,
656 layer_name: &str,
657 gradient_norm: f64,
658 duration: Duration,
659 ) {
660 self.events.push(ProfileEvent::GradientComputation {
661 layer_name: layer_name.to_string(),
662 gradient_norm,
663 duration,
664 });
665 }
666
667 pub fn take_memory_snapshot(&mut self) {
669 let snapshot = MemorySnapshot {
671 timestamp: chrono::Utc::now(),
672 heap_allocated: 0, heap_used: 0,
674 stack_size: 0,
675 gpu_allocated: None,
676 gpu_used: None,
677 };
678
679 self.memory_snapshots.push(snapshot);
680
681 if self.memory_snapshots.len() > 1000 {
683 self.memory_snapshots.drain(0..500);
684 }
685 }
686
687 pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
689 self.bottlenecks.clear();
690
691 self.analyze_layer_bottlenecks();
693
694 self.analyze_memory_bottlenecks();
696
697 self.analyze_tensor_bottlenecks();
699
700 self.bottlenecks.clone()
701 }
702
703 pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
705 let mut stats = HashMap::new();
706
707 let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
709
710 for event in &self.events {
711 let event_type = match event {
712 ProfileEvent::FunctionCall { .. } => "FunctionCall",
713 ProfileEvent::LayerExecution { .. } => "LayerExecution",
714 ProfileEvent::TensorOperation { .. } => "TensorOperation",
715 ProfileEvent::ModelInference { .. } => "ModelInference",
716 ProfileEvent::GradientComputation { .. } => "GradientComputation",
717 };
718
719 grouped_events.entry(event_type.to_string()).or_default().push(event);
720 }
721
722 for (event_type, events) in grouped_events {
724 let durations: Vec<Duration> = events
725 .iter()
726 .filter_map(|event| match event {
727 ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
728 ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
729 ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
730 ProfileEvent::ModelInference { duration, .. } => Some(*duration),
731 ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
732 })
733 .collect();
734
735 if !durations.is_empty() {
736 let total_duration: Duration = durations.iter().sum();
737 let avg_duration = total_duration / durations.len() as u32;
738 let min_duration = durations.iter().min().copied().unwrap_or_default();
739 let max_duration = durations.iter().max().copied().unwrap_or_default();
740
741 stats.insert(
742 event_type.clone(),
743 ProfileStats {
744 event_type,
745 count: durations.len(),
746 total_duration,
747 avg_duration,
748 min_duration,
749 max_duration,
750 total_memory: 0, avg_memory: 0.0,
752 },
753 );
754 }
755 }
756
757 stats
758 }
759
760 pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
762 &self.layer_profiles
763 }
764
765 pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
767 &self.memory_snapshots
768 }
769
770 pub async fn generate_report(&self) -> Result<ProfilerReport> {
772 let statistics = self.get_statistics();
773 let bottlenecks = self.bottlenecks.clone();
774 let total_events = self.events.len();
775
776 let total_runtime =
777 if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
778
779 let slowest_layers = self.get_slowest_layers(5);
781
782 let memory_efficiency = self.analyze_memory_efficiency();
784
785 Ok(ProfilerReport {
786 total_events,
787 total_runtime,
788 statistics,
789 bottlenecks,
790 slowest_layers,
791 memory_efficiency,
792 recommendations: self.generate_performance_recommendations(),
793 })
794 }
795
796 pub fn clear(&mut self) {
798 self.events.clear();
799 self.active_timers.clear();
800 self.memory_snapshots.clear();
801 self.layer_profiles.clear();
802 self.bottlenecks.clear();
803 self.start_time = None;
804 self.gpu_kernel_profiles.clear();
806 self.memory_allocations.clear();
807 self.layer_latency_profiles.clear();
808 self.io_profiles.clear();
809 self.cpu_bottleneck_analysis.clear();
810 if let Ok(mut tracker) = self.memory_tracker.lock() {
811 *tracker = MemoryTracker::new();
812 }
813 self.io_monitor = IoMonitor::new();
814 }
815
816 pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
820 if let Some(ref mut gpu_profiler) = self.gpu_profiler {
821 gpu_profiler.profile_kernel(kernel_profile.clone());
822 }
823 self.gpu_kernel_profiles.push(kernel_profile);
824 }
825
826 pub fn track_memory_allocation(
828 &mut self,
829 size_bytes: usize,
830 allocation_type: MemoryAllocationType,
831 device_id: Option<i32>,
832 stack_trace: Vec<String>,
833 ) -> Uuid {
834 let allocation_id = Uuid::new_v4();
835 let allocation = MemoryAllocation {
836 allocation_id,
837 size_bytes,
838 allocation_type,
839 device_id,
840 timestamp: SystemTime::now(),
841 stack_trace,
842 freed: false,
843 free_timestamp: None,
844 };
845
846 if let Ok(mut tracker) = self.memory_tracker.lock() {
847 tracker.track_allocation(allocation.clone());
848 }
849
850 self.memory_allocations.insert(allocation_id, allocation);
851 allocation_id
852 }
853
854 pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
856 if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
857 allocation.freed = true;
858 allocation.free_timestamp = Some(SystemTime::now());
859 }
860
861 if let Ok(mut tracker) = self.memory_tracker.lock() {
862 tracker.track_deallocation(allocation_id);
863 }
864 }
865
866 pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
868 self.layer_latency_profiles
869 .insert(layer_latency.layer_name.clone(), layer_latency);
870 }
871
872 pub fn start_io_profiling(
874 &mut self,
875 operation_type: IoOperationType,
876 bytes_expected: usize,
877 ) -> Uuid {
878 self.io_monitor.start_io_operation(operation_type, bytes_expected)
879 }
880
881 pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
883 if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
884 {
885 self.io_profiles.push(profile);
886 }
887 }
888
889 pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
891 let analysis = CpuBottleneckAnalysis {
894 thread_id: 0, cpu_usage: 0.75, context_switches: 1000,
897 cache_misses: 500,
898 instructions_per_cycle: 2.5,
899 branch_mispredictions: 100,
900 hot_functions: vec![
901 HotFunction {
902 function_name: "tensor_multiply".to_string(),
903 self_time_percentage: 25.0,
904 call_count: 1000,
905 avg_time_per_call: Duration::from_micros(250),
906 },
907 HotFunction {
908 function_name: "gradient_computation".to_string(),
909 self_time_percentage: 20.0,
910 call_count: 500,
911 avg_time_per_call: Duration::from_micros(400),
912 },
913 ],
914 bottleneck_score: 0.6,
915 };
916
917 self.cpu_bottleneck_analysis.push(analysis.clone());
918 vec![analysis]
919 }
920
921 pub fn get_memory_stats(&self) -> Option<MemoryStats> {
923 if let Ok(tracker) = self.memory_tracker.lock() {
924 Some(tracker.get_memory_stats())
925 } else {
926 None
927 }
928 }
929
930 pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
932 self.gpu_profiler
933 .as_ref()
934 .map(|profiler| profiler.get_gpu_utilization(device_id))
935 }
936
937 pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
939 let mut stats = HashMap::new();
940
941 stats.insert(
942 IoDeviceType::SSD,
943 self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
944 );
945 stats.insert(
946 IoDeviceType::HDD,
947 self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
948 );
949 stats.insert(
950 IoDeviceType::Network,
951 self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
952 );
953 stats.insert(
954 IoDeviceType::Memory,
955 self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
956 );
957 stats.insert(
958 IoDeviceType::Cache,
959 self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
960 );
961
962 stats
963 }
964
965 pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
967 self.layer_latency_profiles
968 .values()
969 .map(|profile| LayerLatencyAnalysis {
970 layer_name: profile.layer_name.clone(),
971 layer_type: profile.layer_type.clone(),
972 total_time: profile.cpu_time
973 + profile.gpu_time
974 + profile.memory_copy_time
975 + profile.sync_time,
976 cpu_percentage: profile.cpu_time.as_secs_f64()
977 / (profile.cpu_time
978 + profile.gpu_time
979 + profile.memory_copy_time
980 + profile.sync_time)
981 .as_secs_f64()
982 * 100.0,
983 gpu_percentage: profile.gpu_time.as_secs_f64()
984 / (profile.cpu_time
985 + profile.gpu_time
986 + profile.memory_copy_time
987 + profile.sync_time)
988 .as_secs_f64()
989 * 100.0,
990 memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
991 / (profile.cpu_time
992 + profile.gpu_time
993 + profile.memory_copy_time
994 + profile.sync_time)
995 .as_secs_f64()
996 * 100.0,
997 flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
998 profile.flops as f64 / profile.gpu_time.as_secs_f64()
999 } else {
1000 0.0
1001 },
1002 memory_bandwidth_utilization: profile.cache_hit_rate,
1003 bottleneck_type: self.identify_layer_bottleneck(profile),
1004 })
1005 .collect()
1006 }
1007
1008 pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
1010 let memory_stats = self.get_memory_stats();
1011 let io_bandwidth_stats = self.get_io_bandwidth_stats();
1012 let layer_analysis = self.get_layer_latency_analysis();
1013
1014 let gpu_utilization =
1015 self.gpu_profiler.as_ref().map(|profiler| profiler.get_gpu_utilization(0));
1016
1017 PerformanceAnalysis {
1018 memory_stats,
1019 io_bandwidth_stats,
1020 layer_analysis,
1021 gpu_utilization,
1022 cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
1023 total_gpu_kernels: self.gpu_kernel_profiles.len(),
1024 total_io_operations: self.io_profiles.len(),
1025 performance_score: self.calculate_overall_performance_score(),
1026 recommendations: self.generate_enhanced_recommendations(),
1027 }
1028 }
1029
1030 fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
1031 let total_time =
1032 profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
1033
1034 if profile.memory_copy_time > total_time / 2 {
1035 "Memory Bandwidth".to_string()
1036 } else if profile.sync_time > total_time / 3 {
1037 "GPU Synchronization".to_string()
1038 } else if profile.gpu_time > profile.cpu_time * 10 {
1039 "GPU Compute".to_string()
1040 } else {
1041 "CPU Compute".to_string()
1042 }
1043 }
1044
1045 fn calculate_overall_performance_score(&self) -> f64 {
1046 let mut score: f64 = 100.0;
1047
1048 for bottleneck in &self.bottlenecks {
1050 match bottleneck.severity {
1051 BottleneckSeverity::Critical => score -= 20.0,
1052 BottleneckSeverity::High => score -= 10.0,
1053 BottleneckSeverity::Medium => score -= 5.0,
1054 BottleneckSeverity::Low => score -= 2.0,
1055 }
1056 }
1057
1058 if let Some(gpu_util) = self.get_gpu_utilization(0) {
1060 if gpu_util < 0.5 {
1061 score -= 15.0;
1062 } else if gpu_util < 0.7 {
1063 score -= 8.0;
1064 }
1065 }
1066
1067 if let Some(memory_stats) = self.get_memory_stats() {
1069 if memory_stats.memory_efficiency < 0.8 {
1070 score -= 10.0;
1071 }
1072 }
1073
1074 score.max(0.0)
1075 }
1076
1077 fn generate_enhanced_recommendations(&self) -> Vec<String> {
1078 let mut recommendations = Vec::new();
1079
1080 if let Some(gpu_util) = self.get_gpu_utilization(0) {
1082 if gpu_util < 0.5 {
1083 recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
1084 }
1085 }
1086
1087 if let Some(memory_stats) = self.get_memory_stats() {
1089 if memory_stats.memory_efficiency < 0.8 {
1090 recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
1091 }
1092
1093 if memory_stats.active_allocations > 10000 {
1094 recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
1095 }
1096 }
1097
1098 let io_stats = self.get_io_bandwidth_stats();
1100 if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
1101 if ssd_bandwidth < 100.0 {
1102 recommendations.push(
1104 "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
1105 .to_string(),
1106 );
1107 }
1108 }
1109
1110 let layer_analysis = self.get_layer_latency_analysis();
1112 for analysis in &layer_analysis {
1113 if analysis.memory_copy_percentage > 50.0 {
1114 recommendations.push(format!(
1115 "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
1116 analysis.layer_name
1117 ));
1118 }
1119
1120 if analysis.cpu_percentage > 80.0 {
1121 recommendations.push(format!(
1122 "Layer '{}' is CPU bound. Consider GPU acceleration.",
1123 analysis.layer_name
1124 ));
1125 }
1126 }
1127
1128 if recommendations.is_empty() {
1129 recommendations
1130 .push("Performance appears optimal based on current analysis.".to_string());
1131 }
1132
1133 recommendations
1134 }
1135
1136 fn analyze_layer_bottlenecks(&mut self) {
1139 for (layer_name, profile) in &self.layer_profiles {
1140 if profile.forward_times.is_empty() {
1141 continue;
1142 }
1143
1144 let avg_forward_time =
1145 profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
1146
1147 if avg_forward_time.as_millis() > 100 {
1149 let mut metrics = HashMap::new();
1150 metrics.insert(
1151 "avg_forward_time_ms".to_string(),
1152 avg_forward_time.as_millis() as f64,
1153 );
1154 metrics.insert("call_count".to_string(), profile.call_count as f64);
1155
1156 self.bottlenecks.push(PerformanceBottleneck {
1157 bottleneck_type: BottleneckType::ModelComputation,
1158 location: layer_name.clone(),
1159 severity: if avg_forward_time.as_millis() > 500 {
1160 BottleneckSeverity::High
1161 } else {
1162 BottleneckSeverity::Medium
1163 },
1164 description: format!(
1165 "Layer '{}' has slow forward pass: {:.1}ms average",
1166 layer_name,
1167 avg_forward_time.as_millis()
1168 ),
1169 suggestion: "Consider optimizing layer implementation or reducing layer size"
1170 .to_string(),
1171 metrics,
1172 });
1173 }
1174 }
1175 }
1176
1177 fn analyze_memory_bottlenecks(&mut self) {
1178 if self.memory_snapshots.len() < 2 {
1179 return;
1180 }
1181
1182 let recent_snapshots = if self.memory_snapshots.len() > 10 {
1184 &self.memory_snapshots[self.memory_snapshots.len() - 10..]
1185 } else {
1186 &self.memory_snapshots
1187 };
1188
1189 if recent_snapshots.len() >= 5 {
1190 let initial_memory = recent_snapshots[0].heap_allocated;
1191 let final_memory = recent_snapshots
1192 .last()
1193 .expect("recent_snapshots has at least 5 elements")
1194 .heap_allocated;
1195
1196 if final_memory > initial_memory * 2 {
1197 let mut metrics = HashMap::new();
1198 metrics.insert(
1199 "initial_memory_mb".to_string(),
1200 initial_memory as f64 / (1024.0 * 1024.0),
1201 );
1202 metrics.insert(
1203 "final_memory_mb".to_string(),
1204 final_memory as f64 / (1024.0 * 1024.0),
1205 );
1206 metrics.insert(
1207 "growth_ratio".to_string(),
1208 final_memory as f64 / initial_memory as f64,
1209 );
1210
1211 self.bottlenecks.push(PerformanceBottleneck {
1212 bottleneck_type: BottleneckType::MemoryBound,
1213 location: "Memory Usage".to_string(),
1214 severity: BottleneckSeverity::High,
1215 description: "Significant memory growth detected during profiling".to_string(),
1216 suggestion: "Check for memory leaks or inefficient memory usage patterns"
1217 .to_string(),
1218 metrics,
1219 });
1220 }
1221 }
1222 }
1223
1224 fn analyze_tensor_bottlenecks(&mut self) {
1225 let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
1227
1228 for event in &self.events {
1229 if let ProfileEvent::TensorOperation {
1230 operation,
1231 duration,
1232 ..
1233 } = event
1234 {
1235 operation_groups.entry(operation.clone()).or_default().push(*duration);
1236 }
1237 }
1238
1239 for (operation, durations) in operation_groups {
1241 if durations.is_empty() {
1242 continue;
1243 }
1244
1245 let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
1246 let total_time = durations.iter().sum::<Duration>();
1247
1248 if avg_duration.as_millis() > 10 {
1250 let mut metrics = HashMap::new();
1251 metrics.insert(
1252 "avg_duration_ms".to_string(),
1253 avg_duration.as_millis() as f64,
1254 );
1255 metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
1256 metrics.insert("call_count".to_string(), durations.len() as f64);
1257
1258 self.bottlenecks.push(PerformanceBottleneck {
1259 bottleneck_type: BottleneckType::CpuBound,
1260 location: format!("Tensor Operation: {}", operation),
1261 severity: if avg_duration.as_millis() > 50 {
1262 BottleneckSeverity::High
1263 } else {
1264 BottleneckSeverity::Medium
1265 },
1266 description: format!(
1267 "Tensor operation '{}' is slow: {:.1}ms average",
1268 operation,
1269 avg_duration.as_millis()
1270 ),
1271 suggestion:
1272 "Consider optimizing tensor operation or using different data types"
1273 .to_string(),
1274 metrics,
1275 });
1276 }
1277 }
1278 }
1279
1280 fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
1281 let mut layer_times: Vec<(String, Duration)> = self
1282 .layer_profiles
1283 .iter()
1284 .map(|(name, profile)| {
1285 let avg_time = if profile.forward_times.is_empty() {
1286 Duration::ZERO
1287 } else {
1288 profile.forward_times.iter().sum::<Duration>()
1289 / profile.forward_times.len() as u32
1290 };
1291 (name.clone(), avg_time)
1292 })
1293 .collect();
1294
1295 layer_times.sort_by_key(|item| std::cmp::Reverse(item.1));
1296 layer_times.truncate(limit);
1297 layer_times
1298 }
1299
1300 fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
1301 if self.memory_snapshots.is_empty() {
1302 return MemoryEfficiencyAnalysis::default();
1303 }
1304
1305 let memory_values: Vec<usize> =
1306 self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
1307
1308 let max_memory = memory_values.iter().max().copied().unwrap_or(0);
1309 let min_memory = memory_values.iter().min().copied().unwrap_or(0);
1310 let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
1311
1312 MemoryEfficiencyAnalysis {
1313 peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
1314 min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
1315 avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
1316 memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
1317 efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
1318 }
1319 }
1320
1321 fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
1322 if values.len() < 2 {
1323 return 0.0;
1324 }
1325
1326 let variance_sum: f64 = values
1327 .iter()
1328 .map(|&x| {
1329 let diff = x as f64 - mean as f64;
1330 diff * diff
1331 })
1332 .sum();
1333
1334 variance_sum / (values.len() - 1) as f64
1335 }
1336
1337 fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
1338 if values.is_empty() {
1339 return 0.0;
1340 }
1341
1342 let max_memory = values.iter().max().copied().unwrap_or(0);
1343 let min_memory = values.iter().min().copied().unwrap_or(0);
1344
1345 if max_memory == 0 {
1346 return 100.0;
1347 }
1348
1349 100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
1351 }
1352
1353 fn generate_performance_recommendations(&self) -> Vec<String> {
1354 let mut recommendations = Vec::new();
1355
1356 for bottleneck in &self.bottlenecks {
1358 match bottleneck.bottleneck_type {
1359 BottleneckType::ModelComputation => {
1360 recommendations.push(
1361 "Consider model architecture optimizations or layer fusion".to_string(),
1362 );
1363 },
1364 BottleneckType::MemoryBound => {
1365 recommendations.push(
1366 "Optimize memory usage with gradient checkpointing or model parallelism"
1367 .to_string(),
1368 );
1369 },
1370 BottleneckType::CpuBound => {
1371 recommendations.push(
1372 "Consider GPU acceleration or optimized CPU implementations".to_string(),
1373 );
1374 },
1375 _ => {},
1376 }
1377 }
1378
1379 if self.events.len() > 10000 {
1381 recommendations.push(
1382 "High number of profiling events - consider reducing profiling overhead"
1383 .to_string(),
1384 );
1385 }
1386
1387 let stats = self.get_statistics();
1388 if let Some(layer_stats) = stats.get("LayerExecution") {
1389 if layer_stats.avg_duration.as_millis() > 50 {
1390 recommendations.push(
1391 "Average layer execution time is high - consider layer optimization"
1392 .to_string(),
1393 );
1394 }
1395 }
1396
1397 if recommendations.is_empty() {
1398 recommendations
1399 .push("Performance appears optimal based on current profiling data".to_string());
1400 }
1401
1402 recommendations
1403 }
1404}
1405
1406#[derive(Debug, Clone, Serialize, Deserialize)]
1408pub struct MemoryEfficiencyAnalysis {
1409 pub peak_memory_mb: f64,
1410 pub min_memory_mb: f64,
1411 pub avg_memory_mb: f64,
1412 pub memory_variance: f64,
1413 pub efficiency_score: f64,
1414}
1415
1416impl Default for MemoryEfficiencyAnalysis {
1417 fn default() -> Self {
1418 Self {
1419 peak_memory_mb: 0.0,
1420 min_memory_mb: 0.0,
1421 avg_memory_mb: 0.0,
1422 memory_variance: 0.0,
1423 efficiency_score: 100.0,
1424 }
1425 }
1426}
1427
1428#[derive(Debug, Clone, Serialize, Deserialize)]
1430pub struct ProfilerReport {
1431 pub total_events: usize,
1432 pub total_runtime: Duration,
1433 pub statistics: HashMap<String, ProfileStats>,
1434 pub bottlenecks: Vec<PerformanceBottleneck>,
1435 pub slowest_layers: Vec<(String, Duration)>,
1436 pub memory_efficiency: MemoryEfficiencyAnalysis,
1437 pub recommendations: Vec<String>,
1438}
1439
1440pub struct ScopedTimer<'a> {
1442 profiler: &'a mut Profiler,
1443 name: String,
1444}
1445
1446impl<'a> ScopedTimer<'a> {
1447 pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1448 profiler.start_timer(&name);
1449 Self { profiler, name }
1450 }
1451}
1452
1453impl<'a> Drop for ScopedTimer<'a> {
1454 fn drop(&mut self) {
1455 self.profiler.end_timer(&self.name);
1456 }
1457}
1458
1459#[derive(Debug, Clone, Serialize, Deserialize)]
1461pub struct LayerLatencyAnalysis {
1462 pub layer_name: String,
1463 pub layer_type: String,
1464 pub total_time: Duration,
1465 pub cpu_percentage: f64,
1466 pub gpu_percentage: f64,
1467 pub memory_copy_percentage: f64,
1468 pub flops_per_second: f64,
1469 pub memory_bandwidth_utilization: f64,
1470 pub bottleneck_type: String,
1471}
1472
1473#[derive(Debug, Serialize, Deserialize)]
1475pub struct PerformanceAnalysis {
1476 pub memory_stats: Option<MemoryStats>,
1477 pub io_bandwidth_stats: HashMap<IoDeviceType, f64>,
1478 pub layer_analysis: Vec<LayerLatencyAnalysis>,
1479 pub gpu_utilization: Option<f64>,
1480 pub cpu_bottlenecks: Vec<CpuBottleneckAnalysis>,
1481 pub total_gpu_kernels: usize,
1482 pub total_io_operations: usize,
1483 pub performance_score: f64,
1484 pub recommendations: Vec<String>,
1485}
1486
1487#[derive(Debug, Serialize, Deserialize)]
1489pub struct EnhancedProfilerReport {
1490 pub basic_report: ProfilerReport,
1491 pub performance_analysis: PerformanceAnalysis,
1492 pub gpu_kernel_summary: GpuKernelSummary,
1493 pub memory_allocation_summary: MemoryAllocationSummary,
1494 pub io_performance_summary: IoPerformanceSummary,
1495}
1496
1497#[derive(Debug, Serialize, Deserialize)]
1498pub struct GpuKernelSummary {
1499 pub total_kernels: usize,
1500 pub total_execution_time: Duration,
1501 pub avg_occupancy: f64,
1502 pub avg_compute_utilization: f64,
1503 pub slowest_kernels: Vec<String>,
1504}
1505
1506#[derive(Debug, Serialize, Deserialize)]
1507pub struct MemoryAllocationSummary {
1508 pub total_allocations: usize,
1509 pub peak_memory_usage: usize,
1510 pub memory_efficiency: f64,
1511 pub largest_allocations: Vec<String>,
1512 pub memory_leaks: usize,
1513}
1514
1515#[derive(Debug, Serialize, Deserialize)]
1516pub struct IoPerformanceSummary {
1517 pub total_operations: usize,
1518 pub total_bytes_transferred: usize,
1519 pub avg_bandwidth_by_device: HashMap<IoDeviceType, f64>,
1520 pub slowest_operations: Vec<String>,
1521}
1522
1523impl Profiler {
1524 pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
1526 let basic_report = self.generate_report().await?;
1527 let performance_analysis = self.get_performance_analysis();
1528
1529 let gpu_kernel_summary = self.generate_gpu_kernel_summary();
1530 let memory_allocation_summary = self.generate_memory_allocation_summary();
1531 let io_performance_summary = self.generate_io_performance_summary();
1532
1533 Ok(EnhancedProfilerReport {
1534 basic_report,
1535 performance_analysis,
1536 gpu_kernel_summary,
1537 memory_allocation_summary,
1538 io_performance_summary,
1539 })
1540 }
1541
1542 fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
1543 let total_kernels = self.gpu_kernel_profiles.len();
1544 let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
1545
1546 let avg_occupancy = if total_kernels > 0 {
1547 self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
1548 } else {
1549 0.0
1550 };
1551
1552 let avg_compute_utilization = if total_kernels > 0 {
1553 self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1554 / total_kernels as f64
1555 } else {
1556 0.0
1557 };
1558
1559 let mut kernels_by_time: Vec<_> = self
1560 .gpu_kernel_profiles
1561 .iter()
1562 .map(|k| (k.kernel_name.clone(), k.execution_time))
1563 .collect();
1564 kernels_by_time.sort_by_key(|item| std::cmp::Reverse(item.1));
1565
1566 let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1567
1568 GpuKernelSummary {
1569 total_kernels,
1570 total_execution_time,
1571 avg_occupancy,
1572 avg_compute_utilization,
1573 slowest_kernels,
1574 }
1575 }
1576
1577 fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1578 let total_allocations = self.memory_allocations.len();
1579 let peak_memory_usage =
1580 self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1581
1582 let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1583 stats.memory_efficiency
1584 } else {
1585 1.0
1586 };
1587
1588 let mut allocations_by_size: Vec<_> = self
1589 .memory_allocations
1590 .values()
1591 .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1592 .collect();
1593 allocations_by_size.sort_by_key(|item| std::cmp::Reverse(item.1));
1594
1595 let largest_allocations =
1596 allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1597
1598 let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1599
1600 MemoryAllocationSummary {
1601 total_allocations,
1602 peak_memory_usage,
1603 memory_efficiency,
1604 largest_allocations,
1605 memory_leaks,
1606 }
1607 }
1608
1609 fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1610 let total_operations = self.io_profiles.len();
1611 let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1612
1613 let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1614
1615 let mut operations_by_duration: Vec<_> = self
1616 .io_profiles
1617 .iter()
1618 .map(|io| {
1619 (
1620 format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1621 io.duration,
1622 )
1623 })
1624 .collect();
1625 operations_by_duration.sort_by_key(|item| std::cmp::Reverse(item.1));
1626
1627 let slowest_operations =
1628 operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1629
1630 IoPerformanceSummary {
1631 total_operations,
1632 total_bytes_transferred,
1633 avg_bandwidth_by_device,
1634 slowest_operations,
1635 }
1636 }
1637}
1638
1639#[macro_export]
1641macro_rules! profile_scope {
1642 ($profiler:expr, $name:expr) => {
1643 let _timer = ScopedTimer::new($profiler, $name.to_string());
1644 };
1645}