1use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant, SystemTime};
8use uuid::Uuid;
9
10use crate::DebugConfig;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub enum ProfileEvent {
15 FunctionCall {
16 function_name: String,
17 duration: Duration,
18 memory_delta: i64,
19 },
20 LayerExecution {
21 layer_name: String,
22 layer_type: String,
23 forward_time: Duration,
24 backward_time: Option<Duration>,
25 memory_usage: usize,
26 parameter_count: usize,
27 },
28 TensorOperation {
29 operation: String,
30 tensor_shape: Vec<usize>,
31 duration: Duration,
32 memory_allocated: usize,
33 },
34 ModelInference {
35 batch_size: usize,
36 sequence_length: usize,
37 duration: Duration,
38 tokens_per_second: f64,
39 },
40 GradientComputation {
41 layer_name: String,
42 gradient_norm: f64,
43 duration: Duration,
44 },
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ProfileStats {
50 pub event_type: String,
51 pub count: usize,
52 pub total_duration: Duration,
53 pub avg_duration: Duration,
54 pub min_duration: Duration,
55 pub max_duration: Duration,
56 pub total_memory: i64,
57 pub avg_memory: f64,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct MemorySnapshot {
63 pub timestamp: chrono::DateTime<chrono::Utc>,
64 pub heap_allocated: usize,
65 pub heap_used: usize,
66 pub stack_size: usize,
67 pub gpu_allocated: Option<usize>,
68 pub gpu_used: Option<usize>,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct PerformanceBottleneck {
74 pub bottleneck_type: BottleneckType,
75 pub location: String,
76 pub severity: BottleneckSeverity,
77 pub description: String,
78 pub suggestion: String,
79 pub metrics: HashMap<String, f64>,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub enum BottleneckType {
84 CpuBound,
85 MemoryBound,
86 IoBound,
87 GpuBound,
88 NetworkBound,
89 DataLoading,
90 ModelComputation,
91 GradientComputation,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub enum BottleneckSeverity {
96 Low,
97 Medium,
98 High,
99 Critical,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct CpuProfile {
105 pub function_name: String,
106 pub self_time: Duration,
107 pub total_time: Duration,
108 pub call_count: usize,
109 pub children: Vec<CpuProfile>,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct GpuKernelProfile {
115 pub kernel_name: String,
116 pub grid_size: (u32, u32, u32),
117 pub block_size: (u32, u32, u32),
118 pub shared_memory_bytes: usize,
119 pub registers_per_thread: u32,
120 pub occupancy: f64,
121 pub execution_time: Duration,
122 pub memory_bandwidth_gb_s: f64,
123 pub compute_utilization: f64,
124 pub stream_id: i32,
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct MemoryAllocation {
130 pub allocation_id: Uuid,
131 pub size_bytes: usize,
132 pub allocation_type: MemoryAllocationType,
133 pub device_id: Option<i32>,
134 pub timestamp: SystemTime,
135 pub stack_trace: Vec<String>,
136 pub freed: bool,
137 pub free_timestamp: Option<SystemTime>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub enum MemoryAllocationType {
142 Host,
143 Device,
144 Unified,
145 Pinned,
146 Mapped,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct LayerLatencyProfile {
152 pub layer_name: String,
153 pub layer_type: String,
154 pub input_shapes: Vec<Vec<usize>>,
155 pub output_shapes: Vec<Vec<usize>>,
156 pub cpu_time: Duration,
157 pub gpu_time: Duration,
158 pub memory_copy_time: Duration,
159 pub sync_time: Duration,
160 pub parameter_count: usize,
161 pub flops: u64,
162 pub memory_footprint_bytes: usize,
163 pub cache_hit_rate: f64,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct IoProfile {
169 pub operation_type: IoOperationType,
170 pub file_path: Option<String>,
171 pub bytes_transferred: usize,
172 pub duration: Duration,
173 pub bandwidth_mb_s: f64,
174 pub queue_time: Duration,
175 pub device_type: IoDeviceType,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub enum IoOperationType {
180 FileRead,
181 FileWrite,
182 NetworkRead,
183 NetworkWrite,
184 DatabaseQuery,
185 CacheLoad,
186 CacheStore,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
190pub enum IoDeviceType {
191 SSD,
192 HDD,
193 Network,
194 Memory,
195 Cache,
196}
197
198#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CpuBottleneckAnalysis {
201 pub thread_id: u64,
202 pub cpu_usage: f64,
203 pub context_switches: u64,
204 pub cache_misses: u64,
205 pub instructions_per_cycle: f64,
206 pub branch_mispredictions: u64,
207 pub hot_functions: Vec<HotFunction>,
208 pub bottleneck_score: f64,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct HotFunction {
213 pub function_name: String,
214 pub self_time_percentage: f64,
215 pub call_count: usize,
216 pub avg_time_per_call: Duration,
217}
218
219#[derive(Debug)]
221pub struct MemoryTracker {
222 allocations: HashMap<Uuid, MemoryAllocation>,
223 total_allocated: usize,
224 peak_allocated: usize,
225 allocation_count: usize,
226 deallocation_count: usize,
227}
228
229impl MemoryTracker {
230 pub fn new() -> Self {
231 Self {
232 allocations: HashMap::new(),
233 total_allocated: 0,
234 peak_allocated: 0,
235 allocation_count: 0,
236 deallocation_count: 0,
237 }
238 }
239
240 pub fn track_allocation(&mut self, allocation: MemoryAllocation) {
241 self.total_allocated += allocation.size_bytes;
242 self.allocation_count += 1;
243
244 if self.total_allocated > self.peak_allocated {
245 self.peak_allocated = self.total_allocated;
246 }
247
248 self.allocations.insert(allocation.allocation_id, allocation);
249 }
250
251 pub fn track_deallocation(&mut self, allocation_id: Uuid) {
252 if let Some(mut allocation) = self.allocations.remove(&allocation_id) {
253 allocation.freed = true;
254 allocation.free_timestamp = Some(SystemTime::now());
255 self.total_allocated = self.total_allocated.saturating_sub(allocation.size_bytes);
256 self.deallocation_count += 1;
257 }
258 }
259
260 pub fn get_memory_stats(&self) -> MemoryStats {
261 MemoryStats {
262 total_allocated: self.total_allocated,
263 peak_allocated: self.peak_allocated,
264 active_allocations: self.allocations.len(),
265 allocation_count: self.allocation_count,
266 deallocation_count: self.deallocation_count,
267 memory_efficiency: if self.allocation_count > 0 {
268 self.deallocation_count as f64 / self.allocation_count as f64
269 } else {
270 1.0
271 },
272 }
273 }
274}
275
276#[derive(Debug, Clone, Serialize, Deserialize)]
277pub struct MemoryStats {
278 pub total_allocated: usize,
279 pub peak_allocated: usize,
280 pub active_allocations: usize,
281 pub allocation_count: usize,
282 pub deallocation_count: usize,
283 pub memory_efficiency: f64,
284}
285
286#[derive(Debug)]
288#[allow(dead_code)]
289pub struct GpuProfiler {
290 #[allow(dead_code)]
291 device_count: i32,
292 active_streams: HashMap<i32, Vec<GpuKernelProfile>>,
293 memory_pools: HashMap<i32, GpuMemoryPool>,
294}
295
296#[allow(dead_code)]
297#[derive(Debug)]
298pub struct GpuMemoryPool {
299 #[allow(dead_code)]
300 device_id: i32,
301 total_memory: usize,
302 free_memory: usize,
303 fragmentation_score: f64,
304}
305
306impl GpuProfiler {
307 pub fn new() -> Result<Self> {
308 Ok(Self {
310 device_count: 1, active_streams: HashMap::new(),
312 memory_pools: HashMap::new(),
313 })
314 }
315
316 pub fn profile_kernel(&mut self, kernel_profile: GpuKernelProfile) {
317 self.active_streams
318 .entry(kernel_profile.stream_id)
319 .or_insert_with(Vec::new)
320 .push(kernel_profile);
321 }
322
323 pub fn get_gpu_utilization(&self, device_id: i32) -> f64 {
324 if let Some(kernels) = self.active_streams.get(&device_id) {
326 if kernels.is_empty() {
327 0.0
328 } else {
329 kernels.iter().map(|k| k.compute_utilization).sum::<f64>() / kernels.len() as f64
330 }
331 } else {
332 0.0
333 }
334 }
335}
336
337#[derive(Debug)]
339pub struct IoMonitor {
340 active_operations: HashMap<Uuid, IoOperation>,
341 bandwidth_history: Vec<BandwidthSample>,
342 io_queue_depth: usize,
343}
344#[allow(dead_code)]
345#[derive(Debug)]
346pub struct IoOperation {
347 #[allow(dead_code)]
348 operation_id: Uuid,
349 start_time: Instant,
350 operation_type: IoOperationType,
351 bytes_expected: usize,
352}
353
354#[derive(Debug, Clone, Serialize, Deserialize)]
355pub struct BandwidthSample {
356 pub timestamp: SystemTime,
357 pub bandwidth_mb_s: f64,
358 pub device_type: IoDeviceType,
359}
360
361impl IoMonitor {
362 pub fn new() -> Self {
363 Self {
364 active_operations: HashMap::new(),
365 bandwidth_history: Vec::new(),
366 io_queue_depth: 0,
367 }
368 }
369
370 pub fn start_io_operation(
371 &mut self,
372 operation_type: IoOperationType,
373 bytes_expected: usize,
374 ) -> Uuid {
375 let operation_id = Uuid::new_v4();
376 let operation = IoOperation {
377 operation_id,
378 start_time: Instant::now(),
379 operation_type,
380 bytes_expected,
381 };
382
383 self.active_operations.insert(operation_id, operation);
384 self.io_queue_depth += 1;
385 operation_id
386 }
387
388 pub fn finish_io_operation(
389 &mut self,
390 operation_id: Uuid,
391 bytes_transferred: usize,
392 ) -> Option<IoProfile> {
393 if let Some(operation) = self.active_operations.remove(&operation_id) {
394 let duration = operation.start_time.elapsed();
395 let bandwidth_mb_s = if duration.as_secs_f64() > 0.0 {
396 bytes_transferred as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()
397 } else {
398 0.0
399 };
400
401 self.io_queue_depth = self.io_queue_depth.saturating_sub(1);
402
403 let device_type = match operation.operation_type {
404 IoOperationType::FileRead | IoOperationType::FileWrite => IoDeviceType::SSD,
405 IoOperationType::NetworkRead | IoOperationType::NetworkWrite => {
406 IoDeviceType::Network
407 },
408 IoOperationType::CacheLoad | IoOperationType::CacheStore => IoDeviceType::Cache,
409 _ => IoDeviceType::Memory,
410 };
411
412 self.bandwidth_history.push(BandwidthSample {
414 timestamp: SystemTime::now(),
415 bandwidth_mb_s,
416 device_type: device_type.clone(),
417 });
418
419 if self.bandwidth_history.len() > 1000 {
421 self.bandwidth_history.drain(0..500);
422 }
423
424 Some(IoProfile {
425 operation_type: operation.operation_type,
426 file_path: None, bytes_transferred,
428 duration,
429 bandwidth_mb_s,
430 queue_time: Duration::from_millis(self.io_queue_depth as u64 * 10), device_type,
432 })
433 } else {
434 None
435 }
436 }
437
438 pub fn get_average_bandwidth(&self, device_type: &IoDeviceType) -> f64 {
439 let samples: Vec<f64> = self
440 .bandwidth_history
441 .iter()
442 .filter(|s| s.device_type == *device_type)
443 .map(|s| s.bandwidth_mb_s)
444 .collect();
445
446 if samples.is_empty() {
447 0.0
448 } else {
449 samples.iter().sum::<f64>() / samples.len() as f64
450 }
451 }
452}
453
454#[derive(Debug)]
456pub struct Profiler {
457 #[allow(dead_code)]
458 config: DebugConfig,
459 events: Vec<ProfileEvent>,
460 active_timers: HashMap<String, Instant>,
461 memory_snapshots: Vec<MemorySnapshot>,
462 start_time: Option<Instant>,
463 layer_profiles: HashMap<String, LayerProfile>,
464 bottlenecks: Vec<PerformanceBottleneck>,
465 gpu_kernel_profiles: Vec<GpuKernelProfile>,
467 memory_allocations: HashMap<Uuid, MemoryAllocation>,
468 layer_latency_profiles: HashMap<String, LayerLatencyProfile>,
469 io_profiles: Vec<IoProfile>,
470 cpu_bottleneck_analysis: Vec<CpuBottleneckAnalysis>,
471 memory_tracker: Arc<Mutex<MemoryTracker>>,
472 gpu_profiler: Option<GpuProfiler>,
473 io_monitor: IoMonitor,
474}
475
476#[derive(Debug)]
477pub struct LayerProfile {
478 #[allow(dead_code)]
479 layer_name: String,
480 forward_times: Vec<Duration>,
481 backward_times: Vec<Duration>,
482 memory_usage: Vec<usize>,
483 call_count: usize,
484}
485
486impl LayerProfile {
487 pub fn forward_times(&self) -> &Vec<Duration> {
489 &self.forward_times
490 }
491
492 pub fn backward_times(&self) -> &Vec<Duration> {
494 &self.backward_times
495 }
496
497 pub fn memory_usage(&self) -> &Vec<usize> {
499 &self.memory_usage
500 }
501
502 pub fn call_count(&self) -> usize {
504 self.call_count
505 }
506}
507
508impl Profiler {
509 pub fn new(config: &DebugConfig) -> Self {
511 Self {
512 config: config.clone(),
513 events: Vec::new(),
514 active_timers: HashMap::new(),
515 memory_snapshots: Vec::new(),
516 start_time: None,
517 layer_profiles: HashMap::new(),
518 bottlenecks: Vec::new(),
519 gpu_kernel_profiles: Vec::new(),
521 memory_allocations: HashMap::new(),
522 layer_latency_profiles: HashMap::new(),
523 io_profiles: Vec::new(),
524 cpu_bottleneck_analysis: Vec::new(),
525 memory_tracker: Arc::new(Mutex::new(MemoryTracker::new())),
526 gpu_profiler: GpuProfiler::new().ok(),
527 io_monitor: IoMonitor::new(),
528 }
529 }
530
531 pub async fn start(&mut self) -> Result<()> {
533 tracing::info!("Starting performance profiler");
534 self.start_time = Some(Instant::now());
535 self.take_memory_snapshot();
536 Ok(())
537 }
538
539 pub fn get_events(&self) -> &Vec<ProfileEvent> {
541 &self.events
542 }
543
544 pub fn start_timer(&mut self, name: &str) {
546 self.active_timers.insert(name.to_string(), Instant::now());
547 }
548
549 pub fn end_timer(&mut self, name: &str) -> Option<Duration> {
551 if let Some(start_time) = self.active_timers.remove(name) {
552 let duration = start_time.elapsed();
553
554 self.events.push(ProfileEvent::FunctionCall {
556 function_name: name.to_string(),
557 duration,
558 memory_delta: 0, });
560
561 Some(duration)
562 } else {
563 tracing::warn!("Timer '{}' was not started", name);
564 None
565 }
566 }
567
568 pub fn record_layer_execution(
570 &mut self,
571 layer_name: &str,
572 layer_type: &str,
573 forward_time: Duration,
574 backward_time: Option<Duration>,
575 memory_usage: usize,
576 parameter_count: usize,
577 ) {
578 self.events.push(ProfileEvent::LayerExecution {
580 layer_name: layer_name.to_string(),
581 layer_type: layer_type.to_string(),
582 forward_time,
583 backward_time,
584 memory_usage,
585 parameter_count,
586 });
587
588 let profile =
590 self.layer_profiles
591 .entry(layer_name.to_string())
592 .or_insert_with(|| LayerProfile {
593 layer_name: layer_name.to_string(),
594 forward_times: Vec::new(),
595 backward_times: Vec::new(),
596 memory_usage: Vec::new(),
597 call_count: 0,
598 });
599
600 profile.forward_times.push(forward_time);
601 if let Some(backward) = backward_time {
602 profile.backward_times.push(backward);
603 }
604 profile.memory_usage.push(memory_usage);
605 profile.call_count += 1;
606 }
607
608 pub fn record_tensor_operation(
610 &mut self,
611 operation: &str,
612 tensor_shape: &[usize],
613 duration: Duration,
614 memory_allocated: usize,
615 ) {
616 self.events.push(ProfileEvent::TensorOperation {
617 operation: operation.to_string(),
618 tensor_shape: tensor_shape.to_vec(),
619 duration,
620 memory_allocated,
621 });
622 }
623
624 pub fn record_model_inference(
626 &mut self,
627 batch_size: usize,
628 sequence_length: usize,
629 duration: Duration,
630 ) {
631 let tokens_per_second = (batch_size * sequence_length) as f64 / duration.as_secs_f64();
632
633 self.events.push(ProfileEvent::ModelInference {
634 batch_size,
635 sequence_length,
636 duration,
637 tokens_per_second,
638 });
639 }
640
641 pub fn record_gradient_computation(
643 &mut self,
644 layer_name: &str,
645 gradient_norm: f64,
646 duration: Duration,
647 ) {
648 self.events.push(ProfileEvent::GradientComputation {
649 layer_name: layer_name.to_string(),
650 gradient_norm,
651 duration,
652 });
653 }
654
655 pub fn take_memory_snapshot(&mut self) {
657 let snapshot = MemorySnapshot {
659 timestamp: chrono::Utc::now(),
660 heap_allocated: 0, heap_used: 0,
662 stack_size: 0,
663 gpu_allocated: None,
664 gpu_used: None,
665 };
666
667 self.memory_snapshots.push(snapshot);
668
669 if self.memory_snapshots.len() > 1000 {
671 self.memory_snapshots.drain(0..500);
672 }
673 }
674
675 pub fn analyze_performance(&mut self) -> Vec<PerformanceBottleneck> {
677 self.bottlenecks.clear();
678
679 self.analyze_layer_bottlenecks();
681
682 self.analyze_memory_bottlenecks();
684
685 self.analyze_tensor_bottlenecks();
687
688 self.bottlenecks.clone()
689 }
690
691 pub fn get_statistics(&self) -> HashMap<String, ProfileStats> {
693 let mut stats = HashMap::new();
694
695 let mut grouped_events: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
697
698 for event in &self.events {
699 let event_type = match event {
700 ProfileEvent::FunctionCall { .. } => "FunctionCall",
701 ProfileEvent::LayerExecution { .. } => "LayerExecution",
702 ProfileEvent::TensorOperation { .. } => "TensorOperation",
703 ProfileEvent::ModelInference { .. } => "ModelInference",
704 ProfileEvent::GradientComputation { .. } => "GradientComputation",
705 };
706
707 grouped_events
708 .entry(event_type.to_string())
709 .or_insert_with(Vec::new)
710 .push(event);
711 }
712
713 for (event_type, events) in grouped_events {
715 let durations: Vec<Duration> = events
716 .iter()
717 .filter_map(|event| match event {
718 ProfileEvent::FunctionCall { duration, .. } => Some(*duration),
719 ProfileEvent::LayerExecution { forward_time, .. } => Some(*forward_time),
720 ProfileEvent::TensorOperation { duration, .. } => Some(*duration),
721 ProfileEvent::ModelInference { duration, .. } => Some(*duration),
722 ProfileEvent::GradientComputation { duration, .. } => Some(*duration),
723 })
724 .collect();
725
726 if !durations.is_empty() {
727 let total_duration: Duration = durations.iter().sum();
728 let avg_duration = total_duration / durations.len() as u32;
729 let min_duration = durations.iter().min().copied().unwrap_or_default();
730 let max_duration = durations.iter().max().copied().unwrap_or_default();
731
732 stats.insert(
733 event_type.clone(),
734 ProfileStats {
735 event_type,
736 count: durations.len(),
737 total_duration,
738 avg_duration,
739 min_duration,
740 max_duration,
741 total_memory: 0, avg_memory: 0.0,
743 },
744 );
745 }
746 }
747
748 stats
749 }
750
751 pub fn get_layer_profiles(&self) -> &HashMap<String, LayerProfile> {
753 &self.layer_profiles
754 }
755
756 pub fn get_memory_timeline(&self) -> &[MemorySnapshot] {
758 &self.memory_snapshots
759 }
760
761 pub async fn generate_report(&self) -> Result<ProfilerReport> {
763 let statistics = self.get_statistics();
764 let bottlenecks = self.bottlenecks.clone();
765 let total_events = self.events.len();
766
767 let total_runtime =
768 if let Some(start) = self.start_time { start.elapsed() } else { Duration::ZERO };
769
770 let slowest_layers = self.get_slowest_layers(5);
772
773 let memory_efficiency = self.analyze_memory_efficiency();
775
776 Ok(ProfilerReport {
777 total_events,
778 total_runtime,
779 statistics,
780 bottlenecks,
781 slowest_layers,
782 memory_efficiency,
783 recommendations: self.generate_performance_recommendations(),
784 })
785 }
786
787 pub fn clear(&mut self) {
789 self.events.clear();
790 self.active_timers.clear();
791 self.memory_snapshots.clear();
792 self.layer_profiles.clear();
793 self.bottlenecks.clear();
794 self.start_time = None;
795 self.gpu_kernel_profiles.clear();
797 self.memory_allocations.clear();
798 self.layer_latency_profiles.clear();
799 self.io_profiles.clear();
800 self.cpu_bottleneck_analysis.clear();
801 if let Ok(mut tracker) = self.memory_tracker.lock() {
802 *tracker = MemoryTracker::new();
803 }
804 self.io_monitor = IoMonitor::new();
805 }
806
807 pub fn profile_gpu_kernel(&mut self, kernel_profile: GpuKernelProfile) {
811 if let Some(ref mut gpu_profiler) = self.gpu_profiler {
812 gpu_profiler.profile_kernel(kernel_profile.clone());
813 }
814 self.gpu_kernel_profiles.push(kernel_profile);
815 }
816
817 pub fn track_memory_allocation(
819 &mut self,
820 size_bytes: usize,
821 allocation_type: MemoryAllocationType,
822 device_id: Option<i32>,
823 stack_trace: Vec<String>,
824 ) -> Uuid {
825 let allocation_id = Uuid::new_v4();
826 let allocation = MemoryAllocation {
827 allocation_id,
828 size_bytes,
829 allocation_type,
830 device_id,
831 timestamp: SystemTime::now(),
832 stack_trace,
833 freed: false,
834 free_timestamp: None,
835 };
836
837 if let Ok(mut tracker) = self.memory_tracker.lock() {
838 tracker.track_allocation(allocation.clone());
839 }
840
841 self.memory_allocations.insert(allocation_id, allocation);
842 allocation_id
843 }
844
845 pub fn track_memory_deallocation(&mut self, allocation_id: Uuid) {
847 if let Some(allocation) = self.memory_allocations.get_mut(&allocation_id) {
848 allocation.freed = true;
849 allocation.free_timestamp = Some(SystemTime::now());
850 }
851
852 if let Ok(mut tracker) = self.memory_tracker.lock() {
853 tracker.track_deallocation(allocation_id);
854 }
855 }
856
857 pub fn profile_layer_latency(&mut self, layer_latency: LayerLatencyProfile) {
859 self.layer_latency_profiles
860 .insert(layer_latency.layer_name.clone(), layer_latency);
861 }
862
863 pub fn start_io_profiling(
865 &mut self,
866 operation_type: IoOperationType,
867 bytes_expected: usize,
868 ) -> Uuid {
869 self.io_monitor.start_io_operation(operation_type, bytes_expected)
870 }
871
872 pub fn finish_io_profiling(&mut self, operation_id: Uuid, bytes_transferred: usize) {
874 if let Some(profile) = self.io_monitor.finish_io_operation(operation_id, bytes_transferred)
875 {
876 self.io_profiles.push(profile);
877 }
878 }
879
880 pub fn analyze_cpu_bottlenecks(&mut self) -> Vec<CpuBottleneckAnalysis> {
882 let analysis = CpuBottleneckAnalysis {
885 thread_id: 0, cpu_usage: 0.75, context_switches: 1000,
888 cache_misses: 500,
889 instructions_per_cycle: 2.5,
890 branch_mispredictions: 100,
891 hot_functions: vec![
892 HotFunction {
893 function_name: "tensor_multiply".to_string(),
894 self_time_percentage: 25.0,
895 call_count: 1000,
896 avg_time_per_call: Duration::from_micros(250),
897 },
898 HotFunction {
899 function_name: "gradient_computation".to_string(),
900 self_time_percentage: 20.0,
901 call_count: 500,
902 avg_time_per_call: Duration::from_micros(400),
903 },
904 ],
905 bottleneck_score: 0.6,
906 };
907
908 self.cpu_bottleneck_analysis.push(analysis.clone());
909 vec![analysis]
910 }
911
912 pub fn get_memory_stats(&self) -> Option<MemoryStats> {
914 if let Ok(tracker) = self.memory_tracker.lock() {
915 Some(tracker.get_memory_stats())
916 } else {
917 None
918 }
919 }
920
921 pub fn get_gpu_utilization(&self, device_id: i32) -> Option<f64> {
923 self.gpu_profiler
924 .as_ref()
925 .map(|profiler| profiler.get_gpu_utilization(device_id))
926 }
927
928 pub fn get_io_bandwidth_stats(&self) -> HashMap<IoDeviceType, f64> {
930 let mut stats = HashMap::new();
931
932 stats.insert(
933 IoDeviceType::SSD,
934 self.io_monitor.get_average_bandwidth(&IoDeviceType::SSD),
935 );
936 stats.insert(
937 IoDeviceType::HDD,
938 self.io_monitor.get_average_bandwidth(&IoDeviceType::HDD),
939 );
940 stats.insert(
941 IoDeviceType::Network,
942 self.io_monitor.get_average_bandwidth(&IoDeviceType::Network),
943 );
944 stats.insert(
945 IoDeviceType::Memory,
946 self.io_monitor.get_average_bandwidth(&IoDeviceType::Memory),
947 );
948 stats.insert(
949 IoDeviceType::Cache,
950 self.io_monitor.get_average_bandwidth(&IoDeviceType::Cache),
951 );
952
953 stats
954 }
955
956 pub fn get_layer_latency_analysis(&self) -> Vec<LayerLatencyAnalysis> {
958 self.layer_latency_profiles
959 .values()
960 .map(|profile| LayerLatencyAnalysis {
961 layer_name: profile.layer_name.clone(),
962 layer_type: profile.layer_type.clone(),
963 total_time: profile.cpu_time
964 + profile.gpu_time
965 + profile.memory_copy_time
966 + profile.sync_time,
967 cpu_percentage: profile.cpu_time.as_secs_f64()
968 / (profile.cpu_time
969 + profile.gpu_time
970 + profile.memory_copy_time
971 + profile.sync_time)
972 .as_secs_f64()
973 * 100.0,
974 gpu_percentage: profile.gpu_time.as_secs_f64()
975 / (profile.cpu_time
976 + profile.gpu_time
977 + profile.memory_copy_time
978 + profile.sync_time)
979 .as_secs_f64()
980 * 100.0,
981 memory_copy_percentage: profile.memory_copy_time.as_secs_f64()
982 / (profile.cpu_time
983 + profile.gpu_time
984 + profile.memory_copy_time
985 + profile.sync_time)
986 .as_secs_f64()
987 * 100.0,
988 flops_per_second: if profile.gpu_time.as_secs_f64() > 0.0 {
989 profile.flops as f64 / profile.gpu_time.as_secs_f64()
990 } else {
991 0.0
992 },
993 memory_bandwidth_utilization: profile.cache_hit_rate,
994 bottleneck_type: self.identify_layer_bottleneck(profile),
995 })
996 .collect()
997 }
998
999 pub fn get_performance_analysis(&self) -> PerformanceAnalysis {
1001 let memory_stats = self.get_memory_stats();
1002 let io_bandwidth_stats = self.get_io_bandwidth_stats();
1003 let layer_analysis = self.get_layer_latency_analysis();
1004
1005 let gpu_utilization = if let Some(profiler) = &self.gpu_profiler {
1006 Some(profiler.get_gpu_utilization(0))
1007 } else {
1008 None
1009 };
1010
1011 PerformanceAnalysis {
1012 memory_stats,
1013 io_bandwidth_stats,
1014 layer_analysis,
1015 gpu_utilization,
1016 cpu_bottlenecks: self.cpu_bottleneck_analysis.clone(),
1017 total_gpu_kernels: self.gpu_kernel_profiles.len(),
1018 total_io_operations: self.io_profiles.len(),
1019 performance_score: self.calculate_overall_performance_score(),
1020 recommendations: self.generate_enhanced_recommendations(),
1021 }
1022 }
1023
1024 fn identify_layer_bottleneck(&self, profile: &LayerLatencyProfile) -> String {
1025 let total_time =
1026 profile.cpu_time + profile.gpu_time + profile.memory_copy_time + profile.sync_time;
1027
1028 if profile.memory_copy_time > total_time / 2 {
1029 "Memory Bandwidth".to_string()
1030 } else if profile.sync_time > total_time / 3 {
1031 "GPU Synchronization".to_string()
1032 } else if profile.gpu_time > profile.cpu_time * 10 {
1033 "GPU Compute".to_string()
1034 } else {
1035 "CPU Compute".to_string()
1036 }
1037 }
1038
1039 fn calculate_overall_performance_score(&self) -> f64 {
1040 let mut score: f64 = 100.0;
1041
1042 for bottleneck in &self.bottlenecks {
1044 match bottleneck.severity {
1045 BottleneckSeverity::Critical => score -= 20.0,
1046 BottleneckSeverity::High => score -= 10.0,
1047 BottleneckSeverity::Medium => score -= 5.0,
1048 BottleneckSeverity::Low => score -= 2.0,
1049 }
1050 }
1051
1052 if let Some(gpu_util) = self.get_gpu_utilization(0) {
1054 if gpu_util < 0.5 {
1055 score -= 15.0;
1056 } else if gpu_util < 0.7 {
1057 score -= 8.0;
1058 }
1059 }
1060
1061 if let Some(memory_stats) = self.get_memory_stats() {
1063 if memory_stats.memory_efficiency < 0.8 {
1064 score -= 10.0;
1065 }
1066 }
1067
1068 score.max(0.0)
1069 }
1070
1071 fn generate_enhanced_recommendations(&self) -> Vec<String> {
1072 let mut recommendations = Vec::new();
1073
1074 if let Some(gpu_util) = self.get_gpu_utilization(0) {
1076 if gpu_util < 0.5 {
1077 recommendations.push("Low GPU utilization detected. Consider increasing batch size or optimizing GPU kernels.".to_string());
1078 }
1079 }
1080
1081 if let Some(memory_stats) = self.get_memory_stats() {
1083 if memory_stats.memory_efficiency < 0.8 {
1084 recommendations.push("Memory allocation efficiency is low. Consider memory pooling or reducing allocations.".to_string());
1085 }
1086
1087 if memory_stats.active_allocations > 10000 {
1088 recommendations.push("High number of active memory allocations. Consider batch allocation strategies.".to_string());
1089 }
1090 }
1091
1092 let io_stats = self.get_io_bandwidth_stats();
1094 if let Some(&ssd_bandwidth) = io_stats.get(&IoDeviceType::SSD) {
1095 if ssd_bandwidth < 100.0 {
1096 recommendations.push(
1098 "Low SSD bandwidth utilization. Consider optimizing file I/O patterns."
1099 .to_string(),
1100 );
1101 }
1102 }
1103
1104 let layer_analysis = self.get_layer_latency_analysis();
1106 for analysis in &layer_analysis {
1107 if analysis.memory_copy_percentage > 50.0 {
1108 recommendations.push(format!(
1109 "Layer '{}' is memory bandwidth bound. Consider data layout optimization.",
1110 analysis.layer_name
1111 ));
1112 }
1113
1114 if analysis.cpu_percentage > 80.0 {
1115 recommendations.push(format!(
1116 "Layer '{}' is CPU bound. Consider GPU acceleration.",
1117 analysis.layer_name
1118 ));
1119 }
1120 }
1121
1122 if recommendations.is_empty() {
1123 recommendations
1124 .push("Performance appears optimal based on current analysis.".to_string());
1125 }
1126
1127 recommendations
1128 }
1129
1130 fn analyze_layer_bottlenecks(&mut self) {
1133 for (layer_name, profile) in &self.layer_profiles {
1134 if profile.forward_times.is_empty() {
1135 continue;
1136 }
1137
1138 let avg_forward_time =
1139 profile.forward_times.iter().sum::<Duration>() / profile.forward_times.len() as u32;
1140
1141 if avg_forward_time.as_millis() > 100 {
1143 let mut metrics = HashMap::new();
1144 metrics.insert(
1145 "avg_forward_time_ms".to_string(),
1146 avg_forward_time.as_millis() as f64,
1147 );
1148 metrics.insert("call_count".to_string(), profile.call_count as f64);
1149
1150 self.bottlenecks.push(PerformanceBottleneck {
1151 bottleneck_type: BottleneckType::ModelComputation,
1152 location: layer_name.clone(),
1153 severity: if avg_forward_time.as_millis() > 500 {
1154 BottleneckSeverity::High
1155 } else {
1156 BottleneckSeverity::Medium
1157 },
1158 description: format!(
1159 "Layer '{}' has slow forward pass: {:.1}ms average",
1160 layer_name,
1161 avg_forward_time.as_millis()
1162 ),
1163 suggestion: "Consider optimizing layer implementation or reducing layer size"
1164 .to_string(),
1165 metrics,
1166 });
1167 }
1168 }
1169 }
1170
1171 fn analyze_memory_bottlenecks(&mut self) {
1172 if self.memory_snapshots.len() < 2 {
1173 return;
1174 }
1175
1176 let recent_snapshots = if self.memory_snapshots.len() > 10 {
1178 &self.memory_snapshots[self.memory_snapshots.len() - 10..]
1179 } else {
1180 &self.memory_snapshots
1181 };
1182
1183 if recent_snapshots.len() >= 5 {
1184 let initial_memory = recent_snapshots[0].heap_allocated;
1185 let final_memory = recent_snapshots.last().unwrap().heap_allocated;
1186
1187 if final_memory > initial_memory * 2 {
1188 let mut metrics = HashMap::new();
1189 metrics.insert(
1190 "initial_memory_mb".to_string(),
1191 initial_memory as f64 / (1024.0 * 1024.0),
1192 );
1193 metrics.insert(
1194 "final_memory_mb".to_string(),
1195 final_memory as f64 / (1024.0 * 1024.0),
1196 );
1197 metrics.insert(
1198 "growth_ratio".to_string(),
1199 final_memory as f64 / initial_memory as f64,
1200 );
1201
1202 self.bottlenecks.push(PerformanceBottleneck {
1203 bottleneck_type: BottleneckType::MemoryBound,
1204 location: "Memory Usage".to_string(),
1205 severity: BottleneckSeverity::High,
1206 description: "Significant memory growth detected during profiling".to_string(),
1207 suggestion: "Check for memory leaks or inefficient memory usage patterns"
1208 .to_string(),
1209 metrics,
1210 });
1211 }
1212 }
1213 }
1214
1215 fn analyze_tensor_bottlenecks(&mut self) {
1216 let mut operation_groups: HashMap<String, Vec<Duration>> = HashMap::new();
1218
1219 for event in &self.events {
1220 if let ProfileEvent::TensorOperation {
1221 operation,
1222 duration,
1223 ..
1224 } = event
1225 {
1226 operation_groups
1227 .entry(operation.clone())
1228 .or_insert_with(Vec::new)
1229 .push(*duration);
1230 }
1231 }
1232
1233 for (operation, durations) in operation_groups {
1235 if durations.is_empty() {
1236 continue;
1237 }
1238
1239 let avg_duration = durations.iter().sum::<Duration>() / durations.len() as u32;
1240 let total_time = durations.iter().sum::<Duration>();
1241
1242 if avg_duration.as_millis() > 10 {
1244 let mut metrics = HashMap::new();
1245 metrics.insert(
1246 "avg_duration_ms".to_string(),
1247 avg_duration.as_millis() as f64,
1248 );
1249 metrics.insert("total_time_ms".to_string(), total_time.as_millis() as f64);
1250 metrics.insert("call_count".to_string(), durations.len() as f64);
1251
1252 self.bottlenecks.push(PerformanceBottleneck {
1253 bottleneck_type: BottleneckType::CpuBound,
1254 location: format!("Tensor Operation: {}", operation),
1255 severity: if avg_duration.as_millis() > 50 {
1256 BottleneckSeverity::High
1257 } else {
1258 BottleneckSeverity::Medium
1259 },
1260 description: format!(
1261 "Tensor operation '{}' is slow: {:.1}ms average",
1262 operation,
1263 avg_duration.as_millis()
1264 ),
1265 suggestion:
1266 "Consider optimizing tensor operation or using different data types"
1267 .to_string(),
1268 metrics,
1269 });
1270 }
1271 }
1272 }
1273
1274 fn get_slowest_layers(&self, limit: usize) -> Vec<(String, Duration)> {
1275 let mut layer_times: Vec<(String, Duration)> = self
1276 .layer_profiles
1277 .iter()
1278 .map(|(name, profile)| {
1279 let avg_time = if profile.forward_times.is_empty() {
1280 Duration::ZERO
1281 } else {
1282 profile.forward_times.iter().sum::<Duration>()
1283 / profile.forward_times.len() as u32
1284 };
1285 (name.clone(), avg_time)
1286 })
1287 .collect();
1288
1289 layer_times.sort_by(|a, b| b.1.cmp(&a.1));
1290 layer_times.truncate(limit);
1291 layer_times
1292 }
1293
1294 fn analyze_memory_efficiency(&self) -> MemoryEfficiencyAnalysis {
1295 if self.memory_snapshots.is_empty() {
1296 return MemoryEfficiencyAnalysis::default();
1297 }
1298
1299 let memory_values: Vec<usize> =
1300 self.memory_snapshots.iter().map(|snapshot| snapshot.heap_allocated).collect();
1301
1302 let max_memory = memory_values.iter().max().copied().unwrap_or(0);
1303 let min_memory = memory_values.iter().min().copied().unwrap_or(0);
1304 let avg_memory = memory_values.iter().sum::<usize>() / memory_values.len();
1305
1306 MemoryEfficiencyAnalysis {
1307 peak_memory_mb: max_memory as f64 / (1024.0 * 1024.0),
1308 min_memory_mb: min_memory as f64 / (1024.0 * 1024.0),
1309 avg_memory_mb: avg_memory as f64 / (1024.0 * 1024.0),
1310 memory_variance: self.calculate_memory_variance(&memory_values, avg_memory),
1311 efficiency_score: self.calculate_memory_efficiency_score(&memory_values),
1312 }
1313 }
1314
1315 fn calculate_memory_variance(&self, values: &[usize], mean: usize) -> f64 {
1316 if values.len() < 2 {
1317 return 0.0;
1318 }
1319
1320 let variance_sum: f64 = values
1321 .iter()
1322 .map(|&x| {
1323 let diff = x as f64 - mean as f64;
1324 diff * diff
1325 })
1326 .sum();
1327
1328 variance_sum / (values.len() - 1) as f64
1329 }
1330
1331 fn calculate_memory_efficiency_score(&self, values: &[usize]) -> f64 {
1332 if values.is_empty() {
1333 return 0.0;
1334 }
1335
1336 let max_memory = values.iter().max().copied().unwrap_or(0);
1337 let min_memory = values.iter().min().copied().unwrap_or(0);
1338
1339 if max_memory == 0 {
1340 return 100.0;
1341 }
1342
1343 100.0 * (1.0 - (max_memory - min_memory) as f64 / max_memory as f64)
1345 }
1346
1347 fn generate_performance_recommendations(&self) -> Vec<String> {
1348 let mut recommendations = Vec::new();
1349
1350 for bottleneck in &self.bottlenecks {
1352 match bottleneck.bottleneck_type {
1353 BottleneckType::ModelComputation => {
1354 recommendations.push(
1355 "Consider model architecture optimizations or layer fusion".to_string(),
1356 );
1357 },
1358 BottleneckType::MemoryBound => {
1359 recommendations.push(
1360 "Optimize memory usage with gradient checkpointing or model parallelism"
1361 .to_string(),
1362 );
1363 },
1364 BottleneckType::CpuBound => {
1365 recommendations.push(
1366 "Consider GPU acceleration or optimized CPU implementations".to_string(),
1367 );
1368 },
1369 _ => {},
1370 }
1371 }
1372
1373 if self.events.len() > 10000 {
1375 recommendations.push(
1376 "High number of profiling events - consider reducing profiling overhead"
1377 .to_string(),
1378 );
1379 }
1380
1381 let stats = self.get_statistics();
1382 if let Some(layer_stats) = stats.get("LayerExecution") {
1383 if layer_stats.avg_duration.as_millis() > 50 {
1384 recommendations.push(
1385 "Average layer execution time is high - consider layer optimization"
1386 .to_string(),
1387 );
1388 }
1389 }
1390
1391 if recommendations.is_empty() {
1392 recommendations
1393 .push("Performance appears optimal based on current profiling data".to_string());
1394 }
1395
1396 recommendations
1397 }
1398}
1399
1400#[derive(Debug, Clone, Serialize, Deserialize)]
1402pub struct MemoryEfficiencyAnalysis {
1403 pub peak_memory_mb: f64,
1404 pub min_memory_mb: f64,
1405 pub avg_memory_mb: f64,
1406 pub memory_variance: f64,
1407 pub efficiency_score: f64,
1408}
1409
1410impl Default for MemoryEfficiencyAnalysis {
1411 fn default() -> Self {
1412 Self {
1413 peak_memory_mb: 0.0,
1414 min_memory_mb: 0.0,
1415 avg_memory_mb: 0.0,
1416 memory_variance: 0.0,
1417 efficiency_score: 100.0,
1418 }
1419 }
1420}
1421
1422#[derive(Debug, Clone, Serialize, Deserialize)]
1424pub struct ProfilerReport {
1425 pub total_events: usize,
1426 pub total_runtime: Duration,
1427 pub statistics: HashMap<String, ProfileStats>,
1428 pub bottlenecks: Vec<PerformanceBottleneck>,
1429 pub slowest_layers: Vec<(String, Duration)>,
1430 pub memory_efficiency: MemoryEfficiencyAnalysis,
1431 pub recommendations: Vec<String>,
1432}
1433
1434pub struct ScopedTimer<'a> {
1436 profiler: &'a mut Profiler,
1437 name: String,
1438}
1439
1440impl<'a> ScopedTimer<'a> {
1441 pub fn new(profiler: &'a mut Profiler, name: String) -> Self {
1442 profiler.start_timer(&name);
1443 Self { profiler, name }
1444 }
1445}
1446
1447impl<'a> Drop for ScopedTimer<'a> {
1448 fn drop(&mut self) {
1449 self.profiler.end_timer(&self.name);
1450 }
1451}
1452
1453#[derive(Debug, Clone, Serialize, Deserialize)]
1455pub struct LayerLatencyAnalysis {
1456 pub layer_name: String,
1457 pub layer_type: String,
1458 pub total_time: Duration,
1459 pub cpu_percentage: f64,
1460 pub gpu_percentage: f64,
1461 pub memory_copy_percentage: f64,
1462 pub flops_per_second: f64,
1463 pub memory_bandwidth_utilization: f64,
1464 pub bottleneck_type: String,
1465}
1466
1467#[derive(Debug, Serialize, Deserialize)]
1469pub struct PerformanceAnalysis {
1470 pub memory_stats: Option<MemoryStats>,
1471 pub io_bandwidth_stats: HashMap<IoDeviceType, f64>,
1472 pub layer_analysis: Vec<LayerLatencyAnalysis>,
1473 pub gpu_utilization: Option<f64>,
1474 pub cpu_bottlenecks: Vec<CpuBottleneckAnalysis>,
1475 pub total_gpu_kernels: usize,
1476 pub total_io_operations: usize,
1477 pub performance_score: f64,
1478 pub recommendations: Vec<String>,
1479}
1480
1481#[derive(Debug, Serialize, Deserialize)]
1483pub struct EnhancedProfilerReport {
1484 pub basic_report: ProfilerReport,
1485 pub performance_analysis: PerformanceAnalysis,
1486 pub gpu_kernel_summary: GpuKernelSummary,
1487 pub memory_allocation_summary: MemoryAllocationSummary,
1488 pub io_performance_summary: IoPerformanceSummary,
1489}
1490
1491#[derive(Debug, Serialize, Deserialize)]
1492pub struct GpuKernelSummary {
1493 pub total_kernels: usize,
1494 pub total_execution_time: Duration,
1495 pub avg_occupancy: f64,
1496 pub avg_compute_utilization: f64,
1497 pub slowest_kernels: Vec<String>,
1498}
1499
1500#[derive(Debug, Serialize, Deserialize)]
1501pub struct MemoryAllocationSummary {
1502 pub total_allocations: usize,
1503 pub peak_memory_usage: usize,
1504 pub memory_efficiency: f64,
1505 pub largest_allocations: Vec<String>,
1506 pub memory_leaks: usize,
1507}
1508
1509#[derive(Debug, Serialize, Deserialize)]
1510pub struct IoPerformanceSummary {
1511 pub total_operations: usize,
1512 pub total_bytes_transferred: usize,
1513 pub avg_bandwidth_by_device: HashMap<IoDeviceType, f64>,
1514 pub slowest_operations: Vec<String>,
1515}
1516
1517impl Profiler {
1518 pub async fn generate_enhanced_report(&self) -> Result<EnhancedProfilerReport> {
1520 let basic_report = self.generate_report().await?;
1521 let performance_analysis = self.get_performance_analysis();
1522
1523 let gpu_kernel_summary = self.generate_gpu_kernel_summary();
1524 let memory_allocation_summary = self.generate_memory_allocation_summary();
1525 let io_performance_summary = self.generate_io_performance_summary();
1526
1527 Ok(EnhancedProfilerReport {
1528 basic_report,
1529 performance_analysis,
1530 gpu_kernel_summary,
1531 memory_allocation_summary,
1532 io_performance_summary,
1533 })
1534 }
1535
1536 fn generate_gpu_kernel_summary(&self) -> GpuKernelSummary {
1537 let total_kernels = self.gpu_kernel_profiles.len();
1538 let total_execution_time = self.gpu_kernel_profiles.iter().map(|k| k.execution_time).sum();
1539
1540 let avg_occupancy = if total_kernels > 0 {
1541 self.gpu_kernel_profiles.iter().map(|k| k.occupancy).sum::<f64>() / total_kernels as f64
1542 } else {
1543 0.0
1544 };
1545
1546 let avg_compute_utilization = if total_kernels > 0 {
1547 self.gpu_kernel_profiles.iter().map(|k| k.compute_utilization).sum::<f64>()
1548 / total_kernels as f64
1549 } else {
1550 0.0
1551 };
1552
1553 let mut kernels_by_time: Vec<_> = self
1554 .gpu_kernel_profiles
1555 .iter()
1556 .map(|k| (k.kernel_name.clone(), k.execution_time))
1557 .collect();
1558 kernels_by_time.sort_by(|a, b| b.1.cmp(&a.1));
1559
1560 let slowest_kernels = kernels_by_time.into_iter().take(5).map(|(name, _)| name).collect();
1561
1562 GpuKernelSummary {
1563 total_kernels,
1564 total_execution_time,
1565 avg_occupancy,
1566 avg_compute_utilization,
1567 slowest_kernels,
1568 }
1569 }
1570
1571 fn generate_memory_allocation_summary(&self) -> MemoryAllocationSummary {
1572 let total_allocations = self.memory_allocations.len();
1573 let peak_memory_usage =
1574 self.memory_allocations.values().map(|a| a.size_bytes).max().unwrap_or(0);
1575
1576 let memory_efficiency = if let Some(stats) = self.get_memory_stats() {
1577 stats.memory_efficiency
1578 } else {
1579 1.0
1580 };
1581
1582 let mut allocations_by_size: Vec<_> = self
1583 .memory_allocations
1584 .values()
1585 .map(|a| (format!("{} bytes", a.size_bytes), a.size_bytes))
1586 .collect();
1587 allocations_by_size.sort_by(|a, b| b.1.cmp(&a.1));
1588
1589 let largest_allocations =
1590 allocations_by_size.into_iter().take(5).map(|(desc, _)| desc).collect();
1591
1592 let memory_leaks = self.memory_allocations.values().filter(|a| !a.freed).count();
1593
1594 MemoryAllocationSummary {
1595 total_allocations,
1596 peak_memory_usage,
1597 memory_efficiency,
1598 largest_allocations,
1599 memory_leaks,
1600 }
1601 }
1602
1603 fn generate_io_performance_summary(&self) -> IoPerformanceSummary {
1604 let total_operations = self.io_profiles.len();
1605 let total_bytes_transferred = self.io_profiles.iter().map(|io| io.bytes_transferred).sum();
1606
1607 let avg_bandwidth_by_device = self.get_io_bandwidth_stats();
1608
1609 let mut operations_by_duration: Vec<_> = self
1610 .io_profiles
1611 .iter()
1612 .map(|io| {
1613 (
1614 format!("{:?}: {} bytes", io.operation_type, io.bytes_transferred),
1615 io.duration,
1616 )
1617 })
1618 .collect();
1619 operations_by_duration.sort_by(|a, b| b.1.cmp(&a.1));
1620
1621 let slowest_operations =
1622 operations_by_duration.into_iter().take(5).map(|(desc, _)| desc).collect();
1623
1624 IoPerformanceSummary {
1625 total_operations,
1626 total_bytes_transferred,
1627 avg_bandwidth_by_device,
1628 slowest_operations,
1629 }
1630 }
1631}
1632
1633#[macro_export]
1635macro_rules! profile_scope {
1636 ($profiler:expr, $name:expr) => {
1637 let _timer = ScopedTimer::new($profiler, $name.to_string());
1638 };
1639}