trustformers_debug/
memory_profiler.rs

1//! Advanced memory profiling for TrustformeRS models.
2//!
3//! This module provides comprehensive memory profiling capabilities including:
4//! - Heap allocation tracking
5//! - Memory leak detection
6//! - Peak memory analysis
7//! - Allocation patterns
8//! - GC pressure analysis
9//! - Memory fragmentation monitoring
10//!
11//! # Example
12//!
13//! ```no_run
14//! use trustformers_debug::{MemoryProfiler, MemoryProfilingConfig};
15//!
16//! let config = MemoryProfilingConfig::default();
17//! let mut profiler = MemoryProfiler::new(config);
18//!
19//! profiler.start().await?;
20//! // ... run model training/inference ...
21//! let report = profiler.stop().await?;
22//!
23//! println!("Peak memory usage: {} MB", report.peak_memory_mb);
24//! println!("Memory leaks detected: {}", report.potential_leaks.len());
25//! ```
26
27use anyhow::Result;
28use serde::{Deserialize, Serialize};
29use std::collections::{HashMap, VecDeque};
30use std::sync::{Arc, Mutex};
31use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
32use tokio::time::interval;
33use uuid::Uuid;
34
35/// Configuration for memory profiling
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct MemoryProfilingConfig {
38    /// Enable heap allocation tracking
39    pub enable_heap_tracking: bool,
40    /// Enable leak detection
41    pub enable_leak_detection: bool,
42    /// Enable allocation pattern analysis
43    pub enable_pattern_analysis: bool,
44    /// Enable memory fragmentation monitoring
45    pub enable_fragmentation_monitoring: bool,
46    /// Enable GC pressure analysis
47    pub enable_gc_pressure_analysis: bool,
48    /// Sampling interval for memory measurements (milliseconds)
49    pub sampling_interval_ms: u64,
50    /// Maximum number of allocation records to keep
51    pub max_allocation_records: usize,
52    /// Threshold for considering an allocation "large" (bytes)
53    pub large_allocation_threshold: usize,
54    /// Window size for detecting allocation patterns (seconds)
55    pub pattern_analysis_window_secs: u64,
56    /// Threshold for leak detection (allocations alive for this duration)
57    pub leak_detection_threshold_secs: u64,
58}
59
60impl Default for MemoryProfilingConfig {
61    fn default() -> Self {
62        Self {
63            enable_heap_tracking: true,
64            enable_leak_detection: true,
65            enable_pattern_analysis: true,
66            enable_fragmentation_monitoring: true,
67            enable_gc_pressure_analysis: true,
68            sampling_interval_ms: 100, // 100ms sampling
69            max_allocation_records: 100000,
70            large_allocation_threshold: 1024 * 1024, // 1MB
71            pattern_analysis_window_secs: 60,        // 1 minute window
72            leak_detection_threshold_secs: 300,      // 5 minutes
73        }
74    }
75}
76
77/// Allocation record for tracking individual allocations
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct AllocationRecord {
80    pub id: Uuid,
81    pub size: usize,
82    pub timestamp: SystemTime,
83    pub stack_trace: Vec<String>,
84    pub allocation_type: AllocationType,
85    pub freed: bool,
86    pub freed_at: Option<SystemTime>,
87    pub tags: Vec<String>, // For categorizing allocations
88}
89
90/// Type of allocation
91#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
92pub enum AllocationType {
93    Tensor,
94    Buffer,
95    Weights,
96    Gradients,
97    Activations,
98    Cache,
99    Temporary,
100    Other(String),
101}
102
103/// Memory usage snapshot at a point in time
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct MemorySnapshot {
106    pub timestamp: SystemTime,
107    pub total_heap_bytes: usize,
108    pub used_heap_bytes: usize,
109    pub free_heap_bytes: usize,
110    pub peak_heap_bytes: usize,
111    pub allocation_count: usize,
112    pub free_count: usize,
113    pub fragmentation_ratio: f64,
114    pub gc_pressure_score: f64,
115    pub allocations_by_type: HashMap<AllocationType, usize>,
116    pub allocations_by_size: HashMap<String, usize>, // Size buckets
117}
118
119/// Memory leak information
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct MemoryLeak {
122    pub allocation_id: Uuid,
123    pub size: usize,
124    pub age_seconds: f64,
125    pub allocation_type: AllocationType,
126    pub stack_trace: Vec<String>,
127    pub tags: Vec<String>,
128    pub severity: LeakSeverity,
129}
130
131/// Severity of memory leak
132#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
133pub enum LeakSeverity {
134    Low,      // Small allocations, short-lived
135    Medium,   // Moderate size or moderately old
136    High,     // Large allocations or very old
137    Critical, // Very large or extremely old
138}
139
140/// Allocation pattern detected by analysis
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct AllocationPattern {
143    pub pattern_type: PatternType,
144    pub description: String,
145    pub confidence: f64,   // 0.0 to 1.0
146    pub impact_score: f64, // 0.0 to 1.0 (higher = more concerning)
147    pub recommendations: Vec<String>,
148    pub examples: Vec<AllocationRecord>,
149}
150
151/// Type of allocation pattern
152#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub enum PatternType {
154    MemoryLeak,           // Consistent growth without deallocation
155    ChurningAllocations,  // Rapid alloc/free cycles
156    FragmentationCausing, // Allocations that cause fragmentation
157    LargeAllocations,     // Unexpectedly large allocations
158    UnbalancedTypes,      // Disproportionate allocation types
159    PeakUsageSpikes,      // Sudden memory usage spikes
160}
161
162/// Memory fragmentation analysis
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct FragmentationAnalysis {
165    pub fragmentation_ratio: f64,
166    pub largest_free_block: usize,
167    pub total_free_memory: usize,
168    pub free_block_count: usize,
169    pub average_free_block_size: f64,
170    pub fragmentation_severity: FragmentationSeverity,
171    pub recommendations: Vec<String>,
172}
173
174/// Fragmentation severity levels
175#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
176pub enum FragmentationSeverity {
177    Low,    // < 10% fragmentation
178    Medium, // 10-30% fragmentation
179    High,   // 30-60% fragmentation
180    Severe, // > 60% fragmentation
181}
182
183/// Garbage collection pressure analysis
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct GCPressureAnalysis {
186    pub pressure_score: f64,    // 0.0 to 1.0
187    pub allocation_rate: f64,   // allocations per second
188    pub deallocation_rate: f64, // deallocations per second
189    pub churn_rate: f64,        // alloc/dealloc cycles per second
190    pub pressure_level: GCPressureLevel,
191    pub contributing_factors: Vec<String>,
192    pub recommendations: Vec<String>,
193}
194
195/// GC pressure levels
196#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
197pub enum GCPressureLevel {
198    Low,
199    Medium,
200    High,
201    Critical,
202}
203
204/// Comprehensive memory profiling report
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct MemoryProfilingReport {
207    pub session_id: Uuid,
208    pub start_time: SystemTime,
209    pub end_time: SystemTime,
210    pub duration_secs: f64,
211    pub config: MemoryProfilingConfig,
212
213    // Summary statistics
214    pub peak_memory_mb: f64,
215    pub average_memory_mb: f64,
216    pub total_allocations: usize,
217    pub total_deallocations: usize,
218    pub net_allocations: i64,
219
220    // Memory timeline
221    pub memory_timeline: Vec<MemorySnapshot>,
222
223    // Leak detection
224    pub potential_leaks: Vec<MemoryLeak>,
225    pub leak_summary: HashMap<AllocationType, usize>,
226
227    // Pattern analysis
228    pub detected_patterns: Vec<AllocationPattern>,
229
230    // Fragmentation analysis
231    pub fragmentation_analysis: FragmentationAnalysis,
232
233    // GC pressure analysis
234    pub gc_pressure_analysis: GCPressureAnalysis,
235
236    // Allocation statistics
237    pub allocations_by_type: HashMap<AllocationType, AllocationTypeStats>,
238    pub allocations_by_size_bucket: HashMap<String, usize>,
239
240    // Performance metrics
241    pub profiling_overhead_ms: f64,
242    pub sampling_accuracy: f64,
243}
244
245/// Statistics for each allocation type
246#[derive(Debug, Clone, Serialize, Deserialize)]
247pub struct AllocationTypeStats {
248    pub total_allocations: usize,
249    pub total_deallocations: usize,
250    pub current_count: usize,
251    pub total_bytes_allocated: usize,
252    pub total_bytes_deallocated: usize,
253    pub current_bytes: usize,
254    pub peak_count: usize,
255    pub peak_bytes: usize,
256    pub average_allocation_size: f64,
257    pub largest_allocation: usize,
258}
259
260/// Memory profiler implementation
261#[derive(Debug)]
262pub struct MemoryProfiler {
263    config: MemoryProfilingConfig,
264    session_id: Uuid,
265    start_time: Option<Instant>,
266    allocations: Arc<Mutex<HashMap<Uuid, AllocationRecord>>>,
267    memory_timeline: Arc<Mutex<VecDeque<MemorySnapshot>>>,
268    type_stats: Arc<Mutex<HashMap<AllocationType, AllocationTypeStats>>>,
269    running: Arc<Mutex<bool>>,
270    profiling_start_time: Option<Instant>,
271}
272
273impl MemoryProfiler {
274    /// Create a new memory profiler
275    pub fn new(config: MemoryProfilingConfig) -> Self {
276        Self {
277            config,
278            session_id: Uuid::new_v4(),
279            start_time: None,
280            allocations: Arc::new(Mutex::new(HashMap::new())),
281            memory_timeline: Arc::new(Mutex::new(VecDeque::new())),
282            type_stats: Arc::new(Mutex::new(HashMap::new())),
283            running: Arc::new(Mutex::new(false)),
284            profiling_start_time: None,
285        }
286    }
287
288    /// Start memory profiling
289    pub async fn start(&mut self) -> Result<()> {
290        let mut running = self.running.lock().expect("lock should not be poisoned");
291        if *running {
292            return Err(anyhow::anyhow!("Memory profiler is already running"));
293        }
294
295        *running = true;
296        self.start_time = Some(Instant::now());
297        self.profiling_start_time = Some(Instant::now());
298
299        // Start periodic sampling
300        if self.config.enable_heap_tracking {
301            self.start_sampling().await?;
302        }
303
304        tracing::info!("Memory profiler started for session {}", self.session_id);
305        Ok(())
306    }
307
308    /// Stop memory profiling and generate report
309    pub async fn stop(&mut self) -> Result<MemoryProfilingReport> {
310        {
311            let mut running = self.running.lock().expect("lock should not be poisoned");
312            if !*running {
313                return Err(anyhow::anyhow!("Memory profiler is not running"));
314            }
315            *running = false;
316        }
317        // Guard is dropped here so background sampling task can check the flag and exit
318
319        let end_time = SystemTime::now();
320        let start_time = self
321            .start_time
322            .ok_or_else(|| anyhow::anyhow!("start_time should be set when profiler is running"))?;
323        let duration =
324            end_time.duration_since(UNIX_EPOCH)?.as_secs_f64() - start_time.elapsed().as_secs_f64();
325
326        // Calculate profiling overhead
327        let profiling_overhead = if let Some(prof_start) = self.profiling_start_time {
328            prof_start.elapsed().as_millis() as f64 * 0.01 // Estimated 1% overhead
329        } else {
330            0.0
331        };
332
333        let report = self.generate_report(end_time, duration, profiling_overhead).await?;
334
335        tracing::info!("Memory profiler stopped for session {}", self.session_id);
336        Ok(report)
337    }
338
339    /// Record an allocation
340    pub fn record_allocation(
341        &self,
342        size: usize,
343        allocation_type: AllocationType,
344        tags: Vec<String>,
345    ) -> Result<Uuid> {
346        let running = self.running.lock().expect("lock should not be poisoned");
347        if !*running {
348            return Err(anyhow::anyhow!("Memory profiler is not running"));
349        }
350
351        let allocation_id = Uuid::new_v4();
352        let record = AllocationRecord {
353            id: allocation_id,
354            size,
355            timestamp: SystemTime::now(),
356            stack_trace: self.capture_stack_trace(),
357            allocation_type: allocation_type.clone(),
358            freed: false,
359            freed_at: None,
360            tags,
361        };
362
363        // Store allocation record
364        let mut allocations = self.allocations.lock().expect("lock should not be poisoned");
365        allocations.insert(allocation_id, record);
366
367        // Update type statistics
368        self.update_type_stats(&allocation_type, size, true);
369
370        Ok(allocation_id)
371    }
372
373    /// Record a deallocation
374    pub fn record_deallocation(&self, allocation_id: Uuid) -> Result<()> {
375        let running = self.running.lock().expect("lock should not be poisoned");
376        if !*running {
377            return Ok(()); // Silently ignore if not running
378        }
379
380        let mut allocations = self.allocations.lock().expect("lock should not be poisoned");
381        if let Some(record) = allocations.get_mut(&allocation_id) {
382            record.freed = true;
383            record.freed_at = Some(SystemTime::now());
384
385            // Update type statistics
386            self.update_type_stats(&record.allocation_type, record.size, false);
387        }
388
389        Ok(())
390    }
391
392    /// Tag an existing allocation
393    pub fn tag_allocation(&self, allocation_id: Uuid, tag: String) -> Result<()> {
394        let mut allocations = self.allocations.lock().expect("lock should not be poisoned");
395        if let Some(record) = allocations.get_mut(&allocation_id) {
396            record.tags.push(tag);
397        }
398        Ok(())
399    }
400
401    /// Get current memory usage snapshot
402    pub fn get_memory_snapshot(&self) -> Result<MemorySnapshot> {
403        let allocations = self.allocations.lock().expect("lock should not be poisoned");
404        let _type_stats = self.type_stats.lock().expect("lock should not be poisoned");
405
406        let mut total_heap = 0;
407        let mut used_heap = 0;
408        let mut allocation_count = 0;
409        let mut free_count = 0;
410        let mut allocations_by_type = HashMap::new();
411        let mut allocations_by_size = HashMap::new();
412
413        for record in allocations.values() {
414            total_heap += record.size;
415
416            if !record.freed {
417                used_heap += record.size;
418                allocation_count += 1;
419
420                *allocations_by_type.entry(record.allocation_type.clone()).or_insert(0) +=
421                    record.size;
422
423                let size_bucket = self.get_size_bucket(record.size);
424                *allocations_by_size.entry(size_bucket).or_insert(0) += 1;
425            } else {
426                free_count += 1;
427            }
428        }
429
430        let free_heap = total_heap - used_heap;
431        let fragmentation_ratio =
432            if total_heap > 0 { free_heap as f64 / total_heap as f64 } else { 0.0 };
433
434        let gc_pressure_score = self.calculate_gc_pressure_score();
435
436        Ok(MemorySnapshot {
437            timestamp: SystemTime::now(),
438            total_heap_bytes: total_heap,
439            used_heap_bytes: used_heap,
440            free_heap_bytes: free_heap,
441            peak_heap_bytes: used_heap, // Simplified for now
442            allocation_count,
443            free_count,
444            fragmentation_ratio,
445            gc_pressure_score,
446            allocations_by_type,
447            allocations_by_size,
448        })
449    }
450
451    /// Detect memory leaks
452    pub fn detect_leaks(&self) -> Result<Vec<MemoryLeak>> {
453        let allocations = self.allocations.lock().expect("lock should not be poisoned");
454        let now = SystemTime::now();
455        let threshold = Duration::from_secs(self.config.leak_detection_threshold_secs);
456        let mut leaks = Vec::new();
457
458        for record in allocations.values() {
459            if !record.freed {
460                let age = now.duration_since(record.timestamp)?;
461                if age > threshold {
462                    let age_seconds = age.as_secs_f64();
463                    let severity = self.classify_leak_severity(record.size, age_seconds);
464
465                    leaks.push(MemoryLeak {
466                        allocation_id: record.id,
467                        size: record.size,
468                        age_seconds,
469                        allocation_type: record.allocation_type.clone(),
470                        stack_trace: record.stack_trace.clone(),
471                        tags: record.tags.clone(),
472                        severity,
473                    });
474                }
475            }
476        }
477
478        // Sort by severity and size
479        leaks.sort_by(|a, b| b.severity.cmp(&a.severity).then(b.size.cmp(&a.size)));
480
481        Ok(leaks)
482    }
483
484    /// Analyze allocation patterns
485    pub fn analyze_patterns(&self) -> Result<Vec<AllocationPattern>> {
486        let mut patterns = Vec::new();
487
488        // Detect memory leak patterns
489        if let Ok(leak_pattern) = self.detect_leak_pattern() {
490            patterns.push(leak_pattern);
491        }
492
493        // Detect churning allocation patterns
494        if let Ok(churn_pattern) = self.detect_churn_pattern() {
495            patterns.push(churn_pattern);
496        }
497
498        // Detect large allocation patterns
499        if let Ok(large_alloc_pattern) = self.detect_large_allocation_pattern() {
500            patterns.push(large_alloc_pattern);
501        }
502
503        // Detect fragmentation-causing patterns
504        if let Ok(frag_pattern) = self.detect_fragmentation_pattern() {
505            patterns.push(frag_pattern);
506        }
507
508        Ok(patterns)
509    }
510
511    /// Analyze memory fragmentation
512    pub fn analyze_fragmentation(&self) -> Result<FragmentationAnalysis> {
513        let snapshot = self.get_memory_snapshot()?;
514
515        let fragmentation_ratio = snapshot.fragmentation_ratio;
516        let severity = match fragmentation_ratio {
517            r if r < 0.1 => FragmentationSeverity::Low,
518            r if r < 0.3 => FragmentationSeverity::Medium,
519            r if r < 0.6 => FragmentationSeverity::High,
520            _ => FragmentationSeverity::Severe,
521        };
522
523        let recommendations = match severity {
524            FragmentationSeverity::Low => {
525                vec!["Memory fragmentation is low. Continue current practices.".to_string()]
526            },
527            FragmentationSeverity::Medium => vec![
528                "Consider pooling allocations of similar sizes.".to_string(),
529                "Monitor for increasing fragmentation trends.".to_string(),
530            ],
531            FragmentationSeverity::High => vec![
532                "Implement memory pooling for frequent allocations.".to_string(),
533                "Consider compaction strategies for long-running processes.".to_string(),
534                "Review allocation patterns for optimization opportunities.".to_string(),
535            ],
536            FragmentationSeverity::Severe => vec![
537                "Critical fragmentation detected. Immediate action required.".to_string(),
538                "Implement custom allocators with compaction.".to_string(),
539                "Consider restarting the process to reset memory layout.".to_string(),
540                "Review and optimize allocation strategies.".to_string(),
541            ],
542        };
543
544        Ok(FragmentationAnalysis {
545            fragmentation_ratio,
546            largest_free_block: snapshot.free_heap_bytes, // Simplified
547            total_free_memory: snapshot.free_heap_bytes,
548            free_block_count: snapshot.free_count,
549            average_free_block_size: if snapshot.free_count > 0 {
550                snapshot.free_heap_bytes as f64 / snapshot.free_count as f64
551            } else {
552                0.0
553            },
554            fragmentation_severity: severity,
555            recommendations,
556        })
557    }
558
559    /// Analyze GC pressure
560    pub fn analyze_gc_pressure(&self) -> Result<GCPressureAnalysis> {
561        let timeline = self.memory_timeline.lock().expect("lock should not be poisoned");
562
563        let pressure_score = self.calculate_gc_pressure_score();
564        let (allocation_rate, deallocation_rate) = self.calculate_allocation_rates(&timeline);
565        let churn_rate = allocation_rate.min(deallocation_rate);
566
567        let pressure_level = match pressure_score {
568            p if p < 0.25 => GCPressureLevel::Low,
569            p if p < 0.5 => GCPressureLevel::Medium,
570            p if p < 0.75 => GCPressureLevel::High,
571            _ => GCPressureLevel::Critical,
572        };
573
574        let mut contributing_factors = Vec::new();
575        let mut recommendations = Vec::new();
576
577        if allocation_rate > 1000.0 {
578            contributing_factors.push("High allocation rate".to_string());
579            recommendations.push("Consider object pooling or reuse strategies".to_string());
580        }
581
582        if churn_rate > 500.0 {
583            contributing_factors.push("High allocation churn".to_string());
584            recommendations.push("Reduce temporary object creation".to_string());
585        }
586
587        if pressure_level == GCPressureLevel::Critical {
588            recommendations
589                .push("Consider manual memory management for critical paths".to_string());
590        }
591
592        Ok(GCPressureAnalysis {
593            pressure_score,
594            allocation_rate,
595            deallocation_rate,
596            churn_rate,
597            pressure_level,
598            contributing_factors,
599            recommendations,
600        })
601    }
602
603    // Private helper methods
604
605    async fn start_sampling(&self) -> Result<()> {
606        let interval_duration = Duration::from_millis(self.config.sampling_interval_ms);
607        let mut interval = interval(interval_duration);
608        let _timeline = Arc::clone(&self.memory_timeline);
609        let running = Arc::clone(&self.running);
610
611        tokio::spawn(async move {
612            loop {
613                interval.tick().await;
614
615                let is_running = {
616                    let running_guard = running.lock().expect("lock should not be poisoned");
617                    *running_guard
618                };
619
620                if !is_running {
621                    break;
622                }
623
624                // This would normally sample actual memory usage
625                // For now, we'll use a placeholder implementation
626            }
627        });
628
629        Ok(())
630    }
631
632    pub async fn generate_report(
633        &self,
634        end_time: SystemTime,
635        duration_secs: f64,
636        profiling_overhead_ms: f64,
637    ) -> Result<MemoryProfilingReport> {
638        // Extract data from locked mutexes first, then drop guards before calling
639        // analysis methods that also need to acquire these locks.
640        let (
641            total_allocations,
642            total_deallocations,
643            net_allocations,
644            peak_memory_mb,
645            average_memory_mb,
646            allocations_by_size_bucket,
647            timeline_snapshot,
648            type_stats_snapshot,
649        ) = {
650            let allocations = self.allocations.lock().expect("lock should not be poisoned");
651            let timeline = self.memory_timeline.lock().expect("lock should not be poisoned");
652            let type_stats = self.type_stats.lock().expect("lock should not be poisoned");
653
654            let total_allocs = allocations.len();
655            let total_deallocs = allocations.values().filter(|r| r.freed).count();
656            let net_allocs = total_allocs as i64 - total_deallocs as i64;
657
658            // Calculate summary statistics
659            let peak_mem = timeline
660                .iter()
661                .map(|s| s.peak_heap_bytes as f64 / 1024.0 / 1024.0)
662                .fold(0.0, f64::max);
663
664            let avg_mem = if !timeline.is_empty() {
665                timeline.iter().map(|s| s.used_heap_bytes as f64 / 1024.0 / 1024.0).sum::<f64>()
666                    / timeline.len() as f64
667            } else {
668                0.0
669            };
670
671            // Create size buckets
672            let mut size_buckets = HashMap::new();
673            for record in allocations.values() {
674                let bucket = self.get_size_bucket(record.size);
675                *size_buckets.entry(bucket).or_insert(0) += 1;
676            }
677
678            let timeline_snap: Vec<_> = timeline.iter().cloned().collect();
679            let type_stats_snap = type_stats.clone();
680
681            (
682                total_allocs,
683                total_deallocs,
684                net_allocs,
685                peak_mem,
686                avg_mem,
687                size_buckets,
688                timeline_snap,
689                type_stats_snap,
690            )
691        };
692        // Guards are dropped here -- analysis methods can now safely acquire locks
693
694        let potential_leaks = self.detect_leaks()?;
695        let detected_patterns = self.analyze_patterns()?;
696        let fragmentation_analysis = self.analyze_fragmentation()?;
697        let gc_pressure_analysis = self.analyze_gc_pressure()?;
698
699        let mut leak_summary = HashMap::new();
700        for leak in &potential_leaks {
701            *leak_summary.entry(leak.allocation_type.clone()).or_insert(0) += 1;
702        }
703
704        Ok(MemoryProfilingReport {
705            session_id: self.session_id,
706            start_time: UNIX_EPOCH
707                + Duration::from_secs_f64(
708                    SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64() - duration_secs,
709                ),
710            end_time,
711            duration_secs,
712            config: self.config.clone(),
713            peak_memory_mb,
714            average_memory_mb,
715            total_allocations,
716            total_deallocations,
717            net_allocations,
718            memory_timeline: timeline_snapshot,
719            potential_leaks,
720            leak_summary,
721            detected_patterns,
722            fragmentation_analysis,
723            gc_pressure_analysis,
724            allocations_by_type: type_stats_snapshot,
725            allocations_by_size_bucket,
726            profiling_overhead_ms,
727            sampling_accuracy: 0.95, // Placeholder
728        })
729    }
730
731    fn capture_stack_trace(&self) -> Vec<String> {
732        // Placeholder implementation - in a real implementation,
733        // this would capture the actual call stack
734        vec![
735            "function_a".to_string(),
736            "function_b".to_string(),
737            "main".to_string(),
738        ]
739    }
740
741    fn update_type_stats(
742        &self,
743        allocation_type: &AllocationType,
744        size: usize,
745        is_allocation: bool,
746    ) {
747        let mut type_stats = self.type_stats.lock().expect("lock should not be poisoned");
748        let stats = type_stats.entry(allocation_type.clone()).or_insert(AllocationTypeStats {
749            total_allocations: 0,
750            total_deallocations: 0,
751            current_count: 0,
752            total_bytes_allocated: 0,
753            total_bytes_deallocated: 0,
754            current_bytes: 0,
755            peak_count: 0,
756            peak_bytes: 0,
757            average_allocation_size: 0.0,
758            largest_allocation: 0,
759        });
760
761        if is_allocation {
762            stats.total_allocations += 1;
763            stats.current_count += 1;
764            stats.total_bytes_allocated += size;
765            stats.current_bytes += size;
766            stats.peak_count = stats.peak_count.max(stats.current_count);
767            stats.peak_bytes = stats.peak_bytes.max(stats.current_bytes);
768            stats.largest_allocation = stats.largest_allocation.max(size);
769        } else {
770            stats.total_deallocations += 1;
771            stats.current_count = stats.current_count.saturating_sub(1);
772            stats.total_bytes_deallocated += size;
773            stats.current_bytes = stats.current_bytes.saturating_sub(size);
774        }
775
776        stats.average_allocation_size = if stats.total_allocations > 0 {
777            stats.total_bytes_allocated as f64 / stats.total_allocations as f64
778        } else {
779            0.0
780        };
781    }
782
783    fn get_size_bucket(&self, size: usize) -> String {
784        match size {
785            0..=1024 => "0-1KB".to_string(),
786            1025..=10240 => "1-10KB".to_string(),
787            10241..=102400 => "10-100KB".to_string(),
788            102401..=1048576 => "100KB-1MB".to_string(),
789            1048577..=10485760 => "1-10MB".to_string(),
790            _ => ">10MB".to_string(),
791        }
792    }
793
794    fn classify_leak_severity(&self, size: usize, age_seconds: f64) -> LeakSeverity {
795        let large_size = size > self.config.large_allocation_threshold;
796        let old_age = age_seconds > 1800.0; // 30 minutes
797        let very_old_age = age_seconds > 3600.0; // 1 hour
798
799        match (large_size, old_age, very_old_age) {
800            (true, _, true) => LeakSeverity::Critical,
801            (true, true, _) => LeakSeverity::High,
802            (true, false, _) => LeakSeverity::Medium,
803            (false, true, _) => LeakSeverity::Medium,
804            _ => LeakSeverity::Low,
805        }
806    }
807
808    fn calculate_gc_pressure_score(&self) -> f64 {
809        // Simplified GC pressure calculation
810        // In a real implementation, this would consider allocation patterns,
811        // heap growth rate, and other factors
812        0.3 // Placeholder value
813    }
814
815    fn calculate_allocation_rates(&self, timeline: &VecDeque<MemorySnapshot>) -> (f64, f64) {
816        if timeline.len() < 2 {
817            return (0.0, 0.0);
818        }
819
820        // Simplified rate calculation
821        let first = &timeline[0];
822        let last = &timeline[timeline.len() - 1];
823
824        let duration = last
825            .timestamp
826            .duration_since(first.timestamp)
827            .unwrap_or(Duration::from_secs(1))
828            .as_secs_f64();
829
830        let allocation_rate =
831            (last.allocation_count as f64 - first.allocation_count as f64) / duration;
832        let deallocation_rate = (last.free_count as f64 - first.free_count as f64) / duration;
833
834        (allocation_rate.max(0.0), deallocation_rate.max(0.0))
835    }
836
837    // Pattern detection methods
838
839    fn detect_leak_pattern(&self) -> Result<AllocationPattern> {
840        let leaks = self.detect_leaks()?;
841        let high_severity_leaks = leaks
842            .iter()
843            .filter(|l| l.severity == LeakSeverity::High || l.severity == LeakSeverity::Critical)
844            .count();
845
846        let confidence = if leaks.len() > 10 { 0.9 } else { 0.5 };
847        let impact_score = (high_severity_leaks as f64 / (leaks.len().max(1)) as f64).min(1.0);
848
849        Ok(AllocationPattern {
850            pattern_type: PatternType::MemoryLeak,
851            description: format!("Detected {} potential memory leaks", leaks.len()),
852            confidence,
853            impact_score,
854            recommendations: vec![
855                "Review long-lived allocations for proper cleanup".to_string(),
856                "Implement RAII patterns for automatic resource management".to_string(),
857            ],
858            examples: leaks
859                .into_iter()
860                .take(3)
861                .map(|leak| {
862                    // Convert leak to allocation record for example
863                    AllocationRecord {
864                        id: leak.allocation_id,
865                        size: leak.size,
866                        timestamp: SystemTime::now(), // Placeholder
867                        stack_trace: leak.stack_trace,
868                        allocation_type: leak.allocation_type,
869                        freed: false,
870                        freed_at: None,
871                        tags: leak.tags,
872                    }
873                })
874                .collect(),
875        })
876    }
877
878    fn detect_churn_pattern(&self) -> Result<AllocationPattern> {
879        // Simplified churn detection
880        let allocations = self.allocations.lock().expect("lock should not be poisoned");
881        let short_lived_count = allocations
882            .values()
883            .filter(|record| {
884                if let (Some(_freed_at), false) = (record.freed_at, record.freed) {
885                    false // Contradiction, skip
886                } else if record.freed {
887                    if let Some(freed_at) = record.freed_at {
888                        freed_at.duration_since(record.timestamp).unwrap_or(Duration::from_secs(0))
889                            < Duration::from_secs(1)
890                    } else {
891                        false
892                    }
893                } else {
894                    false
895                }
896            })
897            .count();
898
899        let total_count = allocations.len();
900        let churn_ratio = if total_count > 0 {
901            short_lived_count as f64 / total_count as f64
902        } else {
903            0.0
904        };
905
906        Ok(AllocationPattern {
907            pattern_type: PatternType::ChurningAllocations,
908            description: format!(
909                "High allocation churn detected: {:.1}% short-lived allocations",
910                churn_ratio * 100.0
911            ),
912            confidence: if churn_ratio > 0.5 { 0.8 } else { 0.4 },
913            impact_score: churn_ratio,
914            recommendations: vec![
915                "Consider object pooling for frequently allocated objects".to_string(),
916                "Reduce temporary object creation in hot paths".to_string(),
917            ],
918            examples: vec![], // Simplified for now
919        })
920    }
921
922    fn detect_large_allocation_pattern(&self) -> Result<AllocationPattern> {
923        let allocations = self.allocations.lock().expect("lock should not be poisoned");
924        let large_allocations: Vec<_> = allocations
925            .values()
926            .filter(|record| record.size > self.config.large_allocation_threshold)
927            .cloned()
928            .collect();
929
930        let impact_score = if !allocations.is_empty() {
931            large_allocations.len() as f64 / allocations.len() as f64
932        } else {
933            0.0
934        };
935
936        Ok(AllocationPattern {
937            pattern_type: PatternType::LargeAllocations,
938            description: format!(
939                "Found {} large allocations (>{}MB)",
940                large_allocations.len(),
941                self.config.large_allocation_threshold / 1024 / 1024
942            ),
943            confidence: if large_allocations.len() > 5 { 0.9 } else { 0.6 },
944            impact_score,
945            recommendations: vec![
946                "Review large allocations for optimization opportunities".to_string(),
947                "Consider streaming or chunked processing for large data".to_string(),
948            ],
949            examples: large_allocations.into_iter().take(3).collect(),
950        })
951    }
952
953    fn detect_fragmentation_pattern(&self) -> Result<AllocationPattern> {
954        let fragmentation = self.analyze_fragmentation()?;
955
956        Ok(AllocationPattern {
957            pattern_type: PatternType::FragmentationCausing,
958            description: format!(
959                "Memory fragmentation at {:.1}%",
960                fragmentation.fragmentation_ratio * 100.0
961            ),
962            confidence: 0.8,
963            impact_score: fragmentation.fragmentation_ratio,
964            recommendations: fragmentation.recommendations,
965            examples: vec![], // Simplified for now
966        })
967    }
968}
969
970impl PartialOrd for LeakSeverity {
971    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
972        Some(self.cmp(other))
973    }
974}
975
976impl Ord for LeakSeverity {
977    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
978        let self_val = match self {
979            LeakSeverity::Low => 0,
980            LeakSeverity::Medium => 1,
981            LeakSeverity::High => 2,
982            LeakSeverity::Critical => 3,
983        };
984        let other_val = match other {
985            LeakSeverity::Low => 0,
986            LeakSeverity::Medium => 1,
987            LeakSeverity::High => 2,
988            LeakSeverity::Critical => 3,
989        };
990        self_val.cmp(&other_val)
991    }
992}
993
994#[cfg(test)]
995mod tests {
996    use super::*;
997    use tokio;
998
999    #[tokio::test(flavor = "multi_thread")]
1000    #[ignore] // FIXME: This test has implementation issues causing slow execution
1001    async fn test_memory_profiler_basic() -> Result<()> {
1002        let config = MemoryProfilingConfig {
1003            sampling_interval_ms: 1000, // Slower sampling for faster tests
1004            ..Default::default()
1005        };
1006        let mut profiler = MemoryProfiler::new(config);
1007
1008        // Wrap in timeout to prevent hanging
1009        let test_result = tokio::time::timeout(Duration::from_millis(500), async {
1010            profiler.start().await?;
1011
1012            // Record some allocations
1013            let alloc_id1 = profiler.record_allocation(
1014                1024,
1015                AllocationType::Tensor,
1016                vec!["test".to_string()],
1017            )?;
1018
1019            let _alloc_id2 = profiler.record_allocation(
1020                2048,
1021                AllocationType::Buffer,
1022                vec!["test".to_string()],
1023            )?;
1024
1025            // Free one allocation
1026            profiler.record_deallocation(alloc_id1)?;
1027
1028            // Give background tasks a moment to process
1029            tokio::time::sleep(Duration::from_millis(1)).await;
1030
1031            let report = profiler.stop().await?;
1032
1033            assert_eq!(report.total_allocations, 2);
1034            assert_eq!(report.total_deallocations, 1);
1035            assert_eq!(report.net_allocations, 1);
1036
1037            Ok::<(), anyhow::Error>(())
1038        })
1039        .await;
1040
1041        match test_result {
1042            Ok(result) => result,
1043            Err(_) => Err(anyhow::anyhow!("Test timed out after 500ms")),
1044        }
1045    }
1046
1047    #[tokio::test]
1048    async fn test_leak_detection() -> Result<()> {
1049        let config = MemoryProfilingConfig {
1050            leak_detection_threshold_secs: 1, // 1 second for testing
1051            ..Default::default()
1052        };
1053
1054        let mut profiler = MemoryProfiler::new(config);
1055        profiler.start().await?; // Start the profiler
1056
1057        // Record allocation and wait
1058        profiler.record_allocation(1024, AllocationType::Tensor, vec!["leak_test".to_string()])?;
1059
1060        tokio::time::sleep(Duration::from_secs(2)).await;
1061
1062        let leaks = profiler.detect_leaks()?;
1063        assert!(!leaks.is_empty());
1064
1065        Ok(())
1066    }
1067
1068    #[test]
1069    fn test_size_buckets() {
1070        let config = MemoryProfilingConfig::default();
1071        let profiler = MemoryProfiler::new(config);
1072
1073        assert_eq!(profiler.get_size_bucket(512), "0-1KB");
1074        assert_eq!(profiler.get_size_bucket(5120), "1-10KB");
1075        assert_eq!(profiler.get_size_bucket(51200), "10-100KB");
1076        assert_eq!(profiler.get_size_bucket(512000), "100KB-1MB");
1077        assert_eq!(profiler.get_size_bucket(5120000), "1-10MB");
1078        assert_eq!(profiler.get_size_bucket(51200000), ">10MB");
1079    }
1080
1081    #[test]
1082    fn test_leak_severity_classification() {
1083        let config = MemoryProfilingConfig::default();
1084        let profiler = MemoryProfiler::new(config);
1085
1086        // Small, new allocation
1087        assert_eq!(
1088            profiler.classify_leak_severity(1024, 60.0),
1089            LeakSeverity::Low
1090        );
1091
1092        // Large, old allocation
1093        assert_eq!(
1094            profiler.classify_leak_severity(10485760, 3700.0),
1095            LeakSeverity::Critical
1096        );
1097
1098        // Medium size, medium age
1099        assert_eq!(
1100            profiler.classify_leak_severity(524288, 1900.0),
1101            LeakSeverity::Medium
1102        );
1103    }
1104}
trustformers_debug/memory_profiler.rs

trustformers_debug/
memory_profiler.rs