trustformers_debug/model_diagnostics/
performance.rs

1//! Performance metrics and analysis functionality.
2//!
3//! This module provides comprehensive performance monitoring and analysis
4//! capabilities including throughput analysis, memory usage tracking,
5//! performance trend analysis, and performance optimization recommendations.
6
7use super::types::{ModelPerformanceMetrics, PerformanceSummary};
8
9/// Performance analyzer for tracking and analyzing model performance metrics.
10#[derive(Debug)]
11pub struct PerformanceAnalyzer {
12    /// Historical performance metrics
13    performance_history: Vec<ModelPerformanceMetrics>,
14    /// Maximum history length to maintain
15    max_history_length: usize,
16    /// Performance thresholds for alerts
17    thresholds: PerformanceThresholds,
18}
19
20/// Performance thresholds for triggering alerts.
21#[derive(Debug, Clone)]
22pub struct PerformanceThresholds {
23    /// Maximum acceptable memory usage in MB
24    pub max_memory_mb: f64,
25    /// Minimum acceptable throughput in samples/sec
26    pub min_throughput: f64,
27    /// Maximum acceptable loss increase percentage
28    pub max_loss_increase_percent: f64,
29    /// Maximum acceptable variance in loss
30    pub max_loss_variance: f64,
31}
32
33impl Default for PerformanceThresholds {
34    fn default() -> Self {
35        Self {
36            max_memory_mb: 8192.0, // 8GB
37            min_throughput: 100.0,
38            max_loss_increase_percent: 10.0,
39            max_loss_variance: 0.1,
40        }
41    }
42}
43
44impl PerformanceAnalyzer {
45    /// Create a new performance analyzer.
46    pub fn new() -> Self {
47        Self {
48            performance_history: Vec::new(),
49            max_history_length: 1000,
50            thresholds: PerformanceThresholds::default(),
51        }
52    }
53
54    /// Create a new performance analyzer with custom thresholds.
55    pub fn with_thresholds(thresholds: PerformanceThresholds) -> Self {
56        Self {
57            performance_history: Vec::new(),
58            max_history_length: 1000,
59            thresholds,
60        }
61    }
62
63    /// Set maximum history length.
64    pub fn set_max_history_length(&mut self, length: usize) {
65        self.max_history_length = length;
66        if self.performance_history.len() > length {
67            self.performance_history.drain(0..self.performance_history.len() - length);
68        }
69    }
70
71    /// Record a new performance measurement.
72    pub fn record_performance(&mut self, metrics: ModelPerformanceMetrics) {
73        self.performance_history.push(metrics);
74
75        // Maintain maximum history length
76        if self.performance_history.len() > self.max_history_length {
77            self.performance_history.remove(0);
78        }
79    }
80
81    /// Record metrics (alias for record_performance).
82    pub fn record_metrics(&mut self, metrics: ModelPerformanceMetrics) {
83        self.record_performance(metrics);
84    }
85
86    /// Get the complete performance history.
87    pub fn get_performance_history(&self) -> &[ModelPerformanceMetrics] {
88        &self.performance_history
89    }
90
91    /// Generate a performance summary.
92    pub fn generate_performance_summary(&self) -> PerformanceSummary {
93        if self.performance_history.is_empty() {
94            return PerformanceSummary::default();
95        }
96
97        let total_steps = self.performance_history.len();
98        let current_metrics = self.performance_history.last().unwrap();
99
100        let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
101        let throughputs: Vec<f64> =
102            self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
103        let memory_usages: Vec<f64> =
104            self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
105
106        let best_loss = losses.iter().fold(f64::INFINITY, |acc, &x| acc.min(x));
107        let avg_loss = losses.iter().sum::<f64>() / losses.len() as f64;
108        let avg_throughput = throughputs.iter().sum::<f64>() / throughputs.len() as f64;
109        let peak_memory_mb = memory_usages.iter().fold(0.0f64, |acc, &x| acc.max(x));
110        let avg_memory_mb = memory_usages.iter().sum::<f64>() / memory_usages.len() as f64;
111
112        PerformanceSummary {
113            total_steps,
114            current_loss: current_metrics.loss,
115            best_loss,
116            avg_loss,
117            current_throughput: current_metrics.throughput_samples_per_sec,
118            avg_throughput,
119            peak_memory_mb,
120            avg_memory_mb,
121        }
122    }
123
124    /// Analyze performance trends.
125    pub fn analyze_performance_trends(&self) -> PerformanceTrends {
126        if self.performance_history.len() < 10 {
127            return PerformanceTrends::default();
128        }
129
130        let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
131        let throughputs: Vec<f64> =
132            self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
133        let memory_usages: Vec<f64> =
134            self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
135
136        let loss_trend = self.compute_trend(&losses);
137        let throughput_trend = self.compute_trend(&throughputs);
138        let memory_trend = self.compute_trend(&memory_usages);
139
140        let loss_volatility = self.compute_volatility(&losses);
141        let throughput_volatility = self.compute_volatility(&throughputs);
142
143        PerformanceTrends {
144            loss_trend,
145            throughput_trend,
146            memory_trend,
147            loss_volatility,
148            throughput_volatility,
149            trend_confidence: self.compute_trend_confidence(&losses),
150        }
151    }
152
153    /// Check for performance anomalies.
154    pub fn detect_performance_anomalies(&self) -> Vec<PerformanceAnomaly> {
155        let mut anomalies = Vec::new();
156
157        if self.performance_history.len() < 5 {
158            return anomalies;
159        }
160
161        // Check for memory leaks
162        if let Some(anomaly) = self.detect_memory_leak() {
163            anomalies.push(anomaly);
164        }
165
166        // Check for performance degradation
167        if let Some(anomaly) = self.detect_performance_degradation() {
168            anomalies.push(anomaly);
169        }
170
171        // Check for training instability
172        if let Some(anomaly) = self.detect_training_instability() {
173            anomalies.push(anomaly);
174        }
175
176        // Check for throughput drops
177        if let Some(anomaly) = self.detect_throughput_drops() {
178            anomalies.push(anomaly);
179        }
180
181        anomalies
182    }
183
184    /// Generate performance optimization recommendations.
185    pub fn generate_optimization_recommendations(&self) -> Vec<OptimizationRecommendation> {
186        let mut recommendations = Vec::new();
187        let summary = self.generate_performance_summary();
188
189        // Memory optimization recommendations
190        if summary.peak_memory_mb > self.thresholds.max_memory_mb {
191            recommendations.push(OptimizationRecommendation {
192                category: "Memory".to_string(),
193                priority: PerformanceRecommendationPriority::High,
194                description: "High memory usage detected".to_string(),
195                suggestion: "Consider reducing batch size or using gradient checkpointing"
196                    .to_string(),
197                expected_improvement: 0.3,
198            });
199        }
200
201        // Throughput optimization recommendations
202        if summary.avg_throughput < self.thresholds.min_throughput {
203            recommendations.push(OptimizationRecommendation {
204                category: "Throughput".to_string(),
205                priority: PerformanceRecommendationPriority::Medium,
206                description: "Low throughput detected".to_string(),
207                suggestion: "Consider increasing batch size or optimizing data loading".to_string(),
208                expected_improvement: 0.4,
209            });
210        }
211
212        // Loss optimization recommendations
213        let trends = self.analyze_performance_trends();
214        if trends.loss_trend > 0.01 {
215            recommendations.push(OptimizationRecommendation {
216                category: "Training".to_string(),
217                priority: PerformanceRecommendationPriority::High,
218                description: "Loss is increasing".to_string(),
219                suggestion: "Consider reducing learning rate or adding regularization".to_string(),
220                expected_improvement: 0.5,
221            });
222        }
223
224        recommendations
225    }
226
227    /// Compute linear trend for a series of values.
228    fn compute_trend(&self, values: &[f64]) -> f64 {
229        if values.len() < 2 {
230            return 0.0;
231        }
232
233        let n = values.len() as f64;
234        let x_mean = (n - 1.0) / 2.0;
235        let y_mean = values.iter().sum::<f64>() / n;
236
237        let mut numerator = 0.0;
238        let mut denominator = 0.0;
239
240        for (i, &y) in values.iter().enumerate() {
241            let x = i as f64;
242            numerator += (x - x_mean) * (y - y_mean);
243            denominator += (x - x_mean).powi(2);
244        }
245
246        if denominator == 0.0 {
247            0.0
248        } else {
249            numerator / denominator
250        }
251    }
252
253    /// Compute volatility (coefficient of variation) for a series of values.
254    fn compute_volatility(&self, values: &[f64]) -> f64 {
255        if values.len() < 2 {
256            return 0.0;
257        }
258
259        let mean = values.iter().sum::<f64>() / values.len() as f64;
260        let variance =
261            values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
262        let std_dev = variance.sqrt();
263
264        if mean == 0.0 {
265            0.0
266        } else {
267            std_dev / mean.abs()
268        }
269    }
270
271    /// Compute confidence in trend analysis.
272    fn compute_trend_confidence(&self, values: &[f64]) -> f64 {
273        if values.len() < 10 {
274            return 0.0;
275        }
276
277        let trend = self.compute_trend(values);
278        let volatility = self.compute_volatility(values);
279
280        // Higher confidence for stronger trends with lower volatility
281        let trend_strength = trend.abs();
282        let confidence = trend_strength / (1.0 + volatility);
283        confidence.min(1.0)
284    }
285
286    /// Detect memory leak patterns.
287    fn detect_memory_leak(&self) -> Option<PerformanceAnomaly> {
288        if self.performance_history.len() < 10 {
289            return None;
290        }
291
292        let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
293        let memory_usages: Vec<f64> = recent_metrics.iter().map(|m| m.memory_usage_mb).collect();
294        let memory_trend = self.compute_trend(&memory_usages);
295
296        // Consider it a memory leak if memory is consistently growing
297        if memory_trend > 10.0 {
298            // More than 10MB increase per step on average
299            Some(PerformanceAnomaly {
300                anomaly_type: AnomalyType::MemoryLeak,
301                severity: AnomalySeverity::High,
302                description: format!("Memory usage increasing at {:.1} MB/step", memory_trend),
303                detected_at_step: self.performance_history.last().unwrap().training_step,
304                confidence: 0.8,
305            })
306        } else {
307            None
308        }
309    }
310
311    /// Detect performance degradation.
312    fn detect_performance_degradation(&self) -> Option<PerformanceAnomaly> {
313        if self.performance_history.len() < 20 {
314            return None;
315        }
316
317        let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
318        let previous_metrics = &self.performance_history
319            [self.performance_history.len() - 20..self.performance_history.len() - 10];
320
321        let recent_avg_loss: f64 =
322            recent_metrics.iter().map(|m| m.loss).sum::<f64>() / recent_metrics.len() as f64;
323        let previous_avg_loss: f64 =
324            previous_metrics.iter().map(|m| m.loss).sum::<f64>() / previous_metrics.len() as f64;
325
326        let degradation_percent =
327            ((recent_avg_loss - previous_avg_loss) / previous_avg_loss) * 100.0;
328
329        if degradation_percent > self.thresholds.max_loss_increase_percent {
330            Some(PerformanceAnomaly {
331                anomaly_type: AnomalyType::PerformanceDegradation,
332                severity: AnomalySeverity::High,
333                description: format!("Performance degraded by {:.1}%", degradation_percent),
334                detected_at_step: self.performance_history.last().unwrap().training_step,
335                confidence: 0.9,
336            })
337        } else {
338            None
339        }
340    }
341
342    /// Detect training instability.
343    fn detect_training_instability(&self) -> Option<PerformanceAnomaly> {
344        if self.performance_history.len() < 10 {
345            return None;
346        }
347
348        let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
349        let losses: Vec<f64> = recent_metrics.iter().map(|m| m.loss).collect();
350        let volatility = self.compute_volatility(&losses);
351
352        if volatility > self.thresholds.max_loss_variance {
353            Some(PerformanceAnomaly {
354                anomaly_type: AnomalyType::TrainingInstability,
355                severity: AnomalySeverity::Medium,
356                description: format!("High loss volatility: {:.3}", volatility),
357                detected_at_step: self.performance_history.last().unwrap().training_step,
358                confidence: 0.7,
359            })
360        } else {
361            None
362        }
363    }
364
365    /// Detect throughput drops.
366    fn detect_throughput_drops(&self) -> Option<PerformanceAnomaly> {
367        if self.performance_history.len() < 10 {
368            return None;
369        }
370
371        let recent_metrics = &self.performance_history[self.performance_history.len() - 5..];
372        let avg_recent_throughput: f64 =
373            recent_metrics.iter().map(|m| m.throughput_samples_per_sec).sum::<f64>()
374                / recent_metrics.len() as f64;
375
376        if avg_recent_throughput < self.thresholds.min_throughput {
377            Some(PerformanceAnomaly {
378                anomaly_type: AnomalyType::ThroughputDrop,
379                severity: AnomalySeverity::Medium,
380                description: format!("Low throughput: {:.1} samples/sec", avg_recent_throughput),
381                detected_at_step: self.performance_history.last().unwrap().training_step,
382                confidence: 0.8,
383            })
384        } else {
385            None
386        }
387    }
388
389    /// Clear performance history.
390    pub fn clear(&mut self) {
391        self.performance_history.clear();
392    }
393}
394
395impl Default for PerformanceAnalyzer {
396    fn default() -> Self {
397        Self::new()
398    }
399}
400
401/// Performance trends analysis results.
402#[derive(Debug, Clone)]
403pub struct PerformanceTrends {
404    /// Loss trend (slope)
405    pub loss_trend: f64,
406    /// Throughput trend (slope)
407    pub throughput_trend: f64,
408    /// Memory usage trend (slope)
409    pub memory_trend: f64,
410    /// Loss volatility (coefficient of variation)
411    pub loss_volatility: f64,
412    /// Throughput volatility (coefficient of variation)
413    pub throughput_volatility: f64,
414    /// Confidence in trend analysis
415    pub trend_confidence: f64,
416}
417
418impl Default for PerformanceTrends {
419    fn default() -> Self {
420        Self {
421            loss_trend: 0.0,
422            throughput_trend: 0.0,
423            memory_trend: 0.0,
424            loss_volatility: 0.0,
425            throughput_volatility: 0.0,
426            trend_confidence: 0.0,
427        }
428    }
429}
430
431/// Performance anomaly detection results.
432#[derive(Debug, Clone)]
433pub struct PerformanceAnomaly {
434    /// Type of anomaly detected
435    pub anomaly_type: AnomalyType,
436    /// Severity of the anomaly
437    pub severity: AnomalySeverity,
438    /// Description of the anomaly
439    pub description: String,
440    /// Training step when anomaly was detected
441    pub detected_at_step: usize,
442    /// Confidence in the detection
443    pub confidence: f64,
444}
445
446/// Types of performance anomalies.
447#[derive(Debug, Clone)]
448pub enum AnomalyType {
449    /// Memory leak detected
450    MemoryLeak,
451    /// Performance degradation detected
452    PerformanceDegradation,
453    /// Training instability detected
454    TrainingInstability,
455    /// Throughput drop detected
456    ThroughputDrop,
457}
458
459/// Severity levels for anomalies.
460#[derive(Debug, Clone)]
461pub enum AnomalySeverity {
462    /// Low severity anomaly
463    Low,
464    /// Medium severity anomaly
465    Medium,
466    /// High severity anomaly
467    High,
468    /// Critical severity anomaly
469    Critical,
470}
471
472/// Performance optimization recommendation.
473#[derive(Debug, Clone)]
474pub struct OptimizationRecommendation {
475    /// Category of optimization
476    pub category: String,
477    /// Priority of the recommendation
478    pub priority: PerformanceRecommendationPriority,
479    /// Description of the issue
480    pub description: String,
481    /// Suggested optimization
482    pub suggestion: String,
483    /// Expected improvement (0.0 to 1.0)
484    pub expected_improvement: f64,
485}
486
487/// Priority levels for recommendations.
488#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
489pub enum PerformanceRecommendationPriority {
490    /// Low priority recommendation
491    Low,
492    /// Medium priority recommendation
493    Medium,
494    /// High priority recommendation
495    High,
496    /// Critical priority recommendation
497    Critical,
498}
499
500#[cfg(test)]
501mod tests {
502    use super::*;
503    use chrono::Utc;
504
505    fn create_test_metrics(
506        step: usize,
507        loss: f64,
508        memory: f64,
509        throughput: f64,
510    ) -> ModelPerformanceMetrics {
511        ModelPerformanceMetrics {
512            training_step: step,
513            loss,
514            accuracy: Some(0.8),
515            learning_rate: 0.001,
516            batch_size: 32,
517            throughput_samples_per_sec: throughput,
518            memory_usage_mb: memory,
519            gpu_utilization: Some(0.9),
520            timestamp: Utc::now(),
521        }
522    }
523
524    #[test]
525    fn test_performance_analyzer_creation() {
526        let analyzer = PerformanceAnalyzer::new();
527        assert_eq!(analyzer.performance_history.len(), 0);
528        assert_eq!(analyzer.max_history_length, 1000);
529    }
530
531    #[test]
532    fn test_record_performance() {
533        let mut analyzer = PerformanceAnalyzer::new();
534        let metrics = create_test_metrics(1, 0.5, 1000.0, 100.0);
535
536        analyzer.record_performance(metrics);
537        assert_eq!(analyzer.performance_history.len(), 1);
538    }
539
540    #[test]
541    fn test_performance_summary() {
542        let mut analyzer = PerformanceAnalyzer::new();
543
544        // Add some test data
545        for i in 1..=5 {
546            let metrics = create_test_metrics(i, 1.0 / i as f64, 1000.0, 100.0);
547            analyzer.record_performance(metrics);
548        }
549
550        let summary = analyzer.generate_performance_summary();
551        assert_eq!(summary.total_steps, 5);
552        assert!(summary.best_loss < summary.avg_loss);
553    }
554
555    #[test]
556    fn test_trend_computation() {
557        let analyzer = PerformanceAnalyzer::new();
558        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
559        let trend = analyzer.compute_trend(&values);
560        assert!(trend > 0.0); // Should be positive trend
561    }
562
563    #[test]
564    fn test_memory_leak_detection() {
565        let mut analyzer = PerformanceAnalyzer::new();
566
567        // Add metrics with increasing memory usage
568        for i in 1..=15 {
569            let metrics = create_test_metrics(i, 0.5, 1000.0 + (i as f64 * 50.0), 100.0);
570            analyzer.record_performance(metrics);
571        }
572
573        let anomalies = analyzer.detect_performance_anomalies();
574        assert!(!anomalies.is_empty());
575        assert!(matches!(anomalies[0].anomaly_type, AnomalyType::MemoryLeak));
576    }
577}