Skip to main content

trustformers_debug/model_diagnostics/
performance.rs

1//! Performance metrics and analysis functionality.
2//!
3//! This module provides comprehensive performance monitoring and analysis
4//! capabilities including throughput analysis, memory usage tracking,
5//! performance trend analysis, and performance optimization recommendations.
6
7use super::types::{ModelPerformanceMetrics, PerformanceSummary};
8
9/// Performance analyzer for tracking and analyzing model performance metrics.
10#[derive(Debug)]
11pub struct PerformanceAnalyzer {
12    /// Historical performance metrics
13    performance_history: Vec<ModelPerformanceMetrics>,
14    /// Maximum history length to maintain
15    max_history_length: usize,
16    /// Performance thresholds for alerts
17    thresholds: PerformanceThresholds,
18}
19
20/// Performance thresholds for triggering alerts.
21#[derive(Debug, Clone)]
22pub struct PerformanceThresholds {
23    /// Maximum acceptable memory usage in MB
24    pub max_memory_mb: f64,
25    /// Minimum acceptable throughput in samples/sec
26    pub min_throughput: f64,
27    /// Maximum acceptable loss increase percentage
28    pub max_loss_increase_percent: f64,
29    /// Maximum acceptable variance in loss
30    pub max_loss_variance: f64,
31}
32
33impl Default for PerformanceThresholds {
34    fn default() -> Self {
35        Self {
36            max_memory_mb: 8192.0, // 8GB
37            min_throughput: 100.0,
38            max_loss_increase_percent: 10.0,
39            max_loss_variance: 0.1,
40        }
41    }
42}
43
44impl PerformanceAnalyzer {
45    /// Create a new performance analyzer.
46    pub fn new() -> Self {
47        Self {
48            performance_history: Vec::new(),
49            max_history_length: 1000,
50            thresholds: PerformanceThresholds::default(),
51        }
52    }
53
54    /// Create a new performance analyzer with custom thresholds.
55    pub fn with_thresholds(thresholds: PerformanceThresholds) -> Self {
56        Self {
57            performance_history: Vec::new(),
58            max_history_length: 1000,
59            thresholds,
60        }
61    }
62
63    /// Set maximum history length.
64    pub fn set_max_history_length(&mut self, length: usize) {
65        self.max_history_length = length;
66        if self.performance_history.len() > length {
67            self.performance_history.drain(0..self.performance_history.len() - length);
68        }
69    }
70
71    /// Record a new performance measurement.
72    pub fn record_performance(&mut self, metrics: ModelPerformanceMetrics) {
73        self.performance_history.push(metrics);
74
75        // Maintain maximum history length
76        if self.performance_history.len() > self.max_history_length {
77            self.performance_history.remove(0);
78        }
79    }
80
81    /// Record metrics (alias for record_performance).
82    pub fn record_metrics(&mut self, metrics: ModelPerformanceMetrics) {
83        self.record_performance(metrics);
84    }
85
86    /// Get the complete performance history.
87    pub fn get_performance_history(&self) -> &[ModelPerformanceMetrics] {
88        &self.performance_history
89    }
90
91    /// Generate a performance summary.
92    pub fn generate_performance_summary(&self) -> PerformanceSummary {
93        if self.performance_history.is_empty() {
94            return PerformanceSummary::default();
95        }
96
97        let total_steps = self.performance_history.len();
98        let current_metrics = self
99            .performance_history
100            .last()
101            .expect("performance_history is non-empty after is_empty check");
102
103        let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
104        let throughputs: Vec<f64> =
105            self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
106        let memory_usages: Vec<f64> =
107            self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
108
109        let best_loss = losses.iter().fold(f64::INFINITY, |acc, &x| acc.min(x));
110        let avg_loss = losses.iter().sum::<f64>() / losses.len() as f64;
111        let avg_throughput = throughputs.iter().sum::<f64>() / throughputs.len() as f64;
112        let peak_memory_mb = memory_usages.iter().fold(0.0f64, |acc, &x| acc.max(x));
113        let avg_memory_mb = memory_usages.iter().sum::<f64>() / memory_usages.len() as f64;
114
115        PerformanceSummary {
116            total_steps,
117            current_loss: current_metrics.loss,
118            best_loss,
119            avg_loss,
120            current_throughput: current_metrics.throughput_samples_per_sec,
121            avg_throughput,
122            peak_memory_mb,
123            avg_memory_mb,
124        }
125    }
126
127    /// Analyze performance trends.
128    pub fn analyze_performance_trends(&self) -> PerformanceTrends {
129        if self.performance_history.len() < 10 {
130            return PerformanceTrends::default();
131        }
132
133        let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
134        let throughputs: Vec<f64> =
135            self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
136        let memory_usages: Vec<f64> =
137            self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
138
139        let loss_trend = self.compute_trend(&losses);
140        let throughput_trend = self.compute_trend(&throughputs);
141        let memory_trend = self.compute_trend(&memory_usages);
142
143        let loss_volatility = self.compute_volatility(&losses);
144        let throughput_volatility = self.compute_volatility(&throughputs);
145
146        PerformanceTrends {
147            loss_trend,
148            throughput_trend,
149            memory_trend,
150            loss_volatility,
151            throughput_volatility,
152            trend_confidence: self.compute_trend_confidence(&losses),
153        }
154    }
155
156    /// Check for performance anomalies.
157    pub fn detect_performance_anomalies(&self) -> Vec<PerformanceAnomaly> {
158        let mut anomalies = Vec::new();
159
160        if self.performance_history.len() < 5 {
161            return anomalies;
162        }
163
164        // Check for memory leaks
165        if let Some(anomaly) = self.detect_memory_leak() {
166            anomalies.push(anomaly);
167        }
168
169        // Check for performance degradation
170        if let Some(anomaly) = self.detect_performance_degradation() {
171            anomalies.push(anomaly);
172        }
173
174        // Check for training instability
175        if let Some(anomaly) = self.detect_training_instability() {
176            anomalies.push(anomaly);
177        }
178
179        // Check for throughput drops
180        if let Some(anomaly) = self.detect_throughput_drops() {
181            anomalies.push(anomaly);
182        }
183
184        anomalies
185    }
186
187    /// Generate performance optimization recommendations.
188    pub fn generate_optimization_recommendations(&self) -> Vec<OptimizationRecommendation> {
189        let mut recommendations = Vec::new();
190        let summary = self.generate_performance_summary();
191
192        // Memory optimization recommendations
193        if summary.peak_memory_mb > self.thresholds.max_memory_mb {
194            recommendations.push(OptimizationRecommendation {
195                category: "Memory".to_string(),
196                priority: PerformanceRecommendationPriority::High,
197                description: "High memory usage detected".to_string(),
198                suggestion: "Consider reducing batch size or using gradient checkpointing"
199                    .to_string(),
200                expected_improvement: 0.3,
201            });
202        }
203
204        // Throughput optimization recommendations
205        if summary.avg_throughput < self.thresholds.min_throughput {
206            recommendations.push(OptimizationRecommendation {
207                category: "Throughput".to_string(),
208                priority: PerformanceRecommendationPriority::Medium,
209                description: "Low throughput detected".to_string(),
210                suggestion: "Consider increasing batch size or optimizing data loading".to_string(),
211                expected_improvement: 0.4,
212            });
213        }
214
215        // Loss optimization recommendations
216        let trends = self.analyze_performance_trends();
217        if trends.loss_trend > 0.01 {
218            recommendations.push(OptimizationRecommendation {
219                category: "Training".to_string(),
220                priority: PerformanceRecommendationPriority::High,
221                description: "Loss is increasing".to_string(),
222                suggestion: "Consider reducing learning rate or adding regularization".to_string(),
223                expected_improvement: 0.5,
224            });
225        }
226
227        recommendations
228    }
229
230    /// Compute linear trend for a series of values.
231    fn compute_trend(&self, values: &[f64]) -> f64 {
232        if values.len() < 2 {
233            return 0.0;
234        }
235
236        let n = values.len() as f64;
237        let x_mean = (n - 1.0) / 2.0;
238        let y_mean = values.iter().sum::<f64>() / n;
239
240        let mut numerator = 0.0;
241        let mut denominator = 0.0;
242
243        for (i, &y) in values.iter().enumerate() {
244            let x = i as f64;
245            numerator += (x - x_mean) * (y - y_mean);
246            denominator += (x - x_mean).powi(2);
247        }
248
249        if denominator == 0.0 {
250            0.0
251        } else {
252            numerator / denominator
253        }
254    }
255
256    /// Compute volatility (coefficient of variation) for a series of values.
257    fn compute_volatility(&self, values: &[f64]) -> f64 {
258        if values.len() < 2 {
259            return 0.0;
260        }
261
262        let mean = values.iter().sum::<f64>() / values.len() as f64;
263        let variance =
264            values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
265        let std_dev = variance.sqrt();
266
267        if mean == 0.0 {
268            0.0
269        } else {
270            std_dev / mean.abs()
271        }
272    }
273
274    /// Compute confidence in trend analysis.
275    fn compute_trend_confidence(&self, values: &[f64]) -> f64 {
276        if values.len() < 10 {
277            return 0.0;
278        }
279
280        let trend = self.compute_trend(values);
281        let volatility = self.compute_volatility(values);
282
283        // Higher confidence for stronger trends with lower volatility
284        let trend_strength = trend.abs();
285        let confidence = trend_strength / (1.0 + volatility);
286        confidence.min(1.0)
287    }
288
289    /// Detect memory leak patterns.
290    fn detect_memory_leak(&self) -> Option<PerformanceAnomaly> {
291        if self.performance_history.len() < 10 {
292            return None;
293        }
294
295        let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
296        let memory_usages: Vec<f64> = recent_metrics.iter().map(|m| m.memory_usage_mb).collect();
297        let memory_trend = self.compute_trend(&memory_usages);
298
299        // Consider it a memory leak if memory is consistently growing
300        if memory_trend > 10.0 {
301            // More than 10MB increase per step on average
302            Some(PerformanceAnomaly {
303                anomaly_type: AnomalyType::MemoryLeak,
304                severity: AnomalySeverity::High,
305                description: format!("Memory usage increasing at {:.1} MB/step", memory_trend),
306                detected_at_step: self
307                    .performance_history
308                    .last()
309                    .expect("performance_history is non-empty after is_empty check")
310                    .training_step,
311                confidence: 0.8,
312            })
313        } else {
314            None
315        }
316    }
317
318    /// Detect performance degradation.
319    fn detect_performance_degradation(&self) -> Option<PerformanceAnomaly> {
320        if self.performance_history.len() < 20 {
321            return None;
322        }
323
324        let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
325        let previous_metrics = &self.performance_history
326            [self.performance_history.len() - 20..self.performance_history.len() - 10];
327
328        let recent_avg_loss: f64 =
329            recent_metrics.iter().map(|m| m.loss).sum::<f64>() / recent_metrics.len() as f64;
330        let previous_avg_loss: f64 =
331            previous_metrics.iter().map(|m| m.loss).sum::<f64>() / previous_metrics.len() as f64;
332
333        let degradation_percent =
334            ((recent_avg_loss - previous_avg_loss) / previous_avg_loss) * 100.0;
335
336        if degradation_percent > self.thresholds.max_loss_increase_percent {
337            Some(PerformanceAnomaly {
338                anomaly_type: AnomalyType::PerformanceDegradation,
339                severity: AnomalySeverity::High,
340                description: format!("Performance degraded by {:.1}%", degradation_percent),
341                detected_at_step: self
342                    .performance_history
343                    .last()
344                    .expect("performance_history is non-empty after is_empty check")
345                    .training_step,
346                confidence: 0.9,
347            })
348        } else {
349            None
350        }
351    }
352
353    /// Detect training instability.
354    fn detect_training_instability(&self) -> Option<PerformanceAnomaly> {
355        if self.performance_history.len() < 10 {
356            return None;
357        }
358
359        let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
360        let losses: Vec<f64> = recent_metrics.iter().map(|m| m.loss).collect();
361        let volatility = self.compute_volatility(&losses);
362
363        if volatility > self.thresholds.max_loss_variance {
364            Some(PerformanceAnomaly {
365                anomaly_type: AnomalyType::TrainingInstability,
366                severity: AnomalySeverity::Medium,
367                description: format!("High loss volatility: {:.3}", volatility),
368                detected_at_step: self
369                    .performance_history
370                    .last()
371                    .expect("performance_history is non-empty after is_empty check")
372                    .training_step,
373                confidence: 0.7,
374            })
375        } else {
376            None
377        }
378    }
379
380    /// Detect throughput drops.
381    fn detect_throughput_drops(&self) -> Option<PerformanceAnomaly> {
382        if self.performance_history.len() < 10 {
383            return None;
384        }
385
386        let recent_metrics = &self.performance_history[self.performance_history.len() - 5..];
387        let avg_recent_throughput: f64 =
388            recent_metrics.iter().map(|m| m.throughput_samples_per_sec).sum::<f64>()
389                / recent_metrics.len() as f64;
390
391        if avg_recent_throughput < self.thresholds.min_throughput {
392            Some(PerformanceAnomaly {
393                anomaly_type: AnomalyType::ThroughputDrop,
394                severity: AnomalySeverity::Medium,
395                description: format!("Low throughput: {:.1} samples/sec", avg_recent_throughput),
396                detected_at_step: self
397                    .performance_history
398                    .last()
399                    .expect("performance_history is non-empty after is_empty check")
400                    .training_step,
401                confidence: 0.8,
402            })
403        } else {
404            None
405        }
406    }
407
408    /// Clear performance history.
409    pub fn clear(&mut self) {
410        self.performance_history.clear();
411    }
412}
413
414impl Default for PerformanceAnalyzer {
415    fn default() -> Self {
416        Self::new()
417    }
418}
419
420/// Performance trends analysis results.
421#[derive(Debug, Clone)]
422pub struct PerformanceTrends {
423    /// Loss trend (slope)
424    pub loss_trend: f64,
425    /// Throughput trend (slope)
426    pub throughput_trend: f64,
427    /// Memory usage trend (slope)
428    pub memory_trend: f64,
429    /// Loss volatility (coefficient of variation)
430    pub loss_volatility: f64,
431    /// Throughput volatility (coefficient of variation)
432    pub throughput_volatility: f64,
433    /// Confidence in trend analysis
434    pub trend_confidence: f64,
435}
436
437impl Default for PerformanceTrends {
438    fn default() -> Self {
439        Self {
440            loss_trend: 0.0,
441            throughput_trend: 0.0,
442            memory_trend: 0.0,
443            loss_volatility: 0.0,
444            throughput_volatility: 0.0,
445            trend_confidence: 0.0,
446        }
447    }
448}
449
450/// Performance anomaly detection results.
451#[derive(Debug, Clone)]
452pub struct PerformanceAnomaly {
453    /// Type of anomaly detected
454    pub anomaly_type: AnomalyType,
455    /// Severity of the anomaly
456    pub severity: AnomalySeverity,
457    /// Description of the anomaly
458    pub description: String,
459    /// Training step when anomaly was detected
460    pub detected_at_step: usize,
461    /// Confidence in the detection
462    pub confidence: f64,
463}
464
465/// Types of performance anomalies.
466#[derive(Debug, Clone)]
467pub enum AnomalyType {
468    /// Memory leak detected
469    MemoryLeak,
470    /// Performance degradation detected
471    PerformanceDegradation,
472    /// Training instability detected
473    TrainingInstability,
474    /// Throughput drop detected
475    ThroughputDrop,
476}
477
478/// Severity levels for anomalies.
479#[derive(Debug, Clone)]
480pub enum AnomalySeverity {
481    /// Low severity anomaly
482    Low,
483    /// Medium severity anomaly
484    Medium,
485    /// High severity anomaly
486    High,
487    /// Critical severity anomaly
488    Critical,
489}
490
491/// Performance optimization recommendation.
492#[derive(Debug, Clone)]
493pub struct OptimizationRecommendation {
494    /// Category of optimization
495    pub category: String,
496    /// Priority of the recommendation
497    pub priority: PerformanceRecommendationPriority,
498    /// Description of the issue
499    pub description: String,
500    /// Suggested optimization
501    pub suggestion: String,
502    /// Expected improvement (0.0 to 1.0)
503    pub expected_improvement: f64,
504}
505
506/// Priority levels for recommendations.
507#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
508pub enum PerformanceRecommendationPriority {
509    /// Low priority recommendation
510    Low,
511    /// Medium priority recommendation
512    Medium,
513    /// High priority recommendation
514    High,
515    /// Critical priority recommendation
516    Critical,
517}
518
519#[cfg(test)]
520mod tests {
521    use super::*;
522    use chrono::Utc;
523
524    fn create_test_metrics(
525        step: usize,
526        loss: f64,
527        memory: f64,
528        throughput: f64,
529    ) -> ModelPerformanceMetrics {
530        ModelPerformanceMetrics {
531            training_step: step,
532            loss,
533            accuracy: Some(0.8),
534            learning_rate: 0.001,
535            batch_size: 32,
536            throughput_samples_per_sec: throughput,
537            memory_usage_mb: memory,
538            gpu_utilization: Some(0.9),
539            timestamp: Utc::now(),
540        }
541    }
542
543    #[test]
544    fn test_performance_analyzer_creation() {
545        let analyzer = PerformanceAnalyzer::new();
546        assert_eq!(analyzer.performance_history.len(), 0);
547        assert_eq!(analyzer.max_history_length, 1000);
548    }
549
550    #[test]
551    fn test_record_performance() {
552        let mut analyzer = PerformanceAnalyzer::new();
553        let metrics = create_test_metrics(1, 0.5, 1000.0, 100.0);
554
555        analyzer.record_performance(metrics);
556        assert_eq!(analyzer.performance_history.len(), 1);
557    }
558
559    #[test]
560    fn test_performance_summary() {
561        let mut analyzer = PerformanceAnalyzer::new();
562
563        // Add some test data
564        for i in 1..=5 {
565            let metrics = create_test_metrics(i, 1.0 / i as f64, 1000.0, 100.0);
566            analyzer.record_performance(metrics);
567        }
568
569        let summary = analyzer.generate_performance_summary();
570        assert_eq!(summary.total_steps, 5);
571        assert!(summary.best_loss < summary.avg_loss);
572    }
573
574    #[test]
575    fn test_trend_computation() {
576        let analyzer = PerformanceAnalyzer::new();
577        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
578        let trend = analyzer.compute_trend(&values);
579        assert!(trend > 0.0); // Should be positive trend
580    }
581
582    #[test]
583    fn test_memory_leak_detection() {
584        let mut analyzer = PerformanceAnalyzer::new();
585
586        // Add metrics with increasing memory usage
587        for i in 1..=15 {
588            let metrics = create_test_metrics(i, 0.5, 1000.0 + (i as f64 * 50.0), 100.0);
589            analyzer.record_performance(metrics);
590        }
591
592        let anomalies = analyzer.detect_performance_anomalies();
593        assert!(!anomalies.is_empty());
594        assert!(matches!(anomalies[0].anomaly_type, AnomalyType::MemoryLeak));
595    }
596}