trustformers_debug/model_diagnostics/
performance.rs1use super::types::{ModelPerformanceMetrics, PerformanceSummary};
8
9#[derive(Debug)]
11pub struct PerformanceAnalyzer {
12 performance_history: Vec<ModelPerformanceMetrics>,
14 max_history_length: usize,
16 thresholds: PerformanceThresholds,
18}
19
20#[derive(Debug, Clone)]
22pub struct PerformanceThresholds {
23 pub max_memory_mb: f64,
25 pub min_throughput: f64,
27 pub max_loss_increase_percent: f64,
29 pub max_loss_variance: f64,
31}
32
33impl Default for PerformanceThresholds {
34 fn default() -> Self {
35 Self {
36 max_memory_mb: 8192.0, min_throughput: 100.0,
38 max_loss_increase_percent: 10.0,
39 max_loss_variance: 0.1,
40 }
41 }
42}
43
44impl PerformanceAnalyzer {
45 pub fn new() -> Self {
47 Self {
48 performance_history: Vec::new(),
49 max_history_length: 1000,
50 thresholds: PerformanceThresholds::default(),
51 }
52 }
53
54 pub fn with_thresholds(thresholds: PerformanceThresholds) -> Self {
56 Self {
57 performance_history: Vec::new(),
58 max_history_length: 1000,
59 thresholds,
60 }
61 }
62
63 pub fn set_max_history_length(&mut self, length: usize) {
65 self.max_history_length = length;
66 if self.performance_history.len() > length {
67 self.performance_history.drain(0..self.performance_history.len() - length);
68 }
69 }
70
71 pub fn record_performance(&mut self, metrics: ModelPerformanceMetrics) {
73 self.performance_history.push(metrics);
74
75 if self.performance_history.len() > self.max_history_length {
77 self.performance_history.remove(0);
78 }
79 }
80
81 pub fn record_metrics(&mut self, metrics: ModelPerformanceMetrics) {
83 self.record_performance(metrics);
84 }
85
86 pub fn get_performance_history(&self) -> &[ModelPerformanceMetrics] {
88 &self.performance_history
89 }
90
91 pub fn generate_performance_summary(&self) -> PerformanceSummary {
93 if self.performance_history.is_empty() {
94 return PerformanceSummary::default();
95 }
96
97 let total_steps = self.performance_history.len();
98 let current_metrics = self
99 .performance_history
100 .last()
101 .expect("performance_history is non-empty after is_empty check");
102
103 let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
104 let throughputs: Vec<f64> =
105 self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
106 let memory_usages: Vec<f64> =
107 self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
108
109 let best_loss = losses.iter().fold(f64::INFINITY, |acc, &x| acc.min(x));
110 let avg_loss = losses.iter().sum::<f64>() / losses.len() as f64;
111 let avg_throughput = throughputs.iter().sum::<f64>() / throughputs.len() as f64;
112 let peak_memory_mb = memory_usages.iter().fold(0.0f64, |acc, &x| acc.max(x));
113 let avg_memory_mb = memory_usages.iter().sum::<f64>() / memory_usages.len() as f64;
114
115 PerformanceSummary {
116 total_steps,
117 current_loss: current_metrics.loss,
118 best_loss,
119 avg_loss,
120 current_throughput: current_metrics.throughput_samples_per_sec,
121 avg_throughput,
122 peak_memory_mb,
123 avg_memory_mb,
124 }
125 }
126
127 pub fn analyze_performance_trends(&self) -> PerformanceTrends {
129 if self.performance_history.len() < 10 {
130 return PerformanceTrends::default();
131 }
132
133 let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
134 let throughputs: Vec<f64> =
135 self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
136 let memory_usages: Vec<f64> =
137 self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
138
139 let loss_trend = self.compute_trend(&losses);
140 let throughput_trend = self.compute_trend(&throughputs);
141 let memory_trend = self.compute_trend(&memory_usages);
142
143 let loss_volatility = self.compute_volatility(&losses);
144 let throughput_volatility = self.compute_volatility(&throughputs);
145
146 PerformanceTrends {
147 loss_trend,
148 throughput_trend,
149 memory_trend,
150 loss_volatility,
151 throughput_volatility,
152 trend_confidence: self.compute_trend_confidence(&losses),
153 }
154 }
155
156 pub fn detect_performance_anomalies(&self) -> Vec<PerformanceAnomaly> {
158 let mut anomalies = Vec::new();
159
160 if self.performance_history.len() < 5 {
161 return anomalies;
162 }
163
164 if let Some(anomaly) = self.detect_memory_leak() {
166 anomalies.push(anomaly);
167 }
168
169 if let Some(anomaly) = self.detect_performance_degradation() {
171 anomalies.push(anomaly);
172 }
173
174 if let Some(anomaly) = self.detect_training_instability() {
176 anomalies.push(anomaly);
177 }
178
179 if let Some(anomaly) = self.detect_throughput_drops() {
181 anomalies.push(anomaly);
182 }
183
184 anomalies
185 }
186
187 pub fn generate_optimization_recommendations(&self) -> Vec<OptimizationRecommendation> {
189 let mut recommendations = Vec::new();
190 let summary = self.generate_performance_summary();
191
192 if summary.peak_memory_mb > self.thresholds.max_memory_mb {
194 recommendations.push(OptimizationRecommendation {
195 category: "Memory".to_string(),
196 priority: PerformanceRecommendationPriority::High,
197 description: "High memory usage detected".to_string(),
198 suggestion: "Consider reducing batch size or using gradient checkpointing"
199 .to_string(),
200 expected_improvement: 0.3,
201 });
202 }
203
204 if summary.avg_throughput < self.thresholds.min_throughput {
206 recommendations.push(OptimizationRecommendation {
207 category: "Throughput".to_string(),
208 priority: PerformanceRecommendationPriority::Medium,
209 description: "Low throughput detected".to_string(),
210 suggestion: "Consider increasing batch size or optimizing data loading".to_string(),
211 expected_improvement: 0.4,
212 });
213 }
214
215 let trends = self.analyze_performance_trends();
217 if trends.loss_trend > 0.01 {
218 recommendations.push(OptimizationRecommendation {
219 category: "Training".to_string(),
220 priority: PerformanceRecommendationPriority::High,
221 description: "Loss is increasing".to_string(),
222 suggestion: "Consider reducing learning rate or adding regularization".to_string(),
223 expected_improvement: 0.5,
224 });
225 }
226
227 recommendations
228 }
229
230 fn compute_trend(&self, values: &[f64]) -> f64 {
232 if values.len() < 2 {
233 return 0.0;
234 }
235
236 let n = values.len() as f64;
237 let x_mean = (n - 1.0) / 2.0;
238 let y_mean = values.iter().sum::<f64>() / n;
239
240 let mut numerator = 0.0;
241 let mut denominator = 0.0;
242
243 for (i, &y) in values.iter().enumerate() {
244 let x = i as f64;
245 numerator += (x - x_mean) * (y - y_mean);
246 denominator += (x - x_mean).powi(2);
247 }
248
249 if denominator == 0.0 {
250 0.0
251 } else {
252 numerator / denominator
253 }
254 }
255
256 fn compute_volatility(&self, values: &[f64]) -> f64 {
258 if values.len() < 2 {
259 return 0.0;
260 }
261
262 let mean = values.iter().sum::<f64>() / values.len() as f64;
263 let variance =
264 values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
265 let std_dev = variance.sqrt();
266
267 if mean == 0.0 {
268 0.0
269 } else {
270 std_dev / mean.abs()
271 }
272 }
273
274 fn compute_trend_confidence(&self, values: &[f64]) -> f64 {
276 if values.len() < 10 {
277 return 0.0;
278 }
279
280 let trend = self.compute_trend(values);
281 let volatility = self.compute_volatility(values);
282
283 let trend_strength = trend.abs();
285 let confidence = trend_strength / (1.0 + volatility);
286 confidence.min(1.0)
287 }
288
289 fn detect_memory_leak(&self) -> Option<PerformanceAnomaly> {
291 if self.performance_history.len() < 10 {
292 return None;
293 }
294
295 let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
296 let memory_usages: Vec<f64> = recent_metrics.iter().map(|m| m.memory_usage_mb).collect();
297 let memory_trend = self.compute_trend(&memory_usages);
298
299 if memory_trend > 10.0 {
301 Some(PerformanceAnomaly {
303 anomaly_type: AnomalyType::MemoryLeak,
304 severity: AnomalySeverity::High,
305 description: format!("Memory usage increasing at {:.1} MB/step", memory_trend),
306 detected_at_step: self
307 .performance_history
308 .last()
309 .expect("performance_history is non-empty after is_empty check")
310 .training_step,
311 confidence: 0.8,
312 })
313 } else {
314 None
315 }
316 }
317
318 fn detect_performance_degradation(&self) -> Option<PerformanceAnomaly> {
320 if self.performance_history.len() < 20 {
321 return None;
322 }
323
324 let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
325 let previous_metrics = &self.performance_history
326 [self.performance_history.len() - 20..self.performance_history.len() - 10];
327
328 let recent_avg_loss: f64 =
329 recent_metrics.iter().map(|m| m.loss).sum::<f64>() / recent_metrics.len() as f64;
330 let previous_avg_loss: f64 =
331 previous_metrics.iter().map(|m| m.loss).sum::<f64>() / previous_metrics.len() as f64;
332
333 let degradation_percent =
334 ((recent_avg_loss - previous_avg_loss) / previous_avg_loss) * 100.0;
335
336 if degradation_percent > self.thresholds.max_loss_increase_percent {
337 Some(PerformanceAnomaly {
338 anomaly_type: AnomalyType::PerformanceDegradation,
339 severity: AnomalySeverity::High,
340 description: format!("Performance degraded by {:.1}%", degradation_percent),
341 detected_at_step: self
342 .performance_history
343 .last()
344 .expect("performance_history is non-empty after is_empty check")
345 .training_step,
346 confidence: 0.9,
347 })
348 } else {
349 None
350 }
351 }
352
353 fn detect_training_instability(&self) -> Option<PerformanceAnomaly> {
355 if self.performance_history.len() < 10 {
356 return None;
357 }
358
359 let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
360 let losses: Vec<f64> = recent_metrics.iter().map(|m| m.loss).collect();
361 let volatility = self.compute_volatility(&losses);
362
363 if volatility > self.thresholds.max_loss_variance {
364 Some(PerformanceAnomaly {
365 anomaly_type: AnomalyType::TrainingInstability,
366 severity: AnomalySeverity::Medium,
367 description: format!("High loss volatility: {:.3}", volatility),
368 detected_at_step: self
369 .performance_history
370 .last()
371 .expect("performance_history is non-empty after is_empty check")
372 .training_step,
373 confidence: 0.7,
374 })
375 } else {
376 None
377 }
378 }
379
380 fn detect_throughput_drops(&self) -> Option<PerformanceAnomaly> {
382 if self.performance_history.len() < 10 {
383 return None;
384 }
385
386 let recent_metrics = &self.performance_history[self.performance_history.len() - 5..];
387 let avg_recent_throughput: f64 =
388 recent_metrics.iter().map(|m| m.throughput_samples_per_sec).sum::<f64>()
389 / recent_metrics.len() as f64;
390
391 if avg_recent_throughput < self.thresholds.min_throughput {
392 Some(PerformanceAnomaly {
393 anomaly_type: AnomalyType::ThroughputDrop,
394 severity: AnomalySeverity::Medium,
395 description: format!("Low throughput: {:.1} samples/sec", avg_recent_throughput),
396 detected_at_step: self
397 .performance_history
398 .last()
399 .expect("performance_history is non-empty after is_empty check")
400 .training_step,
401 confidence: 0.8,
402 })
403 } else {
404 None
405 }
406 }
407
408 pub fn clear(&mut self) {
410 self.performance_history.clear();
411 }
412}
413
414impl Default for PerformanceAnalyzer {
415 fn default() -> Self {
416 Self::new()
417 }
418}
419
420#[derive(Debug, Clone)]
422pub struct PerformanceTrends {
423 pub loss_trend: f64,
425 pub throughput_trend: f64,
427 pub memory_trend: f64,
429 pub loss_volatility: f64,
431 pub throughput_volatility: f64,
433 pub trend_confidence: f64,
435}
436
437impl Default for PerformanceTrends {
438 fn default() -> Self {
439 Self {
440 loss_trend: 0.0,
441 throughput_trend: 0.0,
442 memory_trend: 0.0,
443 loss_volatility: 0.0,
444 throughput_volatility: 0.0,
445 trend_confidence: 0.0,
446 }
447 }
448}
449
450#[derive(Debug, Clone)]
452pub struct PerformanceAnomaly {
453 pub anomaly_type: AnomalyType,
455 pub severity: AnomalySeverity,
457 pub description: String,
459 pub detected_at_step: usize,
461 pub confidence: f64,
463}
464
465#[derive(Debug, Clone)]
467pub enum AnomalyType {
468 MemoryLeak,
470 PerformanceDegradation,
472 TrainingInstability,
474 ThroughputDrop,
476}
477
478#[derive(Debug, Clone)]
480pub enum AnomalySeverity {
481 Low,
483 Medium,
485 High,
487 Critical,
489}
490
491#[derive(Debug, Clone)]
493pub struct OptimizationRecommendation {
494 pub category: String,
496 pub priority: PerformanceRecommendationPriority,
498 pub description: String,
500 pub suggestion: String,
502 pub expected_improvement: f64,
504}
505
506#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
508pub enum PerformanceRecommendationPriority {
509 Low,
511 Medium,
513 High,
515 Critical,
517}
518
519#[cfg(test)]
520mod tests {
521 use super::*;
522 use chrono::Utc;
523
524 fn create_test_metrics(
525 step: usize,
526 loss: f64,
527 memory: f64,
528 throughput: f64,
529 ) -> ModelPerformanceMetrics {
530 ModelPerformanceMetrics {
531 training_step: step,
532 loss,
533 accuracy: Some(0.8),
534 learning_rate: 0.001,
535 batch_size: 32,
536 throughput_samples_per_sec: throughput,
537 memory_usage_mb: memory,
538 gpu_utilization: Some(0.9),
539 timestamp: Utc::now(),
540 }
541 }
542
543 #[test]
544 fn test_performance_analyzer_creation() {
545 let analyzer = PerformanceAnalyzer::new();
546 assert_eq!(analyzer.performance_history.len(), 0);
547 assert_eq!(analyzer.max_history_length, 1000);
548 }
549
550 #[test]
551 fn test_record_performance() {
552 let mut analyzer = PerformanceAnalyzer::new();
553 let metrics = create_test_metrics(1, 0.5, 1000.0, 100.0);
554
555 analyzer.record_performance(metrics);
556 assert_eq!(analyzer.performance_history.len(), 1);
557 }
558
559 #[test]
560 fn test_performance_summary() {
561 let mut analyzer = PerformanceAnalyzer::new();
562
563 for i in 1..=5 {
565 let metrics = create_test_metrics(i, 1.0 / i as f64, 1000.0, 100.0);
566 analyzer.record_performance(metrics);
567 }
568
569 let summary = analyzer.generate_performance_summary();
570 assert_eq!(summary.total_steps, 5);
571 assert!(summary.best_loss < summary.avg_loss);
572 }
573
574 #[test]
575 fn test_trend_computation() {
576 let analyzer = PerformanceAnalyzer::new();
577 let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
578 let trend = analyzer.compute_trend(&values);
579 assert!(trend > 0.0); }
581
582 #[test]
583 fn test_memory_leak_detection() {
584 let mut analyzer = PerformanceAnalyzer::new();
585
586 for i in 1..=15 {
588 let metrics = create_test_metrics(i, 0.5, 1000.0 + (i as f64 * 50.0), 100.0);
589 analyzer.record_performance(metrics);
590 }
591
592 let anomalies = analyzer.detect_performance_anomalies();
593 assert!(!anomalies.is_empty());
594 assert!(matches!(anomalies[0].anomaly_type, AnomalyType::MemoryLeak));
595 }
596}