trustformers_debug/model_diagnostics/
performance.rs1use super::types::{ModelPerformanceMetrics, PerformanceSummary};
8
9#[derive(Debug)]
11pub struct PerformanceAnalyzer {
12 performance_history: Vec<ModelPerformanceMetrics>,
14 max_history_length: usize,
16 thresholds: PerformanceThresholds,
18}
19
20#[derive(Debug, Clone)]
22pub struct PerformanceThresholds {
23 pub max_memory_mb: f64,
25 pub min_throughput: f64,
27 pub max_loss_increase_percent: f64,
29 pub max_loss_variance: f64,
31}
32
33impl Default for PerformanceThresholds {
34 fn default() -> Self {
35 Self {
36 max_memory_mb: 8192.0, min_throughput: 100.0,
38 max_loss_increase_percent: 10.0,
39 max_loss_variance: 0.1,
40 }
41 }
42}
43
44impl PerformanceAnalyzer {
45 pub fn new() -> Self {
47 Self {
48 performance_history: Vec::new(),
49 max_history_length: 1000,
50 thresholds: PerformanceThresholds::default(),
51 }
52 }
53
54 pub fn with_thresholds(thresholds: PerformanceThresholds) -> Self {
56 Self {
57 performance_history: Vec::new(),
58 max_history_length: 1000,
59 thresholds,
60 }
61 }
62
63 pub fn set_max_history_length(&mut self, length: usize) {
65 self.max_history_length = length;
66 if self.performance_history.len() > length {
67 self.performance_history.drain(0..self.performance_history.len() - length);
68 }
69 }
70
71 pub fn record_performance(&mut self, metrics: ModelPerformanceMetrics) {
73 self.performance_history.push(metrics);
74
75 if self.performance_history.len() > self.max_history_length {
77 self.performance_history.remove(0);
78 }
79 }
80
81 pub fn record_metrics(&mut self, metrics: ModelPerformanceMetrics) {
83 self.record_performance(metrics);
84 }
85
86 pub fn get_performance_history(&self) -> &[ModelPerformanceMetrics] {
88 &self.performance_history
89 }
90
91 pub fn generate_performance_summary(&self) -> PerformanceSummary {
93 if self.performance_history.is_empty() {
94 return PerformanceSummary::default();
95 }
96
97 let total_steps = self.performance_history.len();
98 let current_metrics = self.performance_history.last().unwrap();
99
100 let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
101 let throughputs: Vec<f64> =
102 self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
103 let memory_usages: Vec<f64> =
104 self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
105
106 let best_loss = losses.iter().fold(f64::INFINITY, |acc, &x| acc.min(x));
107 let avg_loss = losses.iter().sum::<f64>() / losses.len() as f64;
108 let avg_throughput = throughputs.iter().sum::<f64>() / throughputs.len() as f64;
109 let peak_memory_mb = memory_usages.iter().fold(0.0f64, |acc, &x| acc.max(x));
110 let avg_memory_mb = memory_usages.iter().sum::<f64>() / memory_usages.len() as f64;
111
112 PerformanceSummary {
113 total_steps,
114 current_loss: current_metrics.loss,
115 best_loss,
116 avg_loss,
117 current_throughput: current_metrics.throughput_samples_per_sec,
118 avg_throughput,
119 peak_memory_mb,
120 avg_memory_mb,
121 }
122 }
123
124 pub fn analyze_performance_trends(&self) -> PerformanceTrends {
126 if self.performance_history.len() < 10 {
127 return PerformanceTrends::default();
128 }
129
130 let losses: Vec<f64> = self.performance_history.iter().map(|m| m.loss).collect();
131 let throughputs: Vec<f64> =
132 self.performance_history.iter().map(|m| m.throughput_samples_per_sec).collect();
133 let memory_usages: Vec<f64> =
134 self.performance_history.iter().map(|m| m.memory_usage_mb).collect();
135
136 let loss_trend = self.compute_trend(&losses);
137 let throughput_trend = self.compute_trend(&throughputs);
138 let memory_trend = self.compute_trend(&memory_usages);
139
140 let loss_volatility = self.compute_volatility(&losses);
141 let throughput_volatility = self.compute_volatility(&throughputs);
142
143 PerformanceTrends {
144 loss_trend,
145 throughput_trend,
146 memory_trend,
147 loss_volatility,
148 throughput_volatility,
149 trend_confidence: self.compute_trend_confidence(&losses),
150 }
151 }
152
153 pub fn detect_performance_anomalies(&self) -> Vec<PerformanceAnomaly> {
155 let mut anomalies = Vec::new();
156
157 if self.performance_history.len() < 5 {
158 return anomalies;
159 }
160
161 if let Some(anomaly) = self.detect_memory_leak() {
163 anomalies.push(anomaly);
164 }
165
166 if let Some(anomaly) = self.detect_performance_degradation() {
168 anomalies.push(anomaly);
169 }
170
171 if let Some(anomaly) = self.detect_training_instability() {
173 anomalies.push(anomaly);
174 }
175
176 if let Some(anomaly) = self.detect_throughput_drops() {
178 anomalies.push(anomaly);
179 }
180
181 anomalies
182 }
183
184 pub fn generate_optimization_recommendations(&self) -> Vec<OptimizationRecommendation> {
186 let mut recommendations = Vec::new();
187 let summary = self.generate_performance_summary();
188
189 if summary.peak_memory_mb > self.thresholds.max_memory_mb {
191 recommendations.push(OptimizationRecommendation {
192 category: "Memory".to_string(),
193 priority: PerformanceRecommendationPriority::High,
194 description: "High memory usage detected".to_string(),
195 suggestion: "Consider reducing batch size or using gradient checkpointing"
196 .to_string(),
197 expected_improvement: 0.3,
198 });
199 }
200
201 if summary.avg_throughput < self.thresholds.min_throughput {
203 recommendations.push(OptimizationRecommendation {
204 category: "Throughput".to_string(),
205 priority: PerformanceRecommendationPriority::Medium,
206 description: "Low throughput detected".to_string(),
207 suggestion: "Consider increasing batch size or optimizing data loading".to_string(),
208 expected_improvement: 0.4,
209 });
210 }
211
212 let trends = self.analyze_performance_trends();
214 if trends.loss_trend > 0.01 {
215 recommendations.push(OptimizationRecommendation {
216 category: "Training".to_string(),
217 priority: PerformanceRecommendationPriority::High,
218 description: "Loss is increasing".to_string(),
219 suggestion: "Consider reducing learning rate or adding regularization".to_string(),
220 expected_improvement: 0.5,
221 });
222 }
223
224 recommendations
225 }
226
227 fn compute_trend(&self, values: &[f64]) -> f64 {
229 if values.len() < 2 {
230 return 0.0;
231 }
232
233 let n = values.len() as f64;
234 let x_mean = (n - 1.0) / 2.0;
235 let y_mean = values.iter().sum::<f64>() / n;
236
237 let mut numerator = 0.0;
238 let mut denominator = 0.0;
239
240 for (i, &y) in values.iter().enumerate() {
241 let x = i as f64;
242 numerator += (x - x_mean) * (y - y_mean);
243 denominator += (x - x_mean).powi(2);
244 }
245
246 if denominator == 0.0 {
247 0.0
248 } else {
249 numerator / denominator
250 }
251 }
252
253 fn compute_volatility(&self, values: &[f64]) -> f64 {
255 if values.len() < 2 {
256 return 0.0;
257 }
258
259 let mean = values.iter().sum::<f64>() / values.len() as f64;
260 let variance =
261 values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
262 let std_dev = variance.sqrt();
263
264 if mean == 0.0 {
265 0.0
266 } else {
267 std_dev / mean.abs()
268 }
269 }
270
271 fn compute_trend_confidence(&self, values: &[f64]) -> f64 {
273 if values.len() < 10 {
274 return 0.0;
275 }
276
277 let trend = self.compute_trend(values);
278 let volatility = self.compute_volatility(values);
279
280 let trend_strength = trend.abs();
282 let confidence = trend_strength / (1.0 + volatility);
283 confidence.min(1.0)
284 }
285
286 fn detect_memory_leak(&self) -> Option<PerformanceAnomaly> {
288 if self.performance_history.len() < 10 {
289 return None;
290 }
291
292 let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
293 let memory_usages: Vec<f64> = recent_metrics.iter().map(|m| m.memory_usage_mb).collect();
294 let memory_trend = self.compute_trend(&memory_usages);
295
296 if memory_trend > 10.0 {
298 Some(PerformanceAnomaly {
300 anomaly_type: AnomalyType::MemoryLeak,
301 severity: AnomalySeverity::High,
302 description: format!("Memory usage increasing at {:.1} MB/step", memory_trend),
303 detected_at_step: self.performance_history.last().unwrap().training_step,
304 confidence: 0.8,
305 })
306 } else {
307 None
308 }
309 }
310
311 fn detect_performance_degradation(&self) -> Option<PerformanceAnomaly> {
313 if self.performance_history.len() < 20 {
314 return None;
315 }
316
317 let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
318 let previous_metrics = &self.performance_history
319 [self.performance_history.len() - 20..self.performance_history.len() - 10];
320
321 let recent_avg_loss: f64 =
322 recent_metrics.iter().map(|m| m.loss).sum::<f64>() / recent_metrics.len() as f64;
323 let previous_avg_loss: f64 =
324 previous_metrics.iter().map(|m| m.loss).sum::<f64>() / previous_metrics.len() as f64;
325
326 let degradation_percent =
327 ((recent_avg_loss - previous_avg_loss) / previous_avg_loss) * 100.0;
328
329 if degradation_percent > self.thresholds.max_loss_increase_percent {
330 Some(PerformanceAnomaly {
331 anomaly_type: AnomalyType::PerformanceDegradation,
332 severity: AnomalySeverity::High,
333 description: format!("Performance degraded by {:.1}%", degradation_percent),
334 detected_at_step: self.performance_history.last().unwrap().training_step,
335 confidence: 0.9,
336 })
337 } else {
338 None
339 }
340 }
341
342 fn detect_training_instability(&self) -> Option<PerformanceAnomaly> {
344 if self.performance_history.len() < 10 {
345 return None;
346 }
347
348 let recent_metrics = &self.performance_history[self.performance_history.len() - 10..];
349 let losses: Vec<f64> = recent_metrics.iter().map(|m| m.loss).collect();
350 let volatility = self.compute_volatility(&losses);
351
352 if volatility > self.thresholds.max_loss_variance {
353 Some(PerformanceAnomaly {
354 anomaly_type: AnomalyType::TrainingInstability,
355 severity: AnomalySeverity::Medium,
356 description: format!("High loss volatility: {:.3}", volatility),
357 detected_at_step: self.performance_history.last().unwrap().training_step,
358 confidence: 0.7,
359 })
360 } else {
361 None
362 }
363 }
364
365 fn detect_throughput_drops(&self) -> Option<PerformanceAnomaly> {
367 if self.performance_history.len() < 10 {
368 return None;
369 }
370
371 let recent_metrics = &self.performance_history[self.performance_history.len() - 5..];
372 let avg_recent_throughput: f64 =
373 recent_metrics.iter().map(|m| m.throughput_samples_per_sec).sum::<f64>()
374 / recent_metrics.len() as f64;
375
376 if avg_recent_throughput < self.thresholds.min_throughput {
377 Some(PerformanceAnomaly {
378 anomaly_type: AnomalyType::ThroughputDrop,
379 severity: AnomalySeverity::Medium,
380 description: format!("Low throughput: {:.1} samples/sec", avg_recent_throughput),
381 detected_at_step: self.performance_history.last().unwrap().training_step,
382 confidence: 0.8,
383 })
384 } else {
385 None
386 }
387 }
388
389 pub fn clear(&mut self) {
391 self.performance_history.clear();
392 }
393}
394
395impl Default for PerformanceAnalyzer {
396 fn default() -> Self {
397 Self::new()
398 }
399}
400
401#[derive(Debug, Clone)]
403pub struct PerformanceTrends {
404 pub loss_trend: f64,
406 pub throughput_trend: f64,
408 pub memory_trend: f64,
410 pub loss_volatility: f64,
412 pub throughput_volatility: f64,
414 pub trend_confidence: f64,
416}
417
418impl Default for PerformanceTrends {
419 fn default() -> Self {
420 Self {
421 loss_trend: 0.0,
422 throughput_trend: 0.0,
423 memory_trend: 0.0,
424 loss_volatility: 0.0,
425 throughput_volatility: 0.0,
426 trend_confidence: 0.0,
427 }
428 }
429}
430
431#[derive(Debug, Clone)]
433pub struct PerformanceAnomaly {
434 pub anomaly_type: AnomalyType,
436 pub severity: AnomalySeverity,
438 pub description: String,
440 pub detected_at_step: usize,
442 pub confidence: f64,
444}
445
446#[derive(Debug, Clone)]
448pub enum AnomalyType {
449 MemoryLeak,
451 PerformanceDegradation,
453 TrainingInstability,
455 ThroughputDrop,
457}
458
459#[derive(Debug, Clone)]
461pub enum AnomalySeverity {
462 Low,
464 Medium,
466 High,
468 Critical,
470}
471
472#[derive(Debug, Clone)]
474pub struct OptimizationRecommendation {
475 pub category: String,
477 pub priority: PerformanceRecommendationPriority,
479 pub description: String,
481 pub suggestion: String,
483 pub expected_improvement: f64,
485}
486
487#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
489pub enum PerformanceRecommendationPriority {
490 Low,
492 Medium,
494 High,
496 Critical,
498}
499
500#[cfg(test)]
501mod tests {
502 use super::*;
503 use chrono::Utc;
504
505 fn create_test_metrics(
506 step: usize,
507 loss: f64,
508 memory: f64,
509 throughput: f64,
510 ) -> ModelPerformanceMetrics {
511 ModelPerformanceMetrics {
512 training_step: step,
513 loss,
514 accuracy: Some(0.8),
515 learning_rate: 0.001,
516 batch_size: 32,
517 throughput_samples_per_sec: throughput,
518 memory_usage_mb: memory,
519 gpu_utilization: Some(0.9),
520 timestamp: Utc::now(),
521 }
522 }
523
524 #[test]
525 fn test_performance_analyzer_creation() {
526 let analyzer = PerformanceAnalyzer::new();
527 assert_eq!(analyzer.performance_history.len(), 0);
528 assert_eq!(analyzer.max_history_length, 1000);
529 }
530
531 #[test]
532 fn test_record_performance() {
533 let mut analyzer = PerformanceAnalyzer::new();
534 let metrics = create_test_metrics(1, 0.5, 1000.0, 100.0);
535
536 analyzer.record_performance(metrics);
537 assert_eq!(analyzer.performance_history.len(), 1);
538 }
539
540 #[test]
541 fn test_performance_summary() {
542 let mut analyzer = PerformanceAnalyzer::new();
543
544 for i in 1..=5 {
546 let metrics = create_test_metrics(i, 1.0 / i as f64, 1000.0, 100.0);
547 analyzer.record_performance(metrics);
548 }
549
550 let summary = analyzer.generate_performance_summary();
551 assert_eq!(summary.total_steps, 5);
552 assert!(summary.best_loss < summary.avg_loss);
553 }
554
555 #[test]
556 fn test_trend_computation() {
557 let analyzer = PerformanceAnalyzer::new();
558 let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
559 let trend = analyzer.compute_trend(&values);
560 assert!(trend > 0.0); }
562
563 #[test]
564 fn test_memory_leak_detection() {
565 let mut analyzer = PerformanceAnalyzer::new();
566
567 for i in 1..=15 {
569 let metrics = create_test_metrics(i, 0.5, 1000.0 + (i as f64 * 50.0), 100.0);
570 analyzer.record_performance(metrics);
571 }
572
573 let anomalies = analyzer.detect_performance_anomalies();
574 assert!(!anomalies.is_empty());
575 assert!(matches!(anomalies[0].anomaly_type, AnomalyType::MemoryLeak));
576 }
577}