Skip to main content

trustformers_debug/gradient_debugger/
performance_tracking.rs

1//! Performance Tracking and Bottleneck Analysis for Gradient Computation
2//!
3//! This module provides comprehensive performance tracking capabilities for gradient
4//! computation, including bottleneck identification, throughput analysis, and
5//! resource utilization monitoring.
6
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::time::{Duration, Instant};
10
11/// Performance tracking for gradient computation
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct GradientPerformanceTracker {
14    pub total_gradient_computations: usize,
15    pub average_computation_time: Duration,
16    pub memory_usage_bytes: usize,
17    pub throughput_gradients_per_second: f64,
18    pub bottleneck_layers: Vec<String>,
19    pub layer_performance_map: HashMap<String, LayerPerformanceMetrics>,
20    pub resource_utilization: ResourceUtilization,
21    pub performance_history: Vec<PerformanceSnapshot>,
22}
23
24impl Default for GradientPerformanceTracker {
25    fn default() -> Self {
26        Self {
27            total_gradient_computations: 0,
28            average_computation_time: Duration::from_millis(0),
29            memory_usage_bytes: 0,
30            throughput_gradients_per_second: 0.0,
31            bottleneck_layers: Vec::new(),
32            layer_performance_map: HashMap::new(),
33            resource_utilization: ResourceUtilization::default(),
34            performance_history: Vec::new(),
35        }
36    }
37}
38
39impl GradientPerformanceTracker {
40    pub fn new() -> Self {
41        Self::default()
42    }
43
44    pub fn start_timing(&mut self, layer_name: &str) -> PerformanceTimer {
45        PerformanceTimer::new(layer_name.to_string())
46    }
47
48    pub fn record_layer_performance(
49        &mut self,
50        layer_name: &str,
51        computation_time: Duration,
52        memory_used: usize,
53    ) {
54        let metrics = self
55            .layer_performance_map
56            .entry(layer_name.to_string())
57            .or_insert_with(|| LayerPerformanceMetrics::new(layer_name.to_string()));
58
59        metrics.update(computation_time, memory_used);
60        self.total_gradient_computations += 1;
61
62        // Update overall averages
63        self.update_overall_metrics();
64        self.identify_bottlenecks();
65    }
66
67    fn update_overall_metrics(&mut self) {
68        if self.layer_performance_map.is_empty() {
69            return;
70        }
71
72        let total_time: Duration =
73            self.layer_performance_map.values().map(|m| m.average_computation_time).sum();
74
75        let total_layers = self.layer_performance_map.len();
76        self.average_computation_time = total_time / total_layers as u32;
77
78        self.memory_usage_bytes =
79            self.layer_performance_map.values().map(|m| m.average_memory_usage).sum();
80
81        // Calculate throughput
82        if self.average_computation_time.as_secs_f64() > 0.0 {
83            self.throughput_gradients_per_second =
84                1.0 / self.average_computation_time.as_secs_f64();
85        }
86    }
87
88    fn identify_bottlenecks(&mut self) {
89        self.bottleneck_layers.clear();
90
91        if self.layer_performance_map.len() < 2 {
92            return;
93        }
94
95        // Calculate mean and standard deviation of computation times
96        let times: Vec<f64> = self
97            .layer_performance_map
98            .values()
99            .map(|m| m.average_computation_time.as_secs_f64())
100            .collect();
101
102        let mean = times.iter().sum::<f64>() / times.len() as f64;
103        let variance = times.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / times.len() as f64;
104        let std_dev = variance.sqrt();
105
106        // Identify layers that are significantly slower than average
107        let threshold = mean + 1.5 * std_dev;
108
109        for (layer_name, metrics) in &self.layer_performance_map {
110            if metrics.average_computation_time.as_secs_f64() > threshold {
111                self.bottleneck_layers.push(layer_name.clone());
112            }
113        }
114    }
115
116    pub fn get_performance_trends(&self) -> PerformanceTrends {
117        if self.performance_history.len() < 2 {
118            return PerformanceTrends::default();
119        }
120
121        let recent_snapshots: Vec<&PerformanceSnapshot> =
122            self.performance_history.iter().rev().take(10).collect();
123
124        let older_snapshots: Vec<&PerformanceSnapshot> =
125            self.performance_history.iter().rev().skip(10).take(10).collect();
126
127        if older_snapshots.is_empty() {
128            return PerformanceTrends::default();
129        }
130
131        let recent_avg_throughput = recent_snapshots.iter().map(|s| s.throughput).sum::<f64>()
132            / recent_snapshots.len() as f64;
133
134        let older_avg_throughput = older_snapshots.iter().map(|s| s.throughput).sum::<f64>()
135            / older_snapshots.len() as f64;
136
137        let recent_avg_memory =
138            recent_snapshots.iter().map(|s| s.memory_usage).sum::<usize>() / recent_snapshots.len();
139
140        let older_avg_memory =
141            older_snapshots.iter().map(|s| s.memory_usage).sum::<usize>() / older_snapshots.len();
142
143        PerformanceTrends {
144            throughput_trend: Self::classify_trend(recent_avg_throughput, older_avg_throughput),
145            memory_trend: Self::classify_trend(recent_avg_memory as f64, older_avg_memory as f64),
146            bottleneck_stability: self
147                .analyze_bottleneck_stability(&recent_snapshots, &older_snapshots),
148            overall_performance_direction: self
149                .analyze_overall_direction(&recent_snapshots, &older_snapshots),
150        }
151    }
152
153    fn classify_trend(recent: f64, older: f64) -> TrendDirection {
154        let change_ratio = (recent - older) / older.max(1e-10);
155        let threshold = 0.05; // 5% change threshold
156
157        if change_ratio > threshold {
158            TrendDirection::Improving
159        } else if change_ratio < -threshold {
160            TrendDirection::Degrading
161        } else {
162            TrendDirection::Stable
163        }
164    }
165
166    fn analyze_bottleneck_stability(
167        &self,
168        recent: &[&PerformanceSnapshot],
169        older: &[&PerformanceSnapshot],
170    ) -> BottleneckStability {
171        let recent_bottlenecks: std::collections::HashSet<&String> =
172            recent.iter().flat_map(|s| &s.active_bottlenecks).collect();
173
174        let older_bottlenecks: std::collections::HashSet<&String> =
175            older.iter().flat_map(|s| &s.active_bottlenecks).collect();
176
177        let intersection_size = recent_bottlenecks.intersection(&older_bottlenecks).count();
178        let union_size = recent_bottlenecks.union(&older_bottlenecks).count();
179
180        if union_size == 0 {
181            return BottleneckStability::Stable;
182        }
183
184        let stability_ratio = intersection_size as f64 / union_size as f64;
185
186        if stability_ratio > 0.8 {
187            BottleneckStability::Stable
188        } else if stability_ratio > 0.5 {
189            BottleneckStability::Moderate
190        } else {
191            BottleneckStability::Unstable
192        }
193    }
194
195    fn analyze_overall_direction(
196        &self,
197        recent: &[&PerformanceSnapshot],
198        older: &[&PerformanceSnapshot],
199    ) -> PerformanceDirection {
200        let recent_avg_time =
201            recent.iter().map(|s| s.average_time.as_secs_f64()).sum::<f64>() / recent.len() as f64;
202
203        let older_avg_time =
204            older.iter().map(|s| s.average_time.as_secs_f64()).sum::<f64>() / older.len() as f64;
205
206        if recent_avg_time < older_avg_time * 0.95 {
207            PerformanceDirection::Improving
208        } else if recent_avg_time > older_avg_time * 1.05 {
209            PerformanceDirection::Degrading
210        } else {
211            PerformanceDirection::Stable
212        }
213    }
214
215    pub fn generate_optimization_recommendations(&self) -> Vec<OptimizationRecommendation> {
216        let mut recommendations = Vec::new();
217
218        // Analyze bottlenecks
219        for layer_name in &self.bottleneck_layers {
220            if let Some(metrics) = self.layer_performance_map.get(layer_name) {
221                recommendations.push(OptimizationRecommendation {
222                    layer_name: layer_name.clone(),
223                    issue_type: OptimizationIssue::ComputationalBottleneck,
224                    severity: self.calculate_bottleneck_severity(metrics),
225                    recommendations: vec![
226                        format!("Consider optimizing {} layer computation", layer_name),
227                        "Check for inefficient operations or memory access patterns".to_string(),
228                        "Consider layer-specific optimizations or hardware acceleration"
229                            .to_string(),
230                    ],
231                    expected_improvement: self.estimate_improvement_potential(metrics),
232                });
233            }
234        }
235
236        // Memory usage analysis
237        if self.memory_usage_bytes > 1_000_000_000 {
238            // > 1GB
239            recommendations.push(OptimizationRecommendation {
240                layer_name: "Global".to_string(),
241                issue_type: OptimizationIssue::HighMemoryUsage,
242                severity: OptimizationSeverity::High,
243                recommendations: vec![
244                    "Consider gradient checkpointing to reduce memory usage".to_string(),
245                    "Optimize batch size and sequence length".to_string(),
246                    "Use memory-efficient attention mechanisms".to_string(),
247                ],
248                expected_improvement: 0.3,
249            });
250        }
251
252        // Low throughput analysis
253        if self.throughput_gradients_per_second < 1.0 {
254            recommendations.push(OptimizationRecommendation {
255                layer_name: "Global".to_string(),
256                issue_type: OptimizationIssue::LowThroughput,
257                severity: OptimizationSeverity::Medium,
258                recommendations: vec![
259                    "Consider mixed precision training".to_string(),
260                    "Optimize data loading and preprocessing pipelines".to_string(),
261                    "Use gradient accumulation for larger effective batch sizes".to_string(),
262                ],
263                expected_improvement: 0.4,
264            });
265        }
266
267        recommendations
268    }
269
270    fn calculate_bottleneck_severity(
271        &self,
272        metrics: &LayerPerformanceMetrics,
273    ) -> OptimizationSeverity {
274        let relative_slowness = metrics.average_computation_time.as_secs_f64()
275            / self.average_computation_time.as_secs_f64();
276
277        if relative_slowness > 3.0 {
278            OptimizationSeverity::Critical
279        } else if relative_slowness > 2.0 {
280            OptimizationSeverity::High
281        } else if relative_slowness > 1.5 {
282            OptimizationSeverity::Medium
283        } else {
284            OptimizationSeverity::Low
285        }
286    }
287
288    fn estimate_improvement_potential(&self, metrics: &LayerPerformanceMetrics) -> f64 {
289        let relative_slowness = metrics.average_computation_time.as_secs_f64()
290            / self.average_computation_time.as_secs_f64();
291
292        // Estimate potential improvement based on how much slower this layer is
293        (relative_slowness - 1.0).min(0.8).max(0.1)
294    }
295
296    /// Start monitoring performance
297    pub fn start_monitoring(&mut self) {
298        // Reset performance tracking state
299        self.total_gradient_computations = 0;
300        self.average_computation_time = Duration::from_millis(0);
301        self.memory_usage_bytes = 0;
302        self.throughput_gradients_per_second = 0.0;
303        self.bottleneck_layers.clear();
304        self.layer_performance_map.clear();
305
306        // Initialize resource utilization monitoring
307        self.resource_utilization = ResourceUtilization {
308            cpu_usage_percent: 0.0,
309            memory_usage_percent: 0.0,
310            gpu_usage_percent: 0.0,
311            io_wait_percent: 0.0,
312        };
313    }
314
315    /// Take a performance snapshot
316    pub fn take_performance_snapshot(&self) -> PerformanceSnapshot {
317        PerformanceSnapshot {
318            timestamp: std::time::SystemTime::now(),
319            total_computations: self.total_gradient_computations,
320            average_time: self.average_computation_time,
321            memory_usage: self.memory_usage_bytes,
322            throughput: self.throughput_gradients_per_second,
323            active_bottlenecks: self.bottleneck_layers.clone(),
324            layer_count: self.layer_performance_map.len(),
325        }
326    }
327}
328
329/// Layer-specific performance metrics
330#[derive(Debug, Clone, Serialize, Deserialize)]
331pub struct LayerPerformanceMetrics {
332    pub layer_name: String,
333    pub computation_count: usize,
334    pub total_computation_time: Duration,
335    pub average_computation_time: Duration,
336    pub total_memory_usage: usize,
337    pub average_memory_usage: usize,
338    pub min_computation_time: Duration,
339    pub max_computation_time: Duration,
340    pub performance_variance: f64,
341}
342
343impl LayerPerformanceMetrics {
344    pub fn new(layer_name: String) -> Self {
345        Self {
346            layer_name,
347            computation_count: 0,
348            total_computation_time: Duration::from_millis(0),
349            average_computation_time: Duration::from_millis(0),
350            total_memory_usage: 0,
351            average_memory_usage: 0,
352            min_computation_time: Duration::from_secs(u64::MAX),
353            max_computation_time: Duration::from_millis(0),
354            performance_variance: 0.0,
355        }
356    }
357
358    pub fn update(&mut self, computation_time: Duration, memory_used: usize) {
359        self.computation_count += 1;
360        self.total_computation_time += computation_time;
361        self.total_memory_usage += memory_used;
362
363        self.average_computation_time = self.total_computation_time / self.computation_count as u32;
364        self.average_memory_usage = self.total_memory_usage / self.computation_count;
365
366        if computation_time < self.min_computation_time {
367            self.min_computation_time = computation_time;
368        }
369        if computation_time > self.max_computation_time {
370            self.max_computation_time = computation_time;
371        }
372
373        self.update_variance(computation_time);
374    }
375
376    fn update_variance(&mut self, new_time: Duration) {
377        if self.computation_count < 2 {
378            self.performance_variance = 0.0;
379            return;
380        }
381
382        let mean = self.average_computation_time.as_secs_f64();
383        let new_value = new_time.as_secs_f64();
384
385        // Incremental variance calculation
386        let old_variance = self.performance_variance;
387        let delta = new_value - mean;
388        self.performance_variance = ((self.computation_count - 1) as f64 * old_variance
389            + delta * delta)
390            / self.computation_count as f64;
391    }
392}
393
394/// Performance timer for measuring gradient computation time
395#[derive(Debug)]
396pub struct PerformanceTimer {
397    layer_name: String,
398    start_time: Instant,
399}
400
401impl PerformanceTimer {
402    pub fn new(layer_name: String) -> Self {
403        Self {
404            layer_name,
405            start_time: Instant::now(),
406        }
407    }
408
409    pub fn finish(self) -> (String, Duration) {
410        (self.layer_name, self.start_time.elapsed())
411    }
412}
413
414/// Resource utilization metrics
415#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct ResourceUtilization {
417    pub cpu_usage_percent: f64,
418    pub gpu_usage_percent: f64,
419    pub memory_usage_percent: f64,
420    pub io_wait_percent: f64,
421}
422
423impl Default for ResourceUtilization {
424    fn default() -> Self {
425        Self {
426            cpu_usage_percent: 0.0,
427            gpu_usage_percent: 0.0,
428            memory_usage_percent: 0.0,
429            io_wait_percent: 0.0,
430        }
431    }
432}
433
434/// Performance snapshot at a point in time
435#[derive(Debug, Clone, Serialize, Deserialize)]
436pub struct PerformanceSnapshot {
437    pub timestamp: std::time::SystemTime,
438    pub total_computations: usize,
439    pub average_time: Duration,
440    pub memory_usage: usize,
441    pub throughput: f64,
442    pub active_bottlenecks: Vec<String>,
443    pub layer_count: usize,
444}
445
446/// Performance trends analysis
447#[derive(Debug, Clone, Serialize, Deserialize)]
448pub struct PerformanceTrends {
449    pub throughput_trend: TrendDirection,
450    pub memory_trend: TrendDirection,
451    pub bottleneck_stability: BottleneckStability,
452    pub overall_performance_direction: PerformanceDirection,
453}
454
455impl Default for PerformanceTrends {
456    fn default() -> Self {
457        Self {
458            throughput_trend: TrendDirection::Stable,
459            memory_trend: TrendDirection::Stable,
460            bottleneck_stability: BottleneckStability::Stable,
461            overall_performance_direction: PerformanceDirection::Stable,
462        }
463    }
464}
465
466#[derive(Debug, Clone, Serialize, Deserialize)]
467pub enum TrendDirection {
468    Improving,
469    Stable,
470    Degrading,
471}
472
473#[derive(Debug, Clone, Serialize, Deserialize)]
474pub enum BottleneckStability {
475    Stable,
476    Moderate,
477    Unstable,
478}
479
480#[derive(Debug, Clone, Serialize, Deserialize)]
481pub enum PerformanceDirection {
482    Improving,
483    Stable,
484    Degrading,
485}
486
487/// Optimization recommendation
488#[derive(Debug, Clone, Serialize, Deserialize)]
489pub struct OptimizationRecommendation {
490    pub layer_name: String,
491    pub issue_type: OptimizationIssue,
492    pub severity: OptimizationSeverity,
493    pub recommendations: Vec<String>,
494    pub expected_improvement: f64,
495}
496
497#[derive(Debug, Clone, Serialize, Deserialize)]
498pub enum OptimizationIssue {
499    ComputationalBottleneck,
500    HighMemoryUsage,
501    LowThroughput,
502    ResourceUnderutilization,
503}
504
505#[derive(Debug, Clone, Serialize, Deserialize)]
506pub enum OptimizationSeverity {
507    Low,
508    Medium,
509    High,
510    Critical,
511}