trustformers_debug/gradient_debugger/
debugger.rs

1//! Main Gradient Debugger Implementation
2//!
3//! This module provides the main GradientDebugger that orchestrates all gradient
4//! debugging capabilities including monitoring, anomaly detection, performance tracking,
5//! conflict analysis, visualization, and enhanced analysis.
6
7use super::anomaly_detection::*;
8use super::conflict_analysis::*;
9use super::enhanced_analysis::*;
10use super::monitoring::*;
11use super::performance_tracking::*;
12use super::types::*;
13use super::visualization::*;
14use crate::DebugConfig;
15use anyhow::Result;
16use serde::{Deserialize, Serialize};
17use std::collections::HashMap;
18
19/// Flow analysis for gradient flow patterns
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct FlowAnalysis {
22    pub layer_analyses: HashMap<String, LayerFlowAnalysis>,
23}
24
25/// Analysis of gradient flow for a specific layer
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct LayerFlowAnalysis {
28    pub layer_name: String,
29    pub is_vanishing: bool,
30    pub is_exploding: bool,
31    pub gradient_norm: f64,
32    pub flow_consistency: f64,
33}
34
35/// Main gradient debugger
36#[derive(Debug)]
37pub struct GradientDebugger {
38    #[allow(dead_code)]
39    config: DebugConfig,
40    gradient_config: GradientDebugConfig,
41    gradient_histories: HashMap<String, GradientHistory>,
42    current_step: usize,
43    alerts: Vec<GradientAlert>,
44    layer_no_gradient_count: HashMap<String, usize>,
45
46    // Advanced features
47    adaptive_thresholds: HashMap<String, AdaptiveThresholds>,
48    real_time_monitors: HashMap<String, RealTimeGradientMonitor>,
49    anomaly_detector: GradientAnomalyDetector,
50    performance_tracker: GradientPerformanceTracker,
51    conflict_analyzer: GradientConflictAnalyzer,
52    flow_visualizer: GradientFlowVisualizer,
53    enhanced_analyzer: EnhancedGradientAnalyzer,
54}
55
56impl GradientDebugger {
57    /// Create a new gradient debugger
58    pub fn new(config: DebugConfig) -> Self {
59        let gradient_config = GradientDebugConfig::default();
60
61        Self {
62            config,
63            gradient_config: gradient_config.clone(),
64            gradient_histories: HashMap::new(),
65            current_step: 0,
66            alerts: Vec::new(),
67            layer_no_gradient_count: HashMap::new(),
68            adaptive_thresholds: HashMap::new(),
69            real_time_monitors: HashMap::new(),
70            anomaly_detector: GradientAnomalyDetector::default(),
71            performance_tracker: GradientPerformanceTracker::default(),
72            conflict_analyzer: GradientConflictAnalyzer::default(),
73            flow_visualizer: GradientFlowVisualizer::default(),
74            enhanced_analyzer: EnhancedGradientAnalyzer::default(),
75        }
76    }
77
78    /// Create with custom gradient configuration
79    pub fn with_gradient_config(config: DebugConfig, gradient_config: GradientDebugConfig) -> Self {
80        Self {
81            config,
82            gradient_config: gradient_config.clone(),
83            gradient_histories: HashMap::new(),
84            current_step: 0,
85            alerts: Vec::new(),
86            layer_no_gradient_count: HashMap::new(),
87            adaptive_thresholds: HashMap::new(),
88            real_time_monitors: HashMap::new(),
89            anomaly_detector: GradientAnomalyDetector::default(),
90            performance_tracker: GradientPerformanceTracker::default(),
91            conflict_analyzer: GradientConflictAnalyzer::default(),
92            flow_visualizer: GradientFlowVisualizer::default(),
93            enhanced_analyzer: EnhancedGradientAnalyzer::default(),
94        }
95    }
96
97    /// Record gradient flow for a layer
98    pub fn record_gradient_flow(
99        &mut self,
100        layer_name: &str,
101        gradient_norm: f64,
102        gradient_mean: f64,
103        gradient_std: f64,
104    ) -> Result<()> {
105        let flow = GradientFlow {
106            layer_name: layer_name.to_string(),
107            step: self.current_step,
108            gradient_norm,
109            gradient_mean,
110            gradient_std,
111            gradient_max: gradient_mean + gradient_std,
112            gradient_min: gradient_mean - gradient_std,
113            dead_neurons_ratio: self.estimate_dead_neurons_ratio(gradient_norm),
114            active_neurons_ratio: 1.0 - self.estimate_dead_neurons_ratio(gradient_norm),
115            timestamp: chrono::Utc::now(),
116        };
117
118        // Update gradient history
119        {
120            let history = self
121                .gradient_histories
122                .entry(layer_name.to_string())
123                .or_insert_with(|| GradientHistory::new(layer_name.to_string(), 1000));
124            history.add_gradient_flow(&flow);
125        }
126
127        // Update adaptive thresholds
128        let thresholds =
129            self.adaptive_thresholds.entry(layer_name.to_string()).or_insert_with(|| {
130                AdaptiveThresholds::new(
131                    layer_name.to_string(),
132                    self.gradient_config.vanishing_threshold,
133                    self.gradient_config.exploding_threshold,
134                )
135            });
136        thresholds.update_thresholds(gradient_norm);
137
138        // Update real-time monitor
139        let monitor = self
140            .real_time_monitors
141            .entry(layer_name.to_string())
142            .or_insert_with(|| RealTimeGradientMonitor::new(layer_name.to_string()));
143        monitor.update(gradient_norm);
144
145        // Check for alerts
146        self.check_gradient_alerts(layer_name, &flow)?;
147
148        // Record performance metrics
149        let timer = self.performance_tracker.start_timing(layer_name);
150        let (_, computation_time) = timer.finish();
151        self.performance_tracker
152            .record_layer_performance(layer_name, computation_time, 0); // Memory usage simplified
153
154        // Detect anomalies
155        let anomalies =
156            self.anomaly_detector
157                .detect_anomalies(layer_name, gradient_norm, self.current_step);
158        for anomaly in anomalies {
159            self.alerts.push(GradientAlert::GradientOscillation {
160                layer_name: anomaly.layer_name,
161                variance: anomaly.severity,
162            });
163        }
164
165        // Establish baseline if needed
166        if let Some(history) = self.gradient_histories.get(layer_name) {
167            if history.gradient_norms.len() == 50 {
168                let gradient_values: Vec<f64> = history.gradient_norms.iter().cloned().collect();
169                self.anomaly_detector.establish_baseline(layer_name, &gradient_values);
170            }
171        }
172
173        Ok(())
174    }
175
176    /// Get current gradient debugging status
177    pub fn get_status(&self) -> GradientDebugStatus {
178        let layer_statuses: HashMap<String, LayerGradientStatus> = self
179            .gradient_histories
180            .iter()
181            .map(|(layer_name, history)| {
182                let status = self.compute_layer_status(layer_name, history);
183                (layer_name.clone(), status)
184            })
185            .collect();
186
187        let overall_health = self.compute_overall_health(&layer_statuses);
188        let recent_alerts: Vec<GradientAlert> =
189            self.alerts.iter().rev().take(10).cloned().collect();
190
191        GradientDebugStatus {
192            current_step: self.current_step,
193            overall_health,
194            layer_statuses,
195            recent_alerts,
196            total_alerts: self.alerts.len(),
197            active_layers: self.gradient_histories.len(),
198        }
199    }
200
201    /// Generate flow analysis for report generation
202    fn generate_flow_analysis(&self) -> FlowAnalysis {
203        let mut layer_analyses = HashMap::new();
204
205        for (layer_name, history) in &self.gradient_histories {
206            let latest_gradient = history.gradient_norms.back().cloned().unwrap_or(0.0);
207
208            // Determine if gradients are vanishing or exploding
209            let is_vanishing = latest_gradient < 1e-8
210                || (history.gradient_norms.len() > 5
211                    && history.gradient_norms.iter().rev().take(5).all(|&g| g < 1e-6));
212
213            let is_exploding = latest_gradient > 100.0
214                || (history.gradient_norms.len() > 3
215                    && history.gradient_norms.iter().rev().take(3).any(|&g| g > 50.0));
216
217            // Calculate flow consistency (variance in gradient norms)
218            let flow_consistency = if history.gradient_norms.len() > 1 {
219                let mean = history.gradient_norms.iter().sum::<f64>()
220                    / history.gradient_norms.len() as f64;
221                let variance =
222                    history.gradient_norms.iter().map(|&x| (x - mean).powi(2)).sum::<f64>()
223                        / history.gradient_norms.len() as f64;
224                1.0 / (1.0 + variance) // Higher consistency = lower variance
225            } else {
226                1.0
227            };
228
229            layer_analyses.insert(
230                layer_name.clone(),
231                LayerFlowAnalysis {
232                    layer_name: layer_name.clone(),
233                    is_vanishing,
234                    is_exploding,
235                    gradient_norm: latest_gradient,
236                    flow_consistency,
237                },
238            );
239        }
240
241        FlowAnalysis { layer_analyses }
242    }
243
244    /// Generate comprehensive debugging report
245    pub fn generate_comprehensive_report(&self) -> Result<ComprehensiveGradientReport> {
246        let status = self.get_status();
247        let conflict_analysis = self.conflict_analyzer.analyze_conflicts(&self.gradient_histories);
248        let visualization = self
249            .flow_visualizer
250            .generate_visualization(&self.gradient_histories, self.current_step);
251        let enhanced_analysis =
252            self.enhanced_analyzer.generate_enhanced_analysis(&self.gradient_histories);
253        let performance_snapshot = self.performance_tracker.take_performance_snapshot();
254        let anomaly_summary = self.anomaly_detector.get_anomaly_summary(None);
255
256        let flow_analysis = self.generate_flow_analysis();
257
258        Ok(ComprehensiveGradientReport {
259            timestamp: chrono::Utc::now(),
260            status,
261            conflict_analysis,
262            visualization,
263            enhanced_analysis,
264            flow_analysis,
265            performance_snapshot,
266            anomaly_summary,
267            recommendations: self.generate_comprehensive_recommendations()?,
268        })
269    }
270
271    /// Analyze gradient conflicts between layers
272    pub fn analyze_gradient_conflicts(&self) -> GradientConflictAnalysis {
273        self.conflict_analyzer.analyze_conflicts(&self.gradient_histories)
274    }
275
276    /// Generate gradient flow visualization
277    pub fn generate_gradient_flow_visualization(&self) -> GradientFlowVisualization {
278        self.flow_visualizer
279            .generate_visualization(&self.gradient_histories, self.current_step)
280    }
281
282    /// Generate enhanced layer analysis
283    pub fn generate_enhanced_layer_analysis(&self) -> EnhancedLayerGradientAnalysis {
284        self.enhanced_analyzer.generate_enhanced_analysis(&self.gradient_histories)
285    }
286
287    /// Get performance insights
288    pub fn get_performance_insights(&self) -> PerformanceInsights {
289        let trends = self.performance_tracker.get_performance_trends();
290        let recommendations = self.performance_tracker.generate_optimization_recommendations();
291        let bottlenecks = self.performance_tracker.bottleneck_layers.clone();
292
293        PerformanceInsights {
294            trends,
295            recommendations,
296            bottlenecks,
297            current_throughput: self.performance_tracker.throughput_gradients_per_second,
298            memory_usage: self.performance_tracker.memory_usage_bytes,
299        }
300    }
301
302    /// Advance to next step
303    pub fn next_step(&mut self) {
304        self.current_step += 1;
305
306        // Clear old alerts (keep last 100)
307        if self.alerts.len() > 100 {
308            self.alerts.drain(0..self.alerts.len() - 100);
309        }
310
311        // Update no-gradient counters
312        for (layer_name, history) in &self.gradient_histories {
313            if let Some(latest_norm) = history.gradient_norms.back() {
314                if *latest_norm < 1e-8 {
315                    *self.layer_no_gradient_count.entry(layer_name.clone()).or_insert(0) += 1;
316                } else {
317                    self.layer_no_gradient_count.insert(layer_name.clone(), 0);
318                }
319            }
320        }
321
322        // Check for no-gradient alerts
323        for (layer_name, &count) in &self.layer_no_gradient_count {
324            if count >= self.gradient_config.no_gradient_steps_threshold {
325                self.alerts.push(GradientAlert::NoGradientFlow {
326                    layer_name: layer_name.clone(),
327                    steps_without_gradient: count,
328                });
329            }
330        }
331    }
332
333    /// Reset debugger state
334    pub fn reset(&mut self) {
335        self.gradient_histories.clear();
336        self.current_step = 0;
337        self.alerts.clear();
338        self.layer_no_gradient_count.clear();
339        self.adaptive_thresholds.clear();
340        self.real_time_monitors.clear();
341        self.anomaly_detector = GradientAnomalyDetector::default();
342        self.performance_tracker = GradientPerformanceTracker::default();
343    }
344
345    /// Get alerts for a specific layer
346    pub fn get_layer_alerts(&self, layer_name: &str) -> Vec<&GradientAlert> {
347        self.alerts
348            .iter()
349            .filter(|alert| match alert {
350                GradientAlert::VanishingGradients {
351                    layer_name: name, ..
352                } => name == layer_name,
353                GradientAlert::ExplodingGradients {
354                    layer_name: name, ..
355                } => name == layer_name,
356                GradientAlert::DeadNeurons {
357                    layer_name: name, ..
358                } => name == layer_name,
359                GradientAlert::GradientOscillation {
360                    layer_name: name, ..
361                } => name == layer_name,
362                GradientAlert::NoGradientFlow {
363                    layer_name: name, ..
364                } => name == layer_name,
365            })
366            .collect()
367    }
368
369    /// Get gradient history for a layer
370    pub fn get_layer_history(&self, layer_name: &str) -> Option<&GradientHistory> {
371        self.gradient_histories.get(layer_name)
372    }
373
374    /// Get all monitored layers
375    pub fn get_monitored_layers(&self) -> Vec<&String> {
376        self.gradient_histories.keys().collect()
377    }
378
379    // Private helper methods
380
381    fn estimate_dead_neurons_ratio(&self, gradient_norm: f64) -> f64 {
382        // Simplified estimation - in practice would analyze individual neuron gradients
383        if gradient_norm < 1e-6 {
384            0.9 // Assume 90% dead if very low gradient
385        } else if gradient_norm < 1e-4 {
386            0.3 // Assume 30% dead if low gradient
387        } else {
388            0.05 // Assume 5% dead for normal gradients
389        }
390    }
391
392    fn check_gradient_alerts(&mut self, layer_name: &str, flow: &GradientFlow) -> Result<()> {
393        // Check adaptive thresholds first
394        if let Some(thresholds) = self.adaptive_thresholds.get(layer_name) {
395            let threshold_alerts = thresholds.check_thresholds(flow.gradient_norm);
396            self.alerts.extend(threshold_alerts);
397        } else {
398            // Fallback to static thresholds
399            if flow.gradient_norm < self.gradient_config.vanishing_threshold {
400                self.alerts.push(GradientAlert::VanishingGradients {
401                    layer_name: layer_name.to_string(),
402                    norm: flow.gradient_norm,
403                    threshold: self.gradient_config.vanishing_threshold,
404                });
405            }
406
407            if flow.gradient_norm > self.gradient_config.exploding_threshold {
408                self.alerts.push(GradientAlert::ExplodingGradients {
409                    layer_name: layer_name.to_string(),
410                    norm: flow.gradient_norm,
411                    threshold: self.gradient_config.exploding_threshold,
412                });
413            }
414        }
415
416        // Check dead neurons
417        if flow.dead_neurons_ratio > self.gradient_config.dead_neuron_threshold {
418            self.alerts.push(GradientAlert::DeadNeurons {
419                layer_name: layer_name.to_string(),
420                ratio: flow.dead_neurons_ratio,
421                threshold: self.gradient_config.dead_neuron_threshold,
422            });
423        }
424
425        // Check oscillation
426        if let Some(monitor) = self.real_time_monitors.get(layer_name) {
427            if monitor.is_oscillating() {
428                self.alerts.push(GradientAlert::GradientOscillation {
429                    layer_name: layer_name.to_string(),
430                    variance: monitor.get_stability_score(),
431                });
432            }
433        }
434
435        Ok(())
436    }
437
438    fn compute_layer_status(
439        &self,
440        layer_name: &str,
441        history: &GradientHistory,
442    ) -> LayerGradientStatus {
443        let latest_norm = history.gradient_norms.back().cloned().unwrap_or(0.0);
444        let health = self.classify_layer_health(layer_name, history);
445        let alerts = self.get_layer_alerts(layer_name).len();
446        let trend = history.get_trend_slope().unwrap_or(0.0);
447
448        LayerGradientStatus {
449            layer_name: layer_name.to_string(),
450            health,
451            latest_gradient_norm: latest_norm,
452            gradient_trend: trend,
453            alert_count: alerts,
454            steps_recorded: history.gradient_norms.len(),
455        }
456    }
457
458    fn classify_layer_health(&self, layer_name: &str, history: &GradientHistory) -> LayerHealth {
459        let latest_norm = history.gradient_norms.back().cloned().unwrap_or(0.0);
460        let alert_count = self.get_layer_alerts(layer_name).len();
461
462        if latest_norm < 1e-7 || latest_norm > 100.0 || alert_count > 3 {
463            LayerHealth::Critical
464        } else if latest_norm < 1e-5 || latest_norm > 10.0 || alert_count > 0 {
465            LayerHealth::Warning
466        } else {
467            LayerHealth::Healthy
468        }
469    }
470
471    fn compute_overall_health(
472        &self,
473        layer_statuses: &HashMap<String, LayerGradientStatus>,
474    ) -> LayerHealth {
475        if layer_statuses.is_empty() {
476            return LayerHealth::Healthy;
477        }
478
479        let critical_count =
480            layer_statuses.values().filter(|s| s.health == LayerHealth::Critical).count();
481        let warning_count =
482            layer_statuses.values().filter(|s| s.health == LayerHealth::Warning).count();
483        let total = layer_statuses.len();
484
485        if critical_count > 0 || warning_count as f64 / total as f64 > 0.5 {
486            LayerHealth::Critical
487        } else if warning_count > 0 {
488            LayerHealth::Warning
489        } else {
490            LayerHealth::Healthy
491        }
492    }
493
494    fn generate_comprehensive_recommendations(&self) -> Result<Vec<GradientRecommendation>> {
495        let mut recommendations = Vec::new();
496
497        // Performance recommendations
498        let perf_recs = self.performance_tracker.generate_optimization_recommendations();
499        for rec in perf_recs {
500            recommendations.push(GradientRecommendation {
501                recommendation_type: RecommendationType::Performance,
502                title: rec.layer_name,
503                description: format!("{:?}: {}", rec.issue_type, rec.recommendations.join(", ")),
504                priority: match rec.severity {
505                    OptimizationSeverity::Critical => GradientRecommendationPriority::High,
506                    OptimizationSeverity::High => GradientRecommendationPriority::High,
507                    OptimizationSeverity::Medium => GradientRecommendationPriority::Medium,
508                    OptimizationSeverity::Low => GradientRecommendationPriority::Low,
509                },
510                expected_impact: rec.expected_improvement,
511            });
512        }
513
514        // Conflict recommendations
515        let conflict_analysis = self.conflict_analyzer.analyze_conflicts(&self.gradient_histories);
516        for strategy in conflict_analysis.mitigation_strategies {
517            recommendations.push(GradientRecommendation {
518                recommendation_type: RecommendationType::Conflict,
519                title: strategy.strategy_name,
520                description: strategy.description,
521                priority: match strategy.implementation_complexity {
522                    MitigationComplexity::Simple => GradientRecommendationPriority::High,
523                    MitigationComplexity::Moderate => GradientRecommendationPriority::Medium,
524                    MitigationComplexity::Complex => GradientRecommendationPriority::Medium,
525                    MitigationComplexity::RequiresArchitectureChange => {
526                        GradientRecommendationPriority::Low
527                    },
528                },
529                expected_impact: strategy.effectiveness,
530            });
531        }
532
533        // Anomaly recommendations
534        let anomaly_summary = self.anomaly_detector.get_anomaly_summary(None);
535        for rec_text in anomaly_summary.recommendations {
536            recommendations.push(GradientRecommendation {
537                recommendation_type: RecommendationType::Anomaly,
538                title: "Anomaly Mitigation".to_string(),
539                description: rec_text,
540                priority: if anomaly_summary.average_severity > 0.7 {
541                    GradientRecommendationPriority::High
542                } else {
543                    GradientRecommendationPriority::Medium
544                },
545                expected_impact: 1.0 - anomaly_summary.average_severity,
546            });
547        }
548
549        // Sort by priority and expected impact
550        recommendations.sort_by(|a, b| {
551            let priority_cmp = b.priority.cmp(&a.priority);
552            if priority_cmp == std::cmp::Ordering::Equal {
553                b.expected_impact.partial_cmp(&a.expected_impact).unwrap()
554            } else {
555                priority_cmp
556            }
557        });
558
559        Ok(recommendations)
560    }
561
562    /// Generate recommendations based on current analysis
563    pub fn generate_recommendations(&self) -> Result<Vec<GradientRecommendation>> {
564        self.generate_comprehensive_recommendations()
565    }
566
567    /// Start the gradient debugger
568    pub async fn start(&mut self) -> Result<()> {
569        // Initialize monitoring systems
570        self.performance_tracker.start_monitoring();
571
572        // Reset state for a new debugging session
573        self.current_step = 0;
574        self.alerts.clear();
575
576        // Initialize adaptive thresholds for existing histories
577        for (layer_name, history) in &self.gradient_histories {
578            if !history.gradient_norms.is_empty() {
579                let thresholds = AdaptiveThresholds::from_history(history);
580                self.adaptive_thresholds.insert(layer_name.clone(), thresholds);
581            }
582        }
583
584        Ok(())
585    }
586
587    /// Generate comprehensive gradient report
588    pub async fn generate_report(&self) -> Result<ComprehensiveGradientReport> {
589        let status = GradientDebugStatus {
590            current_step: self.current_step,
591            overall_health: self.evaluate_overall_health(),
592            layer_statuses: self.get_layer_statuses(),
593            recent_alerts: self.alerts.iter().rev().take(10).cloned().collect(),
594            total_alerts: self.alerts.len(),
595            active_layers: self.gradient_histories.len(),
596        };
597
598        let conflict_analysis = self.conflict_analyzer.analyze_conflicts(&self.gradient_histories);
599        let visualization = self.flow_visualizer.create_visualization(&self.gradient_histories);
600        let enhanced_analysis = self.enhanced_analyzer.analyze_gradients(&self.gradient_histories);
601        let performance_snapshot = self.performance_tracker.take_performance_snapshot();
602        let anomaly_summary = self.anomaly_detector.get_anomaly_summary(None);
603        let recommendations = self.generate_recommendations().unwrap_or_default();
604
605        let flow_analysis = self.generate_flow_analysis();
606
607        Ok(ComprehensiveGradientReport {
608            timestamp: chrono::Utc::now(),
609            status,
610            conflict_analysis,
611            visualization,
612            enhanced_analysis,
613            flow_analysis,
614            performance_snapshot,
615            anomaly_summary,
616            recommendations,
617        })
618    }
619
620    /// Quick analysis for immediate insights
621    pub async fn quick_analysis(&self) -> Result<GradientQuickAnalysis> {
622        let mut problematic_layers = Vec::new();
623        let mut total_gradients = 0f64;
624        let mut active_layers = 0;
625
626        for (layer_name, history) in &self.gradient_histories {
627            if !history.gradient_norms.is_empty() {
628                active_layers += 1;
629                let latest_norm = history.gradient_norms.back().unwrap();
630                total_gradients += latest_norm;
631
632                // Check for basic problems
633                if *latest_norm < 1e-8 {
634                    problematic_layers.push(format!("{}: Vanishing gradients", layer_name));
635                } else if *latest_norm > 100.0 {
636                    problematic_layers.push(format!("{}: Exploding gradients", layer_name));
637                }
638            }
639        }
640
641        let average_gradient =
642            if active_layers > 0 { total_gradients / active_layers as f64 } else { 0.0 };
643
644        let health_score = self.calculate_quick_health_score();
645
646        Ok(GradientQuickAnalysis {
647            overall_health: if health_score > 0.8 {
648                LayerHealth::Healthy
649            } else if health_score > 0.5 {
650                LayerHealth::Warning
651            } else {
652                LayerHealth::Critical
653            },
654            active_layers,
655            problematic_layers,
656            average_gradient_norm: average_gradient,
657            recent_alerts_count: self.alerts.len(),
658            timestamp: chrono::Utc::now(),
659        })
660    }
661
662    /// Evaluate overall gradient health
663    fn evaluate_overall_health(&self) -> LayerHealth {
664        if self.gradient_histories.is_empty() {
665            return LayerHealth::Unknown;
666        }
667
668        let mut healthy_count = 0;
669        let mut warning_count = 0;
670        let mut critical_count = 0;
671
672        for history in self.gradient_histories.values() {
673            if let Some(latest_norm) = history.gradient_norms.back() {
674                if *latest_norm < 1e-8 || *latest_norm > 100.0 {
675                    critical_count += 1;
676                } else if *latest_norm < 1e-6 || *latest_norm > 10.0 {
677                    warning_count += 1;
678                } else {
679                    healthy_count += 1;
680                }
681            }
682        }
683
684        let total = healthy_count + warning_count + critical_count;
685        let critical_ratio = critical_count as f64 / total as f64;
686        let warning_ratio = (warning_count + critical_count) as f64 / total as f64;
687
688        if critical_ratio > 0.3 {
689            LayerHealth::Critical
690        } else if warning_ratio > 0.5 {
691            LayerHealth::Warning
692        } else {
693            LayerHealth::Healthy
694        }
695    }
696
697    /// Get status for each layer
698    fn get_layer_statuses(&self) -> HashMap<String, LayerGradientStatus> {
699        let mut statuses = HashMap::new();
700
701        for (layer_name, history) in &self.gradient_histories {
702            let status = if let Some(latest_norm) = history.gradient_norms.back() {
703                LayerGradientStatus {
704                    layer_name: layer_name.clone(),
705                    latest_gradient_norm: *latest_norm,
706                    gradient_trend: self.calculate_trend_value(history),
707                    health: if *latest_norm < 1e-8 {
708                        LayerHealth::Critical
709                    } else if *latest_norm > 100.0 {
710                        LayerHealth::Critical
711                    } else if *latest_norm < 1e-6 || *latest_norm > 10.0 {
712                        LayerHealth::Warning
713                    } else {
714                        LayerHealth::Healthy
715                    },
716                    alert_count: self.get_layer_alerts(layer_name).len(),
717                    steps_recorded: history.gradient_norms.len(),
718                }
719            } else {
720                LayerGradientStatus {
721                    layer_name: layer_name.clone(),
722                    latest_gradient_norm: 0.0,
723                    gradient_trend: 0.0,
724                    health: LayerHealth::Unknown,
725                    alert_count: 0,
726                    steps_recorded: 0,
727                }
728            };
729
730            statuses.insert(layer_name.clone(), status);
731        }
732
733        statuses
734    }
735
736    /// Calculate gradient trend for a layer
737    #[allow(dead_code)]
738    fn calculate_trend(&self, history: &GradientHistory) -> GradientTrend {
739        if history.gradient_norms.len() < 3 {
740            return GradientTrend::Unknown;
741        }
742
743        let recent: Vec<f64> = history.gradient_norms.iter().rev().take(3).cloned().collect();
744
745        if recent[0] > recent[1] && recent[1] > recent[2] {
746            GradientTrend::Increasing
747        } else if recent[0] < recent[1] && recent[1] < recent[2] {
748            GradientTrend::Decreasing
749        } else {
750            GradientTrend::Stable
751        }
752    }
753
754    /// Calculate gradient trend as numeric value for a layer
755    fn calculate_trend_value(&self, history: &GradientHistory) -> f64 {
756        if history.gradient_norms.len() < 2 {
757            return 0.0;
758        }
759
760        let recent: Vec<f64> = history.gradient_norms.iter().rev().take(10).cloned().collect();
761        if recent.len() < 2 {
762            return 0.0;
763        }
764
765        // Calculate linear trend slope
766        let n = recent.len() as f64;
767        let sum_x = (0..recent.len()).sum::<usize>() as f64;
768        let sum_y = recent.iter().sum::<f64>();
769        let sum_xy = recent.iter().enumerate().map(|(i, &y)| i as f64 * y).sum::<f64>();
770        let sum_x2 = (0..recent.len()).map(|i| (i * i) as f64).sum::<f64>();
771
772        let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x);
773        slope
774    }
775
776    /// Calculate quick health score
777    fn calculate_quick_health_score(&self) -> f64 {
778        if self.gradient_histories.is_empty() {
779            return 0.0;
780        }
781
782        let mut score = 0.0;
783        let mut count = 0;
784
785        for history in self.gradient_histories.values() {
786            if let Some(latest_norm) = history.gradient_norms.back() {
787                // Score based on gradient magnitude (ideal range: 1e-4 to 1.0)
788                let norm_score = if *latest_norm >= 1e-4 && *latest_norm <= 1.0 {
789                    1.0
790                } else if *latest_norm >= 1e-6 && *latest_norm <= 10.0 {
791                    0.7
792                } else if *latest_norm >= 1e-8 && *latest_norm <= 100.0 {
793                    0.3
794                } else {
795                    0.0
796                };
797
798                score += norm_score;
799                count += 1;
800            }
801        }
802
803        if count == 0 {
804            0.0
805        } else {
806            score / count as f64
807        }
808    }
809}
810
811/// Current gradient debugging status
812#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
813pub struct GradientDebugStatus {
814    pub current_step: usize,
815    pub overall_health: LayerHealth,
816    pub layer_statuses: HashMap<String, LayerGradientStatus>,
817    pub recent_alerts: Vec<GradientAlert>,
818    pub total_alerts: usize,
819    pub active_layers: usize,
820}
821
822/// Comprehensive gradient debugging report
823#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
824pub struct ComprehensiveGradientReport {
825    pub timestamp: chrono::DateTime<chrono::Utc>,
826    pub status: GradientDebugStatus,
827    pub conflict_analysis: GradientConflictAnalysis,
828    pub visualization: GradientFlowVisualization,
829    pub enhanced_analysis: EnhancedLayerGradientAnalysis,
830    pub flow_analysis: FlowAnalysis,
831    pub performance_snapshot: PerformanceSnapshot,
832    pub anomaly_summary: AnomalySummary,
833    pub recommendations: Vec<GradientRecommendation>,
834}
835
836impl ComprehensiveGradientReport {
837    /// Check if there are vanishing gradient issues
838    pub fn has_vanishing_gradients(&self) -> bool {
839        // Check if any layers have very small gradients
840        for (_, layer_status) in &self.status.layer_statuses {
841            if layer_status.latest_gradient_norm < 1e-8 {
842                return true;
843            }
844        }
845
846        // Check anomaly summary for vanishing gradient patterns
847        for anomaly in &self.anomaly_summary.anomalies {
848            if matches!(
849                anomaly.anomaly_type,
850                crate::anomaly_detector::AnomalyType::GradientVanishing
851            ) {
852                return true;
853            }
854        }
855
856        false
857    }
858
859    /// Check if there are exploding gradient issues
860    pub fn has_exploding_gradients(&self) -> bool {
861        // Check if any layers have very large gradients
862        for (_, layer_status) in &self.status.layer_statuses {
863            if layer_status.latest_gradient_norm > 100.0 {
864                return true;
865            }
866        }
867
868        // Check anomaly summary for exploding gradient patterns
869        for anomaly in &self.anomaly_summary.anomalies {
870            if matches!(
871                anomaly.anomaly_type,
872                crate::anomaly_detector::AnomalyType::GradientExplosion
873                    | crate::anomaly_detector::AnomalyType::NumericalInstability
874            ) {
875                return true;
876            }
877        }
878
879        false
880    }
881}
882
883/// Performance insights summary
884#[derive(Debug, Clone)]
885pub struct PerformanceInsights {
886    pub trends: PerformanceTrends,
887    pub recommendations: Vec<OptimizationRecommendation>,
888    pub bottlenecks: Vec<String>,
889    pub current_throughput: f64,
890    pub memory_usage: usize,
891}
892
893/// Gradient debugging recommendation
894#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
895pub struct GradientRecommendation {
896    pub recommendation_type: RecommendationType,
897    pub title: String,
898    pub description: String,
899    pub priority: GradientRecommendationPriority,
900    pub expected_impact: f64,
901}
902
903#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
904pub enum RecommendationType {
905    Performance,
906    Conflict,
907    Anomaly,
908    Architecture,
909    Optimization,
910}
911
912#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
913pub enum GradientRecommendationPriority {
914    Low,
915    Medium,
916    High,
917}
918
919/// Quick analysis results for immediate insights
920#[derive(Debug, Clone)]
921pub struct GradientQuickAnalysis {
922    pub overall_health: LayerHealth,
923    pub active_layers: usize,
924    pub problematic_layers: Vec<String>,
925    pub average_gradient_norm: f64,
926    pub recent_alerts_count: usize,
927    pub timestamp: chrono::DateTime<chrono::Utc>,
928}
929
930/// Status for individual layer gradients
931#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
932pub struct LayerGradientStatus {
933    pub layer_name: String,
934    pub health: LayerHealth,
935    pub latest_gradient_norm: f64,
936    pub gradient_trend: f64,
937    pub alert_count: usize,
938    pub steps_recorded: usize,
939}
940
941/// Gradient trend indicators
942#[derive(Debug, Clone, PartialEq, Eq)]
943pub enum GradientTrend {
944    Unknown,
945    Increasing,
946    Decreasing,
947    Stable,
948}
trustformers_debug/gradient_debugger/debugger.rs

trustformers_debug/gradient_debugger/
debugger.rs