trustformers_debug/
differential_debugging.rs

1//! # Differential Debugging System
2//!
3//! Advanced model comparison, A/B analysis, version diff tracking, regression identification,
4//! and performance delta analysis for TrustformeRS models.
5
6use anyhow::Result;
7use chrono::{DateTime, Utc};
8use indexmap::IndexMap;
9// use scirs2_core::ndarray::*; // SciRS2 Integration Policy - was: use ndarray::{Array1, Array2};
10use serde::{Deserialize, Serialize};
11use statrs::statistics::Statistics;
12use std::collections::HashMap;
13use uuid::Uuid;
14
15/// Configuration for differential debugging
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DifferentialDebuggingConfig {
18    /// Enable model comparison analysis
19    pub enable_model_comparison: bool,
20    /// Enable A/B testing analysis
21    pub enable_ab_analysis: bool,
22    /// Enable version diff tracking
23    pub enable_version_diff: bool,
24    /// Enable regression identification
25    pub enable_regression_detection: bool,
26    /// Enable performance delta analysis
27    pub enable_performance_delta: bool,
28    /// Statistical significance threshold for comparisons
29    pub significance_threshold: f64,
30    /// Maximum number of models to compare simultaneously
31    pub max_comparison_models: usize,
32    /// Regression detection sensitivity (0.0 to 1.0)
33    pub regression_sensitivity: f64,
34    /// Performance delta threshold (percentage)
35    pub performance_delta_threshold: f64,
36}
37
38impl Default for DifferentialDebuggingConfig {
39    fn default() -> Self {
40        Self {
41            enable_model_comparison: true,
42            enable_ab_analysis: true,
43            enable_version_diff: true,
44            enable_regression_detection: true,
45            enable_performance_delta: true,
46            significance_threshold: 0.05,
47            max_comparison_models: 10,
48            regression_sensitivity: 0.8,
49            performance_delta_threshold: 5.0,
50        }
51    }
52}
53
54/// Model snapshot for comparison
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ModelSnapshot {
57    /// Unique identifier for the model snapshot
58    pub id: Uuid,
59    /// Model name or version identifier
60    pub name: String,
61    /// Timestamp when snapshot was created
62    pub timestamp: DateTime<Utc>,
63    /// Model version information
64    pub version: String,
65    /// Git commit hash (if available)
66    pub commit_hash: Option<String>,
67    /// Model performance metrics
68    pub metrics: ModelMetrics,
69    /// Model architecture information
70    pub architecture: ArchitectureInfo,
71    /// Training configuration
72    pub training_config: TrainingConfig,
73    /// Model weights summary statistics
74    pub weights_summary: WeightsSummary,
75    /// Additional metadata
76    pub metadata: HashMap<String, String>,
77}
78
79/// Performance metrics for a model
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ModelMetrics {
82    /// Training accuracy
83    pub train_accuracy: f64,
84    /// Validation accuracy
85    pub val_accuracy: f64,
86    /// Test accuracy (if available)
87    pub test_accuracy: Option<f64>,
88    /// Training loss
89    pub train_loss: f64,
90    /// Validation loss
91    pub val_loss: f64,
92    /// Test loss (if available)
93    pub test_loss: Option<f64>,
94    /// Inference latency (ms)
95    pub inference_latency_ms: f64,
96    /// Memory usage (MB)
97    pub memory_usage_mb: f64,
98    /// Model size (MB)
99    pub model_size_mb: f64,
100    /// FLOPS count
101    pub flops: u64,
102    /// Training time (seconds)
103    pub training_time_s: f64,
104    /// Custom metrics
105    pub custom_metrics: HashMap<String, f64>,
106}
107
108/// Architecture information
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ArchitectureInfo {
111    /// Number of parameters
112    pub parameter_count: u64,
113    /// Number of layers
114    pub layer_count: u32,
115    /// Model depth
116    pub depth: u32,
117    /// Hidden dimension size
118    pub hidden_size: u32,
119    /// Number of attention heads
120    pub num_heads: Option<u32>,
121    /// Feed-forward dimension
122    pub ff_dim: Option<u32>,
123    /// Vocabulary size
124    pub vocab_size: Option<u32>,
125    /// Sequence length
126    pub max_seq_length: Option<u32>,
127}
128
129/// Training configuration
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TrainingConfig {
132    /// Learning rate
133    pub learning_rate: f64,
134    /// Batch size
135    pub batch_size: u32,
136    /// Number of epochs
137    pub epochs: u32,
138    /// Optimizer type
139    pub optimizer: String,
140    /// Learning rate schedule
141    pub lr_schedule: Option<String>,
142    /// Regularization parameters
143    pub regularization: HashMap<String, f64>,
144}
145
146/// Summary statistics for model weights
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct WeightsSummary {
149    /// Mean weight value
150    pub mean: f64,
151    /// Standard deviation of weights
152    pub std_dev: f64,
153    /// Minimum weight value
154    pub min: f64,
155    /// Maximum weight value
156    pub max: f64,
157    /// Weight distribution percentiles
158    pub percentiles: HashMap<String, f64>,
159    /// Number of zero weights
160    pub zero_count: u64,
161    /// Sparsity ratio
162    pub sparsity: f64,
163}
164
165/// Result of model comparison analysis
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ModelComparisonResult {
168    /// Models being compared
169    pub models: Vec<String>,
170    /// Comparison timestamp
171    pub timestamp: DateTime<Utc>,
172    /// Performance comparison
173    pub performance_comparison: PerformanceComparison,
174    /// Architecture differences
175    pub architecture_diff: ArchitectureDiff,
176    /// Statistical significance results
177    pub statistical_analysis: StatisticalAnalysis,
178    /// Overall comparison summary
179    pub summary: ComparisonSummary,
180}
181
182/// Performance comparison between models
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct PerformanceComparison {
185    /// Accuracy comparison
186    pub accuracy_comparison: MetricComparison,
187    /// Loss comparison
188    pub loss_comparison: MetricComparison,
189    /// Latency comparison
190    pub latency_comparison: MetricComparison,
191    /// Memory usage comparison
192    pub memory_comparison: MetricComparison,
193    /// Model size comparison
194    pub size_comparison: MetricComparison,
195    /// Custom metric comparisons
196    pub custom_comparisons: HashMap<String, MetricComparison>,
197}
198
199/// Comparison result for a specific metric
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct MetricComparison {
202    /// Values for each model
203    pub values: HashMap<String, f64>,
204    /// Best performing model for this metric
205    pub best_model: String,
206    /// Worst performing model for this metric
207    pub worst_model: String,
208    /// Performance differences (relative to best)
209    pub differences: HashMap<String, f64>,
210    /// Statistical significance of differences
211    pub significant_differences: HashMap<String, bool>,
212}
213
214/// Architecture differences between models
215#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ArchitectureDiff {
217    /// Parameter count differences
218    pub parameter_diff: HashMap<String, i64>,
219    /// Layer count differences
220    pub layer_diff: HashMap<String, i32>,
221    /// Architecture similarity score (0.0 to 1.0)
222    pub similarity_score: f64,
223    /// Notable differences
224    pub notable_differences: Vec<String>,
225}
226
227/// Statistical analysis of comparisons
228#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct StatisticalAnalysis {
230    /// P-values for metric comparisons
231    pub p_values: HashMap<String, f64>,
232    /// Effect sizes (Cohen's d)
233    pub effect_sizes: HashMap<String, f64>,
234    /// Confidence intervals
235    pub confidence_intervals: HashMap<String, (f64, f64)>,
236    /// Statistical significance summary
237    pub significance_summary: HashMap<String, bool>,
238}
239
240/// Overall comparison summary
241#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct ComparisonSummary {
243    /// Overall best model
244    pub best_model: String,
245    /// Model rankings by different criteria
246    pub rankings: HashMap<String, Vec<String>>,
247    /// Key findings
248    pub key_findings: Vec<String>,
249    /// Recommendations
250    pub recommendations: Vec<String>,
251}
252
253/// A/B test configuration
254#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ABTestConfig {
256    /// Test name
257    pub name: String,
258    /// Model A identifier
259    pub model_a: String,
260    /// Model B identifier
261    pub model_b: String,
262    /// Test duration (if applicable)
263    pub duration_hours: Option<u32>,
264    /// Sample size for each group
265    pub sample_size: u32,
266    /// Metrics to track
267    pub tracked_metrics: Vec<String>,
268    /// Minimum detectable effect size
269    pub min_effect_size: f64,
270    /// Statistical power
271    pub power: f64,
272}
273
274/// A/B test result
275#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct ABTestResult {
277    /// Test configuration
278    pub config: ABTestConfig,
279    /// Test start time
280    pub start_time: DateTime<Utc>,
281    /// Test end time
282    pub end_time: Option<DateTime<Utc>>,
283    /// Model A results
284    pub model_a_results: ABTestMetrics,
285    /// Model B results
286    pub model_b_results: ABTestMetrics,
287    /// Statistical test results
288    pub statistical_tests: HashMap<String, StatisticalTestResult>,
289    /// Test conclusion
290    pub conclusion: ABTestConclusion,
291}
292
293/// Metrics for A/B test
294#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ABTestMetrics {
296    /// Sample size
297    pub sample_size: u32,
298    /// Metric values
299    pub metrics: HashMap<String, Vec<f64>>,
300    /// Summary statistics
301    pub summary_stats: HashMap<String, SummaryStats>,
302}
303
304/// Summary statistics
305#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct SummaryStats {
307    pub mean: f64,
308    pub std_dev: f64,
309    pub min: f64,
310    pub max: f64,
311    pub median: f64,
312    pub q25: f64,
313    pub q75: f64,
314}
315
316/// Statistical test result
317#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct StatisticalTestResult {
319    /// Test type (t-test, Mann-Whitney U, etc.)
320    pub test_type: String,
321    /// Test statistic
322    pub statistic: f64,
323    /// P-value
324    pub p_value: f64,
325    /// Effect size
326    pub effect_size: f64,
327    /// Confidence interval for difference
328    pub confidence_interval: (f64, f64),
329    /// Is result statistically significant?
330    pub is_significant: bool,
331}
332
333/// A/B test conclusion
334#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct ABTestConclusion {
336    /// Winner (if any)
337    pub winner: Option<String>,
338    /// Confidence level
339    pub confidence: f64,
340    /// Practical significance
341    pub practical_significance: bool,
342    /// Recommendation
343    pub recommendation: String,
344    /// Summary
345    pub summary: String,
346}
347
348/// Version diff tracking information
349#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct VersionDiff {
351    /// Previous version
352    pub from_version: String,
353    /// Current version
354    pub to_version: String,
355    /// Diff timestamp
356    pub timestamp: DateTime<Utc>,
357    /// Performance changes
358    pub performance_delta: PerformanceDelta,
359    /// Architecture changes
360    pub architecture_changes: Vec<ArchitectureChange>,
361    /// Configuration changes
362    pub config_changes: Vec<ConfigChange>,
363    /// Weight changes summary
364    pub weight_changes: WeightChangesSummary,
365}
366
367/// Performance delta between versions
368#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PerformanceDelta {
370    /// Accuracy change
371    pub accuracy_delta: f64,
372    /// Loss change
373    pub loss_delta: f64,
374    /// Latency change
375    pub latency_delta: f64,
376    /// Memory usage change
377    pub memory_delta: f64,
378    /// Model size change
379    pub size_delta: f64,
380    /// Training time change
381    pub training_time_delta: f64,
382    /// Custom metric changes
383    pub custom_deltas: HashMap<String, f64>,
384}
385
386/// Architecture change description
387#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct ArchitectureChange {
389    /// Type of change
390    pub change_type: String,
391    /// Description
392    pub description: String,
393    /// Impact assessment
394    pub impact: String,
395}
396
397/// Configuration change description
398#[derive(Debug, Clone, Serialize, Deserialize)]
399pub struct ConfigChange {
400    /// Parameter name
401    pub parameter: String,
402    /// Old value
403    pub old_value: String,
404    /// New value
405    pub new_value: String,
406    /// Change impact
407    pub impact: String,
408}
409
410/// Summary of weight changes
411#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct WeightChangesSummary {
413    /// Average magnitude of weight changes
414    pub avg_magnitude: f64,
415    /// Maximum weight change
416    pub max_change: f64,
417    /// Percentage of weights that changed significantly
418    pub significant_change_ratio: f64,
419    /// Layer-wise change summary
420    pub layer_changes: HashMap<String, f64>,
421}
422
423/// Regression detection result
424#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct RegressionDetectionResult {
426    /// Analysis timestamp
427    pub timestamp: DateTime<Utc>,
428    /// Detected regressions
429    pub regressions: Vec<Regression>,
430    /// Performance improvements
431    pub improvements: Vec<Improvement>,
432    /// Overall assessment
433    pub overall_assessment: RegressionAssessment,
434}
435
436/// Detected regression
437#[derive(Debug, Clone, Serialize, Deserialize)]
438pub struct Regression {
439    /// Metric that regressed
440    pub metric: String,
441    /// Current value
442    pub current_value: f64,
443    /// Previous value
444    pub previous_value: f64,
445    /// Regression magnitude
446    pub magnitude: f64,
447    /// Severity level
448    pub severity: RegressionSeverity,
449    /// Possible causes
450    pub possible_causes: Vec<String>,
451    /// Suggested fixes
452    pub suggested_fixes: Vec<String>,
453}
454
455/// Performance improvement
456#[derive(Debug, Clone, Serialize, Deserialize)]
457pub struct Improvement {
458    /// Metric that improved
459    pub metric: String,
460    /// Current value
461    pub current_value: f64,
462    /// Previous value
463    pub previous_value: f64,
464    /// Improvement magnitude
465    pub magnitude: f64,
466    /// Likely causes
467    pub likely_causes: Vec<String>,
468}
469
470/// Regression severity levels
471#[derive(Debug, Clone, Serialize, Deserialize)]
472pub enum RegressionSeverity {
473    Critical,
474    Major,
475    Minor,
476    Negligible,
477}
478
479/// Overall regression assessment
480#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct RegressionAssessment {
482    /// Overall health score (0.0 to 1.0)
483    pub health_score: f64,
484    /// Number of critical regressions
485    pub critical_regressions: usize,
486    /// Number of improvements
487    pub improvements: usize,
488    /// Recommendation
489    pub recommendation: String,
490}
491
492/// Main differential debugging analyzer
493#[derive(Debug)]
494pub struct DifferentialDebugger {
495    config: DifferentialDebuggingConfig,
496    model_snapshots: IndexMap<String, ModelSnapshot>,
497    comparison_history: Vec<ModelComparisonResult>,
498    ab_tests: Vec<ABTestResult>,
499    version_diffs: Vec<VersionDiff>,
500    regression_history: Vec<RegressionDetectionResult>,
501}
502
503impl DifferentialDebugger {
504    /// Create a new differential debugger
505    pub fn new(config: DifferentialDebuggingConfig) -> Self {
506        Self {
507            config,
508            model_snapshots: IndexMap::new(),
509            comparison_history: Vec::new(),
510            ab_tests: Vec::new(),
511            version_diffs: Vec::new(),
512            regression_history: Vec::new(),
513        }
514    }
515
516    /// Add a model snapshot for comparison
517    pub fn add_model_snapshot(&mut self, snapshot: ModelSnapshot) -> Result<()> {
518        if self.model_snapshots.len() >= self.config.max_comparison_models {
519            // Remove oldest snapshot
520            self.model_snapshots.shift_remove_index(0);
521        }
522
523        self.model_snapshots.insert(snapshot.name.clone(), snapshot);
524        Ok(())
525    }
526
527    /// Compare two or more models
528    pub async fn compare_models(
529        &mut self,
530        model_names: Vec<String>,
531    ) -> Result<ModelComparisonResult> {
532        if !self.config.enable_model_comparison {
533            return Err(anyhow::anyhow!("Model comparison is disabled"));
534        }
535
536        if model_names.len() < 2 {
537            return Err(anyhow::anyhow!(
538                "At least two models are required for comparison"
539            ));
540        }
541
542        // Get model snapshots
543        let models: Vec<&ModelSnapshot> = model_names
544            .iter()
545            .map(|name| {
546                self.model_snapshots
547                    .get(name)
548                    .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", name))
549            })
550            .collect::<Result<Vec<_>>>()?;
551
552        // Perform comparison analysis
553        let performance_comparison = self.compare_performance(&models)?;
554        let architecture_diff = self.analyze_architecture_differences(&models)?;
555        let statistical_analysis = self.perform_statistical_analysis(&models)?;
556        let summary = self.generate_comparison_summary(
557            &models,
558            &performance_comparison,
559            &statistical_analysis,
560        )?;
561
562        let result = ModelComparisonResult {
563            models: model_names,
564            timestamp: Utc::now(),
565            performance_comparison,
566            architecture_diff,
567            statistical_analysis,
568            summary,
569        };
570
571        self.comparison_history.push(result.clone());
572        Ok(result)
573    }
574
575    /// Run A/B test analysis
576    pub async fn run_ab_test(
577        &mut self,
578        config: ABTestConfig,
579        model_a_data: Vec<f64>,
580        model_b_data: Vec<f64>,
581    ) -> Result<ABTestResult> {
582        if !self.config.enable_ab_analysis {
583            return Err(anyhow::anyhow!("A/B analysis is disabled"));
584        }
585
586        let start_time = Utc::now();
587
588        // Calculate summary statistics for both models
589        let model_a_stats = self.calculate_summary_stats(&model_a_data);
590        let model_b_stats = self.calculate_summary_stats(&model_b_data);
591
592        let model_a_results = ABTestMetrics {
593            sample_size: model_a_data.len() as u32,
594            metrics: {
595                let mut metrics = HashMap::new();
596                metrics.insert("primary_metric".to_string(), model_a_data);
597                metrics
598            },
599            summary_stats: {
600                let mut stats = HashMap::new();
601                stats.insert("primary_metric".to_string(), model_a_stats);
602                stats
603            },
604        };
605
606        let model_b_results = ABTestMetrics {
607            sample_size: model_b_data.len() as u32,
608            metrics: {
609                let mut metrics = HashMap::new();
610                metrics.insert("primary_metric".to_string(), model_b_data);
611                metrics
612            },
613            summary_stats: {
614                let mut stats = HashMap::new();
615                stats.insert("primary_metric".to_string(), model_b_stats);
616                stats
617            },
618        };
619
620        // Perform statistical tests
621        let statistical_tests =
622            self.perform_ab_statistical_tests(&model_a_results, &model_b_results)?;
623
624        // Generate conclusion
625        let conclusion = self.generate_ab_conclusion(
626            &config,
627            &model_a_results,
628            &model_b_results,
629            &statistical_tests,
630        )?;
631
632        let result = ABTestResult {
633            config,
634            start_time,
635            end_time: Some(Utc::now()),
636            model_a_results,
637            model_b_results,
638            statistical_tests,
639            conclusion,
640        };
641
642        self.ab_tests.push(result.clone());
643        Ok(result)
644    }
645
646    /// Track version differences
647    pub async fn track_version_diff(
648        &mut self,
649        from_model: &str,
650        to_model: &str,
651    ) -> Result<VersionDiff> {
652        if !self.config.enable_version_diff {
653            return Err(anyhow::anyhow!("Version diff tracking is disabled"));
654        }
655
656        let from_snapshot = self
657            .model_snapshots
658            .get(from_model)
659            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", from_model))?;
660        let to_snapshot = self
661            .model_snapshots
662            .get(to_model)
663            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", to_model))?;
664
665        let performance_delta = self.calculate_performance_delta(from_snapshot, to_snapshot)?;
666        let architecture_changes = self.detect_architecture_changes(from_snapshot, to_snapshot)?;
667        let config_changes = self.detect_config_changes(from_snapshot, to_snapshot)?;
668        let weight_changes = self.analyze_weight_changes(from_snapshot, to_snapshot)?;
669
670        let diff = VersionDiff {
671            from_version: from_snapshot.version.clone(),
672            to_version: to_snapshot.version.clone(),
673            timestamp: Utc::now(),
674            performance_delta,
675            architecture_changes,
676            config_changes,
677            weight_changes,
678        };
679
680        self.version_diffs.push(diff.clone());
681        Ok(diff)
682    }
683
684    /// Detect performance regressions
685    pub async fn detect_regressions(
686        &mut self,
687        current_model: &str,
688        baseline_model: &str,
689    ) -> Result<RegressionDetectionResult> {
690        if !self.config.enable_regression_detection {
691            return Err(anyhow::anyhow!("Regression detection is disabled"));
692        }
693
694        let current = self
695            .model_snapshots
696            .get(current_model)
697            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", current_model))?;
698        let baseline = self
699            .model_snapshots
700            .get(baseline_model)
701            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", baseline_model))?;
702
703        let mut regressions = Vec::new();
704        let mut improvements = Vec::new();
705
706        // Check accuracy regression
707        if current.metrics.val_accuracy < baseline.metrics.val_accuracy {
708            let magnitude = baseline.metrics.val_accuracy - current.metrics.val_accuracy;
709            if magnitude > self.config.regression_sensitivity * 0.01 {
710                regressions.push(Regression {
711                    metric: "validation_accuracy".to_string(),
712                    current_value: current.metrics.val_accuracy,
713                    previous_value: baseline.metrics.val_accuracy,
714                    magnitude,
715                    severity: self.classify_regression_severity(magnitude, "accuracy"),
716                    possible_causes: vec![
717                        "Learning rate too high".to_string(),
718                        "Insufficient training".to_string(),
719                        "Data distribution shift".to_string(),
720                    ],
721                    suggested_fixes: vec![
722                        "Reduce learning rate".to_string(),
723                        "Increase training epochs".to_string(),
724                        "Check data quality".to_string(),
725                    ],
726                });
727            }
728        } else if current.metrics.val_accuracy > baseline.metrics.val_accuracy {
729            let magnitude = current.metrics.val_accuracy - baseline.metrics.val_accuracy;
730            improvements.push(Improvement {
731                metric: "validation_accuracy".to_string(),
732                current_value: current.metrics.val_accuracy,
733                previous_value: baseline.metrics.val_accuracy,
734                magnitude,
735                likely_causes: vec![
736                    "Better optimization".to_string(),
737                    "Improved architecture".to_string(),
738                    "Better hyperparameters".to_string(),
739                ],
740            });
741        }
742
743        // Check latency regression
744        if current.metrics.inference_latency_ms > baseline.metrics.inference_latency_ms {
745            let magnitude =
746                current.metrics.inference_latency_ms - baseline.metrics.inference_latency_ms;
747            let relative_change = magnitude / baseline.metrics.inference_latency_ms * 100.0;
748            if relative_change > self.config.performance_delta_threshold {
749                regressions.push(Regression {
750                    metric: "inference_latency".to_string(),
751                    current_value: current.metrics.inference_latency_ms,
752                    previous_value: baseline.metrics.inference_latency_ms,
753                    magnitude,
754                    severity: self.classify_regression_severity(relative_change, "latency"),
755                    possible_causes: vec![
756                        "Model complexity increased".to_string(),
757                        "Inefficient implementation".to_string(),
758                        "Hardware degradation".to_string(),
759                    ],
760                    suggested_fixes: vec![
761                        "Profile and optimize bottlenecks".to_string(),
762                        "Consider model compression".to_string(),
763                        "Check hardware configuration".to_string(),
764                    ],
765                });
766            }
767        }
768
769        let critical_regressions = regressions
770            .iter()
771            .filter(|r| matches!(r.severity, RegressionSeverity::Critical))
772            .count();
773
774        let health_score = if critical_regressions > 0 {
775            0.0
776        } else {
777            1.0 - (regressions.len() as f64 * 0.1).min(1.0)
778        };
779
780        let recommendation = if critical_regressions > 0 {
781            "Critical regressions detected. Immediate action required.".to_string()
782        } else if !regressions.is_empty() {
783            "Some regressions detected. Review and address if necessary.".to_string()
784        } else {
785            "No significant regressions detected.".to_string()
786        };
787
788        let overall_assessment = RegressionAssessment {
789            health_score,
790            critical_regressions,
791            improvements: improvements.len(),
792            recommendation,
793        };
794
795        let result = RegressionDetectionResult {
796            timestamp: Utc::now(),
797            regressions,
798            improvements,
799            overall_assessment,
800        };
801
802        self.regression_history.push(result.clone());
803        Ok(result)
804    }
805
806    /// Generate comprehensive differential debugging report
807    pub async fn generate_report(&self) -> Result<DifferentialDebuggingReport> {
808        Ok(DifferentialDebuggingReport {
809            timestamp: Utc::now(),
810            config: self.config.clone(),
811            total_models: self.model_snapshots.len(),
812            comparison_count: self.comparison_history.len(),
813            ab_test_count: self.ab_tests.len(),
814            version_diff_count: self.version_diffs.len(),
815            regression_detection_count: self.regression_history.len(),
816            recent_comparisons: self.comparison_history.iter().rev().take(5).cloned().collect(),
817            recent_regressions: self.regression_history.iter().rev().take(3).cloned().collect(),
818            model_summary: self.generate_model_summary(),
819        })
820    }
821
822    // Helper methods
823
824    fn compare_performance(&self, models: &[&ModelSnapshot]) -> Result<PerformanceComparison> {
825        let mut accuracy_values = HashMap::new();
826        let mut loss_values = HashMap::new();
827        let mut latency_values = HashMap::new();
828        let mut memory_values = HashMap::new();
829        let mut size_values = HashMap::new();
830
831        for model in models {
832            accuracy_values.insert(model.name.clone(), model.metrics.val_accuracy);
833            loss_values.insert(model.name.clone(), model.metrics.val_loss);
834            latency_values.insert(model.name.clone(), model.metrics.inference_latency_ms);
835            memory_values.insert(model.name.clone(), model.metrics.memory_usage_mb);
836            size_values.insert(model.name.clone(), model.metrics.model_size_mb);
837        }
838
839        Ok(PerformanceComparison {
840            accuracy_comparison: self.create_metric_comparison(accuracy_values, true)?,
841            loss_comparison: self.create_metric_comparison(loss_values, false)?,
842            latency_comparison: self.create_metric_comparison(latency_values, false)?,
843            memory_comparison: self.create_metric_comparison(memory_values, false)?,
844            size_comparison: self.create_metric_comparison(size_values, false)?,
845            custom_comparisons: HashMap::new(),
846        })
847    }
848
849    fn create_metric_comparison(
850        &self,
851        values: HashMap<String, f64>,
852        higher_is_better: bool,
853    ) -> Result<MetricComparison> {
854        let best_model = if higher_is_better {
855            values
856                .iter()
857                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
858                .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
859                .0
860                .clone()
861        } else {
862            values
863                .iter()
864                .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
865                .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
866                .0
867                .clone()
868        };
869
870        let worst_model = if higher_is_better {
871            values
872                .iter()
873                .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
874                .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
875                .0
876                .clone()
877        } else {
878            values
879                .iter()
880                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
881                .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
882                .0
883                .clone()
884        };
885
886        let best_value = values[&best_model];
887        let mut differences = HashMap::new();
888        let mut significant_differences = HashMap::new();
889
890        for (model, value) in &values {
891            let diff = if higher_is_better {
892                (value - best_value) / best_value * 100.0
893            } else {
894                (best_value - value) / best_value * 100.0
895            };
896            differences.insert(model.clone(), diff);
897            significant_differences.insert(model.clone(), diff.abs() > 1.0); // 1% threshold
898        }
899
900        Ok(MetricComparison {
901            values,
902            best_model,
903            worst_model,
904            differences,
905            significant_differences,
906        })
907    }
908
909    fn analyze_architecture_differences(
910        &self,
911        models: &[&ModelSnapshot],
912    ) -> Result<ArchitectureDiff> {
913        if models.len() < 2 {
914            return Err(anyhow::anyhow!(
915                "Need at least 2 models for architecture diff"
916            ));
917        }
918
919        let base_model = models[0];
920        let mut parameter_diff = HashMap::new();
921        let mut layer_diff = HashMap::new();
922        let mut notable_differences = Vec::new();
923
924        for model in models.iter().skip(1) {
925            let param_diff = model.architecture.parameter_count as i64
926                - base_model.architecture.parameter_count as i64;
927            let layer_diff_val =
928                model.architecture.layer_count as i32 - base_model.architecture.layer_count as i32;
929
930            parameter_diff.insert(model.name.clone(), param_diff);
931            layer_diff.insert(model.name.clone(), layer_diff_val);
932
933            if param_diff.abs() > 1_000_000 {
934                notable_differences.push(format!(
935                    "Model '{}' has {} parameter difference",
936                    model.name, param_diff
937                ));
938            }
939
940            if layer_diff_val != 0 {
941                notable_differences.push(format!(
942                    "Model '{}' has {} layer difference",
943                    model.name, layer_diff_val
944                ));
945            }
946        }
947
948        // Calculate similarity score based on architecture features
949        let mut similarity_scores = Vec::new();
950        for model in models.iter().skip(1) {
951            let score = self
952                .calculate_architecture_similarity(&base_model.architecture, &model.architecture);
953            similarity_scores.push(score);
954        }
955        let similarity_score =
956            similarity_scores.iter().sum::<f64>() / similarity_scores.len() as f64;
957
958        Ok(ArchitectureDiff {
959            parameter_diff,
960            layer_diff,
961            similarity_score,
962            notable_differences,
963        })
964    }
965
966    fn calculate_architecture_similarity(
967        &self,
968        arch1: &ArchitectureInfo,
969        arch2: &ArchitectureInfo,
970    ) -> f64 {
971        let mut similarity = 0.0;
972        let mut features = 0;
973
974        // Parameter count similarity
975        let param_ratio = (arch1.parameter_count.min(arch2.parameter_count) as f64)
976            / (arch1.parameter_count.max(arch2.parameter_count) as f64);
977        similarity += param_ratio;
978        features += 1;
979
980        // Layer count similarity
981        let layer_ratio = (arch1.layer_count.min(arch2.layer_count) as f64)
982            / (arch1.layer_count.max(arch2.layer_count) as f64);
983        similarity += layer_ratio;
984        features += 1;
985
986        // Hidden size similarity (if available)
987        let hidden_ratio = (arch1.hidden_size.min(arch2.hidden_size) as f64)
988            / (arch1.hidden_size.max(arch2.hidden_size) as f64);
989        similarity += hidden_ratio;
990        features += 1;
991
992        similarity / features as f64
993    }
994
995    fn perform_statistical_analysis(
996        &self,
997        _models: &[&ModelSnapshot],
998    ) -> Result<StatisticalAnalysis> {
999        // For now, return placeholder analysis
1000        // In a real implementation, this would perform proper statistical tests
1001        Ok(StatisticalAnalysis {
1002            p_values: HashMap::new(),
1003            effect_sizes: HashMap::new(),
1004            confidence_intervals: HashMap::new(),
1005            significance_summary: HashMap::new(),
1006        })
1007    }
1008
1009    fn generate_comparison_summary(
1010        &self,
1011        _models: &[&ModelSnapshot],
1012        performance: &PerformanceComparison,
1013        _statistical: &StatisticalAnalysis,
1014    ) -> Result<ComparisonSummary> {
1015        let best_model = performance.accuracy_comparison.best_model.clone();
1016
1017        let mut rankings = HashMap::new();
1018        rankings.insert(
1019            "accuracy".to_string(),
1020            vec![performance.accuracy_comparison.best_model.clone()],
1021        );
1022        rankings.insert(
1023            "latency".to_string(),
1024            vec![performance.latency_comparison.best_model.clone()],
1025        );
1026
1027        let key_findings = vec![
1028            format!(
1029                "Best accuracy: {} ({:.2}%)",
1030                performance.accuracy_comparison.best_model,
1031                performance.accuracy_comparison.values[&performance.accuracy_comparison.best_model]
1032                    * 100.0
1033            ),
1034            format!(
1035                "Fastest inference: {} ({:.2}ms)",
1036                performance.latency_comparison.best_model,
1037                performance.latency_comparison.values[&performance.latency_comparison.best_model]
1038            ),
1039        ];
1040
1041        let recommendations = vec![
1042            "Consider the trade-offs between accuracy and latency".to_string(),
1043            "Monitor memory usage for production deployment".to_string(),
1044        ];
1045
1046        Ok(ComparisonSummary {
1047            best_model,
1048            rankings,
1049            key_findings,
1050            recommendations,
1051        })
1052    }
1053
1054    fn calculate_summary_stats(&self, data: &[f64]) -> SummaryStats {
1055        let mean = data.iter().sum::<f64>() / data.len() as f64;
1056        let variance = data.variance();
1057        let std_dev = variance.sqrt();
1058
1059        let mut sorted_data = data.to_vec();
1060        sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1061
1062        let min = sorted_data[0];
1063        let max = sorted_data[sorted_data.len() - 1];
1064        let median = sorted_data[sorted_data.len() / 2];
1065        let q25 = sorted_data[sorted_data.len() / 4];
1066        let q75 = sorted_data[3 * sorted_data.len() / 4];
1067
1068        SummaryStats {
1069            mean,
1070            std_dev,
1071            min,
1072            max,
1073            median,
1074            q25,
1075            q75,
1076        }
1077    }
1078
1079    fn perform_ab_statistical_tests(
1080        &self,
1081        model_a: &ABTestMetrics,
1082        model_b: &ABTestMetrics,
1083    ) -> Result<HashMap<String, StatisticalTestResult>> {
1084        let mut results = HashMap::new();
1085
1086        // Simple t-test for primary metric
1087        if let (Some(a_data), Some(b_data)) = (
1088            model_a.metrics.get("primary_metric"),
1089            model_b.metrics.get("primary_metric"),
1090        ) {
1091            let a_mean = a_data.mean();
1092            let b_mean = b_data.mean();
1093            let a_var = a_data.variance();
1094            let b_var = b_data.variance();
1095
1096            // Simplified t-test calculation
1097            let pooled_std = ((a_var + b_var) / 2.0).sqrt();
1098            let standard_error =
1099                pooled_std * (1.0 / a_data.len() as f64 + 1.0 / b_data.len() as f64).sqrt();
1100            let t_statistic = (a_mean - b_mean) / standard_error;
1101
1102            // Simplified p-value (would use proper statistical functions in real implementation)
1103            let p_value = if t_statistic.abs() > 2.0 { 0.01 } else { 0.1 };
1104
1105            let effect_size = (a_mean - b_mean) / pooled_std; // Cohen's d
1106            let margin_of_error = 1.96 * standard_error; // 95% CI
1107
1108            results.insert(
1109                "primary_metric".to_string(),
1110                StatisticalTestResult {
1111                    test_type: "Welch's t-test".to_string(),
1112                    statistic: t_statistic,
1113                    p_value,
1114                    effect_size,
1115                    confidence_interval: (
1116                        a_mean - b_mean - margin_of_error,
1117                        a_mean - b_mean + margin_of_error,
1118                    ),
1119                    is_significant: p_value < 0.05,
1120                },
1121            );
1122        }
1123
1124        Ok(results)
1125    }
1126
1127    fn generate_ab_conclusion(
1128        &self,
1129        config: &ABTestConfig,
1130        _model_a: &ABTestMetrics,
1131        _model_b: &ABTestMetrics,
1132        tests: &HashMap<String, StatisticalTestResult>,
1133    ) -> Result<ABTestConclusion> {
1134        let primary_test = tests.get("primary_metric");
1135
1136        let (winner, confidence, practical_significance) = if let Some(test) = primary_test {
1137            let winner = if test.effect_size > 0.0 {
1138                Some(config.model_a.clone())
1139            } else {
1140                Some(config.model_b.clone())
1141            };
1142
1143            let confidence = if test.is_significant { 0.95 } else { 0.5 };
1144            let practical_significance = test.effect_size.abs() > config.min_effect_size;
1145
1146            (winner, confidence, practical_significance)
1147        } else {
1148            (None, 0.5, false)
1149        };
1150
1151        let recommendation = if practical_significance && confidence > 0.9 {
1152            format!(
1153                "Recommend deploying {}",
1154                winner.as_ref().expect(
1155                    "winner should be Some when practical_significance and confidence > 0.9"
1156                )
1157            )
1158        } else {
1159            "Insufficient evidence for a clear recommendation".to_string()
1160        };
1161
1162        let summary = format!(
1163            "A/B test completed with {} confidence",
1164            if confidence > 0.9 { "high" } else { "low" }
1165        );
1166
1167        Ok(ABTestConclusion {
1168            winner,
1169            confidence,
1170            practical_significance,
1171            recommendation,
1172            summary,
1173        })
1174    }
1175
1176    fn calculate_performance_delta(
1177        &self,
1178        from: &ModelSnapshot,
1179        to: &ModelSnapshot,
1180    ) -> Result<PerformanceDelta> {
1181        Ok(PerformanceDelta {
1182            accuracy_delta: to.metrics.val_accuracy - from.metrics.val_accuracy,
1183            loss_delta: to.metrics.val_loss - from.metrics.val_loss,
1184            latency_delta: to.metrics.inference_latency_ms - from.metrics.inference_latency_ms,
1185            memory_delta: to.metrics.memory_usage_mb - from.metrics.memory_usage_mb,
1186            size_delta: to.metrics.model_size_mb - from.metrics.model_size_mb,
1187            training_time_delta: to.metrics.training_time_s - from.metrics.training_time_s,
1188            custom_deltas: HashMap::new(),
1189        })
1190    }
1191
1192    fn detect_architecture_changes(
1193        &self,
1194        from: &ModelSnapshot,
1195        to: &ModelSnapshot,
1196    ) -> Result<Vec<ArchitectureChange>> {
1197        let mut changes = Vec::new();
1198
1199        if from.architecture.parameter_count != to.architecture.parameter_count {
1200            changes.push(ArchitectureChange {
1201                change_type: "Parameter Count".to_string(),
1202                description: format!(
1203                    "Changed from {} to {} parameters",
1204                    from.architecture.parameter_count, to.architecture.parameter_count
1205                ),
1206                impact: "Affects model capacity and memory usage".to_string(),
1207            });
1208        }
1209
1210        if from.architecture.layer_count != to.architecture.layer_count {
1211            changes.push(ArchitectureChange {
1212                change_type: "Layer Count".to_string(),
1213                description: format!(
1214                    "Changed from {} to {} layers",
1215                    from.architecture.layer_count, to.architecture.layer_count
1216                ),
1217                impact: "Affects model depth and training dynamics".to_string(),
1218            });
1219        }
1220
1221        Ok(changes)
1222    }
1223
1224    fn detect_config_changes(
1225        &self,
1226        from: &ModelSnapshot,
1227        to: &ModelSnapshot,
1228    ) -> Result<Vec<ConfigChange>> {
1229        let mut changes = Vec::new();
1230
1231        if from.training_config.learning_rate != to.training_config.learning_rate {
1232            changes.push(ConfigChange {
1233                parameter: "learning_rate".to_string(),
1234                old_value: from.training_config.learning_rate.to_string(),
1235                new_value: to.training_config.learning_rate.to_string(),
1236                impact: "Affects training speed and convergence".to_string(),
1237            });
1238        }
1239
1240        if from.training_config.batch_size != to.training_config.batch_size {
1241            changes.push(ConfigChange {
1242                parameter: "batch_size".to_string(),
1243                old_value: from.training_config.batch_size.to_string(),
1244                new_value: to.training_config.batch_size.to_string(),
1245                impact: "Affects gradient noise and memory usage".to_string(),
1246            });
1247        }
1248
1249        Ok(changes)
1250    }
1251
1252    fn analyze_weight_changes(
1253        &self,
1254        from: &ModelSnapshot,
1255        to: &ModelSnapshot,
1256    ) -> Result<WeightChangesSummary> {
1257        // Simplified weight change analysis
1258        let avg_magnitude = (to.weights_summary.mean - from.weights_summary.mean).abs();
1259        let max_change = (to.weights_summary.max - from.weights_summary.max).abs();
1260        let significant_change_ratio = if avg_magnitude > 0.01 { 0.8 } else { 0.2 };
1261
1262        Ok(WeightChangesSummary {
1263            avg_magnitude,
1264            max_change,
1265            significant_change_ratio,
1266            layer_changes: HashMap::new(),
1267        })
1268    }
1269
1270    fn classify_regression_severity(
1271        &self,
1272        magnitude: f64,
1273        metric_type: &str,
1274    ) -> RegressionSeverity {
1275        match metric_type {
1276            "accuracy" => {
1277                if magnitude > 0.1 {
1278                    RegressionSeverity::Critical
1279                } else if magnitude > 0.05 {
1280                    RegressionSeverity::Major
1281                } else if magnitude > 0.02 {
1282                    RegressionSeverity::Minor
1283                } else {
1284                    RegressionSeverity::Negligible
1285                }
1286            },
1287            "latency" => {
1288                if magnitude > 50.0 {
1289                    RegressionSeverity::Critical
1290                } else if magnitude > 20.0 {
1291                    RegressionSeverity::Major
1292                } else if magnitude > 10.0 {
1293                    RegressionSeverity::Minor
1294                } else {
1295                    RegressionSeverity::Negligible
1296                }
1297            },
1298            _ => RegressionSeverity::Minor,
1299        }
1300    }
1301
1302    fn generate_model_summary(&self) -> HashMap<String, String> {
1303        let mut summary = HashMap::new();
1304
1305        if let Some((best_name, best_model)) = self.model_snapshots.iter().max_by(|a, b| {
1306            a.1.metrics
1307                .val_accuracy
1308                .partial_cmp(&b.1.metrics.val_accuracy)
1309                .unwrap_or(std::cmp::Ordering::Equal)
1310        }) {
1311            summary.insert("best_accuracy_model".to_string(), best_name.clone());
1312            summary.insert(
1313                "best_accuracy_value".to_string(),
1314                format!("{:.4}", best_model.metrics.val_accuracy),
1315            );
1316        }
1317
1318        if let Some((fastest_name, fastest_model)) = self.model_snapshots.iter().min_by(|a, b| {
1319            a.1.metrics
1320                .inference_latency_ms
1321                .partial_cmp(&b.1.metrics.inference_latency_ms)
1322                .unwrap_or(std::cmp::Ordering::Equal)
1323        }) {
1324            summary.insert("fastest_model".to_string(), fastest_name.clone());
1325            summary.insert(
1326                "fastest_latency".to_string(),
1327                format!("{:.2}ms", fastest_model.metrics.inference_latency_ms),
1328            );
1329        }
1330
1331        summary.insert(
1332            "total_models".to_string(),
1333            self.model_snapshots.len().to_string(),
1334        );
1335        summary
1336    }
1337}
1338
1339/// Comprehensive differential debugging report
1340#[derive(Debug, Clone, Serialize, Deserialize)]
1341pub struct DifferentialDebuggingReport {
1342    pub timestamp: DateTime<Utc>,
1343    pub config: DifferentialDebuggingConfig,
1344    pub total_models: usize,
1345    pub comparison_count: usize,
1346    pub ab_test_count: usize,
1347    pub version_diff_count: usize,
1348    pub regression_detection_count: usize,
1349    pub recent_comparisons: Vec<ModelComparisonResult>,
1350    pub recent_regressions: Vec<RegressionDetectionResult>,
1351    pub model_summary: HashMap<String, String>,
1352}
1353
1354#[cfg(test)]
1355mod tests {
1356    use super::*;
1357
1358    #[tokio::test]
1359    async fn test_differential_debugger_creation() {
1360        let config = DifferentialDebuggingConfig::default();
1361        let debugger = DifferentialDebugger::new(config);
1362        assert_eq!(debugger.model_snapshots.len(), 0);
1363    }
1364
1365    #[tokio::test]
1366    async fn test_model_snapshot_addition() {
1367        let config = DifferentialDebuggingConfig::default();
1368        let mut debugger = DifferentialDebugger::new(config);
1369
1370        let snapshot = create_test_snapshot("test_model");
1371        debugger.add_model_snapshot(snapshot).expect("add operation failed");
1372        assert_eq!(debugger.model_snapshots.len(), 1);
1373    }
1374
1375    #[tokio::test]
1376    async fn test_model_comparison() {
1377        let config = DifferentialDebuggingConfig::default();
1378        let mut debugger = DifferentialDebugger::new(config);
1379
1380        // Add two test models
1381        let snapshot1 = create_test_snapshot("model_a");
1382        let snapshot2 = create_test_snapshot("model_b");
1383
1384        debugger.add_model_snapshot(snapshot1).expect("add operation failed");
1385        debugger.add_model_snapshot(snapshot2).expect("add operation failed");
1386
1387        let result = debugger
1388            .compare_models(vec!["model_a".to_string(), "model_b".to_string()])
1389            .await;
1390        assert!(result.is_ok());
1391    }
1392
1393    fn create_test_snapshot(name: &str) -> ModelSnapshot {
1394        ModelSnapshot {
1395            id: Uuid::new_v4(),
1396            name: name.to_string(),
1397            timestamp: Utc::now(),
1398            version: "1.0.0".to_string(),
1399            commit_hash: Some("abc123".to_string()),
1400            metrics: ModelMetrics {
1401                train_accuracy: 0.95,
1402                val_accuracy: 0.90,
1403                test_accuracy: Some(0.88),
1404                train_loss: 0.05,
1405                val_loss: 0.10,
1406                test_loss: Some(0.12),
1407                inference_latency_ms: 50.0,
1408                memory_usage_mb: 2048.0,
1409                model_size_mb: 500.0,
1410                flops: 1_000_000_000,
1411                training_time_s: 3600.0,
1412                custom_metrics: HashMap::new(),
1413            },
1414            architecture: ArchitectureInfo {
1415                parameter_count: 175_000_000,
1416                layer_count: 24,
1417                depth: 24,
1418                hidden_size: 1024,
1419                num_heads: Some(16),
1420                ff_dim: Some(4096),
1421                vocab_size: Some(50257),
1422                max_seq_length: Some(2048),
1423            },
1424            training_config: TrainingConfig {
1425                learning_rate: 1e-4,
1426                batch_size: 32,
1427                epochs: 10,
1428                optimizer: "AdamW".to_string(),
1429                lr_schedule: Some("cosine".to_string()),
1430                regularization: HashMap::new(),
1431            },
1432            weights_summary: WeightsSummary {
1433                mean: 0.0,
1434                std_dev: 0.1,
1435                min: -0.5,
1436                max: 0.5,
1437                percentiles: HashMap::new(),
1438                zero_count: 1000,
1439                sparsity: 0.01,
1440            },
1441            metadata: HashMap::new(),
1442        }
1443    }
1444}
trustformers_debug/differential_debugging.rs

trustformers_debug/
differential_debugging.rs