trustformers_debug/
differential_debugging.rs

1//! # Differential Debugging System
2//!
3//! Advanced model comparison, A/B analysis, version diff tracking, regression identification,
4//! and performance delta analysis for TrustformeRS models.
5
6use anyhow::Result;
7use chrono::{DateTime, Utc};
8use indexmap::IndexMap;
9// use scirs2_core::ndarray::*; // SciRS2 Integration Policy - was: use ndarray::{Array1, Array2};
10use serde::{Deserialize, Serialize};
11use statrs::statistics::Statistics;
12use std::collections::HashMap;
13use uuid::Uuid;
14
15/// Configuration for differential debugging
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DifferentialDebuggingConfig {
18    /// Enable model comparison analysis
19    pub enable_model_comparison: bool,
20    /// Enable A/B testing analysis
21    pub enable_ab_analysis: bool,
22    /// Enable version diff tracking
23    pub enable_version_diff: bool,
24    /// Enable regression identification
25    pub enable_regression_detection: bool,
26    /// Enable performance delta analysis
27    pub enable_performance_delta: bool,
28    /// Statistical significance threshold for comparisons
29    pub significance_threshold: f64,
30    /// Maximum number of models to compare simultaneously
31    pub max_comparison_models: usize,
32    /// Regression detection sensitivity (0.0 to 1.0)
33    pub regression_sensitivity: f64,
34    /// Performance delta threshold (percentage)
35    pub performance_delta_threshold: f64,
36}
37
38impl Default for DifferentialDebuggingConfig {
39    fn default() -> Self {
40        Self {
41            enable_model_comparison: true,
42            enable_ab_analysis: true,
43            enable_version_diff: true,
44            enable_regression_detection: true,
45            enable_performance_delta: true,
46            significance_threshold: 0.05,
47            max_comparison_models: 10,
48            regression_sensitivity: 0.8,
49            performance_delta_threshold: 5.0,
50        }
51    }
52}
53
54/// Model snapshot for comparison
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ModelSnapshot {
57    /// Unique identifier for the model snapshot
58    pub id: Uuid,
59    /// Model name or version identifier
60    pub name: String,
61    /// Timestamp when snapshot was created
62    pub timestamp: DateTime<Utc>,
63    /// Model version information
64    pub version: String,
65    /// Git commit hash (if available)
66    pub commit_hash: Option<String>,
67    /// Model performance metrics
68    pub metrics: ModelMetrics,
69    /// Model architecture information
70    pub architecture: ArchitectureInfo,
71    /// Training configuration
72    pub training_config: TrainingConfig,
73    /// Model weights summary statistics
74    pub weights_summary: WeightsSummary,
75    /// Additional metadata
76    pub metadata: HashMap<String, String>,
77}
78
79/// Performance metrics for a model
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ModelMetrics {
82    /// Training accuracy
83    pub train_accuracy: f64,
84    /// Validation accuracy
85    pub val_accuracy: f64,
86    /// Test accuracy (if available)
87    pub test_accuracy: Option<f64>,
88    /// Training loss
89    pub train_loss: f64,
90    /// Validation loss
91    pub val_loss: f64,
92    /// Test loss (if available)
93    pub test_loss: Option<f64>,
94    /// Inference latency (ms)
95    pub inference_latency_ms: f64,
96    /// Memory usage (MB)
97    pub memory_usage_mb: f64,
98    /// Model size (MB)
99    pub model_size_mb: f64,
100    /// FLOPS count
101    pub flops: u64,
102    /// Training time (seconds)
103    pub training_time_s: f64,
104    /// Custom metrics
105    pub custom_metrics: HashMap<String, f64>,
106}
107
108/// Architecture information
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ArchitectureInfo {
111    /// Number of parameters
112    pub parameter_count: u64,
113    /// Number of layers
114    pub layer_count: u32,
115    /// Model depth
116    pub depth: u32,
117    /// Hidden dimension size
118    pub hidden_size: u32,
119    /// Number of attention heads
120    pub num_heads: Option<u32>,
121    /// Feed-forward dimension
122    pub ff_dim: Option<u32>,
123    /// Vocabulary size
124    pub vocab_size: Option<u32>,
125    /// Sequence length
126    pub max_seq_length: Option<u32>,
127}
128
129/// Training configuration
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TrainingConfig {
132    /// Learning rate
133    pub learning_rate: f64,
134    /// Batch size
135    pub batch_size: u32,
136    /// Number of epochs
137    pub epochs: u32,
138    /// Optimizer type
139    pub optimizer: String,
140    /// Learning rate schedule
141    pub lr_schedule: Option<String>,
142    /// Regularization parameters
143    pub regularization: HashMap<String, f64>,
144}
145
146/// Summary statistics for model weights
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct WeightsSummary {
149    /// Mean weight value
150    pub mean: f64,
151    /// Standard deviation of weights
152    pub std_dev: f64,
153    /// Minimum weight value
154    pub min: f64,
155    /// Maximum weight value
156    pub max: f64,
157    /// Weight distribution percentiles
158    pub percentiles: HashMap<String, f64>,
159    /// Number of zero weights
160    pub zero_count: u64,
161    /// Sparsity ratio
162    pub sparsity: f64,
163}
164
165/// Result of model comparison analysis
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ModelComparisonResult {
168    /// Models being compared
169    pub models: Vec<String>,
170    /// Comparison timestamp
171    pub timestamp: DateTime<Utc>,
172    /// Performance comparison
173    pub performance_comparison: PerformanceComparison,
174    /// Architecture differences
175    pub architecture_diff: ArchitectureDiff,
176    /// Statistical significance results
177    pub statistical_analysis: StatisticalAnalysis,
178    /// Overall comparison summary
179    pub summary: ComparisonSummary,
180}
181
182/// Performance comparison between models
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct PerformanceComparison {
185    /// Accuracy comparison
186    pub accuracy_comparison: MetricComparison,
187    /// Loss comparison
188    pub loss_comparison: MetricComparison,
189    /// Latency comparison
190    pub latency_comparison: MetricComparison,
191    /// Memory usage comparison
192    pub memory_comparison: MetricComparison,
193    /// Model size comparison
194    pub size_comparison: MetricComparison,
195    /// Custom metric comparisons
196    pub custom_comparisons: HashMap<String, MetricComparison>,
197}
198
199/// Comparison result for a specific metric
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct MetricComparison {
202    /// Values for each model
203    pub values: HashMap<String, f64>,
204    /// Best performing model for this metric
205    pub best_model: String,
206    /// Worst performing model for this metric
207    pub worst_model: String,
208    /// Performance differences (relative to best)
209    pub differences: HashMap<String, f64>,
210    /// Statistical significance of differences
211    pub significant_differences: HashMap<String, bool>,
212}
213
214/// Architecture differences between models
215#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ArchitectureDiff {
217    /// Parameter count differences
218    pub parameter_diff: HashMap<String, i64>,
219    /// Layer count differences
220    pub layer_diff: HashMap<String, i32>,
221    /// Architecture similarity score (0.0 to 1.0)
222    pub similarity_score: f64,
223    /// Notable differences
224    pub notable_differences: Vec<String>,
225}
226
227/// Statistical analysis of comparisons
228#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct StatisticalAnalysis {
230    /// P-values for metric comparisons
231    pub p_values: HashMap<String, f64>,
232    /// Effect sizes (Cohen's d)
233    pub effect_sizes: HashMap<String, f64>,
234    /// Confidence intervals
235    pub confidence_intervals: HashMap<String, (f64, f64)>,
236    /// Statistical significance summary
237    pub significance_summary: HashMap<String, bool>,
238}
239
240/// Overall comparison summary
241#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct ComparisonSummary {
243    /// Overall best model
244    pub best_model: String,
245    /// Model rankings by different criteria
246    pub rankings: HashMap<String, Vec<String>>,
247    /// Key findings
248    pub key_findings: Vec<String>,
249    /// Recommendations
250    pub recommendations: Vec<String>,
251}
252
253/// A/B test configuration
254#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ABTestConfig {
256    /// Test name
257    pub name: String,
258    /// Model A identifier
259    pub model_a: String,
260    /// Model B identifier
261    pub model_b: String,
262    /// Test duration (if applicable)
263    pub duration_hours: Option<u32>,
264    /// Sample size for each group
265    pub sample_size: u32,
266    /// Metrics to track
267    pub tracked_metrics: Vec<String>,
268    /// Minimum detectable effect size
269    pub min_effect_size: f64,
270    /// Statistical power
271    pub power: f64,
272}
273
274/// A/B test result
275#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct ABTestResult {
277    /// Test configuration
278    pub config: ABTestConfig,
279    /// Test start time
280    pub start_time: DateTime<Utc>,
281    /// Test end time
282    pub end_time: Option<DateTime<Utc>>,
283    /// Model A results
284    pub model_a_results: ABTestMetrics,
285    /// Model B results
286    pub model_b_results: ABTestMetrics,
287    /// Statistical test results
288    pub statistical_tests: HashMap<String, StatisticalTestResult>,
289    /// Test conclusion
290    pub conclusion: ABTestConclusion,
291}
292
293/// Metrics for A/B test
294#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ABTestMetrics {
296    /// Sample size
297    pub sample_size: u32,
298    /// Metric values
299    pub metrics: HashMap<String, Vec<f64>>,
300    /// Summary statistics
301    pub summary_stats: HashMap<String, SummaryStats>,
302}
303
304/// Summary statistics
305#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct SummaryStats {
307    pub mean: f64,
308    pub std_dev: f64,
309    pub min: f64,
310    pub max: f64,
311    pub median: f64,
312    pub q25: f64,
313    pub q75: f64,
314}
315
316/// Statistical test result
317#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct StatisticalTestResult {
319    /// Test type (t-test, Mann-Whitney U, etc.)
320    pub test_type: String,
321    /// Test statistic
322    pub statistic: f64,
323    /// P-value
324    pub p_value: f64,
325    /// Effect size
326    pub effect_size: f64,
327    /// Confidence interval for difference
328    pub confidence_interval: (f64, f64),
329    /// Is result statistically significant?
330    pub is_significant: bool,
331}
332
333/// A/B test conclusion
334#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct ABTestConclusion {
336    /// Winner (if any)
337    pub winner: Option<String>,
338    /// Confidence level
339    pub confidence: f64,
340    /// Practical significance
341    pub practical_significance: bool,
342    /// Recommendation
343    pub recommendation: String,
344    /// Summary
345    pub summary: String,
346}
347
348/// Version diff tracking information
349#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct VersionDiff {
351    /// Previous version
352    pub from_version: String,
353    /// Current version
354    pub to_version: String,
355    /// Diff timestamp
356    pub timestamp: DateTime<Utc>,
357    /// Performance changes
358    pub performance_delta: PerformanceDelta,
359    /// Architecture changes
360    pub architecture_changes: Vec<ArchitectureChange>,
361    /// Configuration changes
362    pub config_changes: Vec<ConfigChange>,
363    /// Weight changes summary
364    pub weight_changes: WeightChangesSummary,
365}
366
367/// Performance delta between versions
368#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PerformanceDelta {
370    /// Accuracy change
371    pub accuracy_delta: f64,
372    /// Loss change
373    pub loss_delta: f64,
374    /// Latency change
375    pub latency_delta: f64,
376    /// Memory usage change
377    pub memory_delta: f64,
378    /// Model size change
379    pub size_delta: f64,
380    /// Training time change
381    pub training_time_delta: f64,
382    /// Custom metric changes
383    pub custom_deltas: HashMap<String, f64>,
384}
385
386/// Architecture change description
387#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct ArchitectureChange {
389    /// Type of change
390    pub change_type: String,
391    /// Description
392    pub description: String,
393    /// Impact assessment
394    pub impact: String,
395}
396
397/// Configuration change description
398#[derive(Debug, Clone, Serialize, Deserialize)]
399pub struct ConfigChange {
400    /// Parameter name
401    pub parameter: String,
402    /// Old value
403    pub old_value: String,
404    /// New value
405    pub new_value: String,
406    /// Change impact
407    pub impact: String,
408}
409
410/// Summary of weight changes
411#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct WeightChangesSummary {
413    /// Average magnitude of weight changes
414    pub avg_magnitude: f64,
415    /// Maximum weight change
416    pub max_change: f64,
417    /// Percentage of weights that changed significantly
418    pub significant_change_ratio: f64,
419    /// Layer-wise change summary
420    pub layer_changes: HashMap<String, f64>,
421}
422
423/// Regression detection result
424#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct RegressionDetectionResult {
426    /// Analysis timestamp
427    pub timestamp: DateTime<Utc>,
428    /// Detected regressions
429    pub regressions: Vec<Regression>,
430    /// Performance improvements
431    pub improvements: Vec<Improvement>,
432    /// Overall assessment
433    pub overall_assessment: RegressionAssessment,
434}
435
436/// Detected regression
437#[derive(Debug, Clone, Serialize, Deserialize)]
438pub struct Regression {
439    /// Metric that regressed
440    pub metric: String,
441    /// Current value
442    pub current_value: f64,
443    /// Previous value
444    pub previous_value: f64,
445    /// Regression magnitude
446    pub magnitude: f64,
447    /// Severity level
448    pub severity: RegressionSeverity,
449    /// Possible causes
450    pub possible_causes: Vec<String>,
451    /// Suggested fixes
452    pub suggested_fixes: Vec<String>,
453}
454
455/// Performance improvement
456#[derive(Debug, Clone, Serialize, Deserialize)]
457pub struct Improvement {
458    /// Metric that improved
459    pub metric: String,
460    /// Current value
461    pub current_value: f64,
462    /// Previous value
463    pub previous_value: f64,
464    /// Improvement magnitude
465    pub magnitude: f64,
466    /// Likely causes
467    pub likely_causes: Vec<String>,
468}
469
470/// Regression severity levels
471#[derive(Debug, Clone, Serialize, Deserialize)]
472pub enum RegressionSeverity {
473    Critical,
474    Major,
475    Minor,
476    Negligible,
477}
478
479/// Overall regression assessment
480#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct RegressionAssessment {
482    /// Overall health score (0.0 to 1.0)
483    pub health_score: f64,
484    /// Number of critical regressions
485    pub critical_regressions: usize,
486    /// Number of improvements
487    pub improvements: usize,
488    /// Recommendation
489    pub recommendation: String,
490}
491
492/// Main differential debugging analyzer
493#[derive(Debug)]
494pub struct DifferentialDebugger {
495    config: DifferentialDebuggingConfig,
496    model_snapshots: IndexMap<String, ModelSnapshot>,
497    comparison_history: Vec<ModelComparisonResult>,
498    ab_tests: Vec<ABTestResult>,
499    version_diffs: Vec<VersionDiff>,
500    regression_history: Vec<RegressionDetectionResult>,
501}
502
503impl DifferentialDebugger {
504    /// Create a new differential debugger
505    pub fn new(config: DifferentialDebuggingConfig) -> Self {
506        Self {
507            config,
508            model_snapshots: IndexMap::new(),
509            comparison_history: Vec::new(),
510            ab_tests: Vec::new(),
511            version_diffs: Vec::new(),
512            regression_history: Vec::new(),
513        }
514    }
515
516    /// Add a model snapshot for comparison
517    pub fn add_model_snapshot(&mut self, snapshot: ModelSnapshot) -> Result<()> {
518        if self.model_snapshots.len() >= self.config.max_comparison_models {
519            // Remove oldest snapshot
520            self.model_snapshots.shift_remove_index(0);
521        }
522
523        self.model_snapshots.insert(snapshot.name.clone(), snapshot);
524        Ok(())
525    }
526
527    /// Compare two or more models
528    pub async fn compare_models(
529        &mut self,
530        model_names: Vec<String>,
531    ) -> Result<ModelComparisonResult> {
532        if !self.config.enable_model_comparison {
533            return Err(anyhow::anyhow!("Model comparison is disabled"));
534        }
535
536        if model_names.len() < 2 {
537            return Err(anyhow::anyhow!(
538                "At least two models are required for comparison"
539            ));
540        }
541
542        // Get model snapshots
543        let models: Vec<&ModelSnapshot> = model_names
544            .iter()
545            .map(|name| {
546                self.model_snapshots
547                    .get(name)
548                    .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", name))
549            })
550            .collect::<Result<Vec<_>>>()?;
551
552        // Perform comparison analysis
553        let performance_comparison = self.compare_performance(&models)?;
554        let architecture_diff = self.analyze_architecture_differences(&models)?;
555        let statistical_analysis = self.perform_statistical_analysis(&models)?;
556        let summary = self.generate_comparison_summary(
557            &models,
558            &performance_comparison,
559            &statistical_analysis,
560        )?;
561
562        let result = ModelComparisonResult {
563            models: model_names,
564            timestamp: Utc::now(),
565            performance_comparison,
566            architecture_diff,
567            statistical_analysis,
568            summary,
569        };
570
571        self.comparison_history.push(result.clone());
572        Ok(result)
573    }
574
575    /// Run A/B test analysis
576    pub async fn run_ab_test(
577        &mut self,
578        config: ABTestConfig,
579        model_a_data: Vec<f64>,
580        model_b_data: Vec<f64>,
581    ) -> Result<ABTestResult> {
582        if !self.config.enable_ab_analysis {
583            return Err(anyhow::anyhow!("A/B analysis is disabled"));
584        }
585
586        let start_time = Utc::now();
587
588        // Calculate summary statistics for both models
589        let model_a_stats = self.calculate_summary_stats(&model_a_data);
590        let model_b_stats = self.calculate_summary_stats(&model_b_data);
591
592        let model_a_results = ABTestMetrics {
593            sample_size: model_a_data.len() as u32,
594            metrics: {
595                let mut metrics = HashMap::new();
596                metrics.insert("primary_metric".to_string(), model_a_data);
597                metrics
598            },
599            summary_stats: {
600                let mut stats = HashMap::new();
601                stats.insert("primary_metric".to_string(), model_a_stats);
602                stats
603            },
604        };
605
606        let model_b_results = ABTestMetrics {
607            sample_size: model_b_data.len() as u32,
608            metrics: {
609                let mut metrics = HashMap::new();
610                metrics.insert("primary_metric".to_string(), model_b_data);
611                metrics
612            },
613            summary_stats: {
614                let mut stats = HashMap::new();
615                stats.insert("primary_metric".to_string(), model_b_stats);
616                stats
617            },
618        };
619
620        // Perform statistical tests
621        let statistical_tests =
622            self.perform_ab_statistical_tests(&model_a_results, &model_b_results)?;
623
624        // Generate conclusion
625        let conclusion = self.generate_ab_conclusion(
626            &config,
627            &model_a_results,
628            &model_b_results,
629            &statistical_tests,
630        )?;
631
632        let result = ABTestResult {
633            config,
634            start_time,
635            end_time: Some(Utc::now()),
636            model_a_results,
637            model_b_results,
638            statistical_tests,
639            conclusion,
640        };
641
642        self.ab_tests.push(result.clone());
643        Ok(result)
644    }
645
646    /// Track version differences
647    pub async fn track_version_diff(
648        &mut self,
649        from_model: &str,
650        to_model: &str,
651    ) -> Result<VersionDiff> {
652        if !self.config.enable_version_diff {
653            return Err(anyhow::anyhow!("Version diff tracking is disabled"));
654        }
655
656        let from_snapshot = self
657            .model_snapshots
658            .get(from_model)
659            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", from_model))?;
660        let to_snapshot = self
661            .model_snapshots
662            .get(to_model)
663            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", to_model))?;
664
665        let performance_delta = self.calculate_performance_delta(from_snapshot, to_snapshot)?;
666        let architecture_changes = self.detect_architecture_changes(from_snapshot, to_snapshot)?;
667        let config_changes = self.detect_config_changes(from_snapshot, to_snapshot)?;
668        let weight_changes = self.analyze_weight_changes(from_snapshot, to_snapshot)?;
669
670        let diff = VersionDiff {
671            from_version: from_snapshot.version.clone(),
672            to_version: to_snapshot.version.clone(),
673            timestamp: Utc::now(),
674            performance_delta,
675            architecture_changes,
676            config_changes,
677            weight_changes,
678        };
679
680        self.version_diffs.push(diff.clone());
681        Ok(diff)
682    }
683
684    /// Detect performance regressions
685    pub async fn detect_regressions(
686        &mut self,
687        current_model: &str,
688        baseline_model: &str,
689    ) -> Result<RegressionDetectionResult> {
690        if !self.config.enable_regression_detection {
691            return Err(anyhow::anyhow!("Regression detection is disabled"));
692        }
693
694        let current = self
695            .model_snapshots
696            .get(current_model)
697            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", current_model))?;
698        let baseline = self
699            .model_snapshots
700            .get(baseline_model)
701            .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", baseline_model))?;
702
703        let mut regressions = Vec::new();
704        let mut improvements = Vec::new();
705
706        // Check accuracy regression
707        if current.metrics.val_accuracy < baseline.metrics.val_accuracy {
708            let magnitude = baseline.metrics.val_accuracy - current.metrics.val_accuracy;
709            if magnitude > self.config.regression_sensitivity * 0.01 {
710                regressions.push(Regression {
711                    metric: "validation_accuracy".to_string(),
712                    current_value: current.metrics.val_accuracy,
713                    previous_value: baseline.metrics.val_accuracy,
714                    magnitude,
715                    severity: self.classify_regression_severity(magnitude, "accuracy"),
716                    possible_causes: vec![
717                        "Learning rate too high".to_string(),
718                        "Insufficient training".to_string(),
719                        "Data distribution shift".to_string(),
720                    ],
721                    suggested_fixes: vec![
722                        "Reduce learning rate".to_string(),
723                        "Increase training epochs".to_string(),
724                        "Check data quality".to_string(),
725                    ],
726                });
727            }
728        } else if current.metrics.val_accuracy > baseline.metrics.val_accuracy {
729            let magnitude = current.metrics.val_accuracy - baseline.metrics.val_accuracy;
730            improvements.push(Improvement {
731                metric: "validation_accuracy".to_string(),
732                current_value: current.metrics.val_accuracy,
733                previous_value: baseline.metrics.val_accuracy,
734                magnitude,
735                likely_causes: vec![
736                    "Better optimization".to_string(),
737                    "Improved architecture".to_string(),
738                    "Better hyperparameters".to_string(),
739                ],
740            });
741        }
742
743        // Check latency regression
744        if current.metrics.inference_latency_ms > baseline.metrics.inference_latency_ms {
745            let magnitude =
746                current.metrics.inference_latency_ms - baseline.metrics.inference_latency_ms;
747            let relative_change = magnitude / baseline.metrics.inference_latency_ms * 100.0;
748            if relative_change > self.config.performance_delta_threshold {
749                regressions.push(Regression {
750                    metric: "inference_latency".to_string(),
751                    current_value: current.metrics.inference_latency_ms,
752                    previous_value: baseline.metrics.inference_latency_ms,
753                    magnitude,
754                    severity: self.classify_regression_severity(relative_change, "latency"),
755                    possible_causes: vec![
756                        "Model complexity increased".to_string(),
757                        "Inefficient implementation".to_string(),
758                        "Hardware degradation".to_string(),
759                    ],
760                    suggested_fixes: vec![
761                        "Profile and optimize bottlenecks".to_string(),
762                        "Consider model compression".to_string(),
763                        "Check hardware configuration".to_string(),
764                    ],
765                });
766            }
767        }
768
769        let critical_regressions = regressions
770            .iter()
771            .filter(|r| matches!(r.severity, RegressionSeverity::Critical))
772            .count();
773
774        let health_score = if critical_regressions > 0 {
775            0.0
776        } else {
777            1.0 - (regressions.len() as f64 * 0.1).min(1.0)
778        };
779
780        let recommendation = if critical_regressions > 0 {
781            "Critical regressions detected. Immediate action required.".to_string()
782        } else if !regressions.is_empty() {
783            "Some regressions detected. Review and address if necessary.".to_string()
784        } else {
785            "No significant regressions detected.".to_string()
786        };
787
788        let overall_assessment = RegressionAssessment {
789            health_score,
790            critical_regressions,
791            improvements: improvements.len(),
792            recommendation,
793        };
794
795        let result = RegressionDetectionResult {
796            timestamp: Utc::now(),
797            regressions,
798            improvements,
799            overall_assessment,
800        };
801
802        self.regression_history.push(result.clone());
803        Ok(result)
804    }
805
806    /// Generate comprehensive differential debugging report
807    pub async fn generate_report(&self) -> Result<DifferentialDebuggingReport> {
808        Ok(DifferentialDebuggingReport {
809            timestamp: Utc::now(),
810            config: self.config.clone(),
811            total_models: self.model_snapshots.len(),
812            comparison_count: self.comparison_history.len(),
813            ab_test_count: self.ab_tests.len(),
814            version_diff_count: self.version_diffs.len(),
815            regression_detection_count: self.regression_history.len(),
816            recent_comparisons: self.comparison_history.iter().rev().take(5).cloned().collect(),
817            recent_regressions: self.regression_history.iter().rev().take(3).cloned().collect(),
818            model_summary: self.generate_model_summary(),
819        })
820    }
821
822    // Helper methods
823
824    fn compare_performance(&self, models: &[&ModelSnapshot]) -> Result<PerformanceComparison> {
825        let mut accuracy_values = HashMap::new();
826        let mut loss_values = HashMap::new();
827        let mut latency_values = HashMap::new();
828        let mut memory_values = HashMap::new();
829        let mut size_values = HashMap::new();
830
831        for model in models {
832            accuracy_values.insert(model.name.clone(), model.metrics.val_accuracy);
833            loss_values.insert(model.name.clone(), model.metrics.val_loss);
834            latency_values.insert(model.name.clone(), model.metrics.inference_latency_ms);
835            memory_values.insert(model.name.clone(), model.metrics.memory_usage_mb);
836            size_values.insert(model.name.clone(), model.metrics.model_size_mb);
837        }
838
839        Ok(PerformanceComparison {
840            accuracy_comparison: self.create_metric_comparison(accuracy_values, true)?,
841            loss_comparison: self.create_metric_comparison(loss_values, false)?,
842            latency_comparison: self.create_metric_comparison(latency_values, false)?,
843            memory_comparison: self.create_metric_comparison(memory_values, false)?,
844            size_comparison: self.create_metric_comparison(size_values, false)?,
845            custom_comparisons: HashMap::new(),
846        })
847    }
848
849    fn create_metric_comparison(
850        &self,
851        values: HashMap<String, f64>,
852        higher_is_better: bool,
853    ) -> Result<MetricComparison> {
854        let best_model = if higher_is_better {
855            values.iter().max_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
856        } else {
857            values.iter().min_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
858        };
859
860        let worst_model = if higher_is_better {
861            values.iter().min_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
862        } else {
863            values.iter().max_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
864        };
865
866        let best_value = values[&best_model];
867        let mut differences = HashMap::new();
868        let mut significant_differences = HashMap::new();
869
870        for (model, value) in &values {
871            let diff = if higher_is_better {
872                (value - best_value) / best_value * 100.0
873            } else {
874                (best_value - value) / best_value * 100.0
875            };
876            differences.insert(model.clone(), diff);
877            significant_differences.insert(model.clone(), diff.abs() > 1.0); // 1% threshold
878        }
879
880        Ok(MetricComparison {
881            values,
882            best_model,
883            worst_model,
884            differences,
885            significant_differences,
886        })
887    }
888
889    fn analyze_architecture_differences(
890        &self,
891        models: &[&ModelSnapshot],
892    ) -> Result<ArchitectureDiff> {
893        if models.len() < 2 {
894            return Err(anyhow::anyhow!(
895                "Need at least 2 models for architecture diff"
896            ));
897        }
898
899        let base_model = models[0];
900        let mut parameter_diff = HashMap::new();
901        let mut layer_diff = HashMap::new();
902        let mut notable_differences = Vec::new();
903
904        for model in models.iter().skip(1) {
905            let param_diff = model.architecture.parameter_count as i64
906                - base_model.architecture.parameter_count as i64;
907            let layer_diff_val =
908                model.architecture.layer_count as i32 - base_model.architecture.layer_count as i32;
909
910            parameter_diff.insert(model.name.clone(), param_diff);
911            layer_diff.insert(model.name.clone(), layer_diff_val);
912
913            if param_diff.abs() > 1_000_000 {
914                notable_differences.push(format!(
915                    "Model '{}' has {} parameter difference",
916                    model.name, param_diff
917                ));
918            }
919
920            if layer_diff_val != 0 {
921                notable_differences.push(format!(
922                    "Model '{}' has {} layer difference",
923                    model.name, layer_diff_val
924                ));
925            }
926        }
927
928        // Calculate similarity score based on architecture features
929        let mut similarity_scores = Vec::new();
930        for model in models.iter().skip(1) {
931            let score = self
932                .calculate_architecture_similarity(&base_model.architecture, &model.architecture);
933            similarity_scores.push(score);
934        }
935        let similarity_score =
936            similarity_scores.iter().sum::<f64>() / similarity_scores.len() as f64;
937
938        Ok(ArchitectureDiff {
939            parameter_diff,
940            layer_diff,
941            similarity_score,
942            notable_differences,
943        })
944    }
945
946    fn calculate_architecture_similarity(
947        &self,
948        arch1: &ArchitectureInfo,
949        arch2: &ArchitectureInfo,
950    ) -> f64 {
951        let mut similarity = 0.0;
952        let mut features = 0;
953
954        // Parameter count similarity
955        let param_ratio = (arch1.parameter_count.min(arch2.parameter_count) as f64)
956            / (arch1.parameter_count.max(arch2.parameter_count) as f64);
957        similarity += param_ratio;
958        features += 1;
959
960        // Layer count similarity
961        let layer_ratio = (arch1.layer_count.min(arch2.layer_count) as f64)
962            / (arch1.layer_count.max(arch2.layer_count) as f64);
963        similarity += layer_ratio;
964        features += 1;
965
966        // Hidden size similarity (if available)
967        let hidden_ratio = (arch1.hidden_size.min(arch2.hidden_size) as f64)
968            / (arch1.hidden_size.max(arch2.hidden_size) as f64);
969        similarity += hidden_ratio;
970        features += 1;
971
972        similarity / features as f64
973    }
974
975    fn perform_statistical_analysis(
976        &self,
977        _models: &[&ModelSnapshot],
978    ) -> Result<StatisticalAnalysis> {
979        // For now, return placeholder analysis
980        // In a real implementation, this would perform proper statistical tests
981        Ok(StatisticalAnalysis {
982            p_values: HashMap::new(),
983            effect_sizes: HashMap::new(),
984            confidence_intervals: HashMap::new(),
985            significance_summary: HashMap::new(),
986        })
987    }
988
989    fn generate_comparison_summary(
990        &self,
991        _models: &[&ModelSnapshot],
992        performance: &PerformanceComparison,
993        _statistical: &StatisticalAnalysis,
994    ) -> Result<ComparisonSummary> {
995        let best_model = performance.accuracy_comparison.best_model.clone();
996
997        let mut rankings = HashMap::new();
998        rankings.insert(
999            "accuracy".to_string(),
1000            vec![performance.accuracy_comparison.best_model.clone()],
1001        );
1002        rankings.insert(
1003            "latency".to_string(),
1004            vec![performance.latency_comparison.best_model.clone()],
1005        );
1006
1007        let key_findings = vec![
1008            format!(
1009                "Best accuracy: {} ({:.2}%)",
1010                performance.accuracy_comparison.best_model,
1011                performance.accuracy_comparison.values[&performance.accuracy_comparison.best_model]
1012                    * 100.0
1013            ),
1014            format!(
1015                "Fastest inference: {} ({:.2}ms)",
1016                performance.latency_comparison.best_model,
1017                performance.latency_comparison.values[&performance.latency_comparison.best_model]
1018            ),
1019        ];
1020
1021        let recommendations = vec![
1022            "Consider the trade-offs between accuracy and latency".to_string(),
1023            "Monitor memory usage for production deployment".to_string(),
1024        ];
1025
1026        Ok(ComparisonSummary {
1027            best_model,
1028            rankings,
1029            key_findings,
1030            recommendations,
1031        })
1032    }
1033
1034    fn calculate_summary_stats(&self, data: &[f64]) -> SummaryStats {
1035        let mean = data.iter().sum::<f64>() / data.len() as f64;
1036        let variance = data.variance();
1037        let std_dev = variance.sqrt();
1038
1039        let mut sorted_data = data.to_vec();
1040        sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1041
1042        let min = sorted_data[0];
1043        let max = sorted_data[sorted_data.len() - 1];
1044        let median = sorted_data[sorted_data.len() / 2];
1045        let q25 = sorted_data[sorted_data.len() / 4];
1046        let q75 = sorted_data[3 * sorted_data.len() / 4];
1047
1048        SummaryStats {
1049            mean,
1050            std_dev,
1051            min,
1052            max,
1053            median,
1054            q25,
1055            q75,
1056        }
1057    }
1058
1059    fn perform_ab_statistical_tests(
1060        &self,
1061        model_a: &ABTestMetrics,
1062        model_b: &ABTestMetrics,
1063    ) -> Result<HashMap<String, StatisticalTestResult>> {
1064        let mut results = HashMap::new();
1065
1066        // Simple t-test for primary metric
1067        if let (Some(a_data), Some(b_data)) = (
1068            model_a.metrics.get("primary_metric"),
1069            model_b.metrics.get("primary_metric"),
1070        ) {
1071            let a_mean = a_data.mean();
1072            let b_mean = b_data.mean();
1073            let a_var = a_data.variance();
1074            let b_var = b_data.variance();
1075
1076            // Simplified t-test calculation
1077            let pooled_std = ((a_var + b_var) / 2.0).sqrt();
1078            let standard_error =
1079                pooled_std * (1.0 / a_data.len() as f64 + 1.0 / b_data.len() as f64).sqrt();
1080            let t_statistic = (a_mean - b_mean) / standard_error;
1081
1082            // Simplified p-value (would use proper statistical functions in real implementation)
1083            let p_value = if t_statistic.abs() > 2.0 { 0.01 } else { 0.1 };
1084
1085            let effect_size = (a_mean - b_mean) / pooled_std; // Cohen's d
1086            let margin_of_error = 1.96 * standard_error; // 95% CI
1087
1088            results.insert(
1089                "primary_metric".to_string(),
1090                StatisticalTestResult {
1091                    test_type: "Welch's t-test".to_string(),
1092                    statistic: t_statistic,
1093                    p_value,
1094                    effect_size,
1095                    confidence_interval: (
1096                        a_mean - b_mean - margin_of_error,
1097                        a_mean - b_mean + margin_of_error,
1098                    ),
1099                    is_significant: p_value < 0.05,
1100                },
1101            );
1102        }
1103
1104        Ok(results)
1105    }
1106
1107    fn generate_ab_conclusion(
1108        &self,
1109        config: &ABTestConfig,
1110        _model_a: &ABTestMetrics,
1111        _model_b: &ABTestMetrics,
1112        tests: &HashMap<String, StatisticalTestResult>,
1113    ) -> Result<ABTestConclusion> {
1114        let primary_test = tests.get("primary_metric");
1115
1116        let (winner, confidence, practical_significance) = if let Some(test) = primary_test {
1117            let winner = if test.effect_size > 0.0 {
1118                Some(config.model_a.clone())
1119            } else {
1120                Some(config.model_b.clone())
1121            };
1122
1123            let confidence = if test.is_significant { 0.95 } else { 0.5 };
1124            let practical_significance = test.effect_size.abs() > config.min_effect_size;
1125
1126            (winner, confidence, practical_significance)
1127        } else {
1128            (None, 0.5, false)
1129        };
1130
1131        let recommendation = if practical_significance && confidence > 0.9 {
1132            format!("Recommend deploying {}", winner.as_ref().unwrap())
1133        } else {
1134            "Insufficient evidence for a clear recommendation".to_string()
1135        };
1136
1137        let summary = format!(
1138            "A/B test completed with {} confidence",
1139            if confidence > 0.9 { "high" } else { "low" }
1140        );
1141
1142        Ok(ABTestConclusion {
1143            winner,
1144            confidence,
1145            practical_significance,
1146            recommendation,
1147            summary,
1148        })
1149    }
1150
1151    fn calculate_performance_delta(
1152        &self,
1153        from: &ModelSnapshot,
1154        to: &ModelSnapshot,
1155    ) -> Result<PerformanceDelta> {
1156        Ok(PerformanceDelta {
1157            accuracy_delta: to.metrics.val_accuracy - from.metrics.val_accuracy,
1158            loss_delta: to.metrics.val_loss - from.metrics.val_loss,
1159            latency_delta: to.metrics.inference_latency_ms - from.metrics.inference_latency_ms,
1160            memory_delta: to.metrics.memory_usage_mb - from.metrics.memory_usage_mb,
1161            size_delta: to.metrics.model_size_mb - from.metrics.model_size_mb,
1162            training_time_delta: to.metrics.training_time_s - from.metrics.training_time_s,
1163            custom_deltas: HashMap::new(),
1164        })
1165    }
1166
1167    fn detect_architecture_changes(
1168        &self,
1169        from: &ModelSnapshot,
1170        to: &ModelSnapshot,
1171    ) -> Result<Vec<ArchitectureChange>> {
1172        let mut changes = Vec::new();
1173
1174        if from.architecture.parameter_count != to.architecture.parameter_count {
1175            changes.push(ArchitectureChange {
1176                change_type: "Parameter Count".to_string(),
1177                description: format!(
1178                    "Changed from {} to {} parameters",
1179                    from.architecture.parameter_count, to.architecture.parameter_count
1180                ),
1181                impact: "Affects model capacity and memory usage".to_string(),
1182            });
1183        }
1184
1185        if from.architecture.layer_count != to.architecture.layer_count {
1186            changes.push(ArchitectureChange {
1187                change_type: "Layer Count".to_string(),
1188                description: format!(
1189                    "Changed from {} to {} layers",
1190                    from.architecture.layer_count, to.architecture.layer_count
1191                ),
1192                impact: "Affects model depth and training dynamics".to_string(),
1193            });
1194        }
1195
1196        Ok(changes)
1197    }
1198
1199    fn detect_config_changes(
1200        &self,
1201        from: &ModelSnapshot,
1202        to: &ModelSnapshot,
1203    ) -> Result<Vec<ConfigChange>> {
1204        let mut changes = Vec::new();
1205
1206        if from.training_config.learning_rate != to.training_config.learning_rate {
1207            changes.push(ConfigChange {
1208                parameter: "learning_rate".to_string(),
1209                old_value: from.training_config.learning_rate.to_string(),
1210                new_value: to.training_config.learning_rate.to_string(),
1211                impact: "Affects training speed and convergence".to_string(),
1212            });
1213        }
1214
1215        if from.training_config.batch_size != to.training_config.batch_size {
1216            changes.push(ConfigChange {
1217                parameter: "batch_size".to_string(),
1218                old_value: from.training_config.batch_size.to_string(),
1219                new_value: to.training_config.batch_size.to_string(),
1220                impact: "Affects gradient noise and memory usage".to_string(),
1221            });
1222        }
1223
1224        Ok(changes)
1225    }
1226
1227    fn analyze_weight_changes(
1228        &self,
1229        from: &ModelSnapshot,
1230        to: &ModelSnapshot,
1231    ) -> Result<WeightChangesSummary> {
1232        // Simplified weight change analysis
1233        let avg_magnitude = (to.weights_summary.mean - from.weights_summary.mean).abs();
1234        let max_change = (to.weights_summary.max - from.weights_summary.max).abs();
1235        let significant_change_ratio = if avg_magnitude > 0.01 { 0.8 } else { 0.2 };
1236
1237        Ok(WeightChangesSummary {
1238            avg_magnitude,
1239            max_change,
1240            significant_change_ratio,
1241            layer_changes: HashMap::new(),
1242        })
1243    }
1244
1245    fn classify_regression_severity(
1246        &self,
1247        magnitude: f64,
1248        metric_type: &str,
1249    ) -> RegressionSeverity {
1250        match metric_type {
1251            "accuracy" => {
1252                if magnitude > 0.1 {
1253                    RegressionSeverity::Critical
1254                } else if magnitude > 0.05 {
1255                    RegressionSeverity::Major
1256                } else if magnitude > 0.02 {
1257                    RegressionSeverity::Minor
1258                } else {
1259                    RegressionSeverity::Negligible
1260                }
1261            },
1262            "latency" => {
1263                if magnitude > 50.0 {
1264                    RegressionSeverity::Critical
1265                } else if magnitude > 20.0 {
1266                    RegressionSeverity::Major
1267                } else if magnitude > 10.0 {
1268                    RegressionSeverity::Minor
1269                } else {
1270                    RegressionSeverity::Negligible
1271                }
1272            },
1273            _ => RegressionSeverity::Minor,
1274        }
1275    }
1276
1277    fn generate_model_summary(&self) -> HashMap<String, String> {
1278        let mut summary = HashMap::new();
1279
1280        if let Some((best_name, best_model)) = self
1281            .model_snapshots
1282            .iter()
1283            .max_by(|a, b| a.1.metrics.val_accuracy.partial_cmp(&b.1.metrics.val_accuracy).unwrap())
1284        {
1285            summary.insert("best_accuracy_model".to_string(), best_name.clone());
1286            summary.insert(
1287                "best_accuracy_value".to_string(),
1288                format!("{:.4}", best_model.metrics.val_accuracy),
1289            );
1290        }
1291
1292        if let Some((fastest_name, fastest_model)) = self.model_snapshots.iter().min_by(|a, b| {
1293            a.1.metrics
1294                .inference_latency_ms
1295                .partial_cmp(&b.1.metrics.inference_latency_ms)
1296                .unwrap()
1297        }) {
1298            summary.insert("fastest_model".to_string(), fastest_name.clone());
1299            summary.insert(
1300                "fastest_latency".to_string(),
1301                format!("{:.2}ms", fastest_model.metrics.inference_latency_ms),
1302            );
1303        }
1304
1305        summary.insert(
1306            "total_models".to_string(),
1307            self.model_snapshots.len().to_string(),
1308        );
1309        summary
1310    }
1311}
1312
1313/// Comprehensive differential debugging report
1314#[derive(Debug, Clone, Serialize, Deserialize)]
1315pub struct DifferentialDebuggingReport {
1316    pub timestamp: DateTime<Utc>,
1317    pub config: DifferentialDebuggingConfig,
1318    pub total_models: usize,
1319    pub comparison_count: usize,
1320    pub ab_test_count: usize,
1321    pub version_diff_count: usize,
1322    pub regression_detection_count: usize,
1323    pub recent_comparisons: Vec<ModelComparisonResult>,
1324    pub recent_regressions: Vec<RegressionDetectionResult>,
1325    pub model_summary: HashMap<String, String>,
1326}
1327
1328#[cfg(test)]
1329mod tests {
1330    use super::*;
1331
1332    #[tokio::test]
1333    async fn test_differential_debugger_creation() {
1334        let config = DifferentialDebuggingConfig::default();
1335        let debugger = DifferentialDebugger::new(config);
1336        assert_eq!(debugger.model_snapshots.len(), 0);
1337    }
1338
1339    #[tokio::test]
1340    async fn test_model_snapshot_addition() {
1341        let config = DifferentialDebuggingConfig::default();
1342        let mut debugger = DifferentialDebugger::new(config);
1343
1344        let snapshot = create_test_snapshot("test_model");
1345        debugger.add_model_snapshot(snapshot).unwrap();
1346        assert_eq!(debugger.model_snapshots.len(), 1);
1347    }
1348
1349    #[tokio::test]
1350    async fn test_model_comparison() {
1351        let config = DifferentialDebuggingConfig::default();
1352        let mut debugger = DifferentialDebugger::new(config);
1353
1354        // Add two test models
1355        let snapshot1 = create_test_snapshot("model_a");
1356        let snapshot2 = create_test_snapshot("model_b");
1357
1358        debugger.add_model_snapshot(snapshot1).unwrap();
1359        debugger.add_model_snapshot(snapshot2).unwrap();
1360
1361        let result = debugger
1362            .compare_models(vec!["model_a".to_string(), "model_b".to_string()])
1363            .await;
1364        assert!(result.is_ok());
1365    }
1366
1367    fn create_test_snapshot(name: &str) -> ModelSnapshot {
1368        ModelSnapshot {
1369            id: Uuid::new_v4(),
1370            name: name.to_string(),
1371            timestamp: Utc::now(),
1372            version: "1.0.0".to_string(),
1373            commit_hash: Some("abc123".to_string()),
1374            metrics: ModelMetrics {
1375                train_accuracy: 0.95,
1376                val_accuracy: 0.90,
1377                test_accuracy: Some(0.88),
1378                train_loss: 0.05,
1379                val_loss: 0.10,
1380                test_loss: Some(0.12),
1381                inference_latency_ms: 50.0,
1382                memory_usage_mb: 2048.0,
1383                model_size_mb: 500.0,
1384                flops: 1_000_000_000,
1385                training_time_s: 3600.0,
1386                custom_metrics: HashMap::new(),
1387            },
1388            architecture: ArchitectureInfo {
1389                parameter_count: 175_000_000,
1390                layer_count: 24,
1391                depth: 24,
1392                hidden_size: 1024,
1393                num_heads: Some(16),
1394                ff_dim: Some(4096),
1395                vocab_size: Some(50257),
1396                max_seq_length: Some(2048),
1397            },
1398            training_config: TrainingConfig {
1399                learning_rate: 1e-4,
1400                batch_size: 32,
1401                epochs: 10,
1402                optimizer: "AdamW".to_string(),
1403                lr_schedule: Some("cosine".to_string()),
1404                regularization: HashMap::new(),
1405            },
1406            weights_summary: WeightsSummary {
1407                mean: 0.0,
1408                std_dev: 0.1,
1409                min: -0.5,
1410                max: 0.5,
1411                percentiles: HashMap::new(),
1412                zero_count: 1000,
1413                sparsity: 0.01,
1414            },
1415            metadata: HashMap::new(),
1416        }
1417    }
1418}