1use anyhow::Result;
7use chrono::{DateTime, Utc};
8use indexmap::IndexMap;
9use serde::{Deserialize, Serialize};
11use statrs::statistics::Statistics;
12use std::collections::HashMap;
13use uuid::Uuid;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DifferentialDebuggingConfig {
18 pub enable_model_comparison: bool,
20 pub enable_ab_analysis: bool,
22 pub enable_version_diff: bool,
24 pub enable_regression_detection: bool,
26 pub enable_performance_delta: bool,
28 pub significance_threshold: f64,
30 pub max_comparison_models: usize,
32 pub regression_sensitivity: f64,
34 pub performance_delta_threshold: f64,
36}
37
38impl Default for DifferentialDebuggingConfig {
39 fn default() -> Self {
40 Self {
41 enable_model_comparison: true,
42 enable_ab_analysis: true,
43 enable_version_diff: true,
44 enable_regression_detection: true,
45 enable_performance_delta: true,
46 significance_threshold: 0.05,
47 max_comparison_models: 10,
48 regression_sensitivity: 0.8,
49 performance_delta_threshold: 5.0,
50 }
51 }
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ModelSnapshot {
57 pub id: Uuid,
59 pub name: String,
61 pub timestamp: DateTime<Utc>,
63 pub version: String,
65 pub commit_hash: Option<String>,
67 pub metrics: ModelMetrics,
69 pub architecture: ArchitectureInfo,
71 pub training_config: TrainingConfig,
73 pub weights_summary: WeightsSummary,
75 pub metadata: HashMap<String, String>,
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ModelMetrics {
82 pub train_accuracy: f64,
84 pub val_accuracy: f64,
86 pub test_accuracy: Option<f64>,
88 pub train_loss: f64,
90 pub val_loss: f64,
92 pub test_loss: Option<f64>,
94 pub inference_latency_ms: f64,
96 pub memory_usage_mb: f64,
98 pub model_size_mb: f64,
100 pub flops: u64,
102 pub training_time_s: f64,
104 pub custom_metrics: HashMap<String, f64>,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ArchitectureInfo {
111 pub parameter_count: u64,
113 pub layer_count: u32,
115 pub depth: u32,
117 pub hidden_size: u32,
119 pub num_heads: Option<u32>,
121 pub ff_dim: Option<u32>,
123 pub vocab_size: Option<u32>,
125 pub max_seq_length: Option<u32>,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TrainingConfig {
132 pub learning_rate: f64,
134 pub batch_size: u32,
136 pub epochs: u32,
138 pub optimizer: String,
140 pub lr_schedule: Option<String>,
142 pub regularization: HashMap<String, f64>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct WeightsSummary {
149 pub mean: f64,
151 pub std_dev: f64,
153 pub min: f64,
155 pub max: f64,
157 pub percentiles: HashMap<String, f64>,
159 pub zero_count: u64,
161 pub sparsity: f64,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ModelComparisonResult {
168 pub models: Vec<String>,
170 pub timestamp: DateTime<Utc>,
172 pub performance_comparison: PerformanceComparison,
174 pub architecture_diff: ArchitectureDiff,
176 pub statistical_analysis: StatisticalAnalysis,
178 pub summary: ComparisonSummary,
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct PerformanceComparison {
185 pub accuracy_comparison: MetricComparison,
187 pub loss_comparison: MetricComparison,
189 pub latency_comparison: MetricComparison,
191 pub memory_comparison: MetricComparison,
193 pub size_comparison: MetricComparison,
195 pub custom_comparisons: HashMap<String, MetricComparison>,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct MetricComparison {
202 pub values: HashMap<String, f64>,
204 pub best_model: String,
206 pub worst_model: String,
208 pub differences: HashMap<String, f64>,
210 pub significant_differences: HashMap<String, bool>,
212}
213
214#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ArchitectureDiff {
217 pub parameter_diff: HashMap<String, i64>,
219 pub layer_diff: HashMap<String, i32>,
221 pub similarity_score: f64,
223 pub notable_differences: Vec<String>,
225}
226
227#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct StatisticalAnalysis {
230 pub p_values: HashMap<String, f64>,
232 pub effect_sizes: HashMap<String, f64>,
234 pub confidence_intervals: HashMap<String, (f64, f64)>,
236 pub significance_summary: HashMap<String, bool>,
238}
239
240#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct ComparisonSummary {
243 pub best_model: String,
245 pub rankings: HashMap<String, Vec<String>>,
247 pub key_findings: Vec<String>,
249 pub recommendations: Vec<String>,
251}
252
253#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ABTestConfig {
256 pub name: String,
258 pub model_a: String,
260 pub model_b: String,
262 pub duration_hours: Option<u32>,
264 pub sample_size: u32,
266 pub tracked_metrics: Vec<String>,
268 pub min_effect_size: f64,
270 pub power: f64,
272}
273
274#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct ABTestResult {
277 pub config: ABTestConfig,
279 pub start_time: DateTime<Utc>,
281 pub end_time: Option<DateTime<Utc>>,
283 pub model_a_results: ABTestMetrics,
285 pub model_b_results: ABTestMetrics,
287 pub statistical_tests: HashMap<String, StatisticalTestResult>,
289 pub conclusion: ABTestConclusion,
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ABTestMetrics {
296 pub sample_size: u32,
298 pub metrics: HashMap<String, Vec<f64>>,
300 pub summary_stats: HashMap<String, SummaryStats>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct SummaryStats {
307 pub mean: f64,
308 pub std_dev: f64,
309 pub min: f64,
310 pub max: f64,
311 pub median: f64,
312 pub q25: f64,
313 pub q75: f64,
314}
315
316#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct StatisticalTestResult {
319 pub test_type: String,
321 pub statistic: f64,
323 pub p_value: f64,
325 pub effect_size: f64,
327 pub confidence_interval: (f64, f64),
329 pub is_significant: bool,
331}
332
333#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct ABTestConclusion {
336 pub winner: Option<String>,
338 pub confidence: f64,
340 pub practical_significance: bool,
342 pub recommendation: String,
344 pub summary: String,
346}
347
348#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct VersionDiff {
351 pub from_version: String,
353 pub to_version: String,
355 pub timestamp: DateTime<Utc>,
357 pub performance_delta: PerformanceDelta,
359 pub architecture_changes: Vec<ArchitectureChange>,
361 pub config_changes: Vec<ConfigChange>,
363 pub weight_changes: WeightChangesSummary,
365}
366
367#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PerformanceDelta {
370 pub accuracy_delta: f64,
372 pub loss_delta: f64,
374 pub latency_delta: f64,
376 pub memory_delta: f64,
378 pub size_delta: f64,
380 pub training_time_delta: f64,
382 pub custom_deltas: HashMap<String, f64>,
384}
385
386#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct ArchitectureChange {
389 pub change_type: String,
391 pub description: String,
393 pub impact: String,
395}
396
397#[derive(Debug, Clone, Serialize, Deserialize)]
399pub struct ConfigChange {
400 pub parameter: String,
402 pub old_value: String,
404 pub new_value: String,
406 pub impact: String,
408}
409
410#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct WeightChangesSummary {
413 pub avg_magnitude: f64,
415 pub max_change: f64,
417 pub significant_change_ratio: f64,
419 pub layer_changes: HashMap<String, f64>,
421}
422
423#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct RegressionDetectionResult {
426 pub timestamp: DateTime<Utc>,
428 pub regressions: Vec<Regression>,
430 pub improvements: Vec<Improvement>,
432 pub overall_assessment: RegressionAssessment,
434}
435
436#[derive(Debug, Clone, Serialize, Deserialize)]
438pub struct Regression {
439 pub metric: String,
441 pub current_value: f64,
443 pub previous_value: f64,
445 pub magnitude: f64,
447 pub severity: RegressionSeverity,
449 pub possible_causes: Vec<String>,
451 pub suggested_fixes: Vec<String>,
453}
454
455#[derive(Debug, Clone, Serialize, Deserialize)]
457pub struct Improvement {
458 pub metric: String,
460 pub current_value: f64,
462 pub previous_value: f64,
464 pub magnitude: f64,
466 pub likely_causes: Vec<String>,
468}
469
470#[derive(Debug, Clone, Serialize, Deserialize)]
472pub enum RegressionSeverity {
473 Critical,
474 Major,
475 Minor,
476 Negligible,
477}
478
479#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct RegressionAssessment {
482 pub health_score: f64,
484 pub critical_regressions: usize,
486 pub improvements: usize,
488 pub recommendation: String,
490}
491
492#[derive(Debug)]
494pub struct DifferentialDebugger {
495 config: DifferentialDebuggingConfig,
496 model_snapshots: IndexMap<String, ModelSnapshot>,
497 comparison_history: Vec<ModelComparisonResult>,
498 ab_tests: Vec<ABTestResult>,
499 version_diffs: Vec<VersionDiff>,
500 regression_history: Vec<RegressionDetectionResult>,
501}
502
503impl DifferentialDebugger {
504 pub fn new(config: DifferentialDebuggingConfig) -> Self {
506 Self {
507 config,
508 model_snapshots: IndexMap::new(),
509 comparison_history: Vec::new(),
510 ab_tests: Vec::new(),
511 version_diffs: Vec::new(),
512 regression_history: Vec::new(),
513 }
514 }
515
516 pub fn add_model_snapshot(&mut self, snapshot: ModelSnapshot) -> Result<()> {
518 if self.model_snapshots.len() >= self.config.max_comparison_models {
519 self.model_snapshots.shift_remove_index(0);
521 }
522
523 self.model_snapshots.insert(snapshot.name.clone(), snapshot);
524 Ok(())
525 }
526
527 pub async fn compare_models(
529 &mut self,
530 model_names: Vec<String>,
531 ) -> Result<ModelComparisonResult> {
532 if !self.config.enable_model_comparison {
533 return Err(anyhow::anyhow!("Model comparison is disabled"));
534 }
535
536 if model_names.len() < 2 {
537 return Err(anyhow::anyhow!(
538 "At least two models are required for comparison"
539 ));
540 }
541
542 let models: Vec<&ModelSnapshot> = model_names
544 .iter()
545 .map(|name| {
546 self.model_snapshots
547 .get(name)
548 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", name))
549 })
550 .collect::<Result<Vec<_>>>()?;
551
552 let performance_comparison = self.compare_performance(&models)?;
554 let architecture_diff = self.analyze_architecture_differences(&models)?;
555 let statistical_analysis = self.perform_statistical_analysis(&models)?;
556 let summary = self.generate_comparison_summary(
557 &models,
558 &performance_comparison,
559 &statistical_analysis,
560 )?;
561
562 let result = ModelComparisonResult {
563 models: model_names,
564 timestamp: Utc::now(),
565 performance_comparison,
566 architecture_diff,
567 statistical_analysis,
568 summary,
569 };
570
571 self.comparison_history.push(result.clone());
572 Ok(result)
573 }
574
575 pub async fn run_ab_test(
577 &mut self,
578 config: ABTestConfig,
579 model_a_data: Vec<f64>,
580 model_b_data: Vec<f64>,
581 ) -> Result<ABTestResult> {
582 if !self.config.enable_ab_analysis {
583 return Err(anyhow::anyhow!("A/B analysis is disabled"));
584 }
585
586 let start_time = Utc::now();
587
588 let model_a_stats = self.calculate_summary_stats(&model_a_data);
590 let model_b_stats = self.calculate_summary_stats(&model_b_data);
591
592 let model_a_results = ABTestMetrics {
593 sample_size: model_a_data.len() as u32,
594 metrics: {
595 let mut metrics = HashMap::new();
596 metrics.insert("primary_metric".to_string(), model_a_data);
597 metrics
598 },
599 summary_stats: {
600 let mut stats = HashMap::new();
601 stats.insert("primary_metric".to_string(), model_a_stats);
602 stats
603 },
604 };
605
606 let model_b_results = ABTestMetrics {
607 sample_size: model_b_data.len() as u32,
608 metrics: {
609 let mut metrics = HashMap::new();
610 metrics.insert("primary_metric".to_string(), model_b_data);
611 metrics
612 },
613 summary_stats: {
614 let mut stats = HashMap::new();
615 stats.insert("primary_metric".to_string(), model_b_stats);
616 stats
617 },
618 };
619
620 let statistical_tests =
622 self.perform_ab_statistical_tests(&model_a_results, &model_b_results)?;
623
624 let conclusion = self.generate_ab_conclusion(
626 &config,
627 &model_a_results,
628 &model_b_results,
629 &statistical_tests,
630 )?;
631
632 let result = ABTestResult {
633 config,
634 start_time,
635 end_time: Some(Utc::now()),
636 model_a_results,
637 model_b_results,
638 statistical_tests,
639 conclusion,
640 };
641
642 self.ab_tests.push(result.clone());
643 Ok(result)
644 }
645
646 pub async fn track_version_diff(
648 &mut self,
649 from_model: &str,
650 to_model: &str,
651 ) -> Result<VersionDiff> {
652 if !self.config.enable_version_diff {
653 return Err(anyhow::anyhow!("Version diff tracking is disabled"));
654 }
655
656 let from_snapshot = self
657 .model_snapshots
658 .get(from_model)
659 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", from_model))?;
660 let to_snapshot = self
661 .model_snapshots
662 .get(to_model)
663 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", to_model))?;
664
665 let performance_delta = self.calculate_performance_delta(from_snapshot, to_snapshot)?;
666 let architecture_changes = self.detect_architecture_changes(from_snapshot, to_snapshot)?;
667 let config_changes = self.detect_config_changes(from_snapshot, to_snapshot)?;
668 let weight_changes = self.analyze_weight_changes(from_snapshot, to_snapshot)?;
669
670 let diff = VersionDiff {
671 from_version: from_snapshot.version.clone(),
672 to_version: to_snapshot.version.clone(),
673 timestamp: Utc::now(),
674 performance_delta,
675 architecture_changes,
676 config_changes,
677 weight_changes,
678 };
679
680 self.version_diffs.push(diff.clone());
681 Ok(diff)
682 }
683
684 pub async fn detect_regressions(
686 &mut self,
687 current_model: &str,
688 baseline_model: &str,
689 ) -> Result<RegressionDetectionResult> {
690 if !self.config.enable_regression_detection {
691 return Err(anyhow::anyhow!("Regression detection is disabled"));
692 }
693
694 let current = self
695 .model_snapshots
696 .get(current_model)
697 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", current_model))?;
698 let baseline = self
699 .model_snapshots
700 .get(baseline_model)
701 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", baseline_model))?;
702
703 let mut regressions = Vec::new();
704 let mut improvements = Vec::new();
705
706 if current.metrics.val_accuracy < baseline.metrics.val_accuracy {
708 let magnitude = baseline.metrics.val_accuracy - current.metrics.val_accuracy;
709 if magnitude > self.config.regression_sensitivity * 0.01 {
710 regressions.push(Regression {
711 metric: "validation_accuracy".to_string(),
712 current_value: current.metrics.val_accuracy,
713 previous_value: baseline.metrics.val_accuracy,
714 magnitude,
715 severity: self.classify_regression_severity(magnitude, "accuracy"),
716 possible_causes: vec![
717 "Learning rate too high".to_string(),
718 "Insufficient training".to_string(),
719 "Data distribution shift".to_string(),
720 ],
721 suggested_fixes: vec![
722 "Reduce learning rate".to_string(),
723 "Increase training epochs".to_string(),
724 "Check data quality".to_string(),
725 ],
726 });
727 }
728 } else if current.metrics.val_accuracy > baseline.metrics.val_accuracy {
729 let magnitude = current.metrics.val_accuracy - baseline.metrics.val_accuracy;
730 improvements.push(Improvement {
731 metric: "validation_accuracy".to_string(),
732 current_value: current.metrics.val_accuracy,
733 previous_value: baseline.metrics.val_accuracy,
734 magnitude,
735 likely_causes: vec![
736 "Better optimization".to_string(),
737 "Improved architecture".to_string(),
738 "Better hyperparameters".to_string(),
739 ],
740 });
741 }
742
743 if current.metrics.inference_latency_ms > baseline.metrics.inference_latency_ms {
745 let magnitude =
746 current.metrics.inference_latency_ms - baseline.metrics.inference_latency_ms;
747 let relative_change = magnitude / baseline.metrics.inference_latency_ms * 100.0;
748 if relative_change > self.config.performance_delta_threshold {
749 regressions.push(Regression {
750 metric: "inference_latency".to_string(),
751 current_value: current.metrics.inference_latency_ms,
752 previous_value: baseline.metrics.inference_latency_ms,
753 magnitude,
754 severity: self.classify_regression_severity(relative_change, "latency"),
755 possible_causes: vec![
756 "Model complexity increased".to_string(),
757 "Inefficient implementation".to_string(),
758 "Hardware degradation".to_string(),
759 ],
760 suggested_fixes: vec![
761 "Profile and optimize bottlenecks".to_string(),
762 "Consider model compression".to_string(),
763 "Check hardware configuration".to_string(),
764 ],
765 });
766 }
767 }
768
769 let critical_regressions = regressions
770 .iter()
771 .filter(|r| matches!(r.severity, RegressionSeverity::Critical))
772 .count();
773
774 let health_score = if critical_regressions > 0 {
775 0.0
776 } else {
777 1.0 - (regressions.len() as f64 * 0.1).min(1.0)
778 };
779
780 let recommendation = if critical_regressions > 0 {
781 "Critical regressions detected. Immediate action required.".to_string()
782 } else if !regressions.is_empty() {
783 "Some regressions detected. Review and address if necessary.".to_string()
784 } else {
785 "No significant regressions detected.".to_string()
786 };
787
788 let overall_assessment = RegressionAssessment {
789 health_score,
790 critical_regressions,
791 improvements: improvements.len(),
792 recommendation,
793 };
794
795 let result = RegressionDetectionResult {
796 timestamp: Utc::now(),
797 regressions,
798 improvements,
799 overall_assessment,
800 };
801
802 self.regression_history.push(result.clone());
803 Ok(result)
804 }
805
806 pub async fn generate_report(&self) -> Result<DifferentialDebuggingReport> {
808 Ok(DifferentialDebuggingReport {
809 timestamp: Utc::now(),
810 config: self.config.clone(),
811 total_models: self.model_snapshots.len(),
812 comparison_count: self.comparison_history.len(),
813 ab_test_count: self.ab_tests.len(),
814 version_diff_count: self.version_diffs.len(),
815 regression_detection_count: self.regression_history.len(),
816 recent_comparisons: self.comparison_history.iter().rev().take(5).cloned().collect(),
817 recent_regressions: self.regression_history.iter().rev().take(3).cloned().collect(),
818 model_summary: self.generate_model_summary(),
819 })
820 }
821
822 fn compare_performance(&self, models: &[&ModelSnapshot]) -> Result<PerformanceComparison> {
825 let mut accuracy_values = HashMap::new();
826 let mut loss_values = HashMap::new();
827 let mut latency_values = HashMap::new();
828 let mut memory_values = HashMap::new();
829 let mut size_values = HashMap::new();
830
831 for model in models {
832 accuracy_values.insert(model.name.clone(), model.metrics.val_accuracy);
833 loss_values.insert(model.name.clone(), model.metrics.val_loss);
834 latency_values.insert(model.name.clone(), model.metrics.inference_latency_ms);
835 memory_values.insert(model.name.clone(), model.metrics.memory_usage_mb);
836 size_values.insert(model.name.clone(), model.metrics.model_size_mb);
837 }
838
839 Ok(PerformanceComparison {
840 accuracy_comparison: self.create_metric_comparison(accuracy_values, true)?,
841 loss_comparison: self.create_metric_comparison(loss_values, false)?,
842 latency_comparison: self.create_metric_comparison(latency_values, false)?,
843 memory_comparison: self.create_metric_comparison(memory_values, false)?,
844 size_comparison: self.create_metric_comparison(size_values, false)?,
845 custom_comparisons: HashMap::new(),
846 })
847 }
848
849 fn create_metric_comparison(
850 &self,
851 values: HashMap<String, f64>,
852 higher_is_better: bool,
853 ) -> Result<MetricComparison> {
854 let best_model = if higher_is_better {
855 values.iter().max_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
856 } else {
857 values.iter().min_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
858 };
859
860 let worst_model = if higher_is_better {
861 values.iter().min_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
862 } else {
863 values.iter().max_by(|a, b| a.1.partial_cmp(b.1).unwrap()).unwrap().0.clone()
864 };
865
866 let best_value = values[&best_model];
867 let mut differences = HashMap::new();
868 let mut significant_differences = HashMap::new();
869
870 for (model, value) in &values {
871 let diff = if higher_is_better {
872 (value - best_value) / best_value * 100.0
873 } else {
874 (best_value - value) / best_value * 100.0
875 };
876 differences.insert(model.clone(), diff);
877 significant_differences.insert(model.clone(), diff.abs() > 1.0); }
879
880 Ok(MetricComparison {
881 values,
882 best_model,
883 worst_model,
884 differences,
885 significant_differences,
886 })
887 }
888
889 fn analyze_architecture_differences(
890 &self,
891 models: &[&ModelSnapshot],
892 ) -> Result<ArchitectureDiff> {
893 if models.len() < 2 {
894 return Err(anyhow::anyhow!(
895 "Need at least 2 models for architecture diff"
896 ));
897 }
898
899 let base_model = models[0];
900 let mut parameter_diff = HashMap::new();
901 let mut layer_diff = HashMap::new();
902 let mut notable_differences = Vec::new();
903
904 for model in models.iter().skip(1) {
905 let param_diff = model.architecture.parameter_count as i64
906 - base_model.architecture.parameter_count as i64;
907 let layer_diff_val =
908 model.architecture.layer_count as i32 - base_model.architecture.layer_count as i32;
909
910 parameter_diff.insert(model.name.clone(), param_diff);
911 layer_diff.insert(model.name.clone(), layer_diff_val);
912
913 if param_diff.abs() > 1_000_000 {
914 notable_differences.push(format!(
915 "Model '{}' has {} parameter difference",
916 model.name, param_diff
917 ));
918 }
919
920 if layer_diff_val != 0 {
921 notable_differences.push(format!(
922 "Model '{}' has {} layer difference",
923 model.name, layer_diff_val
924 ));
925 }
926 }
927
928 let mut similarity_scores = Vec::new();
930 for model in models.iter().skip(1) {
931 let score = self
932 .calculate_architecture_similarity(&base_model.architecture, &model.architecture);
933 similarity_scores.push(score);
934 }
935 let similarity_score =
936 similarity_scores.iter().sum::<f64>() / similarity_scores.len() as f64;
937
938 Ok(ArchitectureDiff {
939 parameter_diff,
940 layer_diff,
941 similarity_score,
942 notable_differences,
943 })
944 }
945
946 fn calculate_architecture_similarity(
947 &self,
948 arch1: &ArchitectureInfo,
949 arch2: &ArchitectureInfo,
950 ) -> f64 {
951 let mut similarity = 0.0;
952 let mut features = 0;
953
954 let param_ratio = (arch1.parameter_count.min(arch2.parameter_count) as f64)
956 / (arch1.parameter_count.max(arch2.parameter_count) as f64);
957 similarity += param_ratio;
958 features += 1;
959
960 let layer_ratio = (arch1.layer_count.min(arch2.layer_count) as f64)
962 / (arch1.layer_count.max(arch2.layer_count) as f64);
963 similarity += layer_ratio;
964 features += 1;
965
966 let hidden_ratio = (arch1.hidden_size.min(arch2.hidden_size) as f64)
968 / (arch1.hidden_size.max(arch2.hidden_size) as f64);
969 similarity += hidden_ratio;
970 features += 1;
971
972 similarity / features as f64
973 }
974
975 fn perform_statistical_analysis(
976 &self,
977 _models: &[&ModelSnapshot],
978 ) -> Result<StatisticalAnalysis> {
979 Ok(StatisticalAnalysis {
982 p_values: HashMap::new(),
983 effect_sizes: HashMap::new(),
984 confidence_intervals: HashMap::new(),
985 significance_summary: HashMap::new(),
986 })
987 }
988
989 fn generate_comparison_summary(
990 &self,
991 _models: &[&ModelSnapshot],
992 performance: &PerformanceComparison,
993 _statistical: &StatisticalAnalysis,
994 ) -> Result<ComparisonSummary> {
995 let best_model = performance.accuracy_comparison.best_model.clone();
996
997 let mut rankings = HashMap::new();
998 rankings.insert(
999 "accuracy".to_string(),
1000 vec![performance.accuracy_comparison.best_model.clone()],
1001 );
1002 rankings.insert(
1003 "latency".to_string(),
1004 vec![performance.latency_comparison.best_model.clone()],
1005 );
1006
1007 let key_findings = vec![
1008 format!(
1009 "Best accuracy: {} ({:.2}%)",
1010 performance.accuracy_comparison.best_model,
1011 performance.accuracy_comparison.values[&performance.accuracy_comparison.best_model]
1012 * 100.0
1013 ),
1014 format!(
1015 "Fastest inference: {} ({:.2}ms)",
1016 performance.latency_comparison.best_model,
1017 performance.latency_comparison.values[&performance.latency_comparison.best_model]
1018 ),
1019 ];
1020
1021 let recommendations = vec![
1022 "Consider the trade-offs between accuracy and latency".to_string(),
1023 "Monitor memory usage for production deployment".to_string(),
1024 ];
1025
1026 Ok(ComparisonSummary {
1027 best_model,
1028 rankings,
1029 key_findings,
1030 recommendations,
1031 })
1032 }
1033
1034 fn calculate_summary_stats(&self, data: &[f64]) -> SummaryStats {
1035 let mean = data.iter().sum::<f64>() / data.len() as f64;
1036 let variance = data.variance();
1037 let std_dev = variance.sqrt();
1038
1039 let mut sorted_data = data.to_vec();
1040 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1041
1042 let min = sorted_data[0];
1043 let max = sorted_data[sorted_data.len() - 1];
1044 let median = sorted_data[sorted_data.len() / 2];
1045 let q25 = sorted_data[sorted_data.len() / 4];
1046 let q75 = sorted_data[3 * sorted_data.len() / 4];
1047
1048 SummaryStats {
1049 mean,
1050 std_dev,
1051 min,
1052 max,
1053 median,
1054 q25,
1055 q75,
1056 }
1057 }
1058
1059 fn perform_ab_statistical_tests(
1060 &self,
1061 model_a: &ABTestMetrics,
1062 model_b: &ABTestMetrics,
1063 ) -> Result<HashMap<String, StatisticalTestResult>> {
1064 let mut results = HashMap::new();
1065
1066 if let (Some(a_data), Some(b_data)) = (
1068 model_a.metrics.get("primary_metric"),
1069 model_b.metrics.get("primary_metric"),
1070 ) {
1071 let a_mean = a_data.mean();
1072 let b_mean = b_data.mean();
1073 let a_var = a_data.variance();
1074 let b_var = b_data.variance();
1075
1076 let pooled_std = ((a_var + b_var) / 2.0).sqrt();
1078 let standard_error =
1079 pooled_std * (1.0 / a_data.len() as f64 + 1.0 / b_data.len() as f64).sqrt();
1080 let t_statistic = (a_mean - b_mean) / standard_error;
1081
1082 let p_value = if t_statistic.abs() > 2.0 { 0.01 } else { 0.1 };
1084
1085 let effect_size = (a_mean - b_mean) / pooled_std; let margin_of_error = 1.96 * standard_error; results.insert(
1089 "primary_metric".to_string(),
1090 StatisticalTestResult {
1091 test_type: "Welch's t-test".to_string(),
1092 statistic: t_statistic,
1093 p_value,
1094 effect_size,
1095 confidence_interval: (
1096 a_mean - b_mean - margin_of_error,
1097 a_mean - b_mean + margin_of_error,
1098 ),
1099 is_significant: p_value < 0.05,
1100 },
1101 );
1102 }
1103
1104 Ok(results)
1105 }
1106
1107 fn generate_ab_conclusion(
1108 &self,
1109 config: &ABTestConfig,
1110 _model_a: &ABTestMetrics,
1111 _model_b: &ABTestMetrics,
1112 tests: &HashMap<String, StatisticalTestResult>,
1113 ) -> Result<ABTestConclusion> {
1114 let primary_test = tests.get("primary_metric");
1115
1116 let (winner, confidence, practical_significance) = if let Some(test) = primary_test {
1117 let winner = if test.effect_size > 0.0 {
1118 Some(config.model_a.clone())
1119 } else {
1120 Some(config.model_b.clone())
1121 };
1122
1123 let confidence = if test.is_significant { 0.95 } else { 0.5 };
1124 let practical_significance = test.effect_size.abs() > config.min_effect_size;
1125
1126 (winner, confidence, practical_significance)
1127 } else {
1128 (None, 0.5, false)
1129 };
1130
1131 let recommendation = if practical_significance && confidence > 0.9 {
1132 format!("Recommend deploying {}", winner.as_ref().unwrap())
1133 } else {
1134 "Insufficient evidence for a clear recommendation".to_string()
1135 };
1136
1137 let summary = format!(
1138 "A/B test completed with {} confidence",
1139 if confidence > 0.9 { "high" } else { "low" }
1140 );
1141
1142 Ok(ABTestConclusion {
1143 winner,
1144 confidence,
1145 practical_significance,
1146 recommendation,
1147 summary,
1148 })
1149 }
1150
1151 fn calculate_performance_delta(
1152 &self,
1153 from: &ModelSnapshot,
1154 to: &ModelSnapshot,
1155 ) -> Result<PerformanceDelta> {
1156 Ok(PerformanceDelta {
1157 accuracy_delta: to.metrics.val_accuracy - from.metrics.val_accuracy,
1158 loss_delta: to.metrics.val_loss - from.metrics.val_loss,
1159 latency_delta: to.metrics.inference_latency_ms - from.metrics.inference_latency_ms,
1160 memory_delta: to.metrics.memory_usage_mb - from.metrics.memory_usage_mb,
1161 size_delta: to.metrics.model_size_mb - from.metrics.model_size_mb,
1162 training_time_delta: to.metrics.training_time_s - from.metrics.training_time_s,
1163 custom_deltas: HashMap::new(),
1164 })
1165 }
1166
1167 fn detect_architecture_changes(
1168 &self,
1169 from: &ModelSnapshot,
1170 to: &ModelSnapshot,
1171 ) -> Result<Vec<ArchitectureChange>> {
1172 let mut changes = Vec::new();
1173
1174 if from.architecture.parameter_count != to.architecture.parameter_count {
1175 changes.push(ArchitectureChange {
1176 change_type: "Parameter Count".to_string(),
1177 description: format!(
1178 "Changed from {} to {} parameters",
1179 from.architecture.parameter_count, to.architecture.parameter_count
1180 ),
1181 impact: "Affects model capacity and memory usage".to_string(),
1182 });
1183 }
1184
1185 if from.architecture.layer_count != to.architecture.layer_count {
1186 changes.push(ArchitectureChange {
1187 change_type: "Layer Count".to_string(),
1188 description: format!(
1189 "Changed from {} to {} layers",
1190 from.architecture.layer_count, to.architecture.layer_count
1191 ),
1192 impact: "Affects model depth and training dynamics".to_string(),
1193 });
1194 }
1195
1196 Ok(changes)
1197 }
1198
1199 fn detect_config_changes(
1200 &self,
1201 from: &ModelSnapshot,
1202 to: &ModelSnapshot,
1203 ) -> Result<Vec<ConfigChange>> {
1204 let mut changes = Vec::new();
1205
1206 if from.training_config.learning_rate != to.training_config.learning_rate {
1207 changes.push(ConfigChange {
1208 parameter: "learning_rate".to_string(),
1209 old_value: from.training_config.learning_rate.to_string(),
1210 new_value: to.training_config.learning_rate.to_string(),
1211 impact: "Affects training speed and convergence".to_string(),
1212 });
1213 }
1214
1215 if from.training_config.batch_size != to.training_config.batch_size {
1216 changes.push(ConfigChange {
1217 parameter: "batch_size".to_string(),
1218 old_value: from.training_config.batch_size.to_string(),
1219 new_value: to.training_config.batch_size.to_string(),
1220 impact: "Affects gradient noise and memory usage".to_string(),
1221 });
1222 }
1223
1224 Ok(changes)
1225 }
1226
1227 fn analyze_weight_changes(
1228 &self,
1229 from: &ModelSnapshot,
1230 to: &ModelSnapshot,
1231 ) -> Result<WeightChangesSummary> {
1232 let avg_magnitude = (to.weights_summary.mean - from.weights_summary.mean).abs();
1234 let max_change = (to.weights_summary.max - from.weights_summary.max).abs();
1235 let significant_change_ratio = if avg_magnitude > 0.01 { 0.8 } else { 0.2 };
1236
1237 Ok(WeightChangesSummary {
1238 avg_magnitude,
1239 max_change,
1240 significant_change_ratio,
1241 layer_changes: HashMap::new(),
1242 })
1243 }
1244
1245 fn classify_regression_severity(
1246 &self,
1247 magnitude: f64,
1248 metric_type: &str,
1249 ) -> RegressionSeverity {
1250 match metric_type {
1251 "accuracy" => {
1252 if magnitude > 0.1 {
1253 RegressionSeverity::Critical
1254 } else if magnitude > 0.05 {
1255 RegressionSeverity::Major
1256 } else if magnitude > 0.02 {
1257 RegressionSeverity::Minor
1258 } else {
1259 RegressionSeverity::Negligible
1260 }
1261 },
1262 "latency" => {
1263 if magnitude > 50.0 {
1264 RegressionSeverity::Critical
1265 } else if magnitude > 20.0 {
1266 RegressionSeverity::Major
1267 } else if magnitude > 10.0 {
1268 RegressionSeverity::Minor
1269 } else {
1270 RegressionSeverity::Negligible
1271 }
1272 },
1273 _ => RegressionSeverity::Minor,
1274 }
1275 }
1276
1277 fn generate_model_summary(&self) -> HashMap<String, String> {
1278 let mut summary = HashMap::new();
1279
1280 if let Some((best_name, best_model)) = self
1281 .model_snapshots
1282 .iter()
1283 .max_by(|a, b| a.1.metrics.val_accuracy.partial_cmp(&b.1.metrics.val_accuracy).unwrap())
1284 {
1285 summary.insert("best_accuracy_model".to_string(), best_name.clone());
1286 summary.insert(
1287 "best_accuracy_value".to_string(),
1288 format!("{:.4}", best_model.metrics.val_accuracy),
1289 );
1290 }
1291
1292 if let Some((fastest_name, fastest_model)) = self.model_snapshots.iter().min_by(|a, b| {
1293 a.1.metrics
1294 .inference_latency_ms
1295 .partial_cmp(&b.1.metrics.inference_latency_ms)
1296 .unwrap()
1297 }) {
1298 summary.insert("fastest_model".to_string(), fastest_name.clone());
1299 summary.insert(
1300 "fastest_latency".to_string(),
1301 format!("{:.2}ms", fastest_model.metrics.inference_latency_ms),
1302 );
1303 }
1304
1305 summary.insert(
1306 "total_models".to_string(),
1307 self.model_snapshots.len().to_string(),
1308 );
1309 summary
1310 }
1311}
1312
1313#[derive(Debug, Clone, Serialize, Deserialize)]
1315pub struct DifferentialDebuggingReport {
1316 pub timestamp: DateTime<Utc>,
1317 pub config: DifferentialDebuggingConfig,
1318 pub total_models: usize,
1319 pub comparison_count: usize,
1320 pub ab_test_count: usize,
1321 pub version_diff_count: usize,
1322 pub regression_detection_count: usize,
1323 pub recent_comparisons: Vec<ModelComparisonResult>,
1324 pub recent_regressions: Vec<RegressionDetectionResult>,
1325 pub model_summary: HashMap<String, String>,
1326}
1327
1328#[cfg(test)]
1329mod tests {
1330 use super::*;
1331
1332 #[tokio::test]
1333 async fn test_differential_debugger_creation() {
1334 let config = DifferentialDebuggingConfig::default();
1335 let debugger = DifferentialDebugger::new(config);
1336 assert_eq!(debugger.model_snapshots.len(), 0);
1337 }
1338
1339 #[tokio::test]
1340 async fn test_model_snapshot_addition() {
1341 let config = DifferentialDebuggingConfig::default();
1342 let mut debugger = DifferentialDebugger::new(config);
1343
1344 let snapshot = create_test_snapshot("test_model");
1345 debugger.add_model_snapshot(snapshot).unwrap();
1346 assert_eq!(debugger.model_snapshots.len(), 1);
1347 }
1348
1349 #[tokio::test]
1350 async fn test_model_comparison() {
1351 let config = DifferentialDebuggingConfig::default();
1352 let mut debugger = DifferentialDebugger::new(config);
1353
1354 let snapshot1 = create_test_snapshot("model_a");
1356 let snapshot2 = create_test_snapshot("model_b");
1357
1358 debugger.add_model_snapshot(snapshot1).unwrap();
1359 debugger.add_model_snapshot(snapshot2).unwrap();
1360
1361 let result = debugger
1362 .compare_models(vec!["model_a".to_string(), "model_b".to_string()])
1363 .await;
1364 assert!(result.is_ok());
1365 }
1366
1367 fn create_test_snapshot(name: &str) -> ModelSnapshot {
1368 ModelSnapshot {
1369 id: Uuid::new_v4(),
1370 name: name.to_string(),
1371 timestamp: Utc::now(),
1372 version: "1.0.0".to_string(),
1373 commit_hash: Some("abc123".to_string()),
1374 metrics: ModelMetrics {
1375 train_accuracy: 0.95,
1376 val_accuracy: 0.90,
1377 test_accuracy: Some(0.88),
1378 train_loss: 0.05,
1379 val_loss: 0.10,
1380 test_loss: Some(0.12),
1381 inference_latency_ms: 50.0,
1382 memory_usage_mb: 2048.0,
1383 model_size_mb: 500.0,
1384 flops: 1_000_000_000,
1385 training_time_s: 3600.0,
1386 custom_metrics: HashMap::new(),
1387 },
1388 architecture: ArchitectureInfo {
1389 parameter_count: 175_000_000,
1390 layer_count: 24,
1391 depth: 24,
1392 hidden_size: 1024,
1393 num_heads: Some(16),
1394 ff_dim: Some(4096),
1395 vocab_size: Some(50257),
1396 max_seq_length: Some(2048),
1397 },
1398 training_config: TrainingConfig {
1399 learning_rate: 1e-4,
1400 batch_size: 32,
1401 epochs: 10,
1402 optimizer: "AdamW".to_string(),
1403 lr_schedule: Some("cosine".to_string()),
1404 regularization: HashMap::new(),
1405 },
1406 weights_summary: WeightsSummary {
1407 mean: 0.0,
1408 std_dev: 0.1,
1409 min: -0.5,
1410 max: 0.5,
1411 percentiles: HashMap::new(),
1412 zero_count: 1000,
1413 sparsity: 0.01,
1414 },
1415 metadata: HashMap::new(),
1416 }
1417 }
1418}