1use anyhow::Result;
7use chrono::{DateTime, Utc};
8use indexmap::IndexMap;
9use serde::{Deserialize, Serialize};
11use statrs::statistics::Statistics;
12use std::collections::HashMap;
13use uuid::Uuid;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DifferentialDebuggingConfig {
18 pub enable_model_comparison: bool,
20 pub enable_ab_analysis: bool,
22 pub enable_version_diff: bool,
24 pub enable_regression_detection: bool,
26 pub enable_performance_delta: bool,
28 pub significance_threshold: f64,
30 pub max_comparison_models: usize,
32 pub regression_sensitivity: f64,
34 pub performance_delta_threshold: f64,
36}
37
38impl Default for DifferentialDebuggingConfig {
39 fn default() -> Self {
40 Self {
41 enable_model_comparison: true,
42 enable_ab_analysis: true,
43 enable_version_diff: true,
44 enable_regression_detection: true,
45 enable_performance_delta: true,
46 significance_threshold: 0.05,
47 max_comparison_models: 10,
48 regression_sensitivity: 0.8,
49 performance_delta_threshold: 5.0,
50 }
51 }
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ModelSnapshot {
57 pub id: Uuid,
59 pub name: String,
61 pub timestamp: DateTime<Utc>,
63 pub version: String,
65 pub commit_hash: Option<String>,
67 pub metrics: ModelMetrics,
69 pub architecture: ArchitectureInfo,
71 pub training_config: TrainingConfig,
73 pub weights_summary: WeightsSummary,
75 pub metadata: HashMap<String, String>,
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ModelMetrics {
82 pub train_accuracy: f64,
84 pub val_accuracy: f64,
86 pub test_accuracy: Option<f64>,
88 pub train_loss: f64,
90 pub val_loss: f64,
92 pub test_loss: Option<f64>,
94 pub inference_latency_ms: f64,
96 pub memory_usage_mb: f64,
98 pub model_size_mb: f64,
100 pub flops: u64,
102 pub training_time_s: f64,
104 pub custom_metrics: HashMap<String, f64>,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ArchitectureInfo {
111 pub parameter_count: u64,
113 pub layer_count: u32,
115 pub depth: u32,
117 pub hidden_size: u32,
119 pub num_heads: Option<u32>,
121 pub ff_dim: Option<u32>,
123 pub vocab_size: Option<u32>,
125 pub max_seq_length: Option<u32>,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TrainingConfig {
132 pub learning_rate: f64,
134 pub batch_size: u32,
136 pub epochs: u32,
138 pub optimizer: String,
140 pub lr_schedule: Option<String>,
142 pub regularization: HashMap<String, f64>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct WeightsSummary {
149 pub mean: f64,
151 pub std_dev: f64,
153 pub min: f64,
155 pub max: f64,
157 pub percentiles: HashMap<String, f64>,
159 pub zero_count: u64,
161 pub sparsity: f64,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ModelComparisonResult {
168 pub models: Vec<String>,
170 pub timestamp: DateTime<Utc>,
172 pub performance_comparison: PerformanceComparison,
174 pub architecture_diff: ArchitectureDiff,
176 pub statistical_analysis: StatisticalAnalysis,
178 pub summary: ComparisonSummary,
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct PerformanceComparison {
185 pub accuracy_comparison: MetricComparison,
187 pub loss_comparison: MetricComparison,
189 pub latency_comparison: MetricComparison,
191 pub memory_comparison: MetricComparison,
193 pub size_comparison: MetricComparison,
195 pub custom_comparisons: HashMap<String, MetricComparison>,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct MetricComparison {
202 pub values: HashMap<String, f64>,
204 pub best_model: String,
206 pub worst_model: String,
208 pub differences: HashMap<String, f64>,
210 pub significant_differences: HashMap<String, bool>,
212}
213
214#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ArchitectureDiff {
217 pub parameter_diff: HashMap<String, i64>,
219 pub layer_diff: HashMap<String, i32>,
221 pub similarity_score: f64,
223 pub notable_differences: Vec<String>,
225}
226
227#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct StatisticalAnalysis {
230 pub p_values: HashMap<String, f64>,
232 pub effect_sizes: HashMap<String, f64>,
234 pub confidence_intervals: HashMap<String, (f64, f64)>,
236 pub significance_summary: HashMap<String, bool>,
238}
239
240#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct ComparisonSummary {
243 pub best_model: String,
245 pub rankings: HashMap<String, Vec<String>>,
247 pub key_findings: Vec<String>,
249 pub recommendations: Vec<String>,
251}
252
253#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ABTestConfig {
256 pub name: String,
258 pub model_a: String,
260 pub model_b: String,
262 pub duration_hours: Option<u32>,
264 pub sample_size: u32,
266 pub tracked_metrics: Vec<String>,
268 pub min_effect_size: f64,
270 pub power: f64,
272}
273
274#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct ABTestResult {
277 pub config: ABTestConfig,
279 pub start_time: DateTime<Utc>,
281 pub end_time: Option<DateTime<Utc>>,
283 pub model_a_results: ABTestMetrics,
285 pub model_b_results: ABTestMetrics,
287 pub statistical_tests: HashMap<String, StatisticalTestResult>,
289 pub conclusion: ABTestConclusion,
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ABTestMetrics {
296 pub sample_size: u32,
298 pub metrics: HashMap<String, Vec<f64>>,
300 pub summary_stats: HashMap<String, SummaryStats>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct SummaryStats {
307 pub mean: f64,
308 pub std_dev: f64,
309 pub min: f64,
310 pub max: f64,
311 pub median: f64,
312 pub q25: f64,
313 pub q75: f64,
314}
315
316#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct StatisticalTestResult {
319 pub test_type: String,
321 pub statistic: f64,
323 pub p_value: f64,
325 pub effect_size: f64,
327 pub confidence_interval: (f64, f64),
329 pub is_significant: bool,
331}
332
333#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct ABTestConclusion {
336 pub winner: Option<String>,
338 pub confidence: f64,
340 pub practical_significance: bool,
342 pub recommendation: String,
344 pub summary: String,
346}
347
348#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct VersionDiff {
351 pub from_version: String,
353 pub to_version: String,
355 pub timestamp: DateTime<Utc>,
357 pub performance_delta: PerformanceDelta,
359 pub architecture_changes: Vec<ArchitectureChange>,
361 pub config_changes: Vec<ConfigChange>,
363 pub weight_changes: WeightChangesSummary,
365}
366
367#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PerformanceDelta {
370 pub accuracy_delta: f64,
372 pub loss_delta: f64,
374 pub latency_delta: f64,
376 pub memory_delta: f64,
378 pub size_delta: f64,
380 pub training_time_delta: f64,
382 pub custom_deltas: HashMap<String, f64>,
384}
385
386#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct ArchitectureChange {
389 pub change_type: String,
391 pub description: String,
393 pub impact: String,
395}
396
397#[derive(Debug, Clone, Serialize, Deserialize)]
399pub struct ConfigChange {
400 pub parameter: String,
402 pub old_value: String,
404 pub new_value: String,
406 pub impact: String,
408}
409
410#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct WeightChangesSummary {
413 pub avg_magnitude: f64,
415 pub max_change: f64,
417 pub significant_change_ratio: f64,
419 pub layer_changes: HashMap<String, f64>,
421}
422
423#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct RegressionDetectionResult {
426 pub timestamp: DateTime<Utc>,
428 pub regressions: Vec<Regression>,
430 pub improvements: Vec<Improvement>,
432 pub overall_assessment: RegressionAssessment,
434}
435
436#[derive(Debug, Clone, Serialize, Deserialize)]
438pub struct Regression {
439 pub metric: String,
441 pub current_value: f64,
443 pub previous_value: f64,
445 pub magnitude: f64,
447 pub severity: RegressionSeverity,
449 pub possible_causes: Vec<String>,
451 pub suggested_fixes: Vec<String>,
453}
454
455#[derive(Debug, Clone, Serialize, Deserialize)]
457pub struct Improvement {
458 pub metric: String,
460 pub current_value: f64,
462 pub previous_value: f64,
464 pub magnitude: f64,
466 pub likely_causes: Vec<String>,
468}
469
470#[derive(Debug, Clone, Serialize, Deserialize)]
472pub enum RegressionSeverity {
473 Critical,
474 Major,
475 Minor,
476 Negligible,
477}
478
479#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct RegressionAssessment {
482 pub health_score: f64,
484 pub critical_regressions: usize,
486 pub improvements: usize,
488 pub recommendation: String,
490}
491
492#[derive(Debug)]
494pub struct DifferentialDebugger {
495 config: DifferentialDebuggingConfig,
496 model_snapshots: IndexMap<String, ModelSnapshot>,
497 comparison_history: Vec<ModelComparisonResult>,
498 ab_tests: Vec<ABTestResult>,
499 version_diffs: Vec<VersionDiff>,
500 regression_history: Vec<RegressionDetectionResult>,
501}
502
503impl DifferentialDebugger {
504 pub fn new(config: DifferentialDebuggingConfig) -> Self {
506 Self {
507 config,
508 model_snapshots: IndexMap::new(),
509 comparison_history: Vec::new(),
510 ab_tests: Vec::new(),
511 version_diffs: Vec::new(),
512 regression_history: Vec::new(),
513 }
514 }
515
516 pub fn add_model_snapshot(&mut self, snapshot: ModelSnapshot) -> Result<()> {
518 if self.model_snapshots.len() >= self.config.max_comparison_models {
519 self.model_snapshots.shift_remove_index(0);
521 }
522
523 self.model_snapshots.insert(snapshot.name.clone(), snapshot);
524 Ok(())
525 }
526
527 pub async fn compare_models(
529 &mut self,
530 model_names: Vec<String>,
531 ) -> Result<ModelComparisonResult> {
532 if !self.config.enable_model_comparison {
533 return Err(anyhow::anyhow!("Model comparison is disabled"));
534 }
535
536 if model_names.len() < 2 {
537 return Err(anyhow::anyhow!(
538 "At least two models are required for comparison"
539 ));
540 }
541
542 let models: Vec<&ModelSnapshot> = model_names
544 .iter()
545 .map(|name| {
546 self.model_snapshots
547 .get(name)
548 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", name))
549 })
550 .collect::<Result<Vec<_>>>()?;
551
552 let performance_comparison = self.compare_performance(&models)?;
554 let architecture_diff = self.analyze_architecture_differences(&models)?;
555 let statistical_analysis = self.perform_statistical_analysis(&models)?;
556 let summary = self.generate_comparison_summary(
557 &models,
558 &performance_comparison,
559 &statistical_analysis,
560 )?;
561
562 let result = ModelComparisonResult {
563 models: model_names,
564 timestamp: Utc::now(),
565 performance_comparison,
566 architecture_diff,
567 statistical_analysis,
568 summary,
569 };
570
571 self.comparison_history.push(result.clone());
572 Ok(result)
573 }
574
575 pub async fn run_ab_test(
577 &mut self,
578 config: ABTestConfig,
579 model_a_data: Vec<f64>,
580 model_b_data: Vec<f64>,
581 ) -> Result<ABTestResult> {
582 if !self.config.enable_ab_analysis {
583 return Err(anyhow::anyhow!("A/B analysis is disabled"));
584 }
585
586 let start_time = Utc::now();
587
588 let model_a_stats = self.calculate_summary_stats(&model_a_data);
590 let model_b_stats = self.calculate_summary_stats(&model_b_data);
591
592 let model_a_results = ABTestMetrics {
593 sample_size: model_a_data.len() as u32,
594 metrics: {
595 let mut metrics = HashMap::new();
596 metrics.insert("primary_metric".to_string(), model_a_data);
597 metrics
598 },
599 summary_stats: {
600 let mut stats = HashMap::new();
601 stats.insert("primary_metric".to_string(), model_a_stats);
602 stats
603 },
604 };
605
606 let model_b_results = ABTestMetrics {
607 sample_size: model_b_data.len() as u32,
608 metrics: {
609 let mut metrics = HashMap::new();
610 metrics.insert("primary_metric".to_string(), model_b_data);
611 metrics
612 },
613 summary_stats: {
614 let mut stats = HashMap::new();
615 stats.insert("primary_metric".to_string(), model_b_stats);
616 stats
617 },
618 };
619
620 let statistical_tests =
622 self.perform_ab_statistical_tests(&model_a_results, &model_b_results)?;
623
624 let conclusion = self.generate_ab_conclusion(
626 &config,
627 &model_a_results,
628 &model_b_results,
629 &statistical_tests,
630 )?;
631
632 let result = ABTestResult {
633 config,
634 start_time,
635 end_time: Some(Utc::now()),
636 model_a_results,
637 model_b_results,
638 statistical_tests,
639 conclusion,
640 };
641
642 self.ab_tests.push(result.clone());
643 Ok(result)
644 }
645
646 pub async fn track_version_diff(
648 &mut self,
649 from_model: &str,
650 to_model: &str,
651 ) -> Result<VersionDiff> {
652 if !self.config.enable_version_diff {
653 return Err(anyhow::anyhow!("Version diff tracking is disabled"));
654 }
655
656 let from_snapshot = self
657 .model_snapshots
658 .get(from_model)
659 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", from_model))?;
660 let to_snapshot = self
661 .model_snapshots
662 .get(to_model)
663 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", to_model))?;
664
665 let performance_delta = self.calculate_performance_delta(from_snapshot, to_snapshot)?;
666 let architecture_changes = self.detect_architecture_changes(from_snapshot, to_snapshot)?;
667 let config_changes = self.detect_config_changes(from_snapshot, to_snapshot)?;
668 let weight_changes = self.analyze_weight_changes(from_snapshot, to_snapshot)?;
669
670 let diff = VersionDiff {
671 from_version: from_snapshot.version.clone(),
672 to_version: to_snapshot.version.clone(),
673 timestamp: Utc::now(),
674 performance_delta,
675 architecture_changes,
676 config_changes,
677 weight_changes,
678 };
679
680 self.version_diffs.push(diff.clone());
681 Ok(diff)
682 }
683
684 pub async fn detect_regressions(
686 &mut self,
687 current_model: &str,
688 baseline_model: &str,
689 ) -> Result<RegressionDetectionResult> {
690 if !self.config.enable_regression_detection {
691 return Err(anyhow::anyhow!("Regression detection is disabled"));
692 }
693
694 let current = self
695 .model_snapshots
696 .get(current_model)
697 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", current_model))?;
698 let baseline = self
699 .model_snapshots
700 .get(baseline_model)
701 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", baseline_model))?;
702
703 let mut regressions = Vec::new();
704 let mut improvements = Vec::new();
705
706 if current.metrics.val_accuracy < baseline.metrics.val_accuracy {
708 let magnitude = baseline.metrics.val_accuracy - current.metrics.val_accuracy;
709 if magnitude > self.config.regression_sensitivity * 0.01 {
710 regressions.push(Regression {
711 metric: "validation_accuracy".to_string(),
712 current_value: current.metrics.val_accuracy,
713 previous_value: baseline.metrics.val_accuracy,
714 magnitude,
715 severity: self.classify_regression_severity(magnitude, "accuracy"),
716 possible_causes: vec![
717 "Learning rate too high".to_string(),
718 "Insufficient training".to_string(),
719 "Data distribution shift".to_string(),
720 ],
721 suggested_fixes: vec![
722 "Reduce learning rate".to_string(),
723 "Increase training epochs".to_string(),
724 "Check data quality".to_string(),
725 ],
726 });
727 }
728 } else if current.metrics.val_accuracy > baseline.metrics.val_accuracy {
729 let magnitude = current.metrics.val_accuracy - baseline.metrics.val_accuracy;
730 improvements.push(Improvement {
731 metric: "validation_accuracy".to_string(),
732 current_value: current.metrics.val_accuracy,
733 previous_value: baseline.metrics.val_accuracy,
734 magnitude,
735 likely_causes: vec![
736 "Better optimization".to_string(),
737 "Improved architecture".to_string(),
738 "Better hyperparameters".to_string(),
739 ],
740 });
741 }
742
743 if current.metrics.inference_latency_ms > baseline.metrics.inference_latency_ms {
745 let magnitude =
746 current.metrics.inference_latency_ms - baseline.metrics.inference_latency_ms;
747 let relative_change = magnitude / baseline.metrics.inference_latency_ms * 100.0;
748 if relative_change > self.config.performance_delta_threshold {
749 regressions.push(Regression {
750 metric: "inference_latency".to_string(),
751 current_value: current.metrics.inference_latency_ms,
752 previous_value: baseline.metrics.inference_latency_ms,
753 magnitude,
754 severity: self.classify_regression_severity(relative_change, "latency"),
755 possible_causes: vec![
756 "Model complexity increased".to_string(),
757 "Inefficient implementation".to_string(),
758 "Hardware degradation".to_string(),
759 ],
760 suggested_fixes: vec![
761 "Profile and optimize bottlenecks".to_string(),
762 "Consider model compression".to_string(),
763 "Check hardware configuration".to_string(),
764 ],
765 });
766 }
767 }
768
769 let critical_regressions = regressions
770 .iter()
771 .filter(|r| matches!(r.severity, RegressionSeverity::Critical))
772 .count();
773
774 let health_score = if critical_regressions > 0 {
775 0.0
776 } else {
777 1.0 - (regressions.len() as f64 * 0.1).min(1.0)
778 };
779
780 let recommendation = if critical_regressions > 0 {
781 "Critical regressions detected. Immediate action required.".to_string()
782 } else if !regressions.is_empty() {
783 "Some regressions detected. Review and address if necessary.".to_string()
784 } else {
785 "No significant regressions detected.".to_string()
786 };
787
788 let overall_assessment = RegressionAssessment {
789 health_score,
790 critical_regressions,
791 improvements: improvements.len(),
792 recommendation,
793 };
794
795 let result = RegressionDetectionResult {
796 timestamp: Utc::now(),
797 regressions,
798 improvements,
799 overall_assessment,
800 };
801
802 self.regression_history.push(result.clone());
803 Ok(result)
804 }
805
806 pub async fn generate_report(&self) -> Result<DifferentialDebuggingReport> {
808 Ok(DifferentialDebuggingReport {
809 timestamp: Utc::now(),
810 config: self.config.clone(),
811 total_models: self.model_snapshots.len(),
812 comparison_count: self.comparison_history.len(),
813 ab_test_count: self.ab_tests.len(),
814 version_diff_count: self.version_diffs.len(),
815 regression_detection_count: self.regression_history.len(),
816 recent_comparisons: self.comparison_history.iter().rev().take(5).cloned().collect(),
817 recent_regressions: self.regression_history.iter().rev().take(3).cloned().collect(),
818 model_summary: self.generate_model_summary(),
819 })
820 }
821
822 fn compare_performance(&self, models: &[&ModelSnapshot]) -> Result<PerformanceComparison> {
825 let mut accuracy_values = HashMap::new();
826 let mut loss_values = HashMap::new();
827 let mut latency_values = HashMap::new();
828 let mut memory_values = HashMap::new();
829 let mut size_values = HashMap::new();
830
831 for model in models {
832 accuracy_values.insert(model.name.clone(), model.metrics.val_accuracy);
833 loss_values.insert(model.name.clone(), model.metrics.val_loss);
834 latency_values.insert(model.name.clone(), model.metrics.inference_latency_ms);
835 memory_values.insert(model.name.clone(), model.metrics.memory_usage_mb);
836 size_values.insert(model.name.clone(), model.metrics.model_size_mb);
837 }
838
839 Ok(PerformanceComparison {
840 accuracy_comparison: self.create_metric_comparison(accuracy_values, true)?,
841 loss_comparison: self.create_metric_comparison(loss_values, false)?,
842 latency_comparison: self.create_metric_comparison(latency_values, false)?,
843 memory_comparison: self.create_metric_comparison(memory_values, false)?,
844 size_comparison: self.create_metric_comparison(size_values, false)?,
845 custom_comparisons: HashMap::new(),
846 })
847 }
848
849 fn create_metric_comparison(
850 &self,
851 values: HashMap<String, f64>,
852 higher_is_better: bool,
853 ) -> Result<MetricComparison> {
854 let best_model = if higher_is_better {
855 values
856 .iter()
857 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
858 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
859 .0
860 .clone()
861 } else {
862 values
863 .iter()
864 .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
865 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
866 .0
867 .clone()
868 };
869
870 let worst_model = if higher_is_better {
871 values
872 .iter()
873 .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
874 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
875 .0
876 .clone()
877 } else {
878 values
879 .iter()
880 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
881 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
882 .0
883 .clone()
884 };
885
886 let best_value = values[&best_model];
887 let mut differences = HashMap::new();
888 let mut significant_differences = HashMap::new();
889
890 for (model, value) in &values {
891 let diff = if higher_is_better {
892 (value - best_value) / best_value * 100.0
893 } else {
894 (best_value - value) / best_value * 100.0
895 };
896 differences.insert(model.clone(), diff);
897 significant_differences.insert(model.clone(), diff.abs() > 1.0); }
899
900 Ok(MetricComparison {
901 values,
902 best_model,
903 worst_model,
904 differences,
905 significant_differences,
906 })
907 }
908
909 fn analyze_architecture_differences(
910 &self,
911 models: &[&ModelSnapshot],
912 ) -> Result<ArchitectureDiff> {
913 if models.len() < 2 {
914 return Err(anyhow::anyhow!(
915 "Need at least 2 models for architecture diff"
916 ));
917 }
918
919 let base_model = models[0];
920 let mut parameter_diff = HashMap::new();
921 let mut layer_diff = HashMap::new();
922 let mut notable_differences = Vec::new();
923
924 for model in models.iter().skip(1) {
925 let param_diff = model.architecture.parameter_count as i64
926 - base_model.architecture.parameter_count as i64;
927 let layer_diff_val =
928 model.architecture.layer_count as i32 - base_model.architecture.layer_count as i32;
929
930 parameter_diff.insert(model.name.clone(), param_diff);
931 layer_diff.insert(model.name.clone(), layer_diff_val);
932
933 if param_diff.abs() > 1_000_000 {
934 notable_differences.push(format!(
935 "Model '{}' has {} parameter difference",
936 model.name, param_diff
937 ));
938 }
939
940 if layer_diff_val != 0 {
941 notable_differences.push(format!(
942 "Model '{}' has {} layer difference",
943 model.name, layer_diff_val
944 ));
945 }
946 }
947
948 let mut similarity_scores = Vec::new();
950 for model in models.iter().skip(1) {
951 let score = self
952 .calculate_architecture_similarity(&base_model.architecture, &model.architecture);
953 similarity_scores.push(score);
954 }
955 let similarity_score =
956 similarity_scores.iter().sum::<f64>() / similarity_scores.len() as f64;
957
958 Ok(ArchitectureDiff {
959 parameter_diff,
960 layer_diff,
961 similarity_score,
962 notable_differences,
963 })
964 }
965
966 fn calculate_architecture_similarity(
967 &self,
968 arch1: &ArchitectureInfo,
969 arch2: &ArchitectureInfo,
970 ) -> f64 {
971 let mut similarity = 0.0;
972 let mut features = 0;
973
974 let param_ratio = (arch1.parameter_count.min(arch2.parameter_count) as f64)
976 / (arch1.parameter_count.max(arch2.parameter_count) as f64);
977 similarity += param_ratio;
978 features += 1;
979
980 let layer_ratio = (arch1.layer_count.min(arch2.layer_count) as f64)
982 / (arch1.layer_count.max(arch2.layer_count) as f64);
983 similarity += layer_ratio;
984 features += 1;
985
986 let hidden_ratio = (arch1.hidden_size.min(arch2.hidden_size) as f64)
988 / (arch1.hidden_size.max(arch2.hidden_size) as f64);
989 similarity += hidden_ratio;
990 features += 1;
991
992 similarity / features as f64
993 }
994
995 fn perform_statistical_analysis(
996 &self,
997 _models: &[&ModelSnapshot],
998 ) -> Result<StatisticalAnalysis> {
999 Ok(StatisticalAnalysis {
1002 p_values: HashMap::new(),
1003 effect_sizes: HashMap::new(),
1004 confidence_intervals: HashMap::new(),
1005 significance_summary: HashMap::new(),
1006 })
1007 }
1008
1009 fn generate_comparison_summary(
1010 &self,
1011 _models: &[&ModelSnapshot],
1012 performance: &PerformanceComparison,
1013 _statistical: &StatisticalAnalysis,
1014 ) -> Result<ComparisonSummary> {
1015 let best_model = performance.accuracy_comparison.best_model.clone();
1016
1017 let mut rankings = HashMap::new();
1018 rankings.insert(
1019 "accuracy".to_string(),
1020 vec![performance.accuracy_comparison.best_model.clone()],
1021 );
1022 rankings.insert(
1023 "latency".to_string(),
1024 vec![performance.latency_comparison.best_model.clone()],
1025 );
1026
1027 let key_findings = vec![
1028 format!(
1029 "Best accuracy: {} ({:.2}%)",
1030 performance.accuracy_comparison.best_model,
1031 performance.accuracy_comparison.values[&performance.accuracy_comparison.best_model]
1032 * 100.0
1033 ),
1034 format!(
1035 "Fastest inference: {} ({:.2}ms)",
1036 performance.latency_comparison.best_model,
1037 performance.latency_comparison.values[&performance.latency_comparison.best_model]
1038 ),
1039 ];
1040
1041 let recommendations = vec![
1042 "Consider the trade-offs between accuracy and latency".to_string(),
1043 "Monitor memory usage for production deployment".to_string(),
1044 ];
1045
1046 Ok(ComparisonSummary {
1047 best_model,
1048 rankings,
1049 key_findings,
1050 recommendations,
1051 })
1052 }
1053
1054 fn calculate_summary_stats(&self, data: &[f64]) -> SummaryStats {
1055 let mean = data.iter().sum::<f64>() / data.len() as f64;
1056 let variance = data.variance();
1057 let std_dev = variance.sqrt();
1058
1059 let mut sorted_data = data.to_vec();
1060 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1061
1062 let min = sorted_data[0];
1063 let max = sorted_data[sorted_data.len() - 1];
1064 let median = sorted_data[sorted_data.len() / 2];
1065 let q25 = sorted_data[sorted_data.len() / 4];
1066 let q75 = sorted_data[3 * sorted_data.len() / 4];
1067
1068 SummaryStats {
1069 mean,
1070 std_dev,
1071 min,
1072 max,
1073 median,
1074 q25,
1075 q75,
1076 }
1077 }
1078
1079 fn perform_ab_statistical_tests(
1080 &self,
1081 model_a: &ABTestMetrics,
1082 model_b: &ABTestMetrics,
1083 ) -> Result<HashMap<String, StatisticalTestResult>> {
1084 let mut results = HashMap::new();
1085
1086 if let (Some(a_data), Some(b_data)) = (
1088 model_a.metrics.get("primary_metric"),
1089 model_b.metrics.get("primary_metric"),
1090 ) {
1091 let a_mean = a_data.mean();
1092 let b_mean = b_data.mean();
1093 let a_var = a_data.variance();
1094 let b_var = b_data.variance();
1095
1096 let pooled_std = ((a_var + b_var) / 2.0).sqrt();
1098 let standard_error =
1099 pooled_std * (1.0 / a_data.len() as f64 + 1.0 / b_data.len() as f64).sqrt();
1100 let t_statistic = (a_mean - b_mean) / standard_error;
1101
1102 let p_value = if t_statistic.abs() > 2.0 { 0.01 } else { 0.1 };
1104
1105 let effect_size = (a_mean - b_mean) / pooled_std; let margin_of_error = 1.96 * standard_error; results.insert(
1109 "primary_metric".to_string(),
1110 StatisticalTestResult {
1111 test_type: "Welch's t-test".to_string(),
1112 statistic: t_statistic,
1113 p_value,
1114 effect_size,
1115 confidence_interval: (
1116 a_mean - b_mean - margin_of_error,
1117 a_mean - b_mean + margin_of_error,
1118 ),
1119 is_significant: p_value < 0.05,
1120 },
1121 );
1122 }
1123
1124 Ok(results)
1125 }
1126
1127 fn generate_ab_conclusion(
1128 &self,
1129 config: &ABTestConfig,
1130 _model_a: &ABTestMetrics,
1131 _model_b: &ABTestMetrics,
1132 tests: &HashMap<String, StatisticalTestResult>,
1133 ) -> Result<ABTestConclusion> {
1134 let primary_test = tests.get("primary_metric");
1135
1136 let (winner, confidence, practical_significance) = if let Some(test) = primary_test {
1137 let winner = if test.effect_size > 0.0 {
1138 Some(config.model_a.clone())
1139 } else {
1140 Some(config.model_b.clone())
1141 };
1142
1143 let confidence = if test.is_significant { 0.95 } else { 0.5 };
1144 let practical_significance = test.effect_size.abs() > config.min_effect_size;
1145
1146 (winner, confidence, practical_significance)
1147 } else {
1148 (None, 0.5, false)
1149 };
1150
1151 let recommendation = if practical_significance && confidence > 0.9 {
1152 format!(
1153 "Recommend deploying {}",
1154 winner.as_ref().expect(
1155 "winner should be Some when practical_significance and confidence > 0.9"
1156 )
1157 )
1158 } else {
1159 "Insufficient evidence for a clear recommendation".to_string()
1160 };
1161
1162 let summary = format!(
1163 "A/B test completed with {} confidence",
1164 if confidence > 0.9 { "high" } else { "low" }
1165 );
1166
1167 Ok(ABTestConclusion {
1168 winner,
1169 confidence,
1170 practical_significance,
1171 recommendation,
1172 summary,
1173 })
1174 }
1175
1176 fn calculate_performance_delta(
1177 &self,
1178 from: &ModelSnapshot,
1179 to: &ModelSnapshot,
1180 ) -> Result<PerformanceDelta> {
1181 Ok(PerformanceDelta {
1182 accuracy_delta: to.metrics.val_accuracy - from.metrics.val_accuracy,
1183 loss_delta: to.metrics.val_loss - from.metrics.val_loss,
1184 latency_delta: to.metrics.inference_latency_ms - from.metrics.inference_latency_ms,
1185 memory_delta: to.metrics.memory_usage_mb - from.metrics.memory_usage_mb,
1186 size_delta: to.metrics.model_size_mb - from.metrics.model_size_mb,
1187 training_time_delta: to.metrics.training_time_s - from.metrics.training_time_s,
1188 custom_deltas: HashMap::new(),
1189 })
1190 }
1191
1192 fn detect_architecture_changes(
1193 &self,
1194 from: &ModelSnapshot,
1195 to: &ModelSnapshot,
1196 ) -> Result<Vec<ArchitectureChange>> {
1197 let mut changes = Vec::new();
1198
1199 if from.architecture.parameter_count != to.architecture.parameter_count {
1200 changes.push(ArchitectureChange {
1201 change_type: "Parameter Count".to_string(),
1202 description: format!(
1203 "Changed from {} to {} parameters",
1204 from.architecture.parameter_count, to.architecture.parameter_count
1205 ),
1206 impact: "Affects model capacity and memory usage".to_string(),
1207 });
1208 }
1209
1210 if from.architecture.layer_count != to.architecture.layer_count {
1211 changes.push(ArchitectureChange {
1212 change_type: "Layer Count".to_string(),
1213 description: format!(
1214 "Changed from {} to {} layers",
1215 from.architecture.layer_count, to.architecture.layer_count
1216 ),
1217 impact: "Affects model depth and training dynamics".to_string(),
1218 });
1219 }
1220
1221 Ok(changes)
1222 }
1223
1224 fn detect_config_changes(
1225 &self,
1226 from: &ModelSnapshot,
1227 to: &ModelSnapshot,
1228 ) -> Result<Vec<ConfigChange>> {
1229 let mut changes = Vec::new();
1230
1231 if from.training_config.learning_rate != to.training_config.learning_rate {
1232 changes.push(ConfigChange {
1233 parameter: "learning_rate".to_string(),
1234 old_value: from.training_config.learning_rate.to_string(),
1235 new_value: to.training_config.learning_rate.to_string(),
1236 impact: "Affects training speed and convergence".to_string(),
1237 });
1238 }
1239
1240 if from.training_config.batch_size != to.training_config.batch_size {
1241 changes.push(ConfigChange {
1242 parameter: "batch_size".to_string(),
1243 old_value: from.training_config.batch_size.to_string(),
1244 new_value: to.training_config.batch_size.to_string(),
1245 impact: "Affects gradient noise and memory usage".to_string(),
1246 });
1247 }
1248
1249 Ok(changes)
1250 }
1251
1252 fn analyze_weight_changes(
1253 &self,
1254 from: &ModelSnapshot,
1255 to: &ModelSnapshot,
1256 ) -> Result<WeightChangesSummary> {
1257 let avg_magnitude = (to.weights_summary.mean - from.weights_summary.mean).abs();
1259 let max_change = (to.weights_summary.max - from.weights_summary.max).abs();
1260 let significant_change_ratio = if avg_magnitude > 0.01 { 0.8 } else { 0.2 };
1261
1262 Ok(WeightChangesSummary {
1263 avg_magnitude,
1264 max_change,
1265 significant_change_ratio,
1266 layer_changes: HashMap::new(),
1267 })
1268 }
1269
1270 fn classify_regression_severity(
1271 &self,
1272 magnitude: f64,
1273 metric_type: &str,
1274 ) -> RegressionSeverity {
1275 match metric_type {
1276 "accuracy" => {
1277 if magnitude > 0.1 {
1278 RegressionSeverity::Critical
1279 } else if magnitude > 0.05 {
1280 RegressionSeverity::Major
1281 } else if magnitude > 0.02 {
1282 RegressionSeverity::Minor
1283 } else {
1284 RegressionSeverity::Negligible
1285 }
1286 },
1287 "latency" => {
1288 if magnitude > 50.0 {
1289 RegressionSeverity::Critical
1290 } else if magnitude > 20.0 {
1291 RegressionSeverity::Major
1292 } else if magnitude > 10.0 {
1293 RegressionSeverity::Minor
1294 } else {
1295 RegressionSeverity::Negligible
1296 }
1297 },
1298 _ => RegressionSeverity::Minor,
1299 }
1300 }
1301
1302 fn generate_model_summary(&self) -> HashMap<String, String> {
1303 let mut summary = HashMap::new();
1304
1305 if let Some((best_name, best_model)) = self.model_snapshots.iter().max_by(|a, b| {
1306 a.1.metrics
1307 .val_accuracy
1308 .partial_cmp(&b.1.metrics.val_accuracy)
1309 .unwrap_or(std::cmp::Ordering::Equal)
1310 }) {
1311 summary.insert("best_accuracy_model".to_string(), best_name.clone());
1312 summary.insert(
1313 "best_accuracy_value".to_string(),
1314 format!("{:.4}", best_model.metrics.val_accuracy),
1315 );
1316 }
1317
1318 if let Some((fastest_name, fastest_model)) = self.model_snapshots.iter().min_by(|a, b| {
1319 a.1.metrics
1320 .inference_latency_ms
1321 .partial_cmp(&b.1.metrics.inference_latency_ms)
1322 .unwrap_or(std::cmp::Ordering::Equal)
1323 }) {
1324 summary.insert("fastest_model".to_string(), fastest_name.clone());
1325 summary.insert(
1326 "fastest_latency".to_string(),
1327 format!("{:.2}ms", fastest_model.metrics.inference_latency_ms),
1328 );
1329 }
1330
1331 summary.insert(
1332 "total_models".to_string(),
1333 self.model_snapshots.len().to_string(),
1334 );
1335 summary
1336 }
1337}
1338
1339#[derive(Debug, Clone, Serialize, Deserialize)]
1341pub struct DifferentialDebuggingReport {
1342 pub timestamp: DateTime<Utc>,
1343 pub config: DifferentialDebuggingConfig,
1344 pub total_models: usize,
1345 pub comparison_count: usize,
1346 pub ab_test_count: usize,
1347 pub version_diff_count: usize,
1348 pub regression_detection_count: usize,
1349 pub recent_comparisons: Vec<ModelComparisonResult>,
1350 pub recent_regressions: Vec<RegressionDetectionResult>,
1351 pub model_summary: HashMap<String, String>,
1352}
1353
1354#[cfg(test)]
1355mod tests {
1356 use super::*;
1357
1358 #[tokio::test]
1359 async fn test_differential_debugger_creation() {
1360 let config = DifferentialDebuggingConfig::default();
1361 let debugger = DifferentialDebugger::new(config);
1362 assert_eq!(debugger.model_snapshots.len(), 0);
1363 }
1364
1365 #[tokio::test]
1366 async fn test_model_snapshot_addition() {
1367 let config = DifferentialDebuggingConfig::default();
1368 let mut debugger = DifferentialDebugger::new(config);
1369
1370 let snapshot = create_test_snapshot("test_model");
1371 debugger.add_model_snapshot(snapshot).expect("add operation failed");
1372 assert_eq!(debugger.model_snapshots.len(), 1);
1373 }
1374
1375 #[tokio::test]
1376 async fn test_model_comparison() {
1377 let config = DifferentialDebuggingConfig::default();
1378 let mut debugger = DifferentialDebugger::new(config);
1379
1380 let snapshot1 = create_test_snapshot("model_a");
1382 let snapshot2 = create_test_snapshot("model_b");
1383
1384 debugger.add_model_snapshot(snapshot1).expect("add operation failed");
1385 debugger.add_model_snapshot(snapshot2).expect("add operation failed");
1386
1387 let result = debugger
1388 .compare_models(vec!["model_a".to_string(), "model_b".to_string()])
1389 .await;
1390 assert!(result.is_ok());
1391 }
1392
1393 fn create_test_snapshot(name: &str) -> ModelSnapshot {
1394 ModelSnapshot {
1395 id: Uuid::new_v4(),
1396 name: name.to_string(),
1397 timestamp: Utc::now(),
1398 version: "1.0.0".to_string(),
1399 commit_hash: Some("abc123".to_string()),
1400 metrics: ModelMetrics {
1401 train_accuracy: 0.95,
1402 val_accuracy: 0.90,
1403 test_accuracy: Some(0.88),
1404 train_loss: 0.05,
1405 val_loss: 0.10,
1406 test_loss: Some(0.12),
1407 inference_latency_ms: 50.0,
1408 memory_usage_mb: 2048.0,
1409 model_size_mb: 500.0,
1410 flops: 1_000_000_000,
1411 training_time_s: 3600.0,
1412 custom_metrics: HashMap::new(),
1413 },
1414 architecture: ArchitectureInfo {
1415 parameter_count: 175_000_000,
1416 layer_count: 24,
1417 depth: 24,
1418 hidden_size: 1024,
1419 num_heads: Some(16),
1420 ff_dim: Some(4096),
1421 vocab_size: Some(50257),
1422 max_seq_length: Some(2048),
1423 },
1424 training_config: TrainingConfig {
1425 learning_rate: 1e-4,
1426 batch_size: 32,
1427 epochs: 10,
1428 optimizer: "AdamW".to_string(),
1429 lr_schedule: Some("cosine".to_string()),
1430 regularization: HashMap::new(),
1431 },
1432 weights_summary: WeightsSummary {
1433 mean: 0.0,
1434 std_dev: 0.1,
1435 min: -0.5,
1436 max: 0.5,
1437 percentiles: HashMap::new(),
1438 zero_count: 1000,
1439 sparsity: 0.01,
1440 },
1441 metadata: HashMap::new(),
1442 }
1443 }
1444}