1use anyhow::Result;
7use chrono::{DateTime, Utc};
8use indexmap::IndexMap;
9use serde::{Deserialize, Serialize};
11use statrs::statistics::Statistics;
12use std::collections::HashMap;
13use uuid::Uuid;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DifferentialDebuggingConfig {
18 pub enable_model_comparison: bool,
20 pub enable_ab_analysis: bool,
22 pub enable_version_diff: bool,
24 pub enable_regression_detection: bool,
26 pub enable_performance_delta: bool,
28 pub significance_threshold: f64,
30 pub max_comparison_models: usize,
32 pub regression_sensitivity: f64,
34 pub performance_delta_threshold: f64,
36}
37
38impl Default for DifferentialDebuggingConfig {
39 fn default() -> Self {
40 Self {
41 enable_model_comparison: true,
42 enable_ab_analysis: true,
43 enable_version_diff: true,
44 enable_regression_detection: true,
45 enable_performance_delta: true,
46 significance_threshold: 0.05,
47 max_comparison_models: 10,
48 regression_sensitivity: 0.8,
49 performance_delta_threshold: 5.0,
50 }
51 }
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ModelSnapshot {
57 pub id: Uuid,
59 pub name: String,
61 pub timestamp: DateTime<Utc>,
63 pub version: String,
65 pub commit_hash: Option<String>,
67 pub metrics: ModelMetrics,
69 pub architecture: ArchitectureInfo,
71 pub training_config: TrainingConfig,
73 pub weights_summary: WeightsSummary,
75 pub metadata: HashMap<String, String>,
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ModelMetrics {
82 pub train_accuracy: f64,
84 pub val_accuracy: f64,
86 pub test_accuracy: Option<f64>,
88 pub train_loss: f64,
90 pub val_loss: f64,
92 pub test_loss: Option<f64>,
94 pub inference_latency_ms: f64,
96 pub memory_usage_mb: f64,
98 pub model_size_mb: f64,
100 pub flops: u64,
102 pub training_time_s: f64,
104 pub custom_metrics: HashMap<String, f64>,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ArchitectureInfo {
111 pub parameter_count: u64,
113 pub layer_count: u32,
115 pub depth: u32,
117 pub hidden_size: u32,
119 pub num_heads: Option<u32>,
121 pub ff_dim: Option<u32>,
123 pub vocab_size: Option<u32>,
125 pub max_seq_length: Option<u32>,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TrainingConfig {
132 pub learning_rate: f64,
134 pub batch_size: u32,
136 pub epochs: u32,
138 pub optimizer: String,
140 pub lr_schedule: Option<String>,
142 pub regularization: HashMap<String, f64>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct WeightsSummary {
149 pub mean: f64,
151 pub std_dev: f64,
153 pub min: f64,
155 pub max: f64,
157 pub percentiles: HashMap<String, f64>,
159 pub zero_count: u64,
161 pub sparsity: f64,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ModelComparisonResult {
168 pub models: Vec<String>,
170 pub timestamp: DateTime<Utc>,
172 pub performance_comparison: PerformanceComparison,
174 pub architecture_diff: ArchitectureDiff,
176 pub statistical_analysis: StatisticalAnalysis,
178 pub summary: ComparisonSummary,
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct PerformanceComparison {
185 pub accuracy_comparison: MetricComparison,
187 pub loss_comparison: MetricComparison,
189 pub latency_comparison: MetricComparison,
191 pub memory_comparison: MetricComparison,
193 pub size_comparison: MetricComparison,
195 pub custom_comparisons: HashMap<String, MetricComparison>,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct MetricComparison {
202 pub values: HashMap<String, f64>,
204 pub best_model: String,
206 pub worst_model: String,
208 pub differences: HashMap<String, f64>,
210 pub significant_differences: HashMap<String, bool>,
212}
213
214#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ArchitectureDiff {
217 pub parameter_diff: HashMap<String, i64>,
219 pub layer_diff: HashMap<String, i32>,
221 pub similarity_score: f64,
223 pub notable_differences: Vec<String>,
225}
226
227#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct StatisticalAnalysis {
230 pub p_values: HashMap<String, f64>,
232 pub effect_sizes: HashMap<String, f64>,
234 pub confidence_intervals: HashMap<String, (f64, f64)>,
236 pub significance_summary: HashMap<String, bool>,
238}
239
240#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct ComparisonSummary {
243 pub best_model: String,
245 pub rankings: HashMap<String, Vec<String>>,
247 pub key_findings: Vec<String>,
249 pub recommendations: Vec<String>,
251}
252
253#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ABTestConfig {
256 pub name: String,
258 pub model_a: String,
260 pub model_b: String,
262 pub duration_hours: Option<u32>,
264 pub sample_size: u32,
266 pub tracked_metrics: Vec<String>,
268 pub min_effect_size: f64,
270 pub power: f64,
272}
273
274#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct ABTestResult {
277 pub config: ABTestConfig,
279 pub start_time: DateTime<Utc>,
281 pub end_time: Option<DateTime<Utc>>,
283 pub model_a_results: ABTestMetrics,
285 pub model_b_results: ABTestMetrics,
287 pub statistical_tests: HashMap<String, StatisticalTestResult>,
289 pub conclusion: ABTestConclusion,
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ABTestMetrics {
296 pub sample_size: u32,
298 pub metrics: HashMap<String, Vec<f64>>,
300 pub summary_stats: HashMap<String, SummaryStats>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct SummaryStats {
307 pub mean: f64,
308 pub std_dev: f64,
309 pub min: f64,
310 pub max: f64,
311 pub median: f64,
312 pub q25: f64,
313 pub q75: f64,
314}
315
316#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct StatisticalTestResult {
319 pub test_type: String,
321 pub statistic: f64,
323 pub p_value: f64,
325 pub effect_size: f64,
327 pub confidence_interval: (f64, f64),
329 pub is_significant: bool,
331}
332
333#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct ABTestConclusion {
336 pub winner: Option<String>,
338 pub confidence: f64,
340 pub practical_significance: bool,
342 pub recommendation: String,
344 pub summary: String,
346}
347
348#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct VersionDiff {
351 pub from_version: String,
353 pub to_version: String,
355 pub timestamp: DateTime<Utc>,
357 pub performance_delta: PerformanceDelta,
359 pub architecture_changes: Vec<ArchitectureChange>,
361 pub config_changes: Vec<ConfigChange>,
363 pub weight_changes: WeightChangesSummary,
365}
366
367#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PerformanceDelta {
370 pub accuracy_delta: f64,
372 pub loss_delta: f64,
374 pub latency_delta: f64,
376 pub memory_delta: f64,
378 pub size_delta: f64,
380 pub training_time_delta: f64,
382 pub custom_deltas: HashMap<String, f64>,
384}
385
386#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct ArchitectureChange {
389 pub change_type: String,
391 pub description: String,
393 pub impact: String,
395}
396
397#[derive(Debug, Clone, Serialize, Deserialize)]
399pub struct ConfigChange {
400 pub parameter: String,
402 pub old_value: String,
404 pub new_value: String,
406 pub impact: String,
408}
409
410#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct WeightChangesSummary {
413 pub avg_magnitude: f64,
415 pub max_change: f64,
417 pub significant_change_ratio: f64,
419 pub layer_changes: HashMap<String, f64>,
421}
422
423#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct RegressionDetectionResult {
426 pub timestamp: DateTime<Utc>,
428 pub regressions: Vec<Regression>,
430 pub improvements: Vec<Improvement>,
432 pub overall_assessment: RegressionAssessment,
434}
435
436#[derive(Debug, Clone, Serialize, Deserialize)]
438pub struct Regression {
439 pub metric: String,
441 pub current_value: f64,
443 pub previous_value: f64,
445 pub magnitude: f64,
447 pub severity: RegressionSeverity,
449 pub possible_causes: Vec<String>,
451 pub suggested_fixes: Vec<String>,
453}
454
455#[derive(Debug, Clone, Serialize, Deserialize)]
457pub struct Improvement {
458 pub metric: String,
460 pub current_value: f64,
462 pub previous_value: f64,
464 pub magnitude: f64,
466 pub likely_causes: Vec<String>,
468}
469
470#[derive(Debug, Clone, Serialize, Deserialize)]
472pub enum RegressionSeverity {
473 Critical,
474 Major,
475 Minor,
476 Negligible,
477}
478
479#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct RegressionAssessment {
482 pub health_score: f64,
484 pub critical_regressions: usize,
486 pub improvements: usize,
488 pub recommendation: String,
490}
491
492#[derive(Debug)]
494pub struct DifferentialDebugger {
495 config: DifferentialDebuggingConfig,
496 model_snapshots: IndexMap<String, ModelSnapshot>,
497 comparison_history: Vec<ModelComparisonResult>,
498 ab_tests: Vec<ABTestResult>,
499 version_diffs: Vec<VersionDiff>,
500 regression_history: Vec<RegressionDetectionResult>,
501}
502
503impl DifferentialDebugger {
504 pub fn new(config: DifferentialDebuggingConfig) -> Self {
506 Self {
507 config,
508 model_snapshots: IndexMap::new(),
509 comparison_history: Vec::new(),
510 ab_tests: Vec::new(),
511 version_diffs: Vec::new(),
512 regression_history: Vec::new(),
513 }
514 }
515
516 pub fn add_model_snapshot(&mut self, snapshot: ModelSnapshot) -> Result<()> {
518 if self.model_snapshots.len() >= self.config.max_comparison_models {
519 self.model_snapshots.shift_remove_index(0);
521 }
522
523 self.model_snapshots.insert(snapshot.name.clone(), snapshot);
524 Ok(())
525 }
526
527 pub async fn compare_models(
529 &mut self,
530 model_names: Vec<String>,
531 ) -> Result<ModelComparisonResult> {
532 if !self.config.enable_model_comparison {
533 return Err(anyhow::anyhow!("Model comparison is disabled"));
534 }
535
536 if model_names.len() < 2 {
537 return Err(anyhow::anyhow!(
538 "At least two models are required for comparison"
539 ));
540 }
541
542 let models: Vec<&ModelSnapshot> = model_names
544 .iter()
545 .map(|name| {
546 self.model_snapshots
547 .get(name)
548 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", name))
549 })
550 .collect::<Result<Vec<_>>>()?;
551
552 let performance_comparison = self.compare_performance(&models)?;
554 let architecture_diff = self.analyze_architecture_differences(&models)?;
555 let statistical_analysis = self.perform_statistical_analysis(&models)?;
556 let summary = self.generate_comparison_summary(
557 &models,
558 &performance_comparison,
559 &statistical_analysis,
560 )?;
561
562 let result = ModelComparisonResult {
563 models: model_names,
564 timestamp: Utc::now(),
565 performance_comparison,
566 architecture_diff,
567 statistical_analysis,
568 summary,
569 };
570
571 self.comparison_history.push(result.clone());
572 Ok(result)
573 }
574
575 pub async fn run_ab_test(
577 &mut self,
578 config: ABTestConfig,
579 model_a_data: Vec<f64>,
580 model_b_data: Vec<f64>,
581 ) -> Result<ABTestResult> {
582 if !self.config.enable_ab_analysis {
583 return Err(anyhow::anyhow!("A/B analysis is disabled"));
584 }
585
586 let start_time = Utc::now();
587
588 let model_a_stats = self.calculate_summary_stats(&model_a_data);
590 let model_b_stats = self.calculate_summary_stats(&model_b_data);
591
592 let model_a_results = ABTestMetrics {
593 sample_size: model_a_data.len() as u32,
594 metrics: {
595 let mut metrics = HashMap::new();
596 metrics.insert("primary_metric".to_string(), model_a_data);
597 metrics
598 },
599 summary_stats: {
600 let mut stats = HashMap::new();
601 stats.insert("primary_metric".to_string(), model_a_stats);
602 stats
603 },
604 };
605
606 let model_b_results = ABTestMetrics {
607 sample_size: model_b_data.len() as u32,
608 metrics: {
609 let mut metrics = HashMap::new();
610 metrics.insert("primary_metric".to_string(), model_b_data);
611 metrics
612 },
613 summary_stats: {
614 let mut stats = HashMap::new();
615 stats.insert("primary_metric".to_string(), model_b_stats);
616 stats
617 },
618 };
619
620 let statistical_tests =
622 self.perform_ab_statistical_tests(&model_a_results, &model_b_results)?;
623
624 let conclusion = self.generate_ab_conclusion(
626 &config,
627 &model_a_results,
628 &model_b_results,
629 &statistical_tests,
630 )?;
631
632 let result = ABTestResult {
633 config,
634 start_time,
635 end_time: Some(Utc::now()),
636 model_a_results,
637 model_b_results,
638 statistical_tests,
639 conclusion,
640 };
641
642 self.ab_tests.push(result.clone());
643 Ok(result)
644 }
645
646 pub async fn track_version_diff(
648 &mut self,
649 from_model: &str,
650 to_model: &str,
651 ) -> Result<VersionDiff> {
652 if !self.config.enable_version_diff {
653 return Err(anyhow::anyhow!("Version diff tracking is disabled"));
654 }
655
656 let from_snapshot = self
657 .model_snapshots
658 .get(from_model)
659 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", from_model))?;
660 let to_snapshot = self
661 .model_snapshots
662 .get(to_model)
663 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", to_model))?;
664
665 let performance_delta = self.calculate_performance_delta(from_snapshot, to_snapshot)?;
666 let architecture_changes = self.detect_architecture_changes(from_snapshot, to_snapshot)?;
667 let config_changes = self.detect_config_changes(from_snapshot, to_snapshot)?;
668 let weight_changes = self.analyze_weight_changes(from_snapshot, to_snapshot)?;
669
670 let diff = VersionDiff {
671 from_version: from_snapshot.version.clone(),
672 to_version: to_snapshot.version.clone(),
673 timestamp: Utc::now(),
674 performance_delta,
675 architecture_changes,
676 config_changes,
677 weight_changes,
678 };
679
680 self.version_diffs.push(diff.clone());
681 Ok(diff)
682 }
683
684 pub async fn detect_regressions(
686 &mut self,
687 current_model: &str,
688 baseline_model: &str,
689 ) -> Result<RegressionDetectionResult> {
690 if !self.config.enable_regression_detection {
691 return Err(anyhow::anyhow!("Regression detection is disabled"));
692 }
693
694 let current = self
695 .model_snapshots
696 .get(current_model)
697 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", current_model))?;
698 let baseline = self
699 .model_snapshots
700 .get(baseline_model)
701 .ok_or_else(|| anyhow::anyhow!("Model '{}' not found", baseline_model))?;
702
703 let mut regressions = Vec::new();
704 let mut improvements = Vec::new();
705
706 if current.metrics.val_accuracy < baseline.metrics.val_accuracy {
708 let magnitude = baseline.metrics.val_accuracy - current.metrics.val_accuracy;
709 if magnitude > self.config.regression_sensitivity * 0.01 {
710 regressions.push(Regression {
711 metric: "validation_accuracy".to_string(),
712 current_value: current.metrics.val_accuracy,
713 previous_value: baseline.metrics.val_accuracy,
714 magnitude,
715 severity: self.classify_regression_severity(magnitude, "accuracy"),
716 possible_causes: vec![
717 "Learning rate too high".to_string(),
718 "Insufficient training".to_string(),
719 "Data distribution shift".to_string(),
720 ],
721 suggested_fixes: vec![
722 "Reduce learning rate".to_string(),
723 "Increase training epochs".to_string(),
724 "Check data quality".to_string(),
725 ],
726 });
727 }
728 } else if current.metrics.val_accuracy > baseline.metrics.val_accuracy {
729 let magnitude = current.metrics.val_accuracy - baseline.metrics.val_accuracy;
730 improvements.push(Improvement {
731 metric: "validation_accuracy".to_string(),
732 current_value: current.metrics.val_accuracy,
733 previous_value: baseline.metrics.val_accuracy,
734 magnitude,
735 likely_causes: vec![
736 "Better optimization".to_string(),
737 "Improved architecture".to_string(),
738 "Better hyperparameters".to_string(),
739 ],
740 });
741 }
742
743 if current.metrics.inference_latency_ms > baseline.metrics.inference_latency_ms {
745 let magnitude =
746 current.metrics.inference_latency_ms - baseline.metrics.inference_latency_ms;
747 let relative_change = magnitude / baseline.metrics.inference_latency_ms * 100.0;
748 if relative_change > self.config.performance_delta_threshold {
749 regressions.push(Regression {
750 metric: "inference_latency".to_string(),
751 current_value: current.metrics.inference_latency_ms,
752 previous_value: baseline.metrics.inference_latency_ms,
753 magnitude,
754 severity: self.classify_regression_severity(relative_change, "latency"),
755 possible_causes: vec![
756 "Model complexity increased".to_string(),
757 "Inefficient implementation".to_string(),
758 "Hardware degradation".to_string(),
759 ],
760 suggested_fixes: vec![
761 "Profile and optimize bottlenecks".to_string(),
762 "Consider model compression".to_string(),
763 "Check hardware configuration".to_string(),
764 ],
765 });
766 }
767 }
768
769 let critical_regressions = regressions
770 .iter()
771 .filter(|r| matches!(r.severity, RegressionSeverity::Critical))
772 .count();
773
774 let health_score = if critical_regressions > 0 {
775 0.0
776 } else {
777 1.0 - (regressions.len() as f64 * 0.1).min(1.0)
778 };
779
780 let recommendation = if critical_regressions > 0 {
781 "Critical regressions detected. Immediate action required.".to_string()
782 } else if !regressions.is_empty() {
783 "Some regressions detected. Review and address if necessary.".to_string()
784 } else {
785 "No significant regressions detected.".to_string()
786 };
787
788 let overall_assessment = RegressionAssessment {
789 health_score,
790 critical_regressions,
791 improvements: improvements.len(),
792 recommendation,
793 };
794
795 let result = RegressionDetectionResult {
796 timestamp: Utc::now(),
797 regressions,
798 improvements,
799 overall_assessment,
800 };
801
802 self.regression_history.push(result.clone());
803 Ok(result)
804 }
805
806 pub async fn generate_report(&self) -> Result<DifferentialDebuggingReport> {
808 Ok(DifferentialDebuggingReport {
809 timestamp: Utc::now(),
810 config: self.config.clone(),
811 total_models: self.model_snapshots.len(),
812 comparison_count: self.comparison_history.len(),
813 ab_test_count: self.ab_tests.len(),
814 version_diff_count: self.version_diffs.len(),
815 regression_detection_count: self.regression_history.len(),
816 recent_comparisons: self.comparison_history.iter().rev().take(5).cloned().collect(),
817 recent_regressions: self.regression_history.iter().rev().take(3).cloned().collect(),
818 model_summary: self.generate_model_summary(),
819 })
820 }
821
822 fn compare_performance(&self, models: &[&ModelSnapshot]) -> Result<PerformanceComparison> {
825 let mut accuracy_values = HashMap::new();
826 let mut loss_values = HashMap::new();
827 let mut latency_values = HashMap::new();
828 let mut memory_values = HashMap::new();
829 let mut size_values = HashMap::new();
830
831 for model in models {
832 accuracy_values.insert(model.name.clone(), model.metrics.val_accuracy);
833 loss_values.insert(model.name.clone(), model.metrics.val_loss);
834 latency_values.insert(model.name.clone(), model.metrics.inference_latency_ms);
835 memory_values.insert(model.name.clone(), model.metrics.memory_usage_mb);
836 size_values.insert(model.name.clone(), model.metrics.model_size_mb);
837 }
838
839 Ok(PerformanceComparison {
840 accuracy_comparison: self.create_metric_comparison(accuracy_values, true)?,
841 loss_comparison: self.create_metric_comparison(loss_values, false)?,
842 latency_comparison: self.create_metric_comparison(latency_values, false)?,
843 memory_comparison: self.create_metric_comparison(memory_values, false)?,
844 size_comparison: self.create_metric_comparison(size_values, false)?,
845 custom_comparisons: HashMap::new(),
846 })
847 }
848
849 fn create_metric_comparison(
850 &self,
851 values: HashMap<String, f64>,
852 higher_is_better: bool,
853 ) -> Result<MetricComparison> {
854 let best_model = if higher_is_better {
855 values
856 .iter()
857 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
858 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
859 .0
860 .clone()
861 } else {
862 values
863 .iter()
864 .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
865 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
866 .0
867 .clone()
868 };
869
870 let worst_model = if higher_is_better {
871 values
872 .iter()
873 .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
874 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
875 .0
876 .clone()
877 } else {
878 values
879 .iter()
880 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
881 .ok_or_else(|| anyhow::anyhow!("No values to compare"))?
882 .0
883 .clone()
884 };
885
886 let best_value = values[&best_model];
887 let mut differences = HashMap::new();
888 let mut significant_differences = HashMap::new();
889
890 for (model, value) in &values {
891 let diff = if higher_is_better {
892 (value - best_value) / best_value * 100.0
893 } else {
894 (best_value - value) / best_value * 100.0
895 };
896 differences.insert(model.clone(), diff);
897 significant_differences.insert(model.clone(), diff.abs() > 1.0); }
899
900 Ok(MetricComparison {
901 values,
902 best_model,
903 worst_model,
904 differences,
905 significant_differences,
906 })
907 }
908
909 fn analyze_architecture_differences(
910 &self,
911 models: &[&ModelSnapshot],
912 ) -> Result<ArchitectureDiff> {
913 if models.len() < 2 {
914 return Err(anyhow::anyhow!(
915 "Need at least 2 models for architecture diff"
916 ));
917 }
918
919 let base_model = models[0];
920 let mut parameter_diff = HashMap::new();
921 let mut layer_diff = HashMap::new();
922 let mut notable_differences = Vec::new();
923
924 for model in models.iter().skip(1) {
925 let param_diff = model.architecture.parameter_count as i64
926 - base_model.architecture.parameter_count as i64;
927 let layer_diff_val =
928 model.architecture.layer_count as i32 - base_model.architecture.layer_count as i32;
929
930 parameter_diff.insert(model.name.clone(), param_diff);
931 layer_diff.insert(model.name.clone(), layer_diff_val);
932
933 if param_diff.abs() > 1_000_000 {
934 notable_differences.push(format!(
935 "Model '{}' has {} parameter difference",
936 model.name, param_diff
937 ));
938 }
939
940 if layer_diff_val != 0 {
941 notable_differences.push(format!(
942 "Model '{}' has {} layer difference",
943 model.name, layer_diff_val
944 ));
945 }
946 }
947
948 let mut similarity_scores = Vec::new();
950 for model in models.iter().skip(1) {
951 let score = self
952 .calculate_architecture_similarity(&base_model.architecture, &model.architecture);
953 similarity_scores.push(score);
954 }
955 let similarity_score =
956 similarity_scores.iter().sum::<f64>() / similarity_scores.len() as f64;
957
958 Ok(ArchitectureDiff {
959 parameter_diff,
960 layer_diff,
961 similarity_score,
962 notable_differences,
963 })
964 }
965
966 fn calculate_architecture_similarity(
967 &self,
968 arch1: &ArchitectureInfo,
969 arch2: &ArchitectureInfo,
970 ) -> f64 {
971 let mut similarity = 0.0;
972 let mut features = 0;
973
974 let param_ratio = (arch1.parameter_count.min(arch2.parameter_count) as f64)
976 / (arch1.parameter_count.max(arch2.parameter_count) as f64);
977 similarity += param_ratio;
978 features += 1;
979
980 let layer_ratio = (arch1.layer_count.min(arch2.layer_count) as f64)
982 / (arch1.layer_count.max(arch2.layer_count) as f64);
983 similarity += layer_ratio;
984 features += 1;
985
986 let hidden_ratio = (arch1.hidden_size.min(arch2.hidden_size) as f64)
988 / (arch1.hidden_size.max(arch2.hidden_size) as f64);
989 similarity += hidden_ratio;
990 features += 1;
991
992 similarity / features as f64
993 }
994
995 fn perform_statistical_analysis(
996 &self,
997 _models: &[&ModelSnapshot],
998 ) -> Result<StatisticalAnalysis> {
999 Ok(StatisticalAnalysis {
1002 p_values: HashMap::new(),
1003 effect_sizes: HashMap::new(),
1004 confidence_intervals: HashMap::new(),
1005 significance_summary: HashMap::new(),
1006 })
1007 }
1008
1009 fn generate_comparison_summary(
1010 &self,
1011 _models: &[&ModelSnapshot],
1012 performance: &PerformanceComparison,
1013 _statistical: &StatisticalAnalysis,
1014 ) -> Result<ComparisonSummary> {
1015 let best_model = performance.accuracy_comparison.best_model.clone();
1016
1017 let mut rankings = HashMap::new();
1018 rankings.insert(
1019 "accuracy".to_string(),
1020 vec![performance.accuracy_comparison.best_model.clone()],
1021 );
1022 rankings.insert(
1023 "latency".to_string(),
1024 vec![performance.latency_comparison.best_model.clone()],
1025 );
1026
1027 let key_findings = vec![
1028 format!(
1029 "Best accuracy: {} ({:.2}%)",
1030 performance.accuracy_comparison.best_model,
1031 performance.accuracy_comparison.values[&performance.accuracy_comparison.best_model]
1032 * 100.0
1033 ),
1034 format!(
1035 "Fastest inference: {} ({:.2}ms)",
1036 performance.latency_comparison.best_model,
1037 performance.latency_comparison.values[&performance.latency_comparison.best_model]
1038 ),
1039 ];
1040
1041 let recommendations = vec![
1042 "Consider the trade-offs between accuracy and latency".to_string(),
1043 "Monitor memory usage for production deployment".to_string(),
1044 ];
1045
1046 Ok(ComparisonSummary {
1047 best_model,
1048 rankings,
1049 key_findings,
1050 recommendations,
1051 })
1052 }
1053
1054 fn calculate_summary_stats(&self, data: &[f64]) -> SummaryStats {
1055 let mean = data.iter().sum::<f64>() / data.len() as f64;
1056 let variance = data.variance();
1057 let std_dev = variance.sqrt();
1058
1059 let mut sorted_data = data.to_vec();
1060 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1061
1062 let min = sorted_data[0];
1063 let max = sorted_data[sorted_data.len() - 1];
1064 let median = sorted_data[sorted_data.len() / 2];
1065 let q25 = sorted_data[sorted_data.len() / 4];
1066 let q75 = sorted_data[3 * sorted_data.len() / 4];
1067
1068 SummaryStats {
1069 mean,
1070 std_dev,
1071 min,
1072 max,
1073 median,
1074 q25,
1075 q75,
1076 }
1077 }
1078
1079 fn perform_ab_statistical_tests(
1080 &self,
1081 model_a: &ABTestMetrics,
1082 model_b: &ABTestMetrics,
1083 ) -> Result<HashMap<String, StatisticalTestResult>> {
1084 let mut results = HashMap::new();
1085
1086 if let (Some(a_data), Some(b_data)) = (
1088 model_a.metrics.get("primary_metric"),
1089 model_b.metrics.get("primary_metric"),
1090 ) {
1091 let a_mean = a_data.mean();
1092 let b_mean = b_data.mean();
1093 let a_var = a_data.variance();
1094 let b_var = b_data.variance();
1095
1096 let pooled_std = ((a_var + b_var) / 2.0).sqrt();
1098 let standard_error =
1099 pooled_std * (1.0 / a_data.len() as f64 + 1.0 / b_data.len() as f64).sqrt();
1100 let t_statistic = (a_mean - b_mean) / standard_error;
1101
1102 let p_value = if t_statistic.abs() > 2.0 { 0.01 } else { 0.1 };
1104
1105 let effect_size = (a_mean - b_mean) / pooled_std; let margin_of_error = 1.96 * standard_error; results.insert(
1109 "primary_metric".to_string(),
1110 StatisticalTestResult {
1111 test_type: "Welch's t-test".to_string(),
1112 statistic: t_statistic,
1113 p_value,
1114 effect_size,
1115 confidence_interval: (
1116 a_mean - b_mean - margin_of_error,
1117 a_mean - b_mean + margin_of_error,
1118 ),
1119 is_significant: p_value < 0.05,
1120 },
1121 );
1122 }
1123
1124 Ok(results)
1125 }
1126
1127 fn generate_ab_conclusion(
1128 &self,
1129 config: &ABTestConfig,
1130 _model_a: &ABTestMetrics,
1131 _model_b: &ABTestMetrics,
1132 tests: &HashMap<String, StatisticalTestResult>,
1133 ) -> Result<ABTestConclusion> {
1134 let primary_test = tests.get("primary_metric");
1135
1136 let (winner, confidence, practical_significance) = if let Some(test) = primary_test {
1137 let winner = if test.effect_size > 0.0 {
1138 Some(config.model_a.clone())
1139 } else {
1140 Some(config.model_b.clone())
1141 };
1142
1143 let confidence = if test.is_significant { 0.95 } else { 0.5 };
1144 let practical_significance = test.effect_size.abs() > config.min_effect_size;
1145
1146 (winner, confidence, practical_significance)
1147 } else {
1148 (None, 0.5, false)
1149 };
1150
1151 let recommendation = if practical_significance && confidence > 0.9 {
1152 format!(
1153 "Recommend deploying {}",
1154 winner.as_ref().expect(
1155 "winner should be Some when practical_significance and confidence > 0.9"
1156 )
1157 )
1158 } else {
1159 "Insufficient evidence for a clear recommendation".to_string()
1160 };
1161
1162 let summary = format!(
1163 "A/B test completed with {} confidence",
1164 if confidence > 0.9 { "high" } else { "low" }
1165 );
1166
1167 Ok(ABTestConclusion {
1168 winner,
1169 confidence,
1170 practical_significance,
1171 recommendation,
1172 summary,
1173 })
1174 }
1175
1176 fn calculate_performance_delta(
1177 &self,
1178 from: &ModelSnapshot,
1179 to: &ModelSnapshot,
1180 ) -> Result<PerformanceDelta> {
1181 Ok(PerformanceDelta {
1182 accuracy_delta: to.metrics.val_accuracy - from.metrics.val_accuracy,
1183 loss_delta: to.metrics.val_loss - from.metrics.val_loss,
1184 latency_delta: to.metrics.inference_latency_ms - from.metrics.inference_latency_ms,
1185 memory_delta: to.metrics.memory_usage_mb - from.metrics.memory_usage_mb,
1186 size_delta: to.metrics.model_size_mb - from.metrics.model_size_mb,
1187 training_time_delta: to.metrics.training_time_s - from.metrics.training_time_s,
1188 custom_deltas: HashMap::new(),
1189 })
1190 }
1191
1192 fn detect_architecture_changes(
1193 &self,
1194 from: &ModelSnapshot,
1195 to: &ModelSnapshot,
1196 ) -> Result<Vec<ArchitectureChange>> {
1197 let mut changes = Vec::new();
1198
1199 if from.architecture.parameter_count != to.architecture.parameter_count {
1200 changes.push(ArchitectureChange {
1201 change_type: "Parameter Count".to_string(),
1202 description: format!(
1203 "Changed from {} to {} parameters",
1204 from.architecture.parameter_count, to.architecture.parameter_count
1205 ),
1206 impact: "Affects model capacity and memory usage".to_string(),
1207 });
1208 }
1209
1210 if from.architecture.layer_count != to.architecture.layer_count {
1211 changes.push(ArchitectureChange {
1212 change_type: "Layer Count".to_string(),
1213 description: format!(
1214 "Changed from {} to {} layers",
1215 from.architecture.layer_count, to.architecture.layer_count
1216 ),
1217 impact: "Affects model depth and training dynamics".to_string(),
1218 });
1219 }
1220
1221 Ok(changes)
1222 }
1223
1224 fn detect_config_changes(
1225 &self,
1226 from: &ModelSnapshot,
1227 to: &ModelSnapshot,
1228 ) -> Result<Vec<ConfigChange>> {
1229 let mut changes = Vec::new();
1230
1231 if from.training_config.learning_rate != to.training_config.learning_rate {
1232 changes.push(ConfigChange {
1233 parameter: "learning_rate".to_string(),
1234 old_value: from.training_config.learning_rate.to_string(),
1235 new_value: to.training_config.learning_rate.to_string(),
1236 impact: "Affects training speed and convergence".to_string(),
1237 });
1238 }
1239
1240 if from.training_config.batch_size != to.training_config.batch_size {
1241 changes.push(ConfigChange {
1242 parameter: "batch_size".to_string(),
1243 old_value: from.training_config.batch_size.to_string(),
1244 new_value: to.training_config.batch_size.to_string(),
1245 impact: "Affects gradient noise and memory usage".to_string(),
1246 });
1247 }
1248
1249 Ok(changes)
1250 }
1251
1252 fn analyze_weight_changes(
1253 &self,
1254 from: &ModelSnapshot,
1255 to: &ModelSnapshot,
1256 ) -> Result<WeightChangesSummary> {
1257 let avg_magnitude = (to.weights_summary.mean - from.weights_summary.mean).abs();
1259 let max_change = (to.weights_summary.max - from.weights_summary.max).abs();
1260 let significant_change_ratio = if avg_magnitude > 0.01 { 0.8 } else { 0.2 };
1261
1262 Ok(WeightChangesSummary {
1263 avg_magnitude,
1264 max_change,
1265 significant_change_ratio,
1266 layer_changes: HashMap::new(),
1267 })
1268 }
1269
1270 fn classify_regression_severity(
1271 &self,
1272 magnitude: f64,
1273 metric_type: &str,
1274 ) -> RegressionSeverity {
1275 match metric_type {
1276 "accuracy" => {
1277 if magnitude > 0.1 {
1278 RegressionSeverity::Critical
1279 } else if magnitude > 0.05 {
1280 RegressionSeverity::Major
1281 } else if magnitude > 0.02 {
1282 RegressionSeverity::Minor
1283 } else {
1284 RegressionSeverity::Negligible
1285 }
1286 },
1287 "latency" => {
1288 if magnitude > 50.0 {
1289 RegressionSeverity::Critical
1290 } else if magnitude > 20.0 {
1291 RegressionSeverity::Major
1292 } else if magnitude > 10.0 {
1293 RegressionSeverity::Minor
1294 } else {
1295 RegressionSeverity::Negligible
1296 }
1297 },
1298 _ => RegressionSeverity::Minor,
1299 }
1300 }
1301
1302 fn generate_model_summary(&self) -> HashMap<String, String> {
1303 let mut summary = HashMap::new();
1304
1305 if let Some((best_name, best_model)) = self.model_snapshots.iter().max_by(|a, b| {
1306 a.1.metrics
1307 .val_accuracy
1308 .partial_cmp(&b.1.metrics.val_accuracy)
1309 .unwrap_or(std::cmp::Ordering::Equal)
1310 }) {
1311 summary.insert("best_accuracy_model".to_string(), best_name.clone());
1312 summary.insert(
1313 "best_accuracy_value".to_string(),
1314 format!("{:.4}", best_model.metrics.val_accuracy),
1315 );
1316 }
1317
1318 if let Some((fastest_name, fastest_model)) = self.model_snapshots.iter().min_by(|a, b| {
1319 a.1.metrics
1320 .inference_latency_ms
1321 .partial_cmp(&b.1.metrics.inference_latency_ms)
1322 .unwrap_or(std::cmp::Ordering::Equal)
1323 }) {
1324 summary.insert("fastest_model".to_string(), fastest_name.clone());
1325 summary.insert(
1326 "fastest_latency".to_string(),
1327 format!("{:.2}ms", fastest_model.metrics.inference_latency_ms),
1328 );
1329 }
1330
1331 summary.insert(
1332 "total_models".to_string(),
1333 self.model_snapshots.len().to_string(),
1334 );
1335 summary
1336 }
1337}
1338
1339#[derive(Debug, Clone, Serialize, Deserialize)]
1341pub struct DifferentialDebuggingReport {
1342 pub timestamp: DateTime<Utc>,
1343 pub config: DifferentialDebuggingConfig,
1344 pub total_models: usize,
1345 pub comparison_count: usize,
1346 pub ab_test_count: usize,
1347 pub version_diff_count: usize,
1348 pub regression_detection_count: usize,
1349 pub recent_comparisons: Vec<ModelComparisonResult>,
1350 pub recent_regressions: Vec<RegressionDetectionResult>,
1351 pub model_summary: HashMap<String, String>,
1352}
1353
1354#[cfg(test)]
1355mod tests {
1356 use super::*;
1357
1358 #[tokio::test]
1359 async fn test_differential_debugger_creation() {
1360 let config = DifferentialDebuggingConfig::default();
1361 let debugger = DifferentialDebugger::new(config);
1362 assert_eq!(debugger.model_snapshots.len(), 0);
1363 }
1364
1365 #[tokio::test]
1366 async fn test_model_snapshot_addition() {
1367 let config = DifferentialDebuggingConfig::default();
1368 let mut debugger = DifferentialDebugger::new(config);
1369
1370 let snapshot = create_test_snapshot("test_model");
1371 debugger.add_model_snapshot(snapshot).expect("add operation failed");
1372 assert_eq!(debugger.model_snapshots.len(), 1);
1373 }
1374
1375 #[tokio::test]
1376 async fn test_model_comparison() {
1377 let config = DifferentialDebuggingConfig::default();
1378 let mut debugger = DifferentialDebugger::new(config);
1379
1380 let snapshot1 = create_test_snapshot("model_a");
1382 let snapshot2 = create_test_snapshot("model_b");
1383
1384 debugger.add_model_snapshot(snapshot1).expect("add operation failed");
1385 debugger.add_model_snapshot(snapshot2).expect("add operation failed");
1386
1387 let result = debugger
1388 .compare_models(vec!["model_a".to_string(), "model_b".to_string()])
1389 .await;
1390 assert!(result.is_ok());
1391 }
1392
1393 #[test]
1394 fn test_config_default() {
1395 let config = DifferentialDebuggingConfig::default();
1396 assert!(config.enable_model_comparison);
1397 assert!(config.enable_ab_analysis);
1398 assert!(config.enable_version_diff);
1399 assert!(config.enable_regression_detection);
1400 assert!(config.enable_performance_delta);
1401 assert!((config.significance_threshold - 0.05).abs() < f64::EPSILON);
1402 assert_eq!(config.max_comparison_models, 10);
1403 }
1404
1405 #[tokio::test]
1406 async fn test_max_comparison_models_limit() {
1407 let mut config = DifferentialDebuggingConfig::default();
1408 config.max_comparison_models = 2;
1409 let mut debugger = DifferentialDebugger::new(config);
1410
1411 debugger
1412 .add_model_snapshot(create_test_snapshot("model_1"))
1413 .expect("add should succeed");
1414 debugger
1415 .add_model_snapshot(create_test_snapshot("model_2"))
1416 .expect("add should succeed");
1417 debugger
1418 .add_model_snapshot(create_test_snapshot("model_3"))
1419 .expect("add should succeed");
1420 assert_eq!(debugger.model_snapshots.len(), 2);
1421 }
1422
1423 #[tokio::test]
1424 async fn test_compare_models_disabled() {
1425 let mut config = DifferentialDebuggingConfig::default();
1426 config.enable_model_comparison = false;
1427 let mut debugger = DifferentialDebugger::new(config);
1428
1429 let snapshot1 = create_test_snapshot("a");
1430 let snapshot2 = create_test_snapshot("b");
1431 debugger.add_model_snapshot(snapshot1).expect("add should succeed");
1432 debugger.add_model_snapshot(snapshot2).expect("add should succeed");
1433
1434 let result = debugger.compare_models(vec!["a".to_string(), "b".to_string()]).await;
1435 assert!(result.is_err());
1436 }
1437
1438 #[tokio::test]
1439 async fn test_compare_models_too_few() {
1440 let config = DifferentialDebuggingConfig::default();
1441 let mut debugger = DifferentialDebugger::new(config);
1442 let snapshot1 = create_test_snapshot("only_one");
1443 debugger.add_model_snapshot(snapshot1).expect("add should succeed");
1444 let result = debugger.compare_models(vec!["only_one".to_string()]).await;
1445 assert!(result.is_err());
1446 }
1447
1448 #[tokio::test]
1449 async fn test_compare_models_missing_model() {
1450 let config = DifferentialDebuggingConfig::default();
1451 let mut debugger = DifferentialDebugger::new(config);
1452 let snapshot1 = create_test_snapshot("existing");
1453 debugger.add_model_snapshot(snapshot1).expect("add should succeed");
1454 let result = debugger
1455 .compare_models(vec!["existing".to_string(), "missing".to_string()])
1456 .await;
1457 assert!(result.is_err());
1458 }
1459
1460 #[tokio::test]
1461 async fn test_ab_test_analysis() {
1462 let config = DifferentialDebuggingConfig::default();
1463 let mut debugger = DifferentialDebugger::new(config);
1464
1465 let ab_config = ABTestConfig {
1466 name: "test_ab".to_string(),
1467 model_a: "model_a".to_string(),
1468 model_b: "model_b".to_string(),
1469 duration_hours: None,
1470 sample_size: 100,
1471 tracked_metrics: vec!["accuracy".to_string()],
1472 min_effect_size: 0.05,
1473 power: 0.8,
1474 };
1475
1476 let mut seed: u64 = 42;
1478 let model_a_data: Vec<f64> = (0..100)
1479 .map(|_| {
1480 seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
1481 0.8 + (seed as f64 / u64::MAX as f64) * 0.1
1482 })
1483 .collect();
1484 let model_b_data: Vec<f64> = (0..100)
1485 .map(|_| {
1486 seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
1487 0.82 + (seed as f64 / u64::MAX as f64) * 0.1
1488 })
1489 .collect();
1490
1491 let result = debugger.run_ab_test(ab_config, model_a_data, model_b_data).await;
1492 assert!(result.is_ok());
1493 let ab_result = result.expect("ab test should succeed");
1494 assert!(ab_result.conclusion.confidence >= 0.0);
1495 }
1496
1497 #[tokio::test]
1498 async fn test_ab_test_disabled() {
1499 let mut config = DifferentialDebuggingConfig::default();
1500 config.enable_ab_analysis = false;
1501 let mut debugger = DifferentialDebugger::new(config);
1502
1503 let ab_config = ABTestConfig {
1504 name: "test".to_string(),
1505 model_a: "a".to_string(),
1506 model_b: "b".to_string(),
1507 duration_hours: None,
1508 sample_size: 10,
1509 tracked_metrics: vec![],
1510 min_effect_size: 0.05,
1511 power: 0.8,
1512 };
1513
1514 let result = debugger.run_ab_test(ab_config, vec![1.0], vec![2.0]).await;
1515 assert!(result.is_err());
1516 }
1517
1518 #[test]
1519 fn test_model_metrics_creation() {
1520 let metrics = ModelMetrics {
1521 train_accuracy: 0.95,
1522 val_accuracy: 0.90,
1523 test_accuracy: None,
1524 train_loss: 0.05,
1525 val_loss: 0.10,
1526 test_loss: None,
1527 inference_latency_ms: 50.0,
1528 memory_usage_mb: 2048.0,
1529 model_size_mb: 500.0,
1530 flops: 1_000_000_000,
1531 training_time_s: 3600.0,
1532 custom_metrics: HashMap::new(),
1533 };
1534 assert!(metrics.train_accuracy > metrics.val_accuracy);
1535 assert!(metrics.test_accuracy.is_none());
1536 }
1537
1538 #[test]
1539 fn test_architecture_info() {
1540 let info = ArchitectureInfo {
1541 parameter_count: 175_000_000,
1542 layer_count: 24,
1543 depth: 24,
1544 hidden_size: 1024,
1545 num_heads: Some(16),
1546 ff_dim: Some(4096),
1547 vocab_size: Some(50257),
1548 max_seq_length: Some(2048),
1549 };
1550 assert_eq!(info.parameter_count, 175_000_000);
1551 assert_eq!(info.layer_count, 24);
1552 }
1553
1554 #[test]
1555 fn test_summary_stats() {
1556 let stats = SummaryStats {
1557 mean: 0.85,
1558 std_dev: 0.05,
1559 min: 0.70,
1560 max: 0.95,
1561 median: 0.86,
1562 q25: 0.82,
1563 q75: 0.89,
1564 };
1565 assert!(stats.min < stats.q25);
1566 assert!(stats.q25 < stats.median);
1567 assert!(stats.median < stats.q75);
1568 assert!(stats.q75 < stats.max);
1569 }
1570
1571 #[test]
1572 fn test_performance_delta() {
1573 let delta = PerformanceDelta {
1574 accuracy_delta: 0.02,
1575 loss_delta: -0.01,
1576 latency_delta: -5.0,
1577 memory_delta: 100.0,
1578 size_delta: 50.0,
1579 training_time_delta: -300.0,
1580 custom_deltas: HashMap::new(),
1581 };
1582 assert!(delta.accuracy_delta > 0.0);
1583 assert!(delta.loss_delta < 0.0);
1584 }
1585
1586 #[test]
1587 fn test_regression_severity_variants() {
1588 let severities = [
1589 RegressionSeverity::Critical,
1590 RegressionSeverity::Major,
1591 RegressionSeverity::Minor,
1592 RegressionSeverity::Negligible,
1593 ];
1594 assert_eq!(severities.len(), 4);
1595 }
1596
1597 #[test]
1598 fn test_version_diff_creation() {
1599 let diff = VersionDiff {
1600 from_version: "1.0.0".to_string(),
1601 to_version: "1.1.0".to_string(),
1602 timestamp: Utc::now(),
1603 performance_delta: PerformanceDelta {
1604 accuracy_delta: 0.01,
1605 loss_delta: -0.005,
1606 latency_delta: 0.0,
1607 memory_delta: 0.0,
1608 size_delta: 10.0,
1609 training_time_delta: 0.0,
1610 custom_deltas: HashMap::new(),
1611 },
1612 architecture_changes: vec![ArchitectureChange {
1613 change_type: "layer_added".to_string(),
1614 description: "Added dropout layer".to_string(),
1615 impact: "minor".to_string(),
1616 }],
1617 config_changes: vec![],
1618 weight_changes: WeightChangesSummary {
1619 avg_magnitude: 0.001,
1620 max_change: 0.05,
1621 significant_change_ratio: 0.1,
1622 layer_changes: HashMap::new(),
1623 },
1624 };
1625 assert_eq!(diff.from_version, "1.0.0");
1626 assert_eq!(diff.architecture_changes.len(), 1);
1627 }
1628
1629 #[test]
1630 fn test_statistical_test_result() {
1631 let result = StatisticalTestResult {
1632 test_type: "t-test".to_string(),
1633 statistic: 2.5,
1634 p_value: 0.01,
1635 effect_size: 0.4,
1636 confidence_interval: (0.01, 0.05),
1637 is_significant: true,
1638 };
1639 assert!(result.is_significant);
1640 assert!(result.p_value < 0.05);
1641 }
1642
1643 #[test]
1644 fn test_ab_test_conclusion() {
1645 let conclusion = ABTestConclusion {
1646 winner: Some("model_b".to_string()),
1647 confidence: 0.95,
1648 practical_significance: true,
1649 recommendation: "Deploy model_b".to_string(),
1650 summary: "Model B outperforms Model A significantly".to_string(),
1651 };
1652 assert!(conclusion.winner.is_some());
1653 assert!(conclusion.practical_significance);
1654 }
1655
1656 #[tokio::test]
1657 async fn test_compare_two_different_models() {
1658 let config = DifferentialDebuggingConfig::default();
1659 let mut debugger = DifferentialDebugger::new(config);
1660
1661 let mut snap_a = create_test_snapshot("model_a");
1662 snap_a.metrics.train_accuracy = 0.90;
1663 snap_a.metrics.val_accuracy = 0.85;
1664
1665 let mut snap_b = create_test_snapshot("model_b");
1666 snap_b.metrics.train_accuracy = 0.95;
1667 snap_b.metrics.val_accuracy = 0.92;
1668
1669 debugger.add_model_snapshot(snap_a).expect("add should succeed");
1670 debugger.add_model_snapshot(snap_b).expect("add should succeed");
1671
1672 let result = debugger
1673 .compare_models(vec!["model_a".to_string(), "model_b".to_string()])
1674 .await;
1675 assert!(result.is_ok());
1676 let comparison = result.expect("comparison should succeed");
1677 assert_eq!(comparison.models.len(), 2);
1678 }
1679
1680 fn create_test_snapshot(name: &str) -> ModelSnapshot {
1681 ModelSnapshot {
1682 id: Uuid::new_v4(),
1683 name: name.to_string(),
1684 timestamp: Utc::now(),
1685 version: "1.0.0".to_string(),
1686 commit_hash: Some("abc123".to_string()),
1687 metrics: ModelMetrics {
1688 train_accuracy: 0.95,
1689 val_accuracy: 0.90,
1690 test_accuracy: Some(0.88),
1691 train_loss: 0.05,
1692 val_loss: 0.10,
1693 test_loss: Some(0.12),
1694 inference_latency_ms: 50.0,
1695 memory_usage_mb: 2048.0,
1696 model_size_mb: 500.0,
1697 flops: 1_000_000_000,
1698 training_time_s: 3600.0,
1699 custom_metrics: HashMap::new(),
1700 },
1701 architecture: ArchitectureInfo {
1702 parameter_count: 175_000_000,
1703 layer_count: 24,
1704 depth: 24,
1705 hidden_size: 1024,
1706 num_heads: Some(16),
1707 ff_dim: Some(4096),
1708 vocab_size: Some(50257),
1709 max_seq_length: Some(2048),
1710 },
1711 training_config: TrainingConfig {
1712 learning_rate: 1e-4,
1713 batch_size: 32,
1714 epochs: 10,
1715 optimizer: "AdamW".to_string(),
1716 lr_schedule: Some("cosine".to_string()),
1717 regularization: HashMap::new(),
1718 },
1719 weights_summary: WeightsSummary {
1720 mean: 0.0,
1721 std_dev: 0.1,
1722 min: -0.5,
1723 max: 0.5,
1724 percentiles: HashMap::new(),
1725 zero_count: 1000,
1726 sparsity: 0.01,
1727 },
1728 metadata: HashMap::new(),
1729 }
1730 }
1731}