use anyhow::Result;
use chrono::{DateTime, Utc};
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use statrs::statistics::Statistics;
use std::collections::HashMap;
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DifferentialDebuggingConfig {
pub enable_model_comparison: bool,
pub enable_ab_analysis: bool,
pub enable_version_diff: bool,
pub enable_regression_detection: bool,
pub enable_performance_delta: bool,
pub significance_threshold: f64,
pub max_comparison_models: usize,
pub regression_sensitivity: f64,
pub performance_delta_threshold: f64,
}
impl Default for DifferentialDebuggingConfig {
fn default() -> Self {
Self {
enable_model_comparison: true,
enable_ab_analysis: true,
enable_version_diff: true,
enable_regression_detection: true,
enable_performance_delta: true,
significance_threshold: 0.05,
max_comparison_models: 10,
regression_sensitivity: 0.8,
performance_delta_threshold: 5.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelSnapshot {
pub id: Uuid,
pub name: String,
pub timestamp: DateTime<Utc>,
pub version: String,
pub commit_hash: Option<String>,
pub metrics: ModelMetrics,
pub architecture: ArchitectureInfo,
pub training_config: TrainingConfig,
pub weights_summary: WeightsSummary,
pub metadata: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelMetrics {
pub train_accuracy: f64,
pub val_accuracy: f64,
pub test_accuracy: Option<f64>,
pub train_loss: f64,
pub val_loss: f64,
pub test_loss: Option<f64>,
pub inference_latency_ms: f64,
pub memory_usage_mb: f64,
pub model_size_mb: f64,
pub flops: u64,
pub training_time_s: f64,
pub custom_metrics: HashMap<String, f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArchitectureInfo {
pub parameter_count: u64,
pub layer_count: u32,
pub depth: u32,
pub hidden_size: u32,
pub num_heads: Option<u32>,
pub ff_dim: Option<u32>,
pub vocab_size: Option<u32>,
pub max_seq_length: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TrainingConfig {
pub learning_rate: f64,
pub batch_size: u32,
pub epochs: u32,
pub optimizer: String,
pub lr_schedule: Option<String>,
pub regularization: HashMap<String, f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WeightsSummary {
pub mean: f64,
pub std_dev: f64,
pub min: f64,
pub max: f64,
pub percentiles: HashMap<String, f64>,
pub zero_count: u64,
pub sparsity: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelComparisonResult {
pub models: Vec<String>,
pub timestamp: DateTime<Utc>,
pub performance_comparison: PerformanceComparison,
pub architecture_diff: ArchitectureDiff,
pub statistical_analysis: StatisticalAnalysis,
pub summary: ComparisonSummary,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceComparison {
pub accuracy_comparison: MetricComparison,
pub loss_comparison: MetricComparison,
pub latency_comparison: MetricComparison,
pub memory_comparison: MetricComparison,
pub size_comparison: MetricComparison,
pub custom_comparisons: HashMap<String, MetricComparison>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricComparison {
pub values: HashMap<String, f64>,
pub best_model: String,
pub worst_model: String,
pub differences: HashMap<String, f64>,
pub significant_differences: HashMap<String, bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArchitectureDiff {
pub parameter_diff: HashMap<String, i64>,
pub layer_diff: HashMap<String, i32>,
pub similarity_score: f64,
pub notable_differences: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StatisticalAnalysis {
pub p_values: HashMap<String, f64>,
pub effect_sizes: HashMap<String, f64>,
pub confidence_intervals: HashMap<String, (f64, f64)>,
pub significance_summary: HashMap<String, bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComparisonSummary {
pub best_model: String,
pub rankings: HashMap<String, Vec<String>>,
pub key_findings: Vec<String>,
pub recommendations: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ABTestConfig {
pub name: String,
pub model_a: String,
pub model_b: String,
pub duration_hours: Option<u32>,
pub sample_size: u32,
pub tracked_metrics: Vec<String>,
pub min_effect_size: f64,
pub power: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ABTestResult {
pub config: ABTestConfig,
pub start_time: DateTime<Utc>,
pub end_time: Option<DateTime<Utc>>,
pub model_a_results: ABTestMetrics,
pub model_b_results: ABTestMetrics,
pub statistical_tests: HashMap<String, StatisticalTestResult>,
pub conclusion: ABTestConclusion,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ABTestMetrics {
pub sample_size: u32,
pub metrics: HashMap<String, Vec<f64>>,
pub summary_stats: HashMap<String, SummaryStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SummaryStats {
pub mean: f64,
pub std_dev: f64,
pub min: f64,
pub max: f64,
pub median: f64,
pub q25: f64,
pub q75: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StatisticalTestResult {
pub test_type: String,
pub statistic: f64,
pub p_value: f64,
pub effect_size: f64,
pub confidence_interval: (f64, f64),
pub is_significant: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ABTestConclusion {
pub winner: Option<String>,
pub confidence: f64,
pub practical_significance: bool,
pub recommendation: String,
pub summary: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VersionDiff {
pub from_version: String,
pub to_version: String,
pub timestamp: DateTime<Utc>,
pub performance_delta: PerformanceDelta,
pub architecture_changes: Vec<ArchitectureChange>,
pub config_changes: Vec<ConfigChange>,
pub weight_changes: WeightChangesSummary,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceDelta {
pub accuracy_delta: f64,
pub loss_delta: f64,
pub latency_delta: f64,
pub memory_delta: f64,
pub size_delta: f64,
pub training_time_delta: f64,
pub custom_deltas: HashMap<String, f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArchitectureChange {
pub change_type: String,
pub description: String,
pub impact: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConfigChange {
pub parameter: String,
pub old_value: String,
pub new_value: String,
pub impact: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WeightChangesSummary {
pub avg_magnitude: f64,
pub max_change: f64,
pub significant_change_ratio: f64,
pub layer_changes: HashMap<String, f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegressionDetectionResult {
pub timestamp: DateTime<Utc>,
pub regressions: Vec<Regression>,
pub improvements: Vec<Improvement>,
pub overall_assessment: RegressionAssessment,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Regression {
pub metric: String,
pub current_value: f64,
pub previous_value: f64,
pub magnitude: f64,
pub severity: RegressionSeverity,
pub possible_causes: Vec<String>,
pub suggested_fixes: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Improvement {
pub metric: String,
pub current_value: f64,
pub previous_value: f64,
pub magnitude: f64,
pub likely_causes: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RegressionSeverity {
Critical,
Major,
Minor,
Negligible,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegressionAssessment {
pub health_score: f64,
pub critical_regressions: usize,
pub improvements: usize,
pub recommendation: String,
}
#[derive(Debug)]
pub struct DifferentialDebugger {
config: DifferentialDebuggingConfig,
model_snapshots: IndexMap<String, ModelSnapshot>,
comparison_history: Vec<ModelComparisonResult>,
ab_tests: Vec<ABTestResult>,
version_diffs: Vec<VersionDiff>,
regression_history: Vec<RegressionDetectionResult>,
}
impl DifferentialDebugger {
pub fn new(config: DifferentialDebuggingConfig) -> Self {
Self {
config,
model_snapshots: IndexMap::new(),
comparison_history: Vec::new(),
ab_tests: Vec::new(),
version_diffs: Vec::new(),
regression_history: Vec::new(),
}
}
pub fn add_model_snapshot(&mut self, snapshot: ModelSnapshot) -> Result<()> {
if self.model_snapshots.len() >= self.config.max_comparison_models {
self.model_snapshots.shift_remove_index(0);
}
self.model_snapshots.insert(snapshot.name.clone(), snapshot);
Ok(())
}
pub async fn compare_models(
&mut self,
model_names: Vec<String>,
) -> Result<ModelComparisonResult> {
if !self.config.enable_model_comparison {
return Err(anyhow::anyhow!("Model comparison is disabled"));
}
if model_names.len() < 2 {
return Err(anyhow::anyhow!(
"At least two models are required for comparison"
));
}
let models: Vec<&ModelSnapshot> = model_names
.iter()
.map(|name| {
self.model_snapshots
.get(name)
.ok_or_else(|| anyhow::anyhow!("Model '{}' not found", name))
})
.collect::<Result<Vec<_>>>()?;
let performance_comparison = self.compare_performance(&models)?;
let architecture_diff = self.analyze_architecture_differences(&models)?;
let statistical_analysis = self.perform_statistical_analysis(&models)?;
let summary = self.generate_comparison_summary(
&models,
&performance_comparison,
&statistical_analysis,
)?;
let result = ModelComparisonResult {
models: model_names,
timestamp: Utc::now(),
performance_comparison,
architecture_diff,
statistical_analysis,
summary,
};
self.comparison_history.push(result.clone());
Ok(result)
}
pub async fn run_ab_test(
&mut self,
config: ABTestConfig,
model_a_data: Vec<f64>,
model_b_data: Vec<f64>,
) -> Result<ABTestResult> {
if !self.config.enable_ab_analysis {
return Err(anyhow::anyhow!("A/B analysis is disabled"));
}
let start_time = Utc::now();
let model_a_stats = self.calculate_summary_stats(&model_a_data);
let model_b_stats = self.calculate_summary_stats(&model_b_data);
let model_a_results = ABTestMetrics {
sample_size: model_a_data.len() as u32,
metrics: {
let mut metrics = HashMap::new();
metrics.insert("primary_metric".to_string(), model_a_data);
metrics
},
summary_stats: {
let mut stats = HashMap::new();
stats.insert("primary_metric".to_string(), model_a_stats);
stats
},
};
let model_b_results = ABTestMetrics {
sample_size: model_b_data.len() as u32,
metrics: {
let mut metrics = HashMap::new();
metrics.insert("primary_metric".to_string(), model_b_data);
metrics
},
summary_stats: {
let mut stats = HashMap::new();
stats.insert("primary_metric".to_string(), model_b_stats);
stats
},
};
let statistical_tests =
self.perform_ab_statistical_tests(&model_a_results, &model_b_results)?;
let conclusion = self.generate_ab_conclusion(
&config,
&model_a_results,
&model_b_results,
&statistical_tests,
)?;
let result = ABTestResult {
config,
start_time,
end_time: Some(Utc::now()),
model_a_results,
model_b_results,
statistical_tests,
conclusion,
};
self.ab_tests.push(result.clone());
Ok(result)
}
pub async fn track_version_diff(
&mut self,
from_model: &str,
to_model: &str,
) -> Result<VersionDiff> {
if !self.config.enable_version_diff {
return Err(anyhow::anyhow!("Version diff tracking is disabled"));
}
let from_snapshot = self
.model_snapshots
.get(from_model)
.ok_or_else(|| anyhow::anyhow!("Model '{}' not found", from_model))?;
let to_snapshot = self
.model_snapshots
.get(to_model)
.ok_or_else(|| anyhow::anyhow!("Model '{}' not found", to_model))?;
let performance_delta = self.calculate_performance_delta(from_snapshot, to_snapshot)?;
let architecture_changes = self.detect_architecture_changes(from_snapshot, to_snapshot)?;
let config_changes = self.detect_config_changes(from_snapshot, to_snapshot)?;
let weight_changes = self.analyze_weight_changes(from_snapshot, to_snapshot)?;
let diff = VersionDiff {
from_version: from_snapshot.version.clone(),
to_version: to_snapshot.version.clone(),
timestamp: Utc::now(),
performance_delta,
architecture_changes,
config_changes,
weight_changes,
};
self.version_diffs.push(diff.clone());
Ok(diff)
}
pub async fn detect_regressions(
&mut self,
current_model: &str,
baseline_model: &str,
) -> Result<RegressionDetectionResult> {
if !self.config.enable_regression_detection {
return Err(anyhow::anyhow!("Regression detection is disabled"));
}
let current = self
.model_snapshots
.get(current_model)
.ok_or_else(|| anyhow::anyhow!("Model '{}' not found", current_model))?;
let baseline = self
.model_snapshots
.get(baseline_model)
.ok_or_else(|| anyhow::anyhow!("Model '{}' not found", baseline_model))?;
let mut regressions = Vec::new();
let mut improvements = Vec::new();
if current.metrics.val_accuracy < baseline.metrics.val_accuracy {
let magnitude = baseline.metrics.val_accuracy - current.metrics.val_accuracy;
if magnitude > self.config.regression_sensitivity * 0.01 {
regressions.push(Regression {
metric: "validation_accuracy".to_string(),
current_value: current.metrics.val_accuracy,
previous_value: baseline.metrics.val_accuracy,
magnitude,
severity: self.classify_regression_severity(magnitude, "accuracy"),
possible_causes: vec![
"Learning rate too high".to_string(),
"Insufficient training".to_string(),
"Data distribution shift".to_string(),
],
suggested_fixes: vec![
"Reduce learning rate".to_string(),
"Increase training epochs".to_string(),
"Check data quality".to_string(),
],
});
}
} else if current.metrics.val_accuracy > baseline.metrics.val_accuracy {
let magnitude = current.metrics.val_accuracy - baseline.metrics.val_accuracy;
improvements.push(Improvement {
metric: "validation_accuracy".to_string(),
current_value: current.metrics.val_accuracy,
previous_value: baseline.metrics.val_accuracy,
magnitude,
likely_causes: vec![
"Better optimization".to_string(),
"Improved architecture".to_string(),
"Better hyperparameters".to_string(),
],
});
}
if current.metrics.inference_latency_ms > baseline.metrics.inference_latency_ms {
let magnitude =
current.metrics.inference_latency_ms - baseline.metrics.inference_latency_ms;
let relative_change = magnitude / baseline.metrics.inference_latency_ms * 100.0;
if relative_change > self.config.performance_delta_threshold {
regressions.push(Regression {
metric: "inference_latency".to_string(),
current_value: current.metrics.inference_latency_ms,
previous_value: baseline.metrics.inference_latency_ms,
magnitude,
severity: self.classify_regression_severity(relative_change, "latency"),
possible_causes: vec![
"Model complexity increased".to_string(),
"Inefficient implementation".to_string(),
"Hardware degradation".to_string(),
],
suggested_fixes: vec![
"Profile and optimize bottlenecks".to_string(),
"Consider model compression".to_string(),
"Check hardware configuration".to_string(),
],
});
}
}
let critical_regressions = regressions
.iter()
.filter(|r| matches!(r.severity, RegressionSeverity::Critical))
.count();
let health_score = if critical_regressions > 0 {
0.0
} else {
1.0 - (regressions.len() as f64 * 0.1).min(1.0)
};
let recommendation = if critical_regressions > 0 {
"Critical regressions detected. Immediate action required.".to_string()
} else if !regressions.is_empty() {
"Some regressions detected. Review and address if necessary.".to_string()
} else {
"No significant regressions detected.".to_string()
};
let overall_assessment = RegressionAssessment {
health_score,
critical_regressions,
improvements: improvements.len(),
recommendation,
};
let result = RegressionDetectionResult {
timestamp: Utc::now(),
regressions,
improvements,
overall_assessment,
};
self.regression_history.push(result.clone());
Ok(result)
}
pub async fn generate_report(&self) -> Result<DifferentialDebuggingReport> {
Ok(DifferentialDebuggingReport {
timestamp: Utc::now(),
config: self.config.clone(),
total_models: self.model_snapshots.len(),
comparison_count: self.comparison_history.len(),
ab_test_count: self.ab_tests.len(),
version_diff_count: self.version_diffs.len(),
regression_detection_count: self.regression_history.len(),
recent_comparisons: self.comparison_history.iter().rev().take(5).cloned().collect(),
recent_regressions: self.regression_history.iter().rev().take(3).cloned().collect(),
model_summary: self.generate_model_summary(),
})
}
fn compare_performance(&self, models: &[&ModelSnapshot]) -> Result<PerformanceComparison> {
let mut accuracy_values = HashMap::new();
let mut loss_values = HashMap::new();
let mut latency_values = HashMap::new();
let mut memory_values = HashMap::new();
let mut size_values = HashMap::new();
for model in models {
accuracy_values.insert(model.name.clone(), model.metrics.val_accuracy);
loss_values.insert(model.name.clone(), model.metrics.val_loss);
latency_values.insert(model.name.clone(), model.metrics.inference_latency_ms);
memory_values.insert(model.name.clone(), model.metrics.memory_usage_mb);
size_values.insert(model.name.clone(), model.metrics.model_size_mb);
}
Ok(PerformanceComparison {
accuracy_comparison: self.create_metric_comparison(accuracy_values, true)?,
loss_comparison: self.create_metric_comparison(loss_values, false)?,
latency_comparison: self.create_metric_comparison(latency_values, false)?,
memory_comparison: self.create_metric_comparison(memory_values, false)?,
size_comparison: self.create_metric_comparison(size_values, false)?,
custom_comparisons: HashMap::new(),
})
}
fn create_metric_comparison(
&self,
values: HashMap<String, f64>,
higher_is_better: bool,
) -> Result<MetricComparison> {
let best_model = if higher_is_better {
values
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.ok_or_else(|| anyhow::anyhow!("No values to compare"))?
.0
.clone()
} else {
values
.iter()
.min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.ok_or_else(|| anyhow::anyhow!("No values to compare"))?
.0
.clone()
};
let worst_model = if higher_is_better {
values
.iter()
.min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.ok_or_else(|| anyhow::anyhow!("No values to compare"))?
.0
.clone()
} else {
values
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.ok_or_else(|| anyhow::anyhow!("No values to compare"))?
.0
.clone()
};
let best_value = values[&best_model];
let mut differences = HashMap::new();
let mut significant_differences = HashMap::new();
for (model, value) in &values {
let diff = if higher_is_better {
(value - best_value) / best_value * 100.0
} else {
(best_value - value) / best_value * 100.0
};
differences.insert(model.clone(), diff);
significant_differences.insert(model.clone(), diff.abs() > 1.0); }
Ok(MetricComparison {
values,
best_model,
worst_model,
differences,
significant_differences,
})
}
fn analyze_architecture_differences(
&self,
models: &[&ModelSnapshot],
) -> Result<ArchitectureDiff> {
if models.len() < 2 {
return Err(anyhow::anyhow!(
"Need at least 2 models for architecture diff"
));
}
let base_model = models[0];
let mut parameter_diff = HashMap::new();
let mut layer_diff = HashMap::new();
let mut notable_differences = Vec::new();
for model in models.iter().skip(1) {
let param_diff = model.architecture.parameter_count as i64
- base_model.architecture.parameter_count as i64;
let layer_diff_val =
model.architecture.layer_count as i32 - base_model.architecture.layer_count as i32;
parameter_diff.insert(model.name.clone(), param_diff);
layer_diff.insert(model.name.clone(), layer_diff_val);
if param_diff.abs() > 1_000_000 {
notable_differences.push(format!(
"Model '{}' has {} parameter difference",
model.name, param_diff
));
}
if layer_diff_val != 0 {
notable_differences.push(format!(
"Model '{}' has {} layer difference",
model.name, layer_diff_val
));
}
}
let mut similarity_scores = Vec::new();
for model in models.iter().skip(1) {
let score = self
.calculate_architecture_similarity(&base_model.architecture, &model.architecture);
similarity_scores.push(score);
}
let similarity_score =
similarity_scores.iter().sum::<f64>() / similarity_scores.len() as f64;
Ok(ArchitectureDiff {
parameter_diff,
layer_diff,
similarity_score,
notable_differences,
})
}
fn calculate_architecture_similarity(
&self,
arch1: &ArchitectureInfo,
arch2: &ArchitectureInfo,
) -> f64 {
let mut similarity = 0.0;
let mut features = 0;
let param_ratio = (arch1.parameter_count.min(arch2.parameter_count) as f64)
/ (arch1.parameter_count.max(arch2.parameter_count) as f64);
similarity += param_ratio;
features += 1;
let layer_ratio = (arch1.layer_count.min(arch2.layer_count) as f64)
/ (arch1.layer_count.max(arch2.layer_count) as f64);
similarity += layer_ratio;
features += 1;
let hidden_ratio = (arch1.hidden_size.min(arch2.hidden_size) as f64)
/ (arch1.hidden_size.max(arch2.hidden_size) as f64);
similarity += hidden_ratio;
features += 1;
similarity / features as f64
}
fn perform_statistical_analysis(
&self,
_models: &[&ModelSnapshot],
) -> Result<StatisticalAnalysis> {
Ok(StatisticalAnalysis {
p_values: HashMap::new(),
effect_sizes: HashMap::new(),
confidence_intervals: HashMap::new(),
significance_summary: HashMap::new(),
})
}
fn generate_comparison_summary(
&self,
_models: &[&ModelSnapshot],
performance: &PerformanceComparison,
_statistical: &StatisticalAnalysis,
) -> Result<ComparisonSummary> {
let best_model = performance.accuracy_comparison.best_model.clone();
let mut rankings = HashMap::new();
rankings.insert(
"accuracy".to_string(),
vec![performance.accuracy_comparison.best_model.clone()],
);
rankings.insert(
"latency".to_string(),
vec![performance.latency_comparison.best_model.clone()],
);
let key_findings = vec![
format!(
"Best accuracy: {} ({:.2}%)",
performance.accuracy_comparison.best_model,
performance.accuracy_comparison.values[&performance.accuracy_comparison.best_model]
* 100.0
),
format!(
"Fastest inference: {} ({:.2}ms)",
performance.latency_comparison.best_model,
performance.latency_comparison.values[&performance.latency_comparison.best_model]
),
];
let recommendations = vec![
"Consider the trade-offs between accuracy and latency".to_string(),
"Monitor memory usage for production deployment".to_string(),
];
Ok(ComparisonSummary {
best_model,
rankings,
key_findings,
recommendations,
})
}
fn calculate_summary_stats(&self, data: &[f64]) -> SummaryStats {
let mean = data.iter().sum::<f64>() / data.len() as f64;
let variance = data.variance();
let std_dev = variance.sqrt();
let mut sorted_data = data.to_vec();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let min = sorted_data[0];
let max = sorted_data[sorted_data.len() - 1];
let median = sorted_data[sorted_data.len() / 2];
let q25 = sorted_data[sorted_data.len() / 4];
let q75 = sorted_data[3 * sorted_data.len() / 4];
SummaryStats {
mean,
std_dev,
min,
max,
median,
q25,
q75,
}
}
fn perform_ab_statistical_tests(
&self,
model_a: &ABTestMetrics,
model_b: &ABTestMetrics,
) -> Result<HashMap<String, StatisticalTestResult>> {
let mut results = HashMap::new();
if let (Some(a_data), Some(b_data)) = (
model_a.metrics.get("primary_metric"),
model_b.metrics.get("primary_metric"),
) {
let a_mean = a_data.mean();
let b_mean = b_data.mean();
let a_var = a_data.variance();
let b_var = b_data.variance();
let pooled_std = ((a_var + b_var) / 2.0).sqrt();
let standard_error =
pooled_std * (1.0 / a_data.len() as f64 + 1.0 / b_data.len() as f64).sqrt();
let t_statistic = (a_mean - b_mean) / standard_error;
let p_value = if t_statistic.abs() > 2.0 { 0.01 } else { 0.1 };
let effect_size = (a_mean - b_mean) / pooled_std; let margin_of_error = 1.96 * standard_error;
results.insert(
"primary_metric".to_string(),
StatisticalTestResult {
test_type: "Welch's t-test".to_string(),
statistic: t_statistic,
p_value,
effect_size,
confidence_interval: (
a_mean - b_mean - margin_of_error,
a_mean - b_mean + margin_of_error,
),
is_significant: p_value < 0.05,
},
);
}
Ok(results)
}
fn generate_ab_conclusion(
&self,
config: &ABTestConfig,
_model_a: &ABTestMetrics,
_model_b: &ABTestMetrics,
tests: &HashMap<String, StatisticalTestResult>,
) -> Result<ABTestConclusion> {
let primary_test = tests.get("primary_metric");
let (winner, confidence, practical_significance) = if let Some(test) = primary_test {
let winner = if test.effect_size > 0.0 {
Some(config.model_a.clone())
} else {
Some(config.model_b.clone())
};
let confidence = if test.is_significant { 0.95 } else { 0.5 };
let practical_significance = test.effect_size.abs() > config.min_effect_size;
(winner, confidence, practical_significance)
} else {
(None, 0.5, false)
};
let recommendation = if practical_significance && confidence > 0.9 {
format!(
"Recommend deploying {}",
winner.as_ref().expect(
"winner should be Some when practical_significance and confidence > 0.9"
)
)
} else {
"Insufficient evidence for a clear recommendation".to_string()
};
let summary = format!(
"A/B test completed with {} confidence",
if confidence > 0.9 { "high" } else { "low" }
);
Ok(ABTestConclusion {
winner,
confidence,
practical_significance,
recommendation,
summary,
})
}
fn calculate_performance_delta(
&self,
from: &ModelSnapshot,
to: &ModelSnapshot,
) -> Result<PerformanceDelta> {
Ok(PerformanceDelta {
accuracy_delta: to.metrics.val_accuracy - from.metrics.val_accuracy,
loss_delta: to.metrics.val_loss - from.metrics.val_loss,
latency_delta: to.metrics.inference_latency_ms - from.metrics.inference_latency_ms,
memory_delta: to.metrics.memory_usage_mb - from.metrics.memory_usage_mb,
size_delta: to.metrics.model_size_mb - from.metrics.model_size_mb,
training_time_delta: to.metrics.training_time_s - from.metrics.training_time_s,
custom_deltas: HashMap::new(),
})
}
fn detect_architecture_changes(
&self,
from: &ModelSnapshot,
to: &ModelSnapshot,
) -> Result<Vec<ArchitectureChange>> {
let mut changes = Vec::new();
if from.architecture.parameter_count != to.architecture.parameter_count {
changes.push(ArchitectureChange {
change_type: "Parameter Count".to_string(),
description: format!(
"Changed from {} to {} parameters",
from.architecture.parameter_count, to.architecture.parameter_count
),
impact: "Affects model capacity and memory usage".to_string(),
});
}
if from.architecture.layer_count != to.architecture.layer_count {
changes.push(ArchitectureChange {
change_type: "Layer Count".to_string(),
description: format!(
"Changed from {} to {} layers",
from.architecture.layer_count, to.architecture.layer_count
),
impact: "Affects model depth and training dynamics".to_string(),
});
}
Ok(changes)
}
fn detect_config_changes(
&self,
from: &ModelSnapshot,
to: &ModelSnapshot,
) -> Result<Vec<ConfigChange>> {
let mut changes = Vec::new();
if from.training_config.learning_rate != to.training_config.learning_rate {
changes.push(ConfigChange {
parameter: "learning_rate".to_string(),
old_value: from.training_config.learning_rate.to_string(),
new_value: to.training_config.learning_rate.to_string(),
impact: "Affects training speed and convergence".to_string(),
});
}
if from.training_config.batch_size != to.training_config.batch_size {
changes.push(ConfigChange {
parameter: "batch_size".to_string(),
old_value: from.training_config.batch_size.to_string(),
new_value: to.training_config.batch_size.to_string(),
impact: "Affects gradient noise and memory usage".to_string(),
});
}
Ok(changes)
}
fn analyze_weight_changes(
&self,
from: &ModelSnapshot,
to: &ModelSnapshot,
) -> Result<WeightChangesSummary> {
let avg_magnitude = (to.weights_summary.mean - from.weights_summary.mean).abs();
let max_change = (to.weights_summary.max - from.weights_summary.max).abs();
let significant_change_ratio = if avg_magnitude > 0.01 { 0.8 } else { 0.2 };
Ok(WeightChangesSummary {
avg_magnitude,
max_change,
significant_change_ratio,
layer_changes: HashMap::new(),
})
}
fn classify_regression_severity(
&self,
magnitude: f64,
metric_type: &str,
) -> RegressionSeverity {
match metric_type {
"accuracy" => {
if magnitude > 0.1 {
RegressionSeverity::Critical
} else if magnitude > 0.05 {
RegressionSeverity::Major
} else if magnitude > 0.02 {
RegressionSeverity::Minor
} else {
RegressionSeverity::Negligible
}
},
"latency" => {
if magnitude > 50.0 {
RegressionSeverity::Critical
} else if magnitude > 20.0 {
RegressionSeverity::Major
} else if magnitude > 10.0 {
RegressionSeverity::Minor
} else {
RegressionSeverity::Negligible
}
},
_ => RegressionSeverity::Minor,
}
}
fn generate_model_summary(&self) -> HashMap<String, String> {
let mut summary = HashMap::new();
if let Some((best_name, best_model)) = self.model_snapshots.iter().max_by(|a, b| {
a.1.metrics
.val_accuracy
.partial_cmp(&b.1.metrics.val_accuracy)
.unwrap_or(std::cmp::Ordering::Equal)
}) {
summary.insert("best_accuracy_model".to_string(), best_name.clone());
summary.insert(
"best_accuracy_value".to_string(),
format!("{:.4}", best_model.metrics.val_accuracy),
);
}
if let Some((fastest_name, fastest_model)) = self.model_snapshots.iter().min_by(|a, b| {
a.1.metrics
.inference_latency_ms
.partial_cmp(&b.1.metrics.inference_latency_ms)
.unwrap_or(std::cmp::Ordering::Equal)
}) {
summary.insert("fastest_model".to_string(), fastest_name.clone());
summary.insert(
"fastest_latency".to_string(),
format!("{:.2}ms", fastest_model.metrics.inference_latency_ms),
);
}
summary.insert(
"total_models".to_string(),
self.model_snapshots.len().to_string(),
);
summary
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DifferentialDebuggingReport {
pub timestamp: DateTime<Utc>,
pub config: DifferentialDebuggingConfig,
pub total_models: usize,
pub comparison_count: usize,
pub ab_test_count: usize,
pub version_diff_count: usize,
pub regression_detection_count: usize,
pub recent_comparisons: Vec<ModelComparisonResult>,
pub recent_regressions: Vec<RegressionDetectionResult>,
pub model_summary: HashMap<String, String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_differential_debugger_creation() {
let config = DifferentialDebuggingConfig::default();
let debugger = DifferentialDebugger::new(config);
assert_eq!(debugger.model_snapshots.len(), 0);
}
#[tokio::test]
async fn test_model_snapshot_addition() {
let config = DifferentialDebuggingConfig::default();
let mut debugger = DifferentialDebugger::new(config);
let snapshot = create_test_snapshot("test_model");
debugger.add_model_snapshot(snapshot).expect("add operation failed");
assert_eq!(debugger.model_snapshots.len(), 1);
}
#[tokio::test]
async fn test_model_comparison() {
let config = DifferentialDebuggingConfig::default();
let mut debugger = DifferentialDebugger::new(config);
let snapshot1 = create_test_snapshot("model_a");
let snapshot2 = create_test_snapshot("model_b");
debugger.add_model_snapshot(snapshot1).expect("add operation failed");
debugger.add_model_snapshot(snapshot2).expect("add operation failed");
let result = debugger
.compare_models(vec!["model_a".to_string(), "model_b".to_string()])
.await;
assert!(result.is_ok());
}
#[test]
fn test_config_default() {
let config = DifferentialDebuggingConfig::default();
assert!(config.enable_model_comparison);
assert!(config.enable_ab_analysis);
assert!(config.enable_version_diff);
assert!(config.enable_regression_detection);
assert!(config.enable_performance_delta);
assert!((config.significance_threshold - 0.05).abs() < f64::EPSILON);
assert_eq!(config.max_comparison_models, 10);
}
#[tokio::test]
async fn test_max_comparison_models_limit() {
let mut config = DifferentialDebuggingConfig::default();
config.max_comparison_models = 2;
let mut debugger = DifferentialDebugger::new(config);
debugger
.add_model_snapshot(create_test_snapshot("model_1"))
.expect("add should succeed");
debugger
.add_model_snapshot(create_test_snapshot("model_2"))
.expect("add should succeed");
debugger
.add_model_snapshot(create_test_snapshot("model_3"))
.expect("add should succeed");
assert_eq!(debugger.model_snapshots.len(), 2);
}
#[tokio::test]
async fn test_compare_models_disabled() {
let mut config = DifferentialDebuggingConfig::default();
config.enable_model_comparison = false;
let mut debugger = DifferentialDebugger::new(config);
let snapshot1 = create_test_snapshot("a");
let snapshot2 = create_test_snapshot("b");
debugger.add_model_snapshot(snapshot1).expect("add should succeed");
debugger.add_model_snapshot(snapshot2).expect("add should succeed");
let result = debugger.compare_models(vec!["a".to_string(), "b".to_string()]).await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_compare_models_too_few() {
let config = DifferentialDebuggingConfig::default();
let mut debugger = DifferentialDebugger::new(config);
let snapshot1 = create_test_snapshot("only_one");
debugger.add_model_snapshot(snapshot1).expect("add should succeed");
let result = debugger.compare_models(vec!["only_one".to_string()]).await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_compare_models_missing_model() {
let config = DifferentialDebuggingConfig::default();
let mut debugger = DifferentialDebugger::new(config);
let snapshot1 = create_test_snapshot("existing");
debugger.add_model_snapshot(snapshot1).expect("add should succeed");
let result = debugger
.compare_models(vec!["existing".to_string(), "missing".to_string()])
.await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_ab_test_analysis() {
let config = DifferentialDebuggingConfig::default();
let mut debugger = DifferentialDebugger::new(config);
let ab_config = ABTestConfig {
name: "test_ab".to_string(),
model_a: "model_a".to_string(),
model_b: "model_b".to_string(),
duration_hours: None,
sample_size: 100,
tracked_metrics: vec!["accuracy".to_string()],
min_effect_size: 0.05,
power: 0.8,
};
let mut seed: u64 = 42;
let model_a_data: Vec<f64> = (0..100)
.map(|_| {
seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
0.8 + (seed as f64 / u64::MAX as f64) * 0.1
})
.collect();
let model_b_data: Vec<f64> = (0..100)
.map(|_| {
seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
0.82 + (seed as f64 / u64::MAX as f64) * 0.1
})
.collect();
let result = debugger.run_ab_test(ab_config, model_a_data, model_b_data).await;
assert!(result.is_ok());
let ab_result = result.expect("ab test should succeed");
assert!(ab_result.conclusion.confidence >= 0.0);
}
#[tokio::test]
async fn test_ab_test_disabled() {
let mut config = DifferentialDebuggingConfig::default();
config.enable_ab_analysis = false;
let mut debugger = DifferentialDebugger::new(config);
let ab_config = ABTestConfig {
name: "test".to_string(),
model_a: "a".to_string(),
model_b: "b".to_string(),
duration_hours: None,
sample_size: 10,
tracked_metrics: vec![],
min_effect_size: 0.05,
power: 0.8,
};
let result = debugger.run_ab_test(ab_config, vec![1.0], vec![2.0]).await;
assert!(result.is_err());
}
#[test]
fn test_model_metrics_creation() {
let metrics = ModelMetrics {
train_accuracy: 0.95,
val_accuracy: 0.90,
test_accuracy: None,
train_loss: 0.05,
val_loss: 0.10,
test_loss: None,
inference_latency_ms: 50.0,
memory_usage_mb: 2048.0,
model_size_mb: 500.0,
flops: 1_000_000_000,
training_time_s: 3600.0,
custom_metrics: HashMap::new(),
};
assert!(metrics.train_accuracy > metrics.val_accuracy);
assert!(metrics.test_accuracy.is_none());
}
#[test]
fn test_architecture_info() {
let info = ArchitectureInfo {
parameter_count: 175_000_000,
layer_count: 24,
depth: 24,
hidden_size: 1024,
num_heads: Some(16),
ff_dim: Some(4096),
vocab_size: Some(50257),
max_seq_length: Some(2048),
};
assert_eq!(info.parameter_count, 175_000_000);
assert_eq!(info.layer_count, 24);
}
#[test]
fn test_summary_stats() {
let stats = SummaryStats {
mean: 0.85,
std_dev: 0.05,
min: 0.70,
max: 0.95,
median: 0.86,
q25: 0.82,
q75: 0.89,
};
assert!(stats.min < stats.q25);
assert!(stats.q25 < stats.median);
assert!(stats.median < stats.q75);
assert!(stats.q75 < stats.max);
}
#[test]
fn test_performance_delta() {
let delta = PerformanceDelta {
accuracy_delta: 0.02,
loss_delta: -0.01,
latency_delta: -5.0,
memory_delta: 100.0,
size_delta: 50.0,
training_time_delta: -300.0,
custom_deltas: HashMap::new(),
};
assert!(delta.accuracy_delta > 0.0);
assert!(delta.loss_delta < 0.0);
}
#[test]
fn test_regression_severity_variants() {
let severities = [
RegressionSeverity::Critical,
RegressionSeverity::Major,
RegressionSeverity::Minor,
RegressionSeverity::Negligible,
];
assert_eq!(severities.len(), 4);
}
#[test]
fn test_version_diff_creation() {
let diff = VersionDiff {
from_version: "1.0.0".to_string(),
to_version: "1.1.0".to_string(),
timestamp: Utc::now(),
performance_delta: PerformanceDelta {
accuracy_delta: 0.01,
loss_delta: -0.005,
latency_delta: 0.0,
memory_delta: 0.0,
size_delta: 10.0,
training_time_delta: 0.0,
custom_deltas: HashMap::new(),
},
architecture_changes: vec![ArchitectureChange {
change_type: "layer_added".to_string(),
description: "Added dropout layer".to_string(),
impact: "minor".to_string(),
}],
config_changes: vec![],
weight_changes: WeightChangesSummary {
avg_magnitude: 0.001,
max_change: 0.05,
significant_change_ratio: 0.1,
layer_changes: HashMap::new(),
},
};
assert_eq!(diff.from_version, "1.0.0");
assert_eq!(diff.architecture_changes.len(), 1);
}
#[test]
fn test_statistical_test_result() {
let result = StatisticalTestResult {
test_type: "t-test".to_string(),
statistic: 2.5,
p_value: 0.01,
effect_size: 0.4,
confidence_interval: (0.01, 0.05),
is_significant: true,
};
assert!(result.is_significant);
assert!(result.p_value < 0.05);
}
#[test]
fn test_ab_test_conclusion() {
let conclusion = ABTestConclusion {
winner: Some("model_b".to_string()),
confidence: 0.95,
practical_significance: true,
recommendation: "Deploy model_b".to_string(),
summary: "Model B outperforms Model A significantly".to_string(),
};
assert!(conclusion.winner.is_some());
assert!(conclusion.practical_significance);
}
#[tokio::test]
async fn test_compare_two_different_models() {
let config = DifferentialDebuggingConfig::default();
let mut debugger = DifferentialDebugger::new(config);
let mut snap_a = create_test_snapshot("model_a");
snap_a.metrics.train_accuracy = 0.90;
snap_a.metrics.val_accuracy = 0.85;
let mut snap_b = create_test_snapshot("model_b");
snap_b.metrics.train_accuracy = 0.95;
snap_b.metrics.val_accuracy = 0.92;
debugger.add_model_snapshot(snap_a).expect("add should succeed");
debugger.add_model_snapshot(snap_b).expect("add should succeed");
let result = debugger
.compare_models(vec!["model_a".to_string(), "model_b".to_string()])
.await;
assert!(result.is_ok());
let comparison = result.expect("comparison should succeed");
assert_eq!(comparison.models.len(), 2);
}
fn create_test_snapshot(name: &str) -> ModelSnapshot {
ModelSnapshot {
id: Uuid::new_v4(),
name: name.to_string(),
timestamp: Utc::now(),
version: "1.0.0".to_string(),
commit_hash: Some("abc123".to_string()),
metrics: ModelMetrics {
train_accuracy: 0.95,
val_accuracy: 0.90,
test_accuracy: Some(0.88),
train_loss: 0.05,
val_loss: 0.10,
test_loss: Some(0.12),
inference_latency_ms: 50.0,
memory_usage_mb: 2048.0,
model_size_mb: 500.0,
flops: 1_000_000_000,
training_time_s: 3600.0,
custom_metrics: HashMap::new(),
},
architecture: ArchitectureInfo {
parameter_count: 175_000_000,
layer_count: 24,
depth: 24,
hidden_size: 1024,
num_heads: Some(16),
ff_dim: Some(4096),
vocab_size: Some(50257),
max_seq_length: Some(2048),
},
training_config: TrainingConfig {
learning_rate: 1e-4,
batch_size: 32,
epochs: 10,
optimizer: "AdamW".to_string(),
lr_schedule: Some("cosine".to_string()),
regularization: HashMap::new(),
},
weights_summary: WeightsSummary {
mean: 0.0,
std_dev: 0.1,
min: -0.5,
max: 0.5,
percentiles: HashMap::new(),
zero_count: 1000,
sparsity: 0.01,
},
metadata: HashMap::new(),
}
}
}