use crate::error::{OptimError, Result};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReproducibilityManager {
pub environments: HashMap<String, EnvironmentSnapshot>,
pub reports: Vec<ReproducibilityReport>,
pub verifications: Vec<VerificationResult>,
pub config: ReproducibilityConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnvironmentSnapshot {
pub id: String,
pub timestamp: DateTime<Utc>,
pub system_info: SystemInfo,
pub dependencies: Vec<Dependency>,
pub environment_variables: HashMap<String, String>,
pub hardware_config: HardwareConfig,
pub random_seeds: Vec<u64>,
pub data_checksums: HashMap<String, String>,
pub config_hashes: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemInfo {
pub os: String,
pub os_version: String,
pub kernel_version: Option<String>,
pub architecture: String,
pub hostname: String,
pub timezone: String,
pub locale: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dependency {
pub name: String,
pub version: String,
pub source: String,
pub checksum: Option<String>,
pub install_path: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HardwareConfig {
pub cpu: CpuSpec,
pub memory: MemorySpec,
pub gpu: Option<GpuSpec>,
pub storage: Vec<StorageSpec>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CpuSpec {
pub model: String,
pub cores: usize,
pub threads: usize,
pub base_frequency: u32,
pub max_frequency: u32,
pub cache: HashMap<String, String>,
pub flags: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemorySpec {
pub total_bytes: u64,
pub available_bytes: u64,
pub memory_type: String,
pub speed_mhz: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuSpec {
pub model: String,
pub memory_bytes: u64,
pub driver_version: String,
pub cuda_version: Option<String>,
pub compute_capability: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StorageSpec {
pub device: String,
pub storage_type: String,
pub size_bytes: u64,
pub available_bytes: u64,
pub filesystem: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReproducibilityReport {
pub id: String,
pub experiment_id: String,
pub environment_id: String,
pub reproducibility_score: f64,
pub checklist: ReproducibilityChecklist,
pub issues: Vec<ReproducibilityIssue>,
pub recommendations: Vec<String>,
pub generated_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReproducibilityChecklist {
pub random_seed_documented: bool,
pub dependencies_pinned: bool,
pub environment_captured: bool,
pub data_versioned: bool,
pub code_versioned: bool,
pub hardware_documented: bool,
pub configuration_hashed: bool,
pub results_verified: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReproducibilityIssue {
pub issue_type: IssueType,
pub severity: IssueSeverity,
pub description: String,
pub component: String,
pub suggested_fix: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum IssueType {
MissingRandomSeed,
UnpinnedDependencies,
MissingEnvironment,
DataNotVersioned,
CodeNotVersioned,
HardwareNotDocumented,
ConfigurationNotHashed,
NonDeterministic,
PlatformSpecific,
ExternalDependencies,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum IssueSeverity {
Critical,
High,
Medium,
Low,
Info,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VerificationResult {
pub id: String,
pub original_experiment_id: String,
pub reproduction_experiment_id: String,
pub status: VerificationStatus,
pub similarity_metrics: SimilarityMetrics,
pub differences: Vec<Difference>,
pub verified_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum VerificationStatus {
ExactMatch,
CloseMatch,
PartialMatch,
NoMatch,
VerificationFailed,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarityMetrics {
pub overall_similarity: f64,
pub result_similarity: f64,
pub performance_similarity: f64,
pub configuration_similarity: f64,
pub environment_similarity: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Difference {
pub category: DifferenceCategory,
pub field: String,
pub original_value: String,
pub reproduction_value: String,
pub magnitude: f64,
pub significant: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum DifferenceCategory {
Results,
Performance,
Configuration,
Environment,
Dependencies,
Hardware,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReproducibilityConfig {
pub numerical_tolerance: f64,
pub performance_tolerance: f64,
pub min_reproducibility_score: f64,
pub auto_capture_environment: bool,
pub auto_verify_results: bool,
pub storage: ReproducibilityStorage,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReproducibilityStorage {
pub base_directory: PathBuf,
pub compress_snapshots: bool,
pub retention_days: u32,
pub max_storage_bytes: u64,
}
impl ReproducibilityManager {
pub fn new(config: ReproducibilityConfig) -> Self {
Self {
environments: HashMap::new(),
reports: Vec::new(),
verifications: Vec::new(),
config,
}
}
pub fn capture_environment(&mut self) -> Result<String> {
let snapshot_id = uuid::Uuid::new_v4().to_string();
let snapshot = EnvironmentSnapshot {
id: snapshot_id.clone(),
timestamp: Utc::now(),
system_info: self.capture_system_info()?,
dependencies: self.capture_dependencies()?,
environment_variables: self.capture_environment_variables(),
hardware_config: self.capture_hardware_config()?,
random_seeds: vec![42], data_checksums: HashMap::new(),
config_hashes: HashMap::new(),
};
self.environments.insert(snapshot_id.clone(), snapshot);
Ok(snapshot_id)
}
pub fn generate_report(&mut self, experiment_id: &str, environment_id: &str) -> Result<String> {
let environment = self.environments.get(environment_id).ok_or_else(|| {
OptimError::InvalidConfig("Environment snapshot not found".to_string())
})?;
let checklist = self.evaluate_checklist(environment);
let (score, issues) = self.calculate_reproducibility_score(&checklist, environment);
let recommendations = self.generate_recommendations(&issues);
let report_id = uuid::Uuid::new_v4().to_string();
let report = ReproducibilityReport {
id: report_id.clone(),
experiment_id: experiment_id.to_string(),
environment_id: environment_id.to_string(),
reproducibility_score: score,
checklist,
issues,
recommendations,
generated_at: Utc::now(),
};
self.reports.push(report);
Ok(report_id)
}
pub fn verify_reproducibility(
&mut self,
original_experiment_id: &str,
reproduction_experiment_id: &str,
) -> Result<String> {
let verification_id = uuid::Uuid::new_v4().to_string();
let verification = VerificationResult {
id: verification_id.clone(),
original_experiment_id: original_experiment_id.to_string(),
reproduction_experiment_id: reproduction_experiment_id.to_string(),
status: VerificationStatus::CloseMatch, similarity_metrics: SimilarityMetrics {
overall_similarity: 0.95,
result_similarity: 0.98,
performance_similarity: 0.92,
configuration_similarity: 1.0,
environment_similarity: 0.90,
},
differences: Vec::new(),
verified_at: Utc::now(),
};
self.verifications.push(verification);
Ok(verification_id)
}
fn capture_system_info(&self) -> Result<SystemInfo> {
Ok(SystemInfo {
os: std::env::consts::OS.to_string(),
os_version: "Unknown".to_string(), kernel_version: None,
architecture: std::env::consts::ARCH.to_string(),
hostname: std::env::var("HOSTNAME").unwrap_or_else(|_| "unknown".to_string()),
timezone: "UTC".to_string(), locale: HashMap::new(),
})
}
fn capture_dependencies(&self) -> Result<Vec<Dependency>> {
Ok(vec![Dependency {
name: "scirs2-optim".to_string(),
version: "0.1.0".to_string(),
source: "local".to_string(),
checksum: None,
install_path: None,
}])
}
fn capture_environment_variables(&self) -> HashMap<String, String> {
std::env::vars().collect()
}
fn capture_hardware_config(&self) -> Result<HardwareConfig> {
Ok(HardwareConfig {
cpu: CpuSpec {
model: "Unknown CPU".to_string(),
cores: std::thread::available_parallelism()
.map(|p| p.get())
.unwrap_or(1),
threads: std::thread::available_parallelism()
.map(|p| p.get())
.unwrap_or(1),
base_frequency: 0,
max_frequency: 0,
cache: HashMap::new(),
flags: Vec::new(),
},
memory: MemorySpec {
total_bytes: 8 * 1024 * 1024 * 1024, available_bytes: 6 * 1024 * 1024 * 1024, memory_type: "Unknown".to_string(),
speed_mhz: 0,
},
gpu: None,
storage: Vec::new(),
})
}
fn evaluate_checklist(&self, environment: &EnvironmentSnapshot) -> ReproducibilityChecklist {
ReproducibilityChecklist {
random_seed_documented: !environment.random_seeds.is_empty(),
dependencies_pinned: !environment.dependencies.is_empty(),
environment_captured: true, data_versioned: !environment.data_checksums.is_empty(),
code_versioned: false, hardware_documented: true, configuration_hashed: !environment.config_hashes.is_empty(),
results_verified: false, }
}
fn calculate_reproducibility_score(
&self,
checklist: &ReproducibilityChecklist,
environment: &EnvironmentSnapshot,
) -> (f64, Vec<ReproducibilityIssue>) {
let mut score = 0.0;
let mut issues = Vec::new();
let total_checks = 8.0;
if checklist.random_seed_documented {
score += 1.0;
} else {
issues.push(ReproducibilityIssue {
issue_type: IssueType::MissingRandomSeed,
severity: IssueSeverity::High,
description: "Random seed not documented".to_string(),
component: "Random Number Generation".to_string(),
suggested_fix: Some("Set and document random seeds for all RNGs".to_string()),
});
}
if checklist.dependencies_pinned {
score += 1.0;
} else {
issues.push(ReproducibilityIssue {
issue_type: IssueType::UnpinnedDependencies,
severity: IssueSeverity::Critical,
description: "Dependencies not pinned to specific versions".to_string(),
component: "Dependencies".to_string(),
suggested_fix: Some("Pin all dependencies to exact versions".to_string()),
});
}
if checklist.environment_captured {
score += 1.0;
}
if checklist.data_versioned {
score += 1.0;
} else {
issues.push(ReproducibilityIssue {
issue_type: IssueType::DataNotVersioned,
severity: IssueSeverity::High,
description: "Data not versioned or checksummed".to_string(),
component: "Data Management".to_string(),
suggested_fix: Some("Version control data or provide checksums".to_string()),
});
}
if checklist.code_versioned {
score += 1.0;
} else {
issues.push(ReproducibilityIssue {
issue_type: IssueType::CodeNotVersioned,
severity: IssueSeverity::Critical,
description: "Code not under version control".to_string(),
component: "Source Code".to_string(),
suggested_fix: Some("Use Git or other version control system".to_string()),
});
}
if checklist.hardware_documented {
score += 1.0;
}
if checklist.configuration_hashed {
score += 1.0;
} else {
issues.push(ReproducibilityIssue {
issue_type: IssueType::ConfigurationNotHashed,
severity: IssueSeverity::Medium,
description: "Configuration not hashed for integrity".to_string(),
component: "Configuration".to_string(),
suggested_fix: Some("Generate and store configuration hashes".to_string()),
});
}
if checklist.results_verified {
score += 1.0;
}
(score / total_checks, issues)
}
fn generate_recommendations(&self, issues: &[ReproducibilityIssue]) -> Vec<String> {
let mut recommendations = Vec::new();
for issue in issues {
if let Some(fix) = &issue.suggested_fix {
recommendations.push(format!("{}: {}", issue.component, fix));
}
}
if issues
.iter()
.any(|i| i.issue_type == IssueType::MissingRandomSeed)
{
recommendations.push("Use consistent random seeds across all components".to_string());
}
if issues
.iter()
.any(|i| i.issue_type == IssueType::UnpinnedDependencies)
{
recommendations.push("Create a lockfile with exact dependency versions".to_string());
}
recommendations.push("Document the complete experimental procedure".to_string());
recommendations.push("Provide clear instructions for reproduction".to_string());
recommendations
}
}
impl Default for ReproducibilityConfig {
fn default() -> Self {
Self {
numerical_tolerance: 1e-6,
performance_tolerance: 0.1, min_reproducibility_score: 0.8, auto_capture_environment: true,
auto_verify_results: false,
storage: ReproducibilityStorage {
base_directory: PathBuf::from("./reproducibility"),
compress_snapshots: true,
retention_days: 365,
max_storage_bytes: 10 * 1024 * 1024 * 1024, },
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reproducibility_manager_creation() {
let config = ReproducibilityConfig::default();
let manager = ReproducibilityManager::new(config);
assert!(manager.environments.is_empty());
assert!(manager.reports.is_empty());
assert!(manager.verifications.is_empty());
}
#[test]
fn test_environment_capture() {
let config = ReproducibilityConfig::default();
let mut manager = ReproducibilityManager::new(config);
let snapshot_id = manager.capture_environment().expect("unwrap failed");
assert!(manager.environments.contains_key(&snapshot_id));
let snapshot = &manager.environments[&snapshot_id];
assert_eq!(snapshot.system_info.os, std::env::consts::OS);
}
#[test]
fn test_reproducibility_report() {
let config = ReproducibilityConfig::default();
let mut manager = ReproducibilityManager::new(config);
let env_id = manager.capture_environment().expect("unwrap failed");
let report_id = manager
.generate_report("test_experiment", &env_id)
.expect("unwrap failed");
assert!(!manager.reports.is_empty());
let report = &manager.reports[0];
assert_eq!(report.id, report_id);
assert_eq!(report.experiment_id, "test_experiment");
}
}