1use crate::error::{OptimError, Result};
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::path::PathBuf;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ReproducibilityManager {
15 pub environments: HashMap<String, EnvironmentSnapshot>,
17 pub reports: Vec<ReproducibilityReport>,
19 pub verifications: Vec<VerificationResult>,
21 pub config: ReproducibilityConfig,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct EnvironmentSnapshot {
28 pub id: String,
30 pub timestamp: DateTime<Utc>,
32 pub system_info: SystemInfo,
34 pub dependencies: Vec<Dependency>,
36 pub environment_variables: HashMap<String, String>,
38 pub hardware_config: HardwareConfig,
40 pub random_seeds: Vec<u64>,
42 pub data_checksums: HashMap<String, String>,
44 pub config_hashes: HashMap<String, String>,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct SystemInfo {
51 pub os: String,
53 pub os_version: String,
55 pub kernel_version: Option<String>,
57 pub architecture: String,
59 pub hostname: String,
61 pub timezone: String,
63 pub locale: HashMap<String, String>,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct Dependency {
70 pub name: String,
72 pub version: String,
74 pub source: String,
76 pub checksum: Option<String>,
78 pub install_path: Option<String>,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct HardwareConfig {
85 pub cpu: CpuSpec,
87 pub memory: MemorySpec,
89 pub gpu: Option<GpuSpec>,
91 pub storage: Vec<StorageSpec>,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct CpuSpec {
98 pub model: String,
100 pub cores: usize,
102 pub threads: usize,
104 pub base_frequency: u32,
106 pub max_frequency: u32,
108 pub cache: HashMap<String, String>,
110 pub flags: Vec<String>,
112}
113
114#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct MemorySpec {
117 pub total_bytes: u64,
119 pub available_bytes: u64,
121 pub memory_type: String,
123 pub speed_mhz: u32,
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct GpuSpec {
130 pub model: String,
132 pub memory_bytes: u64,
134 pub driver_version: String,
136 pub cuda_version: Option<String>,
138 pub compute_capability: Option<String>,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct StorageSpec {
145 pub device: String,
147 pub storage_type: String,
149 pub size_bytes: u64,
151 pub available_bytes: u64,
153 pub filesystem: String,
155}
156
157#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct ReproducibilityReport {
160 pub id: String,
162 pub experiment_id: String,
164 pub environment_id: String,
166 pub reproducibility_score: f64,
168 pub checklist: ReproducibilityChecklist,
170 pub issues: Vec<ReproducibilityIssue>,
172 pub recommendations: Vec<String>,
174 pub generated_at: DateTime<Utc>,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct ReproducibilityChecklist {
181 pub random_seed_documented: bool,
183 pub dependencies_pinned: bool,
185 pub environment_captured: bool,
187 pub data_versioned: bool,
189 pub code_versioned: bool,
191 pub hardware_documented: bool,
193 pub configuration_hashed: bool,
195 pub results_verified: bool,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ReproducibilityIssue {
202 pub issue_type: IssueType,
204 pub severity: IssueSeverity,
206 pub description: String,
208 pub component: String,
210 pub suggested_fix: Option<String>,
212}
213
214#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
216pub enum IssueType {
217 MissingRandomSeed,
219 UnpinnedDependencies,
221 MissingEnvironment,
223 DataNotVersioned,
225 CodeNotVersioned,
227 HardwareNotDocumented,
229 ConfigurationNotHashed,
231 NonDeterministic,
233 PlatformSpecific,
235 ExternalDependencies,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
241pub enum IssueSeverity {
242 Critical,
244 High,
246 Medium,
248 Low,
250 Info,
252}
253
254#[derive(Debug, Clone, Serialize, Deserialize)]
256pub struct VerificationResult {
257 pub id: String,
259 pub original_experiment_id: String,
261 pub reproduction_experiment_id: String,
263 pub status: VerificationStatus,
265 pub similarity_metrics: SimilarityMetrics,
267 pub differences: Vec<Difference>,
269 pub verified_at: DateTime<Utc>,
271}
272
273#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
275pub enum VerificationStatus {
276 ExactMatch,
278 CloseMatch,
280 PartialMatch,
282 NoMatch,
284 VerificationFailed,
286}
287
288#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct SimilarityMetrics {
291 pub overall_similarity: f64,
293 pub result_similarity: f64,
295 pub performance_similarity: f64,
297 pub configuration_similarity: f64,
299 pub environment_similarity: f64,
301}
302
303#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct Difference {
306 pub category: DifferenceCategory,
308 pub field: String,
310 pub original_value: String,
312 pub reproduction_value: String,
314 pub magnitude: f64,
316 pub significant: bool,
318}
319
320#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
322pub enum DifferenceCategory {
323 Results,
325 Performance,
327 Configuration,
329 Environment,
331 Dependencies,
333 Hardware,
335}
336
337#[derive(Debug, Clone, Serialize, Deserialize)]
339pub struct ReproducibilityConfig {
340 pub numerical_tolerance: f64,
342 pub performance_tolerance: f64,
344 pub min_reproducibility_score: f64,
346 pub auto_capture_environment: bool,
348 pub auto_verify_results: bool,
350 pub storage: ReproducibilityStorage,
352}
353
354#[derive(Debug, Clone, Serialize, Deserialize)]
356pub struct ReproducibilityStorage {
357 pub base_directory: PathBuf,
359 pub compress_snapshots: bool,
361 pub retention_days: u32,
363 pub max_storage_bytes: u64,
365}
366
367impl ReproducibilityManager {
368 pub fn new(config: ReproducibilityConfig) -> Self {
370 Self {
371 environments: HashMap::new(),
372 reports: Vec::new(),
373 verifications: Vec::new(),
374 config,
375 }
376 }
377
378 pub fn capture_environment(&mut self) -> Result<String> {
380 let snapshot_id = uuid::Uuid::new_v4().to_string();
381 let snapshot = EnvironmentSnapshot {
382 id: snapshot_id.clone(),
383 timestamp: Utc::now(),
384 system_info: self.capture_system_info()?,
385 dependencies: self.capture_dependencies()?,
386 environment_variables: self.capture_environment_variables(),
387 hardware_config: self.capture_hardware_config()?,
388 random_seeds: vec![42], data_checksums: HashMap::new(),
390 config_hashes: HashMap::new(),
391 };
392
393 self.environments.insert(snapshot_id.clone(), snapshot);
394 Ok(snapshot_id)
395 }
396
397 pub fn generate_report(&mut self, experiment_id: &str, environment_id: &str) -> Result<String> {
399 let environment = self.environments.get(environment_id).ok_or_else(|| {
400 OptimError::InvalidConfig("Environment snapshot not found".to_string())
401 })?;
402
403 let checklist = self.evaluate_checklist(environment);
404 let (score, issues) = self.calculate_reproducibility_score(&checklist, environment);
405 let recommendations = self.generate_recommendations(&issues);
406
407 let report_id = uuid::Uuid::new_v4().to_string();
408 let report = ReproducibilityReport {
409 id: report_id.clone(),
410 experiment_id: experiment_id.to_string(),
411 environment_id: environment_id.to_string(),
412 reproducibility_score: score,
413 checklist,
414 issues,
415 recommendations,
416 generated_at: Utc::now(),
417 };
418
419 self.reports.push(report);
420 Ok(report_id)
421 }
422
423 pub fn verify_reproducibility(
425 &mut self,
426 original_experiment_id: &str,
427 reproduction_experiment_id: &str,
428 ) -> Result<String> {
429 let verification_id = uuid::Uuid::new_v4().to_string();
433 let verification = VerificationResult {
434 id: verification_id.clone(),
435 original_experiment_id: original_experiment_id.to_string(),
436 reproduction_experiment_id: reproduction_experiment_id.to_string(),
437 status: VerificationStatus::CloseMatch, similarity_metrics: SimilarityMetrics {
439 overall_similarity: 0.95,
440 result_similarity: 0.98,
441 performance_similarity: 0.92,
442 configuration_similarity: 1.0,
443 environment_similarity: 0.90,
444 },
445 differences: Vec::new(),
446 verified_at: Utc::now(),
447 };
448
449 self.verifications.push(verification);
450 Ok(verification_id)
451 }
452
453 fn capture_system_info(&self) -> Result<SystemInfo> {
454 Ok(SystemInfo {
455 os: std::env::consts::OS.to_string(),
456 os_version: "Unknown".to_string(), kernel_version: None,
458 architecture: std::env::consts::ARCH.to_string(),
459 hostname: std::env::var("HOSTNAME").unwrap_or_else(|_| "unknown".to_string()),
460 timezone: "UTC".to_string(), locale: HashMap::new(),
462 })
463 }
464
465 fn capture_dependencies(&self) -> Result<Vec<Dependency>> {
466 Ok(vec![Dependency {
468 name: "scirs2-optim".to_string(),
469 version: "0.1.0".to_string(),
470 source: "local".to_string(),
471 checksum: None,
472 install_path: None,
473 }])
474 }
475
476 fn capture_environment_variables(&self) -> HashMap<String, String> {
477 std::env::vars().collect()
478 }
479
480 fn capture_hardware_config(&self) -> Result<HardwareConfig> {
481 Ok(HardwareConfig {
482 cpu: CpuSpec {
483 model: "Unknown CPU".to_string(),
484 cores: std::thread::available_parallelism()
485 .map(|p| p.get())
486 .unwrap_or(1),
487 threads: std::thread::available_parallelism()
488 .map(|p| p.get())
489 .unwrap_or(1),
490 base_frequency: 0,
491 max_frequency: 0,
492 cache: HashMap::new(),
493 flags: Vec::new(),
494 },
495 memory: MemorySpec {
496 total_bytes: 8 * 1024 * 1024 * 1024, available_bytes: 6 * 1024 * 1024 * 1024, memory_type: "Unknown".to_string(),
499 speed_mhz: 0,
500 },
501 gpu: None,
502 storage: Vec::new(),
503 })
504 }
505
506 fn evaluate_checklist(&self, environment: &EnvironmentSnapshot) -> ReproducibilityChecklist {
507 ReproducibilityChecklist {
508 random_seed_documented: !environment.random_seeds.is_empty(),
509 dependencies_pinned: !environment.dependencies.is_empty(),
510 environment_captured: true, data_versioned: !environment.data_checksums.is_empty(),
512 code_versioned: false, hardware_documented: true, configuration_hashed: !environment.config_hashes.is_empty(),
515 results_verified: false, }
517 }
518
519 fn calculate_reproducibility_score(
520 &self,
521 checklist: &ReproducibilityChecklist,
522 environment: &EnvironmentSnapshot,
523 ) -> (f64, Vec<ReproducibilityIssue>) {
524 let mut score = 0.0;
525 let mut issues = Vec::new();
526 let total_checks = 8.0;
527
528 if checklist.random_seed_documented {
529 score += 1.0;
530 } else {
531 issues.push(ReproducibilityIssue {
532 issue_type: IssueType::MissingRandomSeed,
533 severity: IssueSeverity::High,
534 description: "Random seed not documented".to_string(),
535 component: "Random Number Generation".to_string(),
536 suggested_fix: Some("Set and document random seeds for all RNGs".to_string()),
537 });
538 }
539
540 if checklist.dependencies_pinned {
541 score += 1.0;
542 } else {
543 issues.push(ReproducibilityIssue {
544 issue_type: IssueType::UnpinnedDependencies,
545 severity: IssueSeverity::Critical,
546 description: "Dependencies not pinned to specific versions".to_string(),
547 component: "Dependencies".to_string(),
548 suggested_fix: Some("Pin all dependencies to exact versions".to_string()),
549 });
550 }
551
552 if checklist.environment_captured {
553 score += 1.0;
554 }
555
556 if checklist.data_versioned {
557 score += 1.0;
558 } else {
559 issues.push(ReproducibilityIssue {
560 issue_type: IssueType::DataNotVersioned,
561 severity: IssueSeverity::High,
562 description: "Data not versioned or checksummed".to_string(),
563 component: "Data Management".to_string(),
564 suggested_fix: Some("Version control data or provide checksums".to_string()),
565 });
566 }
567
568 if checklist.code_versioned {
569 score += 1.0;
570 } else {
571 issues.push(ReproducibilityIssue {
572 issue_type: IssueType::CodeNotVersioned,
573 severity: IssueSeverity::Critical,
574 description: "Code not under version control".to_string(),
575 component: "Source Code".to_string(),
576 suggested_fix: Some("Use Git or other version control system".to_string()),
577 });
578 }
579
580 if checklist.hardware_documented {
581 score += 1.0;
582 }
583
584 if checklist.configuration_hashed {
585 score += 1.0;
586 } else {
587 issues.push(ReproducibilityIssue {
588 issue_type: IssueType::ConfigurationNotHashed,
589 severity: IssueSeverity::Medium,
590 description: "Configuration not hashed for integrity".to_string(),
591 component: "Configuration".to_string(),
592 suggested_fix: Some("Generate and store configuration hashes".to_string()),
593 });
594 }
595
596 if checklist.results_verified {
597 score += 1.0;
598 }
599
600 (score / total_checks, issues)
601 }
602
603 fn generate_recommendations(&self, issues: &[ReproducibilityIssue]) -> Vec<String> {
604 let mut recommendations = Vec::new();
605
606 for issue in issues {
607 if let Some(fix) = &issue.suggested_fix {
608 recommendations.push(format!("{}: {}", issue.component, fix));
609 }
610 }
611
612 if issues
613 .iter()
614 .any(|i| i.issue_type == IssueType::MissingRandomSeed)
615 {
616 recommendations.push("Use consistent random seeds across all components".to_string());
617 }
618
619 if issues
620 .iter()
621 .any(|i| i.issue_type == IssueType::UnpinnedDependencies)
622 {
623 recommendations.push("Create a lockfile with exact dependency versions".to_string());
624 }
625
626 recommendations.push("Document the complete experimental procedure".to_string());
627 recommendations.push("Provide clear instructions for reproduction".to_string());
628
629 recommendations
630 }
631}
632
633impl Default for ReproducibilityConfig {
634 fn default() -> Self {
635 Self {
636 numerical_tolerance: 1e-6,
637 performance_tolerance: 0.1, min_reproducibility_score: 0.8, auto_capture_environment: true,
640 auto_verify_results: false,
641 storage: ReproducibilityStorage {
642 base_directory: PathBuf::from("./reproducibility"),
643 compress_snapshots: true,
644 retention_days: 365,
645 max_storage_bytes: 10 * 1024 * 1024 * 1024, },
647 }
648 }
649}
650
651#[cfg(test)]
652mod tests {
653 use super::*;
654
655 #[test]
656 fn test_reproducibility_manager_creation() {
657 let config = ReproducibilityConfig::default();
658 let manager = ReproducibilityManager::new(config);
659
660 assert!(manager.environments.is_empty());
661 assert!(manager.reports.is_empty());
662 assert!(manager.verifications.is_empty());
663 }
664
665 #[test]
666 fn test_environment_capture() {
667 let config = ReproducibilityConfig::default();
668 let mut manager = ReproducibilityManager::new(config);
669
670 let snapshot_id = manager.capture_environment().unwrap();
671
672 assert!(manager.environments.contains_key(&snapshot_id));
673 let snapshot = &manager.environments[&snapshot_id];
674 assert_eq!(snapshot.system_info.os, std::env::consts::OS);
675 }
676
677 #[test]
678 fn test_reproducibility_report() {
679 let config = ReproducibilityConfig::default();
680 let mut manager = ReproducibilityManager::new(config);
681
682 let env_id = manager.capture_environment().unwrap();
683 let report_id = manager.generate_report("test_experiment", &env_id).unwrap();
684
685 assert!(!manager.reports.is_empty());
686 let report = &manager.reports[0];
687 assert_eq!(report.id, report_id);
688 assert_eq!(report.experiment_id, "test_experiment");
689 }
690}