Skip to main content

datasynth_runtime/
run_manifest.rs

1//! Run manifest and metadata tracking for reproducibility.
2//!
3//! This module provides structures for capturing complete generation run metadata,
4//! enabling reproducibility and traceability of generated data.
5
6use chrono::{DateTime, Utc};
7use datasynth_config::schema::GeneratorConfig;
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::fs::File;
12use std::io::{self, BufReader, Read as _, Write};
13use std::path::Path;
14use uuid::Uuid;
15
16use super::EnhancedGenerationStatistics;
17
18/// Complete manifest of a generation run for reproducibility.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct RunManifest {
21    /// Manifest format version.
22    #[serde(default = "default_manifest_version")]
23    pub manifest_version: String,
24    /// Unique identifier for this run.
25    pub run_id: String,
26    /// Timestamp when generation started.
27    pub started_at: DateTime<Utc>,
28    /// Timestamp when generation completed.
29    pub completed_at: Option<DateTime<Utc>>,
30    /// SHA-256 hash of the configuration (for quick comparison).
31    pub config_hash: String,
32    /// Complete configuration snapshot.
33    pub config_snapshot: GeneratorConfig,
34    /// Seed used for random number generation.
35    pub seed: u64,
36    /// Scenario tags for categorization.
37    #[serde(default)]
38    pub scenario_tags: Vec<String>,
39    /// Generation statistics.
40    #[serde(default)]
41    pub statistics: Option<EnhancedGenerationStatistics>,
42    /// Duration in seconds.
43    pub duration_seconds: Option<f64>,
44    /// Version of the generator.
45    pub generator_version: String,
46    /// Additional metadata.
47    #[serde(default)]
48    pub metadata: HashMap<String, String>,
49    /// Output directory path.
50    pub output_directory: Option<String>,
51    /// List of output files generated.
52    #[serde(default)]
53    pub output_files: Vec<OutputFileInfo>,
54    /// Any warnings or notes from the generation.
55    #[serde(default)]
56    pub warnings: Vec<String>,
57    /// Data lineage graph tracking config → generator → output relationships.
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub lineage: Option<super::lineage::LineageGraph>,
60    /// Quality gate evaluation result.
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub quality_gate_result: Option<QualityGateResultSummary>,
63    /// LLM enrichment phase summary.
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub llm_enrichment: Option<LlmEnrichmentSummary>,
66    /// Diffusion enhancement phase summary.
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub diffusion_model: Option<DiffusionModelSummary>,
69    /// Causal generation phase summary.
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub causal_generation: Option<CausalGenerationSummary>,
72}
73
74/// Summary of LLM enrichment phase for the run manifest.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct LlmEnrichmentSummary {
77    /// Whether LLM enrichment was enabled.
78    pub enabled: bool,
79    /// Execution time in milliseconds.
80    pub timing_ms: u64,
81    /// Number of vendors enriched.
82    pub vendors_enriched: usize,
83    /// Provider used (e.g., "mock", "openai").
84    pub provider: String,
85}
86
87/// Summary of diffusion enhancement phase for the run manifest.
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct DiffusionModelSummary {
90    /// Whether diffusion enhancement was enabled.
91    pub enabled: bool,
92    /// Execution time in milliseconds.
93    pub timing_ms: u64,
94    /// Number of samples generated.
95    pub samples_generated: usize,
96    /// Number of diffusion steps used.
97    pub n_steps: usize,
98}
99
100/// Summary of causal generation phase for the run manifest.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CausalGenerationSummary {
103    /// Whether causal generation was enabled.
104    pub enabled: bool,
105    /// Execution time in milliseconds.
106    pub timing_ms: u64,
107    /// Number of causal samples generated.
108    pub samples_generated: usize,
109    /// Template used (e.g., "fraud_detection", "revenue_cycle").
110    pub template: String,
111    /// Whether causal validation passed (None if validation was not run).
112    pub validation_passed: Option<bool>,
113}
114
115/// Summary of quality gate evaluation for the run manifest.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct QualityGateResultSummary {
118    /// Whether all gates passed.
119    pub passed: bool,
120    /// Profile name used.
121    pub profile_name: String,
122    /// Number of gates that passed.
123    pub gates_passed: usize,
124    /// Total number of gates evaluated.
125    pub gates_total: usize,
126    /// Names of failed gates.
127    pub failed_gates: Vec<String>,
128}
129
130fn default_manifest_version() -> String {
131    "2.0".to_string()
132}
133
134/// Information about an output file.
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct OutputFileInfo {
137    /// Relative path from output directory.
138    pub path: String,
139    /// File format (csv, json, parquet).
140    pub format: String,
141    /// Record count.
142    pub record_count: Option<usize>,
143    /// File size in bytes.
144    pub size_bytes: Option<u64>,
145    /// SHA-256 checksum of the file contents.
146    #[serde(default, skip_serializing_if = "Option::is_none")]
147    pub sha256_checksum: Option<String>,
148    /// Index of the first record in this file (for partitioned outputs).
149    #[serde(default, skip_serializing_if = "Option::is_none")]
150    pub first_record_index: Option<u64>,
151    /// Index of the last record in this file (for partitioned outputs).
152    #[serde(default, skip_serializing_if = "Option::is_none")]
153    pub last_record_index: Option<u64>,
154}
155
156/// Result of verifying a single file's checksum.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct ChecksumVerificationResult {
159    /// Relative path of the file.
160    pub path: String,
161    /// Verification status.
162    pub status: ChecksumStatus,
163    /// Expected checksum (from manifest).
164    pub expected: Option<String>,
165    /// Actual checksum (computed from file).
166    pub actual: Option<String>,
167}
168
169/// Status of a checksum verification.
170#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
171pub enum ChecksumStatus {
172    /// Checksum matches.
173    Ok,
174    /// Checksum does not match.
175    Mismatch,
176    /// File is missing on disk.
177    Missing,
178    /// No checksum recorded in manifest.
179    NoChecksum,
180}
181
182/// Computes the SHA-256 checksum of a file, streaming in 8KB chunks.
183pub fn compute_file_checksum(path: &Path) -> io::Result<String> {
184    let file = File::open(path)?;
185    let mut reader = BufReader::new(file);
186    let mut hasher = Sha256::new();
187    let mut buffer = [0u8; 8192];
188    loop {
189        let bytes_read = reader.read(&mut buffer)?;
190        if bytes_read == 0 {
191            break;
192        }
193        hasher.update(&buffer[..bytes_read]);
194    }
195    Ok(hex::encode(hasher.finalize()))
196}
197
198impl RunManifest {
199    /// Creates a new run manifest.
200    pub fn new(config: &GeneratorConfig, seed: u64) -> Self {
201        let run_id = Uuid::new_v4().to_string();
202        let config_hash = Self::hash_config(config);
203
204        Self {
205            manifest_version: "2.0".to_string(),
206            run_id,
207            started_at: Utc::now(),
208            completed_at: None,
209            config_hash,
210            config_snapshot: config.clone(),
211            seed,
212            scenario_tags: Vec::new(),
213            statistics: None,
214            duration_seconds: None,
215            generator_version: env!("CARGO_PKG_VERSION").to_string(),
216            metadata: HashMap::new(),
217            output_directory: None,
218            output_files: Vec::new(),
219            warnings: Vec::new(),
220            lineage: None,
221            quality_gate_result: None,
222            llm_enrichment: None,
223            diffusion_model: None,
224            causal_generation: None,
225        }
226    }
227
228    /// Computes SHA-256 hash of the configuration.
229    fn hash_config(config: &GeneratorConfig) -> String {
230        let json = serde_json::to_string(config).unwrap_or_default();
231        let mut hasher = Sha256::new();
232        hasher.update(json.as_bytes());
233        let result = hasher.finalize();
234        hex::encode(result)
235    }
236
237    /// Marks the run as complete.
238    pub fn complete(&mut self, statistics: EnhancedGenerationStatistics) {
239        self.completed_at = Some(Utc::now());
240        self.duration_seconds = Some(
241            (self.completed_at.expect("completed_at just set above") - self.started_at)
242                .num_milliseconds() as f64
243                / 1000.0,
244        );
245        self.statistics = Some(statistics);
246    }
247
248    /// Adds a scenario tag.
249    pub fn add_tag(&mut self, tag: &str) {
250        if !self.scenario_tags.contains(&tag.to_string()) {
251            self.scenario_tags.push(tag.to_string());
252        }
253    }
254
255    /// Adds multiple scenario tags.
256    pub fn add_tags(&mut self, tags: &[String]) {
257        for tag in tags {
258            self.add_tag(tag);
259        }
260    }
261
262    /// Sets the output directory.
263    pub fn set_output_directory(&mut self, path: &Path) {
264        self.output_directory = Some(path.display().to_string());
265    }
266
267    /// Adds an output file record.
268    pub fn add_output_file(&mut self, info: OutputFileInfo) {
269        self.output_files.push(info);
270    }
271
272    /// Adds a warning message.
273    pub fn add_warning(&mut self, warning: &str) {
274        self.warnings.push(warning.to_string());
275    }
276
277    /// Adds metadata.
278    pub fn add_metadata(&mut self, key: &str, value: &str) {
279        self.metadata.insert(key.to_string(), value.to_string());
280    }
281
282    /// Populates SHA-256 checksums for all output files.
283    ///
284    /// Resolves each file path relative to `base_dir` and computes its checksum.
285    /// Also populates `size_bytes` if not already set.
286    pub fn populate_file_checksums(&mut self, base_dir: &Path) {
287        for file_info in &mut self.output_files {
288            let file_path = base_dir.join(&file_info.path);
289            if file_path.exists() {
290                if let Ok(checksum) = compute_file_checksum(&file_path) {
291                    file_info.sha256_checksum = Some(checksum);
292                }
293                if file_info.size_bytes.is_none() {
294                    if let Ok(metadata) = std::fs::metadata(&file_path) {
295                        file_info.size_bytes = Some(metadata.len());
296                    }
297                }
298            }
299        }
300    }
301
302    /// Verifies checksums for all output files against their recorded values.
303    pub fn verify_file_checksums(&self, base_dir: &Path) -> Vec<ChecksumVerificationResult> {
304        self.output_files
305            .iter()
306            .map(|file_info| {
307                let file_path = base_dir.join(&file_info.path);
308
309                let expected = file_info.sha256_checksum.clone();
310                if expected.is_none() {
311                    return ChecksumVerificationResult {
312                        path: file_info.path.clone(),
313                        status: ChecksumStatus::NoChecksum,
314                        expected: None,
315                        actual: None,
316                    };
317                }
318
319                if !file_path.exists() {
320                    return ChecksumVerificationResult {
321                        path: file_info.path.clone(),
322                        status: ChecksumStatus::Missing,
323                        expected,
324                        actual: None,
325                    };
326                }
327
328                match compute_file_checksum(&file_path) {
329                    Ok(actual) => {
330                        let status = if expected.as_deref() == Some(actual.as_str()) {
331                            ChecksumStatus::Ok
332                        } else {
333                            ChecksumStatus::Mismatch
334                        };
335                        ChecksumVerificationResult {
336                            path: file_info.path.clone(),
337                            status,
338                            expected,
339                            actual: Some(actual),
340                        }
341                    }
342                    Err(_) => ChecksumVerificationResult {
343                        path: file_info.path.clone(),
344                        status: ChecksumStatus::Missing,
345                        expected,
346                        actual: None,
347                    },
348                }
349            })
350            .collect()
351    }
352
353    /// Writes the manifest to a JSON file.
354    pub fn write_to_file(&self, path: &Path) -> std::io::Result<()> {
355        let json = serde_json::to_string_pretty(self)?;
356        let mut file = File::create(path)?;
357        file.write_all(json.as_bytes())?;
358        Ok(())
359    }
360
361    /// Returns the run ID.
362    pub fn run_id(&self) -> &str {
363        &self.run_id
364    }
365}
366
367// Note: ScenarioConfig is now defined in datasynth-config/src/schema.rs
368// and exported via datasynth_config::schema::ScenarioConfig
369
370#[cfg(test)]
371#[allow(clippy::unwrap_used)]
372mod tests {
373    use super::*;
374    use datasynth_config::schema::*;
375
376    fn create_test_config() -> GeneratorConfig {
377        GeneratorConfig {
378            global: GlobalConfig {
379                industry: datasynth_core::models::IndustrySector::Manufacturing,
380                start_date: "2024-01-01".to_string(),
381                period_months: 1,
382                seed: Some(42),
383                parallel: false,
384                group_currency: "USD".to_string(),
385                worker_threads: 1,
386                memory_limit_mb: 512,
387            },
388            companies: vec![CompanyConfig {
389                code: "TEST".to_string(),
390                name: "Test Company".to_string(),
391                currency: "USD".to_string(),
392                country: "US".to_string(),
393                annual_transaction_volume: TransactionVolume::TenK,
394                volume_weight: 1.0,
395                fiscal_year_variant: "K4".to_string(),
396            }],
397            chart_of_accounts: ChartOfAccountsConfig::default(),
398            transactions: TransactionConfig::default(),
399            output: OutputConfig::default(),
400            fraud: FraudConfig::default(),
401            internal_controls: InternalControlsConfig::default(),
402            business_processes: BusinessProcessConfig::default(),
403            user_personas: UserPersonaConfig::default(),
404            templates: TemplateConfig::default(),
405            approval: ApprovalConfig::default(),
406            departments: DepartmentConfig::default(),
407            master_data: MasterDataConfig::default(),
408            document_flows: DocumentFlowConfig::default(),
409            intercompany: IntercompanyConfig::default(),
410            balance: BalanceConfig::default(),
411            ocpm: OcpmConfig::default(),
412            audit: AuditGenerationConfig::default(),
413            banking: datasynth_banking::BankingConfig::default(),
414            data_quality: DataQualitySchemaConfig::default(),
415            scenario: ScenarioConfig::default(),
416            temporal: TemporalDriftConfig::default(),
417            graph_export: GraphExportConfig::default(),
418            streaming: StreamingSchemaConfig::default(),
419            rate_limit: RateLimitSchemaConfig::default(),
420            temporal_attributes: TemporalAttributeSchemaConfig::default(),
421            relationships: RelationshipSchemaConfig::default(),
422            accounting_standards: AccountingStandardsConfig::default(),
423            audit_standards: AuditStandardsConfig::default(),
424            distributions: Default::default(),
425            temporal_patterns: Default::default(),
426            vendor_network: VendorNetworkSchemaConfig::default(),
427            customer_segmentation: CustomerSegmentationSchemaConfig::default(),
428            relationship_strength: RelationshipStrengthSchemaConfig::default(),
429            cross_process_links: CrossProcessLinksSchemaConfig::default(),
430            organizational_events: OrganizationalEventsSchemaConfig::default(),
431            behavioral_drift: BehavioralDriftSchemaConfig::default(),
432            market_drift: MarketDriftSchemaConfig::default(),
433            drift_labeling: DriftLabelingSchemaConfig::default(),
434            anomaly_injection: Default::default(),
435            industry_specific: Default::default(),
436            fingerprint_privacy: Default::default(),
437            quality_gates: Default::default(),
438            compliance: Default::default(),
439            webhooks: Default::default(),
440            llm: Default::default(),
441            diffusion: Default::default(),
442            causal: Default::default(),
443            source_to_pay: Default::default(),
444            financial_reporting: Default::default(),
445            hr: Default::default(),
446            manufacturing: Default::default(),
447            sales_quotes: Default::default(),
448        }
449    }
450
451    #[test]
452    fn test_run_manifest_creation() {
453        let config = create_test_config();
454        let manifest = RunManifest::new(&config, 42);
455
456        assert!(!manifest.run_id.is_empty());
457        assert_eq!(manifest.seed, 42);
458        assert!(!manifest.config_hash.is_empty());
459        assert!(manifest.completed_at.is_none());
460    }
461
462    #[test]
463    fn test_run_manifest_completion() {
464        let config = create_test_config();
465        let mut manifest = RunManifest::new(&config, 42);
466
467        // Simulate some work
468        std::thread::sleep(std::time::Duration::from_millis(10));
469
470        let stats = EnhancedGenerationStatistics {
471            total_entries: 100,
472            total_line_items: 500,
473            ..Default::default()
474        };
475        manifest.complete(stats);
476
477        assert!(manifest.completed_at.is_some());
478        assert!(manifest.duration_seconds.unwrap() >= 0.01);
479        assert_eq!(manifest.statistics.as_ref().unwrap().total_entries, 100);
480    }
481
482    #[test]
483    fn test_config_hash_consistency() {
484        let config = create_test_config();
485        let hash1 = RunManifest::hash_config(&config);
486        let hash2 = RunManifest::hash_config(&config);
487
488        assert_eq!(hash1, hash2);
489    }
490
491    #[test]
492    fn test_scenario_tags() {
493        let config = create_test_config();
494        let mut manifest = RunManifest::new(&config, 42);
495
496        manifest.add_tag("fraud_detection");
497        manifest.add_tag("retail");
498        manifest.add_tag("fraud_detection"); // Duplicate
499
500        assert_eq!(manifest.scenario_tags.len(), 2);
501        assert!(manifest
502            .scenario_tags
503            .contains(&"fraud_detection".to_string()));
504        assert!(manifest.scenario_tags.contains(&"retail".to_string()));
505    }
506
507    #[test]
508    fn test_output_file_tracking() {
509        let config = create_test_config();
510        let mut manifest = RunManifest::new(&config, 42);
511
512        manifest.add_output_file(OutputFileInfo {
513            path: "journal_entries.csv".to_string(),
514            format: "csv".to_string(),
515            record_count: Some(1000),
516            size_bytes: Some(102400),
517            sha256_checksum: None,
518            first_record_index: None,
519            last_record_index: None,
520        });
521
522        assert_eq!(manifest.output_files.len(), 1);
523        assert_eq!(manifest.output_files[0].record_count, Some(1000));
524    }
525
526    #[test]
527    fn test_manifest_version() {
528        let config = create_test_config();
529        let manifest = RunManifest::new(&config, 42);
530        assert_eq!(manifest.manifest_version, "2.0");
531    }
532
533    #[test]
534    fn test_backward_compat_deserialize() {
535        // Old manifest JSON without manifest_version or checksum fields
536        let old_json = r#"{
537            "run_id": "test-123",
538            "started_at": "2024-01-01T00:00:00Z",
539            "completed_at": null,
540            "config_hash": "abc123",
541            "config_snapshot": null,
542            "seed": 42,
543            "duration_seconds": null,
544            "generator_version": "0.4.0",
545            "output_directory": null,
546            "output_files": [
547                {
548                    "path": "data.csv",
549                    "format": "csv",
550                    "record_count": 100,
551                    "size_bytes": 1024
552                }
553            ]
554        }"#;
555
556        // Should deserialize without errors (config_snapshot will fail since it's null,
557        // but the point is that the new fields have proper defaults)
558        let result: Result<serde_json::Value, _> = serde_json::from_str(old_json);
559        assert!(result.is_ok());
560    }
561
562    #[test]
563    fn test_checksum_computation() {
564        let dir = tempfile::tempdir().expect("create temp dir");
565        let file_path = dir.path().join("test.txt");
566        std::fs::write(&file_path, b"hello world").expect("write file");
567
568        let checksum = compute_file_checksum(&file_path).expect("compute checksum");
569        // SHA-256 of "hello world"
570        assert_eq!(
571            checksum,
572            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
573        );
574    }
575
576    #[test]
577    fn test_populate_and_verify_checksums() {
578        let dir = tempfile::tempdir().expect("create temp dir");
579        let file_path = dir.path().join("data.csv");
580        std::fs::write(&file_path, b"id,name\n1,Alice\n2,Bob\n").expect("write file");
581
582        let config = create_test_config();
583        let mut manifest = RunManifest::new(&config, 42);
584        manifest.add_output_file(OutputFileInfo {
585            path: "data.csv".to_string(),
586            format: "csv".to_string(),
587            record_count: Some(2),
588            size_bytes: None,
589            sha256_checksum: None,
590            first_record_index: None,
591            last_record_index: None,
592        });
593
594        manifest.populate_file_checksums(dir.path());
595
596        assert!(manifest.output_files[0].sha256_checksum.is_some());
597        assert!(manifest.output_files[0].size_bytes.is_some());
598
599        // Verify should pass
600        let results = manifest.verify_file_checksums(dir.path());
601        assert_eq!(results.len(), 1);
602        assert_eq!(results[0].status, ChecksumStatus::Ok);
603    }
604
605    #[test]
606    fn test_verify_detects_mismatch() {
607        let dir = tempfile::tempdir().expect("create temp dir");
608        let file_path = dir.path().join("data.csv");
609        std::fs::write(&file_path, b"original content").expect("write file");
610
611        let config = create_test_config();
612        let mut manifest = RunManifest::new(&config, 42);
613        manifest.add_output_file(OutputFileInfo {
614            path: "data.csv".to_string(),
615            format: "csv".to_string(),
616            record_count: None,
617            size_bytes: None,
618            sha256_checksum: None,
619            first_record_index: None,
620            last_record_index: None,
621        });
622
623        manifest.populate_file_checksums(dir.path());
624
625        // Modify file after checksum
626        std::fs::write(&file_path, b"modified content").expect("write file");
627
628        let results = manifest.verify_file_checksums(dir.path());
629        assert_eq!(results[0].status, ChecksumStatus::Mismatch);
630    }
631
632    #[test]
633    fn test_verify_missing_file() {
634        let dir = tempfile::tempdir().expect("create temp dir");
635
636        let config = create_test_config();
637        let mut manifest = RunManifest::new(&config, 42);
638        manifest.add_output_file(OutputFileInfo {
639            path: "nonexistent.csv".to_string(),
640            format: "csv".to_string(),
641            record_count: None,
642            size_bytes: None,
643            sha256_checksum: Some("abc123".to_string()),
644            first_record_index: None,
645            last_record_index: None,
646        });
647
648        let results = manifest.verify_file_checksums(dir.path());
649        assert_eq!(results[0].status, ChecksumStatus::Missing);
650    }
651
652    #[test]
653    fn test_verify_no_checksum() {
654        let dir = tempfile::tempdir().expect("create temp dir");
655
656        let config = create_test_config();
657        let mut manifest = RunManifest::new(&config, 42);
658        manifest.add_output_file(OutputFileInfo {
659            path: "data.csv".to_string(),
660            format: "csv".to_string(),
661            record_count: None,
662            size_bytes: None,
663            sha256_checksum: None,
664            first_record_index: None,
665            last_record_index: None,
666        });
667
668        let results = manifest.verify_file_checksums(dir.path());
669        assert_eq!(results[0].status, ChecksumStatus::NoChecksum);
670    }
671}