Skip to main content

datasynth_runtime/
run_manifest.rs

1//! Run manifest and metadata tracking for reproducibility.
2//!
3//! This module provides structures for capturing complete generation run metadata,
4//! enabling reproducibility and traceability of generated data.
5
6use chrono::{DateTime, Utc};
7use datasynth_config::schema::GeneratorConfig;
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::fs::File;
12use std::io::{self, BufReader, Read as _, Write};
13use std::path::Path;
14use uuid::Uuid;
15
16use super::EnhancedGenerationStatistics;
17
18/// Complete manifest of a generation run for reproducibility.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct RunManifest {
21    /// Manifest format version.
22    #[serde(default = "default_manifest_version")]
23    pub manifest_version: String,
24    /// Unique identifier for this run.
25    pub run_id: String,
26    /// Timestamp when generation started.
27    pub started_at: DateTime<Utc>,
28    /// Timestamp when generation completed.
29    pub completed_at: Option<DateTime<Utc>>,
30    /// SHA-256 hash of the configuration (for quick comparison).
31    pub config_hash: String,
32    /// Complete configuration snapshot.
33    pub config_snapshot: GeneratorConfig,
34    /// Seed used for random number generation.
35    pub seed: u64,
36    /// Scenario tags for categorization.
37    #[serde(default)]
38    pub scenario_tags: Vec<String>,
39    /// Generation statistics.
40    #[serde(default)]
41    pub statistics: Option<EnhancedGenerationStatistics>,
42    /// Duration in seconds.
43    pub duration_seconds: Option<f64>,
44    /// Version of the generator.
45    pub generator_version: String,
46    /// Additional metadata.
47    #[serde(default)]
48    pub metadata: HashMap<String, String>,
49    /// Output directory path.
50    pub output_directory: Option<String>,
51    /// List of output files generated.
52    #[serde(default)]
53    pub output_files: Vec<OutputFileInfo>,
54    /// Any warnings or notes from the generation.
55    #[serde(default)]
56    pub warnings: Vec<String>,
57    /// Data lineage graph tracking config → generator → output relationships.
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub lineage: Option<super::lineage::LineageGraph>,
60    /// Quality gate evaluation result.
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub quality_gate_result: Option<QualityGateResultSummary>,
63    /// LLM enrichment phase summary.
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub llm_enrichment: Option<LlmEnrichmentSummary>,
66    /// Diffusion enhancement phase summary.
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub diffusion_model: Option<DiffusionModelSummary>,
69    /// Causal generation phase summary.
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub causal_generation: Option<CausalGenerationSummary>,
72}
73
74/// Summary of LLM enrichment phase for the run manifest.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct LlmEnrichmentSummary {
77    /// Whether LLM enrichment was enabled.
78    pub enabled: bool,
79    /// Execution time in milliseconds.
80    pub timing_ms: u64,
81    /// Number of vendors enriched.
82    pub vendors_enriched: usize,
83    /// Provider used (e.g., "mock", "openai").
84    pub provider: String,
85}
86
87/// Summary of diffusion enhancement phase for the run manifest.
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct DiffusionModelSummary {
90    /// Whether diffusion enhancement was enabled.
91    pub enabled: bool,
92    /// Execution time in milliseconds.
93    pub timing_ms: u64,
94    /// Number of samples generated.
95    pub samples_generated: usize,
96    /// Number of diffusion steps used.
97    pub n_steps: usize,
98}
99
100/// Summary of causal generation phase for the run manifest.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CausalGenerationSummary {
103    /// Whether causal generation was enabled.
104    pub enabled: bool,
105    /// Execution time in milliseconds.
106    pub timing_ms: u64,
107    /// Number of causal samples generated.
108    pub samples_generated: usize,
109    /// Template used (e.g., "fraud_detection", "revenue_cycle").
110    pub template: String,
111    /// Whether causal validation passed (None if validation was not run).
112    pub validation_passed: Option<bool>,
113}
114
115/// Summary of quality gate evaluation for the run manifest.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct QualityGateResultSummary {
118    /// Whether all gates passed.
119    pub passed: bool,
120    /// Profile name used.
121    pub profile_name: String,
122    /// Number of gates that passed.
123    pub gates_passed: usize,
124    /// Total number of gates evaluated.
125    pub gates_total: usize,
126    /// Names of failed gates.
127    pub failed_gates: Vec<String>,
128}
129
130fn default_manifest_version() -> String {
131    "2.0".to_string()
132}
133
134/// Information about an output file.
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct OutputFileInfo {
137    /// Relative path from output directory.
138    pub path: String,
139    /// File format (csv, json, parquet).
140    pub format: String,
141    /// Record count.
142    pub record_count: Option<usize>,
143    /// File size in bytes.
144    pub size_bytes: Option<u64>,
145    /// SHA-256 checksum of the file contents.
146    #[serde(default, skip_serializing_if = "Option::is_none")]
147    pub sha256_checksum: Option<String>,
148    /// Index of the first record in this file (for partitioned outputs).
149    #[serde(default, skip_serializing_if = "Option::is_none")]
150    pub first_record_index: Option<u64>,
151    /// Index of the last record in this file (for partitioned outputs).
152    #[serde(default, skip_serializing_if = "Option::is_none")]
153    pub last_record_index: Option<u64>,
154}
155
156/// Result of verifying a single file's checksum.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct ChecksumVerificationResult {
159    /// Relative path of the file.
160    pub path: String,
161    /// Verification status.
162    pub status: ChecksumStatus,
163    /// Expected checksum (from manifest).
164    pub expected: Option<String>,
165    /// Actual checksum (computed from file).
166    pub actual: Option<String>,
167}
168
169/// Status of a checksum verification.
170#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
171pub enum ChecksumStatus {
172    /// Checksum matches.
173    Ok,
174    /// Checksum does not match.
175    Mismatch,
176    /// File is missing on disk.
177    Missing,
178    /// No checksum recorded in manifest.
179    NoChecksum,
180}
181
182/// Computes the SHA-256 checksum of a file, streaming in 8KB chunks.
183pub fn compute_file_checksum(path: &Path) -> io::Result<String> {
184    let file = File::open(path)?;
185    let mut reader = BufReader::new(file);
186    let mut hasher = Sha256::new();
187    let mut buffer = [0u8; 8192];
188    loop {
189        let bytes_read = reader.read(&mut buffer)?;
190        if bytes_read == 0 {
191            break;
192        }
193        hasher.update(&buffer[..bytes_read]);
194    }
195    Ok(hex::encode(hasher.finalize()))
196}
197
198impl RunManifest {
199    /// Creates a new run manifest.
200    pub fn new(config: &GeneratorConfig, seed: u64) -> Self {
201        let run_id = Uuid::new_v4().to_string();
202        let config_hash = Self::hash_config(config);
203
204        Self {
205            manifest_version: "2.0".to_string(),
206            run_id,
207            started_at: Utc::now(),
208            completed_at: None,
209            config_hash,
210            config_snapshot: config.clone(),
211            seed,
212            scenario_tags: Vec::new(),
213            statistics: None,
214            duration_seconds: None,
215            generator_version: env!("CARGO_PKG_VERSION").to_string(),
216            metadata: HashMap::new(),
217            output_directory: None,
218            output_files: Vec::new(),
219            warnings: Vec::new(),
220            lineage: None,
221            quality_gate_result: None,
222            llm_enrichment: None,
223            diffusion_model: None,
224            causal_generation: None,
225        }
226    }
227
228    /// Computes SHA-256 hash of the configuration.
229    fn hash_config(config: &GeneratorConfig) -> String {
230        let json = serde_json::to_string(config).unwrap_or_default();
231        let mut hasher = Sha256::new();
232        hasher.update(json.as_bytes());
233        let result = hasher.finalize();
234        hex::encode(result)
235    }
236
237    /// Marks the run as complete.
238    pub fn complete(&mut self, statistics: EnhancedGenerationStatistics) {
239        self.completed_at = Some(Utc::now());
240        self.duration_seconds = Some(
241            (self.completed_at.expect("completed_at just set above") - self.started_at)
242                .num_milliseconds() as f64
243                / 1000.0,
244        );
245        self.statistics = Some(statistics);
246    }
247
248    /// Adds a scenario tag.
249    pub fn add_tag(&mut self, tag: &str) {
250        if !self.scenario_tags.contains(&tag.to_string()) {
251            self.scenario_tags.push(tag.to_string());
252        }
253    }
254
255    /// Adds multiple scenario tags.
256    pub fn add_tags(&mut self, tags: &[String]) {
257        for tag in tags {
258            self.add_tag(tag);
259        }
260    }
261
262    /// Sets the output directory.
263    pub fn set_output_directory(&mut self, path: &Path) {
264        self.output_directory = Some(path.display().to_string());
265    }
266
267    /// Adds an output file record.
268    pub fn add_output_file(&mut self, info: OutputFileInfo) {
269        self.output_files.push(info);
270    }
271
272    /// Adds a warning message.
273    pub fn add_warning(&mut self, warning: &str) {
274        self.warnings.push(warning.to_string());
275    }
276
277    /// Adds metadata.
278    pub fn add_metadata(&mut self, key: &str, value: &str) {
279        self.metadata.insert(key.to_string(), value.to_string());
280    }
281
282    /// Populates SHA-256 checksums for all output files.
283    ///
284    /// Resolves each file path relative to `base_dir` and computes its checksum.
285    /// Also populates `size_bytes` if not already set.
286    pub fn populate_file_checksums(&mut self, base_dir: &Path) {
287        for file_info in &mut self.output_files {
288            let file_path = base_dir.join(&file_info.path);
289            if file_path.exists() {
290                if let Ok(checksum) = compute_file_checksum(&file_path) {
291                    file_info.sha256_checksum = Some(checksum);
292                }
293                if file_info.size_bytes.is_none() {
294                    if let Ok(metadata) = std::fs::metadata(&file_path) {
295                        file_info.size_bytes = Some(metadata.len());
296                    }
297                }
298            }
299        }
300    }
301
302    /// Verifies checksums for all output files against their recorded values.
303    pub fn verify_file_checksums(&self, base_dir: &Path) -> Vec<ChecksumVerificationResult> {
304        self.output_files
305            .iter()
306            .map(|file_info| {
307                let file_path = base_dir.join(&file_info.path);
308
309                let expected = file_info.sha256_checksum.clone();
310                if expected.is_none() {
311                    return ChecksumVerificationResult {
312                        path: file_info.path.clone(),
313                        status: ChecksumStatus::NoChecksum,
314                        expected: None,
315                        actual: None,
316                    };
317                }
318
319                if !file_path.exists() {
320                    return ChecksumVerificationResult {
321                        path: file_info.path.clone(),
322                        status: ChecksumStatus::Missing,
323                        expected,
324                        actual: None,
325                    };
326                }
327
328                match compute_file_checksum(&file_path) {
329                    Ok(actual) => {
330                        let status = if expected.as_deref() == Some(actual.as_str()) {
331                            ChecksumStatus::Ok
332                        } else {
333                            ChecksumStatus::Mismatch
334                        };
335                        ChecksumVerificationResult {
336                            path: file_info.path.clone(),
337                            status,
338                            expected,
339                            actual: Some(actual),
340                        }
341                    }
342                    Err(_) => ChecksumVerificationResult {
343                        path: file_info.path.clone(),
344                        status: ChecksumStatus::Missing,
345                        expected,
346                        actual: None,
347                    },
348                }
349            })
350            .collect()
351    }
352
353    /// Writes the manifest to a JSON file.
354    pub fn write_to_file(&self, path: &Path) -> std::io::Result<()> {
355        let json = serde_json::to_string_pretty(self)?;
356        let mut file = File::create(path)?;
357        file.write_all(json.as_bytes())?;
358        Ok(())
359    }
360
361    /// Returns the run ID.
362    pub fn run_id(&self) -> &str {
363        &self.run_id
364    }
365}
366
367// Note: ScenarioConfig is now defined in datasynth-config/src/schema.rs
368// and exported via datasynth_config::schema::ScenarioConfig
369
370#[cfg(test)]
371#[allow(clippy::unwrap_used)]
372mod tests {
373    use super::*;
374    use datasynth_config::schema::*;
375
376    fn create_test_config() -> GeneratorConfig {
377        GeneratorConfig {
378            global: GlobalConfig {
379                industry: datasynth_core::models::IndustrySector::Manufacturing,
380                start_date: "2024-01-01".to_string(),
381                period_months: 1,
382                seed: Some(42),
383                parallel: false,
384                group_currency: "USD".to_string(),
385                worker_threads: 1,
386                memory_limit_mb: 512,
387            },
388            companies: vec![CompanyConfig {
389                code: "TEST".to_string(),
390                name: "Test Company".to_string(),
391                currency: "USD".to_string(),
392                country: "US".to_string(),
393                annual_transaction_volume: TransactionVolume::TenK,
394                volume_weight: 1.0,
395                fiscal_year_variant: "K4".to_string(),
396            }],
397            chart_of_accounts: ChartOfAccountsConfig::default(),
398            transactions: TransactionConfig::default(),
399            output: OutputConfig::default(),
400            fraud: FraudConfig::default(),
401            internal_controls: InternalControlsConfig::default(),
402            business_processes: BusinessProcessConfig::default(),
403            user_personas: UserPersonaConfig::default(),
404            templates: TemplateConfig::default(),
405            approval: ApprovalConfig::default(),
406            departments: DepartmentConfig::default(),
407            master_data: MasterDataConfig::default(),
408            document_flows: DocumentFlowConfig::default(),
409            intercompany: IntercompanyConfig::default(),
410            balance: BalanceConfig::default(),
411            ocpm: OcpmConfig::default(),
412            audit: AuditGenerationConfig::default(),
413            banking: datasynth_banking::BankingConfig::default(),
414            data_quality: DataQualitySchemaConfig::default(),
415            scenario: ScenarioConfig::default(),
416            temporal: TemporalDriftConfig::default(),
417            graph_export: GraphExportConfig::default(),
418            streaming: StreamingSchemaConfig::default(),
419            rate_limit: RateLimitSchemaConfig::default(),
420            temporal_attributes: TemporalAttributeSchemaConfig::default(),
421            relationships: RelationshipSchemaConfig::default(),
422            accounting_standards: AccountingStandardsConfig::default(),
423            audit_standards: AuditStandardsConfig::default(),
424            distributions: Default::default(),
425            temporal_patterns: Default::default(),
426            vendor_network: VendorNetworkSchemaConfig::default(),
427            customer_segmentation: CustomerSegmentationSchemaConfig::default(),
428            relationship_strength: RelationshipStrengthSchemaConfig::default(),
429            cross_process_links: CrossProcessLinksSchemaConfig::default(),
430            organizational_events: OrganizationalEventsSchemaConfig::default(),
431            behavioral_drift: BehavioralDriftSchemaConfig::default(),
432            market_drift: MarketDriftSchemaConfig::default(),
433            drift_labeling: DriftLabelingSchemaConfig::default(),
434            anomaly_injection: Default::default(),
435            industry_specific: Default::default(),
436            fingerprint_privacy: Default::default(),
437            quality_gates: Default::default(),
438            compliance: Default::default(),
439            webhooks: Default::default(),
440            llm: Default::default(),
441            diffusion: Default::default(),
442            causal: Default::default(),
443            source_to_pay: Default::default(),
444            financial_reporting: Default::default(),
445            hr: Default::default(),
446            manufacturing: Default::default(),
447            sales_quotes: Default::default(),
448            tax: Default::default(),
449            treasury: Default::default(),
450            project_accounting: Default::default(),
451            esg: Default::default(),
452        }
453    }
454
455    #[test]
456    fn test_run_manifest_creation() {
457        let config = create_test_config();
458        let manifest = RunManifest::new(&config, 42);
459
460        assert!(!manifest.run_id.is_empty());
461        assert_eq!(manifest.seed, 42);
462        assert!(!manifest.config_hash.is_empty());
463        assert!(manifest.completed_at.is_none());
464    }
465
466    #[test]
467    fn test_run_manifest_completion() {
468        let config = create_test_config();
469        let mut manifest = RunManifest::new(&config, 42);
470
471        // Simulate some work
472        std::thread::sleep(std::time::Duration::from_millis(10));
473
474        let stats = EnhancedGenerationStatistics {
475            total_entries: 100,
476            total_line_items: 500,
477            ..Default::default()
478        };
479        manifest.complete(stats);
480
481        assert!(manifest.completed_at.is_some());
482        assert!(manifest.duration_seconds.unwrap() >= 0.01);
483        assert_eq!(manifest.statistics.as_ref().unwrap().total_entries, 100);
484    }
485
486    #[test]
487    fn test_config_hash_consistency() {
488        let config = create_test_config();
489        let hash1 = RunManifest::hash_config(&config);
490        let hash2 = RunManifest::hash_config(&config);
491
492        assert_eq!(hash1, hash2);
493    }
494
495    #[test]
496    fn test_scenario_tags() {
497        let config = create_test_config();
498        let mut manifest = RunManifest::new(&config, 42);
499
500        manifest.add_tag("fraud_detection");
501        manifest.add_tag("retail");
502        manifest.add_tag("fraud_detection"); // Duplicate
503
504        assert_eq!(manifest.scenario_tags.len(), 2);
505        assert!(manifest
506            .scenario_tags
507            .contains(&"fraud_detection".to_string()));
508        assert!(manifest.scenario_tags.contains(&"retail".to_string()));
509    }
510
511    #[test]
512    fn test_output_file_tracking() {
513        let config = create_test_config();
514        let mut manifest = RunManifest::new(&config, 42);
515
516        manifest.add_output_file(OutputFileInfo {
517            path: "journal_entries.csv".to_string(),
518            format: "csv".to_string(),
519            record_count: Some(1000),
520            size_bytes: Some(102400),
521            sha256_checksum: None,
522            first_record_index: None,
523            last_record_index: None,
524        });
525
526        assert_eq!(manifest.output_files.len(), 1);
527        assert_eq!(manifest.output_files[0].record_count, Some(1000));
528    }
529
530    #[test]
531    fn test_manifest_version() {
532        let config = create_test_config();
533        let manifest = RunManifest::new(&config, 42);
534        assert_eq!(manifest.manifest_version, "2.0");
535    }
536
537    #[test]
538    fn test_backward_compat_deserialize() {
539        // Old manifest JSON without manifest_version or checksum fields
540        let old_json = r#"{
541            "run_id": "test-123",
542            "started_at": "2024-01-01T00:00:00Z",
543            "completed_at": null,
544            "config_hash": "abc123",
545            "config_snapshot": null,
546            "seed": 42,
547            "duration_seconds": null,
548            "generator_version": "0.4.0",
549            "output_directory": null,
550            "output_files": [
551                {
552                    "path": "data.csv",
553                    "format": "csv",
554                    "record_count": 100,
555                    "size_bytes": 1024
556                }
557            ]
558        }"#;
559
560        // Should deserialize without errors (config_snapshot will fail since it's null,
561        // but the point is that the new fields have proper defaults)
562        let result: Result<serde_json::Value, _> = serde_json::from_str(old_json);
563        assert!(result.is_ok());
564    }
565
566    #[test]
567    fn test_checksum_computation() {
568        let dir = tempfile::tempdir().expect("create temp dir");
569        let file_path = dir.path().join("test.txt");
570        std::fs::write(&file_path, b"hello world").expect("write file");
571
572        let checksum = compute_file_checksum(&file_path).expect("compute checksum");
573        // SHA-256 of "hello world"
574        assert_eq!(
575            checksum,
576            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
577        );
578    }
579
580    #[test]
581    fn test_populate_and_verify_checksums() {
582        let dir = tempfile::tempdir().expect("create temp dir");
583        let file_path = dir.path().join("data.csv");
584        std::fs::write(&file_path, b"id,name\n1,Alice\n2,Bob\n").expect("write file");
585
586        let config = create_test_config();
587        let mut manifest = RunManifest::new(&config, 42);
588        manifest.add_output_file(OutputFileInfo {
589            path: "data.csv".to_string(),
590            format: "csv".to_string(),
591            record_count: Some(2),
592            size_bytes: None,
593            sha256_checksum: None,
594            first_record_index: None,
595            last_record_index: None,
596        });
597
598        manifest.populate_file_checksums(dir.path());
599
600        assert!(manifest.output_files[0].sha256_checksum.is_some());
601        assert!(manifest.output_files[0].size_bytes.is_some());
602
603        // Verify should pass
604        let results = manifest.verify_file_checksums(dir.path());
605        assert_eq!(results.len(), 1);
606        assert_eq!(results[0].status, ChecksumStatus::Ok);
607    }
608
609    #[test]
610    fn test_verify_detects_mismatch() {
611        let dir = tempfile::tempdir().expect("create temp dir");
612        let file_path = dir.path().join("data.csv");
613        std::fs::write(&file_path, b"original content").expect("write file");
614
615        let config = create_test_config();
616        let mut manifest = RunManifest::new(&config, 42);
617        manifest.add_output_file(OutputFileInfo {
618            path: "data.csv".to_string(),
619            format: "csv".to_string(),
620            record_count: None,
621            size_bytes: None,
622            sha256_checksum: None,
623            first_record_index: None,
624            last_record_index: None,
625        });
626
627        manifest.populate_file_checksums(dir.path());
628
629        // Modify file after checksum
630        std::fs::write(&file_path, b"modified content").expect("write file");
631
632        let results = manifest.verify_file_checksums(dir.path());
633        assert_eq!(results[0].status, ChecksumStatus::Mismatch);
634    }
635
636    #[test]
637    fn test_verify_missing_file() {
638        let dir = tempfile::tempdir().expect("create temp dir");
639
640        let config = create_test_config();
641        let mut manifest = RunManifest::new(&config, 42);
642        manifest.add_output_file(OutputFileInfo {
643            path: "nonexistent.csv".to_string(),
644            format: "csv".to_string(),
645            record_count: None,
646            size_bytes: None,
647            sha256_checksum: Some("abc123".to_string()),
648            first_record_index: None,
649            last_record_index: None,
650        });
651
652        let results = manifest.verify_file_checksums(dir.path());
653        assert_eq!(results[0].status, ChecksumStatus::Missing);
654    }
655
656    #[test]
657    fn test_verify_no_checksum() {
658        let dir = tempfile::tempdir().expect("create temp dir");
659
660        let config = create_test_config();
661        let mut manifest = RunManifest::new(&config, 42);
662        manifest.add_output_file(OutputFileInfo {
663            path: "data.csv".to_string(),
664            format: "csv".to_string(),
665            record_count: None,
666            size_bytes: None,
667            sha256_checksum: None,
668            first_record_index: None,
669            last_record_index: None,
670        });
671
672        let results = manifest.verify_file_checksums(dir.path());
673        assert_eq!(results[0].status, ChecksumStatus::NoChecksum);
674    }
675}