Skip to main content

datasynth_runtime/
run_manifest.rs

1//! Run manifest and metadata tracking for reproducibility.
2//!
3//! This module provides structures for capturing complete generation run metadata,
4//! enabling reproducibility and traceability of generated data.
5
6use chrono::{DateTime, Utc};
7use datasynth_config::schema::GeneratorConfig;
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::fs::File;
12use std::io::{self, BufReader, Read as _, Write};
13use std::path::Path;
14use uuid::Uuid;
15
16use super::EnhancedGenerationStatistics;
17
18/// Complete manifest of a generation run for reproducibility.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct RunManifest {
21    /// Manifest format version.
22    #[serde(default = "default_manifest_version")]
23    pub manifest_version: String,
24    /// Unique identifier for this run.
25    pub run_id: String,
26    /// Timestamp when generation started.
27    pub started_at: DateTime<Utc>,
28    /// Timestamp when generation completed.
29    pub completed_at: Option<DateTime<Utc>>,
30    /// SHA-256 hash of the configuration (for quick comparison).
31    pub config_hash: String,
32    /// Complete configuration snapshot.
33    pub config_snapshot: GeneratorConfig,
34    /// Seed used for random number generation.
35    pub seed: u64,
36    /// Scenario tags for categorization.
37    #[serde(default)]
38    pub scenario_tags: Vec<String>,
39    /// Generation statistics.
40    #[serde(default)]
41    pub statistics: Option<EnhancedGenerationStatistics>,
42    /// Duration in seconds.
43    pub duration_seconds: Option<f64>,
44    /// Version of the generator.
45    pub generator_version: String,
46    /// Additional metadata.
47    #[serde(default)]
48    pub metadata: HashMap<String, String>,
49    /// Output directory path.
50    pub output_directory: Option<String>,
51    /// List of output files generated.
52    #[serde(default)]
53    pub output_files: Vec<OutputFileInfo>,
54    /// Any warnings or notes from the generation.
55    #[serde(default)]
56    pub warnings: Vec<String>,
57    /// Data lineage graph tracking config → generator → output relationships.
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub lineage: Option<super::lineage::LineageGraph>,
60    /// Quality gate evaluation result.
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub quality_gate_result: Option<QualityGateResultSummary>,
63    /// LLM enrichment phase summary.
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub llm_enrichment: Option<LlmEnrichmentSummary>,
66    /// Diffusion enhancement phase summary.
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub diffusion_model: Option<DiffusionModelSummary>,
69    /// Causal generation phase summary.
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub causal_generation: Option<CausalGenerationSummary>,
72}
73
74/// Summary of LLM enrichment phase for the run manifest.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct LlmEnrichmentSummary {
77    /// Whether LLM enrichment was enabled.
78    pub enabled: bool,
79    /// Execution time in milliseconds.
80    pub timing_ms: u64,
81    /// Number of vendors enriched.
82    pub vendors_enriched: usize,
83    /// Provider used (e.g., "mock", "openai").
84    pub provider: String,
85}
86
87/// Summary of diffusion enhancement phase for the run manifest.
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct DiffusionModelSummary {
90    /// Whether diffusion enhancement was enabled.
91    pub enabled: bool,
92    /// Execution time in milliseconds.
93    pub timing_ms: u64,
94    /// Number of samples generated.
95    pub samples_generated: usize,
96    /// Number of diffusion steps used.
97    pub n_steps: usize,
98}
99
100/// Summary of causal generation phase for the run manifest.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CausalGenerationSummary {
103    /// Whether causal generation was enabled.
104    pub enabled: bool,
105    /// Execution time in milliseconds.
106    pub timing_ms: u64,
107    /// Number of causal samples generated.
108    pub samples_generated: usize,
109    /// Template used (e.g., "fraud_detection", "revenue_cycle").
110    pub template: String,
111    /// Whether causal validation passed (None if validation was not run).
112    pub validation_passed: Option<bool>,
113}
114
115/// Summary of quality gate evaluation for the run manifest.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct QualityGateResultSummary {
118    /// Whether all gates passed.
119    pub passed: bool,
120    /// Profile name used.
121    pub profile_name: String,
122    /// Number of gates that passed.
123    pub gates_passed: usize,
124    /// Total number of gates evaluated.
125    pub gates_total: usize,
126    /// Names of failed gates.
127    pub failed_gates: Vec<String>,
128}
129
130fn default_manifest_version() -> String {
131    "2.0".to_string()
132}
133
134/// Information about an output file.
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct OutputFileInfo {
137    /// Relative path from output directory.
138    pub path: String,
139    /// File format (csv, json, parquet).
140    pub format: String,
141    /// Record count.
142    pub record_count: Option<usize>,
143    /// File size in bytes.
144    pub size_bytes: Option<u64>,
145    /// SHA-256 checksum of the file contents.
146    #[serde(default, skip_serializing_if = "Option::is_none")]
147    pub sha256_checksum: Option<String>,
148    /// Index of the first record in this file (for partitioned outputs).
149    #[serde(default, skip_serializing_if = "Option::is_none")]
150    pub first_record_index: Option<u64>,
151    /// Index of the last record in this file (for partitioned outputs).
152    #[serde(default, skip_serializing_if = "Option::is_none")]
153    pub last_record_index: Option<u64>,
154}
155
156/// Result of verifying a single file's checksum.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct ChecksumVerificationResult {
159    /// Relative path of the file.
160    pub path: String,
161    /// Verification status.
162    pub status: ChecksumStatus,
163    /// Expected checksum (from manifest).
164    pub expected: Option<String>,
165    /// Actual checksum (computed from file).
166    pub actual: Option<String>,
167}
168
169/// Status of a checksum verification.
170#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
171pub enum ChecksumStatus {
172    /// Checksum matches.
173    Ok,
174    /// Checksum does not match.
175    Mismatch,
176    /// File is missing on disk.
177    Missing,
178    /// No checksum recorded in manifest.
179    NoChecksum,
180}
181
182/// Computes the SHA-256 checksum of a file, streaming in 8KB chunks.
183pub fn compute_file_checksum(path: &Path) -> io::Result<String> {
184    let file = File::open(path)?;
185    let mut reader = BufReader::new(file);
186    let mut hasher = Sha256::new();
187    let mut buffer = [0u8; 8192];
188    loop {
189        let bytes_read = reader.read(&mut buffer)?;
190        if bytes_read == 0 {
191            break;
192        }
193        hasher.update(&buffer[..bytes_read]);
194    }
195    Ok(hex::encode(hasher.finalize()))
196}
197
198impl RunManifest {
199    /// Creates a new run manifest.
200    pub fn new(config: &GeneratorConfig, seed: u64) -> Self {
201        let run_id = Uuid::new_v4().to_string();
202        let config_hash = Self::hash_config(config);
203
204        Self {
205            manifest_version: "2.0".to_string(),
206            run_id,
207            started_at: Utc::now(),
208            completed_at: None,
209            config_hash,
210            config_snapshot: config.clone(),
211            seed,
212            scenario_tags: Vec::new(),
213            statistics: None,
214            duration_seconds: None,
215            generator_version: env!("CARGO_PKG_VERSION").to_string(),
216            metadata: HashMap::new(),
217            output_directory: None,
218            output_files: Vec::new(),
219            warnings: Vec::new(),
220            lineage: None,
221            quality_gate_result: None,
222            llm_enrichment: None,
223            diffusion_model: None,
224            causal_generation: None,
225        }
226    }
227
228    /// Computes SHA-256 hash of the configuration.
229    fn hash_config(config: &GeneratorConfig) -> String {
230        let json = match serde_json::to_string(config) {
231            Ok(j) => j,
232            Err(e) => {
233                tracing::warn!("Failed to serialize config for hashing: {}", e);
234                String::new()
235            }
236        };
237        let mut hasher = Sha256::new();
238        hasher.update(json.as_bytes());
239        let result = hasher.finalize();
240        hex::encode(result)
241    }
242
243    /// Marks the run as complete.
244    pub fn complete(&mut self, statistics: EnhancedGenerationStatistics) {
245        let now = Utc::now();
246        self.completed_at = Some(now);
247        self.duration_seconds = Some((now - self.started_at).num_milliseconds() as f64 / 1000.0);
248        self.statistics = Some(statistics);
249    }
250
251    /// Adds a scenario tag.
252    pub fn add_tag(&mut self, tag: &str) {
253        if !self.scenario_tags.contains(&tag.to_string()) {
254            self.scenario_tags.push(tag.to_string());
255        }
256    }
257
258    /// Adds multiple scenario tags.
259    pub fn add_tags(&mut self, tags: &[String]) {
260        for tag in tags {
261            self.add_tag(tag);
262        }
263    }
264
265    /// Sets the output directory.
266    pub fn set_output_directory(&mut self, path: &Path) {
267        self.output_directory = Some(path.display().to_string());
268    }
269
270    /// Adds an output file record.
271    pub fn add_output_file(&mut self, info: OutputFileInfo) {
272        self.output_files.push(info);
273    }
274
275    /// Adds a warning message.
276    pub fn add_warning(&mut self, warning: &str) {
277        self.warnings.push(warning.to_string());
278    }
279
280    /// Adds metadata.
281    pub fn add_metadata(&mut self, key: &str, value: &str) {
282        self.metadata.insert(key.to_string(), value.to_string());
283    }
284
285    /// Populates SHA-256 checksums for all output files.
286    ///
287    /// Resolves each file path relative to `base_dir` and computes its checksum.
288    /// Also populates `size_bytes` if not already set.
289    pub fn populate_file_checksums(&mut self, base_dir: &Path) {
290        for file_info in &mut self.output_files {
291            let file_path = base_dir.join(&file_info.path);
292            if file_path.exists() {
293                if let Ok(checksum) = compute_file_checksum(&file_path) {
294                    file_info.sha256_checksum = Some(checksum);
295                }
296                if file_info.size_bytes.is_none() {
297                    if let Ok(metadata) = std::fs::metadata(&file_path) {
298                        file_info.size_bytes = Some(metadata.len());
299                    }
300                }
301            }
302        }
303    }
304
305    /// Verifies checksums for all output files against their recorded values.
306    pub fn verify_file_checksums(&self, base_dir: &Path) -> Vec<ChecksumVerificationResult> {
307        self.output_files
308            .iter()
309            .map(|file_info| {
310                let file_path = base_dir.join(&file_info.path);
311
312                let expected = file_info.sha256_checksum.clone();
313                if expected.is_none() {
314                    return ChecksumVerificationResult {
315                        path: file_info.path.clone(),
316                        status: ChecksumStatus::NoChecksum,
317                        expected: None,
318                        actual: None,
319                    };
320                }
321
322                if !file_path.exists() {
323                    return ChecksumVerificationResult {
324                        path: file_info.path.clone(),
325                        status: ChecksumStatus::Missing,
326                        expected,
327                        actual: None,
328                    };
329                }
330
331                match compute_file_checksum(&file_path) {
332                    Ok(actual) => {
333                        let status = if expected.as_deref() == Some(actual.as_str()) {
334                            ChecksumStatus::Ok
335                        } else {
336                            ChecksumStatus::Mismatch
337                        };
338                        ChecksumVerificationResult {
339                            path: file_info.path.clone(),
340                            status,
341                            expected,
342                            actual: Some(actual),
343                        }
344                    }
345                    Err(_) => ChecksumVerificationResult {
346                        path: file_info.path.clone(),
347                        status: ChecksumStatus::Missing,
348                        expected,
349                        actual: None,
350                    },
351                }
352            })
353            .collect()
354    }
355
356    /// Writes the manifest to a JSON file.
357    pub fn write_to_file(&self, path: &Path) -> std::io::Result<()> {
358        let json = serde_json::to_string_pretty(self)?;
359        let mut file = File::create(path)?;
360        file.write_all(json.as_bytes())?;
361        Ok(())
362    }
363
364    /// Returns the run ID.
365    pub fn run_id(&self) -> &str {
366        &self.run_id
367    }
368}
369
370// Note: ScenarioConfig is now defined in datasynth-config/src/schema.rs
371// and exported via datasynth_config::schema::ScenarioConfig
372
373#[cfg(test)]
374#[allow(clippy::unwrap_used)]
375mod tests {
376    use super::*;
377    use datasynth_config::schema::*;
378
379    fn create_test_config() -> GeneratorConfig {
380        GeneratorConfig {
381            global: GlobalConfig {
382                industry: datasynth_core::models::IndustrySector::Manufacturing,
383                start_date: "2024-01-01".to_string(),
384                period_months: 1,
385                seed: Some(42),
386                parallel: false,
387                group_currency: "USD".to_string(),
388                worker_threads: 1,
389                memory_limit_mb: 512,
390                fiscal_year_months: None,
391            },
392            companies: vec![CompanyConfig {
393                code: "TEST".to_string(),
394                name: "Test Company".to_string(),
395                currency: "USD".to_string(),
396                country: "US".to_string(),
397                annual_transaction_volume: TransactionVolume::TenK,
398                volume_weight: 1.0,
399                fiscal_year_variant: "K4".to_string(),
400            }],
401            chart_of_accounts: ChartOfAccountsConfig::default(),
402            transactions: TransactionConfig::default(),
403            output: OutputConfig::default(),
404            fraud: FraudConfig::default(),
405            internal_controls: InternalControlsConfig::default(),
406            business_processes: BusinessProcessConfig::default(),
407            user_personas: UserPersonaConfig::default(),
408            templates: TemplateConfig::default(),
409            approval: ApprovalConfig::default(),
410            departments: DepartmentConfig::default(),
411            master_data: MasterDataConfig::default(),
412            document_flows: DocumentFlowConfig::default(),
413            intercompany: IntercompanyConfig::default(),
414            balance: BalanceConfig::default(),
415            ocpm: OcpmConfig::default(),
416            audit: AuditGenerationConfig::default(),
417            banking: datasynth_banking::BankingConfig::default(),
418            data_quality: DataQualitySchemaConfig::default(),
419            scenario: ScenarioConfig::default(),
420            temporal: TemporalDriftConfig::default(),
421            graph_export: GraphExportConfig::default(),
422            streaming: StreamingSchemaConfig::default(),
423            rate_limit: RateLimitSchemaConfig::default(),
424            temporal_attributes: TemporalAttributeSchemaConfig::default(),
425            relationships: RelationshipSchemaConfig::default(),
426            accounting_standards: AccountingStandardsConfig::default(),
427            audit_standards: AuditStandardsConfig::default(),
428            distributions: Default::default(),
429            temporal_patterns: Default::default(),
430            vendor_network: VendorNetworkSchemaConfig::default(),
431            customer_segmentation: CustomerSegmentationSchemaConfig::default(),
432            relationship_strength: RelationshipStrengthSchemaConfig::default(),
433            cross_process_links: CrossProcessLinksSchemaConfig::default(),
434            organizational_events: OrganizationalEventsSchemaConfig::default(),
435            behavioral_drift: BehavioralDriftSchemaConfig::default(),
436            market_drift: MarketDriftSchemaConfig::default(),
437            drift_labeling: DriftLabelingSchemaConfig::default(),
438            anomaly_injection: Default::default(),
439            industry_specific: Default::default(),
440            fingerprint_privacy: Default::default(),
441            quality_gates: Default::default(),
442            compliance: Default::default(),
443            webhooks: Default::default(),
444            llm: Default::default(),
445            diffusion: Default::default(),
446            causal: Default::default(),
447            source_to_pay: Default::default(),
448            financial_reporting: Default::default(),
449            hr: Default::default(),
450            manufacturing: Default::default(),
451            sales_quotes: Default::default(),
452            tax: Default::default(),
453            treasury: Default::default(),
454            project_accounting: Default::default(),
455            esg: Default::default(),
456            country_packs: None,
457            scenarios: Default::default(),
458            session: Default::default(),
459        }
460    }
461
462    #[test]
463    fn test_run_manifest_creation() {
464        let config = create_test_config();
465        let manifest = RunManifest::new(&config, 42);
466
467        assert!(!manifest.run_id.is_empty());
468        assert_eq!(manifest.seed, 42);
469        assert!(!manifest.config_hash.is_empty());
470        assert!(manifest.completed_at.is_none());
471    }
472
473    #[test]
474    fn test_run_manifest_completion() {
475        let config = create_test_config();
476        let mut manifest = RunManifest::new(&config, 42);
477
478        // Simulate some work
479        std::thread::sleep(std::time::Duration::from_millis(10));
480
481        let stats = EnhancedGenerationStatistics {
482            total_entries: 100,
483            total_line_items: 500,
484            ..Default::default()
485        };
486        manifest.complete(stats);
487
488        assert!(manifest.completed_at.is_some());
489        assert!(manifest.duration_seconds.unwrap() >= 0.01);
490        assert_eq!(manifest.statistics.as_ref().unwrap().total_entries, 100);
491    }
492
493    #[test]
494    fn test_config_hash_consistency() {
495        let config = create_test_config();
496        let hash1 = RunManifest::hash_config(&config);
497        let hash2 = RunManifest::hash_config(&config);
498
499        assert_eq!(hash1, hash2);
500    }
501
502    #[test]
503    fn test_scenario_tags() {
504        let config = create_test_config();
505        let mut manifest = RunManifest::new(&config, 42);
506
507        manifest.add_tag("fraud_detection");
508        manifest.add_tag("retail");
509        manifest.add_tag("fraud_detection"); // Duplicate
510
511        assert_eq!(manifest.scenario_tags.len(), 2);
512        assert!(manifest
513            .scenario_tags
514            .contains(&"fraud_detection".to_string()));
515        assert!(manifest.scenario_tags.contains(&"retail".to_string()));
516    }
517
518    #[test]
519    fn test_output_file_tracking() {
520        let config = create_test_config();
521        let mut manifest = RunManifest::new(&config, 42);
522
523        manifest.add_output_file(OutputFileInfo {
524            path: "journal_entries.csv".to_string(),
525            format: "csv".to_string(),
526            record_count: Some(1000),
527            size_bytes: Some(102400),
528            sha256_checksum: None,
529            first_record_index: None,
530            last_record_index: None,
531        });
532
533        assert_eq!(manifest.output_files.len(), 1);
534        assert_eq!(manifest.output_files[0].record_count, Some(1000));
535    }
536
537    #[test]
538    fn test_manifest_version() {
539        let config = create_test_config();
540        let manifest = RunManifest::new(&config, 42);
541        assert_eq!(manifest.manifest_version, "2.0");
542    }
543
544    #[test]
545    fn test_backward_compat_deserialize() {
546        // Old manifest JSON without manifest_version or checksum fields
547        let old_json = r#"{
548            "run_id": "test-123",
549            "started_at": "2024-01-01T00:00:00Z",
550            "completed_at": null,
551            "config_hash": "abc123",
552            "config_snapshot": null,
553            "seed": 42,
554            "duration_seconds": null,
555            "generator_version": "0.4.0",
556            "output_directory": null,
557            "output_files": [
558                {
559                    "path": "data.csv",
560                    "format": "csv",
561                    "record_count": 100,
562                    "size_bytes": 1024
563                }
564            ]
565        }"#;
566
567        // Should deserialize without errors (config_snapshot will fail since it's null,
568        // but the point is that the new fields have proper defaults)
569        let result: Result<serde_json::Value, _> = serde_json::from_str(old_json);
570        assert!(result.is_ok());
571    }
572
573    #[test]
574    fn test_checksum_computation() {
575        let dir = tempfile::tempdir().expect("create temp dir");
576        let file_path = dir.path().join("test.txt");
577        std::fs::write(&file_path, b"hello world").expect("write file");
578
579        let checksum = compute_file_checksum(&file_path).expect("compute checksum");
580        // SHA-256 of "hello world"
581        assert_eq!(
582            checksum,
583            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
584        );
585    }
586
587    #[test]
588    fn test_populate_and_verify_checksums() {
589        let dir = tempfile::tempdir().expect("create temp dir");
590        let file_path = dir.path().join("data.csv");
591        std::fs::write(&file_path, b"id,name\n1,Alice\n2,Bob\n").expect("write file");
592
593        let config = create_test_config();
594        let mut manifest = RunManifest::new(&config, 42);
595        manifest.add_output_file(OutputFileInfo {
596            path: "data.csv".to_string(),
597            format: "csv".to_string(),
598            record_count: Some(2),
599            size_bytes: None,
600            sha256_checksum: None,
601            first_record_index: None,
602            last_record_index: None,
603        });
604
605        manifest.populate_file_checksums(dir.path());
606
607        assert!(manifest.output_files[0].sha256_checksum.is_some());
608        assert!(manifest.output_files[0].size_bytes.is_some());
609
610        // Verify should pass
611        let results = manifest.verify_file_checksums(dir.path());
612        assert_eq!(results.len(), 1);
613        assert_eq!(results[0].status, ChecksumStatus::Ok);
614    }
615
616    #[test]
617    fn test_verify_detects_mismatch() {
618        let dir = tempfile::tempdir().expect("create temp dir");
619        let file_path = dir.path().join("data.csv");
620        std::fs::write(&file_path, b"original content").expect("write file");
621
622        let config = create_test_config();
623        let mut manifest = RunManifest::new(&config, 42);
624        manifest.add_output_file(OutputFileInfo {
625            path: "data.csv".to_string(),
626            format: "csv".to_string(),
627            record_count: None,
628            size_bytes: None,
629            sha256_checksum: None,
630            first_record_index: None,
631            last_record_index: None,
632        });
633
634        manifest.populate_file_checksums(dir.path());
635
636        // Modify file after checksum
637        std::fs::write(&file_path, b"modified content").expect("write file");
638
639        let results = manifest.verify_file_checksums(dir.path());
640        assert_eq!(results[0].status, ChecksumStatus::Mismatch);
641    }
642
643    #[test]
644    fn test_verify_missing_file() {
645        let dir = tempfile::tempdir().expect("create temp dir");
646
647        let config = create_test_config();
648        let mut manifest = RunManifest::new(&config, 42);
649        manifest.add_output_file(OutputFileInfo {
650            path: "nonexistent.csv".to_string(),
651            format: "csv".to_string(),
652            record_count: None,
653            size_bytes: None,
654            sha256_checksum: Some("abc123".to_string()),
655            first_record_index: None,
656            last_record_index: None,
657        });
658
659        let results = manifest.verify_file_checksums(dir.path());
660        assert_eq!(results[0].status, ChecksumStatus::Missing);
661    }
662
663    #[test]
664    fn test_verify_no_checksum() {
665        let dir = tempfile::tempdir().expect("create temp dir");
666
667        let config = create_test_config();
668        let mut manifest = RunManifest::new(&config, 42);
669        manifest.add_output_file(OutputFileInfo {
670            path: "data.csv".to_string(),
671            format: "csv".to_string(),
672            record_count: None,
673            size_bytes: None,
674            sha256_checksum: None,
675            first_record_index: None,
676            last_record_index: None,
677        });
678
679        let results = manifest.verify_file_checksums(dir.path());
680        assert_eq!(results[0].status, ChecksumStatus::NoChecksum);
681    }
682}