Skip to main content

datasynth_runtime/
run_manifest.rs

1//! Run manifest and metadata tracking for reproducibility.
2//!
3//! This module provides structures for capturing complete generation run metadata,
4//! enabling reproducibility and traceability of generated data.
5
6use chrono::{DateTime, Utc};
7use datasynth_config::schema::GeneratorConfig;
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::fs::File;
12use std::io::{self, BufReader, Read as _, Write};
13use std::path::Path;
14use uuid::Uuid;
15
16use super::EnhancedGenerationStatistics;
17
18/// Complete manifest of a generation run for reproducibility.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct RunManifest {
21    /// Manifest format version.
22    #[serde(default = "default_manifest_version")]
23    pub manifest_version: String,
24    /// Unique identifier for this run.
25    pub run_id: String,
26    /// Timestamp when generation started.
27    pub started_at: DateTime<Utc>,
28    /// Timestamp when generation completed.
29    pub completed_at: Option<DateTime<Utc>>,
30    /// SHA-256 hash of the configuration (for quick comparison).
31    pub config_hash: String,
32    /// Complete configuration snapshot.
33    pub config_snapshot: GeneratorConfig,
34    /// Seed used for random number generation.
35    pub seed: u64,
36    /// Scenario tags for categorization.
37    #[serde(default)]
38    pub scenario_tags: Vec<String>,
39    /// Generation statistics.
40    #[serde(default)]
41    pub statistics: Option<EnhancedGenerationStatistics>,
42    /// Duration in seconds.
43    pub duration_seconds: Option<f64>,
44    /// Version of the generator.
45    pub generator_version: String,
46    /// Additional metadata.
47    #[serde(default)]
48    pub metadata: HashMap<String, String>,
49    /// Output directory path.
50    pub output_directory: Option<String>,
51    /// List of output files generated.
52    #[serde(default)]
53    pub output_files: Vec<OutputFileInfo>,
54    /// Any warnings or notes from the generation.
55    #[serde(default)]
56    pub warnings: Vec<String>,
57    /// Data lineage graph tracking config → generator → output relationships.
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub lineage: Option<super::lineage::LineageGraph>,
60    /// Quality gate evaluation result.
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub quality_gate_result: Option<QualityGateResultSummary>,
63    /// LLM enrichment phase summary.
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub llm_enrichment: Option<LlmEnrichmentSummary>,
66    /// Diffusion enhancement phase summary.
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub diffusion_model: Option<DiffusionModelSummary>,
69    /// Causal generation phase summary.
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub causal_generation: Option<CausalGenerationSummary>,
72}
73
74/// Summary of LLM enrichment phase for the run manifest.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct LlmEnrichmentSummary {
77    /// Whether LLM enrichment was enabled.
78    pub enabled: bool,
79    /// Execution time in milliseconds.
80    pub timing_ms: u64,
81    /// Number of vendors enriched.
82    pub vendors_enriched: usize,
83    /// Provider used (e.g., "mock", "openai").
84    pub provider: String,
85}
86
87/// Summary of diffusion enhancement phase for the run manifest.
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct DiffusionModelSummary {
90    /// Whether diffusion enhancement was enabled.
91    pub enabled: bool,
92    /// Execution time in milliseconds.
93    pub timing_ms: u64,
94    /// Number of samples generated.
95    pub samples_generated: usize,
96    /// Number of diffusion steps used.
97    pub n_steps: usize,
98}
99
100/// Summary of causal generation phase for the run manifest.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CausalGenerationSummary {
103    /// Whether causal generation was enabled.
104    pub enabled: bool,
105    /// Execution time in milliseconds.
106    pub timing_ms: u64,
107    /// Number of causal samples generated.
108    pub samples_generated: usize,
109    /// Template used (e.g., "fraud_detection", "revenue_cycle").
110    pub template: String,
111    /// Whether causal validation passed (None if validation was not run).
112    pub validation_passed: Option<bool>,
113}
114
115/// Summary of quality gate evaluation for the run manifest.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct QualityGateResultSummary {
118    /// Whether all gates passed.
119    pub passed: bool,
120    /// Profile name used.
121    pub profile_name: String,
122    /// Number of gates that passed.
123    pub gates_passed: usize,
124    /// Total number of gates evaluated.
125    pub gates_total: usize,
126    /// Names of failed gates.
127    pub failed_gates: Vec<String>,
128}
129
130fn default_manifest_version() -> String {
131    "2.0".to_string()
132}
133
134/// Information about an output file.
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct OutputFileInfo {
137    /// Relative path from output directory.
138    pub path: String,
139    /// File format (csv, json, parquet).
140    pub format: String,
141    /// Record count.
142    pub record_count: Option<usize>,
143    /// File size in bytes.
144    pub size_bytes: Option<u64>,
145    /// SHA-256 checksum of the file contents.
146    #[serde(default, skip_serializing_if = "Option::is_none")]
147    pub sha256_checksum: Option<String>,
148    /// Index of the first record in this file (for partitioned outputs).
149    #[serde(default, skip_serializing_if = "Option::is_none")]
150    pub first_record_index: Option<u64>,
151    /// Index of the last record in this file (for partitioned outputs).
152    #[serde(default, skip_serializing_if = "Option::is_none")]
153    pub last_record_index: Option<u64>,
154}
155
156/// Result of verifying a single file's checksum.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct ChecksumVerificationResult {
159    /// Relative path of the file.
160    pub path: String,
161    /// Verification status.
162    pub status: ChecksumStatus,
163    /// Expected checksum (from manifest).
164    pub expected: Option<String>,
165    /// Actual checksum (computed from file).
166    pub actual: Option<String>,
167}
168
169/// Status of a checksum verification.
170#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
171pub enum ChecksumStatus {
172    /// Checksum matches.
173    Ok,
174    /// Checksum does not match.
175    Mismatch,
176    /// File is missing on disk.
177    Missing,
178    /// No checksum recorded in manifest.
179    NoChecksum,
180}
181
182/// Computes the SHA-256 checksum of a file, streaming in 8KB chunks.
183pub fn compute_file_checksum(path: &Path) -> io::Result<String> {
184    let file = File::open(path)?;
185    let mut reader = BufReader::new(file);
186    let mut hasher = Sha256::new();
187    let mut buffer = [0u8; 8192];
188    loop {
189        let bytes_read = reader.read(&mut buffer)?;
190        if bytes_read == 0 {
191            break;
192        }
193        hasher.update(&buffer[..bytes_read]);
194    }
195    Ok(hex::encode(hasher.finalize()))
196}
197
198impl RunManifest {
199    /// Creates a new run manifest.
200    pub fn new(config: &GeneratorConfig, seed: u64) -> Self {
201        let run_id = Uuid::new_v4().to_string();
202        let config_hash = Self::hash_config(config);
203
204        Self {
205            manifest_version: "2.0".to_string(),
206            run_id,
207            started_at: Utc::now(),
208            completed_at: None,
209            config_hash,
210            config_snapshot: config.clone(),
211            seed,
212            scenario_tags: Vec::new(),
213            statistics: None,
214            duration_seconds: None,
215            generator_version: env!("CARGO_PKG_VERSION").to_string(),
216            metadata: HashMap::new(),
217            output_directory: None,
218            output_files: Vec::new(),
219            warnings: Vec::new(),
220            lineage: None,
221            quality_gate_result: None,
222            llm_enrichment: None,
223            diffusion_model: None,
224            causal_generation: None,
225        }
226    }
227
228    /// Computes SHA-256 hash of the configuration.
229    fn hash_config(config: &GeneratorConfig) -> String {
230        let json = match serde_json::to_string(config) {
231            Ok(j) => j,
232            Err(e) => {
233                tracing::warn!("Failed to serialize config for hashing: {}", e);
234                String::new()
235            }
236        };
237        let mut hasher = Sha256::new();
238        hasher.update(json.as_bytes());
239        let result = hasher.finalize();
240        hex::encode(result)
241    }
242
243    /// Marks the run as complete.
244    pub fn complete(&mut self, statistics: EnhancedGenerationStatistics) {
245        let now = Utc::now();
246        self.completed_at = Some(now);
247        self.duration_seconds = Some((now - self.started_at).num_milliseconds() as f64 / 1000.0);
248        self.statistics = Some(statistics);
249    }
250
251    /// Adds a scenario tag.
252    pub fn add_tag(&mut self, tag: &str) {
253        if !self.scenario_tags.contains(&tag.to_string()) {
254            self.scenario_tags.push(tag.to_string());
255        }
256    }
257
258    /// Adds multiple scenario tags.
259    pub fn add_tags(&mut self, tags: &[String]) {
260        for tag in tags {
261            self.add_tag(tag);
262        }
263    }
264
265    /// Sets the output directory.
266    pub fn set_output_directory(&mut self, path: &Path) {
267        self.output_directory = Some(path.display().to_string());
268    }
269
270    /// Adds an output file record.
271    pub fn add_output_file(&mut self, info: OutputFileInfo) {
272        self.output_files.push(info);
273    }
274
275    /// Adds a warning message.
276    pub fn add_warning(&mut self, warning: &str) {
277        self.warnings.push(warning.to_string());
278    }
279
280    /// Adds metadata.
281    pub fn add_metadata(&mut self, key: &str, value: &str) {
282        self.metadata.insert(key.to_string(), value.to_string());
283    }
284
285    /// Populates SHA-256 checksums for all output files.
286    ///
287    /// Resolves each file path relative to `base_dir` and computes its checksum.
288    /// Also populates `size_bytes` if not already set.
289    pub fn populate_file_checksums(&mut self, base_dir: &Path) {
290        for file_info in &mut self.output_files {
291            let file_path = base_dir.join(&file_info.path);
292            if file_path.exists() {
293                if let Ok(checksum) = compute_file_checksum(&file_path) {
294                    file_info.sha256_checksum = Some(checksum);
295                }
296                if file_info.size_bytes.is_none() {
297                    if let Ok(metadata) = std::fs::metadata(&file_path) {
298                        file_info.size_bytes = Some(metadata.len());
299                    }
300                }
301            }
302        }
303    }
304
305    /// Verifies checksums for all output files against their recorded values.
306    pub fn verify_file_checksums(&self, base_dir: &Path) -> Vec<ChecksumVerificationResult> {
307        self.output_files
308            .iter()
309            .map(|file_info| {
310                let file_path = base_dir.join(&file_info.path);
311
312                let expected = file_info.sha256_checksum.clone();
313                if expected.is_none() {
314                    return ChecksumVerificationResult {
315                        path: file_info.path.clone(),
316                        status: ChecksumStatus::NoChecksum,
317                        expected: None,
318                        actual: None,
319                    };
320                }
321
322                if !file_path.exists() {
323                    return ChecksumVerificationResult {
324                        path: file_info.path.clone(),
325                        status: ChecksumStatus::Missing,
326                        expected,
327                        actual: None,
328                    };
329                }
330
331                match compute_file_checksum(&file_path) {
332                    Ok(actual) => {
333                        let status = if expected.as_deref() == Some(actual.as_str()) {
334                            ChecksumStatus::Ok
335                        } else {
336                            ChecksumStatus::Mismatch
337                        };
338                        ChecksumVerificationResult {
339                            path: file_info.path.clone(),
340                            status,
341                            expected,
342                            actual: Some(actual),
343                        }
344                    }
345                    Err(_) => ChecksumVerificationResult {
346                        path: file_info.path.clone(),
347                        status: ChecksumStatus::Missing,
348                        expected,
349                        actual: None,
350                    },
351                }
352            })
353            .collect()
354    }
355
356    /// Writes the manifest to a JSON file.
357    pub fn write_to_file(&self, path: &Path) -> std::io::Result<()> {
358        let json = serde_json::to_string_pretty(self)?;
359        let mut file = File::create(path)?;
360        file.write_all(json.as_bytes())?;
361        Ok(())
362    }
363
364    /// Returns the run ID.
365    pub fn run_id(&self) -> &str {
366        &self.run_id
367    }
368}
369
370// Note: ScenarioConfig is now defined in datasynth-config/src/schema.rs
371// and exported via datasynth_config::schema::ScenarioConfig
372
373#[cfg(test)]
374#[allow(clippy::unwrap_used)]
375mod tests {
376    use super::*;
377    use datasynth_config::schema::*;
378
379    fn create_test_config() -> GeneratorConfig {
380        GeneratorConfig {
381            global: GlobalConfig {
382                industry: datasynth_core::models::IndustrySector::Manufacturing,
383                start_date: "2024-01-01".to_string(),
384                period_months: 1,
385                seed: Some(42),
386                parallel: false,
387                group_currency: "USD".to_string(),
388                worker_threads: 1,
389                memory_limit_mb: 512,
390            },
391            companies: vec![CompanyConfig {
392                code: "TEST".to_string(),
393                name: "Test Company".to_string(),
394                currency: "USD".to_string(),
395                country: "US".to_string(),
396                annual_transaction_volume: TransactionVolume::TenK,
397                volume_weight: 1.0,
398                fiscal_year_variant: "K4".to_string(),
399            }],
400            chart_of_accounts: ChartOfAccountsConfig::default(),
401            transactions: TransactionConfig::default(),
402            output: OutputConfig::default(),
403            fraud: FraudConfig::default(),
404            internal_controls: InternalControlsConfig::default(),
405            business_processes: BusinessProcessConfig::default(),
406            user_personas: UserPersonaConfig::default(),
407            templates: TemplateConfig::default(),
408            approval: ApprovalConfig::default(),
409            departments: DepartmentConfig::default(),
410            master_data: MasterDataConfig::default(),
411            document_flows: DocumentFlowConfig::default(),
412            intercompany: IntercompanyConfig::default(),
413            balance: BalanceConfig::default(),
414            ocpm: OcpmConfig::default(),
415            audit: AuditGenerationConfig::default(),
416            banking: datasynth_banking::BankingConfig::default(),
417            data_quality: DataQualitySchemaConfig::default(),
418            scenario: ScenarioConfig::default(),
419            temporal: TemporalDriftConfig::default(),
420            graph_export: GraphExportConfig::default(),
421            streaming: StreamingSchemaConfig::default(),
422            rate_limit: RateLimitSchemaConfig::default(),
423            temporal_attributes: TemporalAttributeSchemaConfig::default(),
424            relationships: RelationshipSchemaConfig::default(),
425            accounting_standards: AccountingStandardsConfig::default(),
426            audit_standards: AuditStandardsConfig::default(),
427            distributions: Default::default(),
428            temporal_patterns: Default::default(),
429            vendor_network: VendorNetworkSchemaConfig::default(),
430            customer_segmentation: CustomerSegmentationSchemaConfig::default(),
431            relationship_strength: RelationshipStrengthSchemaConfig::default(),
432            cross_process_links: CrossProcessLinksSchemaConfig::default(),
433            organizational_events: OrganizationalEventsSchemaConfig::default(),
434            behavioral_drift: BehavioralDriftSchemaConfig::default(),
435            market_drift: MarketDriftSchemaConfig::default(),
436            drift_labeling: DriftLabelingSchemaConfig::default(),
437            anomaly_injection: Default::default(),
438            industry_specific: Default::default(),
439            fingerprint_privacy: Default::default(),
440            quality_gates: Default::default(),
441            compliance: Default::default(),
442            webhooks: Default::default(),
443            llm: Default::default(),
444            diffusion: Default::default(),
445            causal: Default::default(),
446            source_to_pay: Default::default(),
447            financial_reporting: Default::default(),
448            hr: Default::default(),
449            manufacturing: Default::default(),
450            sales_quotes: Default::default(),
451            tax: Default::default(),
452            treasury: Default::default(),
453            project_accounting: Default::default(),
454            esg: Default::default(),
455            country_packs: None,
456        }
457    }
458
459    #[test]
460    fn test_run_manifest_creation() {
461        let config = create_test_config();
462        let manifest = RunManifest::new(&config, 42);
463
464        assert!(!manifest.run_id.is_empty());
465        assert_eq!(manifest.seed, 42);
466        assert!(!manifest.config_hash.is_empty());
467        assert!(manifest.completed_at.is_none());
468    }
469
470    #[test]
471    fn test_run_manifest_completion() {
472        let config = create_test_config();
473        let mut manifest = RunManifest::new(&config, 42);
474
475        // Simulate some work
476        std::thread::sleep(std::time::Duration::from_millis(10));
477
478        let stats = EnhancedGenerationStatistics {
479            total_entries: 100,
480            total_line_items: 500,
481            ..Default::default()
482        };
483        manifest.complete(stats);
484
485        assert!(manifest.completed_at.is_some());
486        assert!(manifest.duration_seconds.unwrap() >= 0.01);
487        assert_eq!(manifest.statistics.as_ref().unwrap().total_entries, 100);
488    }
489
490    #[test]
491    fn test_config_hash_consistency() {
492        let config = create_test_config();
493        let hash1 = RunManifest::hash_config(&config);
494        let hash2 = RunManifest::hash_config(&config);
495
496        assert_eq!(hash1, hash2);
497    }
498
499    #[test]
500    fn test_scenario_tags() {
501        let config = create_test_config();
502        let mut manifest = RunManifest::new(&config, 42);
503
504        manifest.add_tag("fraud_detection");
505        manifest.add_tag("retail");
506        manifest.add_tag("fraud_detection"); // Duplicate
507
508        assert_eq!(manifest.scenario_tags.len(), 2);
509        assert!(manifest
510            .scenario_tags
511            .contains(&"fraud_detection".to_string()));
512        assert!(manifest.scenario_tags.contains(&"retail".to_string()));
513    }
514
515    #[test]
516    fn test_output_file_tracking() {
517        let config = create_test_config();
518        let mut manifest = RunManifest::new(&config, 42);
519
520        manifest.add_output_file(OutputFileInfo {
521            path: "journal_entries.csv".to_string(),
522            format: "csv".to_string(),
523            record_count: Some(1000),
524            size_bytes: Some(102400),
525            sha256_checksum: None,
526            first_record_index: None,
527            last_record_index: None,
528        });
529
530        assert_eq!(manifest.output_files.len(), 1);
531        assert_eq!(manifest.output_files[0].record_count, Some(1000));
532    }
533
534    #[test]
535    fn test_manifest_version() {
536        let config = create_test_config();
537        let manifest = RunManifest::new(&config, 42);
538        assert_eq!(manifest.manifest_version, "2.0");
539    }
540
541    #[test]
542    fn test_backward_compat_deserialize() {
543        // Old manifest JSON without manifest_version or checksum fields
544        let old_json = r#"{
545            "run_id": "test-123",
546            "started_at": "2024-01-01T00:00:00Z",
547            "completed_at": null,
548            "config_hash": "abc123",
549            "config_snapshot": null,
550            "seed": 42,
551            "duration_seconds": null,
552            "generator_version": "0.4.0",
553            "output_directory": null,
554            "output_files": [
555                {
556                    "path": "data.csv",
557                    "format": "csv",
558                    "record_count": 100,
559                    "size_bytes": 1024
560                }
561            ]
562        }"#;
563
564        // Should deserialize without errors (config_snapshot will fail since it's null,
565        // but the point is that the new fields have proper defaults)
566        let result: Result<serde_json::Value, _> = serde_json::from_str(old_json);
567        assert!(result.is_ok());
568    }
569
570    #[test]
571    fn test_checksum_computation() {
572        let dir = tempfile::tempdir().expect("create temp dir");
573        let file_path = dir.path().join("test.txt");
574        std::fs::write(&file_path, b"hello world").expect("write file");
575
576        let checksum = compute_file_checksum(&file_path).expect("compute checksum");
577        // SHA-256 of "hello world"
578        assert_eq!(
579            checksum,
580            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
581        );
582    }
583
584    #[test]
585    fn test_populate_and_verify_checksums() {
586        let dir = tempfile::tempdir().expect("create temp dir");
587        let file_path = dir.path().join("data.csv");
588        std::fs::write(&file_path, b"id,name\n1,Alice\n2,Bob\n").expect("write file");
589
590        let config = create_test_config();
591        let mut manifest = RunManifest::new(&config, 42);
592        manifest.add_output_file(OutputFileInfo {
593            path: "data.csv".to_string(),
594            format: "csv".to_string(),
595            record_count: Some(2),
596            size_bytes: None,
597            sha256_checksum: None,
598            first_record_index: None,
599            last_record_index: None,
600        });
601
602        manifest.populate_file_checksums(dir.path());
603
604        assert!(manifest.output_files[0].sha256_checksum.is_some());
605        assert!(manifest.output_files[0].size_bytes.is_some());
606
607        // Verify should pass
608        let results = manifest.verify_file_checksums(dir.path());
609        assert_eq!(results.len(), 1);
610        assert_eq!(results[0].status, ChecksumStatus::Ok);
611    }
612
613    #[test]
614    fn test_verify_detects_mismatch() {
615        let dir = tempfile::tempdir().expect("create temp dir");
616        let file_path = dir.path().join("data.csv");
617        std::fs::write(&file_path, b"original content").expect("write file");
618
619        let config = create_test_config();
620        let mut manifest = RunManifest::new(&config, 42);
621        manifest.add_output_file(OutputFileInfo {
622            path: "data.csv".to_string(),
623            format: "csv".to_string(),
624            record_count: None,
625            size_bytes: None,
626            sha256_checksum: None,
627            first_record_index: None,
628            last_record_index: None,
629        });
630
631        manifest.populate_file_checksums(dir.path());
632
633        // Modify file after checksum
634        std::fs::write(&file_path, b"modified content").expect("write file");
635
636        let results = manifest.verify_file_checksums(dir.path());
637        assert_eq!(results[0].status, ChecksumStatus::Mismatch);
638    }
639
640    #[test]
641    fn test_verify_missing_file() {
642        let dir = tempfile::tempdir().expect("create temp dir");
643
644        let config = create_test_config();
645        let mut manifest = RunManifest::new(&config, 42);
646        manifest.add_output_file(OutputFileInfo {
647            path: "nonexistent.csv".to_string(),
648            format: "csv".to_string(),
649            record_count: None,
650            size_bytes: None,
651            sha256_checksum: Some("abc123".to_string()),
652            first_record_index: None,
653            last_record_index: None,
654        });
655
656        let results = manifest.verify_file_checksums(dir.path());
657        assert_eq!(results[0].status, ChecksumStatus::Missing);
658    }
659
660    #[test]
661    fn test_verify_no_checksum() {
662        let dir = tempfile::tempdir().expect("create temp dir");
663
664        let config = create_test_config();
665        let mut manifest = RunManifest::new(&config, 42);
666        manifest.add_output_file(OutputFileInfo {
667            path: "data.csv".to_string(),
668            format: "csv".to_string(),
669            record_count: None,
670            size_bytes: None,
671            sha256_checksum: None,
672            first_record_index: None,
673            last_record_index: None,
674        });
675
676        let results = manifest.verify_file_checksums(dir.path());
677        assert_eq!(results[0].status, ChecksumStatus::NoChecksum);
678    }
679}