1use chrono::{DateTime, Utc};
7use datasynth_config::schema::GeneratorConfig;
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::fs::File;
12use std::io::{self, BufReader, Read as _, Write};
13use std::path::Path;
14use uuid::Uuid;
15
16use super::EnhancedGenerationStatistics;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct RunManifest {
21 #[serde(default = "default_manifest_version")]
23 pub manifest_version: String,
24 pub run_id: String,
26 pub started_at: DateTime<Utc>,
28 pub completed_at: Option<DateTime<Utc>>,
30 pub config_hash: String,
32 pub config_snapshot: GeneratorConfig,
34 pub seed: u64,
36 #[serde(default)]
38 pub scenario_tags: Vec<String>,
39 #[serde(default)]
41 pub statistics: Option<EnhancedGenerationStatistics>,
42 pub duration_seconds: Option<f64>,
44 pub generator_version: String,
46 #[serde(default)]
48 pub metadata: HashMap<String, String>,
49 pub output_directory: Option<String>,
51 #[serde(default)]
53 pub output_files: Vec<OutputFileInfo>,
54 #[serde(default)]
56 pub warnings: Vec<String>,
57 #[serde(default, skip_serializing_if = "Option::is_none")]
59 pub lineage: Option<super::lineage::LineageGraph>,
60 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub quality_gate_result: Option<QualityGateResultSummary>,
63 #[serde(default, skip_serializing_if = "Option::is_none")]
65 pub llm_enrichment: Option<LlmEnrichmentSummary>,
66 #[serde(default, skip_serializing_if = "Option::is_none")]
68 pub diffusion_model: Option<DiffusionModelSummary>,
69 #[serde(default, skip_serializing_if = "Option::is_none")]
71 pub causal_generation: Option<CausalGenerationSummary>,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct LlmEnrichmentSummary {
77 pub enabled: bool,
79 pub timing_ms: u64,
81 pub vendors_enriched: usize,
83 pub provider: String,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct DiffusionModelSummary {
90 pub enabled: bool,
92 pub timing_ms: u64,
94 pub samples_generated: usize,
96 pub n_steps: usize,
98}
99
100#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CausalGenerationSummary {
103 pub enabled: bool,
105 pub timing_ms: u64,
107 pub samples_generated: usize,
109 pub template: String,
111 pub validation_passed: Option<bool>,
113}
114
115#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct QualityGateResultSummary {
118 pub passed: bool,
120 pub profile_name: String,
122 pub gates_passed: usize,
124 pub gates_total: usize,
126 pub failed_gates: Vec<String>,
128}
129
130fn default_manifest_version() -> String {
131 "2.0".to_string()
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct OutputFileInfo {
137 pub path: String,
139 pub format: String,
141 pub record_count: Option<usize>,
143 pub size_bytes: Option<u64>,
145 #[serde(default, skip_serializing_if = "Option::is_none")]
147 pub sha256_checksum: Option<String>,
148 #[serde(default, skip_serializing_if = "Option::is_none")]
150 pub first_record_index: Option<u64>,
151 #[serde(default, skip_serializing_if = "Option::is_none")]
153 pub last_record_index: Option<u64>,
154}
155
156#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct ChecksumVerificationResult {
159 pub path: String,
161 pub status: ChecksumStatus,
163 pub expected: Option<String>,
165 pub actual: Option<String>,
167}
168
169#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
171pub enum ChecksumStatus {
172 Ok,
174 Mismatch,
176 Missing,
178 NoChecksum,
180}
181
182pub fn compute_file_checksum(path: &Path) -> io::Result<String> {
184 let file = File::open(path)?;
185 let mut reader = BufReader::new(file);
186 let mut hasher = Sha256::new();
187 let mut buffer = [0u8; 8192];
188 loop {
189 let bytes_read = reader.read(&mut buffer)?;
190 if bytes_read == 0 {
191 break;
192 }
193 hasher.update(&buffer[..bytes_read]);
194 }
195 Ok(hex::encode(hasher.finalize()))
196}
197
198impl RunManifest {
199 pub fn new(config: &GeneratorConfig, seed: u64) -> Self {
201 let run_id = Uuid::new_v4().to_string();
202 let config_hash = Self::hash_config(config);
203
204 Self {
205 manifest_version: "2.0".to_string(),
206 run_id,
207 started_at: Utc::now(),
208 completed_at: None,
209 config_hash,
210 config_snapshot: config.clone(),
211 seed,
212 scenario_tags: Vec::new(),
213 statistics: None,
214 duration_seconds: None,
215 generator_version: env!("CARGO_PKG_VERSION").to_string(),
216 metadata: HashMap::new(),
217 output_directory: None,
218 output_files: Vec::new(),
219 warnings: Vec::new(),
220 lineage: None,
221 quality_gate_result: None,
222 llm_enrichment: None,
223 diffusion_model: None,
224 causal_generation: None,
225 }
226 }
227
228 fn hash_config(config: &GeneratorConfig) -> String {
230 let json = match serde_json::to_string(config) {
231 Ok(j) => j,
232 Err(e) => {
233 tracing::warn!("Failed to serialize config for hashing: {}", e);
234 String::new()
235 }
236 };
237 let mut hasher = Sha256::new();
238 hasher.update(json.as_bytes());
239 let result = hasher.finalize();
240 hex::encode(result)
241 }
242
243 pub fn complete(&mut self, statistics: EnhancedGenerationStatistics) {
245 let now = Utc::now();
246 self.completed_at = Some(now);
247 self.duration_seconds = Some((now - self.started_at).num_milliseconds() as f64 / 1000.0);
248 self.statistics = Some(statistics);
249 }
250
251 pub fn add_tag(&mut self, tag: &str) {
253 if !self.scenario_tags.contains(&tag.to_string()) {
254 self.scenario_tags.push(tag.to_string());
255 }
256 }
257
258 pub fn add_tags(&mut self, tags: &[String]) {
260 for tag in tags {
261 self.add_tag(tag);
262 }
263 }
264
265 pub fn set_output_directory(&mut self, path: &Path) {
267 self.output_directory = Some(path.display().to_string());
268 }
269
270 pub fn add_output_file(&mut self, info: OutputFileInfo) {
272 self.output_files.push(info);
273 }
274
275 pub fn add_warning(&mut self, warning: &str) {
277 self.warnings.push(warning.to_string());
278 }
279
280 pub fn add_metadata(&mut self, key: &str, value: &str) {
282 self.metadata.insert(key.to_string(), value.to_string());
283 }
284
285 pub fn populate_file_checksums(&mut self, base_dir: &Path) {
290 for file_info in &mut self.output_files {
291 let file_path = base_dir.join(&file_info.path);
292 if file_path.exists() {
293 if let Ok(checksum) = compute_file_checksum(&file_path) {
294 file_info.sha256_checksum = Some(checksum);
295 }
296 if file_info.size_bytes.is_none() {
297 if let Ok(metadata) = std::fs::metadata(&file_path) {
298 file_info.size_bytes = Some(metadata.len());
299 }
300 }
301 }
302 }
303 }
304
305 pub fn verify_file_checksums(&self, base_dir: &Path) -> Vec<ChecksumVerificationResult> {
307 self.output_files
308 .iter()
309 .map(|file_info| {
310 let file_path = base_dir.join(&file_info.path);
311
312 let expected = file_info.sha256_checksum.clone();
313 if expected.is_none() {
314 return ChecksumVerificationResult {
315 path: file_info.path.clone(),
316 status: ChecksumStatus::NoChecksum,
317 expected: None,
318 actual: None,
319 };
320 }
321
322 if !file_path.exists() {
323 return ChecksumVerificationResult {
324 path: file_info.path.clone(),
325 status: ChecksumStatus::Missing,
326 expected,
327 actual: None,
328 };
329 }
330
331 match compute_file_checksum(&file_path) {
332 Ok(actual) => {
333 let status = if expected.as_deref() == Some(actual.as_str()) {
334 ChecksumStatus::Ok
335 } else {
336 ChecksumStatus::Mismatch
337 };
338 ChecksumVerificationResult {
339 path: file_info.path.clone(),
340 status,
341 expected,
342 actual: Some(actual),
343 }
344 }
345 Err(_) => ChecksumVerificationResult {
346 path: file_info.path.clone(),
347 status: ChecksumStatus::Missing,
348 expected,
349 actual: None,
350 },
351 }
352 })
353 .collect()
354 }
355
356 pub fn write_to_file(&self, path: &Path) -> std::io::Result<()> {
358 let json = serde_json::to_string_pretty(self)?;
359 let mut file = File::create(path)?;
360 file.write_all(json.as_bytes())?;
361 Ok(())
362 }
363
364 pub fn run_id(&self) -> &str {
366 &self.run_id
367 }
368}
369
370#[cfg(test)]
374#[allow(clippy::unwrap_used)]
375mod tests {
376 use super::*;
377 use datasynth_config::schema::*;
378
379 fn create_test_config() -> GeneratorConfig {
380 GeneratorConfig {
381 global: GlobalConfig {
382 industry: datasynth_core::models::IndustrySector::Manufacturing,
383 start_date: "2024-01-01".to_string(),
384 period_months: 1,
385 seed: Some(42),
386 parallel: false,
387 group_currency: "USD".to_string(),
388 presentation_currency: None,
389 worker_threads: 1,
390 memory_limit_mb: 512,
391 fiscal_year_months: None,
392 },
393 companies: vec![CompanyConfig {
394 code: "TEST".to_string(),
395 name: "Test Company".to_string(),
396 currency: "USD".to_string(),
397 functional_currency: None,
398 country: "US".to_string(),
399 annual_transaction_volume: TransactionVolume::TenK,
400 volume_weight: 1.0,
401 fiscal_year_variant: "K4".to_string(),
402 }],
403 chart_of_accounts: ChartOfAccountsConfig::default(),
404 transactions: TransactionConfig::default(),
405 output: OutputConfig::default(),
406 fraud: FraudConfig::default(),
407 internal_controls: InternalControlsConfig::default(),
408 business_processes: BusinessProcessConfig::default(),
409 user_personas: UserPersonaConfig::default(),
410 templates: TemplateConfig::default(),
411 approval: ApprovalConfig::default(),
412 departments: DepartmentConfig::default(),
413 master_data: MasterDataConfig::default(),
414 document_flows: DocumentFlowConfig::default(),
415 intercompany: IntercompanyConfig::default(),
416 balance: BalanceConfig::default(),
417 ocpm: OcpmConfig::default(),
418 audit: AuditGenerationConfig::default(),
419 banking: datasynth_banking::BankingConfig::default(),
420 data_quality: DataQualitySchemaConfig::default(),
421 scenario: ScenarioConfig::default(),
422 temporal: TemporalDriftConfig::default(),
423 graph_export: GraphExportConfig::default(),
424 streaming: StreamingSchemaConfig::default(),
425 rate_limit: RateLimitSchemaConfig::default(),
426 temporal_attributes: TemporalAttributeSchemaConfig::default(),
427 relationships: RelationshipSchemaConfig::default(),
428 accounting_standards: AccountingStandardsConfig::default(),
429 audit_standards: AuditStandardsConfig::default(),
430 distributions: Default::default(),
431 temporal_patterns: Default::default(),
432 vendor_network: VendorNetworkSchemaConfig::default(),
433 customer_segmentation: CustomerSegmentationSchemaConfig::default(),
434 relationship_strength: RelationshipStrengthSchemaConfig::default(),
435 cross_process_links: CrossProcessLinksSchemaConfig::default(),
436 organizational_events: OrganizationalEventsSchemaConfig::default(),
437 behavioral_drift: BehavioralDriftSchemaConfig::default(),
438 market_drift: MarketDriftSchemaConfig::default(),
439 drift_labeling: DriftLabelingSchemaConfig::default(),
440 anomaly_injection: Default::default(),
441 industry_specific: Default::default(),
442 fingerprint_privacy: Default::default(),
443 quality_gates: Default::default(),
444 compliance: Default::default(),
445 webhooks: Default::default(),
446 llm: Default::default(),
447 diffusion: Default::default(),
448 causal: Default::default(),
449 source_to_pay: Default::default(),
450 financial_reporting: Default::default(),
451 hr: Default::default(),
452 manufacturing: Default::default(),
453 sales_quotes: Default::default(),
454 tax: Default::default(),
455 treasury: Default::default(),
456 project_accounting: Default::default(),
457 esg: Default::default(),
458 country_packs: None,
459 scenarios: Default::default(),
460 session: Default::default(),
461 compliance_regulations: Default::default(),
462 }
463 }
464
465 #[test]
466 fn test_run_manifest_creation() {
467 let config = create_test_config();
468 let manifest = RunManifest::new(&config, 42);
469
470 assert!(!manifest.run_id.is_empty());
471 assert_eq!(manifest.seed, 42);
472 assert!(!manifest.config_hash.is_empty());
473 assert!(manifest.completed_at.is_none());
474 }
475
476 #[test]
477 fn test_run_manifest_completion() {
478 let config = create_test_config();
479 let mut manifest = RunManifest::new(&config, 42);
480
481 std::thread::sleep(std::time::Duration::from_millis(10));
483
484 let stats = EnhancedGenerationStatistics {
485 total_entries: 100,
486 total_line_items: 500,
487 ..Default::default()
488 };
489 manifest.complete(stats);
490
491 assert!(manifest.completed_at.is_some());
492 assert!(manifest.duration_seconds.unwrap() >= 0.01);
493 assert_eq!(manifest.statistics.as_ref().unwrap().total_entries, 100);
494 }
495
496 #[test]
497 fn test_config_hash_consistency() {
498 let config = create_test_config();
499 let hash1 = RunManifest::hash_config(&config);
500 let hash2 = RunManifest::hash_config(&config);
501
502 assert_eq!(hash1, hash2);
503 }
504
505 #[test]
506 fn test_scenario_tags() {
507 let config = create_test_config();
508 let mut manifest = RunManifest::new(&config, 42);
509
510 manifest.add_tag("fraud_detection");
511 manifest.add_tag("retail");
512 manifest.add_tag("fraud_detection"); assert_eq!(manifest.scenario_tags.len(), 2);
515 assert!(manifest
516 .scenario_tags
517 .contains(&"fraud_detection".to_string()));
518 assert!(manifest.scenario_tags.contains(&"retail".to_string()));
519 }
520
521 #[test]
522 fn test_output_file_tracking() {
523 let config = create_test_config();
524 let mut manifest = RunManifest::new(&config, 42);
525
526 manifest.add_output_file(OutputFileInfo {
527 path: "journal_entries.csv".to_string(),
528 format: "csv".to_string(),
529 record_count: Some(1000),
530 size_bytes: Some(102400),
531 sha256_checksum: None,
532 first_record_index: None,
533 last_record_index: None,
534 });
535
536 assert_eq!(manifest.output_files.len(), 1);
537 assert_eq!(manifest.output_files[0].record_count, Some(1000));
538 }
539
540 #[test]
541 fn test_manifest_version() {
542 let config = create_test_config();
543 let manifest = RunManifest::new(&config, 42);
544 assert_eq!(manifest.manifest_version, "2.0");
545 }
546
547 #[test]
548 fn test_backward_compat_deserialize() {
549 let old_json = r#"{
551 "run_id": "test-123",
552 "started_at": "2024-01-01T00:00:00Z",
553 "completed_at": null,
554 "config_hash": "abc123",
555 "config_snapshot": null,
556 "seed": 42,
557 "duration_seconds": null,
558 "generator_version": "0.4.0",
559 "output_directory": null,
560 "output_files": [
561 {
562 "path": "data.csv",
563 "format": "csv",
564 "record_count": 100,
565 "size_bytes": 1024
566 }
567 ]
568 }"#;
569
570 let result: Result<serde_json::Value, _> = serde_json::from_str(old_json);
573 assert!(result.is_ok());
574 }
575
576 #[test]
577 fn test_checksum_computation() {
578 let dir = tempfile::tempdir().expect("create temp dir");
579 let file_path = dir.path().join("test.txt");
580 std::fs::write(&file_path, b"hello world").expect("write file");
581
582 let checksum = compute_file_checksum(&file_path).expect("compute checksum");
583 assert_eq!(
585 checksum,
586 "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
587 );
588 }
589
590 #[test]
591 fn test_populate_and_verify_checksums() {
592 let dir = tempfile::tempdir().expect("create temp dir");
593 let file_path = dir.path().join("data.csv");
594 std::fs::write(&file_path, b"id,name\n1,Alice\n2,Bob\n").expect("write file");
595
596 let config = create_test_config();
597 let mut manifest = RunManifest::new(&config, 42);
598 manifest.add_output_file(OutputFileInfo {
599 path: "data.csv".to_string(),
600 format: "csv".to_string(),
601 record_count: Some(2),
602 size_bytes: None,
603 sha256_checksum: None,
604 first_record_index: None,
605 last_record_index: None,
606 });
607
608 manifest.populate_file_checksums(dir.path());
609
610 assert!(manifest.output_files[0].sha256_checksum.is_some());
611 assert!(manifest.output_files[0].size_bytes.is_some());
612
613 let results = manifest.verify_file_checksums(dir.path());
615 assert_eq!(results.len(), 1);
616 assert_eq!(results[0].status, ChecksumStatus::Ok);
617 }
618
619 #[test]
620 fn test_verify_detects_mismatch() {
621 let dir = tempfile::tempdir().expect("create temp dir");
622 let file_path = dir.path().join("data.csv");
623 std::fs::write(&file_path, b"original content").expect("write file");
624
625 let config = create_test_config();
626 let mut manifest = RunManifest::new(&config, 42);
627 manifest.add_output_file(OutputFileInfo {
628 path: "data.csv".to_string(),
629 format: "csv".to_string(),
630 record_count: None,
631 size_bytes: None,
632 sha256_checksum: None,
633 first_record_index: None,
634 last_record_index: None,
635 });
636
637 manifest.populate_file_checksums(dir.path());
638
639 std::fs::write(&file_path, b"modified content").expect("write file");
641
642 let results = manifest.verify_file_checksums(dir.path());
643 assert_eq!(results[0].status, ChecksumStatus::Mismatch);
644 }
645
646 #[test]
647 fn test_verify_missing_file() {
648 let dir = tempfile::tempdir().expect("create temp dir");
649
650 let config = create_test_config();
651 let mut manifest = RunManifest::new(&config, 42);
652 manifest.add_output_file(OutputFileInfo {
653 path: "nonexistent.csv".to_string(),
654 format: "csv".to_string(),
655 record_count: None,
656 size_bytes: None,
657 sha256_checksum: Some("abc123".to_string()),
658 first_record_index: None,
659 last_record_index: None,
660 });
661
662 let results = manifest.verify_file_checksums(dir.path());
663 assert_eq!(results[0].status, ChecksumStatus::Missing);
664 }
665
666 #[test]
667 fn test_verify_no_checksum() {
668 let dir = tempfile::tempdir().expect("create temp dir");
669
670 let config = create_test_config();
671 let mut manifest = RunManifest::new(&config, 42);
672 manifest.add_output_file(OutputFileInfo {
673 path: "data.csv".to_string(),
674 format: "csv".to_string(),
675 record_count: None,
676 size_bytes: None,
677 sha256_checksum: None,
678 first_record_index: None,
679 last_record_index: None,
680 });
681
682 let results = manifest.verify_file_checksums(dir.path());
683 assert_eq!(results[0].status, ChecksumStatus::NoChecksum);
684 }
685}