Skip to main content

datasynth_runtime/
enhanced_orchestrator.rs

1//! Enhanced generation orchestrator with full feature integration.
2//!
3//! This orchestrator coordinates all generation phases:
4//! 1. Chart of Accounts generation
5//! 2. Master data generation (vendors, customers, materials, assets, employees)
6//! 3. Document flow generation (P2P, O2C) + subledger linking + OCPM events
7//! 4. Journal entry generation
8//! 5. Anomaly injection
9//! 6. Balance validation
10//! 7. Data quality injection
11//! 8. Audit data generation (engagements, workpapers, evidence, risks, findings, judgments)
12//! 9. Banking KYC/AML data generation (customers, accounts, transactions, typologies)
13//! 10. Graph export (accounting network for ML training and network reconstruction)
14//! 11. LLM enrichment (AI-augmented vendor names, descriptions)
15//! 12. Diffusion enhancement (statistical diffusion-based sample generation)
16//! 13. Causal overlay (structural causal model generation and validation)
17
18use std::collections::HashMap;
19use std::path::PathBuf;
20use std::sync::Arc;
21
22use chrono::{Datelike, NaiveDate};
23use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
24use serde::{Deserialize, Serialize};
25use tracing::{debug, info, warn};
26
27use datasynth_banking::{
28    models::{BankAccount, BankTransaction, BankingCustomer},
29    BankingOrchestratorBuilder,
30};
31use datasynth_config::schema::GeneratorConfig;
32use datasynth_core::error::{SynthError, SynthResult};
33use datasynth_core::models::audit::{
34    AuditEngagement, AuditEvidence, AuditFinding, ProfessionalJudgment, RiskAssessment, Workpaper,
35};
36use datasynth_core::models::subledger::ap::APInvoice;
37use datasynth_core::models::subledger::ar::ARInvoice;
38use datasynth_core::models::*;
39use datasynth_core::{DegradationActions, DegradationLevel, ResourceGuard, ResourceGuardBuilder};
40use datasynth_fingerprint::{
41    io::FingerprintReader,
42    models::Fingerprint,
43    synthesis::{ConfigSynthesizer, CopulaGeneratorSpec, SynthesisOptions},
44};
45use datasynth_generators::{
46    // Anomaly injection
47    AnomalyInjector,
48    AnomalyInjectorConfig,
49    AssetGenerator,
50    // Audit generators
51    AuditEngagementGenerator,
52    BalanceTrackerConfig,
53    // Core generators
54    ChartOfAccountsGenerator,
55    CustomerGenerator,
56    DataQualityConfig,
57    // Data quality
58    DataQualityInjector,
59    DataQualityStats,
60    // Document flow JE generator
61    DocumentFlowJeConfig,
62    DocumentFlowJeGenerator,
63    // Subledger linker
64    DocumentFlowLinker,
65    EmployeeGenerator,
66    EvidenceGenerator,
67    FindingGenerator,
68    JournalEntryGenerator,
69    JudgmentGenerator,
70    LatePaymentDistribution,
71    MaterialGenerator,
72    O2CDocumentChain,
73    O2CGenerator,
74    O2CGeneratorConfig,
75    O2CPaymentBehavior,
76    P2PDocumentChain,
77    // Document flow generators
78    P2PGenerator,
79    P2PGeneratorConfig,
80    P2PPaymentBehavior,
81    RiskAssessmentGenerator,
82    // Balance validation
83    RunningBalanceTracker,
84    ValidationError,
85    // Master data generators
86    VendorGenerator,
87    WorkpaperGenerator,
88};
89use datasynth_graph::{
90    PyGExportConfig, PyGExporter, TransactionGraphBuilder, TransactionGraphConfig,
91};
92use datasynth_ocpm::{
93    EventLogMetadata, O2cDocuments, OcpmEventGenerator, OcpmEventLog, OcpmGeneratorConfig,
94    P2pDocuments,
95};
96
97use datasynth_config::schema::{O2CFlowConfig, P2PFlowConfig};
98use datasynth_core::causal::{CausalGraph, CausalValidator, StructuralCausalModel};
99use datasynth_core::diffusion::{DiffusionBackend, DiffusionConfig, StatisticalDiffusionBackend};
100use datasynth_core::llm::MockLlmProvider;
101use datasynth_core::models::documents::PaymentMethod;
102use datasynth_generators::llm_enrichment::VendorLlmEnricher;
103
104// ============================================================================
105// Configuration Conversion Functions
106// ============================================================================
107
108/// Convert P2P flow config from schema to generator config.
109fn convert_p2p_config(schema_config: &P2PFlowConfig) -> P2PGeneratorConfig {
110    let payment_behavior = &schema_config.payment_behavior;
111    let late_dist = &payment_behavior.late_payment_days_distribution;
112
113    P2PGeneratorConfig {
114        three_way_match_rate: schema_config.three_way_match_rate,
115        partial_delivery_rate: schema_config.partial_delivery_rate,
116        over_delivery_rate: 0.02, // Not in schema, use default
117        price_variance_rate: schema_config.price_variance_rate,
118        max_price_variance_percent: schema_config.max_price_variance_percent,
119        avg_days_po_to_gr: schema_config.average_po_to_gr_days,
120        avg_days_gr_to_invoice: schema_config.average_gr_to_invoice_days,
121        avg_days_invoice_to_payment: schema_config.average_invoice_to_payment_days,
122        payment_method_distribution: vec![
123            (PaymentMethod::BankTransfer, 0.60),
124            (PaymentMethod::Check, 0.25),
125            (PaymentMethod::Wire, 0.10),
126            (PaymentMethod::CreditCard, 0.05),
127        ],
128        early_payment_discount_rate: 0.30, // Not in schema, use default
129        payment_behavior: P2PPaymentBehavior {
130            late_payment_rate: payment_behavior.late_payment_rate,
131            late_payment_distribution: LatePaymentDistribution {
132                slightly_late_1_to_7: late_dist.slightly_late_1_to_7,
133                late_8_to_14: late_dist.late_8_to_14,
134                very_late_15_to_30: late_dist.very_late_15_to_30,
135                severely_late_31_to_60: late_dist.severely_late_31_to_60,
136                extremely_late_over_60: late_dist.extremely_late_over_60,
137            },
138            partial_payment_rate: payment_behavior.partial_payment_rate,
139            payment_correction_rate: payment_behavior.payment_correction_rate,
140        },
141    }
142}
143
144/// Convert O2C flow config from schema to generator config.
145fn convert_o2c_config(schema_config: &O2CFlowConfig) -> O2CGeneratorConfig {
146    let payment_behavior = &schema_config.payment_behavior;
147
148    O2CGeneratorConfig {
149        credit_check_failure_rate: schema_config.credit_check_failure_rate,
150        partial_shipment_rate: schema_config.partial_shipment_rate,
151        avg_days_so_to_delivery: schema_config.average_so_to_delivery_days,
152        avg_days_delivery_to_invoice: schema_config.average_delivery_to_invoice_days,
153        avg_days_invoice_to_payment: schema_config.average_invoice_to_receipt_days,
154        late_payment_rate: 0.15, // Managed through dunning now
155        bad_debt_rate: schema_config.bad_debt_rate,
156        returns_rate: schema_config.return_rate,
157        cash_discount_take_rate: schema_config.cash_discount.taken_rate,
158        payment_method_distribution: vec![
159            (PaymentMethod::BankTransfer, 0.50),
160            (PaymentMethod::Check, 0.30),
161            (PaymentMethod::Wire, 0.15),
162            (PaymentMethod::CreditCard, 0.05),
163        ],
164        payment_behavior: O2CPaymentBehavior {
165            partial_payment_rate: payment_behavior.partial_payments.rate,
166            short_payment_rate: payment_behavior.short_payments.rate,
167            max_short_percent: payment_behavior.short_payments.max_short_percent,
168            on_account_rate: payment_behavior.on_account_payments.rate,
169            payment_correction_rate: payment_behavior.payment_corrections.rate,
170            avg_days_until_remainder: payment_behavior.partial_payments.avg_days_until_remainder,
171        },
172    }
173}
174
175/// Configuration for which generation phases to run.
176#[derive(Debug, Clone)]
177pub struct PhaseConfig {
178    /// Generate master data (vendors, customers, materials, assets, employees).
179    pub generate_master_data: bool,
180    /// Generate document flows (P2P, O2C).
181    pub generate_document_flows: bool,
182    /// Generate OCPM events from document flows.
183    pub generate_ocpm_events: bool,
184    /// Generate journal entries.
185    pub generate_journal_entries: bool,
186    /// Inject anomalies.
187    pub inject_anomalies: bool,
188    /// Inject data quality variations (typos, missing values, format variations).
189    pub inject_data_quality: bool,
190    /// Validate balance sheet equation after generation.
191    pub validate_balances: bool,
192    /// Show progress bars.
193    pub show_progress: bool,
194    /// Number of vendors to generate per company.
195    pub vendors_per_company: usize,
196    /// Number of customers to generate per company.
197    pub customers_per_company: usize,
198    /// Number of materials to generate per company.
199    pub materials_per_company: usize,
200    /// Number of assets to generate per company.
201    pub assets_per_company: usize,
202    /// Number of employees to generate per company.
203    pub employees_per_company: usize,
204    /// Number of P2P chains to generate.
205    pub p2p_chains: usize,
206    /// Number of O2C chains to generate.
207    pub o2c_chains: usize,
208    /// Generate audit data (engagements, workpapers, evidence, risks, findings, judgments).
209    pub generate_audit: bool,
210    /// Number of audit engagements to generate.
211    pub audit_engagements: usize,
212    /// Number of workpapers per engagement.
213    pub workpapers_per_engagement: usize,
214    /// Number of evidence items per workpaper.
215    pub evidence_per_workpaper: usize,
216    /// Number of risk assessments per engagement.
217    pub risks_per_engagement: usize,
218    /// Number of findings per engagement.
219    pub findings_per_engagement: usize,
220    /// Number of professional judgments per engagement.
221    pub judgments_per_engagement: usize,
222    /// Generate banking KYC/AML data (customers, accounts, transactions, typologies).
223    pub generate_banking: bool,
224    /// Generate graph exports (accounting network for ML training).
225    pub generate_graph_export: bool,
226}
227
228impl Default for PhaseConfig {
229    fn default() -> Self {
230        Self {
231            generate_master_data: true,
232            generate_document_flows: true,
233            generate_ocpm_events: false, // Off by default
234            generate_journal_entries: true,
235            inject_anomalies: false,
236            inject_data_quality: false, // Off by default (to preserve clean test data)
237            validate_balances: true,
238            show_progress: true,
239            vendors_per_company: 50,
240            customers_per_company: 100,
241            materials_per_company: 200,
242            assets_per_company: 50,
243            employees_per_company: 100,
244            p2p_chains: 100,
245            o2c_chains: 100,
246            generate_audit: false, // Off by default
247            audit_engagements: 5,
248            workpapers_per_engagement: 20,
249            evidence_per_workpaper: 5,
250            risks_per_engagement: 15,
251            findings_per_engagement: 8,
252            judgments_per_engagement: 10,
253            generate_banking: false,      // Off by default
254            generate_graph_export: false, // Off by default
255        }
256    }
257}
258
259/// Master data snapshot containing all generated entities.
260#[derive(Debug, Clone, Default)]
261pub struct MasterDataSnapshot {
262    /// Generated vendors.
263    pub vendors: Vec<Vendor>,
264    /// Generated customers.
265    pub customers: Vec<Customer>,
266    /// Generated materials.
267    pub materials: Vec<Material>,
268    /// Generated fixed assets.
269    pub assets: Vec<FixedAsset>,
270    /// Generated employees.
271    pub employees: Vec<Employee>,
272}
273
274/// Info about a completed hypergraph export.
275#[derive(Debug, Clone)]
276pub struct HypergraphExportInfo {
277    /// Number of nodes exported.
278    pub node_count: usize,
279    /// Number of pairwise edges exported.
280    pub edge_count: usize,
281    /// Number of hyperedges exported.
282    pub hyperedge_count: usize,
283    /// Output directory path.
284    pub output_path: PathBuf,
285}
286
287/// Document flow snapshot containing all generated document chains.
288#[derive(Debug, Clone, Default)]
289pub struct DocumentFlowSnapshot {
290    /// P2P document chains.
291    pub p2p_chains: Vec<P2PDocumentChain>,
292    /// O2C document chains.
293    pub o2c_chains: Vec<O2CDocumentChain>,
294    /// All purchase orders (flattened).
295    pub purchase_orders: Vec<documents::PurchaseOrder>,
296    /// All goods receipts (flattened).
297    pub goods_receipts: Vec<documents::GoodsReceipt>,
298    /// All vendor invoices (flattened).
299    pub vendor_invoices: Vec<documents::VendorInvoice>,
300    /// All sales orders (flattened).
301    pub sales_orders: Vec<documents::SalesOrder>,
302    /// All deliveries (flattened).
303    pub deliveries: Vec<documents::Delivery>,
304    /// All customer invoices (flattened).
305    pub customer_invoices: Vec<documents::CustomerInvoice>,
306    /// All payments (flattened).
307    pub payments: Vec<documents::Payment>,
308}
309
310/// Subledger snapshot containing generated subledger records.
311#[derive(Debug, Clone, Default)]
312pub struct SubledgerSnapshot {
313    /// AP invoices linked from document flow vendor invoices.
314    pub ap_invoices: Vec<APInvoice>,
315    /// AR invoices linked from document flow customer invoices.
316    pub ar_invoices: Vec<ARInvoice>,
317}
318
319/// OCPM snapshot containing generated OCPM event log data.
320#[derive(Debug, Clone, Default)]
321pub struct OcpmSnapshot {
322    /// OCPM event log (if generated)
323    pub event_log: Option<OcpmEventLog>,
324    /// Number of events generated
325    pub event_count: usize,
326    /// Number of objects generated
327    pub object_count: usize,
328    /// Number of cases generated
329    pub case_count: usize,
330}
331
332/// Audit data snapshot containing all generated audit-related entities.
333#[derive(Debug, Clone, Default)]
334pub struct AuditSnapshot {
335    /// Audit engagements per ISA 210/220.
336    pub engagements: Vec<AuditEngagement>,
337    /// Workpapers per ISA 230.
338    pub workpapers: Vec<Workpaper>,
339    /// Audit evidence per ISA 500.
340    pub evidence: Vec<AuditEvidence>,
341    /// Risk assessments per ISA 315/330.
342    pub risk_assessments: Vec<RiskAssessment>,
343    /// Audit findings per ISA 265.
344    pub findings: Vec<AuditFinding>,
345    /// Professional judgments per ISA 200.
346    pub judgments: Vec<ProfessionalJudgment>,
347}
348
349/// Banking KYC/AML data snapshot containing all generated banking entities.
350#[derive(Debug, Clone, Default)]
351pub struct BankingSnapshot {
352    /// Banking customers (retail, business, trust).
353    pub customers: Vec<BankingCustomer>,
354    /// Bank accounts.
355    pub accounts: Vec<BankAccount>,
356    /// Bank transactions with AML labels.
357    pub transactions: Vec<BankTransaction>,
358    /// Number of suspicious transactions.
359    pub suspicious_count: usize,
360    /// Number of AML scenarios generated.
361    pub scenario_count: usize,
362}
363
364/// Graph export snapshot containing exported graph metadata.
365#[derive(Debug, Clone, Default)]
366pub struct GraphExportSnapshot {
367    /// Whether graph export was performed.
368    pub exported: bool,
369    /// Number of graphs exported.
370    pub graph_count: usize,
371    /// Exported graph metadata (by format name).
372    pub exports: HashMap<String, GraphExportInfo>,
373}
374
375/// Information about an exported graph.
376#[derive(Debug, Clone)]
377pub struct GraphExportInfo {
378    /// Graph name.
379    pub name: String,
380    /// Export format (pytorch_geometric, neo4j, dgl).
381    pub format: String,
382    /// Output directory path.
383    pub output_path: PathBuf,
384    /// Number of nodes.
385    pub node_count: usize,
386    /// Number of edges.
387    pub edge_count: usize,
388}
389
390/// Anomaly labels generated during injection.
391#[derive(Debug, Clone, Default)]
392pub struct AnomalyLabels {
393    /// All anomaly labels.
394    pub labels: Vec<LabeledAnomaly>,
395    /// Summary statistics.
396    pub summary: Option<AnomalySummary>,
397    /// Count by anomaly type.
398    pub by_type: HashMap<String, usize>,
399}
400
401/// Balance validation results from running balance tracker.
402#[derive(Debug, Clone, Default)]
403pub struct BalanceValidationResult {
404    /// Whether validation was performed.
405    pub validated: bool,
406    /// Whether balance sheet equation is satisfied.
407    pub is_balanced: bool,
408    /// Number of entries processed.
409    pub entries_processed: u64,
410    /// Total debits across all entries.
411    pub total_debits: rust_decimal::Decimal,
412    /// Total credits across all entries.
413    pub total_credits: rust_decimal::Decimal,
414    /// Number of accounts tracked.
415    pub accounts_tracked: usize,
416    /// Number of companies tracked.
417    pub companies_tracked: usize,
418    /// Validation errors encountered.
419    pub validation_errors: Vec<ValidationError>,
420    /// Whether any unbalanced entries were found.
421    pub has_unbalanced_entries: bool,
422}
423
424/// Complete result of enhanced generation run.
425#[derive(Debug)]
426pub struct EnhancedGenerationResult {
427    /// Generated chart of accounts.
428    pub chart_of_accounts: ChartOfAccounts,
429    /// Master data snapshot.
430    pub master_data: MasterDataSnapshot,
431    /// Document flow snapshot.
432    pub document_flows: DocumentFlowSnapshot,
433    /// Subledger snapshot (linked from document flows).
434    pub subledger: SubledgerSnapshot,
435    /// OCPM event log snapshot (if OCPM generation enabled).
436    pub ocpm: OcpmSnapshot,
437    /// Audit data snapshot (if audit generation enabled).
438    pub audit: AuditSnapshot,
439    /// Banking KYC/AML data snapshot (if banking generation enabled).
440    pub banking: BankingSnapshot,
441    /// Graph export snapshot (if graph export enabled).
442    pub graph_export: GraphExportSnapshot,
443    /// Generated journal entries.
444    pub journal_entries: Vec<JournalEntry>,
445    /// Anomaly labels (if injection enabled).
446    pub anomaly_labels: AnomalyLabels,
447    /// Balance validation results (if validation enabled).
448    pub balance_validation: BalanceValidationResult,
449    /// Data quality statistics (if injection enabled).
450    pub data_quality_stats: DataQualityStats,
451    /// Generation statistics.
452    pub statistics: EnhancedGenerationStatistics,
453    /// Data lineage graph (if tracking enabled).
454    pub lineage: Option<super::lineage::LineageGraph>,
455    /// Quality gate evaluation result.
456    pub gate_result: Option<datasynth_eval::gates::GateResult>,
457}
458
459/// Enhanced statistics about a generation run.
460#[derive(Debug, Clone, Default, Serialize, Deserialize)]
461pub struct EnhancedGenerationStatistics {
462    /// Total journal entries generated.
463    pub total_entries: u64,
464    /// Total line items generated.
465    pub total_line_items: u64,
466    /// Number of accounts in CoA.
467    pub accounts_count: usize,
468    /// Number of companies.
469    pub companies_count: usize,
470    /// Period in months.
471    pub period_months: u32,
472    /// Master data counts.
473    pub vendor_count: usize,
474    pub customer_count: usize,
475    pub material_count: usize,
476    pub asset_count: usize,
477    pub employee_count: usize,
478    /// Document flow counts.
479    pub p2p_chain_count: usize,
480    pub o2c_chain_count: usize,
481    /// Subledger counts.
482    pub ap_invoice_count: usize,
483    pub ar_invoice_count: usize,
484    /// OCPM counts.
485    pub ocpm_event_count: usize,
486    pub ocpm_object_count: usize,
487    pub ocpm_case_count: usize,
488    /// Audit counts.
489    pub audit_engagement_count: usize,
490    pub audit_workpaper_count: usize,
491    pub audit_evidence_count: usize,
492    pub audit_risk_count: usize,
493    pub audit_finding_count: usize,
494    pub audit_judgment_count: usize,
495    /// Anomaly counts.
496    pub anomalies_injected: usize,
497    /// Data quality issue counts.
498    pub data_quality_issues: usize,
499    /// Banking counts.
500    pub banking_customer_count: usize,
501    pub banking_account_count: usize,
502    pub banking_transaction_count: usize,
503    pub banking_suspicious_count: usize,
504    /// Graph export counts.
505    pub graph_export_count: usize,
506    pub graph_node_count: usize,
507    pub graph_edge_count: usize,
508    /// LLM enrichment timing (milliseconds).
509    #[serde(default)]
510    pub llm_enrichment_ms: u64,
511    /// Number of vendor names enriched by LLM.
512    #[serde(default)]
513    pub llm_vendors_enriched: usize,
514    /// Diffusion enhancement timing (milliseconds).
515    #[serde(default)]
516    pub diffusion_enhancement_ms: u64,
517    /// Number of diffusion samples generated.
518    #[serde(default)]
519    pub diffusion_samples_generated: usize,
520    /// Causal generation timing (milliseconds).
521    #[serde(default)]
522    pub causal_generation_ms: u64,
523    /// Number of causal samples generated.
524    #[serde(default)]
525    pub causal_samples_generated: usize,
526    /// Whether causal validation passed.
527    #[serde(default)]
528    pub causal_validation_passed: Option<bool>,
529}
530
531/// Enhanced orchestrator with full feature integration.
532pub struct EnhancedOrchestrator {
533    config: GeneratorConfig,
534    phase_config: PhaseConfig,
535    coa: Option<Arc<ChartOfAccounts>>,
536    master_data: MasterDataSnapshot,
537    seed: u64,
538    multi_progress: Option<MultiProgress>,
539    /// Resource guard for memory, disk, and CPU monitoring
540    resource_guard: ResourceGuard,
541    /// Output path for disk space monitoring
542    output_path: Option<PathBuf>,
543    /// Copula generators for preserving correlations (from fingerprint)
544    copula_generators: Vec<CopulaGeneratorSpec>,
545}
546
547impl EnhancedOrchestrator {
548    /// Create a new enhanced orchestrator.
549    pub fn new(config: GeneratorConfig, phase_config: PhaseConfig) -> SynthResult<Self> {
550        datasynth_config::validate_config(&config)?;
551
552        let seed = config.global.seed.unwrap_or_else(rand::random);
553
554        // Build resource guard from config
555        let resource_guard = Self::build_resource_guard(&config, None);
556
557        Ok(Self {
558            config,
559            phase_config,
560            coa: None,
561            master_data: MasterDataSnapshot::default(),
562            seed,
563            multi_progress: None,
564            resource_guard,
565            output_path: None,
566            copula_generators: Vec::new(),
567        })
568    }
569
570    /// Create with default phase config.
571    pub fn with_defaults(config: GeneratorConfig) -> SynthResult<Self> {
572        Self::new(config, PhaseConfig::default())
573    }
574
575    /// Enable/disable progress bars.
576    pub fn with_progress(mut self, show: bool) -> Self {
577        self.phase_config.show_progress = show;
578        if show {
579            self.multi_progress = Some(MultiProgress::new());
580        }
581        self
582    }
583
584    /// Set the output path for disk space monitoring.
585    pub fn with_output_path<P: Into<PathBuf>>(mut self, path: P) -> Self {
586        let path = path.into();
587        self.output_path = Some(path.clone());
588        // Rebuild resource guard with the output path
589        self.resource_guard = Self::build_resource_guard(&self.config, Some(path));
590        self
591    }
592
593    /// Check if copula generators are available.
594    ///
595    /// Returns true if the orchestrator has copula generators for preserving
596    /// correlations (typically from fingerprint-based generation).
597    pub fn has_copulas(&self) -> bool {
598        !self.copula_generators.is_empty()
599    }
600
601    /// Get the copula generators.
602    ///
603    /// Returns a reference to the copula generators for use during generation.
604    /// These can be used to generate correlated samples that preserve the
605    /// statistical relationships from the source data.
606    pub fn copulas(&self) -> &[CopulaGeneratorSpec] {
607        &self.copula_generators
608    }
609
610    /// Get a mutable reference to the copula generators.
611    ///
612    /// Allows generators to sample from copulas during data generation.
613    pub fn copulas_mut(&mut self) -> &mut [CopulaGeneratorSpec] {
614        &mut self.copula_generators
615    }
616
617    /// Sample correlated values from a named copula.
618    ///
619    /// Returns None if the copula doesn't exist.
620    pub fn sample_from_copula(&mut self, copula_name: &str) -> Option<Vec<f64>> {
621        self.copula_generators
622            .iter_mut()
623            .find(|c| c.name == copula_name)
624            .map(|c| c.generator.sample())
625    }
626
627    /// Create an orchestrator from a fingerprint file.
628    ///
629    /// This reads the fingerprint, synthesizes a GeneratorConfig from it,
630    /// and creates an orchestrator configured to generate data matching
631    /// the statistical properties of the original data.
632    ///
633    /// # Arguments
634    /// * `fingerprint_path` - Path to the .dsf fingerprint file
635    /// * `phase_config` - Phase configuration for generation
636    /// * `scale` - Scale factor for row counts (1.0 = same as original)
637    ///
638    /// # Example
639    /// ```no_run
640    /// use datasynth_runtime::{EnhancedOrchestrator, PhaseConfig};
641    /// use std::path::Path;
642    ///
643    /// let orchestrator = EnhancedOrchestrator::from_fingerprint(
644    ///     Path::new("fingerprint.dsf"),
645    ///     PhaseConfig::default(),
646    ///     1.0,
647    /// ).unwrap();
648    /// ```
649    pub fn from_fingerprint(
650        fingerprint_path: &std::path::Path,
651        phase_config: PhaseConfig,
652        scale: f64,
653    ) -> SynthResult<Self> {
654        info!("Loading fingerprint from: {}", fingerprint_path.display());
655
656        // Read the fingerprint
657        let reader = FingerprintReader::new();
658        let fingerprint = reader
659            .read_from_file(fingerprint_path)
660            .map_err(|e| SynthError::config(format!("Failed to read fingerprint: {}", e)))?;
661
662        Self::from_fingerprint_data(fingerprint, phase_config, scale)
663    }
664
665    /// Create an orchestrator from a loaded fingerprint.
666    ///
667    /// # Arguments
668    /// * `fingerprint` - The loaded fingerprint
669    /// * `phase_config` - Phase configuration for generation
670    /// * `scale` - Scale factor for row counts (1.0 = same as original)
671    pub fn from_fingerprint_data(
672        fingerprint: Fingerprint,
673        phase_config: PhaseConfig,
674        scale: f64,
675    ) -> SynthResult<Self> {
676        info!(
677            "Synthesizing config from fingerprint (version: {}, tables: {})",
678            fingerprint.manifest.version,
679            fingerprint.schema.tables.len()
680        );
681
682        // Generate a seed for the synthesis
683        let seed: u64 = rand::random();
684
685        // Use ConfigSynthesizer with scale option to convert fingerprint to GeneratorConfig
686        let options = SynthesisOptions {
687            scale,
688            seed: Some(seed),
689            preserve_correlations: true,
690            inject_anomalies: true,
691        };
692        let synthesizer = ConfigSynthesizer::with_options(options);
693
694        // Synthesize full result including copula generators
695        let synthesis_result = synthesizer
696            .synthesize_full(&fingerprint, seed)
697            .map_err(|e| {
698                SynthError::config(format!(
699                    "Failed to synthesize config from fingerprint: {}",
700                    e
701                ))
702            })?;
703
704        // Start with a base config from the fingerprint's industry if available
705        let mut config = if let Some(ref industry) = fingerprint.manifest.source.industry {
706            Self::base_config_for_industry(industry)
707        } else {
708            Self::base_config_for_industry("manufacturing")
709        };
710
711        // Apply the synthesized patches
712        config = Self::apply_config_patch(config, &synthesis_result.config_patch);
713
714        // Log synthesis results
715        info!(
716            "Config synthesized: {} tables, scale={:.2}, copula generators: {}",
717            fingerprint.schema.tables.len(),
718            scale,
719            synthesis_result.copula_generators.len()
720        );
721
722        if !synthesis_result.copula_generators.is_empty() {
723            for spec in &synthesis_result.copula_generators {
724                info!(
725                    "  Copula '{}' for table '{}': {} columns",
726                    spec.name,
727                    spec.table,
728                    spec.columns.len()
729                );
730            }
731        }
732
733        // Create the orchestrator with the synthesized config
734        let mut orchestrator = Self::new(config, phase_config)?;
735
736        // Store copula generators for use during generation
737        orchestrator.copula_generators = synthesis_result.copula_generators;
738
739        Ok(orchestrator)
740    }
741
742    /// Create a base config for a given industry.
743    fn base_config_for_industry(industry: &str) -> GeneratorConfig {
744        use datasynth_config::presets::create_preset;
745        use datasynth_config::TransactionVolume;
746        use datasynth_core::models::{CoAComplexity, IndustrySector};
747
748        let sector = match industry.to_lowercase().as_str() {
749            "manufacturing" => IndustrySector::Manufacturing,
750            "retail" => IndustrySector::Retail,
751            "financial" | "financial_services" => IndustrySector::FinancialServices,
752            "healthcare" => IndustrySector::Healthcare,
753            "technology" | "tech" => IndustrySector::Technology,
754            _ => IndustrySector::Manufacturing,
755        };
756
757        // Create a preset with reasonable defaults
758        create_preset(
759            sector,
760            1,  // company count
761            12, // period months
762            CoAComplexity::Medium,
763            TransactionVolume::TenK,
764        )
765    }
766
767    /// Apply a config patch to a GeneratorConfig.
768    fn apply_config_patch(
769        mut config: GeneratorConfig,
770        patch: &datasynth_fingerprint::synthesis::ConfigPatch,
771    ) -> GeneratorConfig {
772        use datasynth_fingerprint::synthesis::ConfigValue;
773
774        for (key, value) in patch.values() {
775            match (key.as_str(), value) {
776                // Transaction count is handled via TransactionVolume enum on companies
777                // Log it but cannot directly set it (would need to modify company volumes)
778                ("transactions.count", ConfigValue::Integer(n)) => {
779                    info!(
780                        "Fingerprint suggests {} transactions (apply via company volumes)",
781                        n
782                    );
783                }
784                ("global.period_months", ConfigValue::Integer(n)) => {
785                    config.global.period_months = *n as u32;
786                }
787                ("global.start_date", ConfigValue::String(s)) => {
788                    config.global.start_date = s.clone();
789                }
790                ("global.seed", ConfigValue::Integer(n)) => {
791                    config.global.seed = Some(*n as u64);
792                }
793                ("fraud.enabled", ConfigValue::Bool(b)) => {
794                    config.fraud.enabled = *b;
795                }
796                ("fraud.fraud_rate", ConfigValue::Float(f)) => {
797                    config.fraud.fraud_rate = *f;
798                }
799                ("data_quality.enabled", ConfigValue::Bool(b)) => {
800                    config.data_quality.enabled = *b;
801                }
802                // Handle anomaly injection paths (mapped to fraud config)
803                ("anomaly_injection.enabled", ConfigValue::Bool(b)) => {
804                    config.fraud.enabled = *b;
805                }
806                ("anomaly_injection.overall_rate", ConfigValue::Float(f)) => {
807                    config.fraud.fraud_rate = *f;
808                }
809                _ => {
810                    debug!("Ignoring unknown config patch key: {}", key);
811                }
812            }
813        }
814
815        config
816    }
817
818    /// Build a resource guard from the configuration.
819    fn build_resource_guard(
820        config: &GeneratorConfig,
821        output_path: Option<PathBuf>,
822    ) -> ResourceGuard {
823        let mut builder = ResourceGuardBuilder::new();
824
825        // Configure memory limit if set
826        if config.global.memory_limit_mb > 0 {
827            builder = builder.memory_limit(config.global.memory_limit_mb);
828        }
829
830        // Configure disk monitoring for output path
831        if let Some(path) = output_path {
832            builder = builder.output_path(path).min_free_disk(100); // Require at least 100 MB free
833        }
834
835        // Use conservative degradation settings for production safety
836        builder = builder.conservative();
837
838        builder.build()
839    }
840
841    /// Check resources (memory, disk, CPU) and return degradation level.
842    ///
843    /// Returns an error if hard limits are exceeded.
844    /// Returns Ok(DegradationLevel) indicating current resource state.
845    fn check_resources(&self) -> SynthResult<DegradationLevel> {
846        self.resource_guard.check()
847    }
848
849    /// Check resources with logging.
850    fn check_resources_with_log(&self, phase: &str) -> SynthResult<DegradationLevel> {
851        let level = self.resource_guard.check()?;
852
853        if level != DegradationLevel::Normal {
854            warn!(
855                "Resource degradation at {}: level={}, memory={}MB, disk={}MB",
856                phase,
857                level,
858                self.resource_guard.current_memory_mb(),
859                self.resource_guard.available_disk_mb()
860            );
861        }
862
863        Ok(level)
864    }
865
866    /// Get current degradation actions based on resource state.
867    fn get_degradation_actions(&self) -> DegradationActions {
868        self.resource_guard.get_actions()
869    }
870
871    /// Legacy method for backwards compatibility - now uses ResourceGuard.
872    fn check_memory_limit(&self) -> SynthResult<()> {
873        self.check_resources()?;
874        Ok(())
875    }
876
877    /// Run the complete generation workflow.
878    pub fn generate(&mut self) -> SynthResult<EnhancedGenerationResult> {
879        info!("Starting enhanced generation workflow");
880        info!(
881            "Config: industry={:?}, period_months={}, companies={}",
882            self.config.global.industry,
883            self.config.global.period_months,
884            self.config.companies.len()
885        );
886
887        // Initial resource check before starting
888        let initial_level = self.check_resources_with_log("initial")?;
889        if initial_level == DegradationLevel::Emergency {
890            return Err(SynthError::resource(
891                "Insufficient resources to start generation",
892            ));
893        }
894
895        let mut stats = EnhancedGenerationStatistics {
896            companies_count: self.config.companies.len(),
897            period_months: self.config.global.period_months,
898            ..Default::default()
899        };
900
901        // Phase 1: Chart of Accounts
902        let coa = self.phase_chart_of_accounts(&mut stats)?;
903
904        // Phase 2: Master Data
905        self.phase_master_data(&mut stats)?;
906
907        // Phase 3: Document Flows + Subledger Linking
908        let (document_flows, subledger) = self.phase_document_flows(&mut stats)?;
909
910        // Phase 3c: OCPM Events
911        let ocpm = self.phase_ocpm_events(&document_flows, &mut stats)?;
912
913        // Phase 4: Journal Entries
914        let mut entries = self.phase_journal_entries(&coa, &document_flows, &mut stats)?;
915
916        // Get current degradation actions for optional phases
917        let actions = self.get_degradation_actions();
918
919        // Phase 5: Anomaly Injection
920        let anomaly_labels = self.phase_anomaly_injection(&mut entries, &actions, &mut stats)?;
921
922        // Phase 6: Balance Validation
923        let balance_validation = self.phase_balance_validation(&entries)?;
924
925        // Phase 7: Data Quality Injection
926        let data_quality_stats =
927            self.phase_data_quality_injection(&mut entries, &actions, &mut stats)?;
928
929        // Phase 8: Audit Data
930        let audit = self.phase_audit_data(&entries, &mut stats)?;
931
932        // Phase 9: Banking KYC/AML Data
933        let banking = self.phase_banking_data(&mut stats)?;
934
935        // Phase 10: Graph Export
936        let graph_export = self.phase_graph_export(&entries, &coa, &mut stats)?;
937
938        // Phase 10b: Hypergraph Export
939        self.phase_hypergraph_export(&coa, &entries, &document_flows, &mut stats)?;
940
941        // Phase 11: LLM Enrichment
942        self.phase_llm_enrichment(&mut stats);
943
944        // Phase 12: Diffusion Enhancement
945        self.phase_diffusion_enhancement(&mut stats);
946
947        // Phase 13: Causal Overlay
948        self.phase_causal_overlay(&mut stats);
949
950        // Log final resource statistics
951        let resource_stats = self.resource_guard.stats();
952        info!(
953            "Generation workflow complete. Resource stats: memory_peak={}MB, disk_written={}bytes, degradation_level={}",
954            resource_stats.memory.peak_resident_bytes / (1024 * 1024),
955            resource_stats.disk.estimated_bytes_written,
956            resource_stats.degradation_level
957        );
958
959        // Build data lineage graph
960        let lineage = self.build_lineage_graph();
961
962        Ok(EnhancedGenerationResult {
963            chart_of_accounts: (*coa).clone(),
964            master_data: self.master_data.clone(),
965            document_flows,
966            subledger,
967            ocpm,
968            audit,
969            banking,
970            graph_export,
971            journal_entries: entries,
972            anomaly_labels,
973            balance_validation,
974            data_quality_stats,
975            statistics: stats,
976            lineage: Some(lineage),
977            gate_result: None,
978        })
979    }
980
981    // ========================================================================
982    // Generation Phase Methods
983    // ========================================================================
984
985    /// Phase 1: Generate Chart of Accounts and update statistics.
986    fn phase_chart_of_accounts(
987        &mut self,
988        stats: &mut EnhancedGenerationStatistics,
989    ) -> SynthResult<Arc<ChartOfAccounts>> {
990        info!("Phase 1: Generating Chart of Accounts");
991        let coa = self.generate_coa()?;
992        stats.accounts_count = coa.account_count();
993        info!(
994            "Chart of Accounts generated: {} accounts",
995            stats.accounts_count
996        );
997        self.check_resources_with_log("post-coa")?;
998        Ok(coa)
999    }
1000
1001    /// Phase 2: Generate master data (vendors, customers, materials, assets, employees).
1002    fn phase_master_data(&mut self, stats: &mut EnhancedGenerationStatistics) -> SynthResult<()> {
1003        if self.phase_config.generate_master_data {
1004            info!("Phase 2: Generating Master Data");
1005            self.generate_master_data()?;
1006            stats.vendor_count = self.master_data.vendors.len();
1007            stats.customer_count = self.master_data.customers.len();
1008            stats.material_count = self.master_data.materials.len();
1009            stats.asset_count = self.master_data.assets.len();
1010            stats.employee_count = self.master_data.employees.len();
1011            info!(
1012                "Master data generated: {} vendors, {} customers, {} materials, {} assets, {} employees",
1013                stats.vendor_count, stats.customer_count, stats.material_count,
1014                stats.asset_count, stats.employee_count
1015            );
1016            self.check_resources_with_log("post-master-data")?;
1017        } else {
1018            debug!("Phase 2: Skipped (master data generation disabled)");
1019        }
1020        Ok(())
1021    }
1022
1023    /// Phase 3: Generate document flows (P2P and O2C) and link to subledgers.
1024    fn phase_document_flows(
1025        &mut self,
1026        stats: &mut EnhancedGenerationStatistics,
1027    ) -> SynthResult<(DocumentFlowSnapshot, SubledgerSnapshot)> {
1028        let mut document_flows = DocumentFlowSnapshot::default();
1029        let mut subledger = SubledgerSnapshot::default();
1030
1031        if self.phase_config.generate_document_flows && !self.master_data.vendors.is_empty() {
1032            info!("Phase 3: Generating Document Flows");
1033            self.generate_document_flows(&mut document_flows)?;
1034            stats.p2p_chain_count = document_flows.p2p_chains.len();
1035            stats.o2c_chain_count = document_flows.o2c_chains.len();
1036            info!(
1037                "Document flows generated: {} P2P chains, {} O2C chains",
1038                stats.p2p_chain_count, stats.o2c_chain_count
1039            );
1040
1041            // Phase 3b: Link document flows to subledgers (for data coherence)
1042            debug!("Phase 3b: Linking document flows to subledgers");
1043            subledger = self.link_document_flows_to_subledgers(&document_flows)?;
1044            stats.ap_invoice_count = subledger.ap_invoices.len();
1045            stats.ar_invoice_count = subledger.ar_invoices.len();
1046            debug!(
1047                "Subledgers linked: {} AP invoices, {} AR invoices",
1048                stats.ap_invoice_count, stats.ar_invoice_count
1049            );
1050
1051            self.check_resources_with_log("post-document-flows")?;
1052        } else {
1053            debug!("Phase 3: Skipped (document flow generation disabled or no master data)");
1054        }
1055
1056        Ok((document_flows, subledger))
1057    }
1058
1059    /// Phase 3c: Generate OCPM events from document flows.
1060    fn phase_ocpm_events(
1061        &mut self,
1062        document_flows: &DocumentFlowSnapshot,
1063        stats: &mut EnhancedGenerationStatistics,
1064    ) -> SynthResult<OcpmSnapshot> {
1065        if self.phase_config.generate_ocpm_events && !document_flows.p2p_chains.is_empty() {
1066            info!("Phase 3c: Generating OCPM Events");
1067            let ocpm_snapshot = self.generate_ocpm_events(document_flows)?;
1068            stats.ocpm_event_count = ocpm_snapshot.event_count;
1069            stats.ocpm_object_count = ocpm_snapshot.object_count;
1070            stats.ocpm_case_count = ocpm_snapshot.case_count;
1071            info!(
1072                "OCPM events generated: {} events, {} objects, {} cases",
1073                stats.ocpm_event_count, stats.ocpm_object_count, stats.ocpm_case_count
1074            );
1075            self.check_resources_with_log("post-ocpm")?;
1076            Ok(ocpm_snapshot)
1077        } else {
1078            debug!("Phase 3c: Skipped (OCPM generation disabled or no document flows)");
1079            Ok(OcpmSnapshot::default())
1080        }
1081    }
1082
1083    /// Phase 4: Generate journal entries from document flows and standalone generation.
1084    fn phase_journal_entries(
1085        &mut self,
1086        coa: &Arc<ChartOfAccounts>,
1087        document_flows: &DocumentFlowSnapshot,
1088        stats: &mut EnhancedGenerationStatistics,
1089    ) -> SynthResult<Vec<JournalEntry>> {
1090        let mut entries = Vec::new();
1091
1092        // Phase 4a: Generate JEs from document flows (for data coherence)
1093        if self.phase_config.generate_document_flows && !document_flows.p2p_chains.is_empty() {
1094            debug!("Phase 4a: Generating JEs from document flows");
1095            let flow_entries = self.generate_jes_from_document_flows(document_flows)?;
1096            debug!("Generated {} JEs from document flows", flow_entries.len());
1097            entries.extend(flow_entries);
1098        }
1099
1100        // Phase 4b: Generate standalone journal entries
1101        if self.phase_config.generate_journal_entries {
1102            info!("Phase 4: Generating Journal Entries");
1103            let je_entries = self.generate_journal_entries(coa)?;
1104            info!("Generated {} standalone journal entries", je_entries.len());
1105            entries.extend(je_entries);
1106        } else {
1107            debug!("Phase 4: Skipped (journal entry generation disabled)");
1108        }
1109
1110        if !entries.is_empty() {
1111            stats.total_entries = entries.len() as u64;
1112            stats.total_line_items = entries.iter().map(|e| e.line_count() as u64).sum();
1113            info!(
1114                "Total entries: {}, total line items: {}",
1115                stats.total_entries, stats.total_line_items
1116            );
1117            self.check_resources_with_log("post-journal-entries")?;
1118        }
1119
1120        Ok(entries)
1121    }
1122
1123    /// Phase 5: Inject anomalies into journal entries.
1124    fn phase_anomaly_injection(
1125        &mut self,
1126        entries: &mut [JournalEntry],
1127        actions: &DegradationActions,
1128        stats: &mut EnhancedGenerationStatistics,
1129    ) -> SynthResult<AnomalyLabels> {
1130        if self.phase_config.inject_anomalies
1131            && !entries.is_empty()
1132            && !actions.skip_anomaly_injection
1133        {
1134            info!("Phase 5: Injecting Anomalies");
1135            let result = self.inject_anomalies(entries)?;
1136            stats.anomalies_injected = result.labels.len();
1137            info!("Injected {} anomalies", stats.anomalies_injected);
1138            self.check_resources_with_log("post-anomaly-injection")?;
1139            Ok(result)
1140        } else if actions.skip_anomaly_injection {
1141            warn!("Phase 5: Skipped due to resource degradation");
1142            Ok(AnomalyLabels::default())
1143        } else {
1144            debug!("Phase 5: Skipped (anomaly injection disabled or no entries)");
1145            Ok(AnomalyLabels::default())
1146        }
1147    }
1148
1149    /// Phase 6: Validate balance sheet equation on journal entries.
1150    fn phase_balance_validation(
1151        &mut self,
1152        entries: &[JournalEntry],
1153    ) -> SynthResult<BalanceValidationResult> {
1154        if self.phase_config.validate_balances && !entries.is_empty() {
1155            debug!("Phase 6: Validating Balances");
1156            let balance_validation = self.validate_journal_entries(entries)?;
1157            if balance_validation.is_balanced {
1158                debug!("Balance validation passed");
1159            } else {
1160                warn!(
1161                    "Balance validation found {} errors",
1162                    balance_validation.validation_errors.len()
1163                );
1164            }
1165            Ok(balance_validation)
1166        } else {
1167            Ok(BalanceValidationResult::default())
1168        }
1169    }
1170
1171    /// Phase 7: Inject data quality variations (typos, missing values, format issues).
1172    fn phase_data_quality_injection(
1173        &mut self,
1174        entries: &mut [JournalEntry],
1175        actions: &DegradationActions,
1176        stats: &mut EnhancedGenerationStatistics,
1177    ) -> SynthResult<DataQualityStats> {
1178        if self.phase_config.inject_data_quality
1179            && !entries.is_empty()
1180            && !actions.skip_data_quality
1181        {
1182            info!("Phase 7: Injecting Data Quality Variations");
1183            let dq_stats = self.inject_data_quality(entries)?;
1184            stats.data_quality_issues = dq_stats.records_with_issues;
1185            info!("Injected {} data quality issues", stats.data_quality_issues);
1186            self.check_resources_with_log("post-data-quality")?;
1187            Ok(dq_stats)
1188        } else if actions.skip_data_quality {
1189            warn!("Phase 7: Skipped due to resource degradation");
1190            Ok(DataQualityStats::default())
1191        } else {
1192            debug!("Phase 7: Skipped (data quality injection disabled or no entries)");
1193            Ok(DataQualityStats::default())
1194        }
1195    }
1196
1197    /// Phase 8: Generate audit data (engagements, workpapers, evidence, risks, findings).
1198    fn phase_audit_data(
1199        &mut self,
1200        entries: &[JournalEntry],
1201        stats: &mut EnhancedGenerationStatistics,
1202    ) -> SynthResult<AuditSnapshot> {
1203        if self.phase_config.generate_audit {
1204            info!("Phase 8: Generating Audit Data");
1205            let audit_snapshot = self.generate_audit_data(entries)?;
1206            stats.audit_engagement_count = audit_snapshot.engagements.len();
1207            stats.audit_workpaper_count = audit_snapshot.workpapers.len();
1208            stats.audit_evidence_count = audit_snapshot.evidence.len();
1209            stats.audit_risk_count = audit_snapshot.risk_assessments.len();
1210            stats.audit_finding_count = audit_snapshot.findings.len();
1211            stats.audit_judgment_count = audit_snapshot.judgments.len();
1212            info!(
1213                "Audit data generated: {} engagements, {} workpapers, {} evidence, {} risks, {} findings, {} judgments",
1214                stats.audit_engagement_count, stats.audit_workpaper_count,
1215                stats.audit_evidence_count, stats.audit_risk_count,
1216                stats.audit_finding_count, stats.audit_judgment_count
1217            );
1218            self.check_resources_with_log("post-audit")?;
1219            Ok(audit_snapshot)
1220        } else {
1221            debug!("Phase 8: Skipped (audit generation disabled)");
1222            Ok(AuditSnapshot::default())
1223        }
1224    }
1225
1226    /// Phase 9: Generate banking KYC/AML data.
1227    fn phase_banking_data(
1228        &mut self,
1229        stats: &mut EnhancedGenerationStatistics,
1230    ) -> SynthResult<BankingSnapshot> {
1231        if self.phase_config.generate_banking && self.config.banking.enabled {
1232            info!("Phase 9: Generating Banking KYC/AML Data");
1233            let banking_snapshot = self.generate_banking_data()?;
1234            stats.banking_customer_count = banking_snapshot.customers.len();
1235            stats.banking_account_count = banking_snapshot.accounts.len();
1236            stats.banking_transaction_count = banking_snapshot.transactions.len();
1237            stats.banking_suspicious_count = banking_snapshot.suspicious_count;
1238            info!(
1239                "Banking data generated: {} customers, {} accounts, {} transactions ({} suspicious)",
1240                stats.banking_customer_count, stats.banking_account_count,
1241                stats.banking_transaction_count, stats.banking_suspicious_count
1242            );
1243            self.check_resources_with_log("post-banking")?;
1244            Ok(banking_snapshot)
1245        } else {
1246            debug!("Phase 9: Skipped (banking generation disabled)");
1247            Ok(BankingSnapshot::default())
1248        }
1249    }
1250
1251    /// Phase 10: Export accounting network graphs for ML training.
1252    fn phase_graph_export(
1253        &mut self,
1254        entries: &[JournalEntry],
1255        coa: &Arc<ChartOfAccounts>,
1256        stats: &mut EnhancedGenerationStatistics,
1257    ) -> SynthResult<GraphExportSnapshot> {
1258        if (self.phase_config.generate_graph_export || self.config.graph_export.enabled)
1259            && !entries.is_empty()
1260        {
1261            info!("Phase 10: Exporting Accounting Network Graphs");
1262            match self.export_graphs(entries, coa, stats) {
1263                Ok(snapshot) => {
1264                    info!(
1265                        "Graph export complete: {} graphs ({} nodes, {} edges)",
1266                        snapshot.graph_count, stats.graph_node_count, stats.graph_edge_count
1267                    );
1268                    Ok(snapshot)
1269                }
1270                Err(e) => {
1271                    warn!("Phase 10: Graph export failed: {}", e);
1272                    Ok(GraphExportSnapshot::default())
1273                }
1274            }
1275        } else {
1276            debug!("Phase 10: Skipped (graph export disabled or no entries)");
1277            Ok(GraphExportSnapshot::default())
1278        }
1279    }
1280
1281    /// Phase 10b: Export multi-layer hypergraph for RustGraph integration.
1282    fn phase_hypergraph_export(
1283        &self,
1284        coa: &Arc<ChartOfAccounts>,
1285        entries: &[JournalEntry],
1286        document_flows: &DocumentFlowSnapshot,
1287        stats: &mut EnhancedGenerationStatistics,
1288    ) -> SynthResult<()> {
1289        if self.config.graph_export.hypergraph.enabled && !entries.is_empty() {
1290            info!("Phase 10b: Exporting Multi-Layer Hypergraph");
1291            match self.export_hypergraph(coa, entries, document_flows, stats) {
1292                Ok(info) => {
1293                    info!(
1294                        "Hypergraph export complete: {} nodes, {} edges, {} hyperedges",
1295                        info.node_count, info.edge_count, info.hyperedge_count
1296                    );
1297                }
1298                Err(e) => {
1299                    warn!("Phase 10b: Hypergraph export failed: {}", e);
1300                }
1301            }
1302        } else {
1303            debug!("Phase 10b: Skipped (hypergraph export disabled or no entries)");
1304        }
1305        Ok(())
1306    }
1307
1308    /// Phase 11: LLM Enrichment.
1309    ///
1310    /// Uses an LLM provider (mock by default) to enrich vendor names with
1311    /// realistic, context-aware names. This phase is non-blocking: failures
1312    /// log a warning but do not stop the generation pipeline.
1313    fn phase_llm_enrichment(&mut self, stats: &mut EnhancedGenerationStatistics) {
1314        if !self.config.llm.enabled {
1315            debug!("Phase 11: Skipped (LLM enrichment disabled)");
1316            return;
1317        }
1318
1319        info!("Phase 11: Starting LLM Enrichment");
1320        let start = std::time::Instant::now();
1321
1322        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1323            let provider = Arc::new(MockLlmProvider::new(self.seed));
1324            let enricher = VendorLlmEnricher::new(provider);
1325
1326            let industry = format!("{:?}", self.config.global.industry);
1327            let max_enrichments = self
1328                .config
1329                .llm
1330                .max_vendor_enrichments
1331                .min(self.master_data.vendors.len());
1332
1333            let mut enriched_count = 0usize;
1334            for vendor in self.master_data.vendors.iter_mut().take(max_enrichments) {
1335                match enricher.enrich_vendor_name(&industry, "general", &vendor.country) {
1336                    Ok(name) => {
1337                        vendor.name = name;
1338                        enriched_count += 1;
1339                    }
1340                    Err(e) => {
1341                        warn!(
1342                            "LLM vendor enrichment failed for {}: {}",
1343                            vendor.vendor_id, e
1344                        );
1345                    }
1346                }
1347            }
1348
1349            enriched_count
1350        }));
1351
1352        match result {
1353            Ok(enriched_count) => {
1354                stats.llm_vendors_enriched = enriched_count;
1355                let elapsed = start.elapsed();
1356                stats.llm_enrichment_ms = elapsed.as_millis() as u64;
1357                info!(
1358                    "Phase 11 complete: {} vendors enriched in {}ms",
1359                    enriched_count, stats.llm_enrichment_ms
1360                );
1361            }
1362            Err(_) => {
1363                let elapsed = start.elapsed();
1364                stats.llm_enrichment_ms = elapsed.as_millis() as u64;
1365                warn!("Phase 11: LLM enrichment failed (panic caught), continuing");
1366            }
1367        }
1368    }
1369
1370    /// Phase 12: Diffusion Enhancement.
1371    ///
1372    /// Generates a sample set using the statistical diffusion backend to
1373    /// demonstrate distribution-matching data generation. This phase is
1374    /// non-blocking: failures log a warning but do not stop the pipeline.
1375    fn phase_diffusion_enhancement(&self, stats: &mut EnhancedGenerationStatistics) {
1376        if !self.config.diffusion.enabled {
1377            debug!("Phase 12: Skipped (diffusion enhancement disabled)");
1378            return;
1379        }
1380
1381        info!("Phase 12: Starting Diffusion Enhancement");
1382        let start = std::time::Instant::now();
1383
1384        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1385            // Target distribution: transaction amounts (log-normal-like)
1386            let means = vec![5000.0, 3.0, 2.0]; // amount, line_items, approval_level
1387            let stds = vec![2000.0, 1.5, 1.0];
1388
1389            let diffusion_config = DiffusionConfig {
1390                n_steps: self.config.diffusion.n_steps,
1391                seed: self.seed,
1392                ..Default::default()
1393            };
1394
1395            let backend = StatisticalDiffusionBackend::new(means, stds, diffusion_config);
1396
1397            let n_samples = self.config.diffusion.sample_size;
1398            let n_features = 3; // amount, line_items, approval_level
1399            let samples = backend.generate(n_samples, n_features, self.seed);
1400
1401            samples.len()
1402        }));
1403
1404        match result {
1405            Ok(sample_count) => {
1406                stats.diffusion_samples_generated = sample_count;
1407                let elapsed = start.elapsed();
1408                stats.diffusion_enhancement_ms = elapsed.as_millis() as u64;
1409                info!(
1410                    "Phase 12 complete: {} diffusion samples generated in {}ms",
1411                    sample_count, stats.diffusion_enhancement_ms
1412                );
1413            }
1414            Err(_) => {
1415                let elapsed = start.elapsed();
1416                stats.diffusion_enhancement_ms = elapsed.as_millis() as u64;
1417                warn!("Phase 12: Diffusion enhancement failed (panic caught), continuing");
1418            }
1419        }
1420    }
1421
1422    /// Phase 13: Causal Overlay.
1423    ///
1424    /// Builds a structural causal model from a built-in template (e.g.,
1425    /// fraud_detection) and generates causal samples. Optionally validates
1426    /// that the output respects the causal structure. This phase is
1427    /// non-blocking: failures log a warning but do not stop the pipeline.
1428    fn phase_causal_overlay(&self, stats: &mut EnhancedGenerationStatistics) {
1429        if !self.config.causal.enabled {
1430            debug!("Phase 13: Skipped (causal generation disabled)");
1431            return;
1432        }
1433
1434        info!("Phase 13: Starting Causal Overlay");
1435        let start = std::time::Instant::now();
1436
1437        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1438            // Select template based on config
1439            let graph = match self.config.causal.template.as_str() {
1440                "revenue_cycle" => CausalGraph::revenue_cycle_template(),
1441                _ => CausalGraph::fraud_detection_template(),
1442            };
1443
1444            let scm = StructuralCausalModel::new(graph.clone())
1445                .map_err(|e| SynthError::generation(format!("Failed to build SCM: {}", e)))?;
1446
1447            let n_samples = self.config.causal.sample_size;
1448            let samples = scm
1449                .generate(n_samples, self.seed)
1450                .map_err(|e| SynthError::generation(format!("SCM generation failed: {}", e)))?;
1451
1452            // Optionally validate causal structure
1453            let validation_passed = if self.config.causal.validate {
1454                let report = CausalValidator::validate_causal_structure(&samples, &graph);
1455                if report.valid {
1456                    info!(
1457                        "Causal validation passed: all {} checks OK",
1458                        report.checks.len()
1459                    );
1460                } else {
1461                    warn!(
1462                        "Causal validation: {} violations detected: {:?}",
1463                        report.violations.len(),
1464                        report.violations
1465                    );
1466                }
1467                Some(report.valid)
1468            } else {
1469                None
1470            };
1471
1472            Ok::<(usize, Option<bool>), SynthError>((samples.len(), validation_passed))
1473        }));
1474
1475        match result {
1476            Ok(Ok((sample_count, validation_passed))) => {
1477                stats.causal_samples_generated = sample_count;
1478                stats.causal_validation_passed = validation_passed;
1479                let elapsed = start.elapsed();
1480                stats.causal_generation_ms = elapsed.as_millis() as u64;
1481                info!(
1482                    "Phase 13 complete: {} causal samples generated in {}ms (validation: {:?})",
1483                    sample_count, stats.causal_generation_ms, validation_passed,
1484                );
1485            }
1486            Ok(Err(e)) => {
1487                let elapsed = start.elapsed();
1488                stats.causal_generation_ms = elapsed.as_millis() as u64;
1489                warn!("Phase 13: Causal generation failed: {}", e);
1490            }
1491            Err(_) => {
1492                let elapsed = start.elapsed();
1493                stats.causal_generation_ms = elapsed.as_millis() as u64;
1494                warn!("Phase 13: Causal generation failed (panic caught), continuing");
1495            }
1496        }
1497    }
1498
1499    /// Generate the chart of accounts.
1500    fn generate_coa(&mut self) -> SynthResult<Arc<ChartOfAccounts>> {
1501        let pb = self.create_progress_bar(1, "Generating Chart of Accounts");
1502
1503        let mut gen = ChartOfAccountsGenerator::new(
1504            self.config.chart_of_accounts.complexity,
1505            self.config.global.industry,
1506            self.seed,
1507        );
1508
1509        let coa = Arc::new(gen.generate());
1510        self.coa = Some(Arc::clone(&coa));
1511
1512        if let Some(pb) = pb {
1513            pb.finish_with_message("Chart of Accounts complete");
1514        }
1515
1516        Ok(coa)
1517    }
1518
1519    /// Generate master data entities.
1520    fn generate_master_data(&mut self) -> SynthResult<()> {
1521        let start_date = NaiveDate::parse_from_str(&self.config.global.start_date, "%Y-%m-%d")
1522            .map_err(|e| SynthError::config(format!("Invalid start_date: {}", e)))?;
1523        let end_date = start_date + chrono::Months::new(self.config.global.period_months);
1524
1525        let total = self.config.companies.len() as u64 * 5; // 5 entity types
1526        let pb = self.create_progress_bar(total, "Generating Master Data");
1527
1528        for (i, company) in self.config.companies.iter().enumerate() {
1529            let company_seed = self.seed.wrapping_add(i as u64 * 1000);
1530
1531            // Generate vendors
1532            let mut vendor_gen = VendorGenerator::new(company_seed);
1533            let vendor_pool = vendor_gen.generate_vendor_pool(
1534                self.phase_config.vendors_per_company,
1535                &company.code,
1536                start_date,
1537            );
1538            self.master_data.vendors.extend(vendor_pool.vendors);
1539            if let Some(pb) = &pb {
1540                pb.inc(1);
1541            }
1542
1543            // Generate customers
1544            let mut customer_gen = CustomerGenerator::new(company_seed + 100);
1545            let customer_pool = customer_gen.generate_customer_pool(
1546                self.phase_config.customers_per_company,
1547                &company.code,
1548                start_date,
1549            );
1550            self.master_data.customers.extend(customer_pool.customers);
1551            if let Some(pb) = &pb {
1552                pb.inc(1);
1553            }
1554
1555            // Generate materials
1556            let mut material_gen = MaterialGenerator::new(company_seed + 200);
1557            let material_pool = material_gen.generate_material_pool(
1558                self.phase_config.materials_per_company,
1559                &company.code,
1560                start_date,
1561            );
1562            self.master_data.materials.extend(material_pool.materials);
1563            if let Some(pb) = &pb {
1564                pb.inc(1);
1565            }
1566
1567            // Generate fixed assets
1568            let mut asset_gen = AssetGenerator::new(company_seed + 300);
1569            let asset_pool = asset_gen.generate_asset_pool(
1570                self.phase_config.assets_per_company,
1571                &company.code,
1572                (start_date, end_date),
1573            );
1574            self.master_data.assets.extend(asset_pool.assets);
1575            if let Some(pb) = &pb {
1576                pb.inc(1);
1577            }
1578
1579            // Generate employees
1580            let mut employee_gen = EmployeeGenerator::new(company_seed + 400);
1581            let employee_pool =
1582                employee_gen.generate_company_pool(&company.code, (start_date, end_date));
1583            self.master_data.employees.extend(employee_pool.employees);
1584            if let Some(pb) = &pb {
1585                pb.inc(1);
1586            }
1587        }
1588
1589        if let Some(pb) = pb {
1590            pb.finish_with_message("Master data generation complete");
1591        }
1592
1593        Ok(())
1594    }
1595
1596    /// Generate document flows (P2P and O2C).
1597    fn generate_document_flows(&mut self, flows: &mut DocumentFlowSnapshot) -> SynthResult<()> {
1598        let start_date = NaiveDate::parse_from_str(&self.config.global.start_date, "%Y-%m-%d")
1599            .map_err(|e| SynthError::config(format!("Invalid start_date: {}", e)))?;
1600
1601        // Generate P2P chains
1602        let p2p_count = self
1603            .phase_config
1604            .p2p_chains
1605            .min(self.master_data.vendors.len() * 2);
1606        let pb = self.create_progress_bar(p2p_count as u64, "Generating P2P Document Flows");
1607
1608        // Convert P2P config from schema to generator config
1609        let p2p_config = convert_p2p_config(&self.config.document_flows.p2p);
1610        let mut p2p_gen = P2PGenerator::with_config(self.seed + 1000, p2p_config);
1611
1612        for i in 0..p2p_count {
1613            let vendor = &self.master_data.vendors[i % self.master_data.vendors.len()];
1614            let materials: Vec<&Material> = self
1615                .master_data
1616                .materials
1617                .iter()
1618                .skip(i % self.master_data.materials.len().max(1))
1619                .take(2.min(self.master_data.materials.len()))
1620                .collect();
1621
1622            if materials.is_empty() {
1623                continue;
1624            }
1625
1626            let company = &self.config.companies[i % self.config.companies.len()];
1627            let po_date = start_date + chrono::Duration::days((i * 3) as i64 % 365);
1628            let fiscal_period = po_date.month() as u8;
1629            let created_by = self
1630                .master_data
1631                .employees
1632                .first()
1633                .map(|e| e.user_id.as_str())
1634                .unwrap_or("SYSTEM");
1635
1636            let chain = p2p_gen.generate_chain(
1637                &company.code,
1638                vendor,
1639                &materials,
1640                po_date,
1641                start_date.year() as u16,
1642                fiscal_period,
1643                created_by,
1644            );
1645
1646            // Flatten documents
1647            flows.purchase_orders.push(chain.purchase_order.clone());
1648            flows.goods_receipts.extend(chain.goods_receipts.clone());
1649            if let Some(vi) = &chain.vendor_invoice {
1650                flows.vendor_invoices.push(vi.clone());
1651            }
1652            if let Some(payment) = &chain.payment {
1653                flows.payments.push(payment.clone());
1654            }
1655            flows.p2p_chains.push(chain);
1656
1657            if let Some(pb) = &pb {
1658                pb.inc(1);
1659            }
1660        }
1661
1662        if let Some(pb) = pb {
1663            pb.finish_with_message("P2P document flows complete");
1664        }
1665
1666        // Generate O2C chains
1667        let o2c_count = self
1668            .phase_config
1669            .o2c_chains
1670            .min(self.master_data.customers.len() * 2);
1671        let pb = self.create_progress_bar(o2c_count as u64, "Generating O2C Document Flows");
1672
1673        // Convert O2C config from schema to generator config
1674        let o2c_config = convert_o2c_config(&self.config.document_flows.o2c);
1675        let mut o2c_gen = O2CGenerator::with_config(self.seed + 2000, o2c_config);
1676
1677        for i in 0..o2c_count {
1678            let customer = &self.master_data.customers[i % self.master_data.customers.len()];
1679            let materials: Vec<&Material> = self
1680                .master_data
1681                .materials
1682                .iter()
1683                .skip(i % self.master_data.materials.len().max(1))
1684                .take(2.min(self.master_data.materials.len()))
1685                .collect();
1686
1687            if materials.is_empty() {
1688                continue;
1689            }
1690
1691            let company = &self.config.companies[i % self.config.companies.len()];
1692            let so_date = start_date + chrono::Duration::days((i * 2) as i64 % 365);
1693            let fiscal_period = so_date.month() as u8;
1694            let created_by = self
1695                .master_data
1696                .employees
1697                .first()
1698                .map(|e| e.user_id.as_str())
1699                .unwrap_or("SYSTEM");
1700
1701            let chain = o2c_gen.generate_chain(
1702                &company.code,
1703                customer,
1704                &materials,
1705                so_date,
1706                start_date.year() as u16,
1707                fiscal_period,
1708                created_by,
1709            );
1710
1711            // Flatten documents
1712            flows.sales_orders.push(chain.sales_order.clone());
1713            flows.deliveries.extend(chain.deliveries.clone());
1714            if let Some(ci) = &chain.customer_invoice {
1715                flows.customer_invoices.push(ci.clone());
1716            }
1717            if let Some(receipt) = &chain.customer_receipt {
1718                flows.payments.push(receipt.clone());
1719            }
1720            flows.o2c_chains.push(chain);
1721
1722            if let Some(pb) = &pb {
1723                pb.inc(1);
1724            }
1725        }
1726
1727        if let Some(pb) = pb {
1728            pb.finish_with_message("O2C document flows complete");
1729        }
1730
1731        Ok(())
1732    }
1733
1734    /// Generate journal entries.
1735    fn generate_journal_entries(
1736        &mut self,
1737        coa: &Arc<ChartOfAccounts>,
1738    ) -> SynthResult<Vec<JournalEntry>> {
1739        let total = self.calculate_total_transactions();
1740        let pb = self.create_progress_bar(total, "Generating Journal Entries");
1741
1742        let start_date = NaiveDate::parse_from_str(&self.config.global.start_date, "%Y-%m-%d")
1743            .map_err(|e| SynthError::config(format!("Invalid start_date: {}", e)))?;
1744        let end_date = start_date + chrono::Months::new(self.config.global.period_months);
1745
1746        let company_codes: Vec<String> = self
1747            .config
1748            .companies
1749            .iter()
1750            .map(|c| c.code.clone())
1751            .collect();
1752
1753        let generator = JournalEntryGenerator::new_with_params(
1754            self.config.transactions.clone(),
1755            Arc::clone(coa),
1756            company_codes,
1757            start_date,
1758            end_date,
1759            self.seed,
1760        );
1761
1762        // Connect generated master data to ensure JEs reference real entities
1763        // Enable persona-based error injection for realistic human behavior
1764        // Pass fraud configuration for fraud injection
1765        let mut generator = generator
1766            .with_master_data(
1767                &self.master_data.vendors,
1768                &self.master_data.customers,
1769                &self.master_data.materials,
1770            )
1771            .with_persona_errors(true)
1772            .with_fraud_config(self.config.fraud.clone());
1773
1774        // Apply temporal drift if configured
1775        if self.config.temporal.enabled {
1776            let drift_config = self.config.temporal.to_core_config();
1777            generator = generator.with_drift_config(drift_config, self.seed + 100);
1778        }
1779
1780        let mut entries = Vec::with_capacity(total as usize);
1781
1782        // Check memory limit at start
1783        self.check_memory_limit()?;
1784
1785        // Check every 1000 entries to avoid overhead
1786        const MEMORY_CHECK_INTERVAL: u64 = 1000;
1787
1788        for i in 0..total {
1789            let entry = generator.generate();
1790            entries.push(entry);
1791            if let Some(pb) = &pb {
1792                pb.inc(1);
1793            }
1794
1795            // Periodic memory limit check
1796            if (i + 1) % MEMORY_CHECK_INTERVAL == 0 {
1797                self.check_memory_limit()?;
1798            }
1799        }
1800
1801        if let Some(pb) = pb {
1802            pb.finish_with_message("Journal entries complete");
1803        }
1804
1805        Ok(entries)
1806    }
1807
1808    /// Generate journal entries from document flows.
1809    ///
1810    /// This creates proper GL entries for each document in the P2P and O2C flows,
1811    /// ensuring that document activity is reflected in the general ledger.
1812    fn generate_jes_from_document_flows(
1813        &mut self,
1814        flows: &DocumentFlowSnapshot,
1815    ) -> SynthResult<Vec<JournalEntry>> {
1816        let total_chains = flows.p2p_chains.len() + flows.o2c_chains.len();
1817        let pb = self.create_progress_bar(total_chains as u64, "Generating Document Flow JEs");
1818
1819        let mut generator = DocumentFlowJeGenerator::with_config_and_seed(
1820            DocumentFlowJeConfig::default(),
1821            self.seed,
1822        );
1823        let mut entries = Vec::new();
1824
1825        // Generate JEs from P2P chains
1826        for chain in &flows.p2p_chains {
1827            let chain_entries = generator.generate_from_p2p_chain(chain);
1828            entries.extend(chain_entries);
1829            if let Some(pb) = &pb {
1830                pb.inc(1);
1831            }
1832        }
1833
1834        // Generate JEs from O2C chains
1835        for chain in &flows.o2c_chains {
1836            let chain_entries = generator.generate_from_o2c_chain(chain);
1837            entries.extend(chain_entries);
1838            if let Some(pb) = &pb {
1839                pb.inc(1);
1840            }
1841        }
1842
1843        if let Some(pb) = pb {
1844            pb.finish_with_message(format!(
1845                "Generated {} JEs from document flows",
1846                entries.len()
1847            ));
1848        }
1849
1850        Ok(entries)
1851    }
1852
1853    /// Link document flows to subledger records.
1854    ///
1855    /// Creates AP invoices from vendor invoices and AR invoices from customer invoices,
1856    /// ensuring subledger data is coherent with document flow data.
1857    fn link_document_flows_to_subledgers(
1858        &mut self,
1859        flows: &DocumentFlowSnapshot,
1860    ) -> SynthResult<SubledgerSnapshot> {
1861        let total = flows.vendor_invoices.len() + flows.customer_invoices.len();
1862        let pb = self.create_progress_bar(total as u64, "Linking Subledgers");
1863
1864        let mut linker = DocumentFlowLinker::new();
1865
1866        // Convert vendor invoices to AP invoices
1867        let ap_invoices = linker.batch_create_ap_invoices(&flows.vendor_invoices);
1868        if let Some(pb) = &pb {
1869            pb.inc(flows.vendor_invoices.len() as u64);
1870        }
1871
1872        // Convert customer invoices to AR invoices
1873        let ar_invoices = linker.batch_create_ar_invoices(&flows.customer_invoices);
1874        if let Some(pb) = &pb {
1875            pb.inc(flows.customer_invoices.len() as u64);
1876        }
1877
1878        if let Some(pb) = pb {
1879            pb.finish_with_message(format!(
1880                "Linked {} AP and {} AR invoices",
1881                ap_invoices.len(),
1882                ar_invoices.len()
1883            ));
1884        }
1885
1886        Ok(SubledgerSnapshot {
1887            ap_invoices,
1888            ar_invoices,
1889        })
1890    }
1891
1892    /// Generate OCPM events from document flows.
1893    ///
1894    /// Creates OCEL 2.0 compliant event logs from P2P and O2C document flows,
1895    /// capturing the object-centric process perspective.
1896    fn generate_ocpm_events(&mut self, flows: &DocumentFlowSnapshot) -> SynthResult<OcpmSnapshot> {
1897        let total_chains = flows.p2p_chains.len() + flows.o2c_chains.len();
1898        let pb = self.create_progress_bar(total_chains as u64, "Generating OCPM Events");
1899
1900        // Create OCPM event log with standard types
1901        let metadata = EventLogMetadata::new("SyntheticData OCPM Log");
1902        let mut event_log = OcpmEventLog::with_metadata(metadata).with_standard_types();
1903
1904        // Configure the OCPM generator
1905        let ocpm_config = OcpmGeneratorConfig {
1906            generate_p2p: true,
1907            generate_o2c: true,
1908            happy_path_rate: 0.75,
1909            exception_path_rate: 0.20,
1910            error_path_rate: 0.05,
1911            add_duration_variability: true,
1912            duration_std_dev_factor: 0.3,
1913        };
1914        let mut ocpm_gen = OcpmEventGenerator::with_config(self.seed + 3000, ocpm_config);
1915
1916        // Get available users for resource assignment
1917        let available_users: Vec<String> = self
1918            .master_data
1919            .employees
1920            .iter()
1921            .take(20)
1922            .map(|e| e.user_id.clone())
1923            .collect();
1924
1925        // Generate events from P2P chains
1926        for chain in &flows.p2p_chains {
1927            let po = &chain.purchase_order;
1928            let documents = P2pDocuments::new(
1929                &po.header.document_id,
1930                &po.vendor_id,
1931                &po.header.company_code,
1932                po.total_net_amount,
1933                &po.header.currency,
1934            )
1935            .with_goods_receipt(
1936                chain
1937                    .goods_receipts
1938                    .first()
1939                    .map(|gr| gr.header.document_id.as_str())
1940                    .unwrap_or(""),
1941            )
1942            .with_invoice(
1943                chain
1944                    .vendor_invoice
1945                    .as_ref()
1946                    .map(|vi| vi.header.document_id.as_str())
1947                    .unwrap_or(""),
1948            )
1949            .with_payment(
1950                chain
1951                    .payment
1952                    .as_ref()
1953                    .map(|p| p.header.document_id.as_str())
1954                    .unwrap_or(""),
1955            );
1956
1957            let start_time =
1958                chrono::DateTime::from_naive_utc_and_offset(po.header.entry_timestamp, chrono::Utc);
1959            let result = ocpm_gen.generate_p2p_case(&documents, start_time, &available_users);
1960
1961            // Add events and objects to the event log
1962            for event in result.events {
1963                event_log.add_event(event);
1964            }
1965            for object in result.objects {
1966                event_log.add_object(object);
1967            }
1968            for relationship in result.relationships {
1969                event_log.add_relationship(relationship);
1970            }
1971            event_log.add_case(result.case_trace);
1972
1973            if let Some(pb) = &pb {
1974                pb.inc(1);
1975            }
1976        }
1977
1978        // Generate events from O2C chains
1979        for chain in &flows.o2c_chains {
1980            let so = &chain.sales_order;
1981            let documents = O2cDocuments::new(
1982                &so.header.document_id,
1983                &so.customer_id,
1984                &so.header.company_code,
1985                so.total_net_amount,
1986                &so.header.currency,
1987            )
1988            .with_delivery(
1989                chain
1990                    .deliveries
1991                    .first()
1992                    .map(|d| d.header.document_id.as_str())
1993                    .unwrap_or(""),
1994            )
1995            .with_invoice(
1996                chain
1997                    .customer_invoice
1998                    .as_ref()
1999                    .map(|ci| ci.header.document_id.as_str())
2000                    .unwrap_or(""),
2001            )
2002            .with_receipt(
2003                chain
2004                    .customer_receipt
2005                    .as_ref()
2006                    .map(|r| r.header.document_id.as_str())
2007                    .unwrap_or(""),
2008            );
2009
2010            let start_time =
2011                chrono::DateTime::from_naive_utc_and_offset(so.header.entry_timestamp, chrono::Utc);
2012            let result = ocpm_gen.generate_o2c_case(&documents, start_time, &available_users);
2013
2014            // Add events and objects to the event log
2015            for event in result.events {
2016                event_log.add_event(event);
2017            }
2018            for object in result.objects {
2019                event_log.add_object(object);
2020            }
2021            for relationship in result.relationships {
2022                event_log.add_relationship(relationship);
2023            }
2024            event_log.add_case(result.case_trace);
2025
2026            if let Some(pb) = &pb {
2027                pb.inc(1);
2028            }
2029        }
2030
2031        // Compute process variants
2032        event_log.compute_variants();
2033
2034        let summary = event_log.summary();
2035
2036        if let Some(pb) = pb {
2037            pb.finish_with_message(format!(
2038                "Generated {} OCPM events, {} objects",
2039                summary.event_count, summary.object_count
2040            ));
2041        }
2042
2043        Ok(OcpmSnapshot {
2044            event_count: summary.event_count,
2045            object_count: summary.object_count,
2046            case_count: summary.case_count,
2047            event_log: Some(event_log),
2048        })
2049    }
2050
2051    /// Inject anomalies into journal entries.
2052    fn inject_anomalies(&mut self, entries: &mut [JournalEntry]) -> SynthResult<AnomalyLabels> {
2053        let pb = self.create_progress_bar(entries.len() as u64, "Injecting Anomalies");
2054
2055        let anomaly_config = AnomalyInjectorConfig {
2056            rates: AnomalyRateConfig {
2057                total_rate: 0.02,
2058                ..Default::default()
2059            },
2060            seed: self.seed + 5000,
2061            ..Default::default()
2062        };
2063
2064        let mut injector = AnomalyInjector::new(anomaly_config);
2065        let result = injector.process_entries(entries);
2066
2067        if let Some(pb) = &pb {
2068            pb.inc(entries.len() as u64);
2069            pb.finish_with_message("Anomaly injection complete");
2070        }
2071
2072        let mut by_type = HashMap::new();
2073        for label in &result.labels {
2074            *by_type
2075                .entry(format!("{:?}", label.anomaly_type))
2076                .or_insert(0) += 1;
2077        }
2078
2079        Ok(AnomalyLabels {
2080            labels: result.labels,
2081            summary: Some(result.summary),
2082            by_type,
2083        })
2084    }
2085
2086    /// Validate journal entries using running balance tracker.
2087    ///
2088    /// Applies all entries to the balance tracker and validates:
2089    /// - Each entry is internally balanced (debits = credits)
2090    /// - Balance sheet equation holds (Assets = Liabilities + Equity + Net Income)
2091    ///
2092    /// Note: Entries with human errors (marked with [HUMAN_ERROR:*] tags) are
2093    /// excluded from balance validation as they may be intentionally unbalanced.
2094    fn validate_journal_entries(
2095        &mut self,
2096        entries: &[JournalEntry],
2097    ) -> SynthResult<BalanceValidationResult> {
2098        // Filter out entries with human errors as they may be intentionally unbalanced
2099        let clean_entries: Vec<&JournalEntry> = entries
2100            .iter()
2101            .filter(|e| {
2102                e.header
2103                    .header_text
2104                    .as_ref()
2105                    .map(|t| !t.contains("[HUMAN_ERROR:"))
2106                    .unwrap_or(true)
2107            })
2108            .collect();
2109
2110        let pb = self.create_progress_bar(clean_entries.len() as u64, "Validating Balances");
2111
2112        // Configure tracker to not fail on errors (collect them instead)
2113        let config = BalanceTrackerConfig {
2114            validate_on_each_entry: false,   // We'll validate at the end
2115            track_history: false,            // Skip history for performance
2116            fail_on_validation_error: false, // Collect errors, don't fail
2117            ..Default::default()
2118        };
2119
2120        let mut tracker = RunningBalanceTracker::new(config);
2121
2122        // Apply clean entries (without human errors)
2123        let clean_refs: Vec<JournalEntry> = clean_entries.into_iter().cloned().collect();
2124        let errors = tracker.apply_entries(&clean_refs);
2125
2126        if let Some(pb) = &pb {
2127            pb.inc(entries.len() as u64);
2128        }
2129
2130        // Check if any entries were unbalanced
2131        // Note: When fail_on_validation_error is false, errors are stored in tracker
2132        let has_unbalanced = tracker
2133            .get_validation_errors()
2134            .iter()
2135            .any(|e| e.error_type == datasynth_generators::ValidationErrorType::UnbalancedEntry);
2136
2137        // Validate balance sheet for each company
2138        // Include both returned errors and collected validation errors
2139        let mut all_errors = errors;
2140        all_errors.extend(tracker.get_validation_errors().iter().cloned());
2141        let company_codes: Vec<String> = self
2142            .config
2143            .companies
2144            .iter()
2145            .map(|c| c.code.clone())
2146            .collect();
2147
2148        let end_date = NaiveDate::parse_from_str(&self.config.global.start_date, "%Y-%m-%d")
2149            .map(|d| d + chrono::Months::new(self.config.global.period_months))
2150            .unwrap_or_else(|_| chrono::Local::now().date_naive());
2151
2152        for company_code in &company_codes {
2153            if let Err(e) = tracker.validate_balance_sheet(company_code, end_date, None) {
2154                all_errors.push(e);
2155            }
2156        }
2157
2158        // Get statistics after all mutable operations are done
2159        let stats = tracker.get_statistics();
2160
2161        // Determine if balanced overall
2162        let is_balanced = all_errors.is_empty();
2163
2164        if let Some(pb) = pb {
2165            let msg = if is_balanced {
2166                "Balance validation passed"
2167            } else {
2168                "Balance validation completed with errors"
2169            };
2170            pb.finish_with_message(msg);
2171        }
2172
2173        Ok(BalanceValidationResult {
2174            validated: true,
2175            is_balanced,
2176            entries_processed: stats.entries_processed,
2177            total_debits: stats.total_debits,
2178            total_credits: stats.total_credits,
2179            accounts_tracked: stats.accounts_tracked,
2180            companies_tracked: stats.companies_tracked,
2181            validation_errors: all_errors,
2182            has_unbalanced_entries: has_unbalanced,
2183        })
2184    }
2185
2186    /// Inject data quality variations into journal entries.
2187    ///
2188    /// Applies typos, missing values, and format variations to make
2189    /// the synthetic data more realistic for testing data cleaning pipelines.
2190    fn inject_data_quality(
2191        &mut self,
2192        entries: &mut [JournalEntry],
2193    ) -> SynthResult<DataQualityStats> {
2194        let pb = self.create_progress_bar(entries.len() as u64, "Injecting Data Quality Issues");
2195
2196        // Use minimal configuration by default for realistic but not overwhelming issues
2197        let config = DataQualityConfig::minimal();
2198        let mut injector = DataQualityInjector::new(config);
2199
2200        // Build context for missing value decisions
2201        let context = HashMap::new();
2202
2203        for entry in entries.iter_mut() {
2204            // Process header_text field (common target for typos)
2205            if let Some(text) = &entry.header.header_text {
2206                let processed = injector.process_text_field(
2207                    "header_text",
2208                    text,
2209                    &entry.header.document_id.to_string(),
2210                    &context,
2211                );
2212                match processed {
2213                    Some(new_text) if new_text != *text => {
2214                        entry.header.header_text = Some(new_text);
2215                    }
2216                    None => {
2217                        entry.header.header_text = None; // Missing value
2218                    }
2219                    _ => {}
2220                }
2221            }
2222
2223            // Process reference field
2224            if let Some(ref_text) = &entry.header.reference {
2225                let processed = injector.process_text_field(
2226                    "reference",
2227                    ref_text,
2228                    &entry.header.document_id.to_string(),
2229                    &context,
2230                );
2231                match processed {
2232                    Some(new_text) if new_text != *ref_text => {
2233                        entry.header.reference = Some(new_text);
2234                    }
2235                    None => {
2236                        entry.header.reference = None;
2237                    }
2238                    _ => {}
2239                }
2240            }
2241
2242            // Process user_persona field (potential for typos in user IDs)
2243            let user_persona = entry.header.user_persona.clone();
2244            if let Some(processed) = injector.process_text_field(
2245                "user_persona",
2246                &user_persona,
2247                &entry.header.document_id.to_string(),
2248                &context,
2249            ) {
2250                if processed != user_persona {
2251                    entry.header.user_persona = processed;
2252                }
2253            }
2254
2255            // Process line items
2256            for line in &mut entry.lines {
2257                // Process line description if present
2258                if let Some(ref text) = line.line_text {
2259                    let processed = injector.process_text_field(
2260                        "line_text",
2261                        text,
2262                        &entry.header.document_id.to_string(),
2263                        &context,
2264                    );
2265                    match processed {
2266                        Some(new_text) if new_text != *text => {
2267                            line.line_text = Some(new_text);
2268                        }
2269                        None => {
2270                            line.line_text = None;
2271                        }
2272                        _ => {}
2273                    }
2274                }
2275
2276                // Process cost_center if present
2277                if let Some(cc) = &line.cost_center {
2278                    let processed = injector.process_text_field(
2279                        "cost_center",
2280                        cc,
2281                        &entry.header.document_id.to_string(),
2282                        &context,
2283                    );
2284                    match processed {
2285                        Some(new_cc) if new_cc != *cc => {
2286                            line.cost_center = Some(new_cc);
2287                        }
2288                        None => {
2289                            line.cost_center = None;
2290                        }
2291                        _ => {}
2292                    }
2293                }
2294            }
2295
2296            if let Some(pb) = &pb {
2297                pb.inc(1);
2298            }
2299        }
2300
2301        if let Some(pb) = pb {
2302            pb.finish_with_message("Data quality injection complete");
2303        }
2304
2305        Ok(injector.stats().clone())
2306    }
2307
2308    /// Generate audit data (engagements, workpapers, evidence, risks, findings, judgments).
2309    ///
2310    /// Creates complete audit documentation for each company in the configuration,
2311    /// following ISA standards:
2312    /// - ISA 210/220: Engagement acceptance and terms
2313    /// - ISA 230: Audit documentation (workpapers)
2314    /// - ISA 265: Control deficiencies (findings)
2315    /// - ISA 315/330: Risk assessment and response
2316    /// - ISA 500: Audit evidence
2317    /// - ISA 200: Professional judgment
2318    fn generate_audit_data(&mut self, entries: &[JournalEntry]) -> SynthResult<AuditSnapshot> {
2319        let start_date = NaiveDate::parse_from_str(&self.config.global.start_date, "%Y-%m-%d")
2320            .map_err(|e| SynthError::config(format!("Invalid start_date: {}", e)))?;
2321        let fiscal_year = start_date.year() as u16;
2322        let period_end = start_date + chrono::Months::new(self.config.global.period_months);
2323
2324        // Calculate rough total revenue from entries for materiality
2325        let total_revenue: rust_decimal::Decimal = entries
2326            .iter()
2327            .flat_map(|e| e.lines.iter())
2328            .filter(|l| l.credit_amount > rust_decimal::Decimal::ZERO)
2329            .map(|l| l.credit_amount)
2330            .sum();
2331
2332        let total_items = (self.phase_config.audit_engagements * 50) as u64; // Approximate items
2333        let pb = self.create_progress_bar(total_items, "Generating Audit Data");
2334
2335        let mut snapshot = AuditSnapshot::default();
2336
2337        // Initialize generators
2338        let mut engagement_gen = AuditEngagementGenerator::new(self.seed + 7000);
2339        let mut workpaper_gen = WorkpaperGenerator::new(self.seed + 7100);
2340        let mut evidence_gen = EvidenceGenerator::new(self.seed + 7200);
2341        let mut risk_gen = RiskAssessmentGenerator::new(self.seed + 7300);
2342        let mut finding_gen = FindingGenerator::new(self.seed + 7400);
2343        let mut judgment_gen = JudgmentGenerator::new(self.seed + 7500);
2344
2345        // Get list of accounts from CoA for risk assessment
2346        let accounts: Vec<String> = self
2347            .coa
2348            .as_ref()
2349            .map(|coa| {
2350                coa.get_postable_accounts()
2351                    .iter()
2352                    .map(|acc| acc.account_code().to_string())
2353                    .collect()
2354            })
2355            .unwrap_or_default();
2356
2357        // Generate engagements for each company
2358        for (i, company) in self.config.companies.iter().enumerate() {
2359            // Calculate company-specific revenue (proportional to volume weight)
2360            let company_revenue = total_revenue
2361                * rust_decimal::Decimal::try_from(company.volume_weight).unwrap_or_default();
2362
2363            // Generate engagements for this company
2364            let engagements_for_company =
2365                self.phase_config.audit_engagements / self.config.companies.len().max(1);
2366            let extra = if i < self.phase_config.audit_engagements % self.config.companies.len() {
2367                1
2368            } else {
2369                0
2370            };
2371
2372            for _eng_idx in 0..(engagements_for_company + extra) {
2373                // Generate the engagement
2374                let engagement = engagement_gen.generate_engagement(
2375                    &company.code,
2376                    &company.name,
2377                    fiscal_year,
2378                    period_end,
2379                    company_revenue,
2380                    None, // Use default engagement type
2381                );
2382
2383                if let Some(pb) = &pb {
2384                    pb.inc(1);
2385                }
2386
2387                // Get team members from the engagement
2388                let team_members: Vec<String> = engagement.team_member_ids.clone();
2389
2390                // Generate workpapers for the engagement
2391                let workpapers =
2392                    workpaper_gen.generate_complete_workpaper_set(&engagement, &team_members);
2393
2394                for wp in &workpapers {
2395                    if let Some(pb) = &pb {
2396                        pb.inc(1);
2397                    }
2398
2399                    // Generate evidence for each workpaper
2400                    let evidence = evidence_gen.generate_evidence_for_workpaper(
2401                        wp,
2402                        &team_members,
2403                        wp.preparer_date,
2404                    );
2405
2406                    for _ in &evidence {
2407                        if let Some(pb) = &pb {
2408                            pb.inc(1);
2409                        }
2410                    }
2411
2412                    snapshot.evidence.extend(evidence);
2413                }
2414
2415                // Generate risk assessments for the engagement
2416                let risks =
2417                    risk_gen.generate_risks_for_engagement(&engagement, &team_members, &accounts);
2418
2419                for _ in &risks {
2420                    if let Some(pb) = &pb {
2421                        pb.inc(1);
2422                    }
2423                }
2424                snapshot.risk_assessments.extend(risks);
2425
2426                // Generate findings for the engagement
2427                let findings = finding_gen.generate_findings_for_engagement(
2428                    &engagement,
2429                    &workpapers,
2430                    &team_members,
2431                );
2432
2433                for _ in &findings {
2434                    if let Some(pb) = &pb {
2435                        pb.inc(1);
2436                    }
2437                }
2438                snapshot.findings.extend(findings);
2439
2440                // Generate professional judgments for the engagement
2441                let judgments =
2442                    judgment_gen.generate_judgments_for_engagement(&engagement, &team_members);
2443
2444                for _ in &judgments {
2445                    if let Some(pb) = &pb {
2446                        pb.inc(1);
2447                    }
2448                }
2449                snapshot.judgments.extend(judgments);
2450
2451                // Add workpapers after findings since findings need them
2452                snapshot.workpapers.extend(workpapers);
2453                snapshot.engagements.push(engagement);
2454            }
2455        }
2456
2457        if let Some(pb) = pb {
2458            pb.finish_with_message(format!(
2459                "Audit data: {} engagements, {} workpapers, {} evidence",
2460                snapshot.engagements.len(),
2461                snapshot.workpapers.len(),
2462                snapshot.evidence.len()
2463            ));
2464        }
2465
2466        Ok(snapshot)
2467    }
2468
2469    /// Export journal entries as graph data for ML training and network reconstruction.
2470    ///
2471    /// Builds a transaction graph where:
2472    /// - Nodes are GL accounts
2473    /// - Edges are money flows from credit to debit accounts
2474    /// - Edge attributes include amount, date, business process, anomaly flags
2475    fn export_graphs(
2476        &mut self,
2477        entries: &[JournalEntry],
2478        _coa: &Arc<ChartOfAccounts>,
2479        stats: &mut EnhancedGenerationStatistics,
2480    ) -> SynthResult<GraphExportSnapshot> {
2481        let pb = self.create_progress_bar(100, "Exporting Graphs");
2482
2483        let mut snapshot = GraphExportSnapshot::default();
2484
2485        // Get output directory
2486        let output_dir = self
2487            .output_path
2488            .clone()
2489            .unwrap_or_else(|| PathBuf::from(&self.config.output.output_directory));
2490        let graph_dir = output_dir.join(&self.config.graph_export.output_subdirectory);
2491
2492        // Process each graph type configuration
2493        for graph_type in &self.config.graph_export.graph_types {
2494            if let Some(pb) = &pb {
2495                pb.inc(10);
2496            }
2497
2498            // Build transaction graph
2499            let graph_config = TransactionGraphConfig {
2500                include_vendors: false,
2501                include_customers: false,
2502                create_debit_credit_edges: true,
2503                include_document_nodes: graph_type.include_document_nodes,
2504                min_edge_weight: graph_type.min_edge_weight,
2505                aggregate_parallel_edges: graph_type.aggregate_edges,
2506            };
2507
2508            let mut builder = TransactionGraphBuilder::new(graph_config);
2509            builder.add_journal_entries(entries);
2510            let graph = builder.build();
2511
2512            // Update stats
2513            stats.graph_node_count += graph.node_count();
2514            stats.graph_edge_count += graph.edge_count();
2515
2516            if let Some(pb) = &pb {
2517                pb.inc(40);
2518            }
2519
2520            // Export to each configured format
2521            for format in &self.config.graph_export.formats {
2522                let format_dir = graph_dir.join(&graph_type.name).join(format_name(*format));
2523
2524                // Create output directory
2525                if let Err(e) = std::fs::create_dir_all(&format_dir) {
2526                    warn!("Failed to create graph output directory: {}", e);
2527                    continue;
2528                }
2529
2530                match format {
2531                    datasynth_config::schema::GraphExportFormat::PytorchGeometric => {
2532                        let pyg_config = PyGExportConfig {
2533                            common: datasynth_graph::CommonExportConfig {
2534                                export_node_features: true,
2535                                export_edge_features: true,
2536                                export_node_labels: true,
2537                                export_edge_labels: true,
2538                                export_masks: true,
2539                                train_ratio: self.config.graph_export.train_ratio,
2540                                val_ratio: self.config.graph_export.validation_ratio,
2541                                seed: self.config.graph_export.split_seed.unwrap_or(self.seed),
2542                            },
2543                            one_hot_categoricals: false,
2544                        };
2545
2546                        let exporter = PyGExporter::new(pyg_config);
2547                        match exporter.export(&graph, &format_dir) {
2548                            Ok(metadata) => {
2549                                snapshot.exports.insert(
2550                                    format!("{}_{}", graph_type.name, "pytorch_geometric"),
2551                                    GraphExportInfo {
2552                                        name: graph_type.name.clone(),
2553                                        format: "pytorch_geometric".to_string(),
2554                                        output_path: format_dir.clone(),
2555                                        node_count: metadata.num_nodes,
2556                                        edge_count: metadata.num_edges,
2557                                    },
2558                                );
2559                                snapshot.graph_count += 1;
2560                            }
2561                            Err(e) => {
2562                                warn!("Failed to export PyTorch Geometric graph: {}", e);
2563                            }
2564                        }
2565                    }
2566                    datasynth_config::schema::GraphExportFormat::Neo4j => {
2567                        // Neo4j export will be added in a future update
2568                        debug!("Neo4j export not yet implemented for accounting networks");
2569                    }
2570                    datasynth_config::schema::GraphExportFormat::Dgl => {
2571                        // DGL export will be added in a future update
2572                        debug!("DGL export not yet implemented for accounting networks");
2573                    }
2574                    datasynth_config::schema::GraphExportFormat::RustGraph => {
2575                        use datasynth_graph::{
2576                            RustGraphExportConfig, RustGraphExporter, RustGraphOutputFormat,
2577                        };
2578
2579                        let rustgraph_config = RustGraphExportConfig {
2580                            include_features: true,
2581                            include_temporal: true,
2582                            include_labels: true,
2583                            source_name: "datasynth".to_string(),
2584                            batch_id: None,
2585                            output_format: RustGraphOutputFormat::JsonLines,
2586                            export_node_properties: true,
2587                            export_edge_properties: true,
2588                            pretty_print: false,
2589                        };
2590
2591                        let exporter = RustGraphExporter::new(rustgraph_config);
2592                        match exporter.export(&graph, &format_dir) {
2593                            Ok(metadata) => {
2594                                snapshot.exports.insert(
2595                                    format!("{}_{}", graph_type.name, "rustgraph"),
2596                                    GraphExportInfo {
2597                                        name: graph_type.name.clone(),
2598                                        format: "rustgraph".to_string(),
2599                                        output_path: format_dir.clone(),
2600                                        node_count: metadata.num_nodes,
2601                                        edge_count: metadata.num_edges,
2602                                    },
2603                                );
2604                                snapshot.graph_count += 1;
2605                            }
2606                            Err(e) => {
2607                                warn!("Failed to export RustGraph: {}", e);
2608                            }
2609                        }
2610                    }
2611                    datasynth_config::schema::GraphExportFormat::RustGraphHypergraph => {
2612                        // Hypergraph export is handled separately in Phase 10b
2613                        debug!("RustGraphHypergraph format is handled in Phase 10b (hypergraph export)");
2614                    }
2615                }
2616            }
2617
2618            if let Some(pb) = &pb {
2619                pb.inc(40);
2620            }
2621        }
2622
2623        stats.graph_export_count = snapshot.graph_count;
2624        snapshot.exported = snapshot.graph_count > 0;
2625
2626        if let Some(pb) = pb {
2627            pb.finish_with_message(format!(
2628                "Graphs exported: {} graphs ({} nodes, {} edges)",
2629                snapshot.graph_count, stats.graph_node_count, stats.graph_edge_count
2630            ));
2631        }
2632
2633        Ok(snapshot)
2634    }
2635
2636    /// Export a multi-layer hypergraph for RustGraph integration.
2637    ///
2638    /// Builds a 3-layer hypergraph:
2639    /// - Layer 1: Governance & Controls (COSO, internal controls, master data)
2640    /// - Layer 2: Process Events (P2P/O2C document flows)
2641    /// - Layer 3: Accounting Network (GL accounts, journal entries as hyperedges)
2642    fn export_hypergraph(
2643        &self,
2644        coa: &Arc<ChartOfAccounts>,
2645        entries: &[JournalEntry],
2646        document_flows: &DocumentFlowSnapshot,
2647        stats: &mut EnhancedGenerationStatistics,
2648    ) -> SynthResult<HypergraphExportInfo> {
2649        use datasynth_graph::builders::hypergraph::{HypergraphBuilder, HypergraphConfig};
2650        use datasynth_graph::exporters::hypergraph::{HypergraphExportConfig, HypergraphExporter};
2651        use datasynth_graph::exporters::unified::{RustGraphUnifiedExporter, UnifiedExportConfig};
2652        use datasynth_graph::models::hypergraph::AggregationStrategy;
2653
2654        let hg_settings = &self.config.graph_export.hypergraph;
2655
2656        // Parse aggregation strategy from config string
2657        let aggregation_strategy = match hg_settings.aggregation_strategy.as_str() {
2658            "truncate" => AggregationStrategy::Truncate,
2659            "pool_by_counterparty" => AggregationStrategy::PoolByCounterparty,
2660            "pool_by_time_period" => AggregationStrategy::PoolByTimePeriod,
2661            "importance_sample" => AggregationStrategy::ImportanceSample,
2662            _ => AggregationStrategy::PoolByCounterparty,
2663        };
2664
2665        let builder_config = HypergraphConfig {
2666            max_nodes: hg_settings.max_nodes,
2667            aggregation_strategy,
2668            include_coso: hg_settings.governance_layer.include_coso,
2669            include_controls: hg_settings.governance_layer.include_controls,
2670            include_sox: hg_settings.governance_layer.include_sox,
2671            include_vendors: hg_settings.governance_layer.include_vendors,
2672            include_customers: hg_settings.governance_layer.include_customers,
2673            include_employees: hg_settings.governance_layer.include_employees,
2674            include_p2p: hg_settings.process_layer.include_p2p,
2675            include_o2c: hg_settings.process_layer.include_o2c,
2676            events_as_hyperedges: hg_settings.process_layer.events_as_hyperedges,
2677            docs_per_counterparty_threshold: hg_settings
2678                .process_layer
2679                .docs_per_counterparty_threshold,
2680            include_accounts: hg_settings.accounting_layer.include_accounts,
2681            je_as_hyperedges: hg_settings.accounting_layer.je_as_hyperedges,
2682            include_cross_layer_edges: hg_settings.cross_layer.enabled,
2683        };
2684
2685        let mut builder = HypergraphBuilder::new(builder_config);
2686
2687        // Layer 1: Governance & Controls
2688        builder.add_coso_framework();
2689
2690        // Add controls if available (generated during JE generation)
2691        // Controls are generated per-company; we use the standard set
2692        if hg_settings.governance_layer.include_controls && self.config.internal_controls.enabled {
2693            let controls = InternalControl::standard_controls();
2694            builder.add_controls(&controls);
2695        }
2696
2697        // Add master data
2698        builder.add_vendors(&self.master_data.vendors);
2699        builder.add_customers(&self.master_data.customers);
2700        builder.add_employees(&self.master_data.employees);
2701
2702        // Layer 2: Process Events (P2P/O2C documents)
2703        builder.add_p2p_documents(
2704            &document_flows.purchase_orders,
2705            &document_flows.goods_receipts,
2706            &document_flows.vendor_invoices,
2707            &document_flows.payments,
2708        );
2709        builder.add_o2c_documents(
2710            &document_flows.sales_orders,
2711            &document_flows.deliveries,
2712            &document_flows.customer_invoices,
2713        );
2714
2715        // Layer 3: Accounting Network
2716        builder.add_accounts(coa);
2717        builder.add_journal_entries_as_hyperedges(entries);
2718
2719        // Build the hypergraph
2720        let hypergraph = builder.build();
2721
2722        // Export
2723        let output_dir = self
2724            .output_path
2725            .clone()
2726            .unwrap_or_else(|| PathBuf::from(&self.config.output.output_directory));
2727        let hg_dir = output_dir
2728            .join(&self.config.graph_export.output_subdirectory)
2729            .join(&hg_settings.output_subdirectory);
2730
2731        // Branch on output format
2732        let (num_nodes, num_edges, num_hyperedges) = match hg_settings.output_format.as_str() {
2733            "unified" => {
2734                let exporter = RustGraphUnifiedExporter::new(UnifiedExportConfig::default());
2735                let metadata = exporter.export(&hypergraph, &hg_dir).map_err(|e| {
2736                    SynthError::generation(format!("Unified hypergraph export failed: {}", e))
2737                })?;
2738                (
2739                    metadata.num_nodes,
2740                    metadata.num_edges,
2741                    metadata.num_hyperedges,
2742                )
2743            }
2744            _ => {
2745                // "native" or any unrecognized format → use existing exporter
2746                let exporter = HypergraphExporter::new(HypergraphExportConfig::default());
2747                let metadata = exporter.export(&hypergraph, &hg_dir).map_err(|e| {
2748                    SynthError::generation(format!("Hypergraph export failed: {}", e))
2749                })?;
2750                (
2751                    metadata.num_nodes,
2752                    metadata.num_edges,
2753                    metadata.num_hyperedges,
2754                )
2755            }
2756        };
2757
2758        // Stream to RustGraph ingest endpoint if configured
2759        #[cfg(feature = "streaming")]
2760        if let Some(ref target_url) = hg_settings.stream_target {
2761            use crate::stream_client::{StreamClient, StreamConfig};
2762            use std::io::Write as _;
2763
2764            let api_key = std::env::var("RUSTGRAPH_API_KEY").ok();
2765            let stream_config = StreamConfig {
2766                target_url: target_url.clone(),
2767                batch_size: hg_settings.stream_batch_size,
2768                api_key,
2769                ..StreamConfig::default()
2770            };
2771
2772            match StreamClient::new(stream_config) {
2773                Ok(mut client) => {
2774                    let exporter = RustGraphUnifiedExporter::new(UnifiedExportConfig::default());
2775                    match exporter.export_to_writer(&hypergraph, &mut client) {
2776                        Ok(_) => {
2777                            if let Err(e) = client.flush() {
2778                                warn!("Failed to flush stream client: {}", e);
2779                            } else {
2780                                info!("Streamed {} records to {}", client.total_sent(), target_url);
2781                            }
2782                        }
2783                        Err(e) => {
2784                            warn!("Streaming export failed: {}", e);
2785                        }
2786                    }
2787                }
2788                Err(e) => {
2789                    warn!("Failed to create stream client: {}", e);
2790                }
2791            }
2792        }
2793
2794        // Update stats
2795        stats.graph_node_count += num_nodes;
2796        stats.graph_edge_count += num_edges;
2797        stats.graph_export_count += 1;
2798
2799        Ok(HypergraphExportInfo {
2800            node_count: num_nodes,
2801            edge_count: num_edges,
2802            hyperedge_count: num_hyperedges,
2803            output_path: hg_dir,
2804        })
2805    }
2806
2807    /// Generate banking KYC/AML data.
2808    ///
2809    /// Creates banking customers, accounts, and transactions with AML typology injection.
2810    /// Uses the BankingOrchestrator from synth-banking crate.
2811    fn generate_banking_data(&mut self) -> SynthResult<BankingSnapshot> {
2812        let pb = self.create_progress_bar(100, "Generating Banking Data");
2813
2814        // Build the banking orchestrator from config
2815        let orchestrator = BankingOrchestratorBuilder::new()
2816            .config(self.config.banking.clone())
2817            .seed(self.seed + 9000)
2818            .build();
2819
2820        if let Some(pb) = &pb {
2821            pb.inc(10);
2822        }
2823
2824        // Generate the banking data
2825        let result = orchestrator.generate();
2826
2827        if let Some(pb) = &pb {
2828            pb.inc(90);
2829            pb.finish_with_message(format!(
2830                "Banking: {} customers, {} transactions",
2831                result.customers.len(),
2832                result.transactions.len()
2833            ));
2834        }
2835
2836        Ok(BankingSnapshot {
2837            customers: result.customers,
2838            accounts: result.accounts,
2839            transactions: result.transactions,
2840            suspicious_count: result.stats.suspicious_count,
2841            scenario_count: result.scenarios.len(),
2842        })
2843    }
2844
2845    /// Calculate total transactions to generate.
2846    fn calculate_total_transactions(&self) -> u64 {
2847        let months = self.config.global.period_months as f64;
2848        self.config
2849            .companies
2850            .iter()
2851            .map(|c| {
2852                let annual = c.annual_transaction_volume.count() as f64;
2853                let weighted = annual * c.volume_weight;
2854                (weighted * months / 12.0) as u64
2855            })
2856            .sum()
2857    }
2858
2859    /// Create a progress bar if progress display is enabled.
2860    fn create_progress_bar(&self, total: u64, message: &str) -> Option<ProgressBar> {
2861        if !self.phase_config.show_progress {
2862            return None;
2863        }
2864
2865        let pb = if let Some(mp) = &self.multi_progress {
2866            mp.add(ProgressBar::new(total))
2867        } else {
2868            ProgressBar::new(total)
2869        };
2870
2871        pb.set_style(
2872            ProgressStyle::default_bar()
2873                .template(&format!(
2874                    "{{spinner:.green}} {} [{{elapsed_precise}}] [{{bar:40.cyan/blue}}] {{pos}}/{{len}} ({{per_sec}})",
2875                    message
2876                ))
2877                .expect("Progress bar template should be valid - uses only standard indicatif placeholders")
2878                .progress_chars("#>-"),
2879        );
2880
2881        Some(pb)
2882    }
2883
2884    /// Get the generated chart of accounts.
2885    pub fn get_coa(&self) -> Option<Arc<ChartOfAccounts>> {
2886        self.coa.clone()
2887    }
2888
2889    /// Get the generated master data.
2890    pub fn get_master_data(&self) -> &MasterDataSnapshot {
2891        &self.master_data
2892    }
2893
2894    /// Build a lineage graph describing config → phase → output relationships.
2895    fn build_lineage_graph(&self) -> super::lineage::LineageGraph {
2896        use super::lineage::LineageGraphBuilder;
2897
2898        let mut builder = LineageGraphBuilder::new();
2899
2900        // Config sections
2901        builder.add_config_section("config:global", "Global Config");
2902        builder.add_config_section("config:chart_of_accounts", "Chart of Accounts Config");
2903        builder.add_config_section("config:transactions", "Transaction Config");
2904
2905        // Generator phases
2906        builder.add_generator_phase("phase:coa", "Chart of Accounts Generation");
2907        builder.add_generator_phase("phase:je", "Journal Entry Generation");
2908
2909        // Config → phase edges
2910        builder.configured_by("phase:coa", "config:chart_of_accounts");
2911        builder.configured_by("phase:je", "config:transactions");
2912
2913        // Output files
2914        builder.add_output_file("output:je", "Journal Entries", "sample_entries.json");
2915        builder.produced_by("output:je", "phase:je");
2916
2917        // Optional phases based on config
2918        if self.phase_config.generate_master_data {
2919            builder.add_config_section("config:master_data", "Master Data Config");
2920            builder.add_generator_phase("phase:master_data", "Master Data Generation");
2921            builder.configured_by("phase:master_data", "config:master_data");
2922            builder.input_to("phase:master_data", "phase:je");
2923        }
2924
2925        if self.phase_config.generate_document_flows {
2926            builder.add_config_section("config:document_flows", "Document Flow Config");
2927            builder.add_generator_phase("phase:p2p", "P2P Document Flow");
2928            builder.add_generator_phase("phase:o2c", "O2C Document Flow");
2929            builder.configured_by("phase:p2p", "config:document_flows");
2930            builder.configured_by("phase:o2c", "config:document_flows");
2931
2932            builder.add_output_file("output:po", "Purchase Orders", "purchase_orders.csv");
2933            builder.add_output_file("output:gr", "Goods Receipts", "goods_receipts.csv");
2934            builder.add_output_file("output:vi", "Vendor Invoices", "vendor_invoices.csv");
2935            builder.add_output_file("output:so", "Sales Orders", "sales_orders.csv");
2936            builder.add_output_file("output:ci", "Customer Invoices", "customer_invoices.csv");
2937
2938            builder.produced_by("output:po", "phase:p2p");
2939            builder.produced_by("output:gr", "phase:p2p");
2940            builder.produced_by("output:vi", "phase:p2p");
2941            builder.produced_by("output:so", "phase:o2c");
2942            builder.produced_by("output:ci", "phase:o2c");
2943        }
2944
2945        if self.phase_config.inject_anomalies {
2946            builder.add_config_section("config:fraud", "Fraud/Anomaly Config");
2947            builder.add_generator_phase("phase:anomaly", "Anomaly Injection");
2948            builder.configured_by("phase:anomaly", "config:fraud");
2949            builder.add_output_file(
2950                "output:labels",
2951                "Anomaly Labels",
2952                "labels/anomaly_labels.csv",
2953            );
2954            builder.produced_by("output:labels", "phase:anomaly");
2955        }
2956
2957        if self.phase_config.generate_audit {
2958            builder.add_config_section("config:audit", "Audit Config");
2959            builder.add_generator_phase("phase:audit", "Audit Data Generation");
2960            builder.configured_by("phase:audit", "config:audit");
2961        }
2962
2963        if self.phase_config.generate_banking {
2964            builder.add_config_section("config:banking", "Banking Config");
2965            builder.add_generator_phase("phase:banking", "Banking KYC/AML Generation");
2966            builder.configured_by("phase:banking", "config:banking");
2967        }
2968
2969        if self.config.llm.enabled {
2970            builder.add_config_section("config:llm", "LLM Enrichment Config");
2971            builder.add_generator_phase("phase:llm_enrichment", "LLM Enrichment");
2972            builder.configured_by("phase:llm_enrichment", "config:llm");
2973        }
2974
2975        if self.config.diffusion.enabled {
2976            builder.add_config_section("config:diffusion", "Diffusion Enhancement Config");
2977            builder.add_generator_phase("phase:diffusion", "Diffusion Enhancement");
2978            builder.configured_by("phase:diffusion", "config:diffusion");
2979        }
2980
2981        if self.config.causal.enabled {
2982            builder.add_config_section("config:causal", "Causal Generation Config");
2983            builder.add_generator_phase("phase:causal", "Causal Overlay");
2984            builder.configured_by("phase:causal", "config:causal");
2985        }
2986
2987        builder.build()
2988    }
2989}
2990
2991/// Get the directory name for a graph export format.
2992fn format_name(format: datasynth_config::schema::GraphExportFormat) -> &'static str {
2993    match format {
2994        datasynth_config::schema::GraphExportFormat::PytorchGeometric => "pytorch_geometric",
2995        datasynth_config::schema::GraphExportFormat::Neo4j => "neo4j",
2996        datasynth_config::schema::GraphExportFormat::Dgl => "dgl",
2997        datasynth_config::schema::GraphExportFormat::RustGraph => "rustgraph",
2998        datasynth_config::schema::GraphExportFormat::RustGraphHypergraph => "rustgraph_hypergraph",
2999    }
3000}
3001
3002#[cfg(test)]
3003#[allow(clippy::unwrap_used)]
3004mod tests {
3005    use super::*;
3006    use datasynth_config::schema::*;
3007
3008    fn create_test_config() -> GeneratorConfig {
3009        GeneratorConfig {
3010            global: GlobalConfig {
3011                industry: IndustrySector::Manufacturing,
3012                start_date: "2024-01-01".to_string(),
3013                period_months: 1,
3014                seed: Some(42),
3015                parallel: false,
3016                group_currency: "USD".to_string(),
3017                worker_threads: 0,
3018                memory_limit_mb: 0,
3019            },
3020            companies: vec![CompanyConfig {
3021                code: "1000".to_string(),
3022                name: "Test Company".to_string(),
3023                currency: "USD".to_string(),
3024                country: "US".to_string(),
3025                annual_transaction_volume: TransactionVolume::TenK,
3026                volume_weight: 1.0,
3027                fiscal_year_variant: "K4".to_string(),
3028            }],
3029            chart_of_accounts: ChartOfAccountsConfig {
3030                complexity: CoAComplexity::Small,
3031                industry_specific: true,
3032                custom_accounts: None,
3033                min_hierarchy_depth: 2,
3034                max_hierarchy_depth: 4,
3035            },
3036            transactions: TransactionConfig::default(),
3037            output: OutputConfig::default(),
3038            fraud: FraudConfig::default(),
3039            internal_controls: InternalControlsConfig::default(),
3040            business_processes: BusinessProcessConfig::default(),
3041            user_personas: UserPersonaConfig::default(),
3042            templates: TemplateConfig::default(),
3043            approval: ApprovalConfig::default(),
3044            departments: DepartmentConfig::default(),
3045            master_data: MasterDataConfig::default(),
3046            document_flows: DocumentFlowConfig::default(),
3047            intercompany: IntercompanyConfig::default(),
3048            balance: BalanceConfig::default(),
3049            ocpm: OcpmConfig::default(),
3050            audit: AuditGenerationConfig::default(),
3051            banking: datasynth_banking::BankingConfig::default(),
3052            data_quality: DataQualitySchemaConfig::default(),
3053            scenario: ScenarioConfig::default(),
3054            temporal: TemporalDriftConfig::default(),
3055            graph_export: GraphExportConfig::default(),
3056            streaming: StreamingSchemaConfig::default(),
3057            rate_limit: RateLimitSchemaConfig::default(),
3058            temporal_attributes: TemporalAttributeSchemaConfig::default(),
3059            relationships: RelationshipSchemaConfig::default(),
3060            accounting_standards: AccountingStandardsConfig::default(),
3061            audit_standards: AuditStandardsConfig::default(),
3062            distributions: Default::default(),
3063            temporal_patterns: Default::default(),
3064            vendor_network: VendorNetworkSchemaConfig::default(),
3065            customer_segmentation: CustomerSegmentationSchemaConfig::default(),
3066            relationship_strength: RelationshipStrengthSchemaConfig::default(),
3067            cross_process_links: CrossProcessLinksSchemaConfig::default(),
3068            organizational_events: OrganizationalEventsSchemaConfig::default(),
3069            behavioral_drift: BehavioralDriftSchemaConfig::default(),
3070            market_drift: MarketDriftSchemaConfig::default(),
3071            drift_labeling: DriftLabelingSchemaConfig::default(),
3072            anomaly_injection: Default::default(),
3073            industry_specific: Default::default(),
3074            fingerprint_privacy: Default::default(),
3075            quality_gates: Default::default(),
3076            compliance: Default::default(),
3077            webhooks: Default::default(),
3078            llm: Default::default(),
3079            diffusion: Default::default(),
3080            causal: Default::default(),
3081        }
3082    }
3083
3084    #[test]
3085    fn test_enhanced_orchestrator_creation() {
3086        let config = create_test_config();
3087        let orchestrator = EnhancedOrchestrator::with_defaults(config);
3088        assert!(orchestrator.is_ok());
3089    }
3090
3091    #[test]
3092    fn test_minimal_generation() {
3093        let config = create_test_config();
3094        let phase_config = PhaseConfig {
3095            generate_master_data: false,
3096            generate_document_flows: false,
3097            generate_journal_entries: true,
3098            inject_anomalies: false,
3099            show_progress: false,
3100            ..Default::default()
3101        };
3102
3103        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3104        let result = orchestrator.generate();
3105
3106        assert!(result.is_ok());
3107        let result = result.unwrap();
3108        assert!(!result.journal_entries.is_empty());
3109    }
3110
3111    #[test]
3112    fn test_master_data_generation() {
3113        let config = create_test_config();
3114        let phase_config = PhaseConfig {
3115            generate_master_data: true,
3116            generate_document_flows: false,
3117            generate_journal_entries: false,
3118            inject_anomalies: false,
3119            show_progress: false,
3120            vendors_per_company: 5,
3121            customers_per_company: 5,
3122            materials_per_company: 10,
3123            assets_per_company: 5,
3124            employees_per_company: 10,
3125            ..Default::default()
3126        };
3127
3128        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3129        let result = orchestrator.generate().unwrap();
3130
3131        assert!(!result.master_data.vendors.is_empty());
3132        assert!(!result.master_data.customers.is_empty());
3133        assert!(!result.master_data.materials.is_empty());
3134    }
3135
3136    #[test]
3137    fn test_document_flow_generation() {
3138        let config = create_test_config();
3139        let phase_config = PhaseConfig {
3140            generate_master_data: true,
3141            generate_document_flows: true,
3142            generate_journal_entries: false,
3143            inject_anomalies: false,
3144            inject_data_quality: false,
3145            validate_balances: false,
3146            generate_ocpm_events: false,
3147            show_progress: false,
3148            vendors_per_company: 5,
3149            customers_per_company: 5,
3150            materials_per_company: 10,
3151            assets_per_company: 5,
3152            employees_per_company: 10,
3153            p2p_chains: 5,
3154            o2c_chains: 5,
3155            ..Default::default()
3156        };
3157
3158        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3159        let result = orchestrator.generate().unwrap();
3160
3161        // Should have generated P2P and O2C chains
3162        assert!(!result.document_flows.p2p_chains.is_empty());
3163        assert!(!result.document_flows.o2c_chains.is_empty());
3164
3165        // Flattened documents should be populated
3166        assert!(!result.document_flows.purchase_orders.is_empty());
3167        assert!(!result.document_flows.sales_orders.is_empty());
3168    }
3169
3170    #[test]
3171    fn test_anomaly_injection() {
3172        let config = create_test_config();
3173        let phase_config = PhaseConfig {
3174            generate_master_data: false,
3175            generate_document_flows: false,
3176            generate_journal_entries: true,
3177            inject_anomalies: true,
3178            show_progress: false,
3179            ..Default::default()
3180        };
3181
3182        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3183        let result = orchestrator.generate().unwrap();
3184
3185        // Should have journal entries
3186        assert!(!result.journal_entries.is_empty());
3187
3188        // With ~833 entries and 2% rate, expect some anomalies
3189        // Note: This is probabilistic, so we just verify the structure exists
3190        assert!(result.anomaly_labels.summary.is_some());
3191    }
3192
3193    #[test]
3194    fn test_full_generation_pipeline() {
3195        let config = create_test_config();
3196        let phase_config = PhaseConfig {
3197            generate_master_data: true,
3198            generate_document_flows: true,
3199            generate_journal_entries: true,
3200            inject_anomalies: false,
3201            inject_data_quality: false,
3202            validate_balances: true,
3203            generate_ocpm_events: false,
3204            show_progress: false,
3205            vendors_per_company: 3,
3206            customers_per_company: 3,
3207            materials_per_company: 5,
3208            assets_per_company: 3,
3209            employees_per_company: 5,
3210            p2p_chains: 3,
3211            o2c_chains: 3,
3212            ..Default::default()
3213        };
3214
3215        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3216        let result = orchestrator.generate().unwrap();
3217
3218        // All phases should have results
3219        assert!(!result.master_data.vendors.is_empty());
3220        assert!(!result.master_data.customers.is_empty());
3221        assert!(!result.document_flows.p2p_chains.is_empty());
3222        assert!(!result.document_flows.o2c_chains.is_empty());
3223        assert!(!result.journal_entries.is_empty());
3224        assert!(result.statistics.accounts_count > 0);
3225
3226        // Subledger linking should have run
3227        assert!(!result.subledger.ap_invoices.is_empty());
3228        assert!(!result.subledger.ar_invoices.is_empty());
3229
3230        // Balance validation should have run
3231        assert!(result.balance_validation.validated);
3232        assert!(result.balance_validation.entries_processed > 0);
3233    }
3234
3235    #[test]
3236    fn test_subledger_linking() {
3237        let config = create_test_config();
3238        let phase_config = PhaseConfig {
3239            generate_master_data: true,
3240            generate_document_flows: true,
3241            generate_journal_entries: false,
3242            inject_anomalies: false,
3243            inject_data_quality: false,
3244            validate_balances: false,
3245            generate_ocpm_events: false,
3246            show_progress: false,
3247            vendors_per_company: 5,
3248            customers_per_company: 5,
3249            materials_per_company: 10,
3250            assets_per_company: 3,
3251            employees_per_company: 5,
3252            p2p_chains: 5,
3253            o2c_chains: 5,
3254            ..Default::default()
3255        };
3256
3257        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3258        let result = orchestrator.generate().unwrap();
3259
3260        // Should have document flows
3261        assert!(!result.document_flows.vendor_invoices.is_empty());
3262        assert!(!result.document_flows.customer_invoices.is_empty());
3263
3264        // Subledger should be linked from document flows
3265        assert!(!result.subledger.ap_invoices.is_empty());
3266        assert!(!result.subledger.ar_invoices.is_empty());
3267
3268        // AP invoices count should match vendor invoices count
3269        assert_eq!(
3270            result.subledger.ap_invoices.len(),
3271            result.document_flows.vendor_invoices.len()
3272        );
3273
3274        // AR invoices count should match customer invoices count
3275        assert_eq!(
3276            result.subledger.ar_invoices.len(),
3277            result.document_flows.customer_invoices.len()
3278        );
3279
3280        // Statistics should reflect subledger counts
3281        assert_eq!(
3282            result.statistics.ap_invoice_count,
3283            result.subledger.ap_invoices.len()
3284        );
3285        assert_eq!(
3286            result.statistics.ar_invoice_count,
3287            result.subledger.ar_invoices.len()
3288        );
3289    }
3290
3291    #[test]
3292    fn test_balance_validation() {
3293        let config = create_test_config();
3294        let phase_config = PhaseConfig {
3295            generate_master_data: false,
3296            generate_document_flows: false,
3297            generate_journal_entries: true,
3298            inject_anomalies: false,
3299            validate_balances: true,
3300            show_progress: false,
3301            ..Default::default()
3302        };
3303
3304        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3305        let result = orchestrator.generate().unwrap();
3306
3307        // Balance validation should run
3308        assert!(result.balance_validation.validated);
3309        assert!(result.balance_validation.entries_processed > 0);
3310
3311        // Generated JEs should be balanced (no unbalanced entries)
3312        assert!(!result.balance_validation.has_unbalanced_entries);
3313
3314        // Total debits should equal total credits
3315        assert_eq!(
3316            result.balance_validation.total_debits,
3317            result.balance_validation.total_credits
3318        );
3319    }
3320
3321    #[test]
3322    fn test_statistics_accuracy() {
3323        let config = create_test_config();
3324        let phase_config = PhaseConfig {
3325            generate_master_data: true,
3326            generate_document_flows: false,
3327            generate_journal_entries: true,
3328            inject_anomalies: false,
3329            show_progress: false,
3330            vendors_per_company: 10,
3331            customers_per_company: 20,
3332            materials_per_company: 15,
3333            assets_per_company: 5,
3334            employees_per_company: 8,
3335            ..Default::default()
3336        };
3337
3338        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3339        let result = orchestrator.generate().unwrap();
3340
3341        // Statistics should match actual data
3342        assert_eq!(
3343            result.statistics.vendor_count,
3344            result.master_data.vendors.len()
3345        );
3346        assert_eq!(
3347            result.statistics.customer_count,
3348            result.master_data.customers.len()
3349        );
3350        assert_eq!(
3351            result.statistics.material_count,
3352            result.master_data.materials.len()
3353        );
3354        assert_eq!(
3355            result.statistics.total_entries as usize,
3356            result.journal_entries.len()
3357        );
3358    }
3359
3360    #[test]
3361    fn test_phase_config_defaults() {
3362        let config = PhaseConfig::default();
3363        assert!(config.generate_master_data);
3364        assert!(config.generate_document_flows);
3365        assert!(config.generate_journal_entries);
3366        assert!(!config.inject_anomalies);
3367        assert!(config.validate_balances);
3368        assert!(config.show_progress);
3369        assert!(config.vendors_per_company > 0);
3370        assert!(config.customers_per_company > 0);
3371    }
3372
3373    #[test]
3374    fn test_get_coa_before_generation() {
3375        let config = create_test_config();
3376        let orchestrator = EnhancedOrchestrator::with_defaults(config).unwrap();
3377
3378        // Before generation, CoA should be None
3379        assert!(orchestrator.get_coa().is_none());
3380    }
3381
3382    #[test]
3383    fn test_get_coa_after_generation() {
3384        let config = create_test_config();
3385        let phase_config = PhaseConfig {
3386            generate_master_data: false,
3387            generate_document_flows: false,
3388            generate_journal_entries: true,
3389            inject_anomalies: false,
3390            show_progress: false,
3391            ..Default::default()
3392        };
3393
3394        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3395        let _ = orchestrator.generate().unwrap();
3396
3397        // After generation, CoA should be available
3398        assert!(orchestrator.get_coa().is_some());
3399    }
3400
3401    #[test]
3402    fn test_get_master_data() {
3403        let config = create_test_config();
3404        let phase_config = PhaseConfig {
3405            generate_master_data: true,
3406            generate_document_flows: false,
3407            generate_journal_entries: false,
3408            inject_anomalies: false,
3409            show_progress: false,
3410            vendors_per_company: 5,
3411            customers_per_company: 5,
3412            materials_per_company: 5,
3413            assets_per_company: 5,
3414            employees_per_company: 5,
3415            ..Default::default()
3416        };
3417
3418        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3419        let _ = orchestrator.generate().unwrap();
3420
3421        let master_data = orchestrator.get_master_data();
3422        assert!(!master_data.vendors.is_empty());
3423    }
3424
3425    #[test]
3426    fn test_with_progress_builder() {
3427        let config = create_test_config();
3428        let orchestrator = EnhancedOrchestrator::with_defaults(config)
3429            .unwrap()
3430            .with_progress(false);
3431
3432        // Should still work without progress
3433        assert!(!orchestrator.phase_config.show_progress);
3434    }
3435
3436    #[test]
3437    fn test_multi_company_generation() {
3438        let mut config = create_test_config();
3439        config.companies.push(CompanyConfig {
3440            code: "2000".to_string(),
3441            name: "Subsidiary".to_string(),
3442            currency: "EUR".to_string(),
3443            country: "DE".to_string(),
3444            annual_transaction_volume: TransactionVolume::TenK,
3445            volume_weight: 0.5,
3446            fiscal_year_variant: "K4".to_string(),
3447        });
3448
3449        let phase_config = PhaseConfig {
3450            generate_master_data: true,
3451            generate_document_flows: false,
3452            generate_journal_entries: true,
3453            inject_anomalies: false,
3454            show_progress: false,
3455            vendors_per_company: 5,
3456            customers_per_company: 5,
3457            materials_per_company: 5,
3458            assets_per_company: 5,
3459            employees_per_company: 5,
3460            ..Default::default()
3461        };
3462
3463        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3464        let result = orchestrator.generate().unwrap();
3465
3466        // Should have master data for both companies
3467        assert!(result.statistics.vendor_count >= 10); // 5 per company
3468        assert!(result.statistics.customer_count >= 10);
3469        assert!(result.statistics.companies_count == 2);
3470    }
3471
3472    #[test]
3473    fn test_empty_master_data_skips_document_flows() {
3474        let config = create_test_config();
3475        let phase_config = PhaseConfig {
3476            generate_master_data: false,   // Skip master data
3477            generate_document_flows: true, // Try to generate flows
3478            generate_journal_entries: false,
3479            inject_anomalies: false,
3480            show_progress: false,
3481            ..Default::default()
3482        };
3483
3484        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3485        let result = orchestrator.generate().unwrap();
3486
3487        // Without master data, document flows should be empty
3488        assert!(result.document_flows.p2p_chains.is_empty());
3489        assert!(result.document_flows.o2c_chains.is_empty());
3490    }
3491
3492    #[test]
3493    fn test_journal_entry_line_item_count() {
3494        let config = create_test_config();
3495        let phase_config = PhaseConfig {
3496            generate_master_data: false,
3497            generate_document_flows: false,
3498            generate_journal_entries: true,
3499            inject_anomalies: false,
3500            show_progress: false,
3501            ..Default::default()
3502        };
3503
3504        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3505        let result = orchestrator.generate().unwrap();
3506
3507        // Total line items should match sum of all entry line counts
3508        let calculated_line_items: u64 = result
3509            .journal_entries
3510            .iter()
3511            .map(|e| e.line_count() as u64)
3512            .sum();
3513        assert_eq!(result.statistics.total_line_items, calculated_line_items);
3514    }
3515
3516    #[test]
3517    fn test_audit_generation() {
3518        let config = create_test_config();
3519        let phase_config = PhaseConfig {
3520            generate_master_data: false,
3521            generate_document_flows: false,
3522            generate_journal_entries: true,
3523            inject_anomalies: false,
3524            show_progress: false,
3525            generate_audit: true,
3526            audit_engagements: 2,
3527            workpapers_per_engagement: 5,
3528            evidence_per_workpaper: 2,
3529            risks_per_engagement: 3,
3530            findings_per_engagement: 2,
3531            judgments_per_engagement: 2,
3532            ..Default::default()
3533        };
3534
3535        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3536        let result = orchestrator.generate().unwrap();
3537
3538        // Should have generated audit data
3539        assert_eq!(result.audit.engagements.len(), 2);
3540        assert!(!result.audit.workpapers.is_empty());
3541        assert!(!result.audit.evidence.is_empty());
3542        assert!(!result.audit.risk_assessments.is_empty());
3543        assert!(!result.audit.findings.is_empty());
3544        assert!(!result.audit.judgments.is_empty());
3545
3546        // Statistics should match
3547        assert_eq!(
3548            result.statistics.audit_engagement_count,
3549            result.audit.engagements.len()
3550        );
3551        assert_eq!(
3552            result.statistics.audit_workpaper_count,
3553            result.audit.workpapers.len()
3554        );
3555        assert_eq!(
3556            result.statistics.audit_evidence_count,
3557            result.audit.evidence.len()
3558        );
3559        assert_eq!(
3560            result.statistics.audit_risk_count,
3561            result.audit.risk_assessments.len()
3562        );
3563        assert_eq!(
3564            result.statistics.audit_finding_count,
3565            result.audit.findings.len()
3566        );
3567        assert_eq!(
3568            result.statistics.audit_judgment_count,
3569            result.audit.judgments.len()
3570        );
3571    }
3572
3573    #[test]
3574    fn test_new_phases_disabled_by_default() {
3575        let config = create_test_config();
3576        // Verify new config fields default to disabled
3577        assert!(!config.llm.enabled);
3578        assert!(!config.diffusion.enabled);
3579        assert!(!config.causal.enabled);
3580
3581        let phase_config = PhaseConfig {
3582            generate_master_data: false,
3583            generate_document_flows: false,
3584            generate_journal_entries: true,
3585            inject_anomalies: false,
3586            show_progress: false,
3587            ..Default::default()
3588        };
3589
3590        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3591        let result = orchestrator.generate().unwrap();
3592
3593        // All new phase statistics should be zero when disabled
3594        assert_eq!(result.statistics.llm_enrichment_ms, 0);
3595        assert_eq!(result.statistics.llm_vendors_enriched, 0);
3596        assert_eq!(result.statistics.diffusion_enhancement_ms, 0);
3597        assert_eq!(result.statistics.diffusion_samples_generated, 0);
3598        assert_eq!(result.statistics.causal_generation_ms, 0);
3599        assert_eq!(result.statistics.causal_samples_generated, 0);
3600        assert!(result.statistics.causal_validation_passed.is_none());
3601    }
3602
3603    #[test]
3604    fn test_llm_enrichment_enabled() {
3605        let mut config = create_test_config();
3606        config.llm.enabled = true;
3607        config.llm.max_vendor_enrichments = 3;
3608
3609        let phase_config = PhaseConfig {
3610            generate_master_data: true,
3611            generate_document_flows: false,
3612            generate_journal_entries: false,
3613            inject_anomalies: false,
3614            show_progress: false,
3615            vendors_per_company: 5,
3616            customers_per_company: 3,
3617            materials_per_company: 3,
3618            assets_per_company: 3,
3619            employees_per_company: 3,
3620            ..Default::default()
3621        };
3622
3623        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3624        let result = orchestrator.generate().unwrap();
3625
3626        // LLM enrichment should have run
3627        assert!(result.statistics.llm_vendors_enriched > 0);
3628        assert!(result.statistics.llm_vendors_enriched <= 3);
3629    }
3630
3631    #[test]
3632    fn test_diffusion_enhancement_enabled() {
3633        let mut config = create_test_config();
3634        config.diffusion.enabled = true;
3635        config.diffusion.n_steps = 50;
3636        config.diffusion.sample_size = 20;
3637
3638        let phase_config = PhaseConfig {
3639            generate_master_data: false,
3640            generate_document_flows: false,
3641            generate_journal_entries: true,
3642            inject_anomalies: false,
3643            show_progress: false,
3644            ..Default::default()
3645        };
3646
3647        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3648        let result = orchestrator.generate().unwrap();
3649
3650        // Diffusion phase should have generated samples
3651        assert_eq!(result.statistics.diffusion_samples_generated, 20);
3652    }
3653
3654    #[test]
3655    fn test_causal_overlay_enabled() {
3656        let mut config = create_test_config();
3657        config.causal.enabled = true;
3658        config.causal.template = "fraud_detection".to_string();
3659        config.causal.sample_size = 100;
3660        config.causal.validate = true;
3661
3662        let phase_config = PhaseConfig {
3663            generate_master_data: false,
3664            generate_document_flows: false,
3665            generate_journal_entries: true,
3666            inject_anomalies: false,
3667            show_progress: false,
3668            ..Default::default()
3669        };
3670
3671        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3672        let result = orchestrator.generate().unwrap();
3673
3674        // Causal phase should have generated samples
3675        assert_eq!(result.statistics.causal_samples_generated, 100);
3676        // Validation should have run
3677        assert!(result.statistics.causal_validation_passed.is_some());
3678    }
3679
3680    #[test]
3681    fn test_causal_overlay_revenue_cycle_template() {
3682        let mut config = create_test_config();
3683        config.causal.enabled = true;
3684        config.causal.template = "revenue_cycle".to_string();
3685        config.causal.sample_size = 50;
3686        config.causal.validate = false;
3687
3688        let phase_config = PhaseConfig {
3689            generate_master_data: false,
3690            generate_document_flows: false,
3691            generate_journal_entries: true,
3692            inject_anomalies: false,
3693            show_progress: false,
3694            ..Default::default()
3695        };
3696
3697        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3698        let result = orchestrator.generate().unwrap();
3699
3700        // Causal phase should have generated samples
3701        assert_eq!(result.statistics.causal_samples_generated, 50);
3702        // Validation was disabled
3703        assert!(result.statistics.causal_validation_passed.is_none());
3704    }
3705
3706    #[test]
3707    fn test_all_new_phases_enabled_together() {
3708        let mut config = create_test_config();
3709        config.llm.enabled = true;
3710        config.llm.max_vendor_enrichments = 2;
3711        config.diffusion.enabled = true;
3712        config.diffusion.n_steps = 20;
3713        config.diffusion.sample_size = 10;
3714        config.causal.enabled = true;
3715        config.causal.sample_size = 50;
3716        config.causal.validate = true;
3717
3718        let phase_config = PhaseConfig {
3719            generate_master_data: true,
3720            generate_document_flows: false,
3721            generate_journal_entries: true,
3722            inject_anomalies: false,
3723            show_progress: false,
3724            vendors_per_company: 5,
3725            customers_per_company: 3,
3726            materials_per_company: 3,
3727            assets_per_company: 3,
3728            employees_per_company: 3,
3729            ..Default::default()
3730        };
3731
3732        let mut orchestrator = EnhancedOrchestrator::new(config, phase_config).unwrap();
3733        let result = orchestrator.generate().unwrap();
3734
3735        // All three phases should have run
3736        assert!(result.statistics.llm_vendors_enriched > 0);
3737        assert_eq!(result.statistics.diffusion_samples_generated, 10);
3738        assert_eq!(result.statistics.causal_samples_generated, 50);
3739        assert!(result.statistics.causal_validation_passed.is_some());
3740    }
3741
3742    #[test]
3743    fn test_statistics_serialization_with_new_fields() {
3744        let stats = EnhancedGenerationStatistics {
3745            total_entries: 100,
3746            total_line_items: 500,
3747            llm_enrichment_ms: 42,
3748            llm_vendors_enriched: 10,
3749            diffusion_enhancement_ms: 100,
3750            diffusion_samples_generated: 50,
3751            causal_generation_ms: 200,
3752            causal_samples_generated: 100,
3753            causal_validation_passed: Some(true),
3754            ..Default::default()
3755        };
3756
3757        let json = serde_json::to_string(&stats).unwrap();
3758        let deserialized: EnhancedGenerationStatistics = serde_json::from_str(&json).unwrap();
3759
3760        assert_eq!(deserialized.llm_enrichment_ms, 42);
3761        assert_eq!(deserialized.llm_vendors_enriched, 10);
3762        assert_eq!(deserialized.diffusion_enhancement_ms, 100);
3763        assert_eq!(deserialized.diffusion_samples_generated, 50);
3764        assert_eq!(deserialized.causal_generation_ms, 200);
3765        assert_eq!(deserialized.causal_samples_generated, 100);
3766        assert_eq!(deserialized.causal_validation_passed, Some(true));
3767    }
3768
3769    #[test]
3770    fn test_statistics_backward_compat_deserialization() {
3771        // Old JSON without the new fields should still deserialize
3772        let old_json = r#"{
3773            "total_entries": 100,
3774            "total_line_items": 500,
3775            "accounts_count": 50,
3776            "companies_count": 1,
3777            "period_months": 12,
3778            "vendor_count": 10,
3779            "customer_count": 20,
3780            "material_count": 15,
3781            "asset_count": 5,
3782            "employee_count": 8,
3783            "p2p_chain_count": 5,
3784            "o2c_chain_count": 5,
3785            "ap_invoice_count": 5,
3786            "ar_invoice_count": 5,
3787            "ocpm_event_count": 0,
3788            "ocpm_object_count": 0,
3789            "ocpm_case_count": 0,
3790            "audit_engagement_count": 0,
3791            "audit_workpaper_count": 0,
3792            "audit_evidence_count": 0,
3793            "audit_risk_count": 0,
3794            "audit_finding_count": 0,
3795            "audit_judgment_count": 0,
3796            "anomalies_injected": 0,
3797            "data_quality_issues": 0,
3798            "banking_customer_count": 0,
3799            "banking_account_count": 0,
3800            "banking_transaction_count": 0,
3801            "banking_suspicious_count": 0,
3802            "graph_export_count": 0,
3803            "graph_node_count": 0,
3804            "graph_edge_count": 0
3805        }"#;
3806
3807        let stats: EnhancedGenerationStatistics = serde_json::from_str(old_json).unwrap();
3808
3809        // New fields should default to 0 / None
3810        assert_eq!(stats.llm_enrichment_ms, 0);
3811        assert_eq!(stats.llm_vendors_enriched, 0);
3812        assert_eq!(stats.diffusion_enhancement_ms, 0);
3813        assert_eq!(stats.diffusion_samples_generated, 0);
3814        assert_eq!(stats.causal_generation_ms, 0);
3815        assert_eq!(stats.causal_samples_generated, 0);
3816        assert!(stats.causal_validation_passed.is_none());
3817    }
3818}