Skip to main content

datasynth_generators/
je_generator.rs

1//! Journal Entry generator with statistical distributions.
2
3use chrono::{Datelike, NaiveDate, Timelike};
4use datasynth_core::utils::seeded_rng;
5use rand::prelude::*;
6use rand_chacha::ChaCha8Rng;
7use rust_decimal::prelude::*;
8use rust_decimal::Decimal;
9use std::sync::{Arc, LazyLock};
10
11use tracing::debug;
12
13use datasynth_config::schema::{
14    AdvancedDistributionConfig, FraudConfig, GeneratorConfig, MixtureDistributionType,
15    TemplateConfig, TemporalPatternsConfig, TransactionConfig,
16};
17use datasynth_core::distributions::{
18    AdvancedAmountSampler, BusinessDayCalculator, CrossDayConfig, DriftAdjustments, DriftConfig,
19    DriftController, EventType, IndustryAmountProfile, IndustryType, LagDistribution,
20    PeriodEndConfig, PeriodEndDynamics, PeriodEndModel, ProcessingLagCalculator,
21    ProcessingLagConfig, *,
22};
23use datasynth_core::models::*;
24use datasynth_core::templates::{
25    descriptions::DescriptionContext, DescriptionGenerator, ReferenceGenerator, ReferenceType,
26};
27use datasynth_core::traits::Generator;
28use datasynth_core::uuid_factory::{DeterministicUuidFactory, GeneratorType};
29use datasynth_core::CountryPack;
30
31use crate::company_selector::WeightedCompanySelector;
32use crate::user_generator::{UserGenerator, UserGeneratorConfig};
33
34use datasynth_core::distributions::text_taxonomy::{PiiPlaceholderKind, PlaceholderResolver};
35
36/// T2-D Lever 1: the default generic SAP source-mix, used when industry priors
37/// are not loaded but `transactions.synthetic_source_codes` is on (the default).
38/// Built once. See [`SourceMixPrior::sap_default`] and experiments/ml/FINDINGS.md §6.
39static DEFAULT_SOURCE_MIX: LazyLock<
40    datasynth_core::distributions::behavioral_priors::SourceMixPrior,
41> = LazyLock::new(datasynth_core::distributions::behavioral_priors::SourceMixPrior::sap_default);
42
43/// SOTA-5: default fraction of JEs that are reversals/corrections when
44/// `transactions.reversal_rate` is unset. Set to match the corpus reversal
45/// proxy (~0.10) — at 0.04 the measured proxy was only ~0.034 (the proxy
46/// detects ~85% of reversals), so 0.10 lands the proxy near the corpus.
47const DEFAULT_REVERSAL_RATE: f64 = 0.10;
48
49/// SOTA-6: default fraction of JEs that are allocation/assessment batches when
50/// `transactions.allocation_batch_rate` is unset. Small (each batch carries
51/// ~30-80 lines), so the resulting line-share (~8%) and lines-per-JE tail match
52/// the corpus's large-batch postings (FINDINGS §8: AB docs ~52 lines drive the
53/// lpje std). `0.0` disables.
54const DEFAULT_ALLOCATION_RATE: f64 = 0.008;
55/// SOTA-4: foreign document currencies + their company-currency rate (company
56/// units per 1 unit of the document currency). Synthetic, plausible values.
57const FOREIGN_CCYS: &[(&str, f64)] = &[
58    ("EUR", 1.09),
59    ("GBP", 1.27),
60    ("CHF", 1.12),
61    ("CAD", 0.74),
62    ("JPY", 0.0068),
63    ("AUD", 0.66),
64    ("CNY", 0.14),
65];
66/// SOTA-6: inclusive bounds for the number of target (cost-center) lines an
67/// allocation batch explodes into — centred near the corpus AB mean (~52).
68const ALLOCATION_MIN_TARGETS: u32 = 30;
69const ALLOCATION_MAX_TARGETS: u32 = 80;
70
71/// SOTA-2: Zipf exponent for the hot-account power-law. At s=2.0 the top-10%
72/// of accounts in a pool carry ~92-96% of that pool's lines across realistic
73/// pool sizes (N≈60-150) — matching the corpus account-activity Pareto (~0.95).
74const ZIPF_ALPHA: f64 = 2.0;
75/// Largest pool size the precomputed harmonic table covers; larger pools (none
76/// realistic for a single account-type) fall back to the uniform draw.
77const ZIPF_CAP: usize = 16_384;
78/// SOTA-2: cumulative partial sums `CUM[k] = Σ_{i=1..k} i^-ZIPF_ALPHA` (CUM[0]=0),
79/// computed once. Lets [`JournalEntryGenerator::power_law_index`] normalise (O(1)
80/// lookup of `CUM[n]`) and inverse-CDF sample (binary search) without an O(n) sum.
81static ZIPF_CUM: LazyLock<Vec<f64>> = LazyLock::new(|| {
82    let mut cum = Vec::with_capacity(ZIPF_CAP + 1);
83    cum.push(0.0);
84    let mut acc = 0.0_f64;
85    for i in 1..=ZIPF_CAP {
86        acc += 1.0 / (i as f64).powf(ZIPF_ALPHA);
87        cum.push(acc);
88    }
89    cum
90});
91
92/// SP6 — Resolves PII placeholders to concrete values drawn from the run's
93/// synthetic master data. `{company}` <- vendor/customer names, `{person}` <-
94/// user display names, `{street}` <- addresses (empty pool for now — no
95/// address master entity), `{patient}` <- a synthetic-person pool (no master
96/// entity exists for patients). Empty pools fall back to obviously-synthetic
97/// constants so output never carries an empty span or a literal `{…}` token.
98#[derive(Debug, Default)]
99pub struct MasterDataResolver {
100    pub companies: Vec<String>,
101    pub persons: Vec<String>,
102    pub streets: Vec<String>,
103    pub patients: Vec<String>,
104}
105
106impl PlaceholderResolver for MasterDataResolver {
107    fn resolve(&mut self, kind: PiiPlaceholderKind, rng: &mut dyn rand::Rng) -> String {
108        use rand::RngExt;
109        let (pool, fallback): (&Vec<String>, &str) = match kind {
110            PiiPlaceholderKind::Company => (&self.companies, "Synthetic Company AG"),
111            PiiPlaceholderKind::Person => (&self.persons, "Synthetic Person"),
112            PiiPlaceholderKind::Street => (&self.streets, "Synthetic Street 1"),
113            PiiPlaceholderKind::Patient => (&self.patients, "Synthetic Patient"),
114        };
115        if pool.is_empty() {
116            return fallback.to_string();
117        }
118        let idx = rng.random_range(0..pool.len());
119        pool[idx].clone()
120    }
121}
122
123/// A small static pool of obviously-synthetic person names for `{patient}`
124/// filling. No master entity exists for patients. Locale is a hint; for SP6
125/// a single neutral set is sufficient.
126///
127/// **Shape invariant:** every entry must avoid the `<initial>. <surname>` and
128/// `<surname> <initial>.` shapes, because the SP6 `residual_pii_scan` flags
129/// those as `initial_surname` / `surname_initial` PII patterns. The smoke
130/// test asserts the canonical `*{patient} G:…` template fills to a scan-clean
131/// string; an entry like `"B. Muster"` would regress that. Prefer two-word
132/// `<First> <Last>` shapes with no periods (covered by
133/// `synthetic_patient_pool_entries_pass_residual_scan`).
134fn synthetic_patient_pool(_locale: &str) -> Vec<String> {
135    [
136        "Alex Beispiel",
137        "Bea Muster",
138        "Cleo Synthetic",
139        "Demo Example",
140        "Erik Probe",
141        "Fred Testperson",
142        "Gerda Platzhalter",
143        "Hans Demo",
144    ]
145    .iter()
146    .map(|s| s.to_string())
147    .collect()
148}
149
150/// Generator for realistic journal entries.
151pub struct JournalEntryGenerator {
152    rng: ChaCha8Rng,
153    /// T2-D: independent RNG stream for the default source-mix draw, so
154    /// populating `sap_source_code` on the no-priors path never perturbs the
155    /// main `rng` — all other fields stay byte-identical to the legacy output.
156    source_mix_rng: ChaCha8Rng,
157    /// SOTA-1: per-(company, doc-type) library of reusable JE account archetypes
158    /// `(debit_accounts, credit_accounts)` for the recurring-templates process.
159    /// Capped per key; reused on the no-priors path so standard postings recur.
160    recurring_archetypes:
161        std::collections::HashMap<(String, String), Vec<(Vec<String>, Vec<String>)>>,
162    /// SOTA-1: independent RNG for the template-reuse roll + archetype pick, so
163    /// templating never perturbs the main `rng` (amounts/dates/counts unchanged).
164    template_rng: ChaCha8Rng,
165    /// SOTA-5: ring buffer of recent (complete) JEs a later reversal can offset.
166    /// Storing the whole JE lets the reversal inherit its source code, line text,
167    /// audit flags, etc. (only dr/cr + the header markers are changed).
168    reversal_buffer: Vec<JournalEntry>,
169    /// SOTA-5: independent RNG for reversal rolls, so reversals intersperse
170    /// without perturbing the main `rng` (normal JEs stay byte-identical).
171    reversal_rng: ChaCha8Rng,
172    /// SOTA-2: independent RNG for the hot-account power-law override, so the
173    /// account-activity Pareto (a few accounts carry most lines, as in the
174    /// corpus) is concentrated without perturbing the main `rng` — the uniform
175    /// `.choose` draw is still consumed, only its *result* is replaced.
176    account_rng: ChaCha8Rng,
177    /// SOTA-6: independent RNG for the allocation/assessment-batch process, so
178    /// the large 1-to-many postings (the corpus's lines-per-JE tail) intersperse
179    /// without perturbing the main `rng` (normal JEs stay byte-identical).
180    allocation_rng: ChaCha8Rng,
181    /// SOTA-4: independent RNG for the foreign-currency post-process, so the
182    /// document-currency tagging never perturbs the main `rng` (company-currency
183    /// JEs stay byte-identical).
184    fx_rng: ChaCha8Rng,
185    /// SOTA-8: independent RNG for the source-conditional Dirichlet account-pair
186    /// sampler. Built lazily (one `SourcePool` per observed source); when the
187    /// feature is off the sampler stays None and the main RNG / `account_rng`
188    /// stream is byte-identical.
189    cond_pair_rng: ChaCha8Rng,
190    /// SOTA-8: per-source Dirichlet PMFs over per-source account pools.
191    /// Lazy-built on first JE whose source isn't yet pooled.
192    cond_pair_sampler: Option<
193        datasynth_core::distributions::source_conditional_pair::SourceConditionalPairSampler,
194    >,
195    /// SOTA-8: SAP source code of the JE currently being constructed, so the
196    /// `select_*_account` helpers can consult the per-source pool.
197    current_je_source: Option<String>,
198    seed: u64,
199    config: TransactionConfig,
200    coa: Arc<ChartOfAccounts>,
201    companies: Vec<String>,
202    company_selector: WeightedCompanySelector,
203    line_sampler: LineItemSampler,
204    amount_sampler: AmountSampler,
205    temporal_sampler: TemporalSampler,
206    start_date: NaiveDate,
207    end_date: NaiveDate,
208    count: u64,
209    uuid_factory: DeterministicUuidFactory,
210    // Enhanced features
211    user_pool: Option<UserPool>,
212    description_generator: DescriptionGenerator,
213    reference_generator: ReferenceGenerator,
214    template_config: TemplateConfig,
215    vendor_pool: VendorPool,
216    customer_pool: CustomerPool,
217    // Material pool for realistic material references
218    material_pool: Option<MaterialPool>,
219    // Cost-center IDs sourced from the generated cost-centers master so
220    // `JE.cost_center` joins back to `cost_centers.id`.  Populated via
221    // [`with_cost_center_pool`] from the orchestrator after master-data
222    // generation; falls back to the hardcoded `COST_CENTER_POOL` const
223    // when empty (configs that skip master-data generation).
224    cost_center_pool: Vec<String>,
225    // Profit-center IDs sourced from the generated profit-centers master
226    // so `JE.profit_center` joins back to `profit_centers.id`.  Same
227    // population semantics as `cost_center_pool`.
228    profit_center_pool: Vec<String>,
229    // Flag indicating whether we're using real master data vs defaults
230    using_real_master_data: bool,
231    // Fraud generation
232    fraud_config: FraudConfig,
233    // Persona-based error injection
234    persona_errors_enabled: bool,
235    // Approval threshold enforcement
236    approval_enabled: bool,
237    approval_threshold: rust_decimal::Decimal,
238    // SOD violation rate for approval tracking (0.0 to 1.0)
239    sod_violation_rate: f64,
240    // Batching behavior - humans often process similar items together
241    batch_state: Option<BatchState>,
242    // Temporal drift controller for simulating distribution changes over time
243    drift_controller: Option<DriftController>,
244    // Temporal patterns components
245    business_day_calculator: Option<BusinessDayCalculator>,
246    processing_lag_calculator: Option<ProcessingLagCalculator>,
247    temporal_patterns_config: Option<TemporalPatternsConfig>,
248    // Business-process weights for the O2C/P2P/R2R/H2R/A2R volume mix. Must
249    // sum to 1.0 (validated by config schema). Default matches the legacy
250    // hard-coded 0.35/0.30/0.20/0.10/0.05 distribution.
251    business_process_weights: [(BusinessProcess, f64); 5],
252    // v3.4.0 advanced distributions (mixture models + industry profiles).
253    // None preserves v3.3.2 byte-for-byte behavior; populated only when the
254    // caller opts in via [`set_advanced_distributions`].
255    advanced_amount_sampler: Option<AdvancedAmountSampler>,
256    // v3.5.3+ conditional amount override. Populated when
257    // `config.distributions.conditional` contains an entry where
258    // `output_field == "amount"` and `input_field ∈ {"month",
259    // "quarter", "constant"}`. Applied *after* the fraud-pattern /
260    // advanced-sampler / legacy-sampler cascade on non-fraud entries
261    // so it can steer amounts by calendar context without disturbing
262    // fraud semantics.
263    conditional_amount_override: Option<datasynth_core::distributions::ConditionalSampler>,
264    // v3.5.4+ Gaussian copula for amount↔line_count correlation. When
265    // populated, each non-fraud JE draws a (u, v) pair; u nudges amount
266    // via a `(0.75 + 0.5*u)` multiplier and v biases line_count toward
267    // the upper/lower end of its range. Produces observable Spearman
268    // correlation without rewiring existing samplers for inverse-CDF.
269    correlation_copula: Option<datasynth_core::distributions::BivariateCopulaSampler>,
270    /// SP3 — opt-in industry priors. When `Some`, je_generator routes
271    /// timing/lines-per-JE/fanout/active-window through prior-driven samplers.
272    /// When `None`, behavior is identical to v5.11.
273    pub loaded_priors: Option<crate::priors_loader::LoadedPriors>,
274    /// SP3 T11 — accumulated IET days per document-type code.  Only used when
275    /// `loaded_priors.is_some()`.  Tracks the running day offset so
276    /// consecutive calls for the same source produce IET-spaced posting dates.
277    iet_day_accum: std::collections::HashMap<String, f64>,
278    /// SP3.12 — last TP value drawn per SAP source code.  Used by the TP motif
279    /// sampler to bias the next TP draw toward cluster-mates of the previous TP
280    /// on the same source, building triangle structure in the TP co-occurrence graph.
281    last_tp_by_source: std::collections::HashMap<String, String>,
282    /// SP3.4 — when Some, observes each emitted line and applies calibration
283    /// steps to the generator's tunable parameters.
284    pub velocity_calibrator: Option<crate::velocity_calibrator::VelocityCalibrator>,
285    /// SP6 — PII placeholder resolver populated from the run's synthetic master
286    /// data (vendors, customers, users). Rebuilt once via
287    /// [`refresh_md_resolver`] before JE generation begins.
288    md_resolver: MasterDataResolver,
289}
290
291const DEFAULT_BUSINESS_PROCESS_WEIGHTS: [(BusinessProcess, f64); 5] = [
292    (BusinessProcess::O2C, 0.35),
293    (BusinessProcess::P2P, 0.30),
294    (BusinessProcess::R2R, 0.20),
295    (BusinessProcess::H2R, 0.10),
296    (BusinessProcess::A2R, 0.05),
297];
298
299/// Map the schema-level [`datasynth_config::schema::IndustryProfileType`]
300/// onto the distributions-layer [`IndustryType`], then return that industry's
301/// pre-configured `sales_amounts` mixture. Used as a fallback when the
302/// caller enables `distributions.amounts` but supplies no components.
303/// Per-entry context channels for conditional-distribution overrides.
304///
305/// v4.1.0+ supported `input_field` values:
306///
307///   - `"month"` — posting-date month (1..=12)
308///   - `"quarter"` — posting-date quarter (1..=4)
309///   - `"year"` — posting-date year (e.g. 2026.0)
310///   - `"day_of_week"` — 1 (Mon) .. 7 (Sun)
311///   - `"day_of_month"` — 1..=31
312///   - `"day_of_year"` — 1..=366
313///   - `"week_of_year"` — 1..=53
314///   - `"is_period_end"` — 1.0 when posting_date is the last business
315///     day of the month, else 0.0
316///   - `"is_quarter_end"` — 1.0 when posting_date is in a quarter-end
317///     month AND is the last business day, else 0.0
318///   - `"is_year_end"` — 1.0 when posting_date is in December AND is
319///     the last business day, else 0.0
320///   - `"constant"` / empty — always 0.0 (treats as unconditional)
321///
322/// Unsupported values cause the conditional rule to be silently ignored
323/// to keep runtime robust against user typos.
324impl JournalEntryGenerator {
325    fn supported_conditional_input(field: &str) -> bool {
326        matches!(
327            field,
328            "month"
329                | "quarter"
330                | "year"
331                | "day_of_week"
332                | "day_of_month"
333                | "day_of_year"
334                | "week_of_year"
335                | "is_period_end"
336                | "is_quarter_end"
337                | "is_year_end"
338                | "constant"
339                | ""
340        )
341    }
342
343    fn conditional_input_value(&self, posting_date: chrono::NaiveDate) -> f64 {
344        let input_field = match self
345            .conditional_amount_override
346            .as_ref()
347            .map(|s| s.config().input_field.as_str())
348        {
349            Some(f) => f,
350            None => return 0.0,
351        };
352
353        let is_last_business_day = |d: chrono::NaiveDate| -> bool {
354            // Last day-of-month → is_period_end. Handles Feb/leap-year
355            // via chrono's num_days_from_ce roundabout; simpler path:
356            // if adding 1 day moves to a different month, this is EOM.
357            let next = d.succ_opt();
358            match next {
359                Some(n) => n.month() != d.month(),
360                None => true,
361            }
362        };
363
364        match input_field {
365            "month" => posting_date.month() as f64,
366            "quarter" => ((posting_date.month() - 1) / 3 + 1) as f64,
367            "year" => posting_date.year() as f64,
368            "day_of_week" => posting_date.weekday().number_from_monday() as f64,
369            "day_of_month" => posting_date.day() as f64,
370            "day_of_year" => posting_date.ordinal() as f64,
371            "week_of_year" => posting_date.iso_week().week() as f64,
372            "is_period_end" => f64::from(u8::from(is_last_business_day(posting_date))),
373            "is_quarter_end" => {
374                let m = posting_date.month();
375                let is_q_month = matches!(m, 3 | 6 | 9 | 12);
376                f64::from(u8::from(is_q_month && is_last_business_day(posting_date)))
377            }
378            "is_year_end" => f64::from(u8::from(
379                posting_date.month() == 12 && is_last_business_day(posting_date),
380            )),
381            _ => 0.0,
382        }
383    }
384}
385
386fn industry_profile_to_log_normal(
387    p: datasynth_config::schema::IndustryProfileType,
388) -> datasynth_core::distributions::LogNormalMixtureConfig {
389    use datasynth_config::schema::IndustryProfileType as P;
390    let industry = match p {
391        P::Retail => IndustryType::Retail,
392        P::Manufacturing => IndustryType::Manufacturing,
393        P::FinancialServices => IndustryType::FinancialServices,
394        P::Healthcare => IndustryType::Healthcare,
395        P::Technology => IndustryType::Technology,
396    };
397    IndustryAmountProfile::for_industry(industry).sales_amounts
398}
399
400/// State for tracking batch processing behavior.
401///
402/// When humans process transactions, they often batch similar items together
403/// (e.g., processing all invoices from one vendor, entering similar expenses).
404#[derive(Clone)]
405struct BatchState {
406    /// The base entry template to vary
407    base_account_number: String,
408    base_amount: rust_decimal::Decimal,
409    base_business_process: Option<BusinessProcess>,
410    base_posting_date: NaiveDate,
411    /// Remaining entries in this batch
412    remaining: u8,
413}
414
415impl JournalEntryGenerator {
416    /// Create a new journal entry generator.
417    pub fn new_with_params(
418        config: TransactionConfig,
419        coa: Arc<ChartOfAccounts>,
420        companies: Vec<String>,
421        start_date: NaiveDate,
422        end_date: NaiveDate,
423        seed: u64,
424    ) -> Self {
425        Self::new_with_full_config(
426            config,
427            coa,
428            companies,
429            start_date,
430            end_date,
431            seed,
432            TemplateConfig::default(),
433            None,
434        )
435    }
436
437    /// Create a new journal entry generator with full configuration.
438    #[allow(clippy::too_many_arguments)]
439    pub fn new_with_full_config(
440        config: TransactionConfig,
441        coa: Arc<ChartOfAccounts>,
442        companies: Vec<String>,
443        start_date: NaiveDate,
444        end_date: NaiveDate,
445        seed: u64,
446        template_config: TemplateConfig,
447        user_pool: Option<UserPool>,
448    ) -> Self {
449        // Initialize user pool if not provided
450        let user_pool = user_pool.or_else(|| {
451            if template_config.names.generate_realistic_names {
452                let user_gen_config = UserGeneratorConfig {
453                    culture_distribution: vec![
454                        (
455                            datasynth_core::templates::NameCulture::WesternUs,
456                            template_config.names.culture_distribution.western_us,
457                        ),
458                        (
459                            datasynth_core::templates::NameCulture::Hispanic,
460                            template_config.names.culture_distribution.hispanic,
461                        ),
462                        (
463                            datasynth_core::templates::NameCulture::German,
464                            template_config.names.culture_distribution.german,
465                        ),
466                        (
467                            datasynth_core::templates::NameCulture::French,
468                            template_config.names.culture_distribution.french,
469                        ),
470                        (
471                            datasynth_core::templates::NameCulture::Chinese,
472                            template_config.names.culture_distribution.chinese,
473                        ),
474                        (
475                            datasynth_core::templates::NameCulture::Japanese,
476                            template_config.names.culture_distribution.japanese,
477                        ),
478                        (
479                            datasynth_core::templates::NameCulture::Indian,
480                            template_config.names.culture_distribution.indian,
481                        ),
482                    ],
483                    email_domain: template_config.names.email_domain.clone(),
484                    generate_realistic_names: true,
485                };
486                let mut user_gen = UserGenerator::with_config(seed + 100, user_gen_config);
487                Some(user_gen.generate_standard(&companies))
488            } else {
489                None
490            }
491        });
492
493        // Initialize reference generator
494        let mut ref_gen = ReferenceGenerator::new(
495            start_date.year(),
496            companies
497                .first()
498                .map(std::string::String::as_str)
499                .unwrap_or("1000"),
500        );
501        ref_gen.set_prefix(
502            ReferenceType::Invoice,
503            &template_config.references.invoice_prefix,
504        );
505        ref_gen.set_prefix(
506            ReferenceType::PurchaseOrder,
507            &template_config.references.po_prefix,
508        );
509        ref_gen.set_prefix(
510            ReferenceType::SalesOrder,
511            &template_config.references.so_prefix,
512        );
513
514        // Create weighted company selector (uniform weights for this constructor)
515        let company_selector = WeightedCompanySelector::uniform(companies.clone());
516
517        Self {
518            rng: seeded_rng(seed, 0),
519            source_mix_rng: seeded_rng(seed, 50_063),
520            recurring_archetypes: std::collections::HashMap::new(),
521            template_rng: seeded_rng(seed, 70_081),
522            reversal_buffer: Vec::new(),
523            reversal_rng: seeded_rng(seed, 90_017),
524            account_rng: seeded_rng(seed, 60_071),
525            allocation_rng: seeded_rng(seed, 80_023),
526            fx_rng: seeded_rng(seed, 70_093),
527            cond_pair_rng: seeded_rng(seed, 110_071),
528            cond_pair_sampler: None,
529            current_je_source: None,
530            seed,
531            config: config.clone(),
532            coa,
533            companies,
534            company_selector,
535            line_sampler: LineItemSampler::with_config(
536                seed + 1,
537                config.line_item_distribution.clone(),
538                config.even_odd_distribution.clone(),
539                config.debit_credit_distribution.clone(),
540            ),
541            amount_sampler: AmountSampler::with_config(seed + 2, config.amounts.clone()),
542            temporal_sampler: TemporalSampler::with_config(
543                seed + 3,
544                config.seasonality.clone(),
545                WorkingHoursConfig::default(),
546                Vec::new(),
547            ),
548            start_date,
549            end_date,
550            count: 0,
551            uuid_factory: DeterministicUuidFactory::new(seed, GeneratorType::JournalEntry),
552            user_pool,
553            description_generator: DescriptionGenerator::new(),
554            reference_generator: ref_gen,
555            template_config,
556            vendor_pool: VendorPool::standard(),
557            customer_pool: CustomerPool::standard(),
558            material_pool: None,
559            cost_center_pool: Vec::new(),
560            profit_center_pool: Vec::new(),
561            using_real_master_data: false,
562            fraud_config: FraudConfig::default(),
563            persona_errors_enabled: true, // Enable by default for realism
564            approval_enabled: true,       // Enable by default for realism
565            approval_threshold: rust_decimal::Decimal::new(10000, 0), // $10,000 default threshold
566            sod_violation_rate: 0.10,     // 10% default SOD violation rate
567            batch_state: None,
568            drift_controller: None,
569            // Always provide a basic BusinessDayCalculator so that weekend/holiday
570            // filtering is active even when temporal_patterns is not explicitly enabled.
571            business_day_calculator: Some(BusinessDayCalculator::new(HolidayCalendar::new(
572                Region::US,
573                start_date.year(),
574            ))),
575            processing_lag_calculator: None,
576            temporal_patterns_config: None,
577            business_process_weights: DEFAULT_BUSINESS_PROCESS_WEIGHTS,
578            advanced_amount_sampler: None,
579            conditional_amount_override: None,
580            correlation_copula: None,
581            loaded_priors: None,
582            iet_day_accum: std::collections::HashMap::new(),
583            last_tp_by_source: std::collections::HashMap::new(),
584            velocity_calibrator: None,
585            md_resolver: MasterDataResolver::default(),
586        }
587    }
588
589    /// Wire v3.4.0 advanced distributions. When the caller's config has
590    /// `distributions.enabled = true` AND `distributions.amounts.enabled =
591    /// true`, the journal-entry generator routes non-fraud amount sampling
592    /// through an [`AdvancedAmountSampler`] (log-normal or Gaussian mixture).
593    ///
594    /// When `distributions.industry_profile` is `Some`, the caller's
595    /// explicitly configured components override nothing — if the component
596    /// list is empty, the industry profile's `sales_amounts` mixture is used
597    /// instead. Explicit components always win.
598    ///
599    /// Returning `Ok(())` with no side effect is intentional for the
600    /// following no-op cases, so callers can unconditionally invoke this:
601    ///   - `config.enabled = false`
602    ///   - `config.amounts.enabled = false`
603    ///   - empty component list with no industry profile
604    ///
605    /// Errors propagate from mixture validation (e.g. weights not summing
606    /// to 1.0, non-positive sigma).
607    pub fn set_advanced_distributions(
608        &mut self,
609        config: &AdvancedDistributionConfig,
610        seed: u64,
611    ) -> Result<(), String> {
612        if !config.enabled {
613            return Ok(());
614        }
615
616        // v3.5.3+: build a conditional-amount override when the config
617        // declares a rule with `output_field == "amount"` and a supported
618        // input field. The override is applied *after* the standard
619        // cascade so it doesn't disturb fraud-path sampling. Unsupported
620        // input fields are ignored with a trace log.
621        self.conditional_amount_override = config
622            .conditional
623            .iter()
624            .find(|c| {
625                c.output_field == "amount" && Self::supported_conditional_input(&c.input_field)
626            })
627            .and_then(|c| {
628                datasynth_core::distributions::ConditionalSampler::new(
629                    seed.wrapping_add(17),
630                    c.to_core_config(),
631                )
632                .ok()
633            });
634
635        // v4.1.0+: all 5 copula types wired (Gaussian / Clayton /
636        // Gumbel / Frank / Student-t). The `BivariateCopulaSampler`
637        // already implements each; v3.5.4 had a filter limiting to
638        // Gaussian only — lifted here now that the smoke test matrix
639        // covers all types.
640        self.correlation_copula = config
641            .correlations
642            .to_core_config_for_pair("amount", "line_count")
643            .and_then(|copula_cfg| {
644                datasynth_core::distributions::BivariateCopulaSampler::new(
645                    seed.wrapping_add(31),
646                    copula_cfg,
647                )
648                .ok()
649            });
650
651        // v3.4.4+: Pareto takes precedence over mixture models when set.
652        // This supports heavy-tailed amount distributions (capex, strategic
653        // contracts, fraud) that log-normal/Gaussian mixtures can't model
654        // as sharply.
655        if let Some(pareto) = &config.pareto {
656            if pareto.enabled {
657                let core_cfg = pareto.to_core_config();
658                self.advanced_amount_sampler =
659                    Some(AdvancedAmountSampler::new_pareto(seed, core_cfg)?);
660                return Ok(());
661            }
662        }
663
664        if !config.amounts.enabled {
665            return Ok(());
666        }
667
668        match config.amounts.distribution_type {
669            MixtureDistributionType::LogNormal => {
670                let lognormal_cfg = config.amounts.to_log_normal_config().or_else(|| {
671                    config
672                        .industry_profile
673                        .as_ref()
674                        .map(|p| industry_profile_to_log_normal(p.profile_type()))
675                });
676                if let Some(cfg) = lognormal_cfg {
677                    self.advanced_amount_sampler =
678                        Some(AdvancedAmountSampler::new_log_normal(seed, cfg)?);
679                }
680            }
681            MixtureDistributionType::Gaussian => {
682                if let Some(cfg) = config.amounts.to_gaussian_config() {
683                    self.advanced_amount_sampler =
684                        Some(AdvancedAmountSampler::new_gaussian(seed, cfg)?);
685                }
686            }
687        }
688
689        Ok(())
690    }
691
692    /// Override the business-process volume mix. Weights map directly to the
693    /// `business_processes.*_weight` YAML config; they do not have to sum to
694    /// exactly 1.0 (they're normalized via `weighted_select`).
695    pub fn set_business_process_weights(
696        &mut self,
697        o2c: f64,
698        p2p: f64,
699        r2r: f64,
700        h2r: f64,
701        a2r: f64,
702    ) {
703        self.business_process_weights = [
704            (BusinessProcess::O2C, o2c),
705            (BusinessProcess::P2P, p2p),
706            (BusinessProcess::R2R, r2r),
707            (BusinessProcess::H2R, h2r),
708            (BusinessProcess::A2R, a2r),
709        ];
710    }
711
712    /// Create from a full GeneratorConfig.
713    ///
714    /// This constructor uses the volume_weight from company configs
715    /// for weighted company selection, and fraud config from GeneratorConfig.
716    pub fn from_generator_config(
717        full_config: &GeneratorConfig,
718        coa: Arc<ChartOfAccounts>,
719        start_date: NaiveDate,
720        end_date: NaiveDate,
721        seed: u64,
722    ) -> Self {
723        let companies: Vec<String> = full_config
724            .companies
725            .iter()
726            .map(|c| c.code.clone())
727            .collect();
728
729        // Create weighted selector using volume_weight from company configs
730        let company_selector = WeightedCompanySelector::from_configs(&full_config.companies);
731
732        let mut generator = Self::new_with_full_config(
733            full_config.transactions.clone(),
734            coa,
735            companies,
736            start_date,
737            end_date,
738            seed,
739            full_config.templates.clone(),
740            None,
741        );
742
743        // Override the uniform selector with weighted selector
744        generator.company_selector = company_selector;
745
746        // Set fraud config
747        generator.fraud_config = full_config.fraud.clone();
748
749        // Configure temporal patterns if enabled
750        let temporal_config = &full_config.temporal_patterns;
751        if temporal_config.enabled {
752            generator = generator.with_temporal_patterns(temporal_config.clone(), seed);
753        }
754
755        generator
756    }
757
758    /// Configure temporal patterns including business day calculations and processing lags.
759    ///
760    /// This enables realistic temporal behavior including:
761    /// - Business day awareness (no postings on weekends/holidays)
762    /// - Processing lag modeling (event-to-posting delays)
763    /// - Period-end dynamics (volume spikes at month/quarter/year end)
764    pub fn with_temporal_patterns(mut self, config: TemporalPatternsConfig, seed: u64) -> Self {
765        // Create business day calculator if enabled
766        if config.business_days.enabled {
767            let region = config
768                .calendars
769                .regions
770                .first()
771                .map(|r| Self::parse_region(r))
772                .unwrap_or(Region::US);
773
774            let calendar = HolidayCalendar::new(region, self.start_date.year());
775            self.business_day_calculator = Some(BusinessDayCalculator::new(calendar));
776        }
777
778        // Create processing lag calculator if enabled
779        if config.processing_lags.enabled {
780            let lag_config = Self::convert_processing_lag_config(&config.processing_lags);
781            self.processing_lag_calculator =
782                Some(ProcessingLagCalculator::with_config(seed, lag_config));
783        }
784
785        // Create period-end dynamics if configured
786        let model = config.period_end.model.as_deref().unwrap_or("flat");
787        if model != "flat"
788            || config
789                .period_end
790                .month_end
791                .as_ref()
792                .is_some_and(|m| m.peak_multiplier.unwrap_or(1.0) != 1.0)
793        {
794            let dynamics = Self::convert_period_end_config(&config.period_end);
795            self.temporal_sampler.set_period_end_dynamics(dynamics);
796        }
797
798        self.temporal_patterns_config = Some(config);
799        self
800    }
801
802    /// Configure temporal patterns using a [`CountryPack`] for the holiday calendar.
803    ///
804    /// This is an alternative to `with_temporal_patterns` that derives the
805    /// holiday calendar from a country-pack definition rather than the built-in
806    /// region-based calendars.  All other temporal behaviour (business-day
807    /// adjustment, processing lags, period-end dynamics) is configured
808    /// identically.
809    pub fn with_country_pack_temporal(
810        mut self,
811        config: TemporalPatternsConfig,
812        seed: u64,
813        pack: &CountryPack,
814    ) -> Self {
815        // Create business day calculator using the country pack calendar
816        if config.business_days.enabled {
817            let calendar = HolidayCalendar::from_country_pack(pack, self.start_date.year());
818            self.business_day_calculator = Some(BusinessDayCalculator::new(calendar));
819        }
820
821        // Create processing lag calculator if enabled
822        if config.processing_lags.enabled {
823            let lag_config = Self::convert_processing_lag_config(&config.processing_lags);
824            self.processing_lag_calculator =
825                Some(ProcessingLagCalculator::with_config(seed, lag_config));
826        }
827
828        // Create period-end dynamics if configured
829        let model = config.period_end.model.as_deref().unwrap_or("flat");
830        if model != "flat"
831            || config
832                .period_end
833                .month_end
834                .as_ref()
835                .is_some_and(|m| m.peak_multiplier.unwrap_or(1.0) != 1.0)
836        {
837            let dynamics = Self::convert_period_end_config(&config.period_end);
838            self.temporal_sampler.set_period_end_dynamics(dynamics);
839        }
840
841        self.temporal_patterns_config = Some(config);
842        self
843    }
844
845    /// Convert schema processing lag config to core config.
846    fn convert_processing_lag_config(
847        schema: &datasynth_config::schema::ProcessingLagSchemaConfig,
848    ) -> ProcessingLagConfig {
849        let mut config = ProcessingLagConfig {
850            enabled: schema.enabled,
851            ..Default::default()
852        };
853
854        // Helper to convert lag schema to distribution
855        let convert_lag = |lag: &datasynth_config::schema::LagDistributionSchemaConfig| {
856            let mut dist = LagDistribution::log_normal(lag.mu, lag.sigma);
857            if let Some(min) = lag.min_hours {
858                dist.min_lag_hours = min;
859            }
860            if let Some(max) = lag.max_hours {
861                dist.max_lag_hours = max;
862            }
863            dist
864        };
865
866        // Apply event-specific lags
867        if let Some(ref lag) = schema.sales_order_lag {
868            config
869                .event_lags
870                .insert(EventType::SalesOrder, convert_lag(lag));
871        }
872        if let Some(ref lag) = schema.purchase_order_lag {
873            config
874                .event_lags
875                .insert(EventType::PurchaseOrder, convert_lag(lag));
876        }
877        if let Some(ref lag) = schema.goods_receipt_lag {
878            config
879                .event_lags
880                .insert(EventType::GoodsReceipt, convert_lag(lag));
881        }
882        if let Some(ref lag) = schema.invoice_receipt_lag {
883            config
884                .event_lags
885                .insert(EventType::InvoiceReceipt, convert_lag(lag));
886        }
887        if let Some(ref lag) = schema.invoice_issue_lag {
888            config
889                .event_lags
890                .insert(EventType::InvoiceIssue, convert_lag(lag));
891        }
892        if let Some(ref lag) = schema.payment_lag {
893            config
894                .event_lags
895                .insert(EventType::Payment, convert_lag(lag));
896        }
897        if let Some(ref lag) = schema.journal_entry_lag {
898            config
899                .event_lags
900                .insert(EventType::JournalEntry, convert_lag(lag));
901        }
902
903        // Apply cross-day posting config
904        if let Some(ref cross_day) = schema.cross_day_posting {
905            config.cross_day = CrossDayConfig {
906                enabled: cross_day.enabled,
907                probability_by_hour: cross_day.probability_by_hour.clone(),
908                ..Default::default()
909            };
910        }
911
912        config
913    }
914
915    /// Convert schema period-end config to core PeriodEndDynamics.
916    fn convert_period_end_config(
917        schema: &datasynth_config::schema::PeriodEndSchemaConfig,
918    ) -> PeriodEndDynamics {
919        let model_type = schema.model.as_deref().unwrap_or("exponential");
920
921        // Helper to convert period config
922        let convert_period =
923            |period: Option<&datasynth_config::schema::PeriodEndModelSchemaConfig>,
924             default_peak: f64|
925             -> PeriodEndConfig {
926                if let Some(p) = period {
927                    let model = match model_type {
928                        "flat" => PeriodEndModel::FlatMultiplier {
929                            multiplier: p.peak_multiplier.unwrap_or(default_peak),
930                        },
931                        "extended_crunch" => PeriodEndModel::ExtendedCrunch {
932                            start_day: p.start_day.unwrap_or(-10),
933                            sustained_high_days: p.sustained_high_days.unwrap_or(3),
934                            peak_multiplier: p.peak_multiplier.unwrap_or(default_peak),
935                            ramp_up_days: 3, // Default ramp-up period
936                        },
937                        _ => PeriodEndModel::ExponentialAcceleration {
938                            start_day: p.start_day.unwrap_or(-10),
939                            base_multiplier: p.base_multiplier.unwrap_or(1.0),
940                            peak_multiplier: p.peak_multiplier.unwrap_or(default_peak),
941                            decay_rate: p.decay_rate.unwrap_or(0.3),
942                        },
943                    };
944                    PeriodEndConfig {
945                        enabled: true,
946                        model,
947                        additional_multiplier: p.additional_multiplier.unwrap_or(1.0),
948                    }
949                } else {
950                    PeriodEndConfig {
951                        enabled: true,
952                        model: PeriodEndModel::ExponentialAcceleration {
953                            start_day: -10,
954                            base_multiplier: 1.0,
955                            peak_multiplier: default_peak,
956                            decay_rate: 0.3,
957                        },
958                        additional_multiplier: 1.0,
959                    }
960                }
961            };
962
963        PeriodEndDynamics::new(
964            convert_period(schema.month_end.as_ref(), 2.0),
965            convert_period(schema.quarter_end.as_ref(), 3.5),
966            convert_period(schema.year_end.as_ref(), 5.0),
967        )
968    }
969
970    /// Parse a region string into a Region enum.
971    fn parse_region(region_str: &str) -> Region {
972        match region_str.to_uppercase().as_str() {
973            "US" => Region::US,
974            "DE" => Region::DE,
975            "GB" => Region::GB,
976            "CN" => Region::CN,
977            "JP" => Region::JP,
978            "IN" => Region::IN,
979            "BR" => Region::BR,
980            "MX" => Region::MX,
981            "AU" => Region::AU,
982            "SG" => Region::SG,
983            "KR" => Region::KR,
984            "FR" => Region::FR,
985            "IT" => Region::IT,
986            "ES" => Region::ES,
987            "CA" => Region::CA,
988            _ => Region::US,
989        }
990    }
991
992    /// Set a custom company selector.
993    pub fn set_company_selector(&mut self, selector: WeightedCompanySelector) {
994        self.company_selector = selector;
995    }
996
997    /// Get the current company selector.
998    pub fn company_selector(&self) -> &WeightedCompanySelector {
999        &self.company_selector
1000    }
1001
1002    /// Set fraud configuration.
1003    pub fn set_fraud_config(&mut self, config: FraudConfig) {
1004        self.fraud_config = config;
1005    }
1006
1007    /// Set vendors from generated master data.
1008    ///
1009    /// This replaces the default vendor pool with actual generated vendors,
1010    /// ensuring JEs reference real master data entities.
1011    pub fn with_vendors(mut self, vendors: &[Vendor]) -> Self {
1012        if !vendors.is_empty() {
1013            self.vendor_pool = VendorPool::from_vendors(vendors.to_vec());
1014            self.using_real_master_data = true;
1015        }
1016        self
1017    }
1018
1019    /// Set customers from generated master data.
1020    ///
1021    /// This replaces the default customer pool with actual generated customers,
1022    /// ensuring JEs reference real master data entities.
1023    pub fn with_customers(mut self, customers: &[Customer]) -> Self {
1024        if !customers.is_empty() {
1025            self.customer_pool = CustomerPool::from_customers(customers.to_vec());
1026            self.using_real_master_data = true;
1027        }
1028        self
1029    }
1030
1031    /// Set materials from generated master data.
1032    ///
1033    /// This provides material references for JEs that involve inventory movements.
1034    pub fn with_materials(mut self, materials: &[Material]) -> Self {
1035        if !materials.is_empty() {
1036            self.material_pool = Some(MaterialPool::from_materials(materials.to_vec()));
1037            self.using_real_master_data = true;
1038        }
1039        self
1040    }
1041
1042    /// Set all master data at once for convenience.
1043    ///
1044    /// This is the recommended way to configure the JE generator with
1045    /// generated master data to ensure data coherence.
1046    pub fn with_master_data(
1047        self,
1048        vendors: &[Vendor],
1049        customers: &[Customer],
1050        materials: &[Material],
1051    ) -> Self {
1052        self.with_vendors(vendors)
1053            .with_customers(customers)
1054            .with_materials(materials)
1055    }
1056
1057    /// SP6 — Build a [`MasterDataResolver`] from the run's master data and
1058    /// store it in `self.md_resolver`. Call once before JE generation begins
1059    /// (the entry method `generate` calls this lazily on the first entry when
1060    /// the resolver pools are empty). Pools are cheap `Vec<String>` snapshots
1061    /// of names already held in the generator's vendor/customer/user pools.
1062    fn refresh_md_resolver(&mut self) {
1063        let companies: Vec<String> = self
1064            .vendor_pool
1065            .vendors
1066            .iter()
1067            .map(|v| v.name.clone())
1068            .chain(self.customer_pool.customers.iter().map(|c| c.name.clone()))
1069            .collect();
1070
1071        let persons: Vec<String> = self
1072            .user_pool
1073            .as_ref()
1074            .map(|p| p.users.iter().map(|u| u.display_name.clone()).collect())
1075            .unwrap_or_default();
1076
1077        let streets: Vec<String> = Vec::new(); // No address master entity in this generator.
1078        let patients = synthetic_patient_pool("de_CH");
1079
1080        self.md_resolver = MasterDataResolver {
1081            companies,
1082            persons,
1083            streets,
1084            patients,
1085        };
1086    }
1087
1088    /// Set the cost-center pool used by line-item enrichment.
1089    ///
1090    /// The orchestrator wires this from the generated cost-centers
1091    /// master so `JE.cost_center` joins back to `cost_centers.id`.
1092    /// When the pool is non-empty `enrich_line_items` picks
1093    /// deterministically from it; the hardcoded fallback
1094    /// `COST_CENTER_POOL` const is only used when the pool is empty
1095    /// (configs that don't generate cost-center master data).
1096    pub fn with_cost_center_pool(mut self, ids: Vec<String>) -> Self {
1097        self.cost_center_pool = ids;
1098        self
1099    }
1100
1101    /// Set the profit-center pool used by line-item enrichment.
1102    ///
1103    /// Same semantics as `with_cost_center_pool` but for the
1104    /// profit-centers master.  Without this, the legacy
1105    /// `PC-{company_code}-{P2P|O2C|R2R|H2R}` derivation is used —
1106    /// which is consistent within a generation run but does not
1107    /// match the format the master data generator emits.
1108    pub fn with_profit_center_pool(mut self, ids: Vec<String>) -> Self {
1109        self.profit_center_pool = ids;
1110        self
1111    }
1112
1113    /// Replace the auto-generated user pool with an externally-built one.
1114    ///
1115    /// The orchestrator builds a [`UserPool`] from the generated
1116    /// employee master ([`UserPool::from_employees`]) and passes it
1117    /// here, so `JE.created_by` joins back to `employees.user_id`.
1118    /// Without this call, `with_country_pack_names` generates its
1119    /// own user pool whose ids are disjoint from the employee
1120    /// master.
1121    pub fn with_user_pool(mut self, pool: UserPool) -> Self {
1122        self.user_pool = Some(pool);
1123        self
1124    }
1125
1126    /// Replace the user pool with one generated from a [`CountryPack`].
1127    ///
1128    /// This is an alternative to the default name-culture distribution that
1129    /// derives name pools and weights from the country-pack's `names` section.
1130    /// The existing user pool (if any) is discarded and regenerated using
1131    /// `MultiCultureNameGenerator::from_country_pack`.
1132    pub fn with_country_pack_names(mut self, pack: &CountryPack) -> Self {
1133        let name_gen =
1134            datasynth_core::templates::MultiCultureNameGenerator::from_country_pack(pack);
1135        let config = UserGeneratorConfig {
1136            // The culture distribution is embedded in the name generator
1137            // itself, so we use an empty list here.
1138            culture_distribution: Vec::new(),
1139            email_domain: name_gen.email_domain().to_string(),
1140            generate_realistic_names: true,
1141        };
1142        let mut user_gen = UserGenerator::with_name_generator(self.seed + 100, config, name_gen);
1143        self.user_pool = Some(user_gen.generate_standard(&self.companies));
1144        self
1145    }
1146
1147    /// Check if the generator is using real master data.
1148    pub fn is_using_real_master_data(&self) -> bool {
1149        self.using_real_master_data
1150    }
1151
1152    /// Determine if this transaction should be fraudulent.
1153    /// Pick a realistic ERP `source_system` provenance code.
1154    ///
1155    /// Returns a string like `"SAP-FI/AP"`, `"manual/adjustment"`,
1156    /// `"Interface/EDI"`. Uses the business process to bias toward
1157    /// process-appropriate sub-modules (e.g. P2P → SAP-MM/IV, O2C →
1158    /// SAP-SD/IV, H2R → SAP-HR/PR). The legacy 7-code shape
1159    /// (`SAP-FI`, `SAP-MM`, etc.) is preserved as a prefix so existing
1160    /// `starts_with` filters keep working.
1161    ///
1162    /// **Manual contract**: when `is_manual` is true the returned value
1163    /// always starts with `"manual"` or `"spreadsheet"`. This is asserted
1164    /// in `test_isa240_audit_flags_populated`.
1165    fn pick_source_system(rng: &mut ChaCha8Rng, is_manual: bool, bp: BusinessProcess) -> String {
1166        if is_manual {
1167            // 8 manual provenance codes — all share a `manual/` or
1168            // `spreadsheet/` prefix.
1169            const MANUAL: &[&str] = &[
1170                "manual/standard",
1171                "manual/adjustment",
1172                "manual/reclassification",
1173                "manual/accrual",
1174                "manual/reversal",
1175                "manual/correction",
1176                "spreadsheet/upload",
1177                "spreadsheet/journal",
1178            ];
1179            let idx = (rng.random::<u32>() as usize) % MANUAL.len();
1180            return MANUAL[idx].to_string();
1181        }
1182
1183        // Process-aware automated provenance. Each process has a small
1184        // primary set; we also mix in cross-process codes ~20% of the
1185        // time so the taxonomy stays diverse without losing coherence.
1186        let primary: &[&str] = match bp {
1187            BusinessProcess::P2P => &[
1188                "SAP-MM/PO",
1189                "SAP-MM/IV",
1190                "SAP-MM/IM",
1191                "SAP-FI/AP",
1192                "Interface/EDI",
1193            ],
1194            BusinessProcess::O2C => &[
1195                "SAP-SD/ORD",
1196                "SAP-SD/DEL",
1197                "SAP-SD/IV",
1198                "SAP-FI/AR",
1199                "Interface/Lockbox",
1200            ],
1201            BusinessProcess::H2R => &["SAP-HR/PR", "SAP-HR/TIME", "Interface/PayRun"],
1202            BusinessProcess::A2R => &["SAP-FI/AA", "SAP-FI/GL"],
1203            BusinessProcess::Treasury => &["Treasury/CM", "Treasury/HD", "Interface/Bank"],
1204            BusinessProcess::Tax => &["Tax/RPT", "SAP-FI/GL"],
1205            BusinessProcess::Mfg => &["SAP-MM/IM", "SAP-FI/GL"],
1206            // R2R, S2C, Bank, Audit, Intercompany, ProjectAccounting, Esg
1207            // → fall through to a generic mix.
1208            _ => &[
1209                "SAP-FI/GL",
1210                "SAP-FI/AP",
1211                "SAP-FI/AR",
1212                "SAP-FI/AA",
1213                "External/SubL",
1214            ],
1215        };
1216
1217        // 80% process-appropriate, 20% cross-process (pulled from a
1218        // generic pool) so the categorical distribution has long tails.
1219        const CROSS: &[&str] = &[
1220            "SAP-FI/GL",
1221            "SAP-FI/AP",
1222            "SAP-FI/AR",
1223            "Interface/EDI",
1224            "Interface/Bank",
1225            "External/SubL",
1226        ];
1227        let pool = if rng.random::<f64>() < 0.80 {
1228            primary
1229        } else {
1230            CROSS
1231        };
1232        let idx = (rng.random::<u32>() as usize) % pool.len();
1233        pool[idx].to_string()
1234    }
1235
1236    /// T2-D Lever 1: choose the `sap_source_code` emitted in the CSV `source`
1237    /// column. Priority: loaded industry priors' `source_mix` (SP3.6) → the
1238    /// default generic SAP doc-type mix when `transactions.synthetic_source_codes`
1239    /// is on (the default) → `None` (legacy: `source` falls back to the coarse
1240    /// `TransactionSource` enum). Closes the source-mix breadth gap by default
1241    /// (entropy ~0.75 → ~2.7; experiments/ml/FINDINGS.md §6).
1242    fn sample_sap_source_code(&mut self) -> Option<String> {
1243        if let Some(p) = self.loaded_priors.as_ref() {
1244            return Some(p.source_mix.sample(&mut self.rng));
1245        }
1246        if self.config.synthetic_source_codes.unwrap_or(true) {
1247            // Independent stream: never perturb the main RNG, so all other
1248            // fields stay byte-identical to the legacy (enum-source) output.
1249            return Some(DEFAULT_SOURCE_MIX.sample(&mut self.source_mix_rng));
1250        }
1251        None
1252    }
1253
1254    /// SOTA-1: on the no-priors path, reuse a cached `(debit, credit)` account
1255    /// archetype matching the line counts for this `(company, doc_type)` with
1256    /// high probability, so standard postings recur (and a hot subset of
1257    /// accounts dominates) instead of every JE drawing fresh uniform accounts.
1258    /// Returns the accounts to use, or `None` to select fresh (then cached).
1259    /// Rolls `template_rng` first so the main RNG (amounts/dates/counts) is
1260    /// never perturbed — only account *choice* changes on reuse.
1261    fn pick_recurring_archetype(
1262        &mut self,
1263        company: &str,
1264        doc_type: &str,
1265        debit_count: usize,
1266        credit_count: usize,
1267    ) -> Option<(Vec<String>, Vec<String>)> {
1268        if !self.config.recurring_templates.unwrap_or(true) {
1269            return None;
1270        }
1271        // Priors carry their own GL-account structure; templating is the no-priors
1272        // default-path realism boost (FINDINGS sec.8) UNLESS the user has explicitly
1273        // set archetype_reuse_probability — in that case SOTA-1 composes with the
1274        // priors path (SOTA-9 #137: lift corpus recurring share toward ~0.97).
1275        let p_reuse_opt = self.config.archetype_reuse_probability;
1276        if p_reuse_opt.is_none() && self.loaded_priors.is_some() {
1277            return None;
1278        }
1279        let p_reuse = p_reuse_opt.unwrap_or(0.90);
1280        if self.template_rng.random::<f64>() >= p_reuse {
1281            return None;
1282        }
1283        let lib = self
1284            .recurring_archetypes
1285            .get(&(company.to_string(), doc_type.to_string()))?;
1286        let matching: Vec<&(Vec<String>, Vec<String>)> = lib
1287            .iter()
1288            .filter(|(d, c)| d.len() == debit_count && c.len() == credit_count)
1289            .collect();
1290        if matching.is_empty() {
1291            return None;
1292        }
1293        // Power-law (Zipf) over the cached archetypes rather than a uniform pick:
1294        // the earlier-cached "standard" posting of each (company, doc-type, shape)
1295        // dominates, so a hot subset of archetypes carries most JEs. Uniform reuse
1296        // kept the per-JE recurring share high but left the archetype head too
1297        // flat (top-50 coverage 0.49 vs corpus 0.65); concentrating the head lifts
1298        // top-50 coverage toward the corpus. Same mechanism as the SOTA-2 account
1299        // Pareto, drawn on the `template_rng` stream.
1300        let idx = Self::power_law_index(matching.len(), &mut self.template_rng).unwrap_or(0);
1301        Some(matching[idx].clone())
1302    }
1303
1304    /// SOTA-1: record a freshly-selected archetype for future reuse, capped per
1305    /// `(company, doc_type)` so the standard-posting library stays small.
1306    fn cache_recurring_archetype(
1307        &mut self,
1308        company: &str,
1309        doc_type: &str,
1310        debit: Vec<String>,
1311        credit: Vec<String>,
1312    ) {
1313        if self.loaded_priors.is_some() || !self.config.recurring_templates.unwrap_or(true) {
1314            return;
1315        }
1316        if debit.is_empty() && credit.is_empty() {
1317            return;
1318        }
1319        const CAP: usize = 24; // distinct archetypes per (company, doc-type) — fewer ⇒ top-50 archetypes cover more JEs (toward corpus top-50 ~0.65)
1320        let lib = self
1321            .recurring_archetypes
1322            .entry((company.to_string(), doc_type.to_string()))
1323            .or_default();
1324        if lib.len() < CAP {
1325            lib.push((debit, credit));
1326        }
1327    }
1328
1329    /// SOTA-5: with probability `transactions.reversal_rate` (default ~10%),
1330    /// build a reversal/correction of a recent JE (swap dr/cr, reference the
1331    /// original) instead of a fresh JE. Uses `reversal_rng` and an id derived
1332    /// from the original, so the main RNG + uuid factory are unperturbed (normal
1333    /// JEs stay byte-identical; reversals are interspersed). Balanced because the
1334    /// original was balanced and we swap each line's debit/credit.
1335    fn maybe_generate_reversal(&mut self) -> Option<JournalEntry> {
1336        let rate = self.config.reversal_rate.unwrap_or(DEFAULT_REVERSAL_RATE);
1337        if rate <= 0.0 || self.reversal_buffer.is_empty() {
1338            return None;
1339        }
1340        if self.reversal_rng.random::<f64>() >= rate {
1341            return None;
1342        }
1343        let pick = (self.reversal_rng.random::<u32>() as usize) % self.reversal_buffer.len();
1344        // Consume the entry so the same original is never reversed twice — that
1345        // would mint the same derived id (`orig ^ salt`) and produce duplicate
1346        // document IDs (regression caught by `test_document_reference_integrity`).
1347        let mut entry = self.reversal_buffer.remove(pick);
1348        let orig_id = entry.header.document_id;
1349        // Reversal posts a few business days after the original.
1350        let offset = 1 + (self.reversal_rng.random::<u32>() % 7) as i64;
1351        let mut rev_date = entry.header.posting_date + chrono::Duration::days(offset);
1352        if let Some(ref calc) = self.business_day_calculator {
1353            if !calc.is_business_day(rev_date) {
1354                rev_date = calc.next_business_day(rev_date, false);
1355            }
1356        }
1357        if rev_date > self.end_date {
1358            rev_date = entry.header.posting_date;
1359        }
1360        // Deterministic id derived from the original (no uuid-factory advance).
1361        let rev_id =
1362            uuid::Uuid::from_u128(orig_id.as_u128() ^ 0x5245_5645_5253_414c_5245_5645_5253_414c);
1363        // Inherit everything from the original (source code, line text, audit
1364        // flags, ...); change only the markers + each line's debit/credit.
1365        entry.header.document_id = rev_id;
1366        entry.header.posting_date = rev_date;
1367        entry.header.document_date = rev_date;
1368        entry.header.fiscal_year = rev_date.year() as u16;
1369        entry.header.fiscal_period = rev_date.month() as u8;
1370        entry.header.header_text = Some(format!("Reversal of {orig_id}"));
1371        entry.header.reference = Some(format!("REV-{orig_id}"));
1372        entry.header.batch_id = None;
1373        for line in entry.lines.iter_mut() {
1374            std::mem::swap(&mut line.debit_amount, &mut line.credit_amount);
1375            line.document_id = rev_id;
1376        }
1377        Some(entry)
1378    }
1379
1380    /// SOTA-5/6: remember a (complete) JE so a later reversal (SOTA-5) or
1381    /// allocation batch (SOTA-6) can reuse it. Populated when either process is
1382    /// enabled, so disabling reversals doesn't starve the allocation batches.
1383    fn record_for_reversal(&mut self, entry: &JournalEntry) {
1384        let reversal_on = self.config.reversal_rate.unwrap_or(DEFAULT_REVERSAL_RATE) > 0.0;
1385        let allocation_on = self
1386            .config
1387            .allocation_batch_rate
1388            .unwrap_or(DEFAULT_ALLOCATION_RATE)
1389            > 0.0;
1390        if (!reversal_on && !allocation_on) || entry.lines.is_empty() {
1391            return;
1392        }
1393        const CAP: usize = 64;
1394        if self.reversal_buffer.len() >= CAP {
1395            self.reversal_buffer.remove(0);
1396        }
1397        self.reversal_buffer.push(entry.clone());
1398    }
1399
1400    /// SOTA-4: with probability `transactions.foreign_currency_rate`, post this JE
1401    /// in a foreign document currency (SAP-style). `debit_amount`/`credit_amount`/
1402    /// `local_amount` stay the company-ledger amount (DMBTR — the trial balance is
1403    /// unaffected); `header.currency`/`header.exchange_rate` + each line's
1404    /// `transaction_amount` (WRBTR) carry the foreign value. Balance holds in both
1405    /// currencies (every line shares one rate). Drawn on `fx_rng` so the main
1406    /// `rng` (and all company-currency JEs) stay byte-identical.
1407    fn maybe_apply_foreign_currency(&mut self, entry: &mut JournalEntry) {
1408        let prob = self.config.foreign_currency_rate.unwrap_or(0.0);
1409        if prob <= 0.0 || self.fx_rng.random::<f64>() >= prob {
1410            return;
1411        }
1412        let (code, rate) = FOREIGN_CCYS[self.fx_rng.random_range(0..FOREIGN_CCYS.len())];
1413        let rate_dec = match Decimal::from_f64_retain(rate) {
1414            Some(r) if r > Decimal::ZERO => r,
1415            _ => return,
1416        };
1417        entry.header.currency = code.to_string();
1418        entry.header.exchange_rate = rate_dec;
1419        for line in entry.lines.iter_mut() {
1420            let ledger = line.debit_amount + line.credit_amount; // one side is zero
1421            line.transaction_amount = Some((ledger / rate_dec).round_dp(2));
1422        }
1423    }
1424
1425    /// SOTA-6: split `total` into `n` positive cent-precise parts summing
1426    /// **exactly** to `total` (so the JE stays balanced), with random weights so
1427    /// the allocation isn't perfectly even. Each part is ≥ 1 cent. Returns a
1428    /// single `[total]` when the amount is too small to split into `n` parts.
1429    fn split_amount(total: Decimal, n: usize, rng: &mut ChaCha8Rng) -> Vec<Decimal> {
1430        let n = n.max(1);
1431        let total_cents = (total.round_dp(2) * Decimal::from(100))
1432            .to_i64()
1433            .unwrap_or(0);
1434        if n == 1 || total_cents < n as i64 {
1435            return vec![total];
1436        }
1437        let weights: Vec<f64> = (0..n).map(|_| 0.5 + rng.random::<f64>()).collect();
1438        let sumw: f64 = weights.iter().sum::<f64>().max(f64::EPSILON);
1439        let spare = total_cents - n as i64; // ≥ 0; each part keeps a 1-cent floor
1440        let mut cents: Vec<i64> = weights
1441            .iter()
1442            .map(|w| 1 + (spare as f64 * w / sumw).floor() as i64)
1443            .collect();
1444        // dump the (small, < n) flooring leftover onto the largest part
1445        let assigned: i64 = cents.iter().sum();
1446        let leftover = total_cents - assigned;
1447        if let Some(maxp) = cents.iter_mut().max_by_key(|c| **c) {
1448            *maxp += leftover;
1449        }
1450        cents.into_iter().map(|c| Decimal::new(c, 2)).collect()
1451    }
1452
1453    /// SOTA-3: deterministic dimension → business-unit roll-up (the dimension is
1454    /// the cost center, or the profit center as fallback). The same dimension
1455    /// value always maps to the same BU code (`BU01`..`BU11`, matching the
1456    /// corpus's ~11 BU codes), so business-unit analytics are internally
1457    /// consistent — not a random per-line label. FNV-1a hash, bucketed.
1458    fn business_unit_for_dimension(dim: &str) -> String {
1459        const N_BU: u32 = 11;
1460        let mut h: u32 = 0x811c_9dc5;
1461        for b in dim.bytes() {
1462            h ^= b as u32;
1463            h = h.wrapping_mul(0x0100_0193);
1464        }
1465        format!("BU{:02}", (h % N_BU) + 1)
1466    }
1467
1468    /// SOTA-6: with probability `transactions.allocation_batch_rate` (default
1469    /// ~0.8%), emit an allocation/assessment batch instead of a fresh JE — the
1470    /// large 1-to-many posting that drives the corpus lines-per-JE tail (AB docs
1471    /// ~52 lines). Reuses a buffered JE for a valid header (no main-RNG / uuid
1472    /// advance), then explodes its largest debit line into ~30-80 cost-center-
1473    /// spread sub-lines summing to the same amount, so balance is preserved and
1474    /// the cost-center dimension breadth rises. Tagged source `AB`.
1475    fn maybe_generate_allocation_batch(&mut self) -> Option<JournalEntry> {
1476        let rate = self
1477            .config
1478            .allocation_batch_rate
1479            .unwrap_or(DEFAULT_ALLOCATION_RATE);
1480        if rate <= 0.0 || self.reversal_buffer.is_empty() {
1481            return None;
1482        }
1483        if self.allocation_rng.random::<f64>() >= rate {
1484            return None;
1485        }
1486        let pick = (self.allocation_rng.random::<u32>() as usize) % self.reversal_buffer.len();
1487        // Consume the entry (same reason as the reversal path: a reused base
1488        // would mint a duplicate derived id `base ^ salt`).
1489        let mut entry = self.reversal_buffer.remove(pick);
1490        // Explode the largest debit line across cost centers.
1491        let idx = entry
1492            .lines
1493            .iter()
1494            .enumerate()
1495            .filter(|(_, l)| l.debit_amount > Decimal::ZERO)
1496            .max_by(|a, b| a.1.debit_amount.cmp(&b.1.debit_amount))
1497            .map(|(i, _)| i)?;
1498        let template = entry.lines[idx].clone();
1499        let n = self
1500            .allocation_rng
1501            .random_range(ALLOCATION_MIN_TARGETS..=ALLOCATION_MAX_TARGETS) as usize;
1502        let parts = Self::split_amount(template.debit_amount, n, &mut self.allocation_rng);
1503        if parts.len() < ALLOCATION_MIN_TARGETS as usize {
1504            // amount too small to make a meaningful batch — leave it a normal JE
1505            return None;
1506        }
1507        // Valid cost-center candidates for this company (joins back to master).
1508        let company_code = entry.header.company_code.clone();
1509        let cc_pool: Vec<String> = if self.cost_center_pool.is_empty() {
1510            Self::COST_CENTER_POOL
1511                .iter()
1512                .map(|s| s.to_string())
1513                .collect()
1514        } else {
1515            let needle = format!("-{company_code}-");
1516            let filtered: Vec<String> = self
1517                .cost_center_pool
1518                .iter()
1519                .filter(|id| id.contains(&needle))
1520                .cloned()
1521                .collect();
1522            if filtered.is_empty() {
1523                self.cost_center_pool.clone()
1524            } else {
1525                filtered
1526            }
1527        };
1528        let mut new_lines: Vec<JournalEntryLine> =
1529            Vec::with_capacity(entry.lines.len() + parts.len() - 1);
1530        for (j, line) in entry.lines.iter().enumerate() {
1531            if j == idx {
1532                let bu_on = self.config.business_unit_dimension.unwrap_or(true);
1533                for (k, part) in parts.iter().enumerate() {
1534                    let mut nl = template.clone();
1535                    nl.debit_amount = *part;
1536                    nl.credit_amount = Decimal::ZERO;
1537                    nl.cost_center = Some(cc_pool[k % cc_pool.len()].clone());
1538                    // SOTA-3: keep business_unit coherent with the *new* CC
1539                    // (the clone carried the template's stale BU).
1540                    if bu_on {
1541                        nl.business_unit = nl
1542                            .cost_center
1543                            .as_deref()
1544                            .map(Self::business_unit_for_dimension);
1545                    }
1546                    new_lines.push(nl);
1547                }
1548            } else {
1549                new_lines.push(line.clone());
1550            }
1551        }
1552        // Derived id (distinct from the reversal salt); retag as an allocation.
1553        let base_id = entry.header.document_id;
1554        let alloc_id =
1555            uuid::Uuid::from_u128(base_id.as_u128() ^ 0xA110_CA70_A110_CA70_A110_CA70_A110_CA70);
1556        entry.header.document_id = alloc_id;
1557        entry.header.sap_source_code = Some("AB".to_string());
1558        entry.header.header_text = Some("Allocation/assessment cycle".to_string());
1559        entry.header.reference = Some(format!("ALLOC-{base_id}"));
1560        entry.header.batch_id = None;
1561        for (i, line) in new_lines.iter_mut().enumerate() {
1562            line.line_number = (i + 1) as u32;
1563            line.document_id = alloc_id;
1564        }
1565        entry.lines = new_lines.into();
1566        Some(entry)
1567    }
1568
1569    fn determine_fraud(&mut self) -> Option<FraudType> {
1570        if !self.fraud_config.enabled {
1571            return None;
1572        }
1573
1574        // Roll for fraud based on fraud rate
1575        if self.rng.random::<f64>() >= self.fraud_config.fraud_rate {
1576            return None;
1577        }
1578
1579        // Select fraud type based on distribution
1580        Some(self.select_fraud_type())
1581    }
1582
1583    /// Select a fraud type based on the configured distribution.
1584    fn select_fraud_type(&mut self) -> FraudType {
1585        let dist = &self.fraud_config.fraud_type_distribution;
1586        let roll: f64 = self.rng.random();
1587
1588        let mut cumulative = 0.0;
1589
1590        cumulative += dist.suspense_account_abuse;
1591        if roll < cumulative {
1592            return FraudType::SuspenseAccountAbuse;
1593        }
1594
1595        cumulative += dist.fictitious_transaction;
1596        if roll < cumulative {
1597            return FraudType::FictitiousTransaction;
1598        }
1599
1600        cumulative += dist.revenue_manipulation;
1601        if roll < cumulative {
1602            return FraudType::RevenueManipulation;
1603        }
1604
1605        cumulative += dist.expense_capitalization;
1606        if roll < cumulative {
1607            return FraudType::ExpenseCapitalization;
1608        }
1609
1610        cumulative += dist.split_transaction;
1611        if roll < cumulative {
1612            return FraudType::SplitTransaction;
1613        }
1614
1615        cumulative += dist.timing_anomaly;
1616        if roll < cumulative {
1617            return FraudType::TimingAnomaly;
1618        }
1619
1620        cumulative += dist.unauthorized_access;
1621        if roll < cumulative {
1622            return FraudType::UnauthorizedAccess;
1623        }
1624
1625        cumulative += dist.duplicate_payment;
1626        if roll < cumulative {
1627            return FraudType::DuplicatePayment;
1628        }
1629
1630        cumulative += dist.kickback_scheme;
1631        if roll < cumulative {
1632            return FraudType::KickbackScheme;
1633        }
1634
1635        cumulative += dist.round_tripping;
1636        if roll < cumulative {
1637            return FraudType::RoundTripping;
1638        }
1639
1640        cumulative += dist.unauthorized_discount;
1641        if roll < cumulative {
1642            return FraudType::UnauthorizedDiscount;
1643        }
1644
1645        // Fallback when distribution is sub-1.0 (validator allows tolerance)
1646        FraudType::DuplicatePayment
1647    }
1648
1649    /// Map a fraud type to an amount pattern for suspicious amounts.
1650    fn fraud_type_to_amount_pattern(&self, fraud_type: FraudType) -> FraudAmountPattern {
1651        match fraud_type {
1652            FraudType::SplitTransaction | FraudType::JustBelowThreshold => {
1653                FraudAmountPattern::ThresholdAdjacent
1654            }
1655            FraudType::FictitiousTransaction
1656            | FraudType::FictitiousEntry
1657            | FraudType::SuspenseAccountAbuse
1658            | FraudType::RoundDollarManipulation => FraudAmountPattern::ObviousRoundNumbers,
1659            FraudType::RevenueManipulation
1660            | FraudType::ExpenseCapitalization
1661            | FraudType::ImproperCapitalization
1662            | FraudType::ReserveManipulation
1663            | FraudType::UnauthorizedAccess
1664            | FraudType::PrematureRevenue
1665            | FraudType::UnderstatedLiabilities
1666            | FraudType::OverstatedAssets
1667            | FraudType::ChannelStuffing => FraudAmountPattern::StatisticallyImprobable,
1668            FraudType::DuplicatePayment
1669            | FraudType::TimingAnomaly
1670            | FraudType::SelfApproval
1671            | FraudType::ExceededApprovalLimit
1672            | FraudType::SegregationOfDutiesViolation
1673            | FraudType::UnauthorizedApproval
1674            | FraudType::CollusiveApproval
1675            | FraudType::FictitiousVendor
1676            | FraudType::ShellCompanyPayment
1677            | FraudType::Kickback
1678            | FraudType::KickbackScheme
1679            | FraudType::UnauthorizedDiscount
1680            | FraudType::RoundTripping
1681            | FraudType::InvoiceManipulation
1682            | FraudType::AssetMisappropriation
1683            | FraudType::InventoryTheft
1684            | FraudType::GhostEmployee => FraudAmountPattern::Normal,
1685            // Accounting Standards Fraud Types (ASC 606/IFRS 15 - Revenue)
1686            FraudType::ImproperRevenueRecognition
1687            | FraudType::ImproperPoAllocation
1688            | FraudType::VariableConsiderationManipulation
1689            | FraudType::ContractModificationMisstatement => {
1690                FraudAmountPattern::StatisticallyImprobable
1691            }
1692            // Accounting Standards Fraud Types (ASC 842/IFRS 16 - Leases)
1693            FraudType::LeaseClassificationManipulation
1694            | FraudType::OffBalanceSheetLease
1695            | FraudType::LeaseLiabilityUnderstatement
1696            | FraudType::RouAssetMisstatement => FraudAmountPattern::StatisticallyImprobable,
1697            // Accounting Standards Fraud Types (ASC 820/IFRS 13 - Fair Value)
1698            FraudType::FairValueHierarchyManipulation
1699            | FraudType::Level3InputManipulation
1700            | FraudType::ValuationTechniqueManipulation => {
1701                FraudAmountPattern::StatisticallyImprobable
1702            }
1703            // Accounting Standards Fraud Types (ASC 360/IAS 36 - Impairment)
1704            FraudType::DelayedImpairment
1705            | FraudType::ImpairmentTestAvoidance
1706            | FraudType::CashFlowProjectionManipulation
1707            | FraudType::ImproperImpairmentReversal => FraudAmountPattern::StatisticallyImprobable,
1708            // Sourcing/Procurement Fraud
1709            FraudType::BidRigging
1710            | FraudType::PhantomVendorContract
1711            | FraudType::ConflictOfInterestSourcing => FraudAmountPattern::Normal,
1712            FraudType::SplitContractThreshold => FraudAmountPattern::ThresholdAdjacent,
1713            // HR/Payroll Fraud
1714            FraudType::GhostEmployeePayroll
1715            | FraudType::PayrollInflation
1716            | FraudType::DuplicateExpenseReport
1717            | FraudType::FictitiousExpense => FraudAmountPattern::Normal,
1718            FraudType::SplitExpenseToAvoidApproval => FraudAmountPattern::ThresholdAdjacent,
1719            // O2C Fraud
1720            FraudType::RevenueTimingManipulation => FraudAmountPattern::StatisticallyImprobable,
1721            FraudType::QuotePriceOverride => FraudAmountPattern::Normal,
1722        }
1723    }
1724
1725    /// Generate a deterministic UUID using the factory.
1726    #[inline]
1727    fn generate_deterministic_uuid(&self) -> uuid::Uuid {
1728        self.uuid_factory.next()
1729    }
1730
1731    /// Cost center pool used for expense account enrichment.
1732    const COST_CENTER_POOL: &'static [&'static str] =
1733        &["CC1000", "CC2000", "CC3000", "CC4000", "CC5000"];
1734
1735    /// Enrich journal entry line items with account descriptions, cost centers,
1736    /// profit centers, value dates, line text, and assignment fields.
1737    ///
1738    /// This populates the sparse optional fields that `JournalEntryLine::debit()`
1739    /// and `::credit()` leave as `None`.
1740    ///
1741    /// SP3 T13: changed to `&mut self` so `loaded_priors` fanout samplers
1742    /// can be driven for CostCenter and ProfitCenter when priors are loaded.
1743    fn enrich_line_items(&mut self, entry: &mut JournalEntry) {
1744        let posting_date = entry.header.posting_date;
1745        let company_code = &entry.header.company_code;
1746        let header_text = entry.header.header_text.clone();
1747        let business_process = entry.header.business_process;
1748        // SP3 T13 — document-type code used as the entity_id for fanout
1749        // samplers.  Derived from the header field set during generate().
1750        let doc_type_key = entry.header.document_type.clone();
1751
1752        // SP3.7 — capture the SAP source code as an owned Option<String> so it
1753        // can be passed to `sample_attribute_for_source` as a `&str` inside the
1754        // line loop without keeping a borrow on `entry`.
1755        let header_sap_code: Option<String> = entry.header.sap_source_code.clone();
1756
1757        // SP3.3 — resolve cross-entity motif neighbors once before the line
1758        // loop.  Owned Vec avoids holding a shared borrow on `self.loaded_priors`
1759        // across the subsequent `&mut` fanout-sampler calls.
1760        let (cc_pc_neighbor_vec, cc_pc_share_prob): (Vec<String>, f64) =
1761            if let Some(priors) = &self.loaded_priors {
1762                if let Some(motifs) = &priors.cross_entity_motifs {
1763                    (
1764                        motifs.neighbors(&doc_type_key).to_vec(),
1765                        motifs.should_share(&doc_type_key),
1766                    )
1767                } else {
1768                    (Vec::new(), 0.0)
1769                }
1770            } else {
1771                (Vec::new(), 0.0)
1772            };
1773
1774        // Derive a deterministic index from the document_id for cost center selection
1775        let doc_id_bytes = entry.header.document_id.as_bytes();
1776        let mut cc_seed: usize = 0;
1777        for &b in doc_id_bytes {
1778            cc_seed = cc_seed.wrapping_add(b as usize);
1779        }
1780
1781        for (i, line) in entry.lines.iter_mut().enumerate() {
1782            // 1. account_description: look up from CoA
1783            if line.account_description.is_none() {
1784                line.account_description = self
1785                    .coa
1786                    .get_account(&line.gl_account)
1787                    .map(|a| a.short_description.clone());
1788            }
1789
1790            // 2. cost_center: assign to expense accounts (5xxx/6xxx)
1791            //
1792            // SP3 T13: when priors are loaded, the CostCenter fanout
1793            // sampler overrides the pool/legacy path.  This block runs
1794            // before the existing logic; if the sampler fires, `line.cost_center`
1795            // is set and the legacy block below is skipped via the
1796            // `line.cost_center.is_none()` guard.
1797            //
1798            // When the orchestrator has provided a master-data-sourced
1799            // pool (`with_cost_center_pool`), pick from it so the value
1800            // joins back to `cost_centers.id`.  Otherwise fall back to
1801            // the legacy hardcoded `COST_CENTER_POOL` const.
1802            //
1803            // Selection within the pool is filtered to entries that
1804            // mention the entry's `company_code` (master IDs follow
1805            // the `CC-{company}-...` convention) so cross-company
1806            // contamination is avoided; if no pool entry matches the
1807            // company we fall through to the full pool.
1808            if line.cost_center.is_none() {
1809                // SP3 T13 — prior-driven CostCenter fanout.
1810                // SP3.3: prefer neighbor-used buckets when motifs are available.
1811                // SP3.7: try per-source conditional cost_center first; fall back
1812                //        to the fanout sampler when the conditional is absent.
1813                let priors_opt = &mut self.loaded_priors;
1814                let rng_ref = &mut self.rng;
1815                if let Some(priors) = priors_opt {
1816                    let sp37_cc = header_sap_code.as_deref().and_then(|code| {
1817                        priors.sample_attribute_for_source(code, "cost_center", rng_ref)
1818                    });
1819                    if sp37_cc.is_some() {
1820                        line.cost_center = sp37_cc;
1821                    } else if let Some(sampler) = priors.fanout_samplers.get_mut("CostCenter") {
1822                        line.cost_center = Some(sampler.pick_for_with_neighbors(
1823                            &doc_type_key,
1824                            &cc_pc_neighbor_vec,
1825                            cc_pc_share_prob,
1826                            rng_ref,
1827                        ));
1828                    }
1829                }
1830            }
1831            if line.cost_center.is_none() {
1832                let first_char = line.gl_account.chars().next().unwrap_or('0');
1833                if first_char == '5' || first_char == '6' {
1834                    if !self.cost_center_pool.is_empty() {
1835                        let needle = format!("-{company_code}-");
1836                        let candidates: Vec<&String> = self
1837                            .cost_center_pool
1838                            .iter()
1839                            .filter(|id| id.contains(&needle))
1840                            .collect();
1841                        let pool: Vec<&String> = if candidates.is_empty() {
1842                            self.cost_center_pool.iter().collect()
1843                        } else {
1844                            candidates
1845                        };
1846                        let idx = cc_seed.wrapping_add(i) % pool.len();
1847                        line.cost_center = Some(pool[idx].clone());
1848                    } else {
1849                        let idx = cc_seed.wrapping_add(i) % Self::COST_CENTER_POOL.len();
1850                        line.cost_center = Some(Self::COST_CENTER_POOL[idx].to_string());
1851                    }
1852                }
1853            }
1854
1855            // 3. profit_center: assign from master pool when available
1856            // (`with_profit_center_pool`); otherwise derive from
1857            // company code + business process (legacy behaviour, which
1858            // does not match the master-data PC ID format).
1859            //
1860            // SP3 T13: prior-driven ProfitCenter fanout override fires first
1861            // (same pattern as CostCenter above).
1862            if line.profit_center.is_none() {
1863                // SP3 T13 — prior-driven ProfitCenter fanout.
1864                // SP3.3: prefer neighbor-used buckets when motifs are available.
1865                // SP3.7: try per-source conditional profit_center first; fall back
1866                //        to the fanout sampler when the conditional is absent.
1867                let priors_opt = &mut self.loaded_priors;
1868                let rng_ref = &mut self.rng;
1869                if let Some(priors) = priors_opt {
1870                    let sp37_pc = header_sap_code.as_deref().and_then(|code| {
1871                        priors.sample_attribute_for_source(code, "profit_center", rng_ref)
1872                    });
1873                    if sp37_pc.is_some() {
1874                        line.profit_center = sp37_pc;
1875                    } else if let Some(sampler) = priors.fanout_samplers.get_mut("ProfitCenter") {
1876                        line.profit_center = Some(sampler.pick_for_with_neighbors(
1877                            &doc_type_key,
1878                            &cc_pc_neighbor_vec,
1879                            cc_pc_share_prob,
1880                            rng_ref,
1881                        ));
1882                    }
1883                }
1884            }
1885            if line.profit_center.is_none() {
1886                if !self.profit_center_pool.is_empty() {
1887                    let needle = format!("-{company_code}-");
1888                    let candidates: Vec<&String> = self
1889                        .profit_center_pool
1890                        .iter()
1891                        .filter(|id| id.contains(&needle))
1892                        .collect();
1893                    let pool: Vec<&String> = if candidates.is_empty() {
1894                        self.profit_center_pool.iter().collect()
1895                    } else {
1896                        candidates
1897                    };
1898                    let idx = cc_seed.wrapping_add(i) % pool.len();
1899                    line.profit_center = Some(pool[idx].clone());
1900                } else {
1901                    let suffix = match business_process {
1902                        Some(BusinessProcess::P2P) => "-P2P",
1903                        Some(BusinessProcess::O2C) => "-O2C",
1904                        Some(BusinessProcess::R2R) => "-R2R",
1905                        Some(BusinessProcess::H2R) => "-H2R",
1906                        _ => "",
1907                    };
1908                    line.profit_center = Some(format!("PC-{company_code}{suffix}"));
1909                }
1910            }
1911
1912            // 3b. business_unit (SOTA-3): a coherent roll-up of the cost center,
1913            // or the profit center as fallback — the same dimension value always
1914            // maps to the same BU, so BU-level analytics are consistent. Runs
1915            // after both CC (step 2) and PC (step 3) are assigned; using CC-or-PC
1916            // lifts fill toward the corpus (~82%) vs only CC-bearing lines (~24%).
1917            // Flag-gated by `transactions.business_unit_dimension` (default-on).
1918            if line.business_unit.is_none() && self.config.business_unit_dimension.unwrap_or(true) {
1919                if let Some(dim) = line
1920                    .cost_center
1921                    .as_deref()
1922                    .or(line.profit_center.as_deref())
1923                {
1924                    line.business_unit = Some(Self::business_unit_for_dimension(dim));
1925                }
1926            }
1927
1928            // 4. trading_partner: SP3.9 — inherit JE-level trading_partner from
1929            // the header. The header was populated once per JE in generate();
1930            // all lines share the same value to match corpus SAP semantics.
1931            // The is_none() guard preserves TP values already set by the P2P/O2C
1932            // document chain manager (also JE-level, different code path).
1933            if line.trading_partner.is_none() {
1934                line.trading_partner = entry.header.trading_partner.clone();
1935            }
1936
1937            // 5. line_text: fall back to header_text if not already set
1938            if line.line_text.is_none() {
1939                line.line_text = header_text.clone();
1940            }
1941
1942            // 6. value_date: set to posting_date for AR/AP accounts
1943            if line.value_date.is_none()
1944                && (line.gl_account.starts_with("1100") || line.gl_account.starts_with("2000"))
1945            {
1946                line.value_date = Some(posting_date);
1947            }
1948
1949            // 7. assignment: set to vendor/customer reference for AP/AR lines
1950            if line.assignment.is_none() {
1951                if line.gl_account.starts_with("2000") {
1952                    // AP line - use vendor reference from header
1953                    if let Some(ref ht) = header_text {
1954                        // Try to extract vendor ID from header text patterns like "... - V-001"
1955                        if let Some(vendor_part) = ht.rsplit(" - ").next() {
1956                            if vendor_part.starts_with("V-")
1957                                || vendor_part.starts_with("VENDOR")
1958                                || vendor_part.starts_with("Vendor")
1959                            {
1960                                line.assignment = Some(vendor_part.to_string());
1961                            }
1962                        }
1963                    }
1964                } else if line.gl_account.starts_with("1100") {
1965                    // AR line - use customer reference from header
1966                    if let Some(ref ht) = header_text {
1967                        if let Some(customer_part) = ht.rsplit(" - ").next() {
1968                            if customer_part.starts_with("C-")
1969                                || customer_part.starts_with("CUST")
1970                                || customer_part.starts_with("Customer")
1971                            {
1972                                line.assignment = Some(customer_part.to_string());
1973                            }
1974                        }
1975                    }
1976                }
1977            }
1978        }
1979    }
1980
1981    /// Generate a single journal entry.
1982    pub fn generate(&mut self) -> JournalEntry {
1983        debug!(
1984            count = self.count,
1985            companies = self.companies.len(),
1986            start_date = %self.start_date,
1987            end_date = %self.end_date,
1988            "Generating journal entry"
1989        );
1990
1991        // Check if we're in a batch - if so, generate a batched entry
1992        if let Some(ref state) = self.batch_state {
1993            if state.remaining > 0 {
1994                return self.generate_batched_entry();
1995            }
1996        }
1997
1998        // SOTA-5: with a small probability, emit a reversal/correction of a
1999        // recent JE instead of a fresh one (a process auditors look for).
2000        if let Some(rev) = self.maybe_generate_reversal() {
2001            return rev;
2002        }
2003
2004        // SOTA-6: with a small probability, emit a large allocation/assessment
2005        // batch (the corpus lines-per-JE tail) instead of a fresh JE.
2006        if let Some(alloc) = self.maybe_generate_allocation_batch() {
2007            return alloc;
2008        }
2009
2010        // SP6 — Lazy-init the MD resolver on the first call. Rebuilding once
2011        // per run is sufficient; pools are stable after master-data generation.
2012        if self.md_resolver.companies.is_empty()
2013            && self.md_resolver.persons.is_empty()
2014            && self.md_resolver.patients.is_empty()
2015        {
2016            self.refresh_md_resolver();
2017        }
2018
2019        self.count += 1;
2020
2021        // Generate deterministic document ID
2022        let document_id = self.generate_deterministic_uuid();
2023
2024        // SP3.5c — Lazy temporal-sampler date draw.
2025        //
2026        // When priors are loaded the IET path (SP3 T11) will immediately replace
2027        // this value, so drawing from the temporal sampler here wastes one RNG
2028        // advance on the sampler's internal stream AND makes the temporal-sampler
2029        // variance contribute to the merged date sequence even though the IET
2030        // sampler is meant to dominate.
2031        //
2032        // Fix: only draw from the temporal sampler now when no priors are loaded.
2033        // The IET block sets `posting_date` unconditionally when priors are Some;
2034        // the active-window fallback (SP3 T14) has its own sample_date call and is
2035        // unaffected by this change.
2036        //
2037        // Priors-absent path: byte-identical to v5.13 — the draw and business-day
2038        // snap are performed exactly as before.
2039        let mut posting_date = if self.loaded_priors.is_none() {
2040            let mut d = self
2041                .temporal_sampler
2042                .sample_date(self.start_date, self.end_date);
2043            // Adjust posting date to be a business day if business day calculator is configured
2044            if let Some(ref calc) = self.business_day_calculator {
2045                if !calc.is_business_day(d) {
2046                    d = calc.next_business_day(d, false);
2047                    if d > self.end_date {
2048                        d = calc.prev_business_day(self.end_date, true);
2049                    }
2050                }
2051            }
2052            d
2053        } else {
2054            // Priors-loaded path: IET block (below) will set the real date.
2055            // Use start_date as a zero-cost placeholder — it is always overwritten.
2056            self.start_date
2057        };
2058
2059        // Select company using weighted selector
2060        let company_code = self.company_selector.select(&mut self.rng).to_string();
2061
2062        // v4.1.0+: draw a single (u, v) pair from the copula — cached for
2063        // both the amount adjustment (u) and the line-count shift (v).
2064        // None when no copula is configured.
2065        let copula_uv: Option<(f64, f64)> =
2066            self.correlation_copula.as_mut().map(|cop| cop.sample());
2067
2068        // Sample line item specification. When a copula is configured,
2069        // v drives line-count via a quantile-preserving map: integer
2070        // count `2 + floor(v * 10)` gives range [2, 11] evenly spaced
2071        // in v, so rank(v) == rank(line_count).
2072        //
2073        // v4.1.6+: upgraded from the v3.5.4 nudge (shift around
2074        // independently-drawn count) to true rank-preserving quantile
2075        // inversion, so empirical Kendall-τ now matches copula theory.
2076        let mut line_spec = self.line_sampler.sample();
2077        if let Some((_u, v)) = copula_uv {
2078            let new_total = 2 + ((v * 10.0).floor() as usize).min(9);
2079            let old_debit = line_spec.debit_count.max(1);
2080            let old_credit = line_spec.credit_count.max(1);
2081            let new_debit = (new_total as f64 * old_debit as f64 / (old_debit + old_credit) as f64)
2082                .round() as usize;
2083            let new_debit = new_debit.clamp(1, new_total - 1);
2084            let new_credit = new_total - new_debit;
2085            line_spec.total_count = new_total;
2086            line_spec.debit_count = new_debit;
2087            line_spec.credit_count = new_credit;
2088        }
2089
2090        // SOTA-10 (#138): optional hard cap on total lines per JE — tames the
2091        // monster outliers (synth max 2133 vs corpus 924). Scales debit + credit
2092        // proportionally so balance is preserved.
2093        if let Some(cap) = self.config.lines_per_je_cap {
2094            let cap = cap.max(2);
2095            let total = line_spec.debit_count + line_spec.credit_count;
2096            if total > cap {
2097                let new_debit =
2098                    ((line_spec.debit_count as f64 / total as f64) * cap as f64).round() as usize;
2099                let new_debit = new_debit.clamp(1, cap - 1);
2100                let new_credit = cap - new_debit;
2101                line_spec.total_count = cap;
2102                line_spec.debit_count = new_debit;
2103                line_spec.credit_count = new_credit;
2104            }
2105        }
2106
2107        // Determine source type using full 4-way distribution
2108        let source = self.select_source();
2109        let is_automated = matches!(
2110            source,
2111            TransactionSource::Automated | TransactionSource::Recurring
2112        );
2113
2114        // SP3.6 — when priors are loaded, sample a canonical SAP source code
2115        // from the bundle's source-mix distribution.  This is independent of
2116        // the `TransactionSource` enum (which controls manual/automated semantics)
2117        // and is written to `header.sap_source_code`, then emitted in the CSV
2118        // `source` column in place of the generic label.
2119        let sap_source_code: Option<String> = self.sample_sap_source_code();
2120        // SOTA-8: stash the current JE's SAP source so select_*_account can consult
2121        // the per-source Dirichlet pool. Cleared at the end of this generate() call.
2122        self.current_je_source = sap_source_code.clone();
2123
2124        // Select business process
2125        let business_process = self.select_business_process();
2126
2127        // SP3 T11 — IET-driven posting-date override.
2128        //
2129        // When priors are loaded, replace the uniform temporal-sampler date
2130        // with one derived from the per-Source inter-event-time prior.  We
2131        // accumulate IET samples (in fractional days) per document-type code
2132        // and map the accumulated offset onto [start_date, end_date].
2133        //
2134        // The None path is untouched: `posting_date` from the temporal sampler
2135        // above is used as-is.
2136        {
2137            // Split-borrow: three distinct struct fields accessed simultaneously.
2138            let priors_opt = &mut self.loaded_priors;
2139            let rng_ref = &mut self.rng;
2140            let iet_accum_ref = &mut self.iet_day_accum;
2141            if let Some(priors) = priors_opt {
2142                let doc_type = Self::document_type_for_process(business_process).to_string();
2143                let period_days = (self.end_date - self.start_date).num_days().max(1) as f64;
2144                let iet = priors
2145                    .iet_sampler
2146                    .sample_next(&doc_type, rng_ref)
2147                    .max(0.001);
2148                let accum = iet_accum_ref.entry(doc_type).or_insert(0.0);
2149                *accum += iet;
2150                // Wrap within period so we never exceed the generation window.
2151                if *accum >= period_days {
2152                    *accum %= period_days;
2153                }
2154                let day_offset =
2155                    (*accum as i64).clamp(0, (self.end_date - self.start_date).num_days());
2156                posting_date = self.start_date + chrono::Duration::days(day_offset);
2157                // Re-apply business-day snap so the IET date still lands on a
2158                // working day (matches the business_day_calculator logic above).
2159                if let Some(ref calc) = self.business_day_calculator {
2160                    if !calc.is_business_day(posting_date) {
2161                        posting_date = calc.next_business_day(posting_date, false);
2162                        if posting_date > self.end_date {
2163                            posting_date = calc.prev_business_day(self.end_date, true);
2164                        }
2165                    }
2166                }
2167            } // end if let Some(priors)
2168        } // end split-borrow scope
2169
2170        // SP3 T14 — active-window gating.
2171        //
2172        // After the IET-driven date is computed, check whether this Source is
2173        // still in its active window for the resulting day.  If the prior says
2174        // the Source has "gone quiet" (e.g. a vendor that stopped trading), we
2175        // fall back to the temporal-sampler date so the JE still emits but is
2176        // no longer anchored to the IET timeline for this source.
2177        //
2178        // In a day-loop architecture this would be a `continue`; here, the
2179        // equivalent is to revert `posting_date` to the original temporal-
2180        // sampler sample so downstream logic sees a plausible date.
2181        //
2182        // The None path is untouched.
2183        if let Some(ref priors) = self.loaded_priors {
2184            let doc_type = Self::document_type_for_process(business_process);
2185            let day_in_period = (posting_date - self.start_date).num_days();
2186            let active = match &priors.multi_segment_window {
2187                Some(msw) => msw.is_active(doc_type, day_in_period),
2188                None => priors.active_window.is_active(doc_type, day_in_period),
2189            };
2190            if !active {
2191                // Source is outside its active window: fall back to a fresh
2192                // temporal-sampler draw.  (SP3.5c: the up-front temporal draw
2193                // is skipped when priors are loaded, so we always re-sample
2194                // here in the fallback path rather than reusing a cached value.)
2195                posting_date = self
2196                    .temporal_sampler
2197                    .sample_date(self.start_date, self.end_date);
2198                if let Some(ref calc) = self.business_day_calculator {
2199                    if !calc.is_business_day(posting_date) {
2200                        posting_date = calc.next_business_day(posting_date, false);
2201                        if posting_date > self.end_date {
2202                            posting_date = calc.prev_business_day(self.end_date, true);
2203                        }
2204                    }
2205                }
2206            }
2207        }
2208
2209        // SP3 T12 — lines-per-JE override from prior histogram.
2210        //
2211        // When priors are loaded, replace `line_spec` totals with a sample
2212        // drawn from the Source-conditional histogram (falling back to the
2213        // overall histogram when the document-type is unknown).  `.max(2)`
2214        // guarantees every JE has at least one debit + one credit line.
2215        // The None path leaves `line_spec` from the copula / line-sampler
2216        // cascade above completely unchanged.
2217        if let Some(ref priors) = self.loaded_priors {
2218            let doc_type = Self::document_type_for_process(business_process);
2219            let hist = priors
2220                .lines_per_je
2221                .by_source
2222                .get(doc_type)
2223                .unwrap_or(&priors.lines_per_je.overall);
2224            let n_total = (hist.sample_bucket(&mut self.rng) as usize).max(2);
2225            let old_debit = line_spec.debit_count.max(1);
2226            let old_credit = line_spec.credit_count.max(1);
2227            let new_debit = (n_total as f64 * old_debit as f64 / (old_debit + old_credit) as f64)
2228                .round() as usize;
2229            let new_debit = new_debit.clamp(1, n_total - 1);
2230            line_spec.total_count = n_total;
2231            line_spec.debit_count = new_debit;
2232            line_spec.credit_count = n_total - new_debit;
2233        }
2234
2235        // Determine if this is a fraudulent transaction
2236        let fraud_type = self.determine_fraud();
2237        let is_fraud = fraud_type.is_some();
2238
2239        // Sample time based on source
2240        let time = self.temporal_sampler.sample_time(!is_automated);
2241        let created_at = posting_date.and_time(time).and_utc();
2242
2243        // Select user from pool or generate generic
2244        let (created_by, user_persona) = self.select_user(is_automated);
2245
2246        // Create header with deterministic UUID
2247        let mut header =
2248            JournalEntryHeader::with_deterministic_id(company_code, posting_date, document_id);
2249        header.created_at = created_at;
2250        header.source = source;
2251        header.sap_source_code = sap_source_code;
2252
2253        // SP3.9 — JE-level trading partner. Draw once per JE; all lines
2254        // inherit. corpus SAP semantics is one TP per document.
2255        // SP3.12 — TP motif sampler: bias toward cluster-mates of the
2256        // previously-drawn TP on the same source to build triangle structure.
2257        // Split-borrow: sap_source_code was moved into header above, so clone
2258        // the code out before the mutable borrow on self.loaded_priors.
2259        // (sap_source_code is cloned again below for the SP4.5 user-persona lookup)
2260        {
2261            let code_opt = header.sap_source_code.clone();
2262            if let Some(ref code) = code_opt {
2263                let rng_ref = &mut self.rng;
2264                // SP3.12: resolve TP motif neighbors from the last TP on this source.
2265                // We read last_tp_by_source (shared ref) before the mutable borrow
2266                // on loaded_priors.  The update happens after the block.
2267                let tp_neighbors: Vec<String> = if let Some(ref priors) = self.loaded_priors {
2268                    if let Some(ref motifs) = priors.tp_motif_sampler {
2269                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
2270                            motifs.neighbors(last_tp).to_vec()
2271                        } else {
2272                            Vec::new()
2273                        }
2274                    } else {
2275                        Vec::new()
2276                    }
2277                } else {
2278                    Vec::new()
2279                };
2280                let tp_share_prob: f64 = if let Some(ref priors) = self.loaded_priors {
2281                    if let Some(ref motifs) = priors.tp_motif_sampler {
2282                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
2283                            motifs.should_share(last_tp)
2284                        } else {
2285                            0.0
2286                        }
2287                    } else {
2288                        0.0
2289                    }
2290                } else {
2291                    0.0
2292                };
2293
2294                if let Some(ref mut priors) = self.loaded_priors {
2295                    // SP3.12: if the motif roll fires AND the distribution
2296                    // supports one of the neighbor TP values, draw from that
2297                    // restricted set.  Otherwise fall through to the marginal.
2298                    let tp = if !tp_neighbors.is_empty()
2299                        && tp_share_prob > 0.0
2300                        && rng_ref.random_range(0.0..1.0) < tp_share_prob
2301                    {
2302                        // Find a neighbor that the per-source TP distribution
2303                        // actually knows about.  Sample from the full marginal
2304                        // weighted by the neighbor-filtered subset.
2305                        use datasynth_core::distributions::behavioral_priors::CategoricalDistribution;
2306                        let filtered: std::collections::BTreeMap<String, f64> = priors
2307                            .per_source_attribute
2308                            .as_ref()
2309                            .and_then(|psa| psa.conditional(code, "trading_partner"))
2310                            .map(|dist| {
2311                                dist.probabilities
2312                                    .iter()
2313                                    .filter(|(v, _)| tp_neighbors.contains(v))
2314                                    .map(|(v, p)| (v.clone(), *p))
2315                                    .collect()
2316                            })
2317                            .unwrap_or_default();
2318                        if filtered.is_empty() {
2319                            priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
2320                        } else {
2321                            let neighbour_dist = CategoricalDistribution {
2322                                probabilities: filtered,
2323                                n: 0, // unused in sample()
2324                            };
2325                            neighbour_dist.sample(rng_ref).or_else(|| {
2326                                priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
2327                            })
2328                        }
2329                    } else {
2330                        priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
2331                    };
2332                    header.trading_partner = tp;
2333                }
2334                // SP3.12: record the drawn TP so the next JE on this source
2335                // can use it as the motif anchor.
2336                if let Some(ref tp) = header.trading_partner {
2337                    self.last_tp_by_source.insert(code.clone(), tp.clone());
2338                }
2339            }
2340        }
2341
2342        // SP4.5 — user-persona prior: when a corpus prior with user data is
2343        // loaded, override `created_by` with a user characteristic of the drawn
2344        // source, and bias `created_at` hour-of-day from the user's density.
2345        // Falls back transparently to `created_by` / `created_at` already set above.
2346        let (created_by, created_at) = {
2347            let sap_code_for_user = header.sap_source_code.clone();
2348            if let (Some(ref code), Some(ref priors)) = (sap_code_for_user, &self.loaded_priors) {
2349                if let Some(uid) = priors.sample_user_for_source(code, &mut self.rng) {
2350                    let new_created_at = if let Some((hour, _)) =
2351                        priors.sample_timestamp_for_user(&uid, &mut self.rng)
2352                    {
2353                        let base = header.created_at;
2354                        base.date_naive()
2355                            .and_hms_opt(hour, 0, 0)
2356                            .map(|naive| naive.and_utc())
2357                            .unwrap_or(base)
2358                    } else {
2359                        header.created_at
2360                    };
2361                    (uid, new_created_at)
2362                } else {
2363                    (created_by, header.created_at)
2364                }
2365            } else {
2366                (created_by, header.created_at)
2367            }
2368        };
2369
2370        header.created_by = created_by;
2371        header.created_at = created_at;
2372        header.user_persona = user_persona;
2373        header.business_process = Some(business_process);
2374        header.document_type = Self::document_type_for_process(business_process).to_string();
2375        header.is_fraud = is_fraud;
2376        header.fraud_type = fraud_type;
2377
2378        // --- ISA 240 audit flags ---
2379        let is_manual = matches!(source, TransactionSource::Manual);
2380        header.is_manual = is_manual;
2381
2382        // Determine source_system based on manual vs automated.
2383        //
2384        // Real ERPs typically expose 20+ distinct provenance codes per
2385        // company (one per module + sub-module + interface). The taxonomy
2386        // below is a strict superset of the legacy {manual, spreadsheet,
2387        // SAP-FI, SAP-MM, SAP-SD, interface, SAP-HR} codes so downstream
2388        // consumers that filter by prefix (e.g. `starts_with("SAP-")`)
2389        // continue to work.
2390        //
2391        // Contract preserved by the generator-level audit assertion in
2392        // `test_isa240_audit_flags_populated`:
2393        //   - manual entries → starts_with("manual") || starts_with("spreadsheet")
2394        //   - automated entries → does NOT start with "manual"/"spreadsheet"
2395        header.source_system = Self::pick_source_system(&mut self.rng, is_manual, business_process);
2396
2397        // is_post_close: entry is in the last month of the configured period
2398        // and the posting date falls after the 25th (simulating close cutoff)
2399        let is_post_close = posting_date.month() == self.end_date.month()
2400            && posting_date.year() == self.end_date.year()
2401            && posting_date.day() > 25;
2402        header.is_post_close = is_post_close;
2403
2404        // created_date: for manual entries, same day as posting; for automated,
2405        // 0-3 days before posting_date
2406        let created_date = if is_manual {
2407            posting_date.and_hms_opt(time.hour().min(23), time.minute(), time.second())
2408        } else {
2409            let lag_days = self.rng.random_range(0i64..=3);
2410            let created_naive_date = posting_date
2411                .checked_sub_signed(chrono::Duration::days(lag_days))
2412                .unwrap_or(posting_date);
2413            created_naive_date.and_hms_opt(
2414                self.rng.random_range(8u32..=17),
2415                self.rng.random_range(0u32..=59),
2416                self.rng.random_range(0u32..=59),
2417            )
2418        };
2419        header.created_date = created_date;
2420
2421        // Generate description context
2422        let mut context =
2423            DescriptionContext::with_period(posting_date.month(), posting_date.year());
2424
2425        // Add vendor/customer context based on business process
2426        match business_process {
2427            BusinessProcess::P2P => {
2428                if let Some(vendor) = self.vendor_pool.random_vendor(&mut self.rng) {
2429                    context.vendor_name = Some(vendor.name.clone());
2430                }
2431            }
2432            BusinessProcess::O2C => {
2433                if let Some(customer) = self.customer_pool.random_customer(&mut self.rng) {
2434                    context.customer_name = Some(customer.name.clone());
2435                }
2436            }
2437            _ => {}
2438        }
2439
2440        // Generate header text if enabled.
2441        // SP6 — Try text-taxonomy prior (sample_header_template) first,
2442        // then the built-in DescriptionGenerator.
2443        if self.template_config.descriptions.generate_header_text {
2444            let priors_header = if let Some(src) = header.sap_source_code.as_deref() {
2445                if let Some(p) = self.loaded_priors.as_ref() {
2446                    // SP6: text-taxonomy header pool
2447                    p.sample_header_template(src, &mut self.md_resolver, &mut self.rng)
2448                } else {
2449                    None
2450                }
2451            } else {
2452                None
2453            };
2454            header.header_text = Some(priors_header.unwrap_or_else(|| {
2455                self.description_generator.generate_header_text(
2456                    business_process,
2457                    &context,
2458                    &mut self.rng,
2459                )
2460            }));
2461        }
2462
2463        // Generate reference if enabled.
2464        // SP4.7 — when priors are loaded and the bundle carries a reference-format
2465        // template for the current SAP source code, sample from that distribution
2466        // instead of the fixed `ReferenceGenerator` template.  The priors path is
2467        // preferred because it produces corpus format patterns; the existing
2468        // generator is the fallback for sources not covered by the bundle.
2469        if self.template_config.references.generate_references {
2470            let priors_ref = header.sap_source_code.as_deref().and_then(|src| {
2471                self.loaded_priors
2472                    .as_ref()
2473                    .and_then(|p| p.sample_reference(src, &mut self.rng))
2474            });
2475            header.reference = Some(priors_ref.unwrap_or_else(|| {
2476                self.reference_generator
2477                    .generate_for_process_year(business_process, posting_date.year())
2478            }));
2479        }
2480
2481        // Derive typed source document from reference prefix
2482        header.source_document = header
2483            .reference
2484            .as_deref()
2485            .and_then(DocumentRef::parse)
2486            .or_else(|| {
2487                if header.source == TransactionSource::Manual {
2488                    Some(DocumentRef::Manual)
2489                } else {
2490                    None
2491                }
2492            });
2493
2494        // Generate line items
2495        let mut entry = JournalEntry::new(header);
2496
2497        // Generate amount - use fraud pattern if this is a fraudulent transaction.
2498        // Non-fraud path prefers the v3.4.0 advanced sampler when configured; fraud
2499        // patterns always use the legacy sampler because they target specific
2500        // thresholds (round numbers, just-under-approval amounts) that are
2501        // orthogonal to mixture models.
2502        let base_amount = if let Some(ft) = fraud_type {
2503            let pattern = self.fraud_type_to_amount_pattern(ft);
2504            self.amount_sampler.sample_fraud(pattern)
2505        } else if let Some(ref mut adv) = self.advanced_amount_sampler {
2506            adv.sample_decimal()
2507        } else {
2508            self.amount_sampler.sample()
2509        };
2510        // v3.5.3+: if a conditional-amount override is configured and
2511        // the JE is non-fraud, re-sample the amount from the conditional
2512        // distribution using the computed context. Fraud entries bypass
2513        // this path to preserve fraud-pattern semantics (as with the
2514        // advanced sampler cascade above).
2515        let base_amount = if fraud_type.is_none() {
2516            // Compute input context BEFORE taking &mut on the sampler
2517            // to avoid borrow-checker conflict with the immutable
2518            // `conditional_input_value` call.
2519            let input = self.conditional_input_value(posting_date);
2520            if let Some(ref mut cond) = self.conditional_amount_override {
2521                cond.sample_decimal(input)
2522            } else {
2523                base_amount
2524            }
2525        } else {
2526            base_amount
2527        };
2528
2529        // SP4.3 — when priors are loaded, try to replace the base_amount with
2530        // a draw from the per-source log-normal conditional.  This step only
2531        // fires for non-fraud JEs (fraud entries must preserve fraud-pattern
2532        // semantics).  We use the source-marginal (gl_prefix = "") as the
2533        // initial lookup; per-class refinement requires knowing the GL account
2534        // which is sampled after the amount in some paths, so we defer that
2535        // to a follow-up sprint.  Balance preservation is maintained because
2536        // the splitter below uses `total_amount` unchanged.
2537        //
2538        // W7.M — autocorr mitigation: ~30 % of priors-enabled draws bypass the
2539        // per-source conditional and draw from the global marginal sampler.
2540        // This loosens the per-source amount-sequence correlation that SP4.3's
2541        // conditional was over-tightening (v5.23 baseline: Source P1 Autocorr
2542        // +750 %, TP P1 Autocorr +101 %).  Proven pattern from SP3.12 W2
2543        // TP-clustering mitigation.
2544        //
2545        // Split-borrow: `loaded_priors` and `rng` are distinct struct fields so
2546        // the compiler allows simultaneous mutable borrows.
2547        // SP5.3 — intermediate tune from 0.20 to 0.25 between v5.24 (0.30 →
2548        // autocorr 1.53, over-corrected) and v5.25 (0.20 → autocorr 3.74,
2549        // under-corrected). Targets the trade-off sweet spot.
2550        const PRIORS_AMOUNT_BYPASS_SHARE: f64 = 0.25;
2551        let base_amount = if fraud_type.is_none() {
2552            if let Some(src) = entry.header.sap_source_code.as_deref() {
2553                let src_owned = src.to_string();
2554                // Gate: skip the conditional ~25 % of the time to loosen
2555                // per-source amount sequence correlation without overshooting.
2556                let use_conditional = self.loaded_priors.is_some()
2557                    && self.rng.random_range(0.0..1.0) >= PRIORS_AMOUNT_BYPASS_SHARE;
2558                if use_conditional {
2559                    let priors_ref = &mut self.loaded_priors;
2560                    let rng_ref = &mut self.rng;
2561                    if let Some(priors) = priors_ref {
2562                        priors
2563                            .sample_amount_for_source(&src_owned, "", rng_ref)
2564                            .and_then(|v| {
2565                                if v.is_finite() && v > 0.0 {
2566                                    Decimal::from_f64_retain(v)
2567                                } else {
2568                                    None
2569                                }
2570                            })
2571                            .unwrap_or(base_amount)
2572                    } else {
2573                        base_amount
2574                    }
2575                } else {
2576                    base_amount
2577                }
2578            } else {
2579                base_amount
2580            }
2581        } else {
2582            base_amount
2583        };
2584
2585        // v4.1.6+: if a copula is configured AND an advanced amount
2586        // sampler with a ppf is available, use true rank-preserving
2587        // inverse-CDF sampling — amount is drawn DIRECTLY from the
2588        // sampler's quantile at `u`, replacing (not nudging) the
2589        // independently-drawn base_amount. This makes empirical
2590        // Kendall-τ match the copula's theoretical τ.
2591        //
2592        // Fallback for copula-without-advanced-sampler: keep the
2593        // v4.1.0 log-scale multiplier nudge (observable correlation,
2594        // diluted magnitude).
2595        let base_amount = if fraud_type.is_none() {
2596            if let Some((u, _v)) = copula_uv {
2597                if let Some(ref adv) = self.advanced_amount_sampler {
2598                    adv.ppf_decimal(u)
2599                } else {
2600                    let log_mult = 4.0 * (u - 0.5);
2601                    let adjusted = base_amount.to_f64().unwrap_or(1.0) * log_mult.exp();
2602                    Decimal::from_f64_retain(adjusted).unwrap_or(base_amount)
2603                }
2604            } else {
2605                base_amount
2606            }
2607        } else {
2608            base_amount
2609        };
2610
2611        // Apply temporal drift if configured
2612        let drift_adjusted_amount = {
2613            let drift = self.get_drift_adjustments(posting_date);
2614            if drift.amount_mean_multiplier != 1.0 {
2615                // Apply drift multiplier (includes seasonal factor if enabled)
2616                let multiplier = drift.amount_mean_multiplier * drift.seasonal_factor;
2617                let adjusted = base_amount.to_f64().unwrap_or(1.0) * multiplier;
2618                Decimal::from_f64_retain(adjusted).unwrap_or(base_amount)
2619            } else {
2620                base_amount
2621            }
2622        };
2623
2624        // Apply human variation to amounts for non-automated transactions
2625        let total_amount = if is_automated {
2626            drift_adjusted_amount // Automated systems use exact amounts
2627        } else {
2628            self.apply_human_variation(drift_adjusted_amount)
2629        };
2630
2631        // SP3 T13 — derive the document-type key once for use in all
2632        // fanout-sampler lookups below.  Computed unconditionally so it is
2633        // available for both debit and credit loops without re-deriving.
2634        let doc_type_for_fanout = Self::document_type_for_process(business_process).to_string();
2635
2636        // SP3.3 — resolve cross-entity motif neighbors for this fanout entity.
2637        // We capture an owned Vec<String> here so that the shared borrow on
2638        // `self.loaded_priors` is released before the subsequent `&mut` borrow
2639        // on `fanout_samplers`.
2640        let (gl_neighbor_vec, gl_share_prob): (Vec<String>, f64) =
2641            if let Some(priors) = &self.loaded_priors {
2642                if let Some(motifs) = &priors.cross_entity_motifs {
2643                    (
2644                        motifs.neighbors(&doc_type_for_fanout).to_vec(),
2645                        motifs.should_share(&doc_type_for_fanout),
2646                    )
2647                } else {
2648                    (Vec::new(), 0.0)
2649                }
2650            } else {
2651                (Vec::new(), 0.0)
2652            };
2653
2654        // SOTA-1: recurring/standard-journal templates. On the no-priors path,
2655        // reuse a cached account archetype for this (company, doc-type, counts)
2656        // with high probability so standard postings recur (and a hot account
2657        // subset dominates). Reuse overrides only the line account (set after
2658        // text/RNG below), so amounts/counts/dates stay byte-identical; fresh
2659        // archetypes are captured + cached after the lines are built.
2660        let reuse_archetype = self.pick_recurring_archetype(
2661            &entry.header.company_code,
2662            &doc_type_for_fanout,
2663            line_spec.debit_count,
2664            line_spec.credit_count,
2665        );
2666        let mut fresh_debit_accts: Vec<String> = Vec::new();
2667        let mut fresh_credit_accts: Vec<String> = Vec::new();
2668        // SOTA-8: hoisted so both the debit and credit loops + their SOTA-1 archetype
2669        // override blocks share the same flag.
2670        let sota8_active = self.config.source_conditional_account_pair.enabled;
2671
2672        // Generate debit lines
2673        let debit_amounts = self
2674            .amount_sampler
2675            .sample_summing_to(line_spec.debit_count, total_amount);
2676        for (i, amount) in debit_amounts.into_iter().enumerate() {
2677            // SP3 T13 — GL Account fanout: when priors are loaded, pick the
2678            // account from the BipartiteFanoutSampler keyed "GLAccount" for
2679            // this Source.  Split-borrows let us hold &mut loaded_priors and
2680            // &mut rng at the same time (distinct struct fields).
2681            // SP3 T13 — GL Account fanout for debit lines.
2682            // Pre-compute the fallback before the split-borrow scope so that
2683            // `select_debit_account` (which takes `&mut self`) does not conflict
2684            // with the concurrent borrow of `loaded_priors` and `rng`.
2685            let debit_fallback = self.select_debit_account().account_number.clone();
2686            // SOTA-8: when enabled, the per-source Dirichlet pool (which `select_debit_account`
2687            // has already consulted via try_cond_pick_account_number) takes precedence over the
2688            // SP3/SP4 priors-driven path so the user's explicit source-conditional knob actually
2689            // governs the source-conditional account distribution. `sota8_active` is hoisted
2690            // above this scope so the credit loop can see it too.
2691            let account_number = if sota8_active {
2692                debit_fallback
2693            } else {
2694                let priors_opt = &mut self.loaded_priors;
2695                let rng_ref = &mut self.rng;
2696                if let Some(priors) = priors_opt {
2697                    // SP4.6 — role-aware GL account selection: try (source, "DR")
2698                    // conditional first, then fall back to SP3.7 source-marginal,
2699                    // then to the fanout sampler, then to the default debit account.
2700                    let sp46_gl = entry
2701                        .header
2702                        .sap_source_code
2703                        .as_deref()
2704                        .and_then(|code| priors.sample_gl_for_source_role(code, "DR", rng_ref));
2705                    if let Some(gl) = sp46_gl {
2706                        gl
2707                    } else {
2708                        // SP3.7 — try per-source marginal GL account.
2709                        let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
2710                            priors.sample_attribute_for_source(code, "gl_account", rng_ref)
2711                        });
2712                        if let Some(gl) = sp37_gl {
2713                            gl
2714                        } else if let Some(sampler) = priors.fanout_samplers.get_mut("GLAccount") {
2715                            // SP3.3: prefer neighbor-used buckets when motifs are available.
2716                            sampler.pick_for_with_neighbors(
2717                                &doc_type_for_fanout,
2718                                &gl_neighbor_vec,
2719                                gl_share_prob,
2720                                rng_ref,
2721                            )
2722                        } else {
2723                            debit_fallback
2724                        }
2725                    }
2726                } else {
2727                    debit_fallback
2728                }
2729            };
2730            let mut line = JournalEntryLine::debit(
2731                entry.header.document_id,
2732                (i + 1) as u32,
2733                account_number.clone(),
2734                amount,
2735            );
2736
2737            // Generate line text if enabled.
2738            // SP6 — Try text-taxonomy (account-class cascade), then DescriptionGenerator.
2739            if self.template_config.descriptions.generate_line_text {
2740                let src = entry.header.sap_source_code.as_deref();
2741                let priors_line = if let Some(s) = src {
2742                    if let Some(p) = self.loaded_priors.as_ref() {
2743                        let account_class = p
2744                            .coa_semantic
2745                            .as_ref()
2746                            .and_then(|c| c.accounts.get(&account_number))
2747                            .and_then(|a| a.account_class.as_deref())
2748                            .unwrap_or(
2749                                datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior::UNKNOWN_CLASS,
2750                            );
2751                        // SP6 text_taxonomy cascade
2752                        p.sample_line_template(
2753                            s,
2754                            account_class,
2755                            &mut self.md_resolver,
2756                            &mut self.rng,
2757                        )
2758                    } else {
2759                        None
2760                    }
2761                } else {
2762                    None
2763                };
2764                line.line_text = Some(priors_line.unwrap_or_else(|| {
2765                    self.description_generator.generate_line_text(
2766                        &account_number,
2767                        &context,
2768                        &mut self.rng,
2769                    )
2770                }));
2771            }
2772
2773            // SOTA-1: override the line's account with the reused archetype's
2774            // (RNG + text above are unchanged -> amounts/counts/dates stay
2775            // byte-identical); else capture the fresh account for caching.
2776            // SOTA-1 and SOTA-8 compose: SOTA-8 picks the FIRST archetype's accounts
2777            // from its per-source pool, then SOTA-1 caches + reuses them. Disabling
2778            // SOTA-1 under SOTA-8 actually *worsens* edge concentration — empirically
2779            // measured in Round 0 v4: edges/je 0.35 -> 0.82 when SOTA-1 was bypassed.
2780            if let Some((ref d, _)) = reuse_archetype {
2781                if let Some(a) = d.get(i) {
2782                    line.gl_account = a.clone();
2783                }
2784            } else if self.loaded_priors.is_none() {
2785                fresh_debit_accts.push(line.gl_account.clone());
2786            }
2787            entry.add_line(line);
2788        }
2789
2790        // Generate credit lines - use the SAME amounts to ensure balance
2791        let credit_amounts = self
2792            .amount_sampler
2793            .sample_summing_to(line_spec.credit_count, total_amount);
2794        for (i, amount) in credit_amounts.into_iter().enumerate() {
2795            // SP3 T13 — GL Account fanout for credit lines.
2796            let credit_fallback = self.select_credit_account().account_number.clone();
2797            // SOTA-8 precedence (mirror of the debit-side block above).
2798            let account_number = if sota8_active {
2799                credit_fallback
2800            } else {
2801                let priors_opt = &mut self.loaded_priors;
2802                let rng_ref = &mut self.rng;
2803                if let Some(priors) = priors_opt {
2804                    let sp46_gl = entry
2805                        .header
2806                        .sap_source_code
2807                        .as_deref()
2808                        .and_then(|code| priors.sample_gl_for_source_role(code, "CR", rng_ref));
2809                    if let Some(gl) = sp46_gl {
2810                        gl
2811                    } else {
2812                        let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
2813                            priors.sample_attribute_for_source(code, "gl_account", rng_ref)
2814                        });
2815                        if let Some(gl) = sp37_gl {
2816                            gl
2817                        } else if let Some(sampler) = priors.fanout_samplers.get_mut("GLAccount") {
2818                            sampler.pick_for_with_neighbors(
2819                                &doc_type_for_fanout,
2820                                &gl_neighbor_vec,
2821                                gl_share_prob,
2822                                rng_ref,
2823                            )
2824                        } else {
2825                            credit_fallback
2826                        }
2827                    }
2828                } else {
2829                    credit_fallback
2830                }
2831            };
2832            let mut line = JournalEntryLine::credit(
2833                entry.header.document_id,
2834                (line_spec.debit_count + i + 1) as u32,
2835                account_number.clone(),
2836                amount,
2837            );
2838
2839            // Generate line text if enabled.
2840            // SP6 — Try text-taxonomy (account-class cascade), then DescriptionGenerator.
2841            if self.template_config.descriptions.generate_line_text {
2842                let src = entry.header.sap_source_code.as_deref();
2843                let priors_line = if let Some(s) = src {
2844                    if let Some(p) = self.loaded_priors.as_ref() {
2845                        let account_class = p
2846                            .coa_semantic
2847                            .as_ref()
2848                            .and_then(|c| c.accounts.get(&account_number))
2849                            .and_then(|a| a.account_class.as_deref())
2850                            .unwrap_or(
2851                                datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior::UNKNOWN_CLASS,
2852                            );
2853                        // SP6 text_taxonomy cascade
2854                        p.sample_line_template(
2855                            s,
2856                            account_class,
2857                            &mut self.md_resolver,
2858                            &mut self.rng,
2859                        )
2860                    } else {
2861                        None
2862                    }
2863                } else {
2864                    None
2865                };
2866                line.line_text = Some(priors_line.unwrap_or_else(|| {
2867                    self.description_generator.generate_line_text(
2868                        &account_number,
2869                        &context,
2870                        &mut self.rng,
2871                    )
2872                }));
2873            }
2874
2875            // SOTA-1: override the credit line's account with the reused
2876            // archetype's; else capture the fresh account for caching.
2877            // (Same compose-with-SOTA-8 rationale as the debit block.)
2878            if let Some((_, ref c)) = reuse_archetype {
2879                if let Some(a) = c.get(i) {
2880                    line.gl_account = a.clone();
2881                }
2882            } else if self.loaded_priors.is_none() {
2883                fresh_credit_accts.push(line.gl_account.clone());
2884            }
2885            entry.add_line(line);
2886        }
2887
2888        // SOTA-1: cache the freshly-selected archetype for future reuse so
2889        // standard postings recur (skipped when this JE reused one).
2890        if reuse_archetype.is_none() {
2891            self.cache_recurring_archetype(
2892                &entry.header.company_code,
2893                &doc_type_for_fanout,
2894                std::mem::take(&mut fresh_debit_accts),
2895                std::mem::take(&mut fresh_credit_accts),
2896            );
2897        }
2898
2899        // Enrich line items with account descriptions, cost centers, etc.
2900        self.enrich_line_items(&mut entry);
2901
2902        // Apply persona-based errors if enabled and it's a human user
2903        if self.persona_errors_enabled && !is_automated {
2904            self.maybe_inject_persona_error(&mut entry);
2905        }
2906
2907        // Apply approval workflow if enabled and amount exceeds threshold
2908        if self.approval_enabled {
2909            self.maybe_apply_approval_workflow(&mut entry, posting_date);
2910        }
2911
2912        // Populate approved_by / approval_date from the approval workflow
2913        self.populate_approval_fields(&mut entry, posting_date);
2914
2915        // Maybe start a batch of similar entries for realism
2916        self.maybe_start_batch(&entry);
2917
2918        // SP3.4 + SP3.5b — observe each line through the velocity calibrator and
2919        // apply each returned CalibrationStep to the relevant tunable parameter.
2920        if self.velocity_calibrator.is_some() {
2921            let mut pending: Vec<crate::velocity_calibrator::CalibrationStep> = Vec::new();
2922            for line in &entry.lines {
2923                if let Some(step) = self
2924                    .velocity_calibrator
2925                    .as_mut()
2926                    .and_then(|cal| cal.observe_line(line))
2927                {
2928                    pending.push(step);
2929                }
2930            }
2931            for step in pending {
2932                self.apply_calibration_step(&step);
2933            }
2934        }
2935
2936        // SOTA-4: with a small probability, post this JE in a foreign document
2937        // currency (company-ledger amounts unchanged; adds transaction_amount).
2938        self.maybe_apply_foreign_currency(&mut entry);
2939
2940        // SOTA-5: remember this JE so a later reversal can offset it.
2941        self.record_for_reversal(&entry);
2942
2943        entry
2944    }
2945
2946    /// SP3.5b — Apply a CalibrationStep from the velocity calibrator to the
2947    /// affected tunable parameter on this generator.
2948    ///
2949    /// Only `amounts.lognormal_sigma` (R6) and `amounts.round_dollar_share`
2950    /// (R9) are plumbed in v5.14. R7/R8/R10 parameters (off_hours_share,
2951    /// post_close_share, backdating_share) are observed by the calibrator
2952    /// but not yet consumed on the generator side — see v5.15 for plumbing.
2953    fn apply_calibration_step(&mut self, step: &crate::velocity_calibrator::CalibrationStep) {
2954        match step.parameter.as_str() {
2955            "amounts.lognormal_sigma" => {
2956                self.amount_sampler.set_lognormal_sigma(step.new_value);
2957            }
2958            "amounts.round_dollar_share" => {
2959                self.amount_sampler
2960                    .set_round_number_probability(step.new_value);
2961            }
2962            _ => {
2963                // Unknown / not-yet-plumbed parameter — calibrator records it
2964                // in `adjustments` for inspection; no mutation here.
2965            }
2966        }
2967    }
2968
2969    /// Enable or disable persona-based error injection.
2970    ///
2971    /// When enabled, entries created by human personas have a chance
2972    /// to contain realistic human errors based on their experience level.
2973    pub fn with_persona_errors(mut self, enabled: bool) -> Self {
2974        self.persona_errors_enabled = enabled;
2975        self
2976    }
2977
2978    /// Set fraud configuration for fraud injection.
2979    ///
2980    /// When fraud is enabled in the config, transactions have a chance
2981    /// to be marked as fraudulent based on the configured fraud rate.
2982    pub fn with_fraud_config(mut self, config: FraudConfig) -> Self {
2983        self.fraud_config = config;
2984        self
2985    }
2986
2987    /// Check if persona errors are enabled.
2988    pub fn persona_errors_enabled(&self) -> bool {
2989        self.persona_errors_enabled
2990    }
2991
2992    /// Enable or disable batch processing behavior.
2993    ///
2994    /// When enabled (default), the generator will occasionally produce batches
2995    /// of similar entries, simulating how humans batch similar work together.
2996    pub fn with_batching(mut self, enabled: bool) -> Self {
2997        if !enabled {
2998            self.batch_state = None;
2999        }
3000        self
3001    }
3002
3003    /// Check if batch processing is enabled.
3004    pub fn batching_enabled(&self) -> bool {
3005        // Batching is implicitly enabled when not explicitly disabled
3006        true
3007    }
3008
3009    /// Maybe start a batch based on the current entry.
3010    ///
3011    /// Humans often batch similar work: processing invoices from one vendor,
3012    /// entering expense reports for a trip, reconciling similar items.
3013    fn maybe_start_batch(&mut self, entry: &JournalEntry) {
3014        // Only start batch for non-automated, non-fraud entries
3015        if entry.header.source == TransactionSource::Automated || entry.header.is_fraud {
3016            return;
3017        }
3018
3019        // 15% chance to start a batch (most work is not batched)
3020        if self.rng.random::<f64>() > 0.15 {
3021            return;
3022        }
3023
3024        // Extract key attributes for batching
3025        let base_account = entry
3026            .lines
3027            .first()
3028            .map(|l| l.gl_account.clone())
3029            .unwrap_or_default();
3030
3031        let base_amount = entry.total_debit();
3032
3033        self.batch_state = Some(BatchState {
3034            base_account_number: base_account,
3035            base_amount,
3036            base_business_process: entry.header.business_process,
3037            base_posting_date: entry.header.posting_date,
3038            remaining: self.rng.random_range(2..7), // 2-6 more similar entries
3039        });
3040    }
3041
3042    /// Generate an entry that's part of the current batch.
3043    ///
3044    /// Batched entries have:
3045    /// - Same or very similar business process
3046    /// - Same posting date (batched work done together)
3047    /// - Similar amounts (within ±15%)
3048    /// - Same debit account (processing similar items)
3049    fn generate_batched_entry(&mut self) -> JournalEntry {
3050        use rust_decimal::Decimal;
3051
3052        // Decrement batch counter
3053        if let Some(ref mut state) = self.batch_state {
3054            state.remaining = state.remaining.saturating_sub(1);
3055        }
3056
3057        let Some(batch) = self.batch_state.clone() else {
3058            // This is a programming error - batch_state should be set before calling this method.
3059            // Clear state and fall back to generating a standard entry instead of panicking.
3060            tracing::warn!(
3061                "generate_batched_entry called without batch_state; generating standard entry"
3062            );
3063            self.batch_state = None;
3064            return self.generate();
3065        };
3066
3067        // Use the batch's posting date (work done on same day)
3068        let posting_date = batch.base_posting_date;
3069
3070        self.count += 1;
3071        let document_id = self.generate_deterministic_uuid();
3072
3073        // Select same company (batched work is usually same company)
3074        let company_code = self.company_selector.select(&mut self.rng).to_string();
3075
3076        // Use simplified line spec for batched entries (usually 2-line)
3077        let _line_spec = LineItemSpec {
3078            total_count: 2,
3079            debit_count: 1,
3080            credit_count: 1,
3081            split_type: DebitCreditSplit::Equal,
3082        };
3083
3084        // Batched entries are always manual
3085        let source = TransactionSource::Manual;
3086
3087        // SP3.6 — sample SAP source code for the batch entry when priors loaded.
3088        let sap_source_code: Option<String> = self.sample_sap_source_code();
3089        // SOTA-8: stash the batch JE's source for the per-source pool consult.
3090        self.current_je_source = sap_source_code.clone();
3091
3092        // Use the batch's business process
3093        let business_process = batch.base_business_process.unwrap_or(BusinessProcess::R2R);
3094
3095        // Sample time
3096        let time = self.temporal_sampler.sample_time(true);
3097        let created_at = posting_date.and_time(time).and_utc();
3098
3099        // Same user for batched work
3100        let (created_by, user_persona) = self.select_user(false);
3101
3102        // Create header
3103        let mut header =
3104            JournalEntryHeader::with_deterministic_id(company_code, posting_date, document_id);
3105        header.created_at = created_at;
3106        header.source = source;
3107        header.sap_source_code = sap_source_code;
3108
3109        // SP3.9 — JE-level trading partner for batched entries (same pattern as
3110        // the primary generate() path).
3111        // SP3.12 — TP motif biasing also applies to batched entries.
3112        {
3113            let code_opt = header.sap_source_code.clone();
3114            if let Some(ref code) = code_opt {
3115                let rng_ref = &mut self.rng;
3116                let tp_neighbors: Vec<String> = if let Some(ref priors) = self.loaded_priors {
3117                    if let Some(ref motifs) = priors.tp_motif_sampler {
3118                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
3119                            motifs.neighbors(last_tp).to_vec()
3120                        } else {
3121                            Vec::new()
3122                        }
3123                    } else {
3124                        Vec::new()
3125                    }
3126                } else {
3127                    Vec::new()
3128                };
3129                let tp_share_prob: f64 = if let Some(ref priors) = self.loaded_priors {
3130                    if let Some(ref motifs) = priors.tp_motif_sampler {
3131                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
3132                            motifs.should_share(last_tp)
3133                        } else {
3134                            0.0
3135                        }
3136                    } else {
3137                        0.0
3138                    }
3139                } else {
3140                    0.0
3141                };
3142                if let Some(ref mut priors) = self.loaded_priors {
3143                    use datasynth_core::distributions::behavioral_priors::CategoricalDistribution;
3144                    let tp = if !tp_neighbors.is_empty()
3145                        && tp_share_prob > 0.0
3146                        && rng_ref.random_range(0.0..1.0) < tp_share_prob
3147                    {
3148                        let filtered: std::collections::BTreeMap<String, f64> = priors
3149                            .per_source_attribute
3150                            .as_ref()
3151                            .and_then(|psa| psa.conditional(code, "trading_partner"))
3152                            .map(|dist| {
3153                                dist.probabilities
3154                                    .iter()
3155                                    .filter(|(v, _)| tp_neighbors.contains(v))
3156                                    .map(|(v, p)| (v.clone(), *p))
3157                                    .collect()
3158                            })
3159                            .unwrap_or_default();
3160                        if filtered.is_empty() {
3161                            priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3162                        } else {
3163                            let neighbour_dist = CategoricalDistribution {
3164                                probabilities: filtered,
3165                                n: 0,
3166                            };
3167                            neighbour_dist.sample(rng_ref).or_else(|| {
3168                                priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3169                            })
3170                        }
3171                    } else {
3172                        priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3173                    };
3174                    header.trading_partner = tp;
3175                }
3176                if let Some(ref tp) = header.trading_partner {
3177                    self.last_tp_by_source.insert(code.clone(), tp.clone());
3178                }
3179            }
3180        }
3181
3182        // SP4.5 — user-persona prior for batched entries (same pattern as primary path).
3183        let (created_by, created_at) = {
3184            let sap_code_for_user = header.sap_source_code.clone();
3185            if let (Some(ref code), Some(ref priors)) = (sap_code_for_user, &self.loaded_priors) {
3186                if let Some(uid) = priors.sample_user_for_source(code, &mut self.rng) {
3187                    let new_created_at = if let Some((hour, _)) =
3188                        priors.sample_timestamp_for_user(&uid, &mut self.rng)
3189                    {
3190                        let base = header.created_at;
3191                        base.date_naive()
3192                            .and_hms_opt(hour, 0, 0)
3193                            .map(|naive| naive.and_utc())
3194                            .unwrap_or(base)
3195                    } else {
3196                        header.created_at
3197                    };
3198                    (uid, new_created_at)
3199                } else {
3200                    (created_by, header.created_at)
3201                }
3202            } else {
3203                (created_by, header.created_at)
3204            }
3205        };
3206
3207        header.created_by = created_by;
3208        header.created_at = created_at;
3209        header.user_persona = user_persona;
3210        header.business_process = Some(business_process);
3211        header.document_type = Self::document_type_for_process(business_process).to_string();
3212
3213        // Batched manual entries have Manual source document
3214        header.source_document = Some(DocumentRef::Manual);
3215
3216        // ISA 240 audit flags for batched entries (always manual)
3217        header.is_manual = true;
3218        header.source_system = if self.rng.random::<f64>() < 0.70 {
3219            "manual".to_string()
3220        } else {
3221            "spreadsheet".to_string()
3222        };
3223        header.is_post_close = posting_date.month() == self.end_date.month()
3224            && posting_date.year() == self.end_date.year()
3225            && posting_date.day() > 25;
3226        header.created_date =
3227            posting_date.and_hms_opt(time.hour().min(23), time.minute(), time.second());
3228
3229        // Generate similar amount (within ±15% of base)
3230        let variation = self.rng.random_range(-0.15..0.15);
3231        let varied_amount =
3232            batch.base_amount * (Decimal::ONE + Decimal::try_from(variation).unwrap_or_default());
3233        let total_amount = varied_amount.round_dp(2).max(Decimal::from(1));
3234
3235        // Create the entry
3236        let mut entry = JournalEntry::new(header);
3237
3238        // Use same debit account as batch base
3239        let debit_line = JournalEntryLine::debit(
3240            entry.header.document_id,
3241            1,
3242            batch.base_account_number.clone(),
3243            total_amount,
3244        );
3245        entry.add_line(debit_line);
3246
3247        // SP3.12 W3 — Select a credit account for the batched entry.
3248        // When priors are loaded and this entry has a SAP source code, use the
3249        // per-source GL-account conditional (same as the primary generate() path).
3250        // This prevents batched entries from adding legacy-CoA accounts to the
3251        // Source-Source projection graph, which was inflating graph density and
3252        // driving the P3 ClusteringGap metric above 30× DR.
3253        let credit_fallback = self.select_credit_account().account_number.clone();
3254        let credit_account = {
3255            let priors_opt = &mut self.loaded_priors;
3256            let rng_ref = &mut self.rng;
3257            if let Some(priors) = priors_opt {
3258                // SP4.6 — role-aware GL for the batched-entry credit line.
3259                // Try (source, "CR") first, then source-marginal, then fallback.
3260                let sp46_gl = entry
3261                    .header
3262                    .sap_source_code
3263                    .as_deref()
3264                    .and_then(|code| priors.sample_gl_for_source_role(code, "CR", rng_ref));
3265                if let Some(gl) = sp46_gl {
3266                    gl
3267                } else {
3268                    let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
3269                        priors.sample_attribute_for_source(code, "gl_account", rng_ref)
3270                    });
3271                    sp37_gl.unwrap_or(credit_fallback)
3272                }
3273            } else {
3274                credit_fallback
3275            }
3276        };
3277        let credit_line =
3278            JournalEntryLine::credit(entry.header.document_id, 2, credit_account, total_amount);
3279        entry.add_line(credit_line);
3280
3281        // Enrich line items with account descriptions, cost centers, etc.
3282        self.enrich_line_items(&mut entry);
3283
3284        // Apply persona-based errors if enabled
3285        if self.persona_errors_enabled {
3286            self.maybe_inject_persona_error(&mut entry);
3287        }
3288
3289        // Apply approval workflow if enabled
3290        if self.approval_enabled {
3291            self.maybe_apply_approval_workflow(&mut entry, posting_date);
3292        }
3293
3294        // Populate approved_by / approval_date from the approval workflow
3295        self.populate_approval_fields(&mut entry, posting_date);
3296
3297        // Clear batch state if no more entries remaining
3298        if batch.remaining <= 1 {
3299            self.batch_state = None;
3300        }
3301
3302        entry
3303    }
3304
3305    /// Maybe inject a persona-appropriate error based on the persona's error rate.
3306    fn maybe_inject_persona_error(&mut self, entry: &mut JournalEntry) {
3307        // Parse persona from the entry header
3308        let persona_str = &entry.header.user_persona;
3309        let persona = match persona_str.to_lowercase().as_str() {
3310            s if s.contains("junior") => UserPersona::JuniorAccountant,
3311            s if s.contains("senior") => UserPersona::SeniorAccountant,
3312            s if s.contains("controller") => UserPersona::Controller,
3313            s if s.contains("manager") => UserPersona::Manager,
3314            s if s.contains("executive") => UserPersona::Executive,
3315            _ => return, // Don't inject errors for unknown personas
3316        };
3317
3318        // Get base error rate from persona
3319        let base_error_rate = persona.error_rate();
3320
3321        // Apply stress factors based on posting date
3322        let adjusted_rate = self.apply_stress_factors(base_error_rate, entry.header.posting_date);
3323
3324        // Check if error should occur based on adjusted rate
3325        if self.rng.random::<f64>() >= adjusted_rate {
3326            return; // No error this time
3327        }
3328
3329        // Select and inject persona-appropriate error
3330        self.inject_human_error(entry, persona);
3331    }
3332
3333    /// Apply contextual stress factors to the base error rate.
3334    ///
3335    /// Stress factors increase error likelihood during:
3336    /// - Month-end (day >= 28): 1.5x more errors due to deadline pressure
3337    /// - Quarter-end (Mar, Jun, Sep, Dec): additional 25% boost
3338    /// - Year-end (December 28-31): 2.0x more errors due to audit pressure
3339    /// - Monday morning (catch-up work): 20% more errors
3340    /// - Friday afternoon (rushing to leave): 30% more errors
3341    fn apply_stress_factors(&self, base_rate: f64, posting_date: chrono::NaiveDate) -> f64 {
3342        use chrono::Datelike;
3343
3344        let mut rate = base_rate;
3345        let day = posting_date.day();
3346        let month = posting_date.month();
3347
3348        // Year-end stress (December 28-31): double the error rate
3349        if month == 12 && day >= 28 {
3350            rate *= 2.0;
3351            return rate.min(0.5); // Cap at 50% to keep it realistic
3352        }
3353
3354        // Quarter-end stress (last days of Mar, Jun, Sep, Dec)
3355        if matches!(month, 3 | 6 | 9 | 12) && day >= 28 {
3356            rate *= 1.75; // 75% more errors at quarter end
3357            return rate.min(0.4);
3358        }
3359
3360        // Month-end stress (last 3 days of month)
3361        if day >= 28 {
3362            rate *= 1.5; // 50% more errors at month end
3363        }
3364
3365        // Day-of-week stress effects
3366        let weekday = posting_date.weekday();
3367        match weekday {
3368            chrono::Weekday::Mon => {
3369                // Monday: catching up, often rushed
3370                rate *= 1.2;
3371            }
3372            chrono::Weekday::Fri => {
3373                // Friday: rushing to finish before weekend
3374                rate *= 1.3;
3375            }
3376            _ => {}
3377        }
3378
3379        // Cap at 40% to keep it realistic
3380        rate.min(0.4)
3381    }
3382
3383    /// Apply human-like variation to an amount.
3384    ///
3385    /// Humans don't enter perfectly calculated amounts - they:
3386    /// - Round amounts differently
3387    /// - Estimate instead of calculating exactly
3388    /// - Make small input variations
3389    ///
3390    /// This applies small variations (typically ±2%) to make amounts more realistic.
3391    fn apply_human_variation(&mut self, amount: rust_decimal::Decimal) -> rust_decimal::Decimal {
3392        use rust_decimal::Decimal;
3393
3394        // Automated transactions or very small amounts don't get variation
3395        if amount < Decimal::from(10) {
3396            return amount;
3397        }
3398
3399        // 70% chance of human variation being applied
3400        if self.rng.random::<f64>() > 0.70 {
3401            return amount;
3402        }
3403
3404        // Decide which type of human variation to apply
3405        let variation_type: u8 = self.rng.random_range(0..4);
3406
3407        match variation_type {
3408            0 => {
3409                // ±2% variation (common for estimated amounts)
3410                let variation_pct = self.rng.random_range(-0.02..0.02);
3411                let variation = amount * Decimal::try_from(variation_pct).unwrap_or_default();
3412                (amount + variation).round_dp(2)
3413            }
3414            1 => {
3415                // Round to nearest $10
3416                let ten = Decimal::from(10);
3417                (amount / ten).round() * ten
3418            }
3419            2 => {
3420                // Round to nearest $100 (for larger amounts)
3421                if amount >= Decimal::from(500) {
3422                    let hundred = Decimal::from(100);
3423                    (amount / hundred).round() * hundred
3424                } else {
3425                    amount
3426                }
3427            }
3428            3 => {
3429                // Slight under/over payment (±$0.01 to ±$1.00)
3430                let cents = Decimal::new(self.rng.random_range(-100..100), 2);
3431                (amount + cents).max(Decimal::ZERO).round_dp(2)
3432            }
3433            _ => amount,
3434        }
3435    }
3436
3437    /// Rebalance an entry after a one-sided amount modification.
3438    ///
3439    /// When an error modifies one line's amount, this finds a line on the opposite
3440    /// side (credit if modified was debit, or vice versa) and adjusts it by the
3441    /// same impact to maintain balance.
3442    fn rebalance_entry(entry: &mut JournalEntry, modified_was_debit: bool, impact: Decimal) {
3443        // Find a line on the opposite side to adjust
3444        let balancing_idx = entry.lines.iter().position(|l| {
3445            if modified_was_debit {
3446                l.credit_amount > Decimal::ZERO
3447            } else {
3448                l.debit_amount > Decimal::ZERO
3449            }
3450        });
3451
3452        if let Some(idx) = balancing_idx {
3453            if modified_was_debit {
3454                entry.lines[idx].credit_amount += impact;
3455            } else {
3456                entry.lines[idx].debit_amount += impact;
3457            }
3458        }
3459    }
3460
3461    /// Inject a human-like error based on the persona.
3462    ///
3463    /// All error types maintain balance - amount modifications are applied to both sides.
3464    /// Entries are marked with [HUMAN_ERROR:*] tags in header_text for ML detection.
3465    fn inject_human_error(&mut self, entry: &mut JournalEntry, persona: UserPersona) {
3466        use rust_decimal::Decimal;
3467
3468        // Different personas make different types of errors
3469        let error_type: u8 = match persona {
3470            UserPersona::JuniorAccountant => {
3471                // Junior accountants make more varied errors
3472                self.rng.random_range(0..5)
3473            }
3474            UserPersona::SeniorAccountant => {
3475                // Senior accountants mainly make transposition errors
3476                self.rng.random_range(0..3)
3477            }
3478            UserPersona::Controller | UserPersona::Manager => {
3479                // Controllers/managers mainly make rounding or cutoff errors
3480                self.rng.random_range(3..5)
3481            }
3482            _ => return,
3483        };
3484
3485        match error_type {
3486            0 => {
3487                // Transposed digits in an amount
3488                if let Some(line) = entry.lines.get_mut(0) {
3489                    let is_debit = line.debit_amount > Decimal::ZERO;
3490                    let original_amount = if is_debit {
3491                        line.debit_amount
3492                    } else {
3493                        line.credit_amount
3494                    };
3495
3496                    // Simple digit swap in the string representation
3497                    let s = original_amount.to_string();
3498                    if s.len() >= 2 {
3499                        let chars: Vec<char> = s.chars().collect();
3500                        let pos = self.rng.random_range(0..chars.len().saturating_sub(1));
3501                        if chars[pos].is_ascii_digit()
3502                            && chars.get(pos + 1).is_some_and(char::is_ascii_digit)
3503                        {
3504                            let mut new_chars = chars;
3505                            new_chars.swap(pos, pos + 1);
3506                            if let Ok(new_amount) =
3507                                new_chars.into_iter().collect::<String>().parse::<Decimal>()
3508                            {
3509                                let impact = new_amount - original_amount;
3510
3511                                // Apply to the modified line
3512                                if is_debit {
3513                                    entry.lines[0].debit_amount = new_amount;
3514                                } else {
3515                                    entry.lines[0].credit_amount = new_amount;
3516                                }
3517
3518                                // Rebalance the entry
3519                                Self::rebalance_entry(entry, is_debit, impact);
3520
3521                                entry.header.header_text = Some(
3522                                    entry.header.header_text.clone().unwrap_or_default()
3523                                        + " [HUMAN_ERROR:TRANSPOSITION]",
3524                                );
3525                            }
3526                        }
3527                    }
3528                }
3529            }
3530            1 => {
3531                // Wrong decimal place (off by factor of 10)
3532                if let Some(line) = entry.lines.get_mut(0) {
3533                    let is_debit = line.debit_amount > Decimal::ZERO;
3534                    let original_amount = if is_debit {
3535                        line.debit_amount
3536                    } else {
3537                        line.credit_amount
3538                    };
3539
3540                    let new_amount = original_amount * Decimal::new(10, 0);
3541                    let impact = new_amount - original_amount;
3542
3543                    // Apply to the modified line
3544                    if is_debit {
3545                        entry.lines[0].debit_amount = new_amount;
3546                    } else {
3547                        entry.lines[0].credit_amount = new_amount;
3548                    }
3549
3550                    // Rebalance the entry
3551                    Self::rebalance_entry(entry, is_debit, impact);
3552
3553                    entry.header.header_text = Some(
3554                        entry.header.header_text.clone().unwrap_or_default()
3555                            + " [HUMAN_ERROR:DECIMAL_SHIFT]",
3556                    );
3557                }
3558            }
3559            2 => {
3560                // Typo in description (doesn't affect balance)
3561                if let Some(ref mut text) = entry.header.header_text {
3562                    let typos = ["teh", "adn", "wiht", "taht", "recieve"];
3563                    let correct = ["the", "and", "with", "that", "receive"];
3564                    let idx = self.rng.random_range(0..typos.len());
3565                    if text.to_lowercase().contains(correct[idx]) {
3566                        *text = text.replace(correct[idx], typos[idx]);
3567                        *text = format!("{text} [HUMAN_ERROR:TYPO]");
3568                    }
3569                }
3570            }
3571            3 => {
3572                // Rounding to round number
3573                if let Some(line) = entry.lines.get_mut(0) {
3574                    let is_debit = line.debit_amount > Decimal::ZERO;
3575                    let original_amount = if is_debit {
3576                        line.debit_amount
3577                    } else {
3578                        line.credit_amount
3579                    };
3580
3581                    let new_amount =
3582                        (original_amount / Decimal::new(100, 0)).round() * Decimal::new(100, 0);
3583                    let impact = new_amount - original_amount;
3584
3585                    // Apply to the modified line
3586                    if is_debit {
3587                        entry.lines[0].debit_amount = new_amount;
3588                    } else {
3589                        entry.lines[0].credit_amount = new_amount;
3590                    }
3591
3592                    // Rebalance the entry
3593                    Self::rebalance_entry(entry, is_debit, impact);
3594
3595                    entry.header.header_text = Some(
3596                        entry.header.header_text.clone().unwrap_or_default()
3597                            + " [HUMAN_ERROR:ROUNDED]",
3598                    );
3599                }
3600            }
3601            // Late posting marker (document date much earlier than posting
3602            // date). Doesn't create an imbalance.
3603            4 if entry.header.document_date == entry.header.posting_date => {
3604                let days_late = self.rng.random_range(5..15);
3605                entry.header.document_date =
3606                    entry.header.posting_date - chrono::Duration::days(days_late);
3607                entry.header.header_text = Some(
3608                    entry.header.header_text.clone().unwrap_or_default()
3609                        + " [HUMAN_ERROR:LATE_POSTING]",
3610                );
3611            }
3612            _ => {}
3613        }
3614    }
3615
3616    /// Apply approval workflow for high-value transactions.
3617    ///
3618    /// If the entry amount exceeds the approval threshold, simulate an
3619    /// approval workflow with appropriate approvers based on amount.
3620    fn maybe_apply_approval_workflow(
3621        &mut self,
3622        entry: &mut JournalEntry,
3623        _posting_date: NaiveDate,
3624    ) {
3625        use rust_decimal::Decimal;
3626
3627        let amount = entry.total_debit();
3628
3629        // Skip if amount is below threshold
3630        if amount <= self.approval_threshold {
3631            // Auto-approved below threshold
3632            let workflow = ApprovalWorkflow::auto_approved(
3633                entry.header.created_by.clone(),
3634                entry.header.user_persona.clone(),
3635                amount,
3636                entry.header.created_at,
3637            );
3638            entry.header.approval_workflow = Some(workflow);
3639            return;
3640        }
3641
3642        // Mark as SOX relevant for high-value transactions
3643        entry.header.sox_relevant = true;
3644
3645        // Determine required approval levels based on amount
3646        let required_levels = if amount > Decimal::new(100000, 0) {
3647            3 // Executive approval required
3648        } else if amount > Decimal::new(50000, 0) {
3649            2 // Senior management approval
3650        } else {
3651            1 // Manager approval
3652        };
3653
3654        // Create the approval workflow
3655        let mut workflow = ApprovalWorkflow::new(
3656            entry.header.created_by.clone(),
3657            entry.header.user_persona.clone(),
3658            amount,
3659        );
3660        workflow.required_levels = required_levels;
3661
3662        // Simulate submission
3663        let submit_time = entry.header.created_at;
3664        let submit_action = ApprovalAction::new(
3665            entry.header.created_by.clone(),
3666            entry.header.user_persona.clone(),
3667            self.parse_persona(&entry.header.user_persona),
3668            ApprovalActionType::Submit,
3669            0,
3670        )
3671        .with_timestamp(submit_time);
3672
3673        workflow.actions.push(submit_action);
3674        workflow.status = ApprovalStatus::Pending;
3675        workflow.submitted_at = Some(submit_time);
3676
3677        // Simulate approvals with realistic delays
3678        let mut current_time = submit_time;
3679        for level in 1..=required_levels {
3680            // Add delay for approval (1-3 business hours per level)
3681            let delay_hours = self.rng.random_range(1..4);
3682            current_time += chrono::Duration::hours(delay_hours);
3683
3684            // Skip weekends
3685            while current_time.weekday() == chrono::Weekday::Sat
3686                || current_time.weekday() == chrono::Weekday::Sun
3687            {
3688                current_time += chrono::Duration::days(1);
3689            }
3690
3691            // Generate approver based on level
3692            let (approver_id, approver_role) = self.select_approver(level);
3693
3694            let approve_action = ApprovalAction::new(
3695                approver_id.clone(),
3696                approver_role.to_string(),
3697                approver_role,
3698                ApprovalActionType::Approve,
3699                level,
3700            )
3701            .with_timestamp(current_time);
3702
3703            workflow.actions.push(approve_action);
3704            workflow.current_level = level;
3705        }
3706
3707        // Mark as approved
3708        workflow.status = ApprovalStatus::Approved;
3709        workflow.approved_at = Some(current_time);
3710
3711        entry.header.approval_workflow = Some(workflow);
3712    }
3713
3714    /// Select an approver based on the required level.
3715    fn select_approver(&mut self, level: u8) -> (String, UserPersona) {
3716        let persona = match level {
3717            1 => UserPersona::Manager,
3718            2 => UserPersona::Controller,
3719            _ => UserPersona::Executive,
3720        };
3721
3722        // Try to get from user pool first
3723        if let Some(ref pool) = self.user_pool {
3724            if let Some(user) = pool.get_random_user(persona, &mut self.rng) {
3725                return (user.user_id.clone(), persona);
3726            }
3727        }
3728
3729        // Fallback to generated approver
3730        let approver_id = match persona {
3731            UserPersona::Manager => format!("MGR{:04}", self.rng.random_range(1..100)),
3732            UserPersona::Controller => format!("CTRL{:04}", self.rng.random_range(1..20)),
3733            UserPersona::Executive => format!("EXEC{:04}", self.rng.random_range(1..10)),
3734            _ => format!("USR{:04}", self.rng.random_range(1..1000)),
3735        };
3736
3737        (approver_id, persona)
3738    }
3739
3740    /// Parse user persona from string.
3741    fn parse_persona(&self, persona_str: &str) -> UserPersona {
3742        match persona_str.to_lowercase().as_str() {
3743            s if s.contains("junior") => UserPersona::JuniorAccountant,
3744            s if s.contains("senior") => UserPersona::SeniorAccountant,
3745            s if s.contains("controller") => UserPersona::Controller,
3746            s if s.contains("manager") => UserPersona::Manager,
3747            s if s.contains("executive") => UserPersona::Executive,
3748            s if s.contains("automated") || s.contains("system") => UserPersona::AutomatedSystem,
3749            _ => UserPersona::JuniorAccountant, // Default
3750        }
3751    }
3752
3753    /// Enable or disable approval workflow.
3754    pub fn with_approval(mut self, enabled: bool) -> Self {
3755        self.approval_enabled = enabled;
3756        self
3757    }
3758
3759    /// Set the approval threshold amount.
3760    pub fn with_approval_threshold(mut self, threshold: rust_decimal::Decimal) -> Self {
3761        self.approval_threshold = threshold;
3762        self
3763    }
3764
3765    /// Set the SOD violation rate for approval tracking.
3766    ///
3767    /// When a transaction is approved, there is a `rate` probability (0.0 to 1.0)
3768    /// that the approver is the same as the creator, which constitutes a SOD violation.
3769    /// Default is 0.10 (10%).
3770    pub fn with_sod_violation_rate(mut self, rate: f64) -> Self {
3771        self.sod_violation_rate = rate;
3772        self
3773    }
3774
3775    /// Populate `approved_by` and `approval_date` from the approval workflow,
3776    /// and flag SOD violations when the approver matches the creator.
3777    fn populate_approval_fields(&mut self, entry: &mut JournalEntry, posting_date: NaiveDate) {
3778        if let Some(ref workflow) = entry.header.approval_workflow {
3779            // Extract the last approver from the workflow actions
3780            let last_approver = workflow
3781                .actions
3782                .iter()
3783                .rev()
3784                .find(|a| matches!(a.action, ApprovalActionType::Approve));
3785
3786            if let Some(approver_action) = last_approver {
3787                entry.header.approved_by = Some(approver_action.actor_id.clone());
3788                entry.header.approval_date = Some(approver_action.action_timestamp.date_naive());
3789            } else {
3790                // No explicit approver (auto-approved); use the preparer
3791                entry.header.approved_by = Some(workflow.preparer_id.clone());
3792                entry.header.approval_date = Some(posting_date);
3793            }
3794
3795            // Inject SOD violation: with configured probability, set approver = creator
3796            if self.rng.random::<f64>() < self.sod_violation_rate {
3797                let creator = entry.header.created_by.clone();
3798                entry.header.approved_by = Some(creator);
3799                entry.header.sod_violation = true;
3800                entry.header.sod_conflict_type = Some(SodConflictType::PreparerApprover);
3801            }
3802        }
3803    }
3804
3805    /// Set the temporal drift controller for simulating distribution changes over time.
3806    ///
3807    /// When drift is enabled, amounts and other distributions will shift based on
3808    /// the period (month) to simulate realistic temporal evolution like inflation
3809    /// or increasing fraud rates.
3810    pub fn with_drift_controller(mut self, controller: DriftController) -> Self {
3811        self.drift_controller = Some(controller);
3812        self
3813    }
3814
3815    /// Set drift configuration directly.
3816    ///
3817    /// Creates a drift controller from the config. Total periods is calculated
3818    /// from the date range.
3819    pub fn with_drift_config(mut self, config: DriftConfig, seed: u64) -> Self {
3820        if config.enabled {
3821            let total_periods = self.calculate_total_periods();
3822            self.drift_controller = Some(DriftController::new(config, seed, total_periods));
3823        }
3824        self
3825    }
3826
3827    /// Calculate total periods (months) in the date range.
3828    fn calculate_total_periods(&self) -> u32 {
3829        let start_year = self.start_date.year();
3830        let start_month = self.start_date.month();
3831        let end_year = self.end_date.year();
3832        let end_month = self.end_date.month();
3833
3834        ((end_year - start_year) * 12 + (end_month as i32 - start_month as i32) + 1).max(1) as u32
3835    }
3836
3837    /// Calculate the period number (0-indexed) for a given date.
3838    fn date_to_period(&self, date: NaiveDate) -> u32 {
3839        let start_year = self.start_date.year();
3840        let start_month = self.start_date.month() as i32;
3841        let date_year = date.year();
3842        let date_month = date.month() as i32;
3843
3844        ((date_year - start_year) * 12 + (date_month - start_month)).max(0) as u32
3845    }
3846
3847    /// Get drift adjustments for a given date.
3848    fn get_drift_adjustments(&self, date: NaiveDate) -> DriftAdjustments {
3849        if let Some(ref controller) = self.drift_controller {
3850            let period = self.date_to_period(date);
3851            controller.compute_adjustments(period)
3852        } else {
3853            DriftAdjustments::none()
3854        }
3855    }
3856
3857    /// Select a user from the pool or generate a generic user ID.
3858    #[inline]
3859    fn select_user(&mut self, is_automated: bool) -> (String, String) {
3860        if let Some(ref pool) = self.user_pool {
3861            let persona = if is_automated {
3862                UserPersona::AutomatedSystem
3863            } else {
3864                // Random distribution among human personas
3865                let roll: f64 = self.rng.random();
3866                if roll < 0.4 {
3867                    UserPersona::JuniorAccountant
3868                } else if roll < 0.7 {
3869                    UserPersona::SeniorAccountant
3870                } else if roll < 0.85 {
3871                    UserPersona::Controller
3872                } else {
3873                    UserPersona::Manager
3874                }
3875            };
3876
3877            if let Some(user) = pool.get_random_user(persona, &mut self.rng) {
3878                return (user.user_id.clone(), user.persona.to_string());
3879            }
3880        }
3881
3882        // Fallback to generic format
3883        if is_automated {
3884            (
3885                format!("BATCH{:04}", self.rng.random_range(1..=20)),
3886                "automated_system".to_string(),
3887            )
3888        } else {
3889            (
3890                format!("USER{:04}", self.rng.random_range(1..=40)),
3891                "senior_accountant".to_string(),
3892            )
3893        }
3894    }
3895
3896    /// Select transaction source based on configuration weights.
3897    #[inline]
3898    fn select_source(&mut self) -> TransactionSource {
3899        let roll: f64 = self.rng.random();
3900        let dist = &self.config.source_distribution;
3901
3902        if roll < dist.manual {
3903            TransactionSource::Manual
3904        } else if roll < dist.manual + dist.automated {
3905            TransactionSource::Automated
3906        } else if roll < dist.manual + dist.automated + dist.recurring {
3907            TransactionSource::Recurring
3908        } else {
3909            TransactionSource::Adjustment
3910        }
3911    }
3912
3913    /// Select a business process based on configuration weights.
3914    #[inline]
3915    /// Map a business process to a SAP-style document type code.
3916    ///
3917    /// - P2P → "KR" (vendor invoice)
3918    /// - O2C → "DR" (customer invoice)
3919    /// - R2R → "SA" (general journal)
3920    /// - H2R → "HR" (HR posting)
3921    /// - A2R → "AA" (asset posting)
3922    /// - others → "SA"
3923    fn document_type_for_process(process: BusinessProcess) -> &'static str {
3924        match process {
3925            BusinessProcess::P2P => "KR",
3926            BusinessProcess::O2C => "DR",
3927            BusinessProcess::R2R => "SA",
3928            BusinessProcess::H2R => "HR",
3929            BusinessProcess::A2R => "AA",
3930            _ => "SA",
3931        }
3932    }
3933
3934    fn select_business_process(&mut self) -> BusinessProcess {
3935        *datasynth_core::utils::weighted_select(&mut self.rng, &self.business_process_weights)
3936    }
3937
3938    /// SOTA-2: draw a rank index in `[0, n)` with `P(rank=i) ∝ 1/(i+1)^ZIPF_ALPHA`
3939    /// from a dedicated stream, so a few low-rank accounts carry most lines (the
3940    /// corpus account-activity Pareto). Returns `None` for an empty/oversized pool
3941    /// so the caller keeps the uniform draw.
3942    #[inline]
3943    fn power_law_index(n: usize, rng: &mut ChaCha8Rng) -> Option<usize> {
3944        if n == 0 || n > ZIPF_CAP {
3945            return None;
3946        }
3947        let total = ZIPF_CUM[n];
3948        let r = rng.random::<f64>() * total;
3949        // smallest k in 1..=n with CUM[k] >= r → 0-based rank k-1
3950        let k = ZIPF_CUM[..=n]
3951            .binary_search_by(|v| v.partial_cmp(&r).unwrap_or(std::cmp::Ordering::Less))
3952            .unwrap_or_else(|e| e);
3953        Some(k.saturating_sub(1).min(n - 1))
3954    }
3955
3956    /// SOTA-2: replace a uniform `Vec<&GLAccount>` pick with a hot-account
3957    /// power-law pick when concentration is on (default). The uniform `.choose`
3958    /// draw on the main `rng` is still consumed by the caller first, so
3959    /// amounts/line-counts/dates stay byte-identical to the legacy stream — only
3960    /// the *selected account* changes. Associated (not `&mut self`) so it borrows
3961    /// only `account_rng`, leaving `coa` free for `all`/`uniform`.
3962    #[inline]
3963    fn concentrate<'a>(
3964        enabled: bool,
3965        rng: &mut ChaCha8Rng,
3966        all: &[&'a GLAccount],
3967        uniform: Option<&'a GLAccount>,
3968    ) -> Option<&'a GLAccount> {
3969        if enabled {
3970            Self::power_law_index(all.len(), rng)
3971                .map(|i| all[i])
3972                .or(uniform)
3973        } else {
3974            uniform
3975        }
3976    }
3977
3978    /// SOTA-8: ensure a `SourcePool` exists for `source` in the sampler (lazy build).
3979    /// One pool per source, persisted across JEs (sampler grows monotonically).
3980    fn ensure_cond_pair_pool(&mut self, source: &str) {
3981        let cfg = &self.config.source_conditional_account_pair;
3982        if !cfg.enabled {
3983            return;
3984        }
3985        if self.cond_pair_sampler.is_none() {
3986            self.cond_pair_sampler = Some(Default::default());
3987        }
3988        let sampler = self
3989            .cond_pair_sampler
3990            .as_mut()
3991            .expect("just-initialised above");
3992        if sampler.pool(source).is_some() {
3993            return;
3994        }
3995        let all_accounts: Vec<String> = self
3996            .coa
3997            .accounts
3998            .iter()
3999            .map(|a| a.account_number.clone())
4000            .collect();
4001        if all_accounts.is_empty() {
4002            return;
4003        }
4004        // Uniform weights here — the existing account-Pareto (account_concentration)
4005        // still applies at the outer fallback level if the per-source pool isn't used.
4006        let weights: Vec<f64> = vec![1.0; all_accounts.len()];
4007        sampler.ensure_pool(
4008            source,
4009            &all_accounts,
4010            &weights,
4011            cfg.accts_per_source_target,
4012            cfg.concentration,
4013            &mut self.cond_pair_rng,
4014        );
4015    }
4016
4017    /// SOTA-8: if the feature is enabled and the current JE has a source with a
4018    /// pool, pick an *account number* from the per-source PMF. Returns an owned
4019    /// `String` so the caller can release the mutable self-borrow before looking
4020    /// up the `GLAccount` in `self.coa`.
4021    #[inline]
4022    fn try_cond_pick_account_number(&mut self) -> Option<String> {
4023        let cfg = &self.config.source_conditional_account_pair;
4024        if !cfg.enabled {
4025            return None;
4026        }
4027        let src = self.current_je_source.clone()?;
4028        self.ensure_cond_pair_pool(&src);
4029        let sampler = self.cond_pair_sampler.as_ref()?;
4030        let pool = sampler.pool(&src)?;
4031        Some(pool.sample_one(&mut self.cond_pair_rng).to_string())
4032    }
4033
4034    #[inline]
4035    fn select_debit_account(&mut self) -> &GLAccount {
4036        // SOTA-8 source-conditional pick when feature is enabled.
4037        if let Some(acct_num) = self.try_cond_pick_account_number() {
4038            if let Some(a) = self
4039                .coa
4040                .accounts
4041                .iter()
4042                .find(|a| a.account_number == acct_num)
4043            {
4044                return a;
4045            }
4046            // Sampler chose an account not in CoA (defensive fall-through).
4047        }
4048        let accounts = self.coa.get_accounts_by_type(AccountType::Asset);
4049        let expense_accounts = self.coa.get_accounts_by_type(AccountType::Expense);
4050
4051        // 60% asset, 40% expense for debits
4052        let all: Vec<_> = if self.rng.random::<f64>() < 0.6 {
4053            accounts
4054        } else {
4055            expense_accounts
4056        };
4057
4058        let uniform = all.choose(&mut self.rng).copied();
4059        let enabled = self.config.account_concentration.unwrap_or(true);
4060        Self::concentrate(enabled, &mut self.account_rng, &all, uniform).unwrap_or_else(|| {
4061            tracing::warn!(
4062                "Account selection returned empty list, falling back to first COA account"
4063            );
4064            &self.coa.accounts[0]
4065        })
4066    }
4067
4068    #[inline]
4069    fn select_credit_account(&mut self) -> &GLAccount {
4070        // SOTA-8 source-conditional pick when feature is enabled.
4071        if let Some(acct_num) = self.try_cond_pick_account_number() {
4072            if let Some(a) = self
4073                .coa
4074                .accounts
4075                .iter()
4076                .find(|a| a.account_number == acct_num)
4077            {
4078                return a;
4079            }
4080        }
4081        let liability_accounts = self.coa.get_accounts_by_type(AccountType::Liability);
4082        let revenue_accounts = self.coa.get_accounts_by_type(AccountType::Revenue);
4083
4084        // 60% liability, 40% revenue for credits
4085        let all: Vec<_> = if self.rng.random::<f64>() < 0.6 {
4086            liability_accounts
4087        } else {
4088            revenue_accounts
4089        };
4090
4091        let uniform = all.choose(&mut self.rng).copied();
4092        let enabled = self.config.account_concentration.unwrap_or(true);
4093        Self::concentrate(enabled, &mut self.account_rng, &all, uniform).unwrap_or_else(|| {
4094            tracing::warn!(
4095                "Account selection returned empty list, falling back to first COA account"
4096            );
4097            &self.coa.accounts[0]
4098        })
4099    }
4100}
4101
4102impl Generator for JournalEntryGenerator {
4103    type Item = JournalEntry;
4104    type Config = (
4105        TransactionConfig,
4106        Arc<ChartOfAccounts>,
4107        Vec<String>,
4108        NaiveDate,
4109        NaiveDate,
4110    );
4111
4112    fn new(config: Self::Config, seed: u64) -> Self {
4113        Self::new_with_params(config.0, config.1, config.2, config.3, config.4, seed)
4114    }
4115
4116    fn generate_one(&mut self) -> Self::Item {
4117        self.generate()
4118    }
4119
4120    fn reset(&mut self) {
4121        self.rng = seeded_rng(self.seed, 0);
4122        self.source_mix_rng = seeded_rng(self.seed, 50_063);
4123        self.template_rng = seeded_rng(self.seed, 70_081);
4124        self.recurring_archetypes.clear();
4125        self.reversal_rng = seeded_rng(self.seed, 90_017);
4126        self.reversal_buffer.clear();
4127        self.account_rng = seeded_rng(self.seed, 60_071);
4128        self.allocation_rng = seeded_rng(self.seed, 80_023);
4129        self.fx_rng = seeded_rng(self.seed, 70_093);
4130        self.line_sampler.reset(self.seed + 1);
4131        self.amount_sampler.reset(self.seed + 2);
4132        self.temporal_sampler.reset(self.seed + 3);
4133        if let Some(ref mut adv) = self.advanced_amount_sampler {
4134            adv.reset(self.seed + 2);
4135        }
4136        self.count = 0;
4137        self.uuid_factory.reset();
4138
4139        // Reset reference generator by recreating it
4140        let mut ref_gen = ReferenceGenerator::new(
4141            self.start_date.year(),
4142            self.companies
4143                .first()
4144                .map(std::string::String::as_str)
4145                .unwrap_or("1000"),
4146        );
4147        ref_gen.set_prefix(
4148            ReferenceType::Invoice,
4149            &self.template_config.references.invoice_prefix,
4150        );
4151        ref_gen.set_prefix(
4152            ReferenceType::PurchaseOrder,
4153            &self.template_config.references.po_prefix,
4154        );
4155        ref_gen.set_prefix(
4156            ReferenceType::SalesOrder,
4157            &self.template_config.references.so_prefix,
4158        );
4159        self.reference_generator = ref_gen;
4160    }
4161
4162    fn count(&self) -> u64 {
4163        self.count
4164    }
4165
4166    fn seed(&self) -> u64 {
4167        self.seed
4168    }
4169}
4170
4171use datasynth_core::traits::ParallelGenerator;
4172
4173impl ParallelGenerator for JournalEntryGenerator {
4174    /// Split this generator into `parts` independent sub-generators.
4175    ///
4176    /// Each sub-generator gets a deterministic seed derived from the parent seed
4177    /// and its partition index, plus a partitioned UUID factory to avoid contention.
4178    /// The results are deterministic for a given partition count.
4179    fn split(self, parts: usize) -> Vec<Self> {
4180        let parts = parts.max(1);
4181        (0..parts)
4182            .map(|i| {
4183                // Derive a unique seed per partition using a golden-ratio constant
4184                let sub_seed = self
4185                    .seed
4186                    .wrapping_add((i as u64).wrapping_mul(0x9E3779B97F4A7C15));
4187
4188                let mut gen = JournalEntryGenerator::new_with_full_config(
4189                    self.config.clone(),
4190                    Arc::clone(&self.coa),
4191                    self.companies.clone(),
4192                    self.start_date,
4193                    self.end_date,
4194                    sub_seed,
4195                    self.template_config.clone(),
4196                    self.user_pool.clone(),
4197                );
4198
4199                // Copy over configuration state
4200                gen.company_selector = self.company_selector.clone();
4201                gen.vendor_pool = self.vendor_pool.clone();
4202                gen.customer_pool = self.customer_pool.clone();
4203                gen.material_pool = self.material_pool.clone();
4204                // v5.9.0: master-data pools so sub-generators emit
4205                // CC/PC values that join back to the corresponding
4206                // masters (without these clones, parallel workers
4207                // fell back to the hardcoded `COST_CENTER_POOL` const
4208                // and the legacy `PC-{COMP}-{P2P|O2C|...}` derivation).
4209                gen.cost_center_pool = self.cost_center_pool.clone();
4210                gen.profit_center_pool = self.profit_center_pool.clone();
4211                gen.using_real_master_data = self.using_real_master_data;
4212                gen.fraud_config = self.fraud_config.clone();
4213                gen.persona_errors_enabled = self.persona_errors_enabled;
4214                gen.approval_enabled = self.approval_enabled;
4215                gen.approval_threshold = self.approval_threshold;
4216                gen.sod_violation_rate = self.sod_violation_rate;
4217                // v3.4.0+: advanced amount sampler (mixture / Pareto /
4218                // Gaussian). Clone and reset the internal RNG with the
4219                // partition's sub_seed so each worker explores a unique
4220                // subsequence without repeating the parent stream.
4221                if let Some(mut adv) = self.advanced_amount_sampler.clone() {
4222                    adv.reset(sub_seed.wrapping_add(2));
4223                    gen.advanced_amount_sampler = Some(adv);
4224                }
4225                // v3.5.3+: conditional amount override — clone + reset
4226                // so each partition gets a fresh deterministic stream.
4227                if let Some(mut cond) = self.conditional_amount_override.clone() {
4228                    cond.reset(sub_seed.wrapping_add(17));
4229                    gen.conditional_amount_override = Some(cond);
4230                }
4231                // v3.5.4+: copula sampler — clone + reset per partition.
4232                if let Some(mut cop) = self.correlation_copula.clone() {
4233                    cop.reset(sub_seed.wrapping_add(31));
4234                    gen.correlation_copula = Some(cop);
4235                }
4236
4237                // Use partitioned UUID factory to eliminate atomic contention
4238                gen.uuid_factory = DeterministicUuidFactory::for_partition(
4239                    sub_seed,
4240                    GeneratorType::JournalEntry,
4241                    i as u8,
4242                );
4243
4244                // Copy temporal patterns if configured
4245                if let Some(ref config) = self.temporal_patterns_config {
4246                    gen.temporal_patterns_config = Some(config.clone());
4247                    // Rebuild business day calculator from the stored config
4248                    if config.business_days.enabled {
4249                        if let Some(ref bdc) = self.business_day_calculator {
4250                            gen.business_day_calculator = Some(bdc.clone());
4251                        }
4252                    }
4253                    // Rebuild processing lag calculator with partition seed
4254                    if config.processing_lags.enabled {
4255                        let lag_config =
4256                            Self::convert_processing_lag_config(&config.processing_lags);
4257                        gen.processing_lag_calculator =
4258                            Some(ProcessingLagCalculator::with_config(sub_seed, lag_config));
4259                    }
4260                }
4261
4262                // Copy drift controller if present
4263                if let Some(ref dc) = self.drift_controller {
4264                    gen.drift_controller = Some(dc.clone());
4265                }
4266
4267                // SP3: share Arc-wrapped priors with all sub-generators.
4268                // Clone is O(1) — increments the reference count only.
4269                gen.loaded_priors = self.loaded_priors.clone();
4270
4271                // SP3.4: each partition starts with a fresh calibrator so
4272                // observations are partition-local (avoids cross-partition
4273                // state contamination).  Target rates and window size are
4274                // cloned from the parent; accumulated state is not.
4275                if let Some(ref cal) = self.velocity_calibrator {
4276                    let mut fresh = crate::velocity_calibrator::VelocityCalibrator::new(
4277                        cal.target_trigger_rates.clone(),
4278                        cal.n_lines_between_calibrations,
4279                    );
4280                    fresh.current_values = cal.current_values.clone();
4281                    gen.velocity_calibrator = Some(fresh);
4282                }
4283
4284                gen
4285            })
4286            .collect()
4287    }
4288}
4289
4290#[cfg(test)]
4291mod tests {
4292    use super::*;
4293    use crate::ChartOfAccountsGenerator;
4294
4295    #[test]
4296    fn test_generate_balanced_entries() {
4297        let mut coa_gen =
4298            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4299        let coa = Arc::new(coa_gen.generate());
4300
4301        let mut je_gen = JournalEntryGenerator::new_with_params(
4302            TransactionConfig::default(),
4303            coa,
4304            vec!["1000".to_string()],
4305            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4306            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4307            42,
4308        );
4309
4310        let mut balanced_count = 0;
4311        for _ in 0..100 {
4312            let entry = je_gen.generate();
4313
4314            // Skip entries with human errors as they may be intentionally unbalanced
4315            let has_human_error = entry
4316                .header
4317                .header_text
4318                .as_ref()
4319                .map(|t| t.contains("[HUMAN_ERROR:"))
4320                .unwrap_or(false);
4321
4322            if !has_human_error {
4323                assert!(
4324                    entry.is_balanced(),
4325                    "Entry {:?} is not balanced",
4326                    entry.header.document_id
4327                );
4328                balanced_count += 1;
4329            }
4330            assert!(entry.line_count() >= 2, "Entry has fewer than 2 lines");
4331        }
4332
4333        // Ensure most entries are balanced (human errors are rare)
4334        assert!(
4335            balanced_count >= 80,
4336            "Expected at least 80 balanced entries, got {}",
4337            balanced_count
4338        );
4339    }
4340
4341    #[test]
4342    fn test_deterministic_generation() {
4343        let mut coa_gen =
4344            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4345        let coa = Arc::new(coa_gen.generate());
4346
4347        let mut gen1 = JournalEntryGenerator::new_with_params(
4348            TransactionConfig::default(),
4349            Arc::clone(&coa),
4350            vec!["1000".to_string()],
4351            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4352            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4353            42,
4354        );
4355
4356        let mut gen2 = JournalEntryGenerator::new_with_params(
4357            TransactionConfig::default(),
4358            coa,
4359            vec!["1000".to_string()],
4360            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4361            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4362            42,
4363        );
4364
4365        for _ in 0..50 {
4366            let e1 = gen1.generate();
4367            let e2 = gen2.generate();
4368            assert_eq!(e1.header.document_id, e2.header.document_id);
4369            assert_eq!(e1.total_debit(), e2.total_debit());
4370        }
4371    }
4372
4373    #[test]
4374    fn test_templates_generate_descriptions() {
4375        let mut coa_gen =
4376            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4377        let coa = Arc::new(coa_gen.generate());
4378
4379        // Enable all template features
4380        let template_config = TemplateConfig {
4381            names: datasynth_config::schema::NameTemplateConfig {
4382                generate_realistic_names: true,
4383                email_domain: "test.com".to_string(),
4384                culture_distribution: datasynth_config::schema::CultureDistribution::default(),
4385            },
4386            descriptions: datasynth_config::schema::DescriptionTemplateConfig {
4387                generate_header_text: true,
4388                generate_line_text: true,
4389            },
4390            references: datasynth_config::schema::ReferenceTemplateConfig {
4391                generate_references: true,
4392                invoice_prefix: "TEST-INV".to_string(),
4393                po_prefix: "TEST-PO".to_string(),
4394                so_prefix: "TEST-SO".to_string(),
4395            },
4396            path: None,
4397            merge_strategy: datasynth_config::TemplateMergeStrategy::default(),
4398        };
4399
4400        let mut je_gen = JournalEntryGenerator::new_with_full_config(
4401            TransactionConfig::default(),
4402            coa,
4403            vec!["1000".to_string()],
4404            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4405            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4406            42,
4407            template_config,
4408            None,
4409        )
4410        .with_persona_errors(false); // Disable for template testing
4411
4412        for _ in 0..10 {
4413            let entry = je_gen.generate();
4414
4415            // Verify header text is populated
4416            assert!(
4417                entry.header.header_text.is_some(),
4418                "Header text should be populated"
4419            );
4420
4421            // Verify reference is populated
4422            assert!(
4423                entry.header.reference.is_some(),
4424                "Reference should be populated"
4425            );
4426
4427            // Verify business process is set
4428            assert!(
4429                entry.header.business_process.is_some(),
4430                "Business process should be set"
4431            );
4432
4433            // Verify line text is populated
4434            for line in &entry.lines {
4435                assert!(line.line_text.is_some(), "Line text should be populated");
4436            }
4437
4438            // Entry should still be balanced
4439            assert!(entry.is_balanced());
4440        }
4441    }
4442
4443    #[test]
4444    fn test_user_pool_integration() {
4445        let mut coa_gen =
4446            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4447        let coa = Arc::new(coa_gen.generate());
4448
4449        let companies = vec!["1000".to_string()];
4450
4451        // Generate user pool
4452        let mut user_gen = crate::UserGenerator::new(42);
4453        let user_pool = user_gen.generate_standard(&companies);
4454
4455        let mut je_gen = JournalEntryGenerator::new_with_full_config(
4456            TransactionConfig::default(),
4457            coa,
4458            companies,
4459            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4460            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4461            42,
4462            TemplateConfig::default(),
4463            Some(user_pool),
4464        );
4465
4466        // Generate entries and verify user IDs are from pool
4467        for _ in 0..20 {
4468            let entry = je_gen.generate();
4469
4470            // User ID should not be generic BATCH/USER format when pool is used
4471            // (though it may still fall back if random selection misses)
4472            assert!(!entry.header.created_by.is_empty());
4473        }
4474    }
4475
4476    #[test]
4477    fn test_master_data_connection() {
4478        let mut coa_gen =
4479            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4480        let coa = Arc::new(coa_gen.generate());
4481
4482        // Create test vendors
4483        let vendors = vec![
4484            Vendor::new("V-TEST-001", "Test Vendor Alpha", VendorType::Supplier),
4485            Vendor::new("V-TEST-002", "Test Vendor Beta", VendorType::Technology),
4486        ];
4487
4488        // Create test customers
4489        let customers = vec![
4490            Customer::new("C-TEST-001", "Test Customer One", CustomerType::Corporate),
4491            Customer::new(
4492                "C-TEST-002",
4493                "Test Customer Two",
4494                CustomerType::SmallBusiness,
4495            ),
4496        ];
4497
4498        // Create test materials
4499        let materials = vec![Material::new(
4500            "MAT-TEST-001",
4501            "Test Material A",
4502            MaterialType::RawMaterial,
4503        )];
4504
4505        // Create generator with master data
4506        let generator = JournalEntryGenerator::new_with_params(
4507            TransactionConfig::default(),
4508            coa,
4509            vec!["1000".to_string()],
4510            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4511            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4512            42,
4513        );
4514
4515        // Without master data
4516        assert!(!generator.is_using_real_master_data());
4517
4518        // Connect master data
4519        let generator_with_data = generator
4520            .with_vendors(&vendors)
4521            .with_customers(&customers)
4522            .with_materials(&materials);
4523
4524        // Should now be using real master data
4525        assert!(generator_with_data.is_using_real_master_data());
4526    }
4527
4528    #[test]
4529    fn test_with_master_data_convenience_method() {
4530        let mut coa_gen =
4531            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4532        let coa = Arc::new(coa_gen.generate());
4533
4534        let vendors = vec![Vendor::new("V-001", "Vendor One", VendorType::Supplier)];
4535        let customers = vec![Customer::new(
4536            "C-001",
4537            "Customer One",
4538            CustomerType::Corporate,
4539        )];
4540        let materials = vec![Material::new(
4541            "MAT-001",
4542            "Material One",
4543            MaterialType::RawMaterial,
4544        )];
4545
4546        let generator = JournalEntryGenerator::new_with_params(
4547            TransactionConfig::default(),
4548            coa,
4549            vec!["1000".to_string()],
4550            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4551            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4552            42,
4553        )
4554        .with_master_data(&vendors, &customers, &materials);
4555
4556        assert!(generator.is_using_real_master_data());
4557    }
4558
4559    #[test]
4560    fn test_stress_factors_increase_error_rate() {
4561        let mut coa_gen =
4562            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4563        let coa = Arc::new(coa_gen.generate());
4564
4565        let generator = JournalEntryGenerator::new_with_params(
4566            TransactionConfig::default(),
4567            coa,
4568            vec!["1000".to_string()],
4569            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4570            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4571            42,
4572        );
4573
4574        let base_rate = 0.1;
4575
4576        // Regular day - no stress factors
4577        let regular_day = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(); // Mid-June Wednesday
4578        let regular_rate = generator.apply_stress_factors(base_rate, regular_day);
4579        assert!(
4580            (regular_rate - base_rate).abs() < 0.01,
4581            "Regular day should have minimal stress factor adjustment"
4582        );
4583
4584        // Month end - 50% more errors
4585        let month_end = NaiveDate::from_ymd_opt(2024, 6, 29).unwrap(); // June 29 (Saturday)
4586        let month_end_rate = generator.apply_stress_factors(base_rate, month_end);
4587        assert!(
4588            month_end_rate > regular_rate,
4589            "Month end should have higher error rate than regular day"
4590        );
4591
4592        // Year end - double the error rate
4593        let year_end = NaiveDate::from_ymd_opt(2024, 12, 30).unwrap(); // December 30
4594        let year_end_rate = generator.apply_stress_factors(base_rate, year_end);
4595        assert!(
4596            year_end_rate > month_end_rate,
4597            "Year end should have highest error rate"
4598        );
4599
4600        // Friday stress
4601        let friday = NaiveDate::from_ymd_opt(2024, 6, 14).unwrap(); // Friday
4602        let friday_rate = generator.apply_stress_factors(base_rate, friday);
4603        assert!(
4604            friday_rate > regular_rate,
4605            "Friday should have higher error rate than mid-week"
4606        );
4607
4608        // Monday stress
4609        let monday = NaiveDate::from_ymd_opt(2024, 6, 17).unwrap(); // Monday
4610        let monday_rate = generator.apply_stress_factors(base_rate, monday);
4611        assert!(
4612            monday_rate > regular_rate,
4613            "Monday should have higher error rate than mid-week"
4614        );
4615    }
4616
4617    #[test]
4618    fn test_batching_produces_similar_entries() {
4619        let mut coa_gen =
4620            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4621        let coa = Arc::new(coa_gen.generate());
4622
4623        // Use seed 123 which is more likely to trigger batching
4624        let mut je_gen = JournalEntryGenerator::new_with_params(
4625            TransactionConfig::default(),
4626            coa,
4627            vec!["1000".to_string()],
4628            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4629            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4630            123,
4631        )
4632        .with_persona_errors(false); // Disable to ensure balanced entries
4633
4634        // Generate many entries - at 15% batch rate, should see some batches
4635        let entries: Vec<JournalEntry> = (0..200).map(|_| je_gen.generate()).collect();
4636
4637        // Check that all entries are balanced (batched or not)
4638        for entry in &entries {
4639            assert!(
4640                entry.is_balanced(),
4641                "All entries including batched should be balanced"
4642            );
4643        }
4644
4645        // Count entries with same-day posting dates (batch indicator)
4646        let mut date_counts: std::collections::HashMap<NaiveDate, usize> =
4647            std::collections::HashMap::new();
4648        for entry in &entries {
4649            *date_counts.entry(entry.header.posting_date).or_insert(0) += 1;
4650        }
4651
4652        // With batching, some dates should have multiple entries
4653        let dates_with_multiple = date_counts.values().filter(|&&c| c > 1).count();
4654        assert!(
4655            dates_with_multiple > 0,
4656            "With batching, should see some dates with multiple entries"
4657        );
4658    }
4659
4660    #[test]
4661    fn test_temporal_patterns_business_days() {
4662        use datasynth_config::schema::{
4663            BusinessDaySchemaConfig, CalendarSchemaConfig, TemporalPatternsConfig,
4664        };
4665
4666        let mut coa_gen =
4667            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4668        let coa = Arc::new(coa_gen.generate());
4669
4670        // Create temporal patterns config with business days enabled
4671        let temporal_config = TemporalPatternsConfig {
4672            enabled: true,
4673            business_days: BusinessDaySchemaConfig {
4674                enabled: true,
4675                ..Default::default()
4676            },
4677            calendars: CalendarSchemaConfig {
4678                regions: vec!["US".to_string()],
4679                custom_holidays: vec![],
4680            },
4681            ..Default::default()
4682        };
4683
4684        let mut je_gen = JournalEntryGenerator::new_with_params(
4685            TransactionConfig::default(),
4686            coa,
4687            vec!["1000".to_string()],
4688            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4689            NaiveDate::from_ymd_opt(2024, 3, 31).unwrap(), // Q1 2024
4690            42,
4691        )
4692        .with_temporal_patterns(temporal_config, 42)
4693        .with_persona_errors(false);
4694
4695        // Generate entries and verify none fall on weekends
4696        let entries: Vec<JournalEntry> = (0..100).map(|_| je_gen.generate()).collect();
4697
4698        for entry in &entries {
4699            let weekday = entry.header.posting_date.weekday();
4700            assert!(
4701                weekday != chrono::Weekday::Sat && weekday != chrono::Weekday::Sun,
4702                "Posting date {:?} should not be a weekend",
4703                entry.header.posting_date
4704            );
4705        }
4706    }
4707
4708    #[test]
4709    fn test_default_generation_filters_weekends() {
4710        // Verify that weekend entries are <5% even when temporal_patterns is NOT enabled.
4711        // This tests the fix where new_with_full_config always creates a default
4712        // BusinessDayCalculator with US holidays as a fallback.
4713        let mut coa_gen =
4714            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4715        let coa = Arc::new(coa_gen.generate());
4716
4717        let mut je_gen = JournalEntryGenerator::new_with_params(
4718            TransactionConfig::default(),
4719            coa,
4720            vec!["1000".to_string()],
4721            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4722            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4723            42,
4724        )
4725        .with_persona_errors(false);
4726
4727        let total = 500;
4728        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4729
4730        let weekend_count = entries
4731            .iter()
4732            .filter(|e| {
4733                let wd = e.header.posting_date.weekday();
4734                wd == chrono::Weekday::Sat || wd == chrono::Weekday::Sun
4735            })
4736            .count();
4737
4738        let weekend_pct = weekend_count as f64 / total as f64;
4739        assert!(
4740            weekend_pct < 0.05,
4741            "Expected weekend entries <5% of total without temporal_patterns enabled, \
4742             but got {:.1}% ({}/{})",
4743            weekend_pct * 100.0,
4744            weekend_count,
4745            total
4746        );
4747    }
4748
4749    #[test]
4750    fn test_document_type_derived_from_business_process() {
4751        let mut coa_gen =
4752            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4753        let coa = Arc::new(coa_gen.generate());
4754
4755        let mut je_gen = JournalEntryGenerator::new_with_params(
4756            TransactionConfig::default(),
4757            coa,
4758            vec!["1000".to_string()],
4759            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4760            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4761            99,
4762        )
4763        .with_persona_errors(false)
4764        .with_batching(false);
4765
4766        let total = 200;
4767        let mut doc_types = std::collections::HashSet::new();
4768        let mut sa_count = 0_usize;
4769
4770        for _ in 0..total {
4771            let entry = je_gen.generate();
4772            let dt = &entry.header.document_type;
4773            doc_types.insert(dt.clone());
4774            if dt == "SA" {
4775                sa_count += 1;
4776            }
4777        }
4778
4779        // Should have more than 3 distinct document types
4780        assert!(
4781            doc_types.len() > 3,
4782            "Expected >3 distinct document types, got {} ({:?})",
4783            doc_types.len(),
4784            doc_types,
4785        );
4786
4787        // "SA" should be less than 50% (R2R is 20% of the weight)
4788        let sa_pct = sa_count as f64 / total as f64;
4789        assert!(
4790            sa_pct < 0.50,
4791            "Expected SA <50%, got {:.1}% ({}/{})",
4792            sa_pct * 100.0,
4793            sa_count,
4794            total,
4795        );
4796    }
4797
4798    #[test]
4799    fn test_enrich_line_items_account_description() {
4800        let mut coa_gen =
4801            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4802        let coa = Arc::new(coa_gen.generate());
4803
4804        let mut je_gen = JournalEntryGenerator::new_with_params(
4805            TransactionConfig::default(),
4806            coa,
4807            vec!["1000".to_string()],
4808            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4809            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4810            42,
4811        )
4812        .with_persona_errors(false);
4813
4814        let total = 200;
4815        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4816
4817        // Count lines with account_description populated
4818        let total_lines: usize = entries.iter().map(|e| e.lines.len()).sum();
4819        let lines_with_desc: usize = entries
4820            .iter()
4821            .flat_map(|e| &e.lines)
4822            .filter(|l| l.account_description.is_some())
4823            .count();
4824
4825        let desc_pct = lines_with_desc as f64 / total_lines as f64;
4826        assert!(
4827            desc_pct > 0.95,
4828            "Expected >95% of lines to have account_description, got {:.1}% ({}/{})",
4829            desc_pct * 100.0,
4830            lines_with_desc,
4831            total_lines,
4832        );
4833    }
4834
4835    #[test]
4836    fn test_enrich_line_items_cost_center_for_expense_accounts() {
4837        let mut coa_gen =
4838            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4839        let coa = Arc::new(coa_gen.generate());
4840
4841        let mut je_gen = JournalEntryGenerator::new_with_params(
4842            TransactionConfig::default(),
4843            coa,
4844            vec!["1000".to_string()],
4845            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4846            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4847            42,
4848        )
4849        .with_persona_errors(false);
4850
4851        let total = 300;
4852        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4853
4854        // Count expense account lines (5xxx/6xxx) with cost_center populated
4855        let expense_lines: Vec<&JournalEntryLine> = entries
4856            .iter()
4857            .flat_map(|e| &e.lines)
4858            .filter(|l| {
4859                let first = l.gl_account.chars().next().unwrap_or('0');
4860                first == '5' || first == '6'
4861            })
4862            .collect();
4863
4864        if !expense_lines.is_empty() {
4865            let with_cc = expense_lines
4866                .iter()
4867                .filter(|l| l.cost_center.is_some())
4868                .count();
4869            let cc_pct = with_cc as f64 / expense_lines.len() as f64;
4870            assert!(
4871                cc_pct > 0.80,
4872                "Expected >80% of expense lines to have cost_center, got {:.1}% ({}/{})",
4873                cc_pct * 100.0,
4874                with_cc,
4875                expense_lines.len(),
4876            );
4877        }
4878    }
4879
4880    #[test]
4881    fn test_enrich_line_items_profit_center_and_line_text() {
4882        let mut coa_gen =
4883            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4884        let coa = Arc::new(coa_gen.generate());
4885
4886        let mut je_gen = JournalEntryGenerator::new_with_params(
4887            TransactionConfig::default(),
4888            coa,
4889            vec!["1000".to_string()],
4890            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4891            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4892            42,
4893        )
4894        .with_persona_errors(false);
4895
4896        let total = 100;
4897        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4898
4899        let total_lines: usize = entries.iter().map(|e| e.lines.len()).sum();
4900
4901        // All lines should have profit_center
4902        let with_pc = entries
4903            .iter()
4904            .flat_map(|e| &e.lines)
4905            .filter(|l| l.profit_center.is_some())
4906            .count();
4907        let pc_pct = with_pc as f64 / total_lines as f64;
4908        assert!(
4909            pc_pct > 0.95,
4910            "Expected >95% of lines to have profit_center, got {:.1}% ({}/{})",
4911            pc_pct * 100.0,
4912            with_pc,
4913            total_lines,
4914        );
4915
4916        // All lines should have line_text (either from template or header fallback)
4917        let with_text = entries
4918            .iter()
4919            .flat_map(|e| &e.lines)
4920            .filter(|l| l.line_text.is_some())
4921            .count();
4922        let text_pct = with_text as f64 / total_lines as f64;
4923        assert!(
4924            text_pct > 0.95,
4925            "Expected >95% of lines to have line_text, got {:.1}% ({}/{})",
4926            text_pct * 100.0,
4927            with_text,
4928            total_lines,
4929        );
4930    }
4931
4932    // --- ISA 240 audit flag tests ---
4933
4934    #[test]
4935    fn test_je_has_audit_flags() {
4936        let mut coa_gen =
4937            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4938        let coa = Arc::new(coa_gen.generate());
4939
4940        let mut je_gen = JournalEntryGenerator::new_with_params(
4941            TransactionConfig::default(),
4942            coa,
4943            vec!["1000".to_string()],
4944            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4945            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4946            42,
4947        )
4948        .with_persona_errors(false);
4949
4950        for _ in 0..100 {
4951            let entry = je_gen.generate();
4952
4953            // source_system should always be non-empty
4954            assert!(
4955                !entry.header.source_system.is_empty(),
4956                "source_system should be populated, got empty string"
4957            );
4958
4959            // created_by should always be non-empty (already tested elsewhere, but confirm)
4960            assert!(
4961                !entry.header.created_by.is_empty(),
4962                "created_by should be populated"
4963            );
4964
4965            // created_date should always be populated
4966            assert!(
4967                entry.header.created_date.is_some(),
4968                "created_date should be populated"
4969            );
4970        }
4971    }
4972
4973    #[test]
4974    fn test_manual_entry_rate() {
4975        let mut coa_gen =
4976            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4977        let coa = Arc::new(coa_gen.generate());
4978
4979        let mut je_gen = JournalEntryGenerator::new_with_params(
4980            TransactionConfig::default(),
4981            coa,
4982            vec!["1000".to_string()],
4983            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4984            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4985            42,
4986        )
4987        .with_persona_errors(false)
4988        .with_batching(false);
4989
4990        let total = 1000;
4991        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4992
4993        let manual_count = entries.iter().filter(|e| e.header.is_manual).count();
4994        let manual_rate = manual_count as f64 / total as f64;
4995
4996        // Default source_distribution.manual is typically around 0.05-0.15
4997        // Allow a wide tolerance for statistical variation
4998        assert!(
4999            manual_rate > 0.01 && manual_rate < 0.50,
5000            "Manual entry rate should be reasonable (1%-50%), got {:.1}% ({}/{})",
5001            manual_rate * 100.0,
5002            manual_count,
5003            total,
5004        );
5005
5006        // is_manual should match TransactionSource::Manual
5007        for entry in &entries {
5008            let source_is_manual = entry.header.source == TransactionSource::Manual;
5009            assert_eq!(
5010                entry.header.is_manual, source_is_manual,
5011                "is_manual should match source == Manual"
5012            );
5013        }
5014    }
5015
5016    #[test]
5017    fn test_manual_source_consistency() {
5018        let mut coa_gen =
5019            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5020        let coa = Arc::new(coa_gen.generate());
5021
5022        let mut je_gen = JournalEntryGenerator::new_with_params(
5023            TransactionConfig::default(),
5024            coa,
5025            vec!["1000".to_string()],
5026            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5027            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5028            42,
5029        )
5030        .with_persona_errors(false)
5031        .with_batching(false);
5032
5033        for _ in 0..500 {
5034            let entry = je_gen.generate();
5035
5036            if entry.header.is_manual {
5037                // Manual entries must have a source_system in the
5038                // `manual/...` or `spreadsheet/...` family (the bare
5039                // legacy `manual` and `spreadsheet` values are also
5040                // accepted to keep older fixtures working).
5041                let s = entry.header.source_system.as_str();
5042                assert!(
5043                    s == "manual"
5044                        || s == "spreadsheet"
5045                        || s.starts_with("manual/")
5046                        || s.starts_with("spreadsheet/"),
5047                    "Manual entry should have source_system in `manual` / `spreadsheet` family, got '{s}'",
5048                );
5049            } else {
5050                // Non-manual entries must NOT be in the manual/spreadsheet family.
5051                let s = entry.header.source_system.as_str();
5052                assert!(
5053                    !(s == "manual"
5054                        || s == "spreadsheet"
5055                        || s.starts_with("manual/")
5056                        || s.starts_with("spreadsheet/")),
5057                    "Non-manual entry should not be in `manual` / `spreadsheet` family, got '{s}'",
5058                );
5059            }
5060        }
5061    }
5062
5063    #[test]
5064    fn test_default_source_codes_breadth() {
5065        // T2-D Lever 1: with no industry priors and the default config, the
5066        // `source` column carries a broad generic SAP doc-type mix
5067        // (sap_source_code populated) instead of collapsing to the
5068        // TransactionSource enum. See experiments/ml/FINDINGS.md §6.
5069        let mut coa_gen =
5070            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 7);
5071        let coa = Arc::new(coa_gen.generate());
5072        let mut je_gen = JournalEntryGenerator::new_with_params(
5073            TransactionConfig::default(),
5074            coa,
5075            vec!["1000".to_string()],
5076            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5077            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5078            7,
5079        )
5080        .with_persona_errors(false)
5081        .with_batching(false);
5082
5083        let mut codes = std::collections::HashSet::new();
5084        for _ in 0..500 {
5085            let e = je_gen.generate();
5086            let code = e
5087                .header
5088                .sap_source_code
5089                .expect("default config should populate sap_source_code");
5090            codes.insert(code);
5091        }
5092        assert!(
5093            codes.len() >= 10,
5094            "default source-mix should be broad (>=10 distinct codes), got {}",
5095            codes.len()
5096        );
5097    }
5098
5099    #[test]
5100    fn test_source_codes_opt_out() {
5101        // synthetic_source_codes = Some(false) restores the legacy behaviour:
5102        // sap_source_code stays None and `source` falls back to the enum.
5103        let mut coa_gen =
5104            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 9);
5105        let coa = Arc::new(coa_gen.generate());
5106        let cfg = TransactionConfig {
5107            synthetic_source_codes: Some(false),
5108            ..TransactionConfig::default()
5109        };
5110        let mut je_gen = JournalEntryGenerator::new_with_params(
5111            cfg,
5112            coa,
5113            vec!["1000".to_string()],
5114            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5115            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5116            9,
5117        )
5118        .with_persona_errors(false)
5119        .with_batching(false);
5120        for _ in 0..50 {
5121            let e = je_gen.generate();
5122            assert!(
5123                e.header.sap_source_code.is_none(),
5124                "opt-out should leave sap_source_code None (legacy enum source)"
5125            );
5126        }
5127    }
5128
5129    #[test]
5130    fn test_recurring_templates_reuse_archetypes() {
5131        // SOTA-1: with templating on (default), generated JEs reuse account
5132        // archetypes (far fewer distinct than the legacy uniform-per-line
5133        // selection), and balance is preserved either way.
5134        fn run(recurring: Option<bool>) -> (usize, usize, bool) {
5135            let mut coa_gen = ChartOfAccountsGenerator::new(
5136                CoAComplexity::Medium,
5137                IndustrySector::Manufacturing,
5138                11,
5139            );
5140            let coa = Arc::new(coa_gen.generate());
5141            let cfg = TransactionConfig {
5142                recurring_templates: recurring,
5143                ..TransactionConfig::default()
5144            };
5145            let mut g = JournalEntryGenerator::new_with_params(
5146                cfg,
5147                coa,
5148                vec!["1000".to_string()],
5149                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5150                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5151                11,
5152            )
5153            .with_persona_errors(false)
5154            .with_batching(false);
5155            let n = 800;
5156            let mut arche = std::collections::HashSet::new();
5157            let mut balanced = true;
5158            for _ in 0..n {
5159                let e = g.generate();
5160                if !e.is_balanced() {
5161                    balanced = false;
5162                }
5163                let mut sig: Vec<(String, bool)> = e
5164                    .lines
5165                    .iter()
5166                    .map(|l| (l.gl_account.clone(), l.debit_amount > Decimal::ZERO))
5167                    .collect();
5168                sig.sort();
5169                arche.insert(sig);
5170            }
5171            (n, arche.len(), balanced)
5172        }
5173        let (n, distinct_on, bal_on) = run(Some(true));
5174        let (_, distinct_off, bal_off) = run(Some(false));
5175        assert!(bal_on && bal_off, "balance preserved in both modes");
5176        assert!(
5177            distinct_on < distinct_off,
5178            "templating should reduce distinct archetypes ({distinct_on} on vs {distinct_off} off)"
5179        );
5180        assert!(
5181            distinct_on * 2 < n,
5182            "templating should reuse heavily: {distinct_on} distinct archetypes over {n} JEs"
5183        );
5184    }
5185
5186    #[test]
5187    fn test_reversal_process_emits_balanced_reversals() {
5188        // SOTA-5: with reversal_rate > 0, some JEs are balanced reversals of
5189        // earlier ones (header_text "Reversal of ..."); rate 0.0 emits none.
5190        fn run(rate: Option<f64>) -> (usize, bool) {
5191            let mut coa_gen = ChartOfAccountsGenerator::new(
5192                CoAComplexity::Small,
5193                IndustrySector::Manufacturing,
5194                13,
5195            );
5196            let coa = Arc::new(coa_gen.generate());
5197            let cfg = TransactionConfig {
5198                reversal_rate: rate,
5199                ..TransactionConfig::default()
5200            };
5201            let mut g = JournalEntryGenerator::new_with_params(
5202                cfg,
5203                coa,
5204                vec!["1000".to_string()],
5205                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5206                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5207                13,
5208            )
5209            .with_persona_errors(false)
5210            .with_batching(false);
5211            let mut reversals = 0;
5212            let mut balanced = true;
5213            for _ in 0..1000 {
5214                let e = g.generate();
5215                if !e.is_balanced() {
5216                    balanced = false;
5217                }
5218                if e.header
5219                    .header_text
5220                    .as_deref()
5221                    .is_some_and(|t| t.starts_with("Reversal of"))
5222                {
5223                    reversals += 1;
5224                }
5225            }
5226            (reversals, balanced)
5227        }
5228        let (rev_on, bal_on) = run(Some(0.05));
5229        let (rev_off, bal_off) = run(Some(0.0));
5230        assert!(bal_on && bal_off, "all entries balanced incl. reversals");
5231        assert_eq!(rev_off, 0, "rate 0.0 emits no reversals, got {rev_off}");
5232        assert!(rev_on > 0, "rate 0.05 should emit reversals, got {rev_on}");
5233    }
5234
5235    #[test]
5236    fn test_account_concentration_creates_pareto() {
5237        // SOTA-2: with concentration on (default), a hot subset of accounts
5238        // carries most lines (the corpus account-activity Pareto, top-10% ≈ 95%)
5239        // vs the legacy near-uniform pool draw. Templating + reversals are held
5240        // off so the only difference between the two runs is the power-law pick.
5241        fn run(concentration: Option<bool>) -> (f64, bool) {
5242            let mut coa_gen = ChartOfAccountsGenerator::new(
5243                CoAComplexity::Medium,
5244                IndustrySector::Manufacturing,
5245                17,
5246            );
5247            let coa = Arc::new(coa_gen.generate());
5248            let cfg = TransactionConfig {
5249                account_concentration: concentration,
5250                recurring_templates: Some(false),
5251                reversal_rate: Some(0.0),
5252                ..TransactionConfig::default()
5253            };
5254            let mut g = JournalEntryGenerator::new_with_params(
5255                cfg,
5256                coa,
5257                vec!["1000".to_string()],
5258                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5259                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5260                17,
5261            )
5262            .with_persona_errors(false)
5263            .with_batching(false);
5264            let mut counts: std::collections::HashMap<String, usize> =
5265                std::collections::HashMap::new();
5266            let mut total_lines = 0usize;
5267            let mut balanced = true;
5268            for _ in 0..1000 {
5269                let e = g.generate();
5270                if !e.is_balanced() {
5271                    balanced = false;
5272                }
5273                for l in &e.lines {
5274                    *counts.entry(l.gl_account.clone()).or_default() += 1;
5275                    total_lines += 1;
5276                }
5277            }
5278            // share of lines carried by the top-10% most-active accounts (the
5279            // corpus_structure "acct top10%" metric, over active accounts).
5280            let mut v: Vec<usize> = counts.values().copied().collect();
5281            v.sort_unstable_by(|a, b| b.cmp(a));
5282            let top_k = ((v.len() as f64 * 0.10).ceil() as usize).max(1);
5283            let top_share = v.iter().take(top_k).sum::<usize>() as f64 / total_lines as f64;
5284            (top_share, balanced)
5285        }
5286        let (share_on, bal_on) = run(Some(true));
5287        let (share_off, bal_off) = run(Some(false));
5288        assert!(bal_on && bal_off, "balance preserved in both modes");
5289        assert!(
5290            share_on > share_off + 0.20,
5291            "concentration should raise the top-10% line share ({share_on:.3} on vs {share_off:.3} off)"
5292        );
5293        assert!(
5294            share_on > 0.50,
5295            "hot accounts should dominate: top-10% line share {share_on:.3}"
5296        );
5297    }
5298
5299    #[test]
5300    fn test_allocation_batch_emits_large_balanced_postings() {
5301        // SOTA-6: with allocation_batch_rate > 0, some JEs are large 1-to-many
5302        // allocation batches (source "AB", many cost-center-spread lines, still
5303        // balanced); rate 0.0 emits none. Reversals are disabled to isolate the
5304        // allocation process (which shares the recent-JE buffer).
5305        fn run(rate: Option<f64>) -> (usize, bool, usize) {
5306            let mut coa_gen = ChartOfAccountsGenerator::new(
5307                CoAComplexity::Small,
5308                IndustrySector::Manufacturing,
5309                23,
5310            );
5311            let coa = Arc::new(coa_gen.generate());
5312            let cfg = TransactionConfig {
5313                allocation_batch_rate: rate,
5314                reversal_rate: Some(0.0),
5315                ..TransactionConfig::default()
5316            };
5317            let mut g = JournalEntryGenerator::new_with_params(
5318                cfg,
5319                coa,
5320                vec!["1000".to_string()],
5321                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5322                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5323                23,
5324            )
5325            .with_persona_errors(false)
5326            .with_batching(false);
5327            let mut batches = 0usize;
5328            let mut balanced = true;
5329            let mut max_distinct_cc = 0usize;
5330            for _ in 0..2000 {
5331                let e = g.generate();
5332                if !e.is_balanced() {
5333                    balanced = false;
5334                }
5335                if e.header.sap_source_code.as_deref() == Some("AB") {
5336                    batches += 1;
5337                    assert!(
5338                        e.lines.len() >= ALLOCATION_MIN_TARGETS as usize,
5339                        "allocation batch should be large, got {} lines",
5340                        e.lines.len()
5341                    );
5342                    let ccs: std::collections::HashSet<String> = e
5343                        .lines
5344                        .iter()
5345                        .filter_map(|l| l.cost_center.clone())
5346                        .collect();
5347                    max_distinct_cc = max_distinct_cc.max(ccs.len());
5348                }
5349            }
5350            (batches, balanced, max_distinct_cc)
5351        }
5352        let (on, bal_on, cc) = run(Some(0.10));
5353        let (off, bal_off, _) = run(Some(0.0));
5354        assert!(
5355            bal_on && bal_off,
5356            "all entries balanced incl. allocation batches"
5357        );
5358        assert_eq!(off, 0, "rate 0.0 emits no allocation batches, got {off}");
5359        assert!(on > 0, "rate 0.10 should emit allocation batches, got {on}");
5360        assert!(
5361            cc > 1,
5362            "allocation should spread across multiple cost centers, got {cc}"
5363        );
5364    }
5365
5366    #[test]
5367    fn test_derived_id_processes_keep_document_ids_unique() {
5368        // SOTA-5/6 regression: reversals and allocation batches mint derived ids
5369        // (`base ^ salt`). Reusing the same base would duplicate an id — the
5370        // failure `test_document_reference_integrity` caught. With both processes
5371        // at high rates, every emitted document id must still be unique.
5372        let mut coa_gen =
5373            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 31);
5374        let coa = Arc::new(coa_gen.generate());
5375        let cfg = TransactionConfig {
5376            reversal_rate: Some(0.15),
5377            allocation_batch_rate: Some(0.10),
5378            ..TransactionConfig::default()
5379        };
5380        let mut g = JournalEntryGenerator::new_with_params(
5381            cfg,
5382            coa,
5383            vec!["1000".to_string()],
5384            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5385            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5386            31,
5387        )
5388        .with_persona_errors(false)
5389        .with_batching(false);
5390        let mut ids = std::collections::HashSet::new();
5391        let n = 3000;
5392        for _ in 0..n {
5393            let e = g.generate();
5394            assert!(
5395                ids.insert(e.header.document_id),
5396                "duplicate document id {} (derived-id collision)",
5397                e.header.document_id
5398            );
5399        }
5400        assert_eq!(ids.len(), n, "all {n} document ids unique");
5401    }
5402
5403    #[test]
5404    fn test_business_unit_rolls_up_from_cost_center() {
5405        // SOTA-3: with the dimension on (default), a line that has a cost center
5406        // (or, as fallback, a profit center) also carries a business_unit that is
5407        // a deterministic roll-up of that dimension (same value → same BU, in
5408        // BU01..BU11); with it off, BU is empty.
5409        fn run(enabled: Option<bool>) -> (usize, usize, bool, bool) {
5410            let mut coa_gen = ChartOfAccountsGenerator::new(
5411                CoAComplexity::Medium,
5412                IndustrySector::Manufacturing,
5413                19,
5414            );
5415            let coa = Arc::new(coa_gen.generate());
5416            let cfg = TransactionConfig {
5417                business_unit_dimension: enabled,
5418                ..TransactionConfig::default()
5419            };
5420            let mut g = JournalEntryGenerator::new_with_params(
5421                cfg,
5422                coa,
5423                vec!["1000".to_string()],
5424                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5425                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5426                19,
5427            )
5428            .with_persona_errors(false)
5429            .with_batching(false);
5430            let mut dim_lines = 0usize;
5431            let mut bu_lines = 0usize;
5432            let mut consistent = true; // BU present ⇒ matches the roll-up of its CC/PC
5433            let mut well_formed = true; // BU in BU01..BU11
5434            let mut dim_to_bu: std::collections::HashMap<String, String> =
5435                std::collections::HashMap::new();
5436            for _ in 0..600 {
5437                let e = g.generate();
5438                for l in &e.lines {
5439                    // BU rolls up from the cost center, or profit center as fallback.
5440                    let dim = l.cost_center.as_deref().or(l.profit_center.as_deref());
5441                    if dim.is_some() {
5442                        dim_lines += 1;
5443                    }
5444                    if let Some(bu) = &l.business_unit {
5445                        bu_lines += 1;
5446                        let d = dim.unwrap_or_default().to_string();
5447                        if bu != &JournalEntryGenerator::business_unit_for_dimension(&d) {
5448                            consistent = false;
5449                        }
5450                        // stable mapping across the run
5451                        if dim_to_bu
5452                            .insert(d, bu.clone())
5453                            .is_some_and(|prev| &prev != bu)
5454                        {
5455                            consistent = false;
5456                        }
5457                        let n_ok = bu.strip_prefix("BU").and_then(|d| d.parse::<u32>().ok());
5458                        if !matches!(n_ok, Some(1..=11)) {
5459                            well_formed = false;
5460                        }
5461                    }
5462                }
5463            }
5464            (dim_lines, bu_lines, consistent, well_formed)
5465        }
5466        let (dim_on, bu_on, consistent, well_formed) = run(Some(true));
5467        let (_, bu_off, _, _) = run(Some(false));
5468        assert!(
5469            dim_on > 0 && bu_on > 0,
5470            "BU should be populated where CC/PC is"
5471        );
5472        assert_eq!(
5473            dim_on, bu_on,
5474            "every CC/PC-bearing line gets a BU ({dim_on} dim vs {bu_on} BU)"
5475        );
5476        assert!(
5477            consistent,
5478            "BU must be the deterministic roll-up of its CC/PC"
5479        );
5480        assert!(well_formed, "BU codes must be BU01..BU11");
5481        assert_eq!(bu_off, 0, "dimension off ⇒ no business_unit, got {bu_off}");
5482    }
5483
5484    #[test]
5485    fn test_foreign_currency_sap_style() {
5486        // SOTA-4: with foreign_currency_rate > 0, some JEs post in a foreign
5487        // document currency. The ledger amounts (debit/credit) stay company
5488        // currency and the JE still balances; the foreign value lands in
5489        // transaction_amount and balances in the transaction currency too. rate
5490        // 0.0 → all company-currency. Reversals/allocations off to isolate.
5491        fn run(rate: Option<f64>) -> (usize, bool, bool) {
5492            let mut coa_gen = ChartOfAccountsGenerator::new(
5493                CoAComplexity::Small,
5494                IndustrySector::Manufacturing,
5495                29,
5496            );
5497            let coa = Arc::new(coa_gen.generate());
5498            let cfg = TransactionConfig {
5499                foreign_currency_rate: rate,
5500                reversal_rate: Some(0.0),
5501                allocation_batch_rate: Some(0.0),
5502                ..TransactionConfig::default()
5503            };
5504            let mut g = JournalEntryGenerator::new_with_params(
5505                cfg,
5506                coa,
5507                vec!["1000".to_string()],
5508                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5509                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5510                29,
5511            )
5512            .with_persona_errors(false)
5513            .with_batching(false);
5514            let mut foreign = 0usize;
5515            let mut ledger_ok = true; // debit == credit (company ledger)
5516            let mut txn_ok = true; // foreign lines carry transaction_amount + balance in txn ccy
5517            for _ in 0..1500 {
5518                let e = g.generate();
5519                if !e.is_balanced() {
5520                    ledger_ok = false;
5521                }
5522                if e.header.currency != "USD" {
5523                    foreign += 1;
5524                    if !e.lines.iter().all(|l| l.transaction_amount.is_some()) {
5525                        txn_ok = false;
5526                    }
5527                    let td: Decimal = e
5528                        .lines
5529                        .iter()
5530                        .filter(|l| l.debit_amount > Decimal::ZERO)
5531                        .filter_map(|l| l.transaction_amount)
5532                        .sum();
5533                    let tc: Decimal = e
5534                        .lines
5535                        .iter()
5536                        .filter(|l| l.credit_amount > Decimal::ZERO)
5537                        .filter_map(|l| l.transaction_amount)
5538                        .sum();
5539                    // tolerate per-line cent rounding (≤ n_lines half-cents)
5540                    let tol = Decimal::new(e.lines.len() as i64, 2);
5541                    if (td - tc).abs() > tol {
5542                        txn_ok = false;
5543                    }
5544                }
5545            }
5546            (foreign, ledger_ok, txn_ok)
5547        }
5548        let (fon, lbal_on, tbal_on) = run(Some(0.20));
5549        let (foff, lbal_off, _) = run(Some(0.0));
5550        assert!(
5551            lbal_on && lbal_off,
5552            "ledger balance (debit==credit) preserved in both modes"
5553        );
5554        assert!(
5555            fon > 0,
5556            "rate 0.20 should produce foreign-currency JEs, got {fon}"
5557        );
5558        assert_eq!(foff, 0, "rate 0.0 ⇒ no foreign JEs, got {foff}");
5559        assert!(
5560            tbal_on,
5561            "foreign JEs carry transaction_amount + balance in the transaction currency"
5562        );
5563    }
5564
5565    #[test]
5566    fn test_created_date_before_posting() {
5567        let mut coa_gen =
5568            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5569        let coa = Arc::new(coa_gen.generate());
5570
5571        let mut je_gen = JournalEntryGenerator::new_with_params(
5572            TransactionConfig::default(),
5573            coa,
5574            vec!["1000".to_string()],
5575            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5576            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5577            42,
5578        )
5579        .with_persona_errors(false);
5580
5581        for _ in 0..500 {
5582            let entry = je_gen.generate();
5583
5584            if let Some(created_date) = entry.header.created_date {
5585                let created_naive_date = created_date.date();
5586                assert!(
5587                    created_naive_date <= entry.header.posting_date,
5588                    "created_date ({}) should be <= posting_date ({})",
5589                    created_naive_date,
5590                    entry.header.posting_date,
5591                );
5592            }
5593        }
5594    }
5595
5596    /// SP3.5b — verify that `apply_calibration_step` mutates the generator's
5597    /// amount_sampler when a `"amounts.lognormal_sigma"` step is applied, and
5598    /// that `"amounts.round_dollar_share"` likewise updates the probability.
5599    #[test]
5600    fn apply_calibration_step_updates_lognormal_sigma() {
5601        let mut coa_gen =
5602            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5603        let coa = Arc::new(coa_gen.generate());
5604
5605        let mut gen = JournalEntryGenerator::new_with_params(
5606            TransactionConfig::default(),
5607            coa,
5608            vec!["1000".to_string()],
5609            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5610            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5611            42,
5612        );
5613
5614        let baseline_sigma = gen.amount_sampler.lognormal_sigma();
5615
5616        let step_sigma = crate::velocity_calibrator::CalibrationStep {
5617            rule_id: "R6".to_string(),
5618            parameter: "amounts.lognormal_sigma".to_string(),
5619            delta: 0.01,
5620            new_value: baseline_sigma + 0.01,
5621        };
5622        gen.apply_calibration_step(&step_sigma);
5623        assert!(
5624            (gen.amount_sampler.lognormal_sigma() - (baseline_sigma + 0.01)).abs() < 1e-9,
5625            "lognormal_sigma should be updated to {}",
5626            baseline_sigma + 0.01
5627        );
5628
5629        let baseline_round = gen.amount_sampler.round_number_probability();
5630        let step_round = crate::velocity_calibrator::CalibrationStep {
5631            rule_id: "R9".to_string(),
5632            parameter: "amounts.round_dollar_share".to_string(),
5633            delta: -0.005,
5634            new_value: (baseline_round - 0.005).max(0.0),
5635        };
5636        gen.apply_calibration_step(&step_round);
5637        let expected = (baseline_round - 0.005).max(0.0).clamp(0.0, 1.0);
5638        assert!(
5639            (gen.amount_sampler.round_number_probability() - expected).abs() < 1e-9,
5640            "round_number_probability should be updated to {}",
5641            expected
5642        );
5643    }
5644
5645    #[test]
5646    fn master_data_resolver_fills_every_pii_kind() {
5647        use datasynth_core::distributions::text_taxonomy::{
5648            PiiPlaceholderKind, PlaceholderResolver,
5649        };
5650        let mut r = MasterDataResolver {
5651            companies: vec!["Acme AG".to_string()],
5652            persons: vec!["Hans Muster".to_string()],
5653            streets: vec!["Hauptstrasse 1".to_string()],
5654            patients: vec!["Patient X".to_string()],
5655        };
5656        let mut rng = rand::rng();
5657        assert_eq!(r.resolve(PiiPlaceholderKind::Company, &mut rng), "Acme AG");
5658        assert_eq!(
5659            r.resolve(PiiPlaceholderKind::Person, &mut rng),
5660            "Hans Muster"
5661        );
5662        assert_eq!(
5663            r.resolve(PiiPlaceholderKind::Street, &mut rng),
5664            "Hauptstrasse 1"
5665        );
5666        assert_eq!(
5667            r.resolve(PiiPlaceholderKind::Patient, &mut rng),
5668            "Patient X"
5669        );
5670    }
5671
5672    #[test]
5673    fn master_data_resolver_empty_pool_falls_back() {
5674        use datasynth_core::distributions::text_taxonomy::{
5675            PiiPlaceholderKind, PlaceholderResolver,
5676        };
5677        let mut r = MasterDataResolver::default();
5678        let mut rng = rand::rng();
5679        let v = r.resolve(PiiPlaceholderKind::Company, &mut rng);
5680        assert!(!v.is_empty());
5681    }
5682
5683    /// Pin the shape invariant on `synthetic_patient_pool`: each entry, once
5684    /// filled into the canonical `*{patient} G:{date}…` template the corpus
5685    /// DZ/RG/RS classes use, must not introduce a *structural* residual-PII
5686    /// shape. Regression guard for the JE_79-class smoke failure: the old pool
5687    /// (`"B. Muster"`, `"A. Beispiel"`, …) shaped each fill as
5688    /// `<initial>. <surname>` which `RE_INITIAL_SURNAME` flags.
5689    ///
5690    /// NB: the `given_name` pattern is deliberately EXCLUDED here. These are
5691    /// synthetic *fill* values that are name-shaped by design (they fill
5692    /// `{patient}`); `given_name` is a template-scan signal for un-tokenized
5693    /// corpus names, not a check on legitimate synthetic output.
5694    #[test]
5695    fn synthetic_patient_pool_entries_pass_residual_scan() {
5696        use datasynth_core::distributions::text_taxonomy::PlaceholderGrammar;
5697        for name in synthetic_patient_pool("de_CH") {
5698            let filled = format!("*{name} G:2024-01-15 E:2024-01-20 A:2024-02-01");
5699            let structural: Vec<_> = PlaceholderGrammar::residual_pii_scan(&filled)
5700                .into_iter()
5701                .filter(|h| h.pattern != "given_name")
5702                .collect();
5703            assert!(
5704                structural.is_empty(),
5705                "synthetic patient name {name:?} fills to PII-shaped {filled:?}: {structural:?}"
5706            );
5707        }
5708    }
5709
5710    #[test]
5711    fn master_data_resolver_fallbacks_are_non_empty_and_placeholder_free() {
5712        use datasynth_core::distributions::text_taxonomy::{
5713            PiiPlaceholderKind, PlaceholderResolver,
5714        };
5715        // Verify fallback constants for every kind are non-empty and contain
5716        // no `{…}` literal placeholders (the resolver must never leak the
5717        // unfilled placeholder token into emitted text).
5718        let mut r = MasterDataResolver::default();
5719        let mut rng = rand::rng();
5720        for kind in [
5721            PiiPlaceholderKind::Company,
5722            PiiPlaceholderKind::Person,
5723            PiiPlaceholderKind::Street,
5724            PiiPlaceholderKind::Patient,
5725        ] {
5726            let v = r.resolve(kind, &mut rng);
5727            assert!(!v.is_empty(), "fallback for {kind:?} must be non-empty");
5728            assert!(
5729                !v.contains('{'),
5730                "fallback for {kind:?} must not contain a placeholder token"
5731            );
5732        }
5733    }
5734}