Skip to main content

datasynth_generators/
je_generator.rs

1//! Journal Entry generator with statistical distributions.
2
3use chrono::{Datelike, NaiveDate, Timelike};
4use datasynth_core::utils::seeded_rng;
5use rand::prelude::*;
6use rand_chacha::ChaCha8Rng;
7use rust_decimal::prelude::*;
8use rust_decimal::Decimal;
9use std::sync::{Arc, LazyLock};
10
11use tracing::debug;
12
13use datasynth_config::schema::{
14    AdvancedDistributionConfig, FraudConfig, GeneratorConfig, MixtureDistributionType,
15    TemplateConfig, TemporalPatternsConfig, TransactionConfig,
16};
17use datasynth_core::distributions::{
18    AdvancedAmountSampler, BusinessDayCalculator, CrossDayConfig, DriftAdjustments, DriftConfig,
19    DriftController, EventType, IndustryAmountProfile, IndustryType, LagDistribution,
20    PeriodEndConfig, PeriodEndDynamics, PeriodEndModel, ProcessingLagCalculator,
21    ProcessingLagConfig, *,
22};
23use datasynth_core::models::*;
24use datasynth_core::templates::{
25    descriptions::DescriptionContext, DescriptionGenerator, ReferenceGenerator, ReferenceType,
26};
27use datasynth_core::traits::Generator;
28use datasynth_core::uuid_factory::{DeterministicUuidFactory, GeneratorType};
29use datasynth_core::CountryPack;
30
31use crate::company_selector::WeightedCompanySelector;
32use crate::user_generator::{UserGenerator, UserGeneratorConfig};
33
34use datasynth_core::distributions::text_taxonomy::{PiiPlaceholderKind, PlaceholderResolver};
35
36/// T2-D Lever 1: the default generic SAP source-mix, used when industry priors
37/// are not loaded but `transactions.synthetic_source_codes` is on (the default).
38/// Built once. See [`SourceMixPrior::sap_default`] and experiments/ml/FINDINGS.md §6.
39static DEFAULT_SOURCE_MIX: LazyLock<
40    datasynth_core::distributions::behavioral_priors::SourceMixPrior,
41> = LazyLock::new(datasynth_core::distributions::behavioral_priors::SourceMixPrior::sap_default);
42
43/// SOTA-5: default fraction of JEs that are reversals/corrections when
44/// `transactions.reversal_rate` is unset. Set to match the corpus reversal
45/// proxy (~0.10) — at 0.04 the measured proxy was only ~0.034 (the proxy
46/// detects ~85% of reversals), so 0.10 lands the proxy near the corpus.
47const DEFAULT_REVERSAL_RATE: f64 = 0.10;
48
49/// SOTA-6: default fraction of JEs that are allocation/assessment batches when
50/// `transactions.allocation_batch_rate` is unset. Small (each batch carries
51/// ~30-80 lines), so the resulting line-share (~8%) and lines-per-JE tail match
52/// the corpus's large-batch postings (FINDINGS §8: AB docs ~52 lines drive the
53/// lpje std). `0.0` disables.
54const DEFAULT_ALLOCATION_RATE: f64 = 0.008;
55/// SOTA-4: foreign document currencies + their company-currency rate (company
56/// units per 1 unit of the document currency). Synthetic, plausible values.
57const FOREIGN_CCYS: &[(&str, f64)] = &[
58    ("EUR", 1.09),
59    ("GBP", 1.27),
60    ("CHF", 1.12),
61    ("CAD", 0.74),
62    ("JPY", 0.0068),
63    ("AUD", 0.66),
64    ("CNY", 0.14),
65];
66/// SOTA-6: inclusive bounds for the number of target (cost-center) lines an
67/// allocation batch explodes into — centred near the corpus AB mean (~52).
68const ALLOCATION_MIN_TARGETS: u32 = 30;
69const ALLOCATION_MAX_TARGETS: u32 = 80;
70
71/// SOTA-2: Zipf exponent for the hot-account power-law. At s=2.0 the top-10%
72/// of accounts in a pool carry ~92-96% of that pool's lines across realistic
73/// pool sizes (N≈60-150) — matching the corpus account-activity Pareto (~0.95).
74const ZIPF_ALPHA: f64 = 2.0;
75/// Largest pool size the precomputed harmonic table covers; larger pools (none
76/// realistic for a single account-type) fall back to the uniform draw.
77const ZIPF_CAP: usize = 16_384;
78/// SOTA-2: cumulative partial sums `CUM[k] = Σ_{i=1..k} i^-ZIPF_ALPHA` (CUM[0]=0),
79/// computed once. Lets [`JournalEntryGenerator::power_law_index`] normalise (O(1)
80/// lookup of `CUM[n]`) and inverse-CDF sample (binary search) without an O(n) sum.
81static ZIPF_CUM: LazyLock<Vec<f64>> = LazyLock::new(|| {
82    let mut cum = Vec::with_capacity(ZIPF_CAP + 1);
83    cum.push(0.0);
84    let mut acc = 0.0_f64;
85    for i in 1..=ZIPF_CAP {
86        acc += 1.0 / (i as f64).powf(ZIPF_ALPHA);
87        cum.push(acc);
88    }
89    cum
90});
91
92/// SP6 — Resolves PII placeholders to concrete values drawn from the run's
93/// synthetic master data. `{company}` <- vendor/customer names, `{person}` <-
94/// user display names, `{street}` <- addresses (empty pool for now — no
95/// address master entity), `{patient}` <- a synthetic-person pool (no master
96/// entity exists for patients). Empty pools fall back to obviously-synthetic
97/// constants so output never carries an empty span or a literal `{…}` token.
98#[derive(Debug, Default)]
99pub struct MasterDataResolver {
100    pub companies: Vec<String>,
101    pub persons: Vec<String>,
102    pub streets: Vec<String>,
103    pub patients: Vec<String>,
104}
105
106impl PlaceholderResolver for MasterDataResolver {
107    fn resolve(&mut self, kind: PiiPlaceholderKind, rng: &mut dyn rand::Rng) -> String {
108        use rand::RngExt;
109        let (pool, fallback): (&Vec<String>, &str) = match kind {
110            PiiPlaceholderKind::Company => (&self.companies, "Synthetic Company AG"),
111            PiiPlaceholderKind::Person => (&self.persons, "Synthetic Person"),
112            PiiPlaceholderKind::Street => (&self.streets, "Synthetic Street 1"),
113            PiiPlaceholderKind::Patient => (&self.patients, "Synthetic Patient"),
114        };
115        if pool.is_empty() {
116            return fallback.to_string();
117        }
118        let idx = rng.random_range(0..pool.len());
119        pool[idx].clone()
120    }
121}
122
123/// A small static pool of obviously-synthetic person names for `{patient}`
124/// filling. No master entity exists for patients. Locale is a hint; for SP6
125/// a single neutral set is sufficient.
126///
127/// **Shape invariant:** every entry must avoid the `<initial>. <surname>` and
128/// `<surname> <initial>.` shapes, because the SP6 `residual_pii_scan` flags
129/// those as `initial_surname` / `surname_initial` PII patterns. The smoke
130/// test asserts the canonical `*{patient} G:…` template fills to a scan-clean
131/// string; an entry like `"B. Muster"` would regress that. Prefer two-word
132/// `<First> <Last>` shapes with no periods (covered by
133/// `synthetic_patient_pool_entries_pass_residual_scan`).
134fn synthetic_patient_pool(_locale: &str) -> Vec<String> {
135    [
136        "Alex Beispiel",
137        "Bea Muster",
138        "Cleo Synthetic",
139        "Demo Example",
140        "Erik Probe",
141        "Fred Testperson",
142        "Gerda Platzhalter",
143        "Hans Demo",
144    ]
145    .iter()
146    .map(|s| s.to_string())
147    .collect()
148}
149
150/// Generator for realistic journal entries.
151pub struct JournalEntryGenerator {
152    rng: ChaCha8Rng,
153    /// T2-D: independent RNG stream for the default source-mix draw, so
154    /// populating `sap_source_code` on the no-priors path never perturbs the
155    /// main `rng` — all other fields stay byte-identical to the legacy output.
156    source_mix_rng: ChaCha8Rng,
157    /// SOTA-1: per-(company, doc-type) library of reusable JE account archetypes
158    /// `(debit_accounts, credit_accounts)` for the recurring-templates process.
159    /// Capped per key; reused on the no-priors path so standard postings recur.
160    recurring_archetypes:
161        std::collections::HashMap<(String, String), Vec<(Vec<String>, Vec<String>)>>,
162    /// SOTA-1: independent RNG for the template-reuse roll + archetype pick, so
163    /// templating never perturbs the main `rng` (amounts/dates/counts unchanged).
164    template_rng: ChaCha8Rng,
165    /// SOTA-5: ring buffer of recent (complete) JEs a later reversal can offset.
166    /// Storing the whole JE lets the reversal inherit its source code, line text,
167    /// audit flags, etc. (only dr/cr + the header markers are changed).
168    reversal_buffer: Vec<JournalEntry>,
169    /// SOTA-5: independent RNG for reversal rolls, so reversals intersperse
170    /// without perturbing the main `rng` (normal JEs stay byte-identical).
171    reversal_rng: ChaCha8Rng,
172    /// SOTA-2: independent RNG for the hot-account power-law override, so the
173    /// account-activity Pareto (a few accounts carry most lines, as in the
174    /// corpus) is concentrated without perturbing the main `rng` — the uniform
175    /// `.choose` draw is still consumed, only its *result* is replaced.
176    account_rng: ChaCha8Rng,
177    /// SOTA-6: independent RNG for the allocation/assessment-batch process, so
178    /// the large 1-to-many postings (the corpus's lines-per-JE tail) intersperse
179    /// without perturbing the main `rng` (normal JEs stay byte-identical).
180    allocation_rng: ChaCha8Rng,
181    /// SOTA-4: independent RNG for the foreign-currency post-process, so the
182    /// document-currency tagging never perturbs the main `rng` (company-currency
183    /// JEs stay byte-identical).
184    fx_rng: ChaCha8Rng,
185    /// SOTA-8: independent RNG for the source-conditional Dirichlet account-pair
186    /// sampler. Built lazily (one `SourcePool` per observed source); when the
187    /// feature is off the sampler stays None and the main RNG / `account_rng`
188    /// stream is byte-identical.
189    cond_pair_rng: ChaCha8Rng,
190    /// SOTA-8: per-source Dirichlet PMFs over per-source account pools.
191    /// Lazy-built on first JE whose source isn't yet pooled.
192    cond_pair_sampler: Option<
193        datasynth_core::distributions::source_conditional_pair::SourceConditionalPairSampler,
194    >,
195    /// SOTA-8: SAP source code of the JE currently being constructed, so the
196    /// `select_*_account` helpers can consult the per-source pool.
197    current_je_source: Option<String>,
198    seed: u64,
199    config: TransactionConfig,
200    coa: Arc<ChartOfAccounts>,
201    companies: Vec<String>,
202    company_selector: WeightedCompanySelector,
203    line_sampler: LineItemSampler,
204    amount_sampler: AmountSampler,
205    temporal_sampler: TemporalSampler,
206    start_date: NaiveDate,
207    end_date: NaiveDate,
208    count: u64,
209    uuid_factory: DeterministicUuidFactory,
210    // Enhanced features
211    user_pool: Option<UserPool>,
212    description_generator: DescriptionGenerator,
213    reference_generator: ReferenceGenerator,
214    template_config: TemplateConfig,
215    vendor_pool: VendorPool,
216    customer_pool: CustomerPool,
217    // Material pool for realistic material references
218    material_pool: Option<MaterialPool>,
219    // Cost-center IDs sourced from the generated cost-centers master so
220    // `JE.cost_center` joins back to `cost_centers.id`.  Populated via
221    // [`with_cost_center_pool`] from the orchestrator after master-data
222    // generation; falls back to the hardcoded `COST_CENTER_POOL` const
223    // when empty (configs that skip master-data generation).
224    cost_center_pool: Vec<String>,
225    // Profit-center IDs sourced from the generated profit-centers master
226    // so `JE.profit_center` joins back to `profit_centers.id`.  Same
227    // population semantics as `cost_center_pool`.
228    profit_center_pool: Vec<String>,
229    // Flag indicating whether we're using real master data vs defaults
230    using_real_master_data: bool,
231    // Fraud generation
232    fraud_config: FraudConfig,
233    // Persona-based error injection
234    persona_errors_enabled: bool,
235    // Approval threshold enforcement
236    approval_enabled: bool,
237    approval_threshold: rust_decimal::Decimal,
238    // SOD violation rate for approval tracking (0.0 to 1.0)
239    sod_violation_rate: f64,
240    // Batching behavior - humans often process similar items together
241    batch_state: Option<BatchState>,
242    // Temporal drift controller for simulating distribution changes over time
243    drift_controller: Option<DriftController>,
244    // Temporal patterns components
245    business_day_calculator: Option<BusinessDayCalculator>,
246    processing_lag_calculator: Option<ProcessingLagCalculator>,
247    temporal_patterns_config: Option<TemporalPatternsConfig>,
248    // Business-process weights for the O2C/P2P/R2R/H2R/A2R volume mix. Must
249    // sum to 1.0 (validated by config schema). Default matches the legacy
250    // hard-coded 0.35/0.30/0.20/0.10/0.05 distribution.
251    business_process_weights: [(BusinessProcess, f64); 5],
252    // v3.4.0 advanced distributions (mixture models + industry profiles).
253    // None preserves v3.3.2 byte-for-byte behavior; populated only when the
254    // caller opts in via [`set_advanced_distributions`].
255    advanced_amount_sampler: Option<AdvancedAmountSampler>,
256    // v3.5.3+ conditional amount override. Populated when
257    // `config.distributions.conditional` contains an entry where
258    // `output_field == "amount"` and `input_field ∈ {"month",
259    // "quarter", "constant"}`. Applied *after* the fraud-pattern /
260    // advanced-sampler / legacy-sampler cascade on non-fraud entries
261    // so it can steer amounts by calendar context without disturbing
262    // fraud semantics.
263    conditional_amount_override: Option<datasynth_core::distributions::ConditionalSampler>,
264    // v3.5.4+ Gaussian copula for amount↔line_count correlation. When
265    // populated, each non-fraud JE draws a (u, v) pair; u nudges amount
266    // via a `(0.75 + 0.5*u)` multiplier and v biases line_count toward
267    // the upper/lower end of its range. Produces observable Spearman
268    // correlation without rewiring existing samplers for inverse-CDF.
269    correlation_copula: Option<datasynth_core::distributions::BivariateCopulaSampler>,
270    /// SP3 — opt-in industry priors. When `Some`, je_generator routes
271    /// timing/lines-per-JE/fanout/active-window through prior-driven samplers.
272    /// When `None`, behavior is identical to v5.11.
273    pub loaded_priors: Option<crate::priors_loader::LoadedPriors>,
274    /// SP3 T11 — accumulated IET days per document-type code.  Only used when
275    /// `loaded_priors.is_some()`.  Tracks the running day offset so
276    /// consecutive calls for the same source produce IET-spaced posting dates.
277    iet_day_accum: std::collections::HashMap<String, f64>,
278    /// v5.30 B1 Phase 2 — per-source burst-clustering state.  When a sampled IET
279    /// falls below `BURST_THRESHOLD_DAYS` and a probability gate fires, the
280    /// next 2-4 events for that source are deterministically clustered with
281    /// short IETs (0.25-1.5 days), giving the within-source IET sequence the
282    /// positive lag-1 autocorrelation the Sajja P1 metric measures.  Bypasses
283    /// the `|ρ| < 0.1` coupling gate in `ConditionalIETSampler` that the SP3
284    /// priors' weak day-resolution autocorrelation can't clear.
285    iet_burst_remaining: std::collections::HashMap<String, u8>,
286    /// SP3.12 — last TP value drawn per SAP source code.  Used by the TP motif
287    /// sampler to bias the next TP draw toward cluster-mates of the previous TP
288    /// on the same source, building triangle structure in the TP co-occurrence graph.
289    last_tp_by_source: std::collections::HashMap<String, String>,
290    /// SP3.4 — when Some, observes each emitted line and applies calibration
291    /// steps to the generator's tunable parameters.
292    pub velocity_calibrator: Option<crate::velocity_calibrator::VelocityCalibrator>,
293    /// SP6 — PII placeholder resolver populated from the run's synthetic master
294    /// data (vendors, customers, users). Rebuilt once via
295    /// [`refresh_md_resolver`] before JE generation begins.
296    md_resolver: MasterDataResolver,
297}
298
299const DEFAULT_BUSINESS_PROCESS_WEIGHTS: [(BusinessProcess, f64); 5] = [
300    (BusinessProcess::O2C, 0.35),
301    (BusinessProcess::P2P, 0.30),
302    (BusinessProcess::R2R, 0.20),
303    (BusinessProcess::H2R, 0.10),
304    (BusinessProcess::A2R, 0.05),
305];
306
307/// Map the schema-level [`datasynth_config::schema::IndustryProfileType`]
308/// onto the distributions-layer [`IndustryType`], then return that industry's
309/// pre-configured `sales_amounts` mixture. Used as a fallback when the
310/// caller enables `distributions.amounts` but supplies no components.
311/// Per-entry context channels for conditional-distribution overrides.
312///
313/// v4.1.0+ supported `input_field` values:
314///
315///   - `"month"` — posting-date month (1..=12)
316///   - `"quarter"` — posting-date quarter (1..=4)
317///   - `"year"` — posting-date year (e.g. 2026.0)
318///   - `"day_of_week"` — 1 (Mon) .. 7 (Sun)
319///   - `"day_of_month"` — 1..=31
320///   - `"day_of_year"` — 1..=366
321///   - `"week_of_year"` — 1..=53
322///   - `"is_period_end"` — 1.0 when posting_date is the last business
323///     day of the month, else 0.0
324///   - `"is_quarter_end"` — 1.0 when posting_date is in a quarter-end
325///     month AND is the last business day, else 0.0
326///   - `"is_year_end"` — 1.0 when posting_date is in December AND is
327///     the last business day, else 0.0
328///   - `"constant"` / empty — always 0.0 (treats as unconditional)
329///
330/// Unsupported values cause the conditional rule to be silently ignored
331/// to keep runtime robust against user typos.
332impl JournalEntryGenerator {
333    fn supported_conditional_input(field: &str) -> bool {
334        matches!(
335            field,
336            "month"
337                | "quarter"
338                | "year"
339                | "day_of_week"
340                | "day_of_month"
341                | "day_of_year"
342                | "week_of_year"
343                | "is_period_end"
344                | "is_quarter_end"
345                | "is_year_end"
346                | "constant"
347                | ""
348        )
349    }
350
351    fn conditional_input_value(&self, posting_date: chrono::NaiveDate) -> f64 {
352        let input_field = match self
353            .conditional_amount_override
354            .as_ref()
355            .map(|s| s.config().input_field.as_str())
356        {
357            Some(f) => f,
358            None => return 0.0,
359        };
360
361        let is_last_business_day = |d: chrono::NaiveDate| -> bool {
362            // Last day-of-month → is_period_end. Handles Feb/leap-year
363            // via chrono's num_days_from_ce roundabout; simpler path:
364            // if adding 1 day moves to a different month, this is EOM.
365            let next = d.succ_opt();
366            match next {
367                Some(n) => n.month() != d.month(),
368                None => true,
369            }
370        };
371
372        match input_field {
373            "month" => posting_date.month() as f64,
374            "quarter" => ((posting_date.month() - 1) / 3 + 1) as f64,
375            "year" => posting_date.year() as f64,
376            "day_of_week" => posting_date.weekday().number_from_monday() as f64,
377            "day_of_month" => posting_date.day() as f64,
378            "day_of_year" => posting_date.ordinal() as f64,
379            "week_of_year" => posting_date.iso_week().week() as f64,
380            "is_period_end" => f64::from(u8::from(is_last_business_day(posting_date))),
381            "is_quarter_end" => {
382                let m = posting_date.month();
383                let is_q_month = matches!(m, 3 | 6 | 9 | 12);
384                f64::from(u8::from(is_q_month && is_last_business_day(posting_date)))
385            }
386            "is_year_end" => f64::from(u8::from(
387                posting_date.month() == 12 && is_last_business_day(posting_date),
388            )),
389            _ => 0.0,
390        }
391    }
392}
393
394fn industry_profile_to_log_normal(
395    p: datasynth_config::schema::IndustryProfileType,
396) -> datasynth_core::distributions::LogNormalMixtureConfig {
397    use datasynth_config::schema::IndustryProfileType as P;
398    let industry = match p {
399        P::Retail => IndustryType::Retail,
400        P::Manufacturing => IndustryType::Manufacturing,
401        P::FinancialServices => IndustryType::FinancialServices,
402        P::Healthcare => IndustryType::Healthcare,
403        P::Technology => IndustryType::Technology,
404    };
405    IndustryAmountProfile::for_industry(industry).sales_amounts
406}
407
408/// State for tracking batch processing behavior.
409///
410/// When humans process transactions, they often batch similar items together
411/// (e.g., processing all invoices from one vendor, entering similar expenses).
412#[derive(Clone)]
413struct BatchState {
414    /// The base entry template to vary
415    base_account_number: String,
416    base_amount: rust_decimal::Decimal,
417    base_business_process: Option<BusinessProcess>,
418    base_posting_date: NaiveDate,
419    /// Remaining entries in this batch
420    remaining: u8,
421}
422
423impl JournalEntryGenerator {
424    /// Create a new journal entry generator.
425    pub fn new_with_params(
426        config: TransactionConfig,
427        coa: Arc<ChartOfAccounts>,
428        companies: Vec<String>,
429        start_date: NaiveDate,
430        end_date: NaiveDate,
431        seed: u64,
432    ) -> Self {
433        Self::new_with_full_config(
434            config,
435            coa,
436            companies,
437            start_date,
438            end_date,
439            seed,
440            TemplateConfig::default(),
441            None,
442        )
443    }
444
445    /// Create a new journal entry generator with full configuration.
446    #[allow(clippy::too_many_arguments)]
447    pub fn new_with_full_config(
448        config: TransactionConfig,
449        coa: Arc<ChartOfAccounts>,
450        companies: Vec<String>,
451        start_date: NaiveDate,
452        end_date: NaiveDate,
453        seed: u64,
454        template_config: TemplateConfig,
455        user_pool: Option<UserPool>,
456    ) -> Self {
457        // Initialize user pool if not provided
458        let user_pool = user_pool.or_else(|| {
459            if template_config.names.generate_realistic_names {
460                let user_gen_config = UserGeneratorConfig {
461                    culture_distribution: vec![
462                        (
463                            datasynth_core::templates::NameCulture::WesternUs,
464                            template_config.names.culture_distribution.western_us,
465                        ),
466                        (
467                            datasynth_core::templates::NameCulture::Hispanic,
468                            template_config.names.culture_distribution.hispanic,
469                        ),
470                        (
471                            datasynth_core::templates::NameCulture::German,
472                            template_config.names.culture_distribution.german,
473                        ),
474                        (
475                            datasynth_core::templates::NameCulture::French,
476                            template_config.names.culture_distribution.french,
477                        ),
478                        (
479                            datasynth_core::templates::NameCulture::Chinese,
480                            template_config.names.culture_distribution.chinese,
481                        ),
482                        (
483                            datasynth_core::templates::NameCulture::Japanese,
484                            template_config.names.culture_distribution.japanese,
485                        ),
486                        (
487                            datasynth_core::templates::NameCulture::Indian,
488                            template_config.names.culture_distribution.indian,
489                        ),
490                    ],
491                    email_domain: template_config.names.email_domain.clone(),
492                    generate_realistic_names: true,
493                };
494                let mut user_gen = UserGenerator::with_config(seed + 100, user_gen_config);
495                Some(user_gen.generate_standard(&companies))
496            } else {
497                None
498            }
499        });
500
501        // Initialize reference generator
502        let mut ref_gen = ReferenceGenerator::new(
503            start_date.year(),
504            companies
505                .first()
506                .map(std::string::String::as_str)
507                .unwrap_or("1000"),
508        );
509        ref_gen.set_prefix(
510            ReferenceType::Invoice,
511            &template_config.references.invoice_prefix,
512        );
513        ref_gen.set_prefix(
514            ReferenceType::PurchaseOrder,
515            &template_config.references.po_prefix,
516        );
517        ref_gen.set_prefix(
518            ReferenceType::SalesOrder,
519            &template_config.references.so_prefix,
520        );
521
522        // Create weighted company selector (uniform weights for this constructor)
523        let company_selector = WeightedCompanySelector::uniform(companies.clone());
524
525        Self {
526            rng: seeded_rng(seed, 0),
527            source_mix_rng: seeded_rng(seed, 50_063),
528            recurring_archetypes: std::collections::HashMap::new(),
529            template_rng: seeded_rng(seed, 70_081),
530            reversal_buffer: Vec::new(),
531            reversal_rng: seeded_rng(seed, 90_017),
532            account_rng: seeded_rng(seed, 60_071),
533            allocation_rng: seeded_rng(seed, 80_023),
534            fx_rng: seeded_rng(seed, 70_093),
535            cond_pair_rng: seeded_rng(seed, 110_071),
536            cond_pair_sampler: None,
537            current_je_source: None,
538            seed,
539            config: config.clone(),
540            coa,
541            companies,
542            company_selector,
543            line_sampler: LineItemSampler::with_config(
544                seed + 1,
545                config.line_item_distribution.clone(),
546                config.even_odd_distribution.clone(),
547                config.debit_credit_distribution.clone(),
548            ),
549            amount_sampler: AmountSampler::with_config(seed + 2, config.amounts.clone()),
550            temporal_sampler: TemporalSampler::with_config(
551                seed + 3,
552                config.seasonality.clone(),
553                WorkingHoursConfig::default(),
554                Vec::new(),
555            ),
556            start_date,
557            end_date,
558            count: 0,
559            uuid_factory: DeterministicUuidFactory::new(seed, GeneratorType::JournalEntry),
560            user_pool,
561            description_generator: DescriptionGenerator::new(),
562            reference_generator: ref_gen,
563            template_config,
564            vendor_pool: VendorPool::standard(),
565            customer_pool: CustomerPool::standard(),
566            material_pool: None,
567            cost_center_pool: Vec::new(),
568            profit_center_pool: Vec::new(),
569            using_real_master_data: false,
570            fraud_config: FraudConfig::default(),
571            persona_errors_enabled: true, // Enable by default for realism
572            approval_enabled: true,       // Enable by default for realism
573            approval_threshold: rust_decimal::Decimal::new(10000, 0), // $10,000 default threshold
574            sod_violation_rate: 0.10,     // 10% default SOD violation rate
575            batch_state: None,
576            drift_controller: None,
577            // Always provide a basic BusinessDayCalculator so that weekend/holiday
578            // filtering is active even when temporal_patterns is not explicitly enabled.
579            business_day_calculator: Some(BusinessDayCalculator::new(HolidayCalendar::new(
580                Region::US,
581                start_date.year(),
582            ))),
583            processing_lag_calculator: None,
584            temporal_patterns_config: None,
585            business_process_weights: DEFAULT_BUSINESS_PROCESS_WEIGHTS,
586            advanced_amount_sampler: None,
587            conditional_amount_override: None,
588            correlation_copula: None,
589            loaded_priors: None,
590            iet_day_accum: std::collections::HashMap::new(),
591            iet_burst_remaining: std::collections::HashMap::new(),
592            last_tp_by_source: std::collections::HashMap::new(),
593            velocity_calibrator: None,
594            md_resolver: MasterDataResolver::default(),
595        }
596    }
597
598    /// Wire v3.4.0 advanced distributions. When the caller's config has
599    /// `distributions.enabled = true` AND `distributions.amounts.enabled =
600    /// true`, the journal-entry generator routes non-fraud amount sampling
601    /// through an [`AdvancedAmountSampler`] (log-normal or Gaussian mixture).
602    ///
603    /// When `distributions.industry_profile` is `Some`, the caller's
604    /// explicitly configured components override nothing — if the component
605    /// list is empty, the industry profile's `sales_amounts` mixture is used
606    /// instead. Explicit components always win.
607    ///
608    /// Returning `Ok(())` with no side effect is intentional for the
609    /// following no-op cases, so callers can unconditionally invoke this:
610    ///   - `config.enabled = false`
611    ///   - `config.amounts.enabled = false`
612    ///   - empty component list with no industry profile
613    ///
614    /// Errors propagate from mixture validation (e.g. weights not summing
615    /// to 1.0, non-positive sigma).
616    pub fn set_advanced_distributions(
617        &mut self,
618        config: &AdvancedDistributionConfig,
619        seed: u64,
620    ) -> Result<(), String> {
621        if !config.enabled {
622            return Ok(());
623        }
624
625        // v3.5.3+: build a conditional-amount override when the config
626        // declares a rule with `output_field == "amount"` and a supported
627        // input field. The override is applied *after* the standard
628        // cascade so it doesn't disturb fraud-path sampling. Unsupported
629        // input fields are ignored with a trace log.
630        self.conditional_amount_override = config
631            .conditional
632            .iter()
633            .find(|c| {
634                c.output_field == "amount" && Self::supported_conditional_input(&c.input_field)
635            })
636            .and_then(|c| {
637                datasynth_core::distributions::ConditionalSampler::new(
638                    seed.wrapping_add(17),
639                    c.to_core_config(),
640                )
641                .ok()
642            });
643
644        // v4.1.0+: all 5 copula types wired (Gaussian / Clayton /
645        // Gumbel / Frank / Student-t). The `BivariateCopulaSampler`
646        // already implements each; v3.5.4 had a filter limiting to
647        // Gaussian only — lifted here now that the smoke test matrix
648        // covers all types.
649        self.correlation_copula = config
650            .correlations
651            .to_core_config_for_pair("amount", "line_count")
652            .and_then(|copula_cfg| {
653                datasynth_core::distributions::BivariateCopulaSampler::new(
654                    seed.wrapping_add(31),
655                    copula_cfg,
656                )
657                .ok()
658            });
659
660        // v3.4.4+: Pareto takes precedence over mixture models when set.
661        // This supports heavy-tailed amount distributions (capex, strategic
662        // contracts, fraud) that log-normal/Gaussian mixtures can't model
663        // as sharply.
664        if let Some(pareto) = &config.pareto {
665            if pareto.enabled {
666                let core_cfg = pareto.to_core_config();
667                self.advanced_amount_sampler =
668                    Some(AdvancedAmountSampler::new_pareto(seed, core_cfg)?);
669                return Ok(());
670            }
671        }
672
673        if !config.amounts.enabled {
674            return Ok(());
675        }
676
677        match config.amounts.distribution_type {
678            MixtureDistributionType::LogNormal => {
679                let lognormal_cfg = config.amounts.to_log_normal_config().or_else(|| {
680                    config
681                        .industry_profile
682                        .as_ref()
683                        .map(|p| industry_profile_to_log_normal(p.profile_type()))
684                });
685                if let Some(cfg) = lognormal_cfg {
686                    self.advanced_amount_sampler =
687                        Some(AdvancedAmountSampler::new_log_normal(seed, cfg)?);
688                }
689            }
690            MixtureDistributionType::Gaussian => {
691                if let Some(cfg) = config.amounts.to_gaussian_config() {
692                    self.advanced_amount_sampler =
693                        Some(AdvancedAmountSampler::new_gaussian(seed, cfg)?);
694                }
695            }
696        }
697
698        Ok(())
699    }
700
701    /// Override the business-process volume mix. Weights map directly to the
702    /// `business_processes.*_weight` YAML config; they do not have to sum to
703    /// exactly 1.0 (they're normalized via `weighted_select`).
704    pub fn set_business_process_weights(
705        &mut self,
706        o2c: f64,
707        p2p: f64,
708        r2r: f64,
709        h2r: f64,
710        a2r: f64,
711    ) {
712        self.business_process_weights = [
713            (BusinessProcess::O2C, o2c),
714            (BusinessProcess::P2P, p2p),
715            (BusinessProcess::R2R, r2r),
716            (BusinessProcess::H2R, h2r),
717            (BusinessProcess::A2R, a2r),
718        ];
719    }
720
721    /// Create from a full GeneratorConfig.
722    ///
723    /// This constructor uses the volume_weight from company configs
724    /// for weighted company selection, and fraud config from GeneratorConfig.
725    pub fn from_generator_config(
726        full_config: &GeneratorConfig,
727        coa: Arc<ChartOfAccounts>,
728        start_date: NaiveDate,
729        end_date: NaiveDate,
730        seed: u64,
731    ) -> Self {
732        let companies: Vec<String> = full_config
733            .companies
734            .iter()
735            .map(|c| c.code.clone())
736            .collect();
737
738        // Create weighted selector using volume_weight from company configs
739        let company_selector = WeightedCompanySelector::from_configs(&full_config.companies);
740
741        let mut generator = Self::new_with_full_config(
742            full_config.transactions.clone(),
743            coa,
744            companies,
745            start_date,
746            end_date,
747            seed,
748            full_config.templates.clone(),
749            None,
750        );
751
752        // Override the uniform selector with weighted selector
753        generator.company_selector = company_selector;
754
755        // Set fraud config
756        generator.fraud_config = full_config.fraud.clone();
757
758        // Configure temporal patterns if enabled
759        let temporal_config = &full_config.temporal_patterns;
760        if temporal_config.enabled {
761            generator = generator.with_temporal_patterns(temporal_config.clone(), seed);
762        }
763
764        generator
765    }
766
767    /// Configure temporal patterns including business day calculations and processing lags.
768    ///
769    /// This enables realistic temporal behavior including:
770    /// - Business day awareness (no postings on weekends/holidays)
771    /// - Processing lag modeling (event-to-posting delays)
772    /// - Period-end dynamics (volume spikes at month/quarter/year end)
773    pub fn with_temporal_patterns(mut self, config: TemporalPatternsConfig, seed: u64) -> Self {
774        // Create business day calculator if enabled
775        if config.business_days.enabled {
776            let region = config
777                .calendars
778                .regions
779                .first()
780                .map(|r| Self::parse_region(r))
781                .unwrap_or(Region::US);
782
783            let calendar = HolidayCalendar::new(region, self.start_date.year());
784            self.business_day_calculator = Some(BusinessDayCalculator::new(calendar));
785        }
786
787        // Create processing lag calculator if enabled
788        if config.processing_lags.enabled {
789            let lag_config = Self::convert_processing_lag_config(&config.processing_lags);
790            self.processing_lag_calculator =
791                Some(ProcessingLagCalculator::with_config(seed, lag_config));
792        }
793
794        // Create period-end dynamics if configured
795        let model = config.period_end.model.as_deref().unwrap_or("flat");
796        if model != "flat"
797            || config
798                .period_end
799                .month_end
800                .as_ref()
801                .is_some_and(|m| m.peak_multiplier.unwrap_or(1.0) != 1.0)
802        {
803            let dynamics = Self::convert_period_end_config(&config.period_end);
804            self.temporal_sampler.set_period_end_dynamics(dynamics);
805        }
806
807        self.temporal_patterns_config = Some(config);
808        self
809    }
810
811    /// Configure temporal patterns using a [`CountryPack`] for the holiday calendar.
812    ///
813    /// This is an alternative to `with_temporal_patterns` that derives the
814    /// holiday calendar from a country-pack definition rather than the built-in
815    /// region-based calendars.  All other temporal behaviour (business-day
816    /// adjustment, processing lags, period-end dynamics) is configured
817    /// identically.
818    pub fn with_country_pack_temporal(
819        mut self,
820        config: TemporalPatternsConfig,
821        seed: u64,
822        pack: &CountryPack,
823    ) -> Self {
824        // Create business day calculator using the country pack calendar
825        if config.business_days.enabled {
826            let calendar = HolidayCalendar::from_country_pack(pack, self.start_date.year());
827            self.business_day_calculator = Some(BusinessDayCalculator::new(calendar));
828        }
829
830        // Create processing lag calculator if enabled
831        if config.processing_lags.enabled {
832            let lag_config = Self::convert_processing_lag_config(&config.processing_lags);
833            self.processing_lag_calculator =
834                Some(ProcessingLagCalculator::with_config(seed, lag_config));
835        }
836
837        // Create period-end dynamics if configured
838        let model = config.period_end.model.as_deref().unwrap_or("flat");
839        if model != "flat"
840            || config
841                .period_end
842                .month_end
843                .as_ref()
844                .is_some_and(|m| m.peak_multiplier.unwrap_or(1.0) != 1.0)
845        {
846            let dynamics = Self::convert_period_end_config(&config.period_end);
847            self.temporal_sampler.set_period_end_dynamics(dynamics);
848        }
849
850        self.temporal_patterns_config = Some(config);
851        self
852    }
853
854    /// Convert schema processing lag config to core config.
855    fn convert_processing_lag_config(
856        schema: &datasynth_config::schema::ProcessingLagSchemaConfig,
857    ) -> ProcessingLagConfig {
858        let mut config = ProcessingLagConfig {
859            enabled: schema.enabled,
860            ..Default::default()
861        };
862
863        // Helper to convert lag schema to distribution
864        let convert_lag = |lag: &datasynth_config::schema::LagDistributionSchemaConfig| {
865            let mut dist = LagDistribution::log_normal(lag.mu, lag.sigma);
866            if let Some(min) = lag.min_hours {
867                dist.min_lag_hours = min;
868            }
869            if let Some(max) = lag.max_hours {
870                dist.max_lag_hours = max;
871            }
872            dist
873        };
874
875        // Apply event-specific lags
876        if let Some(ref lag) = schema.sales_order_lag {
877            config
878                .event_lags
879                .insert(EventType::SalesOrder, convert_lag(lag));
880        }
881        if let Some(ref lag) = schema.purchase_order_lag {
882            config
883                .event_lags
884                .insert(EventType::PurchaseOrder, convert_lag(lag));
885        }
886        if let Some(ref lag) = schema.goods_receipt_lag {
887            config
888                .event_lags
889                .insert(EventType::GoodsReceipt, convert_lag(lag));
890        }
891        if let Some(ref lag) = schema.invoice_receipt_lag {
892            config
893                .event_lags
894                .insert(EventType::InvoiceReceipt, convert_lag(lag));
895        }
896        if let Some(ref lag) = schema.invoice_issue_lag {
897            config
898                .event_lags
899                .insert(EventType::InvoiceIssue, convert_lag(lag));
900        }
901        if let Some(ref lag) = schema.payment_lag {
902            config
903                .event_lags
904                .insert(EventType::Payment, convert_lag(lag));
905        }
906        if let Some(ref lag) = schema.journal_entry_lag {
907            config
908                .event_lags
909                .insert(EventType::JournalEntry, convert_lag(lag));
910        }
911
912        // Apply cross-day posting config
913        if let Some(ref cross_day) = schema.cross_day_posting {
914            config.cross_day = CrossDayConfig {
915                enabled: cross_day.enabled,
916                probability_by_hour: cross_day.probability_by_hour.clone(),
917                ..Default::default()
918            };
919        }
920
921        config
922    }
923
924    /// Convert schema period-end config to core PeriodEndDynamics.
925    fn convert_period_end_config(
926        schema: &datasynth_config::schema::PeriodEndSchemaConfig,
927    ) -> PeriodEndDynamics {
928        let model_type = schema.model.as_deref().unwrap_or("exponential");
929
930        // Helper to convert period config
931        let convert_period =
932            |period: Option<&datasynth_config::schema::PeriodEndModelSchemaConfig>,
933             default_peak: f64|
934             -> PeriodEndConfig {
935                if let Some(p) = period {
936                    let model = match model_type {
937                        "flat" => PeriodEndModel::FlatMultiplier {
938                            multiplier: p.peak_multiplier.unwrap_or(default_peak),
939                        },
940                        "extended_crunch" => PeriodEndModel::ExtendedCrunch {
941                            start_day: p.start_day.unwrap_or(-10),
942                            sustained_high_days: p.sustained_high_days.unwrap_or(3),
943                            peak_multiplier: p.peak_multiplier.unwrap_or(default_peak),
944                            ramp_up_days: 3, // Default ramp-up period
945                        },
946                        _ => PeriodEndModel::ExponentialAcceleration {
947                            start_day: p.start_day.unwrap_or(-10),
948                            base_multiplier: p.base_multiplier.unwrap_or(1.0),
949                            peak_multiplier: p.peak_multiplier.unwrap_or(default_peak),
950                            decay_rate: p.decay_rate.unwrap_or(0.3),
951                        },
952                    };
953                    PeriodEndConfig {
954                        enabled: true,
955                        model,
956                        additional_multiplier: p.additional_multiplier.unwrap_or(1.0),
957                    }
958                } else {
959                    PeriodEndConfig {
960                        enabled: true,
961                        model: PeriodEndModel::ExponentialAcceleration {
962                            start_day: -10,
963                            base_multiplier: 1.0,
964                            peak_multiplier: default_peak,
965                            decay_rate: 0.3,
966                        },
967                        additional_multiplier: 1.0,
968                    }
969                }
970            };
971
972        PeriodEndDynamics::new(
973            convert_period(schema.month_end.as_ref(), 2.0),
974            convert_period(schema.quarter_end.as_ref(), 3.5),
975            convert_period(schema.year_end.as_ref(), 5.0),
976        )
977    }
978
979    /// Parse a region string into a Region enum.
980    fn parse_region(region_str: &str) -> Region {
981        match region_str.to_uppercase().as_str() {
982            "US" => Region::US,
983            "DE" => Region::DE,
984            "GB" => Region::GB,
985            "CN" => Region::CN,
986            "JP" => Region::JP,
987            "IN" => Region::IN,
988            "BR" => Region::BR,
989            "MX" => Region::MX,
990            "AU" => Region::AU,
991            "SG" => Region::SG,
992            "KR" => Region::KR,
993            "FR" => Region::FR,
994            "IT" => Region::IT,
995            "ES" => Region::ES,
996            "CA" => Region::CA,
997            _ => Region::US,
998        }
999    }
1000
1001    /// Set a custom company selector.
1002    pub fn set_company_selector(&mut self, selector: WeightedCompanySelector) {
1003        self.company_selector = selector;
1004    }
1005
1006    /// Get the current company selector.
1007    pub fn company_selector(&self) -> &WeightedCompanySelector {
1008        &self.company_selector
1009    }
1010
1011    /// Set fraud configuration.
1012    pub fn set_fraud_config(&mut self, config: FraudConfig) {
1013        self.fraud_config = config;
1014    }
1015
1016    /// Set vendors from generated master data.
1017    ///
1018    /// This replaces the default vendor pool with actual generated vendors,
1019    /// ensuring JEs reference real master data entities.
1020    pub fn with_vendors(mut self, vendors: &[Vendor]) -> Self {
1021        if !vendors.is_empty() {
1022            self.vendor_pool = VendorPool::from_vendors(vendors.to_vec());
1023            self.using_real_master_data = true;
1024        }
1025        self
1026    }
1027
1028    /// Set customers from generated master data.
1029    ///
1030    /// This replaces the default customer pool with actual generated customers,
1031    /// ensuring JEs reference real master data entities.
1032    pub fn with_customers(mut self, customers: &[Customer]) -> Self {
1033        if !customers.is_empty() {
1034            self.customer_pool = CustomerPool::from_customers(customers.to_vec());
1035            self.using_real_master_data = true;
1036        }
1037        self
1038    }
1039
1040    /// Set materials from generated master data.
1041    ///
1042    /// This provides material references for JEs that involve inventory movements.
1043    pub fn with_materials(mut self, materials: &[Material]) -> Self {
1044        if !materials.is_empty() {
1045            self.material_pool = Some(MaterialPool::from_materials(materials.to_vec()));
1046            self.using_real_master_data = true;
1047        }
1048        self
1049    }
1050
1051    /// Set all master data at once for convenience.
1052    ///
1053    /// This is the recommended way to configure the JE generator with
1054    /// generated master data to ensure data coherence.
1055    pub fn with_master_data(
1056        self,
1057        vendors: &[Vendor],
1058        customers: &[Customer],
1059        materials: &[Material],
1060    ) -> Self {
1061        self.with_vendors(vendors)
1062            .with_customers(customers)
1063            .with_materials(materials)
1064    }
1065
1066    /// SP6 — Build a [`MasterDataResolver`] from the run's master data and
1067    /// store it in `self.md_resolver`. Call once before JE generation begins
1068    /// (the entry method `generate` calls this lazily on the first entry when
1069    /// the resolver pools are empty). Pools are cheap `Vec<String>` snapshots
1070    /// of names already held in the generator's vendor/customer/user pools.
1071    fn refresh_md_resolver(&mut self) {
1072        let companies: Vec<String> = self
1073            .vendor_pool
1074            .vendors
1075            .iter()
1076            .map(|v| v.name.clone())
1077            .chain(self.customer_pool.customers.iter().map(|c| c.name.clone()))
1078            .collect();
1079
1080        let persons: Vec<String> = self
1081            .user_pool
1082            .as_ref()
1083            .map(|p| p.users.iter().map(|u| u.display_name.clone()).collect())
1084            .unwrap_or_default();
1085
1086        let streets: Vec<String> = Vec::new(); // No address master entity in this generator.
1087        let patients = synthetic_patient_pool("de_CH");
1088
1089        self.md_resolver = MasterDataResolver {
1090            companies,
1091            persons,
1092            streets,
1093            patients,
1094        };
1095    }
1096
1097    /// Set the cost-center pool used by line-item enrichment.
1098    ///
1099    /// The orchestrator wires this from the generated cost-centers
1100    /// master so `JE.cost_center` joins back to `cost_centers.id`.
1101    /// When the pool is non-empty `enrich_line_items` picks
1102    /// deterministically from it; the hardcoded fallback
1103    /// `COST_CENTER_POOL` const is only used when the pool is empty
1104    /// (configs that don't generate cost-center master data).
1105    pub fn with_cost_center_pool(mut self, ids: Vec<String>) -> Self {
1106        self.cost_center_pool = ids;
1107        self
1108    }
1109
1110    /// Set the profit-center pool used by line-item enrichment.
1111    ///
1112    /// Same semantics as `with_cost_center_pool` but for the
1113    /// profit-centers master.  Without this, the legacy
1114    /// `PC-{company_code}-{P2P|O2C|R2R|H2R}` derivation is used —
1115    /// which is consistent within a generation run but does not
1116    /// match the format the master data generator emits.
1117    pub fn with_profit_center_pool(mut self, ids: Vec<String>) -> Self {
1118        self.profit_center_pool = ids;
1119        self
1120    }
1121
1122    /// Replace the auto-generated user pool with an externally-built one.
1123    ///
1124    /// The orchestrator builds a [`UserPool`] from the generated
1125    /// employee master ([`UserPool::from_employees`]) and passes it
1126    /// here, so `JE.created_by` joins back to `employees.user_id`.
1127    /// Without this call, `with_country_pack_names` generates its
1128    /// own user pool whose ids are disjoint from the employee
1129    /// master.
1130    pub fn with_user_pool(mut self, pool: UserPool) -> Self {
1131        self.user_pool = Some(pool);
1132        self
1133    }
1134
1135    /// Replace the user pool with one generated from a [`CountryPack`].
1136    ///
1137    /// This is an alternative to the default name-culture distribution that
1138    /// derives name pools and weights from the country-pack's `names` section.
1139    /// The existing user pool (if any) is discarded and regenerated using
1140    /// `MultiCultureNameGenerator::from_country_pack`.
1141    pub fn with_country_pack_names(mut self, pack: &CountryPack) -> Self {
1142        let name_gen =
1143            datasynth_core::templates::MultiCultureNameGenerator::from_country_pack(pack);
1144        let config = UserGeneratorConfig {
1145            // The culture distribution is embedded in the name generator
1146            // itself, so we use an empty list here.
1147            culture_distribution: Vec::new(),
1148            email_domain: name_gen.email_domain().to_string(),
1149            generate_realistic_names: true,
1150        };
1151        let mut user_gen = UserGenerator::with_name_generator(self.seed + 100, config, name_gen);
1152        self.user_pool = Some(user_gen.generate_standard(&self.companies));
1153        self
1154    }
1155
1156    /// Check if the generator is using real master data.
1157    pub fn is_using_real_master_data(&self) -> bool {
1158        self.using_real_master_data
1159    }
1160
1161    /// Determine if this transaction should be fraudulent.
1162    /// Pick a realistic ERP `source_system` provenance code.
1163    ///
1164    /// Returns a string like `"SAP-FI/AP"`, `"manual/adjustment"`,
1165    /// `"Interface/EDI"`. Uses the business process to bias toward
1166    /// process-appropriate sub-modules (e.g. P2P → SAP-MM/IV, O2C →
1167    /// SAP-SD/IV, H2R → SAP-HR/PR). The legacy 7-code shape
1168    /// (`SAP-FI`, `SAP-MM`, etc.) is preserved as a prefix so existing
1169    /// `starts_with` filters keep working.
1170    ///
1171    /// **Manual contract**: when `is_manual` is true the returned value
1172    /// always starts with `"manual"` or `"spreadsheet"`. This is asserted
1173    /// in `test_isa240_audit_flags_populated`.
1174    fn pick_source_system(rng: &mut ChaCha8Rng, is_manual: bool, bp: BusinessProcess) -> String {
1175        if is_manual {
1176            // 8 manual provenance codes — all share a `manual/` or
1177            // `spreadsheet/` prefix.
1178            const MANUAL: &[&str] = &[
1179                "manual/standard",
1180                "manual/adjustment",
1181                "manual/reclassification",
1182                "manual/accrual",
1183                "manual/reversal",
1184                "manual/correction",
1185                "spreadsheet/upload",
1186                "spreadsheet/journal",
1187            ];
1188            let idx = (rng.random::<u32>() as usize) % MANUAL.len();
1189            return MANUAL[idx].to_string();
1190        }
1191
1192        // Process-aware automated provenance. Each process has a small
1193        // primary set; we also mix in cross-process codes ~20% of the
1194        // time so the taxonomy stays diverse without losing coherence.
1195        let primary: &[&str] = match bp {
1196            BusinessProcess::P2P => &[
1197                "SAP-MM/PO",
1198                "SAP-MM/IV",
1199                "SAP-MM/IM",
1200                "SAP-FI/AP",
1201                "Interface/EDI",
1202            ],
1203            BusinessProcess::O2C => &[
1204                "SAP-SD/ORD",
1205                "SAP-SD/DEL",
1206                "SAP-SD/IV",
1207                "SAP-FI/AR",
1208                "Interface/Lockbox",
1209            ],
1210            BusinessProcess::H2R => &["SAP-HR/PR", "SAP-HR/TIME", "Interface/PayRun"],
1211            BusinessProcess::A2R => &["SAP-FI/AA", "SAP-FI/GL"],
1212            BusinessProcess::Treasury => &["Treasury/CM", "Treasury/HD", "Interface/Bank"],
1213            BusinessProcess::Tax => &["Tax/RPT", "SAP-FI/GL"],
1214            BusinessProcess::Mfg => &["SAP-MM/IM", "SAP-FI/GL"],
1215            // R2R, S2C, Bank, Audit, Intercompany, ProjectAccounting, Esg
1216            // → fall through to a generic mix.
1217            _ => &[
1218                "SAP-FI/GL",
1219                "SAP-FI/AP",
1220                "SAP-FI/AR",
1221                "SAP-FI/AA",
1222                "External/SubL",
1223            ],
1224        };
1225
1226        // 80% process-appropriate, 20% cross-process (pulled from a
1227        // generic pool) so the categorical distribution has long tails.
1228        const CROSS: &[&str] = &[
1229            "SAP-FI/GL",
1230            "SAP-FI/AP",
1231            "SAP-FI/AR",
1232            "Interface/EDI",
1233            "Interface/Bank",
1234            "External/SubL",
1235        ];
1236        let pool = if rng.random::<f64>() < 0.80 {
1237            primary
1238        } else {
1239            CROSS
1240        };
1241        let idx = (rng.random::<u32>() as usize) % pool.len();
1242        pool[idx].to_string()
1243    }
1244
1245    /// T2-D Lever 1: choose the `sap_source_code` emitted in the CSV `source`
1246    /// column. Priority: loaded industry priors' `source_mix` (SP3.6) → the
1247    /// default generic SAP doc-type mix when `transactions.synthetic_source_codes`
1248    /// is on (the default) → `None` (legacy: `source` falls back to the coarse
1249    /// `TransactionSource` enum). Closes the source-mix breadth gap by default
1250    /// (entropy ~0.75 → ~2.7; experiments/ml/FINDINGS.md §6).
1251    fn sample_sap_source_code(&mut self) -> Option<String> {
1252        if let Some(p) = self.loaded_priors.as_ref() {
1253            return Some(p.source_mix.sample(&mut self.rng));
1254        }
1255        if self.config.synthetic_source_codes.unwrap_or(true) {
1256            // Independent stream: never perturb the main RNG, so all other
1257            // fields stay byte-identical to the legacy (enum-source) output.
1258            return Some(DEFAULT_SOURCE_MIX.sample(&mut self.source_mix_rng));
1259        }
1260        None
1261    }
1262
1263    /// SOTA-1: on the no-priors path, reuse a cached `(debit, credit)` account
1264    /// archetype matching the line counts for this `(company, doc_type)` with
1265    /// high probability, so standard postings recur (and a hot subset of
1266    /// accounts dominates) instead of every JE drawing fresh uniform accounts.
1267    /// Returns the accounts to use, or `None` to select fresh (then cached).
1268    /// Rolls `template_rng` first so the main RNG (amounts/dates/counts) is
1269    /// never perturbed — only account *choice* changes on reuse.
1270    fn pick_recurring_archetype(
1271        &mut self,
1272        company: &str,
1273        doc_type: &str,
1274        debit_count: usize,
1275        credit_count: usize,
1276    ) -> Option<(Vec<String>, Vec<String>)> {
1277        if !self.config.recurring_templates.unwrap_or(true) {
1278            return None;
1279        }
1280        // Priors carry their own GL-account structure; templating is the no-priors
1281        // default-path realism boost (FINDINGS sec.8) UNLESS the user has explicitly
1282        // set archetype_reuse_probability — in that case SOTA-1 composes with the
1283        // priors path (SOTA-9 #137: lift corpus recurring share toward ~0.97).
1284        let p_reuse_opt = self.config.archetype_reuse_probability;
1285        if p_reuse_opt.is_none() && self.loaded_priors.is_some() {
1286            return None;
1287        }
1288        let p_reuse = p_reuse_opt.unwrap_or(0.90);
1289        if self.template_rng.random::<f64>() >= p_reuse {
1290            return None;
1291        }
1292        let lib = self
1293            .recurring_archetypes
1294            .get(&(company.to_string(), doc_type.to_string()))?;
1295        let matching: Vec<&(Vec<String>, Vec<String>)> = lib
1296            .iter()
1297            .filter(|(d, c)| d.len() == debit_count && c.len() == credit_count)
1298            .collect();
1299        if matching.is_empty() {
1300            return None;
1301        }
1302        // Power-law (Zipf) over the cached archetypes rather than a uniform pick:
1303        // the earlier-cached "standard" posting of each (company, doc-type, shape)
1304        // dominates, so a hot subset of archetypes carries most JEs. Uniform reuse
1305        // kept the per-JE recurring share high but left the archetype head too
1306        // flat (top-50 coverage 0.49 vs corpus 0.65); concentrating the head lifts
1307        // top-50 coverage toward the corpus. Same mechanism as the SOTA-2 account
1308        // Pareto, drawn on the `template_rng` stream.
1309        let idx = Self::power_law_index(matching.len(), &mut self.template_rng).unwrap_or(0);
1310        Some(matching[idx].clone())
1311    }
1312
1313    /// SOTA-1: record a freshly-selected archetype for future reuse, capped per
1314    /// `(company, doc_type)` so the standard-posting library stays small.
1315    fn cache_recurring_archetype(
1316        &mut self,
1317        company: &str,
1318        doc_type: &str,
1319        debit: Vec<String>,
1320        credit: Vec<String>,
1321    ) {
1322        if self.loaded_priors.is_some() || !self.config.recurring_templates.unwrap_or(true) {
1323            return;
1324        }
1325        if debit.is_empty() && credit.is_empty() {
1326            return;
1327        }
1328        const CAP: usize = 24; // distinct archetypes per (company, doc-type) — fewer ⇒ top-50 archetypes cover more JEs (toward corpus top-50 ~0.65)
1329        let lib = self
1330            .recurring_archetypes
1331            .entry((company.to_string(), doc_type.to_string()))
1332            .or_default();
1333        if lib.len() < CAP {
1334            lib.push((debit, credit));
1335        }
1336    }
1337
1338    /// SOTA-5: with probability `transactions.reversal_rate` (default ~10%),
1339    /// build a reversal/correction of a recent JE (swap dr/cr, reference the
1340    /// original) instead of a fresh JE. Uses `reversal_rng` and an id derived
1341    /// from the original, so the main RNG + uuid factory are unperturbed (normal
1342    /// JEs stay byte-identical; reversals are interspersed). Balanced because the
1343    /// original was balanced and we swap each line's debit/credit.
1344    fn maybe_generate_reversal(&mut self) -> Option<JournalEntry> {
1345        let rate = self.config.reversal_rate.unwrap_or(DEFAULT_REVERSAL_RATE);
1346        if rate <= 0.0 || self.reversal_buffer.is_empty() {
1347            return None;
1348        }
1349        if self.reversal_rng.random::<f64>() >= rate {
1350            return None;
1351        }
1352        let pick = (self.reversal_rng.random::<u32>() as usize) % self.reversal_buffer.len();
1353        // Consume the entry so the same original is never reversed twice — that
1354        // would mint the same derived id (`orig ^ salt`) and produce duplicate
1355        // document IDs (regression caught by `test_document_reference_integrity`).
1356        let mut entry = self.reversal_buffer.remove(pick);
1357        let orig_id = entry.header.document_id;
1358        // Reversal posts a few business days after the original.
1359        let offset = 1 + (self.reversal_rng.random::<u32>() % 7) as i64;
1360        let mut rev_date = entry.header.posting_date + chrono::Duration::days(offset);
1361        if let Some(ref calc) = self.business_day_calculator {
1362            if !calc.is_business_day(rev_date) {
1363                rev_date = calc.next_business_day(rev_date, false);
1364            }
1365        }
1366        if rev_date > self.end_date {
1367            rev_date = entry.header.posting_date;
1368        }
1369        // Deterministic id derived from the original (no uuid-factory advance).
1370        let rev_id =
1371            uuid::Uuid::from_u128(orig_id.as_u128() ^ 0x5245_5645_5253_414c_5245_5645_5253_414c);
1372        // Inherit everything from the original (source code, line text, audit
1373        // flags, ...); change only the markers + each line's debit/credit.
1374        entry.header.document_id = rev_id;
1375        entry.header.posting_date = rev_date;
1376        entry.header.document_date = rev_date;
1377        entry.header.fiscal_year = rev_date.year() as u16;
1378        entry.header.fiscal_period = rev_date.month() as u8;
1379        entry.header.header_text = Some(format!("Reversal of {orig_id}"));
1380        entry.header.reference = Some(format!("REV-{orig_id}"));
1381        entry.header.batch_id = None;
1382        for line in entry.lines.iter_mut() {
1383            std::mem::swap(&mut line.debit_amount, &mut line.credit_amount);
1384            line.document_id = rev_id;
1385        }
1386        Some(entry)
1387    }
1388
1389    /// SOTA-5/6: remember a (complete) JE so a later reversal (SOTA-5) or
1390    /// allocation batch (SOTA-6) can reuse it. Populated when either process is
1391    /// enabled, so disabling reversals doesn't starve the allocation batches.
1392    fn record_for_reversal(&mut self, entry: &JournalEntry) {
1393        let reversal_on = self.config.reversal_rate.unwrap_or(DEFAULT_REVERSAL_RATE) > 0.0;
1394        let allocation_on = self
1395            .config
1396            .allocation_batch_rate
1397            .unwrap_or(DEFAULT_ALLOCATION_RATE)
1398            > 0.0;
1399        if (!reversal_on && !allocation_on) || entry.lines.is_empty() {
1400            return;
1401        }
1402        const CAP: usize = 64;
1403        if self.reversal_buffer.len() >= CAP {
1404            self.reversal_buffer.remove(0);
1405        }
1406        self.reversal_buffer.push(entry.clone());
1407    }
1408
1409    /// SOTA-4: with probability `transactions.foreign_currency_rate`, post this JE
1410    /// in a foreign document currency (SAP-style). `debit_amount`/`credit_amount`/
1411    /// `local_amount` stay the company-ledger amount (DMBTR — the trial balance is
1412    /// unaffected); `header.currency`/`header.exchange_rate` + each line's
1413    /// `transaction_amount` (WRBTR) carry the foreign value. Balance holds in both
1414    /// currencies (every line shares one rate). Drawn on `fx_rng` so the main
1415    /// `rng` (and all company-currency JEs) stay byte-identical.
1416    fn maybe_apply_foreign_currency(&mut self, entry: &mut JournalEntry) {
1417        let prob = self.config.foreign_currency_rate.unwrap_or(0.0);
1418        if prob <= 0.0 || self.fx_rng.random::<f64>() >= prob {
1419            return;
1420        }
1421        let (code, rate) = FOREIGN_CCYS[self.fx_rng.random_range(0..FOREIGN_CCYS.len())];
1422        let rate_dec = match Decimal::from_f64_retain(rate) {
1423            Some(r) if r > Decimal::ZERO => r,
1424            _ => return,
1425        };
1426        entry.header.currency = code.to_string();
1427        entry.header.exchange_rate = rate_dec;
1428        for line in entry.lines.iter_mut() {
1429            let ledger = line.debit_amount + line.credit_amount; // one side is zero
1430            line.transaction_amount = Some((ledger / rate_dec).round_dp(2));
1431        }
1432    }
1433
1434    /// SOTA-6: split `total` into `n` positive cent-precise parts summing
1435    /// **exactly** to `total` (so the JE stays balanced), with random weights so
1436    /// the allocation isn't perfectly even. Each part is ≥ 1 cent. Returns a
1437    /// single `[total]` when the amount is too small to split into `n` parts.
1438    fn split_amount(total: Decimal, n: usize, rng: &mut ChaCha8Rng) -> Vec<Decimal> {
1439        let n = n.max(1);
1440        let total_cents = (total.round_dp(2) * Decimal::from(100))
1441            .to_i64()
1442            .unwrap_or(0);
1443        if n == 1 || total_cents < n as i64 {
1444            return vec![total];
1445        }
1446        let weights: Vec<f64> = (0..n).map(|_| 0.5 + rng.random::<f64>()).collect();
1447        let sumw: f64 = weights.iter().sum::<f64>().max(f64::EPSILON);
1448        let spare = total_cents - n as i64; // ≥ 0; each part keeps a 1-cent floor
1449        let mut cents: Vec<i64> = weights
1450            .iter()
1451            .map(|w| 1 + (spare as f64 * w / sumw).floor() as i64)
1452            .collect();
1453        // dump the (small, < n) flooring leftover onto the largest part
1454        let assigned: i64 = cents.iter().sum();
1455        let leftover = total_cents - assigned;
1456        if let Some(maxp) = cents.iter_mut().max_by_key(|c| **c) {
1457            *maxp += leftover;
1458        }
1459        cents.into_iter().map(|c| Decimal::new(c, 2)).collect()
1460    }
1461
1462    /// SOTA-3: deterministic dimension → business-unit roll-up (the dimension is
1463    /// the cost center, or the profit center as fallback). The same dimension
1464    /// value always maps to the same BU code (`BU01`..`BU11`, matching the
1465    /// corpus's ~11 BU codes), so business-unit analytics are internally
1466    /// consistent — not a random per-line label. FNV-1a hash, bucketed.
1467    fn business_unit_for_dimension(dim: &str) -> String {
1468        const N_BU: u32 = 11;
1469        let mut h: u32 = 0x811c_9dc5;
1470        for b in dim.bytes() {
1471            h ^= b as u32;
1472            h = h.wrapping_mul(0x0100_0193);
1473        }
1474        format!("BU{:02}", (h % N_BU) + 1)
1475    }
1476
1477    /// SOTA-6: with probability `transactions.allocation_batch_rate` (default
1478    /// ~0.8%), emit an allocation/assessment batch instead of a fresh JE — the
1479    /// large 1-to-many posting that drives the corpus lines-per-JE tail (AB docs
1480    /// ~52 lines). Reuses a buffered JE for a valid header (no main-RNG / uuid
1481    /// advance), then explodes its largest debit line into ~30-80 cost-center-
1482    /// spread sub-lines summing to the same amount, so balance is preserved and
1483    /// the cost-center dimension breadth rises. Tagged source `AB`.
1484    fn maybe_generate_allocation_batch(&mut self) -> Option<JournalEntry> {
1485        let rate = self
1486            .config
1487            .allocation_batch_rate
1488            .unwrap_or(DEFAULT_ALLOCATION_RATE);
1489        if rate <= 0.0 || self.reversal_buffer.is_empty() {
1490            return None;
1491        }
1492        if self.allocation_rng.random::<f64>() >= rate {
1493            return None;
1494        }
1495        let pick = (self.allocation_rng.random::<u32>() as usize) % self.reversal_buffer.len();
1496        // Consume the entry (same reason as the reversal path: a reused base
1497        // would mint a duplicate derived id `base ^ salt`).
1498        let mut entry = self.reversal_buffer.remove(pick);
1499        // Explode the largest debit line across cost centers.
1500        let idx = entry
1501            .lines
1502            .iter()
1503            .enumerate()
1504            .filter(|(_, l)| l.debit_amount > Decimal::ZERO)
1505            .max_by(|a, b| a.1.debit_amount.cmp(&b.1.debit_amount))
1506            .map(|(i, _)| i)?;
1507        let template = entry.lines[idx].clone();
1508        let n = self
1509            .allocation_rng
1510            .random_range(ALLOCATION_MIN_TARGETS..=ALLOCATION_MAX_TARGETS) as usize;
1511        let parts = Self::split_amount(template.debit_amount, n, &mut self.allocation_rng);
1512        if parts.len() < ALLOCATION_MIN_TARGETS as usize {
1513            // amount too small to make a meaningful batch — leave it a normal JE
1514            return None;
1515        }
1516        // Valid cost-center candidates for this company (joins back to master).
1517        let company_code = entry.header.company_code.clone();
1518        let cc_pool: Vec<String> = if self.cost_center_pool.is_empty() {
1519            Self::COST_CENTER_POOL
1520                .iter()
1521                .map(|s| s.to_string())
1522                .collect()
1523        } else {
1524            let needle = format!("-{company_code}-");
1525            let filtered: Vec<String> = self
1526                .cost_center_pool
1527                .iter()
1528                .filter(|id| id.contains(&needle))
1529                .cloned()
1530                .collect();
1531            if filtered.is_empty() {
1532                self.cost_center_pool.clone()
1533            } else {
1534                filtered
1535            }
1536        };
1537        let mut new_lines: Vec<JournalEntryLine> =
1538            Vec::with_capacity(entry.lines.len() + parts.len() - 1);
1539        for (j, line) in entry.lines.iter().enumerate() {
1540            if j == idx {
1541                let bu_on = self.config.business_unit_dimension.unwrap_or(true);
1542                for (k, part) in parts.iter().enumerate() {
1543                    let mut nl = template.clone();
1544                    nl.debit_amount = *part;
1545                    nl.credit_amount = Decimal::ZERO;
1546                    nl.cost_center = Some(cc_pool[k % cc_pool.len()].clone());
1547                    // SOTA-3: keep business_unit coherent with the *new* CC
1548                    // (the clone carried the template's stale BU).
1549                    if bu_on {
1550                        nl.business_unit = nl
1551                            .cost_center
1552                            .as_deref()
1553                            .map(Self::business_unit_for_dimension);
1554                    }
1555                    new_lines.push(nl);
1556                }
1557            } else {
1558                new_lines.push(line.clone());
1559            }
1560        }
1561        // Derived id (distinct from the reversal salt); retag as an allocation.
1562        let base_id = entry.header.document_id;
1563        let alloc_id =
1564            uuid::Uuid::from_u128(base_id.as_u128() ^ 0xA110_CA70_A110_CA70_A110_CA70_A110_CA70);
1565        entry.header.document_id = alloc_id;
1566        entry.header.sap_source_code = Some("AB".to_string());
1567        entry.header.header_text = Some("Allocation/assessment cycle".to_string());
1568        entry.header.reference = Some(format!("ALLOC-{base_id}"));
1569        entry.header.batch_id = None;
1570        for (i, line) in new_lines.iter_mut().enumerate() {
1571            line.line_number = (i + 1) as u32;
1572            line.document_id = alloc_id;
1573        }
1574        entry.lines = new_lines.into();
1575        Some(entry)
1576    }
1577
1578    fn determine_fraud(&mut self, business_process: BusinessProcess) -> Option<FraudType> {
1579        if !self.fraud_config.enabled {
1580            return None;
1581        }
1582
1583        // v5.30 B3 (#153) — per-process fraud rate override. When
1584        // `fraud.per_process_rates` carries an entry for this JE's business
1585        // process, use that rate instead of the global `fraud_rate`. Unmapped
1586        // processes fall back to the global rate (preserving v5.29 default
1587        // behavior for configs that don't opt in to per-process rates).
1588        //
1589        // The slug uses the YAML wire form (matches `#[serde(rename_all =
1590        // "UPPERCASE")]` plus the per-variant renames on `BusinessProcess`).
1591        let process_slug = match business_process {
1592            BusinessProcess::P2P => "P2P",
1593            BusinessProcess::O2C => "O2C",
1594            BusinessProcess::R2R => "R2R",
1595            BusinessProcess::H2R => "H2R",
1596            BusinessProcess::A2R => "A2R",
1597            BusinessProcess::S2C => "S2C",
1598            BusinessProcess::Mfg => "MFG",
1599            BusinessProcess::Bank => "BANK",
1600            BusinessProcess::Audit => "AUDIT",
1601            BusinessProcess::Treasury => "TREASURY",
1602            BusinessProcess::Tax => "TAX",
1603            BusinessProcess::Intercompany => "INTERCOMPANY",
1604            BusinessProcess::ProjectAccounting => "PROJECT",
1605            BusinessProcess::Esg => "ESG",
1606        };
1607        let effective_rate = self
1608            .fraud_config
1609            .per_process_rates
1610            .get(process_slug)
1611            .copied()
1612            .unwrap_or(self.fraud_config.fraud_rate);
1613
1614        // Roll for fraud based on the (per-process or global) rate
1615        if self.rng.random::<f64>() >= effective_rate {
1616            return None;
1617        }
1618
1619        // Select fraud type based on distribution
1620        Some(self.select_fraud_type())
1621    }
1622
1623    /// Select a fraud type based on the configured distribution.
1624    fn select_fraud_type(&mut self) -> FraudType {
1625        let dist = &self.fraud_config.fraud_type_distribution;
1626        let roll: f64 = self.rng.random();
1627
1628        let mut cumulative = 0.0;
1629
1630        cumulative += dist.suspense_account_abuse;
1631        if roll < cumulative {
1632            return FraudType::SuspenseAccountAbuse;
1633        }
1634
1635        cumulative += dist.fictitious_transaction;
1636        if roll < cumulative {
1637            return FraudType::FictitiousTransaction;
1638        }
1639
1640        cumulative += dist.revenue_manipulation;
1641        if roll < cumulative {
1642            return FraudType::RevenueManipulation;
1643        }
1644
1645        cumulative += dist.expense_capitalization;
1646        if roll < cumulative {
1647            return FraudType::ExpenseCapitalization;
1648        }
1649
1650        cumulative += dist.split_transaction;
1651        if roll < cumulative {
1652            return FraudType::SplitTransaction;
1653        }
1654
1655        cumulative += dist.timing_anomaly;
1656        if roll < cumulative {
1657            return FraudType::TimingAnomaly;
1658        }
1659
1660        cumulative += dist.unauthorized_access;
1661        if roll < cumulative {
1662            return FraudType::UnauthorizedAccess;
1663        }
1664
1665        cumulative += dist.duplicate_payment;
1666        if roll < cumulative {
1667            return FraudType::DuplicatePayment;
1668        }
1669
1670        cumulative += dist.kickback_scheme;
1671        if roll < cumulative {
1672            return FraudType::KickbackScheme;
1673        }
1674
1675        cumulative += dist.round_tripping;
1676        if roll < cumulative {
1677            return FraudType::RoundTripping;
1678        }
1679
1680        cumulative += dist.unauthorized_discount;
1681        if roll < cumulative {
1682            return FraudType::UnauthorizedDiscount;
1683        }
1684
1685        // Fallback when distribution is sub-1.0 (validator allows tolerance)
1686        FraudType::DuplicatePayment
1687    }
1688
1689    /// Map a fraud type to an amount pattern for suspicious amounts.
1690    fn fraud_type_to_amount_pattern(&self, fraud_type: FraudType) -> FraudAmountPattern {
1691        match fraud_type {
1692            FraudType::SplitTransaction | FraudType::JustBelowThreshold => {
1693                FraudAmountPattern::ThresholdAdjacent
1694            }
1695            FraudType::FictitiousTransaction
1696            | FraudType::FictitiousEntry
1697            | FraudType::SuspenseAccountAbuse
1698            | FraudType::RoundDollarManipulation => FraudAmountPattern::ObviousRoundNumbers,
1699            FraudType::RevenueManipulation
1700            | FraudType::ExpenseCapitalization
1701            | FraudType::ImproperCapitalization
1702            | FraudType::ReserveManipulation
1703            | FraudType::UnauthorizedAccess
1704            | FraudType::PrematureRevenue
1705            | FraudType::UnderstatedLiabilities
1706            | FraudType::OverstatedAssets
1707            | FraudType::ChannelStuffing => FraudAmountPattern::StatisticallyImprobable,
1708            FraudType::DuplicatePayment
1709            | FraudType::TimingAnomaly
1710            | FraudType::SelfApproval
1711            | FraudType::ExceededApprovalLimit
1712            | FraudType::SegregationOfDutiesViolation
1713            | FraudType::UnauthorizedApproval
1714            | FraudType::CollusiveApproval
1715            | FraudType::FictitiousVendor
1716            | FraudType::ShellCompanyPayment
1717            | FraudType::Kickback
1718            | FraudType::KickbackScheme
1719            | FraudType::UnauthorizedDiscount
1720            | FraudType::RoundTripping
1721            | FraudType::InvoiceManipulation
1722            | FraudType::AssetMisappropriation
1723            | FraudType::InventoryTheft
1724            | FraudType::GhostEmployee => FraudAmountPattern::Normal,
1725            // Accounting Standards Fraud Types (ASC 606/IFRS 15 - Revenue)
1726            FraudType::ImproperRevenueRecognition
1727            | FraudType::ImproperPoAllocation
1728            | FraudType::VariableConsiderationManipulation
1729            | FraudType::ContractModificationMisstatement => {
1730                FraudAmountPattern::StatisticallyImprobable
1731            }
1732            // Accounting Standards Fraud Types (ASC 842/IFRS 16 - Leases)
1733            FraudType::LeaseClassificationManipulation
1734            | FraudType::OffBalanceSheetLease
1735            | FraudType::LeaseLiabilityUnderstatement
1736            | FraudType::RouAssetMisstatement => FraudAmountPattern::StatisticallyImprobable,
1737            // Accounting Standards Fraud Types (ASC 820/IFRS 13 - Fair Value)
1738            FraudType::FairValueHierarchyManipulation
1739            | FraudType::Level3InputManipulation
1740            | FraudType::ValuationTechniqueManipulation => {
1741                FraudAmountPattern::StatisticallyImprobable
1742            }
1743            // Accounting Standards Fraud Types (ASC 360/IAS 36 - Impairment)
1744            FraudType::DelayedImpairment
1745            | FraudType::ImpairmentTestAvoidance
1746            | FraudType::CashFlowProjectionManipulation
1747            | FraudType::ImproperImpairmentReversal => FraudAmountPattern::StatisticallyImprobable,
1748            // Sourcing/Procurement Fraud
1749            FraudType::BidRigging
1750            | FraudType::PhantomVendorContract
1751            | FraudType::ConflictOfInterestSourcing => FraudAmountPattern::Normal,
1752            FraudType::SplitContractThreshold => FraudAmountPattern::ThresholdAdjacent,
1753            // HR/Payroll Fraud
1754            FraudType::GhostEmployeePayroll
1755            | FraudType::PayrollInflation
1756            | FraudType::DuplicateExpenseReport
1757            | FraudType::FictitiousExpense => FraudAmountPattern::Normal,
1758            FraudType::SplitExpenseToAvoidApproval => FraudAmountPattern::ThresholdAdjacent,
1759            // O2C Fraud
1760            FraudType::RevenueTimingManipulation => FraudAmountPattern::StatisticallyImprobable,
1761            FraudType::QuotePriceOverride => FraudAmountPattern::Normal,
1762        }
1763    }
1764
1765    /// Generate a deterministic UUID using the factory.
1766    #[inline]
1767    fn generate_deterministic_uuid(&self) -> uuid::Uuid {
1768        self.uuid_factory.next()
1769    }
1770
1771    /// Cost center pool used for expense account enrichment.
1772    const COST_CENTER_POOL: &'static [&'static str] =
1773        &["CC1000", "CC2000", "CC3000", "CC4000", "CC5000"];
1774
1775    /// Enrich journal entry line items with account descriptions, cost centers,
1776    /// profit centers, value dates, line text, and assignment fields.
1777    ///
1778    /// This populates the sparse optional fields that `JournalEntryLine::debit()`
1779    /// and `::credit()` leave as `None`.
1780    ///
1781    /// SP3 T13: changed to `&mut self` so `loaded_priors` fanout samplers
1782    /// can be driven for CostCenter and ProfitCenter when priors are loaded.
1783    fn enrich_line_items(&mut self, entry: &mut JournalEntry) {
1784        let posting_date = entry.header.posting_date;
1785        let company_code = &entry.header.company_code;
1786        let header_text = entry.header.header_text.clone();
1787        let business_process = entry.header.business_process;
1788        // SP3 T13 — document-type code used as the entity_id for fanout
1789        // samplers.  Derived from the header field set during generate().
1790        let doc_type_key = entry.header.document_type.clone();
1791
1792        // SP3.7 — capture the SAP source code as an owned Option<String> so it
1793        // can be passed to `sample_attribute_for_source` as a `&str` inside the
1794        // line loop without keeping a borrow on `entry`.
1795        let header_sap_code: Option<String> = entry.header.sap_source_code.clone();
1796
1797        // SP3.3 — resolve cross-entity motif neighbors once before the line
1798        // loop.  Owned Vec avoids holding a shared borrow on `self.loaded_priors`
1799        // across the subsequent `&mut` fanout-sampler calls.
1800        let (cc_pc_neighbor_vec, cc_pc_share_prob): (Vec<String>, f64) =
1801            if let Some(priors) = &self.loaded_priors {
1802                if let Some(motifs) = &priors.cross_entity_motifs {
1803                    (
1804                        motifs.neighbors(&doc_type_key).to_vec(),
1805                        motifs.should_share(&doc_type_key),
1806                    )
1807                } else {
1808                    (Vec::new(), 0.0)
1809                }
1810            } else {
1811                (Vec::new(), 0.0)
1812            };
1813
1814        // Derive a deterministic index from the document_id for cost center selection
1815        let doc_id_bytes = entry.header.document_id.as_bytes();
1816        let mut cc_seed: usize = 0;
1817        for &b in doc_id_bytes {
1818            cc_seed = cc_seed.wrapping_add(b as usize);
1819        }
1820
1821        for (i, line) in entry.lines.iter_mut().enumerate() {
1822            // 1. account_description: look up from CoA
1823            if line.account_description.is_none() {
1824                line.account_description = self
1825                    .coa
1826                    .get_account(&line.gl_account)
1827                    .map(|a| a.short_description.clone());
1828            }
1829
1830            // 2. cost_center: assign to expense accounts (5xxx/6xxx)
1831            //
1832            // SP3 T13: when priors are loaded, the CostCenter fanout
1833            // sampler overrides the pool/legacy path.  This block runs
1834            // before the existing logic; if the sampler fires, `line.cost_center`
1835            // is set and the legacy block below is skipped via the
1836            // `line.cost_center.is_none()` guard.
1837            //
1838            // When the orchestrator has provided a master-data-sourced
1839            // pool (`with_cost_center_pool`), pick from it so the value
1840            // joins back to `cost_centers.id`.  Otherwise fall back to
1841            // the legacy hardcoded `COST_CENTER_POOL` const.
1842            //
1843            // Selection within the pool is filtered to entries that
1844            // mention the entry's `company_code` (master IDs follow
1845            // the `CC-{company}-...` convention) so cross-company
1846            // contamination is avoided; if no pool entry matches the
1847            // company we fall through to the full pool.
1848            if line.cost_center.is_none() {
1849                // SP3 T13 — prior-driven CostCenter fanout.
1850                // SP3.3: prefer neighbor-used buckets when motifs are available.
1851                // SP3.7: try per-source conditional cost_center first; fall back
1852                //        to the fanout sampler when the conditional is absent.
1853                let priors_opt = &mut self.loaded_priors;
1854                let rng_ref = &mut self.rng;
1855                if let Some(priors) = priors_opt {
1856                    let sp37_cc = header_sap_code.as_deref().and_then(|code| {
1857                        priors.sample_attribute_for_source(code, "cost_center", rng_ref)
1858                    });
1859                    if sp37_cc.is_some() {
1860                        line.cost_center = sp37_cc;
1861                    } else if let Some(sampler) = priors.fanout_samplers.get_mut("CostCenter") {
1862                        line.cost_center = Some(sampler.pick_for_with_neighbors(
1863                            &doc_type_key,
1864                            &cc_pc_neighbor_vec,
1865                            cc_pc_share_prob,
1866                            rng_ref,
1867                        ));
1868                    }
1869                }
1870            }
1871            if line.cost_center.is_none() {
1872                let first_char = line.gl_account.chars().next().unwrap_or('0');
1873                if first_char == '5' || first_char == '6' {
1874                    if !self.cost_center_pool.is_empty() {
1875                        let needle = format!("-{company_code}-");
1876                        let candidates: Vec<&String> = self
1877                            .cost_center_pool
1878                            .iter()
1879                            .filter(|id| id.contains(&needle))
1880                            .collect();
1881                        let pool: Vec<&String> = if candidates.is_empty() {
1882                            self.cost_center_pool.iter().collect()
1883                        } else {
1884                            candidates
1885                        };
1886                        let idx = cc_seed.wrapping_add(i) % pool.len();
1887                        line.cost_center = Some(pool[idx].clone());
1888                    } else {
1889                        let idx = cc_seed.wrapping_add(i) % Self::COST_CENTER_POOL.len();
1890                        line.cost_center = Some(Self::COST_CENTER_POOL[idx].to_string());
1891                    }
1892                }
1893            }
1894
1895            // 3. profit_center: assign from master pool when available
1896            // (`with_profit_center_pool`); otherwise derive from
1897            // company code + business process (legacy behaviour, which
1898            // does not match the master-data PC ID format).
1899            //
1900            // SP3 T13: prior-driven ProfitCenter fanout override fires first
1901            // (same pattern as CostCenter above).
1902            if line.profit_center.is_none() {
1903                // SP3 T13 — prior-driven ProfitCenter fanout.
1904                // SP3.3: prefer neighbor-used buckets when motifs are available.
1905                // SP3.7: try per-source conditional profit_center first; fall back
1906                //        to the fanout sampler when the conditional is absent.
1907                let priors_opt = &mut self.loaded_priors;
1908                let rng_ref = &mut self.rng;
1909                if let Some(priors) = priors_opt {
1910                    let sp37_pc = header_sap_code.as_deref().and_then(|code| {
1911                        priors.sample_attribute_for_source(code, "profit_center", rng_ref)
1912                    });
1913                    if sp37_pc.is_some() {
1914                        line.profit_center = sp37_pc;
1915                    } else if let Some(sampler) = priors.fanout_samplers.get_mut("ProfitCenter") {
1916                        line.profit_center = Some(sampler.pick_for_with_neighbors(
1917                            &doc_type_key,
1918                            &cc_pc_neighbor_vec,
1919                            cc_pc_share_prob,
1920                            rng_ref,
1921                        ));
1922                    }
1923                }
1924            }
1925            if line.profit_center.is_none() {
1926                if !self.profit_center_pool.is_empty() {
1927                    let needle = format!("-{company_code}-");
1928                    let candidates: Vec<&String> = self
1929                        .profit_center_pool
1930                        .iter()
1931                        .filter(|id| id.contains(&needle))
1932                        .collect();
1933                    let pool: Vec<&String> = if candidates.is_empty() {
1934                        self.profit_center_pool.iter().collect()
1935                    } else {
1936                        candidates
1937                    };
1938                    let idx = cc_seed.wrapping_add(i) % pool.len();
1939                    line.profit_center = Some(pool[idx].clone());
1940                } else {
1941                    let suffix = match business_process {
1942                        Some(BusinessProcess::P2P) => "-P2P",
1943                        Some(BusinessProcess::O2C) => "-O2C",
1944                        Some(BusinessProcess::R2R) => "-R2R",
1945                        Some(BusinessProcess::H2R) => "-H2R",
1946                        _ => "",
1947                    };
1948                    line.profit_center = Some(format!("PC-{company_code}{suffix}"));
1949                }
1950            }
1951
1952            // 3b. business_unit (SOTA-3): a coherent roll-up of the cost center,
1953            // or the profit center as fallback — the same dimension value always
1954            // maps to the same BU, so BU-level analytics are consistent. Runs
1955            // after both CC (step 2) and PC (step 3) are assigned; using CC-or-PC
1956            // lifts fill toward the corpus (~82%) vs only CC-bearing lines (~24%).
1957            // Flag-gated by `transactions.business_unit_dimension` (default-on).
1958            if line.business_unit.is_none() && self.config.business_unit_dimension.unwrap_or(true) {
1959                if let Some(dim) = line
1960                    .cost_center
1961                    .as_deref()
1962                    .or(line.profit_center.as_deref())
1963                {
1964                    line.business_unit = Some(Self::business_unit_for_dimension(dim));
1965                }
1966            }
1967
1968            // 4. trading_partner: SP3.9 — inherit JE-level trading_partner from
1969            // the header. The header was populated once per JE in generate();
1970            // all lines share the same value to match corpus SAP semantics.
1971            // The is_none() guard preserves TP values already set by the P2P/O2C
1972            // document chain manager (also JE-level, different code path).
1973            if line.trading_partner.is_none() {
1974                line.trading_partner = entry.header.trading_partner.clone();
1975            }
1976
1977            // 5. line_text: fall back to header_text if not already set
1978            if line.line_text.is_none() {
1979                line.line_text = header_text.clone();
1980            }
1981
1982            // 6. value_date: set to posting_date for AR/AP accounts
1983            if line.value_date.is_none()
1984                && (line.gl_account.starts_with("1100") || line.gl_account.starts_with("2000"))
1985            {
1986                line.value_date = Some(posting_date);
1987            }
1988
1989            // 7. assignment: set to vendor/customer reference for AP/AR lines
1990            if line.assignment.is_none() {
1991                if line.gl_account.starts_with("2000") {
1992                    // AP line - use vendor reference from header
1993                    if let Some(ref ht) = header_text {
1994                        // Try to extract vendor ID from header text patterns like "... - V-001"
1995                        if let Some(vendor_part) = ht.rsplit(" - ").next() {
1996                            if vendor_part.starts_with("V-")
1997                                || vendor_part.starts_with("VENDOR")
1998                                || vendor_part.starts_with("Vendor")
1999                            {
2000                                line.assignment = Some(vendor_part.to_string());
2001                            }
2002                        }
2003                    }
2004                } else if line.gl_account.starts_with("1100") {
2005                    // AR line - use customer reference from header
2006                    if let Some(ref ht) = header_text {
2007                        if let Some(customer_part) = ht.rsplit(" - ").next() {
2008                            if customer_part.starts_with("C-")
2009                                || customer_part.starts_with("CUST")
2010                                || customer_part.starts_with("Customer")
2011                            {
2012                                line.assignment = Some(customer_part.to_string());
2013                            }
2014                        }
2015                    }
2016                }
2017            }
2018        }
2019    }
2020
2021    /// Generate a single journal entry.
2022    pub fn generate(&mut self) -> JournalEntry {
2023        debug!(
2024            count = self.count,
2025            companies = self.companies.len(),
2026            start_date = %self.start_date,
2027            end_date = %self.end_date,
2028            "Generating journal entry"
2029        );
2030
2031        // Check if we're in a batch - if so, generate a batched entry
2032        if let Some(ref state) = self.batch_state {
2033            if state.remaining > 0 {
2034                return self.generate_batched_entry();
2035            }
2036        }
2037
2038        // SOTA-5: with a small probability, emit a reversal/correction of a
2039        // recent JE instead of a fresh one (a process auditors look for).
2040        if let Some(rev) = self.maybe_generate_reversal() {
2041            return rev;
2042        }
2043
2044        // SOTA-6: with a small probability, emit a large allocation/assessment
2045        // batch (the corpus lines-per-JE tail) instead of a fresh JE.
2046        if let Some(alloc) = self.maybe_generate_allocation_batch() {
2047            return alloc;
2048        }
2049
2050        // SP6 — Lazy-init the MD resolver on the first call. Rebuilding once
2051        // per run is sufficient; pools are stable after master-data generation.
2052        if self.md_resolver.companies.is_empty()
2053            && self.md_resolver.persons.is_empty()
2054            && self.md_resolver.patients.is_empty()
2055        {
2056            self.refresh_md_resolver();
2057        }
2058
2059        self.count += 1;
2060
2061        // Generate deterministic document ID
2062        let document_id = self.generate_deterministic_uuid();
2063
2064        // SP3.5c — Lazy temporal-sampler date draw.
2065        //
2066        // When priors are loaded the IET path (SP3 T11) will immediately replace
2067        // this value, so drawing from the temporal sampler here wastes one RNG
2068        // advance on the sampler's internal stream AND makes the temporal-sampler
2069        // variance contribute to the merged date sequence even though the IET
2070        // sampler is meant to dominate.
2071        //
2072        // Fix: only draw from the temporal sampler now when no priors are loaded.
2073        // The IET block sets `posting_date` unconditionally when priors are Some;
2074        // the active-window fallback (SP3 T14) has its own sample_date call and is
2075        // unaffected by this change.
2076        //
2077        // Priors-absent path: byte-identical to v5.13 — the draw and business-day
2078        // snap are performed exactly as before.
2079        let mut posting_date = if self.loaded_priors.is_none() {
2080            let mut d = self
2081                .temporal_sampler
2082                .sample_date(self.start_date, self.end_date);
2083            // Adjust posting date to be a business day if business day calculator is configured
2084            if let Some(ref calc) = self.business_day_calculator {
2085                if !calc.is_business_day(d) {
2086                    d = calc.next_business_day(d, false);
2087                    if d > self.end_date {
2088                        d = calc.prev_business_day(self.end_date, true);
2089                    }
2090                }
2091            }
2092            d
2093        } else {
2094            // Priors-loaded path: IET block (below) will set the real date.
2095            // Use start_date as a zero-cost placeholder — it is always overwritten.
2096            self.start_date
2097        };
2098
2099        // Select company using weighted selector
2100        let company_code = self.company_selector.select(&mut self.rng).to_string();
2101
2102        // v4.1.0+: draw a single (u, v) pair from the copula — cached for
2103        // both the amount adjustment (u) and the line-count shift (v).
2104        // None when no copula is configured.
2105        let copula_uv: Option<(f64, f64)> =
2106            self.correlation_copula.as_mut().map(|cop| cop.sample());
2107
2108        // Sample line item specification. When a copula is configured,
2109        // v drives line-count via a quantile-preserving map: integer
2110        // count `2 + floor(v * 10)` gives range [2, 11] evenly spaced
2111        // in v, so rank(v) == rank(line_count).
2112        //
2113        // v4.1.6+: upgraded from the v3.5.4 nudge (shift around
2114        // independently-drawn count) to true rank-preserving quantile
2115        // inversion, so empirical Kendall-τ now matches copula theory.
2116        let mut line_spec = self.line_sampler.sample();
2117        if let Some((_u, v)) = copula_uv {
2118            let new_total = 2 + ((v * 10.0).floor() as usize).min(9);
2119            let old_debit = line_spec.debit_count.max(1);
2120            let old_credit = line_spec.credit_count.max(1);
2121            let new_debit = (new_total as f64 * old_debit as f64 / (old_debit + old_credit) as f64)
2122                .round() as usize;
2123            let new_debit = new_debit.clamp(1, new_total - 1);
2124            let new_credit = new_total - new_debit;
2125            line_spec.total_count = new_total;
2126            line_spec.debit_count = new_debit;
2127            line_spec.credit_count = new_credit;
2128        }
2129
2130        // SOTA-10 (#138): optional hard cap on total lines per JE — tames the
2131        // monster outliers (synth max 2133 vs corpus 924). Scales debit + credit
2132        // proportionally so balance is preserved.
2133        if let Some(cap) = self.config.lines_per_je_cap {
2134            let cap = cap.max(2);
2135            let total = line_spec.debit_count + line_spec.credit_count;
2136            if total > cap {
2137                let new_debit =
2138                    ((line_spec.debit_count as f64 / total as f64) * cap as f64).round() as usize;
2139                let new_debit = new_debit.clamp(1, cap - 1);
2140                let new_credit = cap - new_debit;
2141                line_spec.total_count = cap;
2142                line_spec.debit_count = new_debit;
2143                line_spec.credit_count = new_credit;
2144            }
2145        }
2146
2147        // Determine source type using full 4-way distribution
2148        let source = self.select_source();
2149        let is_automated = matches!(
2150            source,
2151            TransactionSource::Automated | TransactionSource::Recurring
2152        );
2153
2154        // SP3.6 — when priors are loaded, sample a canonical SAP source code
2155        // from the bundle's source-mix distribution.  This is independent of
2156        // the `TransactionSource` enum (which controls manual/automated semantics)
2157        // and is written to `header.sap_source_code`, then emitted in the CSV
2158        // `source` column in place of the generic label.
2159        let sap_source_code: Option<String> = self.sample_sap_source_code();
2160        // SOTA-8: stash the current JE's SAP source so select_*_account can consult
2161        // the per-source Dirichlet pool. Cleared at the end of this generate() call.
2162        self.current_je_source = sap_source_code.clone();
2163
2164        // Select business process
2165        let business_process = self.select_business_process();
2166
2167        // SP3 T11 — IET-driven posting-date override.
2168        //
2169        // When priors are loaded, replace the uniform temporal-sampler date
2170        // with one derived from the per-Source inter-event-time prior.  We
2171        // accumulate IET samples (in fractional days) per source code and
2172        // map the accumulated offset onto [start_date, end_date].
2173        //
2174        // v5.30 B1 (#152): route through `sap_source_code` (the actual emitted
2175        // source) rather than `doc_type` (only 5 values: KR/DR/SA/HR/AA from
2176        // document_type_for_process). Before B1, `sample_next(&doc_type, …)`
2177        // hit the IET sampler with only 5 distinct keys for all 526 emitted
2178        // sources, leaving the per-source lag-1 autocorr machinery in
2179        // ConditionalIETSampler **unwired** for 521 of the sources. The
2180        // Sajja P1 autocorr DR of 105.9× (worst sub-metric on the A1 eval)
2181        // is the direct downstream consequence. Switching to source-keyed
2182        // sampling actually exercises the per-source priors.
2183        //
2184        // The None path is untouched: `posting_date` from the temporal sampler
2185        // above is used as-is.
2186        {
2187            // Split-borrow: four distinct struct fields accessed simultaneously.
2188            let priors_opt = &mut self.loaded_priors;
2189            let rng_ref = &mut self.rng;
2190            let iet_accum_ref = &mut self.iet_day_accum;
2191            let burst_ref = &mut self.iet_burst_remaining;
2192            if let Some(priors) = priors_opt {
2193                // Prefer the per-row SAP source code (populated when priors
2194                // load via SP3.6's source-mix sampler). Fall back to doc_type
2195                // for the rare branch where source-code sampling returned None.
2196                let iet_key = sap_source_code
2197                    .as_deref()
2198                    .unwrap_or_else(|| Self::document_type_for_process(business_process))
2199                    .to_string();
2200                let period_days = (self.end_date - self.start_date).num_days().max(1) as f64;
2201
2202                // v5.30 B1 Phase 2 — burst clustering.
2203                //
2204                // The lag-1 Gaussian-copula path in ConditionalIETSampler
2205                // (conditional_iet.rs:176-203) silently falls back to
2206                // independent sampling whenever the per-source |ρ| < 0.1.
2207                // The bundled SP3 priors' per-source lag1_autocorr values are
2208                // mostly below that threshold (corpus has only weak
2209                // day-resolution autocorrelation), so the coupling never
2210                // fires and the within-source IET autocorr matches the
2211                // noise floor — the Sajja P1 autocorr 105.9× DR before A3,
2212                // 62.84× after A3, with B1 Phase 1 (source-keying) producing
2213                // no measurable lift.
2214                //
2215                // This block bypasses the |ρ| < 0.1 gate by emitting
2216                // **deterministic** short-IET bursts for each source.  When
2217                // a sampled IET is short (< BURST_THRESHOLD_DAYS) and a
2218                // probability gate fires (BURST_PROB), the next
2219                // BURST_LEN events for that source emit IETs in
2220                // [0.25, 1.5] days regardless of what the sampler returns.
2221                //
2222                // Effect on within-source IET autocorrelation: events 1..k
2223                // of a burst have tightly-clustered IETs around 0.85 days
2224                // mean → lag-1 autocorr lifts directly. Inter-burst IETs
2225                // are still sampled normally so the macro distribution
2226                // stays close to the prior.
2227                const BURST_THRESHOLD_DAYS: f64 = 2.0;
2228                const BURST_PROB: f64 = 0.30;
2229                const BURST_LEN_MIN: u8 = 2;
2230                const BURST_LEN_MAX: u8 = 4;
2231
2232                let sampled_iet = priors.iet_sampler.sample_next(&iet_key, rng_ref).max(0.001);
2233
2234                // Check if we're inside an active burst for this source.
2235                let remaining = burst_ref.get(&iet_key).copied().unwrap_or(0);
2236                let iet = if remaining > 0 {
2237                    // Active burst: emit a short IET regardless of sampler.
2238                    burst_ref.insert(iet_key.clone(), remaining - 1);
2239                    rng_ref.random_range(0.25..=1.5)
2240                } else if sampled_iet < BURST_THRESHOLD_DAYS
2241                    && rng_ref.random_range(0.0..1.0) < BURST_PROB
2242                {
2243                    // Start a new burst: this event uses the sampled IET,
2244                    // and the next BURST_LEN events for this source will
2245                    // emit short IETs.
2246                    let len = rng_ref.random_range(BURST_LEN_MIN..=BURST_LEN_MAX);
2247                    burst_ref.insert(iet_key.clone(), len);
2248                    sampled_iet
2249                } else {
2250                    sampled_iet
2251                };
2252
2253                let accum = iet_accum_ref.entry(iet_key).or_insert(0.0);
2254                *accum += iet;
2255                // Wrap within period so we never exceed the generation window.
2256                if *accum >= period_days {
2257                    *accum %= period_days;
2258                }
2259                let day_offset =
2260                    (*accum as i64).clamp(0, (self.end_date - self.start_date).num_days());
2261                posting_date = self.start_date + chrono::Duration::days(day_offset);
2262                // Re-apply business-day snap so the IET date still lands on a
2263                // working day (matches the business_day_calculator logic above).
2264                if let Some(ref calc) = self.business_day_calculator {
2265                    if !calc.is_business_day(posting_date) {
2266                        posting_date = calc.next_business_day(posting_date, false);
2267                        if posting_date > self.end_date {
2268                            posting_date = calc.prev_business_day(self.end_date, true);
2269                        }
2270                    }
2271                }
2272            } // end if let Some(priors)
2273        } // end split-borrow scope
2274
2275        // SP3 T14 — active-window gating.
2276        //
2277        // After the IET-driven date is computed, check whether this Source is
2278        // still in its active window for the resulting day.  If the prior says
2279        // the Source has "gone quiet" (e.g. a vendor that stopped trading), we
2280        // fall back to the temporal-sampler date so the JE still emits but is
2281        // no longer anchored to the IET timeline for this source.
2282        //
2283        // In a day-loop architecture this would be a `continue`; here, the
2284        // equivalent is to revert `posting_date` to the original temporal-
2285        // sampler sample so downstream logic sees a plausible date.
2286        //
2287        // The None path is untouched.
2288        if let Some(ref priors) = self.loaded_priors {
2289            let doc_type = Self::document_type_for_process(business_process);
2290            let day_in_period = (posting_date - self.start_date).num_days();
2291            let active = match &priors.multi_segment_window {
2292                Some(msw) => msw.is_active(doc_type, day_in_period),
2293                None => priors.active_window.is_active(doc_type, day_in_period),
2294            };
2295            if !active {
2296                // Source is outside its active window: fall back to a fresh
2297                // temporal-sampler draw.  (SP3.5c: the up-front temporal draw
2298                // is skipped when priors are loaded, so we always re-sample
2299                // here in the fallback path rather than reusing a cached value.)
2300                posting_date = self
2301                    .temporal_sampler
2302                    .sample_date(self.start_date, self.end_date);
2303                if let Some(ref calc) = self.business_day_calculator {
2304                    if !calc.is_business_day(posting_date) {
2305                        posting_date = calc.next_business_day(posting_date, false);
2306                        if posting_date > self.end_date {
2307                            posting_date = calc.prev_business_day(self.end_date, true);
2308                        }
2309                    }
2310                }
2311            }
2312        }
2313
2314        // SP3 T12 — lines-per-JE override from prior histogram.
2315        //
2316        // When priors are loaded, replace `line_spec` totals with a sample
2317        // drawn from the Source-conditional histogram (falling back to the
2318        // overall histogram when the document-type is unknown).  `.max(2)`
2319        // guarantees every JE has at least one debit + one credit line.
2320        // The None path leaves `line_spec` from the copula / line-sampler
2321        // cascade above completely unchanged.
2322        if let Some(ref priors) = self.loaded_priors {
2323            let doc_type = Self::document_type_for_process(business_process);
2324            let hist = priors
2325                .lines_per_je
2326                .by_source
2327                .get(doc_type)
2328                .unwrap_or(&priors.lines_per_je.overall);
2329            let n_total = (hist.sample_bucket(&mut self.rng) as usize).max(2);
2330            let old_debit = line_spec.debit_count.max(1);
2331            let old_credit = line_spec.credit_count.max(1);
2332            let new_debit = (n_total as f64 * old_debit as f64 / (old_debit + old_credit) as f64)
2333                .round() as usize;
2334            let new_debit = new_debit.clamp(1, n_total - 1);
2335            line_spec.total_count = n_total;
2336            line_spec.debit_count = new_debit;
2337            line_spec.credit_count = n_total - new_debit;
2338        }
2339
2340        // Determine if this is a fraudulent transaction (v5.30 B3 — per-process
2341        // rates pass `business_process` through to honor fraud.per_process_rates
2342        // overrides when configured)
2343        let fraud_type = self.determine_fraud(business_process);
2344        let is_fraud = fraud_type.is_some();
2345
2346        // Sample time based on source
2347        let time = self.temporal_sampler.sample_time(!is_automated);
2348        let created_at = posting_date.and_time(time).and_utc();
2349
2350        // Select user from pool or generate generic
2351        let (created_by, user_persona) = self.select_user(is_automated);
2352
2353        // Create header with deterministic UUID
2354        let mut header =
2355            JournalEntryHeader::with_deterministic_id(company_code, posting_date, document_id);
2356        header.created_at = created_at;
2357        header.source = source;
2358        header.sap_source_code = sap_source_code;
2359
2360        // SP3.9 — JE-level trading partner. Draw once per JE; all lines
2361        // inherit. corpus SAP semantics is one TP per document.
2362        // SP3.12 — TP motif sampler: bias toward cluster-mates of the
2363        // previously-drawn TP on the same source to build triangle structure.
2364        // Split-borrow: sap_source_code was moved into header above, so clone
2365        // the code out before the mutable borrow on self.loaded_priors.
2366        // (sap_source_code is cloned again below for the SP4.5 user-persona lookup)
2367        {
2368            let code_opt = header.sap_source_code.clone();
2369            if let Some(ref code) = code_opt {
2370                let rng_ref = &mut self.rng;
2371                // SP3.12: resolve TP motif neighbors from the last TP on this source.
2372                // We read last_tp_by_source (shared ref) before the mutable borrow
2373                // on loaded_priors.  The update happens after the block.
2374                let tp_neighbors: Vec<String> = if let Some(ref priors) = self.loaded_priors {
2375                    if let Some(ref motifs) = priors.tp_motif_sampler {
2376                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
2377                            motifs.neighbors(last_tp).to_vec()
2378                        } else {
2379                            Vec::new()
2380                        }
2381                    } else {
2382                        Vec::new()
2383                    }
2384                } else {
2385                    Vec::new()
2386                };
2387                let tp_share_prob: f64 = if let Some(ref priors) = self.loaded_priors {
2388                    if let Some(ref motifs) = priors.tp_motif_sampler {
2389                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
2390                            motifs.should_share(last_tp)
2391                        } else {
2392                            0.0
2393                        }
2394                    } else {
2395                        0.0
2396                    }
2397                } else {
2398                    0.0
2399                };
2400
2401                if let Some(ref mut priors) = self.loaded_priors {
2402                    // SP3.12: if the motif roll fires AND the distribution
2403                    // supports one of the neighbor TP values, draw from that
2404                    // restricted set.  Otherwise fall through to the marginal.
2405                    let tp = if !tp_neighbors.is_empty()
2406                        && tp_share_prob > 0.0
2407                        && rng_ref.random_range(0.0..1.0) < tp_share_prob
2408                    {
2409                        // Find a neighbor that the per-source TP distribution
2410                        // actually knows about.  Sample from the full marginal
2411                        // weighted by the neighbor-filtered subset.
2412                        use datasynth_core::distributions::behavioral_priors::CategoricalDistribution;
2413                        let filtered: std::collections::BTreeMap<String, f64> = priors
2414                            .per_source_attribute
2415                            .as_ref()
2416                            .and_then(|psa| psa.conditional(code, "trading_partner"))
2417                            .map(|dist| {
2418                                dist.probabilities
2419                                    .iter()
2420                                    .filter(|(v, _)| tp_neighbors.contains(v))
2421                                    .map(|(v, p)| (v.clone(), *p))
2422                                    .collect()
2423                            })
2424                            .unwrap_or_default();
2425                        if filtered.is_empty() {
2426                            priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
2427                        } else {
2428                            let neighbour_dist = CategoricalDistribution {
2429                                probabilities: filtered,
2430                                n: 0, // unused in sample()
2431                            };
2432                            neighbour_dist.sample(rng_ref).or_else(|| {
2433                                priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
2434                            })
2435                        }
2436                    } else {
2437                        priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
2438                    };
2439                    header.trading_partner = tp;
2440                }
2441                // SP3.12: record the drawn TP so the next JE on this source
2442                // can use it as the motif anchor.
2443                if let Some(ref tp) = header.trading_partner {
2444                    self.last_tp_by_source.insert(code.clone(), tp.clone());
2445                }
2446            }
2447        }
2448
2449        // SP4.5 — user-persona prior: when a corpus prior with user data is
2450        // loaded, override `created_by` with a user characteristic of the drawn
2451        // source, and bias `created_at` hour-of-day from the user's density.
2452        // Falls back transparently to `created_by` / `created_at` already set above.
2453        let (created_by, created_at) = {
2454            let sap_code_for_user = header.sap_source_code.clone();
2455            if let (Some(ref code), Some(ref priors)) = (sap_code_for_user, &self.loaded_priors) {
2456                if let Some(uid) = priors.sample_user_for_source(code, &mut self.rng) {
2457                    let new_created_at = if let Some((hour, _)) =
2458                        priors.sample_timestamp_for_user(&uid, &mut self.rng)
2459                    {
2460                        let base = header.created_at;
2461                        base.date_naive()
2462                            .and_hms_opt(hour, 0, 0)
2463                            .map(|naive| naive.and_utc())
2464                            .unwrap_or(base)
2465                    } else {
2466                        header.created_at
2467                    };
2468                    (uid, new_created_at)
2469                } else {
2470                    (created_by, header.created_at)
2471                }
2472            } else {
2473                (created_by, header.created_at)
2474            }
2475        };
2476
2477        header.created_by = created_by;
2478        header.created_at = created_at;
2479        header.user_persona = user_persona;
2480        header.business_process = Some(business_process);
2481        header.document_type = Self::document_type_for_process(business_process).to_string();
2482        header.is_fraud = is_fraud;
2483        header.fraud_type = fraud_type;
2484
2485        // --- ISA 240 audit flags ---
2486        let is_manual = matches!(source, TransactionSource::Manual);
2487        header.is_manual = is_manual;
2488
2489        // Determine source_system based on manual vs automated.
2490        //
2491        // Real ERPs typically expose 20+ distinct provenance codes per
2492        // company (one per module + sub-module + interface). The taxonomy
2493        // below is a strict superset of the legacy {manual, spreadsheet,
2494        // SAP-FI, SAP-MM, SAP-SD, interface, SAP-HR} codes so downstream
2495        // consumers that filter by prefix (e.g. `starts_with("SAP-")`)
2496        // continue to work.
2497        //
2498        // Contract preserved by the generator-level audit assertion in
2499        // `test_isa240_audit_flags_populated`:
2500        //   - manual entries → starts_with("manual") || starts_with("spreadsheet")
2501        //   - automated entries → does NOT start with "manual"/"spreadsheet"
2502        header.source_system = Self::pick_source_system(&mut self.rng, is_manual, business_process);
2503
2504        // is_post_close: entry is in the last month of the configured period
2505        // and the posting date falls after the 25th (simulating close cutoff)
2506        let is_post_close = posting_date.month() == self.end_date.month()
2507            && posting_date.year() == self.end_date.year()
2508            && posting_date.day() > 25;
2509        header.is_post_close = is_post_close;
2510
2511        // created_date: for manual entries, same day as posting; for automated,
2512        // 0-3 days before posting_date
2513        let created_date = if is_manual {
2514            posting_date.and_hms_opt(time.hour().min(23), time.minute(), time.second())
2515        } else {
2516            let lag_days = self.rng.random_range(0i64..=3);
2517            let created_naive_date = posting_date
2518                .checked_sub_signed(chrono::Duration::days(lag_days))
2519                .unwrap_or(posting_date);
2520            created_naive_date.and_hms_opt(
2521                self.rng.random_range(8u32..=17),
2522                self.rng.random_range(0u32..=59),
2523                self.rng.random_range(0u32..=59),
2524            )
2525        };
2526        header.created_date = created_date;
2527
2528        // Generate description context
2529        let mut context =
2530            DescriptionContext::with_period(posting_date.month(), posting_date.year());
2531
2532        // Add vendor/customer context based on business process
2533        match business_process {
2534            BusinessProcess::P2P => {
2535                if let Some(vendor) = self.vendor_pool.random_vendor(&mut self.rng) {
2536                    context.vendor_name = Some(vendor.name.clone());
2537                }
2538            }
2539            BusinessProcess::O2C => {
2540                if let Some(customer) = self.customer_pool.random_customer(&mut self.rng) {
2541                    context.customer_name = Some(customer.name.clone());
2542                }
2543            }
2544            _ => {}
2545        }
2546
2547        // Generate header text if enabled.
2548        // SP6 — Try text-taxonomy prior (sample_header_template) first,
2549        // then the built-in DescriptionGenerator.
2550        if self.template_config.descriptions.generate_header_text {
2551            let priors_header = if let Some(src) = header.sap_source_code.as_deref() {
2552                if let Some(p) = self.loaded_priors.as_ref() {
2553                    // SP6: text-taxonomy header pool
2554                    p.sample_header_template(src, &mut self.md_resolver, &mut self.rng)
2555                } else {
2556                    None
2557                }
2558            } else {
2559                None
2560            };
2561            header.header_text = Some(priors_header.unwrap_or_else(|| {
2562                self.description_generator.generate_header_text(
2563                    business_process,
2564                    &context,
2565                    &mut self.rng,
2566                )
2567            }));
2568        }
2569
2570        // Generate reference if enabled.
2571        // SP4.7 — when priors are loaded and the bundle carries a reference-format
2572        // template for the current SAP source code, sample from that distribution
2573        // instead of the fixed `ReferenceGenerator` template.  The priors path is
2574        // preferred because it produces corpus format patterns; the existing
2575        // generator is the fallback for sources not covered by the bundle.
2576        if self.template_config.references.generate_references {
2577            let priors_ref = header.sap_source_code.as_deref().and_then(|src| {
2578                self.loaded_priors
2579                    .as_ref()
2580                    .and_then(|p| p.sample_reference(src, &mut self.rng))
2581            });
2582            header.reference = Some(priors_ref.unwrap_or_else(|| {
2583                self.reference_generator
2584                    .generate_for_process_year(business_process, posting_date.year())
2585            }));
2586        }
2587
2588        // Derive typed source document from reference prefix
2589        header.source_document = header
2590            .reference
2591            .as_deref()
2592            .and_then(DocumentRef::parse)
2593            .or_else(|| {
2594                if header.source == TransactionSource::Manual {
2595                    Some(DocumentRef::Manual)
2596                } else {
2597                    None
2598                }
2599            });
2600
2601        // Generate line items
2602        let mut entry = JournalEntry::new(header);
2603
2604        // Generate amount - use fraud pattern if this is a fraudulent transaction.
2605        // Non-fraud path prefers the v3.4.0 advanced sampler when configured; fraud
2606        // patterns always use the legacy sampler because they target specific
2607        // thresholds (round numbers, just-under-approval amounts) that are
2608        // orthogonal to mixture models.
2609        let base_amount = if let Some(ft) = fraud_type {
2610            let pattern = self.fraud_type_to_amount_pattern(ft);
2611            self.amount_sampler.sample_fraud(pattern)
2612        } else if let Some(ref mut adv) = self.advanced_amount_sampler {
2613            adv.sample_decimal()
2614        } else {
2615            self.amount_sampler.sample()
2616        };
2617        // v3.5.3+: if a conditional-amount override is configured and
2618        // the JE is non-fraud, re-sample the amount from the conditional
2619        // distribution using the computed context. Fraud entries bypass
2620        // this path to preserve fraud-pattern semantics (as with the
2621        // advanced sampler cascade above).
2622        let base_amount = if fraud_type.is_none() {
2623            // Compute input context BEFORE taking &mut on the sampler
2624            // to avoid borrow-checker conflict with the immutable
2625            // `conditional_input_value` call.
2626            let input = self.conditional_input_value(posting_date);
2627            if let Some(ref mut cond) = self.conditional_amount_override {
2628                cond.sample_decimal(input)
2629            } else {
2630                base_amount
2631            }
2632        } else {
2633            base_amount
2634        };
2635
2636        // SP4.3 — when priors are loaded, try to replace the base_amount with
2637        // a draw from the per-source log-normal conditional.  This step only
2638        // fires for non-fraud JEs (fraud entries must preserve fraud-pattern
2639        // semantics).  We use the source-marginal (gl_prefix = "") as the
2640        // initial lookup; per-class refinement requires knowing the GL account
2641        // which is sampled after the amount in some paths, so we defer that
2642        // to a follow-up sprint.  Balance preservation is maintained because
2643        // the splitter below uses `total_amount` unchanged.
2644        //
2645        // W7.M — autocorr mitigation: ~30 % of priors-enabled draws bypass the
2646        // per-source conditional and draw from the global marginal sampler.
2647        // This loosens the per-source amount-sequence correlation that SP4.3's
2648        // conditional was over-tightening (v5.23 baseline: Source P1 Autocorr
2649        // +750 %, TP P1 Autocorr +101 %).  Proven pattern from SP3.12 W2
2650        // TP-clustering mitigation.
2651        //
2652        // Split-borrow: `loaded_priors` and `rng` are distinct struct fields so
2653        // the compiler allows simultaneous mutable borrows.
2654        // SP5.3 — intermediate tune from 0.20 to 0.25 between v5.24 (0.30 →
2655        // autocorr 1.53, over-corrected) and v5.25 (0.20 → autocorr 3.74,
2656        // under-corrected). Targets the trade-off sweet spot.
2657        const PRIORS_AMOUNT_BYPASS_SHARE: f64 = 0.25;
2658        let base_amount = if fraud_type.is_none() {
2659            if let Some(src) = entry.header.sap_source_code.as_deref() {
2660                let src_owned = src.to_string();
2661                // Gate: skip the conditional ~25 % of the time to loosen
2662                // per-source amount sequence correlation without overshooting.
2663                let use_conditional = self.loaded_priors.is_some()
2664                    && self.rng.random_range(0.0..1.0) >= PRIORS_AMOUNT_BYPASS_SHARE;
2665                if use_conditional {
2666                    let priors_ref = &mut self.loaded_priors;
2667                    let rng_ref = &mut self.rng;
2668                    if let Some(priors) = priors_ref {
2669                        priors
2670                            .sample_amount_for_source(&src_owned, "", rng_ref)
2671                            .and_then(|v| {
2672                                if v.is_finite() && v > 0.0 {
2673                                    Decimal::from_f64_retain(v)
2674                                } else {
2675                                    None
2676                                }
2677                            })
2678                            .unwrap_or(base_amount)
2679                    } else {
2680                        base_amount
2681                    }
2682                } else {
2683                    base_amount
2684                }
2685            } else {
2686                base_amount
2687            }
2688        } else {
2689            base_amount
2690        };
2691
2692        // v4.1.6+: if a copula is configured AND an advanced amount
2693        // sampler with a ppf is available, use true rank-preserving
2694        // inverse-CDF sampling — amount is drawn DIRECTLY from the
2695        // sampler's quantile at `u`, replacing (not nudging) the
2696        // independently-drawn base_amount. This makes empirical
2697        // Kendall-τ match the copula's theoretical τ.
2698        //
2699        // Fallback for copula-without-advanced-sampler: keep the
2700        // v4.1.0 log-scale multiplier nudge (observable correlation,
2701        // diluted magnitude).
2702        let base_amount = if fraud_type.is_none() {
2703            if let Some((u, _v)) = copula_uv {
2704                if let Some(ref adv) = self.advanced_amount_sampler {
2705                    adv.ppf_decimal(u)
2706                } else {
2707                    let log_mult = 4.0 * (u - 0.5);
2708                    let adjusted = base_amount.to_f64().unwrap_or(1.0) * log_mult.exp();
2709                    Decimal::from_f64_retain(adjusted).unwrap_or(base_amount)
2710                }
2711            } else {
2712                base_amount
2713            }
2714        } else {
2715            base_amount
2716        };
2717
2718        // Apply temporal drift if configured
2719        let drift_adjusted_amount = {
2720            let drift = self.get_drift_adjustments(posting_date);
2721            if drift.amount_mean_multiplier != 1.0 {
2722                // Apply drift multiplier (includes seasonal factor if enabled)
2723                let multiplier = drift.amount_mean_multiplier * drift.seasonal_factor;
2724                let adjusted = base_amount.to_f64().unwrap_or(1.0) * multiplier;
2725                Decimal::from_f64_retain(adjusted).unwrap_or(base_amount)
2726            } else {
2727                base_amount
2728            }
2729        };
2730
2731        // Apply human variation to amounts for non-automated transactions
2732        let total_amount = if is_automated {
2733            drift_adjusted_amount // Automated systems use exact amounts
2734        } else {
2735            self.apply_human_variation(drift_adjusted_amount)
2736        };
2737
2738        // SP3 T13 — derive the document-type key once for use in all
2739        // fanout-sampler lookups below.  Computed unconditionally so it is
2740        // available for both debit and credit loops without re-deriving.
2741        let doc_type_for_fanout = Self::document_type_for_process(business_process).to_string();
2742
2743        // SP3.3 — resolve cross-entity motif neighbors for this fanout entity.
2744        // We capture an owned Vec<String> here so that the shared borrow on
2745        // `self.loaded_priors` is released before the subsequent `&mut` borrow
2746        // on `fanout_samplers`.
2747        let (gl_neighbor_vec, gl_share_prob): (Vec<String>, f64) =
2748            if let Some(priors) = &self.loaded_priors {
2749                if let Some(motifs) = &priors.cross_entity_motifs {
2750                    (
2751                        motifs.neighbors(&doc_type_for_fanout).to_vec(),
2752                        motifs.should_share(&doc_type_for_fanout),
2753                    )
2754                } else {
2755                    (Vec::new(), 0.0)
2756                }
2757            } else {
2758                (Vec::new(), 0.0)
2759            };
2760
2761        // SOTA-1: recurring/standard-journal templates. On the no-priors path,
2762        // reuse a cached account archetype for this (company, doc-type, counts)
2763        // with high probability so standard postings recur (and a hot account
2764        // subset dominates). Reuse overrides only the line account (set after
2765        // text/RNG below), so amounts/counts/dates stay byte-identical; fresh
2766        // archetypes are captured + cached after the lines are built.
2767        let reuse_archetype = self.pick_recurring_archetype(
2768            &entry.header.company_code,
2769            &doc_type_for_fanout,
2770            line_spec.debit_count,
2771            line_spec.credit_count,
2772        );
2773        let mut fresh_debit_accts: Vec<String> = Vec::new();
2774        let mut fresh_credit_accts: Vec<String> = Vec::new();
2775        // SOTA-8: hoisted so both the debit and credit loops + their SOTA-1 archetype
2776        // override blocks share the same flag.
2777        let sota8_active = self.config.source_conditional_account_pair.enabled;
2778
2779        // Generate debit lines
2780        let debit_amounts = self
2781            .amount_sampler
2782            .sample_summing_to(line_spec.debit_count, total_amount);
2783        for (i, amount) in debit_amounts.into_iter().enumerate() {
2784            // SP3 T13 — GL Account fanout: when priors are loaded, pick the
2785            // account from the BipartiteFanoutSampler keyed "GLAccount" for
2786            // this Source.  Split-borrows let us hold &mut loaded_priors and
2787            // &mut rng at the same time (distinct struct fields).
2788            // SP3 T13 — GL Account fanout for debit lines.
2789            // Pre-compute the fallback before the split-borrow scope so that
2790            // `select_debit_account` (which takes `&mut self`) does not conflict
2791            // with the concurrent borrow of `loaded_priors` and `rng`.
2792            let debit_fallback = self.select_debit_account().account_number.clone();
2793            // SOTA-8: when enabled, the per-source Dirichlet pool (which `select_debit_account`
2794            // has already consulted via try_cond_pick_account_number) takes precedence over the
2795            // SP3/SP4 priors-driven path so the user's explicit source-conditional knob actually
2796            // governs the source-conditional account distribution. `sota8_active` is hoisted
2797            // above this scope so the credit loop can see it too.
2798            let account_number = if sota8_active {
2799                debit_fallback
2800            } else {
2801                let priors_opt = &mut self.loaded_priors;
2802                let rng_ref = &mut self.rng;
2803                if let Some(priors) = priors_opt {
2804                    // SP4.6 — role-aware GL account selection: try (source, "DR")
2805                    // conditional first, then fall back to SP3.7 source-marginal,
2806                    // then to the fanout sampler, then to the default debit account.
2807                    let sp46_gl = entry
2808                        .header
2809                        .sap_source_code
2810                        .as_deref()
2811                        .and_then(|code| priors.sample_gl_for_source_role(code, "DR", rng_ref));
2812                    if let Some(gl) = sp46_gl {
2813                        gl
2814                    } else {
2815                        // SP3.7 — try per-source marginal GL account.
2816                        let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
2817                            priors.sample_attribute_for_source(code, "gl_account", rng_ref)
2818                        });
2819                        if let Some(gl) = sp37_gl {
2820                            gl
2821                        } else if let Some(sampler) = priors.fanout_samplers.get_mut("GLAccount") {
2822                            // SP3.3: prefer neighbor-used buckets when motifs are available.
2823                            sampler.pick_for_with_neighbors(
2824                                &doc_type_for_fanout,
2825                                &gl_neighbor_vec,
2826                                gl_share_prob,
2827                                rng_ref,
2828                            )
2829                        } else {
2830                            debit_fallback
2831                        }
2832                    }
2833                } else {
2834                    debit_fallback
2835                }
2836            };
2837            let mut line = JournalEntryLine::debit(
2838                entry.header.document_id,
2839                (i + 1) as u32,
2840                account_number.clone(),
2841                amount,
2842            );
2843
2844            // Generate line text if enabled.
2845            // SP6 — Try text-taxonomy (account-class cascade), then DescriptionGenerator.
2846            if self.template_config.descriptions.generate_line_text {
2847                let src = entry.header.sap_source_code.as_deref();
2848                let priors_line = if let Some(s) = src {
2849                    if let Some(p) = self.loaded_priors.as_ref() {
2850                        let account_class = p
2851                            .coa_semantic
2852                            .as_ref()
2853                            .and_then(|c| c.accounts.get(&account_number))
2854                            .and_then(|a| a.account_class.as_deref())
2855                            .unwrap_or(
2856                                datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior::UNKNOWN_CLASS,
2857                            );
2858                        // SP6 text_taxonomy cascade
2859                        p.sample_line_template(
2860                            s,
2861                            account_class,
2862                            &mut self.md_resolver,
2863                            &mut self.rng,
2864                        )
2865                    } else {
2866                        None
2867                    }
2868                } else {
2869                    None
2870                };
2871                line.line_text = Some(priors_line.unwrap_or_else(|| {
2872                    self.description_generator.generate_line_text(
2873                        &account_number,
2874                        &context,
2875                        &mut self.rng,
2876                    )
2877                }));
2878            }
2879
2880            // SOTA-1: override the line's account with the reused archetype's
2881            // (RNG + text above are unchanged -> amounts/counts/dates stay
2882            // byte-identical); else capture the fresh account for caching.
2883            // SOTA-1 and SOTA-8 compose: SOTA-8 picks the FIRST archetype's accounts
2884            // from its per-source pool, then SOTA-1 caches + reuses them. Disabling
2885            // SOTA-1 under SOTA-8 actually *worsens* edge concentration — empirically
2886            // measured in Round 0 v4: edges/je 0.35 -> 0.82 when SOTA-1 was bypassed.
2887            if let Some((ref d, _)) = reuse_archetype {
2888                if let Some(a) = d.get(i) {
2889                    line.gl_account = a.clone();
2890                }
2891            } else if self.loaded_priors.is_none() {
2892                fresh_debit_accts.push(line.gl_account.clone());
2893            }
2894            entry.add_line(line);
2895        }
2896
2897        // Generate credit lines - use the SAME amounts to ensure balance
2898        let credit_amounts = self
2899            .amount_sampler
2900            .sample_summing_to(line_spec.credit_count, total_amount);
2901        for (i, amount) in credit_amounts.into_iter().enumerate() {
2902            // SP3 T13 — GL Account fanout for credit lines.
2903            let credit_fallback = self.select_credit_account().account_number.clone();
2904            // SOTA-8 precedence (mirror of the debit-side block above).
2905            let account_number = if sota8_active {
2906                credit_fallback
2907            } else {
2908                let priors_opt = &mut self.loaded_priors;
2909                let rng_ref = &mut self.rng;
2910                if let Some(priors) = priors_opt {
2911                    let sp46_gl = entry
2912                        .header
2913                        .sap_source_code
2914                        .as_deref()
2915                        .and_then(|code| priors.sample_gl_for_source_role(code, "CR", rng_ref));
2916                    if let Some(gl) = sp46_gl {
2917                        gl
2918                    } else {
2919                        let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
2920                            priors.sample_attribute_for_source(code, "gl_account", rng_ref)
2921                        });
2922                        if let Some(gl) = sp37_gl {
2923                            gl
2924                        } else if let Some(sampler) = priors.fanout_samplers.get_mut("GLAccount") {
2925                            sampler.pick_for_with_neighbors(
2926                                &doc_type_for_fanout,
2927                                &gl_neighbor_vec,
2928                                gl_share_prob,
2929                                rng_ref,
2930                            )
2931                        } else {
2932                            credit_fallback
2933                        }
2934                    }
2935                } else {
2936                    credit_fallback
2937                }
2938            };
2939            let mut line = JournalEntryLine::credit(
2940                entry.header.document_id,
2941                (line_spec.debit_count + i + 1) as u32,
2942                account_number.clone(),
2943                amount,
2944            );
2945
2946            // Generate line text if enabled.
2947            // SP6 — Try text-taxonomy (account-class cascade), then DescriptionGenerator.
2948            if self.template_config.descriptions.generate_line_text {
2949                let src = entry.header.sap_source_code.as_deref();
2950                let priors_line = if let Some(s) = src {
2951                    if let Some(p) = self.loaded_priors.as_ref() {
2952                        let account_class = p
2953                            .coa_semantic
2954                            .as_ref()
2955                            .and_then(|c| c.accounts.get(&account_number))
2956                            .and_then(|a| a.account_class.as_deref())
2957                            .unwrap_or(
2958                                datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior::UNKNOWN_CLASS,
2959                            );
2960                        // SP6 text_taxonomy cascade
2961                        p.sample_line_template(
2962                            s,
2963                            account_class,
2964                            &mut self.md_resolver,
2965                            &mut self.rng,
2966                        )
2967                    } else {
2968                        None
2969                    }
2970                } else {
2971                    None
2972                };
2973                line.line_text = Some(priors_line.unwrap_or_else(|| {
2974                    self.description_generator.generate_line_text(
2975                        &account_number,
2976                        &context,
2977                        &mut self.rng,
2978                    )
2979                }));
2980            }
2981
2982            // SOTA-1: override the credit line's account with the reused
2983            // archetype's; else capture the fresh account for caching.
2984            // (Same compose-with-SOTA-8 rationale as the debit block.)
2985            if let Some((_, ref c)) = reuse_archetype {
2986                if let Some(a) = c.get(i) {
2987                    line.gl_account = a.clone();
2988                }
2989            } else if self.loaded_priors.is_none() {
2990                fresh_credit_accts.push(line.gl_account.clone());
2991            }
2992            entry.add_line(line);
2993        }
2994
2995        // SOTA-1: cache the freshly-selected archetype for future reuse so
2996        // standard postings recur (skipped when this JE reused one).
2997        if reuse_archetype.is_none() {
2998            self.cache_recurring_archetype(
2999                &entry.header.company_code,
3000                &doc_type_for_fanout,
3001                std::mem::take(&mut fresh_debit_accts),
3002                std::mem::take(&mut fresh_credit_accts),
3003            );
3004        }
3005
3006        // Enrich line items with account descriptions, cost centers, etc.
3007        self.enrich_line_items(&mut entry);
3008
3009        // Apply persona-based errors if enabled and it's a human user
3010        if self.persona_errors_enabled && !is_automated {
3011            self.maybe_inject_persona_error(&mut entry);
3012        }
3013
3014        // Apply approval workflow if enabled and amount exceeds threshold
3015        if self.approval_enabled {
3016            self.maybe_apply_approval_workflow(&mut entry, posting_date);
3017        }
3018
3019        // Populate approved_by / approval_date from the approval workflow
3020        self.populate_approval_fields(&mut entry, posting_date);
3021
3022        // Maybe start a batch of similar entries for realism
3023        self.maybe_start_batch(&entry);
3024
3025        // SP3.4 + SP3.5b — observe each line through the velocity calibrator and
3026        // apply each returned CalibrationStep to the relevant tunable parameter.
3027        if self.velocity_calibrator.is_some() {
3028            let mut pending: Vec<crate::velocity_calibrator::CalibrationStep> = Vec::new();
3029            for line in &entry.lines {
3030                if let Some(step) = self
3031                    .velocity_calibrator
3032                    .as_mut()
3033                    .and_then(|cal| cal.observe_line(line))
3034                {
3035                    pending.push(step);
3036                }
3037            }
3038            for step in pending {
3039                self.apply_calibration_step(&step);
3040            }
3041        }
3042
3043        // SOTA-4: with a small probability, post this JE in a foreign document
3044        // currency (company-ledger amounts unchanged; adds transaction_amount).
3045        self.maybe_apply_foreign_currency(&mut entry);
3046
3047        // SOTA-5: remember this JE so a later reversal can offset it.
3048        self.record_for_reversal(&entry);
3049
3050        entry
3051    }
3052
3053    /// SP3.5b — Apply a CalibrationStep from the velocity calibrator to the
3054    /// affected tunable parameter on this generator.
3055    ///
3056    /// Only `amounts.lognormal_sigma` (R6) and `amounts.round_dollar_share`
3057    /// (R9) are plumbed in v5.14. R7/R8/R10 parameters (off_hours_share,
3058    /// post_close_share, backdating_share) are observed by the calibrator
3059    /// but not yet consumed on the generator side — see v5.15 for plumbing.
3060    fn apply_calibration_step(&mut self, step: &crate::velocity_calibrator::CalibrationStep) {
3061        match step.parameter.as_str() {
3062            "amounts.lognormal_sigma" => {
3063                self.amount_sampler.set_lognormal_sigma(step.new_value);
3064            }
3065            "amounts.round_dollar_share" => {
3066                self.amount_sampler
3067                    .set_round_number_probability(step.new_value);
3068            }
3069            _ => {
3070                // Unknown / not-yet-plumbed parameter — calibrator records it
3071                // in `adjustments` for inspection; no mutation here.
3072            }
3073        }
3074    }
3075
3076    /// Enable or disable persona-based error injection.
3077    ///
3078    /// When enabled, entries created by human personas have a chance
3079    /// to contain realistic human errors based on their experience level.
3080    pub fn with_persona_errors(mut self, enabled: bool) -> Self {
3081        self.persona_errors_enabled = enabled;
3082        self
3083    }
3084
3085    /// Set fraud configuration for fraud injection.
3086    ///
3087    /// When fraud is enabled in the config, transactions have a chance
3088    /// to be marked as fraudulent based on the configured fraud rate.
3089    pub fn with_fraud_config(mut self, config: FraudConfig) -> Self {
3090        self.fraud_config = config;
3091        self
3092    }
3093
3094    /// Check if persona errors are enabled.
3095    pub fn persona_errors_enabled(&self) -> bool {
3096        self.persona_errors_enabled
3097    }
3098
3099    /// Enable or disable batch processing behavior.
3100    ///
3101    /// When enabled (default), the generator will occasionally produce batches
3102    /// of similar entries, simulating how humans batch similar work together.
3103    pub fn with_batching(mut self, enabled: bool) -> Self {
3104        if !enabled {
3105            self.batch_state = None;
3106        }
3107        self
3108    }
3109
3110    /// Check if batch processing is enabled.
3111    pub fn batching_enabled(&self) -> bool {
3112        // Batching is implicitly enabled when not explicitly disabled
3113        true
3114    }
3115
3116    /// Maybe start a batch based on the current entry.
3117    ///
3118    /// Humans often batch similar work: processing invoices from one vendor,
3119    /// entering expense reports for a trip, reconciling similar items.
3120    fn maybe_start_batch(&mut self, entry: &JournalEntry) {
3121        // Only start batch for non-automated, non-fraud entries
3122        if entry.header.source == TransactionSource::Automated || entry.header.is_fraud {
3123            return;
3124        }
3125
3126        // 15% chance to start a batch (most work is not batched)
3127        if self.rng.random::<f64>() > 0.15 {
3128            return;
3129        }
3130
3131        // Extract key attributes for batching
3132        let base_account = entry
3133            .lines
3134            .first()
3135            .map(|l| l.gl_account.clone())
3136            .unwrap_or_default();
3137
3138        let base_amount = entry.total_debit();
3139
3140        self.batch_state = Some(BatchState {
3141            base_account_number: base_account,
3142            base_amount,
3143            base_business_process: entry.header.business_process,
3144            base_posting_date: entry.header.posting_date,
3145            remaining: self.rng.random_range(2..7), // 2-6 more similar entries
3146        });
3147    }
3148
3149    /// Generate an entry that's part of the current batch.
3150    ///
3151    /// Batched entries have:
3152    /// - Same or very similar business process
3153    /// - Same posting date (batched work done together)
3154    /// - Similar amounts (within ±15%)
3155    /// - Same debit account (processing similar items)
3156    fn generate_batched_entry(&mut self) -> JournalEntry {
3157        use rust_decimal::Decimal;
3158
3159        // Decrement batch counter
3160        if let Some(ref mut state) = self.batch_state {
3161            state.remaining = state.remaining.saturating_sub(1);
3162        }
3163
3164        let Some(batch) = self.batch_state.clone() else {
3165            // This is a programming error - batch_state should be set before calling this method.
3166            // Clear state and fall back to generating a standard entry instead of panicking.
3167            tracing::warn!(
3168                "generate_batched_entry called without batch_state; generating standard entry"
3169            );
3170            self.batch_state = None;
3171            return self.generate();
3172        };
3173
3174        // Use the batch's posting date (work done on same day)
3175        let posting_date = batch.base_posting_date;
3176
3177        self.count += 1;
3178        let document_id = self.generate_deterministic_uuid();
3179
3180        // Select same company (batched work is usually same company)
3181        let company_code = self.company_selector.select(&mut self.rng).to_string();
3182
3183        // Use simplified line spec for batched entries (usually 2-line)
3184        let _line_spec = LineItemSpec {
3185            total_count: 2,
3186            debit_count: 1,
3187            credit_count: 1,
3188            split_type: DebitCreditSplit::Equal,
3189        };
3190
3191        // Batched entries are always manual
3192        let source = TransactionSource::Manual;
3193
3194        // SP3.6 — sample SAP source code for the batch entry when priors loaded.
3195        let sap_source_code: Option<String> = self.sample_sap_source_code();
3196        // SOTA-8: stash the batch JE's source for the per-source pool consult.
3197        self.current_je_source = sap_source_code.clone();
3198
3199        // Use the batch's business process
3200        let business_process = batch.base_business_process.unwrap_or(BusinessProcess::R2R);
3201
3202        // Sample time
3203        let time = self.temporal_sampler.sample_time(true);
3204        let created_at = posting_date.and_time(time).and_utc();
3205
3206        // Same user for batched work
3207        let (created_by, user_persona) = self.select_user(false);
3208
3209        // Create header
3210        let mut header =
3211            JournalEntryHeader::with_deterministic_id(company_code, posting_date, document_id);
3212        header.created_at = created_at;
3213        header.source = source;
3214        header.sap_source_code = sap_source_code;
3215
3216        // SP3.9 — JE-level trading partner for batched entries (same pattern as
3217        // the primary generate() path).
3218        // SP3.12 — TP motif biasing also applies to batched entries.
3219        {
3220            let code_opt = header.sap_source_code.clone();
3221            if let Some(ref code) = code_opt {
3222                let rng_ref = &mut self.rng;
3223                let tp_neighbors: Vec<String> = if let Some(ref priors) = self.loaded_priors {
3224                    if let Some(ref motifs) = priors.tp_motif_sampler {
3225                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
3226                            motifs.neighbors(last_tp).to_vec()
3227                        } else {
3228                            Vec::new()
3229                        }
3230                    } else {
3231                        Vec::new()
3232                    }
3233                } else {
3234                    Vec::new()
3235                };
3236                let tp_share_prob: f64 = if let Some(ref priors) = self.loaded_priors {
3237                    if let Some(ref motifs) = priors.tp_motif_sampler {
3238                        if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
3239                            motifs.should_share(last_tp)
3240                        } else {
3241                            0.0
3242                        }
3243                    } else {
3244                        0.0
3245                    }
3246                } else {
3247                    0.0
3248                };
3249                if let Some(ref mut priors) = self.loaded_priors {
3250                    use datasynth_core::distributions::behavioral_priors::CategoricalDistribution;
3251                    let tp = if !tp_neighbors.is_empty()
3252                        && tp_share_prob > 0.0
3253                        && rng_ref.random_range(0.0..1.0) < tp_share_prob
3254                    {
3255                        let filtered: std::collections::BTreeMap<String, f64> = priors
3256                            .per_source_attribute
3257                            .as_ref()
3258                            .and_then(|psa| psa.conditional(code, "trading_partner"))
3259                            .map(|dist| {
3260                                dist.probabilities
3261                                    .iter()
3262                                    .filter(|(v, _)| tp_neighbors.contains(v))
3263                                    .map(|(v, p)| (v.clone(), *p))
3264                                    .collect()
3265                            })
3266                            .unwrap_or_default();
3267                        if filtered.is_empty() {
3268                            priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3269                        } else {
3270                            let neighbour_dist = CategoricalDistribution {
3271                                probabilities: filtered,
3272                                n: 0,
3273                            };
3274                            neighbour_dist.sample(rng_ref).or_else(|| {
3275                                priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3276                            })
3277                        }
3278                    } else {
3279                        priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3280                    };
3281                    header.trading_partner = tp;
3282                }
3283                if let Some(ref tp) = header.trading_partner {
3284                    self.last_tp_by_source.insert(code.clone(), tp.clone());
3285                }
3286            }
3287        }
3288
3289        // SP4.5 — user-persona prior for batched entries (same pattern as primary path).
3290        let (created_by, created_at) = {
3291            let sap_code_for_user = header.sap_source_code.clone();
3292            if let (Some(ref code), Some(ref priors)) = (sap_code_for_user, &self.loaded_priors) {
3293                if let Some(uid) = priors.sample_user_for_source(code, &mut self.rng) {
3294                    let new_created_at = if let Some((hour, _)) =
3295                        priors.sample_timestamp_for_user(&uid, &mut self.rng)
3296                    {
3297                        let base = header.created_at;
3298                        base.date_naive()
3299                            .and_hms_opt(hour, 0, 0)
3300                            .map(|naive| naive.and_utc())
3301                            .unwrap_or(base)
3302                    } else {
3303                        header.created_at
3304                    };
3305                    (uid, new_created_at)
3306                } else {
3307                    (created_by, header.created_at)
3308                }
3309            } else {
3310                (created_by, header.created_at)
3311            }
3312        };
3313
3314        header.created_by = created_by;
3315        header.created_at = created_at;
3316        header.user_persona = user_persona;
3317        header.business_process = Some(business_process);
3318        header.document_type = Self::document_type_for_process(business_process).to_string();
3319
3320        // Batched manual entries have Manual source document
3321        header.source_document = Some(DocumentRef::Manual);
3322
3323        // ISA 240 audit flags for batched entries (always manual)
3324        header.is_manual = true;
3325        header.source_system = if self.rng.random::<f64>() < 0.70 {
3326            "manual".to_string()
3327        } else {
3328            "spreadsheet".to_string()
3329        };
3330        header.is_post_close = posting_date.month() == self.end_date.month()
3331            && posting_date.year() == self.end_date.year()
3332            && posting_date.day() > 25;
3333        header.created_date =
3334            posting_date.and_hms_opt(time.hour().min(23), time.minute(), time.second());
3335
3336        // Generate similar amount (within ±15% of base)
3337        let variation = self.rng.random_range(-0.15..0.15);
3338        let varied_amount =
3339            batch.base_amount * (Decimal::ONE + Decimal::try_from(variation).unwrap_or_default());
3340        let total_amount = varied_amount.round_dp(2).max(Decimal::from(1));
3341
3342        // Create the entry
3343        let mut entry = JournalEntry::new(header);
3344
3345        // Use same debit account as batch base
3346        let debit_line = JournalEntryLine::debit(
3347            entry.header.document_id,
3348            1,
3349            batch.base_account_number.clone(),
3350            total_amount,
3351        );
3352        entry.add_line(debit_line);
3353
3354        // SP3.12 W3 — Select a credit account for the batched entry.
3355        // When priors are loaded and this entry has a SAP source code, use the
3356        // per-source GL-account conditional (same as the primary generate() path).
3357        // This prevents batched entries from adding legacy-CoA accounts to the
3358        // Source-Source projection graph, which was inflating graph density and
3359        // driving the P3 ClusteringGap metric above 30× DR.
3360        let credit_fallback = self.select_credit_account().account_number.clone();
3361        let credit_account = {
3362            let priors_opt = &mut self.loaded_priors;
3363            let rng_ref = &mut self.rng;
3364            if let Some(priors) = priors_opt {
3365                // SP4.6 — role-aware GL for the batched-entry credit line.
3366                // Try (source, "CR") first, then source-marginal, then fallback.
3367                let sp46_gl = entry
3368                    .header
3369                    .sap_source_code
3370                    .as_deref()
3371                    .and_then(|code| priors.sample_gl_for_source_role(code, "CR", rng_ref));
3372                if let Some(gl) = sp46_gl {
3373                    gl
3374                } else {
3375                    let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
3376                        priors.sample_attribute_for_source(code, "gl_account", rng_ref)
3377                    });
3378                    sp37_gl.unwrap_or(credit_fallback)
3379                }
3380            } else {
3381                credit_fallback
3382            }
3383        };
3384        let credit_line =
3385            JournalEntryLine::credit(entry.header.document_id, 2, credit_account, total_amount);
3386        entry.add_line(credit_line);
3387
3388        // Enrich line items with account descriptions, cost centers, etc.
3389        self.enrich_line_items(&mut entry);
3390
3391        // Apply persona-based errors if enabled
3392        if self.persona_errors_enabled {
3393            self.maybe_inject_persona_error(&mut entry);
3394        }
3395
3396        // Apply approval workflow if enabled
3397        if self.approval_enabled {
3398            self.maybe_apply_approval_workflow(&mut entry, posting_date);
3399        }
3400
3401        // Populate approved_by / approval_date from the approval workflow
3402        self.populate_approval_fields(&mut entry, posting_date);
3403
3404        // Clear batch state if no more entries remaining
3405        if batch.remaining <= 1 {
3406            self.batch_state = None;
3407        }
3408
3409        entry
3410    }
3411
3412    /// Maybe inject a persona-appropriate error based on the persona's error rate.
3413    fn maybe_inject_persona_error(&mut self, entry: &mut JournalEntry) {
3414        // Parse persona from the entry header
3415        let persona_str = &entry.header.user_persona;
3416        let persona = match persona_str.to_lowercase().as_str() {
3417            s if s.contains("junior") => UserPersona::JuniorAccountant,
3418            s if s.contains("senior") => UserPersona::SeniorAccountant,
3419            s if s.contains("controller") => UserPersona::Controller,
3420            s if s.contains("manager") => UserPersona::Manager,
3421            s if s.contains("executive") => UserPersona::Executive,
3422            _ => return, // Don't inject errors for unknown personas
3423        };
3424
3425        // Get base error rate from persona
3426        let base_error_rate = persona.error_rate();
3427
3428        // Apply stress factors based on posting date
3429        let adjusted_rate = self.apply_stress_factors(base_error_rate, entry.header.posting_date);
3430
3431        // Check if error should occur based on adjusted rate
3432        if self.rng.random::<f64>() >= adjusted_rate {
3433            return; // No error this time
3434        }
3435
3436        // Select and inject persona-appropriate error
3437        self.inject_human_error(entry, persona);
3438    }
3439
3440    /// Apply contextual stress factors to the base error rate.
3441    ///
3442    /// Stress factors increase error likelihood during:
3443    /// - Month-end (day >= 28): 1.5x more errors due to deadline pressure
3444    /// - Quarter-end (Mar, Jun, Sep, Dec): additional 25% boost
3445    /// - Year-end (December 28-31): 2.0x more errors due to audit pressure
3446    /// - Monday morning (catch-up work): 20% more errors
3447    /// - Friday afternoon (rushing to leave): 30% more errors
3448    fn apply_stress_factors(&self, base_rate: f64, posting_date: chrono::NaiveDate) -> f64 {
3449        use chrono::Datelike;
3450
3451        let mut rate = base_rate;
3452        let day = posting_date.day();
3453        let month = posting_date.month();
3454
3455        // Year-end stress (December 28-31): double the error rate
3456        if month == 12 && day >= 28 {
3457            rate *= 2.0;
3458            return rate.min(0.5); // Cap at 50% to keep it realistic
3459        }
3460
3461        // Quarter-end stress (last days of Mar, Jun, Sep, Dec)
3462        if matches!(month, 3 | 6 | 9 | 12) && day >= 28 {
3463            rate *= 1.75; // 75% more errors at quarter end
3464            return rate.min(0.4);
3465        }
3466
3467        // Month-end stress (last 3 days of month)
3468        if day >= 28 {
3469            rate *= 1.5; // 50% more errors at month end
3470        }
3471
3472        // Day-of-week stress effects
3473        let weekday = posting_date.weekday();
3474        match weekday {
3475            chrono::Weekday::Mon => {
3476                // Monday: catching up, often rushed
3477                rate *= 1.2;
3478            }
3479            chrono::Weekday::Fri => {
3480                // Friday: rushing to finish before weekend
3481                rate *= 1.3;
3482            }
3483            _ => {}
3484        }
3485
3486        // Cap at 40% to keep it realistic
3487        rate.min(0.4)
3488    }
3489
3490    /// Apply human-like variation to an amount.
3491    ///
3492    /// Humans don't enter perfectly calculated amounts - they:
3493    /// - Round amounts differently
3494    /// - Estimate instead of calculating exactly
3495    /// - Make small input variations
3496    ///
3497    /// This applies small variations (typically ±2%) to make amounts more realistic.
3498    fn apply_human_variation(&mut self, amount: rust_decimal::Decimal) -> rust_decimal::Decimal {
3499        use rust_decimal::Decimal;
3500
3501        // Automated transactions or very small amounts don't get variation
3502        if amount < Decimal::from(10) {
3503            return amount;
3504        }
3505
3506        // 70% chance of human variation being applied
3507        if self.rng.random::<f64>() > 0.70 {
3508            return amount;
3509        }
3510
3511        // Decide which type of human variation to apply
3512        let variation_type: u8 = self.rng.random_range(0..4);
3513
3514        match variation_type {
3515            0 => {
3516                // ±2% variation (common for estimated amounts)
3517                let variation_pct = self.rng.random_range(-0.02..0.02);
3518                let variation = amount * Decimal::try_from(variation_pct).unwrap_or_default();
3519                (amount + variation).round_dp(2)
3520            }
3521            1 => {
3522                // Round to nearest $10
3523                let ten = Decimal::from(10);
3524                (amount / ten).round() * ten
3525            }
3526            2 => {
3527                // Round to nearest $100 (for larger amounts)
3528                if amount >= Decimal::from(500) {
3529                    let hundred = Decimal::from(100);
3530                    (amount / hundred).round() * hundred
3531                } else {
3532                    amount
3533                }
3534            }
3535            3 => {
3536                // Slight under/over payment (±$0.01 to ±$1.00)
3537                let cents = Decimal::new(self.rng.random_range(-100..100), 2);
3538                (amount + cents).max(Decimal::ZERO).round_dp(2)
3539            }
3540            _ => amount,
3541        }
3542    }
3543
3544    /// Rebalance an entry after a one-sided amount modification.
3545    ///
3546    /// When an error modifies one line's amount, this finds a line on the opposite
3547    /// side (credit if modified was debit, or vice versa) and adjusts it by the
3548    /// same impact to maintain balance.
3549    fn rebalance_entry(entry: &mut JournalEntry, modified_was_debit: bool, impact: Decimal) {
3550        // Find a line on the opposite side to adjust
3551        let balancing_idx = entry.lines.iter().position(|l| {
3552            if modified_was_debit {
3553                l.credit_amount > Decimal::ZERO
3554            } else {
3555                l.debit_amount > Decimal::ZERO
3556            }
3557        });
3558
3559        if let Some(idx) = balancing_idx {
3560            if modified_was_debit {
3561                entry.lines[idx].credit_amount += impact;
3562            } else {
3563                entry.lines[idx].debit_amount += impact;
3564            }
3565        }
3566    }
3567
3568    /// Inject a human-like error based on the persona.
3569    ///
3570    /// All error types maintain balance - amount modifications are applied to both sides.
3571    /// Entries are marked with [HUMAN_ERROR:*] tags in header_text for ML detection.
3572    fn inject_human_error(&mut self, entry: &mut JournalEntry, persona: UserPersona) {
3573        use rust_decimal::Decimal;
3574
3575        // Different personas make different types of errors
3576        let error_type: u8 = match persona {
3577            UserPersona::JuniorAccountant => {
3578                // Junior accountants make more varied errors
3579                self.rng.random_range(0..5)
3580            }
3581            UserPersona::SeniorAccountant => {
3582                // Senior accountants mainly make transposition errors
3583                self.rng.random_range(0..3)
3584            }
3585            UserPersona::Controller | UserPersona::Manager => {
3586                // Controllers/managers mainly make rounding or cutoff errors
3587                self.rng.random_range(3..5)
3588            }
3589            _ => return,
3590        };
3591
3592        match error_type {
3593            0 => {
3594                // Transposed digits in an amount
3595                if let Some(line) = entry.lines.get_mut(0) {
3596                    let is_debit = line.debit_amount > Decimal::ZERO;
3597                    let original_amount = if is_debit {
3598                        line.debit_amount
3599                    } else {
3600                        line.credit_amount
3601                    };
3602
3603                    // Simple digit swap in the string representation
3604                    let s = original_amount.to_string();
3605                    if s.len() >= 2 {
3606                        let chars: Vec<char> = s.chars().collect();
3607                        let pos = self.rng.random_range(0..chars.len().saturating_sub(1));
3608                        if chars[pos].is_ascii_digit()
3609                            && chars.get(pos + 1).is_some_and(char::is_ascii_digit)
3610                        {
3611                            let mut new_chars = chars;
3612                            new_chars.swap(pos, pos + 1);
3613                            if let Ok(new_amount) =
3614                                new_chars.into_iter().collect::<String>().parse::<Decimal>()
3615                            {
3616                                let impact = new_amount - original_amount;
3617
3618                                // Apply to the modified line
3619                                if is_debit {
3620                                    entry.lines[0].debit_amount = new_amount;
3621                                } else {
3622                                    entry.lines[0].credit_amount = new_amount;
3623                                }
3624
3625                                // Rebalance the entry
3626                                Self::rebalance_entry(entry, is_debit, impact);
3627
3628                                entry.header.header_text = Some(
3629                                    entry.header.header_text.clone().unwrap_or_default()
3630                                        + " [HUMAN_ERROR:TRANSPOSITION]",
3631                                );
3632                            }
3633                        }
3634                    }
3635                }
3636            }
3637            1 => {
3638                // Wrong decimal place (off by factor of 10)
3639                if let Some(line) = entry.lines.get_mut(0) {
3640                    let is_debit = line.debit_amount > Decimal::ZERO;
3641                    let original_amount = if is_debit {
3642                        line.debit_amount
3643                    } else {
3644                        line.credit_amount
3645                    };
3646
3647                    let new_amount = original_amount * Decimal::new(10, 0);
3648                    let impact = new_amount - original_amount;
3649
3650                    // Apply to the modified line
3651                    if is_debit {
3652                        entry.lines[0].debit_amount = new_amount;
3653                    } else {
3654                        entry.lines[0].credit_amount = new_amount;
3655                    }
3656
3657                    // Rebalance the entry
3658                    Self::rebalance_entry(entry, is_debit, impact);
3659
3660                    entry.header.header_text = Some(
3661                        entry.header.header_text.clone().unwrap_or_default()
3662                            + " [HUMAN_ERROR:DECIMAL_SHIFT]",
3663                    );
3664                }
3665            }
3666            2 => {
3667                // Typo in description (doesn't affect balance)
3668                if let Some(ref mut text) = entry.header.header_text {
3669                    let typos = ["teh", "adn", "wiht", "taht", "recieve"];
3670                    let correct = ["the", "and", "with", "that", "receive"];
3671                    let idx = self.rng.random_range(0..typos.len());
3672                    if text.to_lowercase().contains(correct[idx]) {
3673                        *text = text.replace(correct[idx], typos[idx]);
3674                        *text = format!("{text} [HUMAN_ERROR:TYPO]");
3675                    }
3676                }
3677            }
3678            3 => {
3679                // Rounding to round number
3680                if let Some(line) = entry.lines.get_mut(0) {
3681                    let is_debit = line.debit_amount > Decimal::ZERO;
3682                    let original_amount = if is_debit {
3683                        line.debit_amount
3684                    } else {
3685                        line.credit_amount
3686                    };
3687
3688                    let new_amount =
3689                        (original_amount / Decimal::new(100, 0)).round() * Decimal::new(100, 0);
3690                    let impact = new_amount - original_amount;
3691
3692                    // Apply to the modified line
3693                    if is_debit {
3694                        entry.lines[0].debit_amount = new_amount;
3695                    } else {
3696                        entry.lines[0].credit_amount = new_amount;
3697                    }
3698
3699                    // Rebalance the entry
3700                    Self::rebalance_entry(entry, is_debit, impact);
3701
3702                    entry.header.header_text = Some(
3703                        entry.header.header_text.clone().unwrap_or_default()
3704                            + " [HUMAN_ERROR:ROUNDED]",
3705                    );
3706                }
3707            }
3708            // Late posting marker (document date much earlier than posting
3709            // date). Doesn't create an imbalance.
3710            4 if entry.header.document_date == entry.header.posting_date => {
3711                let days_late = self.rng.random_range(5..15);
3712                entry.header.document_date =
3713                    entry.header.posting_date - chrono::Duration::days(days_late);
3714                entry.header.header_text = Some(
3715                    entry.header.header_text.clone().unwrap_or_default()
3716                        + " [HUMAN_ERROR:LATE_POSTING]",
3717                );
3718            }
3719            _ => {}
3720        }
3721    }
3722
3723    /// Apply approval workflow for high-value transactions.
3724    ///
3725    /// If the entry amount exceeds the approval threshold, simulate an
3726    /// approval workflow with appropriate approvers based on amount.
3727    fn maybe_apply_approval_workflow(
3728        &mut self,
3729        entry: &mut JournalEntry,
3730        _posting_date: NaiveDate,
3731    ) {
3732        use rust_decimal::Decimal;
3733
3734        let amount = entry.total_debit();
3735
3736        // Skip if amount is below threshold
3737        if amount <= self.approval_threshold {
3738            // Auto-approved below threshold
3739            let workflow = ApprovalWorkflow::auto_approved(
3740                entry.header.created_by.clone(),
3741                entry.header.user_persona.clone(),
3742                amount,
3743                entry.header.created_at,
3744            );
3745            entry.header.approval_workflow = Some(workflow);
3746            return;
3747        }
3748
3749        // Mark as SOX relevant for high-value transactions
3750        entry.header.sox_relevant = true;
3751
3752        // Determine required approval levels based on amount
3753        let required_levels = if amount > Decimal::new(100000, 0) {
3754            3 // Executive approval required
3755        } else if amount > Decimal::new(50000, 0) {
3756            2 // Senior management approval
3757        } else {
3758            1 // Manager approval
3759        };
3760
3761        // Create the approval workflow
3762        let mut workflow = ApprovalWorkflow::new(
3763            entry.header.created_by.clone(),
3764            entry.header.user_persona.clone(),
3765            amount,
3766        );
3767        workflow.required_levels = required_levels;
3768
3769        // Simulate submission
3770        let submit_time = entry.header.created_at;
3771        let submit_action = ApprovalAction::new(
3772            entry.header.created_by.clone(),
3773            entry.header.user_persona.clone(),
3774            self.parse_persona(&entry.header.user_persona),
3775            ApprovalActionType::Submit,
3776            0,
3777        )
3778        .with_timestamp(submit_time);
3779
3780        workflow.actions.push(submit_action);
3781        workflow.status = ApprovalStatus::Pending;
3782        workflow.submitted_at = Some(submit_time);
3783
3784        // Simulate approvals with realistic delays
3785        let mut current_time = submit_time;
3786        for level in 1..=required_levels {
3787            // Add delay for approval (1-3 business hours per level)
3788            let delay_hours = self.rng.random_range(1..4);
3789            current_time += chrono::Duration::hours(delay_hours);
3790
3791            // Skip weekends
3792            while current_time.weekday() == chrono::Weekday::Sat
3793                || current_time.weekday() == chrono::Weekday::Sun
3794            {
3795                current_time += chrono::Duration::days(1);
3796            }
3797
3798            // Generate approver based on level
3799            let (approver_id, approver_role) = self.select_approver(level);
3800
3801            let approve_action = ApprovalAction::new(
3802                approver_id.clone(),
3803                approver_role.to_string(),
3804                approver_role,
3805                ApprovalActionType::Approve,
3806                level,
3807            )
3808            .with_timestamp(current_time);
3809
3810            workflow.actions.push(approve_action);
3811            workflow.current_level = level;
3812        }
3813
3814        // Mark as approved
3815        workflow.status = ApprovalStatus::Approved;
3816        workflow.approved_at = Some(current_time);
3817
3818        entry.header.approval_workflow = Some(workflow);
3819    }
3820
3821    /// Select an approver based on the required level.
3822    fn select_approver(&mut self, level: u8) -> (String, UserPersona) {
3823        let persona = match level {
3824            1 => UserPersona::Manager,
3825            2 => UserPersona::Controller,
3826            _ => UserPersona::Executive,
3827        };
3828
3829        // Try to get from user pool first
3830        if let Some(ref pool) = self.user_pool {
3831            if let Some(user) = pool.get_random_user(persona, &mut self.rng) {
3832                return (user.user_id.clone(), persona);
3833            }
3834        }
3835
3836        // Fallback to generated approver
3837        let approver_id = match persona {
3838            UserPersona::Manager => format!("MGR{:04}", self.rng.random_range(1..100)),
3839            UserPersona::Controller => format!("CTRL{:04}", self.rng.random_range(1..20)),
3840            UserPersona::Executive => format!("EXEC{:04}", self.rng.random_range(1..10)),
3841            _ => format!("USR{:04}", self.rng.random_range(1..1000)),
3842        };
3843
3844        (approver_id, persona)
3845    }
3846
3847    /// Parse user persona from string.
3848    fn parse_persona(&self, persona_str: &str) -> UserPersona {
3849        match persona_str.to_lowercase().as_str() {
3850            s if s.contains("junior") => UserPersona::JuniorAccountant,
3851            s if s.contains("senior") => UserPersona::SeniorAccountant,
3852            s if s.contains("controller") => UserPersona::Controller,
3853            s if s.contains("manager") => UserPersona::Manager,
3854            s if s.contains("executive") => UserPersona::Executive,
3855            s if s.contains("automated") || s.contains("system") => UserPersona::AutomatedSystem,
3856            _ => UserPersona::JuniorAccountant, // Default
3857        }
3858    }
3859
3860    /// Enable or disable approval workflow.
3861    pub fn with_approval(mut self, enabled: bool) -> Self {
3862        self.approval_enabled = enabled;
3863        self
3864    }
3865
3866    /// Set the approval threshold amount.
3867    pub fn with_approval_threshold(mut self, threshold: rust_decimal::Decimal) -> Self {
3868        self.approval_threshold = threshold;
3869        self
3870    }
3871
3872    /// Set the SOD violation rate for approval tracking.
3873    ///
3874    /// When a transaction is approved, there is a `rate` probability (0.0 to 1.0)
3875    /// that the approver is the same as the creator, which constitutes a SOD violation.
3876    /// Default is 0.10 (10%).
3877    pub fn with_sod_violation_rate(mut self, rate: f64) -> Self {
3878        self.sod_violation_rate = rate;
3879        self
3880    }
3881
3882    /// Populate `approved_by` and `approval_date` from the approval workflow,
3883    /// and flag SOD violations when the approver matches the creator.
3884    fn populate_approval_fields(&mut self, entry: &mut JournalEntry, posting_date: NaiveDate) {
3885        if let Some(ref workflow) = entry.header.approval_workflow {
3886            // Extract the last approver from the workflow actions
3887            let last_approver = workflow
3888                .actions
3889                .iter()
3890                .rev()
3891                .find(|a| matches!(a.action, ApprovalActionType::Approve));
3892
3893            if let Some(approver_action) = last_approver {
3894                entry.header.approved_by = Some(approver_action.actor_id.clone());
3895                entry.header.approval_date = Some(approver_action.action_timestamp.date_naive());
3896            } else {
3897                // No explicit approver (auto-approved); use the preparer
3898                entry.header.approved_by = Some(workflow.preparer_id.clone());
3899                entry.header.approval_date = Some(posting_date);
3900            }
3901
3902            // Inject SOD violation: with configured probability, set approver = creator
3903            if self.rng.random::<f64>() < self.sod_violation_rate {
3904                let creator = entry.header.created_by.clone();
3905                entry.header.approved_by = Some(creator);
3906                entry.header.sod_violation = true;
3907                entry.header.sod_conflict_type = Some(SodConflictType::PreparerApprover);
3908            }
3909        }
3910    }
3911
3912    /// Set the temporal drift controller for simulating distribution changes over time.
3913    ///
3914    /// When drift is enabled, amounts and other distributions will shift based on
3915    /// the period (month) to simulate realistic temporal evolution like inflation
3916    /// or increasing fraud rates.
3917    pub fn with_drift_controller(mut self, controller: DriftController) -> Self {
3918        self.drift_controller = Some(controller);
3919        self
3920    }
3921
3922    /// Set drift configuration directly.
3923    ///
3924    /// Creates a drift controller from the config. Total periods is calculated
3925    /// from the date range.
3926    pub fn with_drift_config(mut self, config: DriftConfig, seed: u64) -> Self {
3927        if config.enabled {
3928            let total_periods = self.calculate_total_periods();
3929            self.drift_controller = Some(DriftController::new(config, seed, total_periods));
3930        }
3931        self
3932    }
3933
3934    /// Calculate total periods (months) in the date range.
3935    fn calculate_total_periods(&self) -> u32 {
3936        let start_year = self.start_date.year();
3937        let start_month = self.start_date.month();
3938        let end_year = self.end_date.year();
3939        let end_month = self.end_date.month();
3940
3941        ((end_year - start_year) * 12 + (end_month as i32 - start_month as i32) + 1).max(1) as u32
3942    }
3943
3944    /// Calculate the period number (0-indexed) for a given date.
3945    fn date_to_period(&self, date: NaiveDate) -> u32 {
3946        let start_year = self.start_date.year();
3947        let start_month = self.start_date.month() as i32;
3948        let date_year = date.year();
3949        let date_month = date.month() as i32;
3950
3951        ((date_year - start_year) * 12 + (date_month - start_month)).max(0) as u32
3952    }
3953
3954    /// Get drift adjustments for a given date.
3955    fn get_drift_adjustments(&self, date: NaiveDate) -> DriftAdjustments {
3956        if let Some(ref controller) = self.drift_controller {
3957            let period = self.date_to_period(date);
3958            controller.compute_adjustments(period)
3959        } else {
3960            DriftAdjustments::none()
3961        }
3962    }
3963
3964    /// Select a user from the pool or generate a generic user ID.
3965    #[inline]
3966    fn select_user(&mut self, is_automated: bool) -> (String, String) {
3967        if let Some(ref pool) = self.user_pool {
3968            let persona = if is_automated {
3969                UserPersona::AutomatedSystem
3970            } else {
3971                // Random distribution among human personas
3972                let roll: f64 = self.rng.random();
3973                if roll < 0.4 {
3974                    UserPersona::JuniorAccountant
3975                } else if roll < 0.7 {
3976                    UserPersona::SeniorAccountant
3977                } else if roll < 0.85 {
3978                    UserPersona::Controller
3979                } else {
3980                    UserPersona::Manager
3981                }
3982            };
3983
3984            if let Some(user) = pool.get_random_user(persona, &mut self.rng) {
3985                return (user.user_id.clone(), user.persona.to_string());
3986            }
3987        }
3988
3989        // Fallback to generic format
3990        if is_automated {
3991            (
3992                format!("BATCH{:04}", self.rng.random_range(1..=20)),
3993                "automated_system".to_string(),
3994            )
3995        } else {
3996            (
3997                format!("USER{:04}", self.rng.random_range(1..=40)),
3998                "senior_accountant".to_string(),
3999            )
4000        }
4001    }
4002
4003    /// Select transaction source based on configuration weights.
4004    #[inline]
4005    fn select_source(&mut self) -> TransactionSource {
4006        let roll: f64 = self.rng.random();
4007        let dist = &self.config.source_distribution;
4008
4009        if roll < dist.manual {
4010            TransactionSource::Manual
4011        } else if roll < dist.manual + dist.automated {
4012            TransactionSource::Automated
4013        } else if roll < dist.manual + dist.automated + dist.recurring {
4014            TransactionSource::Recurring
4015        } else {
4016            TransactionSource::Adjustment
4017        }
4018    }
4019
4020    /// Select a business process based on configuration weights.
4021    #[inline]
4022    /// Map a business process to a SAP-style document type code.
4023    ///
4024    /// - P2P → "KR" (vendor invoice)
4025    /// - O2C → "DR" (customer invoice)
4026    /// - R2R → "SA" (general journal)
4027    /// - H2R → "HR" (HR posting)
4028    /// - A2R → "AA" (asset posting)
4029    /// - others → "SA"
4030    fn document_type_for_process(process: BusinessProcess) -> &'static str {
4031        match process {
4032            BusinessProcess::P2P => "KR",
4033            BusinessProcess::O2C => "DR",
4034            BusinessProcess::R2R => "SA",
4035            BusinessProcess::H2R => "HR",
4036            BusinessProcess::A2R => "AA",
4037            _ => "SA",
4038        }
4039    }
4040
4041    fn select_business_process(&mut self) -> BusinessProcess {
4042        *datasynth_core::utils::weighted_select(&mut self.rng, &self.business_process_weights)
4043    }
4044
4045    /// SOTA-2: draw a rank index in `[0, n)` with `P(rank=i) ∝ 1/(i+1)^ZIPF_ALPHA`
4046    /// from a dedicated stream, so a few low-rank accounts carry most lines (the
4047    /// corpus account-activity Pareto). Returns `None` for an empty/oversized pool
4048    /// so the caller keeps the uniform draw.
4049    #[inline]
4050    fn power_law_index(n: usize, rng: &mut ChaCha8Rng) -> Option<usize> {
4051        if n == 0 || n > ZIPF_CAP {
4052            return None;
4053        }
4054        let total = ZIPF_CUM[n];
4055        let r = rng.random::<f64>() * total;
4056        // smallest k in 1..=n with CUM[k] >= r → 0-based rank k-1
4057        let k = ZIPF_CUM[..=n]
4058            .binary_search_by(|v| v.partial_cmp(&r).unwrap_or(std::cmp::Ordering::Less))
4059            .unwrap_or_else(|e| e);
4060        Some(k.saturating_sub(1).min(n - 1))
4061    }
4062
4063    /// SOTA-2: replace a uniform `Vec<&GLAccount>` pick with a hot-account
4064    /// power-law pick when concentration is on (default). The uniform `.choose`
4065    /// draw on the main `rng` is still consumed by the caller first, so
4066    /// amounts/line-counts/dates stay byte-identical to the legacy stream — only
4067    /// the *selected account* changes. Associated (not `&mut self`) so it borrows
4068    /// only `account_rng`, leaving `coa` free for `all`/`uniform`.
4069    #[inline]
4070    fn concentrate<'a>(
4071        enabled: bool,
4072        rng: &mut ChaCha8Rng,
4073        all: &[&'a GLAccount],
4074        uniform: Option<&'a GLAccount>,
4075    ) -> Option<&'a GLAccount> {
4076        if enabled {
4077            Self::power_law_index(all.len(), rng)
4078                .map(|i| all[i])
4079                .or(uniform)
4080        } else {
4081            uniform
4082        }
4083    }
4084
4085    /// SOTA-8: ensure a `SourcePool` exists for `source` in the sampler (lazy build).
4086    /// One pool per source, persisted across JEs (sampler grows monotonically).
4087    fn ensure_cond_pair_pool(&mut self, source: &str) {
4088        let cfg = &self.config.source_conditional_account_pair;
4089        if !cfg.enabled {
4090            return;
4091        }
4092        if self.cond_pair_sampler.is_none() {
4093            self.cond_pair_sampler = Some(Default::default());
4094        }
4095        let sampler = self
4096            .cond_pair_sampler
4097            .as_mut()
4098            .expect("just-initialised above");
4099        if sampler.pool(source).is_some() {
4100            return;
4101        }
4102        let all_accounts: Vec<String> = self
4103            .coa
4104            .accounts
4105            .iter()
4106            .map(|a| a.account_number.clone())
4107            .collect();
4108        if all_accounts.is_empty() {
4109            return;
4110        }
4111        // Uniform weights here — the existing account-Pareto (account_concentration)
4112        // still applies at the outer fallback level if the per-source pool isn't used.
4113        let weights: Vec<f64> = vec![1.0; all_accounts.len()];
4114        sampler.ensure_pool(
4115            source,
4116            &all_accounts,
4117            &weights,
4118            cfg.accts_per_source_target,
4119            cfg.concentration,
4120            &mut self.cond_pair_rng,
4121        );
4122    }
4123
4124    /// SOTA-8: if the feature is enabled and the current JE has a source with a
4125    /// pool, pick an *account number* from the per-source PMF. Returns an owned
4126    /// `String` so the caller can release the mutable self-borrow before looking
4127    /// up the `GLAccount` in `self.coa`.
4128    #[inline]
4129    fn try_cond_pick_account_number(&mut self) -> Option<String> {
4130        let cfg = &self.config.source_conditional_account_pair;
4131        if !cfg.enabled {
4132            return None;
4133        }
4134        let src = self.current_je_source.clone()?;
4135        self.ensure_cond_pair_pool(&src);
4136        let sampler = self.cond_pair_sampler.as_ref()?;
4137        let pool = sampler.pool(&src)?;
4138        Some(pool.sample_one(&mut self.cond_pair_rng).to_string())
4139    }
4140
4141    #[inline]
4142    fn select_debit_account(&mut self) -> &GLAccount {
4143        // SOTA-8 source-conditional pick when feature is enabled.
4144        if let Some(acct_num) = self.try_cond_pick_account_number() {
4145            if let Some(a) = self
4146                .coa
4147                .accounts
4148                .iter()
4149                .find(|a| a.account_number == acct_num)
4150            {
4151                return a;
4152            }
4153            // Sampler chose an account not in CoA (defensive fall-through).
4154        }
4155        let accounts = self.coa.get_accounts_by_type(AccountType::Asset);
4156        let expense_accounts = self.coa.get_accounts_by_type(AccountType::Expense);
4157
4158        // 60% asset, 40% expense for debits
4159        let all: Vec<_> = if self.rng.random::<f64>() < 0.6 {
4160            accounts
4161        } else {
4162            expense_accounts
4163        };
4164
4165        let uniform = all.choose(&mut self.rng).copied();
4166        let enabled = self.config.account_concentration.unwrap_or(true);
4167        Self::concentrate(enabled, &mut self.account_rng, &all, uniform).unwrap_or_else(|| {
4168            tracing::warn!(
4169                "Account selection returned empty list, falling back to first COA account"
4170            );
4171            &self.coa.accounts[0]
4172        })
4173    }
4174
4175    #[inline]
4176    fn select_credit_account(&mut self) -> &GLAccount {
4177        // SOTA-8 source-conditional pick when feature is enabled.
4178        if let Some(acct_num) = self.try_cond_pick_account_number() {
4179            if let Some(a) = self
4180                .coa
4181                .accounts
4182                .iter()
4183                .find(|a| a.account_number == acct_num)
4184            {
4185                return a;
4186            }
4187        }
4188        let liability_accounts = self.coa.get_accounts_by_type(AccountType::Liability);
4189        let revenue_accounts = self.coa.get_accounts_by_type(AccountType::Revenue);
4190
4191        // 60% liability, 40% revenue for credits
4192        let all: Vec<_> = if self.rng.random::<f64>() < 0.6 {
4193            liability_accounts
4194        } else {
4195            revenue_accounts
4196        };
4197
4198        let uniform = all.choose(&mut self.rng).copied();
4199        let enabled = self.config.account_concentration.unwrap_or(true);
4200        Self::concentrate(enabled, &mut self.account_rng, &all, uniform).unwrap_or_else(|| {
4201            tracing::warn!(
4202                "Account selection returned empty list, falling back to first COA account"
4203            );
4204            &self.coa.accounts[0]
4205        })
4206    }
4207}
4208
4209impl Generator for JournalEntryGenerator {
4210    type Item = JournalEntry;
4211    type Config = (
4212        TransactionConfig,
4213        Arc<ChartOfAccounts>,
4214        Vec<String>,
4215        NaiveDate,
4216        NaiveDate,
4217    );
4218
4219    fn new(config: Self::Config, seed: u64) -> Self {
4220        Self::new_with_params(config.0, config.1, config.2, config.3, config.4, seed)
4221    }
4222
4223    fn generate_one(&mut self) -> Self::Item {
4224        self.generate()
4225    }
4226
4227    fn reset(&mut self) {
4228        self.rng = seeded_rng(self.seed, 0);
4229        self.source_mix_rng = seeded_rng(self.seed, 50_063);
4230        self.template_rng = seeded_rng(self.seed, 70_081);
4231        self.recurring_archetypes.clear();
4232        self.reversal_rng = seeded_rng(self.seed, 90_017);
4233        self.reversal_buffer.clear();
4234        self.account_rng = seeded_rng(self.seed, 60_071);
4235        self.allocation_rng = seeded_rng(self.seed, 80_023);
4236        self.fx_rng = seeded_rng(self.seed, 70_093);
4237        self.line_sampler.reset(self.seed + 1);
4238        self.amount_sampler.reset(self.seed + 2);
4239        self.temporal_sampler.reset(self.seed + 3);
4240        if let Some(ref mut adv) = self.advanced_amount_sampler {
4241            adv.reset(self.seed + 2);
4242        }
4243        self.count = 0;
4244        self.uuid_factory.reset();
4245
4246        // Reset reference generator by recreating it
4247        let mut ref_gen = ReferenceGenerator::new(
4248            self.start_date.year(),
4249            self.companies
4250                .first()
4251                .map(std::string::String::as_str)
4252                .unwrap_or("1000"),
4253        );
4254        ref_gen.set_prefix(
4255            ReferenceType::Invoice,
4256            &self.template_config.references.invoice_prefix,
4257        );
4258        ref_gen.set_prefix(
4259            ReferenceType::PurchaseOrder,
4260            &self.template_config.references.po_prefix,
4261        );
4262        ref_gen.set_prefix(
4263            ReferenceType::SalesOrder,
4264            &self.template_config.references.so_prefix,
4265        );
4266        self.reference_generator = ref_gen;
4267    }
4268
4269    fn count(&self) -> u64 {
4270        self.count
4271    }
4272
4273    fn seed(&self) -> u64 {
4274        self.seed
4275    }
4276}
4277
4278use datasynth_core::traits::ParallelGenerator;
4279
4280impl ParallelGenerator for JournalEntryGenerator {
4281    /// Split this generator into `parts` independent sub-generators.
4282    ///
4283    /// Each sub-generator gets a deterministic seed derived from the parent seed
4284    /// and its partition index, plus a partitioned UUID factory to avoid contention.
4285    /// The results are deterministic for a given partition count.
4286    fn split(self, parts: usize) -> Vec<Self> {
4287        let parts = parts.max(1);
4288        (0..parts)
4289            .map(|i| {
4290                // Derive a unique seed per partition using a golden-ratio constant
4291                let sub_seed = self
4292                    .seed
4293                    .wrapping_add((i as u64).wrapping_mul(0x9E3779B97F4A7C15));
4294
4295                let mut gen = JournalEntryGenerator::new_with_full_config(
4296                    self.config.clone(),
4297                    Arc::clone(&self.coa),
4298                    self.companies.clone(),
4299                    self.start_date,
4300                    self.end_date,
4301                    sub_seed,
4302                    self.template_config.clone(),
4303                    self.user_pool.clone(),
4304                );
4305
4306                // Copy over configuration state
4307                gen.company_selector = self.company_selector.clone();
4308                gen.vendor_pool = self.vendor_pool.clone();
4309                gen.customer_pool = self.customer_pool.clone();
4310                gen.material_pool = self.material_pool.clone();
4311                // v5.9.0: master-data pools so sub-generators emit
4312                // CC/PC values that join back to the corresponding
4313                // masters (without these clones, parallel workers
4314                // fell back to the hardcoded `COST_CENTER_POOL` const
4315                // and the legacy `PC-{COMP}-{P2P|O2C|...}` derivation).
4316                gen.cost_center_pool = self.cost_center_pool.clone();
4317                gen.profit_center_pool = self.profit_center_pool.clone();
4318                gen.using_real_master_data = self.using_real_master_data;
4319                gen.fraud_config = self.fraud_config.clone();
4320                gen.persona_errors_enabled = self.persona_errors_enabled;
4321                gen.approval_enabled = self.approval_enabled;
4322                gen.approval_threshold = self.approval_threshold;
4323                gen.sod_violation_rate = self.sod_violation_rate;
4324                // v3.4.0+: advanced amount sampler (mixture / Pareto /
4325                // Gaussian). Clone and reset the internal RNG with the
4326                // partition's sub_seed so each worker explores a unique
4327                // subsequence without repeating the parent stream.
4328                if let Some(mut adv) = self.advanced_amount_sampler.clone() {
4329                    adv.reset(sub_seed.wrapping_add(2));
4330                    gen.advanced_amount_sampler = Some(adv);
4331                }
4332                // v3.5.3+: conditional amount override — clone + reset
4333                // so each partition gets a fresh deterministic stream.
4334                if let Some(mut cond) = self.conditional_amount_override.clone() {
4335                    cond.reset(sub_seed.wrapping_add(17));
4336                    gen.conditional_amount_override = Some(cond);
4337                }
4338                // v3.5.4+: copula sampler — clone + reset per partition.
4339                if let Some(mut cop) = self.correlation_copula.clone() {
4340                    cop.reset(sub_seed.wrapping_add(31));
4341                    gen.correlation_copula = Some(cop);
4342                }
4343
4344                // Use partitioned UUID factory to eliminate atomic contention
4345                gen.uuid_factory = DeterministicUuidFactory::for_partition(
4346                    sub_seed,
4347                    GeneratorType::JournalEntry,
4348                    i as u8,
4349                );
4350
4351                // Copy temporal patterns if configured
4352                if let Some(ref config) = self.temporal_patterns_config {
4353                    gen.temporal_patterns_config = Some(config.clone());
4354                    // Rebuild business day calculator from the stored config
4355                    if config.business_days.enabled {
4356                        if let Some(ref bdc) = self.business_day_calculator {
4357                            gen.business_day_calculator = Some(bdc.clone());
4358                        }
4359                    }
4360                    // Rebuild processing lag calculator with partition seed
4361                    if config.processing_lags.enabled {
4362                        let lag_config =
4363                            Self::convert_processing_lag_config(&config.processing_lags);
4364                        gen.processing_lag_calculator =
4365                            Some(ProcessingLagCalculator::with_config(sub_seed, lag_config));
4366                    }
4367                }
4368
4369                // Copy drift controller if present
4370                if let Some(ref dc) = self.drift_controller {
4371                    gen.drift_controller = Some(dc.clone());
4372                }
4373
4374                // SP3: share Arc-wrapped priors with all sub-generators.
4375                // Clone is O(1) — increments the reference count only.
4376                gen.loaded_priors = self.loaded_priors.clone();
4377
4378                // SP3.4: each partition starts with a fresh calibrator so
4379                // observations are partition-local (avoids cross-partition
4380                // state contamination).  Target rates and window size are
4381                // cloned from the parent; accumulated state is not.
4382                if let Some(ref cal) = self.velocity_calibrator {
4383                    let mut fresh = crate::velocity_calibrator::VelocityCalibrator::new(
4384                        cal.target_trigger_rates.clone(),
4385                        cal.n_lines_between_calibrations,
4386                    );
4387                    fresh.current_values = cal.current_values.clone();
4388                    gen.velocity_calibrator = Some(fresh);
4389                }
4390
4391                gen
4392            })
4393            .collect()
4394    }
4395}
4396
4397#[cfg(test)]
4398mod tests {
4399    use super::*;
4400    use crate::ChartOfAccountsGenerator;
4401
4402    #[test]
4403    fn test_generate_balanced_entries() {
4404        let mut coa_gen =
4405            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4406        let coa = Arc::new(coa_gen.generate());
4407
4408        let mut je_gen = JournalEntryGenerator::new_with_params(
4409            TransactionConfig::default(),
4410            coa,
4411            vec!["1000".to_string()],
4412            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4413            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4414            42,
4415        );
4416
4417        let mut balanced_count = 0;
4418        for _ in 0..100 {
4419            let entry = je_gen.generate();
4420
4421            // Skip entries with human errors as they may be intentionally unbalanced
4422            let has_human_error = entry
4423                .header
4424                .header_text
4425                .as_ref()
4426                .map(|t| t.contains("[HUMAN_ERROR:"))
4427                .unwrap_or(false);
4428
4429            if !has_human_error {
4430                assert!(
4431                    entry.is_balanced(),
4432                    "Entry {:?} is not balanced",
4433                    entry.header.document_id
4434                );
4435                balanced_count += 1;
4436            }
4437            assert!(entry.line_count() >= 2, "Entry has fewer than 2 lines");
4438        }
4439
4440        // Ensure most entries are balanced (human errors are rare)
4441        assert!(
4442            balanced_count >= 80,
4443            "Expected at least 80 balanced entries, got {}",
4444            balanced_count
4445        );
4446    }
4447
4448    #[test]
4449    fn test_deterministic_generation() {
4450        let mut coa_gen =
4451            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4452        let coa = Arc::new(coa_gen.generate());
4453
4454        let mut gen1 = JournalEntryGenerator::new_with_params(
4455            TransactionConfig::default(),
4456            Arc::clone(&coa),
4457            vec!["1000".to_string()],
4458            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4459            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4460            42,
4461        );
4462
4463        let mut gen2 = JournalEntryGenerator::new_with_params(
4464            TransactionConfig::default(),
4465            coa,
4466            vec!["1000".to_string()],
4467            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4468            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4469            42,
4470        );
4471
4472        for _ in 0..50 {
4473            let e1 = gen1.generate();
4474            let e2 = gen2.generate();
4475            assert_eq!(e1.header.document_id, e2.header.document_id);
4476            assert_eq!(e1.total_debit(), e2.total_debit());
4477        }
4478    }
4479
4480    #[test]
4481    fn test_templates_generate_descriptions() {
4482        let mut coa_gen =
4483            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4484        let coa = Arc::new(coa_gen.generate());
4485
4486        // Enable all template features
4487        let template_config = TemplateConfig {
4488            names: datasynth_config::schema::NameTemplateConfig {
4489                generate_realistic_names: true,
4490                email_domain: "test.com".to_string(),
4491                culture_distribution: datasynth_config::schema::CultureDistribution::default(),
4492            },
4493            descriptions: datasynth_config::schema::DescriptionTemplateConfig {
4494                generate_header_text: true,
4495                generate_line_text: true,
4496            },
4497            references: datasynth_config::schema::ReferenceTemplateConfig {
4498                generate_references: true,
4499                invoice_prefix: "TEST-INV".to_string(),
4500                po_prefix: "TEST-PO".to_string(),
4501                so_prefix: "TEST-SO".to_string(),
4502            },
4503            path: None,
4504            merge_strategy: datasynth_config::TemplateMergeStrategy::default(),
4505        };
4506
4507        let mut je_gen = JournalEntryGenerator::new_with_full_config(
4508            TransactionConfig::default(),
4509            coa,
4510            vec!["1000".to_string()],
4511            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4512            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4513            42,
4514            template_config,
4515            None,
4516        )
4517        .with_persona_errors(false); // Disable for template testing
4518
4519        for _ in 0..10 {
4520            let entry = je_gen.generate();
4521
4522            // Verify header text is populated
4523            assert!(
4524                entry.header.header_text.is_some(),
4525                "Header text should be populated"
4526            );
4527
4528            // Verify reference is populated
4529            assert!(
4530                entry.header.reference.is_some(),
4531                "Reference should be populated"
4532            );
4533
4534            // Verify business process is set
4535            assert!(
4536                entry.header.business_process.is_some(),
4537                "Business process should be set"
4538            );
4539
4540            // Verify line text is populated
4541            for line in &entry.lines {
4542                assert!(line.line_text.is_some(), "Line text should be populated");
4543            }
4544
4545            // Entry should still be balanced
4546            assert!(entry.is_balanced());
4547        }
4548    }
4549
4550    #[test]
4551    fn test_user_pool_integration() {
4552        let mut coa_gen =
4553            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4554        let coa = Arc::new(coa_gen.generate());
4555
4556        let companies = vec!["1000".to_string()];
4557
4558        // Generate user pool
4559        let mut user_gen = crate::UserGenerator::new(42);
4560        let user_pool = user_gen.generate_standard(&companies);
4561
4562        let mut je_gen = JournalEntryGenerator::new_with_full_config(
4563            TransactionConfig::default(),
4564            coa,
4565            companies,
4566            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4567            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4568            42,
4569            TemplateConfig::default(),
4570            Some(user_pool),
4571        );
4572
4573        // Generate entries and verify user IDs are from pool
4574        for _ in 0..20 {
4575            let entry = je_gen.generate();
4576
4577            // User ID should not be generic BATCH/USER format when pool is used
4578            // (though it may still fall back if random selection misses)
4579            assert!(!entry.header.created_by.is_empty());
4580        }
4581    }
4582
4583    #[test]
4584    fn test_master_data_connection() {
4585        let mut coa_gen =
4586            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4587        let coa = Arc::new(coa_gen.generate());
4588
4589        // Create test vendors
4590        let vendors = vec![
4591            Vendor::new("V-TEST-001", "Test Vendor Alpha", VendorType::Supplier),
4592            Vendor::new("V-TEST-002", "Test Vendor Beta", VendorType::Technology),
4593        ];
4594
4595        // Create test customers
4596        let customers = vec![
4597            Customer::new("C-TEST-001", "Test Customer One", CustomerType::Corporate),
4598            Customer::new(
4599                "C-TEST-002",
4600                "Test Customer Two",
4601                CustomerType::SmallBusiness,
4602            ),
4603        ];
4604
4605        // Create test materials
4606        let materials = vec![Material::new(
4607            "MAT-TEST-001",
4608            "Test Material A",
4609            MaterialType::RawMaterial,
4610        )];
4611
4612        // Create generator with master data
4613        let generator = JournalEntryGenerator::new_with_params(
4614            TransactionConfig::default(),
4615            coa,
4616            vec!["1000".to_string()],
4617            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4618            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4619            42,
4620        );
4621
4622        // Without master data
4623        assert!(!generator.is_using_real_master_data());
4624
4625        // Connect master data
4626        let generator_with_data = generator
4627            .with_vendors(&vendors)
4628            .with_customers(&customers)
4629            .with_materials(&materials);
4630
4631        // Should now be using real master data
4632        assert!(generator_with_data.is_using_real_master_data());
4633    }
4634
4635    #[test]
4636    fn test_with_master_data_convenience_method() {
4637        let mut coa_gen =
4638            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4639        let coa = Arc::new(coa_gen.generate());
4640
4641        let vendors = vec![Vendor::new("V-001", "Vendor One", VendorType::Supplier)];
4642        let customers = vec![Customer::new(
4643            "C-001",
4644            "Customer One",
4645            CustomerType::Corporate,
4646        )];
4647        let materials = vec![Material::new(
4648            "MAT-001",
4649            "Material One",
4650            MaterialType::RawMaterial,
4651        )];
4652
4653        let generator = JournalEntryGenerator::new_with_params(
4654            TransactionConfig::default(),
4655            coa,
4656            vec!["1000".to_string()],
4657            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4658            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4659            42,
4660        )
4661        .with_master_data(&vendors, &customers, &materials);
4662
4663        assert!(generator.is_using_real_master_data());
4664    }
4665
4666    #[test]
4667    fn test_stress_factors_increase_error_rate() {
4668        let mut coa_gen =
4669            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4670        let coa = Arc::new(coa_gen.generate());
4671
4672        let generator = JournalEntryGenerator::new_with_params(
4673            TransactionConfig::default(),
4674            coa,
4675            vec!["1000".to_string()],
4676            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4677            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4678            42,
4679        );
4680
4681        let base_rate = 0.1;
4682
4683        // Regular day - no stress factors
4684        let regular_day = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(); // Mid-June Wednesday
4685        let regular_rate = generator.apply_stress_factors(base_rate, regular_day);
4686        assert!(
4687            (regular_rate - base_rate).abs() < 0.01,
4688            "Regular day should have minimal stress factor adjustment"
4689        );
4690
4691        // Month end - 50% more errors
4692        let month_end = NaiveDate::from_ymd_opt(2024, 6, 29).unwrap(); // June 29 (Saturday)
4693        let month_end_rate = generator.apply_stress_factors(base_rate, month_end);
4694        assert!(
4695            month_end_rate > regular_rate,
4696            "Month end should have higher error rate than regular day"
4697        );
4698
4699        // Year end - double the error rate
4700        let year_end = NaiveDate::from_ymd_opt(2024, 12, 30).unwrap(); // December 30
4701        let year_end_rate = generator.apply_stress_factors(base_rate, year_end);
4702        assert!(
4703            year_end_rate > month_end_rate,
4704            "Year end should have highest error rate"
4705        );
4706
4707        // Friday stress
4708        let friday = NaiveDate::from_ymd_opt(2024, 6, 14).unwrap(); // Friday
4709        let friday_rate = generator.apply_stress_factors(base_rate, friday);
4710        assert!(
4711            friday_rate > regular_rate,
4712            "Friday should have higher error rate than mid-week"
4713        );
4714
4715        // Monday stress
4716        let monday = NaiveDate::from_ymd_opt(2024, 6, 17).unwrap(); // Monday
4717        let monday_rate = generator.apply_stress_factors(base_rate, monday);
4718        assert!(
4719            monday_rate > regular_rate,
4720            "Monday should have higher error rate than mid-week"
4721        );
4722    }
4723
4724    #[test]
4725    fn test_batching_produces_similar_entries() {
4726        let mut coa_gen =
4727            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4728        let coa = Arc::new(coa_gen.generate());
4729
4730        // Use seed 123 which is more likely to trigger batching
4731        let mut je_gen = JournalEntryGenerator::new_with_params(
4732            TransactionConfig::default(),
4733            coa,
4734            vec!["1000".to_string()],
4735            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4736            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4737            123,
4738        )
4739        .with_persona_errors(false); // Disable to ensure balanced entries
4740
4741        // Generate many entries - at 15% batch rate, should see some batches
4742        let entries: Vec<JournalEntry> = (0..200).map(|_| je_gen.generate()).collect();
4743
4744        // Check that all entries are balanced (batched or not)
4745        for entry in &entries {
4746            assert!(
4747                entry.is_balanced(),
4748                "All entries including batched should be balanced"
4749            );
4750        }
4751
4752        // Count entries with same-day posting dates (batch indicator)
4753        let mut date_counts: std::collections::HashMap<NaiveDate, usize> =
4754            std::collections::HashMap::new();
4755        for entry in &entries {
4756            *date_counts.entry(entry.header.posting_date).or_insert(0) += 1;
4757        }
4758
4759        // With batching, some dates should have multiple entries
4760        let dates_with_multiple = date_counts.values().filter(|&&c| c > 1).count();
4761        assert!(
4762            dates_with_multiple > 0,
4763            "With batching, should see some dates with multiple entries"
4764        );
4765    }
4766
4767    #[test]
4768    fn test_temporal_patterns_business_days() {
4769        use datasynth_config::schema::{
4770            BusinessDaySchemaConfig, CalendarSchemaConfig, TemporalPatternsConfig,
4771        };
4772
4773        let mut coa_gen =
4774            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4775        let coa = Arc::new(coa_gen.generate());
4776
4777        // Create temporal patterns config with business days enabled
4778        let temporal_config = TemporalPatternsConfig {
4779            enabled: true,
4780            business_days: BusinessDaySchemaConfig {
4781                enabled: true,
4782                ..Default::default()
4783            },
4784            calendars: CalendarSchemaConfig {
4785                regions: vec!["US".to_string()],
4786                custom_holidays: vec![],
4787            },
4788            ..Default::default()
4789        };
4790
4791        let mut je_gen = JournalEntryGenerator::new_with_params(
4792            TransactionConfig::default(),
4793            coa,
4794            vec!["1000".to_string()],
4795            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4796            NaiveDate::from_ymd_opt(2024, 3, 31).unwrap(), // Q1 2024
4797            42,
4798        )
4799        .with_temporal_patterns(temporal_config, 42)
4800        .with_persona_errors(false);
4801
4802        // Generate entries and verify none fall on weekends
4803        let entries: Vec<JournalEntry> = (0..100).map(|_| je_gen.generate()).collect();
4804
4805        for entry in &entries {
4806            let weekday = entry.header.posting_date.weekday();
4807            assert!(
4808                weekday != chrono::Weekday::Sat && weekday != chrono::Weekday::Sun,
4809                "Posting date {:?} should not be a weekend",
4810                entry.header.posting_date
4811            );
4812        }
4813    }
4814
4815    #[test]
4816    fn test_default_generation_filters_weekends() {
4817        // Verify that weekend entries are <5% even when temporal_patterns is NOT enabled.
4818        // This tests the fix where new_with_full_config always creates a default
4819        // BusinessDayCalculator with US holidays as a fallback.
4820        let mut coa_gen =
4821            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4822        let coa = Arc::new(coa_gen.generate());
4823
4824        let mut je_gen = JournalEntryGenerator::new_with_params(
4825            TransactionConfig::default(),
4826            coa,
4827            vec!["1000".to_string()],
4828            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4829            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4830            42,
4831        )
4832        .with_persona_errors(false);
4833
4834        let total = 500;
4835        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4836
4837        let weekend_count = entries
4838            .iter()
4839            .filter(|e| {
4840                let wd = e.header.posting_date.weekday();
4841                wd == chrono::Weekday::Sat || wd == chrono::Weekday::Sun
4842            })
4843            .count();
4844
4845        let weekend_pct = weekend_count as f64 / total as f64;
4846        assert!(
4847            weekend_pct < 0.05,
4848            "Expected weekend entries <5% of total without temporal_patterns enabled, \
4849             but got {:.1}% ({}/{})",
4850            weekend_pct * 100.0,
4851            weekend_count,
4852            total
4853        );
4854    }
4855
4856    #[test]
4857    fn test_document_type_derived_from_business_process() {
4858        let mut coa_gen =
4859            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4860        let coa = Arc::new(coa_gen.generate());
4861
4862        let mut je_gen = JournalEntryGenerator::new_with_params(
4863            TransactionConfig::default(),
4864            coa,
4865            vec!["1000".to_string()],
4866            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4867            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4868            99,
4869        )
4870        .with_persona_errors(false)
4871        .with_batching(false);
4872
4873        let total = 200;
4874        let mut doc_types = std::collections::HashSet::new();
4875        let mut sa_count = 0_usize;
4876
4877        for _ in 0..total {
4878            let entry = je_gen.generate();
4879            let dt = &entry.header.document_type;
4880            doc_types.insert(dt.clone());
4881            if dt == "SA" {
4882                sa_count += 1;
4883            }
4884        }
4885
4886        // Should have more than 3 distinct document types
4887        assert!(
4888            doc_types.len() > 3,
4889            "Expected >3 distinct document types, got {} ({:?})",
4890            doc_types.len(),
4891            doc_types,
4892        );
4893
4894        // "SA" should be less than 50% (R2R is 20% of the weight)
4895        let sa_pct = sa_count as f64 / total as f64;
4896        assert!(
4897            sa_pct < 0.50,
4898            "Expected SA <50%, got {:.1}% ({}/{})",
4899            sa_pct * 100.0,
4900            sa_count,
4901            total,
4902        );
4903    }
4904
4905    #[test]
4906    fn test_enrich_line_items_account_description() {
4907        let mut coa_gen =
4908            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4909        let coa = Arc::new(coa_gen.generate());
4910
4911        let mut je_gen = JournalEntryGenerator::new_with_params(
4912            TransactionConfig::default(),
4913            coa,
4914            vec!["1000".to_string()],
4915            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4916            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4917            42,
4918        )
4919        .with_persona_errors(false);
4920
4921        let total = 200;
4922        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4923
4924        // Count lines with account_description populated
4925        let total_lines: usize = entries.iter().map(|e| e.lines.len()).sum();
4926        let lines_with_desc: usize = entries
4927            .iter()
4928            .flat_map(|e| &e.lines)
4929            .filter(|l| l.account_description.is_some())
4930            .count();
4931
4932        let desc_pct = lines_with_desc as f64 / total_lines as f64;
4933        assert!(
4934            desc_pct > 0.95,
4935            "Expected >95% of lines to have account_description, got {:.1}% ({}/{})",
4936            desc_pct * 100.0,
4937            lines_with_desc,
4938            total_lines,
4939        );
4940    }
4941
4942    #[test]
4943    fn test_enrich_line_items_cost_center_for_expense_accounts() {
4944        let mut coa_gen =
4945            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4946        let coa = Arc::new(coa_gen.generate());
4947
4948        let mut je_gen = JournalEntryGenerator::new_with_params(
4949            TransactionConfig::default(),
4950            coa,
4951            vec!["1000".to_string()],
4952            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4953            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4954            42,
4955        )
4956        .with_persona_errors(false);
4957
4958        let total = 300;
4959        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4960
4961        // Count expense account lines (5xxx/6xxx) with cost_center populated
4962        let expense_lines: Vec<&JournalEntryLine> = entries
4963            .iter()
4964            .flat_map(|e| &e.lines)
4965            .filter(|l| {
4966                let first = l.gl_account.chars().next().unwrap_or('0');
4967                first == '5' || first == '6'
4968            })
4969            .collect();
4970
4971        if !expense_lines.is_empty() {
4972            let with_cc = expense_lines
4973                .iter()
4974                .filter(|l| l.cost_center.is_some())
4975                .count();
4976            let cc_pct = with_cc as f64 / expense_lines.len() as f64;
4977            assert!(
4978                cc_pct > 0.80,
4979                "Expected >80% of expense lines to have cost_center, got {:.1}% ({}/{})",
4980                cc_pct * 100.0,
4981                with_cc,
4982                expense_lines.len(),
4983            );
4984        }
4985    }
4986
4987    #[test]
4988    fn test_enrich_line_items_profit_center_and_line_text() {
4989        let mut coa_gen =
4990            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4991        let coa = Arc::new(coa_gen.generate());
4992
4993        let mut je_gen = JournalEntryGenerator::new_with_params(
4994            TransactionConfig::default(),
4995            coa,
4996            vec!["1000".to_string()],
4997            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4998            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4999            42,
5000        )
5001        .with_persona_errors(false);
5002
5003        let total = 100;
5004        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
5005
5006        let total_lines: usize = entries.iter().map(|e| e.lines.len()).sum();
5007
5008        // All lines should have profit_center
5009        let with_pc = entries
5010            .iter()
5011            .flat_map(|e| &e.lines)
5012            .filter(|l| l.profit_center.is_some())
5013            .count();
5014        let pc_pct = with_pc as f64 / total_lines as f64;
5015        assert!(
5016            pc_pct > 0.95,
5017            "Expected >95% of lines to have profit_center, got {:.1}% ({}/{})",
5018            pc_pct * 100.0,
5019            with_pc,
5020            total_lines,
5021        );
5022
5023        // All lines should have line_text (either from template or header fallback)
5024        let with_text = entries
5025            .iter()
5026            .flat_map(|e| &e.lines)
5027            .filter(|l| l.line_text.is_some())
5028            .count();
5029        let text_pct = with_text as f64 / total_lines as f64;
5030        assert!(
5031            text_pct > 0.95,
5032            "Expected >95% of lines to have line_text, got {:.1}% ({}/{})",
5033            text_pct * 100.0,
5034            with_text,
5035            total_lines,
5036        );
5037    }
5038
5039    // --- ISA 240 audit flag tests ---
5040
5041    #[test]
5042    fn test_je_has_audit_flags() {
5043        let mut coa_gen =
5044            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5045        let coa = Arc::new(coa_gen.generate());
5046
5047        let mut je_gen = JournalEntryGenerator::new_with_params(
5048            TransactionConfig::default(),
5049            coa,
5050            vec!["1000".to_string()],
5051            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5052            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5053            42,
5054        )
5055        .with_persona_errors(false);
5056
5057        for _ in 0..100 {
5058            let entry = je_gen.generate();
5059
5060            // source_system should always be non-empty
5061            assert!(
5062                !entry.header.source_system.is_empty(),
5063                "source_system should be populated, got empty string"
5064            );
5065
5066            // created_by should always be non-empty (already tested elsewhere, but confirm)
5067            assert!(
5068                !entry.header.created_by.is_empty(),
5069                "created_by should be populated"
5070            );
5071
5072            // created_date should always be populated
5073            assert!(
5074                entry.header.created_date.is_some(),
5075                "created_date should be populated"
5076            );
5077        }
5078    }
5079
5080    #[test]
5081    fn test_manual_entry_rate() {
5082        let mut coa_gen =
5083            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5084        let coa = Arc::new(coa_gen.generate());
5085
5086        let mut je_gen = JournalEntryGenerator::new_with_params(
5087            TransactionConfig::default(),
5088            coa,
5089            vec!["1000".to_string()],
5090            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5091            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5092            42,
5093        )
5094        .with_persona_errors(false)
5095        .with_batching(false);
5096
5097        let total = 1000;
5098        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
5099
5100        let manual_count = entries.iter().filter(|e| e.header.is_manual).count();
5101        let manual_rate = manual_count as f64 / total as f64;
5102
5103        // Default source_distribution.manual is typically around 0.05-0.15
5104        // Allow a wide tolerance for statistical variation
5105        assert!(
5106            manual_rate > 0.01 && manual_rate < 0.50,
5107            "Manual entry rate should be reasonable (1%-50%), got {:.1}% ({}/{})",
5108            manual_rate * 100.0,
5109            manual_count,
5110            total,
5111        );
5112
5113        // is_manual should match TransactionSource::Manual
5114        for entry in &entries {
5115            let source_is_manual = entry.header.source == TransactionSource::Manual;
5116            assert_eq!(
5117                entry.header.is_manual, source_is_manual,
5118                "is_manual should match source == Manual"
5119            );
5120        }
5121    }
5122
5123    #[test]
5124    fn test_manual_source_consistency() {
5125        let mut coa_gen =
5126            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5127        let coa = Arc::new(coa_gen.generate());
5128
5129        let mut je_gen = JournalEntryGenerator::new_with_params(
5130            TransactionConfig::default(),
5131            coa,
5132            vec!["1000".to_string()],
5133            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5134            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5135            42,
5136        )
5137        .with_persona_errors(false)
5138        .with_batching(false);
5139
5140        for _ in 0..500 {
5141            let entry = je_gen.generate();
5142
5143            if entry.header.is_manual {
5144                // Manual entries must have a source_system in the
5145                // `manual/...` or `spreadsheet/...` family (the bare
5146                // legacy `manual` and `spreadsheet` values are also
5147                // accepted to keep older fixtures working).
5148                let s = entry.header.source_system.as_str();
5149                assert!(
5150                    s == "manual"
5151                        || s == "spreadsheet"
5152                        || s.starts_with("manual/")
5153                        || s.starts_with("spreadsheet/"),
5154                    "Manual entry should have source_system in `manual` / `spreadsheet` family, got '{s}'",
5155                );
5156            } else {
5157                // Non-manual entries must NOT be in the manual/spreadsheet family.
5158                let s = entry.header.source_system.as_str();
5159                assert!(
5160                    !(s == "manual"
5161                        || s == "spreadsheet"
5162                        || s.starts_with("manual/")
5163                        || s.starts_with("spreadsheet/")),
5164                    "Non-manual entry should not be in `manual` / `spreadsheet` family, got '{s}'",
5165                );
5166            }
5167        }
5168    }
5169
5170    #[test]
5171    fn test_default_source_codes_breadth() {
5172        // T2-D Lever 1: with no industry priors and the default config, the
5173        // `source` column carries a broad generic SAP doc-type mix
5174        // (sap_source_code populated) instead of collapsing to the
5175        // TransactionSource enum. See experiments/ml/FINDINGS.md §6.
5176        let mut coa_gen =
5177            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 7);
5178        let coa = Arc::new(coa_gen.generate());
5179        let mut je_gen = JournalEntryGenerator::new_with_params(
5180            TransactionConfig::default(),
5181            coa,
5182            vec!["1000".to_string()],
5183            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5184            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5185            7,
5186        )
5187        .with_persona_errors(false)
5188        .with_batching(false);
5189
5190        let mut codes = std::collections::HashSet::new();
5191        for _ in 0..500 {
5192            let e = je_gen.generate();
5193            let code = e
5194                .header
5195                .sap_source_code
5196                .expect("default config should populate sap_source_code");
5197            codes.insert(code);
5198        }
5199        assert!(
5200            codes.len() >= 10,
5201            "default source-mix should be broad (>=10 distinct codes), got {}",
5202            codes.len()
5203        );
5204    }
5205
5206    #[test]
5207    fn test_source_codes_opt_out() {
5208        // synthetic_source_codes = Some(false) restores the legacy behaviour:
5209        // sap_source_code stays None and `source` falls back to the enum.
5210        let mut coa_gen =
5211            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 9);
5212        let coa = Arc::new(coa_gen.generate());
5213        let cfg = TransactionConfig {
5214            synthetic_source_codes: Some(false),
5215            ..TransactionConfig::default()
5216        };
5217        let mut je_gen = JournalEntryGenerator::new_with_params(
5218            cfg,
5219            coa,
5220            vec!["1000".to_string()],
5221            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5222            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5223            9,
5224        )
5225        .with_persona_errors(false)
5226        .with_batching(false);
5227        for _ in 0..50 {
5228            let e = je_gen.generate();
5229            assert!(
5230                e.header.sap_source_code.is_none(),
5231                "opt-out should leave sap_source_code None (legacy enum source)"
5232            );
5233        }
5234    }
5235
5236    #[test]
5237    fn test_recurring_templates_reuse_archetypes() {
5238        // SOTA-1: with templating on (default), generated JEs reuse account
5239        // archetypes (far fewer distinct than the legacy uniform-per-line
5240        // selection), and balance is preserved either way.
5241        fn run(recurring: Option<bool>) -> (usize, usize, bool) {
5242            let mut coa_gen = ChartOfAccountsGenerator::new(
5243                CoAComplexity::Medium,
5244                IndustrySector::Manufacturing,
5245                11,
5246            );
5247            let coa = Arc::new(coa_gen.generate());
5248            let cfg = TransactionConfig {
5249                recurring_templates: recurring,
5250                ..TransactionConfig::default()
5251            };
5252            let mut g = JournalEntryGenerator::new_with_params(
5253                cfg,
5254                coa,
5255                vec!["1000".to_string()],
5256                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5257                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5258                11,
5259            )
5260            .with_persona_errors(false)
5261            .with_batching(false);
5262            let n = 800;
5263            let mut arche = std::collections::HashSet::new();
5264            let mut balanced = true;
5265            for _ in 0..n {
5266                let e = g.generate();
5267                if !e.is_balanced() {
5268                    balanced = false;
5269                }
5270                let mut sig: Vec<(String, bool)> = e
5271                    .lines
5272                    .iter()
5273                    .map(|l| (l.gl_account.clone(), l.debit_amount > Decimal::ZERO))
5274                    .collect();
5275                sig.sort();
5276                arche.insert(sig);
5277            }
5278            (n, arche.len(), balanced)
5279        }
5280        let (n, distinct_on, bal_on) = run(Some(true));
5281        let (_, distinct_off, bal_off) = run(Some(false));
5282        assert!(bal_on && bal_off, "balance preserved in both modes");
5283        assert!(
5284            distinct_on < distinct_off,
5285            "templating should reduce distinct archetypes ({distinct_on} on vs {distinct_off} off)"
5286        );
5287        assert!(
5288            distinct_on * 2 < n,
5289            "templating should reuse heavily: {distinct_on} distinct archetypes over {n} JEs"
5290        );
5291    }
5292
5293    #[test]
5294    fn test_reversal_process_emits_balanced_reversals() {
5295        // SOTA-5: with reversal_rate > 0, some JEs are balanced reversals of
5296        // earlier ones (header_text "Reversal of ..."); rate 0.0 emits none.
5297        fn run(rate: Option<f64>) -> (usize, bool) {
5298            let mut coa_gen = ChartOfAccountsGenerator::new(
5299                CoAComplexity::Small,
5300                IndustrySector::Manufacturing,
5301                13,
5302            );
5303            let coa = Arc::new(coa_gen.generate());
5304            let cfg = TransactionConfig {
5305                reversal_rate: rate,
5306                ..TransactionConfig::default()
5307            };
5308            let mut g = JournalEntryGenerator::new_with_params(
5309                cfg,
5310                coa,
5311                vec!["1000".to_string()],
5312                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5313                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5314                13,
5315            )
5316            .with_persona_errors(false)
5317            .with_batching(false);
5318            let mut reversals = 0;
5319            let mut balanced = true;
5320            for _ in 0..1000 {
5321                let e = g.generate();
5322                if !e.is_balanced() {
5323                    balanced = false;
5324                }
5325                if e.header
5326                    .header_text
5327                    .as_deref()
5328                    .is_some_and(|t| t.starts_with("Reversal of"))
5329                {
5330                    reversals += 1;
5331                }
5332            }
5333            (reversals, balanced)
5334        }
5335        let (rev_on, bal_on) = run(Some(0.05));
5336        let (rev_off, bal_off) = run(Some(0.0));
5337        assert!(bal_on && bal_off, "all entries balanced incl. reversals");
5338        assert_eq!(rev_off, 0, "rate 0.0 emits no reversals, got {rev_off}");
5339        assert!(rev_on > 0, "rate 0.05 should emit reversals, got {rev_on}");
5340    }
5341
5342    #[test]
5343    fn test_account_concentration_creates_pareto() {
5344        // SOTA-2: with concentration on (default), a hot subset of accounts
5345        // carries most lines (the corpus account-activity Pareto, top-10% ≈ 95%)
5346        // vs the legacy near-uniform pool draw. Templating + reversals are held
5347        // off so the only difference between the two runs is the power-law pick.
5348        fn run(concentration: Option<bool>) -> (f64, bool) {
5349            let mut coa_gen = ChartOfAccountsGenerator::new(
5350                CoAComplexity::Medium,
5351                IndustrySector::Manufacturing,
5352                17,
5353            );
5354            let coa = Arc::new(coa_gen.generate());
5355            let cfg = TransactionConfig {
5356                account_concentration: concentration,
5357                recurring_templates: Some(false),
5358                reversal_rate: Some(0.0),
5359                ..TransactionConfig::default()
5360            };
5361            let mut g = JournalEntryGenerator::new_with_params(
5362                cfg,
5363                coa,
5364                vec!["1000".to_string()],
5365                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5366                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5367                17,
5368            )
5369            .with_persona_errors(false)
5370            .with_batching(false);
5371            let mut counts: std::collections::HashMap<String, usize> =
5372                std::collections::HashMap::new();
5373            let mut total_lines = 0usize;
5374            let mut balanced = true;
5375            for _ in 0..1000 {
5376                let e = g.generate();
5377                if !e.is_balanced() {
5378                    balanced = false;
5379                }
5380                for l in &e.lines {
5381                    *counts.entry(l.gl_account.clone()).or_default() += 1;
5382                    total_lines += 1;
5383                }
5384            }
5385            // share of lines carried by the top-10% most-active accounts (the
5386            // corpus_structure "acct top10%" metric, over active accounts).
5387            let mut v: Vec<usize> = counts.values().copied().collect();
5388            v.sort_unstable_by(|a, b| b.cmp(a));
5389            let top_k = ((v.len() as f64 * 0.10).ceil() as usize).max(1);
5390            let top_share = v.iter().take(top_k).sum::<usize>() as f64 / total_lines as f64;
5391            (top_share, balanced)
5392        }
5393        let (share_on, bal_on) = run(Some(true));
5394        let (share_off, bal_off) = run(Some(false));
5395        assert!(bal_on && bal_off, "balance preserved in both modes");
5396        assert!(
5397            share_on > share_off + 0.20,
5398            "concentration should raise the top-10% line share ({share_on:.3} on vs {share_off:.3} off)"
5399        );
5400        assert!(
5401            share_on > 0.50,
5402            "hot accounts should dominate: top-10% line share {share_on:.3}"
5403        );
5404    }
5405
5406    #[test]
5407    fn test_allocation_batch_emits_large_balanced_postings() {
5408        // SOTA-6: with allocation_batch_rate > 0, some JEs are large 1-to-many
5409        // allocation batches (source "AB", many cost-center-spread lines, still
5410        // balanced); rate 0.0 emits none. Reversals are disabled to isolate the
5411        // allocation process (which shares the recent-JE buffer).
5412        fn run(rate: Option<f64>) -> (usize, bool, usize) {
5413            let mut coa_gen = ChartOfAccountsGenerator::new(
5414                CoAComplexity::Small,
5415                IndustrySector::Manufacturing,
5416                23,
5417            );
5418            let coa = Arc::new(coa_gen.generate());
5419            let cfg = TransactionConfig {
5420                allocation_batch_rate: rate,
5421                reversal_rate: Some(0.0),
5422                ..TransactionConfig::default()
5423            };
5424            let mut g = JournalEntryGenerator::new_with_params(
5425                cfg,
5426                coa,
5427                vec!["1000".to_string()],
5428                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5429                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5430                23,
5431            )
5432            .with_persona_errors(false)
5433            .with_batching(false);
5434            let mut batches = 0usize;
5435            let mut balanced = true;
5436            let mut max_distinct_cc = 0usize;
5437            for _ in 0..2000 {
5438                let e = g.generate();
5439                if !e.is_balanced() {
5440                    balanced = false;
5441                }
5442                if e.header.sap_source_code.as_deref() == Some("AB") {
5443                    batches += 1;
5444                    assert!(
5445                        e.lines.len() >= ALLOCATION_MIN_TARGETS as usize,
5446                        "allocation batch should be large, got {} lines",
5447                        e.lines.len()
5448                    );
5449                    let ccs: std::collections::HashSet<String> = e
5450                        .lines
5451                        .iter()
5452                        .filter_map(|l| l.cost_center.clone())
5453                        .collect();
5454                    max_distinct_cc = max_distinct_cc.max(ccs.len());
5455                }
5456            }
5457            (batches, balanced, max_distinct_cc)
5458        }
5459        let (on, bal_on, cc) = run(Some(0.10));
5460        let (off, bal_off, _) = run(Some(0.0));
5461        assert!(
5462            bal_on && bal_off,
5463            "all entries balanced incl. allocation batches"
5464        );
5465        assert_eq!(off, 0, "rate 0.0 emits no allocation batches, got {off}");
5466        assert!(on > 0, "rate 0.10 should emit allocation batches, got {on}");
5467        assert!(
5468            cc > 1,
5469            "allocation should spread across multiple cost centers, got {cc}"
5470        );
5471    }
5472
5473    #[test]
5474    fn test_derived_id_processes_keep_document_ids_unique() {
5475        // SOTA-5/6 regression: reversals and allocation batches mint derived ids
5476        // (`base ^ salt`). Reusing the same base would duplicate an id — the
5477        // failure `test_document_reference_integrity` caught. With both processes
5478        // at high rates, every emitted document id must still be unique.
5479        let mut coa_gen =
5480            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 31);
5481        let coa = Arc::new(coa_gen.generate());
5482        let cfg = TransactionConfig {
5483            reversal_rate: Some(0.15),
5484            allocation_batch_rate: Some(0.10),
5485            ..TransactionConfig::default()
5486        };
5487        let mut g = JournalEntryGenerator::new_with_params(
5488            cfg,
5489            coa,
5490            vec!["1000".to_string()],
5491            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5492            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5493            31,
5494        )
5495        .with_persona_errors(false)
5496        .with_batching(false);
5497        let mut ids = std::collections::HashSet::new();
5498        let n = 3000;
5499        for _ in 0..n {
5500            let e = g.generate();
5501            assert!(
5502                ids.insert(e.header.document_id),
5503                "duplicate document id {} (derived-id collision)",
5504                e.header.document_id
5505            );
5506        }
5507        assert_eq!(ids.len(), n, "all {n} document ids unique");
5508    }
5509
5510    #[test]
5511    fn test_business_unit_rolls_up_from_cost_center() {
5512        // SOTA-3: with the dimension on (default), a line that has a cost center
5513        // (or, as fallback, a profit center) also carries a business_unit that is
5514        // a deterministic roll-up of that dimension (same value → same BU, in
5515        // BU01..BU11); with it off, BU is empty.
5516        fn run(enabled: Option<bool>) -> (usize, usize, bool, bool) {
5517            let mut coa_gen = ChartOfAccountsGenerator::new(
5518                CoAComplexity::Medium,
5519                IndustrySector::Manufacturing,
5520                19,
5521            );
5522            let coa = Arc::new(coa_gen.generate());
5523            let cfg = TransactionConfig {
5524                business_unit_dimension: enabled,
5525                ..TransactionConfig::default()
5526            };
5527            let mut g = JournalEntryGenerator::new_with_params(
5528                cfg,
5529                coa,
5530                vec!["1000".to_string()],
5531                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5532                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5533                19,
5534            )
5535            .with_persona_errors(false)
5536            .with_batching(false);
5537            let mut dim_lines = 0usize;
5538            let mut bu_lines = 0usize;
5539            let mut consistent = true; // BU present ⇒ matches the roll-up of its CC/PC
5540            let mut well_formed = true; // BU in BU01..BU11
5541            let mut dim_to_bu: std::collections::HashMap<String, String> =
5542                std::collections::HashMap::new();
5543            for _ in 0..600 {
5544                let e = g.generate();
5545                for l in &e.lines {
5546                    // BU rolls up from the cost center, or profit center as fallback.
5547                    let dim = l.cost_center.as_deref().or(l.profit_center.as_deref());
5548                    if dim.is_some() {
5549                        dim_lines += 1;
5550                    }
5551                    if let Some(bu) = &l.business_unit {
5552                        bu_lines += 1;
5553                        let d = dim.unwrap_or_default().to_string();
5554                        if bu != &JournalEntryGenerator::business_unit_for_dimension(&d) {
5555                            consistent = false;
5556                        }
5557                        // stable mapping across the run
5558                        if dim_to_bu
5559                            .insert(d, bu.clone())
5560                            .is_some_and(|prev| &prev != bu)
5561                        {
5562                            consistent = false;
5563                        }
5564                        let n_ok = bu.strip_prefix("BU").and_then(|d| d.parse::<u32>().ok());
5565                        if !matches!(n_ok, Some(1..=11)) {
5566                            well_formed = false;
5567                        }
5568                    }
5569                }
5570            }
5571            (dim_lines, bu_lines, consistent, well_formed)
5572        }
5573        let (dim_on, bu_on, consistent, well_formed) = run(Some(true));
5574        let (_, bu_off, _, _) = run(Some(false));
5575        assert!(
5576            dim_on > 0 && bu_on > 0,
5577            "BU should be populated where CC/PC is"
5578        );
5579        assert_eq!(
5580            dim_on, bu_on,
5581            "every CC/PC-bearing line gets a BU ({dim_on} dim vs {bu_on} BU)"
5582        );
5583        assert!(
5584            consistent,
5585            "BU must be the deterministic roll-up of its CC/PC"
5586        );
5587        assert!(well_formed, "BU codes must be BU01..BU11");
5588        assert_eq!(bu_off, 0, "dimension off ⇒ no business_unit, got {bu_off}");
5589    }
5590
5591    #[test]
5592    fn test_foreign_currency_sap_style() {
5593        // SOTA-4: with foreign_currency_rate > 0, some JEs post in a foreign
5594        // document currency. The ledger amounts (debit/credit) stay company
5595        // currency and the JE still balances; the foreign value lands in
5596        // transaction_amount and balances in the transaction currency too. rate
5597        // 0.0 → all company-currency. Reversals/allocations off to isolate.
5598        fn run(rate: Option<f64>) -> (usize, bool, bool) {
5599            let mut coa_gen = ChartOfAccountsGenerator::new(
5600                CoAComplexity::Small,
5601                IndustrySector::Manufacturing,
5602                29,
5603            );
5604            let coa = Arc::new(coa_gen.generate());
5605            let cfg = TransactionConfig {
5606                foreign_currency_rate: rate,
5607                reversal_rate: Some(0.0),
5608                allocation_batch_rate: Some(0.0),
5609                ..TransactionConfig::default()
5610            };
5611            let mut g = JournalEntryGenerator::new_with_params(
5612                cfg,
5613                coa,
5614                vec!["1000".to_string()],
5615                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5616                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5617                29,
5618            )
5619            .with_persona_errors(false)
5620            .with_batching(false);
5621            let mut foreign = 0usize;
5622            let mut ledger_ok = true; // debit == credit (company ledger)
5623            let mut txn_ok = true; // foreign lines carry transaction_amount + balance in txn ccy
5624            for _ in 0..1500 {
5625                let e = g.generate();
5626                if !e.is_balanced() {
5627                    ledger_ok = false;
5628                }
5629                if e.header.currency != "USD" {
5630                    foreign += 1;
5631                    if !e.lines.iter().all(|l| l.transaction_amount.is_some()) {
5632                        txn_ok = false;
5633                    }
5634                    let td: Decimal = e
5635                        .lines
5636                        .iter()
5637                        .filter(|l| l.debit_amount > Decimal::ZERO)
5638                        .filter_map(|l| l.transaction_amount)
5639                        .sum();
5640                    let tc: Decimal = e
5641                        .lines
5642                        .iter()
5643                        .filter(|l| l.credit_amount > Decimal::ZERO)
5644                        .filter_map(|l| l.transaction_amount)
5645                        .sum();
5646                    // tolerate per-line cent rounding (≤ n_lines half-cents)
5647                    let tol = Decimal::new(e.lines.len() as i64, 2);
5648                    if (td - tc).abs() > tol {
5649                        txn_ok = false;
5650                    }
5651                }
5652            }
5653            (foreign, ledger_ok, txn_ok)
5654        }
5655        let (fon, lbal_on, tbal_on) = run(Some(0.20));
5656        let (foff, lbal_off, _) = run(Some(0.0));
5657        assert!(
5658            lbal_on && lbal_off,
5659            "ledger balance (debit==credit) preserved in both modes"
5660        );
5661        assert!(
5662            fon > 0,
5663            "rate 0.20 should produce foreign-currency JEs, got {fon}"
5664        );
5665        assert_eq!(foff, 0, "rate 0.0 ⇒ no foreign JEs, got {foff}");
5666        assert!(
5667            tbal_on,
5668            "foreign JEs carry transaction_amount + balance in the transaction currency"
5669        );
5670    }
5671
5672    #[test]
5673    fn test_created_date_before_posting() {
5674        let mut coa_gen =
5675            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5676        let coa = Arc::new(coa_gen.generate());
5677
5678        let mut je_gen = JournalEntryGenerator::new_with_params(
5679            TransactionConfig::default(),
5680            coa,
5681            vec!["1000".to_string()],
5682            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5683            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5684            42,
5685        )
5686        .with_persona_errors(false);
5687
5688        for _ in 0..500 {
5689            let entry = je_gen.generate();
5690
5691            if let Some(created_date) = entry.header.created_date {
5692                let created_naive_date = created_date.date();
5693                assert!(
5694                    created_naive_date <= entry.header.posting_date,
5695                    "created_date ({}) should be <= posting_date ({})",
5696                    created_naive_date,
5697                    entry.header.posting_date,
5698                );
5699            }
5700        }
5701    }
5702
5703    /// SP3.5b — verify that `apply_calibration_step` mutates the generator's
5704    /// amount_sampler when a `"amounts.lognormal_sigma"` step is applied, and
5705    /// that `"amounts.round_dollar_share"` likewise updates the probability.
5706    #[test]
5707    fn apply_calibration_step_updates_lognormal_sigma() {
5708        let mut coa_gen =
5709            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5710        let coa = Arc::new(coa_gen.generate());
5711
5712        let mut gen = JournalEntryGenerator::new_with_params(
5713            TransactionConfig::default(),
5714            coa,
5715            vec!["1000".to_string()],
5716            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5717            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5718            42,
5719        );
5720
5721        let baseline_sigma = gen.amount_sampler.lognormal_sigma();
5722
5723        let step_sigma = crate::velocity_calibrator::CalibrationStep {
5724            rule_id: "R6".to_string(),
5725            parameter: "amounts.lognormal_sigma".to_string(),
5726            delta: 0.01,
5727            new_value: baseline_sigma + 0.01,
5728        };
5729        gen.apply_calibration_step(&step_sigma);
5730        assert!(
5731            (gen.amount_sampler.lognormal_sigma() - (baseline_sigma + 0.01)).abs() < 1e-9,
5732            "lognormal_sigma should be updated to {}",
5733            baseline_sigma + 0.01
5734        );
5735
5736        let baseline_round = gen.amount_sampler.round_number_probability();
5737        let step_round = crate::velocity_calibrator::CalibrationStep {
5738            rule_id: "R9".to_string(),
5739            parameter: "amounts.round_dollar_share".to_string(),
5740            delta: -0.005,
5741            new_value: (baseline_round - 0.005).max(0.0),
5742        };
5743        gen.apply_calibration_step(&step_round);
5744        let expected = (baseline_round - 0.005).max(0.0).clamp(0.0, 1.0);
5745        assert!(
5746            (gen.amount_sampler.round_number_probability() - expected).abs() < 1e-9,
5747            "round_number_probability should be updated to {}",
5748            expected
5749        );
5750    }
5751
5752    #[test]
5753    fn master_data_resolver_fills_every_pii_kind() {
5754        use datasynth_core::distributions::text_taxonomy::{
5755            PiiPlaceholderKind, PlaceholderResolver,
5756        };
5757        let mut r = MasterDataResolver {
5758            companies: vec!["Acme AG".to_string()],
5759            persons: vec!["Hans Muster".to_string()],
5760            streets: vec!["Hauptstrasse 1".to_string()],
5761            patients: vec!["Patient X".to_string()],
5762        };
5763        let mut rng = rand::rng();
5764        assert_eq!(r.resolve(PiiPlaceholderKind::Company, &mut rng), "Acme AG");
5765        assert_eq!(
5766            r.resolve(PiiPlaceholderKind::Person, &mut rng),
5767            "Hans Muster"
5768        );
5769        assert_eq!(
5770            r.resolve(PiiPlaceholderKind::Street, &mut rng),
5771            "Hauptstrasse 1"
5772        );
5773        assert_eq!(
5774            r.resolve(PiiPlaceholderKind::Patient, &mut rng),
5775            "Patient X"
5776        );
5777    }
5778
5779    #[test]
5780    fn master_data_resolver_empty_pool_falls_back() {
5781        use datasynth_core::distributions::text_taxonomy::{
5782            PiiPlaceholderKind, PlaceholderResolver,
5783        };
5784        let mut r = MasterDataResolver::default();
5785        let mut rng = rand::rng();
5786        let v = r.resolve(PiiPlaceholderKind::Company, &mut rng);
5787        assert!(!v.is_empty());
5788    }
5789
5790    /// Pin the shape invariant on `synthetic_patient_pool`: each entry, once
5791    /// filled into the canonical `*{patient} G:{date}…` template the corpus
5792    /// DZ/RG/RS classes use, must not introduce a *structural* residual-PII
5793    /// shape. Regression guard for the JE_79-class smoke failure: the old pool
5794    /// (`"B. Muster"`, `"A. Beispiel"`, …) shaped each fill as
5795    /// `<initial>. <surname>` which `RE_INITIAL_SURNAME` flags.
5796    ///
5797    /// NB: the `given_name` pattern is deliberately EXCLUDED here. These are
5798    /// synthetic *fill* values that are name-shaped by design (they fill
5799    /// `{patient}`); `given_name` is a template-scan signal for un-tokenized
5800    /// corpus names, not a check on legitimate synthetic output.
5801    #[test]
5802    fn synthetic_patient_pool_entries_pass_residual_scan() {
5803        use datasynth_core::distributions::text_taxonomy::PlaceholderGrammar;
5804        for name in synthetic_patient_pool("de_CH") {
5805            let filled = format!("*{name} G:2024-01-15 E:2024-01-20 A:2024-02-01");
5806            let structural: Vec<_> = PlaceholderGrammar::residual_pii_scan(&filled)
5807                .into_iter()
5808                .filter(|h| h.pattern != "given_name")
5809                .collect();
5810            assert!(
5811                structural.is_empty(),
5812                "synthetic patient name {name:?} fills to PII-shaped {filled:?}: {structural:?}"
5813            );
5814        }
5815    }
5816
5817    #[test]
5818    fn master_data_resolver_fallbacks_are_non_empty_and_placeholder_free() {
5819        use datasynth_core::distributions::text_taxonomy::{
5820            PiiPlaceholderKind, PlaceholderResolver,
5821        };
5822        // Verify fallback constants for every kind are non-empty and contain
5823        // no `{…}` literal placeholders (the resolver must never leak the
5824        // unfilled placeholder token into emitted text).
5825        let mut r = MasterDataResolver::default();
5826        let mut rng = rand::rng();
5827        for kind in [
5828            PiiPlaceholderKind::Company,
5829            PiiPlaceholderKind::Person,
5830            PiiPlaceholderKind::Street,
5831            PiiPlaceholderKind::Patient,
5832        ] {
5833            let v = r.resolve(kind, &mut rng);
5834            assert!(!v.is_empty(), "fallback for {kind:?} must be non-empty");
5835            assert!(
5836                !v.contains('{'),
5837                "fallback for {kind:?} must not contain a placeholder token"
5838            );
5839        }
5840    }
5841}