Skip to main content

datasynth_generators/
je_generator.rs

1//! Journal Entry generator with statistical distributions.
2
3use chrono::{Datelike, NaiveDate, Timelike};
4use datasynth_core::utils::seeded_rng;
5use rand::prelude::*;
6use rand_chacha::ChaCha8Rng;
7use rust_decimal::prelude::*;
8use rust_decimal::Decimal;
9use std::sync::{Arc, LazyLock};
10
11use tracing::debug;
12
13use datasynth_config::schema::{
14    AdvancedDistributionConfig, FraudConfig, GeneratorConfig, MixtureDistributionType,
15    TemplateConfig, TemporalPatternsConfig, TransactionConfig,
16};
17use datasynth_core::distributions::{
18    AdvancedAmountSampler, BusinessDayCalculator, CrossDayConfig, DriftAdjustments, DriftConfig,
19    DriftController, EventType, IndustryAmountProfile, IndustryType, LagDistribution,
20    PeriodEndConfig, PeriodEndDynamics, PeriodEndModel, ProcessingLagCalculator,
21    ProcessingLagConfig, *,
22};
23use datasynth_core::models::*;
24use datasynth_core::templates::{
25    descriptions::DescriptionContext, DescriptionGenerator, ReferenceGenerator, ReferenceType,
26};
27use datasynth_core::traits::Generator;
28use datasynth_core::uuid_factory::{DeterministicUuidFactory, GeneratorType};
29use datasynth_core::CountryPack;
30
31use crate::company_selector::WeightedCompanySelector;
32use crate::user_generator::{UserGenerator, UserGeneratorConfig};
33
34use datasynth_core::distributions::text_taxonomy::{PiiPlaceholderKind, PlaceholderResolver};
35
36/// T2-D Lever 1: the default generic SAP source-mix, used when industry priors
37/// are not loaded but `transactions.synthetic_source_codes` is on (the default).
38/// Built once. See [`SourceMixPrior::sap_default`] and experiments/ml/FINDINGS.md §6.
39static DEFAULT_SOURCE_MIX: LazyLock<
40    datasynth_core::distributions::behavioral_priors::SourceMixPrior,
41> = LazyLock::new(datasynth_core::distributions::behavioral_priors::SourceMixPrior::sap_default);
42
43/// SOTA-5: default fraction of JEs that are reversals/corrections when
44/// `transactions.reversal_rate` is unset. Set to match the corpus reversal
45/// proxy (~0.10) — at 0.04 the measured proxy was only ~0.034 (the proxy
46/// detects ~85% of reversals), so 0.10 lands the proxy near the corpus.
47const DEFAULT_REVERSAL_RATE: f64 = 0.10;
48
49/// SOTA-6: default fraction of JEs that are allocation/assessment batches when
50/// `transactions.allocation_batch_rate` is unset. Small (each batch carries
51/// ~30-80 lines), so the resulting line-share (~8%) and lines-per-JE tail match
52/// the corpus's large-batch postings (FINDINGS §8: AB docs ~52 lines drive the
53/// lpje std). `0.0` disables.
54const DEFAULT_ALLOCATION_RATE: f64 = 0.008;
55/// SOTA-4: foreign document currencies + their company-currency rate (company
56/// units per 1 unit of the document currency). Synthetic, plausible values.
57const FOREIGN_CCYS: &[(&str, f64)] = &[
58    ("EUR", 1.09),
59    ("GBP", 1.27),
60    ("CHF", 1.12),
61    ("CAD", 0.74),
62    ("JPY", 0.0068),
63    ("AUD", 0.66),
64    ("CNY", 0.14),
65];
66/// SOTA-6: inclusive bounds for the number of target (cost-center) lines an
67/// allocation batch explodes into — centred near the corpus AB mean (~52).
68const ALLOCATION_MIN_TARGETS: u32 = 30;
69const ALLOCATION_MAX_TARGETS: u32 = 80;
70
71/// SOTA-2: Zipf exponent for the hot-account power-law. At s=2.0 the top-10%
72/// of accounts in a pool carry ~92-96% of that pool's lines across realistic
73/// pool sizes (N≈60-150) — matching the corpus account-activity Pareto (~0.95).
74const ZIPF_ALPHA: f64 = 2.0;
75/// Largest pool size the precomputed harmonic table covers; larger pools (none
76/// realistic for a single account-type) fall back to the uniform draw.
77const ZIPF_CAP: usize = 16_384;
78/// SOTA-2: cumulative partial sums `CUM[k] = Σ_{i=1..k} i^-ZIPF_ALPHA` (CUM[0]=0),
79/// computed once. Lets [`JournalEntryGenerator::power_law_index`] normalise (O(1)
80/// lookup of `CUM[n]`) and inverse-CDF sample (binary search) without an O(n) sum.
81static ZIPF_CUM: LazyLock<Vec<f64>> = LazyLock::new(|| {
82    let mut cum = Vec::with_capacity(ZIPF_CAP + 1);
83    cum.push(0.0);
84    let mut acc = 0.0_f64;
85    for i in 1..=ZIPF_CAP {
86        acc += 1.0 / (i as f64).powf(ZIPF_ALPHA);
87        cum.push(acc);
88    }
89    cum
90});
91
92/// SP6 — Resolves PII placeholders to concrete values drawn from the run's
93/// synthetic master data. `{company}` <- vendor/customer names, `{person}` <-
94/// user display names, `{street}` <- addresses (empty pool for now — no
95/// address master entity), `{patient}` <- a synthetic-person pool (no master
96/// entity exists for patients). Empty pools fall back to obviously-synthetic
97/// constants so output never carries an empty span or a literal `{…}` token.
98#[derive(Debug, Default)]
99pub struct MasterDataResolver {
100    pub companies: Vec<String>,
101    pub persons: Vec<String>,
102    pub streets: Vec<String>,
103    pub patients: Vec<String>,
104}
105
106impl PlaceholderResolver for MasterDataResolver {
107    fn resolve(&mut self, kind: PiiPlaceholderKind, rng: &mut dyn rand::Rng) -> String {
108        use rand::RngExt;
109        let (pool, fallback): (&Vec<String>, &str) = match kind {
110            PiiPlaceholderKind::Company => (&self.companies, "Synthetic Company AG"),
111            PiiPlaceholderKind::Person => (&self.persons, "Synthetic Person"),
112            PiiPlaceholderKind::Street => (&self.streets, "Synthetic Street 1"),
113            PiiPlaceholderKind::Patient => (&self.patients, "Synthetic Patient"),
114        };
115        if pool.is_empty() {
116            return fallback.to_string();
117        }
118        let idx = rng.random_range(0..pool.len());
119        pool[idx].clone()
120    }
121}
122
123/// A small static pool of obviously-synthetic person names for `{patient}`
124/// filling. No master entity exists for patients. Locale is a hint; for SP6
125/// a single neutral set is sufficient.
126///
127/// **Shape invariant:** every entry must avoid the `<initial>. <surname>` and
128/// `<surname> <initial>.` shapes, because the SP6 `residual_pii_scan` flags
129/// those as `initial_surname` / `surname_initial` PII patterns. The smoke
130/// test asserts the canonical `*{patient} G:…` template fills to a scan-clean
131/// string; an entry like `"B. Muster"` would regress that. Prefer two-word
132/// `<First> <Last>` shapes with no periods (covered by
133/// `synthetic_patient_pool_entries_pass_residual_scan`).
134fn synthetic_patient_pool(_locale: &str) -> Vec<String> {
135    [
136        "Alex Beispiel",
137        "Bea Muster",
138        "Cleo Synthetic",
139        "Demo Example",
140        "Erik Probe",
141        "Fred Testperson",
142        "Gerda Platzhalter",
143        "Hans Demo",
144    ]
145    .iter()
146    .map(|s| s.to_string())
147    .collect()
148}
149
150/// Generator for realistic journal entries.
151pub struct JournalEntryGenerator {
152    rng: ChaCha8Rng,
153    /// T2-D: independent RNG stream for the default source-mix draw, so
154    /// populating `sap_source_code` on the no-priors path never perturbs the
155    /// main `rng` — all other fields stay byte-identical to the legacy output.
156    source_mix_rng: ChaCha8Rng,
157    /// SOTA-1: per-(company, doc-type) library of reusable JE account archetypes
158    /// `(debit_accounts, credit_accounts)` for the recurring-templates process.
159    /// Capped per key; reused on the no-priors path so standard postings recur.
160    recurring_archetypes:
161        std::collections::HashMap<(String, String), Vec<(Vec<String>, Vec<String>)>>,
162    /// SOTA-1: independent RNG for the template-reuse roll + archetype pick, so
163    /// templating never perturbs the main `rng` (amounts/dates/counts unchanged).
164    template_rng: ChaCha8Rng,
165    /// SOTA-5: ring buffer of recent (complete) JEs a later reversal can offset.
166    /// Storing the whole JE lets the reversal inherit its source code, line text,
167    /// audit flags, etc. (only dr/cr + the header markers are changed).
168    reversal_buffer: Vec<JournalEntry>,
169    /// SOTA-5: independent RNG for reversal rolls, so reversals intersperse
170    /// without perturbing the main `rng` (normal JEs stay byte-identical).
171    reversal_rng: ChaCha8Rng,
172    /// SOTA-2: independent RNG for the hot-account power-law override, so the
173    /// account-activity Pareto (a few accounts carry most lines, as in the
174    /// corpus) is concentrated without perturbing the main `rng` — the uniform
175    /// `.choose` draw is still consumed, only its *result* is replaced.
176    account_rng: ChaCha8Rng,
177    /// SOTA-6: independent RNG for the allocation/assessment-batch process, so
178    /// the large 1-to-many postings (the corpus's lines-per-JE tail) intersperse
179    /// without perturbing the main `rng` (normal JEs stay byte-identical).
180    allocation_rng: ChaCha8Rng,
181    /// SOTA-4: independent RNG for the foreign-currency post-process, so the
182    /// document-currency tagging never perturbs the main `rng` (company-currency
183    /// JEs stay byte-identical).
184    fx_rng: ChaCha8Rng,
185    /// SOTA-8: independent RNG for the source-conditional Dirichlet account-pair
186    /// sampler. Built lazily (one `SourcePool` per observed source); when the
187    /// feature is off the sampler stays None and the main RNG / `account_rng`
188    /// stream is byte-identical.
189    cond_pair_rng: ChaCha8Rng,
190    /// SOTA-8: per-source Dirichlet PMFs over per-source account pools.
191    /// Lazy-built on first JE whose source isn't yet pooled.
192    cond_pair_sampler: Option<
193        datasynth_core::distributions::source_conditional_pair::SourceConditionalPairSampler,
194    >,
195    /// SOTA-8: SAP source code of the JE currently being constructed, so the
196    /// `select_*_account` helpers can consult the per-source pool.
197    current_je_source: Option<String>,
198    seed: u64,
199    config: TransactionConfig,
200    coa: Arc<ChartOfAccounts>,
201    companies: Vec<String>,
202    // P2 (multi-currency): company_code -> functional (local) currency ISO code.
203    // The header document currency defaults to this entity functional currency so
204    // the flat export reflects per-entity currency (e.g. EUR for a EUR entity)
205    // instead of always the group currency. Empty -> keep the USD default. The
206    // SOTA-4 foreign-document override still applies on top.
207    company_currencies: std::collections::HashMap<String, String>,
208    company_selector: WeightedCompanySelector,
209    line_sampler: LineItemSampler,
210    amount_sampler: AmountSampler,
211    temporal_sampler: TemporalSampler,
212    start_date: NaiveDate,
213    end_date: NaiveDate,
214    count: u64,
215    uuid_factory: DeterministicUuidFactory,
216    // Enhanced features
217    user_pool: Option<UserPool>,
218    description_generator: DescriptionGenerator,
219    reference_generator: ReferenceGenerator,
220    template_config: TemplateConfig,
221    vendor_pool: VendorPool,
222    customer_pool: CustomerPool,
223    // Material pool for realistic material references
224    material_pool: Option<MaterialPool>,
225    // Cost-center IDs sourced from the generated cost-centers master so
226    // `JE.cost_center` joins back to `cost_centers.id`.  Populated via
227    // [`with_cost_center_pool`] from the orchestrator after master-data
228    // generation; falls back to the hardcoded `COST_CENTER_POOL` const
229    // when empty (configs that skip master-data generation).
230    cost_center_pool: Vec<String>,
231    // Profit-center IDs sourced from the generated profit-centers master
232    // so `JE.profit_center` joins back to `profit_centers.id`.  Same
233    // population semantics as `cost_center_pool`.
234    profit_center_pool: Vec<String>,
235    // Flag indicating whether we're using real master data vs defaults
236    using_real_master_data: bool,
237    // Fraud generation
238    fraud_config: FraudConfig,
239    // Persona-based error injection
240    persona_errors_enabled: bool,
241    // Approval threshold enforcement
242    approval_enabled: bool,
243    approval_threshold: rust_decimal::Decimal,
244    // SOD violation rate for approval tracking (0.0 to 1.0)
245    sod_violation_rate: f64,
246    // Batching behavior - humans often process similar items together
247    batch_state: Option<BatchState>,
248    // When false, `maybe_start_batch` never arms a batch (persistent kill-switch
249    // for `with_batching(false)`; previously the builder only nulled
250    // `batch_state`, which `maybe_start_batch` immediately re-armed).
251    batching_enabled: bool,
252    // Temporal drift controller for simulating distribution changes over time
253    drift_controller: Option<DriftController>,
254    // Temporal patterns components
255    business_day_calculator: Option<BusinessDayCalculator>,
256    processing_lag_calculator: Option<ProcessingLagCalculator>,
257    temporal_patterns_config: Option<TemporalPatternsConfig>,
258    // Business-process weights for the O2C/P2P/R2R/H2R/A2R volume mix. Must
259    // sum to 1.0 (validated by config schema). Default matches the legacy
260    // hard-coded 0.35/0.30/0.20/0.10/0.05 distribution.
261    business_process_weights: [(BusinessProcess, f64); 5],
262    // v3.4.0 advanced distributions (mixture models + industry profiles).
263    // None preserves v3.3.2 byte-for-byte behavior; populated only when the
264    // caller opts in via [`set_advanced_distributions`].
265    advanced_amount_sampler: Option<AdvancedAmountSampler>,
266    // v3.5.3+ conditional amount override. Populated when
267    // `config.distributions.conditional` contains an entry where
268    // `output_field == "amount"` and `input_field ∈ {"month",
269    // "quarter", "constant"}`. Applied *after* the fraud-pattern /
270    // advanced-sampler / legacy-sampler cascade on non-fraud entries
271    // so it can steer amounts by calendar context without disturbing
272    // fraud semantics.
273    conditional_amount_override: Option<datasynth_core::distributions::ConditionalSampler>,
274    // v3.5.4+ Gaussian copula for amount↔line_count correlation. When
275    // populated, each non-fraud JE draws a (u, v) pair; u nudges amount
276    // via a `(0.75 + 0.5*u)` multiplier and v biases line_count toward
277    // the upper/lower end of its range. Produces observable Spearman
278    // correlation without rewiring existing samplers for inverse-CDF.
279    correlation_copula: Option<datasynth_core::distributions::BivariateCopulaSampler>,
280    /// SP3 — opt-in industry priors. When `Some`, je_generator routes
281    /// timing/lines-per-JE/fanout/active-window through prior-driven samplers.
282    /// When `None`, behavior is identical to v5.11.
283    pub loaded_priors: Option<crate::priors_loader::LoadedPriors>,
284    /// SP3 T11 — accumulated IET days per document-type code.  Only used when
285    /// `loaded_priors.is_some()`.  Tracks the running day offset so
286    /// consecutive calls for the same source produce IET-spaced posting dates.
287    iet_day_accum: std::collections::HashMap<String, f64>,
288    /// v5.30 B1 Phase 2 — per-source burst-clustering state.  When a sampled IET
289    /// falls below `BURST_THRESHOLD_DAYS` and a probability gate fires, the
290    /// next 2-4 events for that source are deterministically clustered with
291    /// short IETs (0.25-1.5 days), giving the within-source IET sequence the
292    /// positive lag-1 autocorrelation the Sajja P1 metric measures.  Bypasses
293    /// the `|ρ| < 0.1` coupling gate in `ConditionalIETSampler` that the SP3
294    /// priors' weak day-resolution autocorrelation can't clear.
295    iet_burst_remaining: std::collections::HashMap<String, u8>,
296    /// SP3.12 — last TP value drawn per SAP source code.  Used by the TP motif
297    /// sampler to bias the next TP draw toward cluster-mates of the previous TP
298    /// on the same source, building triangle structure in the TP co-occurrence graph.
299    last_tp_by_source: std::collections::HashMap<String, String>,
300    /// SP3.4 — when Some, observes each emitted line and applies calibration
301    /// steps to the generator's tunable parameters.
302    pub velocity_calibrator: Option<crate::velocity_calibrator::VelocityCalibrator>,
303    /// SP6 — PII placeholder resolver populated from the run's synthetic master
304    /// data (vendors, customers, users). Rebuilt once via
305    /// [`refresh_md_resolver`] before JE generation begins.
306    md_resolver: MasterDataResolver,
307}
308
309const DEFAULT_BUSINESS_PROCESS_WEIGHTS: [(BusinessProcess, f64); 5] = [
310    (BusinessProcess::O2C, 0.35),
311    (BusinessProcess::P2P, 0.30),
312    (BusinessProcess::R2R, 0.20),
313    (BusinessProcess::H2R, 0.10),
314    (BusinessProcess::A2R, 0.05),
315];
316
317/// Map the schema-level [`datasynth_config::schema::IndustryProfileType`]
318/// onto the distributions-layer [`IndustryType`], then return that industry's
319/// pre-configured `sales_amounts` mixture. Used as a fallback when the
320/// caller enables `distributions.amounts` but supplies no components.
321/// Per-entry context channels for conditional-distribution overrides.
322///
323/// v4.1.0+ supported `input_field` values:
324///
325///   - `"month"` — posting-date month (1..=12)
326///   - `"quarter"` — posting-date quarter (1..=4)
327///   - `"year"` — posting-date year (e.g. 2026.0)
328///   - `"day_of_week"` — 1 (Mon) .. 7 (Sun)
329///   - `"day_of_month"` — 1..=31
330///   - `"day_of_year"` — 1..=366
331///   - `"week_of_year"` — 1..=53
332///   - `"is_period_end"` — 1.0 when posting_date is the last business
333///     day of the month, else 0.0
334///   - `"is_quarter_end"` — 1.0 when posting_date is in a quarter-end
335///     month AND is the last business day, else 0.0
336///   - `"is_year_end"` — 1.0 when posting_date is in December AND is
337///     the last business day, else 0.0
338///   - `"constant"` / empty — always 0.0 (treats as unconditional)
339///
340/// Unsupported values cause the conditional rule to be silently ignored
341/// to keep runtime robust against user typos.
342impl JournalEntryGenerator {
343    fn supported_conditional_input(field: &str) -> bool {
344        matches!(
345            field,
346            "month"
347                | "quarter"
348                | "year"
349                | "day_of_week"
350                | "day_of_month"
351                | "day_of_year"
352                | "week_of_year"
353                | "is_period_end"
354                | "is_quarter_end"
355                | "is_year_end"
356                | "constant"
357                | ""
358        )
359    }
360
361    fn conditional_input_value(&self, posting_date: chrono::NaiveDate) -> f64 {
362        let input_field = match self
363            .conditional_amount_override
364            .as_ref()
365            .map(|s| s.config().input_field.as_str())
366        {
367            Some(f) => f,
368            None => return 0.0,
369        };
370
371        let is_last_business_day = |d: chrono::NaiveDate| -> bool {
372            // Last day-of-month → is_period_end. Handles Feb/leap-year
373            // via chrono's num_days_from_ce roundabout; simpler path:
374            // if adding 1 day moves to a different month, this is EOM.
375            let next = d.succ_opt();
376            match next {
377                Some(n) => n.month() != d.month(),
378                None => true,
379            }
380        };
381
382        match input_field {
383            "month" => posting_date.month() as f64,
384            "quarter" => ((posting_date.month() - 1) / 3 + 1) as f64,
385            "year" => posting_date.year() as f64,
386            "day_of_week" => posting_date.weekday().number_from_monday() as f64,
387            "day_of_month" => posting_date.day() as f64,
388            "day_of_year" => posting_date.ordinal() as f64,
389            "week_of_year" => posting_date.iso_week().week() as f64,
390            "is_period_end" => f64::from(u8::from(is_last_business_day(posting_date))),
391            "is_quarter_end" => {
392                let m = posting_date.month();
393                let is_q_month = matches!(m, 3 | 6 | 9 | 12);
394                f64::from(u8::from(is_q_month && is_last_business_day(posting_date)))
395            }
396            "is_year_end" => f64::from(u8::from(
397                posting_date.month() == 12 && is_last_business_day(posting_date),
398            )),
399            _ => 0.0,
400        }
401    }
402}
403
404fn industry_profile_to_log_normal(
405    p: datasynth_config::schema::IndustryProfileType,
406) -> datasynth_core::distributions::LogNormalMixtureConfig {
407    use datasynth_config::schema::IndustryProfileType as P;
408    let industry = match p {
409        P::Retail => IndustryType::Retail,
410        P::Manufacturing => IndustryType::Manufacturing,
411        P::FinancialServices => IndustryType::FinancialServices,
412        P::Healthcare => IndustryType::Healthcare,
413        P::Technology => IndustryType::Technology,
414    };
415    IndustryAmountProfile::for_industry(industry).sales_amounts
416}
417
418/// State for tracking batch processing behavior.
419///
420/// When humans process transactions, they often batch similar items together
421/// (e.g., processing all invoices from one vendor, entering similar expenses).
422#[derive(Clone)]
423struct BatchState {
424    /// The base entry template to vary
425    base_account_number: String,
426    base_amount: rust_decimal::Decimal,
427    base_business_process: Option<BusinessProcess>,
428    base_posting_date: NaiveDate,
429    /// Remaining entries in this batch
430    remaining: u8,
431}
432
433impl JournalEntryGenerator {
434    /// Create a new journal entry generator.
435    pub fn new_with_params(
436        config: TransactionConfig,
437        coa: Arc<ChartOfAccounts>,
438        companies: Vec<String>,
439        start_date: NaiveDate,
440        end_date: NaiveDate,
441        seed: u64,
442    ) -> Self {
443        Self::new_with_full_config(
444            config,
445            coa,
446            companies,
447            start_date,
448            end_date,
449            seed,
450            TemplateConfig::default(),
451            None,
452        )
453    }
454
455    /// Create a new journal entry generator with full configuration.
456    #[allow(clippy::too_many_arguments)]
457    pub fn new_with_full_config(
458        config: TransactionConfig,
459        coa: Arc<ChartOfAccounts>,
460        companies: Vec<String>,
461        start_date: NaiveDate,
462        end_date: NaiveDate,
463        seed: u64,
464        template_config: TemplateConfig,
465        user_pool: Option<UserPool>,
466    ) -> Self {
467        // Initialize user pool if not provided
468        let user_pool = user_pool.or_else(|| {
469            if template_config.names.generate_realistic_names {
470                let user_gen_config = UserGeneratorConfig {
471                    culture_distribution: vec![
472                        (
473                            datasynth_core::templates::NameCulture::WesternUs,
474                            template_config.names.culture_distribution.western_us,
475                        ),
476                        (
477                            datasynth_core::templates::NameCulture::Hispanic,
478                            template_config.names.culture_distribution.hispanic,
479                        ),
480                        (
481                            datasynth_core::templates::NameCulture::German,
482                            template_config.names.culture_distribution.german,
483                        ),
484                        (
485                            datasynth_core::templates::NameCulture::French,
486                            template_config.names.culture_distribution.french,
487                        ),
488                        (
489                            datasynth_core::templates::NameCulture::Chinese,
490                            template_config.names.culture_distribution.chinese,
491                        ),
492                        (
493                            datasynth_core::templates::NameCulture::Japanese,
494                            template_config.names.culture_distribution.japanese,
495                        ),
496                        (
497                            datasynth_core::templates::NameCulture::Indian,
498                            template_config.names.culture_distribution.indian,
499                        ),
500                    ],
501                    email_domain: template_config.names.email_domain.clone(),
502                    generate_realistic_names: true,
503                };
504                let mut user_gen = UserGenerator::with_config(seed + 100, user_gen_config);
505                Some(user_gen.generate_standard(&companies))
506            } else {
507                None
508            }
509        });
510
511        // Initialize reference generator
512        let mut ref_gen = ReferenceGenerator::new(
513            start_date.year(),
514            companies
515                .first()
516                .map(std::string::String::as_str)
517                .unwrap_or("1000"),
518        );
519        ref_gen.set_prefix(
520            ReferenceType::Invoice,
521            &template_config.references.invoice_prefix,
522        );
523        ref_gen.set_prefix(
524            ReferenceType::PurchaseOrder,
525            &template_config.references.po_prefix,
526        );
527        ref_gen.set_prefix(
528            ReferenceType::SalesOrder,
529            &template_config.references.so_prefix,
530        );
531
532        // Create weighted company selector (uniform weights for this constructor)
533        let company_selector = WeightedCompanySelector::uniform(companies.clone());
534
535        Self {
536            rng: seeded_rng(seed, 0),
537            source_mix_rng: seeded_rng(seed, 50_063),
538            recurring_archetypes: std::collections::HashMap::new(),
539            template_rng: seeded_rng(seed, 70_081),
540            reversal_buffer: Vec::new(),
541            reversal_rng: seeded_rng(seed, 90_017),
542            account_rng: seeded_rng(seed, 60_071),
543            allocation_rng: seeded_rng(seed, 80_023),
544            fx_rng: seeded_rng(seed, 70_093),
545            cond_pair_rng: seeded_rng(seed, 110_071),
546            cond_pair_sampler: None,
547            current_je_source: None,
548            seed,
549            config: config.clone(),
550            coa,
551            companies,
552            company_currencies: std::collections::HashMap::new(),
553            company_selector,
554            line_sampler: LineItemSampler::with_config(
555                seed + 1,
556                config.line_item_distribution.clone(),
557                config.even_odd_distribution.clone(),
558                config.debit_credit_distribution.clone(),
559            ),
560            amount_sampler: AmountSampler::with_config(seed + 2, config.amounts.clone()),
561            temporal_sampler: TemporalSampler::with_config(
562                seed + 3,
563                config.seasonality.clone(),
564                WorkingHoursConfig::default(),
565                Vec::new(),
566            ),
567            start_date,
568            end_date,
569            count: 0,
570            uuid_factory: DeterministicUuidFactory::new(seed, GeneratorType::JournalEntry),
571            user_pool,
572            description_generator: DescriptionGenerator::new(),
573            reference_generator: ref_gen,
574            template_config,
575            vendor_pool: VendorPool::standard(),
576            customer_pool: CustomerPool::standard(),
577            material_pool: None,
578            cost_center_pool: Vec::new(),
579            profit_center_pool: Vec::new(),
580            using_real_master_data: false,
581            fraud_config: FraudConfig::default(),
582            persona_errors_enabled: true, // Enable by default for realism
583            approval_enabled: true,       // Enable by default for realism
584            approval_threshold: rust_decimal::Decimal::new(10000, 0), // $10,000 default threshold
585            sod_violation_rate: 0.10,     // 10% default SOD violation rate
586            batch_state: None,
587            batching_enabled: true,
588            drift_controller: None,
589            // Always provide a basic BusinessDayCalculator so that weekend/holiday
590            // filtering is active even when temporal_patterns is not explicitly enabled.
591            business_day_calculator: Some(BusinessDayCalculator::new(HolidayCalendar::new(
592                Region::US,
593                start_date.year(),
594            ))),
595            processing_lag_calculator: None,
596            temporal_patterns_config: None,
597            business_process_weights: DEFAULT_BUSINESS_PROCESS_WEIGHTS,
598            advanced_amount_sampler: None,
599            conditional_amount_override: None,
600            correlation_copula: None,
601            loaded_priors: None,
602            iet_day_accum: std::collections::HashMap::new(),
603            iet_burst_remaining: std::collections::HashMap::new(),
604            last_tp_by_source: std::collections::HashMap::new(),
605            velocity_calibrator: None,
606            md_resolver: MasterDataResolver::default(),
607        }
608    }
609
610    /// Wire v3.4.0 advanced distributions. When the caller's config has
611    /// `distributions.enabled = true` AND `distributions.amounts.enabled =
612    /// true`, the journal-entry generator routes non-fraud amount sampling
613    /// through an [`AdvancedAmountSampler`] (log-normal or Gaussian mixture).
614    ///
615    /// When `distributions.industry_profile` is `Some`, the caller's
616    /// explicitly configured components override nothing — if the component
617    /// list is empty, the industry profile's `sales_amounts` mixture is used
618    /// instead. Explicit components always win.
619    ///
620    /// Returning `Ok(())` with no side effect is intentional for the
621    /// following no-op cases, so callers can unconditionally invoke this:
622    ///   - `config.enabled = false`
623    ///   - `config.amounts.enabled = false`
624    ///   - empty component list with no industry profile
625    ///
626    /// Errors propagate from mixture validation (e.g. weights not summing
627    /// to 1.0, non-positive sigma).
628    pub fn set_advanced_distributions(
629        &mut self,
630        config: &AdvancedDistributionConfig,
631        seed: u64,
632    ) -> Result<(), String> {
633        if !config.enabled {
634            return Ok(());
635        }
636
637        // v3.5.3+: build a conditional-amount override when the config
638        // declares a rule with `output_field == "amount"` and a supported
639        // input field. The override is applied *after* the standard
640        // cascade so it doesn't disturb fraud-path sampling. Unsupported
641        // input fields are ignored with a trace log.
642        self.conditional_amount_override = config
643            .conditional
644            .iter()
645            .find(|c| {
646                c.output_field == "amount" && Self::supported_conditional_input(&c.input_field)
647            })
648            .and_then(|c| {
649                datasynth_core::distributions::ConditionalSampler::new(
650                    seed.wrapping_add(17),
651                    c.to_core_config(),
652                )
653                .ok()
654            });
655
656        // v4.1.0+: all 5 copula types wired (Gaussian / Clayton /
657        // Gumbel / Frank / Student-t). The `BivariateCopulaSampler`
658        // already implements each; v3.5.4 had a filter limiting to
659        // Gaussian only — lifted here now that the smoke test matrix
660        // covers all types.
661        self.correlation_copula = config
662            .correlations
663            .to_core_config_for_pair("amount", "line_count")
664            .and_then(|copula_cfg| {
665                datasynth_core::distributions::BivariateCopulaSampler::new(
666                    seed.wrapping_add(31),
667                    copula_cfg,
668                )
669                .ok()
670            });
671
672        // v3.4.4+: Pareto takes precedence over mixture models when set.
673        // This supports heavy-tailed amount distributions (capex, strategic
674        // contracts, fraud) that log-normal/Gaussian mixtures can't model
675        // as sharply.
676        if let Some(pareto) = &config.pareto {
677            if pareto.enabled {
678                let core_cfg = pareto.to_core_config();
679                self.advanced_amount_sampler =
680                    Some(AdvancedAmountSampler::new_pareto(seed, core_cfg)?);
681                return Ok(());
682            }
683        }
684
685        if !config.amounts.enabled {
686            return Ok(());
687        }
688
689        match config.amounts.distribution_type {
690            MixtureDistributionType::LogNormal => {
691                let lognormal_cfg = config.amounts.to_log_normal_config().or_else(|| {
692                    config
693                        .industry_profile
694                        .as_ref()
695                        .map(|p| industry_profile_to_log_normal(p.profile_type()))
696                });
697                if let Some(cfg) = lognormal_cfg {
698                    self.advanced_amount_sampler =
699                        Some(AdvancedAmountSampler::new_log_normal(seed, cfg)?);
700                }
701            }
702            MixtureDistributionType::Gaussian => {
703                if let Some(cfg) = config.amounts.to_gaussian_config() {
704                    self.advanced_amount_sampler =
705                        Some(AdvancedAmountSampler::new_gaussian(seed, cfg)?);
706                }
707            }
708        }
709
710        Ok(())
711    }
712
713    /// Override the business-process volume mix. Weights map directly to the
714    /// `business_processes.*_weight` YAML config; they do not have to sum to
715    /// exactly 1.0 (they're normalized via `weighted_select`).
716    pub fn set_business_process_weights(
717        &mut self,
718        o2c: f64,
719        p2p: f64,
720        r2r: f64,
721        h2r: f64,
722        a2r: f64,
723    ) {
724        self.business_process_weights = [
725            (BusinessProcess::O2C, o2c),
726            (BusinessProcess::P2P, p2p),
727            (BusinessProcess::R2R, r2r),
728            (BusinessProcess::H2R, h2r),
729            (BusinessProcess::A2R, a2r),
730        ];
731    }
732
733    /// Create from a full GeneratorConfig.
734    ///
735    /// This constructor uses the volume_weight from company configs
736    /// for weighted company selection, and fraud config from GeneratorConfig.
737    pub fn from_generator_config(
738        full_config: &GeneratorConfig,
739        coa: Arc<ChartOfAccounts>,
740        start_date: NaiveDate,
741        end_date: NaiveDate,
742        seed: u64,
743    ) -> Self {
744        let companies: Vec<String> = full_config
745            .companies
746            .iter()
747            .map(|c| c.code.clone())
748            .collect();
749
750        // Create weighted selector using volume_weight from company configs
751        let company_selector = WeightedCompanySelector::from_configs(&full_config.companies);
752
753        let mut generator = Self::new_with_full_config(
754            full_config.transactions.clone(),
755            coa,
756            companies,
757            start_date,
758            end_date,
759            seed,
760            full_config.templates.clone(),
761            None,
762        );
763
764        // Override the uniform selector with weighted selector
765        generator.company_selector = company_selector;
766
767        // Set fraud config
768        generator.fraud_config = full_config.fraud.clone();
769
770        // Configure temporal patterns if enabled
771        let temporal_config = &full_config.temporal_patterns;
772        if temporal_config.enabled {
773            generator = generator.with_temporal_patterns(temporal_config.clone(), seed);
774        }
775
776        generator
777    }
778
779    /// Configure temporal patterns including business day calculations and processing lags.
780    ///
781    /// This enables realistic temporal behavior including:
782    /// - Business day awareness (no postings on weekends/holidays)
783    /// - Processing lag modeling (event-to-posting delays)
784    /// - Period-end dynamics (volume spikes at month/quarter/year end)
785    pub fn with_temporal_patterns(mut self, config: TemporalPatternsConfig, seed: u64) -> Self {
786        // Create business day calculator if enabled
787        if config.business_days.enabled {
788            let region = config
789                .calendars
790                .regions
791                .first()
792                .map(|r| Self::parse_region(r))
793                .unwrap_or(Region::US);
794
795            let calendar = HolidayCalendar::new(region, self.start_date.year());
796            self.business_day_calculator = Some(BusinessDayCalculator::new(calendar));
797        }
798
799        // Create processing lag calculator if enabled
800        if config.processing_lags.enabled {
801            let lag_config = Self::convert_processing_lag_config(&config.processing_lags);
802            self.processing_lag_calculator =
803                Some(ProcessingLagCalculator::with_config(seed, lag_config));
804        }
805
806        // Create period-end dynamics if configured
807        let model = config.period_end.model.as_deref().unwrap_or("flat");
808        if model != "flat"
809            || config
810                .period_end
811                .month_end
812                .as_ref()
813                .is_some_and(|m| m.peak_multiplier.unwrap_or(1.0) != 1.0)
814        {
815            let dynamics = Self::convert_period_end_config(&config.period_end);
816            self.temporal_sampler.set_period_end_dynamics(dynamics);
817        }
818
819        self.temporal_patterns_config = Some(config);
820        self
821    }
822
823    /// Configure temporal patterns using a [`CountryPack`] for the holiday calendar.
824    ///
825    /// This is an alternative to `with_temporal_patterns` that derives the
826    /// holiday calendar from a country-pack definition rather than the built-in
827    /// region-based calendars.  All other temporal behaviour (business-day
828    /// adjustment, processing lags, period-end dynamics) is configured
829    /// identically.
830    pub fn with_country_pack_temporal(
831        mut self,
832        config: TemporalPatternsConfig,
833        seed: u64,
834        pack: &CountryPack,
835    ) -> Self {
836        // Create business day calculator using the country pack calendar
837        if config.business_days.enabled {
838            let calendar = HolidayCalendar::from_country_pack(pack, self.start_date.year());
839            self.business_day_calculator = Some(BusinessDayCalculator::new(calendar));
840        }
841
842        // Create processing lag calculator if enabled
843        if config.processing_lags.enabled {
844            let lag_config = Self::convert_processing_lag_config(&config.processing_lags);
845            self.processing_lag_calculator =
846                Some(ProcessingLagCalculator::with_config(seed, lag_config));
847        }
848
849        // Create period-end dynamics if configured
850        let model = config.period_end.model.as_deref().unwrap_or("flat");
851        if model != "flat"
852            || config
853                .period_end
854                .month_end
855                .as_ref()
856                .is_some_and(|m| m.peak_multiplier.unwrap_or(1.0) != 1.0)
857        {
858            let dynamics = Self::convert_period_end_config(&config.period_end);
859            self.temporal_sampler.set_period_end_dynamics(dynamics);
860        }
861
862        self.temporal_patterns_config = Some(config);
863        self
864    }
865
866    /// Convert schema processing lag config to core config.
867    fn convert_processing_lag_config(
868        schema: &datasynth_config::schema::ProcessingLagSchemaConfig,
869    ) -> ProcessingLagConfig {
870        let mut config = ProcessingLagConfig {
871            enabled: schema.enabled,
872            ..Default::default()
873        };
874
875        // Helper to convert lag schema to distribution
876        let convert_lag = |lag: &datasynth_config::schema::LagDistributionSchemaConfig| {
877            let mut dist = LagDistribution::log_normal(lag.mu, lag.sigma);
878            if let Some(min) = lag.min_hours {
879                dist.min_lag_hours = min;
880            }
881            if let Some(max) = lag.max_hours {
882                dist.max_lag_hours = max;
883            }
884            dist
885        };
886
887        // Apply event-specific lags
888        if let Some(ref lag) = schema.sales_order_lag {
889            config
890                .event_lags
891                .insert(EventType::SalesOrder, convert_lag(lag));
892        }
893        if let Some(ref lag) = schema.purchase_order_lag {
894            config
895                .event_lags
896                .insert(EventType::PurchaseOrder, convert_lag(lag));
897        }
898        if let Some(ref lag) = schema.goods_receipt_lag {
899            config
900                .event_lags
901                .insert(EventType::GoodsReceipt, convert_lag(lag));
902        }
903        if let Some(ref lag) = schema.invoice_receipt_lag {
904            config
905                .event_lags
906                .insert(EventType::InvoiceReceipt, convert_lag(lag));
907        }
908        if let Some(ref lag) = schema.invoice_issue_lag {
909            config
910                .event_lags
911                .insert(EventType::InvoiceIssue, convert_lag(lag));
912        }
913        if let Some(ref lag) = schema.payment_lag {
914            config
915                .event_lags
916                .insert(EventType::Payment, convert_lag(lag));
917        }
918        if let Some(ref lag) = schema.journal_entry_lag {
919            config
920                .event_lags
921                .insert(EventType::JournalEntry, convert_lag(lag));
922        }
923
924        // Apply cross-day posting config
925        if let Some(ref cross_day) = schema.cross_day_posting {
926            config.cross_day = CrossDayConfig {
927                enabled: cross_day.enabled,
928                probability_by_hour: cross_day.probability_by_hour.clone(),
929                ..Default::default()
930            };
931        }
932
933        config
934    }
935
936    /// Convert schema period-end config to core PeriodEndDynamics.
937    fn convert_period_end_config(
938        schema: &datasynth_config::schema::PeriodEndSchemaConfig,
939    ) -> PeriodEndDynamics {
940        let model_type = schema.model.as_deref().unwrap_or("exponential");
941
942        // Helper to convert period config
943        let convert_period =
944            |period: Option<&datasynth_config::schema::PeriodEndModelSchemaConfig>,
945             default_peak: f64|
946             -> PeriodEndConfig {
947                if let Some(p) = period {
948                    let model = match model_type {
949                        "flat" => PeriodEndModel::FlatMultiplier {
950                            multiplier: p.peak_multiplier.unwrap_or(default_peak),
951                        },
952                        "extended_crunch" => PeriodEndModel::ExtendedCrunch {
953                            start_day: p.start_day.unwrap_or(-10),
954                            sustained_high_days: p.sustained_high_days.unwrap_or(3),
955                            peak_multiplier: p.peak_multiplier.unwrap_or(default_peak),
956                            ramp_up_days: 3, // Default ramp-up period
957                        },
958                        _ => PeriodEndModel::ExponentialAcceleration {
959                            start_day: p.start_day.unwrap_or(-10),
960                            base_multiplier: p.base_multiplier.unwrap_or(1.0),
961                            peak_multiplier: p.peak_multiplier.unwrap_or(default_peak),
962                            decay_rate: p.decay_rate.unwrap_or(0.3),
963                        },
964                    };
965                    PeriodEndConfig {
966                        enabled: true,
967                        model,
968                        additional_multiplier: p.additional_multiplier.unwrap_or(1.0),
969                    }
970                } else {
971                    PeriodEndConfig {
972                        enabled: true,
973                        model: PeriodEndModel::ExponentialAcceleration {
974                            start_day: -10,
975                            base_multiplier: 1.0,
976                            peak_multiplier: default_peak,
977                            decay_rate: 0.3,
978                        },
979                        additional_multiplier: 1.0,
980                    }
981                }
982            };
983
984        PeriodEndDynamics::new(
985            convert_period(schema.month_end.as_ref(), 2.0),
986            convert_period(schema.quarter_end.as_ref(), 3.5),
987            convert_period(schema.year_end.as_ref(), 5.0),
988        )
989    }
990
991    /// Parse a region string into a Region enum.
992    fn parse_region(region_str: &str) -> Region {
993        match region_str.to_uppercase().as_str() {
994            "US" => Region::US,
995            "DE" => Region::DE,
996            "GB" => Region::GB,
997            "CN" => Region::CN,
998            "JP" => Region::JP,
999            "IN" => Region::IN,
1000            "BR" => Region::BR,
1001            "MX" => Region::MX,
1002            "AU" => Region::AU,
1003            "SG" => Region::SG,
1004            "KR" => Region::KR,
1005            "FR" => Region::FR,
1006            "IT" => Region::IT,
1007            "ES" => Region::ES,
1008            "CA" => Region::CA,
1009            _ => Region::US,
1010        }
1011    }
1012
1013    /// Set a custom company selector.
1014    pub fn set_company_selector(&mut self, selector: WeightedCompanySelector) {
1015        self.company_selector = selector;
1016    }
1017
1018    /// Get the current company selector.
1019    pub fn company_selector(&self) -> &WeightedCompanySelector {
1020        &self.company_selector
1021    }
1022
1023    /// Set fraud configuration.
1024    pub fn set_fraud_config(&mut self, config: FraudConfig) {
1025        self.fraud_config = config;
1026    }
1027
1028    /// Set vendors from generated master data.
1029    ///
1030    /// This replaces the default vendor pool with actual generated vendors,
1031    /// ensuring JEs reference real master data entities.
1032    pub fn with_vendors(mut self, vendors: &[Vendor]) -> Self {
1033        if !vendors.is_empty() {
1034            self.vendor_pool = VendorPool::from_vendors(vendors.to_vec());
1035            self.using_real_master_data = true;
1036        }
1037        self
1038    }
1039
1040    /// Set customers from generated master data.
1041    ///
1042    /// This replaces the default customer pool with actual generated customers,
1043    /// ensuring JEs reference real master data entities.
1044    pub fn with_customers(mut self, customers: &[Customer]) -> Self {
1045        if !customers.is_empty() {
1046            self.customer_pool = CustomerPool::from_customers(customers.to_vec());
1047            self.using_real_master_data = true;
1048        }
1049        self
1050    }
1051
1052    /// Set materials from generated master data.
1053    ///
1054    /// This provides material references for JEs that involve inventory movements.
1055    pub fn with_materials(mut self, materials: &[Material]) -> Self {
1056        if !materials.is_empty() {
1057            self.material_pool = Some(MaterialPool::from_materials(materials.to_vec()));
1058            self.using_real_master_data = true;
1059        }
1060        self
1061    }
1062
1063    /// Set all master data at once for convenience.
1064    ///
1065    /// This is the recommended way to configure the JE generator with
1066    /// generated master data to ensure data coherence.
1067    pub fn with_master_data(
1068        self,
1069        vendors: &[Vendor],
1070        customers: &[Customer],
1071        materials: &[Material],
1072    ) -> Self {
1073        self.with_vendors(vendors)
1074            .with_customers(customers)
1075            .with_materials(materials)
1076    }
1077
1078    /// SP6 — Build a [`MasterDataResolver`] from the run's master data and
1079    /// store it in `self.md_resolver`. Call once before JE generation begins
1080    /// (the entry method `generate` calls this lazily on the first entry when
1081    /// the resolver pools are empty). Pools are cheap `Vec<String>` snapshots
1082    /// of names already held in the generator's vendor/customer/user pools.
1083    fn refresh_md_resolver(&mut self) {
1084        let companies: Vec<String> = self
1085            .vendor_pool
1086            .vendors
1087            .iter()
1088            .map(|v| v.name.clone())
1089            .chain(self.customer_pool.customers.iter().map(|c| c.name.clone()))
1090            .collect();
1091
1092        let persons: Vec<String> = self
1093            .user_pool
1094            .as_ref()
1095            .map(|p| p.users.iter().map(|u| u.display_name.clone()).collect())
1096            .unwrap_or_default();
1097
1098        let streets: Vec<String> = Vec::new(); // No address master entity in this generator.
1099        let patients = synthetic_patient_pool("de_CH");
1100
1101        self.md_resolver = MasterDataResolver {
1102            companies,
1103            persons,
1104            streets,
1105            patients,
1106        };
1107    }
1108
1109    /// Set the cost-center pool used by line-item enrichment.
1110    ///
1111    /// The orchestrator wires this from the generated cost-centers
1112    /// master so `JE.cost_center` joins back to `cost_centers.id`.
1113    /// When the pool is non-empty `enrich_line_items` picks
1114    /// deterministically from it; the hardcoded fallback
1115    /// `COST_CENTER_POOL` const is only used when the pool is empty
1116    /// (configs that don't generate cost-center master data).
1117    pub fn with_cost_center_pool(mut self, ids: Vec<String>) -> Self {
1118        self.cost_center_pool = ids;
1119        self
1120    }
1121
1122    /// Set the profit-center pool used by line-item enrichment.
1123    ///
1124    /// Same semantics as `with_cost_center_pool` but for the
1125    /// profit-centers master.  Without this, the legacy
1126    /// `PC-{company_code}-{P2P|O2C|R2R|H2R}` derivation is used —
1127    /// which is consistent within a generation run but does not
1128    /// match the format the master data generator emits.
1129    pub fn with_profit_center_pool(mut self, ids: Vec<String>) -> Self {
1130        self.profit_center_pool = ids;
1131        self
1132    }
1133
1134    /// Replace the auto-generated user pool with an externally-built one.
1135    ///
1136    /// The orchestrator builds a [`UserPool`] from the generated
1137    /// employee master ([`UserPool::from_employees`]) and passes it
1138    /// here, so `JE.created_by` joins back to `employees.user_id`.
1139    /// Without this call, `with_country_pack_names` generates its
1140    /// own user pool whose ids are disjoint from the employee
1141    /// master.
1142    pub fn with_user_pool(mut self, pool: UserPool) -> Self {
1143        self.user_pool = Some(pool);
1144        self
1145    }
1146
1147    /// Replace the user pool with one generated from a [`CountryPack`].
1148    ///
1149    /// This is an alternative to the default name-culture distribution that
1150    /// derives name pools and weights from the country-pack's `names` section.
1151    /// The existing user pool (if any) is discarded and regenerated using
1152    /// `MultiCultureNameGenerator::from_country_pack`.
1153    pub fn with_country_pack_names(mut self, pack: &CountryPack) -> Self {
1154        let name_gen =
1155            datasynth_core::templates::MultiCultureNameGenerator::from_country_pack(pack);
1156        let config = UserGeneratorConfig {
1157            // The culture distribution is embedded in the name generator
1158            // itself, so we use an empty list here.
1159            culture_distribution: Vec::new(),
1160            email_domain: name_gen.email_domain().to_string(),
1161            generate_realistic_names: true,
1162        };
1163        let mut user_gen = UserGenerator::with_name_generator(self.seed + 100, config, name_gen);
1164        self.user_pool = Some(user_gen.generate_standard(&self.companies));
1165        self
1166    }
1167
1168    /// Check if the generator is using real master data.
1169    pub fn is_using_real_master_data(&self) -> bool {
1170        self.using_real_master_data
1171    }
1172
1173    /// Determine if this transaction should be fraudulent.
1174    /// Pick a realistic ERP `source_system` provenance code.
1175    ///
1176    /// Returns a string like `"SAP-FI/AP"`, `"manual/adjustment"`,
1177    /// `"Interface/EDI"`. Uses the business process to bias toward
1178    /// process-appropriate sub-modules (e.g. P2P → SAP-MM/IV, O2C →
1179    /// SAP-SD/IV, H2R → SAP-HR/PR). The legacy 7-code shape
1180    /// (`SAP-FI`, `SAP-MM`, etc.) is preserved as a prefix so existing
1181    /// `starts_with` filters keep working.
1182    ///
1183    /// **Manual contract**: when `is_manual` is true the returned value
1184    /// always starts with `"manual"` or `"spreadsheet"`. This is asserted
1185    /// in `test_isa240_audit_flags_populated`.
1186    fn pick_source_system(rng: &mut ChaCha8Rng, is_manual: bool, bp: BusinessProcess) -> String {
1187        if is_manual {
1188            // 8 manual provenance codes — all share a `manual/` or
1189            // `spreadsheet/` prefix.
1190            const MANUAL: &[&str] = &[
1191                "manual/standard",
1192                "manual/adjustment",
1193                "manual/reclassification",
1194                "manual/accrual",
1195                "manual/reversal",
1196                "manual/correction",
1197                "spreadsheet/upload",
1198                "spreadsheet/journal",
1199            ];
1200            let idx = (rng.random::<u32>() as usize) % MANUAL.len();
1201            return MANUAL[idx].to_string();
1202        }
1203
1204        // Process-aware automated provenance. Each process has a small
1205        // primary set; we also mix in cross-process codes ~20% of the
1206        // time so the taxonomy stays diverse without losing coherence.
1207        let primary: &[&str] = match bp {
1208            BusinessProcess::P2P => &[
1209                "SAP-MM/PO",
1210                "SAP-MM/IV",
1211                "SAP-MM/IM",
1212                "SAP-FI/AP",
1213                "Interface/EDI",
1214            ],
1215            BusinessProcess::O2C => &[
1216                "SAP-SD/ORD",
1217                "SAP-SD/DEL",
1218                "SAP-SD/IV",
1219                "SAP-FI/AR",
1220                "Interface/Lockbox",
1221            ],
1222            BusinessProcess::H2R => &["SAP-HR/PR", "SAP-HR/TIME", "Interface/PayRun"],
1223            BusinessProcess::A2R => &["SAP-FI/AA", "SAP-FI/GL"],
1224            BusinessProcess::Treasury => &["Treasury/CM", "Treasury/HD", "Interface/Bank"],
1225            BusinessProcess::Tax => &["Tax/RPT", "SAP-FI/GL"],
1226            BusinessProcess::Mfg => &["SAP-MM/IM", "SAP-FI/GL"],
1227            // R2R, S2C, Bank, Audit, Intercompany, ProjectAccounting, Esg
1228            // → fall through to a generic mix.
1229            _ => &[
1230                "SAP-FI/GL",
1231                "SAP-FI/AP",
1232                "SAP-FI/AR",
1233                "SAP-FI/AA",
1234                "External/SubL",
1235            ],
1236        };
1237
1238        // 80% process-appropriate, 20% cross-process (pulled from a
1239        // generic pool) so the categorical distribution has long tails.
1240        const CROSS: &[&str] = &[
1241            "SAP-FI/GL",
1242            "SAP-FI/AP",
1243            "SAP-FI/AR",
1244            "Interface/EDI",
1245            "Interface/Bank",
1246            "External/SubL",
1247        ];
1248        let pool = if rng.random::<f64>() < 0.80 {
1249            primary
1250        } else {
1251            CROSS
1252        };
1253        let idx = (rng.random::<u32>() as usize) % pool.len();
1254        pool[idx].to_string()
1255    }
1256
1257    /// T2-D Lever 1: choose the `sap_source_code` emitted in the CSV `source`
1258    /// column. Priority: loaded industry priors' `source_mix` (SP3.6) → the
1259    /// default generic SAP doc-type mix when `transactions.synthetic_source_codes`
1260    /// is on (the default) → `None` (legacy: `source` falls back to the coarse
1261    /// `TransactionSource` enum). Closes the source-mix breadth gap by default
1262    /// (entropy ~0.75 → ~2.7; experiments/ml/FINDINGS.md §6).
1263    fn sample_sap_source_code(&mut self) -> Option<String> {
1264        if let Some(p) = self.loaded_priors.as_ref() {
1265            return Some(p.source_mix.sample(&mut self.rng));
1266        }
1267        if self.config.synthetic_source_codes.unwrap_or(true) {
1268            // Independent stream: never perturb the main RNG, so all other
1269            // fields stay byte-identical to the legacy (enum-source) output.
1270            return Some(DEFAULT_SOURCE_MIX.sample(&mut self.source_mix_rng));
1271        }
1272        None
1273    }
1274
1275    /// SOTA-1: on the no-priors path, reuse a cached `(debit, credit)` account
1276    /// archetype matching the line counts for this `(company, doc_type)` with
1277    /// high probability, so standard postings recur (and a hot subset of
1278    /// accounts dominates) instead of every JE drawing fresh uniform accounts.
1279    /// Returns the accounts to use, or `None` to select fresh (then cached).
1280    /// Rolls `template_rng` first so the main RNG (amounts/dates/counts) is
1281    /// never perturbed — only account *choice* changes on reuse.
1282    fn pick_recurring_archetype(
1283        &mut self,
1284        company: &str,
1285        doc_type: &str,
1286        debit_count: usize,
1287        credit_count: usize,
1288    ) -> Option<(Vec<String>, Vec<String>)> {
1289        if !self.config.recurring_templates.unwrap_or(true) {
1290            return None;
1291        }
1292        // Priors carry their own GL-account structure; templating is the no-priors
1293        // default-path realism boost (FINDINGS sec.8) UNLESS the user has explicitly
1294        // set archetype_reuse_probability — in that case SOTA-1 composes with the
1295        // priors path (SOTA-9 #137: lift corpus recurring share toward ~0.97).
1296        let p_reuse_opt = self.config.archetype_reuse_probability;
1297        if p_reuse_opt.is_none() && self.loaded_priors.is_some() {
1298            return None;
1299        }
1300        let p_reuse = p_reuse_opt.unwrap_or(0.90);
1301        if self.template_rng.random::<f64>() >= p_reuse {
1302            return None;
1303        }
1304        let lib = self
1305            .recurring_archetypes
1306            .get(&(company.to_string(), doc_type.to_string()))?;
1307        let matching: Vec<&(Vec<String>, Vec<String>)> = lib
1308            .iter()
1309            .filter(|(d, c)| d.len() == debit_count && c.len() == credit_count)
1310            .collect();
1311        if matching.is_empty() {
1312            return None;
1313        }
1314        // Power-law (Zipf) over the cached archetypes rather than a uniform pick:
1315        // the earlier-cached "standard" posting of each (company, doc-type, shape)
1316        // dominates, so a hot subset of archetypes carries most JEs. Uniform reuse
1317        // kept the per-JE recurring share high but left the archetype head too
1318        // flat (top-50 coverage 0.49 vs corpus 0.65); concentrating the head lifts
1319        // top-50 coverage toward the corpus. Same mechanism as the SOTA-2 account
1320        // Pareto, drawn on the `template_rng` stream.
1321        let idx = Self::power_law_index(matching.len(), &mut self.template_rng).unwrap_or(0);
1322        Some(matching[idx].clone())
1323    }
1324
1325    /// SOTA-1: record a freshly-selected archetype for future reuse, capped per
1326    /// `(company, doc_type)` so the standard-posting library stays small.
1327    fn cache_recurring_archetype(
1328        &mut self,
1329        company: &str,
1330        doc_type: &str,
1331        debit: Vec<String>,
1332        credit: Vec<String>,
1333    ) {
1334        if self.loaded_priors.is_some() || !self.config.recurring_templates.unwrap_or(true) {
1335            return;
1336        }
1337        if debit.is_empty() && credit.is_empty() {
1338            return;
1339        }
1340        const CAP: usize = 24; // distinct archetypes per (company, doc-type) — fewer ⇒ top-50 archetypes cover more JEs (toward corpus top-50 ~0.65)
1341        let lib = self
1342            .recurring_archetypes
1343            .entry((company.to_string(), doc_type.to_string()))
1344            .or_default();
1345        if lib.len() < CAP {
1346            lib.push((debit, credit));
1347        }
1348    }
1349
1350    /// SOTA-5: with probability `transactions.reversal_rate` (default ~10%),
1351    /// build a reversal/correction of a recent JE (swap dr/cr, reference the
1352    /// original) instead of a fresh JE. Uses `reversal_rng` and an id derived
1353    /// from the original, so the main RNG + uuid factory are unperturbed (normal
1354    /// JEs stay byte-identical; reversals are interspersed). Balanced because the
1355    /// original was balanced and we swap each line's debit/credit.
1356    fn maybe_generate_reversal(&mut self) -> Option<JournalEntry> {
1357        let rate = self.config.reversal_rate.unwrap_or(DEFAULT_REVERSAL_RATE);
1358        if rate <= 0.0 || self.reversal_buffer.is_empty() {
1359            return None;
1360        }
1361        if self.reversal_rng.random::<f64>() >= rate {
1362            return None;
1363        }
1364        let pick = (self.reversal_rng.random::<u32>() as usize) % self.reversal_buffer.len();
1365        // Consume the entry so the same original is never reversed twice — that
1366        // would mint the same derived id (`orig ^ salt`) and produce duplicate
1367        // document IDs (regression caught by `test_document_reference_integrity`).
1368        let mut entry = self.reversal_buffer.remove(pick);
1369        let orig_id = entry.header.document_id;
1370        // Reversal posts a few business days after the original.
1371        let offset = 1 + (self.reversal_rng.random::<u32>() % 7) as i64;
1372        let mut rev_date = entry.header.posting_date + chrono::Duration::days(offset);
1373        if let Some(ref calc) = self.business_day_calculator {
1374            if !calc.is_business_day(rev_date) {
1375                rev_date = calc.next_business_day(rev_date, false);
1376            }
1377        }
1378        if rev_date > self.end_date {
1379            rev_date = entry.header.posting_date;
1380        }
1381        // Deterministic id derived from the original (no uuid-factory advance).
1382        let rev_id =
1383            uuid::Uuid::from_u128(orig_id.as_u128() ^ 0x5245_5645_5253_414c_5245_5645_5253_414c);
1384        // Inherit everything from the original (source code, line text, audit
1385        // flags, ...); change only the markers + each line's debit/credit.
1386        entry.header.document_id = rev_id;
1387        entry.header.posting_date = rev_date;
1388        entry.header.document_date = rev_date;
1389        entry.header.fiscal_year = rev_date.year() as u16;
1390        entry.header.fiscal_period = rev_date.month() as u8;
1391        entry.header.header_text = Some(format!("Reversal of {orig_id}"));
1392        entry.header.reference = Some(format!("REV-{orig_id}"));
1393        entry.header.batch_id = None;
1394        for line in entry.lines.iter_mut() {
1395            std::mem::swap(&mut line.debit_amount, &mut line.credit_amount);
1396            line.document_id = rev_id;
1397        }
1398        Some(entry)
1399    }
1400
1401    /// SOTA-5/6: remember a (complete) JE so a later reversal (SOTA-5) or
1402    /// allocation batch (SOTA-6) can reuse it. Populated when either process is
1403    /// enabled, so disabling reversals doesn't starve the allocation batches.
1404    fn record_for_reversal(&mut self, entry: &JournalEntry) {
1405        let reversal_on = self.config.reversal_rate.unwrap_or(DEFAULT_REVERSAL_RATE) > 0.0;
1406        let allocation_on = self
1407            .config
1408            .allocation_batch_rate
1409            .unwrap_or(DEFAULT_ALLOCATION_RATE)
1410            > 0.0;
1411        if (!reversal_on && !allocation_on) || entry.lines.is_empty() {
1412            return;
1413        }
1414        const CAP: usize = 64;
1415        if self.reversal_buffer.len() >= CAP {
1416            self.reversal_buffer.remove(0);
1417        }
1418        self.reversal_buffer.push(entry.clone());
1419    }
1420
1421    /// SOTA-4: with probability `transactions.foreign_currency_rate`, post this JE
1422    /// in a foreign document currency (SAP-style). `debit_amount`/`credit_amount`/
1423    /// `local_amount` stay the company-ledger amount (DMBTR — the trial balance is
1424    /// unaffected); `header.currency`/`header.exchange_rate` + each line's
1425    /// `transaction_amount` (WRBTR) carry the foreign value. Balance holds in both
1426    /// currencies (every line shares one rate). Drawn on `fx_rng` so the main
1427    /// `rng` (and all company-currency JEs) stay byte-identical.
1428    fn maybe_apply_foreign_currency(&mut self, entry: &mut JournalEntry) {
1429        let prob = self.config.foreign_currency_rate.unwrap_or(0.0);
1430        if prob <= 0.0 || self.fx_rng.random::<f64>() >= prob {
1431            return;
1432        }
1433        let (code, rate) = FOREIGN_CCYS[self.fx_rng.random_range(0..FOREIGN_CCYS.len())];
1434        let rate_dec = match Decimal::from_f64_retain(rate) {
1435            Some(r) if r > Decimal::ZERO => r,
1436            _ => return,
1437        };
1438        entry.header.currency = code.to_string();
1439        entry.header.exchange_rate = rate_dec;
1440        for line in entry.lines.iter_mut() {
1441            let ledger = line.debit_amount + line.credit_amount; // one side is zero
1442            line.transaction_amount = Some((ledger / rate_dec).round_dp(2));
1443        }
1444    }
1445
1446    /// SOTA-6: split `total` into `n` positive cent-precise parts summing
1447    /// **exactly** to `total` (so the JE stays balanced), with random weights so
1448    /// the allocation isn't perfectly even. Each part is ≥ 1 cent. Returns a
1449    /// single `[total]` when the amount is too small to split into `n` parts.
1450    fn split_amount(total: Decimal, n: usize, rng: &mut ChaCha8Rng) -> Vec<Decimal> {
1451        let n = n.max(1);
1452        let total_cents = (total.round_dp(2) * Decimal::from(100))
1453            .to_i64()
1454            .unwrap_or(0);
1455        if n == 1 || total_cents < n as i64 {
1456            return vec![total];
1457        }
1458        let weights: Vec<f64> = (0..n).map(|_| 0.5 + rng.random::<f64>()).collect();
1459        let sumw: f64 = weights.iter().sum::<f64>().max(f64::EPSILON);
1460        let spare = total_cents - n as i64; // ≥ 0; each part keeps a 1-cent floor
1461        let mut cents: Vec<i64> = weights
1462            .iter()
1463            .map(|w| 1 + (spare as f64 * w / sumw).floor() as i64)
1464            .collect();
1465        // dump the (small, < n) flooring leftover onto the largest part
1466        let assigned: i64 = cents.iter().sum();
1467        let leftover = total_cents - assigned;
1468        if let Some(maxp) = cents.iter_mut().max_by_key(|c| **c) {
1469            *maxp += leftover;
1470        }
1471        cents.into_iter().map(|c| Decimal::new(c, 2)).collect()
1472    }
1473
1474    /// SOTA-3: deterministic dimension → business-unit roll-up (the dimension is
1475    /// the cost center, or the profit center as fallback). The same dimension
1476    /// value always maps to the same BU code (`BU01`..`BU11`, matching the
1477    /// corpus's ~11 BU codes), so business-unit analytics are internally
1478    /// consistent — not a random per-line label. FNV-1a hash, bucketed.
1479    fn business_unit_for_dimension(dim: &str) -> String {
1480        const N_BU: u32 = 11;
1481        let mut h: u32 = 0x811c_9dc5;
1482        for b in dim.bytes() {
1483            h ^= b as u32;
1484            h = h.wrapping_mul(0x0100_0193);
1485        }
1486        format!("BU{:02}", (h % N_BU) + 1)
1487    }
1488
1489    /// SOTA-6: with probability `transactions.allocation_batch_rate` (default
1490    /// ~0.8%), emit an allocation/assessment batch instead of a fresh JE — the
1491    /// large 1-to-many posting that drives the corpus lines-per-JE tail (AB docs
1492    /// ~52 lines). Reuses a buffered JE for a valid header (no main-RNG / uuid
1493    /// advance), then explodes its largest debit line into ~30-80 cost-center-
1494    /// spread sub-lines summing to the same amount, so balance is preserved and
1495    /// the cost-center dimension breadth rises. Tagged source `AB`.
1496    fn maybe_generate_allocation_batch(&mut self) -> Option<JournalEntry> {
1497        let rate = self
1498            .config
1499            .allocation_batch_rate
1500            .unwrap_or(DEFAULT_ALLOCATION_RATE);
1501        if rate <= 0.0 || self.reversal_buffer.is_empty() {
1502            return None;
1503        }
1504        if self.allocation_rng.random::<f64>() >= rate {
1505            return None;
1506        }
1507        let pick = (self.allocation_rng.random::<u32>() as usize) % self.reversal_buffer.len();
1508        // Consume the entry (same reason as the reversal path: a reused base
1509        // would mint a duplicate derived id `base ^ salt`).
1510        let mut entry = self.reversal_buffer.remove(pick);
1511        // Explode the largest debit line across cost centers.
1512        let idx = entry
1513            .lines
1514            .iter()
1515            .enumerate()
1516            .filter(|(_, l)| l.debit_amount > Decimal::ZERO)
1517            .max_by(|a, b| a.1.debit_amount.cmp(&b.1.debit_amount))
1518            .map(|(i, _)| i)?;
1519        let template = entry.lines[idx].clone();
1520        let n = self
1521            .allocation_rng
1522            .random_range(ALLOCATION_MIN_TARGETS..=ALLOCATION_MAX_TARGETS) as usize;
1523        let parts = Self::split_amount(template.debit_amount, n, &mut self.allocation_rng);
1524        if parts.len() < ALLOCATION_MIN_TARGETS as usize {
1525            // amount too small to make a meaningful batch — leave it a normal JE
1526            return None;
1527        }
1528        // Valid cost-center candidates for this company (joins back to master).
1529        let company_code = entry.header.company_code.clone();
1530        let cc_pool: Vec<String> = if self.cost_center_pool.is_empty() {
1531            Self::COST_CENTER_POOL
1532                .iter()
1533                .map(|s| s.to_string())
1534                .collect()
1535        } else {
1536            let needle = format!("-{company_code}-");
1537            let filtered: Vec<String> = self
1538                .cost_center_pool
1539                .iter()
1540                .filter(|id| id.contains(&needle))
1541                .cloned()
1542                .collect();
1543            if filtered.is_empty() {
1544                self.cost_center_pool.clone()
1545            } else {
1546                filtered
1547            }
1548        };
1549        let mut new_lines: Vec<JournalEntryLine> =
1550            Vec::with_capacity(entry.lines.len() + parts.len() - 1);
1551        for (j, line) in entry.lines.iter().enumerate() {
1552            if j == idx {
1553                let bu_on = self.config.business_unit_dimension.unwrap_or(true);
1554                for (k, part) in parts.iter().enumerate() {
1555                    let mut nl = template.clone();
1556                    nl.debit_amount = *part;
1557                    nl.credit_amount = Decimal::ZERO;
1558                    nl.cost_center = Some(cc_pool[k % cc_pool.len()].clone());
1559                    // SOTA-3: keep business_unit coherent with the *new* CC
1560                    // (the clone carried the template's stale BU).
1561                    if bu_on {
1562                        nl.business_unit = nl
1563                            .cost_center
1564                            .as_deref()
1565                            .map(Self::business_unit_for_dimension);
1566                    }
1567                    new_lines.push(nl);
1568                }
1569            } else {
1570                new_lines.push(line.clone());
1571            }
1572        }
1573        // Derived id (distinct from the reversal salt); retag as an allocation.
1574        let base_id = entry.header.document_id;
1575        let alloc_id =
1576            uuid::Uuid::from_u128(base_id.as_u128() ^ 0xA110_CA70_A110_CA70_A110_CA70_A110_CA70);
1577        entry.header.document_id = alloc_id;
1578        entry.header.sap_source_code = Some("AB".to_string());
1579        entry.header.header_text = Some("Allocation/assessment cycle".to_string());
1580        entry.header.reference = Some(format!("ALLOC-{base_id}"));
1581        entry.header.batch_id = None;
1582        for (i, line) in new_lines.iter_mut().enumerate() {
1583            line.line_number = (i + 1) as u32;
1584            line.document_id = alloc_id;
1585        }
1586        entry.lines = new_lines.into();
1587        Some(entry)
1588    }
1589
1590    fn determine_fraud(&mut self, business_process: BusinessProcess) -> Option<FraudType> {
1591        if !self.fraud_config.enabled {
1592            return None;
1593        }
1594
1595        // v5.30 B3 (#153) — per-process fraud rate override. When
1596        // `fraud.per_process_rates` carries an entry for this JE's business
1597        // process, use that rate instead of the global `fraud_rate`. Unmapped
1598        // processes fall back to the global rate (preserving v5.29 default
1599        // behavior for configs that don't opt in to per-process rates).
1600        //
1601        // The slug uses the YAML wire form (matches `#[serde(rename_all =
1602        // "UPPERCASE")]` plus the per-variant renames on `BusinessProcess`).
1603        let process_slug = match business_process {
1604            BusinessProcess::P2P => "P2P",
1605            BusinessProcess::O2C => "O2C",
1606            BusinessProcess::R2R => "R2R",
1607            BusinessProcess::H2R => "H2R",
1608            BusinessProcess::A2R => "A2R",
1609            BusinessProcess::S2C => "S2C",
1610            BusinessProcess::Mfg => "MFG",
1611            BusinessProcess::Bank => "BANK",
1612            BusinessProcess::Audit => "AUDIT",
1613            BusinessProcess::Treasury => "TREASURY",
1614            BusinessProcess::Tax => "TAX",
1615            BusinessProcess::Intercompany => "INTERCOMPANY",
1616            BusinessProcess::ProjectAccounting => "PROJECT",
1617            BusinessProcess::Esg => "ESG",
1618        };
1619        let effective_rate = self
1620            .fraud_config
1621            .per_process_rates
1622            .get(process_slug)
1623            .copied()
1624            .unwrap_or(self.fraud_config.fraud_rate);
1625
1626        // Roll for fraud based on the (per-process or global) rate
1627        if self.rng.random::<f64>() >= effective_rate {
1628            return None;
1629        }
1630
1631        // Select fraud type based on distribution
1632        Some(self.select_fraud_type())
1633    }
1634
1635    /// Select a fraud type based on the configured distribution.
1636    fn select_fraud_type(&mut self) -> FraudType {
1637        let dist = &self.fraud_config.fraud_type_distribution;
1638        let roll: f64 = self.rng.random();
1639
1640        let mut cumulative = 0.0;
1641
1642        cumulative += dist.suspense_account_abuse;
1643        if roll < cumulative {
1644            return FraudType::SuspenseAccountAbuse;
1645        }
1646
1647        cumulative += dist.fictitious_transaction;
1648        if roll < cumulative {
1649            return FraudType::FictitiousTransaction;
1650        }
1651
1652        cumulative += dist.revenue_manipulation;
1653        if roll < cumulative {
1654            return FraudType::RevenueManipulation;
1655        }
1656
1657        cumulative += dist.expense_capitalization;
1658        if roll < cumulative {
1659            return FraudType::ExpenseCapitalization;
1660        }
1661
1662        cumulative += dist.split_transaction;
1663        if roll < cumulative {
1664            return FraudType::SplitTransaction;
1665        }
1666
1667        cumulative += dist.timing_anomaly;
1668        if roll < cumulative {
1669            return FraudType::TimingAnomaly;
1670        }
1671
1672        cumulative += dist.unauthorized_access;
1673        if roll < cumulative {
1674            return FraudType::UnauthorizedAccess;
1675        }
1676
1677        cumulative += dist.duplicate_payment;
1678        if roll < cumulative {
1679            return FraudType::DuplicatePayment;
1680        }
1681
1682        cumulative += dist.kickback_scheme;
1683        if roll < cumulative {
1684            return FraudType::KickbackScheme;
1685        }
1686
1687        cumulative += dist.round_tripping;
1688        if roll < cumulative {
1689            return FraudType::RoundTripping;
1690        }
1691
1692        cumulative += dist.unauthorized_discount;
1693        if roll < cumulative {
1694            return FraudType::UnauthorizedDiscount;
1695        }
1696
1697        // Fallback when distribution is sub-1.0 (validator allows tolerance)
1698        FraudType::DuplicatePayment
1699    }
1700
1701    /// Map a fraud type to an amount pattern for suspicious amounts.
1702    fn fraud_type_to_amount_pattern(&self, fraud_type: FraudType) -> FraudAmountPattern {
1703        match fraud_type {
1704            FraudType::SplitTransaction | FraudType::JustBelowThreshold => {
1705                FraudAmountPattern::ThresholdAdjacent
1706            }
1707            FraudType::FictitiousTransaction
1708            | FraudType::FictitiousEntry
1709            | FraudType::SuspenseAccountAbuse
1710            | FraudType::RoundDollarManipulation => FraudAmountPattern::ObviousRoundNumbers,
1711            FraudType::RevenueManipulation
1712            | FraudType::ExpenseCapitalization
1713            | FraudType::ImproperCapitalization
1714            | FraudType::ReserveManipulation
1715            | FraudType::UnauthorizedAccess
1716            | FraudType::PrematureRevenue
1717            | FraudType::UnderstatedLiabilities
1718            | FraudType::OverstatedAssets
1719            | FraudType::ChannelStuffing => FraudAmountPattern::StatisticallyImprobable,
1720            FraudType::DuplicatePayment
1721            | FraudType::TimingAnomaly
1722            | FraudType::SelfApproval
1723            | FraudType::ExceededApprovalLimit
1724            | FraudType::SegregationOfDutiesViolation
1725            | FraudType::UnauthorizedApproval
1726            | FraudType::CollusiveApproval
1727            | FraudType::FictitiousVendor
1728            | FraudType::ShellCompanyPayment
1729            | FraudType::Kickback
1730            | FraudType::KickbackScheme
1731            | FraudType::UnauthorizedDiscount
1732            | FraudType::RoundTripping
1733            | FraudType::InvoiceManipulation
1734            | FraudType::AssetMisappropriation
1735            | FraudType::InventoryTheft
1736            | FraudType::GhostEmployee => FraudAmountPattern::Normal,
1737            // Accounting Standards Fraud Types (ASC 606/IFRS 15 - Revenue)
1738            FraudType::ImproperRevenueRecognition
1739            | FraudType::ImproperPoAllocation
1740            | FraudType::VariableConsiderationManipulation
1741            | FraudType::ContractModificationMisstatement => {
1742                FraudAmountPattern::StatisticallyImprobable
1743            }
1744            // Accounting Standards Fraud Types (ASC 842/IFRS 16 - Leases)
1745            FraudType::LeaseClassificationManipulation
1746            | FraudType::OffBalanceSheetLease
1747            | FraudType::LeaseLiabilityUnderstatement
1748            | FraudType::RouAssetMisstatement => FraudAmountPattern::StatisticallyImprobable,
1749            // Accounting Standards Fraud Types (ASC 820/IFRS 13 - Fair Value)
1750            FraudType::FairValueHierarchyManipulation
1751            | FraudType::Level3InputManipulation
1752            | FraudType::ValuationTechniqueManipulation => {
1753                FraudAmountPattern::StatisticallyImprobable
1754            }
1755            // Accounting Standards Fraud Types (ASC 360/IAS 36 - Impairment)
1756            FraudType::DelayedImpairment
1757            | FraudType::ImpairmentTestAvoidance
1758            | FraudType::CashFlowProjectionManipulation
1759            | FraudType::ImproperImpairmentReversal => FraudAmountPattern::StatisticallyImprobable,
1760            // Sourcing/Procurement Fraud
1761            FraudType::BidRigging
1762            | FraudType::PhantomVendorContract
1763            | FraudType::ConflictOfInterestSourcing => FraudAmountPattern::Normal,
1764            FraudType::SplitContractThreshold => FraudAmountPattern::ThresholdAdjacent,
1765            // HR/Payroll Fraud
1766            FraudType::GhostEmployeePayroll
1767            | FraudType::PayrollInflation
1768            | FraudType::DuplicateExpenseReport
1769            | FraudType::FictitiousExpense => FraudAmountPattern::Normal,
1770            FraudType::SplitExpenseToAvoidApproval => FraudAmountPattern::ThresholdAdjacent,
1771            // O2C Fraud
1772            FraudType::RevenueTimingManipulation => FraudAmountPattern::StatisticallyImprobable,
1773            FraudType::QuotePriceOverride => FraudAmountPattern::Normal,
1774        }
1775    }
1776
1777    /// Generate a deterministic UUID using the factory.
1778    #[inline]
1779    fn generate_deterministic_uuid(&self) -> uuid::Uuid {
1780        self.uuid_factory.next()
1781    }
1782
1783    /// Cost center pool used for expense account enrichment.
1784    const COST_CENTER_POOL: &'static [&'static str] =
1785        &["CC1000", "CC2000", "CC3000", "CC4000", "CC5000"];
1786
1787    /// Enrich journal entry line items with account descriptions, cost centers,
1788    /// profit centers, value dates, line text, and assignment fields.
1789    ///
1790    /// This populates the sparse optional fields that `JournalEntryLine::debit()`
1791    /// and `::credit()` leave as `None`.
1792    ///
1793    /// SP3 T13: changed to `&mut self` so `loaded_priors` fanout samplers
1794    /// can be driven for CostCenter and ProfitCenter when priors are loaded.
1795    fn enrich_line_items(&mut self, entry: &mut JournalEntry) {
1796        let posting_date = entry.header.posting_date;
1797        let company_code = &entry.header.company_code;
1798        let header_text = entry.header.header_text.clone();
1799        let business_process = entry.header.business_process;
1800        // SP3 T13 — document-type code used as the entity_id for fanout
1801        // samplers.  Derived from the header field set during generate().
1802        let doc_type_key = entry.header.document_type.clone();
1803
1804        // SP3.7 — capture the SAP source code as an owned Option<String> so it
1805        // can be passed to `sample_attribute_for_source` as a `&str` inside the
1806        // line loop without keeping a borrow on `entry`.
1807        let header_sap_code: Option<String> = entry.header.sap_source_code.clone();
1808
1809        // SP3.3 — resolve cross-entity motif neighbors once before the line
1810        // loop.  Owned Vec avoids holding a shared borrow on `self.loaded_priors`
1811        // across the subsequent `&mut` fanout-sampler calls.
1812        let (cc_pc_neighbor_vec, cc_pc_share_prob): (Vec<String>, f64) =
1813            if let Some(priors) = &self.loaded_priors {
1814                if let Some(motifs) = &priors.cross_entity_motifs {
1815                    (
1816                        motifs.neighbors(&doc_type_key).to_vec(),
1817                        motifs.should_share(&doc_type_key),
1818                    )
1819                } else {
1820                    (Vec::new(), 0.0)
1821                }
1822            } else {
1823                (Vec::new(), 0.0)
1824            };
1825
1826        // Derive a deterministic index from the document_id for cost center selection
1827        let doc_id_bytes = entry.header.document_id.as_bytes();
1828        let mut cc_seed: usize = 0;
1829        for &b in doc_id_bytes {
1830            cc_seed = cc_seed.wrapping_add(b as usize);
1831        }
1832
1833        for (i, line) in entry.lines.iter_mut().enumerate() {
1834            // 1. account_description: look up from CoA
1835            if line.account_description.is_none() {
1836                line.account_description = self
1837                    .coa
1838                    .get_account(&line.gl_account)
1839                    .map(|a| a.short_description.clone());
1840            }
1841
1842            // 2. cost_center: assign to expense accounts (5xxx/6xxx)
1843            //
1844            // SP3 T13: when priors are loaded, the CostCenter fanout
1845            // sampler overrides the pool/legacy path.  This block runs
1846            // before the existing logic; if the sampler fires, `line.cost_center`
1847            // is set and the legacy block below is skipped via the
1848            // `line.cost_center.is_none()` guard.
1849            //
1850            // When the orchestrator has provided a master-data-sourced
1851            // pool (`with_cost_center_pool`), pick from it so the value
1852            // joins back to `cost_centers.id`.  Otherwise fall back to
1853            // the legacy hardcoded `COST_CENTER_POOL` const.
1854            //
1855            // Selection within the pool is filtered to entries that
1856            // mention the entry's `company_code` (master IDs follow
1857            // the `CC-{company}-...` convention) so cross-company
1858            // contamination is avoided; if no pool entry matches the
1859            // company we fall through to the full pool.
1860            if line.cost_center.is_none() {
1861                // SP3 T13 — prior-driven CostCenter fanout.
1862                // SP3.3: prefer neighbor-used buckets when motifs are available.
1863                // SP3.7: try per-source conditional cost_center first; fall back
1864                //        to the fanout sampler when the conditional is absent.
1865                let priors_opt = &mut self.loaded_priors;
1866                let rng_ref = &mut self.rng;
1867                if let Some(priors) = priors_opt {
1868                    let sp37_cc = header_sap_code.as_deref().and_then(|code| {
1869                        priors.sample_attribute_for_source(code, "cost_center", rng_ref)
1870                    });
1871                    if sp37_cc.is_some() {
1872                        line.cost_center = sp37_cc;
1873                    } else if let Some(sampler) = priors.fanout_samplers.get_mut("CostCenter") {
1874                        line.cost_center = Some(sampler.pick_for_with_neighbors(
1875                            &doc_type_key,
1876                            &cc_pc_neighbor_vec,
1877                            cc_pc_share_prob,
1878                            rng_ref,
1879                        ));
1880                    }
1881                }
1882            }
1883            if line.cost_center.is_none() {
1884                let first_char = line.gl_account.chars().next().unwrap_or('0');
1885                if first_char == '5' || first_char == '6' {
1886                    if !self.cost_center_pool.is_empty() {
1887                        let needle = format!("-{company_code}-");
1888                        let candidates: Vec<&String> = self
1889                            .cost_center_pool
1890                            .iter()
1891                            .filter(|id| id.contains(&needle))
1892                            .collect();
1893                        let pool: Vec<&String> = if candidates.is_empty() {
1894                            self.cost_center_pool.iter().collect()
1895                        } else {
1896                            candidates
1897                        };
1898                        let idx = cc_seed.wrapping_add(i) % pool.len();
1899                        line.cost_center = Some(pool[idx].clone());
1900                    } else {
1901                        let idx = cc_seed.wrapping_add(i) % Self::COST_CENTER_POOL.len();
1902                        line.cost_center = Some(Self::COST_CENTER_POOL[idx].to_string());
1903                    }
1904                }
1905            }
1906
1907            // 3. profit_center: assign from master pool when available
1908            // (`with_profit_center_pool`); otherwise derive from
1909            // company code + business process (legacy behaviour, which
1910            // does not match the master-data PC ID format).
1911            //
1912            // SP3 T13: prior-driven ProfitCenter fanout override fires first
1913            // (same pattern as CostCenter above).
1914            if line.profit_center.is_none() {
1915                // SP3 T13 — prior-driven ProfitCenter fanout.
1916                // SP3.3: prefer neighbor-used buckets when motifs are available.
1917                // SP3.7: try per-source conditional profit_center first; fall back
1918                //        to the fanout sampler when the conditional is absent.
1919                let priors_opt = &mut self.loaded_priors;
1920                let rng_ref = &mut self.rng;
1921                if let Some(priors) = priors_opt {
1922                    let sp37_pc = header_sap_code.as_deref().and_then(|code| {
1923                        priors.sample_attribute_for_source(code, "profit_center", rng_ref)
1924                    });
1925                    if sp37_pc.is_some() {
1926                        line.profit_center = sp37_pc;
1927                    } else if let Some(sampler) = priors.fanout_samplers.get_mut("ProfitCenter") {
1928                        line.profit_center = Some(sampler.pick_for_with_neighbors(
1929                            &doc_type_key,
1930                            &cc_pc_neighbor_vec,
1931                            cc_pc_share_prob,
1932                            rng_ref,
1933                        ));
1934                    }
1935                }
1936            }
1937            if line.profit_center.is_none() {
1938                if !self.profit_center_pool.is_empty() {
1939                    let needle = format!("-{company_code}-");
1940                    let candidates: Vec<&String> = self
1941                        .profit_center_pool
1942                        .iter()
1943                        .filter(|id| id.contains(&needle))
1944                        .collect();
1945                    let pool: Vec<&String> = if candidates.is_empty() {
1946                        self.profit_center_pool.iter().collect()
1947                    } else {
1948                        candidates
1949                    };
1950                    let idx = cc_seed.wrapping_add(i) % pool.len();
1951                    line.profit_center = Some(pool[idx].clone());
1952                } else {
1953                    let suffix = match business_process {
1954                        Some(BusinessProcess::P2P) => "-P2P",
1955                        Some(BusinessProcess::O2C) => "-O2C",
1956                        Some(BusinessProcess::R2R) => "-R2R",
1957                        Some(BusinessProcess::H2R) => "-H2R",
1958                        _ => "",
1959                    };
1960                    line.profit_center = Some(format!("PC-{company_code}{suffix}"));
1961                }
1962            }
1963
1964            // 3b. business_unit (SOTA-3): a coherent roll-up of the cost center,
1965            // or the profit center as fallback — the same dimension value always
1966            // maps to the same BU, so BU-level analytics are consistent. Runs
1967            // after both CC (step 2) and PC (step 3) are assigned; using CC-or-PC
1968            // lifts fill toward the corpus (~82%) vs only CC-bearing lines (~24%).
1969            // Flag-gated by `transactions.business_unit_dimension` (default-on).
1970            if line.business_unit.is_none() && self.config.business_unit_dimension.unwrap_or(true) {
1971                if let Some(dim) = line
1972                    .cost_center
1973                    .as_deref()
1974                    .or(line.profit_center.as_deref())
1975                {
1976                    line.business_unit = Some(Self::business_unit_for_dimension(dim));
1977                }
1978            }
1979
1980            // 4. trading_partner: SP3.9 — inherit JE-level trading_partner from
1981            // the header. The header was populated once per JE in generate();
1982            // all lines share the same value to match corpus SAP semantics.
1983            // The is_none() guard preserves TP values already set by the P2P/O2C
1984            // document chain manager (also JE-level, different code path).
1985            if line.trading_partner.is_none() {
1986                line.trading_partner = entry.header.trading_partner.clone();
1987            }
1988
1989            // 5. line_text: fall back to header_text if not already set
1990            if line.line_text.is_none() {
1991                line.line_text = header_text.clone();
1992            }
1993
1994            // 6. value_date: set to posting_date for AR/AP accounts
1995            if line.value_date.is_none()
1996                && (line.gl_account.starts_with("1100") || line.gl_account.starts_with("2000"))
1997            {
1998                line.value_date = Some(posting_date);
1999            }
2000
2001            // 7. assignment: set to vendor/customer reference for AP/AR lines
2002            if line.assignment.is_none() {
2003                if line.gl_account.starts_with("2000") {
2004                    // AP line - use vendor reference from header
2005                    if let Some(ref ht) = header_text {
2006                        // Try to extract vendor ID from header text patterns like "... - V-001"
2007                        if let Some(vendor_part) = ht.rsplit(" - ").next() {
2008                            if vendor_part.starts_with("V-")
2009                                || vendor_part.starts_with("VENDOR")
2010                                || vendor_part.starts_with("Vendor")
2011                            {
2012                                line.assignment = Some(vendor_part.to_string());
2013                            }
2014                        }
2015                    }
2016                } else if line.gl_account.starts_with("1100") {
2017                    // AR line - use customer reference from header
2018                    if let Some(ref ht) = header_text {
2019                        if let Some(customer_part) = ht.rsplit(" - ").next() {
2020                            if customer_part.starts_with("C-")
2021                                || customer_part.starts_with("CUST")
2022                                || customer_part.starts_with("Customer")
2023                            {
2024                                line.assignment = Some(customer_part.to_string());
2025                            }
2026                        }
2027                    }
2028                }
2029            }
2030        }
2031    }
2032
2033    /// Generate a single journal entry.
2034    pub fn generate(&mut self) -> JournalEntry {
2035        debug!(
2036            count = self.count,
2037            companies = self.companies.len(),
2038            start_date = %self.start_date,
2039            end_date = %self.end_date,
2040            "Generating journal entry"
2041        );
2042
2043        // Check if we're in a batch - if so, generate a batched entry
2044        if let Some(ref state) = self.batch_state {
2045            if state.remaining > 0 {
2046                return self.generate_batched_entry();
2047            }
2048        }
2049
2050        // SOTA-5: with a small probability, emit a reversal/correction of a
2051        // recent JE instead of a fresh one (a process auditors look for).
2052        if let Some(rev) = self.maybe_generate_reversal() {
2053            return rev;
2054        }
2055
2056        // SOTA-6: with a small probability, emit a large allocation/assessment
2057        // batch (the corpus lines-per-JE tail) instead of a fresh JE.
2058        if let Some(alloc) = self.maybe_generate_allocation_batch() {
2059            return alloc;
2060        }
2061
2062        // SP6 — Lazy-init the MD resolver on the first call. Rebuilding once
2063        // per run is sufficient; pools are stable after master-data generation.
2064        if self.md_resolver.companies.is_empty()
2065            && self.md_resolver.persons.is_empty()
2066            && self.md_resolver.patients.is_empty()
2067        {
2068            self.refresh_md_resolver();
2069        }
2070
2071        self.count += 1;
2072
2073        // Generate deterministic document ID
2074        let document_id = self.generate_deterministic_uuid();
2075
2076        // SP3.5c — Lazy temporal-sampler date draw.
2077        //
2078        // When priors are loaded the IET path (SP3 T11) will immediately replace
2079        // this value, so drawing from the temporal sampler here wastes one RNG
2080        // advance on the sampler's internal stream AND makes the temporal-sampler
2081        // variance contribute to the merged date sequence even though the IET
2082        // sampler is meant to dominate.
2083        //
2084        // Fix: only draw from the temporal sampler now when no priors are loaded.
2085        // The IET block sets `posting_date` unconditionally when priors are Some;
2086        // the active-window fallback (SP3 T14) has its own sample_date call and is
2087        // unaffected by this change.
2088        //
2089        // Priors-absent path: byte-identical to v5.13 — the draw and business-day
2090        // snap are performed exactly as before.
2091        let mut posting_date = if self.loaded_priors.is_none() {
2092            let mut d = self
2093                .temporal_sampler
2094                .sample_date(self.start_date, self.end_date);
2095            // Adjust posting date to be a business day if business day calculator is configured
2096            if let Some(ref calc) = self.business_day_calculator {
2097                if !calc.is_business_day(d) {
2098                    // P3a: distribute weekend dates instead of always snapping to
2099                    // the *next* business day, which piled Sat+Sun onto Monday
2100                    // (a ~26%/16% Mon/Fri skew vs the corpus's flat Mon–Fri).
2101                    // Saturday -> preceding Friday; Sunday/holidays -> next
2102                    // business day. Deterministic (no RNG draw), so other values
2103                    // are unaffected.
2104                    use chrono::Datelike;
2105                    d = if d.weekday() == chrono::Weekday::Sat {
2106                        calc.prev_business_day(d, false)
2107                    } else {
2108                        calc.next_business_day(d, false)
2109                    };
2110                    if d > self.end_date {
2111                        d = calc.prev_business_day(self.end_date, true);
2112                    } else if d < self.start_date {
2113                        d = calc.next_business_day(self.start_date, true);
2114                    }
2115                }
2116            }
2117            d
2118        } else {
2119            // Priors-loaded path: IET block (below) will set the real date.
2120            // Use start_date as a zero-cost placeholder — it is always overwritten.
2121            self.start_date
2122        };
2123
2124        // Select company using weighted selector
2125        let company_code = self.company_selector.select(&mut self.rng).to_string();
2126
2127        // v4.1.0+: draw a single (u, v) pair from the copula — cached for
2128        // both the amount adjustment (u) and the line-count shift (v).
2129        // None when no copula is configured.
2130        let copula_uv: Option<(f64, f64)> =
2131            self.correlation_copula.as_mut().map(|cop| cop.sample());
2132
2133        // Sample line item specification. When a copula is configured,
2134        // v drives line-count via a quantile-preserving map: integer
2135        // count `2 + floor(v * 10)` gives range [2, 11] evenly spaced
2136        // in v, so rank(v) == rank(line_count).
2137        //
2138        // v4.1.6+: upgraded from the v3.5.4 nudge (shift around
2139        // independently-drawn count) to true rank-preserving quantile
2140        // inversion, so empirical Kendall-τ now matches copula theory.
2141        let mut line_spec = self.line_sampler.sample();
2142        if let Some((_u, v)) = copula_uv {
2143            let new_total = 2 + ((v * 10.0).floor() as usize).min(9);
2144            let old_debit = line_spec.debit_count.max(1);
2145            let old_credit = line_spec.credit_count.max(1);
2146            let new_debit = (new_total as f64 * old_debit as f64 / (old_debit + old_credit) as f64)
2147                .round() as usize;
2148            let new_debit = new_debit.clamp(1, new_total - 1);
2149            let new_credit = new_total - new_debit;
2150            line_spec.total_count = new_total;
2151            line_spec.debit_count = new_debit;
2152            line_spec.credit_count = new_credit;
2153        }
2154
2155        // SOTA-10 (#138): optional hard cap on total lines per JE — tames the
2156        // monster outliers (synth max 2133 vs corpus 924). Scales debit + credit
2157        // proportionally so balance is preserved.
2158        if let Some(cap) = self.config.lines_per_je_cap {
2159            let cap = cap.max(2);
2160            let total = line_spec.debit_count + line_spec.credit_count;
2161            if total > cap {
2162                let new_debit =
2163                    ((line_spec.debit_count as f64 / total as f64) * cap as f64).round() as usize;
2164                let new_debit = new_debit.clamp(1, cap - 1);
2165                let new_credit = cap - new_debit;
2166                line_spec.total_count = cap;
2167                line_spec.debit_count = new_debit;
2168                line_spec.credit_count = new_credit;
2169            }
2170        }
2171
2172        // Determine source type using full 4-way distribution
2173        let source = self.select_source();
2174        let is_automated = matches!(
2175            source,
2176            TransactionSource::Automated | TransactionSource::Recurring
2177        );
2178
2179        // SP3.6 — when priors are loaded, sample a canonical SAP source code
2180        // from the bundle's source-mix distribution.  This is independent of
2181        // the `TransactionSource` enum (which controls manual/automated semantics)
2182        // and is written to `header.sap_source_code`, then emitted in the CSV
2183        // `source` column in place of the generic label.
2184        let sap_source_code: Option<String> = self.sample_sap_source_code();
2185        // SOTA-8: stash the current JE's SAP source so select_*_account can consult
2186        // the per-source Dirichlet pool. Cleared at the end of this generate() call.
2187        self.current_je_source = sap_source_code.clone();
2188
2189        // Select business process
2190        let business_process = self.select_business_process();
2191
2192        // SP3 T11 — IET-driven posting-date override.
2193        //
2194        // When priors are loaded, replace the uniform temporal-sampler date
2195        // with one derived from the per-Source inter-event-time prior.  We
2196        // accumulate IET samples (in fractional days) per source code and
2197        // map the accumulated offset onto [start_date, end_date].
2198        //
2199        // v5.30 B1 (#152): route through `sap_source_code` (the actual emitted
2200        // source) rather than `doc_type` (only 5 values: KR/DR/SA/HR/AA from
2201        // document_type_for_process). Before B1, `sample_next(&doc_type, …)`
2202        // hit the IET sampler with only 5 distinct keys for all 526 emitted
2203        // sources, leaving the per-source lag-1 autocorr machinery in
2204        // ConditionalIETSampler **unwired** for 521 of the sources. The
2205        // Sajja P1 autocorr DR of 105.9× (worst sub-metric on the A1 eval)
2206        // is the direct downstream consequence. Switching to source-keyed
2207        // sampling actually exercises the per-source priors.
2208        //
2209        // The None path is untouched: `posting_date` from the temporal sampler
2210        // above is used as-is.
2211        {
2212            // Split-borrow: four distinct struct fields accessed simultaneously.
2213            let priors_opt = &mut self.loaded_priors;
2214            let rng_ref = &mut self.rng;
2215            let iet_accum_ref = &mut self.iet_day_accum;
2216            let burst_ref = &mut self.iet_burst_remaining;
2217            if let Some(priors) = priors_opt {
2218                // Prefer the per-row SAP source code (populated when priors
2219                // load via SP3.6's source-mix sampler). Fall back to doc_type
2220                // for the rare branch where source-code sampling returned None.
2221                let iet_key = sap_source_code
2222                    .as_deref()
2223                    .unwrap_or_else(|| Self::document_type_for_process(business_process))
2224                    .to_string();
2225                let period_days = (self.end_date - self.start_date).num_days().max(1) as f64;
2226
2227                // v5.30 B1 Phase 2 — burst clustering.
2228                //
2229                // The lag-1 Gaussian-copula path in ConditionalIETSampler
2230                // (conditional_iet.rs:176-203) silently falls back to
2231                // independent sampling whenever the per-source |ρ| < 0.1.
2232                // The bundled SP3 priors' per-source lag1_autocorr values are
2233                // mostly below that threshold (corpus has only weak
2234                // day-resolution autocorrelation), so the coupling never
2235                // fires and the within-source IET autocorr matches the
2236                // noise floor — the Sajja P1 autocorr 105.9× DR before A3,
2237                // 62.84× after A3, with B1 Phase 1 (source-keying) producing
2238                // no measurable lift.
2239                //
2240                // This block bypasses the |ρ| < 0.1 gate by emitting
2241                // **deterministic** short-IET bursts for each source.  When
2242                // a sampled IET is short (< BURST_THRESHOLD_DAYS) and a
2243                // probability gate fires (BURST_PROB), the next
2244                // BURST_LEN events for that source emit IETs in
2245                // [0.25, 1.5] days regardless of what the sampler returns.
2246                //
2247                // Effect on within-source IET autocorrelation: events 1..k
2248                // of a burst have tightly-clustered IETs around 0.85 days
2249                // mean → lag-1 autocorr lifts directly. Inter-burst IETs
2250                // are still sampled normally so the macro distribution
2251                // stays close to the prior.
2252                const BURST_THRESHOLD_DAYS: f64 = 2.0;
2253                const BURST_PROB: f64 = 0.30;
2254                const BURST_LEN_MIN: u8 = 2;
2255                const BURST_LEN_MAX: u8 = 4;
2256
2257                let sampled_iet = priors.iet_sampler.sample_next(&iet_key, rng_ref).max(0.001);
2258
2259                // Check if we're inside an active burst for this source.
2260                let remaining = burst_ref.get(&iet_key).copied().unwrap_or(0);
2261                let iet = if remaining > 0 {
2262                    // Active burst: emit a short IET regardless of sampler.
2263                    burst_ref.insert(iet_key.clone(), remaining - 1);
2264                    rng_ref.random_range(0.25..=1.5)
2265                } else if sampled_iet < BURST_THRESHOLD_DAYS
2266                    && rng_ref.random_range(0.0..1.0) < BURST_PROB
2267                {
2268                    // Start a new burst: this event uses the sampled IET,
2269                    // and the next BURST_LEN events for this source will
2270                    // emit short IETs.
2271                    let len = rng_ref.random_range(BURST_LEN_MIN..=BURST_LEN_MAX);
2272                    burst_ref.insert(iet_key.clone(), len);
2273                    sampled_iet
2274                } else {
2275                    sampled_iet
2276                };
2277
2278                let accum = iet_accum_ref.entry(iet_key).or_insert(0.0);
2279                *accum += iet;
2280                // Wrap within period so we never exceed the generation window.
2281                if *accum >= period_days {
2282                    *accum %= period_days;
2283                }
2284                let day_offset =
2285                    (*accum as i64).clamp(0, (self.end_date - self.start_date).num_days());
2286                posting_date = self.start_date + chrono::Duration::days(day_offset);
2287                // Re-apply business-day snap so the IET date still lands on a
2288                // working day (matches the business_day_calculator logic above).
2289                if let Some(ref calc) = self.business_day_calculator {
2290                    if !calc.is_business_day(posting_date) {
2291                        posting_date = calc.next_business_day(posting_date, false);
2292                        if posting_date > self.end_date {
2293                            posting_date = calc.prev_business_day(self.end_date, true);
2294                        }
2295                    }
2296                }
2297            } // end if let Some(priors)
2298        } // end split-borrow scope
2299
2300        // SP3 T14 — active-window gating.
2301        //
2302        // After the IET-driven date is computed, check whether this Source is
2303        // still in its active window for the resulting day.  If the prior says
2304        // the Source has "gone quiet" (e.g. a vendor that stopped trading), we
2305        // fall back to the temporal-sampler date so the JE still emits but is
2306        // no longer anchored to the IET timeline for this source.
2307        //
2308        // In a day-loop architecture this would be a `continue`; here, the
2309        // equivalent is to revert `posting_date` to the original temporal-
2310        // sampler sample so downstream logic sees a plausible date.
2311        //
2312        // The None path is untouched.
2313        if let Some(ref priors) = self.loaded_priors {
2314            let doc_type = Self::document_type_for_process(business_process);
2315            let day_in_period = (posting_date - self.start_date).num_days();
2316            let active = match &priors.multi_segment_window {
2317                Some(msw) => msw.is_active(doc_type, day_in_period),
2318                None => priors.active_window.is_active(doc_type, day_in_period),
2319            };
2320            if !active {
2321                // Source is outside its active window: fall back to a fresh
2322                // temporal-sampler draw.  (SP3.5c: the up-front temporal draw
2323                // is skipped when priors are loaded, so we always re-sample
2324                // here in the fallback path rather than reusing a cached value.)
2325                posting_date = self
2326                    .temporal_sampler
2327                    .sample_date(self.start_date, self.end_date);
2328                if let Some(ref calc) = self.business_day_calculator {
2329                    if !calc.is_business_day(posting_date) {
2330                        posting_date = calc.next_business_day(posting_date, false);
2331                        if posting_date > self.end_date {
2332                            posting_date = calc.prev_business_day(self.end_date, true);
2333                        }
2334                    }
2335                }
2336            }
2337        }
2338
2339        // SP3 T12 — lines-per-JE override from prior histogram.
2340        //
2341        // When priors are loaded, replace `line_spec` totals with a sample
2342        // drawn from the Source-conditional histogram (falling back to the
2343        // overall histogram when the document-type is unknown).  `.max(2)`
2344        // guarantees every JE has at least one debit + one credit line.
2345        // The None path leaves `line_spec` from the copula / line-sampler
2346        // cascade above completely unchanged.
2347        if let Some(ref priors) = self.loaded_priors {
2348            let doc_type = Self::document_type_for_process(business_process);
2349            let hist = priors
2350                .lines_per_je
2351                .by_source
2352                .get(doc_type)
2353                .unwrap_or(&priors.lines_per_je.overall);
2354            let n_total = (hist.sample_bucket(&mut self.rng) as usize).max(2);
2355            let old_debit = line_spec.debit_count.max(1);
2356            let old_credit = line_spec.credit_count.max(1);
2357            let new_debit = (n_total as f64 * old_debit as f64 / (old_debit + old_credit) as f64)
2358                .round() as usize;
2359            let new_debit = new_debit.clamp(1, n_total - 1);
2360            line_spec.total_count = n_total;
2361            line_spec.debit_count = new_debit;
2362            line_spec.credit_count = n_total - new_debit;
2363        }
2364
2365        // Determine if this is a fraudulent transaction (v5.30 B3 — per-process
2366        // rates pass `business_process` through to honor fraud.per_process_rates
2367        // overrides when configured)
2368        let fraud_type = self.determine_fraud(business_process);
2369        let is_fraud = fraud_type.is_some();
2370
2371        // Sample time based on source
2372        let time = self.temporal_sampler.sample_time(!is_automated);
2373        let created_at = posting_date.and_time(time).and_utc();
2374
2375        // Select user from pool or generate generic
2376        let (created_by, user_persona) = self.select_user(is_automated);
2377
2378        // Create header with deterministic UUID
2379        let mut header =
2380            JournalEntryHeader::with_deterministic_id(company_code, posting_date, document_id);
2381        header.created_at = created_at;
2382        header.source = source;
2383        header.sap_source_code = sap_source_code;
2384
2385        // SP3.9/SP3.12 — JE-level trading partner (one TP per document, motif-
2386        // biased toward cluster-mates of the previous TP on the same source).
2387        // Shared with the batched path via the helper (#43).
2388        self.apply_trading_partner_motif(&mut header);
2389
2390        // SP4.5 — user-persona prior: when a corpus prior with user data is
2391        // loaded, override `created_by` with a user characteristic of the drawn
2392        // source, and bias `created_at` hour-of-day from the user's density.
2393        // Falls back transparently to `created_by` / `created_at` already set above.
2394        let (created_by, created_at) = {
2395            let sap_code_for_user = header.sap_source_code.clone();
2396            if let (Some(ref code), Some(ref priors)) = (sap_code_for_user, &self.loaded_priors) {
2397                if let Some(uid) = priors.sample_user_for_source(code, &mut self.rng) {
2398                    let new_created_at = if let Some((hour, _)) =
2399                        priors.sample_timestamp_for_user(&uid, &mut self.rng)
2400                    {
2401                        let base = header.created_at;
2402                        base.date_naive()
2403                            .and_hms_opt(hour, 0, 0)
2404                            .map(|naive| naive.and_utc())
2405                            .unwrap_or(base)
2406                    } else {
2407                        header.created_at
2408                    };
2409                    (uid, new_created_at)
2410                } else {
2411                    (created_by, header.created_at)
2412                }
2413            } else {
2414                (created_by, header.created_at)
2415            }
2416        };
2417
2418        header.created_by = created_by;
2419        header.created_at = created_at;
2420        header.user_persona = user_persona;
2421        header.business_process = Some(business_process);
2422        header.document_type = Self::document_type_for_process(business_process).to_string();
2423        header.is_fraud = is_fraud;
2424        header.fraud_type = fraud_type;
2425
2426        // --- ISA 240 audit flags ---
2427        let is_manual = matches!(source, TransactionSource::Manual);
2428        header.is_manual = is_manual;
2429
2430        // Determine source_system based on manual vs automated.
2431        //
2432        // Real ERPs typically expose 20+ distinct provenance codes per
2433        // company (one per module + sub-module + interface). The taxonomy
2434        // below is a strict superset of the legacy {manual, spreadsheet,
2435        // SAP-FI, SAP-MM, SAP-SD, interface, SAP-HR} codes so downstream
2436        // consumers that filter by prefix (e.g. `starts_with("SAP-")`)
2437        // continue to work.
2438        //
2439        // Contract preserved by the generator-level audit assertion in
2440        // `test_isa240_audit_flags_populated`:
2441        //   - manual entries → starts_with("manual") || starts_with("spreadsheet")
2442        //   - automated entries → does NOT start with "manual"/"spreadsheet"
2443        header.source_system = Self::pick_source_system(&mut self.rng, is_manual, business_process);
2444
2445        // is_post_close: entry is in the last month of the configured period
2446        // and the posting date falls after the 25th (simulating close cutoff)
2447        let is_post_close = posting_date.month() == self.end_date.month()
2448            && posting_date.year() == self.end_date.year()
2449            && posting_date.day() > 25;
2450        header.is_post_close = is_post_close;
2451
2452        // created_date: for manual entries, same day as posting; for automated,
2453        // 0-3 days before posting_date
2454        let created_date = if is_manual {
2455            posting_date.and_hms_opt(time.hour().min(23), time.minute(), time.second())
2456        } else {
2457            let lag_days = self.rng.random_range(0i64..=3);
2458            let created_naive_date = posting_date
2459                .checked_sub_signed(chrono::Duration::days(lag_days))
2460                .unwrap_or(posting_date);
2461            created_naive_date.and_hms_opt(
2462                self.rng.random_range(8u32..=17),
2463                self.rng.random_range(0u32..=59),
2464                self.rng.random_range(0u32..=59),
2465            )
2466        };
2467        header.created_date = created_date;
2468
2469        // Generate description context (period + vendor/customer by process).
2470        // Shared with the batched path via the helper (#43).
2471        let context = self.build_description_context(business_process, posting_date);
2472
2473        // Header text + reference + derived source_document, per template config
2474        // and loaded priors. Shared with the batched path via the helper (#43).
2475        self.apply_header_text_and_reference(&mut header, business_process, &context, posting_date);
2476
2477        // Generate line items
2478        let mut entry = JournalEntry::new(header);
2479
2480        // Generate amount - use fraud pattern if this is a fraudulent transaction.
2481        // Non-fraud path prefers the v3.4.0 advanced sampler when configured; fraud
2482        // patterns always use the legacy sampler because they target specific
2483        // thresholds (round numbers, just-under-approval amounts) that are
2484        // orthogonal to mixture models.
2485        let base_amount = if let Some(ft) = fraud_type {
2486            let pattern = self.fraud_type_to_amount_pattern(ft);
2487            self.amount_sampler.sample_fraud(pattern)
2488        } else if let Some(ref mut adv) = self.advanced_amount_sampler {
2489            adv.sample_decimal()
2490        } else {
2491            self.amount_sampler.sample()
2492        };
2493        // v3.5.3+: if a conditional-amount override is configured and
2494        // the JE is non-fraud, re-sample the amount from the conditional
2495        // distribution using the computed context. Fraud entries bypass
2496        // this path to preserve fraud-pattern semantics (as with the
2497        // advanced sampler cascade above).
2498        let base_amount = if fraud_type.is_none() {
2499            // Compute input context BEFORE taking &mut on the sampler
2500            // to avoid borrow-checker conflict with the immutable
2501            // `conditional_input_value` call.
2502            let input = self.conditional_input_value(posting_date);
2503            if let Some(ref mut cond) = self.conditional_amount_override {
2504                cond.sample_decimal(input)
2505            } else {
2506                base_amount
2507            }
2508        } else {
2509            base_amount
2510        };
2511
2512        // SP4.3 — when priors are loaded, try to replace the base_amount with
2513        // a draw from the per-source log-normal conditional.  This step only
2514        // fires for non-fraud JEs (fraud entries must preserve fraud-pattern
2515        // semantics).  We use the source-marginal (gl_prefix = "") as the
2516        // initial lookup; per-class refinement requires knowing the GL account
2517        // which is sampled after the amount in some paths, so we defer that
2518        // to a follow-up sprint.  Balance preservation is maintained because
2519        // the splitter below uses `total_amount` unchanged.
2520        //
2521        // W7.M — autocorr mitigation: ~30 % of priors-enabled draws bypass the
2522        // per-source conditional and draw from the global marginal sampler.
2523        // This loosens the per-source amount-sequence correlation that SP4.3's
2524        // conditional was over-tightening (v5.23 baseline: Source P1 Autocorr
2525        // +750 %, TP P1 Autocorr +101 %).  Proven pattern from SP3.12 W2
2526        // TP-clustering mitigation.
2527        //
2528        // Split-borrow: `loaded_priors` and `rng` are distinct struct fields so
2529        // the compiler allows simultaneous mutable borrows.
2530        // SP5.3 — intermediate tune from 0.20 to 0.25 between v5.24 (0.30 →
2531        // autocorr 1.53, over-corrected) and v5.25 (0.20 → autocorr 3.74,
2532        // under-corrected). Targets the trade-off sweet spot.
2533        const PRIORS_AMOUNT_BYPASS_SHARE: f64 = 0.25;
2534        let base_amount = if fraud_type.is_none() {
2535            if let Some(src) = entry.header.sap_source_code.as_deref() {
2536                let src_owned = src.to_string();
2537                // Gate: skip the conditional ~25 % of the time to loosen
2538                // per-source amount sequence correlation without overshooting.
2539                let use_conditional = self.loaded_priors.is_some()
2540                    && self.rng.random_range(0.0..1.0) >= PRIORS_AMOUNT_BYPASS_SHARE;
2541                if use_conditional {
2542                    let priors_ref = &mut self.loaded_priors;
2543                    let rng_ref = &mut self.rng;
2544                    if let Some(priors) = priors_ref {
2545                        priors
2546                            .sample_amount_for_source(&src_owned, "", rng_ref)
2547                            .and_then(|v| {
2548                                if v.is_finite() && v > 0.0 {
2549                                    Decimal::from_f64_retain(v)
2550                                } else {
2551                                    None
2552                                }
2553                            })
2554                            .unwrap_or(base_amount)
2555                    } else {
2556                        base_amount
2557                    }
2558                } else {
2559                    base_amount
2560                }
2561            } else {
2562                base_amount
2563            }
2564        } else {
2565            base_amount
2566        };
2567
2568        // v4.1.6+: if a copula is configured AND an advanced amount
2569        // sampler with a ppf is available, use true rank-preserving
2570        // inverse-CDF sampling — amount is drawn DIRECTLY from the
2571        // sampler's quantile at `u`, replacing (not nudging) the
2572        // independently-drawn base_amount. This makes empirical
2573        // Kendall-τ match the copula's theoretical τ.
2574        //
2575        // Fallback for copula-without-advanced-sampler: keep the
2576        // v4.1.0 log-scale multiplier nudge (observable correlation,
2577        // diluted magnitude).
2578        let base_amount = if fraud_type.is_none() {
2579            if let Some((u, _v)) = copula_uv {
2580                if let Some(ref adv) = self.advanced_amount_sampler {
2581                    adv.ppf_decimal(u)
2582                } else {
2583                    let log_mult = 4.0 * (u - 0.5);
2584                    let adjusted = base_amount.to_f64().unwrap_or(1.0) * log_mult.exp();
2585                    Decimal::from_f64_retain(adjusted).unwrap_or(base_amount)
2586                }
2587            } else {
2588                base_amount
2589            }
2590        } else {
2591            base_amount
2592        };
2593
2594        // Apply temporal drift if configured
2595        let drift_adjusted_amount = {
2596            let drift = self.get_drift_adjustments(posting_date);
2597            if drift.amount_mean_multiplier != 1.0 {
2598                // Apply drift multiplier (includes seasonal factor if enabled)
2599                let multiplier = drift.amount_mean_multiplier * drift.seasonal_factor;
2600                let adjusted = base_amount.to_f64().unwrap_or(1.0) * multiplier;
2601                Decimal::from_f64_retain(adjusted).unwrap_or(base_amount)
2602            } else {
2603                base_amount
2604            }
2605        };
2606
2607        // Apply human variation to amounts for non-automated transactions
2608        let total_amount = if is_automated {
2609            drift_adjusted_amount // Automated systems use exact amounts
2610        } else {
2611            self.apply_human_variation(drift_adjusted_amount)
2612        };
2613
2614        // P0a (corpus-realism): couple the JE total to its line count. The total
2615        // is split across the per-side line count, so without coupling a
2616        // multi-line JE splits a 2-line-sized total into tiny per-line amounts
2617        // (per-line ∝ 1/N), collapsing the line-population median to sub-$10
2618        // (corpus is ~$10K). A mild power law on the per-side line count keeps
2619        // per-line amounts in the corpus band — 2-line JEs are unchanged
2620        // (exponent base 1), and the per-line median decays only gently with N
2621        // (overall/2-line median ratio ~0.8, matching the corpus). Applied on
2622        // the default no-copula path for non-fraud entries only; an explicit
2623        // correlation copula governs the coupling when configured, and fraud
2624        // amounts keep their own signatures. See
2625        // docs/analysis/gl-corpus-realism-roadmap.md (P0a).
2626        let total_amount = if copula_uv.is_none()
2627            && fraud_type.is_none()
2628            && !self.config.disable_line_count_amount_coupling
2629        {
2630            const LINE_COUNT_AMOUNT_EXPONENT: f64 = 0.85;
2631            let per_side = (line_spec.total_count as f64 / 2.0).max(1.0);
2632            if per_side > 1.0 {
2633                // Clamp the scaled total to the configured max so a multi-line
2634                // JE cannot push an individual split line above `max_amount`.
2635                let scaled = (total_amount.to_f64().unwrap_or(0.0)
2636                    * per_side.powf(LINE_COUNT_AMOUNT_EXPONENT))
2637                .min(self.config.amounts.max_amount);
2638                Decimal::from_f64_retain(scaled)
2639                    .map(|d| d.round_dp(2))
2640                    .unwrap_or(total_amount)
2641            } else {
2642                total_amount
2643            }
2644        } else {
2645            total_amount
2646        };
2647
2648        // Enforce the configured amount bounds on the final per-entry total,
2649        // regardless of which transforms ran (drift, seasonal multiplier, or the
2650        // P0a line-count coupling). With the corpus-aligned mu the base can sit
2651        // near `max_amount`, so an upstream multiplier (e.g. a month-end seasonal
2652        // factor) could otherwise push a line above the configured maximum.
2653        let total_amount = {
2654            let v = total_amount.to_f64().unwrap_or(0.0).clamp(
2655                self.config.amounts.min_amount,
2656                self.config.amounts.max_amount,
2657            );
2658            Decimal::from_f64_retain(v)
2659                .map(|d| d.round_dp(2))
2660                .unwrap_or(total_amount)
2661        };
2662
2663        // SP3 T13 — derive the document-type key once for use in all
2664        // fanout-sampler lookups below.  Computed unconditionally so it is
2665        // available for both debit and credit loops without re-deriving.
2666        let doc_type_for_fanout = Self::document_type_for_process(business_process).to_string();
2667
2668        // SP3.3 — resolve cross-entity motif neighbors for this fanout entity.
2669        // We capture an owned Vec<String> here so that the shared borrow on
2670        // `self.loaded_priors` is released before the subsequent `&mut` borrow
2671        // on `fanout_samplers`.
2672        let (gl_neighbor_vec, gl_share_prob): (Vec<String>, f64) =
2673            if let Some(priors) = &self.loaded_priors {
2674                if let Some(motifs) = &priors.cross_entity_motifs {
2675                    (
2676                        motifs.neighbors(&doc_type_for_fanout).to_vec(),
2677                        motifs.should_share(&doc_type_for_fanout),
2678                    )
2679                } else {
2680                    (Vec::new(), 0.0)
2681                }
2682            } else {
2683                (Vec::new(), 0.0)
2684            };
2685
2686        // SOTA-1: recurring/standard-journal templates. On the no-priors path,
2687        // reuse a cached account archetype for this (company, doc-type, counts)
2688        // with high probability so standard postings recur (and a hot account
2689        // subset dominates). Reuse overrides only the line account (set after
2690        // text/RNG below), so amounts/counts/dates stay byte-identical; fresh
2691        // archetypes are captured + cached after the lines are built.
2692        let reuse_archetype = self.pick_recurring_archetype(
2693            &entry.header.company_code,
2694            &doc_type_for_fanout,
2695            line_spec.debit_count,
2696            line_spec.credit_count,
2697        );
2698        let mut fresh_debit_accts: Vec<String> = Vec::new();
2699        let mut fresh_credit_accts: Vec<String> = Vec::new();
2700        // SOTA-8: hoisted so both the debit and credit loops + their SOTA-1 archetype
2701        // override blocks share the same flag.
2702        let sota8_active = self.config.source_conditional_account_pair.enabled;
2703
2704        // Generate debit lines
2705        let debit_amounts = self
2706            .amount_sampler
2707            .sample_summing_to(line_spec.debit_count, total_amount);
2708        for (i, amount) in debit_amounts.into_iter().enumerate() {
2709            // SP3 T13 — GL Account fanout: when priors are loaded, pick the
2710            // account from the BipartiteFanoutSampler keyed "GLAccount" for
2711            // this Source.  Split-borrows let us hold &mut loaded_priors and
2712            // &mut rng at the same time (distinct struct fields).
2713            // SP3 T13 — GL Account fanout for debit lines.
2714            // Pre-compute the fallback before the split-borrow scope so that
2715            // `select_debit_account` (which takes `&mut self`) does not conflict
2716            // with the concurrent borrow of `loaded_priors` and `rng`.
2717            let debit_fallback = self.select_debit_account().account_number.clone();
2718            // SOTA-8: when enabled, the per-source Dirichlet pool (which `select_debit_account`
2719            // has already consulted via try_cond_pick_account_number) takes precedence over the
2720            // SP3/SP4 priors-driven path so the user's explicit source-conditional knob actually
2721            // governs the source-conditional account distribution. `sota8_active` is hoisted
2722            // above this scope so the credit loop can see it too.
2723            let account_number = if sota8_active {
2724                debit_fallback
2725            } else {
2726                let priors_opt = &mut self.loaded_priors;
2727                let rng_ref = &mut self.rng;
2728                if let Some(priors) = priors_opt {
2729                    // SP4.6 — role-aware GL account selection: try (source, "DR")
2730                    // conditional first, then fall back to SP3.7 source-marginal,
2731                    // then to the fanout sampler, then to the default debit account.
2732                    let sp46_gl = entry
2733                        .header
2734                        .sap_source_code
2735                        .as_deref()
2736                        .and_then(|code| priors.sample_gl_for_source_role(code, "DR", rng_ref));
2737                    if let Some(gl) = sp46_gl {
2738                        gl
2739                    } else {
2740                        // SP3.7 — try per-source marginal GL account.
2741                        let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
2742                            priors.sample_attribute_for_source(code, "gl_account", rng_ref)
2743                        });
2744                        if let Some(gl) = sp37_gl {
2745                            gl
2746                        } else if let Some(sampler) = priors.fanout_samplers.get_mut("GLAccount") {
2747                            // SP3.3: prefer neighbor-used buckets when motifs are available.
2748                            sampler.pick_for_with_neighbors(
2749                                &doc_type_for_fanout,
2750                                &gl_neighbor_vec,
2751                                gl_share_prob,
2752                                rng_ref,
2753                            )
2754                        } else {
2755                            debit_fallback
2756                        }
2757                    }
2758                } else {
2759                    debit_fallback
2760                }
2761            };
2762            let mut line = JournalEntryLine::debit(
2763                entry.header.document_id,
2764                (i + 1) as u32,
2765                account_number.clone(),
2766                amount,
2767            );
2768
2769            // Generate line text if enabled.
2770            // SP6 — Try text-taxonomy (account-class cascade), then DescriptionGenerator.
2771            if self.template_config.descriptions.generate_line_text {
2772                let src = entry.header.sap_source_code.as_deref();
2773                let priors_line = if let Some(s) = src {
2774                    if let Some(p) = self.loaded_priors.as_ref() {
2775                        let account_class = p
2776                            .coa_semantic
2777                            .as_ref()
2778                            .and_then(|c| c.accounts.get(&account_number))
2779                            .and_then(|a| a.account_class.as_deref())
2780                            .unwrap_or(
2781                                datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior::UNKNOWN_CLASS,
2782                            );
2783                        // SP6 text_taxonomy cascade
2784                        p.sample_line_template(
2785                            s,
2786                            account_class,
2787                            &mut self.md_resolver,
2788                            &mut self.rng,
2789                        )
2790                    } else {
2791                        None
2792                    }
2793                } else {
2794                    None
2795                };
2796                line.line_text = Some(priors_line.unwrap_or_else(|| {
2797                    self.description_generator.generate_line_text(
2798                        &account_number,
2799                        &context,
2800                        &mut self.rng,
2801                    )
2802                }));
2803            }
2804
2805            // SOTA-1: override the line's account with the reused archetype's
2806            // (RNG + text above are unchanged -> amounts/counts/dates stay
2807            // byte-identical); else capture the fresh account for caching.
2808            // SOTA-1 and SOTA-8 compose: SOTA-8 picks the FIRST archetype's accounts
2809            // from its per-source pool, then SOTA-1 caches + reuses them. Disabling
2810            // SOTA-1 under SOTA-8 actually *worsens* edge concentration — empirically
2811            // measured in Round 0 v4: edges/je 0.35 -> 0.82 when SOTA-1 was bypassed.
2812            if let Some((ref d, _)) = reuse_archetype {
2813                if let Some(a) = d.get(i) {
2814                    line.gl_account = a.clone();
2815                }
2816            } else if self.loaded_priors.is_none() {
2817                fresh_debit_accts.push(line.gl_account.clone());
2818            }
2819            entry.add_line(line);
2820        }
2821
2822        // Generate credit lines - use the SAME amounts to ensure balance
2823        let credit_amounts = self
2824            .amount_sampler
2825            .sample_summing_to(line_spec.credit_count, total_amount);
2826        for (i, amount) in credit_amounts.into_iter().enumerate() {
2827            // SP3 T13 — GL Account fanout for credit lines.
2828            let credit_fallback = self.select_credit_account().account_number.clone();
2829            // SOTA-8 precedence (mirror of the debit-side block above).
2830            let account_number = if sota8_active {
2831                credit_fallback
2832            } else {
2833                let priors_opt = &mut self.loaded_priors;
2834                let rng_ref = &mut self.rng;
2835                if let Some(priors) = priors_opt {
2836                    let sp46_gl = entry
2837                        .header
2838                        .sap_source_code
2839                        .as_deref()
2840                        .and_then(|code| priors.sample_gl_for_source_role(code, "CR", rng_ref));
2841                    if let Some(gl) = sp46_gl {
2842                        gl
2843                    } else {
2844                        let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
2845                            priors.sample_attribute_for_source(code, "gl_account", rng_ref)
2846                        });
2847                        if let Some(gl) = sp37_gl {
2848                            gl
2849                        } else if let Some(sampler) = priors.fanout_samplers.get_mut("GLAccount") {
2850                            sampler.pick_for_with_neighbors(
2851                                &doc_type_for_fanout,
2852                                &gl_neighbor_vec,
2853                                gl_share_prob,
2854                                rng_ref,
2855                            )
2856                        } else {
2857                            credit_fallback
2858                        }
2859                    }
2860                } else {
2861                    credit_fallback
2862                }
2863            };
2864            let mut line = JournalEntryLine::credit(
2865                entry.header.document_id,
2866                (line_spec.debit_count + i + 1) as u32,
2867                account_number.clone(),
2868                amount,
2869            );
2870
2871            // Generate line text if enabled.
2872            // SP6 — Try text-taxonomy (account-class cascade), then DescriptionGenerator.
2873            if self.template_config.descriptions.generate_line_text {
2874                let src = entry.header.sap_source_code.as_deref();
2875                let priors_line = if let Some(s) = src {
2876                    if let Some(p) = self.loaded_priors.as_ref() {
2877                        let account_class = p
2878                            .coa_semantic
2879                            .as_ref()
2880                            .and_then(|c| c.accounts.get(&account_number))
2881                            .and_then(|a| a.account_class.as_deref())
2882                            .unwrap_or(
2883                                datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior::UNKNOWN_CLASS,
2884                            );
2885                        // SP6 text_taxonomy cascade
2886                        p.sample_line_template(
2887                            s,
2888                            account_class,
2889                            &mut self.md_resolver,
2890                            &mut self.rng,
2891                        )
2892                    } else {
2893                        None
2894                    }
2895                } else {
2896                    None
2897                };
2898                line.line_text = Some(priors_line.unwrap_or_else(|| {
2899                    self.description_generator.generate_line_text(
2900                        &account_number,
2901                        &context,
2902                        &mut self.rng,
2903                    )
2904                }));
2905            }
2906
2907            // SOTA-1: override the credit line's account with the reused
2908            // archetype's; else capture the fresh account for caching.
2909            // (Same compose-with-SOTA-8 rationale as the debit block.)
2910            if let Some((_, ref c)) = reuse_archetype {
2911                if let Some(a) = c.get(i) {
2912                    line.gl_account = a.clone();
2913                }
2914            } else if self.loaded_priors.is_none() {
2915                fresh_credit_accts.push(line.gl_account.clone());
2916            }
2917            entry.add_line(line);
2918        }
2919
2920        // SOTA-1: cache the freshly-selected archetype for future reuse so
2921        // standard postings recur (skipped when this JE reused one).
2922        if reuse_archetype.is_none() {
2923            self.cache_recurring_archetype(
2924                &entry.header.company_code,
2925                &doc_type_for_fanout,
2926                std::mem::take(&mut fresh_debit_accts),
2927                std::mem::take(&mut fresh_credit_accts),
2928            );
2929        }
2930
2931        // Enrich line items with account descriptions, cost centers, etc.
2932        self.enrich_line_items(&mut entry);
2933
2934        // Apply persona-based errors if enabled and it's a human user
2935        if self.persona_errors_enabled && !is_automated {
2936            self.maybe_inject_persona_error(&mut entry);
2937        }
2938
2939        // Apply approval workflow if enabled and amount exceeds threshold
2940        if self.approval_enabled {
2941            self.maybe_apply_approval_workflow(&mut entry, posting_date);
2942        }
2943
2944        // Populate approved_by / approval_date from the approval workflow
2945        self.populate_approval_fields(&mut entry, posting_date);
2946
2947        // Maybe start a batch of similar entries for realism
2948        self.maybe_start_batch(&entry);
2949
2950        // SP3.4 + SP3.5b — observe each line through the velocity calibrator and
2951        // apply each returned CalibrationStep to the relevant tunable parameter.
2952        if self.velocity_calibrator.is_some() {
2953            let mut pending: Vec<crate::velocity_calibrator::CalibrationStep> = Vec::new();
2954            for line in &entry.lines {
2955                if let Some(step) = self
2956                    .velocity_calibrator
2957                    .as_mut()
2958                    .and_then(|cal| cal.observe_line(line))
2959                {
2960                    pending.push(step);
2961                }
2962            }
2963            for step in pending {
2964                self.apply_calibration_step(&step);
2965            }
2966        }
2967
2968        // P2: default the document currency to the entity's functional currency
2969        // (base), before the SOTA-4 foreign-document override applies on top.
2970        self.apply_company_currency(&mut entry.header);
2971
2972        // SOTA-4: with a small probability, post this JE in a foreign document
2973        // currency (company-ledger amounts unchanged; adds transaction_amount).
2974        self.maybe_apply_foreign_currency(&mut entry);
2975
2976        // SOTA-5: remember this JE so a later reversal can offset it.
2977        self.record_for_reversal(&entry);
2978
2979        entry
2980    }
2981
2982    /// SP3.5b — Apply a CalibrationStep from the velocity calibrator to the
2983    /// affected tunable parameter on this generator.
2984    ///
2985    /// Only `amounts.lognormal_sigma` (R6) and `amounts.round_dollar_share`
2986    /// (R9) are plumbed in v5.14. R7/R8/R10 parameters (off_hours_share,
2987    /// post_close_share, backdating_share) are observed by the calibrator
2988    /// but not yet consumed on the generator side — see v5.15 for plumbing.
2989    fn apply_calibration_step(&mut self, step: &crate::velocity_calibrator::CalibrationStep) {
2990        match step.parameter.as_str() {
2991            "amounts.lognormal_sigma" => {
2992                self.amount_sampler.set_lognormal_sigma(step.new_value);
2993            }
2994            "amounts.round_dollar_share" => {
2995                self.amount_sampler
2996                    .set_round_number_probability(step.new_value);
2997            }
2998            _ => {
2999                // Unknown / not-yet-plumbed parameter — calibrator records it
3000                // in `adjustments` for inspection; no mutation here.
3001            }
3002        }
3003    }
3004
3005    /// Enable or disable persona-based error injection.
3006    ///
3007    /// When enabled, entries created by human personas have a chance
3008    /// to contain realistic human errors based on their experience level.
3009    pub fn with_persona_errors(mut self, enabled: bool) -> Self {
3010        self.persona_errors_enabled = enabled;
3011        self
3012    }
3013
3014    /// Set fraud configuration for fraud injection.
3015    ///
3016    /// When fraud is enabled in the config, transactions have a chance
3017    /// to be marked as fraudulent based on the configured fraud rate.
3018    pub fn with_fraud_config(mut self, config: FraudConfig) -> Self {
3019        self.fraud_config = config;
3020        self
3021    }
3022
3023    /// Check if persona errors are enabled.
3024    pub fn persona_errors_enabled(&self) -> bool {
3025        self.persona_errors_enabled
3026    }
3027
3028    /// Enable or disable batch processing behavior.
3029    ///
3030    /// When enabled (default), the generator will occasionally produce batches
3031    /// of similar entries, simulating how humans batch similar work together.
3032    pub fn with_batching(mut self, enabled: bool) -> Self {
3033        self.batching_enabled = enabled;
3034        if !enabled {
3035            self.batch_state = None;
3036        }
3037        self
3038    }
3039
3040    /// Check if batch processing is enabled.
3041    pub fn batching_enabled(&self) -> bool {
3042        self.batching_enabled
3043    }
3044
3045    /// P2 (multi-currency): supply a `company_code -> functional-currency` map so
3046    /// each JE's document currency defaults to its entity's functional currency
3047    /// (e.g. EUR for a EUR entity) instead of the group/USD default. The SOTA-4
3048    /// foreign-document override still applies on top.
3049    pub fn with_company_currencies(
3050        mut self,
3051        currencies: std::collections::HashMap<String, String>,
3052    ) -> Self {
3053        self.company_currencies = currencies;
3054        self
3055    }
3056
3057    /// Set the header document currency to the entity's functional currency when a
3058    /// mapping is configured. No-op when the map is empty (keeps the USD default).
3059    fn apply_company_currency(&self, header: &mut JournalEntryHeader) {
3060        if let Some(ccy) = self.company_currencies.get(&header.company_code) {
3061            header.currency = ccy.clone();
3062        }
3063    }
3064
3065    // ---- Shared header-finalization helpers (#43) ------------------------
3066    // `generate()` and `generate_batched_entry()` previously re-implemented the
3067    // trading-partner, description-context, and header-text/reference logic
3068    // independently — when the main path gained header_text/reference, the
3069    // batched copy was never updated, so batched JEs had null header_text /
3070    // reference. These helpers are the single source of truth for both paths,
3071    // so the batched path can no longer drift. Extracted verbatim from the
3072    // primary path, so its RNG-draw order is unchanged (byte-identical).
3073
3074    /// SP3.9/SP3.12 — assign the JE-level trading partner (one TP per document),
3075    /// biased toward TP-motif cluster-mates of the previous TP on the same source.
3076    fn apply_trading_partner_motif(&mut self, header: &mut JournalEntryHeader) {
3077        let code_opt = header.sap_source_code.clone();
3078        if let Some(ref code) = code_opt {
3079            let rng_ref = &mut self.rng;
3080            let tp_neighbors: Vec<String> = if let Some(ref priors) = self.loaded_priors {
3081                if let Some(ref motifs) = priors.tp_motif_sampler {
3082                    if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
3083                        motifs.neighbors(last_tp).to_vec()
3084                    } else {
3085                        Vec::new()
3086                    }
3087                } else {
3088                    Vec::new()
3089                }
3090            } else {
3091                Vec::new()
3092            };
3093            let tp_share_prob: f64 = if let Some(ref priors) = self.loaded_priors {
3094                if let Some(ref motifs) = priors.tp_motif_sampler {
3095                    if let Some(last_tp) = self.last_tp_by_source.get(code.as_str()) {
3096                        motifs.should_share(last_tp)
3097                    } else {
3098                        0.0
3099                    }
3100                } else {
3101                    0.0
3102                }
3103            } else {
3104                0.0
3105            };
3106            if let Some(ref mut priors) = self.loaded_priors {
3107                use datasynth_core::distributions::behavioral_priors::CategoricalDistribution;
3108                let tp = if !tp_neighbors.is_empty()
3109                    && tp_share_prob > 0.0
3110                    && rng_ref.random_range(0.0..1.0) < tp_share_prob
3111                {
3112                    let filtered: std::collections::BTreeMap<String, f64> = priors
3113                        .per_source_attribute
3114                        .as_ref()
3115                        .and_then(|psa| psa.conditional(code, "trading_partner"))
3116                        .map(|dist| {
3117                            dist.probabilities
3118                                .iter()
3119                                .filter(|(v, _)| tp_neighbors.contains(v))
3120                                .map(|(v, p)| (v.clone(), *p))
3121                                .collect()
3122                        })
3123                        .unwrap_or_default();
3124                    if filtered.is_empty() {
3125                        priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3126                    } else {
3127                        let neighbour_dist = CategoricalDistribution {
3128                            probabilities: filtered,
3129                            n: 0,
3130                        };
3131                        neighbour_dist.sample(rng_ref).or_else(|| {
3132                            priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3133                        })
3134                    }
3135                } else {
3136                    priors.sample_attribute_for_source(code, "trading_partner", rng_ref)
3137                };
3138                header.trading_partner = tp;
3139            }
3140            if let Some(ref tp) = header.trading_partner {
3141                self.last_tp_by_source.insert(code.clone(), tp.clone());
3142            }
3143        }
3144    }
3145
3146    /// Build the per-JE description context (period + vendor/customer name by
3147    /// business process).
3148    fn build_description_context(
3149        &mut self,
3150        business_process: BusinessProcess,
3151        posting_date: chrono::NaiveDate,
3152    ) -> DescriptionContext {
3153        let mut context =
3154            DescriptionContext::with_period(posting_date.month(), posting_date.year());
3155        match business_process {
3156            BusinessProcess::P2P => {
3157                if let Some(vendor) = self.vendor_pool.random_vendor(&mut self.rng) {
3158                    context.vendor_name = Some(vendor.name.clone());
3159                }
3160            }
3161            BusinessProcess::O2C => {
3162                if let Some(customer) = self.customer_pool.random_customer(&mut self.rng) {
3163                    context.customer_name = Some(customer.name.clone());
3164                }
3165            }
3166            _ => {}
3167        }
3168        context
3169    }
3170
3171    /// Populate `header_text`, `reference`, and the derived `source_document`
3172    /// per the template config + loaded priors (SP6 text-taxonomy / SP4.7
3173    /// reference-format priors, with the built-in generators as fallback).
3174    fn apply_header_text_and_reference(
3175        &mut self,
3176        header: &mut JournalEntryHeader,
3177        business_process: BusinessProcess,
3178        context: &DescriptionContext,
3179        posting_date: chrono::NaiveDate,
3180    ) {
3181        if self.template_config.descriptions.generate_header_text {
3182            let priors_header = if let Some(src) = header.sap_source_code.as_deref() {
3183                if let Some(p) = self.loaded_priors.as_ref() {
3184                    p.sample_header_template(src, &mut self.md_resolver, &mut self.rng)
3185                } else {
3186                    None
3187                }
3188            } else {
3189                None
3190            };
3191            header.header_text = Some(priors_header.unwrap_or_else(|| {
3192                self.description_generator.generate_header_text(
3193                    business_process,
3194                    context,
3195                    &mut self.rng,
3196                )
3197            }));
3198        }
3199        if self.template_config.references.generate_references {
3200            let priors_ref = header.sap_source_code.as_deref().and_then(|src| {
3201                self.loaded_priors
3202                    .as_ref()
3203                    .and_then(|p| p.sample_reference(src, &mut self.rng))
3204            });
3205            header.reference = Some(priors_ref.unwrap_or_else(|| {
3206                self.reference_generator
3207                    .generate_for_process_year(business_process, posting_date.year())
3208            }));
3209        }
3210        header.source_document = header
3211            .reference
3212            .as_deref()
3213            .and_then(DocumentRef::parse)
3214            .or_else(|| {
3215                if header.source == TransactionSource::Manual {
3216                    Some(DocumentRef::Manual)
3217                } else {
3218                    None
3219                }
3220            });
3221    }
3222
3223    /// Maybe start a batch based on the current entry.
3224    ///
3225    /// Humans often batch similar work: processing invoices from one vendor,
3226    /// entering expense reports for a trip, reconciling similar items.
3227    fn maybe_start_batch(&mut self, entry: &JournalEntry) {
3228        // Respect the persistent kill-switch (`with_batching(false)`).
3229        if !self.batching_enabled {
3230            return;
3231        }
3232        // Only start batch for non-automated, non-fraud entries
3233        if entry.header.source == TransactionSource::Automated || entry.header.is_fraud {
3234            return;
3235        }
3236
3237        // 15% chance to start a batch (most work is not batched)
3238        if self.rng.random::<f64>() > 0.15 {
3239            return;
3240        }
3241
3242        // Extract key attributes for batching
3243        let base_account = entry
3244            .lines
3245            .first()
3246            .map(|l| l.gl_account.clone())
3247            .unwrap_or_default();
3248
3249        let base_amount = entry.total_debit();
3250
3251        self.batch_state = Some(BatchState {
3252            base_account_number: base_account,
3253            base_amount,
3254            base_business_process: entry.header.business_process,
3255            base_posting_date: entry.header.posting_date,
3256            remaining: self.rng.random_range(2..7), // 2-6 more similar entries
3257        });
3258    }
3259
3260    /// Generate an entry that's part of the current batch.
3261    ///
3262    /// Batched entries have:
3263    /// - Same or very similar business process
3264    /// - Same posting date (batched work done together)
3265    /// - Similar amounts (within ±15%)
3266    /// - Same debit account (processing similar items)
3267    fn generate_batched_entry(&mut self) -> JournalEntry {
3268        use rust_decimal::Decimal;
3269
3270        // Decrement batch counter
3271        if let Some(ref mut state) = self.batch_state {
3272            state.remaining = state.remaining.saturating_sub(1);
3273        }
3274
3275        let Some(batch) = self.batch_state.clone() else {
3276            // This is a programming error - batch_state should be set before calling this method.
3277            // Clear state and fall back to generating a standard entry instead of panicking.
3278            tracing::warn!(
3279                "generate_batched_entry called without batch_state; generating standard entry"
3280            );
3281            self.batch_state = None;
3282            return self.generate();
3283        };
3284
3285        // Use the batch's posting date (work done on same day)
3286        let posting_date = batch.base_posting_date;
3287
3288        self.count += 1;
3289        let document_id = self.generate_deterministic_uuid();
3290
3291        // Select same company (batched work is usually same company)
3292        let company_code = self.company_selector.select(&mut self.rng).to_string();
3293
3294        // Use simplified line spec for batched entries (usually 2-line)
3295        let _line_spec = LineItemSpec {
3296            total_count: 2,
3297            debit_count: 1,
3298            credit_count: 1,
3299            split_type: DebitCreditSplit::Equal,
3300        };
3301
3302        // Batched entries are always manual
3303        let source = TransactionSource::Manual;
3304
3305        // SP3.6 — sample SAP source code for the batch entry when priors loaded.
3306        let sap_source_code: Option<String> = self.sample_sap_source_code();
3307        // SOTA-8: stash the batch JE's source for the per-source pool consult.
3308        self.current_je_source = sap_source_code.clone();
3309
3310        // Use the batch's business process
3311        let business_process = batch.base_business_process.unwrap_or(BusinessProcess::R2R);
3312
3313        // Sample time
3314        let time = self.temporal_sampler.sample_time(true);
3315        let created_at = posting_date.and_time(time).and_utc();
3316
3317        // Same user for batched work
3318        let (created_by, user_persona) = self.select_user(false);
3319
3320        // Create header
3321        let mut header =
3322            JournalEntryHeader::with_deterministic_id(company_code, posting_date, document_id);
3323        header.created_at = created_at;
3324        header.source = source;
3325        header.sap_source_code = sap_source_code;
3326
3327        // SP3.9/SP3.12 — JE-level trading partner (shared helper; #43 — same code
3328        // as the primary path, so the two can no longer drift).
3329        self.apply_trading_partner_motif(&mut header);
3330
3331        // SP4.5 — user-persona prior for batched entries (same pattern as primary path).
3332        let (created_by, created_at) = {
3333            let sap_code_for_user = header.sap_source_code.clone();
3334            if let (Some(ref code), Some(ref priors)) = (sap_code_for_user, &self.loaded_priors) {
3335                if let Some(uid) = priors.sample_user_for_source(code, &mut self.rng) {
3336                    let new_created_at = if let Some((hour, _)) =
3337                        priors.sample_timestamp_for_user(&uid, &mut self.rng)
3338                    {
3339                        let base = header.created_at;
3340                        base.date_naive()
3341                            .and_hms_opt(hour, 0, 0)
3342                            .map(|naive| naive.and_utc())
3343                            .unwrap_or(base)
3344                    } else {
3345                        header.created_at
3346                    };
3347                    (uid, new_created_at)
3348                } else {
3349                    (created_by, header.created_at)
3350                }
3351            } else {
3352                (created_by, header.created_at)
3353            }
3354        };
3355
3356        header.created_by = created_by;
3357        header.created_at = created_at;
3358        header.user_persona = user_persona;
3359        header.business_process = Some(business_process);
3360        header.document_type = Self::document_type_for_process(business_process).to_string();
3361
3362        // Batched manual entries have Manual source document
3363        header.source_document = Some(DocumentRef::Manual);
3364
3365        // ISA 240 audit flags for batched entries (always manual)
3366        header.is_manual = true;
3367        header.source_system = if self.rng.random::<f64>() < 0.70 {
3368            "manual".to_string()
3369        } else {
3370            "spreadsheet".to_string()
3371        };
3372        header.is_post_close = posting_date.month() == self.end_date.month()
3373            && posting_date.year() == self.end_date.year()
3374            && posting_date.day() > 25;
3375        header.created_date =
3376            posting_date.and_hms_opt(time.hour().min(23), time.minute(), time.second());
3377
3378        // Generate similar amount (within ±15% of base)
3379        let variation = self.rng.random_range(-0.15..0.15);
3380        let varied_amount =
3381            batch.base_amount * (Decimal::ONE + Decimal::try_from(variation).unwrap_or_default());
3382        let total_amount = varied_amount.round_dp(2).max(Decimal::from(1));
3383
3384        // #43 — populate header_text / reference (+ derived source_document) the
3385        // same way the primary path does, so batched JEs are no longer null.
3386        // enrich_line_items (below) then fills line_text from header_text.
3387        let context = self.build_description_context(business_process, posting_date);
3388        self.apply_header_text_and_reference(&mut header, business_process, &context, posting_date);
3389
3390        // Create the entry
3391        let mut entry = JournalEntry::new(header);
3392
3393        // Use same debit account as batch base
3394        let debit_line = JournalEntryLine::debit(
3395            entry.header.document_id,
3396            1,
3397            batch.base_account_number.clone(),
3398            total_amount,
3399        );
3400        entry.add_line(debit_line);
3401
3402        // SP3.12 W3 — Select a credit account for the batched entry.
3403        // When priors are loaded and this entry has a SAP source code, use the
3404        // per-source GL-account conditional (same as the primary generate() path).
3405        // This prevents batched entries from adding legacy-CoA accounts to the
3406        // Source-Source projection graph, which was inflating graph density and
3407        // driving the P3 ClusteringGap metric above 30× DR.
3408        let credit_fallback = self.select_credit_account().account_number.clone();
3409        let credit_account = {
3410            let priors_opt = &mut self.loaded_priors;
3411            let rng_ref = &mut self.rng;
3412            if let Some(priors) = priors_opt {
3413                // SP4.6 — role-aware GL for the batched-entry credit line.
3414                // Try (source, "CR") first, then source-marginal, then fallback.
3415                let sp46_gl = entry
3416                    .header
3417                    .sap_source_code
3418                    .as_deref()
3419                    .and_then(|code| priors.sample_gl_for_source_role(code, "CR", rng_ref));
3420                if let Some(gl) = sp46_gl {
3421                    gl
3422                } else {
3423                    let sp37_gl = entry.header.sap_source_code.as_deref().and_then(|code| {
3424                        priors.sample_attribute_for_source(code, "gl_account", rng_ref)
3425                    });
3426                    sp37_gl.unwrap_or(credit_fallback)
3427                }
3428            } else {
3429                credit_fallback
3430            }
3431        };
3432        let credit_line =
3433            JournalEntryLine::credit(entry.header.document_id, 2, credit_account, total_amount);
3434        entry.add_line(credit_line);
3435
3436        // Enrich line items with account descriptions, cost centers, etc.
3437        self.enrich_line_items(&mut entry);
3438
3439        // Apply persona-based errors if enabled
3440        if self.persona_errors_enabled {
3441            self.maybe_inject_persona_error(&mut entry);
3442        }
3443
3444        // Apply approval workflow if enabled
3445        if self.approval_enabled {
3446            self.maybe_apply_approval_workflow(&mut entry, posting_date);
3447        }
3448
3449        // Populate approved_by / approval_date from the approval workflow
3450        self.populate_approval_fields(&mut entry, posting_date);
3451
3452        // Clear batch state if no more entries remaining
3453        if batch.remaining <= 1 {
3454            self.batch_state = None;
3455        }
3456
3457        // P2: entity functional currency for batched entries too.
3458        self.apply_company_currency(&mut entry.header);
3459
3460        entry
3461    }
3462
3463    /// Maybe inject a persona-appropriate error based on the persona's error rate.
3464    fn maybe_inject_persona_error(&mut self, entry: &mut JournalEntry) {
3465        // Parse persona from the entry header
3466        let persona_str = &entry.header.user_persona;
3467        let persona = match persona_str.to_lowercase().as_str() {
3468            s if s.contains("junior") => UserPersona::JuniorAccountant,
3469            s if s.contains("senior") => UserPersona::SeniorAccountant,
3470            s if s.contains("controller") => UserPersona::Controller,
3471            s if s.contains("manager") => UserPersona::Manager,
3472            s if s.contains("executive") => UserPersona::Executive,
3473            _ => return, // Don't inject errors for unknown personas
3474        };
3475
3476        // Get base error rate from persona
3477        let base_error_rate = persona.error_rate();
3478
3479        // Apply stress factors based on posting date
3480        let adjusted_rate = self.apply_stress_factors(base_error_rate, entry.header.posting_date);
3481
3482        // Check if error should occur based on adjusted rate
3483        if self.rng.random::<f64>() >= adjusted_rate {
3484            return; // No error this time
3485        }
3486
3487        // Select and inject persona-appropriate error
3488        self.inject_human_error(entry, persona);
3489    }
3490
3491    /// Apply contextual stress factors to the base error rate.
3492    ///
3493    /// Stress factors increase error likelihood during:
3494    /// - Month-end (day >= 28): 1.5x more errors due to deadline pressure
3495    /// - Quarter-end (Mar, Jun, Sep, Dec): additional 25% boost
3496    /// - Year-end (December 28-31): 2.0x more errors due to audit pressure
3497    /// - Monday morning (catch-up work): 20% more errors
3498    /// - Friday afternoon (rushing to leave): 30% more errors
3499    fn apply_stress_factors(&self, base_rate: f64, posting_date: chrono::NaiveDate) -> f64 {
3500        use chrono::Datelike;
3501
3502        let mut rate = base_rate;
3503        let day = posting_date.day();
3504        let month = posting_date.month();
3505
3506        // Year-end stress (December 28-31): double the error rate
3507        if month == 12 && day >= 28 {
3508            rate *= 2.0;
3509            return rate.min(0.5); // Cap at 50% to keep it realistic
3510        }
3511
3512        // Quarter-end stress (last days of Mar, Jun, Sep, Dec)
3513        if matches!(month, 3 | 6 | 9 | 12) && day >= 28 {
3514            rate *= 1.75; // 75% more errors at quarter end
3515            return rate.min(0.4);
3516        }
3517
3518        // Month-end stress (last 3 days of month)
3519        if day >= 28 {
3520            rate *= 1.5; // 50% more errors at month end
3521        }
3522
3523        // Day-of-week stress effects
3524        let weekday = posting_date.weekday();
3525        match weekday {
3526            chrono::Weekday::Mon => {
3527                // Monday: catching up, often rushed
3528                rate *= 1.2;
3529            }
3530            chrono::Weekday::Fri => {
3531                // Friday: rushing to finish before weekend
3532                rate *= 1.3;
3533            }
3534            _ => {}
3535        }
3536
3537        // Cap at 40% to keep it realistic
3538        rate.min(0.4)
3539    }
3540
3541    /// Apply human-like variation to an amount.
3542    ///
3543    /// Humans don't enter perfectly calculated amounts - they:
3544    /// - Round amounts differently
3545    /// - Estimate instead of calculating exactly
3546    /// - Make small input variations
3547    ///
3548    /// This applies small variations (typically ±2%) to make amounts more realistic.
3549    fn apply_human_variation(&mut self, amount: rust_decimal::Decimal) -> rust_decimal::Decimal {
3550        use rust_decimal::Decimal;
3551
3552        // Automated transactions or very small amounts don't get variation
3553        if amount < Decimal::from(10) {
3554            return amount;
3555        }
3556
3557        // 70% chance of human variation being applied
3558        if self.rng.random::<f64>() > 0.70 {
3559            return amount;
3560        }
3561
3562        // Decide which type of human variation to apply
3563        let variation_type: u8 = self.rng.random_range(0..4);
3564
3565        match variation_type {
3566            0 => {
3567                // ±2% variation (common for estimated amounts)
3568                let variation_pct = self.rng.random_range(-0.02..0.02);
3569                let variation = amount * Decimal::try_from(variation_pct).unwrap_or_default();
3570                (amount + variation).round_dp(2)
3571            }
3572            1 => {
3573                // Round to nearest $10
3574                let ten = Decimal::from(10);
3575                (amount / ten).round() * ten
3576            }
3577            2 => {
3578                // Round to nearest $100 (for larger amounts)
3579                if amount >= Decimal::from(500) {
3580                    let hundred = Decimal::from(100);
3581                    (amount / hundred).round() * hundred
3582                } else {
3583                    amount
3584                }
3585            }
3586            3 => {
3587                // Slight under/over payment (±$0.01 to ±$1.00)
3588                let cents = Decimal::new(self.rng.random_range(-100..100), 2);
3589                (amount + cents).max(Decimal::ZERO).round_dp(2)
3590            }
3591            _ => amount,
3592        }
3593    }
3594
3595    /// Rebalance an entry after a one-sided amount modification.
3596    ///
3597    /// When an error modifies one line's amount, this finds a line on the opposite
3598    /// side (credit if modified was debit, or vice versa) and adjusts it by the
3599    /// same impact to maintain balance.
3600    fn rebalance_entry(entry: &mut JournalEntry, modified_was_debit: bool, impact: Decimal) {
3601        // Find a line on the opposite side to adjust
3602        let balancing_idx = entry.lines.iter().position(|l| {
3603            if modified_was_debit {
3604                l.credit_amount > Decimal::ZERO
3605            } else {
3606                l.debit_amount > Decimal::ZERO
3607            }
3608        });
3609
3610        if let Some(idx) = balancing_idx {
3611            if modified_was_debit {
3612                entry.lines[idx].credit_amount += impact;
3613            } else {
3614                entry.lines[idx].debit_amount += impact;
3615            }
3616        }
3617    }
3618
3619    /// Inject a human-like error based on the persona.
3620    ///
3621    /// All error types maintain balance - amount modifications are applied to both sides.
3622    /// Entries are marked with [HUMAN_ERROR:*] tags in header_text for ML detection.
3623    fn inject_human_error(&mut self, entry: &mut JournalEntry, persona: UserPersona) {
3624        use rust_decimal::Decimal;
3625
3626        // Different personas make different types of errors
3627        let error_type: u8 = match persona {
3628            UserPersona::JuniorAccountant => {
3629                // Junior accountants make more varied errors
3630                self.rng.random_range(0..5)
3631            }
3632            UserPersona::SeniorAccountant => {
3633                // Senior accountants mainly make transposition errors
3634                self.rng.random_range(0..3)
3635            }
3636            UserPersona::Controller | UserPersona::Manager => {
3637                // Controllers/managers mainly make rounding or cutoff errors
3638                self.rng.random_range(3..5)
3639            }
3640            _ => return,
3641        };
3642
3643        match error_type {
3644            0 => {
3645                // Transposed digits in an amount
3646                if let Some(line) = entry.lines.get_mut(0) {
3647                    let is_debit = line.debit_amount > Decimal::ZERO;
3648                    let original_amount = if is_debit {
3649                        line.debit_amount
3650                    } else {
3651                        line.credit_amount
3652                    };
3653
3654                    // Simple digit swap in the string representation
3655                    let s = original_amount.to_string();
3656                    if s.len() >= 2 {
3657                        let chars: Vec<char> = s.chars().collect();
3658                        let pos = self.rng.random_range(0..chars.len().saturating_sub(1));
3659                        if chars[pos].is_ascii_digit()
3660                            && chars.get(pos + 1).is_some_and(char::is_ascii_digit)
3661                        {
3662                            let mut new_chars = chars;
3663                            new_chars.swap(pos, pos + 1);
3664                            if let Ok(new_amount) =
3665                                new_chars.into_iter().collect::<String>().parse::<Decimal>()
3666                            {
3667                                let impact = new_amount - original_amount;
3668
3669                                // Apply to the modified line
3670                                if is_debit {
3671                                    entry.lines[0].debit_amount = new_amount;
3672                                } else {
3673                                    entry.lines[0].credit_amount = new_amount;
3674                                }
3675
3676                                // Rebalance the entry
3677                                Self::rebalance_entry(entry, is_debit, impact);
3678
3679                                entry.header.header_text = Some(
3680                                    entry.header.header_text.clone().unwrap_or_default()
3681                                        + " [HUMAN_ERROR:TRANSPOSITION]",
3682                                );
3683                            }
3684                        }
3685                    }
3686                }
3687            }
3688            1 => {
3689                // Wrong decimal place (off by factor of 10)
3690                if let Some(line) = entry.lines.get_mut(0) {
3691                    let is_debit = line.debit_amount > Decimal::ZERO;
3692                    let original_amount = if is_debit {
3693                        line.debit_amount
3694                    } else {
3695                        line.credit_amount
3696                    };
3697
3698                    let new_amount = original_amount * Decimal::new(10, 0);
3699                    let impact = new_amount - original_amount;
3700
3701                    // Apply to the modified line
3702                    if is_debit {
3703                        entry.lines[0].debit_amount = new_amount;
3704                    } else {
3705                        entry.lines[0].credit_amount = new_amount;
3706                    }
3707
3708                    // Rebalance the entry
3709                    Self::rebalance_entry(entry, is_debit, impact);
3710
3711                    entry.header.header_text = Some(
3712                        entry.header.header_text.clone().unwrap_or_default()
3713                            + " [HUMAN_ERROR:DECIMAL_SHIFT]",
3714                    );
3715                }
3716            }
3717            2 => {
3718                // Typo in description (doesn't affect balance)
3719                if let Some(ref mut text) = entry.header.header_text {
3720                    let typos = ["teh", "adn", "wiht", "taht", "recieve"];
3721                    let correct = ["the", "and", "with", "that", "receive"];
3722                    let idx = self.rng.random_range(0..typos.len());
3723                    if text.to_lowercase().contains(correct[idx]) {
3724                        *text = text.replace(correct[idx], typos[idx]);
3725                        *text = format!("{text} [HUMAN_ERROR:TYPO]");
3726                    }
3727                }
3728            }
3729            3 => {
3730                // Rounding to round number
3731                if let Some(line) = entry.lines.get_mut(0) {
3732                    let is_debit = line.debit_amount > Decimal::ZERO;
3733                    let original_amount = if is_debit {
3734                        line.debit_amount
3735                    } else {
3736                        line.credit_amount
3737                    };
3738
3739                    let new_amount =
3740                        (original_amount / Decimal::new(100, 0)).round() * Decimal::new(100, 0);
3741                    let impact = new_amount - original_amount;
3742
3743                    // Apply to the modified line
3744                    if is_debit {
3745                        entry.lines[0].debit_amount = new_amount;
3746                    } else {
3747                        entry.lines[0].credit_amount = new_amount;
3748                    }
3749
3750                    // Rebalance the entry
3751                    Self::rebalance_entry(entry, is_debit, impact);
3752
3753                    entry.header.header_text = Some(
3754                        entry.header.header_text.clone().unwrap_or_default()
3755                            + " [HUMAN_ERROR:ROUNDED]",
3756                    );
3757                }
3758            }
3759            // Late posting marker (document date much earlier than posting
3760            // date). Doesn't create an imbalance.
3761            4 if entry.header.document_date == entry.header.posting_date => {
3762                let days_late = self.rng.random_range(5..15);
3763                entry.header.document_date =
3764                    entry.header.posting_date - chrono::Duration::days(days_late);
3765                entry.header.header_text = Some(
3766                    entry.header.header_text.clone().unwrap_or_default()
3767                        + " [HUMAN_ERROR:LATE_POSTING]",
3768                );
3769            }
3770            _ => {}
3771        }
3772    }
3773
3774    /// Apply approval workflow for high-value transactions.
3775    ///
3776    /// If the entry amount exceeds the approval threshold, simulate an
3777    /// approval workflow with appropriate approvers based on amount.
3778    fn maybe_apply_approval_workflow(
3779        &mut self,
3780        entry: &mut JournalEntry,
3781        _posting_date: NaiveDate,
3782    ) {
3783        use rust_decimal::Decimal;
3784
3785        let amount = entry.total_debit();
3786
3787        // Skip if amount is below threshold
3788        if amount <= self.approval_threshold {
3789            // Auto-approved below threshold
3790            let workflow = ApprovalWorkflow::auto_approved(
3791                entry.header.created_by.clone(),
3792                entry.header.user_persona.clone(),
3793                amount,
3794                entry.header.created_at,
3795            );
3796            entry.header.approval_workflow = Some(workflow);
3797            return;
3798        }
3799
3800        // Mark as SOX relevant for high-value transactions
3801        entry.header.sox_relevant = true;
3802
3803        // Determine required approval levels based on amount
3804        let required_levels = if amount > Decimal::new(100000, 0) {
3805            3 // Executive approval required
3806        } else if amount > Decimal::new(50000, 0) {
3807            2 // Senior management approval
3808        } else {
3809            1 // Manager approval
3810        };
3811
3812        // Create the approval workflow
3813        let mut workflow = ApprovalWorkflow::new(
3814            entry.header.created_by.clone(),
3815            entry.header.user_persona.clone(),
3816            amount,
3817        );
3818        workflow.required_levels = required_levels;
3819
3820        // Simulate submission
3821        let submit_time = entry.header.created_at;
3822        let submit_action = ApprovalAction::new(
3823            entry.header.created_by.clone(),
3824            entry.header.user_persona.clone(),
3825            self.parse_persona(&entry.header.user_persona),
3826            ApprovalActionType::Submit,
3827            0,
3828        )
3829        .with_timestamp(submit_time);
3830
3831        workflow.actions.push(submit_action);
3832        workflow.status = ApprovalStatus::Pending;
3833        workflow.submitted_at = Some(submit_time);
3834
3835        // Simulate approvals with realistic delays
3836        let mut current_time = submit_time;
3837        for level in 1..=required_levels {
3838            // Add delay for approval (1-3 business hours per level)
3839            let delay_hours = self.rng.random_range(1..4);
3840            current_time += chrono::Duration::hours(delay_hours);
3841
3842            // Skip weekends
3843            while current_time.weekday() == chrono::Weekday::Sat
3844                || current_time.weekday() == chrono::Weekday::Sun
3845            {
3846                current_time += chrono::Duration::days(1);
3847            }
3848
3849            // Generate approver based on level
3850            let (approver_id, approver_role) = self.select_approver(level);
3851
3852            let approve_action = ApprovalAction::new(
3853                approver_id.clone(),
3854                approver_role.to_string(),
3855                approver_role,
3856                ApprovalActionType::Approve,
3857                level,
3858            )
3859            .with_timestamp(current_time);
3860
3861            workflow.actions.push(approve_action);
3862            workflow.current_level = level;
3863        }
3864
3865        // Mark as approved
3866        workflow.status = ApprovalStatus::Approved;
3867        workflow.approved_at = Some(current_time);
3868
3869        entry.header.approval_workflow = Some(workflow);
3870    }
3871
3872    /// Select an approver based on the required level.
3873    fn select_approver(&mut self, level: u8) -> (String, UserPersona) {
3874        let persona = match level {
3875            1 => UserPersona::Manager,
3876            2 => UserPersona::Controller,
3877            _ => UserPersona::Executive,
3878        };
3879
3880        // Try to get from user pool first
3881        if let Some(ref pool) = self.user_pool {
3882            if let Some(user) = pool.get_random_user(persona, &mut self.rng) {
3883                return (user.user_id.clone(), persona);
3884            }
3885        }
3886
3887        // Fallback to generated approver
3888        let approver_id = match persona {
3889            UserPersona::Manager => format!("MGR{:04}", self.rng.random_range(1..100)),
3890            UserPersona::Controller => format!("CTRL{:04}", self.rng.random_range(1..20)),
3891            UserPersona::Executive => format!("EXEC{:04}", self.rng.random_range(1..10)),
3892            _ => format!("USR{:04}", self.rng.random_range(1..1000)),
3893        };
3894
3895        (approver_id, persona)
3896    }
3897
3898    /// Parse user persona from string.
3899    fn parse_persona(&self, persona_str: &str) -> UserPersona {
3900        match persona_str.to_lowercase().as_str() {
3901            s if s.contains("junior") => UserPersona::JuniorAccountant,
3902            s if s.contains("senior") => UserPersona::SeniorAccountant,
3903            s if s.contains("controller") => UserPersona::Controller,
3904            s if s.contains("manager") => UserPersona::Manager,
3905            s if s.contains("executive") => UserPersona::Executive,
3906            s if s.contains("automated") || s.contains("system") => UserPersona::AutomatedSystem,
3907            _ => UserPersona::JuniorAccountant, // Default
3908        }
3909    }
3910
3911    /// Enable or disable approval workflow.
3912    pub fn with_approval(mut self, enabled: bool) -> Self {
3913        self.approval_enabled = enabled;
3914        self
3915    }
3916
3917    /// Set the approval threshold amount.
3918    pub fn with_approval_threshold(mut self, threshold: rust_decimal::Decimal) -> Self {
3919        self.approval_threshold = threshold;
3920        self
3921    }
3922
3923    /// Set the SOD violation rate for approval tracking.
3924    ///
3925    /// When a transaction is approved, there is a `rate` probability (0.0 to 1.0)
3926    /// that the approver is the same as the creator, which constitutes a SOD violation.
3927    /// Default is 0.10 (10%).
3928    pub fn with_sod_violation_rate(mut self, rate: f64) -> Self {
3929        self.sod_violation_rate = rate;
3930        self
3931    }
3932
3933    /// Populate `approved_by` and `approval_date` from the approval workflow,
3934    /// and flag SOD violations when the approver matches the creator.
3935    fn populate_approval_fields(&mut self, entry: &mut JournalEntry, posting_date: NaiveDate) {
3936        if let Some(ref workflow) = entry.header.approval_workflow {
3937            // Extract the last approver from the workflow actions
3938            let last_approver = workflow
3939                .actions
3940                .iter()
3941                .rev()
3942                .find(|a| matches!(a.action, ApprovalActionType::Approve));
3943
3944            if let Some(approver_action) = last_approver {
3945                entry.header.approved_by = Some(approver_action.actor_id.clone());
3946                entry.header.approval_date = Some(approver_action.action_timestamp.date_naive());
3947            } else {
3948                // No explicit approver (auto-approved); use the preparer
3949                entry.header.approved_by = Some(workflow.preparer_id.clone());
3950                entry.header.approval_date = Some(posting_date);
3951            }
3952
3953            // Inject SOD violation: with configured probability, set approver = creator
3954            if self.rng.random::<f64>() < self.sod_violation_rate {
3955                let creator = entry.header.created_by.clone();
3956                entry.header.approved_by = Some(creator);
3957                entry.header.sod_violation = true;
3958                entry.header.sod_conflict_type = Some(SodConflictType::PreparerApprover);
3959            }
3960        }
3961    }
3962
3963    /// Set the temporal drift controller for simulating distribution changes over time.
3964    ///
3965    /// When drift is enabled, amounts and other distributions will shift based on
3966    /// the period (month) to simulate realistic temporal evolution like inflation
3967    /// or increasing fraud rates.
3968    pub fn with_drift_controller(mut self, controller: DriftController) -> Self {
3969        self.drift_controller = Some(controller);
3970        self
3971    }
3972
3973    /// Set drift configuration directly.
3974    ///
3975    /// Creates a drift controller from the config. Total periods is calculated
3976    /// from the date range.
3977    pub fn with_drift_config(mut self, config: DriftConfig, seed: u64) -> Self {
3978        if config.enabled {
3979            let total_periods = self.calculate_total_periods();
3980            self.drift_controller = Some(DriftController::new(config, seed, total_periods));
3981        }
3982        self
3983    }
3984
3985    /// Calculate total periods (months) in the date range.
3986    fn calculate_total_periods(&self) -> u32 {
3987        let start_year = self.start_date.year();
3988        let start_month = self.start_date.month();
3989        let end_year = self.end_date.year();
3990        let end_month = self.end_date.month();
3991
3992        ((end_year - start_year) * 12 + (end_month as i32 - start_month as i32) + 1).max(1) as u32
3993    }
3994
3995    /// Calculate the period number (0-indexed) for a given date.
3996    fn date_to_period(&self, date: NaiveDate) -> u32 {
3997        let start_year = self.start_date.year();
3998        let start_month = self.start_date.month() as i32;
3999        let date_year = date.year();
4000        let date_month = date.month() as i32;
4001
4002        ((date_year - start_year) * 12 + (date_month - start_month)).max(0) as u32
4003    }
4004
4005    /// Get drift adjustments for a given date.
4006    fn get_drift_adjustments(&self, date: NaiveDate) -> DriftAdjustments {
4007        if let Some(ref controller) = self.drift_controller {
4008            let period = self.date_to_period(date);
4009            controller.compute_adjustments(period)
4010        } else {
4011            DriftAdjustments::none()
4012        }
4013    }
4014
4015    /// Select a user from the pool or generate a generic user ID.
4016    #[inline]
4017    fn select_user(&mut self, is_automated: bool) -> (String, String) {
4018        if let Some(ref pool) = self.user_pool {
4019            let persona = if is_automated {
4020                UserPersona::AutomatedSystem
4021            } else {
4022                // Random distribution among human personas
4023                let roll: f64 = self.rng.random();
4024                if roll < 0.4 {
4025                    UserPersona::JuniorAccountant
4026                } else if roll < 0.7 {
4027                    UserPersona::SeniorAccountant
4028                } else if roll < 0.85 {
4029                    UserPersona::Controller
4030                } else {
4031                    UserPersona::Manager
4032                }
4033            };
4034
4035            if let Some(user) = pool.get_random_user(persona, &mut self.rng) {
4036                return (user.user_id.clone(), user.persona.to_string());
4037            }
4038        }
4039
4040        // Fallback to generic format
4041        if is_automated {
4042            (
4043                format!("BATCH{:04}", self.rng.random_range(1..=20)),
4044                "automated_system".to_string(),
4045            )
4046        } else {
4047            (
4048                format!("USER{:04}", self.rng.random_range(1..=40)),
4049                "senior_accountant".to_string(),
4050            )
4051        }
4052    }
4053
4054    /// Select transaction source based on configuration weights.
4055    #[inline]
4056    fn select_source(&mut self) -> TransactionSource {
4057        let roll: f64 = self.rng.random();
4058        let dist = &self.config.source_distribution;
4059
4060        if roll < dist.manual {
4061            TransactionSource::Manual
4062        } else if roll < dist.manual + dist.automated {
4063            TransactionSource::Automated
4064        } else if roll < dist.manual + dist.automated + dist.recurring {
4065            TransactionSource::Recurring
4066        } else {
4067            TransactionSource::Adjustment
4068        }
4069    }
4070
4071    /// Select a business process based on configuration weights.
4072    #[inline]
4073    /// Map a business process to a SAP-style document type code.
4074    ///
4075    /// - P2P → "KR" (vendor invoice)
4076    /// - O2C → "DR" (customer invoice)
4077    /// - R2R → "SA" (general journal)
4078    /// - H2R → "HR" (HR posting)
4079    /// - A2R → "AA" (asset posting)
4080    /// - others → "SA"
4081    fn document_type_for_process(process: BusinessProcess) -> &'static str {
4082        match process {
4083            BusinessProcess::P2P => "KR",
4084            BusinessProcess::O2C => "DR",
4085            BusinessProcess::R2R => "SA",
4086            BusinessProcess::H2R => "HR",
4087            BusinessProcess::A2R => "AA",
4088            _ => "SA",
4089        }
4090    }
4091
4092    fn select_business_process(&mut self) -> BusinessProcess {
4093        *datasynth_core::utils::weighted_select(&mut self.rng, &self.business_process_weights)
4094    }
4095
4096    /// SOTA-2: draw a rank index in `[0, n)` with `P(rank=i) ∝ 1/(i+1)^ZIPF_ALPHA`
4097    /// from a dedicated stream, so a few low-rank accounts carry most lines (the
4098    /// corpus account-activity Pareto). Returns `None` for an empty/oversized pool
4099    /// so the caller keeps the uniform draw.
4100    #[inline]
4101    fn power_law_index(n: usize, rng: &mut ChaCha8Rng) -> Option<usize> {
4102        if n == 0 || n > ZIPF_CAP {
4103            return None;
4104        }
4105        let total = ZIPF_CUM[n];
4106        let r = rng.random::<f64>() * total;
4107        // smallest k in 1..=n with CUM[k] >= r → 0-based rank k-1
4108        let k = ZIPF_CUM[..=n]
4109            .binary_search_by(|v| v.partial_cmp(&r).unwrap_or(std::cmp::Ordering::Less))
4110            .unwrap_or_else(|e| e);
4111        Some(k.saturating_sub(1).min(n - 1))
4112    }
4113
4114    /// SOTA-2: replace a uniform `Vec<&GLAccount>` pick with a hot-account
4115    /// power-law pick when concentration is on (default). The uniform `.choose`
4116    /// draw on the main `rng` is still consumed by the caller first, so
4117    /// amounts/line-counts/dates stay byte-identical to the legacy stream — only
4118    /// the *selected account* changes. Associated (not `&mut self`) so it borrows
4119    /// only `account_rng`, leaving `coa` free for `all`/`uniform`.
4120    #[inline]
4121    fn concentrate<'a>(
4122        enabled: bool,
4123        rng: &mut ChaCha8Rng,
4124        all: &[&'a GLAccount],
4125        uniform: Option<&'a GLAccount>,
4126    ) -> Option<&'a GLAccount> {
4127        if enabled {
4128            Self::power_law_index(all.len(), rng)
4129                .map(|i| all[i])
4130                .or(uniform)
4131        } else {
4132            uniform
4133        }
4134    }
4135
4136    /// SOTA-8: ensure a `SourcePool` exists for `source` in the sampler (lazy build).
4137    /// One pool per source, persisted across JEs (sampler grows monotonically).
4138    fn ensure_cond_pair_pool(&mut self, source: &str) {
4139        let cfg = &self.config.source_conditional_account_pair;
4140        if !cfg.enabled {
4141            return;
4142        }
4143        if self.cond_pair_sampler.is_none() {
4144            self.cond_pair_sampler = Some(Default::default());
4145        }
4146        let sampler = self
4147            .cond_pair_sampler
4148            .as_mut()
4149            .expect("just-initialised above");
4150        if sampler.pool(source).is_some() {
4151            return;
4152        }
4153        let all_accounts: Vec<String> = self
4154            .coa
4155            .accounts
4156            .iter()
4157            .map(|a| a.account_number.clone())
4158            .collect();
4159        if all_accounts.is_empty() {
4160            return;
4161        }
4162        // Uniform weights here — the existing account-Pareto (account_concentration)
4163        // still applies at the outer fallback level if the per-source pool isn't used.
4164        let weights: Vec<f64> = vec![1.0; all_accounts.len()];
4165        sampler.ensure_pool(
4166            source,
4167            &all_accounts,
4168            &weights,
4169            cfg.accts_per_source_target,
4170            cfg.concentration,
4171            &mut self.cond_pair_rng,
4172        );
4173    }
4174
4175    /// SOTA-8: if the feature is enabled and the current JE has a source with a
4176    /// pool, pick an *account number* from the per-source PMF. Returns an owned
4177    /// `String` so the caller can release the mutable self-borrow before looking
4178    /// up the `GLAccount` in `self.coa`.
4179    #[inline]
4180    fn try_cond_pick_account_number(&mut self) -> Option<String> {
4181        let cfg = &self.config.source_conditional_account_pair;
4182        if !cfg.enabled {
4183            return None;
4184        }
4185        let src = self.current_je_source.clone()?;
4186        self.ensure_cond_pair_pool(&src);
4187        let sampler = self.cond_pair_sampler.as_ref()?;
4188        let pool = sampler.pool(&src)?;
4189        Some(pool.sample_one(&mut self.cond_pair_rng).to_string())
4190    }
4191
4192    #[inline]
4193    fn select_debit_account(&mut self) -> &GLAccount {
4194        // SOTA-8 source-conditional pick when feature is enabled.
4195        if let Some(acct_num) = self.try_cond_pick_account_number() {
4196            if let Some(a) = self
4197                .coa
4198                .accounts
4199                .iter()
4200                .find(|a| a.account_number == acct_num)
4201            {
4202                return a;
4203            }
4204            // Sampler chose an account not in CoA (defensive fall-through).
4205        }
4206        let accounts = self.coa.get_accounts_by_type(AccountType::Asset);
4207        let expense_accounts = self.coa.get_accounts_by_type(AccountType::Expense);
4208
4209        // 60% asset, 40% expense for debits
4210        let all: Vec<_> = if self.rng.random::<f64>() < 0.6 {
4211            accounts
4212        } else {
4213            expense_accounts
4214        };
4215
4216        let uniform = all.choose(&mut self.rng).copied();
4217        let enabled = self.config.account_concentration.unwrap_or(true);
4218        Self::concentrate(enabled, &mut self.account_rng, &all, uniform).unwrap_or_else(|| {
4219            tracing::warn!(
4220                "Account selection returned empty list, falling back to first COA account"
4221            );
4222            &self.coa.accounts[0]
4223        })
4224    }
4225
4226    #[inline]
4227    fn select_credit_account(&mut self) -> &GLAccount {
4228        // SOTA-8 source-conditional pick when feature is enabled.
4229        if let Some(acct_num) = self.try_cond_pick_account_number() {
4230            if let Some(a) = self
4231                .coa
4232                .accounts
4233                .iter()
4234                .find(|a| a.account_number == acct_num)
4235            {
4236                return a;
4237            }
4238        }
4239        let liability_accounts = self.coa.get_accounts_by_type(AccountType::Liability);
4240        let revenue_accounts = self.coa.get_accounts_by_type(AccountType::Revenue);
4241
4242        // 60% liability, 40% revenue for credits
4243        let all: Vec<_> = if self.rng.random::<f64>() < 0.6 {
4244            liability_accounts
4245        } else {
4246            revenue_accounts
4247        };
4248
4249        let uniform = all.choose(&mut self.rng).copied();
4250        let enabled = self.config.account_concentration.unwrap_or(true);
4251        Self::concentrate(enabled, &mut self.account_rng, &all, uniform).unwrap_or_else(|| {
4252            tracing::warn!(
4253                "Account selection returned empty list, falling back to first COA account"
4254            );
4255            &self.coa.accounts[0]
4256        })
4257    }
4258}
4259
4260impl Generator for JournalEntryGenerator {
4261    type Item = JournalEntry;
4262    type Config = (
4263        TransactionConfig,
4264        Arc<ChartOfAccounts>,
4265        Vec<String>,
4266        NaiveDate,
4267        NaiveDate,
4268    );
4269
4270    fn new(config: Self::Config, seed: u64) -> Self {
4271        Self::new_with_params(config.0, config.1, config.2, config.3, config.4, seed)
4272    }
4273
4274    fn generate_one(&mut self) -> Self::Item {
4275        self.generate()
4276    }
4277
4278    fn reset(&mut self) {
4279        self.rng = seeded_rng(self.seed, 0);
4280        self.source_mix_rng = seeded_rng(self.seed, 50_063);
4281        self.template_rng = seeded_rng(self.seed, 70_081);
4282        self.recurring_archetypes.clear();
4283        self.reversal_rng = seeded_rng(self.seed, 90_017);
4284        self.reversal_buffer.clear();
4285        self.account_rng = seeded_rng(self.seed, 60_071);
4286        self.allocation_rng = seeded_rng(self.seed, 80_023);
4287        self.fx_rng = seeded_rng(self.seed, 70_093);
4288        self.line_sampler.reset(self.seed + 1);
4289        self.amount_sampler.reset(self.seed + 2);
4290        self.temporal_sampler.reset(self.seed + 3);
4291        if let Some(ref mut adv) = self.advanced_amount_sampler {
4292            adv.reset(self.seed + 2);
4293        }
4294        self.count = 0;
4295        self.uuid_factory.reset();
4296
4297        // Reset reference generator by recreating it
4298        let mut ref_gen = ReferenceGenerator::new(
4299            self.start_date.year(),
4300            self.companies
4301                .first()
4302                .map(std::string::String::as_str)
4303                .unwrap_or("1000"),
4304        );
4305        ref_gen.set_prefix(
4306            ReferenceType::Invoice,
4307            &self.template_config.references.invoice_prefix,
4308        );
4309        ref_gen.set_prefix(
4310            ReferenceType::PurchaseOrder,
4311            &self.template_config.references.po_prefix,
4312        );
4313        ref_gen.set_prefix(
4314            ReferenceType::SalesOrder,
4315            &self.template_config.references.so_prefix,
4316        );
4317        self.reference_generator = ref_gen;
4318    }
4319
4320    fn count(&self) -> u64 {
4321        self.count
4322    }
4323
4324    fn seed(&self) -> u64 {
4325        self.seed
4326    }
4327}
4328
4329use datasynth_core::traits::ParallelGenerator;
4330
4331impl ParallelGenerator for JournalEntryGenerator {
4332    /// Split this generator into `parts` independent sub-generators.
4333    ///
4334    /// Each sub-generator gets a deterministic seed derived from the parent seed
4335    /// and its partition index, plus a partitioned UUID factory to avoid contention.
4336    /// The results are deterministic for a given partition count.
4337    fn split(self, parts: usize) -> Vec<Self> {
4338        let parts = parts.max(1);
4339        (0..parts)
4340            .map(|i| {
4341                // Derive a unique seed per partition using a golden-ratio constant
4342                let sub_seed = self
4343                    .seed
4344                    .wrapping_add((i as u64).wrapping_mul(0x9E3779B97F4A7C15));
4345
4346                let mut gen = JournalEntryGenerator::new_with_full_config(
4347                    self.config.clone(),
4348                    Arc::clone(&self.coa),
4349                    self.companies.clone(),
4350                    self.start_date,
4351                    self.end_date,
4352                    sub_seed,
4353                    self.template_config.clone(),
4354                    self.user_pool.clone(),
4355                );
4356
4357                // Copy over configuration state
4358                gen.company_selector = self.company_selector.clone();
4359                gen.vendor_pool = self.vendor_pool.clone();
4360                gen.customer_pool = self.customer_pool.clone();
4361                gen.material_pool = self.material_pool.clone();
4362                // v5.9.0: master-data pools so sub-generators emit
4363                // CC/PC values that join back to the corresponding
4364                // masters (without these clones, parallel workers
4365                // fell back to the hardcoded `COST_CENTER_POOL` const
4366                // and the legacy `PC-{COMP}-{P2P|O2C|...}` derivation).
4367                gen.cost_center_pool = self.cost_center_pool.clone();
4368                gen.profit_center_pool = self.profit_center_pool.clone();
4369                gen.using_real_master_data = self.using_real_master_data;
4370                gen.fraud_config = self.fraud_config.clone();
4371                gen.persona_errors_enabled = self.persona_errors_enabled;
4372                // P2: per-entity functional currency must survive the parallel
4373                // split (else workers fall back to the USD default).
4374                gen.company_currencies = self.company_currencies.clone();
4375                // Preserve the batching kill-switch across the split too.
4376                gen.batching_enabled = self.batching_enabled;
4377                gen.approval_enabled = self.approval_enabled;
4378                gen.approval_threshold = self.approval_threshold;
4379                gen.sod_violation_rate = self.sod_violation_rate;
4380                // v3.4.0+: advanced amount sampler (mixture / Pareto /
4381                // Gaussian). Clone and reset the internal RNG with the
4382                // partition's sub_seed so each worker explores a unique
4383                // subsequence without repeating the parent stream.
4384                if let Some(mut adv) = self.advanced_amount_sampler.clone() {
4385                    adv.reset(sub_seed.wrapping_add(2));
4386                    gen.advanced_amount_sampler = Some(adv);
4387                }
4388                // v3.5.3+: conditional amount override — clone + reset
4389                // so each partition gets a fresh deterministic stream.
4390                if let Some(mut cond) = self.conditional_amount_override.clone() {
4391                    cond.reset(sub_seed.wrapping_add(17));
4392                    gen.conditional_amount_override = Some(cond);
4393                }
4394                // v3.5.4+: copula sampler — clone + reset per partition.
4395                if let Some(mut cop) = self.correlation_copula.clone() {
4396                    cop.reset(sub_seed.wrapping_add(31));
4397                    gen.correlation_copula = Some(cop);
4398                }
4399
4400                // Use partitioned UUID factory to eliminate atomic contention
4401                gen.uuid_factory = DeterministicUuidFactory::for_partition(
4402                    sub_seed,
4403                    GeneratorType::JournalEntry,
4404                    i as u8,
4405                );
4406
4407                // Copy temporal patterns if configured
4408                if let Some(ref config) = self.temporal_patterns_config {
4409                    gen.temporal_patterns_config = Some(config.clone());
4410                    // Rebuild business day calculator from the stored config
4411                    if config.business_days.enabled {
4412                        if let Some(ref bdc) = self.business_day_calculator {
4413                            gen.business_day_calculator = Some(bdc.clone());
4414                        }
4415                    }
4416                    // Rebuild processing lag calculator with partition seed
4417                    if config.processing_lags.enabled {
4418                        let lag_config =
4419                            Self::convert_processing_lag_config(&config.processing_lags);
4420                        gen.processing_lag_calculator =
4421                            Some(ProcessingLagCalculator::with_config(sub_seed, lag_config));
4422                    }
4423                }
4424
4425                // Copy drift controller if present
4426                if let Some(ref dc) = self.drift_controller {
4427                    gen.drift_controller = Some(dc.clone());
4428                }
4429
4430                // SP3: share Arc-wrapped priors with all sub-generators.
4431                // Clone is O(1) — increments the reference count only.
4432                gen.loaded_priors = self.loaded_priors.clone();
4433
4434                // SP3.4: each partition starts with a fresh calibrator so
4435                // observations are partition-local (avoids cross-partition
4436                // state contamination).  Target rates and window size are
4437                // cloned from the parent; accumulated state is not.
4438                if let Some(ref cal) = self.velocity_calibrator {
4439                    let mut fresh = crate::velocity_calibrator::VelocityCalibrator::new(
4440                        cal.target_trigger_rates.clone(),
4441                        cal.n_lines_between_calibrations,
4442                    );
4443                    fresh.current_values = cal.current_values.clone();
4444                    gen.velocity_calibrator = Some(fresh);
4445                }
4446
4447                gen
4448            })
4449            .collect()
4450    }
4451}
4452
4453#[cfg(test)]
4454mod tests {
4455    use super::*;
4456    use crate::ChartOfAccountsGenerator;
4457
4458    #[test]
4459    fn test_generate_balanced_entries() {
4460        let mut coa_gen =
4461            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4462        let coa = Arc::new(coa_gen.generate());
4463
4464        let mut je_gen = JournalEntryGenerator::new_with_params(
4465            TransactionConfig::default(),
4466            coa,
4467            vec!["1000".to_string()],
4468            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4469            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4470            42,
4471        );
4472
4473        let mut balanced_count = 0;
4474        for _ in 0..100 {
4475            let entry = je_gen.generate();
4476
4477            // Skip entries with human errors as they may be intentionally unbalanced
4478            let has_human_error = entry
4479                .header
4480                .header_text
4481                .as_ref()
4482                .map(|t| t.contains("[HUMAN_ERROR:"))
4483                .unwrap_or(false);
4484
4485            if !has_human_error {
4486                assert!(
4487                    entry.is_balanced(),
4488                    "Entry {:?} is not balanced",
4489                    entry.header.document_id
4490                );
4491                balanced_count += 1;
4492            }
4493            assert!(entry.line_count() >= 2, "Entry has fewer than 2 lines");
4494        }
4495
4496        // Ensure most entries are balanced (human errors are rare)
4497        assert!(
4498            balanced_count >= 80,
4499            "Expected at least 80 balanced entries, got {}",
4500            balanced_count
4501        );
4502    }
4503
4504    #[test]
4505    fn test_deterministic_generation() {
4506        let mut coa_gen =
4507            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4508        let coa = Arc::new(coa_gen.generate());
4509
4510        let mut gen1 = JournalEntryGenerator::new_with_params(
4511            TransactionConfig::default(),
4512            Arc::clone(&coa),
4513            vec!["1000".to_string()],
4514            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4515            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4516            42,
4517        );
4518
4519        let mut gen2 = JournalEntryGenerator::new_with_params(
4520            TransactionConfig::default(),
4521            coa,
4522            vec!["1000".to_string()],
4523            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4524            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4525            42,
4526        );
4527
4528        for _ in 0..50 {
4529            let e1 = gen1.generate();
4530            let e2 = gen2.generate();
4531            assert_eq!(e1.header.document_id, e2.header.document_id);
4532            assert_eq!(e1.total_debit(), e2.total_debit());
4533        }
4534    }
4535
4536    #[test]
4537    fn test_templates_generate_descriptions() {
4538        let mut coa_gen =
4539            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4540        let coa = Arc::new(coa_gen.generate());
4541
4542        // Enable all template features
4543        let template_config = TemplateConfig {
4544            names: datasynth_config::schema::NameTemplateConfig {
4545                generate_realistic_names: true,
4546                email_domain: "test.com".to_string(),
4547                culture_distribution: datasynth_config::schema::CultureDistribution::default(),
4548            },
4549            descriptions: datasynth_config::schema::DescriptionTemplateConfig {
4550                generate_header_text: true,
4551                generate_line_text: true,
4552            },
4553            references: datasynth_config::schema::ReferenceTemplateConfig {
4554                generate_references: true,
4555                invoice_prefix: "TEST-INV".to_string(),
4556                po_prefix: "TEST-PO".to_string(),
4557                so_prefix: "TEST-SO".to_string(),
4558            },
4559            path: None,
4560            merge_strategy: datasynth_config::TemplateMergeStrategy::default(),
4561        };
4562
4563        let mut je_gen = JournalEntryGenerator::new_with_full_config(
4564            TransactionConfig::default(),
4565            coa,
4566            vec!["1000".to_string()],
4567            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4568            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4569            42,
4570            template_config,
4571            None,
4572        )
4573        .with_persona_errors(false); // Disable for template testing
4574                                     // #43 fixed: batched entries now populate header_text/reference via the
4575                                     // shared apply_header_text_and_reference helper, so batching stays ON
4576                                     // here — this test exercises both the primary and batched paths.
4577
4578        for _ in 0..10 {
4579            let entry = je_gen.generate();
4580
4581            // Verify header text is populated
4582            assert!(
4583                entry.header.header_text.is_some(),
4584                "Header text should be populated"
4585            );
4586
4587            // Verify reference is populated
4588            assert!(
4589                entry.header.reference.is_some(),
4590                "Reference should be populated"
4591            );
4592
4593            // Verify business process is set
4594            assert!(
4595                entry.header.business_process.is_some(),
4596                "Business process should be set"
4597            );
4598
4599            // Verify line text is populated
4600            for line in &entry.lines {
4601                assert!(line.line_text.is_some(), "Line text should be populated");
4602            }
4603
4604            // Entry should still be balanced
4605            assert!(entry.is_balanced());
4606        }
4607    }
4608
4609    #[test]
4610    fn test_user_pool_integration() {
4611        let mut coa_gen =
4612            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4613        let coa = Arc::new(coa_gen.generate());
4614
4615        let companies = vec!["1000".to_string()];
4616
4617        // Generate user pool
4618        let mut user_gen = crate::UserGenerator::new(42);
4619        let user_pool = user_gen.generate_standard(&companies);
4620
4621        let mut je_gen = JournalEntryGenerator::new_with_full_config(
4622            TransactionConfig::default(),
4623            coa,
4624            companies,
4625            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4626            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4627            42,
4628            TemplateConfig::default(),
4629            Some(user_pool),
4630        );
4631
4632        // Generate entries and verify user IDs are from pool
4633        for _ in 0..20 {
4634            let entry = je_gen.generate();
4635
4636            // User ID should not be generic BATCH/USER format when pool is used
4637            // (though it may still fall back if random selection misses)
4638            assert!(!entry.header.created_by.is_empty());
4639        }
4640    }
4641
4642    #[test]
4643    fn test_master_data_connection() {
4644        let mut coa_gen =
4645            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4646        let coa = Arc::new(coa_gen.generate());
4647
4648        // Create test vendors
4649        let vendors = vec![
4650            Vendor::new("V-TEST-001", "Test Vendor Alpha", VendorType::Supplier),
4651            Vendor::new("V-TEST-002", "Test Vendor Beta", VendorType::Technology),
4652        ];
4653
4654        // Create test customers
4655        let customers = vec![
4656            Customer::new("C-TEST-001", "Test Customer One", CustomerType::Corporate),
4657            Customer::new(
4658                "C-TEST-002",
4659                "Test Customer Two",
4660                CustomerType::SmallBusiness,
4661            ),
4662        ];
4663
4664        // Create test materials
4665        let materials = vec![Material::new(
4666            "MAT-TEST-001",
4667            "Test Material A",
4668            MaterialType::RawMaterial,
4669        )];
4670
4671        // Create generator with master data
4672        let generator = JournalEntryGenerator::new_with_params(
4673            TransactionConfig::default(),
4674            coa,
4675            vec!["1000".to_string()],
4676            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4677            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4678            42,
4679        );
4680
4681        // Without master data
4682        assert!(!generator.is_using_real_master_data());
4683
4684        // Connect master data
4685        let generator_with_data = generator
4686            .with_vendors(&vendors)
4687            .with_customers(&customers)
4688            .with_materials(&materials);
4689
4690        // Should now be using real master data
4691        assert!(generator_with_data.is_using_real_master_data());
4692    }
4693
4694    #[test]
4695    fn test_with_master_data_convenience_method() {
4696        let mut coa_gen =
4697            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4698        let coa = Arc::new(coa_gen.generate());
4699
4700        let vendors = vec![Vendor::new("V-001", "Vendor One", VendorType::Supplier)];
4701        let customers = vec![Customer::new(
4702            "C-001",
4703            "Customer One",
4704            CustomerType::Corporate,
4705        )];
4706        let materials = vec![Material::new(
4707            "MAT-001",
4708            "Material One",
4709            MaterialType::RawMaterial,
4710        )];
4711
4712        let generator = JournalEntryGenerator::new_with_params(
4713            TransactionConfig::default(),
4714            coa,
4715            vec!["1000".to_string()],
4716            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4717            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4718            42,
4719        )
4720        .with_master_data(&vendors, &customers, &materials);
4721
4722        assert!(generator.is_using_real_master_data());
4723    }
4724
4725    #[test]
4726    fn test_stress_factors_increase_error_rate() {
4727        let mut coa_gen =
4728            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4729        let coa = Arc::new(coa_gen.generate());
4730
4731        let generator = JournalEntryGenerator::new_with_params(
4732            TransactionConfig::default(),
4733            coa,
4734            vec!["1000".to_string()],
4735            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4736            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4737            42,
4738        );
4739
4740        let base_rate = 0.1;
4741
4742        // Regular day - no stress factors
4743        let regular_day = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(); // Mid-June Wednesday
4744        let regular_rate = generator.apply_stress_factors(base_rate, regular_day);
4745        assert!(
4746            (regular_rate - base_rate).abs() < 0.01,
4747            "Regular day should have minimal stress factor adjustment"
4748        );
4749
4750        // Month end - 50% more errors
4751        let month_end = NaiveDate::from_ymd_opt(2024, 6, 29).unwrap(); // June 29 (Saturday)
4752        let month_end_rate = generator.apply_stress_factors(base_rate, month_end);
4753        assert!(
4754            month_end_rate > regular_rate,
4755            "Month end should have higher error rate than regular day"
4756        );
4757
4758        // Year end - double the error rate
4759        let year_end = NaiveDate::from_ymd_opt(2024, 12, 30).unwrap(); // December 30
4760        let year_end_rate = generator.apply_stress_factors(base_rate, year_end);
4761        assert!(
4762            year_end_rate > month_end_rate,
4763            "Year end should have highest error rate"
4764        );
4765
4766        // Friday stress
4767        let friday = NaiveDate::from_ymd_opt(2024, 6, 14).unwrap(); // Friday
4768        let friday_rate = generator.apply_stress_factors(base_rate, friday);
4769        assert!(
4770            friday_rate > regular_rate,
4771            "Friday should have higher error rate than mid-week"
4772        );
4773
4774        // Monday stress
4775        let monday = NaiveDate::from_ymd_opt(2024, 6, 17).unwrap(); // Monday
4776        let monday_rate = generator.apply_stress_factors(base_rate, monday);
4777        assert!(
4778            monday_rate > regular_rate,
4779            "Monday should have higher error rate than mid-week"
4780        );
4781    }
4782
4783    #[test]
4784    fn test_batching_produces_similar_entries() {
4785        let mut coa_gen =
4786            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4787        let coa = Arc::new(coa_gen.generate());
4788
4789        // Use seed 123 which is more likely to trigger batching
4790        let mut je_gen = JournalEntryGenerator::new_with_params(
4791            TransactionConfig::default(),
4792            coa,
4793            vec!["1000".to_string()],
4794            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4795            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4796            123,
4797        )
4798        .with_persona_errors(false); // Disable to ensure balanced entries
4799
4800        // Generate many entries - at 15% batch rate, should see some batches
4801        let entries: Vec<JournalEntry> = (0..200).map(|_| je_gen.generate()).collect();
4802
4803        // Check that all entries are balanced (batched or not)
4804        for entry in &entries {
4805            assert!(
4806                entry.is_balanced(),
4807                "All entries including batched should be balanced"
4808            );
4809        }
4810
4811        // Count entries with same-day posting dates (batch indicator)
4812        let mut date_counts: std::collections::HashMap<NaiveDate, usize> =
4813            std::collections::HashMap::new();
4814        for entry in &entries {
4815            *date_counts.entry(entry.header.posting_date).or_insert(0) += 1;
4816        }
4817
4818        // With batching, some dates should have multiple entries
4819        let dates_with_multiple = date_counts.values().filter(|&&c| c > 1).count();
4820        assert!(
4821            dates_with_multiple > 0,
4822            "With batching, should see some dates with multiple entries"
4823        );
4824    }
4825
4826    #[test]
4827    fn test_temporal_patterns_business_days() {
4828        use datasynth_config::schema::{
4829            BusinessDaySchemaConfig, CalendarSchemaConfig, TemporalPatternsConfig,
4830        };
4831
4832        let mut coa_gen =
4833            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4834        let coa = Arc::new(coa_gen.generate());
4835
4836        // Create temporal patterns config with business days enabled
4837        let temporal_config = TemporalPatternsConfig {
4838            enabled: true,
4839            business_days: BusinessDaySchemaConfig {
4840                enabled: true,
4841                ..Default::default()
4842            },
4843            calendars: CalendarSchemaConfig {
4844                regions: vec!["US".to_string()],
4845                custom_holidays: vec![],
4846            },
4847            ..Default::default()
4848        };
4849
4850        let mut je_gen = JournalEntryGenerator::new_with_params(
4851            TransactionConfig::default(),
4852            coa,
4853            vec!["1000".to_string()],
4854            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4855            NaiveDate::from_ymd_opt(2024, 3, 31).unwrap(), // Q1 2024
4856            42,
4857        )
4858        .with_temporal_patterns(temporal_config, 42)
4859        .with_persona_errors(false);
4860
4861        // Generate entries and verify none fall on weekends
4862        let entries: Vec<JournalEntry> = (0..100).map(|_| je_gen.generate()).collect();
4863
4864        for entry in &entries {
4865            let weekday = entry.header.posting_date.weekday();
4866            assert!(
4867                weekday != chrono::Weekday::Sat && weekday != chrono::Weekday::Sun,
4868                "Posting date {:?} should not be a weekend",
4869                entry.header.posting_date
4870            );
4871        }
4872    }
4873
4874    #[test]
4875    fn test_default_generation_filters_weekends() {
4876        // Verify that weekend entries are <5% even when temporal_patterns is NOT enabled.
4877        // This tests the fix where new_with_full_config always creates a default
4878        // BusinessDayCalculator with US holidays as a fallback.
4879        let mut coa_gen =
4880            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4881        let coa = Arc::new(coa_gen.generate());
4882
4883        let mut je_gen = JournalEntryGenerator::new_with_params(
4884            TransactionConfig::default(),
4885            coa,
4886            vec!["1000".to_string()],
4887            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4888            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4889            42,
4890        )
4891        .with_persona_errors(false);
4892
4893        let total = 500;
4894        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4895
4896        let weekend_count = entries
4897            .iter()
4898            .filter(|e| {
4899                let wd = e.header.posting_date.weekday();
4900                wd == chrono::Weekday::Sat || wd == chrono::Weekday::Sun
4901            })
4902            .count();
4903
4904        let weekend_pct = weekend_count as f64 / total as f64;
4905        assert!(
4906            weekend_pct < 0.05,
4907            "Expected weekend entries <5% of total without temporal_patterns enabled, \
4908             but got {:.1}% ({}/{})",
4909            weekend_pct * 100.0,
4910            weekend_count,
4911            total
4912        );
4913    }
4914
4915    #[test]
4916    fn test_document_type_derived_from_business_process() {
4917        let mut coa_gen =
4918            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4919        let coa = Arc::new(coa_gen.generate());
4920
4921        let mut je_gen = JournalEntryGenerator::new_with_params(
4922            TransactionConfig::default(),
4923            coa,
4924            vec!["1000".to_string()],
4925            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4926            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4927            99,
4928        )
4929        .with_persona_errors(false)
4930        .with_batching(false);
4931
4932        let total = 200;
4933        let mut doc_types = std::collections::HashSet::new();
4934        let mut sa_count = 0_usize;
4935
4936        for _ in 0..total {
4937            let entry = je_gen.generate();
4938            let dt = &entry.header.document_type;
4939            doc_types.insert(dt.clone());
4940            if dt == "SA" {
4941                sa_count += 1;
4942            }
4943        }
4944
4945        // Should have more than 3 distinct document types
4946        assert!(
4947            doc_types.len() > 3,
4948            "Expected >3 distinct document types, got {} ({:?})",
4949            doc_types.len(),
4950            doc_types,
4951        );
4952
4953        // "SA" should be less than 50% (R2R is 20% of the weight)
4954        let sa_pct = sa_count as f64 / total as f64;
4955        assert!(
4956            sa_pct < 0.50,
4957            "Expected SA <50%, got {:.1}% ({}/{})",
4958            sa_pct * 100.0,
4959            sa_count,
4960            total,
4961        );
4962    }
4963
4964    #[test]
4965    fn test_enrich_line_items_account_description() {
4966        let mut coa_gen =
4967            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
4968        let coa = Arc::new(coa_gen.generate());
4969
4970        let mut je_gen = JournalEntryGenerator::new_with_params(
4971            TransactionConfig::default(),
4972            coa,
4973            vec!["1000".to_string()],
4974            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
4975            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
4976            42,
4977        )
4978        .with_persona_errors(false);
4979
4980        let total = 200;
4981        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
4982
4983        // Count lines with account_description populated
4984        let total_lines: usize = entries.iter().map(|e| e.lines.len()).sum();
4985        let lines_with_desc: usize = entries
4986            .iter()
4987            .flat_map(|e| &e.lines)
4988            .filter(|l| l.account_description.is_some())
4989            .count();
4990
4991        let desc_pct = lines_with_desc as f64 / total_lines as f64;
4992        assert!(
4993            desc_pct > 0.95,
4994            "Expected >95% of lines to have account_description, got {:.1}% ({}/{})",
4995            desc_pct * 100.0,
4996            lines_with_desc,
4997            total_lines,
4998        );
4999    }
5000
5001    #[test]
5002    fn test_enrich_line_items_cost_center_for_expense_accounts() {
5003        let mut coa_gen =
5004            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5005        let coa = Arc::new(coa_gen.generate());
5006
5007        let mut je_gen = JournalEntryGenerator::new_with_params(
5008            TransactionConfig::default(),
5009            coa,
5010            vec!["1000".to_string()],
5011            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5012            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5013            42,
5014        )
5015        .with_persona_errors(false);
5016
5017        let total = 300;
5018        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
5019
5020        // Count expense account lines (5xxx/6xxx) with cost_center populated
5021        let expense_lines: Vec<&JournalEntryLine> = entries
5022            .iter()
5023            .flat_map(|e| &e.lines)
5024            .filter(|l| {
5025                let first = l.gl_account.chars().next().unwrap_or('0');
5026                first == '5' || first == '6'
5027            })
5028            .collect();
5029
5030        if !expense_lines.is_empty() {
5031            let with_cc = expense_lines
5032                .iter()
5033                .filter(|l| l.cost_center.is_some())
5034                .count();
5035            let cc_pct = with_cc as f64 / expense_lines.len() as f64;
5036            assert!(
5037                cc_pct > 0.80,
5038                "Expected >80% of expense lines to have cost_center, got {:.1}% ({}/{})",
5039                cc_pct * 100.0,
5040                with_cc,
5041                expense_lines.len(),
5042            );
5043        }
5044    }
5045
5046    #[test]
5047    fn test_enrich_line_items_profit_center_and_line_text() {
5048        let mut coa_gen =
5049            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5050        let coa = Arc::new(coa_gen.generate());
5051
5052        let mut je_gen = JournalEntryGenerator::new_with_params(
5053            TransactionConfig::default(),
5054            coa,
5055            vec!["1000".to_string()],
5056            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5057            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5058            42,
5059        )
5060        .with_persona_errors(false);
5061        // #43 fixed: batched entries now populate header_text (-> line_text via
5062        // enrich_line_items' fallback), so batching stays ON here.
5063
5064        let total = 100;
5065        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
5066
5067        let total_lines: usize = entries.iter().map(|e| e.lines.len()).sum();
5068
5069        // All lines should have profit_center
5070        let with_pc = entries
5071            .iter()
5072            .flat_map(|e| &e.lines)
5073            .filter(|l| l.profit_center.is_some())
5074            .count();
5075        let pc_pct = with_pc as f64 / total_lines as f64;
5076        assert!(
5077            pc_pct > 0.95,
5078            "Expected >95% of lines to have profit_center, got {:.1}% ({}/{})",
5079            pc_pct * 100.0,
5080            with_pc,
5081            total_lines,
5082        );
5083
5084        // All lines should have line_text (either from template or header fallback)
5085        let with_text = entries
5086            .iter()
5087            .flat_map(|e| &e.lines)
5088            .filter(|l| l.line_text.is_some())
5089            .count();
5090        let text_pct = with_text as f64 / total_lines as f64;
5091        assert!(
5092            text_pct > 0.95,
5093            "Expected >95% of lines to have line_text, got {:.1}% ({}/{})",
5094            text_pct * 100.0,
5095            with_text,
5096            total_lines,
5097        );
5098    }
5099
5100    // --- ISA 240 audit flag tests ---
5101
5102    #[test]
5103    fn test_je_has_audit_flags() {
5104        let mut coa_gen =
5105            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5106        let coa = Arc::new(coa_gen.generate());
5107
5108        let mut je_gen = JournalEntryGenerator::new_with_params(
5109            TransactionConfig::default(),
5110            coa,
5111            vec!["1000".to_string()],
5112            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5113            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5114            42,
5115        )
5116        .with_persona_errors(false);
5117
5118        for _ in 0..100 {
5119            let entry = je_gen.generate();
5120
5121            // source_system should always be non-empty
5122            assert!(
5123                !entry.header.source_system.is_empty(),
5124                "source_system should be populated, got empty string"
5125            );
5126
5127            // created_by should always be non-empty (already tested elsewhere, but confirm)
5128            assert!(
5129                !entry.header.created_by.is_empty(),
5130                "created_by should be populated"
5131            );
5132
5133            // created_date should always be populated
5134            assert!(
5135                entry.header.created_date.is_some(),
5136                "created_date should be populated"
5137            );
5138        }
5139    }
5140
5141    #[test]
5142    fn test_manual_entry_rate() {
5143        let mut coa_gen =
5144            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5145        let coa = Arc::new(coa_gen.generate());
5146
5147        let mut je_gen = JournalEntryGenerator::new_with_params(
5148            TransactionConfig::default(),
5149            coa,
5150            vec!["1000".to_string()],
5151            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5152            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5153            42,
5154        )
5155        .with_persona_errors(false)
5156        .with_batching(false);
5157
5158        let total = 1000;
5159        let entries: Vec<JournalEntry> = (0..total).map(|_| je_gen.generate()).collect();
5160
5161        let manual_count = entries.iter().filter(|e| e.header.is_manual).count();
5162        let manual_rate = manual_count as f64 / total as f64;
5163
5164        // Default source_distribution.manual is typically around 0.05-0.15
5165        // Allow a wide tolerance for statistical variation
5166        assert!(
5167            manual_rate > 0.01 && manual_rate < 0.50,
5168            "Manual entry rate should be reasonable (1%-50%), got {:.1}% ({}/{})",
5169            manual_rate * 100.0,
5170            manual_count,
5171            total,
5172        );
5173
5174        // is_manual should match TransactionSource::Manual
5175        for entry in &entries {
5176            let source_is_manual = entry.header.source == TransactionSource::Manual;
5177            assert_eq!(
5178                entry.header.is_manual, source_is_manual,
5179                "is_manual should match source == Manual"
5180            );
5181        }
5182    }
5183
5184    #[test]
5185    fn test_manual_source_consistency() {
5186        let mut coa_gen =
5187            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5188        let coa = Arc::new(coa_gen.generate());
5189
5190        let mut je_gen = JournalEntryGenerator::new_with_params(
5191            TransactionConfig::default(),
5192            coa,
5193            vec!["1000".to_string()],
5194            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5195            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5196            42,
5197        )
5198        .with_persona_errors(false)
5199        .with_batching(false);
5200
5201        for _ in 0..500 {
5202            let entry = je_gen.generate();
5203
5204            if entry.header.is_manual {
5205                // Manual entries must have a source_system in the
5206                // `manual/...` or `spreadsheet/...` family (the bare
5207                // legacy `manual` and `spreadsheet` values are also
5208                // accepted to keep older fixtures working).
5209                let s = entry.header.source_system.as_str();
5210                assert!(
5211                    s == "manual"
5212                        || s == "spreadsheet"
5213                        || s.starts_with("manual/")
5214                        || s.starts_with("spreadsheet/"),
5215                    "Manual entry should have source_system in `manual` / `spreadsheet` family, got '{s}'",
5216                );
5217            } else {
5218                // Non-manual entries must NOT be in the manual/spreadsheet family.
5219                let s = entry.header.source_system.as_str();
5220                assert!(
5221                    !(s == "manual"
5222                        || s == "spreadsheet"
5223                        || s.starts_with("manual/")
5224                        || s.starts_with("spreadsheet/")),
5225                    "Non-manual entry should not be in `manual` / `spreadsheet` family, got '{s}'",
5226                );
5227            }
5228        }
5229    }
5230
5231    #[test]
5232    fn test_default_source_codes_breadth() {
5233        // T2-D Lever 1: with no industry priors and the default config, the
5234        // `source` column carries a broad generic SAP doc-type mix
5235        // (sap_source_code populated) instead of collapsing to the
5236        // TransactionSource enum. See experiments/ml/FINDINGS.md §6.
5237        let mut coa_gen =
5238            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 7);
5239        let coa = Arc::new(coa_gen.generate());
5240        let mut je_gen = JournalEntryGenerator::new_with_params(
5241            TransactionConfig::default(),
5242            coa,
5243            vec!["1000".to_string()],
5244            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5245            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5246            7,
5247        )
5248        .with_persona_errors(false)
5249        .with_batching(false);
5250
5251        let mut codes = std::collections::HashSet::new();
5252        for _ in 0..500 {
5253            let e = je_gen.generate();
5254            let code = e
5255                .header
5256                .sap_source_code
5257                .expect("default config should populate sap_source_code");
5258            codes.insert(code);
5259        }
5260        assert!(
5261            codes.len() >= 10,
5262            "default source-mix should be broad (>=10 distinct codes), got {}",
5263            codes.len()
5264        );
5265    }
5266
5267    #[test]
5268    fn test_source_codes_opt_out() {
5269        // synthetic_source_codes = Some(false) restores the legacy behaviour:
5270        // sap_source_code stays None and `source` falls back to the enum.
5271        let mut coa_gen =
5272            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 9);
5273        let coa = Arc::new(coa_gen.generate());
5274        let cfg = TransactionConfig {
5275            synthetic_source_codes: Some(false),
5276            ..TransactionConfig::default()
5277        };
5278        let mut je_gen = JournalEntryGenerator::new_with_params(
5279            cfg,
5280            coa,
5281            vec!["1000".to_string()],
5282            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5283            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5284            9,
5285        )
5286        .with_persona_errors(false)
5287        .with_batching(false);
5288        for _ in 0..50 {
5289            let e = je_gen.generate();
5290            assert!(
5291                e.header.sap_source_code.is_none(),
5292                "opt-out should leave sap_source_code None (legacy enum source)"
5293            );
5294        }
5295    }
5296
5297    #[test]
5298    fn test_recurring_templates_reuse_archetypes() {
5299        // SOTA-1: with templating on (default), generated JEs reuse account
5300        // archetypes (far fewer distinct than the legacy uniform-per-line
5301        // selection), and balance is preserved either way.
5302        fn run(recurring: Option<bool>) -> (usize, usize, bool) {
5303            let mut coa_gen = ChartOfAccountsGenerator::new(
5304                CoAComplexity::Medium,
5305                IndustrySector::Manufacturing,
5306                11,
5307            );
5308            let coa = Arc::new(coa_gen.generate());
5309            let cfg = TransactionConfig {
5310                recurring_templates: recurring,
5311                ..TransactionConfig::default()
5312            };
5313            let mut g = JournalEntryGenerator::new_with_params(
5314                cfg,
5315                coa,
5316                vec!["1000".to_string()],
5317                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5318                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5319                11,
5320            )
5321            .with_persona_errors(false)
5322            .with_batching(false);
5323            let n = 800;
5324            let mut arche = std::collections::HashSet::new();
5325            let mut balanced = true;
5326            for _ in 0..n {
5327                let e = g.generate();
5328                if !e.is_balanced() {
5329                    balanced = false;
5330                }
5331                let mut sig: Vec<(String, bool)> = e
5332                    .lines
5333                    .iter()
5334                    .map(|l| (l.gl_account.clone(), l.debit_amount > Decimal::ZERO))
5335                    .collect();
5336                sig.sort();
5337                arche.insert(sig);
5338            }
5339            (n, arche.len(), balanced)
5340        }
5341        let (n, distinct_on, bal_on) = run(Some(true));
5342        let (_, distinct_off, bal_off) = run(Some(false));
5343        assert!(bal_on && bal_off, "balance preserved in both modes");
5344        assert!(
5345            distinct_on < distinct_off,
5346            "templating should reduce distinct archetypes ({distinct_on} on vs {distinct_off} off)"
5347        );
5348        assert!(
5349            distinct_on * 2 < n,
5350            "templating should reuse heavily: {distinct_on} distinct archetypes over {n} JEs"
5351        );
5352    }
5353
5354    #[test]
5355    fn test_reversal_process_emits_balanced_reversals() {
5356        // SOTA-5: with reversal_rate > 0, some JEs are balanced reversals of
5357        // earlier ones (header_text "Reversal of ..."); rate 0.0 emits none.
5358        fn run(rate: Option<f64>) -> (usize, bool) {
5359            let mut coa_gen = ChartOfAccountsGenerator::new(
5360                CoAComplexity::Small,
5361                IndustrySector::Manufacturing,
5362                13,
5363            );
5364            let coa = Arc::new(coa_gen.generate());
5365            let cfg = TransactionConfig {
5366                reversal_rate: rate,
5367                ..TransactionConfig::default()
5368            };
5369            let mut g = JournalEntryGenerator::new_with_params(
5370                cfg,
5371                coa,
5372                vec!["1000".to_string()],
5373                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5374                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5375                13,
5376            )
5377            .with_persona_errors(false)
5378            .with_batching(false);
5379            let mut reversals = 0;
5380            let mut balanced = true;
5381            for _ in 0..1000 {
5382                let e = g.generate();
5383                if !e.is_balanced() {
5384                    balanced = false;
5385                }
5386                if e.header
5387                    .header_text
5388                    .as_deref()
5389                    .is_some_and(|t| t.starts_with("Reversal of"))
5390                {
5391                    reversals += 1;
5392                }
5393            }
5394            (reversals, balanced)
5395        }
5396        let (rev_on, bal_on) = run(Some(0.05));
5397        let (rev_off, bal_off) = run(Some(0.0));
5398        assert!(bal_on && bal_off, "all entries balanced incl. reversals");
5399        assert_eq!(rev_off, 0, "rate 0.0 emits no reversals, got {rev_off}");
5400        assert!(rev_on > 0, "rate 0.05 should emit reversals, got {rev_on}");
5401    }
5402
5403    #[test]
5404    fn test_account_concentration_creates_pareto() {
5405        // SOTA-2: with concentration on (default), a hot subset of accounts
5406        // carries most lines (the corpus account-activity Pareto, top-10% ≈ 95%)
5407        // vs the legacy near-uniform pool draw. Templating + reversals are held
5408        // off so the only difference between the two runs is the power-law pick.
5409        fn run(concentration: Option<bool>) -> (f64, bool) {
5410            let mut coa_gen = ChartOfAccountsGenerator::new(
5411                CoAComplexity::Medium,
5412                IndustrySector::Manufacturing,
5413                17,
5414            );
5415            let coa = Arc::new(coa_gen.generate());
5416            let cfg = TransactionConfig {
5417                account_concentration: concentration,
5418                recurring_templates: Some(false),
5419                reversal_rate: Some(0.0),
5420                ..TransactionConfig::default()
5421            };
5422            let mut g = JournalEntryGenerator::new_with_params(
5423                cfg,
5424                coa,
5425                vec!["1000".to_string()],
5426                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5427                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5428                17,
5429            )
5430            .with_persona_errors(false)
5431            .with_batching(false);
5432            let mut counts: std::collections::HashMap<String, usize> =
5433                std::collections::HashMap::new();
5434            let mut total_lines = 0usize;
5435            let mut balanced = true;
5436            for _ in 0..1000 {
5437                let e = g.generate();
5438                if !e.is_balanced() {
5439                    balanced = false;
5440                }
5441                for l in &e.lines {
5442                    *counts.entry(l.gl_account.clone()).or_default() += 1;
5443                    total_lines += 1;
5444                }
5445            }
5446            // share of lines carried by the top-10% most-active accounts (the
5447            // corpus_structure "acct top10%" metric, over active accounts).
5448            let mut v: Vec<usize> = counts.values().copied().collect();
5449            v.sort_unstable_by(|a, b| b.cmp(a));
5450            let top_k = ((v.len() as f64 * 0.10).ceil() as usize).max(1);
5451            let top_share = v.iter().take(top_k).sum::<usize>() as f64 / total_lines as f64;
5452            (top_share, balanced)
5453        }
5454        let (share_on, bal_on) = run(Some(true));
5455        let (share_off, bal_off) = run(Some(false));
5456        assert!(bal_on && bal_off, "balance preserved in both modes");
5457        assert!(
5458            share_on > share_off + 0.20,
5459            "concentration should raise the top-10% line share ({share_on:.3} on vs {share_off:.3} off)"
5460        );
5461        assert!(
5462            share_on > 0.50,
5463            "hot accounts should dominate: top-10% line share {share_on:.3}"
5464        );
5465    }
5466
5467    #[test]
5468    fn test_allocation_batch_emits_large_balanced_postings() {
5469        // SOTA-6: with allocation_batch_rate > 0, some JEs are large 1-to-many
5470        // allocation batches (source "AB", many cost-center-spread lines, still
5471        // balanced); rate 0.0 emits none. Reversals are disabled to isolate the
5472        // allocation process (which shares the recent-JE buffer).
5473        fn run(rate: Option<f64>) -> (usize, bool, usize) {
5474            let mut coa_gen = ChartOfAccountsGenerator::new(
5475                CoAComplexity::Small,
5476                IndustrySector::Manufacturing,
5477                23,
5478            );
5479            let coa = Arc::new(coa_gen.generate());
5480            let cfg = TransactionConfig {
5481                allocation_batch_rate: rate,
5482                reversal_rate: Some(0.0),
5483                ..TransactionConfig::default()
5484            };
5485            let mut g = JournalEntryGenerator::new_with_params(
5486                cfg,
5487                coa,
5488                vec!["1000".to_string()],
5489                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5490                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5491                23,
5492            )
5493            .with_persona_errors(false)
5494            .with_batching(false);
5495            let mut batches = 0usize;
5496            let mut balanced = true;
5497            let mut max_distinct_cc = 0usize;
5498            for _ in 0..2000 {
5499                let e = g.generate();
5500                if !e.is_balanced() {
5501                    balanced = false;
5502                }
5503                if e.header.sap_source_code.as_deref() == Some("AB") {
5504                    batches += 1;
5505                    assert!(
5506                        e.lines.len() >= ALLOCATION_MIN_TARGETS as usize,
5507                        "allocation batch should be large, got {} lines",
5508                        e.lines.len()
5509                    );
5510                    let ccs: std::collections::HashSet<String> = e
5511                        .lines
5512                        .iter()
5513                        .filter_map(|l| l.cost_center.clone())
5514                        .collect();
5515                    max_distinct_cc = max_distinct_cc.max(ccs.len());
5516                }
5517            }
5518            (batches, balanced, max_distinct_cc)
5519        }
5520        let (on, bal_on, cc) = run(Some(0.10));
5521        let (off, bal_off, _) = run(Some(0.0));
5522        assert!(
5523            bal_on && bal_off,
5524            "all entries balanced incl. allocation batches"
5525        );
5526        assert_eq!(off, 0, "rate 0.0 emits no allocation batches, got {off}");
5527        assert!(on > 0, "rate 0.10 should emit allocation batches, got {on}");
5528        assert!(
5529            cc > 1,
5530            "allocation should spread across multiple cost centers, got {cc}"
5531        );
5532    }
5533
5534    #[test]
5535    fn test_derived_id_processes_keep_document_ids_unique() {
5536        // SOTA-5/6 regression: reversals and allocation batches mint derived ids
5537        // (`base ^ salt`). Reusing the same base would duplicate an id — the
5538        // failure `test_document_reference_integrity` caught. With both processes
5539        // at high rates, every emitted document id must still be unique.
5540        let mut coa_gen =
5541            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 31);
5542        let coa = Arc::new(coa_gen.generate());
5543        let cfg = TransactionConfig {
5544            reversal_rate: Some(0.15),
5545            allocation_batch_rate: Some(0.10),
5546            ..TransactionConfig::default()
5547        };
5548        let mut g = JournalEntryGenerator::new_with_params(
5549            cfg,
5550            coa,
5551            vec!["1000".to_string()],
5552            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5553            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5554            31,
5555        )
5556        .with_persona_errors(false)
5557        .with_batching(false);
5558        let mut ids = std::collections::HashSet::new();
5559        let n = 3000;
5560        for _ in 0..n {
5561            let e = g.generate();
5562            assert!(
5563                ids.insert(e.header.document_id),
5564                "duplicate document id {} (derived-id collision)",
5565                e.header.document_id
5566            );
5567        }
5568        assert_eq!(ids.len(), n, "all {n} document ids unique");
5569    }
5570
5571    #[test]
5572    fn test_business_unit_rolls_up_from_cost_center() {
5573        // SOTA-3: with the dimension on (default), a line that has a cost center
5574        // (or, as fallback, a profit center) also carries a business_unit that is
5575        // a deterministic roll-up of that dimension (same value → same BU, in
5576        // BU01..BU11); with it off, BU is empty.
5577        fn run(enabled: Option<bool>) -> (usize, usize, bool, bool) {
5578            let mut coa_gen = ChartOfAccountsGenerator::new(
5579                CoAComplexity::Medium,
5580                IndustrySector::Manufacturing,
5581                19,
5582            );
5583            let coa = Arc::new(coa_gen.generate());
5584            let cfg = TransactionConfig {
5585                business_unit_dimension: enabled,
5586                ..TransactionConfig::default()
5587            };
5588            let mut g = JournalEntryGenerator::new_with_params(
5589                cfg,
5590                coa,
5591                vec!["1000".to_string()],
5592                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5593                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5594                19,
5595            )
5596            .with_persona_errors(false)
5597            .with_batching(false);
5598            let mut dim_lines = 0usize;
5599            let mut bu_lines = 0usize;
5600            let mut consistent = true; // BU present ⇒ matches the roll-up of its CC/PC
5601            let mut well_formed = true; // BU in BU01..BU11
5602            let mut dim_to_bu: std::collections::HashMap<String, String> =
5603                std::collections::HashMap::new();
5604            for _ in 0..600 {
5605                let e = g.generate();
5606                for l in &e.lines {
5607                    // BU rolls up from the cost center, or profit center as fallback.
5608                    let dim = l.cost_center.as_deref().or(l.profit_center.as_deref());
5609                    if dim.is_some() {
5610                        dim_lines += 1;
5611                    }
5612                    if let Some(bu) = &l.business_unit {
5613                        bu_lines += 1;
5614                        let d = dim.unwrap_or_default().to_string();
5615                        if bu != &JournalEntryGenerator::business_unit_for_dimension(&d) {
5616                            consistent = false;
5617                        }
5618                        // stable mapping across the run
5619                        if dim_to_bu
5620                            .insert(d, bu.clone())
5621                            .is_some_and(|prev| &prev != bu)
5622                        {
5623                            consistent = false;
5624                        }
5625                        let n_ok = bu.strip_prefix("BU").and_then(|d| d.parse::<u32>().ok());
5626                        if !matches!(n_ok, Some(1..=11)) {
5627                            well_formed = false;
5628                        }
5629                    }
5630                }
5631            }
5632            (dim_lines, bu_lines, consistent, well_formed)
5633        }
5634        let (dim_on, bu_on, consistent, well_formed) = run(Some(true));
5635        let (_, bu_off, _, _) = run(Some(false));
5636        assert!(
5637            dim_on > 0 && bu_on > 0,
5638            "BU should be populated where CC/PC is"
5639        );
5640        assert_eq!(
5641            dim_on, bu_on,
5642            "every CC/PC-bearing line gets a BU ({dim_on} dim vs {bu_on} BU)"
5643        );
5644        assert!(
5645            consistent,
5646            "BU must be the deterministic roll-up of its CC/PC"
5647        );
5648        assert!(well_formed, "BU codes must be BU01..BU11");
5649        assert_eq!(bu_off, 0, "dimension off ⇒ no business_unit, got {bu_off}");
5650    }
5651
5652    #[test]
5653    fn test_foreign_currency_sap_style() {
5654        // SOTA-4: with foreign_currency_rate > 0, some JEs post in a foreign
5655        // document currency. The ledger amounts (debit/credit) stay company
5656        // currency and the JE still balances; the foreign value lands in
5657        // transaction_amount and balances in the transaction currency too. rate
5658        // 0.0 → all company-currency. Reversals/allocations off to isolate.
5659        fn run(rate: Option<f64>) -> (usize, bool, bool) {
5660            let mut coa_gen = ChartOfAccountsGenerator::new(
5661                CoAComplexity::Small,
5662                IndustrySector::Manufacturing,
5663                29,
5664            );
5665            let coa = Arc::new(coa_gen.generate());
5666            let cfg = TransactionConfig {
5667                foreign_currency_rate: rate,
5668                reversal_rate: Some(0.0),
5669                allocation_batch_rate: Some(0.0),
5670                ..TransactionConfig::default()
5671            };
5672            let mut g = JournalEntryGenerator::new_with_params(
5673                cfg,
5674                coa,
5675                vec!["1000".to_string()],
5676                NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5677                NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5678                29,
5679            )
5680            .with_persona_errors(false)
5681            .with_batching(false);
5682            let mut foreign = 0usize;
5683            let mut ledger_ok = true; // debit == credit (company ledger)
5684            let mut txn_ok = true; // foreign lines carry transaction_amount + balance in txn ccy
5685            for _ in 0..1500 {
5686                let e = g.generate();
5687                if !e.is_balanced() {
5688                    ledger_ok = false;
5689                }
5690                if e.header.currency != "USD" {
5691                    foreign += 1;
5692                    if !e.lines.iter().all(|l| l.transaction_amount.is_some()) {
5693                        txn_ok = false;
5694                    }
5695                    let td: Decimal = e
5696                        .lines
5697                        .iter()
5698                        .filter(|l| l.debit_amount > Decimal::ZERO)
5699                        .filter_map(|l| l.transaction_amount)
5700                        .sum();
5701                    let tc: Decimal = e
5702                        .lines
5703                        .iter()
5704                        .filter(|l| l.credit_amount > Decimal::ZERO)
5705                        .filter_map(|l| l.transaction_amount)
5706                        .sum();
5707                    // tolerate per-line cent rounding (≤ n_lines half-cents)
5708                    let tol = Decimal::new(e.lines.len() as i64, 2);
5709                    if (td - tc).abs() > tol {
5710                        txn_ok = false;
5711                    }
5712                }
5713            }
5714            (foreign, ledger_ok, txn_ok)
5715        }
5716        let (fon, lbal_on, tbal_on) = run(Some(0.20));
5717        let (foff, lbal_off, _) = run(Some(0.0));
5718        assert!(
5719            lbal_on && lbal_off,
5720            "ledger balance (debit==credit) preserved in both modes"
5721        );
5722        assert!(
5723            fon > 0,
5724            "rate 0.20 should produce foreign-currency JEs, got {fon}"
5725        );
5726        assert_eq!(foff, 0, "rate 0.0 ⇒ no foreign JEs, got {foff}");
5727        assert!(
5728            tbal_on,
5729            "foreign JEs carry transaction_amount + balance in the transaction currency"
5730        );
5731    }
5732
5733    #[test]
5734    fn test_created_date_before_posting() {
5735        let mut coa_gen =
5736            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5737        let coa = Arc::new(coa_gen.generate());
5738
5739        let mut je_gen = JournalEntryGenerator::new_with_params(
5740            TransactionConfig::default(),
5741            coa,
5742            vec!["1000".to_string()],
5743            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5744            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5745            42,
5746        )
5747        .with_persona_errors(false);
5748
5749        for _ in 0..500 {
5750            let entry = je_gen.generate();
5751
5752            if let Some(created_date) = entry.header.created_date {
5753                let created_naive_date = created_date.date();
5754                assert!(
5755                    created_naive_date <= entry.header.posting_date,
5756                    "created_date ({}) should be <= posting_date ({})",
5757                    created_naive_date,
5758                    entry.header.posting_date,
5759                );
5760            }
5761        }
5762    }
5763
5764    /// SP3.5b — verify that `apply_calibration_step` mutates the generator's
5765    /// amount_sampler when a `"amounts.lognormal_sigma"` step is applied, and
5766    /// that `"amounts.round_dollar_share"` likewise updates the probability.
5767    #[test]
5768    fn apply_calibration_step_updates_lognormal_sigma() {
5769        let mut coa_gen =
5770            ChartOfAccountsGenerator::new(CoAComplexity::Small, IndustrySector::Manufacturing, 42);
5771        let coa = Arc::new(coa_gen.generate());
5772
5773        let mut gen = JournalEntryGenerator::new_with_params(
5774            TransactionConfig::default(),
5775            coa,
5776            vec!["1000".to_string()],
5777            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
5778            NaiveDate::from_ymd_opt(2024, 12, 31).unwrap(),
5779            42,
5780        );
5781
5782        let baseline_sigma = gen.amount_sampler.lognormal_sigma();
5783
5784        let step_sigma = crate::velocity_calibrator::CalibrationStep {
5785            rule_id: "R6".to_string(),
5786            parameter: "amounts.lognormal_sigma".to_string(),
5787            delta: 0.01,
5788            new_value: baseline_sigma + 0.01,
5789        };
5790        gen.apply_calibration_step(&step_sigma);
5791        assert!(
5792            (gen.amount_sampler.lognormal_sigma() - (baseline_sigma + 0.01)).abs() < 1e-9,
5793            "lognormal_sigma should be updated to {}",
5794            baseline_sigma + 0.01
5795        );
5796
5797        let baseline_round = gen.amount_sampler.round_number_probability();
5798        let step_round = crate::velocity_calibrator::CalibrationStep {
5799            rule_id: "R9".to_string(),
5800            parameter: "amounts.round_dollar_share".to_string(),
5801            delta: -0.005,
5802            new_value: (baseline_round - 0.005).max(0.0),
5803        };
5804        gen.apply_calibration_step(&step_round);
5805        let expected = (baseline_round - 0.005).max(0.0).clamp(0.0, 1.0);
5806        assert!(
5807            (gen.amount_sampler.round_number_probability() - expected).abs() < 1e-9,
5808            "round_number_probability should be updated to {}",
5809            expected
5810        );
5811    }
5812
5813    #[test]
5814    fn master_data_resolver_fills_every_pii_kind() {
5815        use datasynth_core::distributions::text_taxonomy::{
5816            PiiPlaceholderKind, PlaceholderResolver,
5817        };
5818        let mut r = MasterDataResolver {
5819            companies: vec!["Acme AG".to_string()],
5820            persons: vec!["Hans Muster".to_string()],
5821            streets: vec!["Hauptstrasse 1".to_string()],
5822            patients: vec!["Patient X".to_string()],
5823        };
5824        let mut rng = rand::rng();
5825        assert_eq!(r.resolve(PiiPlaceholderKind::Company, &mut rng), "Acme AG");
5826        assert_eq!(
5827            r.resolve(PiiPlaceholderKind::Person, &mut rng),
5828            "Hans Muster"
5829        );
5830        assert_eq!(
5831            r.resolve(PiiPlaceholderKind::Street, &mut rng),
5832            "Hauptstrasse 1"
5833        );
5834        assert_eq!(
5835            r.resolve(PiiPlaceholderKind::Patient, &mut rng),
5836            "Patient X"
5837        );
5838    }
5839
5840    #[test]
5841    fn master_data_resolver_empty_pool_falls_back() {
5842        use datasynth_core::distributions::text_taxonomy::{
5843            PiiPlaceholderKind, PlaceholderResolver,
5844        };
5845        let mut r = MasterDataResolver::default();
5846        let mut rng = rand::rng();
5847        let v = r.resolve(PiiPlaceholderKind::Company, &mut rng);
5848        assert!(!v.is_empty());
5849    }
5850
5851    /// Pin the shape invariant on `synthetic_patient_pool`: each entry, once
5852    /// filled into the canonical `*{patient} G:{date}…` template the corpus
5853    /// DZ/RG/RS classes use, must not introduce a *structural* residual-PII
5854    /// shape. Regression guard for the JE_79-class smoke failure: the old pool
5855    /// (`"B. Muster"`, `"A. Beispiel"`, …) shaped each fill as
5856    /// `<initial>. <surname>` which `RE_INITIAL_SURNAME` flags.
5857    ///
5858    /// NB: the `given_name` pattern is deliberately EXCLUDED here. These are
5859    /// synthetic *fill* values that are name-shaped by design (they fill
5860    /// `{patient}`); `given_name` is a template-scan signal for un-tokenized
5861    /// corpus names, not a check on legitimate synthetic output.
5862    #[test]
5863    fn synthetic_patient_pool_entries_pass_residual_scan() {
5864        use datasynth_core::distributions::text_taxonomy::PlaceholderGrammar;
5865        for name in synthetic_patient_pool("de_CH") {
5866            let filled = format!("*{name} G:2024-01-15 E:2024-01-20 A:2024-02-01");
5867            let structural: Vec<_> = PlaceholderGrammar::residual_pii_scan(&filled)
5868                .into_iter()
5869                .filter(|h| h.pattern != "given_name")
5870                .collect();
5871            assert!(
5872                structural.is_empty(),
5873                "synthetic patient name {name:?} fills to PII-shaped {filled:?}: {structural:?}"
5874            );
5875        }
5876    }
5877
5878    #[test]
5879    fn master_data_resolver_fallbacks_are_non_empty_and_placeholder_free() {
5880        use datasynth_core::distributions::text_taxonomy::{
5881            PiiPlaceholderKind, PlaceholderResolver,
5882        };
5883        // Verify fallback constants for every kind are non-empty and contain
5884        // no `{…}` literal placeholders (the resolver must never leak the
5885        // unfilled placeholder token into emitted text).
5886        let mut r = MasterDataResolver::default();
5887        let mut rng = rand::rng();
5888        for kind in [
5889            PiiPlaceholderKind::Company,
5890            PiiPlaceholderKind::Person,
5891            PiiPlaceholderKind::Street,
5892            PiiPlaceholderKind::Patient,
5893        ] {
5894            let v = r.resolve(kind, &mut rng);
5895            assert!(!v.is_empty(), "fallback for {kind:?} must be non-empty");
5896            assert!(
5897                !v.contains('{'),
5898                "fallback for {kind:?} must not contain a placeholder token"
5899            );
5900        }
5901    }
5902}