datasynth_core/distributions/
behavioral_priors.rs

1//! Behavioral priors mined from corpus GL data (SP2).
2//!
3//! These types are defined in `datasynth-core` (not `datasynth-fingerprint`) so
4//! that `datasynth-generators` can consume them without creating a package cycle.
5//! `datasynth-fingerprint` re-exports them from here.
6//!
7//! Spec: `docs/superpowers/specs/2026-05-12-sp2-real-world-prior-extraction-design.md`
8
9use std::collections::BTreeMap;
10
11use rand::RngExt;
12use serde::{Deserialize, Serialize};
13
14use super::text_taxonomy::TextTaxonomyPrior;
15
16// ---------------------------------------------------------------------------
17// EmpiricalCdf
18// ---------------------------------------------------------------------------
19
20/// Empirical CDF representation used by IET and lag priors.
21#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
22pub struct EmpiricalCdf {
23    /// Column / source name (informational).
24    pub column: String,
25    /// Sorted quantile values (after privacy processing).
26    pub values: Vec<f64>,
27    /// Cumulative probabilities matching `values` (monotone in [0, 1]).
28    pub probabilities: Vec<f64>,
29}
30
31impl EmpiricalCdf {
32    /// Build an empirical CDF from pre-sorted values.
33    pub fn from_sorted_values(column: impl Into<String>, values: Vec<f64>) -> Self {
34        let n = values.len();
35        let probabilities: Vec<f64> = (1..=n).map(|i| i as f64 / n as f64).collect();
36        Self {
37            column: column.into(),
38            values,
39            probabilities,
40        }
41    }
42
43    /// Evaluate CDF at a value (linear interpolation between knots).
44    pub fn cdf(&self, x: f64) -> f64 {
45        match self.values.binary_search_by(|v| v.total_cmp(&x)) {
46            Ok(i) => self.probabilities[i],
47            Err(i) => {
48                if i == 0 {
49                    0.0
50                } else if i >= self.values.len() {
51                    1.0
52                } else {
53                    let (x0, x1) = (self.values[i - 1], self.values[i]);
54                    let (p0, p1) = (self.probabilities[i - 1], self.probabilities[i]);
55                    p0 + (p1 - p0) * (x - x0) / (x1 - x0)
56                }
57            }
58        }
59    }
60
61    /// Evaluate inverse CDF (quantile function) at a probability.
62    pub fn quantile(&self, p: f64) -> f64 {
63        if p <= 0.0 {
64            return *self.values.first().unwrap_or(&0.0);
65        }
66        if p >= 1.0 {
67            return *self.values.last().unwrap_or(&0.0);
68        }
69        match self.probabilities.binary_search_by(|v| v.total_cmp(&p)) {
70            Ok(i) => self.values[i],
71            Err(i) => {
72                if i == 0 {
73                    self.values[0]
74                } else if i >= self.probabilities.len() {
75                    *self.values.last().unwrap_or(&0.0)
76                } else {
77                    let (p0, p1) = (self.probabilities[i - 1], self.probabilities[i]);
78                    let (x0, x1) = (self.values[i - 1], self.values[i]);
79                    x0 + (x1 - x0) * (p - p0) / (p1 - p0)
80                }
81            }
82        }
83    }
84}
85
86// ---------------------------------------------------------------------------
87// PerSourceAmountPrior (SP4.3)
88// ---------------------------------------------------------------------------
89
90/// SP4.3 — Per-(source, gl_prefix) amount distribution parameters.
91///
92/// Used by the generator to draw amounts conditional on the source code
93/// (and optionally the first-4-digit GL account prefix) drawn at line-construction
94/// time.  Log-normal mu/sigma per (source, gl_prefix) bucket, with fallback to
95/// source-marginal when the specific pair isn't represented.
96///
97/// Storage as nested map `source → gl_prefix → params` for forward compatibility.
98/// The `by_source` marginals serve as the primary fallback when a specific
99/// `(source, gl_prefix)` pair is absent or unrecognised.
100#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
101pub struct PerSourceAmountPrior {
102    /// Keyed by `(source, gl_prefix)`.
103    /// Outer key: source code (e.g. "KR").
104    /// Inner key: first-4-chars of GL account number (e.g. "0041").
105    pub by_source_and_class: BTreeMap<String, BTreeMap<String, LognormalAmount>>,
106    /// Source-only marginal fallback when an `(source, gl_prefix)` pair isn't present.
107    pub by_source: BTreeMap<String, LognormalAmount>,
108}
109
110/// Log-normal amount parameters derived from corpus GL data.
111///
112/// All values refer to `ln(|amount|)`.  Callers take the absolute value of the
113/// raw corpus amount before fitting so that both debit and credit lines are
114/// captured in the same distribution; sign/direction is assigned by the JE
115/// balancer after sampling.
116#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
117pub struct LognormalAmount {
118    /// Mean of `ln(|amount|)`.
119    pub mu: f64,
120    /// Standard deviation of `ln(|amount|)`.
121    pub sigma: f64,
122    /// Number of observations underpinning these parameters.
123    /// Parameters are only emitted when `n >= min_observations` (default 10).
124    pub n: usize,
125    /// Median absolute value of the raw amounts (for sanity / audit).
126    pub median_abs: f64,
127}
128
129impl LognormalAmount {
130    /// Draw a positive amount magnitude from `LogNormal(mu, sigma)`.
131    ///
132    /// Returns an `f64` > 0.  The caller decides sign (debit vs credit) and
133    /// rounds to [`rust_decimal::Decimal`].
134    ///
135    /// Falls back to `LogNormal(0.0, 1.0)` when `sigma` is non-positive or
136    /// the parameters are otherwise invalid — avoids panicking in production.
137    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> f64 {
138        use rand_distr::{Distribution, LogNormal};
139        // Clamp sigma to a tiny positive value to avoid `LogNormal::new` error.
140        let sigma = self.sigma.max(1e-6);
141        let dist = LogNormal::new(self.mu, sigma)
142            .unwrap_or_else(|_| LogNormal::new(0.0, 1.0).expect("fallback lognormal"));
143        dist.sample(rng)
144    }
145}
146
147// ---------------------------------------------------------------------------
148// BehavioralPriors and sub-types
149// ---------------------------------------------------------------------------
150
151/// Root container for the SP2 behavioral priors.
152#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
153pub struct BehavioralPriors {
154    pub schema_version: u32,
155    pub generator_version: String,
156    pub industry: String,
157    pub n_client_inputs: usize,
158    pub n_rows_aggregated: usize,
159    pub source_mix: SourceMixPrior,
160    pub per_source_iet: PerSourceIetPrior,
161    pub lines_per_je: LinesPerJePrior,
162    pub active_lifetime: ActiveLifetimePrior,
163    pub fanout: FanoutPrior,
164    #[serde(default, skip_serializing_if = "Option::is_none")]
165    pub posting_lag: Option<PostingLagPrior>,
166    /// SP3.2 — per-Source multi-segment active patterns. Optional, additive.
167    /// When `Some`, supersedes `active_lifetime` for window placement.
168    #[serde(default, skip_serializing_if = "Option::is_none")]
169    pub active_segments: Option<ActiveSegmentsPrior>,
170    /// SP3.3 — entity-cluster prior driving cross-entity motif preservation.
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub entity_clusters: Option<EntityClustersPrior>,
173    /// SP3.7 — per-source conditional attribute priors. Optional, additive.
174    /// When `Some`, the generator samples GL account / cost center / profit
175    /// center conditioned on the just-drawn source code.
176    #[serde(default, skip_serializing_if = "Option::is_none")]
177    pub per_source_attribute: Option<PerSourceAttributePrior>,
178    /// SP3.12 — TP-entity cluster prior. Clusters TradingPartner values by
179    /// shared GL/CC/PC attribute sets. The generator uses this to bias TP
180    /// selection toward cluster-mates of recently-used TPs (same source),
181    /// building triangle structure in the TP-GL co-occurrence graph.
182    #[serde(default, skip_serializing_if = "Option::is_none")]
183    pub tp_entity_clusters: Option<EntityClustersPrior>,
184    /// SP4.2 — CoA semantic content extracted from corpus COA_XXX.parquet
185    /// files. When `Some`, the CoA generator enriches account descriptions,
186    /// account_class, and account_sub_class with corpus values instead of
187    /// generic constants.  Old bundles load fine (deserialises as `None`).
188    #[serde(default, skip_serializing_if = "Option::is_none")]
189    pub coa_semantic: Option<CoaSemanticPrior>,
190    /// SP4.7 — Per-source reference-string format templates extracted from the
191    /// corpus.  When `Some`, the JE generator samples a reference string
192    /// from the template distribution rather than using the fixed
193    /// `format!("...")` fallback.  Old bundles load fine (deserialises as `None`).
194    #[serde(default, skip_serializing_if = "Option::is_none")]
195    pub reference_formats: Option<ReferenceFormatPrior>,
196    /// SP6 — corpus text taxonomy. Replaces the old SP4.4 text_templates field. When `Some`,
197    /// the generator samples line text keyed on `(source, account-class)`,
198    /// header text on `source`, and CoA descriptions per account — all PII-safe
199    /// templates filled at generation time. `None` for pre-SP6 bundles.
200    #[serde(default, skip_serializing_if = "Option::is_none")]
201    pub text_taxonomy: Option<TextTaxonomyPrior>,
202    /// SP4.5 — Per-user behavioral patterns (source mix, hourly density,
203    /// weekday density, volume share) mined from corpus GL data.
204    /// When `Some` and non-empty, the JE generator biases `created_by` and
205    /// `created_at` toward the characteristic patterns of each user.
206    ///
207    /// Note: the corpus GL files examined for this project carried no
208    /// user column; the prior is included as an additive field so future data
209    /// deliveries can populate it without any schema changes.  Old bundles
210    /// load fine (deserialises as `None`).
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    pub user_personas: Option<UserPersonaPrior>,
213    /// SP4.3 — Per-(source, gl_prefix) amount distribution parameters.
214    ///
215    /// When `Some`, the JE generator samples the JE total-amount from the
216    /// log-normal conditional matching the current source code (and optionally
217    /// GL-account prefix), rather than the global `AmountSampler` marginal.
218    /// Fraud entries bypass this path to preserve fraud-pattern semantics.
219    /// Old bundles load fine (deserialises as `None`).
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    pub source_amount_conditionals: Option<PerSourceAmountPrior>,
222    /// SP4.6 — Per-(source, line_role) GL account conditional.
223    ///
224    /// line_role is "DR" (debit-dominant lines) or "CR" (credit-dominant lines).
225    /// When `Some`, the generator draws GL accounts respecting the line role so
226    /// that expense/asset classes appear on DR sides and liability/revenue classes
227    /// appear on CR sides, matching corpus SAP doc-type shapes.
228    ///
229    /// Sources with no strong role bias (e.g. SA) emit whatever the corpus shows.
230    /// Old bundles load fine (deserialises as `None`).
231    #[serde(default, skip_serializing_if = "Option::is_none")]
232    pub source_role_gl_conditionals: Option<PerSourceRolePrior>,
233    /// SP4.1 — Trial-balance anchor prior.  Per-account opening/closing balances
234    /// extracted from real `TB_XXX.parquet` files.  When `Some`, the generator's
235    /// balance tracker uses these targets to nudge synthetic balances toward a
236    /// corpus-shaped distribution via periodic drift-correction entries.
237    ///
238    /// Old bundles load fine (deserialises as `None`); the balance tracker
239    /// continues with its existing free-drift behaviour when `None`.
240    #[serde(default, skip_serializing_if = "Option::is_none")]
241    pub tb_anchor: Option<TbAnchorPrior>,
242}
243
244impl BehavioralPriors {
245    pub const SCHEMA_VERSION: u32 = 1;
246}
247
248#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
249pub struct SourceMixPrior {
250    pub probabilities: BTreeMap<String, f64>,
251    pub other_fraction: f64,
252    pub min_threshold: f64,
253}
254
255impl SourceMixPrior {
256    /// Draw a source code from the weighted distribution.
257    ///
258    /// Returns a randomly selected key from `probabilities`, weighted by the
259    /// associated probability mass.  Falls back to the first key (or `"SA"`) when
260    /// the map is empty or all weights sum to zero.
261    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> String {
262        if self.probabilities.is_empty() {
263            return "SA".to_string();
264        }
265        let r: f64 = rng.random_range(0.0..1.0);
266        let total: f64 = self.probabilities.values().sum();
267        if total <= 0.0 {
268            return self
269                .probabilities
270                .keys()
271                .next()
272                .cloned()
273                .unwrap_or_else(|| "SA".to_string());
274        }
275        let mut cum = 0.0;
276        for (code, &weight) in &self.probabilities {
277            cum += weight / total;
278            if r <= cum {
279                return code.clone();
280            }
281        }
282        // Floating-point rounding: return the last key.
283        self.probabilities
284            .keys()
285            .next_back()
286            .cloned()
287            .unwrap_or_else(|| "SA".to_string())
288    }
289
290    /// A privacy-safe **generic** SAP document-type source mix for the default
291    /// (no industry-priors) generation path. Standard SAP FI/MM/SD document
292    /// types with hand-chosen, plausible weights — **not** corpus-derived
293    /// probabilities — so the emitted `source` column has realistic breadth
294    /// (~25 codes, entropy ~2.7) instead of collapsing to the coarse
295    /// `TransactionSource` enum (entropy ~0.75). See experiments/ml/FINDINGS.md
296    /// §6 for the source-mix gap that motivates this. Weights are relative;
297    /// `sample` normalises by their sum.
298    pub fn sap_default() -> Self {
299        let head = [
300            ("RV", 0.16),
301            ("KR", 0.12),
302            ("DR", 0.10),
303            ("SA", 0.09),
304            ("DZ", 0.08),
305            ("KZ", 0.07),
306            ("WE", 0.06),
307            ("RE", 0.05),
308            ("DG", 0.04),
309            ("KG", 0.035),
310            ("WA", 0.03),
311            // "AB" (accounting/allocation doc) is intentionally omitted: it is
312            // reserved for the SOTA-6 allocation-batch process so synthetic "AB"
313            // JEs carry the corpus's large lines-per-JE (~52), not a small mix.
314            ("WL", 0.025),
315            ("ZP", 0.02),
316            ("SK", 0.018),
317            ("AF", 0.015),
318            ("AA", 0.012),
319            ("ML", 0.010),
320            ("PR", 0.008),
321            ("RN", 0.007),
322            ("WI", 0.006),
323            ("AN", 0.005),
324            ("UE", 0.004),
325            ("ZV", 0.003),
326            ("EU", 0.002),
327        ];
328        let mut probabilities: BTreeMap<String, f64> =
329            head.into_iter().map(|(k, v)| (k.to_string(), v)).collect();
330
331        // Long tail (Lever 2): a power-law of synthetic SAP custom doc-type
332        // codes (Z-prefixed, the SAP custom-type convention). Synthetic only,
333        // never corpus-derived. The rare tail codes draw few events, which also
334        // lifts the per-source inter-event-time variance (FINDINGS sec.6: IET
335        // variance is coupled to source breadth). Weight is proportional to
336        // 1/rank^1.1, scaled to a fraction of the head's summed mass.
337        //
338        // v5.30 A3 (#150): TAIL_MASS reduced 0.30 → 0.15 to compress the
339        // synthetic-source vocabulary toward the reference shard. v5.29
340        // emitted 526 distinct sources vs reference ~287; this halves the
341        // Z-tail mass, expected to drop synth source-cardinality to ~390-440
342        // and tighten the Sajja P1 IET-distribution gap by ~10-15%. The
343        // trade-off is some IET-variance lift (the SP3 design intent of the
344        // Z-tail); documented in FINDINGS §6 update.
345        const TAIL_N: usize = 500;
346        const TAIL_MASS: f64 = 0.15;
347        let zipf: f64 = (1..=TAIL_N).map(|r| 1.0 / (r as f64).powf(1.1)).sum();
348        for r in 1..=TAIL_N {
349            let w = TAIL_MASS * (1.0 / (r as f64).powf(1.1)) / zipf;
350            probabilities.insert(format!("Z{r:03}"), w);
351        }
352
353        Self {
354            probabilities,
355            other_fraction: 0.0,
356            min_threshold: 0.0,
357        }
358    }
359}
360
361#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
362pub struct PerSourceIetPrior {
363    pub by_source: BTreeMap<String, IetSummary>,
364}
365
366#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
367pub struct IetSummary {
368    pub n: usize,
369    pub empirical_cdf_days: EmpiricalCdf,
370    pub lognormal_fit: Option<LognormalParams>,
371    pub lag1_autocorr: f64,
372}
373
374#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
375pub struct LognormalParams {
376    pub mu: f64,
377    pub sigma: f64,
378}
379
380#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
381pub struct LinesPerJePrior {
382    pub overall: LineCountHistogram,
383    pub by_source: BTreeMap<String, LineCountHistogram>,
384    pub min_jes_per_source: usize,
385}
386
387#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
388pub struct ActiveLifetimePrior {
389    pub by_source: BTreeMap<String, LineCountHistogram>,
390    pub overall: LineCountHistogram,
391}
392
393#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
394pub struct FanoutPrior {
395    pub by_attribute: BTreeMap<String, LineCountHistogram>,
396}
397
398#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
399pub struct PostingLagPrior {
400    pub by_source: BTreeMap<String, LagSummary>,
401}
402
403#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
404pub struct LagSummary {
405    pub empirical_cdf_days: EmpiricalCdf,
406    pub mean: f64,
407    pub stddev: f64,
408    pub n: usize,
409}
410
411// ---------------------------------------------------------------------------
412// ActiveSegmentsPrior (SP3.2)
413// ---------------------------------------------------------------------------
414
415/// SP3.2 — Per-Source distribution of active-segment count + length + gap.
416#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
417pub struct ActiveSegmentsPrior {
418    pub by_source: BTreeMap<String, SourceSegmentSummary>,
419}
420
421#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
422pub struct SourceSegmentSummary {
423    pub segment_count_histogram: LineCountHistogram,
424    pub segment_length_histogram: LineCountHistogram,
425    pub gap_length_histogram: LineCountHistogram,
426}
427
428/// Bucket grid for "how many active segments per Source per period".
429pub const SEGMENT_COUNT_BUCKETS: &[u32] = &[1, 2, 3, 4, 6, 8, 12, 16, 24];
430
431/// Bucket grid for "gap between active segments, in days".
432pub const SEGMENT_GAP_BUCKETS: &[u32] = &[1, 2, 3, 7, 14, 30, 60, 90];
433
434// ---------------------------------------------------------------------------
435// EntityClustersPrior (SP3.3)
436// ---------------------------------------------------------------------------
437
438/// SP3.3 — Clusters of Sources that share attribute pools heavily.
439#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
440pub struct EntityClustersPrior {
441    pub clusters: Vec<EntityCluster>,
442    /// Fraction of Sources that landed in *some* cluster (vs. isolates).
443    pub clustering_rate: f64,
444}
445
446// ---------------------------------------------------------------------------
447// PerSourceAttributePrior + CategoricalDistribution (SP3.7)
448// ---------------------------------------------------------------------------
449
450/// SP3.7 — Per-source conditional distribution over downstream attributes
451/// (GL account, cost center, profit center, ...).  When generation knows
452/// the source code, the conditional CDF tells it which attribute values
453/// are characteristic of that source.
454#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
455pub struct PerSourceAttributePrior {
456    /// Outer key: source code (e.g. "KR", "RV", "DZ").
457    /// Inner key: attribute name ("gl_account", "cost_center", "profit_center").
458    /// Value: categorical distribution over the observed attribute values
459    ///        conditioned on `(source, attribute)`.
460    pub by_source: BTreeMap<String, BTreeMap<String, CategoricalDistribution>>,
461    /// Configured minimum number of observations per (source, attribute) pair
462    /// before a conditional is retained. Smaller groups are dropped to avoid
463    /// over-fitting low-volume sources.
464    pub min_observations: usize,
465}
466
467impl PerSourceAttributePrior {
468    /// Look up the conditional distribution for `(source, attribute)`.
469    /// Returns `None` if the source or attribute isn't represented in the
470    /// prior (e.g. the source had too few observations during extraction).
471    pub fn conditional(&self, source: &str, attribute: &str) -> Option<&CategoricalDistribution> {
472        self.by_source.get(source)?.get(attribute)
473    }
474}
475
476// ---------------------------------------------------------------------------
477// PerSourceRolePrior (SP4.6)
478// ---------------------------------------------------------------------------
479
480/// SP4.6 — Per-(source, line_role) GL account conditional.
481///
482/// Captures the observation that each SAP document type has a canonical
483/// line-role structure:
484/// - KR (vendor invoice): DR → expense (5/6xxx),  CR → AP (2xxx)
485/// - RV (customer invoice): DR → AR (1xxx),       CR → revenue (4xxx)
486/// - DZ (customer payment): DR → Bank (1xxx),     CR → AR (1xxx)
487/// - KZ (vendor payment):  DR → AP (2xxx),        CR → Bank (1xxx)
488/// - WE (goods receipt):   DR → Inventory (1xxx), CR → GR/IR (2xxx)
489/// - SA (manual journal): weak role bias — distribution reflects corpus mix.
490///
491/// `role` is `"DR"` for debit-dominant lines, `"CR"` for credit-dominant lines.
492#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
493pub struct PerSourceRolePrior {
494    /// `(source → role → distribution over GL account values)`.
495    /// Inner role key is `"DR"` or `"CR"`.
496    pub by_source_and_role: BTreeMap<String, BTreeMap<String, CategoricalDistribution>>,
497}
498
499impl PerSourceRolePrior {
500    /// Look up the conditional distribution for `(source, role)`.
501    ///
502    /// Returns `None` when the source or role isn't represented (e.g. too few
503    /// observations during extraction, or an old bundle without SP4.6 data).
504    /// Callers fall back to `per_source_attribute` marginal or to a default GL.
505    pub fn conditional(&self, source: &str, role: &str) -> Option<&CategoricalDistribution> {
506        self.by_source_and_role.get(source)?.get(role)
507    }
508}
509
510// ---------------------------------------------------------------------------
511// TbAnchorPrior (SP4.1)
512// ---------------------------------------------------------------------------
513
514/// SP4.1 — Trial-balance anchor prior.
515///
516/// Extracted from real `TB_XXX.parquet` files.  Each entry represents the
517/// industry-median opening and closing balance for a GL account number.
518/// The generator's balance tracker uses these values in target-aware mode
519/// to emit drift-correction entries that keep the synthetic balance sheet
520/// shaped like a corpus balance sheet.
521///
522/// **Schema note**: real TB files carry `Functional Beginning Balance` and
523/// `Functional Ending Balance` columns but no explicit period-debit/credit
524/// columns — those are derived as the difference between balances.
525#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
526pub struct TbAnchorPrior {
527    /// Per-account TB target.  Keyed by GL account number (normalised to
528    /// the same format as the rest of the bundle, e.g. leading-zero-padded
529    /// 10-digit strings for the current health corpus).
530    pub per_account: BTreeMap<String, TbTarget>,
531    /// Industry-median total assets (sum of asset-class closing balances).
532    /// Used for aggregate drift-tolerance checks.
533    pub total_assets: f64,
534    /// Industry-median total liabilities (sum of liability-class closing balances).
535    pub total_liabilities: f64,
536    /// Industry-median total equity (sum of equity-class closing balances).
537    pub total_equity: f64,
538    /// Number of clients that contributed to this anchor.
539    pub n_clients: usize,
540}
541
542impl TbAnchorPrior {
543    /// Returns `true` when the prior carries at least one account with a
544    /// non-zero closing balance — i.e. it was built from real TB data.
545    pub fn has_data(&self) -> bool {
546        self.per_account
547            .values()
548            .any(|t| t.closing_balance.abs() > 1e-9 || t.opening_balance.abs() > 1e-9)
549    }
550}
551
552/// Target balance for a single GL account, representing the industry-median
553/// values extracted from real `TB_XXX.parquet` files.
554#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
555pub struct TbTarget {
556    /// Industry-median opening balance for this account.
557    pub opening_balance: f64,
558    /// Industry-median closing balance for this account.
559    pub closing_balance: f64,
560    /// Derived period net activity (closing − opening), positive = net debit.
561    /// Pre-computed at extraction time for convenience.
562    pub period_net_activity: f64,
563    /// Standard deviation of opening balance across contributing clients
564    /// (in their respective functional currencies, normalised by client-total-assets).
565    /// Smaller stdev = tighter target.  Zero when only one client contributed.
566    pub opening_stdev: f64,
567    /// Standard deviation of closing balance across contributing clients.
568    pub closing_stdev: f64,
569    /// Number of clients in which this account appeared.
570    pub n_clients: usize,
571}
572
573/// A categorical distribution stored as a sparse `value → probability` map.
574/// Used by SP3.7 for per-source attribute conditionals.
575#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
576pub struct CategoricalDistribution {
577    /// Value (e.g. an account number) → probability mass.  Probabilities
578    /// sum to 1.0 (within float tolerance).
579    pub probabilities: BTreeMap<String, f64>,
580    /// Total observations underpinning this distribution.
581    pub n: usize,
582}
583
584impl CategoricalDistribution {
585    /// Construct from raw counts.  Normalises to probabilities.
586    pub fn from_counts(counts: BTreeMap<String, usize>) -> Self {
587        let n: usize = counts.values().sum();
588        if n == 0 {
589            return Self::default();
590        }
591        let probabilities = counts
592            .into_iter()
593            .map(|(k, v)| (k, v as f64 / n as f64))
594            .collect();
595        Self { probabilities, n }
596    }
597
598    /// Draw a value from the distribution.  Returns `None` when the
599    /// distribution is empty (caller falls back to the marginal or to a
600    /// default attribute value).
601    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> Option<String> {
602        if self.probabilities.is_empty() {
603            return None;
604        }
605        let total: f64 = self.probabilities.values().sum();
606        if total <= 0.0 {
607            return None;
608        }
609        let r: f64 = rng.random_range(0.0..1.0);
610        let mut cum = 0.0;
611        for (value, &p) in &self.probabilities {
612            cum += p / total;
613            if r <= cum {
614                return Some(value.clone());
615            }
616        }
617        self.probabilities.keys().next_back().cloned()
618    }
619}
620
621#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
622pub struct EntityCluster {
623    pub members: Vec<String>,
624    pub avg_jaccard: f64,
625}
626
627// ---------------------------------------------------------------------------
628// CoaSemanticPrior (SP4.2)
629// ---------------------------------------------------------------------------
630
631/// SP4.2 — Per-account semantic content extracted from real CoA files.
632/// Keyed by canonical GL account number (matching values in the per-source
633/// attribute conditional). When `Some` on the parent `BehavioralPriors`, the
634/// CoA generator emits real account names + descriptions + ISO 21378 hierarchy
635/// from this prior instead of generic constants.
636#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
637pub struct CoaSemanticPrior {
638    pub accounts: std::collections::BTreeMap<String, AccountSemantic>,
639}
640
641/// Semantic metadata for a single GL account, sourced from a real CoA file.
642#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
643pub struct AccountSemantic {
644    /// Human-readable account description (e.g. "Kasse", "Bankkonto CHF 1").
645    pub description: String,
646    /// Account type as reported in the source file (e.g. "Assets", "Liabilities").
647    #[serde(default, skip_serializing_if = "Option::is_none")]
648    pub account_type: Option<String>,
649    /// ISO 21378 Level-2 class code or label (e.g. "C _ Cash", "01_Cash and cash equivalents").
650    #[serde(default, skip_serializing_if = "Option::is_none")]
651    pub account_class: Option<String>,
652    /// Human-readable name for `account_class`.
653    #[serde(default, skip_serializing_if = "Option::is_none")]
654    pub account_class_name: Option<String>,
655    /// ISO 21378 Level-3 sub-class code or label.
656    #[serde(default, skip_serializing_if = "Option::is_none")]
657    pub account_sub_class: Option<String>,
658    /// Human-readable name for `account_sub_class`.
659    #[serde(default, skip_serializing_if = "Option::is_none")]
660    pub account_sub_class_name: Option<String>,
661    /// Parent account number in hierarchy (if available).
662    #[serde(default, skip_serializing_if = "Option::is_none")]
663    pub parent_account: Option<String>,
664}
665
666// ---------------------------------------------------------------------------
667// ReferenceFormatPrior (SP4.7)
668// ---------------------------------------------------------------------------
669
670/// SP4.7 — Per-source reference-string format templates extracted from real
671/// corpus GL data.  Templates use placeholders for digit and alphabetic runs;
672/// fixed punctuation / hyphens are preserved verbatim.
673///
674/// Example templates:
675/// - `"{4 digits}-{4 digits}-{10 digits}"` (JE_3 client: all sources)
676/// - `"RE-{4 digits}-{6 digits}"` (hypothetical KR/vendor-invoice client)
677///
678/// Only templates observed ≥ `min_occurrences` times (per client) and the top-10
679/// by frequency per source are retained — preventing PII leakage.
680#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
681pub struct ReferenceFormatPrior {
682    pub by_source: BTreeMap<String, Vec<ReferenceTemplate>>,
683}
684
685/// A single reference-format template observed in corpus data.
686#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
687pub struct ReferenceTemplate {
688    /// Template string with `{N digits}` / `{N alpha}` placeholders.
689    /// Fixed characters (hyphens, slashes, dots) are preserved verbatim.
690    /// Example: `"{4 digits}-{4 digits}-{10 digits}"`
691    pub template: String,
692    /// Fraction of references for this source that match this template
693    /// (after frequency filtering and renormalisation).
694    pub probability: f64,
695    /// A concrete example value from the corpus (for debugging / audit).
696    pub example: String,
697}
698
699// ---------------------------------------------------------------------------
700// UserPersonaPrior (SP4.5)
701// ---------------------------------------------------------------------------
702
703/// SP4.5 — Per-user behavioral patterns extracted from a corpus.
704///
705/// Each user has a characteristic source mix (AP clerks post mostly KR/KZ,
706/// AR clerks RV/DZ, GL/closing staff SA), an hourly density, and a weekday
707/// distribution.
708///
709/// **Corpus note**: The corpus GL files in this project carry no user column
710/// (all 45 JE parquet files were examined; none had a Created-By field).
711/// Extraction is therefore stubbed — `extract_user_personas` in
712/// `user_extractor.rs` returns an empty `UserPersonaPrior`.  When a future data
713/// delivery includes per-user columns, the extractor can be filled in without
714/// any schema changes here.  Generators gate on `UserPersonaPrior::has_data()`
715/// and fall back to the internal user pool when the prior is empty.
716#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
717pub struct UserPersonaPrior {
718    /// User-ID → behavior.  Empty when no user-column data was available.
719    pub users: BTreeMap<String, UserBehavior>,
720    /// Distribution of distinct active-user count per generation run.
721    /// Populated only when extraction had real data.
722    pub user_count_distribution: LineCountHistogram,
723}
724
725impl UserPersonaPrior {
726    /// Returns `true` when the prior carries at least one user with a non-empty
727    /// source mix and positive volume share (i.e. built from real user-column data).
728    pub fn has_data(&self) -> bool {
729        self.users
730            .values()
731            .any(|u| !u.source_mix.is_empty() && u.volume_share > 0.0)
732    }
733
734    /// Sample a user ID likely to post the given `source` code.
735    ///
736    /// Weights each user by `source_mix[source] × volume_share`.  Returns
737    /// `None` when the prior is empty or no user has a non-zero weight for
738    /// `source` (caller falls back to the generic user pool).
739    pub fn sample_user_for_source<R: rand::Rng>(
740        &self,
741        source: &str,
742        rng: &mut R,
743    ) -> Option<String> {
744        use rand::RngExt;
745        if self.users.is_empty() {
746            return None;
747        }
748        let weights: Vec<(&String, f64)> = self
749            .users
750            .iter()
751            .filter_map(|(uid, beh)| {
752                let mix = beh.source_mix.get(source).copied().unwrap_or(0.0);
753                let w = mix * beh.volume_share;
754                if w > 0.0 {
755                    Some((uid, w))
756                } else {
757                    None
758                }
759            })
760            .collect();
761        if weights.is_empty() {
762            return None;
763        }
764        let total: f64 = weights.iter().map(|(_, w)| *w).sum();
765        if total <= 0.0 {
766            return None;
767        }
768        let r: f64 = rng.random_range(0.0..total);
769        let mut cum = 0.0;
770        for (uid, w) in &weights {
771            cum += w;
772            if r <= cum {
773                return Some((*uid).clone());
774            }
775        }
776        weights.last().map(|(uid, _)| (*uid).clone())
777    }
778
779    /// Given a `user_id`, sample an `(hour, weekday)` pair from the user's
780    /// density arrays.  `hour` ∈ 0..24, `weekday` ∈ 0..7 (Monday = 0).
781    ///
782    /// Returns `None` when the user is unknown or all densities are zero.
783    pub fn sample_timestamp_for_user<R: rand::Rng>(
784        &self,
785        user_id: &str,
786        rng: &mut R,
787    ) -> Option<(u32, u32)> {
788        use rand::RngExt;
789        let beh = self.users.get(user_id)?;
790
791        let hour_total: f64 = beh.hourly_density.iter().sum();
792        if hour_total <= 0.0 {
793            return None;
794        }
795        let r: f64 = rng.random_range(0.0..hour_total);
796        let mut cum = 0.0;
797        let mut hour = 0u32;
798        for (h, &p) in beh.hourly_density.iter().enumerate() {
799            cum += p;
800            if r <= cum {
801                hour = h as u32;
802                break;
803            }
804        }
805
806        let weekday_total: f64 = beh.weekday_density.iter().sum();
807        if weekday_total <= 0.0 {
808            return None;
809        }
810        let r: f64 = rng.random_range(0.0..weekday_total);
811        let mut cum = 0.0;
812        let mut weekday = 0u32;
813        for (d, &p) in beh.weekday_density.iter().enumerate() {
814            cum += p;
815            if r <= cum {
816                weekday = d as u32;
817                break;
818            }
819        }
820
821        Some((hour, weekday))
822    }
823}
824
825/// Behavioral fingerprint for a single corpus user.
826#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
827pub struct UserBehavior {
828    /// Source code → probability mass.  Sums to 1.0 (within float tolerance).
829    /// Example: AP clerk → `{"KR": 0.6, "KZ": 0.3, "RE": 0.1}`.
830    pub source_mix: BTreeMap<String, f64>,
831    /// Hour-of-day (0 = midnight … 23 = 23:xx) → probability mass.
832    /// Sums to 1.0.  All zeros when `created_at` was unavailable.
833    pub hourly_density: [f64; 24],
834    /// Day-of-week (0 = Monday … 6 = Sunday) → probability mass.  Sums to 1.0.
835    pub weekday_density: [f64; 7],
836    /// Fraction of total rows in the industry pool attributed to this user.
837    pub volume_share: f64,
838}
839
840impl Default for UserBehavior {
841    fn default() -> Self {
842        Self {
843            source_mix: BTreeMap::new(),
844            hourly_density: [0.0; 24],
845            weekday_density: [0.0; 7],
846            volume_share: 0.0,
847        }
848    }
849}
850
851// ---------------------------------------------------------------------------
852// LineCountHistogram
853// ---------------------------------------------------------------------------
854
855/// Bucket histogram for counts such as lines-per-JE, fan-out, and active lifetime.
856#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
857pub struct LineCountHistogram {
858    pub buckets: Vec<u32>,
859    pub probabilities: Vec<f64>,
860    pub n: usize,
861}
862
863impl LineCountHistogram {
864    /// Build a histogram on the given inclusive-lower-bound bucket grid.
865    ///
866    /// The bucket grid must be sorted ascending. Values >= `buckets.last()`
867    /// fall into the last bucket. Values < `buckets[0]` are dropped (count
868    /// returned for diagnostic use).
869    pub fn build(values: &[u32], buckets: &[u32]) -> (Self, usize) {
870        assert!(!buckets.is_empty(), "buckets must not be empty");
871        let n_buckets = buckets.len();
872        let mut counts = vec![0u64; n_buckets];
873        let mut dropped = 0usize;
874        for &v in values {
875            if v < buckets[0] {
876                dropped += 1;
877                continue;
878            }
879            let bucket_idx = bucket_index(buckets, v);
880            counts[bucket_idx] += 1;
881        }
882        let total: u64 = counts.iter().sum();
883        let probabilities = if total == 0 {
884            vec![0.0; n_buckets]
885        } else {
886            counts.iter().map(|&c| c as f64 / total as f64).collect()
887        };
888        (
889            Self {
890                buckets: buckets.to_vec(),
891                probabilities,
892                n: values.len(),
893            },
894            dropped,
895        )
896    }
897
898    /// Sum bucket counts of `self` and `other` (same bucket grid) and
899    /// renormalise. Returns `None` if grids mismatch.
900    pub fn pool(&self, other: &Self) -> Option<Self> {
901        if self.buckets != other.buckets {
902            return None;
903        }
904        let total_n = self.n + other.n;
905        if total_n == 0 {
906            return Some(Self {
907                buckets: self.buckets.clone(),
908                probabilities: vec![0.0; self.buckets.len()],
909                n: 0,
910            });
911        }
912        let probabilities: Vec<f64> = self
913            .probabilities
914            .iter()
915            .zip(other.probabilities.iter())
916            .map(|(&pa, &pb)| (pa * self.n as f64 + pb * other.n as f64) / total_n as f64)
917            .collect();
918        Some(Self {
919            buckets: self.buckets.clone(),
920            probabilities,
921            n: total_n,
922        })
923    }
924
925    /// Median bucket — the smallest `buckets[i]` whose cumulative probability >= 0.5.
926    pub fn median_bucket(&self) -> u32 {
927        let mut cum = 0.0;
928        for (i, &p) in self.probabilities.iter().enumerate() {
929            cum += p;
930            if cum >= 0.5 {
931                return self.buckets[i];
932            }
933        }
934        *self.buckets.last().unwrap_or(&0)
935    }
936
937    /// Sample a count from the histogram. Picks a bucket weighted by
938    /// probability mass, then samples uniformly within the bucket (between
939    /// `buckets[i]` inclusive and `buckets[i+1]` exclusive — for the last
940    /// bucket, returns `buckets[i]`).
941    pub fn sample_bucket<R: rand::Rng>(&self, rng: &mut R) -> u32 {
942        if self.buckets.is_empty() {
943            return 0;
944        }
945        let r: f64 = rng.random_range(0.0..1.0);
946        let mut cum = 0.0;
947        let mut chosen_idx = self.buckets.len() - 1;
948        for (i, &p) in self.probabilities.iter().enumerate() {
949            cum += p;
950            if r <= cum {
951                chosen_idx = i;
952                break;
953            }
954        }
955        let lo = self.buckets[chosen_idx];
956        let hi = self.buckets.get(chosen_idx + 1).copied().unwrap_or(lo);
957        if hi <= lo {
958            lo
959        } else {
960            rng.random_range(lo..hi)
961        }
962    }
963}
964
965fn bucket_index(buckets: &[u32], v: u32) -> usize {
966    match buckets.binary_search(&v) {
967        Ok(i) => i,
968        Err(i) => i.saturating_sub(1),
969    }
970}
971
972/// Canonical bucket grid for line counts (lines-per-JE, fan-out).
973pub const LINE_COUNT_BUCKETS: &[u32] = &[1, 2, 3, 4, 5, 6, 8, 10, 16, 32, 64, 128, 256, 1024];
974
975/// Canonical bucket grid for active-lifetime days.
976pub const ACTIVE_LIFETIME_DAY_BUCKETS: &[u32] = &[0, 1, 7, 30, 90, 180, 365, 730, 1825];
977
978/// Canonical bucket grid for fan-out values.
979pub const FANOUT_BUCKETS: &[u32] = &[1, 2, 3, 5, 8, 16, 32, 64, 128, 256, 1024];
980
981#[cfg(test)]
982mod tests {
983    use super::*;
984    use rand::SeedableRng;
985    use rand_chacha::ChaCha8Rng;
986
987    #[test]
988    fn sap_default_has_broad_long_tail() {
989        // Lever 2: the default source-mix is the standard head plus a synthetic
990        // power-law long tail, giving corpus-like breadth + entropy.
991        let m = SourceMixPrior::sap_default();
992        let p = &m.probabilities;
993        assert!(
994            p.len() >= 300,
995            "Lever-2 default should carry a long tail, got {} codes",
996            p.len()
997        );
998        let total: f64 = p.values().sum();
999        let ent: f64 = -p
1000            .values()
1001            .map(|&w| {
1002                let q = w / total;
1003                if q > 0.0 {
1004                    q * q.ln()
1005                } else {
1006                    0.0
1007                }
1008            })
1009            .sum::<f64>();
1010        assert!(
1011            ent > 3.0,
1012            "Lever-2 default entropy should exceed 3.0, got {ent:.3}"
1013        );
1014        assert!(p.contains_key("RV"), "standard head code present");
1015        assert!(p.contains_key("Z001"), "synthetic tail code present");
1016        let mut rng = ChaCha8Rng::seed_from_u64(1);
1017        for _ in 0..50 {
1018            assert!(p.contains_key(&m.sample(&mut rng)));
1019        }
1020    }
1021
1022    #[test]
1023    fn line_count_histogram_build_basic() {
1024        let values = vec![1, 1, 2, 3, 5, 5, 5, 32, 200];
1025        let (hist, dropped) = LineCountHistogram::build(&values, LINE_COUNT_BUCKETS);
1026        assert_eq!(dropped, 0);
1027        assert_eq!(hist.n, 9);
1028        assert!((hist.probabilities.iter().sum::<f64>() - 1.0).abs() < 1e-9);
1029    }
1030
1031    #[test]
1032    fn line_count_histogram_drops_below_min() {
1033        let values = vec![0, 0, 1, 2];
1034        let (hist, dropped) = LineCountHistogram::build(&values, &[1, 2, 4]);
1035        assert_eq!(dropped, 2);
1036        assert_eq!(hist.n, 4);
1037        assert!((hist.probabilities[0] - 0.5).abs() < 1e-9);
1038    }
1039
1040    #[test]
1041    fn sample_bucket_respects_probabilities() {
1042        let h = LineCountHistogram {
1043            buckets: vec![1, 2, 4, 8],
1044            probabilities: vec![0.0, 0.0, 1.0, 0.0],
1045            n: 100,
1046        };
1047        let mut rng = ChaCha8Rng::seed_from_u64(42);
1048        for _ in 0..50 {
1049            let s = h.sample_bucket(&mut rng);
1050            assert!((4..8).contains(&s), "expected sample in [4,8), got {s}");
1051        }
1052    }
1053
1054    #[test]
1055    fn empirical_cdf_from_sorted_values() {
1056        let cdf = EmpiricalCdf::from_sorted_values("test", vec![1.0, 2.0, 3.0]);
1057        assert_eq!(cdf.values.len(), 3);
1058        assert!((cdf.probabilities[2] - 1.0).abs() < 1e-9);
1059    }
1060
1061    #[test]
1062    fn active_segments_prior_default_round_trips() {
1063        let p = ActiveSegmentsPrior::default();
1064        let json = serde_json::to_string(&p).expect("serialize");
1065        let back: ActiveSegmentsPrior = serde_json::from_str(&json).expect("deserialize");
1066        assert!(back.by_source.is_empty());
1067    }
1068
1069    #[test]
1070    fn behavioral_priors_active_segments_optional_round_trip() {
1071        // Construct a minimal BehavioralPriors with active_segments=Some(...)
1072        // and round-trip.
1073        let bp = BehavioralPriors {
1074            schema_version: BehavioralPriors::SCHEMA_VERSION,
1075            generator_version: "test".to_string(),
1076            industry: "test".to_string(),
1077            n_client_inputs: 0,
1078            n_rows_aggregated: 0,
1079            source_mix: SourceMixPrior::default(),
1080            per_source_iet: PerSourceIetPrior::default(),
1081            lines_per_je: LinesPerJePrior::default(),
1082            active_lifetime: ActiveLifetimePrior::default(),
1083            fanout: FanoutPrior::default(),
1084            posting_lag: None,
1085            active_segments: Some(ActiveSegmentsPrior::default()),
1086            entity_clusters: None,
1087            per_source_attribute: None,
1088            tp_entity_clusters: None,
1089            coa_semantic: None,
1090            reference_formats: None,
1091            text_taxonomy: None,
1092            user_personas: None,
1093            source_amount_conditionals: None,
1094            source_role_gl_conditionals: None,
1095            tb_anchor: None,
1096        };
1097        let json = serde_json::to_string(&bp).expect("serialize");
1098        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1099        assert!(back.active_segments.is_some());
1100    }
1101
1102    #[test]
1103    fn behavioral_priors_legacy_round_trips_without_active_segments() {
1104        // Old JSON (no active_segments field) should deserialise as None.
1105        let legacy = r#"{
1106            "schema_version": 1,
1107            "generator_version": "5.12.0",
1108            "industry": "health",
1109            "n_client_inputs": 1,
1110            "n_rows_aggregated": 100,
1111            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1112            "per_source_iet": {"by_source": {}},
1113            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1114            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1115            "fanout": {"by_attribute": {}}
1116        }"#;
1117        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1118        assert!(bp.active_segments.is_none());
1119        assert!(bp.posting_lag.is_none());
1120    }
1121
1122    #[test]
1123    fn entity_clusters_prior_default_round_trips() {
1124        let p = EntityClustersPrior::default();
1125        let json = serde_json::to_string(&p).expect("serialize");
1126        let back: EntityClustersPrior = serde_json::from_str(&json).expect("deserialize");
1127        assert!(back.clusters.is_empty());
1128        assert!((back.clustering_rate).abs() < 1e-9);
1129    }
1130
1131    #[test]
1132    fn categorical_distribution_samples_with_correct_weights() {
1133        use rand::SeedableRng;
1134        use rand_chacha::ChaCha8Rng;
1135
1136        let mut counts = BTreeMap::new();
1137        counts.insert("A".to_string(), 700);
1138        counts.insert("B".to_string(), 200);
1139        counts.insert("C".to_string(), 100);
1140        let dist = CategoricalDistribution::from_counts(counts);
1141
1142        assert_eq!(dist.n, 1000);
1143        assert!((dist.probabilities["A"] - 0.7).abs() < 1e-9);
1144
1145        let mut rng = ChaCha8Rng::seed_from_u64(42);
1146        let mut buckets = BTreeMap::new();
1147        for _ in 0..10_000 {
1148            let v = dist.sample(&mut rng).expect("non-empty");
1149            *buckets.entry(v).or_insert(0) += 1;
1150        }
1151        // ~70% should be A, allow 2 std deviations of binomial.
1152        let a_count = buckets.get("A").copied().unwrap_or(0);
1153        assert!(
1154            (a_count as i64 - 7000).abs() < 200,
1155            "got {} A samples",
1156            a_count
1157        );
1158    }
1159
1160    #[test]
1161    fn per_source_attribute_prior_conditional_lookup() {
1162        let mut inner = BTreeMap::new();
1163        let mut prob_map = BTreeMap::new();
1164        prob_map.insert("200001".to_string(), 0.9);
1165        prob_map.insert("200002".to_string(), 0.1);
1166        inner.insert(
1167            "gl_account".to_string(),
1168            CategoricalDistribution {
1169                probabilities: prob_map,
1170                n: 100,
1171            },
1172        );
1173        let mut by_source = BTreeMap::new();
1174        by_source.insert("KR".to_string(), inner);
1175        let prior = PerSourceAttributePrior {
1176            by_source,
1177            min_observations: 10,
1178        };
1179        assert!(prior.conditional("KR", "gl_account").is_some());
1180        assert!(prior.conditional("KR", "cost_center").is_none());
1181        assert!(prior.conditional("RV", "gl_account").is_none());
1182    }
1183
1184    #[test]
1185    fn behavioral_priors_per_source_attribute_optional_round_trip() {
1186        let bp = BehavioralPriors {
1187            schema_version: BehavioralPriors::SCHEMA_VERSION,
1188            generator_version: "test".to_string(),
1189            industry: "test".to_string(),
1190            n_client_inputs: 0,
1191            n_rows_aggregated: 0,
1192            source_mix: SourceMixPrior::default(),
1193            per_source_iet: PerSourceIetPrior::default(),
1194            lines_per_je: LinesPerJePrior::default(),
1195            active_lifetime: ActiveLifetimePrior::default(),
1196            fanout: FanoutPrior::default(),
1197            posting_lag: None,
1198            active_segments: None,
1199            entity_clusters: None,
1200            per_source_attribute: Some(PerSourceAttributePrior::default()),
1201            tp_entity_clusters: None,
1202            coa_semantic: None,
1203            reference_formats: None,
1204            text_taxonomy: None,
1205            user_personas: None,
1206            source_amount_conditionals: None,
1207            source_role_gl_conditionals: None,
1208            tb_anchor: None,
1209        };
1210        let json = serde_json::to_string(&bp).expect("serialize");
1211        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1212        assert!(back.per_source_attribute.is_some());
1213    }
1214
1215    #[test]
1216    fn entity_clusters_prior_with_members_round_trips() {
1217        let p = EntityClustersPrior {
1218            clusters: vec![EntityCluster {
1219                members: vec!["A".into(), "B".into(), "C".into()],
1220                avg_jaccard: 0.42,
1221            }],
1222            clustering_rate: 0.75,
1223        };
1224        let json = serde_json::to_string(&p).expect("serialize");
1225        let back: EntityClustersPrior = serde_json::from_str(&json).expect("deserialize");
1226        assert_eq!(back.clusters.len(), 1);
1227        assert_eq!(back.clusters[0].members.len(), 3);
1228        assert!((back.clusters[0].avg_jaccard - 0.42).abs() < 1e-9);
1229        assert!((back.clustering_rate - 0.75).abs() < 1e-9);
1230    }
1231
1232    // ---- SP4.3 tests -------------------------------------------------------
1233
1234    /// `LognormalAmount::sample` returns positive values whose geometric mean
1235    /// is close to `exp(mu)` (within 20% for 1000 samples).
1236    #[test]
1237    fn lognormal_amount_sample_positive_values() {
1238        use rand::SeedableRng;
1239        use rand_chacha::ChaCha8Rng;
1240
1241        let params = LognormalAmount {
1242            mu: 4.5, // exp(4.5) ≈ 90
1243            sigma: 0.8,
1244            n: 1000,
1245            median_abs: 90.0,
1246        };
1247        let mut rng = ChaCha8Rng::seed_from_u64(42);
1248        let samples: Vec<f64> = (0..1000).map(|_| params.sample(&mut rng)).collect();
1249
1250        // All samples must be strictly positive.
1251        assert!(samples.iter().all(|&v| v > 0.0), "all samples must be > 0");
1252
1253        // Geometric mean (mean of log-samples) should be close to mu.
1254        let log_mean: f64 = samples.iter().map(|v| v.ln()).sum::<f64>() / 1000.0;
1255        assert!(
1256            (log_mean - 4.5).abs() < 0.15,
1257            "log-mean {log_mean:.3} should be near mu=4.5"
1258        );
1259    }
1260
1261    /// `LognormalAmount::sample` tolerates degenerate sigma (near-zero → clamped).
1262    #[test]
1263    fn lognormal_amount_sample_degenerate_sigma() {
1264        use rand::SeedableRng;
1265        use rand_chacha::ChaCha8Rng;
1266
1267        let params = LognormalAmount {
1268            mu: 3.0,
1269            sigma: 0.0, // degenerate — should not panic
1270            n: 5,
1271            median_abs: 20.0,
1272        };
1273        let mut rng = ChaCha8Rng::seed_from_u64(7);
1274        for _ in 0..10 {
1275            let v = params.sample(&mut rng);
1276            assert!(v > 0.0, "must be positive even with sigma=0");
1277        }
1278    }
1279
1280    /// `PerSourceAmountPrior` round-trips through JSON.
1281    #[test]
1282    fn per_source_amount_prior_round_trip() {
1283        let mut by_source = BTreeMap::new();
1284        by_source.insert(
1285            "KR".to_string(),
1286            LognormalAmount {
1287                mu: 4.5,
1288                sigma: 2.158,
1289                n: 278939,
1290                median_abs: 100.0,
1291            },
1292        );
1293        let mut by_source_and_class = BTreeMap::new();
1294        let mut inner = BTreeMap::new();
1295        inner.insert(
1296            "0041".to_string(),
1297            LognormalAmount {
1298                mu: 5.394,
1299                sigma: 1.602,
1300                n: 61726,
1301                median_abs: 209.98,
1302            },
1303        );
1304        by_source_and_class.insert("KR".to_string(), inner);
1305
1306        let prior = PerSourceAmountPrior {
1307            by_source_and_class,
1308            by_source,
1309        };
1310        let json = serde_json::to_string(&prior).expect("serialize");
1311        let back: PerSourceAmountPrior = serde_json::from_str(&json).expect("deserialize");
1312        assert_eq!(back.by_source.len(), 1);
1313        assert_eq!(back.by_source_and_class.len(), 1);
1314        assert_eq!(back.by_source["KR"].n, 278939);
1315        assert_eq!(back.by_source_and_class["KR"]["0041"].n, 61726);
1316    }
1317
1318    /// `BehavioralPriors` with `source_amount_conditionals: Some(...)` round-trips.
1319    #[test]
1320    fn behavioral_priors_source_amount_conditionals_optional_round_trip() {
1321        let prior = PerSourceAmountPrior {
1322            by_source_and_class: BTreeMap::new(),
1323            by_source: BTreeMap::new(),
1324        };
1325        let bp = BehavioralPriors {
1326            schema_version: BehavioralPriors::SCHEMA_VERSION,
1327            generator_version: "test".to_string(),
1328            industry: "test".to_string(),
1329            n_client_inputs: 0,
1330            n_rows_aggregated: 0,
1331            source_mix: SourceMixPrior::default(),
1332            per_source_iet: PerSourceIetPrior::default(),
1333            lines_per_je: LinesPerJePrior::default(),
1334            active_lifetime: ActiveLifetimePrior::default(),
1335            fanout: FanoutPrior::default(),
1336            posting_lag: None,
1337            active_segments: None,
1338            entity_clusters: None,
1339            per_source_attribute: None,
1340            tp_entity_clusters: None,
1341            coa_semantic: None,
1342            reference_formats: None,
1343            text_taxonomy: None,
1344            user_personas: None,
1345            source_amount_conditionals: Some(prior),
1346            source_role_gl_conditionals: None,
1347            tb_anchor: None,
1348        };
1349        let json = serde_json::to_string(&bp).expect("serialize");
1350        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1351        assert!(back.source_amount_conditionals.is_some());
1352    }
1353
1354    /// Legacy `BehavioralPriors` JSON without `source_amount_conditionals`
1355    /// deserialises with `None` for that field (backwards compat).
1356    #[test]
1357    fn behavioral_priors_legacy_missing_source_amount_conditionals() {
1358        let legacy = r#"{
1359            "schema_version": 1,
1360            "generator_version": "5.21.0",
1361            "industry": "health",
1362            "n_client_inputs": 1,
1363            "n_rows_aggregated": 100,
1364            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1365            "per_source_iet": {"by_source": {}},
1366            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1367            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1368            "fanout": {"by_attribute": {}}
1369        }"#;
1370        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1371        assert!(
1372            bp.source_amount_conditionals.is_none(),
1373            "missing field should deserialise as None"
1374        );
1375    }
1376
1377    // ---- SP4.6 tests -------------------------------------------------------
1378
1379    /// `PerSourceRolePrior::conditional` returns the right distribution
1380    /// and `sample` returns only values from the matching role set.
1381    #[test]
1382    fn sp4_6_role_conditional_keeps_dr_in_expense_class() {
1383        use rand::SeedableRng;
1384        use rand_chacha::ChaCha8Rng;
1385
1386        // Build a PerSourceRolePrior for KR:
1387        //   DR: expense accounts (6000, 6100)
1388        //   CR: AP account (2000)
1389        let mut dr_counts = BTreeMap::new();
1390        dr_counts.insert("6000".to_string(), 100usize);
1391        dr_counts.insert("6100".to_string(), 50usize);
1392        let mut cr_counts = BTreeMap::new();
1393        cr_counts.insert("2000".to_string(), 150usize);
1394
1395        let mut role_map = BTreeMap::new();
1396        role_map.insert(
1397            "DR".to_string(),
1398            CategoricalDistribution::from_counts(dr_counts),
1399        );
1400        role_map.insert(
1401            "CR".to_string(),
1402            CategoricalDistribution::from_counts(cr_counts),
1403        );
1404
1405        let mut by_source_and_role = BTreeMap::new();
1406        by_source_and_role.insert("KR".to_string(), role_map);
1407
1408        let prior = PerSourceRolePrior { by_source_and_role };
1409
1410        // DR draws should be in {6000, 6100}
1411        let mut rng = ChaCha8Rng::seed_from_u64(42);
1412        for _ in 0..100 {
1413            let v = prior
1414                .conditional("KR", "DR")
1415                .unwrap()
1416                .sample(&mut rng)
1417                .unwrap();
1418            assert!(
1419                v == "6000" || v == "6100",
1420                "DR draw must be expense account, got {v}"
1421            );
1422        }
1423
1424        // CR draws should be {2000}
1425        for _ in 0..50 {
1426            let v = prior
1427                .conditional("KR", "CR")
1428                .unwrap()
1429                .sample(&mut rng)
1430                .unwrap();
1431            assert_eq!(v, "2000", "CR draw must be AP account");
1432        }
1433    }
1434
1435    /// When a `(source, role)` pair is absent, `conditional` returns `None`.
1436    #[test]
1437    fn sp4_6_role_conditional_falls_back_when_pair_missing() {
1438        let prior = PerSourceRolePrior::default();
1439        assert!(
1440            prior.conditional("KR", "DR").is_none(),
1441            "empty prior must return None"
1442        );
1443        assert!(
1444            prior.conditional("KR", "CR").is_none(),
1445            "empty prior must return None for CR too"
1446        );
1447    }
1448
1449    /// `PerSourceRolePrior` round-trips through JSON without loss.
1450    #[test]
1451    fn sp4_6_per_source_role_prior_json_round_trip() {
1452        let mut dr_counts = BTreeMap::new();
1453        dr_counts.insert("6000".to_string(), 200usize);
1454        let mut role_map = BTreeMap::new();
1455        role_map.insert(
1456            "DR".to_string(),
1457            CategoricalDistribution::from_counts(dr_counts),
1458        );
1459        let mut by_source_and_role = BTreeMap::new();
1460        by_source_and_role.insert("KR".to_string(), role_map);
1461        let prior = PerSourceRolePrior { by_source_and_role };
1462
1463        let json = serde_json::to_string(&prior).expect("serialize");
1464        let back: PerSourceRolePrior = serde_json::from_str(&json).expect("deserialize");
1465        assert!(back.conditional("KR", "DR").is_some());
1466        assert!(back.conditional("KR", "CR").is_none());
1467    }
1468
1469    /// `BehavioralPriors` with `source_role_gl_conditionals: Some(...)` round-trips.
1470    #[test]
1471    fn behavioral_priors_source_role_gl_conditionals_optional_round_trip() {
1472        let prior = PerSourceRolePrior::default();
1473        let bp = BehavioralPriors {
1474            schema_version: BehavioralPriors::SCHEMA_VERSION,
1475            generator_version: "test".to_string(),
1476            industry: "test".to_string(),
1477            n_client_inputs: 0,
1478            n_rows_aggregated: 0,
1479            source_mix: SourceMixPrior::default(),
1480            per_source_iet: PerSourceIetPrior::default(),
1481            lines_per_je: LinesPerJePrior::default(),
1482            active_lifetime: ActiveLifetimePrior::default(),
1483            fanout: FanoutPrior::default(),
1484            posting_lag: None,
1485            active_segments: None,
1486            entity_clusters: None,
1487            per_source_attribute: None,
1488            tp_entity_clusters: None,
1489            coa_semantic: None,
1490            reference_formats: None,
1491            text_taxonomy: None,
1492            user_personas: None,
1493            source_amount_conditionals: None,
1494            source_role_gl_conditionals: Some(prior),
1495            tb_anchor: None,
1496        };
1497        let json = serde_json::to_string(&bp).expect("serialize");
1498        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1499        assert!(back.source_role_gl_conditionals.is_some());
1500    }
1501
1502    /// Legacy `BehavioralPriors` JSON without `source_role_gl_conditionals`
1503    /// deserialises with `None` for that field (backwards compat).
1504    #[test]
1505    fn behavioral_priors_legacy_missing_source_role_gl_conditionals() {
1506        let legacy = r#"{
1507            "schema_version": 1,
1508            "generator_version": "5.21.0",
1509            "industry": "health",
1510            "n_client_inputs": 1,
1511            "n_rows_aggregated": 100,
1512            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1513            "per_source_iet": {"by_source": {}},
1514            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1515            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1516            "fanout": {"by_attribute": {}}
1517        }"#;
1518        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1519        assert!(
1520            bp.source_role_gl_conditionals.is_none(),
1521            "missing field should deserialise as None"
1522        );
1523    }
1524
1525    // ---- SP4.1 tests -------------------------------------------------------
1526
1527    /// `TbAnchorPrior` round-trips through JSON without loss.
1528    #[test]
1529    fn tb_anchor_prior_json_round_trip() {
1530        let mut per_account = BTreeMap::new();
1531        per_account.insert(
1532            "1000".to_string(),
1533            TbTarget {
1534                opening_balance: 100_000.0,
1535                closing_balance: 120_000.0,
1536                period_net_activity: 20_000.0,
1537                opening_stdev: 5_000.0,
1538                closing_stdev: 6_000.0,
1539                n_clients: 3,
1540            },
1541        );
1542        per_account.insert(
1543            "2000".to_string(),
1544            TbTarget {
1545                opening_balance: -50_000.0,
1546                closing_balance: -60_000.0,
1547                period_net_activity: -10_000.0,
1548                opening_stdev: 2_000.0,
1549                closing_stdev: 3_000.0,
1550                n_clients: 3,
1551            },
1552        );
1553        let anchor = TbAnchorPrior {
1554            per_account,
1555            total_assets: 300_000.0,
1556            total_liabilities: 120_000.0,
1557            total_equity: 180_000.0,
1558            n_clients: 3,
1559        };
1560        let json = serde_json::to_string(&anchor).expect("serialize");
1561        let back: TbAnchorPrior = serde_json::from_str(&json).expect("deserialize");
1562        assert_eq!(back.per_account.len(), 2);
1563        assert!((back.per_account["1000"].closing_balance - 120_000.0).abs() < 1e-6);
1564        assert!((back.total_assets - 300_000.0).abs() < 1e-6);
1565        assert_eq!(back.n_clients, 3);
1566    }
1567
1568    /// `TbAnchorPrior::has_data()` returns `true` for non-zero balances.
1569    #[test]
1570    fn tb_anchor_prior_has_data() {
1571        let mut prior = TbAnchorPrior::default();
1572        assert!(!prior.has_data(), "empty prior must report no data");
1573
1574        prior.per_account.insert(
1575            "1000".to_string(),
1576            TbTarget {
1577                closing_balance: 1.0,
1578                ..Default::default()
1579            },
1580        );
1581        assert!(
1582            prior.has_data(),
1583            "non-zero closing balance must report has_data"
1584        );
1585    }
1586
1587    /// `BehavioralPriors` with `tb_anchor: Some(...)` round-trips through JSON.
1588    #[test]
1589    fn behavioral_priors_tb_anchor_optional_round_trip() {
1590        let mut per_account = BTreeMap::new();
1591        per_account.insert(
1592            "1000".to_string(),
1593            TbTarget {
1594                opening_balance: 50_000.0,
1595                closing_balance: 55_000.0,
1596                period_net_activity: 5_000.0,
1597                opening_stdev: 1_000.0,
1598                closing_stdev: 1_200.0,
1599                n_clients: 2,
1600            },
1601        );
1602        let tb_anchor = Some(TbAnchorPrior {
1603            per_account,
1604            total_assets: 55_000.0,
1605            total_liabilities: 0.0,
1606            total_equity: 55_000.0,
1607            n_clients: 2,
1608        });
1609        let bp = BehavioralPriors {
1610            schema_version: BehavioralPriors::SCHEMA_VERSION,
1611            generator_version: "test".to_string(),
1612            industry: "test".to_string(),
1613            n_client_inputs: 0,
1614            n_rows_aggregated: 0,
1615            source_mix: SourceMixPrior::default(),
1616            per_source_iet: PerSourceIetPrior::default(),
1617            lines_per_je: LinesPerJePrior::default(),
1618            active_lifetime: ActiveLifetimePrior::default(),
1619            fanout: FanoutPrior::default(),
1620            posting_lag: None,
1621            active_segments: None,
1622            entity_clusters: None,
1623            per_source_attribute: None,
1624            tp_entity_clusters: None,
1625            coa_semantic: None,
1626            reference_formats: None,
1627            text_taxonomy: None,
1628            user_personas: None,
1629            source_amount_conditionals: None,
1630            source_role_gl_conditionals: None,
1631            tb_anchor,
1632        };
1633        let json = serde_json::to_string(&bp).expect("serialize");
1634        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1635        let anchor = back.tb_anchor.expect("tb_anchor must be Some");
1636        assert_eq!(anchor.per_account.len(), 1);
1637        assert!((anchor.per_account["1000"].closing_balance - 55_000.0).abs() < 1e-6);
1638    }
1639
1640    /// Legacy JSON without `tb_anchor` deserialises as `None` (backwards compat).
1641    #[test]
1642    fn behavioral_priors_legacy_missing_tb_anchor() {
1643        let legacy = r#"{
1644            "schema_version": 1,
1645            "generator_version": "5.22.0",
1646            "industry": "health",
1647            "n_client_inputs": 1,
1648            "n_rows_aggregated": 100,
1649            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1650            "per_source_iet": {"by_source": {}},
1651            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1652            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1653            "fanout": {"by_attribute": {}}
1654        }"#;
1655        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1656        assert!(
1657            bp.tb_anchor.is_none(),
1658            "missing tb_anchor field should deserialise as None"
1659        );
1660    }
1661}
datasynth_core/distributions/behavioral_priors.rs

datasynth_core/distributions/
behavioral_priors.rs