datasynth-core 5.36.0

//! Behavioral priors mined from corpus GL data (SP2).
//!
//! These types are defined in `datasynth-core` (not `datasynth-fingerprint`) so
//! that `datasynth-generators` can consume them without creating a package cycle.
//! `datasynth-fingerprint` re-exports them from here.
//!
//! Spec: `docs/superpowers/specs/2026-05-12-sp2-real-world-prior-extraction-design.md`

use std::collections::BTreeMap;

use rand::RngExt;
use serde::{Deserialize, Serialize};

use super::text_taxonomy::TextTaxonomyPrior;

// ---------------------------------------------------------------------------
// EmpiricalCdf
// ---------------------------------------------------------------------------

/// Empirical CDF representation used by IET and lag priors.
#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
pub struct EmpiricalCdf {
    /// Column / source name (informational).
    pub column: String,
    /// Sorted quantile values (after privacy processing).
    pub values: Vec<f64>,
    /// Cumulative probabilities matching `values` (monotone in [0, 1]).
    pub probabilities: Vec<f64>,
}

impl EmpiricalCdf {
    /// Build an empirical CDF from pre-sorted values.
    pub fn from_sorted_values(column: impl Into<String>, values: Vec<f64>) -> Self {
        let n = values.len();
        let probabilities: Vec<f64> = (1..=n).map(|i| i as f64 / n as f64).collect();
        Self {
            column: column.into(),
            values,
            probabilities,
        }
    }

    /// Evaluate CDF at a value (linear interpolation between knots).
    pub fn cdf(&self, x: f64) -> f64 {
        match self.values.binary_search_by(|v| v.total_cmp(&x)) {
            Ok(i) => self.probabilities[i],
            Err(i) => {
                if i == 0 {
                    0.0
                } else if i >= self.values.len() {
                    1.0
                } else {
                    let (x0, x1) = (self.values[i - 1], self.values[i]);
                    let (p0, p1) = (self.probabilities[i - 1], self.probabilities[i]);
                    p0 + (p1 - p0) * (x - x0) / (x1 - x0)
                }
            }
        }
    }

    /// Evaluate inverse CDF (quantile function) at a probability.
    pub fn quantile(&self, p: f64) -> f64 {
        if p <= 0.0 {
            return *self.values.first().unwrap_or(&0.0);
        }
        if p >= 1.0 {
            return *self.values.last().unwrap_or(&0.0);
        }
        match self.probabilities.binary_search_by(|v| v.total_cmp(&p)) {
            Ok(i) => self.values[i],
            Err(i) => {
                if i == 0 {
                    self.values[0]
                } else if i >= self.probabilities.len() {
                    *self.values.last().unwrap_or(&0.0)
                } else {
                    let (p0, p1) = (self.probabilities[i - 1], self.probabilities[i]);
                    let (x0, x1) = (self.values[i - 1], self.values[i]);
                    x0 + (x1 - x0) * (p - p0) / (p1 - p0)
                }
            }
        }
    }
}

// ---------------------------------------------------------------------------
// PerSourceAmountPrior (SP4.3)
// ---------------------------------------------------------------------------

/// SP4.3 — Per-(source, gl_prefix) amount distribution parameters.
///
/// Used by the generator to draw amounts conditional on the source code
/// (and optionally the first-4-digit GL account prefix) drawn at line-construction
/// time.  Log-normal mu/sigma per (source, gl_prefix) bucket, with fallback to
/// source-marginal when the specific pair isn't represented.
///
/// Storage as nested map `source → gl_prefix → params` for forward compatibility.
/// The `by_source` marginals serve as the primary fallback when a specific
/// `(source, gl_prefix)` pair is absent or unrecognised.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct PerSourceAmountPrior {
    /// Keyed by `(source, gl_prefix)`.
    /// Outer key: source code (e.g. "KR").
    /// Inner key: first-4-chars of GL account number (e.g. "0041").
    pub by_source_and_class: BTreeMap<String, BTreeMap<String, LognormalAmount>>,
    /// Source-only marginal fallback when an `(source, gl_prefix)` pair isn't present.
    pub by_source: BTreeMap<String, LognormalAmount>,
    /// SP4.9 — per-source empirical amount quantile sketches.
    ///
    /// Preferred over the `by_source` log-normal marginal at sampling time when
    /// present: corpus per-source amount tails are atom-laden and heavy in ways
    /// a two-parameter log-normal misfits by large factors, while a fixed
    /// quantile grid pins them. Empty for pre-SP4.9 bundles (serde default).
    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
    pub quantile_sketch_by_source: BTreeMap<String, AmountQuantileSketch>,
    /// Per-source empirical JE-TOTAL (Σ debit-leg amount per journal entry) quantile
    /// sketches. When present, the generator draws a JE's economic size directly from this
    /// — bypassing the per-line draw × line-count (P0a) coupling — so the synthetic JE-total
    /// distribution matches the corpus's, whose large amounts concentrate in a rare few
    /// mega-JEs rather than spreading one large line per JE (the coupling's tail artefact).
    /// Empty for bundles without it (serde default; opt-in, byte-identical when absent).
    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
    pub je_total_sketch_by_source: BTreeMap<String, AmountQuantileSketch>,
    /// Pooled GLOBAL JE-total quantile sketch — every source's per-JE debit-leg
    /// totals concatenated into one sketch.
    ///
    /// Serves as the JE-total fallback for *thin* sources: those below the
    /// per-source extraction gate have no entry in `je_total_sketch_by_source`,
    /// and a line-level marginal is the wrong quantity for a JE total. Without
    /// this, thin sources fall through to the per-line assembly path, whose
    /// line-count coupling over-populates the JE-total tail (twin p99/p50 ≈ 1.4×
    /// vs corpus ≈ 1.1×). Drawing a thin source's JE total from the pooled
    /// global JE-total distribution keeps it on the right quantity.
    /// `None` for bundles without it (serde default; opt-in, byte-identical when absent).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub je_total_sketch_global: Option<AmountQuantileSketch>,
}

/// SP4.9 — Empirical amount quantile sketch for one source.
///
/// `values[i]` is the absolute-amount quantile at `probabilities[i]` (both
/// ascending). Sampling inverse-transforms a uniform draw with log-linear
/// (geometric) interpolation between knots — the right interpolation for
/// amounts spanning orders of magnitude — and a capped log-linear
/// extrapolation above the top knot so the extreme tail is not truncated
/// at the last grid point.
///
/// Privacy posture matches the rest of the bundle's amount statistics
/// (aggregate, min-observation-gated): extractors only emit knots at
/// probabilities `p <= 1 - 5/n`, so every knot is an interpolated order
/// statistic with at least 5 observations strictly above it.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct AmountQuantileSketch {
    /// Grid probabilities, strictly ascending, all in (0, 1).
    pub probabilities: Vec<f64>,
    /// Absolute-amount quantiles matching `probabilities` (non-decreasing, > 0).
    pub values: Vec<f64>,
    /// Number of observations underpinning the sketch.
    pub n: usize,
}

impl AmountQuantileSketch {
    /// Cap on the extrapolation above the top knot, as a multiple of the
    /// top-knot value. Keeps a steep last segment from producing absurd draws.
    pub const MAX_TAIL_EXTRAPOLATION: f64 = 10.0;

    /// A sketch is usable when it has at least two knots, matching lengths,
    /// and strictly positive finite values (required for log-space work).
    pub fn is_usable(&self) -> bool {
        self.probabilities.len() >= 2
            && self.probabilities.len() == self.values.len()
            && self.values.iter().all(|v| v.is_finite() && *v > 0.0)
    }

    /// Draw a positive amount magnitude via inverse-transform sampling.
    ///
    /// Returns `None` when the sketch is unusable — callers fall back to the
    /// log-normal marginal.
    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> Option<f64> {
        if !self.is_usable() {
            return None;
        }
        let u: f64 = rng.random_range(0.0..1.0);
        self.quantile(u)
    }

    /// Inverse CDF at `u` — log-linear (geometric) interpolation between knots,
    /// capped log-linear extrapolation above the top knot, floor value below the
    /// bottom knot. `u` is clamped into [0, 1].
    ///
    /// Exposed so a correlation copula can compose rank-preservingly with the
    /// empirical marginal: feeding the copula's correlated uniform through this
    /// function keeps cross-field correlation AND the sketch marginal exact —
    /// the same principle as the advanced sampler's `ppf` path.
    pub fn quantile(&self, u: f64) -> Option<f64> {
        if !self.is_usable() {
            return None;
        }
        let u = u.clamp(0.0, 1.0);
        let ps = &self.probabilities;
        let vs = &self.values;
        let k = ps.len();
        if u <= ps[0] {
            return Some(vs[0]);
        }
        if u > ps[k - 1] {
            // Capped log-linear extrapolation from the last two knots.
            let dp = (ps[k - 1] - ps[k - 2]).max(1e-12);
            let (l0, l1) = (vs[k - 2].ln(), vs[k - 1].ln());
            let ext = l1 + (l1 - l0) / dp * (u - ps[k - 1]);
            let capped = ext.min(l1 + Self::MAX_TAIL_EXTRAPOLATION.ln());
            return Some(capped.exp());
        }
        let i = match ps.binary_search_by(|p| p.total_cmp(&u)) {
            Ok(i) => return Some(vs[i]),
            Err(i) => i, // ps[i-1] < u < ps[i]
        };
        let t = (u - ps[i - 1]) / (ps[i] - ps[i - 1]).max(1e-12);
        let (l0, l1) = (vs[i - 1].ln(), vs[i].ln());
        Some((l0 + (l1 - l0) * t).exp())
    }
}

/// Log-normal amount parameters derived from corpus GL data.
///
/// All values refer to `ln(|amount|)`.  Callers take the absolute value of the
/// raw corpus amount before fitting so that both debit and credit lines are
/// captured in the same distribution; sign/direction is assigned by the JE
/// balancer after sampling.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct LognormalAmount {
    /// Mean of `ln(|amount|)`.
    pub mu: f64,
    /// Standard deviation of `ln(|amount|)`.
    pub sigma: f64,
    /// Number of observations underpinning these parameters.
    /// Parameters are only emitted when `n >= min_observations` (default 10).
    pub n: usize,
    /// Median absolute value of the raw amounts (for sanity / audit).
    pub median_abs: f64,
}

impl LognormalAmount {
    /// Draw a positive amount magnitude from `LogNormal(mu, sigma)`.
    ///
    /// Returns an `f64` > 0.  The caller decides sign (debit vs credit) and
    /// rounds to [`rust_decimal::Decimal`].
    ///
    /// Falls back to `LogNormal(0.0, 1.0)` when `sigma` is non-positive or
    /// the parameters are otherwise invalid — avoids panicking in production.
    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> f64 {
        use rand_distr::{Distribution, LogNormal};
        // Clamp sigma to a tiny positive value to avoid `LogNormal::new` error.
        let sigma = self.sigma.max(1e-6);
        let dist = LogNormal::new(self.mu, sigma)
            .unwrap_or_else(|_| LogNormal::new(0.0, 1.0).expect("fallback lognormal"));
        dist.sample(rng)
    }
}

// ---------------------------------------------------------------------------
// BehavioralPriors and sub-types
// ---------------------------------------------------------------------------

/// Root container for the SP2 behavioral priors.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct BehavioralPriors {
    pub schema_version: u32,
    pub generator_version: String,
    pub industry: String,
    pub n_client_inputs: usize,
    pub n_rows_aggregated: usize,
    pub source_mix: SourceMixPrior,
    /// Per-JE source-code shares (the fraction of JOURNAL ENTRIES per source,
    /// vs `source_mix`'s per-LINE shares). The generator's per-JE source draw
    /// uses this when present so the JE-count composition matches the corpus;
    /// line shares then emerge from the per-source lines-per-JE histogram
    /// instead of being double-counted. `None` for older bundles (serde
    /// default) — the per-JE draw falls back to the line-share mix unchanged.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub source_mix_je: Option<SourceMixPrior>,
    pub per_source_iet: PerSourceIetPrior,
    pub lines_per_je: LinesPerJePrior,
    pub active_lifetime: ActiveLifetimePrior,
    pub fanout: FanoutPrior,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub posting_lag: Option<PostingLagPrior>,
    /// SP3.2 — per-Source multi-segment active patterns. Optional, additive.
    /// When `Some`, supersedes `active_lifetime` for window placement.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub active_segments: Option<ActiveSegmentsPrior>,
    /// SP3.3 — entity-cluster prior driving cross-entity motif preservation.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub entity_clusters: Option<EntityClustersPrior>,
    /// SP3.7 — per-source conditional attribute priors. Optional, additive.
    /// When `Some`, the generator samples GL account / cost center / profit
    /// center conditioned on the just-drawn source code.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub per_source_attribute: Option<PerSourceAttributePrior>,
    /// SP3.12 — TP-entity cluster prior. Clusters TradingPartner values by
    /// shared GL/CC/PC attribute sets. The generator uses this to bias TP
    /// selection toward cluster-mates of recently-used TPs (same source),
    /// building triangle structure in the TP-GL co-occurrence graph.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tp_entity_clusters: Option<EntityClustersPrior>,
    /// SP4.2 — CoA semantic content extracted from corpus COA_XXX.parquet
    /// files. When `Some`, the CoA generator enriches account descriptions,
    /// account_class, and account_sub_class with corpus values instead of
    /// generic constants.  Old bundles load fine (deserialises as `None`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub coa_semantic: Option<CoaSemanticPrior>,
    /// SP4.7 — Per-source reference-string format templates extracted from the
    /// corpus.  When `Some`, the JE generator samples a reference string
    /// from the template distribution rather than using the fixed
    /// `format!("...")` fallback.  Old bundles load fine (deserialises as `None`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub reference_formats: Option<ReferenceFormatPrior>,
    /// SP6 — corpus text taxonomy. Replaces the old SP4.4 text_templates field. When `Some`,
    /// the generator samples line text keyed on `(source, account-class)`,
    /// header text on `source`, and CoA descriptions per account — all PII-safe
    /// templates filled at generation time. `None` for pre-SP6 bundles.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub text_taxonomy: Option<TextTaxonomyPrior>,
    /// SP4.5 — Per-user behavioral patterns (source mix, hourly density,
    /// weekday density, volume share) mined from corpus GL data.
    /// When `Some` and non-empty, the JE generator biases `created_by` and
    /// `created_at` toward the characteristic patterns of each user.
    ///
    /// Note: the corpus GL files examined for this project carried no
    /// user column; the prior is included as an additive field so future data
    /// deliveries can populate it without any schema changes.  Old bundles
    /// load fine (deserialises as `None`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub user_personas: Option<UserPersonaPrior>,
    /// SP4.3 — Per-(source, gl_prefix) amount distribution parameters.
    ///
    /// When `Some`, the JE generator samples the JE total-amount from the
    /// log-normal conditional matching the current source code (and optionally
    /// GL-account prefix), rather than the global `AmountSampler` marginal.
    /// Fraud entries bypass this path to preserve fraud-pattern semantics.
    /// Old bundles load fine (deserialises as `None`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub source_amount_conditionals: Option<PerSourceAmountPrior>,
    /// SP4.6 — Per-(source, line_role) GL account conditional.
    ///
    /// line_role is "DR" (debit-dominant lines) or "CR" (credit-dominant lines).
    /// When `Some`, the generator draws GL accounts respecting the line role so
    /// that expense/asset classes appear on DR sides and liability/revenue classes
    /// appear on CR sides, matching corpus SAP doc-type shapes.
    ///
    /// Sources with no strong role bias (e.g. SA) emit whatever the corpus shows.
    /// Old bundles load fine (deserialises as `None`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub source_role_gl_conditionals: Option<PerSourceRolePrior>,
    /// SP4.8 — Per-source joint (debit-class, credit-class) flow-pair prior.
    ///
    /// `source_role_gl_conditionals` captures per-(source, role) account
    /// MARGINALS; this prior captures the within-JE JOINT pairing of debit and
    /// credit account classes (the account-flow edge a JE draws), conditioned
    /// on source. Keys of the inner categorical are `"<dr>|<cr>"` leading-digit
    /// class pairs — the same canonicalisation the relational-fidelity ρ
    /// measure uses. When `Some`, the JE generator draws ONE pair per entry and
    /// restricts the per-(source, role) GL conditionals to the drawn classes,
    /// instead of sampling DR and CR accounts independently.
    /// Old bundles load fine (deserialises as `None`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub source_flow_pairs: Option<PerSourceFlowPairPrior>,
    /// SP4.1 — Trial-balance anchor prior.  Per-account opening/closing balances
    /// extracted from real `TB_XXX.parquet` files.  When `Some`, the generator's
    /// balance tracker uses these targets to nudge synthetic balances toward a
    /// corpus-shaped distribution via periodic drift-correction entries.
    ///
    /// Old bundles load fine (deserialises as `None`); the balance tracker
    /// continues with its existing free-drift behaviour when `None`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tb_anchor: Option<TbAnchorPrior>,
    /// Phase-2 R6 — Per-source manual-posting share mined from the corpus
    /// system/manual indicator column.  When `Some`, the JE generator's
    /// manual/automated split can honor the per-source share instead of the
    /// engine's intrinsic manual fraction.  Populated only from parquet
    /// inputs that carry the indicator column; `None` for older bundles and
    /// CSV inputs (serde default — old bundles load fine and re-serialise
    /// byte-identically).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub manual_share: Option<ManualSharePrior>,
    /// Phase-2 R9 — preparer/approver prior (approval share + self-approval
    /// rate). `None` for older bundles (serde default): the generator emits no
    /// approver and output is byte-identical.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub approver: Option<ApproverPrior>,
    /// N9 — per-calendar-month relative JE volume. When `Some` and
    /// `has_data()`, the generator reshapes the per-JE month marginal toward
    /// the corpus's (nearly flat) seasonality instead of the front-loaded
    /// IET-walk default. `None` for older bundles (serde default): byte-identical.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub month_volume: Option<MonthVolumePrior>,
}

impl BehavioralPriors {
    pub const SCHEMA_VERSION: u32 = 1;
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct SourceMixPrior {
    pub probabilities: BTreeMap<String, f64>,
    pub other_fraction: f64,
    pub min_threshold: f64,
}

impl SourceMixPrior {
    /// Draw a source code from the weighted distribution.
    ///
    /// Returns a randomly selected key from `probabilities`, weighted by the
    /// associated probability mass.  Falls back to the first key (or `"SA"`) when
    /// the map is empty or all weights sum to zero.
    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> String {
        if self.probabilities.is_empty() {
            return "SA".to_string();
        }
        let r: f64 = rng.random_range(0.0..1.0);
        let total: f64 = self.probabilities.values().sum();
        if total <= 0.0 {
            return self
                .probabilities
                .keys()
                .next()
                .cloned()
                .unwrap_or_else(|| "SA".to_string());
        }
        let mut cum = 0.0;
        for (code, &weight) in &self.probabilities {
            cum += weight / total;
            if r <= cum {
                return code.clone();
            }
        }
        // Floating-point rounding: return the last key.
        self.probabilities
            .keys()
            .next_back()
            .cloned()
            .unwrap_or_else(|| "SA".to_string())
    }

    /// A privacy-safe **generic** SAP document-type source mix for the default
    /// (no industry-priors) generation path. Standard SAP FI/MM/SD document
    /// types with hand-chosen, plausible weights — **not** corpus-derived
    /// probabilities — so the emitted `source` column has realistic breadth
    /// (~25 codes, entropy ~2.7) instead of collapsing to the coarse
    /// `TransactionSource` enum (entropy ~0.75). See experiments/ml/FINDINGS.md
    /// §6 for the source-mix gap that motivates this. Weights are relative;
    /// `sample` normalises by their sum.
    pub fn sap_default() -> Self {
        let head = [
            ("RV", 0.16),
            ("KR", 0.12),
            ("DR", 0.10),
            ("SA", 0.09),
            ("DZ", 0.08),
            ("KZ", 0.07),
            ("WE", 0.06),
            ("RE", 0.05),
            ("DG", 0.04),
            ("KG", 0.035),
            ("WA", 0.03),
            // "AB" (accounting/allocation doc) is intentionally omitted: it is
            // reserved for the SOTA-6 allocation-batch process so synthetic "AB"
            // JEs carry the corpus's large lines-per-JE (~52), not a small mix.
            ("WL", 0.025),
            ("ZP", 0.02),
            ("SK", 0.018),
            ("AF", 0.015),
            ("AA", 0.012),
            ("ML", 0.010),
            ("PR", 0.008),
            ("RN", 0.007),
            ("WI", 0.006),
            ("AN", 0.005),
            ("UE", 0.004),
            ("ZV", 0.003),
            ("EU", 0.002),
        ];
        let mut probabilities: BTreeMap<String, f64> =
            head.into_iter().map(|(k, v)| (k.to_string(), v)).collect();

        // Long tail (Lever 2): a power-law of synthetic SAP custom doc-type
        // codes (Z-prefixed, the SAP custom-type convention). Synthetic only,
        // never corpus-derived. The rare tail codes draw few events, which also
        // lifts the per-source inter-event-time variance (FINDINGS sec.6: IET
        // variance is coupled to source breadth). Weight is proportional to
        // 1/rank^1.1, scaled to a fraction of the head's summed mass.
        //
        // v5.30 A3 (#150): TAIL_MASS reduced 0.30 → 0.15 to compress the
        // synthetic-source vocabulary toward the reference shard. v5.29
        // emitted 526 distinct sources vs reference ~287; this halves the
        // Z-tail mass, expected to drop synth source-cardinality to ~390-440
        // and tighten the Sajja P1 IET-distribution gap by ~10-15%. The
        // trade-off is some IET-variance lift (the SP3 design intent of the
        // Z-tail); documented in FINDINGS §6 update.
        const TAIL_N: usize = 500;
        const TAIL_MASS: f64 = 0.15;
        let zipf: f64 = (1..=TAIL_N).map(|r| 1.0 / (r as f64).powf(1.1)).sum();
        for r in 1..=TAIL_N {
            let w = TAIL_MASS * (1.0 / (r as f64).powf(1.1)) / zipf;
            probabilities.insert(format!("Z{r:03}"), w);
        }

        Self {
            probabilities,
            other_fraction: 0.0,
            min_threshold: 0.0,
        }
    }
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct PerSourceIetPrior {
    pub by_source: BTreeMap<String, IetSummary>,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct IetSummary {
    pub n: usize,
    pub empirical_cdf_days: EmpiricalCdf,
    pub lognormal_fit: Option<LognormalParams>,
    pub lag1_autocorr: f64,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct LognormalParams {
    pub mu: f64,
    pub sigma: f64,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct LinesPerJePrior {
    pub overall: LineCountHistogram,
    pub by_source: BTreeMap<String, LineCountHistogram>,
    pub min_jes_per_source: usize,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct ActiveLifetimePrior {
    pub by_source: BTreeMap<String, LineCountHistogram>,
    pub overall: LineCountHistogram,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct FanoutPrior {
    pub by_attribute: BTreeMap<String, LineCountHistogram>,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct PostingLagPrior {
    pub by_source: BTreeMap<String, LagSummary>,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct LagSummary {
    pub empirical_cdf_days: EmpiricalCdf,
    pub mean: f64,
    pub stddev: f64,
    pub n: usize,
}

// ---------------------------------------------------------------------------
// ManualSharePrior (Phase-2 R6)
// ---------------------------------------------------------------------------

/// Phase-2 R6 — Per-source share of manually-posted JE lines.
///
/// `by_source` maps a source code to the fraction of its rows whose
/// system/manual indicator marks a manual posting (0.0–1.0).  Sources below
/// the extractor's per-source observation gate contribute to `overall` only.
/// A raw cell counts as manual when it case-insensitively contains
/// `"manual"` — the same convention the audit-triage loader uses.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct ManualSharePrior {
    /// Manual share across all rows carrying a non-empty indicator.
    pub overall: f64,
    /// Per-source manual share for sources that met the observation gate.
    pub by_source: BTreeMap<String, f64>,
    /// Rows with a non-empty indicator (diagnostics + aggregation weight).
    pub n_observations: usize,
}

impl ManualSharePrior {
    pub fn has_data(&self) -> bool {
        self.n_observations > 0
    }
}

// ---------------------------------------------------------------------------
// ApproverPrior (Phase-2 R9)
// ---------------------------------------------------------------------------

/// Preparer/approver behaviour extracted from corpus enterer + approver columns
/// (GH #217 §D.2). Drives the generator's approver emission so SoD /
/// approval-chain detector arms have real signal on twins.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct ApproverPrior {
    /// Share of enterer-bearing rows that carry a non-empty approver.
    pub approval_share: f64,
    /// Among approved rows, the share where approver == enterer (the
    /// self-approval / SoD-violation rate).
    pub self_approval_rate: f64,
    /// Per-source approval share for sources that met the observation gate.
    pub by_source: BTreeMap<String, f64>,
    /// Enterer-bearing rows observed (diagnostics + aggregation weight).
    pub n_observations: usize,
}

impl ApproverPrior {
    pub fn has_data(&self) -> bool {
        self.n_observations > 0
    }

    /// Approval share for `source`, falling back to the overall share.
    pub fn share_for_source(&self, source: &str) -> f64 {
        self.by_source
            .get(source)
            .copied()
            .unwrap_or(self.approval_share)
    }
}

impl ManualSharePrior {
    /// Share for `source`, falling back to `overall` for unseen sources.
    pub fn share_for_source(&self, source: &str) -> f64 {
        self.by_source.get(source).copied().unwrap_or(self.overall)
    }
}

// ---------------------------------------------------------------------------
// MonthVolumePrior (N9 — month-volume seasonality)
// ---------------------------------------------------------------------------

/// Relative volume of amount-bearing JEs per calendar month, extracted from a
/// corpus GL (N9). Corpus books spread JE volume nearly flat across the year
/// (max/min ≈ 1.5), whereas the synthetic IET-walk date assignment front-loads
/// it (most volume in the first months). When present, the generator reshapes
/// the per-JE month marginal toward `shares` while preserving the within-source
/// IET/burst fine structure (order-preserving remap).
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct MonthVolumePrior {
    /// Relative share of amount-bearing JEs per calendar month, index 0 = Jan
    /// .. 11 = Dec. Sums to 1.0 when `has_data()` (renormalised on build).
    pub shares: [f64; 12],
    /// Amount-bearing JEs observed (diagnostics + aggregation weight).
    pub n: usize,
}

impl Default for MonthVolumePrior {
    fn default() -> Self {
        Self {
            shares: [0.0; 12],
            n: 0,
        }
    }
}

impl MonthVolumePrior {
    /// `true` when the prior carries observations and a non-degenerate
    /// (non-zero-sum) month distribution.
    pub fn has_data(&self) -> bool {
        self.n > 0 && self.shares.iter().sum::<f64>() > 0.0
    }

    /// Build from raw per-month counts (index 0 = Jan), renormalising to shares.
    /// `n` is the total observation count. Returns a prior with zero shares when
    /// the counts sum to zero (then `has_data()` is false).
    pub fn from_counts(counts: [f64; 12], n: usize) -> Self {
        let total: f64 = counts.iter().sum();
        let shares = if total > 0.0 {
            let mut s = [0.0; 12];
            for (i, c) in counts.iter().enumerate() {
                s[i] = c / total;
            }
            s
        } else {
            [0.0; 12]
        };
        Self { shares, n }
    }

    /// Map a uniform quantile `u` in [0, 1) to a month index (0 = Jan .. 11 =
    /// Dec) via the inverse CDF of `shares`. Order-preserving: a larger `u`
    /// never maps to an earlier month, so applying it to the IET walk's
    /// monotone within-source position preserves event ordering (and thus the
    /// burst/autocorr structure) while reshaping the month marginal to `shares`.
    pub fn month_for_quantile(&self, u: f64) -> u32 {
        let u = u.clamp(0.0, 1.0);
        let mut cum = 0.0;
        for (i, share) in self.shares.iter().enumerate() {
            cum += share;
            if u < cum {
                return i as u32;
            }
        }
        // Fell through (u ~ 1.0 or rounding): last non-empty month.
        self.shares
            .iter()
            .rposition(|s| *s > 0.0)
            .map(|i| i as u32)
            .unwrap_or(11)
    }
}

// ---------------------------------------------------------------------------
// ActiveSegmentsPrior (SP3.2)
// ---------------------------------------------------------------------------

/// SP3.2 — Per-Source distribution of active-segment count + length + gap.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct ActiveSegmentsPrior {
    pub by_source: BTreeMap<String, SourceSegmentSummary>,
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct SourceSegmentSummary {
    pub segment_count_histogram: LineCountHistogram,
    pub segment_length_histogram: LineCountHistogram,
    pub gap_length_histogram: LineCountHistogram,
}

/// Bucket grid for "how many active segments per Source per period".
pub const SEGMENT_COUNT_BUCKETS: &[u32] = &[1, 2, 3, 4, 6, 8, 12, 16, 24];

/// Bucket grid for "gap between active segments, in days".
pub const SEGMENT_GAP_BUCKETS: &[u32] = &[1, 2, 3, 7, 14, 30, 60, 90];

// ---------------------------------------------------------------------------
// EntityClustersPrior (SP3.3)
// ---------------------------------------------------------------------------

/// SP3.3 — Clusters of Sources that share attribute pools heavily.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct EntityClustersPrior {
    pub clusters: Vec<EntityCluster>,
    /// Fraction of Sources that landed in *some* cluster (vs. isolates).
    pub clustering_rate: f64,
}

// ---------------------------------------------------------------------------
// PerSourceAttributePrior + CategoricalDistribution (SP3.7)
// ---------------------------------------------------------------------------

/// SP3.7 — Per-source conditional distribution over downstream attributes
/// (GL account, cost center, profit center, ...).  When generation knows
/// the source code, the conditional CDF tells it which attribute values
/// are characteristic of that source.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct PerSourceAttributePrior {
    /// Outer key: source code (e.g. "KR", "RV", "DZ").
    /// Inner key: attribute name ("gl_account", "cost_center", "profit_center").
    /// Value: categorical distribution over the observed attribute values
    ///        conditioned on `(source, attribute)`.
    pub by_source: BTreeMap<String, BTreeMap<String, CategoricalDistribution>>,
    /// Configured minimum number of observations per (source, attribute) pair
    /// before a conditional is retained. Smaller groups are dropped to avoid
    /// over-fitting low-volume sources.
    pub min_observations: usize,
}

impl PerSourceAttributePrior {
    /// Look up the conditional distribution for `(source, attribute)`.
    /// Returns `None` if the source or attribute isn't represented in the
    /// prior (e.g. the source had too few observations during extraction).
    pub fn conditional(&self, source: &str, attribute: &str) -> Option<&CategoricalDistribution> {
        self.by_source.get(source)?.get(attribute)
    }
}

// ---------------------------------------------------------------------------
// PerSourceRolePrior (SP4.6)
// ---------------------------------------------------------------------------

/// SP4.6 — Per-(source, line_role) GL account conditional.
///
/// Captures the observation that each SAP document type has a canonical
/// line-role structure:
/// - KR (vendor invoice): DR → expense (5/6xxx),  CR → AP (2xxx)
/// - RV (customer invoice): DR → AR (1xxx),       CR → revenue (4xxx)
/// - DZ (customer payment): DR → Bank (1xxx),     CR → AR (1xxx)
/// - KZ (vendor payment):  DR → AP (2xxx),        CR → Bank (1xxx)
/// - WE (goods receipt):   DR → Inventory (1xxx), CR → GR/IR (2xxx)
/// - SA (manual journal): weak role bias — distribution reflects corpus mix.
///
/// `role` is `"DR"` for debit-dominant lines, `"CR"` for credit-dominant lines.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct PerSourceRolePrior {
    /// `(source → role → distribution over GL account values)`.
    /// Inner role key is `"DR"` or `"CR"`.
    pub by_source_and_role: BTreeMap<String, BTreeMap<String, CategoricalDistribution>>,
}

impl PerSourceRolePrior {
    /// Look up the conditional distribution for `(source, role)`.
    ///
    /// Returns `None` when the source or role isn't represented (e.g. too few
    /// observations during extraction, or an old bundle without SP4.6 data).
    /// Callers fall back to `per_source_attribute` marginal or to a default GL.
    pub fn conditional(&self, source: &str, role: &str) -> Option<&CategoricalDistribution> {
        self.by_source_and_role.get(source)?.get(role)
    }
}

// ---------------------------------------------------------------------------
// PerSourceFlowPairPrior (SP4.8)
// ---------------------------------------------------------------------------

/// SP4.8 — Per-source joint (debit-class, credit-class) flow-pair prior.
///
/// Where `PerSourceRolePrior` stores the per-(source, role) account MARGINALS,
/// this prior stores the within-JE JOINT distribution of (debit account class,
/// credit account class) pairs per source — the account-flow edges a JE draws.
/// Classes are leading-digit canonicalised (digits-only, first `granularity`
/// characters) so a synthetic chart and a client chart share a vocabulary.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct PerSourceFlowPairPrior {
    /// `source → distribution over "<dr>|<cr>" class-pair keys`.
    pub by_source: BTreeMap<String, CategoricalDistribution>,
    /// Number of leading digits per class key (1 in shipped bundles).
    pub granularity: usize,
}

impl PerSourceFlowPairPrior {
    /// Encode a (dr_class, cr_class) pair as a categorical key.
    pub fn pair_key(dr: &str, cr: &str) -> String {
        format!("{dr}|{cr}")
    }

    /// Decode a categorical key back into (dr_class, cr_class).
    pub fn split_pair_key(key: &str) -> Option<(String, String)> {
        let (d, c) = key.split_once('|')?;
        if d.is_empty() || c.is_empty() {
            return None;
        }
        Some((d.to_string(), c.to_string()))
    }

    /// Leading-`granularity`-digit class of an account number; `None` when the
    /// account carries no ASCII digits (mirrors the relational-fidelity measure).
    pub fn account_class(account: &str, granularity: usize) -> Option<String> {
        let digits: String = account.chars().filter(|c| c.is_ascii_digit()).collect();
        if digits.is_empty() {
            None
        } else {
            Some(digits.chars().take(granularity.max(1)).collect())
        }
    }

    /// The pair distribution for `source` (`None` = source unseen / too sparse).
    pub fn pairs(&self, source: &str) -> Option<&CategoricalDistribution> {
        self.by_source.get(source)
    }
}

// ---------------------------------------------------------------------------
// TbAnchorPrior (SP4.1)
// ---------------------------------------------------------------------------

/// SP4.1 — Trial-balance anchor prior.
///
/// Extracted from real `TB_XXX.parquet` files.  Each entry represents the
/// industry-median opening and closing balance for a GL account number.
/// The generator's balance tracker uses these values in target-aware mode
/// to emit drift-correction entries that keep the synthetic balance sheet
/// shaped like a corpus balance sheet.
///
/// **Schema note**: real TB files carry `Functional Beginning Balance` and
/// `Functional Ending Balance` columns but no explicit period-debit/credit
/// columns — those are derived as the difference between balances.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TbAnchorPrior {
    /// Per-account TB target.  Keyed by GL account number (normalised to
    /// the same format as the rest of the bundle, e.g. leading-zero-padded
    /// 10-digit strings for the current health corpus).
    pub per_account: BTreeMap<String, TbTarget>,
    /// Industry-median total assets (sum of asset-class closing balances).
    /// Used for aggregate drift-tolerance checks.
    pub total_assets: f64,
    /// Industry-median total liabilities (sum of liability-class closing balances).
    pub total_liabilities: f64,
    /// Industry-median total equity (sum of equity-class closing balances).
    pub total_equity: f64,
    /// Number of clients that contributed to this anchor.
    pub n_clients: usize,
}

impl TbAnchorPrior {
    /// Returns `true` when the prior carries at least one account with a
    /// non-zero closing balance — i.e. it was built from real TB data.
    pub fn has_data(&self) -> bool {
        self.per_account
            .values()
            .any(|t| t.closing_balance.abs() > 1e-9 || t.opening_balance.abs() > 1e-9)
    }
}

/// Target balance for a single GL account, representing the industry-median
/// values extracted from real `TB_XXX.parquet` files.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TbTarget {
    /// Industry-median opening balance for this account.
    pub opening_balance: f64,
    /// Industry-median closing balance for this account.
    pub closing_balance: f64,
    /// Derived period net activity (closing − opening), positive = net debit.
    /// Pre-computed at extraction time for convenience.
    pub period_net_activity: f64,
    /// Standard deviation of opening balance across contributing clients
    /// (in their respective functional currencies, normalised by client-total-assets).
    /// Smaller stdev = tighter target.  Zero when only one client contributed.
    pub opening_stdev: f64,
    /// Standard deviation of closing balance across contributing clients.
    pub closing_stdev: f64,
    /// Number of clients in which this account appeared.
    pub n_clients: usize,
}

/// A categorical distribution stored as a sparse `value → probability` map.
/// Used by SP3.7 for per-source attribute conditionals.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct CategoricalDistribution {
    /// Value (e.g. an account number) → probability mass.  Probabilities
    /// sum to 1.0 (within float tolerance).
    pub probabilities: BTreeMap<String, f64>,
    /// Total observations underpinning this distribution.
    pub n: usize,
}

impl CategoricalDistribution {
    /// Construct from raw counts.  Normalises to probabilities.
    pub fn from_counts(counts: BTreeMap<String, usize>) -> Self {
        let n: usize = counts.values().sum();
        if n == 0 {
            return Self::default();
        }
        let probabilities = counts
            .into_iter()
            .map(|(k, v)| (k, v as f64 / n as f64))
            .collect();
        Self { probabilities, n }
    }

    /// Draw a value from the distribution.  Returns `None` when the
    /// distribution is empty (caller falls back to the marginal or to a
    /// default attribute value).
    pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> Option<String> {
        if self.probabilities.is_empty() {
            return None;
        }
        let total: f64 = self.probabilities.values().sum();
        if total <= 0.0 {
            return None;
        }
        let r: f64 = rng.random_range(0.0..1.0);
        let mut cum = 0.0;
        for (value, &p) in &self.probabilities {
            cum += p / total;
            if r <= cum {
                return Some(value.clone());
            }
        }
        self.probabilities.keys().next_back().cloned()
    }
}

#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct EntityCluster {
    pub members: Vec<String>,
    pub avg_jaccard: f64,
}

// ---------------------------------------------------------------------------
// CoaSemanticPrior (SP4.2)
// ---------------------------------------------------------------------------

/// SP4.2 — Per-account semantic content extracted from real CoA files.
/// Keyed by canonical GL account number (matching values in the per-source
/// attribute conditional). When `Some` on the parent `BehavioralPriors`, the
/// CoA generator emits real account names + descriptions + ISO 21378 hierarchy
/// from this prior instead of generic constants.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct CoaSemanticPrior {
    pub accounts: std::collections::BTreeMap<String, AccountSemantic>,
}

/// Semantic metadata for a single GL account, sourced from a real CoA file.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct AccountSemantic {
    /// Human-readable account description (e.g. "Kasse", "Bankkonto CHF 1").
    pub description: String,
    /// Account type as reported in the source file (e.g. "Assets", "Liabilities").
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub account_type: Option<String>,
    /// ISO 21378 Level-2 class code or label (e.g. "C _ Cash", "01_Cash and cash equivalents").
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub account_class: Option<String>,
    /// Human-readable name for `account_class`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub account_class_name: Option<String>,
    /// ISO 21378 Level-3 sub-class code or label.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub account_sub_class: Option<String>,
    /// Human-readable name for `account_sub_class`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub account_sub_class_name: Option<String>,
    /// Parent account number in hierarchy (if available).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub parent_account: Option<String>,
}

// ---------------------------------------------------------------------------
// ReferenceFormatPrior (SP4.7)
// ---------------------------------------------------------------------------

/// SP4.7 — Per-source reference-string format templates extracted from real
/// corpus GL data.  Templates use placeholders for digit and alphabetic runs;
/// fixed punctuation / hyphens are preserved verbatim.
///
/// Example templates:
/// - `"{4 digits}-{4 digits}-{10 digits}"` (JE_3 client: all sources)
/// - `"RE-{4 digits}-{6 digits}"` (hypothetical KR/vendor-invoice client)
///
/// Only templates observed ≥ `min_occurrences` times (per client) and the top-10
/// by frequency per source are retained — preventing PII leakage.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct ReferenceFormatPrior {
    pub by_source: BTreeMap<String, Vec<ReferenceTemplate>>,
}

/// A single reference-format template observed in corpus data.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct ReferenceTemplate {
    /// Template string with `{N digits}` / `{N alpha}` placeholders.
    /// Fixed characters (hyphens, slashes, dots) are preserved verbatim.
    /// Example: `"{4 digits}-{4 digits}-{10 digits}"`
    pub template: String,
    /// Fraction of references for this source that match this template
    /// (after frequency filtering and renormalisation).
    pub probability: f64,
    /// A concrete example value from the corpus (for debugging / audit).
    pub example: String,
}

// ---------------------------------------------------------------------------
// UserPersonaPrior (SP4.5)
// ---------------------------------------------------------------------------

/// SP4.5 — Per-user behavioral patterns extracted from a corpus.
///
/// Each user has a characteristic source mix (AP clerks post mostly KR/KZ,
/// AR clerks RV/DZ, GL/closing staff SA), an hourly density, and a weekday
/// distribution.
///
/// **Corpus note**: The corpus GL files in this project carry no user column
/// (all 45 JE parquet files were examined; none had a Created-By field).
/// Extraction is therefore stubbed — `extract_user_personas` in
/// `user_extractor.rs` returns an empty `UserPersonaPrior`.  When a future data
/// delivery includes per-user columns, the extractor can be filled in without
/// any schema changes here.  Generators gate on `UserPersonaPrior::has_data()`
/// and fall back to the internal user pool when the prior is empty.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct UserPersonaPrior {
    /// User-ID → behavior.  Empty when no user-column data was available.
    pub users: BTreeMap<String, UserBehavior>,
    /// Distribution of distinct active-user count per generation run.
    /// Populated only when extraction had real data.
    pub user_count_distribution: LineCountHistogram,
}

impl UserPersonaPrior {
    /// Returns `true` when the prior carries at least one user with a non-empty
    /// source mix and positive volume share (i.e. built from real user-column data).
    pub fn has_data(&self) -> bool {
        self.users
            .values()
            .any(|u| !u.source_mix.is_empty() && u.volume_share > 0.0)
    }

    /// Sample a user ID likely to post the given `source` code.
    ///
    /// Weights each user by `source_mix[source] × volume_share`.  Returns
    /// `None` when the prior is empty or no user has a non-zero weight for
    /// `source` (caller falls back to the generic user pool).
    pub fn sample_user_for_source<R: rand::Rng>(
        &self,
        source: &str,
        rng: &mut R,
    ) -> Option<String> {
        use rand::RngExt;
        if self.users.is_empty() {
            return None;
        }
        let weights: Vec<(&String, f64)> = self
            .users
            .iter()
            .filter_map(|(uid, beh)| {
                let mix = beh.source_mix.get(source).copied().unwrap_or(0.0);
                let w = mix * beh.volume_share;
                if w > 0.0 {
                    Some((uid, w))
                } else {
                    None
                }
            })
            .collect();
        if weights.is_empty() {
            return None;
        }
        let total: f64 = weights.iter().map(|(_, w)| *w).sum();
        if total <= 0.0 {
            return None;
        }
        let r: f64 = rng.random_range(0.0..total);
        let mut cum = 0.0;
        for (uid, w) in &weights {
            cum += w;
            if r <= cum {
                return Some((*uid).clone());
            }
        }
        weights.last().map(|(uid, _)| (*uid).clone())
    }

    /// Given a `user_id`, sample an `(hour, weekday)` pair from the user's
    /// density arrays.  `hour` ∈ 0..24, `weekday` ∈ 0..7 (Monday = 0).
    ///
    /// Returns `None` when the user is unknown or all densities are zero.
    pub fn sample_timestamp_for_user<R: rand::Rng>(
        &self,
        user_id: &str,
        rng: &mut R,
    ) -> Option<(u32, u32)> {
        use rand::RngExt;
        let beh = self.users.get(user_id)?;

        let hour_total: f64 = beh.hourly_density.iter().sum();
        if hour_total <= 0.0 {
            return None;
        }
        let r: f64 = rng.random_range(0.0..hour_total);
        let mut cum = 0.0;
        let mut hour = 0u32;
        for (h, &p) in beh.hourly_density.iter().enumerate() {
            cum += p;
            if r <= cum {
                hour = h as u32;
                break;
            }
        }

        let weekday_total: f64 = beh.weekday_density.iter().sum();
        if weekday_total <= 0.0 {
            return None;
        }
        let r: f64 = rng.random_range(0.0..weekday_total);
        let mut cum = 0.0;
        let mut weekday = 0u32;
        for (d, &p) in beh.weekday_density.iter().enumerate() {
            cum += p;
            if r <= cum {
                weekday = d as u32;
                break;
            }
        }

        Some((hour, weekday))
    }
}

/// Behavioral fingerprint for a single corpus user.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct UserBehavior {
    /// Source code → probability mass.  Sums to 1.0 (within float tolerance).
    /// Example: AP clerk → `{"KR": 0.6, "KZ": 0.3, "RE": 0.1}`.
    pub source_mix: BTreeMap<String, f64>,
    /// Hour-of-day (0 = midnight … 23 = 23:xx) → probability mass.
    /// Sums to 1.0.  All zeros when `created_at` was unavailable.
    pub hourly_density: [f64; 24],
    /// Day-of-week (0 = Monday … 6 = Sunday) → probability mass.  Sums to 1.0.
    pub weekday_density: [f64; 7],
    /// Fraction of total rows in the industry pool attributed to this user.
    pub volume_share: f64,
}

impl Default for UserBehavior {
    fn default() -> Self {
        Self {
            source_mix: BTreeMap::new(),
            hourly_density: [0.0; 24],
            weekday_density: [0.0; 7],
            volume_share: 0.0,
        }
    }
}

// ---------------------------------------------------------------------------
// LineCountHistogram
// ---------------------------------------------------------------------------

/// Bucket histogram for counts such as lines-per-JE, fan-out, and active lifetime.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct LineCountHistogram {
    pub buckets: Vec<u32>,
    pub probabilities: Vec<f64>,
    pub n: usize,
}

impl LineCountHistogram {
    /// Build a histogram on the given inclusive-lower-bound bucket grid.
    ///
    /// The bucket grid must be sorted ascending. Values >= `buckets.last()`
    /// fall into the last bucket. Values < `buckets[0]` are dropped (count
    /// returned for diagnostic use).
    pub fn build(values: &[u32], buckets: &[u32]) -> (Self, usize) {
        assert!(!buckets.is_empty(), "buckets must not be empty");
        let n_buckets = buckets.len();
        let mut counts = vec![0u64; n_buckets];
        let mut dropped = 0usize;
        for &v in values {
            if v < buckets[0] {
                dropped += 1;
                continue;
            }
            let bucket_idx = bucket_index(buckets, v);
            counts[bucket_idx] += 1;
        }
        let total: u64 = counts.iter().sum();
        let probabilities = if total == 0 {
            vec![0.0; n_buckets]
        } else {
            counts.iter().map(|&c| c as f64 / total as f64).collect()
        };
        (
            Self {
                buckets: buckets.to_vec(),
                probabilities,
                n: values.len(),
            },
            dropped,
        )
    }

    /// Sum bucket counts of `self` and `other` (same bucket grid) and
    /// renormalise. Returns `None` if grids mismatch.
    pub fn pool(&self, other: &Self) -> Option<Self> {
        if self.buckets != other.buckets {
            return None;
        }
        let total_n = self.n + other.n;
        if total_n == 0 {
            return Some(Self {
                buckets: self.buckets.clone(),
                probabilities: vec![0.0; self.buckets.len()],
                n: 0,
            });
        }
        let probabilities: Vec<f64> = self
            .probabilities
            .iter()
            .zip(other.probabilities.iter())
            .map(|(&pa, &pb)| (pa * self.n as f64 + pb * other.n as f64) / total_n as f64)
            .collect();
        Some(Self {
            buckets: self.buckets.clone(),
            probabilities,
            n: total_n,
        })
    }

    /// Median bucket — the smallest `buckets[i]` whose cumulative probability >= 0.5.
    pub fn median_bucket(&self) -> u32 {
        let mut cum = 0.0;
        for (i, &p) in self.probabilities.iter().enumerate() {
            cum += p;
            if cum >= 0.5 {
                return self.buckets[i];
            }
        }
        *self.buckets.last().unwrap_or(&0)
    }

    /// Sample a count from the histogram. Picks a bucket weighted by
    /// probability mass, then samples uniformly within the bucket (between
    /// `buckets[i]` inclusive and `buckets[i+1]` exclusive — for the last
    /// bucket, returns `buckets[i]`).
    pub fn sample_bucket<R: rand::Rng>(&self, rng: &mut R) -> u32 {
        if self.buckets.is_empty() {
            return 0;
        }
        let r: f64 = rng.random_range(0.0..1.0);
        let mut cum = 0.0;
        let mut chosen_idx = self.buckets.len() - 1;
        for (i, &p) in self.probabilities.iter().enumerate() {
            cum += p;
            if r <= cum {
                chosen_idx = i;
                break;
            }
        }
        let lo = self.buckets[chosen_idx];
        let hi = self.buckets.get(chosen_idx + 1).copied().unwrap_or(lo);
        if hi <= lo {
            lo
        } else {
            rng.random_range(lo..hi)
        }
    }
}

fn bucket_index(buckets: &[u32], v: u32) -> usize {
    match buckets.binary_search(&v) {
        Ok(i) => i,
        Err(i) => i.saturating_sub(1),
    }
}

/// Canonical bucket grid for line counts (lines-per-JE, fan-out).
pub const LINE_COUNT_BUCKETS: &[u32] = &[1, 2, 3, 4, 5, 6, 8, 10, 16, 32, 64, 128, 256, 1024];

/// Canonical bucket grid for active-lifetime days.
pub const ACTIVE_LIFETIME_DAY_BUCKETS: &[u32] = &[0, 1, 7, 30, 90, 180, 365, 730, 1825];

/// Canonical bucket grid for fan-out values.
pub const FANOUT_BUCKETS: &[u32] = &[1, 2, 3, 5, 8, 16, 32, 64, 128, 256, 1024];

#[cfg(test)]
mod tests {
    use super::*;
    use rand::SeedableRng;
    use rand_chacha::ChaCha8Rng;

    #[test]
    fn sap_default_has_broad_long_tail() {
        // Lever 2: the default source-mix is the standard head plus a synthetic
        // power-law long tail, giving corpus-like breadth + entropy.
        let m = SourceMixPrior::sap_default();
        let p = &m.probabilities;
        assert!(
            p.len() >= 300,
            "Lever-2 default should carry a long tail, got {} codes",
            p.len()
        );
        let total: f64 = p.values().sum();
        let ent: f64 = -p
            .values()
            .map(|&w| {
                let q = w / total;
                if q > 0.0 {
                    q * q.ln()
                } else {
                    0.0
                }
            })
            .sum::<f64>();
        assert!(
            ent > 3.0,
            "Lever-2 default entropy should exceed 3.0, got {ent:.3}"
        );
        assert!(p.contains_key("RV"), "standard head code present");
        assert!(p.contains_key("Z001"), "synthetic tail code present");
        let mut rng = ChaCha8Rng::seed_from_u64(1);
        for _ in 0..50 {
            assert!(p.contains_key(&m.sample(&mut rng)));
        }
    }

    #[test]
    fn line_count_histogram_build_basic() {
        let values = vec![1, 1, 2, 3, 5, 5, 5, 32, 200];
        let (hist, dropped) = LineCountHistogram::build(&values, LINE_COUNT_BUCKETS);
        assert_eq!(dropped, 0);
        assert_eq!(hist.n, 9);
        assert!((hist.probabilities.iter().sum::<f64>() - 1.0).abs() < 1e-9);
    }

    #[test]
    fn line_count_histogram_drops_below_min() {
        let values = vec![0, 0, 1, 2];
        let (hist, dropped) = LineCountHistogram::build(&values, &[1, 2, 4]);
        assert_eq!(dropped, 2);
        assert_eq!(hist.n, 4);
        assert!((hist.probabilities[0] - 0.5).abs() < 1e-9);
    }

    #[test]
    fn sample_bucket_respects_probabilities() {
        let h = LineCountHistogram {
            buckets: vec![1, 2, 4, 8],
            probabilities: vec![0.0, 0.0, 1.0, 0.0],
            n: 100,
        };
        let mut rng = ChaCha8Rng::seed_from_u64(42);
        for _ in 0..50 {
            let s = h.sample_bucket(&mut rng);
            assert!((4..8).contains(&s), "expected sample in [4,8), got {s}");
        }
    }

    #[test]
    fn empirical_cdf_from_sorted_values() {
        let cdf = EmpiricalCdf::from_sorted_values("test", vec![1.0, 2.0, 3.0]);
        assert_eq!(cdf.values.len(), 3);
        assert!((cdf.probabilities[2] - 1.0).abs() < 1e-9);
    }

    #[test]
    fn active_segments_prior_default_round_trips() {
        let p = ActiveSegmentsPrior::default();
        let json = serde_json::to_string(&p).expect("serialize");
        let back: ActiveSegmentsPrior = serde_json::from_str(&json).expect("deserialize");
        assert!(back.by_source.is_empty());
    }

    #[test]
    fn behavioral_priors_active_segments_optional_round_trip() {
        // Construct a minimal BehavioralPriors with active_segments=Some(...)
        // and round-trip.
        let bp = BehavioralPriors {
            schema_version: BehavioralPriors::SCHEMA_VERSION,
            generator_version: "test".to_string(),
            industry: "test".to_string(),
            n_client_inputs: 0,
            n_rows_aggregated: 0,
            source_mix: SourceMixPrior::default(),
            per_source_iet: PerSourceIetPrior::default(),
            lines_per_je: LinesPerJePrior::default(),
            active_lifetime: ActiveLifetimePrior::default(),
            fanout: FanoutPrior::default(),
            posting_lag: None,
            month_volume: None,
            active_segments: Some(ActiveSegmentsPrior::default()),
            entity_clusters: None,
            per_source_attribute: None,
            tp_entity_clusters: None,
            coa_semantic: None,
            reference_formats: None,
            text_taxonomy: None,
            user_personas: None,
            source_amount_conditionals: None,
            source_role_gl_conditionals: None,
            source_flow_pairs: None,
            source_mix_je: None,
            approver: None,
            tb_anchor: None,
            manual_share: None,
        };
        let json = serde_json::to_string(&bp).expect("serialize");
        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
        assert!(back.active_segments.is_some());
    }

    #[test]
    fn behavioral_priors_legacy_round_trips_without_active_segments() {
        // Old JSON (no active_segments field) should deserialise as None.
        let legacy = r#"{
            "schema_version": 1,
            "generator_version": "5.12.0",
            "industry": "health",
            "n_client_inputs": 1,
            "n_rows_aggregated": 100,
            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
            "per_source_iet": {"by_source": {}},
            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
            "fanout": {"by_attribute": {}}
        }"#;
        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
        assert!(bp.active_segments.is_none());
        assert!(bp.posting_lag.is_none());
    }

    /// JE-share mix — a legacy bundle without `source_mix_je` parses as None
    /// (the generator keeps drawing per-JE sources from the line-share mix),
    /// and a populated JE-share mix round-trips.
    #[test]
    fn behavioral_priors_source_mix_je_optional_round_trip() {
        let legacy = r#"{
            "schema_version": 1,
            "generator_version": "5.35.2",
            "industry": "health",
            "n_client_inputs": 1,
            "n_rows_aggregated": 100,
            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
            "per_source_iet": {"by_source": {}},
            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
            "fanout": {"by_attribute": {}}
        }"#;
        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
        assert!(
            bp.source_mix_je.is_none(),
            "legacy bundles must default source_mix_je to None"
        );

        // Populated: per-JE shares round-trip and None is skipped on serialise.
        let mut probabilities = BTreeMap::new();
        probabilities.insert("CO".to_string(), 0.28);
        probabilities.insert("RR".to_string(), 0.72);
        let mut bp2 = bp.clone();
        bp2.source_mix_je = Some(SourceMixPrior {
            probabilities,
            other_fraction: 0.0,
            min_threshold: 0.005,
        });
        let json = serde_json::to_string(&bp2).expect("serialize");
        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
        let jm = back.source_mix_je.expect("populated mix survives");
        assert!((jm.probabilities["CO"] - 0.28).abs() < 1e-12);

        let json_none = serde_json::to_string(&bp).expect("serialize none");
        assert!(
            !json_none.contains("source_mix_je"),
            "None must serialise to no key (old-bundle byte shape preserved)"
        );
    }

    #[test]
    fn entity_clusters_prior_default_round_trips() {
        let p = EntityClustersPrior::default();
        let json = serde_json::to_string(&p).expect("serialize");
        let back: EntityClustersPrior = serde_json::from_str(&json).expect("deserialize");
        assert!(back.clusters.is_empty());
        assert!((back.clustering_rate).abs() < 1e-9);
    }

    #[test]
    fn categorical_distribution_samples_with_correct_weights() {
        use rand::SeedableRng;
        use rand_chacha::ChaCha8Rng;

        let mut counts = BTreeMap::new();
        counts.insert("A".to_string(), 700);
        counts.insert("B".to_string(), 200);
        counts.insert("C".to_string(), 100);
        let dist = CategoricalDistribution::from_counts(counts);

        assert_eq!(dist.n, 1000);
        assert!((dist.probabilities["A"] - 0.7).abs() < 1e-9);

        let mut rng = ChaCha8Rng::seed_from_u64(42);
        let mut buckets = BTreeMap::new();
        for _ in 0..10_000 {
            let v = dist.sample(&mut rng).expect("non-empty");
            *buckets.entry(v).or_insert(0) += 1;
        }
        // ~70% should be A, allow 2 std deviations of binomial.
        let a_count = buckets.get("A").copied().unwrap_or(0);
        assert!(
            (a_count as i64 - 7000).abs() < 200,
            "got {} A samples",
            a_count
        );
    }

    #[test]
    fn per_source_attribute_prior_conditional_lookup() {
        let mut inner = BTreeMap::new();
        let mut prob_map = BTreeMap::new();
        prob_map.insert("200001".to_string(), 0.9);
        prob_map.insert("200002".to_string(), 0.1);
        inner.insert(
            "gl_account".to_string(),
            CategoricalDistribution {
                probabilities: prob_map,
                n: 100,
            },
        );
        let mut by_source = BTreeMap::new();
        by_source.insert("KR".to_string(), inner);
        let prior = PerSourceAttributePrior {
            by_source,
            min_observations: 10,
        };
        assert!(prior.conditional("KR", "gl_account").is_some());
        assert!(prior.conditional("KR", "cost_center").is_none());
        assert!(prior.conditional("RV", "gl_account").is_none());
    }

    #[test]
    fn behavioral_priors_per_source_attribute_optional_round_trip() {
        let bp = BehavioralPriors {
            schema_version: BehavioralPriors::SCHEMA_VERSION,
            generator_version: "test".to_string(),
            industry: "test".to_string(),
            n_client_inputs: 0,
            n_rows_aggregated: 0,
            source_mix: SourceMixPrior::default(),
            per_source_iet: PerSourceIetPrior::default(),
            lines_per_je: LinesPerJePrior::default(),
            active_lifetime: ActiveLifetimePrior::default(),
            fanout: FanoutPrior::default(),
            posting_lag: None,
            month_volume: None,
            active_segments: None,
            entity_clusters: None,
            per_source_attribute: Some(PerSourceAttributePrior::default()),
            tp_entity_clusters: None,
            coa_semantic: None,
            reference_formats: None,
            text_taxonomy: None,
            user_personas: None,
            source_amount_conditionals: None,
            source_role_gl_conditionals: None,
            source_flow_pairs: None,
            source_mix_je: None,
            approver: None,
            tb_anchor: None,
            manual_share: None,
        };
        let json = serde_json::to_string(&bp).expect("serialize");
        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
        assert!(back.per_source_attribute.is_some());
    }

    #[test]
    fn entity_clusters_prior_with_members_round_trips() {
        let p = EntityClustersPrior {
            clusters: vec![EntityCluster {
                members: vec!["A".into(), "B".into(), "C".into()],
                avg_jaccard: 0.42,
            }],
            clustering_rate: 0.75,
        };
        let json = serde_json::to_string(&p).expect("serialize");
        let back: EntityClustersPrior = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back.clusters.len(), 1);
        assert_eq!(back.clusters[0].members.len(), 3);
        assert!((back.clusters[0].avg_jaccard - 0.42).abs() < 1e-9);
        assert!((back.clustering_rate - 0.75).abs() < 1e-9);
    }

    // ---- SP4.3 tests -------------------------------------------------------

    /// `LognormalAmount::sample` returns positive values whose geometric mean
    /// is close to `exp(mu)` (within 20% for 1000 samples).
    #[test]
    fn lognormal_amount_sample_positive_values() {
        use rand::SeedableRng;
        use rand_chacha::ChaCha8Rng;

        let params = LognormalAmount {
            mu: 4.5, // exp(4.5) ≈ 90
            sigma: 0.8,
            n: 1000,
            median_abs: 90.0,
        };
        let mut rng = ChaCha8Rng::seed_from_u64(42);
        let samples: Vec<f64> = (0..1000).map(|_| params.sample(&mut rng)).collect();

        // All samples must be strictly positive.
        assert!(samples.iter().all(|&v| v > 0.0), "all samples must be > 0");

        // Geometric mean (mean of log-samples) should be close to mu.
        let log_mean: f64 = samples.iter().map(|v| v.ln()).sum::<f64>() / 1000.0;
        assert!(
            (log_mean - 4.5).abs() < 0.15,
            "log-mean {log_mean:.3} should be near mu=4.5"
        );
    }

    /// `LognormalAmount::sample` tolerates degenerate sigma (near-zero → clamped).
    #[test]
    fn lognormal_amount_sample_degenerate_sigma() {
        use rand::SeedableRng;
        use rand_chacha::ChaCha8Rng;

        let params = LognormalAmount {
            mu: 3.0,
            sigma: 0.0, // degenerate — should not panic
            n: 5,
            median_abs: 20.0,
        };
        let mut rng = ChaCha8Rng::seed_from_u64(7);
        for _ in 0..10 {
            let v = params.sample(&mut rng);
            assert!(v > 0.0, "must be positive even with sigma=0");
        }
    }

    /// `PerSourceAmountPrior` round-trips through JSON.
    #[test]
    fn per_source_amount_prior_round_trip() {
        let mut by_source = BTreeMap::new();
        by_source.insert(
            "KR".to_string(),
            LognormalAmount {
                mu: 4.5,
                sigma: 2.158,
                n: 278939,
                median_abs: 100.0,
            },
        );
        let mut by_source_and_class = BTreeMap::new();
        let mut inner = BTreeMap::new();
        inner.insert(
            "0041".to_string(),
            LognormalAmount {
                mu: 5.394,
                sigma: 1.602,
                n: 61726,
                median_abs: 209.98,
            },
        );
        by_source_and_class.insert("KR".to_string(), inner);

        let prior = PerSourceAmountPrior {
            by_source_and_class,
            by_source,
            quantile_sketch_by_source: BTreeMap::new(),
            je_total_sketch_by_source: BTreeMap::new(),
            je_total_sketch_global: None,
        };
        let json = serde_json::to_string(&prior).expect("serialize");
        let back: PerSourceAmountPrior = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back.by_source.len(), 1);
        assert_eq!(back.by_source_and_class.len(), 1);
        assert_eq!(back.by_source["KR"].n, 278939);
        assert_eq!(back.by_source_and_class["KR"]["0041"].n, 61726);
        // SP4.9: empty sketch map is skipped on serialise and defaults on parse.
        assert!(!json.contains("quantile_sketch_by_source"));
        assert!(back.quantile_sketch_by_source.is_empty());
    }

    /// SP4.9 — sketch sampling: draws land on the knot grid (median ≈ p50 knot,
    /// p99 of draws ≈ p99 knot), extrapolation above the top knot is capped,
    /// and an unusable sketch returns `None`.
    #[test]
    fn amount_quantile_sketch_sample_matches_knots() {
        use rand::SeedableRng;
        use rand_chacha::ChaCha8Rng;

        let sketch = AmountQuantileSketch {
            probabilities: vec![0.01, 0.10, 0.50, 0.90, 0.99, 0.999],
            values: vec![1.0, 10.0, 100.0, 10_000.0, 1_000_000.0, 5_000_000.0],
            n: 100_000,
        };
        assert!(sketch.is_usable());
        let mut rng = ChaCha8Rng::seed_from_u64(7);
        let mut draws: Vec<f64> = (0..200_000)
            .map(|_| sketch.sample(&mut rng).expect("usable sketch"))
            .collect();
        draws.sort_by(|a, b| a.total_cmp(b));
        let q = |p: f64| draws[((draws.len() as f64 - 1.0) * p) as usize];
        let med = q(0.50);
        assert!(
            (med / 100.0 - 1.0).abs() < 0.10,
            "median should track the p50 knot, got {med}"
        );
        let p99 = q(0.99);
        assert!(
            (p99 / 1_000_000.0 - 1.0).abs() < 0.25,
            "p99 should track the p99 knot, got {p99}"
        );
        // Extrapolation above the 0.999 knot is capped at 10x the top value.
        let max = *draws.last().unwrap();
        assert!(
            max <= 5_000_000.0 * AmountQuantileSketch::MAX_TAIL_EXTRAPOLATION * 1.0001,
            "tail extrapolation must be capped, got {max}"
        );
        assert!(max > 5_000_000.0, "some draws should exceed the top knot");

        // Unusable: single knot.
        let degenerate = AmountQuantileSketch {
            probabilities: vec![0.5],
            values: vec![100.0],
            n: 10,
        };
        assert!(degenerate.sample(&mut rng).is_none());
    }

    /// `quantile(u)` is the deterministic inverse CDF the copula composition
    /// uses: exact at knots, log-linear (geometric) between them, floored below
    /// the first knot, clamped u, and `None` when unusable.
    #[test]
    fn amount_quantile_sketch_quantile_is_inverse_cdf() {
        let sketch = AmountQuantileSketch {
            probabilities: vec![0.10, 0.50, 0.90],
            values: vec![10.0, 100.0, 1000.0],
            n: 10_000,
        };
        assert_eq!(sketch.quantile(0.50), Some(100.0));
        assert_eq!(sketch.quantile(0.10), Some(10.0));
        // Log-linear midpoint of the (0.5, 0.9) segment → geometric mean.
        let mid = sketch.quantile(0.70).expect("usable");
        assert!(
            (mid - (100.0f64 * 1000.0).sqrt()).abs() < 1e-9,
            "expected geometric midpoint ~316.23, got {mid}"
        );
        // Below the bottom knot → floor; out-of-range u is clamped.
        assert_eq!(sketch.quantile(0.01), Some(10.0));
        assert_eq!(sketch.quantile(-1.0), Some(10.0));
        let top = sketch.quantile(2.0).expect("clamped to 1.0");
        assert!(
            top > 1000.0 && top <= 1000.0 * AmountQuantileSketch::MAX_TAIL_EXTRAPOLATION * 1.0001,
            "u clamped to 1.0 extrapolates above the top knot but stays capped, got {top}"
        );

        let degenerate = AmountQuantileSketch {
            probabilities: vec![0.5],
            values: vec![100.0],
            n: 10,
        };
        assert!(degenerate.quantile(0.5).is_none());
    }

    /// SP4.9 — a legacy `PerSourceAmountPrior` JSON without the sketch field
    /// parses with an empty map (old bundles load unchanged).
    #[test]
    fn per_source_amount_prior_legacy_json_defaults_sketch_empty() {
        let legacy = r#"{"by_source_and_class": {}, "by_source": {}}"#;
        let prior: PerSourceAmountPrior = serde_json::from_str(legacy).expect("legacy parse");
        assert!(prior.quantile_sketch_by_source.is_empty());
    }

    /// JE-total sketch — a legacy `PerSourceAmountPrior` JSON without the
    /// je-total field parses with an empty map (old bundles load unchanged).
    #[test]
    fn per_source_amount_prior_legacy_json_defaults_je_total_sketch_empty() {
        let legacy = r#"{"by_source_and_class": {}, "by_source": {}}"#;
        let prior: PerSourceAmountPrior = serde_json::from_str(legacy).expect("legacy parse");
        assert!(prior.je_total_sketch_by_source.is_empty());
    }

    /// JE-total sketch — a populated `je_total_sketch_by_source` round-trips, and
    /// an empty map is skipped on serialise (opt-in / byte-identical for old bundles).
    #[test]
    fn per_source_je_total_sketch_round_trips() {
        let mut je_total = BTreeMap::new();
        je_total.insert(
            "SA".to_string(),
            AmountQuantileSketch {
                probabilities: vec![0.5, 0.99],
                values: vec![135.0, 161_674.0],
                n: 250_000,
            },
        );
        let prior = PerSourceAmountPrior {
            by_source_and_class: BTreeMap::new(),
            by_source: BTreeMap::new(),
            quantile_sketch_by_source: BTreeMap::new(),
            je_total_sketch_by_source: je_total,
            je_total_sketch_global: None,
        };
        let json = serde_json::to_string(&prior).expect("serialize");
        let back: PerSourceAmountPrior = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(
            back.je_total_sketch_by_source["SA"].values,
            vec![135.0, 161_674.0]
        );
        assert_eq!(back.je_total_sketch_by_source["SA"].n, 250_000);

        // Empty map skipped on serialise (no bytes for old bundles).
        let json_empty =
            serde_json::to_string(&PerSourceAmountPrior::default()).expect("serialize");
        assert!(!json_empty.contains("je_total_sketch_by_source"));
    }

    /// Global JE-total sketch — a legacy `PerSourceAmountPrior` JSON without the
    /// global field parses with `None` (old bundles load unchanged).
    #[test]
    fn per_source_amount_prior_legacy_json_defaults_je_total_global_none() {
        let legacy = r#"{"by_source_and_class": {}, "by_source": {}}"#;
        let prior: PerSourceAmountPrior = serde_json::from_str(legacy).expect("legacy parse");
        assert!(prior.je_total_sketch_global.is_none());
    }

    /// Global JE-total sketch — a populated `je_total_sketch_global` round-trips,
    /// and `None` is skipped on serialise (opt-in / byte-identical for old bundles).
    #[test]
    fn per_source_je_total_global_sketch_round_trips() {
        let prior = PerSourceAmountPrior {
            by_source_and_class: BTreeMap::new(),
            by_source: BTreeMap::new(),
            quantile_sketch_by_source: BTreeMap::new(),
            je_total_sketch_by_source: BTreeMap::new(),
            je_total_sketch_global: Some(AmountQuantileSketch {
                probabilities: vec![0.5, 0.99],
                values: vec![142.0, 173_402.0],
                n: 1_250_000,
            }),
        };
        let json = serde_json::to_string(&prior).expect("serialize");
        let back: PerSourceAmountPrior = serde_json::from_str(&json).expect("deserialize");
        let g = back.je_total_sketch_global.expect("global present");
        assert_eq!(g.values, vec![142.0, 173_402.0]);
        assert_eq!(g.n, 1_250_000);

        // `None` skipped on serialise (no bytes for old bundles).
        let json_empty =
            serde_json::to_string(&PerSourceAmountPrior::default()).expect("serialize");
        assert!(!json_empty.contains("je_total_sketch_global"));
    }

    /// `BehavioralPriors` with `source_amount_conditionals: Some(...)` round-trips.
    #[test]
    fn behavioral_priors_source_amount_conditionals_optional_round_trip() {
        let prior = PerSourceAmountPrior {
            by_source_and_class: BTreeMap::new(),
            by_source: BTreeMap::new(),
            quantile_sketch_by_source: BTreeMap::new(),
            je_total_sketch_by_source: BTreeMap::new(),
            je_total_sketch_global: None,
        };
        let bp = BehavioralPriors {
            schema_version: BehavioralPriors::SCHEMA_VERSION,
            generator_version: "test".to_string(),
            industry: "test".to_string(),
            n_client_inputs: 0,
            n_rows_aggregated: 0,
            source_mix: SourceMixPrior::default(),
            per_source_iet: PerSourceIetPrior::default(),
            lines_per_je: LinesPerJePrior::default(),
            active_lifetime: ActiveLifetimePrior::default(),
            fanout: FanoutPrior::default(),
            posting_lag: None,
            month_volume: None,
            active_segments: None,
            entity_clusters: None,
            per_source_attribute: None,
            tp_entity_clusters: None,
            coa_semantic: None,
            reference_formats: None,
            text_taxonomy: None,
            user_personas: None,
            source_amount_conditionals: Some(prior),
            source_role_gl_conditionals: None,
            source_flow_pairs: None,
            source_mix_je: None,
            approver: None,
            tb_anchor: None,
            manual_share: None,
        };
        let json = serde_json::to_string(&bp).expect("serialize");
        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
        assert!(back.source_amount_conditionals.is_some());
    }

    /// Legacy `BehavioralPriors` JSON without `source_amount_conditionals`
    /// deserialises with `None` for that field (backwards compat).
    #[test]
    fn behavioral_priors_legacy_missing_source_amount_conditionals() {
        let legacy = r#"{
            "schema_version": 1,
            "generator_version": "5.21.0",
            "industry": "health",
            "n_client_inputs": 1,
            "n_rows_aggregated": 100,
            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
            "per_source_iet": {"by_source": {}},
            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
            "fanout": {"by_attribute": {}}
        }"#;
        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
        assert!(
            bp.source_amount_conditionals.is_none(),
            "missing field should deserialise as None"
        );
    }

    // ---- SP4.6 tests -------------------------------------------------------

    /// `PerSourceRolePrior::conditional` returns the right distribution
    /// and `sample` returns only values from the matching role set.
    #[test]
    fn sp4_6_role_conditional_keeps_dr_in_expense_class() {
        use rand::SeedableRng;
        use rand_chacha::ChaCha8Rng;

        // Build a PerSourceRolePrior for KR:
        //   DR: expense accounts (6000, 6100)
        //   CR: AP account (2000)
        let mut dr_counts = BTreeMap::new();
        dr_counts.insert("6000".to_string(), 100usize);
        dr_counts.insert("6100".to_string(), 50usize);
        let mut cr_counts = BTreeMap::new();
        cr_counts.insert("2000".to_string(), 150usize);

        let mut role_map = BTreeMap::new();
        role_map.insert(
            "DR".to_string(),
            CategoricalDistribution::from_counts(dr_counts),
        );
        role_map.insert(
            "CR".to_string(),
            CategoricalDistribution::from_counts(cr_counts),
        );

        let mut by_source_and_role = BTreeMap::new();
        by_source_and_role.insert("KR".to_string(), role_map);

        let prior = PerSourceRolePrior { by_source_and_role };

        // DR draws should be in {6000, 6100}
        let mut rng = ChaCha8Rng::seed_from_u64(42);
        for _ in 0..100 {
            let v = prior
                .conditional("KR", "DR")
                .unwrap()
                .sample(&mut rng)
                .unwrap();
            assert!(
                v == "6000" || v == "6100",
                "DR draw must be expense account, got {v}"
            );
        }

        // CR draws should be {2000}
        for _ in 0..50 {
            let v = prior
                .conditional("KR", "CR")
                .unwrap()
                .sample(&mut rng)
                .unwrap();
            assert_eq!(v, "2000", "CR draw must be AP account");
        }
    }

    /// When a `(source, role)` pair is absent, `conditional` returns `None`.
    #[test]
    fn sp4_6_role_conditional_falls_back_when_pair_missing() {
        let prior = PerSourceRolePrior::default();
        assert!(
            prior.conditional("KR", "DR").is_none(),
            "empty prior must return None"
        );
        assert!(
            prior.conditional("KR", "CR").is_none(),
            "empty prior must return None for CR too"
        );
    }

    /// `PerSourceRolePrior` round-trips through JSON without loss.
    #[test]
    fn sp4_6_per_source_role_prior_json_round_trip() {
        let mut dr_counts = BTreeMap::new();
        dr_counts.insert("6000".to_string(), 200usize);
        let mut role_map = BTreeMap::new();
        role_map.insert(
            "DR".to_string(),
            CategoricalDistribution::from_counts(dr_counts),
        );
        let mut by_source_and_role = BTreeMap::new();
        by_source_and_role.insert("KR".to_string(), role_map);
        let prior = PerSourceRolePrior { by_source_and_role };

        let json = serde_json::to_string(&prior).expect("serialize");
        let back: PerSourceRolePrior = serde_json::from_str(&json).expect("deserialize");
        assert!(back.conditional("KR", "DR").is_some());
        assert!(back.conditional("KR", "CR").is_none());
    }

    /// `BehavioralPriors` with `source_role_gl_conditionals: Some(...)` round-trips.
    #[test]
    fn behavioral_priors_source_role_gl_conditionals_optional_round_trip() {
        let prior = PerSourceRolePrior::default();
        let bp = BehavioralPriors {
            schema_version: BehavioralPriors::SCHEMA_VERSION,
            generator_version: "test".to_string(),
            industry: "test".to_string(),
            n_client_inputs: 0,
            n_rows_aggregated: 0,
            source_mix: SourceMixPrior::default(),
            per_source_iet: PerSourceIetPrior::default(),
            lines_per_je: LinesPerJePrior::default(),
            active_lifetime: ActiveLifetimePrior::default(),
            fanout: FanoutPrior::default(),
            posting_lag: None,
            month_volume: None,
            active_segments: None,
            entity_clusters: None,
            per_source_attribute: None,
            tp_entity_clusters: None,
            coa_semantic: None,
            reference_formats: None,
            text_taxonomy: None,
            user_personas: None,
            source_amount_conditionals: None,
            source_role_gl_conditionals: Some(prior),
            source_flow_pairs: None,
            source_mix_je: None,
            approver: None,
            tb_anchor: None,
            manual_share: None,
        };
        let json = serde_json::to_string(&bp).expect("serialize");
        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
        assert!(back.source_role_gl_conditionals.is_some());
    }

    // ---- SP4.8 tests -------------------------------------------------------

    /// `PerSourceFlowPairPrior` pair-key helpers round-trip and classes
    /// canonicalise the same way the relational-fidelity measure does.
    #[test]
    fn flow_pair_prior_key_helpers() {
        let key = PerSourceFlowPairPrior::pair_key("5", "2");
        assert_eq!(key, "5|2");
        assert_eq!(
            PerSourceFlowPairPrior::split_pair_key(&key),
            Some(("5".to_string(), "2".to_string()))
        );
        assert_eq!(PerSourceFlowPairPrior::split_pair_key("5|"), None);
        assert_eq!(PerSourceFlowPairPrior::split_pair_key("nodelim"), None);
        // Digits-only canonicalisation mirrors the relational-fidelity measure:
        // prefix digits count, so "Z398-54441000" → digits "39854441000" → class "3".
        assert_eq!(
            PerSourceFlowPairPrior::account_class("Z398-54441000", 1),
            Some("3".to_string())
        );
        assert_eq!(
            PerSourceFlowPairPrior::account_class("13210000", 1),
            Some("1".to_string())
        );
        assert_eq!(PerSourceFlowPairPrior::account_class("ABC", 1), None);
    }

    /// `BehavioralPriors` with `source_flow_pairs: Some(...)` round-trips, and
    /// legacy JSON without the field deserialises as `None`.
    #[test]
    fn behavioral_priors_source_flow_pairs_optional_round_trip() {
        let mut prior = PerSourceFlowPairPrior {
            granularity: 1,
            ..Default::default()
        };
        let mut counts = BTreeMap::new();
        counts.insert("5|2".to_string(), 7usize);
        counts.insert("1|4".to_string(), 3usize);
        prior.by_source.insert(
            "KR".to_string(),
            CategoricalDistribution::from_counts(counts),
        );

        let json = serde_json::to_string(&prior).expect("serialize");
        let back: PerSourceFlowPairPrior = serde_json::from_str(&json).expect("deserialize");
        assert!(back.pairs("KR").is_some());
        assert!(back.pairs("RV").is_none());

        let legacy = r#"{
            "schema_version": 1,
            "generator_version": "5.21.0",
            "industry": "health",
            "n_client_inputs": 1,
            "n_rows_aggregated": 100,
            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
            "per_source_iet": {"by_source": {}},
            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
            "fanout": {"by_attribute": {}}
        }"#;
        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
        assert!(bp.source_flow_pairs.is_none());
    }

    /// Legacy `BehavioralPriors` JSON without `source_role_gl_conditionals`
    /// deserialises with `None` for that field (backwards compat).
    #[test]
    fn behavioral_priors_legacy_missing_source_role_gl_conditionals() {
        let legacy = r#"{
            "schema_version": 1,
            "generator_version": "5.21.0",
            "industry": "health",
            "n_client_inputs": 1,
            "n_rows_aggregated": 100,
            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
            "per_source_iet": {"by_source": {}},
            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
            "fanout": {"by_attribute": {}}
        }"#;
        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
        assert!(
            bp.source_role_gl_conditionals.is_none(),
            "missing field should deserialise as None"
        );
    }

    // ---- SP4.1 tests -------------------------------------------------------

    /// `TbAnchorPrior` round-trips through JSON without loss.
    #[test]
    fn tb_anchor_prior_json_round_trip() {
        let mut per_account = BTreeMap::new();
        per_account.insert(
            "1000".to_string(),
            TbTarget {
                opening_balance: 100_000.0,
                closing_balance: 120_000.0,
                period_net_activity: 20_000.0,
                opening_stdev: 5_000.0,
                closing_stdev: 6_000.0,
                n_clients: 3,
            },
        );
        per_account.insert(
            "2000".to_string(),
            TbTarget {
                opening_balance: -50_000.0,
                closing_balance: -60_000.0,
                period_net_activity: -10_000.0,
                opening_stdev: 2_000.0,
                closing_stdev: 3_000.0,
                n_clients: 3,
            },
        );
        let anchor = TbAnchorPrior {
            per_account,
            total_assets: 300_000.0,
            total_liabilities: 120_000.0,
            total_equity: 180_000.0,
            n_clients: 3,
        };
        let json = serde_json::to_string(&anchor).expect("serialize");
        let back: TbAnchorPrior = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back.per_account.len(), 2);
        assert!((back.per_account["1000"].closing_balance - 120_000.0).abs() < 1e-6);
        assert!((back.total_assets - 300_000.0).abs() < 1e-6);
        assert_eq!(back.n_clients, 3);
    }

    /// `TbAnchorPrior::has_data()` returns `true` for non-zero balances.
    #[test]
    fn tb_anchor_prior_has_data() {
        let mut prior = TbAnchorPrior::default();
        assert!(!prior.has_data(), "empty prior must report no data");

        prior.per_account.insert(
            "1000".to_string(),
            TbTarget {
                closing_balance: 1.0,
                ..Default::default()
            },
        );
        assert!(
            prior.has_data(),
            "non-zero closing balance must report has_data"
        );
    }

    /// `BehavioralPriors` with `tb_anchor: Some(...)` round-trips through JSON.
    #[test]
    fn behavioral_priors_tb_anchor_optional_round_trip() {
        let mut per_account = BTreeMap::new();
        per_account.insert(
            "1000".to_string(),
            TbTarget {
                opening_balance: 50_000.0,
                closing_balance: 55_000.0,
                period_net_activity: 5_000.0,
                opening_stdev: 1_000.0,
                closing_stdev: 1_200.0,
                n_clients: 2,
            },
        );
        let tb_anchor = Some(TbAnchorPrior {
            per_account,
            total_assets: 55_000.0,
            total_liabilities: 0.0,
            total_equity: 55_000.0,
            n_clients: 2,
        });
        let bp = BehavioralPriors {
            schema_version: BehavioralPriors::SCHEMA_VERSION,
            generator_version: "test".to_string(),
            industry: "test".to_string(),
            n_client_inputs: 0,
            n_rows_aggregated: 0,
            source_mix: SourceMixPrior::default(),
            per_source_iet: PerSourceIetPrior::default(),
            lines_per_je: LinesPerJePrior::default(),
            active_lifetime: ActiveLifetimePrior::default(),
            fanout: FanoutPrior::default(),
            posting_lag: None,
            month_volume: None,
            active_segments: None,
            entity_clusters: None,
            per_source_attribute: None,
            tp_entity_clusters: None,
            coa_semantic: None,
            reference_formats: None,
            text_taxonomy: None,
            user_personas: None,
            source_amount_conditionals: None,
            source_role_gl_conditionals: None,
            source_flow_pairs: None,
            source_mix_je: None,
            approver: None,
            tb_anchor,
            manual_share: None,
        };
        let json = serde_json::to_string(&bp).expect("serialize");
        let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
        let anchor = back.tb_anchor.expect("tb_anchor must be Some");
        assert_eq!(anchor.per_account.len(), 1);
        assert!((anchor.per_account["1000"].closing_balance - 55_000.0).abs() < 1e-6);
    }

    /// Legacy JSON without `tb_anchor` deserialises as `None` (backwards compat).
    #[test]
    fn behavioral_priors_legacy_missing_tb_anchor() {
        let legacy = r#"{
            "schema_version": 1,
            "generator_version": "5.22.0",
            "industry": "health",
            "n_client_inputs": 1,
            "n_rows_aggregated": 100,
            "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
            "per_source_iet": {"by_source": {}},
            "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
            "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
            "fanout": {"by_attribute": {}}
        }"#;
        let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
        assert!(
            bp.tb_anchor.is_none(),
            "missing tb_anchor field should deserialise as None"
        );
        assert!(
            bp.manual_share.is_none(),
            "missing manual_share field should deserialise as None"
        );
    }

    /// Phase-2 R6 — `manual_share: None` must not serialise the key, so old
    /// bundles re-serialise byte-identically.
    #[test]
    fn manual_share_none_is_absent_from_serialized_output() {
        let bp = BehavioralPriors {
            schema_version: BehavioralPriors::SCHEMA_VERSION,
            generator_version: "test".to_string(),
            industry: "test".to_string(),
            n_client_inputs: 0,
            n_rows_aggregated: 0,
            source_mix: SourceMixPrior::default(),
            per_source_iet: PerSourceIetPrior::default(),
            lines_per_je: LinesPerJePrior::default(),
            active_lifetime: ActiveLifetimePrior::default(),
            fanout: FanoutPrior::default(),
            posting_lag: None,
            month_volume: None,
            active_segments: None,
            entity_clusters: None,
            per_source_attribute: None,
            tp_entity_clusters: None,
            coa_semantic: None,
            reference_formats: None,
            text_taxonomy: None,
            user_personas: None,
            source_amount_conditionals: None,
            source_role_gl_conditionals: None,
            source_flow_pairs: None,
            source_mix_je: None,
            approver: None,
            tb_anchor: None,
            manual_share: None,
        };
        let json = serde_json::to_string(&bp).expect("serialize");
        assert!(
            !json.contains("manual_share"),
            "None manual_share must not appear in serialized output"
        );
    }

    /// Phase-2 R6 — `ManualSharePrior` round-trips and `share_for_source`
    /// falls back to `overall` for unseen sources.
    #[test]
    fn manual_share_prior_round_trip_and_fallback() {
        let mut by_source = BTreeMap::new();
        by_source.insert("SA".to_string(), 0.9);
        by_source.insert("RE".to_string(), 0.1);
        let ms = ManualSharePrior {
            overall: 0.75,
            by_source,
            n_observations: 5_000,
        };
        assert!(ms.has_data());
        assert!((ms.share_for_source("SA") - 0.9).abs() < 1e-12);
        assert!((ms.share_for_source("RE") - 0.1).abs() < 1e-12);
        assert!(
            (ms.share_for_source("ZZ") - 0.75).abs() < 1e-12,
            "unseen source must fall back to overall"
        );

        let json = serde_json::to_string(&ms).expect("serialize");
        let back: ManualSharePrior = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back, ms);

        assert!(!ManualSharePrior::default().has_data());
    }

    #[test]
    fn month_volume_prior_from_counts_normalises_and_round_trips() {
        // Raw counts → shares summing to 1.0; n preserved.
        let counts = [
            10.0, 20.0, 30.0, 40.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        ];
        let mv = MonthVolumePrior::from_counts(counts, 100);
        assert!(mv.has_data());
        assert!((mv.shares.iter().sum::<f64>() - 1.0).abs() < 1e-12);
        assert!((mv.shares[0] - 0.10).abs() < 1e-12);
        assert!((mv.shares[3] - 0.40).abs() < 1e-12);
        assert_eq!(mv.n, 100);

        let json = serde_json::to_string(&mv).expect("serialize");
        let back: MonthVolumePrior = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back, mv);

        // Zero counts → no data.
        assert!(!MonthVolumePrior::from_counts([0.0; 12], 0).has_data());
        assert!(!MonthVolumePrior::default().has_data());
    }

    #[test]
    fn month_volume_month_for_quantile_is_monotone_and_matches_cdf() {
        // Jan 50%, Feb 50% → u<0.5 → Jan(0), u>=0.5 → Feb(1).
        let mut counts = [0.0; 12];
        counts[0] = 50.0;
        counts[1] = 50.0;
        let mv = MonthVolumePrior::from_counts(counts, 100);
        assert_eq!(mv.month_for_quantile(0.0), 0);
        assert_eq!(mv.month_for_quantile(0.49), 0);
        assert_eq!(mv.month_for_quantile(0.50), 1);
        assert_eq!(mv.month_for_quantile(0.99), 1);
        // Monotone: month index never decreases as u increases.
        let mut prev = 0u32;
        for k in 0..=100 {
            let m = mv.month_for_quantile(k as f64 / 100.0);
            assert!(m >= prev, "month_for_quantile must be monotone in u");
            prev = m;
        }
        // u==1.0 clamps to the last non-empty month.
        assert_eq!(mv.month_for_quantile(1.0), 1);
    }
}