datasynth-fingerprint 5.36.0

//! Behavioral-prior extraction from corpus GL data.

use std::collections::{BTreeMap, HashSet};
use std::path::Path;

use chrono::NaiveDate;
use datasynth_eval::behavioral_fidelity::loader::{load_csv_records, load_parquet_records};

use super::manual_extractor::extract_manual_share_from_parquet;
use datasynth_eval::behavioral_fidelity::math::pearson_lag1_correlation;
use datasynth_eval::behavioral_fidelity::Record;

use crate::error::FingerprintError;
use crate::models::behavioral::BehavioralPriors;

use crate::models::behavioral::{
    ActiveLifetimePrior, ActiveSegmentsPrior, AmountQuantileSketch, CategoricalDistribution,
    EntityCluster, EntityClustersPrior, FanoutPrior, IetSummary, LagSummary, LineCountHistogram,
    LinesPerJePrior, LognormalAmount, LognormalParams, MonthVolumePrior, PerSourceAmountPrior,
    PerSourceAttributePrior, PerSourceFlowPairPrior, PerSourceIetPrior, PerSourceRolePrior,
    PostingLagPrior, SourceMixPrior, SourceSegmentSummary, ACTIVE_LIFETIME_DAY_BUCKETS,
    FANOUT_BUCKETS, LINE_COUNT_BUCKETS, SEGMENT_COUNT_BUCKETS, SEGMENT_GAP_BUCKETS,
};
use crate::models::EmpiricalCdf;

use super::reference_extractor::extract_reference_formats;
use super::user_extractor::extract_user_personas;

/// SP4.5 — Default minimum row count for a user to be included in the
/// `UserPersonaPrior`.  Users appearing fewer than this many times are dropped
/// to avoid over-fitting one-off entries and to preserve privacy.
pub const DEFAULT_MIN_USER_RECORDS: usize = 100;

/// Default minimum occurrence count for a reference template to be retained
/// in the per-source reference-format prior.
pub const DEFAULT_MIN_REFERENCE_OCCURRENCES: usize = 10;

/// Default minimum row-share for a Source code to appear individually in the mix.
pub const DEFAULT_MIN_SOURCE_THRESHOLD: f64 = 0.005;

/// SP3.8b — Default minimum observation count per source.  Codes appearing
/// fewer than this many times in a single client's data are dropped from
/// the source_mix distribution to keep per-code event density high in
/// downstream generation.
pub const DEFAULT_MIN_SOURCE_OBSERVATIONS: usize = 1000;

/// Gates controlling which Source codes survive individually in the
/// `source_mix` prior.  The defaults reproduce the historical constants, so
/// extraction output is unchanged unless a caller overrides them.  Lowering
/// the gates lets rare-but-real codes survive (a corpus GL's long tail of
/// low-volume sources), at the cost of thinner per-code statistics — the
/// per-source priors keep their own independent minimums either way.
#[derive(Debug, Clone, Copy)]
pub struct SourceMixGates {
    /// Minimum row-share for a code to appear individually
    /// (codes below this roll into `other_fraction`).
    pub min_share: f64,
    /// SP3.8b — minimum raw observation count per code.
    pub min_observations: usize,
}

impl Default for SourceMixGates {
    fn default() -> Self {
        Self {
            min_share: DEFAULT_MIN_SOURCE_THRESHOLD,
            min_observations: DEFAULT_MIN_SOURCE_OBSERVATIONS,
        }
    }
}

/// Build a `SourceMixPrior` from a slice of records.
///
/// Only **amount-bearing** lines (`functional_amount != 0`) are counted: the
/// generator can only emit lines that carry an amount, so a mix fit on all
/// lines over-represents sources whose documents are padded with zero-amount
/// rows (clearing/statistical lines). Corpus GLs routinely carry 20-30 % such
/// rows, concentrated in a few sources — counting them skews the synthetic
/// amount-weighted source composition by 2-3× on the affected codes. When the
/// export carries no usable amounts at all (degenerate/blanked amount column)
/// the count falls back to all lines so the mix is still emitted.
///
/// Two independent filters are applied:
/// 1. `min_threshold` — minimum row-share (probability).  Codes below this
///    fraction are rolled into `other_fraction`.
/// 2. `min_observations` — SP3.8b — minimum raw observation count.  Codes
///    that appear fewer than this many times are dropped regardless of their
///    fractional share.  Dropped mass is added to `other_fraction`.
///    After both filters probabilities are renormalised to sum to 1.0.
pub fn extract_source_mix(
    records: &[Record],
    min_threshold: f64,
    min_observations: usize,
) -> SourceMixPrior {
    if records.is_empty() {
        return SourceMixPrior {
            probabilities: BTreeMap::new(),
            other_fraction: 0.0,
            min_threshold,
        };
    }
    let mut counts: BTreeMap<String, usize> = BTreeMap::new();
    let mut counted: usize = 0;
    for r in records {
        // NaN.abs() > 0.0 is false, so unparseable amounts are excluded too.
        if r.functional_amount.abs() > 0.0 {
            *counts.entry(r.source.clone()).or_insert(0) += 1;
            counted += 1;
        }
    }
    if counted == 0 {
        // Degenerate amount column — fall back to the all-lines count.
        for r in records {
            *counts.entry(r.source.clone()).or_insert(0) += 1;
        }
        counted = records.len();
    }

    // SP3.8b — drop the long tail of low-volume sources to concentrate the
    // distribution on dominant codes.  The probability-threshold filter still
    // applies separately below.
    counts.retain(|_, c| *c >= min_observations);

    let total = counted as f64;
    let mut probabilities = BTreeMap::new();
    let mut other = 0.0;
    for (src, c) in counts {
        let frac = c as f64 / total;
        if frac >= min_threshold {
            probabilities.insert(src, frac);
        } else {
            other += frac;
        }
    }

    // Renormalise retained probabilities so they sum to 1.0, rolling the
    // removed mass from dropped sources into other_fraction.
    let retained_sum: f64 = probabilities.values().sum();
    let dropped_mass = 1.0 - retained_sum - other;
    let other_fraction = other + dropped_mass;

    if retained_sum > 0.0 {
        for v in probabilities.values_mut() {
            *v /= retained_sum;
        }
    }

    SourceMixPrior {
        probabilities,
        other_fraction,
        min_threshold,
    }
}

/// Per-JE source-share mix — the fraction of JOURNAL ENTRIES per source, over
/// the LINE mix's retained vocabulary (`retained` = `source_mix.probabilities`).
///
/// The generator draws one source per JE; drawing from line shares makes the
/// JE-count composition proportional to line counts, over-representing
/// line-heavy sources and under-representing 2-line bulk sources by their
/// lines-per-JE ratio. This mix fixes the draw's units.
///
/// Grouping matches the JE-total sketch: a JE is `(effective-date year,
/// je_number)` — SAP reuses JE numbers across fiscal years — counted when it
/// has at least one positive (debit) leg, attributed to its first debit leg's
/// source. Shares are renormalised over `retained`; the JE mass of dropped
/// codes is reported in `other_fraction`. Returns `None` when the vocabulary
/// or the record set is empty — callers leave `source_mix_je` unset and the
/// generator falls back to the line-share mix.
pub fn extract_source_mix_je(
    records: &[Record],
    retained: &BTreeMap<String, f64>,
) -> Option<SourceMixPrior> {
    use chrono::Datelike;

    if retained.is_empty() {
        return None;
    }
    let mut first_src: BTreeMap<(i32, &str), &str> = BTreeMap::new();
    for r in records {
        if r.functional_amount > 0.0 && r.functional_amount.is_finite() && !r.source.is_empty() {
            first_src
                .entry((r.effective_date.year(), r.je_number.as_str()))
                .or_insert(r.source.as_str());
        }
    }
    if first_src.is_empty() {
        return None;
    }
    let total = first_src.len() as f64;
    let mut counts: BTreeMap<&str, u64> = BTreeMap::new();
    for src in first_src.values() {
        *counts.entry(src).or_insert(0) += 1;
    }
    let retained_total: u64 = counts
        .iter()
        .filter(|(s, _)| retained.contains_key(**s))
        .map(|(_, c)| *c)
        .sum();
    if retained_total == 0 {
        return None;
    }
    let probabilities: BTreeMap<String, f64> = counts
        .iter()
        .filter(|(s, _)| retained.contains_key(**s))
        .map(|(s, c)| (s.to_string(), *c as f64 / retained_total as f64))
        .collect();
    let other_fraction = 1.0 - retained_total as f64 / total;
    Some(SourceMixPrior {
        probabilities,
        other_fraction,
        min_threshold: 0.0,
    })
}

/// Minimum sample count for a Source to receive its own IET summary.
pub const DEFAULT_MIN_IET_SAMPLES: usize = 100;

/// Extract per-Source inter-event-time distributions in days.
pub fn extract_per_source_iet(records: &[Record], min_samples: usize) -> PerSourceIetPrior {
    let mut by_source: BTreeMap<String, Vec<NaiveDate>> = BTreeMap::new();
    for r in records {
        by_source
            .entry(r.source.clone())
            .or_default()
            .push(r.entry_date);
    }
    let mut summaries: BTreeMap<String, IetSummary> = BTreeMap::new();
    for (source, mut dates) in by_source {
        if dates.len() < 2 {
            continue;
        }
        dates.sort();
        let iets: Vec<f64> = dates
            .windows(2)
            .map(|w| (w[1] - w[0]).num_days() as f64)
            .collect();
        if iets.len() < min_samples {
            continue;
        }
        let cdf = build_empirical_cdf(&format!("iet_{source}"), &iets);
        let lognormal = fit_lognormal(&iets);
        let auto = pearson_lag1_correlation(&iets).unwrap_or(0.0);
        summaries.insert(
            source,
            IetSummary {
                n: iets.len(),
                empirical_cdf_days: cdf,
                lognormal_fit: lognormal,
                lag1_autocorr: auto,
            },
        );
    }
    PerSourceIetPrior {
        by_source: summaries,
    }
}

fn build_empirical_cdf(column: &str, samples: &[f64]) -> EmpiricalCdf {
    let mut sorted: Vec<f64> = samples.iter().copied().filter(|x| x.is_finite()).collect();
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    EmpiricalCdf::from_sorted_values(column.to_string(), sorted)
}

/// Default minimum sample count for a Source to receive its own LagSummary.
pub const DEFAULT_MIN_LAG_SAMPLES: usize = 100;

/// Per-Source posting lag in days = EffectiveDate - EntryDate. Can be negative (backdating).
///
/// Measured on the amount-bearing per-JE population: one observation per
/// `je_number` (first amount-bearing record wins — the dates are header-level
/// in practice) and zero-amount records never count. This is the population
/// the generator emits (it draws ONE lag per JE) and the same denominator as
/// `source_mix_je`/`manual_share`; line-level counting over-weights high-line
/// system batches and understates JE-level backdating. Records with an empty
/// `je_number` fall back to per-line counting.
pub fn extract_posting_lag(records: &[Record], min_samples: usize) -> Option<PostingLagPrior> {
    if records.is_empty() {
        return None;
    }
    let mut by_source: BTreeMap<String, Vec<f64>> = BTreeMap::new();
    let mut seen_jes: HashSet<&str> = HashSet::new();
    for r in records {
        if r.functional_amount == 0.0 {
            continue;
        }
        if !r.je_number.is_empty() && !seen_jes.insert(r.je_number.as_str()) {
            continue;
        }
        let lag = (r.effective_date - r.entry_date).num_days() as f64;
        by_source.entry(r.source.clone()).or_default().push(lag);
    }
    let mut summaries: BTreeMap<String, LagSummary> = BTreeMap::new();
    for (source, samples) in by_source {
        if samples.len() < min_samples {
            continue;
        }
        let n = samples.len();
        let mean = samples.iter().sum::<f64>() / n as f64;
        let var = samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n as f64;
        let cdf = build_empirical_cdf(&format!("lag_{source}"), &samples);
        summaries.insert(
            source,
            LagSummary {
                empirical_cdf_days: cdf,
                mean,
                stddev: var.sqrt(),
                n,
            },
        );
    }
    if summaries.is_empty() {
        None
    } else {
        Some(PostingLagPrior {
            by_source: summaries,
        })
    }
}

/// N9 — minimum amount-bearing JEs (with a usable effective date) before a
/// month-volume prior is emitted.
pub const DEFAULT_MIN_MONTH_VOLUME_JES: usize = 60;

/// N9 — per-calendar-month relative JE volume, measured on the amount-bearing
/// per-JE population (one observation per `je_number`; zero-amount records
/// never count) — the same population the generator emits. Returns `None` when
/// fewer than `min_jes` amount-bearing JEs are observed, so thin books don't
/// emit a noisy seasonality.
pub fn extract_month_volume(records: &[Record], min_jes: usize) -> Option<MonthVolumePrior> {
    use chrono::Datelike;
    let mut counts = [0.0f64; 12];
    let mut seen_jes: HashSet<&str> = HashSet::new();
    let mut n = 0usize;
    for r in records {
        if r.functional_amount == 0.0 {
            continue;
        }
        if !r.je_number.is_empty() && !seen_jes.insert(r.je_number.as_str()) {
            continue;
        }
        let m = r.effective_date.month(); // 1..=12
        if (1..=12).contains(&m) {
            counts[(m - 1) as usize] += 1.0;
            n += 1;
        }
    }
    if n < min_jes {
        return None;
    }
    let prior = MonthVolumePrior::from_counts(counts, n);
    prior.has_data().then_some(prior)
}

fn fit_lognormal(samples: &[f64]) -> Option<LognormalParams> {
    let log_samples: Vec<f64> = samples
        .iter()
        .filter(|&&x| x.is_finite() && x > 0.0)
        .map(|&x| (x + 1.0).ln())
        .collect();
    if log_samples.len() < 3 {
        return None;
    }
    let n = log_samples.len() as f64;
    let mean: f64 = log_samples.iter().sum::<f64>() / n;
    let var: f64 = log_samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n.max(1.0);
    Some(LognormalParams {
        mu: mean,
        sigma: var.sqrt(),
    })
}

/// Per-Source active lifetime in days = max(EntryDate) - min(EntryDate).
pub fn extract_active_lifetime(records: &[Record]) -> ActiveLifetimePrior {
    let mut by_source: BTreeMap<String, (NaiveDate, NaiveDate)> = BTreeMap::new();
    for r in records {
        let d = r.entry_date;
        by_source
            .entry(r.source.clone())
            .and_modify(|(lo, hi)| {
                if d < *lo {
                    *lo = d;
                }
                if d > *hi {
                    *hi = d;
                }
            })
            .or_insert((d, d));
    }
    let lifetimes_by_source: Vec<u32> = by_source
        .values()
        .map(|(lo, hi)| hi.signed_duration_since(*lo).num_days().max(0) as u32)
        .collect();
    let (overall, _) = LineCountHistogram::build(&lifetimes_by_source, ACTIVE_LIFETIME_DAY_BUCKETS);

    let mut per_source_hists: BTreeMap<String, LineCountHistogram> = BTreeMap::new();
    for (src, (lo, hi)) in &by_source {
        let life = hi.signed_duration_since(*lo).num_days().max(0) as u32;
        let (h, _) = LineCountHistogram::build(&[life], ACTIVE_LIFETIME_DAY_BUCKETS);
        per_source_hists.insert(src.clone(), h);
    }
    ActiveLifetimePrior {
        by_source: per_source_hists,
        overall,
    }
}

type AttributeProjector = fn(&Record) -> Option<String>;

/// Build the bipartite fan-out prior across {GLAccount, CostCenter, ProfitCenter, TradingPartner}.
pub fn extract_fanout(records: &[Record]) -> FanoutPrior {
    let attributes: [(&str, AttributeProjector); 4] = [
        ("GLAccount", |r| Some(r.gl_account.clone())),
        ("CostCenter", |r| r.cost_center.clone()),
        ("ProfitCenter", |r| r.profit_center.clone()),
        ("TradingPartner", |r| r.trading_partner.clone()),
    ];
    let mut by_attribute: BTreeMap<String, LineCountHistogram> = BTreeMap::new();
    for (name, proj) in attributes {
        let mut sources_per_value: BTreeMap<String, HashSet<String>> = BTreeMap::new();
        for r in records {
            if let Some(v) = proj(r) {
                sources_per_value
                    .entry(v)
                    .or_default()
                    .insert(r.source.clone());
            }
        }
        let fanouts: Vec<u32> = sources_per_value.values().map(|s| s.len() as u32).collect();
        let (hist, _) = LineCountHistogram::build(&fanouts, FANOUT_BUCKETS);
        by_attribute.insert(name.to_string(), hist);
    }
    FanoutPrior { by_attribute }
}

/// Default minimum observations for a (source, attribute) pair to be retained
/// in the per-source attribute prior.
pub const DEFAULT_MIN_ATTRIBUTE_OBSERVATIONS: usize = 10;

/// SP3.7 — Extract per-source conditional distributions for downstream
/// attributes (GL account, cost center, profit center).  For each
/// (source, attribute) pair the function builds a categorical distribution
/// over the observed attribute values.  Pairs with fewer than
/// `min_observations` rows are dropped.
pub fn extract_per_source_attribute(
    records: &[Record],
    min_observations: usize,
) -> PerSourceAttributePrior {
    // Outer: source code. Middle: attribute name. Inner: value → count.
    let mut counts: BTreeMap<String, BTreeMap<String, BTreeMap<String, usize>>> = BTreeMap::new();

    for r in records {
        if r.source.is_empty() {
            continue;
        }
        let source_map = counts.entry(r.source.clone()).or_default();

        if !r.gl_account.is_empty() {
            *source_map
                .entry("gl_account".to_string())
                .or_default()
                .entry(r.gl_account.clone())
                .or_default() += 1;
        }
        if let Some(cc) = r.cost_center.as_ref().filter(|s| !s.is_empty()) {
            *source_map
                .entry("cost_center".to_string())
                .or_default()
                .entry(cc.clone())
                .or_default() += 1;
        }
        if let Some(pc) = r.profit_center.as_ref().filter(|s| !s.is_empty()) {
            *source_map
                .entry("profit_center".to_string())
                .or_default()
                .entry(pc.clone())
                .or_default() += 1;
        }
        // SP3.8a — trading_partner conditional per source.
        if let Some(tp) = r.trading_partner.as_ref().filter(|s| !s.is_empty()) {
            *source_map
                .entry("trading_partner".to_string())
                .or_default()
                .entry(tp.clone())
                .or_default() += 1;
        }
    }

    // Build the final prior, dropping (source, attribute) pairs below threshold.
    let by_source = counts
        .into_iter()
        .filter_map(|(source, attr_map)| {
            let kept: BTreeMap<String, CategoricalDistribution> = attr_map
                .into_iter()
                .filter_map(|(attr, value_counts)| {
                    let total: usize = value_counts.values().sum();
                    if total < min_observations {
                        None
                    } else {
                        Some((attr, CategoricalDistribution::from_counts(value_counts)))
                    }
                })
                .collect();
            if kept.is_empty() {
                None
            } else {
                Some((source, kept))
            }
        })
        .collect();

    PerSourceAttributePrior {
        by_source,
        min_observations,
    }
}

/// SP4.6 — Default minimum observation count per `(source, role, gl_account)` triple
/// before the value is retained in the per-source-role GL conditional.  Privacy gate.
pub const DEFAULT_MIN_SOURCE_ROLE_OBSERVATIONS: usize = 10;

/// SP4.6 — Extract per-(source, line_role) GL account categorical distributions.
///
/// `line_role` is derived from the sign of `functional_amount`:
/// - amount > 0  → "DR"
/// - amount < 0  → "CR"
/// - amount == 0 → skipped
///
/// Only `(source, role)` pairs with ≥ `min_observations` total rows and
/// individual `(source, role, gl_account)` values with ≥ `min_observations`
/// raw observations are retained.  This double-gates the output to avoid
/// PII leakage from low-volume entries.
pub fn extract_source_role_gl(records: &[Record], min_observations: usize) -> PerSourceRolePrior {
    // Outer: source. Middle: role ("DR"|"CR"). Inner: gl_account → count.
    let mut counts: BTreeMap<String, BTreeMap<String, BTreeMap<String, usize>>> = BTreeMap::new();

    for r in records {
        if r.source.is_empty() || r.gl_account.is_empty() {
            continue;
        }
        let role = if r.functional_amount > 0.0 {
            "DR"
        } else if r.functional_amount < 0.0 {
            "CR"
        } else {
            continue; // zero-amount lines contribute no signal
        };
        *counts
            .entry(r.source.clone())
            .or_default()
            .entry(role.to_string())
            .or_default()
            .entry(r.gl_account.clone())
            .or_default() += 1;
    }

    // Build the prior, applying min_observations per-value and per-(source,role).
    let mut by_source_and_role: BTreeMap<String, BTreeMap<String, CategoricalDistribution>> =
        BTreeMap::new();

    for (source, role_map) in counts {
        let mut roles: BTreeMap<String, CategoricalDistribution> = BTreeMap::new();
        for (role, value_counts) in role_map {
            // Filter individual values below the threshold.
            let filtered: BTreeMap<String, usize> = value_counts
                .into_iter()
                .filter(|(_, c)| *c >= min_observations)
                .collect();
            let total: usize = filtered.values().sum();
            if total < min_observations {
                continue;
            }
            roles.insert(role, CategoricalDistribution::from_counts(filtered));
        }
        if !roles.is_empty() {
            by_source_and_role.insert(source, roles);
        }
    }

    PerSourceRolePrior { by_source_and_role }
}

/// SP4.8 — Default minimum observation count per `(source, class_pair)` before
/// the pair is retained in the per-source flow-pair prior.  Privacy gate.
pub const DEFAULT_MIN_FLOW_PAIR_OBSERVATIONS: usize = 10;

/// SP4.8 — Leading-digit class granularity used for the flow-pair prior.
/// Matches the default granularity of the relational-fidelity ρ measure.
pub const DEFAULT_FLOW_PAIR_GRANULARITY: usize = 1;

/// SP4.8 — Extract the per-source joint (debit-class, credit-class) flow-pair
/// distribution.
///
/// Mirrors the relational-fidelity ρ measure: lines are grouped by
/// `je_number`; the JE's unique debit classes (functional_amount > 0) are
/// crossed with its unique credit classes (functional_amount < 0); each cross
/// pair counts once per JE under the JE's source. Classes are digits-only
/// leading-`granularity` strings (`PerSourceFlowPairPrior::account_class`).
///
/// Double privacy gate (mirrors `extract_source_role_gl`): individual
/// `(source, pair)` values below `min_observations` are dropped, and sources
/// whose retained total is below `min_observations` are dropped entirely.
pub fn extract_source_flow_pairs(
    records: &[Record],
    granularity: usize,
    min_observations: usize,
) -> PerSourceFlowPairPrior {
    use std::collections::BTreeSet;

    // je_number → (source, debit classes, credit classes).
    let mut jes: BTreeMap<&str, (String, BTreeSet<String>, BTreeSet<String>)> = BTreeMap::new();
    for r in records {
        if r.je_number.is_empty() || r.source.is_empty() || r.gl_account.is_empty() {
            continue;
        }
        let Some(class) = PerSourceFlowPairPrior::account_class(&r.gl_account, granularity) else {
            continue;
        };
        let entry = jes
            .entry(r.je_number.as_str())
            .or_insert_with(|| (r.source.clone(), BTreeSet::new(), BTreeSet::new()));
        if r.functional_amount > 0.0 {
            entry.1.insert(class);
        } else if r.functional_amount < 0.0 {
            entry.2.insert(class);
        }
    }

    // source → pair_key → count (one count per (JE, pair), like the ρ measure).
    let mut counts: BTreeMap<String, BTreeMap<String, usize>> = BTreeMap::new();
    for (source, deb, cred) in jes.into_values() {
        for d in &deb {
            for c in &cred {
                *counts
                    .entry(source.clone())
                    .or_default()
                    .entry(PerSourceFlowPairPrior::pair_key(d, c))
                    .or_default() += 1;
            }
        }
    }

    let by_source: BTreeMap<String, CategoricalDistribution> = counts
        .into_iter()
        .filter_map(|(source, pair_counts)| {
            let filtered: BTreeMap<String, usize> = pair_counts
                .into_iter()
                .filter(|(_, c)| *c >= min_observations)
                .collect();
            let total: usize = filtered.values().sum();
            if total < min_observations {
                None
            } else {
                Some((source, CategoricalDistribution::from_counts(filtered)))
            }
        })
        .collect();

    PerSourceFlowPairPrior {
        by_source,
        granularity,
    }
}

/// Default minimum observation count for a `(source, gl_prefix)` amount pair
/// to be retained in `PerSourceAmountPrior::by_source_and_class`.  Pairs with
/// fewer observations drop to the per-source marginal.  Privacy gate.
pub const DEFAULT_MIN_AMOUNT_OBSERVATIONS: usize = 10;

/// SP4.3 — Extract per-(source, gl_prefix) log-normal amount parameters from a
/// slice of records.
///
/// The "gl_prefix" key is the first 4 characters of the GL account number,
/// providing enough granularity to separate major balance-sheet categories
/// (e.g. "0041" vs "0022" vs "0047") without over-fitting.
///
/// Only absolute values > 0 are included in the fit; zero-amount lines are
/// dropped.  Groups below `min_observations` are silently excluded — their
/// source falls back to the source-marginal during sampling.
pub fn extract_source_amount_conditionals(
    records: &[Record],
    min_observations: usize,
) -> PerSourceAmountPrior {
    use std::collections::BTreeMap;

    use chrono::Datelike;

    // Accumulate raw absolute-amount values per (source, gl_prefix) and per source,
    // plus the per-JE debit-leg total for the JE-total sketch. JE numbers are
    // year-scoped (SAP reuses JENumber across fiscal years), so the JE key is
    // (effective-date year, je_number); the JE's source is its first debit leg's
    // (lines of one JE share a source in practice).
    let mut by_pair: BTreeMap<(String, String), Vec<f64>> = BTreeMap::new();
    let mut by_src: BTreeMap<String, Vec<f64>> = BTreeMap::new();
    let mut je_acc: BTreeMap<(i32, &str), (&str, f64)> = BTreeMap::new();

    for r in records {
        let abs_amt = r.functional_amount.abs();
        if abs_amt <= 0.0 || !abs_amt.is_finite() {
            continue;
        }
        if r.source.is_empty() {
            continue;
        }
        let gl_prefix: String = if r.gl_account.len() >= 4 {
            r.gl_account[..4].to_string()
        } else {
            r.gl_account.clone()
        };
        by_pair
            .entry((r.source.clone(), gl_prefix))
            .or_default()
            .push(abs_amt);
        by_src.entry(r.source.clone()).or_default().push(abs_amt);
        if r.functional_amount > 0.0 {
            let key = (r.effective_date.year(), r.je_number.as_str());
            let entry = je_acc.entry(key).or_insert((r.source.as_str(), 0.0));
            entry.1 += r.functional_amount;
        }
    }

    // Fit log-normal to each group that meets the threshold.
    let mut by_source_and_class: BTreeMap<String, BTreeMap<String, LognormalAmount>> =
        BTreeMap::new();
    for ((source, gl_prefix), values) in by_pair {
        if values.len() < min_observations {
            continue;
        }
        if let Some(params) = fit_lognormal_amount(&values) {
            by_source_and_class
                .entry(source)
                .or_default()
                .insert(gl_prefix, params);
        }
    }

    // Source-marginal fallback.
    let mut by_source: BTreeMap<String, LognormalAmount> = BTreeMap::new();
    for (source, values) in &by_src {
        if values.len() < min_observations {
            continue;
        }
        if let Some(params) = fit_lognormal_amount(values) {
            by_source.insert(source.clone(), params);
        }
    }

    // SP4.9 — empirical quantile sketch per source. Only high-volume sources
    // get a sketch (the log-normal marginal stays the fallback for the rest);
    // the per-knot privacy gate inside the builder drops knots above 1 - 5/n.
    let mut quantile_sketch_by_source: BTreeMap<String, AmountQuantileSketch> = BTreeMap::new();
    for (source, values) in by_src {
        if values.len() < DEFAULT_MIN_SKETCH_OBSERVATIONS {
            continue;
        }
        if let Some(sketch) = build_amount_quantile_sketch(values) {
            quantile_sketch_by_source.insert(source, sketch);
        }
    }

    // JE-total sketch per source — same gate and privacy posture as the line
    // sketch, but each observation is one JE's debit-leg total. This is what the
    // generator draws a JE's economic size from (the line sketch over-populates
    // the JE-total tail when totals are assembled from per-line draws).
    let mut je_totals_by_src: BTreeMap<&str, Vec<f64>> = BTreeMap::new();
    let mut je_totals_global: Vec<f64> = Vec::new();
    for (source, total) in je_acc.into_values() {
        if total > 0.0 && total.is_finite() {
            je_totals_by_src.entry(source).or_default().push(total);
            je_totals_global.push(total);
        }
    }
    let mut je_total_sketch_by_source: BTreeMap<String, AmountQuantileSketch> = BTreeMap::new();
    for (source, totals) in je_totals_by_src {
        if totals.len() < DEFAULT_MIN_SKETCH_OBSERVATIONS {
            continue;
        }
        if let Some(sketch) = build_amount_quantile_sketch(totals) {
            je_total_sketch_by_source.insert(source.to_string(), sketch);
        }
    }

    // Pooled GLOBAL JE-total sketch — every source's JE totals concatenated into
    // one sketch. Same gate and privacy posture as the per-source sketches; it is
    // the thin-source fallback, so sources below the per-source gate still draw a
    // JE total from the right quantity (a pooled JE-total distribution) instead of
    // the per-line assembly path whose line-count coupling inflates the tail.
    let je_total_sketch_global = if je_totals_global.len() >= DEFAULT_MIN_SKETCH_OBSERVATIONS {
        build_amount_quantile_sketch(je_totals_global)
    } else {
        None
    };

    PerSourceAmountPrior {
        by_source_and_class,
        by_source,
        quantile_sketch_by_source,
        je_total_sketch_by_source,
        je_total_sketch_global,
    }
}

/// SP4.9 — minimum amount-bearing observations for a source to receive an
/// empirical quantile sketch. Below this the tail knots would degenerate into
/// near-max order statistics; such sources keep the log-normal marginal.
pub const DEFAULT_MIN_SKETCH_OBSERVATIONS: usize = 1000;

/// SP4.9 — fixed quantile grid for amount sketches. Dense in the tail because
/// that is where the per-source log-normal misfit concentrates.
pub const AMOUNT_SKETCH_GRID: &[f64] = &[
    0.01, 0.02, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98, 0.99,
    0.995, 0.999,
];

/// Build an [`AmountQuantileSketch`] from positive absolute amounts.
///
/// Knots are linear-interpolated order statistics on [`AMOUNT_SKETCH_GRID`],
/// restricted to probabilities `p <= 1 - 5/n` so every emitted knot has at
/// least 5 observations strictly above it (privacy gate — no knot is the
/// maximum or near-maximum of a small group). Returns `None` when fewer than
/// 4 knots survive or the values cannot support log-space sampling.
fn build_amount_quantile_sketch(mut values: Vec<f64>) -> Option<AmountQuantileSketch> {
    values.retain(|v| *v > 0.0 && v.is_finite());
    let n = values.len();
    if n < 2 {
        return None;
    }
    values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));

    let p_cap = 1.0 - 5.0 / n as f64;
    let probabilities: Vec<f64> = AMOUNT_SKETCH_GRID
        .iter()
        .copied()
        .filter(|p| *p <= p_cap)
        .collect();
    if probabilities.len() < 4 {
        return None;
    }
    let quantile = |p: f64| -> f64 {
        let h = (n - 1) as f64 * p;
        let lo = h.floor() as usize;
        let hi = (lo + 1).min(n - 1);
        values[lo] + (h - lo as f64) * (values[hi] - values[lo])
    };
    let knots: Vec<f64> = probabilities.iter().map(|&p| quantile(p)).collect();

    let sketch = AmountQuantileSketch {
        probabilities,
        values: knots,
        n,
    };
    if sketch.is_usable() {
        Some(sketch)
    } else {
        None
    }
}

/// Fit log-normal parameters to a slice of positive absolute amounts.
///
/// Returns `None` when fewer than 2 finite values are present.
fn fit_lognormal_amount(values: &[f64]) -> Option<LognormalAmount> {
    let log_vals: Vec<f64> = values
        .iter()
        .filter(|&&v| v > 0.0 && v.is_finite())
        .map(|&v| v.ln())
        .collect();
    if log_vals.len() < 2 {
        return None;
    }
    let n = log_vals.len() as f64;
    let mu = log_vals.iter().sum::<f64>() / n;
    let var = log_vals.iter().map(|x| (x - mu).powi(2)).sum::<f64>() / n.max(1.0);
    let sigma = var.sqrt();

    // Compute median of the original absolute values.
    let mut sorted = values.to_vec();
    sorted.retain(|v| *v > 0.0 && v.is_finite());
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let median_abs = if sorted.is_empty() {
        0.0
    } else if sorted.len().is_multiple_of(2) {
        (sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
    } else {
        sorted[sorted.len() / 2]
    };

    Some(LognormalAmount {
        mu,
        sigma,
        n: log_vals.len(),
        median_abs,
    })
}

/// Default minimum JE count for a Source to receive its own histogram.
pub const DEFAULT_MIN_JES_PER_SOURCE: usize = 500;

/// Build LinesPerJePrior — overall + per-Source histogram of line counts.
pub fn extract_lines_per_je(records: &[Record], min_jes_per_source: usize) -> LinesPerJePrior {
    let mut lines_per_je: BTreeMap<String, u32> = BTreeMap::new();
    let mut source_of_je: BTreeMap<String, String> = BTreeMap::new();
    for r in records {
        *lines_per_je.entry(r.je_number.clone()).or_insert(0) += 1;
        source_of_je
            .entry(r.je_number.clone())
            .or_insert_with(|| r.source.clone());
    }
    let overall_values: Vec<u32> = lines_per_je.values().copied().collect();
    let (overall, _) = LineCountHistogram::build(&overall_values, LINE_COUNT_BUCKETS);

    let mut by_source_values: BTreeMap<String, Vec<u32>> = BTreeMap::new();
    for (je, n_lines) in &lines_per_je {
        if let Some(src) = source_of_je.get(je) {
            by_source_values
                .entry(src.clone())
                .or_default()
                .push(*n_lines);
        }
    }
    let mut by_source: BTreeMap<String, LineCountHistogram> = BTreeMap::new();
    for (src, values) in by_source_values {
        if values.len() < min_jes_per_source {
            continue;
        }
        let (hist, _) = LineCountHistogram::build(&values, LINE_COUNT_BUCKETS);
        by_source.insert(src, hist);
    }

    LinesPerJePrior {
        overall,
        by_source,
        min_jes_per_source,
    }
}

pub type BehavioralResult<T> = Result<T, FingerprintError>;

/// Build a fully-populated `BehavioralPriors` for one client/data file.
pub fn extract_behavioral_priors(
    records: &[Record],
    industry: &str,
) -> BehavioralResult<BehavioralPriors> {
    extract_behavioral_priors_with_gates(records, industry, SourceMixGates::default())
}

/// `extract_behavioral_priors` with caller-supplied source-mix gates.
pub fn extract_behavioral_priors_with_gates(
    records: &[Record],
    industry: &str,
    source_mix_gates: SourceMixGates,
) -> BehavioralResult<BehavioralPriors> {
    let source_mix = extract_source_mix(
        records,
        source_mix_gates.min_share,
        source_mix_gates.min_observations,
    );
    // Per-JE shares over the same retained vocabulary — the generator's per-JE
    // source draw needs JE-count units, not line units.
    let source_mix_je = extract_source_mix_je(records, &source_mix.probabilities);
    Ok(BehavioralPriors {
        schema_version: BehavioralPriors::SCHEMA_VERSION,
        generator_version: env!("CARGO_PKG_VERSION").to_string(),
        industry: industry.to_string(),
        n_client_inputs: 1,
        n_rows_aggregated: records.len(),
        source_mix,
        per_source_iet: extract_per_source_iet(records, DEFAULT_MIN_IET_SAMPLES),
        lines_per_je: extract_lines_per_je(records, DEFAULT_MIN_JES_PER_SOURCE),
        active_lifetime: extract_active_lifetime(records),
        fanout: extract_fanout(records),
        posting_lag: extract_posting_lag(records, DEFAULT_MIN_LAG_SAMPLES),
        month_volume: extract_month_volume(records, DEFAULT_MIN_MONTH_VOLUME_JES),
        active_segments: Some(extract_active_segments(records)),
        entity_clusters: Some(extract_entity_clusters(records)),
        per_source_attribute: Some(extract_per_source_attribute(
            records,
            DEFAULT_MIN_ATTRIBUTE_OBSERVATIONS,
        )),
        tp_entity_clusters: Some(extract_tp_entity_clusters(records)),
        reference_formats: {
            let rf = extract_reference_formats(records, DEFAULT_MIN_REFERENCE_OCCURRENCES);
            if rf.by_source.is_empty() {
                None
            } else {
                Some(rf)
            }
        },
        coa_semantic: None,
        // SP4.5 — user_personas: the corpus GL files carry no user column, so
        // `extract_user_personas` returns an empty stub.  We always set `Some(stub)`
        // rather than `None` so that `LoadedPriors::user_personas` is present and the
        // `has_data()` guard on the generator side can be tested without real data.
        user_personas: {
            let up = extract_user_personas(records, DEFAULT_MIN_USER_RECORDS);
            Some(up)
        },
        // SP4.3 — per-(source, gl_prefix) amount conditionals.
        source_amount_conditionals: {
            let sac = extract_source_amount_conditionals(records, DEFAULT_MIN_AMOUNT_OBSERVATIONS);
            // Emit None when no (source, gl_prefix) pairs met the threshold —
            // avoids emitting an empty struct in bundles generated from tiny test data.
            if sac.by_source.is_empty() && sac.by_source_and_class.is_empty() {
                None
            } else {
                Some(sac)
            }
        },
        // SP4.6 — per-(source, line_role) GL account conditionals.
        source_role_gl_conditionals: {
            let srg = extract_source_role_gl(records, DEFAULT_MIN_SOURCE_ROLE_OBSERVATIONS);
            if srg.by_source_and_role.is_empty() {
                None
            } else {
                Some(srg)
            }
        },
        // SP4.8 — per-source joint (debit-class, credit-class) flow pairs.
        source_flow_pairs: {
            let sfp = extract_source_flow_pairs(
                records,
                DEFAULT_FLOW_PAIR_GRANULARITY,
                DEFAULT_MIN_FLOW_PAIR_OBSERVATIONS,
            );
            if sfp.by_source.is_empty() {
                None
            } else {
                Some(sfp)
            }
        },
        // SP6 — text_taxonomy: populated via extract_text_taxonomy (separate path).
        // The `extract_behavioral_priors` path (Record slice, JE-only) leaves this as None.
        text_taxonomy: None,
        // SP4.1 — TB anchoring is populated separately via `extract_tb_anchor_from_parquet`
        approver: None,
        // (tb_extractor module) for callers that have access to real TB_XXX.parquet files.
        // The `extract_behavioral_priors` path (Record slice, JE-only) leaves this as None.
        source_mix_je,
        tb_anchor: None,
        // Phase-2 R6 — populated by `extract_behavioral_priors_from_path*` for
        // parquet inputs that carry a system/manual indicator column.  The
        // Record-slice path has no access to that column.
        manual_share: None,
    })
}

/// Convenience: load a parquet or CSV file and call `extract_behavioral_priors`.
pub fn extract_behavioral_priors_from_path(
    path: &Path,
    industry: &str,
) -> BehavioralResult<BehavioralPriors> {
    extract_behavioral_priors_from_path_with_gates(path, industry, SourceMixGates::default())
}

/// `extract_behavioral_priors_from_path` with caller-supplied source-mix gates.
pub fn extract_behavioral_priors_from_path_with_gates(
    path: &Path,
    industry: &str,
    source_mix_gates: SourceMixGates,
) -> BehavioralResult<BehavioralPriors> {
    let is_parquet = matches!(path.extension().and_then(|s| s.to_str()), Some("parquet"));
    let records = match path.extension().and_then(|s| s.to_str()) {
        Some("parquet") => load_parquet_records(path)
            .map_err(|e| io_error_to_fp(format!("parquet load failed: {e}")))?,
        Some("csv") => {
            load_csv_records(path).map_err(|e| io_error_to_fp(format!("csv load failed: {e}")))?
        }
        _ => {
            return Err(io_error_to_fp(format!(
                "unsupported extension at {}",
                path.display()
            )));
        }
    };
    let mut priors = extract_behavioral_priors_with_gates(&records, industry, source_mix_gates)?;
    // Phase-2 R6 — the system/manual indicator is not part of `Record`; read it
    // directly from the parquet alongside.  Soft-fail: a malformed column must
    // not sink the whole extraction.
    if is_parquet {
        priors.manual_share = match extract_manual_share_from_parquet(
            path,
            super::manual_extractor::DEFAULT_MIN_MANUAL_OBSERVATIONS,
        ) {
            Ok(ms) => ms,
            Err(e) => {
                tracing::warn!("manual-share extraction failed (skipping): {e}");
                None
            }
        };
    }
    // Phase-2 R9 — enterer/approver columns are likewise read as a side channel.
    if is_parquet {
        priors.approver = match super::approver_extractor::extract_approver_prior_from_parquet(
            path,
            super::approver_extractor::DEFAULT_MIN_APPROVER_OBSERVATIONS,
        ) {
            Ok(ap) => ap,
            Err(e) => {
                tracing::warn!("approver extraction failed (skipping): {e}");
                None
            }
        };
    }
    Ok(priors)
}

fn io_error_to_fp(msg: String) -> FingerprintError {
    FingerprintError::ExtractionError {
        extractor: "behavioral_priors".to_string(),
        message: msg,
    }
}

pub const SEGMENT_GAP_THRESHOLD_DAYS: i64 = 7;

/// Split a sorted+dedup'd date list into contiguous segments separated by
/// gaps > `gap_threshold` days. Returns (segments as (start, end), gap-day-values).
fn split_into_segments(
    dates: &[NaiveDate],
    gap_threshold: i64,
) -> (Vec<(NaiveDate, NaiveDate)>, Vec<u32>) {
    if dates.is_empty() {
        return (vec![], vec![]);
    }
    let mut segments = Vec::new();
    let mut gaps = Vec::new();
    let mut seg_start = dates[0];
    let mut seg_end = dates[0];
    for &d in &dates[1..] {
        let gap = (d - seg_end).num_days();
        if gap > gap_threshold {
            segments.push((seg_start, seg_end));
            gaps.push(gap as u32);
            seg_start = d;
            seg_end = d;
        } else {
            seg_end = d;
        }
    }
    segments.push((seg_start, seg_end));
    (segments, gaps)
}

/// Per-Source multi-segment active-pattern extractor.
pub fn extract_active_segments(records: &[Record]) -> ActiveSegmentsPrior {
    let mut by_source: BTreeMap<String, Vec<NaiveDate>> = BTreeMap::new();
    for r in records {
        by_source
            .entry(r.source.clone())
            .or_default()
            .push(r.entry_date);
    }
    let mut summaries: BTreeMap<String, SourceSegmentSummary> = BTreeMap::new();
    for (src, mut dates) in by_source {
        dates.sort();
        dates.dedup();
        if dates.len() < 2 {
            continue;
        }
        let (segments, gaps) = split_into_segments(&dates, SEGMENT_GAP_THRESHOLD_DAYS);
        let segment_count = segments.len() as u32;
        let segment_lengths: Vec<u32> = segments
            .iter()
            .map(|s| (s.1 - s.0).num_days().max(0) as u32)
            .collect();
        let (count_hist, _) = LineCountHistogram::build(&[segment_count], SEGMENT_COUNT_BUCKETS);
        let (length_hist, _) =
            LineCountHistogram::build(&segment_lengths, ACTIVE_LIFETIME_DAY_BUCKETS);
        let (gap_hist, _) = LineCountHistogram::build(&gaps, SEGMENT_GAP_BUCKETS);
        summaries.insert(
            src,
            SourceSegmentSummary {
                segment_count_histogram: count_hist,
                segment_length_histogram: length_hist,
                gap_length_histogram: gap_hist,
            },
        );
    }
    ActiveSegmentsPrior {
        by_source: summaries,
    }
}

/// Maximum number of Sources to cluster — bound on O(N²) Jaccard work.
const MAX_SOURCES_FOR_CLUSTERING: usize = 50;

/// Jaccard similarity threshold for a Source-pair to be considered "clustered".
const JACCARD_THRESHOLD: f64 = 0.3;

/// SP3.5a — Canonical SAP-style Source codes the synthetic generator emits.
/// Used by `normalise_source_code` to map heterogeneous corpus codes to
/// the synthetic vocabulary so the motif sampler can actually fire at lookup.
const CANONICAL_SAP_CODES: &[&str] = &[
    "KR", "RV", "DZ", "WE", "RE", "SA", "IM", "KZ", "AB", "AF", "DR", "KK", "K9", "KX", "PK", "RB",
    "RY", "SL", "ZP",
];

/// Map a corpus Source code to the synthetic generator's SAP vocabulary.
/// Unknown codes return `None` — they're excluded from clustering rather than
/// misclassified.
fn normalise_source_code(raw: &str) -> Option<String> {
    let trimmed = raw.trim();
    if trimmed.is_empty() {
        return None;
    }
    if CANONICAL_SAP_CODES.contains(&trimmed) {
        return Some(trimmed.to_string());
    }
    // Best-effort numeric fallbacks observed in the corpus.
    match trimmed {
        "0" | "00" => Some("SA".to_string()),
        "1" | "01" => Some("RV".to_string()),
        "2" | "02" => Some("KR".to_string()),
        _ => None,
    }
}

/// SP3.3 — Discover clusters of Sources that share attribute pools (GL Account,
/// Cost Center, Profit Center, Trading Partner). Uses Jaccard-threshold
/// connected components on the top-K Sources by row count.
pub fn extract_entity_clusters(records: &[Record]) -> EntityClustersPrior {
    // 1. Count rows per Source so we can cap to the top K.
    let mut row_count_per_source: BTreeMap<String, usize> = BTreeMap::new();
    for r in records {
        *row_count_per_source.entry(r.source.clone()).or_insert(0) += 1;
    }
    let mut sorted_sources: Vec<(String, usize)> = row_count_per_source.into_iter().collect();
    sorted_sources.sort_by_key(|b| std::cmp::Reverse(b.1));
    let top_sources: Vec<String> = sorted_sources
        .into_iter()
        .take(MAX_SOURCES_FOR_CLUSTERING)
        .map(|(s, _)| s)
        .collect();
    let top_set: HashSet<&String> = top_sources.iter().collect();

    // 2. Build the per-Source attribute set across {GL, CC, PC, TP}.
    let mut attr_sets: BTreeMap<String, HashSet<String>> = BTreeMap::new();
    for r in records {
        if !top_set.contains(&r.source) {
            continue;
        }
        let set = attr_sets.entry(r.source.clone()).or_default();
        set.insert(format!("GL:{}", r.gl_account));
        if let Some(cc) = &r.cost_center {
            set.insert(format!("CC:{cc}"));
        }
        if let Some(pc) = &r.profit_center {
            set.insert(format!("PC:{pc}"));
        }
        if let Some(tp) = &r.trading_partner {
            set.insert(format!("TP:{tp}"));
        }
    }

    // SP3.5a — Normalise Source codes so the resulting cluster members match
    // the synthetic generator's SAP vocabulary.
    let attr_sets: BTreeMap<String, HashSet<String>> = attr_sets
        .into_iter()
        .filter_map(|(raw, set)| normalise_source_code(&raw).map(|canonical| (canonical, set)))
        .fold(BTreeMap::new(), |mut acc, (canonical, set)| {
            acc.entry(canonical).or_default().extend(set);
            acc
        });

    // 3. Pairwise Jaccard, threshold, adjacency.
    let sources: Vec<String> = attr_sets.keys().cloned().collect();
    let mut adj: BTreeMap<String, Vec<String>> = BTreeMap::new();
    let mut edge_weights: BTreeMap<(String, String), f64> = BTreeMap::new();
    for i in 0..sources.len() {
        for j in (i + 1)..sources.len() {
            let a = &attr_sets[&sources[i]];
            let b = &attr_sets[&sources[j]];
            if a.is_empty() || b.is_empty() {
                continue;
            }
            let intersection = a.intersection(b).count() as f64;
            let union = a.union(b).count() as f64;
            if union == 0.0 {
                continue;
            }
            let jaccard = intersection / union;
            if jaccard >= JACCARD_THRESHOLD {
                adj.entry(sources[i].clone())
                    .or_default()
                    .push(sources[j].clone());
                adj.entry(sources[j].clone())
                    .or_default()
                    .push(sources[i].clone());
                let key = if sources[i] < sources[j] {
                    (sources[i].clone(), sources[j].clone())
                } else {
                    (sources[j].clone(), sources[i].clone())
                };
                edge_weights.insert(key, jaccard);
            }
        }
    }

    // 4. Connected components → clusters.
    let mut visited: HashSet<String> = HashSet::new();
    let mut clusters: Vec<EntityCluster> = Vec::new();
    for src in &sources {
        if visited.contains(src) {
            continue;
        }
        let mut members = Vec::new();
        let mut stack = vec![src.clone()];
        while let Some(s) = stack.pop() {
            if !visited.insert(s.clone()) {
                continue;
            }
            members.push(s.clone());
            if let Some(neighbors) = adj.get(&s) {
                for n in neighbors {
                    if !visited.contains(n) {
                        stack.push(n.clone());
                    }
                }
            }
        }
        if members.len() >= 2 {
            // Compute average Jaccard within the cluster.
            let mut sum = 0.0;
            let mut count = 0.0;
            for i in 0..members.len() {
                for j in (i + 1)..members.len() {
                    let key = if members[i] < members[j] {
                        (members[i].clone(), members[j].clone())
                    } else {
                        (members[j].clone(), members[i].clone())
                    };
                    if let Some(&w) = edge_weights.get(&key) {
                        sum += w;
                        count += 1.0;
                    }
                }
            }
            let avg_jaccard = if count > 0.0 { sum / count } else { 0.0 };
            clusters.push(EntityCluster {
                members,
                avg_jaccard,
            });
        }
    }

    let total_in_clusters: usize = clusters.iter().map(|c| c.members.len()).sum();
    let denom = sources.len().max(1);
    let clustering_rate = total_in_clusters as f64 / denom as f64;

    EntityClustersPrior {
        clusters,
        clustering_rate,
    }
}

/// Maximum number of TradingPartner values to cluster — bounds the O(N²) Jaccard work.
const MAX_TP_FOR_CLUSTERING: usize = 200;

/// SP3.12 — Discover clusters of TradingPartner values that share attribute pools
/// (GL Account, Cost Center, Profit Center, Source). Uses the same Jaccard-threshold
/// connected-components algorithm as `extract_entity_clusters` but keyed on TP.
/// These clusters drive the `TpMotifSampler` in the generator to emit TP values that
/// tend to share GL accounts, building triangle structure in the TP co-occurrence graph.
pub fn extract_tp_entity_clusters(records: &[Record]) -> EntityClustersPrior {
    // 1. Count rows per TP; cap to the top K.
    let mut row_count_per_tp: BTreeMap<String, usize> = BTreeMap::new();
    for r in records {
        if let Some(tp) = &r.trading_partner {
            if !tp.is_empty() {
                *row_count_per_tp.entry(tp.clone()).or_insert(0) += 1;
            }
        }
    }
    let mut sorted_tps: Vec<(String, usize)> = row_count_per_tp.into_iter().collect();
    sorted_tps.sort_by_key(|b| std::cmp::Reverse(b.1));
    let top_tps: Vec<String> = sorted_tps
        .into_iter()
        .take(MAX_TP_FOR_CLUSTERING)
        .map(|(tp, _)| tp)
        .collect();
    let top_set: HashSet<&String> = top_tps.iter().collect();

    // 2. Build the per-TP attribute set across {GL, CC, PC, Source}.
    let mut attr_sets: BTreeMap<String, HashSet<String>> = BTreeMap::new();
    for r in records {
        let tp = match &r.trading_partner {
            Some(tp) if !tp.is_empty() && top_set.contains(tp) => tp.clone(),
            _ => continue,
        };
        let set = attr_sets.entry(tp).or_default();
        set.insert(format!("GL:{}", r.gl_account));
        if let Some(cc) = &r.cost_center {
            set.insert(format!("CC:{cc}"));
        }
        if let Some(pc) = &r.profit_center {
            set.insert(format!("PC:{pc}"));
        }
        // Include source code so TPs appearing on the same sources cluster.
        set.insert(format!("SRC:{}", r.source));
    }

    // 3. Pairwise Jaccard, threshold, adjacency.
    let tps: Vec<String> = attr_sets.keys().cloned().collect();
    let mut adj: BTreeMap<String, Vec<String>> = BTreeMap::new();
    let mut edge_weights: BTreeMap<(String, String), f64> = BTreeMap::new();
    for i in 0..tps.len() {
        for j in (i + 1)..tps.len() {
            let a = &attr_sets[&tps[i]];
            let b = &attr_sets[&tps[j]];
            if a.is_empty() || b.is_empty() {
                continue;
            }
            let intersection = a.intersection(b).count() as f64;
            let union = a.union(b).count() as f64;
            if union == 0.0 {
                continue;
            }
            let jaccard = intersection / union;
            if jaccard >= JACCARD_THRESHOLD {
                adj.entry(tps[i].clone()).or_default().push(tps[j].clone());
                adj.entry(tps[j].clone()).or_default().push(tps[i].clone());
                let key = if tps[i] < tps[j] {
                    (tps[i].clone(), tps[j].clone())
                } else {
                    (tps[j].clone(), tps[i].clone())
                };
                edge_weights.insert(key, jaccard);
            }
        }
    }

    // 4. Connected components → clusters.
    let mut visited: HashSet<String> = HashSet::new();
    let mut clusters: Vec<EntityCluster> = Vec::new();
    for tp in &tps {
        if visited.contains(tp) {
            continue;
        }
        let mut members = Vec::new();
        let mut stack = vec![tp.clone()];
        while let Some(t) = stack.pop() {
            if !visited.insert(t.clone()) {
                continue;
            }
            members.push(t.clone());
            if let Some(neighbors) = adj.get(&t) {
                for n in neighbors {
                    if !visited.contains(n) {
                        stack.push(n.clone());
                    }
                }
            }
        }
        if members.len() >= 2 {
            let mut sum = 0.0;
            let mut count = 0.0;
            for i in 0..members.len() {
                for j in (i + 1)..members.len() {
                    let key = if members[i] < members[j] {
                        (members[i].clone(), members[j].clone())
                    } else {
                        (members[j].clone(), members[i].clone())
                    };
                    if let Some(&w) = edge_weights.get(&key) {
                        sum += w;
                        count += 1.0;
                    }
                }
            }
            let avg_jaccard = if count > 0.0 { sum / count } else { 0.0 };
            clusters.push(EntityCluster {
                members,
                avg_jaccard,
            });
        }
    }

    let total_in_clusters: usize = clusters.iter().map(|c| c.members.len()).sum();
    let denom = tps.len().max(1);
    let clustering_rate = total_in_clusters as f64 / denom as f64;

    EntityClustersPrior {
        clusters,
        clustering_rate,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use chrono::{Duration, NaiveDate};
    use rand::{RngExt, SeedableRng};

    pub(crate) fn rec(src: &str) -> Record {
        let d = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        Record {
            source: src.into(),
            gl_account: "1".into(),
            cost_center: None,
            profit_center: None,
            trading_partner: None,
            je_number: "J1".into(),
            je_line_number: "001".into(),
            effective_date: d,
            entry_date: d,
            created_at: None,
            functional_amount: 1.0,
            header_text: String::new(),
            line_text: String::new(),
        }
    }

    /// `rec` with an explicit functional amount (zero-amount-line tests).
    pub(crate) fn rec_amt(src: &str, amount: f64) -> Record {
        let mut r = rec(src);
        r.functional_amount = amount;
        r
    }

    #[test]
    fn source_mix_ignores_zero_amount_lines() {
        // B is padded with zero-amount rows (clearing/statistical lines); the
        // mix must reflect the amount-bearing composition only: A 60 / B 30.
        let mut recs: Vec<Record> = Vec::new();
        recs.extend(std::iter::repeat_with(|| rec_amt("A", 10.0)).take(60));
        recs.extend(std::iter::repeat_with(|| rec_amt("B", -5.0)).take(30));
        recs.extend(std::iter::repeat_with(|| rec_amt("B", 0.0)).take(200));
        let mix = extract_source_mix(&recs, DEFAULT_MIN_SOURCE_THRESHOLD, 0);
        assert!((mix.probabilities["A"] - 60.0 / 90.0).abs() < 1e-9);
        assert!((mix.probabilities["B"] - 30.0 / 90.0).abs() < 1e-9);
    }

    #[test]
    fn source_mix_falls_back_to_all_lines_when_amounts_degenerate() {
        // A blanked/zeroed amount column must not produce an empty mix — the
        // extractor falls back to the all-lines count.
        let mut recs: Vec<Record> = Vec::new();
        recs.extend(std::iter::repeat_with(|| rec_amt("A", 0.0)).take(70));
        recs.extend(std::iter::repeat_with(|| rec_amt("B", 0.0)).take(30));
        let mix = extract_source_mix(&recs, DEFAULT_MIN_SOURCE_THRESHOLD, 0);
        assert!((mix.probabilities["A"] - 0.7).abs() < 1e-9);
        assert!((mix.probabilities["B"] - 0.3).abs() < 1e-9);
    }

    #[test]
    fn source_mix_shares_match() {
        let mut recs: Vec<Record> = Vec::new();
        recs.extend(std::iter::repeat_with(|| rec("A")).take(60));
        recs.extend(std::iter::repeat_with(|| rec("B")).take(30));
        recs.extend(std::iter::repeat_with(|| rec("C")).take(10));
        // min_observations=0 so the obs-count filter does not fire here.
        let mix = extract_source_mix(&recs, DEFAULT_MIN_SOURCE_THRESHOLD, 0);
        // After renormalisation each source keeps its relative share.
        assert!((mix.probabilities["A"] - 0.6).abs() < 1e-9);
        assert!((mix.probabilities["B"] - 0.3).abs() < 1e-9);
        assert!((mix.probabilities["C"] - 0.1).abs() < 1e-9);
        assert!(mix.other_fraction.abs() < 1e-9);
    }

    #[test]
    fn source_mix_long_tail_rolls_into_other() {
        let mut recs: Vec<Record> = Vec::new();
        recs.extend(std::iter::repeat_with(|| rec("A")).take(995));
        for i in 1..=5 {
            recs.push(rec(&format!("X{i}")));
        }
        // min_observations=0 so only the probability threshold applies.
        let mix = extract_source_mix(&recs, 0.005, 0);
        // A has 99.5% share — after renormalisation over retained codes it is 1.0.
        // The X* codes are folded into other_fraction via the probability threshold.
        assert!(mix.probabilities.contains_key("A"));
        assert!(!mix.probabilities.contains_key("X1"));
        assert!(mix.other_fraction > 0.0);
    }

    #[test]
    fn source_mix_empty_input_returns_empty() {
        let mix = extract_source_mix(&[], DEFAULT_MIN_SOURCE_THRESHOLD, 0);
        assert!(mix.probabilities.is_empty());
        assert!(mix.other_fraction.abs() < 1e-9);
    }

    #[test]
    fn source_mix_gates_default_matches_constants() {
        let gates = SourceMixGates::default();
        assert!((gates.min_share - DEFAULT_MIN_SOURCE_THRESHOLD).abs() < 1e-12);
        assert_eq!(gates.min_observations, DEFAULT_MIN_SOURCE_OBSERVATIONS);
    }

    #[test]
    fn lowered_gates_retain_rare_but_real_sources() {
        // A long-tailed vocabulary: B sits below both default gates (0.4%
        // share, <1000 obs) but is a real recurring code. The default gates
        // fold it into other_fraction; lowered gates keep it individually.
        let mut recs: Vec<Record> = Vec::new();
        recs.extend(std::iter::repeat_with(|| rec("A")).take(2490));
        recs.extend(std::iter::repeat_with(|| rec("B")).take(10));

        let default_mix = extract_behavioral_priors(&recs, "technology")
            .expect("default extraction")
            .source_mix;
        assert!(!default_mix.probabilities.contains_key("B"));

        let gates = SourceMixGates {
            min_share: 0.001,
            min_observations: 10,
        };
        let mix = extract_behavioral_priors_with_gates(&recs, "technology", gates)
            .expect("gated extraction")
            .source_mix;
        assert!(mix.probabilities.contains_key("B"), "B should survive");
        assert!((mix.probabilities["B"] - 0.004).abs() < 1e-9);
        let total: f64 = mix.probabilities.values().sum();
        assert!((total - 1.0).abs() < 1e-9);
    }

    /// JE-share mix — shares are per JOURNAL ENTRY, not per line: a 2-line
    /// source with 6x the JE count of an 8-line source dominates the JE mix
    /// even though their LINE shares are 60/40.
    #[test]
    fn extract_source_mix_je_uses_je_counts_not_line_counts() {
        let mut records: Vec<Record> = Vec::new();
        // CO: 60 JEs x 2 lines (+X/−X) → 120 amount-bearing lines.
        for i in 0..60 {
            records.push(make_je_rec("CO", &format!("CO-{i}"), 2024, 100.0));
            records.push(make_je_rec("CO", &format!("CO-{i}"), 2024, -100.0));
        }
        // WL: 10 JEs x 8 lines (4x +25 / 4x −25) → 80 amount-bearing lines.
        for i in 0..10 {
            for _ in 0..4 {
                records.push(make_je_rec("WL", &format!("WL-{i}"), 2024, 25.0));
                records.push(make_je_rec("WL", &format!("WL-{i}"), 2024, -25.0));
            }
        }

        let line_mix = extract_source_mix(&records, 0.001, 10);
        assert!((line_mix.probabilities["CO"] - 0.60).abs() < 1e-9);
        assert!((line_mix.probabilities["WL"] - 0.40).abs() < 1e-9);

        let je_mix = extract_source_mix_je(&records, &line_mix.probabilities)
            .expect("JE mix present when the line mix has a vocabulary");
        assert!(
            (je_mix.probabilities["CO"] - 60.0 / 70.0).abs() < 1e-9,
            "CO JE share must be 6/7, got {}",
            je_mix.probabilities["CO"]
        );
        assert!((je_mix.probabilities["WL"] - 10.0 / 70.0).abs() < 1e-9);
        let total: f64 = je_mix.probabilities.values().sum();
        assert!((total - 1.0).abs() < 1e-9);
    }

    /// JE-share mix counts year-scoped JEs: the same JE number in two
    /// effective-date years is TWO entries.
    #[test]
    fn extract_source_mix_je_year_scopes_je_numbers() {
        let mut records: Vec<Record> = Vec::new();
        for year in [2024, 2025] {
            records.push(make_je_rec("CO", "J1", year, 100.0));
            records.push(make_je_rec("CO", "J1", year, -100.0));
        }
        for i in 0..2 {
            records.push(make_je_rec("WL", &format!("W-{i}"), 2024, 50.0));
            records.push(make_je_rec("WL", &format!("W-{i}"), 2024, -50.0));
        }
        let line_mix = extract_source_mix(&records, 0.001, 1);
        let je_mix =
            extract_source_mix_je(&records, &line_mix.probabilities).expect("JE mix present");
        // Year-scoped: CO has 2 JEs (2024-J1 and 2025-J1) of 4 total → 0.5.
        // A cross-year merge would give 1 of 3 ≈ 0.333.
        assert!(
            (je_mix.probabilities["CO"] - 0.5).abs() < 1e-9,
            "CO JE share must be 0.5 (year-scoped), got {}",
            je_mix.probabilities["CO"]
        );
    }

    /// The JE mix inherits the LINE mix's retained vocabulary: codes the line
    /// mix dropped stay out (their JE mass goes to other_fraction), so the two
    /// mixes describe the same source universe.
    #[test]
    fn extract_source_mix_je_inherits_line_mix_vocabulary() {
        let mut records: Vec<Record> = Vec::new();
        for i in 0..40 {
            records.push(make_je_rec("CO", &format!("CO-{i}"), 2024, 100.0));
            records.push(make_je_rec("CO", &format!("CO-{i}"), 2024, -100.0));
        }
        for i in 0..10 {
            records.push(make_je_rec("ZX", &format!("ZX-{i}"), 2024, 9.0));
        }
        // Vocabulary excludes ZX (as if the line gates dropped it).
        let mut retained = BTreeMap::new();
        retained.insert("CO".to_string(), 1.0);

        let je_mix = extract_source_mix_je(&records, &retained).expect("JE mix present");
        assert!(!je_mix.probabilities.contains_key("ZX"));
        assert!((je_mix.probabilities["CO"] - 1.0).abs() < 1e-9);
        // 10 of 50 JEs were outside the vocabulary.
        assert!(
            (je_mix.other_fraction - 0.2).abs() < 1e-9,
            "excluded JE mass should be reported, got {}",
            je_mix.other_fraction
        );
    }

    /// `extract_behavioral_priors` wires the JE mix in, with the same
    /// vocabulary as the line mix.
    #[test]
    fn extract_behavioral_priors_populates_source_mix_je() {
        let mut records: Vec<Record> = Vec::new();
        for i in 0..30 {
            records.push(make_je_rec("CO", &format!("CO-{i}"), 2024, 100.0));
            records.push(make_je_rec("CO", &format!("CO-{i}"), 2024, -100.0));
        }
        for i in 0..10 {
            for l in 0..3 {
                records.push(make_je_rec("WL", &format!("WL-{i}"), 2024, 10.0 + l as f64));
            }
            records.push(make_je_rec("WL", &format!("WL-{i}"), 2024, -33.0));
        }
        // Low gates: the default 1000-line gate would empty the vocabulary at
        // this test scale (and source_mix_je correctly stays None then).
        let gates = SourceMixGates {
            min_share: 0.001,
            min_observations: 10,
        };
        let bp = extract_behavioral_priors_with_gates(&records, "test", gates).expect("ok");
        let je_mix = bp.source_mix_je.expect("source_mix_je populated");
        let line_keys: Vec<&String> = bp.source_mix.probabilities.keys().collect();
        let je_keys: Vec<&String> = je_mix.probabilities.keys().collect();
        assert_eq!(je_keys, line_keys, "vocabularies must match");
        assert!(
            (je_mix.probabilities["CO"] - 0.75).abs() < 1e-9,
            "CO is 30 of 40 JEs, got {}",
            je_mix.probabilities["CO"]
        );

        // Default gates at this scale: empty vocabulary → JE mix absent (the
        // generator then keeps its legacy line-share draw).
        let bp_default = extract_behavioral_priors(&records, "test").expect("ok");
        if bp_default.source_mix.probabilities.is_empty() {
            assert!(bp_default.source_mix_je.is_none());
        }
    }

    #[test]
    fn extract_source_mix_drops_low_volume_codes() {
        // KR has 1500 obs, RV has 100, DZ has 5 — only KR should survive
        // a min_observations=1000 threshold.
        let mut records = Vec::new();
        records.extend(std::iter::repeat_with(|| rec("KR")).take(1500));
        records.extend(std::iter::repeat_with(|| rec("RV")).take(100));
        records.extend(std::iter::repeat_with(|| rec("DZ")).take(5));

        // prob-threshold 0.0 so only the obs-count gate applies.
        let mix = extract_source_mix(&records, 0.0, 1000);

        assert!(mix.probabilities.contains_key("KR"), "KR should survive");
        assert!(
            !mix.probabilities.contains_key("RV"),
            "RV (100 obs) should be dropped"
        );
        assert!(
            !mix.probabilities.contains_key("DZ"),
            "DZ (5 obs) should be dropped"
        );
        // KR is the only survivor → renormalised probability must be 1.0.
        assert!(
            (mix.probabilities["KR"] - 1.0).abs() < 1e-9,
            "KR probability should be 1.0 after renormalisation"
        );
    }

    #[test]
    fn per_source_iet_basic() {
        let mut recs: Vec<Record> = Vec::new();
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        for i in 0..120 {
            let mut r = rec("A");
            r.entry_date = base + chrono::Duration::days(i);
            recs.push(r);
        }
        for i in 0..50 {
            let mut r = rec("B");
            r.entry_date = base + chrono::Duration::days(i);
            recs.push(r);
        }
        let p = extract_per_source_iet(&recs, 100);
        assert!(p.by_source.contains_key("A"));
        assert!(!p.by_source.contains_key("B"));
        let summ = &p.by_source["A"];
        assert_eq!(summ.n, 119);
        assert!(summ.lognormal_fit.is_some());
    }

    #[test]
    fn per_source_iet_constant_gap_zero_autocorr() {
        let mut recs: Vec<Record> = Vec::new();
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        for i in 0..200 {
            let mut r = rec("A");
            r.entry_date = base + chrono::Duration::days(3 * i);
            recs.push(r);
        }
        let p = extract_per_source_iet(&recs, 100);
        // Constant IET → zero variance → pearson returns None → falls back to 0.0.
        assert!((p.by_source["A"].lag1_autocorr).abs() < 1e-9);
    }

    #[test]
    fn lines_per_je_overall_known() {
        let mut recs: Vec<Record> = Vec::new();
        for _ in 0..3 {
            let mut r = rec("S");
            r.je_number = "JE-A".into();
            recs.push(r);
        }
        for _ in 0..2 {
            let mut r = rec("S");
            r.je_number = "JE-B".into();
            recs.push(r);
        }
        let mut r = rec("S");
        r.je_number = "JE-C".into();
        recs.push(r);

        let p = extract_lines_per_je(&recs, DEFAULT_MIN_JES_PER_SOURCE);
        let idx_1 = LINE_COUNT_BUCKETS.iter().position(|&b| b == 1).unwrap();
        let idx_2 = LINE_COUNT_BUCKETS.iter().position(|&b| b == 2).unwrap();
        let idx_3 = LINE_COUNT_BUCKETS.iter().position(|&b| b == 3).unwrap();
        assert!((p.overall.probabilities[idx_1] - 1.0 / 3.0).abs() < 1e-9);
        assert!((p.overall.probabilities[idx_2] - 1.0 / 3.0).abs() < 1e-9);
        assert!((p.overall.probabilities[idx_3] - 1.0 / 3.0).abs() < 1e-9);
        assert_eq!(p.overall.n, 3);
    }

    #[test]
    fn active_lifetime_basic() {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        for i in 0..5 {
            let mut r = rec("A");
            r.entry_date = base + chrono::Duration::days(i * 6);
            recs.push(r);
        }
        for i in 0..5 {
            let mut r = rec("B");
            r.entry_date = base + chrono::Duration::days(i * 50);
            recs.push(r);
        }
        let p = extract_active_lifetime(&recs);
        let idx_7 = ACTIVE_LIFETIME_DAY_BUCKETS
            .iter()
            .position(|&b| b == 7)
            .unwrap();
        let idx_180 = ACTIVE_LIFETIME_DAY_BUCKETS
            .iter()
            .position(|&b| b == 180)
            .unwrap();
        // A: 4*6 = 24 days → bucket 7
        // B: 4*50 = 200 days → bucket 180
        assert!((p.overall.probabilities[idx_7] - 0.5).abs() < 1e-9);
        assert!((p.overall.probabilities[idx_180] - 0.5).abs() < 1e-9);
    }

    #[test]
    fn fanout_basic() {
        let mut recs: Vec<Record> = Vec::new();
        for &(src, gl) in &[("A", "X"), ("B", "X"), ("C", "X"), ("A", "Y")] {
            let mut r = rec(src);
            r.gl_account = gl.into();
            recs.push(r);
        }
        let p = extract_fanout(&recs);
        let hist = &p.by_attribute["GLAccount"];
        let idx_1 = FANOUT_BUCKETS.iter().position(|&b| b == 1).unwrap();
        let idx_3 = FANOUT_BUCKETS.iter().position(|&b| b == 3).unwrap();
        assert!((hist.probabilities[idx_1] - 0.5).abs() < 1e-9);
        assert!((hist.probabilities[idx_3] - 0.5).abs() < 1e-9);
    }

    #[test]
    fn posting_lag_known() {
        let mut recs: Vec<Record> = Vec::new();
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        for i in 0..120 {
            let mut r = rec("A");
            r.je_number = format!("A{i}");
            r.entry_date = base + chrono::Duration::days(i);
            r.effective_date = r.entry_date + chrono::Duration::days(5);
            recs.push(r);
        }
        for i in 0..120 {
            let mut r = rec("B");
            r.je_number = format!("B{i}");
            r.entry_date = base + chrono::Duration::days(i);
            r.effective_date = r.entry_date - chrono::Duration::days(2);
            recs.push(r);
        }
        let p = extract_posting_lag(&recs, 100).expect("non-empty");
        assert!((p.by_source["A"].mean - 5.0).abs() < 1e-9);
        assert!((p.by_source["B"].mean - (-2.0)).abs() < 1e-9);
        assert!((p.by_source["A"].stddev).abs() < 1e-9);
    }

    /// The lag prior must be measured on the amount-bearing per-JE population —
    /// the population the generator emits (it draws ONE lag per JE) and the same
    /// denominator as `source_mix_je`/`manual_share`. Line-level counting
    /// over-weights high-line system batches: on a corpus book the JE-level
    /// backdating share is ~0.29 while the line-level share the twin previously
    /// inherited is ~0.21.
    #[test]
    fn posting_lag_counts_amount_bearing_jes_once() {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        // JE j1: three lines, lag +5 — must count ONCE.
        for _ in 0..3 {
            let mut r = rec("A");
            r.je_number = "j1".into();
            r.entry_date = base;
            r.effective_date = base + chrono::Duration::days(5);
            recs.push(r);
        }
        // JE j2: one line, lag -10.
        let mut r2 = rec("A");
        r2.je_number = "j2".into();
        r2.entry_date = base;
        r2.effective_date = base - chrono::Duration::days(10);
        recs.push(r2);
        // JE j3: zero-amount only — excluded entirely.
        let mut r3 = rec_amt("A", 0.0);
        r3.je_number = "j3".into();
        r3.entry_date = base;
        r3.effective_date = base - chrono::Duration::days(50);
        recs.push(r3);

        let p = extract_posting_lag(&recs, 1).expect("non-empty");
        let a = &p.by_source["A"];
        assert_eq!(a.n, 2, "two amount-bearing JEs, not four lines");
        assert!(
            (a.mean - (-2.5)).abs() < 1e-9,
            "JE-level mean (5 + -10)/2 = -2.5, got {}",
            a.mean
        );
    }

    #[test]
    fn extract_month_volume_counts_amount_bearing_jes_per_month() {
        let jan = NaiveDate::from_ymd_opt(2022, 1, 15).unwrap();
        let feb = NaiveDate::from_ymd_opt(2022, 2, 15).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        // 3 JEs in Jan (one is a 2-line JE → counts once), 1 in Feb.
        for (je, d) in [
            ("a1", jan),
            ("a2", jan),
            ("a2", jan),
            ("a3", jan),
            ("b1", feb),
        ] {
            let mut r = rec("A");
            r.je_number = je.into();
            r.effective_date = d;
            recs.push(r);
        }
        // A zero-amount JE in Feb — excluded.
        let mut z = rec_amt("A", 0.0);
        z.je_number = "z1".into();
        z.effective_date = feb;
        recs.push(z);

        let mv = extract_month_volume(&recs, 1).expect("non-empty");
        assert_eq!(mv.n, 4, "3 distinct amount-bearing Jan JEs + 1 Feb JE");
        assert!((mv.shares[0] - 0.75).abs() < 1e-9, "Jan share 3/4");
        assert!((mv.shares[1] - 0.25).abs() < 1e-9, "Feb share 1/4");
        // Thin-book gate: below min_jes → None.
        assert!(extract_month_volume(&recs, 100).is_none());
    }

    #[test]
    fn split_into_segments_known() {
        // Use day offsets from 2022-01-01 to avoid calendar overflow:
        // offsets 0,1,2 → segment [Jan1-Jan3]; offsets 14,15,16,17 → [Jan15-Jan18];
        // offset 49 → Feb19. Gaps: 12 days (Jan3→Jan15), 32 days (Jan18→Feb19).
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let dates: Vec<NaiveDate> = [0i64, 1, 2, 14, 15, 16, 17, 49]
            .iter()
            .map(|&d| base + chrono::Duration::days(d))
            .collect();
        let (segs, gaps) = split_into_segments(&dates, 7);
        assert_eq!(segs.len(), 3); // [Jan1-Jan3], [Jan15-Jan18], [Feb19]
        assert_eq!(gaps.len(), 2); // gap of 12 days (Jan3→Jan15), gap of 32 days (Jan18→Feb19)
        assert_eq!(gaps[0], 12);
        assert_eq!(gaps[1], 32);
    }

    #[test]
    fn extract_active_segments_basic() {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        for &day_off in &[0i64, 1, 2, 14, 15, 16, 17, 49] {
            let mut r = rec("A");
            r.entry_date = base + chrono::Duration::days(day_off);
            recs.push(r);
        }
        let p = extract_active_segments(&recs);
        assert!(p.by_source.contains_key("A"));
        let summary = &p.by_source["A"];
        // Expect 3 segments; bucket 3 should have full mass.
        let idx_3 = SEGMENT_COUNT_BUCKETS.iter().position(|&b| b == 3).unwrap();
        assert!((summary.segment_count_histogram.probabilities[idx_3] - 1.0).abs() < 1e-9);
    }

    #[test]
    fn extract_entity_clusters_finds_shared_attrs() {
        // 4 sources using canonical SAP codes. KR,RV,DZ share GL accounts heavily; WE is isolated.
        let mut recs: Vec<Record> = Vec::new();
        // KR: GLs 1, 2, 3
        for gl in ["1", "2", "3"] {
            let mut r = rec("KR");
            r.gl_account = gl.into();
            recs.push(r);
        }
        // RV: GLs 1, 2, 3
        for gl in ["1", "2", "3"] {
            let mut r = rec("RV");
            r.gl_account = gl.into();
            recs.push(r);
        }
        // DZ: GLs 1, 2, 4
        for gl in ["1", "2", "4"] {
            let mut r = rec("DZ");
            r.gl_account = gl.into();
            recs.push(r);
        }
        // WE: GL 99 only
        let mut r = rec("WE");
        r.gl_account = "99".into();
        recs.push(r);

        let p = extract_entity_clusters(&recs);
        // Expect at least one cluster containing {KR, RV, DZ}; WE not clustered.
        let any_cluster_has_kr_rv_dz = p.clusters.iter().any(|c| {
            let members: HashSet<&String> = c.members.iter().collect();
            members.contains(&"KR".to_string())
                && members.contains(&"RV".to_string())
                && members.contains(&"DZ".to_string())
        });
        assert!(
            any_cluster_has_kr_rv_dz,
            "expected a cluster containing KR, RV, DZ"
        );
        let any_cluster_has_we = p
            .clusters
            .iter()
            .any(|c| c.members.iter().any(|m| m == "WE"));
        assert!(!any_cluster_has_we, "WE should be an isolate (no cluster)");
    }

    #[test]
    fn extract_entity_clusters_normalises_source_codes() {
        let base = chrono::NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        // Mix raw "0" (→ SA) and "KR" sources, all touching the same GL accounts.
        for source in ["0", "0", "0", "KR", "KR", "KR"] {
            for gl in ["1", "2", "3"] {
                let mut r = rec(source);
                r.entry_date = base;
                r.gl_account = gl.into();
                recs.push(r);
            }
        }
        let p = extract_entity_clusters(&recs);
        // No cluster member should be a raw numeric code — they should all be
        // canonical SAP-style codes.
        for cluster in &p.clusters {
            for member in &cluster.members {
                assert!(
                    !["0", "1", "2", "00", "01", "02"].contains(&member.as_str()),
                    "raw numeric code {member:?} should have been normalised"
                );
            }
        }
        // If any cluster formed, it must contain at least one canonical SAP code.
        if !p.clusters.is_empty() {
            let any_sap = p.clusters.iter().any(|c| {
                c.members
                    .iter()
                    .any(|m| ["KR", "RV", "DZ", "SA", "WE", "RE", "IM", "KZ"].contains(&m.as_str()))
            });
            assert!(
                any_sap,
                "expected at least one SAP-style canonical code in clusters"
            );
        }
    }

    #[test]
    fn normalise_source_code_known_mappings() {
        // Direct SAP codes pass through.
        assert_eq!(normalise_source_code("KR"), Some("KR".to_string()));
        assert_eq!(normalise_source_code("RV"), Some("RV".to_string()));
        // Numeric fallbacks.
        assert_eq!(normalise_source_code("0"), Some("SA".to_string()));
        assert_eq!(normalise_source_code("00"), Some("SA".to_string()));
        assert_eq!(normalise_source_code("1"), Some("RV".to_string()));
        assert_eq!(normalise_source_code("2"), Some("KR".to_string()));
        // Trimmed.
        assert_eq!(normalise_source_code("  KR  "), Some("KR".to_string()));
        // Unknown returns None.
        assert_eq!(normalise_source_code("XYZ"), None);
        assert_eq!(normalise_source_code(""), None);
    }

    fn make_record(
        src: &str,
        gl: &str,
        cost_center: Option<&str>,
        profit_center: Option<&str>,
    ) -> Record {
        let d = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        Record {
            source: src.into(),
            gl_account: gl.into(),
            cost_center: cost_center.map(|s| s.to_string()),
            profit_center: profit_center.map(|s| s.to_string()),
            trading_partner: None,
            je_number: "J1".into(),
            je_line_number: "001".into(),
            effective_date: d,
            entry_date: d,
            created_at: None,
            functional_amount: 1.0,
            header_text: String::new(),
            line_text: String::new(),
        }
    }

    #[test]
    fn extract_per_source_attribute_filters_low_observations() {
        // Build records: 15 KR rows all posting to "200001", 3 RV rows to "400001".
        let mut records: Vec<Record> = (0..15)
            .map(|_| make_record("KR", "200001", Some("CC1"), Some("PC1")))
            .collect();
        records.extend((0..3).map(|_| make_record("RV", "400001", Some("CC2"), Some("PC2"))));

        let prior = extract_per_source_attribute(&records, 10);

        // KR present (15 >= 10).
        assert!(prior.by_source.contains_key("KR"), "KR should be retained");
        let kr_gl = prior
            .conditional("KR", "gl_account")
            .expect("KR/gl_account");
        assert!(
            kr_gl.probabilities.contains_key("200001"),
            "200001 should appear"
        );
        assert_eq!(kr_gl.n, 15);
        assert!((kr_gl.probabilities["200001"] - 1.0).abs() < 1e-9);

        // RV dropped (3 < 10).
        assert!(
            !prior.by_source.contains_key("RV"),
            "RV should be filtered out"
        );
    }

    #[test]
    fn extract_per_source_attribute_skips_empty_source() {
        // Records with empty source should be silently skipped.
        let records: Vec<Record> = (0..20)
            .map(|_| make_record("", "100001", None, None))
            .collect();
        let prior = extract_per_source_attribute(&records, 5);
        assert!(
            prior.by_source.is_empty(),
            "empty source rows must be skipped"
        );
    }

    #[test]
    fn extract_per_source_attribute_multiple_values_normalise() {
        // Mix of two GL accounts for KR: 8 × "200001", 12 × "200002" (total 20 >= 10).
        let mut records: Vec<Record> = (0..8)
            .map(|_| make_record("KR", "200001", None, None))
            .collect();
        records.extend((0..12).map(|_| make_record("KR", "200002", None, None)));

        let prior = extract_per_source_attribute(&records, 10);
        let kr_gl = prior
            .conditional("KR", "gl_account")
            .expect("KR/gl_account");
        assert_eq!(kr_gl.n, 20);
        assert!((kr_gl.probabilities["200001"] - 0.4).abs() < 1e-9);
        assert!((kr_gl.probabilities["200002"] - 0.6).abs() < 1e-9);
        let total: f64 = kr_gl.probabilities.values().sum();
        assert!((total - 1.0).abs() < 1e-9, "probabilities must sum to 1.0");
    }

    #[test]
    fn extract_behavioral_priors_smoke() {
        // SP3.8b set DEFAULT_MIN_SOURCE_OBSERVATIONS = 1000 to drop the
        // long tail of low-volume sources from source_mix. The test must
        // give each source ≥ that threshold for source_mix to be populated.
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        for i in 0..1200i64 {
            let mut r = rec("A");
            r.je_number = format!("JE-A-{:04}", i / 3);
            r.entry_date = base + chrono::Duration::days(i);
            r.effective_date = r.entry_date + chrono::Duration::days(1);
            r.gl_account = format!("ACC-{}", i % 5);
            recs.push(r);
        }
        for i in 0..1200i64 {
            let mut r = rec("B");
            r.je_number = format!("JE-B-{:04}", i / 2);
            r.entry_date = base + chrono::Duration::days(i);
            r.effective_date = r.entry_date - chrono::Duration::days(1);
            r.gl_account = format!("ACC-{}", i % 7);
            recs.push(r);
        }
        let bp = extract_behavioral_priors(&recs, "test_industry").expect("ok");
        assert_eq!(bp.schema_version, BehavioralPriors::SCHEMA_VERSION);
        assert_eq!(bp.industry, "test_industry");
        assert_eq!(bp.n_client_inputs, 1);
        assert_eq!(bp.n_rows_aggregated, 2400);
        assert!(!bp.source_mix.probabilities.is_empty());
        assert!(bp.per_source_iet.by_source.contains_key("A"));
        assert!(bp.per_source_iet.by_source.contains_key("B"));
        assert!(bp.lines_per_je.overall.n > 0);
        assert!(bp.active_lifetime.overall.n > 0);
        assert_eq!(bp.fanout.by_attribute.len(), 4);
        assert!(bp.posting_lag.is_some());
        // SP3.7 — per_source_attribute should be populated (1200 rows per source ≥ threshold).
        assert!(
            bp.per_source_attribute.is_some(),
            "per_source_attribute should be extracted"
        );
        let psa = bp.per_source_attribute.as_ref().unwrap();
        // Both sources should appear; each has >= 10 GL accounts.
        assert!(psa.by_source.contains_key("A") || psa.by_source.contains_key("B"));
    }

    /// SP3.8a — trading_partner is extracted as a 4th attribute alongside
    /// gl_account, cost_center, profit_center.
    #[test]
    fn extract_per_source_attribute_includes_trading_partner() {
        // 15 KR records with trading_partner populated.
        let mut records: Vec<Record> = (0..15)
            .map(|_| {
                let mut r = make_record("KR", "200001", Some("CC1"), Some("PC1"));
                r.trading_partner = Some("V100".to_string());
                r
            })
            .collect();
        // 5 more KR records with a different TP value (still same source, total KR ≥ 10).
        records.extend((0..5).map(|_| {
            let mut r = make_record("KR", "200001", Some("CC1"), Some("PC1"));
            r.trading_partner = Some("V200".to_string());
            r
        }));
        // 3 RV records — below min_observations, should be dropped.
        records.extend((0..3).map(|_| {
            let mut r = make_record("RV", "400001", Some("CC2"), Some("PC2"));
            r.trading_partner = Some("V300".to_string());
            r
        }));

        let prior = extract_per_source_attribute(&records, 10);

        // KR/trading_partner must be present (20 observations ≥ 10).
        let kr_tp = prior
            .conditional("KR", "trading_partner")
            .expect("KR/trading_partner conditional must be present");
        assert!(
            kr_tp.probabilities.contains_key("V100"),
            "V100 should appear in KR trading_partner conditional"
        );
        assert!(
            kr_tp.probabilities.contains_key("V200"),
            "V200 should appear in KR trading_partner conditional"
        );
        assert_eq!(kr_tp.n, 20, "total observations should be 20");
        assert!(
            (kr_tp.probabilities["V100"] - 0.75).abs() < 1e-9,
            "V100 share should be 0.75"
        );
        assert!(
            (kr_tp.probabilities["V200"] - 0.25).abs() < 1e-9,
            "V200 share should be 0.25"
        );
        let total: f64 = kr_tp.probabilities.values().sum();
        assert!((total - 1.0).abs() < 1e-9, "probabilities must sum to 1.0");

        // RV was dropped (3 < 10).
        assert!(
            !prior.by_source.contains_key("RV"),
            "RV should be filtered out"
        );
    }

    /// SP3.12 W2 — TP entity cluster extraction test.
    /// Two TPs (T1, T2) share the same GL accounts; T3 is isolated.
    /// Expects a cluster {T1, T2} and T3 as an isolate.
    #[test]
    fn extract_tp_entity_clusters_finds_shared_attrs() {
        let d = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        let make_tp_rec = |tp: &str, gl: &str| Record {
            source: "KR".into(),
            gl_account: gl.into(),
            cost_center: None,
            profit_center: None,
            trading_partner: Some(tp.into()),
            je_number: "J1".into(),
            je_line_number: "001".into(),
            effective_date: d,
            entry_date: d,
            created_at: None,
            functional_amount: 1.0,
            header_text: String::new(),
            line_text: String::new(),
        };
        let mut recs = Vec::new();
        // T1 and T2 share GLs 10, 20, 30 — expect Jaccard ≥ 0.3.
        for gl in ["10", "20", "30"] {
            recs.push(make_tp_rec("T1", gl));
            recs.push(make_tp_rec("T2", gl));
        }
        // T3 uses a completely different GL — should be isolated.
        recs.push(make_tp_rec("T3", "99"));

        let p = extract_tp_entity_clusters(&recs);
        let cluster_has_t1_t2 = p.clusters.iter().any(|c| {
            let m: HashSet<&String> = c.members.iter().collect();
            m.contains(&"T1".to_string()) && m.contains(&"T2".to_string())
        });
        assert!(cluster_has_t1_t2, "expected a cluster containing T1 and T2");
        let cluster_has_t3 = p
            .clusters
            .iter()
            .any(|c| c.members.iter().any(|m| m == "T3"));
        assert!(!cluster_has_t3, "T3 should be an isolate (no cluster)");
        assert!(p.clustering_rate > 0.0, "clustering_rate must be > 0");
    }

    // ---- SP4.3 amount-conditional extractor tests --------------------------

    fn make_amount_rec(src: &str, gl: &str, amount: f64) -> Record {
        let d = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        Record {
            source: src.into(),
            gl_account: gl.into(),
            cost_center: None,
            profit_center: None,
            trading_partner: None,
            je_number: "J1".into(),
            je_line_number: "001".into(),
            effective_date: d,
            entry_date: d,
            created_at: None,
            functional_amount: amount,
            header_text: String::new(),
            line_text: String::new(),
        }
    }

    /// Extractor filters pairs below `min_observations` and keeps those above.
    #[test]
    fn extract_source_amount_conditionals_filters_low_count_pairs() {
        // KR/0041: 15 records — should be retained (≥10).
        // RV/0013: 3 records — should be dropped (<10).
        let mut records: Vec<Record> = (0..15)
            .map(|i| make_amount_rec("KR", "0041", 100.0 + i as f64))
            .collect();
        records.extend((0..3).map(|i| make_amount_rec("RV", "0013", 500.0 + i as f64)));

        let prior = extract_source_amount_conditionals(&records, 10);

        // KR marginal retained (15 ≥ 10).
        assert!(
            prior.by_source.contains_key("KR"),
            "KR marginal should be retained"
        );
        // KR/0041 retained (15 ≥ 10).
        assert!(
            prior
                .by_source_and_class
                .get("KR")
                .map(|m| m.contains_key("0041"))
                .unwrap_or(false),
            "KR/0041 pair should be retained"
        );
        // RV dropped (3 < 10).
        assert!(
            !prior.by_source.contains_key("RV"),
            "RV marginal should be dropped (only 3 observations)"
        );
    }

    /// Extractor produces sensible mu/sigma from known log-normal data.
    #[test]
    fn extract_source_amount_conditionals_lognormal_params_sensible() {
        // Generate 50 records from a known log-normal: mu=4.5, sigma=0.8.
        // Use fixed amounts close to the theoretical values.
        let base_amount = 4.5_f64.exp(); // ≈ 90
        let records: Vec<Record> = (0..50)
            .map(|i| {
                // Perturb slightly to avoid zero-variance.
                let amt = base_amount * (1.0 + 0.01 * ((i as f64) - 25.0));
                make_amount_rec("KR", "0041", amt)
            })
            .collect();

        let prior = extract_source_amount_conditionals(&records, 10);
        let params = prior
            .by_source_and_class
            .get("KR")
            .and_then(|m| m.get("0041"))
            .expect("KR/0041 params should be present");

        // mu should be close to ln(base_amount) = 4.5.
        assert!(
            (params.mu - 4.5).abs() < 0.1,
            "mu {:.3} should be close to 4.5",
            params.mu
        );
        assert_eq!(params.n, 50);
        assert!(params.median_abs > 0.0, "median_abs must be positive");
    }

    /// Extractor skips zero-amount and negative-amount lines (takes abs value, drops zero).
    #[test]
    fn extract_source_amount_conditionals_skips_zeros() {
        let mut records: Vec<Record> = (0..20)
            .map(|_| make_amount_rec("KR", "0041", 100.0))
            .collect();
        // Add some zero-amount records — should be ignored.
        records.extend((0..5).map(|_| make_amount_rec("KR", "0041", 0.0)));
        // Negative amounts are treated as absolute value.
        records.extend((0..5).map(|_| make_amount_rec("KR", "0041", -50.0)));

        let prior = extract_source_amount_conditionals(&records, 10);
        let params = prior
            .by_source_and_class
            .get("KR")
            .and_then(|m| m.get("0041"))
            .expect("KR/0041 should be present");
        // n should count only the 20 positive + 5 negative (abs) = 25 non-zero records.
        assert_eq!(params.n, 25, "zero-amount records must be excluded from n");
    }

    /// SP4.9 — sources with >= `DEFAULT_MIN_SKETCH_OBSERVATIONS` amount-bearing
    /// lines get an empirical quantile sketch; smaller sources do not. The
    /// per-knot privacy gate caps the top knot at `1 - 5/n`.
    #[test]
    fn extract_source_amount_conditionals_builds_privacy_gated_sketch() {
        // 2000 KR lines with amounts 1..=2000; 500 RV lines (below the gate).
        let mut records: Vec<Record> = (0..2000)
            .map(|i| make_amount_rec("KR", "0041", (i + 1) as f64))
            .collect();
        records.extend((0..500).map(|i| make_amount_rec("RV", "0040", (i + 1) as f64)));

        let prior = extract_source_amount_conditionals(&records, 10);

        let kr = prior
            .quantile_sketch_by_source
            .get("KR")
            .expect("KR sketch present at n=2000");
        assert_eq!(kr.n, 2000);
        assert!(kr.is_usable());
        // Privacy gate: 1 - 5/2000 = 0.9975 → grid keeps 0.995, drops 0.999.
        let top = *kr.probabilities.last().expect("knots");
        assert!(
            (top - 0.995).abs() < 1e-12,
            "top knot must be 0.995 under the 1-5/n cap, got {top}"
        );
        // Median knot of uniform 1..=2000 sits near 1000.
        let p50_idx = kr
            .probabilities
            .iter()
            .position(|p| (*p - 0.50).abs() < 1e-12)
            .expect("p50 knot");
        assert!(
            (kr.values[p50_idx] - 1000.5).abs() < 2.0,
            "p50 knot should be ~1000.5, got {}",
            kr.values[p50_idx]
        );
        // Knots must be non-decreasing.
        assert!(kr.values.windows(2).all(|w| w[1] >= w[0]));

        // RV is below the sketch gate but still gets the log-normal marginal.
        assert!(!prior.quantile_sketch_by_source.contains_key("RV"));
        assert!(prior.by_source.contains_key("RV"));
    }

    /// JE-line record with controlled JE number, effective-date year, and amount —
    /// for the JE-total sketch tests (grouping + summing semantics).
    fn make_je_rec(src: &str, je: &str, year: i32, amount: f64) -> Record {
        let mut r = make_amount_rec(src, "0041", amount);
        r.je_number = je.into();
        r.effective_date = NaiveDate::from_ymd_opt(year, 3, 15).expect("date");
        r.entry_date = r.effective_date;
        r
    }

    /// JE-total sketch — sources with >= `DEFAULT_MIN_SKETCH_OBSERVATIONS` JEs get a
    /// per-source sketch over the JE-total (sum of debit legs per JE); smaller
    /// sources do not. The JE-total sketch is distinct from the line sketch: a
    /// 2-line JE (+X / −X) contributes two lines of |X| to the line sketch but ONE
    /// total of X to the JE-total sketch.
    #[test]
    fn extract_source_amount_conditionals_builds_je_total_sketch() {
        // 1200 SA JEs, each 2 lines (+X, −X) with X = i+1 → JE totals uniform 1..=1200.
        let mut records: Vec<Record> = Vec::new();
        for i in 0..1200 {
            let x = (i + 1) as f64;
            records.push(make_je_rec("SA", &format!("JE-{i}"), 2024, x));
            records.push(make_je_rec("SA", &format!("JE-{i}"), 2024, -x));
        }
        // 300 DR JEs — below the 1000-JE gate → no JE-total sketch.
        for i in 0..300 {
            records.push(make_je_rec("DR", &format!("DR-{i}"), 2024, 50_000.0));
            records.push(make_je_rec("DR", &format!("DR-{i}"), 2024, -50_000.0));
        }

        let prior = extract_source_amount_conditionals(&records, 10);

        let sa = prior
            .je_total_sketch_by_source
            .get("SA")
            .expect("SA JE-total sketch present at 1200 JEs");
        assert_eq!(sa.n, 1200, "one observation per JE, not per line");
        assert!(sa.is_usable());
        // Median JE-total of uniform 1..=1200 sits near 600.5.
        let p50_idx = sa
            .probabilities
            .iter()
            .position(|p| (*p - 0.50).abs() < 1e-12)
            .expect("p50 knot");
        assert!(
            (sa.values[p50_idx] - 600.5).abs() < 2.0,
            "JE-total p50 knot should be ~600.5, got {}",
            sa.values[p50_idx]
        );
        // Knots non-decreasing; privacy cap honoured (1 - 5/1200 ≈ 0.9958 → top knot 0.995).
        assert!(sa.values.windows(2).all(|w| w[1] >= w[0]));
        let top = *sa.probabilities.last().expect("knots");
        assert!(
            (top - 0.995).abs() < 1e-12,
            "top knot must respect the 1-5/n cap, got {top}"
        );

        // DR below the JE-count gate → no JE-total sketch.
        assert!(!prior.je_total_sketch_by_source.contains_key("DR"));
    }

    /// Global JE-total sketch — every source's JE totals are pooled into one
    /// sketch that serves as the thin-source fallback. Two sources EACH below the
    /// per-source gate, but together above it, yield NO per-source JE-total sketch
    /// yet DO yield a populated global sketch spanning both sources' totals.
    #[test]
    fn extract_source_amount_conditionals_builds_global_je_total_sketch() {
        let mut records: Vec<Record> = Vec::new();
        // 600 AA JEs (below the 1000 gate), each total 100.
        for i in 0..600 {
            records.push(make_je_rec("AA", &format!("AA-{i}"), 2024, 100.0));
            records.push(make_je_rec("AA", &format!("AA-{i}"), 2024, -100.0));
        }
        // 600 BB JEs (below the 1000 gate), each total 1000.
        for i in 0..600 {
            records.push(make_je_rec("BB", &format!("BB-{i}"), 2024, 1000.0));
            records.push(make_je_rec("BB", &format!("BB-{i}"), 2024, -1000.0));
        }

        let prior = extract_source_amount_conditionals(&records, 10);

        // Neither thin source clears the per-source gate.
        assert!(!prior.je_total_sketch_by_source.contains_key("AA"));
        assert!(!prior.je_total_sketch_by_source.contains_key("BB"));

        // But the pooled global sketch covers all 1200 JE totals.
        let g = prior
            .je_total_sketch_global
            .as_ref()
            .expect("global JE-total sketch present once pooled JEs clear the gate");
        assert_eq!(g.n, 1200, "one observation per JE across both sources");
        assert!(g.is_usable());
        // p50 of the pooled {100×600, 1000×600} lies strictly between the two
        // source levels — proof the global mixes both, not just one source.
        let p50_idx = g
            .probabilities
            .iter()
            .position(|p| (*p - 0.50).abs() < 1e-12)
            .expect("p50 knot");
        let p50 = g.values[p50_idx];
        assert!(
            p50 > 100.0 && p50 < 1000.0,
            "pooled p50 must sit between the two sources' totals, got {p50}"
        );
    }

    /// Global JE-total sketch — when the pooled JE count is itself below the gate,
    /// no global sketch is built (very thin overall → keep the marginal fallback).
    #[test]
    fn global_je_total_sketch_absent_below_gate() {
        let mut records: Vec<Record> = Vec::new();
        for i in 0..500 {
            records.push(make_je_rec("AA", &format!("AA-{i}"), 2024, 100.0));
            records.push(make_je_rec("AA", &format!("AA-{i}"), 2024, -100.0));
        }
        let prior = extract_source_amount_conditionals(&records, 10);
        assert!(prior.je_total_sketch_global.is_none());
    }

    /// JE-total grouping is year-scoped: SAP JE numbers repeat across fiscal years,
    /// so the same `je_number` in two effective-date years is TWO JEs, never one
    /// summed total.
    #[test]
    fn je_total_sketch_year_scopes_je_numbers() {
        // The same 1000 JE numbers appear in 2024 and in 2025, each year's JE
        // totalling 100. Year-scoped → 2000 JEs, every total 100. A cross-year
        // merge would instead produce 1000 JEs totalling 200.
        let mut records: Vec<Record> = Vec::new();
        for year in [2024, 2025] {
            for i in 0..1000 {
                records.push(make_je_rec("SA", &format!("JE-{i}"), year, 100.0));
                records.push(make_je_rec("SA", &format!("JE-{i}"), year, -100.0));
            }
        }

        let prior = extract_source_amount_conditionals(&records, 10);
        let sa = prior
            .je_total_sketch_by_source
            .get("SA")
            .expect("SA JE-total sketch present");
        assert_eq!(sa.n, 2000, "year-scoped: 1000 JEs per year, not merged");
        assert!(
            sa.values.iter().all(|v| (*v - 100.0).abs() < 1e-9),
            "every JE total is 100 (a cross-year merge would double it), got {:?}",
            &sa.values[..3.min(sa.values.len())]
        );
    }

    /// The JE total is the sum of the DEBIT legs only (the entry's economic size),
    /// not the sum of absolute amounts (which double-counts), and JEs with no
    /// positive leg contribute nothing.
    #[test]
    fn je_total_sums_debit_legs_only() {
        // 1000 JEs: +60, +40, −100 → total 100 each (abs-sum would be 200).
        let mut records: Vec<Record> = Vec::new();
        for i in 0..1000 {
            let je = format!("JE-{i}");
            records.push(make_je_rec("SA", &je, 2024, 60.0));
            records.push(make_je_rec("SA", &je, 2024, 40.0));
            records.push(make_je_rec("SA", &je, 2024, -100.0));
        }
        // 50 credit-only JEs (no positive leg) → skipped entirely.
        for i in 0..50 {
            records.push(make_je_rec("SA", &format!("CR-{i}"), 2024, -77.0));
        }

        let prior = extract_source_amount_conditionals(&records, 10);
        let sa = prior
            .je_total_sketch_by_source
            .get("SA")
            .expect("SA JE-total sketch present");
        assert_eq!(sa.n, 1000, "credit-only JEs must not contribute");
        assert!(
            sa.values.iter().all(|v| (*v - 100.0).abs() < 1e-9),
            "JE total must be the debit-leg sum (100), not the abs-sum (200); got {:?}",
            &sa.values[..3.min(sa.values.len())]
        );
    }

    /// `extract_behavioral_priors` populates `source_amount_conditionals` when
    /// there are enough records.
    #[test]
    fn extract_behavioral_priors_populates_source_amount_conditionals() {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let mut recs: Vec<Record> = Vec::new();
        // 50 records for source "KR" with positive amounts.
        for i in 0..50 {
            let mut r = make_amount_rec("KR", "0041", 100.0 + i as f64);
            r.entry_date = base + chrono::Duration::days(i);
            r.effective_date = r.entry_date;
            r.je_number = format!("JE-{i}");
            recs.push(r);
        }
        let bp = extract_behavioral_priors(&recs, "test").expect("ok");
        assert!(
            bp.source_amount_conditionals.is_some(),
            "source_amount_conditionals should be populated"
        );
        let sac = bp.source_amount_conditionals.as_ref().unwrap();
        assert!(
            sac.by_source.contains_key("KR"),
            "KR marginal should be present"
        );
    }

    // ---- SP4.6 extractor tests ---------------------------------------------

    fn make_role_records(source: &str, role_sign: f64, gl: &str, n: usize) -> Vec<Record> {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        (0..n)
            .map(|i| Record {
                source: source.to_string(),
                gl_account: gl.to_string(),
                cost_center: None,
                profit_center: None,
                trading_partner: None,
                je_number: format!("J{i:06}"),
                je_line_number: "001".to_string(),
                effective_date: base,
                entry_date: base,
                created_at: None,
                functional_amount: role_sign * 100.0,
                header_text: String::new(),
                line_text: String::new(),
            })
            .collect()
    }

    /// DR records in expense class, CR records in AP class — role conditional
    /// should be populated for both (KR, DR) and (KR, CR).
    #[test]
    fn sp4_6_extract_source_role_gl_produces_role_conditionals() {
        let mut records = make_role_records("KR", 1.0, "6000", 15); // DR expense
        records.extend(make_role_records("KR", -1.0, "2000", 15)); // CR AP

        let prior = extract_source_role_gl(&records, 10);
        assert!(
            prior.conditional("KR", "DR").is_some(),
            "KR DR should be present"
        );
        assert!(
            prior.conditional("KR", "CR").is_some(),
            "KR CR should be present"
        );

        let dr_dist = prior.conditional("KR", "DR").unwrap();
        assert!(
            dr_dist.probabilities.contains_key("6000"),
            "DR should have 6000"
        );
        let cr_dist = prior.conditional("KR", "CR").unwrap();
        assert!(
            cr_dist.probabilities.contains_key("2000"),
            "CR should have 2000"
        );
    }

    /// Groups below min_observations are dropped.
    #[test]
    fn sp4_6_extract_source_role_gl_filters_low_counts() {
        // 15 KR-DR-6000 (passes), 3 KR-CR-2000 (below threshold of 10)
        let mut records = make_role_records("KR", 1.0, "6000", 15);
        records.extend(make_role_records("KR", -1.0, "2000", 3));

        let prior = extract_source_role_gl(&records, 10);
        assert!(
            prior.conditional("KR", "DR").is_some(),
            "KR DR should pass threshold"
        );
        // CR group has only 3 observations for 2000 → filtered at value level
        // total CR is also 3 → filtered at (source, role) level too
        assert!(
            prior.conditional("KR", "CR").is_none(),
            "KR CR with only 3 obs should be dropped"
        );
    }

    /// Zero-amount records are skipped.
    #[test]
    fn sp4_6_extract_source_role_gl_skips_zero_amounts() {
        let records: Vec<Record> = (0..20)
            .map(|i| Record {
                source: "SA".to_string(),
                gl_account: "4000".to_string(),
                cost_center: None,
                profit_center: None,
                trading_partner: None,
                je_number: format!("J{i:06}"),
                je_line_number: "001".to_string(),
                effective_date: NaiveDate::from_ymd_opt(2022, 1, 1).unwrap(),
                entry_date: NaiveDate::from_ymd_opt(2022, 1, 1).unwrap(),
                created_at: None,
                functional_amount: 0.0, // zero — should be skipped
                header_text: String::new(),
                line_text: String::new(),
            })
            .collect();
        let prior = extract_source_role_gl(&records, 10);
        assert!(
            prior.by_source_and_role.is_empty(),
            "zero-amount records should yield an empty prior"
        );
    }

    /// `extract_behavioral_priors` populates `source_role_gl_conditionals`
    /// when records contain sufficient DR/CR observations.
    #[test]
    fn sp4_6_extract_behavioral_priors_populates_source_role_gl_conditionals() {
        let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(42);
        let mut recs: Vec<Record> = Vec::new();
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
        let sources = ["KR", "RV", "SA"];
        let gl_dr = ["6000", "6100", "6200"];
        let gl_cr = ["2000", "2100", "1000"];
        for i in 0..1200usize {
            let src = sources[i % sources.len()];
            let (amt, gl) = if i % 2 == 0 {
                (100.0, gl_dr[i % gl_dr.len()])
            } else {
                (-100.0, gl_cr[i % gl_cr.len()])
            };
            recs.push(Record {
                source: src.to_string(),
                gl_account: gl.to_string(),
                cost_center: None,
                profit_center: None,
                trading_partner: None,
                je_number: format!("J{:06}", i / 3),
                je_line_number: format!("{:03}", (i % 3) + 1),
                effective_date: base + Duration::days(rng.random_range(0..365)),
                entry_date: base + Duration::days(rng.random_range(0..365)),
                created_at: None,
                functional_amount: amt,
                header_text: String::new(),
                line_text: String::new(),
            });
        }
        let bp = extract_behavioral_priors(&recs, "test").expect("ok");
        assert!(
            bp.source_role_gl_conditionals.is_some(),
            "source_role_gl_conditionals should be populated with sufficient data"
        );
        assert!(
            bp.source_flow_pairs.is_some(),
            "source_flow_pairs should be populated with sufficient data"
        );
    }

    // ---- SP4.8 extractor tests ---------------------------------------------

    /// Build `n` balanced two-line JEs (DR `gl_dr` / CR `gl_cr`) under `source`.
    fn make_flow_records(source: &str, gl_dr: &str, gl_cr: &str, n: usize) -> Vec<Record> {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        let mut recs = Vec::with_capacity(n * 2);
        for i in 0..n {
            for (sign, gl, line) in [(1.0, gl_dr, "001"), (-1.0, gl_cr, "002")] {
                recs.push(Record {
                    source: source.to_string(),
                    gl_account: gl.to_string(),
                    cost_center: None,
                    profit_center: None,
                    trading_partner: None,
                    je_number: format!("{source}-J{i:06}"),
                    je_line_number: line.to_string(),
                    effective_date: base,
                    entry_date: base,
                    created_at: None,
                    functional_amount: sign * 100.0,
                    header_text: String::new(),
                    line_text: String::new(),
                });
            }
        }
        recs
    }

    /// 20 KR JEs posting DR 5010 / CR 2100 → a single "5|2" pair with mass 1.
    #[test]
    fn sp4_8_extract_source_flow_pairs_produces_joint_pairs() {
        let records = make_flow_records("KR", "5010", "2100", 20);
        let prior = extract_source_flow_pairs(&records, 1, 10);
        let dist = prior.pairs("KR").expect("KR pairs present");
        assert_eq!(dist.probabilities.len(), 1);
        assert!((dist.probabilities["5|2"] - 1.0).abs() < 1e-9);
        assert_eq!(dist.n, 20);
        assert_eq!(prior.granularity, 1);
    }

    /// Pairs below `min_observations` are dropped (privacy gate), and sources
    /// whose retained total stays below the gate vanish entirely.
    #[test]
    fn sp4_8_extract_source_flow_pairs_filters_low_counts() {
        let mut records = make_flow_records("KR", "5010", "2100", 20);
        records.extend(make_flow_records("KR", "1200", "4000", 3)); // below gate
        records.extend(make_flow_records("DZ", "1000", "1100", 4)); // whole source below gate
        let prior = extract_source_flow_pairs(&records, 1, 10);
        let dist = prior.pairs("KR").expect("KR pairs present");
        assert_eq!(
            dist.probabilities.len(),
            1,
            "low-count 1|4 pair must be gated out"
        );
        assert!(dist.probabilities.contains_key("5|2"));
        assert!(prior.pairs("DZ").is_none(), "sparse source must be dropped");
    }

    /// Accounts without digits contribute no class; a JE whose only credit
    /// account is non-numeric yields no pair.
    #[test]
    fn sp4_8_extract_source_flow_pairs_skips_non_numeric_accounts() {
        let records = make_flow_records("KR", "5010", "SUSPENSE", 20);
        let prior = extract_source_flow_pairs(&records, 1, 10);
        assert!(
            prior.pairs("KR").is_none(),
            "non-numeric credit side yields no cross pairs"
        );
    }

    /// Multi-class JEs contribute their full cross product, mirroring the
    /// relational-fidelity pair measure.
    #[test]
    fn sp4_8_extract_source_flow_pairs_cross_product() {
        let base = NaiveDate::from_ymd_opt(2022, 1, 1).expect("date");
        let mut records = Vec::new();
        for i in 0..15 {
            // DR 5xxx + DR 6xxx vs CR 2xxx → pairs 5|2 and 6|2 per JE.
            for (sign, gl, line) in [
                (1.0, "5010", "001"),
                (1.0, "6010", "002"),
                (-1.0, "2100", "003"),
            ] {
                records.push(Record {
                    source: "SA".to_string(),
                    gl_account: gl.to_string(),
                    cost_center: None,
                    profit_center: None,
                    trading_partner: None,
                    je_number: format!("SA-J{i:06}"),
                    je_line_number: line.to_string(),
                    effective_date: base,
                    entry_date: base,
                    created_at: None,
                    functional_amount: sign * 100.0,
                    header_text: String::new(),
                    line_text: String::new(),
                });
            }
        }
        let prior = extract_source_flow_pairs(&records, 1, 10);
        let dist = prior.pairs("SA").expect("SA pairs present");
        assert_eq!(dist.probabilities.len(), 2);
        assert!((dist.probabilities["5|2"] - 0.5).abs() < 1e-9);
        assert!((dist.probabilities["6|2"] - 0.5).abs() < 1e-9);
    }
}