Skip to main content

datasynth_generators/concentration/
trading_partner_pool.rs

1//! `TradingPartnerPoolPass` — closes the SOTA-11.1 / #142 coverage gap by
2//! rewriting JE-line `trading_partner` strings to a target pool size.
3//!
4//! Why post-process: the SOTA-11 round's `vendor.count` knob only affected
5//! `je_generator`'s direct path; document-flow / allocation / period-close
6//! generators use their own hard-coded V-000001..V-000040 pool. This pass
7//! sees every JE regardless of which generator emitted it.
8//!
9//! Safety: `trading_partner` is an informational field — no downstream
10//! invariant (balance, document-chain refs, subledger reconciliation) reads
11//! it. Rewriting in post-process is correct by construction.
12//!
13//! Determinism: FNV-1a 64-bit hash of the original TP string indexes a pool
14//! of size `target_size`. Same input → same output across runs.
15
16use std::collections::BTreeMap;
17
18use datasynth_config::schema::TradingPartnerPoolPassConfig;
19use datasynth_core::models::JournalEntry;
20use rand_chacha::ChaCha8Rng;
21
22use super::{ConcentrationPass, ConcentrationStats};
23
24const PASS_NAME: &str = "trading_partner_pool";
25
26pub struct TradingPartnerPoolPass {
27    target_size: u64,
28}
29
30impl TradingPartnerPoolPass {
31    pub fn new(cfg: TradingPartnerPoolPassConfig) -> Self {
32        Self {
33            // Clamp to >= 1 so the modulus is well-defined.
34            target_size: cfg.target_size.max(1) as u64,
35        }
36    }
37
38    /// FNV-1a 64-bit hash → pool index. Same TP string always maps to the
39    /// same canonical pool TP across runs.
40    #[inline]
41    fn pool_index(&self, tp: &str) -> u64 {
42        const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
43        const FNV_PRIME: u64 = 0x0000_0100_0000_01B3;
44        let mut h = FNV_OFFSET;
45        for b in tp.as_bytes() {
46            h ^= *b as u64;
47            h = h.wrapping_mul(FNV_PRIME);
48        }
49        h % self.target_size
50    }
51
52    #[inline]
53    fn canonical_tp(&self, original: &str) -> String {
54        format!("TP-{:06}", self.pool_index(original))
55    }
56}
57
58impl ConcentrationPass for TradingPartnerPoolPass {
59    fn name(&self) -> &'static str {
60        PASS_NAME
61    }
62
63    fn apply(&self, entries: &mut [JournalEntry], _rng: &mut ChaCha8Rng) -> ConcentrationStats {
64        let mut lines_modified: u64 = 0;
65        let mut entries_modified: usize = 0;
66        for je in entries.iter_mut() {
67            let mut je_touched = false;
68            for line in je.lines.iter_mut() {
69                if let Some(tp) = line.trading_partner.as_ref() {
70                    let new_tp = self.canonical_tp(tp);
71                    if &new_tp != tp {
72                        line.trading_partner = Some(new_tp);
73                        lines_modified += 1;
74                        je_touched = true;
75                    }
76                }
77            }
78            if je_touched {
79                entries_modified += 1;
80            }
81        }
82        let mut extra = BTreeMap::new();
83        extra.insert("lines_modified", lines_modified);
84        extra.insert("target_pool_size", self.target_size);
85        ConcentrationStats {
86            pass: PASS_NAME,
87            entries_examined: entries.len(),
88            entries_modified,
89            extra,
90        }
91    }
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97    use chrono::NaiveDate;
98    use datasynth_core::models::{JournalEntry, JournalEntryLine};
99    use rand::SeedableRng;
100    use std::collections::HashSet;
101
102    fn make_je(idx: usize, tp: Option<&str>) -> JournalEntry {
103        let mut je = JournalEntry::new_simple(
104            format!("JE{idx}"),
105            "C1".to_string(),
106            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
107            format!("test {idx}"),
108        );
109        let line = JournalEntryLine {
110            gl_account: "6000".to_string(),
111            trading_partner: tp.map(String::from),
112            ..JournalEntryLine::default()
113        };
114        je.lines.push(line);
115        je
116    }
117
118    #[test]
119    fn converges_to_target_pool_size() {
120        // 200 distinct TP strings, target 25 — distinct count must collapse to ≤ 25.
121        let mut entries: Vec<JournalEntry> = (0..200)
122            .map(|i| make_je(i, Some(&format!("V-{:06}", i))))
123            .collect();
124
125        let pass = TradingPartnerPoolPass::new(TradingPartnerPoolPassConfig { target_size: 25 });
126        let mut rng = ChaCha8Rng::seed_from_u64(7);
127        let stats = pass.apply(&mut entries, &mut rng);
128
129        let distinct: HashSet<&String> = entries
130            .iter()
131            .filter_map(|je| je.lines[0].trading_partner.as_ref())
132            .collect();
133        assert!(
134            distinct.len() <= 25,
135            "pool exceeded target: {}",
136            distinct.len()
137        );
138        // Hash quality lower bound — FNV-1a on sequential inputs is uniform
139        // asymptotically; with 200 → 25 bins we expect >= half the pool to fill.
140        assert!(
141            distinct.len() >= 12,
142            "pool under-filled below half: {}",
143            distinct.len()
144        );
145        assert_eq!(stats.entries_examined, 200);
146        assert_eq!(stats.extra["target_pool_size"], 25);
147    }
148
149    #[test]
150    fn deterministic_under_same_seed() {
151        let make_batch = || -> Vec<JournalEntry> {
152            (0..50)
153                .map(|i| make_je(i, Some(&format!("V-orig-{i:04}"))))
154                .collect()
155        };
156        let cfg = TradingPartnerPoolPassConfig { target_size: 8 };
157        let pass_a = TradingPartnerPoolPass::new(cfg.clone());
158        let pass_b = TradingPartnerPoolPass::new(cfg);
159
160        let mut batch_a = make_batch();
161        let mut batch_b = make_batch();
162        let mut rng_a = ChaCha8Rng::seed_from_u64(123);
163        let mut rng_b = ChaCha8Rng::seed_from_u64(123);
164        pass_a.apply(&mut batch_a, &mut rng_a);
165        pass_b.apply(&mut batch_b, &mut rng_b);
166
167        for (a, b) in batch_a.iter().zip(batch_b.iter()) {
168            assert_eq!(a.lines[0].trading_partner, b.lines[0].trading_partner);
169        }
170    }
171
172    #[test]
173    fn preserves_lines_without_trading_partner() {
174        let mut entries: Vec<JournalEntry> = (0..10).map(|i| make_je(i, None)).collect();
175        let pass = TradingPartnerPoolPass::new(TradingPartnerPoolPassConfig { target_size: 5 });
176        let mut rng = ChaCha8Rng::seed_from_u64(0);
177        let stats = pass.apply(&mut entries, &mut rng);
178        assert_eq!(stats.entries_modified, 0);
179        assert_eq!(stats.extra["lines_modified"], 0);
180        for je in &entries {
181            assert!(je.lines[0].trading_partner.is_none());
182        }
183    }
184
185    #[test]
186    fn zero_target_size_is_clamped_to_one() {
187        let pass = TradingPartnerPoolPass::new(TradingPartnerPoolPassConfig { target_size: 0 });
188        let mut entries = vec![make_je(0, Some("V-000001"))];
189        let mut rng = ChaCha8Rng::seed_from_u64(0);
190        let _ = pass.apply(&mut entries, &mut rng);
191        // All TPs must collapse to a single string (since pool size = 1).
192        assert_eq!(
193            entries[0].lines[0].trading_partner.as_deref(),
194            Some("TP-000000")
195        );
196    }
197}