Skip to main content

datasynth_generators/concentration/
source_blanking.rs

1//! `SourceBlankingPass` — Phase 1.5 of the central concentration abstraction
2//! (#143). Closes SOTA-7 (#132) by nulling `sap_source_code` on a configurable
3//! fraction of JEs to match the corpus's ~21% blank-source rate.
4//!
5//! Synthetic currently emits an SAP source code on ~100% of JEs (default-on
6//! after SOTA-3 / T2(D) Lever 1b). The corpus shows ~21% blank-source / no-
7//! doc-type postings — see `experiments/ml/FINDINGS.md §9`. Closing the gap
8//! requires a post-process drop, not a generator change: the source codes
9//! themselves are realistic per-process; what's missing is the *absence
10//! pattern* the corpus exhibits.
11//!
12//! Why post-process: the same multi-generator coverage problem that motivated
13//! the central abstraction — every generator path emits a source code, and a
14//! per-generator opt-out would require ~7 wiring points. One trait + one pass
15//! covers them all.
16//!
17//! Safety: `sap_source_code` is an observable attribute; no downstream
18//! invariant (balance, document-chain refs, subledger reconciliation,
19//! `is_balanced()`) reads it. Other passes that DO read it
20//! (`SourceConditionalRarityPass`, `AccountPairSubstitutionPass`) handle
21//! `None` correctly — they just skip those JEs. Pipeline ordering matters:
22//! `SourceBlankingPass` must run AFTER those passes, otherwise their PMF
23//! coverage drops by the blanking rate. Pipeline registration order (in
24//! `mod.rs::from_config`) enforces this.
25
26use std::collections::BTreeMap;
27
28use datasynth_config::schema::SourceBlankingPassConfig;
29use datasynth_core::models::JournalEntry;
30use rand::prelude::*;
31use rand_chacha::ChaCha8Rng;
32
33use super::{ConcentrationPass, ConcentrationStats};
34
35const PASS_NAME: &str = "source_blanking";
36
37pub struct SourceBlankingPass {
38    rate: f64,
39}
40
41impl SourceBlankingPass {
42    pub fn new(cfg: SourceBlankingPassConfig) -> Self {
43        Self {
44            // Clamp to [0.0, 1.0] so a misconfigured value can't break the pass.
45            rate: cfg.rate.clamp(0.0, 1.0),
46        }
47    }
48}
49
50impl ConcentrationPass for SourceBlankingPass {
51    fn name(&self) -> &'static str {
52        PASS_NAME
53    }
54
55    fn apply(&self, entries: &mut [JournalEntry], rng: &mut ChaCha8Rng) -> ConcentrationStats {
56        if self.rate == 0.0 {
57            // Fast path — pass is opted out; emit stats but make no changes.
58            return ConcentrationStats {
59                pass: PASS_NAME,
60                entries_examined: entries.len(),
61                entries_modified: 0,
62                extra: BTreeMap::new(),
63            };
64        }
65
66        let mut blanked: usize = 0;
67        let mut already_blank: u64 = 0;
68        for je in entries.iter_mut() {
69            // "Already blank" = sap_source_code missing or empty string.
70            // Both states are treated the same downstream by the CSV writer
71            // (it back-fills the TransactionSource Display label only when
72            // sap_source_code is None, NOT when it's an empty string — so we
73            // express "intentionally blanked" as `Some("")` to override the
74            // writer's fallback and match the corpus's literal-empty Source
75            // column semantic).
76            match je.header.sap_source_code.as_deref() {
77                None | Some("") => {
78                    already_blank += 1;
79                    continue;
80                }
81                _ => {}
82            }
83            let draw: f64 = rng.random();
84            if draw < self.rate {
85                // See above: `Some("")` (not `None`) so the output writer
86                // honors the blank intent. None still falls back to the
87                // TransactionSource Display label for the priors-disabled
88                // legacy path; that fallback is preserved.
89                je.header.sap_source_code = Some(String::new());
90                blanked += 1;
91            }
92        }
93
94        let mut extra = BTreeMap::new();
95        extra.insert("blanked", blanked as u64);
96        extra.insert("already_blank", already_blank);
97        // Effective rate in basis points (1bp = 0.01%) — cheap integer encoding so
98        // observability doesn't need a float field.
99        let total = entries.len() as u64;
100        let target_bp = (self.rate * 10_000.0) as u64;
101        extra.insert("target_rate_bp", target_bp);
102        if let Some(eff_bp) = (blanked as u64 * 10_000).checked_div(total) {
103            extra.insert("effective_rate_bp", eff_bp);
104        }
105
106        ConcentrationStats {
107            pass: PASS_NAME,
108            entries_examined: entries.len(),
109            entries_modified: blanked,
110            extra,
111        }
112    }
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118    use chrono::NaiveDate;
119    use datasynth_core::models::{JournalEntry, JournalEntryLine};
120    use rand::SeedableRng;
121
122    fn make_je(idx: usize, source: Option<&str>) -> JournalEntry {
123        let mut je = JournalEntry::new_simple(
124            format!("JE{idx}"),
125            "C1".to_string(),
126            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
127            format!("test {idx}"),
128        );
129        je.header.sap_source_code = source.map(String::from);
130        let line = JournalEntryLine {
131            gl_account: "6000".to_string(),
132            ..JournalEntryLine::default()
133        };
134        je.lines.push(line);
135        je
136    }
137
138    #[test]
139    fn rate_zero_leaves_all_sources_intact() {
140        let mut entries: Vec<JournalEntry> = (0..100).map(|i| make_je(i, Some("BKPF"))).collect();
141        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 0.0 });
142        let mut rng = ChaCha8Rng::seed_from_u64(0);
143        let stats = pass.apply(&mut entries, &mut rng);
144        assert_eq!(stats.entries_modified, 0);
145        for je in &entries {
146            assert_eq!(je.header.sap_source_code.as_deref(), Some("BKPF"));
147        }
148    }
149
150    #[test]
151    fn rate_one_blanks_every_source() {
152        let mut entries: Vec<JournalEntry> = (0..100).map(|i| make_je(i, Some("BKPF"))).collect();
153        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 1.0 });
154        let mut rng = ChaCha8Rng::seed_from_u64(1);
155        let stats = pass.apply(&mut entries, &mut rng);
156        assert_eq!(stats.entries_modified, 100);
157        for je in &entries {
158            // Post-pass: sap_source_code is `Some("")` (intentionally blanked
159            // marker); never `None` for previously-set rows. See the pass-body
160            // comment for why `Some("")` not `None`.
161            assert_eq!(je.header.sap_source_code.as_deref(), Some(""));
162        }
163    }
164
165    #[test]
166    fn rate_021_lands_in_corpus_band() {
167        // n=2000 with rate=0.21: ~420 blanked. Binomial std ≈ √(npq) ≈ 18, so
168        // ±100 is ~5σ — assertion comfortably bounds CI noise without being lax.
169        let mut entries: Vec<JournalEntry> = (0..2000).map(|i| make_je(i, Some("BKPF"))).collect();
170        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 0.21 });
171        let mut rng = ChaCha8Rng::seed_from_u64(7);
172        let stats = pass.apply(&mut entries, &mut rng);
173        let blanked = stats.entries_modified;
174        assert!(
175            (320..=520).contains(&blanked),
176            "rate=0.21 blanked={} (expected ~420)",
177            blanked
178        );
179        assert_eq!(stats.extra["target_rate_bp"], 2100);
180    }
181
182    #[test]
183    fn already_blank_jes_pass_through_uncounted() {
184        // Half the batch already has None — those should be skipped before the RNG roll.
185        let mut entries: Vec<JournalEntry> = (0..100)
186            .map(|i| make_je(i, if i % 2 == 0 { Some("BKPF") } else { None }))
187            .collect();
188        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 1.0 });
189        let mut rng = ChaCha8Rng::seed_from_u64(2);
190        let stats = pass.apply(&mut entries, &mut rng);
191        assert_eq!(stats.entries_modified, 50); // only the 50 originally-set
192        assert_eq!(stats.extra["already_blank"], 50);
193    }
194
195    #[test]
196    fn deterministic_under_same_seed() {
197        let make_batch =
198            || -> Vec<JournalEntry> { (0..100).map(|i| make_je(i, Some("BKPF"))).collect() };
199        let cfg = SourceBlankingPassConfig { rate: 0.3 };
200        let pass_a = SourceBlankingPass::new(cfg.clone());
201        let pass_b = SourceBlankingPass::new(cfg);
202
203        let mut batch_a = make_batch();
204        let mut batch_b = make_batch();
205        let mut rng_a = ChaCha8Rng::seed_from_u64(42);
206        let mut rng_b = ChaCha8Rng::seed_from_u64(42);
207        pass_a.apply(&mut batch_a, &mut rng_a);
208        pass_b.apply(&mut batch_b, &mut rng_b);
209
210        for (a, b) in batch_a.iter().zip(batch_b.iter()) {
211            assert_eq!(a.header.sap_source_code, b.header.sap_source_code);
212        }
213    }
214}