Skip to main content

datasynth_generators/concentration/
source_blanking.rs

1//! `SourceBlankingPass` — Phase 1.5 of the central concentration abstraction
2//! (#143). Closes SOTA-7 (#132) by nulling `sap_source_code` on a configurable
3//! fraction of JEs to match the corpus's ~21% blank-source rate.
4//!
5//! Synthetic currently emits an SAP source code on ~100% of JEs (default-on
6//! after SOTA-3 / T2(D) Lever 1b). The corpus shows ~21% blank-source / no-
7//! doc-type postings — see `experiments/ml/FINDINGS.md §9`. Closing the gap
8//! requires a post-process drop, not a generator change: the source codes
9//! themselves are realistic per-process; what's missing is the *absence
10//! pattern* the corpus exhibits.
11//!
12//! Why post-process: the same multi-generator coverage problem that motivated
13//! the central abstraction — every generator path emits a source code, and a
14//! per-generator opt-out would require ~7 wiring points. One trait + one pass
15//! covers them all.
16//!
17//! Safety: `sap_source_code` is an observable attribute; no downstream
18//! invariant (balance, document-chain refs, subledger reconciliation,
19//! `is_balanced()`) reads it. Other passes that DO read it
20//! (`SourceConditionalRarityPass`, `AccountPairSubstitutionPass`) handle
21//! `None` correctly — they just skip those JEs. Pipeline ordering matters:
22//! `SourceBlankingPass` must run AFTER those passes, otherwise their PMF
23//! coverage drops by the blanking rate. Pipeline registration order (in
24//! `mod.rs::from_config`) enforces this.
25
26use std::collections::BTreeMap;
27
28use datasynth_config::schema::SourceBlankingPassConfig;
29use datasynth_core::models::JournalEntry;
30use rand::prelude::*;
31use rand_chacha::ChaCha8Rng;
32
33use super::{ConcentrationPass, ConcentrationStats};
34
35const PASS_NAME: &str = "source_blanking";
36
37pub struct SourceBlankingPass {
38    rate: f64,
39}
40
41impl SourceBlankingPass {
42    pub fn new(cfg: SourceBlankingPassConfig) -> Self {
43        Self {
44            // Clamp to [0.0, 1.0] so a misconfigured value can't break the pass.
45            rate: cfg.rate.clamp(0.0, 1.0),
46        }
47    }
48}
49
50impl ConcentrationPass for SourceBlankingPass {
51    fn name(&self) -> &'static str {
52        PASS_NAME
53    }
54
55    fn apply(&self, entries: &mut [JournalEntry], rng: &mut ChaCha8Rng) -> ConcentrationStats {
56        if self.rate == 0.0 {
57            // Fast path — pass is opted out; emit stats but make no changes.
58            return ConcentrationStats {
59                pass: PASS_NAME,
60                entries_examined: entries.len(),
61                entries_modified: 0,
62                extra: BTreeMap::new(),
63            };
64        }
65
66        let mut blanked: usize = 0;
67        let mut already_blank: u64 = 0;
68        for je in entries.iter_mut() {
69            if je.header.sap_source_code.is_none() {
70                already_blank += 1;
71                continue;
72            }
73            let draw: f64 = rng.random();
74            if draw < self.rate {
75                je.header.sap_source_code = None;
76                blanked += 1;
77            }
78        }
79
80        let mut extra = BTreeMap::new();
81        extra.insert("blanked", blanked as u64);
82        extra.insert("already_blank", already_blank);
83        // Effective rate in basis points (1bp = 0.01%) — cheap integer encoding so
84        // observability doesn't need a float field.
85        let total = entries.len() as u64;
86        let target_bp = (self.rate * 10_000.0) as u64;
87        extra.insert("target_rate_bp", target_bp);
88        if let Some(eff_bp) = (blanked as u64 * 10_000).checked_div(total) {
89            extra.insert("effective_rate_bp", eff_bp);
90        }
91
92        ConcentrationStats {
93            pass: PASS_NAME,
94            entries_examined: entries.len(),
95            entries_modified: blanked,
96            extra,
97        }
98    }
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104    use chrono::NaiveDate;
105    use datasynth_core::models::{JournalEntry, JournalEntryLine};
106    use rand::SeedableRng;
107
108    fn make_je(idx: usize, source: Option<&str>) -> JournalEntry {
109        let mut je = JournalEntry::new_simple(
110            format!("JE{idx}"),
111            "C1".to_string(),
112            NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
113            format!("test {idx}"),
114        );
115        je.header.sap_source_code = source.map(String::from);
116        let line = JournalEntryLine {
117            gl_account: "6000".to_string(),
118            ..JournalEntryLine::default()
119        };
120        je.lines.push(line);
121        je
122    }
123
124    #[test]
125    fn rate_zero_leaves_all_sources_intact() {
126        let mut entries: Vec<JournalEntry> = (0..100).map(|i| make_je(i, Some("BKPF"))).collect();
127        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 0.0 });
128        let mut rng = ChaCha8Rng::seed_from_u64(0);
129        let stats = pass.apply(&mut entries, &mut rng);
130        assert_eq!(stats.entries_modified, 0);
131        for je in &entries {
132            assert_eq!(je.header.sap_source_code.as_deref(), Some("BKPF"));
133        }
134    }
135
136    #[test]
137    fn rate_one_blanks_every_source() {
138        let mut entries: Vec<JournalEntry> = (0..100).map(|i| make_je(i, Some("BKPF"))).collect();
139        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 1.0 });
140        let mut rng = ChaCha8Rng::seed_from_u64(1);
141        let stats = pass.apply(&mut entries, &mut rng);
142        assert_eq!(stats.entries_modified, 100);
143        for je in &entries {
144            assert!(je.header.sap_source_code.is_none());
145        }
146    }
147
148    #[test]
149    fn rate_021_lands_in_corpus_band() {
150        // n=2000 with rate=0.21: ~420 blanked. Binomial std ≈ √(npq) ≈ 18, so
151        // ±100 is ~5σ — assertion comfortably bounds CI noise without being lax.
152        let mut entries: Vec<JournalEntry> = (0..2000).map(|i| make_je(i, Some("BKPF"))).collect();
153        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 0.21 });
154        let mut rng = ChaCha8Rng::seed_from_u64(7);
155        let stats = pass.apply(&mut entries, &mut rng);
156        let blanked = stats.entries_modified;
157        assert!(
158            (320..=520).contains(&blanked),
159            "rate=0.21 blanked={} (expected ~420)",
160            blanked
161        );
162        assert_eq!(stats.extra["target_rate_bp"], 2100);
163    }
164
165    #[test]
166    fn already_blank_jes_pass_through_uncounted() {
167        // Half the batch already has None — those should be skipped before the RNG roll.
168        let mut entries: Vec<JournalEntry> = (0..100)
169            .map(|i| make_je(i, if i % 2 == 0 { Some("BKPF") } else { None }))
170            .collect();
171        let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 1.0 });
172        let mut rng = ChaCha8Rng::seed_from_u64(2);
173        let stats = pass.apply(&mut entries, &mut rng);
174        assert_eq!(stats.entries_modified, 50); // only the 50 originally-set
175        assert_eq!(stats.extra["already_blank"], 50);
176    }
177
178    #[test]
179    fn deterministic_under_same_seed() {
180        let make_batch =
181            || -> Vec<JournalEntry> { (0..100).map(|i| make_je(i, Some("BKPF"))).collect() };
182        let cfg = SourceBlankingPassConfig { rate: 0.3 };
183        let pass_a = SourceBlankingPass::new(cfg.clone());
184        let pass_b = SourceBlankingPass::new(cfg);
185
186        let mut batch_a = make_batch();
187        let mut batch_b = make_batch();
188        let mut rng_a = ChaCha8Rng::seed_from_u64(42);
189        let mut rng_b = ChaCha8Rng::seed_from_u64(42);
190        pass_a.apply(&mut batch_a, &mut rng_a);
191        pass_b.apply(&mut batch_b, &mut rng_b);
192
193        for (a, b) in batch_a.iter().zip(batch_b.iter()) {
194            assert_eq!(a.header.sap_source_code, b.header.sap_source_code);
195        }
196    }
197}