datasynth_generators/concentration/
source_blanking.rs1use std::collections::BTreeMap;
27
28use datasynth_config::schema::SourceBlankingPassConfig;
29use datasynth_core::models::JournalEntry;
30use rand::prelude::*;
31use rand_chacha::ChaCha8Rng;
32
33use super::{ConcentrationPass, ConcentrationStats};
34
35const PASS_NAME: &str = "source_blanking";
36
37pub struct SourceBlankingPass {
38 rate: f64,
39}
40
41impl SourceBlankingPass {
42 pub fn new(cfg: SourceBlankingPassConfig) -> Self {
43 Self {
44 rate: cfg.rate.clamp(0.0, 1.0),
46 }
47 }
48}
49
50impl ConcentrationPass for SourceBlankingPass {
51 fn name(&self) -> &'static str {
52 PASS_NAME
53 }
54
55 fn apply(&self, entries: &mut [JournalEntry], rng: &mut ChaCha8Rng) -> ConcentrationStats {
56 if self.rate == 0.0 {
57 return ConcentrationStats {
59 pass: PASS_NAME,
60 entries_examined: entries.len(),
61 entries_modified: 0,
62 extra: BTreeMap::new(),
63 };
64 }
65
66 let mut blanked: usize = 0;
67 let mut already_blank: u64 = 0;
68 for je in entries.iter_mut() {
69 match je.header.sap_source_code.as_deref() {
77 None | Some("") => {
78 already_blank += 1;
79 continue;
80 }
81 _ => {}
82 }
83 let draw: f64 = rng.random();
84 if draw < self.rate {
85 je.header.sap_source_code = Some(String::new());
90 blanked += 1;
91 }
92 }
93
94 let mut extra = BTreeMap::new();
95 extra.insert("blanked", blanked as u64);
96 extra.insert("already_blank", already_blank);
97 let total = entries.len() as u64;
100 let target_bp = (self.rate * 10_000.0) as u64;
101 extra.insert("target_rate_bp", target_bp);
102 if let Some(eff_bp) = (blanked as u64 * 10_000).checked_div(total) {
103 extra.insert("effective_rate_bp", eff_bp);
104 }
105
106 ConcentrationStats {
107 pass: PASS_NAME,
108 entries_examined: entries.len(),
109 entries_modified: blanked,
110 extra,
111 }
112 }
113}
114
115#[cfg(test)]
116mod tests {
117 use super::*;
118 use chrono::NaiveDate;
119 use datasynth_core::models::{JournalEntry, JournalEntryLine};
120 use rand::SeedableRng;
121
122 fn make_je(idx: usize, source: Option<&str>) -> JournalEntry {
123 let mut je = JournalEntry::new_simple(
124 format!("JE{idx}"),
125 "C1".to_string(),
126 NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
127 format!("test {idx}"),
128 );
129 je.header.sap_source_code = source.map(String::from);
130 let line = JournalEntryLine {
131 gl_account: "6000".to_string(),
132 ..JournalEntryLine::default()
133 };
134 je.lines.push(line);
135 je
136 }
137
138 #[test]
139 fn rate_zero_leaves_all_sources_intact() {
140 let mut entries: Vec<JournalEntry> = (0..100).map(|i| make_je(i, Some("BKPF"))).collect();
141 let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 0.0 });
142 let mut rng = ChaCha8Rng::seed_from_u64(0);
143 let stats = pass.apply(&mut entries, &mut rng);
144 assert_eq!(stats.entries_modified, 0);
145 for je in &entries {
146 assert_eq!(je.header.sap_source_code.as_deref(), Some("BKPF"));
147 }
148 }
149
150 #[test]
151 fn rate_one_blanks_every_source() {
152 let mut entries: Vec<JournalEntry> = (0..100).map(|i| make_je(i, Some("BKPF"))).collect();
153 let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 1.0 });
154 let mut rng = ChaCha8Rng::seed_from_u64(1);
155 let stats = pass.apply(&mut entries, &mut rng);
156 assert_eq!(stats.entries_modified, 100);
157 for je in &entries {
158 assert_eq!(je.header.sap_source_code.as_deref(), Some(""));
162 }
163 }
164
165 #[test]
166 fn rate_021_lands_in_corpus_band() {
167 let mut entries: Vec<JournalEntry> = (0..2000).map(|i| make_je(i, Some("BKPF"))).collect();
170 let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 0.21 });
171 let mut rng = ChaCha8Rng::seed_from_u64(7);
172 let stats = pass.apply(&mut entries, &mut rng);
173 let blanked = stats.entries_modified;
174 assert!(
175 (320..=520).contains(&blanked),
176 "rate=0.21 blanked={} (expected ~420)",
177 blanked
178 );
179 assert_eq!(stats.extra["target_rate_bp"], 2100);
180 }
181
182 #[test]
183 fn already_blank_jes_pass_through_uncounted() {
184 let mut entries: Vec<JournalEntry> = (0..100)
186 .map(|i| make_je(i, if i % 2 == 0 { Some("BKPF") } else { None }))
187 .collect();
188 let pass = SourceBlankingPass::new(SourceBlankingPassConfig { rate: 1.0 });
189 let mut rng = ChaCha8Rng::seed_from_u64(2);
190 let stats = pass.apply(&mut entries, &mut rng);
191 assert_eq!(stats.entries_modified, 50); assert_eq!(stats.extra["already_blank"], 50);
193 }
194
195 #[test]
196 fn deterministic_under_same_seed() {
197 let make_batch =
198 || -> Vec<JournalEntry> { (0..100).map(|i| make_je(i, Some("BKPF"))).collect() };
199 let cfg = SourceBlankingPassConfig { rate: 0.3 };
200 let pass_a = SourceBlankingPass::new(cfg.clone());
201 let pass_b = SourceBlankingPass::new(cfg);
202
203 let mut batch_a = make_batch();
204 let mut batch_b = make_batch();
205 let mut rng_a = ChaCha8Rng::seed_from_u64(42);
206 let mut rng_b = ChaCha8Rng::seed_from_u64(42);
207 pass_a.apply(&mut batch_a, &mut rng_a);
208 pass_b.apply(&mut batch_b, &mut rng_b);
209
210 for (a, b) in batch_a.iter().zip(batch_b.iter()) {
211 assert_eq!(a.header.sap_source_code, b.header.sap_source_code);
212 }
213 }
214}