datasynth_generators/anomaly/
source_conditional_rarity.rs1use std::collections::HashMap;
20
21use datasynth_core::models::JournalEntry;
22
23#[derive(Debug, Clone)]
25pub struct SourceConditionalRarityConfig {
26 pub rate: f64,
29 pub min_surprise: f64,
33 pub min_per_source_lines: u32,
36}
37
38impl Default for SourceConditionalRarityConfig {
39 fn default() -> Self {
40 Self {
41 rate: 0.01,
42 min_surprise: 5.0,
43 min_per_source_lines: 5,
44 }
45 }
46}
47
48const SOURCE_COND_RARITY_LABEL: &str = "SourceConditionalRarity";
49
50pub fn tag_source_conditional_rarity(
55 entries: &mut [JournalEntry],
56 cfg: &SourceConditionalRarityConfig,
57) -> usize {
58 if entries.is_empty() || cfg.rate <= 0.0 {
59 return 0;
60 }
61
62 let mut acct_count: HashMap<String, HashMap<String, u32>> = HashMap::new();
64 let mut total_per_source: HashMap<String, u32> = HashMap::new();
65 for je in entries.iter() {
66 let src = match je.header.sap_source_code.as_deref() {
67 Some(s) if !s.is_empty() => s.to_string(),
68 _ => continue,
69 };
70 let inner = acct_count.entry(src.clone()).or_default();
71 for line in &je.lines {
72 *inner.entry(line.gl_account.clone()).or_insert(0) += 1;
73 *total_per_source.entry(src.clone()).or_insert(0) += 1;
74 }
75 }
76
77 let mut scores: Vec<(usize, f64)> = Vec::with_capacity(entries.len());
79 for (idx, je) in entries.iter().enumerate() {
80 let src = match je.header.sap_source_code.as_deref() {
81 Some(s) if !s.is_empty() => s,
82 _ => continue,
83 };
84 let total = *total_per_source.get(src).unwrap_or(&0);
85 if total < cfg.min_per_source_lines {
86 continue;
87 }
88 let total_f = total as f64;
89 let pmf = match acct_count.get(src) {
90 Some(m) => m,
91 None => continue,
92 };
93 let mut surprise = 0.0_f64;
94 for line in &je.lines {
95 let count = *pmf.get(&line.gl_account).unwrap_or(&0);
96 let p = (count as f64 + 0.5) / (total_f + 0.5);
98 surprise += -(p.ln());
99 }
100 scores.push((idx, surprise));
101 }
102
103 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
105 let n_top = ((entries.len() as f64) * cfg.rate).round() as usize;
106 let n_top = n_top.min(scores.len()).max(1);
107 let mut tagged = 0usize;
108 for (idx, surprise) in scores.into_iter().take(n_top) {
109 if surprise < cfg.min_surprise {
110 break; }
112 let je = &mut entries[idx];
113 je.header.is_anomaly = true;
114 je.header.anomaly_type = Some(SOURCE_COND_RARITY_LABEL.to_string());
115 tagged += 1;
116 }
117 tagged
118}
119
120#[cfg(test)]
121mod tests {
122 use super::*;
123 use chrono::NaiveDate;
124 use datasynth_core::models::JournalEntryLine;
125 use rust_decimal_macros::dec;
126
127 fn make_je(id: &str, source: &str, debit_acct: &str, credit_acct: &str) -> JournalEntry {
128 let mut je = JournalEntry::new_simple(
129 id.to_string(),
130 "C1".to_string(),
131 NaiveDate::from_ymd_opt(2026, 1, 15).unwrap(),
132 format!("test {id}"),
133 );
134 je.header.sap_source_code = Some(source.to_string());
135 je.add_line(JournalEntryLine {
136 line_number: 1,
137 gl_account: debit_acct.to_string(),
138 debit_amount: dec!(100),
139 ..Default::default()
140 });
141 je.add_line(JournalEntryLine {
142 line_number: 2,
143 gl_account: credit_acct.to_string(),
144 credit_amount: dec!(100),
145 ..Default::default()
146 });
147 je
148 }
149
150 #[test]
151 fn tags_the_rare_pair_under_a_common_source() {
152 let mut entries: Vec<JournalEntry> = (0..99)
154 .map(|i| make_je(&format!("je{i}"), "SA", "1000", "2000"))
155 .collect();
156 entries.push(make_je("rare", "SA", "9999", "8888"));
157
158 let cfg = SourceConditionalRarityConfig {
159 rate: 0.02,
160 min_surprise: 0.5,
161 ..Default::default()
162 };
163 let tagged = tag_source_conditional_rarity(&mut entries, &cfg);
164 assert!(tagged >= 1, "expected ≥ 1 tag, got {tagged}");
165 let rare = &entries[99];
167 assert!(rare.header.is_anomaly, "rare JE header not flagged");
168 assert_eq!(
169 rare.header.anomaly_type.as_deref(),
170 Some("SourceConditionalRarity")
171 );
172 let common_tagged = entries[..99].iter().filter(|e| e.header.is_anomaly).count();
174 assert!(
175 common_tagged <= 1,
176 "common JEs over-tagged: {common_tagged}"
177 );
178 }
179
180 #[test]
181 fn skips_sources_with_too_few_lines() {
182 let mut entries = vec![make_je("solo", "ZZ", "1000", "2000")];
183 let cfg = SourceConditionalRarityConfig {
184 rate: 1.0,
185 min_surprise: 0.0,
186 ..Default::default()
187 };
188 let tagged = tag_source_conditional_rarity(&mut entries, &cfg);
189 assert_eq!(tagged, 0, "should skip when per-source data is too sparse");
190 }
191
192 #[test]
193 fn respects_min_surprise_floor() {
194 let mut entries: Vec<JournalEntry> = (0..50)
195 .map(|i| make_je(&format!("je{i}"), "SA", "1000", "2000"))
196 .collect();
197 let cfg = SourceConditionalRarityConfig {
198 rate: 0.10,
199 min_surprise: 100.0,
200 ..Default::default()
201 };
202 let tagged = tag_source_conditional_rarity(&mut entries, &cfg);
203 assert_eq!(
204 tagged, 0,
205 "unreachable min_surprise should suppress tagging"
206 );
207 }
208
209 #[test]
210 fn no_op_on_empty_input() {
211 let mut entries: Vec<JournalEntry> = Vec::new();
212 assert_eq!(
213 tag_source_conditional_rarity(&mut entries, &SourceConditionalRarityConfig::default()),
214 0
215 );
216 }
217}