Skip to main content

datasynth_generators/anomaly/
source_conditional_rarity.rs

1//! SOTA-12 (#140, FINDINGS §13): source-conditional-rarity anomaly tagging.
2//!
3//! The corpus audit packet showed `source_cond_edge_surprise_max` is the dominant
4//! explainer for **100% of top-50 JEs** at production scale. None of the existing
5//! synthetic anomaly types target this pattern (`UnusualAccountPair` and
6//! `NewCounterparty` operate on global rarity, not source-conditional). This module
7//! adds the missing class as a **post-process** over generated JEs — sidestepping the
8//! SOTA-8 / SOTA-11 coverage problem because every JE is in the input regardless of
9//! which generator produced it.
10//!
11//! Algorithm (per call):
12//!   1. Build per-source empirical PMF `P(account | source)` from the JEs that carry
13//!      an SAP source code.
14//!   2. Score each JE by Σ over its lines of `-log P(account | source)` — the
15//!      source-conditional surprise.
16//!   3. Sort descending. The top `rate × n_jes` JEs whose surprise also clears
17//!      `min_surprise` are tagged as `RelationalAnomalyType::SourceConditionalRarity`.
18
19use std::collections::HashMap;
20
21use datasynth_core::models::JournalEntry;
22
23/// Configuration for the source-conditional-rarity post-process.
24#[derive(Debug, Clone)]
25pub struct SourceConditionalRarityConfig {
26    /// Fraction of input JEs to tag as anomalous (default 0.01 — matches the
27    /// audit-packet hot-list size).
28    pub rate: f64,
29    /// Minimum `-log P(edge | source)` (summed over JE lines) to consider for tagging.
30    /// Guards against tagging mildly-surprising JEs when the rate budget would
31    /// otherwise force inclusion. 0.0 disables the floor.
32    pub min_surprise: f64,
33    /// Per-source line-count floor — sources with fewer lines have unreliable
34    /// PMFs and are skipped. Default 5.
35    pub min_per_source_lines: u32,
36}
37
38impl Default for SourceConditionalRarityConfig {
39    fn default() -> Self {
40        Self {
41            rate: 0.01,
42            min_surprise: 5.0,
43            min_per_source_lines: 5,
44        }
45    }
46}
47
48const SOURCE_COND_RARITY_LABEL: &str = "SourceConditionalRarity";
49
50/// Tag the top `rate × n_jes` source-conditionally-rare JEs in `entries`, mutating
51/// each tagged JE's **header** (`is_anomaly = true`, `anomaly_type = "Source-
52/// ConditionalRarity"`) — same pattern as the existing relational anomaly strategies.
53/// Returns the count of JEs actually tagged.
54pub fn tag_source_conditional_rarity(
55    entries: &mut [JournalEntry],
56    cfg: &SourceConditionalRarityConfig,
57) -> usize {
58    if entries.is_empty() || cfg.rate <= 0.0 {
59        return 0;
60    }
61
62    // 1. Per-source PMF: source -> (account -> count); source -> total_lines.
63    let mut acct_count: HashMap<String, HashMap<String, u32>> = HashMap::new();
64    let mut total_per_source: HashMap<String, u32> = HashMap::new();
65    for je in entries.iter() {
66        let src = match je.header.sap_source_code.as_deref() {
67            Some(s) if !s.is_empty() => s.to_string(),
68            _ => continue,
69        };
70        let inner = acct_count.entry(src.clone()).or_default();
71        for line in &je.lines {
72            *inner.entry(line.gl_account.clone()).or_insert(0) += 1;
73            *total_per_source.entry(src.clone()).or_insert(0) += 1;
74        }
75    }
76
77    // 2. Score each JE by Σ -log P(account | source).
78    let mut scores: Vec<(usize, f64)> = Vec::with_capacity(entries.len());
79    for (idx, je) in entries.iter().enumerate() {
80        let src = match je.header.sap_source_code.as_deref() {
81            Some(s) if !s.is_empty() => s,
82            _ => continue,
83        };
84        let total = *total_per_source.get(src).unwrap_or(&0);
85        if total < cfg.min_per_source_lines {
86            continue;
87        }
88        let total_f = total as f64;
89        let pmf = match acct_count.get(src) {
90            Some(m) => m,
91            None => continue,
92        };
93        let mut surprise = 0.0_f64;
94        for line in &je.lines {
95            let count = *pmf.get(&line.gl_account).unwrap_or(&0);
96            // additive smoothing so unseen accounts get a finite (large) surprise
97            let p = (count as f64 + 0.5) / (total_f + 0.5);
98            surprise += -(p.ln());
99        }
100        scores.push((idx, surprise));
101    }
102
103    // 3. Sort descending; pick the top by rate AND clear the floor.
104    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
105    let n_top = ((entries.len() as f64) * cfg.rate).round() as usize;
106    let n_top = n_top.min(scores.len()).max(1);
107    let mut tagged = 0usize;
108    for (idx, surprise) in scores.into_iter().take(n_top) {
109        if surprise < cfg.min_surprise {
110            break; // sorted descending — any subsequent JE is also below threshold
111        }
112        let je = &mut entries[idx];
113        je.header.is_anomaly = true;
114        je.header.anomaly_type = Some(SOURCE_COND_RARITY_LABEL.to_string());
115        tagged += 1;
116    }
117    tagged
118}
119
120#[cfg(test)]
121mod tests {
122    use super::*;
123    use chrono::NaiveDate;
124    use datasynth_core::models::JournalEntryLine;
125    use rust_decimal_macros::dec;
126
127    fn make_je(id: &str, source: &str, debit_acct: &str, credit_acct: &str) -> JournalEntry {
128        let mut je = JournalEntry::new_simple(
129            id.to_string(),
130            "C1".to_string(),
131            NaiveDate::from_ymd_opt(2026, 1, 15).unwrap(),
132            format!("test {id}"),
133        );
134        je.header.sap_source_code = Some(source.to_string());
135        je.add_line(JournalEntryLine {
136            line_number: 1,
137            gl_account: debit_acct.to_string(),
138            debit_amount: dec!(100),
139            ..Default::default()
140        });
141        je.add_line(JournalEntryLine {
142            line_number: 2,
143            gl_account: credit_acct.to_string(),
144            credit_amount: dec!(100),
145            ..Default::default()
146        });
147        je
148    }
149
150    #[test]
151    fn tags_the_rare_pair_under_a_common_source() {
152        // 99 JEs use the common (1000, 2000) pair under source "SA"; 1 uses (9999, 8888).
153        let mut entries: Vec<JournalEntry> = (0..99)
154            .map(|i| make_je(&format!("je{i}"), "SA", "1000", "2000"))
155            .collect();
156        entries.push(make_je("rare", "SA", "9999", "8888"));
157
158        let cfg = SourceConditionalRarityConfig {
159            rate: 0.02,
160            min_surprise: 0.5,
161            ..Default::default()
162        };
163        let tagged = tag_source_conditional_rarity(&mut entries, &cfg);
164        assert!(tagged >= 1, "expected ≥ 1 tag, got {tagged}");
165        // The "rare" JE (index 99) must be tagged.
166        let rare = &entries[99];
167        assert!(rare.header.is_anomaly, "rare JE header not flagged");
168        assert_eq!(
169            rare.header.anomaly_type.as_deref(),
170            Some("SourceConditionalRarity")
171        );
172        // Common JEs must NOT be tagged (allow at most 1 false positive at the rate budget).
173        let common_tagged = entries[..99].iter().filter(|e| e.header.is_anomaly).count();
174        assert!(
175            common_tagged <= 1,
176            "common JEs over-tagged: {common_tagged}"
177        );
178    }
179
180    #[test]
181    fn skips_sources_with_too_few_lines() {
182        let mut entries = vec![make_je("solo", "ZZ", "1000", "2000")];
183        let cfg = SourceConditionalRarityConfig {
184            rate: 1.0,
185            min_surprise: 0.0,
186            ..Default::default()
187        };
188        let tagged = tag_source_conditional_rarity(&mut entries, &cfg);
189        assert_eq!(tagged, 0, "should skip when per-source data is too sparse");
190    }
191
192    #[test]
193    fn respects_min_surprise_floor() {
194        let mut entries: Vec<JournalEntry> = (0..50)
195            .map(|i| make_je(&format!("je{i}"), "SA", "1000", "2000"))
196            .collect();
197        let cfg = SourceConditionalRarityConfig {
198            rate: 0.10,
199            min_surprise: 100.0,
200            ..Default::default()
201        };
202        let tagged = tag_source_conditional_rarity(&mut entries, &cfg);
203        assert_eq!(
204            tagged, 0,
205            "unreachable min_surprise should suppress tagging"
206        );
207    }
208
209    #[test]
210    fn no_op_on_empty_input() {
211        let mut entries: Vec<JournalEntry> = Vec::new();
212        assert_eq!(
213            tag_source_conditional_rarity(&mut entries, &SourceConditionalRarityConfig::default()),
214            0
215        );
216    }
217}