Skip to main content

datasynth_eval/coherence/
fraud_packs.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Input data describing configured vs actual fraud generation.
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct FraudPackData {
7    pub configured_fraud_rate: f64,
8    pub actual_fraud_count: usize,
9    pub total_records: usize,
10    pub configured_scheme_types: Vec<String>,
11    pub actual_scheme_types: Vec<String>,
12    pub scheme_type_counts: HashMap<String, usize>,
13}
14
15/// Thresholds for fraud pack effectiveness.
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct FraudPackThresholds {
18    /// Minimum acceptable rate accuracy (1.0 - |configured - actual| / configured).
19    /// Default: 0.70.
20    pub min_rate_accuracy: f64,
21    /// Minimum fraction of configured scheme types that appear in output.
22    /// Default: 0.80.
23    pub min_scheme_coverage: f64,
24    /// Minimum Shannon entropy of scheme distribution (higher = more uniform).
25    /// Default: 0.5.
26    pub min_distribution_entropy: f64,
27}
28
29impl Default for FraudPackThresholds {
30    fn default() -> Self {
31        Self {
32            min_rate_accuracy: 0.70,
33            min_scheme_coverage: 0.80,
34            min_distribution_entropy: 0.5,
35        }
36    }
37}
38
39/// Result of fraud pack effectiveness analysis.
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FraudPackAnalysis {
42    pub configured_rate: f64,
43    pub actual_rate: f64,
44    /// Rate accuracy: 1.0 - |configured - actual| / configured.
45    pub rate_accuracy: f64,
46    /// Fraction of configured scheme types that appear in output.
47    pub scheme_coverage: f64,
48    /// Shannon entropy of scheme distribution (normalized by log2(n_types)).
49    pub scheme_distribution_entropy: f64,
50    pub passes: bool,
51    pub issues: Vec<String>,
52}
53
54pub struct FraudPackAnalyzer {
55    thresholds: FraudPackThresholds,
56}
57
58impl FraudPackAnalyzer {
59    pub fn new(thresholds: FraudPackThresholds) -> Self {
60        Self { thresholds }
61    }
62
63    pub fn with_defaults() -> Self {
64        Self::new(FraudPackThresholds::default())
65    }
66
67    pub fn analyze(&self, data: &FraudPackData) -> FraudPackAnalysis {
68        let mut issues = Vec::new();
69
70        // Actual fraud rate
71        let actual_rate = if data.total_records > 0 {
72            data.actual_fraud_count as f64 / data.total_records as f64
73        } else {
74            0.0
75        };
76
77        // Rate accuracy
78        let rate_accuracy = if data.configured_fraud_rate > 0.0 {
79            1.0 - ((data.configured_fraud_rate - actual_rate).abs() / data.configured_fraud_rate)
80        } else if actual_rate == 0.0 {
81            1.0 // Both zero = perfect accuracy
82        } else {
83            0.0 // Configured 0 but got fraud = bad
84        };
85
86        // Scheme coverage: fraction of configured types that appear in actual
87        let scheme_coverage = if data.configured_scheme_types.is_empty() {
88            1.0
89        } else {
90            let covered = data
91                .configured_scheme_types
92                .iter()
93                .filter(|t| data.actual_scheme_types.contains(t))
94                .count();
95            covered as f64 / data.configured_scheme_types.len() as f64
96        };
97
98        // Shannon entropy of scheme distribution (normalized)
99        let scheme_distribution_entropy = {
100            let total: usize = data.scheme_type_counts.values().sum();
101            if total == 0 || data.scheme_type_counts.len() <= 1 {
102                0.0
103            } else {
104                let mut entropy = 0.0f64;
105                for &count in data.scheme_type_counts.values() {
106                    if count > 0 {
107                        let p = count as f64 / total as f64;
108                        entropy -= p * p.log2();
109                    }
110                }
111                // Normalize by max possible entropy (uniform distribution)
112                let max_entropy = (data.scheme_type_counts.len() as f64).log2();
113                if max_entropy > 0.0 {
114                    entropy / max_entropy
115                } else {
116                    0.0
117                }
118            }
119        };
120
121        // Check thresholds
122        if rate_accuracy < self.thresholds.min_rate_accuracy {
123            issues.push(format!(
124                "Rate accuracy {:.3} < threshold {:.3} (configured={:.4}, actual={:.4})",
125                rate_accuracy,
126                self.thresholds.min_rate_accuracy,
127                data.configured_fraud_rate,
128                actual_rate
129            ));
130        }
131        if scheme_coverage < self.thresholds.min_scheme_coverage {
132            issues.push(format!(
133                "Scheme coverage {:.2} < threshold {:.2}",
134                scheme_coverage, self.thresholds.min_scheme_coverage
135            ));
136        }
137        if scheme_distribution_entropy < self.thresholds.min_distribution_entropy {
138            issues.push(format!(
139                "Distribution entropy {:.3} < threshold {:.3}",
140                scheme_distribution_entropy, self.thresholds.min_distribution_entropy
141            ));
142        }
143
144        let passes = rate_accuracy >= self.thresholds.min_rate_accuracy
145            && scheme_coverage >= self.thresholds.min_scheme_coverage
146            && scheme_distribution_entropy >= self.thresholds.min_distribution_entropy;
147
148        FraudPackAnalysis {
149            configured_rate: data.configured_fraud_rate,
150            actual_rate,
151            rate_accuracy,
152            scheme_coverage,
153            scheme_distribution_entropy,
154            passes,
155            issues,
156        }
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_perfect_fraud_pack() {
166        let analyzer = FraudPackAnalyzer::with_defaults();
167        let data = FraudPackData {
168            configured_fraud_rate: 0.05,
169            actual_fraud_count: 50,
170            total_records: 1000,
171            configured_scheme_types: vec!["DuplicatePayment".into(), "SplitTransaction".into()],
172            actual_scheme_types: vec!["DuplicatePayment".into(), "SplitTransaction".into()],
173            scheme_type_counts: HashMap::from([
174                ("DuplicatePayment".into(), 25),
175                ("SplitTransaction".into(), 25),
176            ]),
177        };
178        let result = analyzer.analyze(&data);
179        assert!(result.passes, "issues: {:?}", result.issues);
180        assert_eq!(result.rate_accuracy, 1.0);
181        assert_eq!(result.scheme_coverage, 1.0);
182        assert!(result.scheme_distribution_entropy > 0.9); // Uniform = max entropy
183    }
184
185    #[test]
186    fn test_rate_deviation_detected() {
187        let analyzer = FraudPackAnalyzer::with_defaults();
188        let data = FraudPackData {
189            configured_fraud_rate: 0.10,
190            actual_fraud_count: 20,
191            total_records: 1000,
192            configured_scheme_types: vec!["DuplicatePayment".into()],
193            actual_scheme_types: vec!["DuplicatePayment".into()],
194            scheme_type_counts: HashMap::from([("DuplicatePayment".into(), 20)]),
195        };
196        let result = analyzer.analyze(&data);
197        // actual=0.02, configured=0.10, accuracy=1-0.08/0.10=0.2
198        assert!(!result.passes);
199        assert!(result.rate_accuracy < 0.7);
200    }
201
202    #[test]
203    fn test_missing_scheme_types() {
204        let analyzer = FraudPackAnalyzer::with_defaults();
205        let data = FraudPackData {
206            configured_fraud_rate: 0.05,
207            actual_fraud_count: 50,
208            total_records: 1000,
209            configured_scheme_types: vec![
210                "DuplicatePayment".into(),
211                "SplitTransaction".into(),
212                "GhostEmployee".into(),
213                "RoundTripping".into(),
214                "FictitiousTransaction".into(),
215            ],
216            actual_scheme_types: vec!["DuplicatePayment".into()],
217            scheme_type_counts: HashMap::from([("DuplicatePayment".into(), 50)]),
218        };
219        let result = analyzer.analyze(&data);
220        assert!(!result.passes);
221        assert_eq!(result.scheme_coverage, 0.2); // Only 1 of 5
222    }
223
224    #[test]
225    fn test_zero_records_handles_gracefully() {
226        let analyzer = FraudPackAnalyzer::with_defaults();
227        let data = FraudPackData {
228            configured_fraud_rate: 0.05,
229            actual_fraud_count: 0,
230            total_records: 0,
231            configured_scheme_types: vec!["DuplicatePayment".into()],
232            actual_scheme_types: vec![],
233            scheme_type_counts: HashMap::new(),
234        };
235        let result = analyzer.analyze(&data);
236        // Should not panic
237        assert!(!result.passes);
238    }
239
240    #[test]
241    fn test_uniform_distribution_high_entropy() {
242        let analyzer = FraudPackAnalyzer::with_defaults();
243        let data = FraudPackData {
244            configured_fraud_rate: 0.05,
245            actual_fraud_count: 100,
246            total_records: 2000,
247            configured_scheme_types: vec!["A".into(), "B".into(), "C".into(), "D".into()],
248            actual_scheme_types: vec!["A".into(), "B".into(), "C".into(), "D".into()],
249            scheme_type_counts: HashMap::from([
250                ("A".into(), 25),
251                ("B".into(), 25),
252                ("C".into(), 25),
253                ("D".into(), 25),
254            ]),
255        };
256        let result = analyzer.analyze(&data);
257        assert!(result.scheme_distribution_entropy > 0.99);
258        assert!(result.passes, "issues: {:?}", result.issues);
259    }
260
261    #[test]
262    fn test_skewed_distribution_low_entropy() {
263        let analyzer = FraudPackAnalyzer::with_defaults();
264        let data = FraudPackData {
265            configured_fraud_rate: 0.05,
266            actual_fraud_count: 100,
267            total_records: 2000,
268            configured_scheme_types: vec!["A".into(), "B".into(), "C".into()],
269            actual_scheme_types: vec!["A".into(), "B".into(), "C".into()],
270            scheme_type_counts: HashMap::from([("A".into(), 98), ("B".into(), 1), ("C".into(), 1)]),
271        };
272        let result = analyzer.analyze(&data);
273        assert!(result.scheme_distribution_entropy < 0.5);
274    }
275}