1use rust_decimal::Decimal;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16
17use super::{
18 BaselineModelType, BaselineResult, BenchmarkBuilder, BenchmarkSuite, BenchmarkTaskType,
19 MetricType,
20};
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct AcfeCalibration {
25 pub median_loss: Decimal,
27 pub median_duration_months: u32,
29 pub category_distribution: AcfeCategoryDistribution,
31 pub detection_method_distribution: HashMap<String, f64>,
33 pub perpetrator_department_distribution: HashMap<String, f64>,
35}
36
37impl Default for AcfeCalibration {
38 fn default() -> Self {
39 Self {
40 median_loss: Decimal::new(117_000, 0),
41 median_duration_months: 12,
42 category_distribution: AcfeCategoryDistribution::default(),
43 detection_method_distribution: default_detection_methods(),
44 perpetrator_department_distribution: default_perpetrator_departments(),
45 }
46 }
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct AcfeCategoryDistribution {
52 pub asset_misappropriation: f64,
54 pub corruption: f64,
56 pub financial_statement_fraud: f64,
58 pub asset_misappropriation_median: Decimal,
60 pub corruption_median: Decimal,
62 pub financial_statement_fraud_median: Decimal,
64}
65
66impl Default for AcfeCategoryDistribution {
67 fn default() -> Self {
68 Self {
69 asset_misappropriation: 0.86,
70 corruption: 0.33,
71 financial_statement_fraud: 0.10,
72 asset_misappropriation_median: Decimal::new(100_000, 0),
73 corruption_median: Decimal::new(150_000, 0),
74 financial_statement_fraud_median: Decimal::new(954_000, 0),
75 }
76 }
77}
78
79fn default_detection_methods() -> HashMap<String, f64> {
80 let mut methods = HashMap::new();
81 methods.insert("tip".to_string(), 0.42);
82 methods.insert("internal_audit".to_string(), 0.16);
83 methods.insert("management_review".to_string(), 0.12);
84 methods.insert("by_accident".to_string(), 0.06);
85 methods.insert("external_audit".to_string(), 0.04);
86 methods.insert("account_reconciliation".to_string(), 0.05);
87 methods.insert("document_examination".to_string(), 0.04);
88 methods.insert("surveillance".to_string(), 0.02);
89 methods.insert("it_controls".to_string(), 0.02);
90 methods.insert("other".to_string(), 0.07);
91 methods
92}
93
94fn default_perpetrator_departments() -> HashMap<String, f64> {
95 let mut depts = HashMap::new();
96 depts.insert("accounting".to_string(), 0.21);
97 depts.insert("operations".to_string(), 0.17);
98 depts.insert("executive_management".to_string(), 0.12);
99 depts.insert("sales".to_string(), 0.11);
100 depts.insert("customer_service".to_string(), 0.09);
101 depts.insert("purchasing".to_string(), 0.07);
102 depts.insert("finance".to_string(), 0.05);
103 depts.insert("warehousing".to_string(), 0.05);
104 depts.insert("other".to_string(), 0.13);
105 depts
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct AcfeAlignment {
111 pub category_distribution_mad: f64,
113 pub median_loss_ratio: f64,
115 pub duration_distribution_ks: f64,
117 pub detection_method_chi_sq: f64,
119 pub perpetrator_department_chi_sq: f64,
121 pub overall_alignment: f64,
123 pub issues: Vec<String>,
125}
126
127impl Default for AcfeAlignment {
128 fn default() -> Self {
129 Self {
130 category_distribution_mad: 0.0,
131 median_loss_ratio: 1.0,
132 duration_distribution_ks: 0.0,
133 detection_method_chi_sq: 0.0,
134 perpetrator_department_chi_sq: 0.0,
135 overall_alignment: 1.0,
136 issues: Vec::new(),
137 }
138 }
139}
140
141impl AcfeAlignment {
142 pub fn calculate(
144 observed_category_dist: &HashMap<String, f64>,
145 observed_median_loss: Decimal,
146 observed_median_duration: u32,
147 _observed_detection_methods: &HashMap<String, f64>,
148 _observed_perpetrator_depts: &HashMap<String, f64>,
149 ) -> Self {
150 let calibration = AcfeCalibration::default();
151 let mut alignment = Self::default();
152 let mut score_components: Vec<f64> = Vec::new();
153
154 let expected_cat = HashMap::from([
156 (
157 "asset_misappropriation".to_string(),
158 calibration.category_distribution.asset_misappropriation,
159 ),
160 (
161 "corruption".to_string(),
162 calibration.category_distribution.corruption,
163 ),
164 (
165 "financial_statement_fraud".to_string(),
166 calibration.category_distribution.financial_statement_fraud,
167 ),
168 ]);
169
170 let mut total_deviation = 0.0;
171 let mut count = 0;
172 for (cat, expected) in &expected_cat {
173 let observed = observed_category_dist.get(cat).copied().unwrap_or(0.0);
174 total_deviation += (observed - expected).abs();
175 count += 1;
176 }
177 alignment.category_distribution_mad = if count > 0 {
178 total_deviation / count as f64
179 } else {
180 0.0
181 };
182
183 let category_score = (1.0 - alignment.category_distribution_mad * 5.0).max(0.0);
185 score_components.push(category_score);
186 if alignment.category_distribution_mad > 0.10 {
187 alignment.issues.push(format!(
188 "Category distribution deviates from ACFE by MAD={:.2}",
189 alignment.category_distribution_mad
190 ));
191 }
192
193 let expected_loss_f64 = calibration
195 .median_loss
196 .to_string()
197 .parse::<f64>()
198 .unwrap_or(117000.0);
199 let observed_loss_f64 = observed_median_loss
200 .to_string()
201 .parse::<f64>()
202 .unwrap_or(0.0);
203 alignment.median_loss_ratio = if expected_loss_f64 > 0.0 {
204 observed_loss_f64 / expected_loss_f64
205 } else {
206 0.0
207 };
208
209 let loss_score = if alignment.median_loss_ratio >= 0.5 && alignment.median_loss_ratio <= 2.0
211 {
212 1.0 - ((alignment.median_loss_ratio - 1.0).abs() * 0.5)
213 } else {
214 0.2
215 };
216 score_components.push(loss_score);
217 if alignment.median_loss_ratio < 0.5 || alignment.median_loss_ratio > 2.0 {
218 alignment.issues.push(format!(
219 "Median loss ratio {:.2}x differs significantly from ACFE benchmark",
220 alignment.median_loss_ratio
221 ));
222 }
223
224 let expected_duration = calibration.median_duration_months as f64;
226 let observed_duration = observed_median_duration as f64;
227 let duration_ratio = if expected_duration > 0.0 {
228 observed_duration / expected_duration
229 } else {
230 0.0
231 };
232
233 let duration_score = if (0.5..=2.0).contains(&duration_ratio) {
234 1.0 - ((duration_ratio - 1.0).abs() * 0.5)
235 } else {
236 0.2
237 };
238 score_components.push(duration_score);
239 if !(0.5..=2.0).contains(&duration_ratio) {
240 alignment.issues.push(format!(
241 "Median duration {}mo differs from ACFE benchmark of {}mo",
242 observed_median_duration, calibration.median_duration_months
243 ));
244 }
245
246 alignment.overall_alignment = if !score_components.is_empty() {
248 score_components.iter().sum::<f64>() / score_components.len() as f64
249 } else {
250 1.0
251 };
252
253 alignment
254 }
255}
256
257pub fn acfe_calibrated_1k() -> BenchmarkSuite {
259 let mut class_dist = HashMap::new();
260 class_dist.insert("normal".to_string(), 900);
262 class_dist.insert("asset_misappropriation".to_string(), 43); class_dist.insert("corruption".to_string(), 17); class_dist.insert("financial_statement_fraud".to_string(), 5); class_dist.insert("mixed_scheme".to_string(), 35); BenchmarkBuilder::new("acfe-calibrated-1k", "ACFE-Calibrated-1K")
268 .description("1K transactions calibrated to ACFE Report to the Nations statistics. Tests fraud category distribution, loss amounts, and duration patterns.")
269 .task_type(BenchmarkTaskType::FraudClassification)
270 .dataset_size(1000, 100)
271 .class_distribution(class_dist)
272 .split_ratios(0.7, 0.15, 0.15, true)
273 .primary_metric(MetricType::MacroF1)
274 .metrics(vec![
275 MetricType::AucRoc,
276 MetricType::AucPr,
277 MetricType::MacroF1,
278 MetricType::WeightedF1,
279 MetricType::Recall,
280 MetricType::Precision,
281 ])
282 .seed(2024)
283 .time_span_days(365)
284 .num_companies(2)
285 .add_baseline(BaselineResult {
286 model_name: "Random".to_string(),
287 model_type: BaselineModelType::Random,
288 metrics: [
289 ("auc_roc".to_string(), 0.50),
290 ("macro_f1".to_string(), 0.10),
291 ].into_iter().collect(),
292 training_time_seconds: Some(0.0),
293 inference_time_ms: Some(0.01),
294 notes: Some("Random baseline".to_string()),
295 })
296 .add_baseline(BaselineResult {
297 model_name: "IsolationForest".to_string(),
298 model_type: BaselineModelType::IsolationForest,
299 metrics: [
300 ("auc_roc".to_string(), 0.75),
301 ("macro_f1".to_string(), 0.35),
302 ].into_iter().collect(),
303 training_time_seconds: Some(0.5),
304 inference_time_ms: Some(0.1),
305 notes: Some("Unsupervised, tuned for ACFE patterns".to_string()),
306 })
307 .add_baseline(BaselineResult {
308 model_name: "XGBoost-ACFE".to_string(),
309 model_type: BaselineModelType::XgBoost,
310 metrics: [
311 ("auc_roc".to_string(), 0.88),
312 ("macro_f1".to_string(), 0.62),
313 ].into_iter().collect(),
314 training_time_seconds: Some(3.0),
315 inference_time_ms: Some(0.05),
316 notes: Some("Supervised with ACFE-informed features".to_string()),
317 })
318 .metadata("calibration_source", "ACFE Report to the Nations 2024")
319 .metadata("median_loss", "117000")
320 .metadata("median_duration_months", "12")
321 .metadata("domain", "fraud_detection")
322 .metadata("difficulty", "medium")
323 .build()
324}
325
326pub fn acfe_collusion_5k() -> BenchmarkSuite {
328 let mut class_dist = HashMap::new();
329 class_dist.insert("normal".to_string(), 4500);
330 class_dist.insert("solo_fraud".to_string(), 300);
331 class_dist.insert("two_person_collusion".to_string(), 120);
332 class_dist.insert("ring_collusion".to_string(), 50);
333 class_dist.insert("external_collusion".to_string(), 30);
334
335 BenchmarkBuilder::new("acfe-collusion-5k", "ACFE-Collusion-5K")
336 .description("5K transactions for collusion detection. ACFE reports collusion cases have 2x median loss. Tests detection of coordinated fraud networks.")
337 .task_type(BenchmarkTaskType::FraudClassification)
338 .dataset_size(5000, 500)
339 .class_distribution(class_dist)
340 .split_ratios(0.7, 0.15, 0.15, true)
341 .primary_metric(MetricType::AucPr)
342 .metrics(vec![
343 MetricType::AucPr,
344 MetricType::AucRoc,
345 MetricType::MacroF1,
346 MetricType::PrecisionAtK(50),
347 MetricType::Recall,
348 ])
349 .seed(12345)
350 .time_span_days(730) .num_companies(3)
352 .add_baseline(BaselineResult {
353 model_name: "NodeFeatures".to_string(),
354 model_type: BaselineModelType::XgBoost,
355 metrics: [
356 ("auc_pr".to_string(), 0.35),
357 ("auc_roc".to_string(), 0.72),
358 ].into_iter().collect(),
359 training_time_seconds: Some(2.0),
360 inference_time_ms: Some(0.05),
361 notes: Some("Without relationship features".to_string()),
362 })
363 .add_baseline(BaselineResult {
364 model_name: "NetworkFeatures".to_string(),
365 model_type: BaselineModelType::XgBoost,
366 metrics: [
367 ("auc_pr".to_string(), 0.52),
368 ("auc_roc".to_string(), 0.84),
369 ].into_iter().collect(),
370 training_time_seconds: Some(5.0),
371 inference_time_ms: Some(0.1),
372 notes: Some("With entity relationship features".to_string()),
373 })
374 .add_baseline(BaselineResult {
375 model_name: "GNN-Collusion".to_string(),
376 model_type: BaselineModelType::Gnn,
377 metrics: [
378 ("auc_pr".to_string(), 0.68),
379 ("auc_roc".to_string(), 0.91),
380 ].into_iter().collect(),
381 training_time_seconds: Some(60.0),
382 inference_time_ms: Some(5.0),
383 notes: Some("Graph neural network for network patterns".to_string()),
384 })
385 .metadata("collusion_multiplier", "2.0")
386 .metadata("domain", "fraud_detection")
387 .metadata("difficulty", "hard")
388 .build()
389}
390
391pub fn acfe_management_override_2k() -> BenchmarkSuite {
393 let mut class_dist = HashMap::new();
394 class_dist.insert("normal".to_string(), 1800);
395 class_dist.insert("journal_entry_override".to_string(), 80);
396 class_dist.insert("revenue_manipulation".to_string(), 50);
397 class_dist.insert("reserve_manipulation".to_string(), 40);
398 class_dist.insert("expense_capitalization".to_string(), 30);
399
400 BenchmarkBuilder::new("acfe-management-override-2k", "ACFE-ManagementOverride-2K")
401 .description("2K transactions testing management override detection. ACFE reports executive fraud has 6x higher median loss. Tests detection of sophisticated C-suite fraud patterns.")
402 .task_type(BenchmarkTaskType::FraudClassification)
403 .dataset_size(2000, 200)
404 .class_distribution(class_dist)
405 .split_ratios(0.7, 0.15, 0.15, true)
406 .primary_metric(MetricType::AucPr)
407 .metrics(vec![
408 MetricType::AucPr,
409 MetricType::AucRoc,
410 MetricType::MacroF1,
411 MetricType::Recall,
412 MetricType::PrecisionAtK(20),
413 ])
414 .seed(99999)
415 .time_span_days(1095) .num_companies(1)
417 .add_baseline(BaselineResult {
418 model_name: "RuleBased".to_string(),
419 model_type: BaselineModelType::RuleBased,
420 metrics: [
421 ("auc_pr".to_string(), 0.25),
422 ("auc_roc".to_string(), 0.65),
423 ].into_iter().collect(),
424 training_time_seconds: Some(0.0),
425 inference_time_ms: Some(0.5),
426 notes: Some("Traditional audit analytics rules".to_string()),
427 })
428 .add_baseline(BaselineResult {
429 model_name: "Autoencoder".to_string(),
430 model_type: BaselineModelType::Autoencoder,
431 metrics: [
432 ("auc_pr".to_string(), 0.42),
433 ("auc_roc".to_string(), 0.78),
434 ].into_iter().collect(),
435 training_time_seconds: Some(30.0),
436 inference_time_ms: Some(1.0),
437 notes: Some("Reconstruction-based anomaly detection".to_string()),
438 })
439 .add_baseline(BaselineResult {
440 model_name: "LightGBM-Override".to_string(),
441 model_type: BaselineModelType::LightGbm,
442 metrics: [
443 ("auc_pr".to_string(), 0.58),
444 ("auc_roc".to_string(), 0.86),
445 ].into_iter().collect(),
446 training_time_seconds: Some(5.0),
447 inference_time_ms: Some(0.05),
448 notes: Some("With temporal and approval chain features".to_string()),
449 })
450 .metadata("executive_loss_multiplier", "6.0")
451 .metadata("domain", "fraud_detection")
452 .metadata("difficulty", "expert")
453 .build()
454}
455
456pub fn all_acfe_benchmarks() -> Vec<BenchmarkSuite> {
458 vec![
459 acfe_calibrated_1k(),
460 acfe_collusion_5k(),
461 acfe_management_override_2k(),
462 ]
463}
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468
469 #[test]
470 fn test_acfe_calibration_defaults() {
471 let calibration = AcfeCalibration::default();
472 assert_eq!(calibration.median_loss, Decimal::new(117_000, 0));
473 assert_eq!(calibration.median_duration_months, 12);
474 assert_eq!(
475 calibration.category_distribution.asset_misappropriation,
476 0.86
477 );
478 }
479
480 #[test]
481 fn test_acfe_alignment_calculate() {
482 let observed_cat = HashMap::from([
483 ("asset_misappropriation".to_string(), 0.85),
484 ("corruption".to_string(), 0.30),
485 ("financial_statement_fraud".to_string(), 0.08),
486 ]);
487
488 let alignment = AcfeAlignment::calculate(
489 &observed_cat,
490 Decimal::new(120_000, 0),
491 10,
492 &HashMap::new(),
493 &HashMap::new(),
494 );
495
496 assert!(alignment.overall_alignment > 0.7);
498 assert!(alignment.median_loss_ratio > 0.9 && alignment.median_loss_ratio < 1.1);
499 }
500
501 #[test]
502 fn test_acfe_alignment_poor() {
503 let observed_cat = HashMap::from([
504 ("asset_misappropriation".to_string(), 0.50), ("corruption".to_string(), 0.50), ("financial_statement_fraud".to_string(), 0.50), ]);
508
509 let alignment = AcfeAlignment::calculate(
510 &observed_cat,
511 Decimal::new(500_000, 0), 36, &HashMap::new(),
514 &HashMap::new(),
515 );
516
517 assert!(!alignment.issues.is_empty());
519 assert!(alignment.overall_alignment < 0.7);
520 }
521
522 #[test]
523 fn test_acfe_calibrated_1k() {
524 let bench = acfe_calibrated_1k();
525 assert_eq!(bench.id, "acfe-calibrated-1k");
526 assert_eq!(bench.dataset.total_records, 1000);
527 assert!(bench.metadata.contains_key("calibration_source"));
528 assert!(!bench.baselines.is_empty());
529 }
530
531 #[test]
532 fn test_acfe_collusion_5k() {
533 let bench = acfe_collusion_5k();
534 assert_eq!(bench.id, "acfe-collusion-5k");
535 assert_eq!(bench.dataset.total_records, 5000);
536 assert!(bench
537 .dataset
538 .class_distribution
539 .contains_key("ring_collusion"));
540 }
541
542 #[test]
543 fn test_acfe_management_override_2k() {
544 let bench = acfe_management_override_2k();
545 assert_eq!(bench.id, "acfe-management-override-2k");
546 assert!(bench
547 .dataset
548 .class_distribution
549 .contains_key("journal_entry_override"));
550 }
551
552 #[test]
553 fn test_all_acfe_benchmarks() {
554 let benchmarks = all_acfe_benchmarks();
555 assert_eq!(benchmarks.len(), 3);
556
557 for bench in &benchmarks {
558 assert!(bench.metadata.get("domain") == Some(&"fraud_detection".to_string()));
559 }
560 }
561}