1use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12
13use super::{
14 BaselineModelType, BaselineResult, BenchmarkBuilder, BenchmarkSuite, BenchmarkTaskType,
15 MetricType,
16};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct IndustryBenchmarkAnalysis {
21 pub industry: String,
23 pub industry_anomaly_count: usize,
25 pub industry_anomaly_rate: f64,
27 pub transaction_type_distribution: HashMap<String, usize>,
29 pub terminology_coverage: f64,
31 pub regulatory_alignment: f64,
33 pub issues: Vec<String>,
35}
36
37impl Default for IndustryBenchmarkAnalysis {
38 fn default() -> Self {
39 Self {
40 industry: String::new(),
41 industry_anomaly_count: 0,
42 industry_anomaly_rate: 0.0,
43 transaction_type_distribution: HashMap::new(),
44 terminology_coverage: 1.0,
45 regulatory_alignment: 1.0,
46 issues: Vec::new(),
47 }
48 }
49}
50
51pub fn manufacturing_fraud_5k() -> BenchmarkSuite {
57 let mut class_dist = HashMap::new();
58 class_dist.insert("normal".to_string(), 4500);
59 class_dist.insert("yield_manipulation".to_string(), 150);
60 class_dist.insert("labor_misallocation".to_string(), 120);
61 class_dist.insert("phantom_production".to_string(), 50);
62 class_dist.insert("standard_cost_manipulation".to_string(), 80);
63 class_dist.insert("inventory_fraud".to_string(), 60);
64 class_dist.insert("scrap_fraud".to_string(), 40);
65
66 BenchmarkBuilder::new("manufacturing-fraud-5k", "Manufacturing-Fraud-5K")
67 .description("5K manufacturing transactions with industry-specific fraud patterns. Tests detection of yield manipulation, labor fraud, and inventory schemes.")
68 .task_type(BenchmarkTaskType::FraudClassification)
69 .dataset_size(5000, 500)
70 .class_distribution(class_dist)
71 .split_ratios(0.7, 0.15, 0.15, true)
72 .primary_metric(MetricType::MacroF1)
73 .metrics(vec![
74 MetricType::AucRoc,
75 MetricType::AucPr,
76 MetricType::MacroF1,
77 MetricType::Recall,
78 MetricType::Precision,
79 ])
80 .seed(11111)
81 .time_span_days(365)
82 .num_companies(2)
83 .add_baseline(BaselineResult {
84 model_name: "RuleBased-MFG".to_string(),
85 model_type: BaselineModelType::RuleBased,
86 metrics: [
87 ("auc_roc".to_string(), 0.68),
88 ("macro_f1".to_string(), 0.35),
89 ].into_iter().collect(),
90 training_time_seconds: Some(0.0),
91 inference_time_ms: Some(0.5),
92 notes: Some("Manufacturing variance analysis rules".to_string()),
93 })
94 .add_baseline(BaselineResult {
95 model_name: "XGBoost-MFG".to_string(),
96 model_type: BaselineModelType::XgBoost,
97 metrics: [
98 ("auc_roc".to_string(), 0.85),
99 ("macro_f1".to_string(), 0.58),
100 ].into_iter().collect(),
101 training_time_seconds: Some(4.0),
102 inference_time_ms: Some(0.05),
103 notes: Some("With BOM/routing features".to_string()),
104 })
105 .metadata("industry", "manufacturing")
106 .metadata("transaction_types", "production_order,material_issue,labor_posting,variance")
107 .metadata("difficulty", "medium")
108 .build()
109}
110
111pub fn retail_fraud_10k() -> BenchmarkSuite {
117 let mut class_dist = HashMap::new();
118 class_dist.insert("normal".to_string(), 9000);
119 class_dist.insert("sweethearting".to_string(), 300);
120 class_dist.insert("skimming".to_string(), 150);
121 class_dist.insert("refund_fraud".to_string(), 250);
122 class_dist.insert("void_abuse".to_string(), 120);
123 class_dist.insert("gift_card_fraud".to_string(), 80);
124 class_dist.insert("employee_discount_abuse".to_string(), 60);
125 class_dist.insert("vendor_kickback".to_string(), 40);
126
127 BenchmarkBuilder::new("retail-fraud-10k", "Retail-Fraud-10K")
128 .description("10K retail POS transactions with industry-specific fraud patterns. Tests detection of sweethearting, skimming, and refund schemes.")
129 .task_type(BenchmarkTaskType::FraudClassification)
130 .dataset_size(10000, 1000)
131 .class_distribution(class_dist)
132 .split_ratios(0.7, 0.15, 0.15, true)
133 .primary_metric(MetricType::AucPr)
134 .metrics(vec![
135 MetricType::AucPr,
136 MetricType::AucRoc,
137 MetricType::MacroF1,
138 MetricType::PrecisionAtK(100),
139 MetricType::Recall,
140 ])
141 .seed(22222)
142 .time_span_days(90)
143 .num_companies(1)
144 .add_baseline(BaselineResult {
145 model_name: "RuleBased-Retail".to_string(),
146 model_type: BaselineModelType::RuleBased,
147 metrics: [
148 ("auc_pr".to_string(), 0.42),
149 ("auc_roc".to_string(), 0.72),
150 ].into_iter().collect(),
151 training_time_seconds: Some(0.0),
152 inference_time_ms: Some(0.2),
153 notes: Some("POS exception analysis rules".to_string()),
154 })
155 .add_baseline(BaselineResult {
156 model_name: "RandomForest-Retail".to_string(),
157 model_type: BaselineModelType::RandomForest,
158 metrics: [
159 ("auc_pr".to_string(), 0.58),
160 ("auc_roc".to_string(), 0.84),
161 ].into_iter().collect(),
162 training_time_seconds: Some(3.0),
163 inference_time_ms: Some(0.1),
164 notes: Some("With cashier behavior features".to_string()),
165 })
166 .add_baseline(BaselineResult {
167 model_name: "LightGBM-Retail".to_string(),
168 model_type: BaselineModelType::LightGbm,
169 metrics: [
170 ("auc_pr".to_string(), 0.68),
171 ("auc_roc".to_string(), 0.90),
172 ].into_iter().collect(),
173 training_time_seconds: Some(2.0),
174 inference_time_ms: Some(0.05),
175 notes: Some("Optimized with temporal features".to_string()),
176 })
177 .metadata("industry", "retail")
178 .metadata("transaction_types", "pos_sale,return,void,discount,gift_card")
179 .metadata("difficulty", "medium")
180 .build()
181}
182
183pub fn healthcare_fraud_5k() -> BenchmarkSuite {
189 let mut class_dist = HashMap::new();
190 class_dist.insert("normal".to_string(), 4500);
191 class_dist.insert("upcoding".to_string(), 150);
192 class_dist.insert("unbundling".to_string(), 100);
193 class_dist.insert("phantom_billing".to_string(), 50);
194 class_dist.insert("duplicate_billing".to_string(), 80);
195 class_dist.insert("kickback".to_string(), 40);
196 class_dist.insert("medical_necessity_abuse".to_string(), 60);
197 class_dist.insert("dme_fraud".to_string(), 20);
198
199 BenchmarkBuilder::new("healthcare-fraud-5k", "Healthcare-Fraud-5K")
200 .description("5K healthcare revenue cycle transactions with industry-specific fraud patterns. Tests detection of upcoding, unbundling, and kickbacks under HIPAA/Stark/FCA compliance.")
201 .task_type(BenchmarkTaskType::FraudClassification)
202 .dataset_size(5000, 500)
203 .class_distribution(class_dist)
204 .split_ratios(0.7, 0.15, 0.15, true)
205 .primary_metric(MetricType::AucPr)
206 .metrics(vec![
207 MetricType::AucPr,
208 MetricType::AucRoc,
209 MetricType::MacroF1,
210 MetricType::Recall,
211 MetricType::PrecisionAtK(50),
212 ])
213 .seed(33333)
214 .time_span_days(365)
215 .num_companies(1)
216 .add_baseline(BaselineResult {
217 model_name: "NCCI-Edits".to_string(),
218 model_type: BaselineModelType::RuleBased,
219 metrics: [
220 ("auc_pr".to_string(), 0.35),
221 ("auc_roc".to_string(), 0.65),
222 ].into_iter().collect(),
223 training_time_seconds: Some(0.0),
224 inference_time_ms: Some(1.0),
225 notes: Some("CMS NCCI edit-based detection".to_string()),
226 })
227 .add_baseline(BaselineResult {
228 model_name: "ClaimAnalytics".to_string(),
229 model_type: BaselineModelType::RandomForest,
230 metrics: [
231 ("auc_pr".to_string(), 0.52),
232 ("auc_roc".to_string(), 0.80),
233 ].into_iter().collect(),
234 training_time_seconds: Some(5.0),
235 inference_time_ms: Some(0.2),
236 notes: Some("With ICD-10/CPT coding features".to_string()),
237 })
238 .add_baseline(BaselineResult {
239 model_name: "DeepClaim".to_string(),
240 model_type: BaselineModelType::NeuralNetwork,
241 metrics: [
242 ("auc_pr".to_string(), 0.65),
243 ("auc_roc".to_string(), 0.88),
244 ].into_iter().collect(),
245 training_time_seconds: Some(30.0),
246 inference_time_ms: Some(2.0),
247 notes: Some("Embedding-based claim analysis".to_string()),
248 })
249 .metadata("industry", "healthcare")
250 .metadata("regulatory_framework", "hipaa,stark,anti_kickback,fca")
251 .metadata("coding_systems", "icd10,cpt,drg,hcpcs")
252 .metadata("difficulty", "hard")
253 .build()
254}
255
256pub fn technology_fraud_3k() -> BenchmarkSuite {
262 let mut class_dist = HashMap::new();
263 class_dist.insert("normal".to_string(), 2700);
264 class_dist.insert("premature_revenue".to_string(), 100);
265 class_dist.insert("side_letter_abuse".to_string(), 60);
266 class_dist.insert("channel_stuffing".to_string(), 50);
267 class_dist.insert("improper_capitalization".to_string(), 50);
268 class_dist.insert("useful_life_manipulation".to_string(), 40);
269
270 BenchmarkBuilder::new("technology-fraud-3k", "Technology-Fraud-3K")
271 .description("3K technology sector transactions with SaaS/license revenue and R&D fraud patterns. Tests detection of ASC 606 violations and improper capitalization.")
272 .task_type(BenchmarkTaskType::FraudClassification)
273 .dataset_size(3000, 300)
274 .class_distribution(class_dist)
275 .split_ratios(0.7, 0.15, 0.15, true)
276 .primary_metric(MetricType::AucPr)
277 .metrics(vec![
278 MetricType::AucPr,
279 MetricType::AucRoc,
280 MetricType::MacroF1,
281 MetricType::Recall,
282 ])
283 .seed(44444)
284 .time_span_days(730) .num_companies(2)
286 .add_baseline(BaselineResult {
287 model_name: "RevenueRules".to_string(),
288 model_type: BaselineModelType::RuleBased,
289 metrics: [
290 ("auc_pr".to_string(), 0.30),
291 ("auc_roc".to_string(), 0.62),
292 ].into_iter().collect(),
293 training_time_seconds: Some(0.0),
294 inference_time_ms: Some(0.5),
295 notes: Some("ASC 606 compliance rules".to_string()),
296 })
297 .add_baseline(BaselineResult {
298 model_name: "ContractML".to_string(),
299 model_type: BaselineModelType::XgBoost,
300 metrics: [
301 ("auc_pr".to_string(), 0.48),
302 ("auc_roc".to_string(), 0.78),
303 ].into_iter().collect(),
304 training_time_seconds: Some(3.0),
305 inference_time_ms: Some(0.1),
306 notes: Some("With contract/performance obligation features".to_string()),
307 })
308 .add_baseline(BaselineResult {
309 model_name: "TemporalLGBM".to_string(),
310 model_type: BaselineModelType::LightGbm,
311 metrics: [
312 ("auc_pr".to_string(), 0.58),
313 ("auc_roc".to_string(), 0.85),
314 ].into_iter().collect(),
315 training_time_seconds: Some(4.0),
316 inference_time_ms: Some(0.08),
317 notes: Some("With temporal revenue patterns".to_string()),
318 })
319 .metadata("industry", "technology")
320 .metadata("revenue_standards", "asc_606,asc_985")
321 .metadata("difficulty", "hard")
322 .build()
323}
324
325pub fn financial_services_fraud_5k() -> BenchmarkSuite {
331 let mut class_dist = HashMap::new();
332 class_dist.insert("normal".to_string(), 4500);
333 class_dist.insert("loan_fraud".to_string(), 150);
334 class_dist.insert("trading_fraud".to_string(), 100);
335 class_dist.insert("account_manipulation".to_string(), 80);
336 class_dist.insert("insurance_fraud".to_string(), 100);
337 class_dist.insert("fee_fraud".to_string(), 70);
338
339 BenchmarkBuilder::new("financial-services-fraud-5k", "FinancialServices-Fraud-5K")
340 .description("5K financial services transactions with banking, insurance, and investment fraud patterns. Tests detection under regulatory frameworks (Basel, Solvency, SEC).")
341 .task_type(BenchmarkTaskType::FraudClassification)
342 .dataset_size(5000, 500)
343 .class_distribution(class_dist)
344 .split_ratios(0.7, 0.15, 0.15, true)
345 .primary_metric(MetricType::AucPr)
346 .metrics(vec![
347 MetricType::AucPr,
348 MetricType::AucRoc,
349 MetricType::MacroF1,
350 MetricType::Recall,
351 MetricType::PrecisionAtK(50),
352 ])
353 .seed(55555)
354 .time_span_days(365)
355 .num_companies(1)
356 .add_baseline(BaselineResult {
357 model_name: "ComplianceRules".to_string(),
358 model_type: BaselineModelType::RuleBased,
359 metrics: [
360 ("auc_pr".to_string(), 0.38),
361 ("auc_roc".to_string(), 0.70),
362 ].into_iter().collect(),
363 training_time_seconds: Some(0.0),
364 inference_time_ms: Some(0.5),
365 notes: Some("Regulatory compliance rules".to_string()),
366 })
367 .add_baseline(BaselineResult {
368 model_name: "FraudNet".to_string(),
369 model_type: BaselineModelType::RandomForest,
370 metrics: [
371 ("auc_pr".to_string(), 0.55),
372 ("auc_roc".to_string(), 0.83),
373 ].into_iter().collect(),
374 training_time_seconds: Some(5.0),
375 inference_time_ms: Some(0.15),
376 notes: Some("With account behavior features".to_string()),
377 })
378 .add_baseline(BaselineResult {
379 model_name: "DeepFraud".to_string(),
380 model_type: BaselineModelType::NeuralNetwork,
381 metrics: [
382 ("auc_pr".to_string(), 0.68),
383 ("auc_roc".to_string(), 0.91),
384 ].into_iter().collect(),
385 training_time_seconds: Some(45.0),
386 inference_time_ms: Some(3.0),
387 notes: Some("LSTM-based sequence model".to_string()),
388 })
389 .metadata("industry", "financial_services")
390 .metadata("regulatory_framework", "basel,sec,finra")
391 .metadata("difficulty", "hard")
392 .build()
393}
394
395pub fn all_industry_benchmarks() -> Vec<BenchmarkSuite> {
397 vec![
398 manufacturing_fraud_5k(),
399 retail_fraud_10k(),
400 healthcare_fraud_5k(),
401 technology_fraud_3k(),
402 financial_services_fraud_5k(),
403 ]
404}
405
406pub fn get_industry_benchmark(industry: &str) -> Option<BenchmarkSuite> {
408 match industry.to_lowercase().as_str() {
409 "manufacturing" => Some(manufacturing_fraud_5k()),
410 "retail" => Some(retail_fraud_10k()),
411 "healthcare" => Some(healthcare_fraud_5k()),
412 "technology" => Some(technology_fraud_3k()),
413 "financial_services" | "financialservices" => Some(financial_services_fraud_5k()),
414 _ => None,
415 }
416}
417
418#[cfg(test)]
419mod tests {
420 use super::*;
421
422 #[test]
423 fn test_manufacturing_fraud_5k() {
424 let bench = manufacturing_fraud_5k();
425 assert_eq!(bench.id, "manufacturing-fraud-5k");
426 assert_eq!(bench.dataset.total_records, 5000);
427 assert!(bench
428 .dataset
429 .class_distribution
430 .contains_key("yield_manipulation"));
431 assert_eq!(
432 bench.metadata.get("industry"),
433 Some(&"manufacturing".to_string())
434 );
435 }
436
437 #[test]
438 fn test_retail_fraud_10k() {
439 let bench = retail_fraud_10k();
440 assert_eq!(bench.id, "retail-fraud-10k");
441 assert_eq!(bench.dataset.total_records, 10000);
442 assert!(bench
443 .dataset
444 .class_distribution
445 .contains_key("sweethearting"));
446 assert!(bench.dataset.class_distribution.contains_key("skimming"));
447 }
448
449 #[test]
450 fn test_healthcare_fraud_5k() {
451 let bench = healthcare_fraud_5k();
452 assert_eq!(bench.id, "healthcare-fraud-5k");
453 assert!(bench.dataset.class_distribution.contains_key("upcoding"));
454 assert!(bench.dataset.class_distribution.contains_key("unbundling"));
455 assert!(bench.metadata.contains_key("regulatory_framework"));
456 }
457
458 #[test]
459 fn test_technology_fraud_3k() {
460 let bench = technology_fraud_3k();
461 assert_eq!(bench.id, "technology-fraud-3k");
462 assert!(bench
463 .dataset
464 .class_distribution
465 .contains_key("premature_revenue"));
466 assert!(bench
467 .dataset
468 .class_distribution
469 .contains_key("channel_stuffing"));
470 }
471
472 #[test]
473 fn test_financial_services_fraud_5k() {
474 let bench = financial_services_fraud_5k();
475 assert_eq!(bench.id, "financial-services-fraud-5k");
476 assert!(bench.dataset.class_distribution.contains_key("loan_fraud"));
477 assert!(bench
478 .dataset
479 .class_distribution
480 .contains_key("trading_fraud"));
481 }
482
483 #[test]
484 fn test_all_industry_benchmarks() {
485 let benchmarks = all_industry_benchmarks();
486 assert_eq!(benchmarks.len(), 5);
487
488 for bench in &benchmarks {
490 assert!(bench.metadata.contains_key("industry"));
491 }
492 }
493
494 #[test]
495 fn test_get_industry_benchmark() {
496 assert!(get_industry_benchmark("manufacturing").is_some());
497 assert!(get_industry_benchmark("retail").is_some());
498 assert!(get_industry_benchmark("healthcare").is_some());
499 assert!(get_industry_benchmark("technology").is_some());
500 assert!(get_industry_benchmark("financial_services").is_some());
501 assert!(get_industry_benchmark("unknown").is_none());
502 }
503
504 #[test]
505 fn test_industry_benchmark_analysis_default() {
506 let analysis = IndustryBenchmarkAnalysis::default();
507 assert!(analysis.industry.is_empty());
508 assert_eq!(analysis.terminology_coverage, 1.0);
509 assert!(analysis.issues.is_empty());
510 }
511}