pub mod acfe;
pub mod industry;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub use acfe::{
acfe_calibrated_1k, acfe_collusion_5k, acfe_management_override_2k, all_acfe_benchmarks,
AcfeAlignment, AcfeCalibration, AcfeCategoryDistribution,
};
pub use industry::{
all_industry_benchmarks, financial_services_fraud_5k, get_industry_benchmark,
healthcare_fraud_5k, manufacturing_fraud_5k, retail_fraud_10k, technology_fraud_3k,
IndustryBenchmarkAnalysis,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSuite {
pub id: String,
pub name: String,
pub description: String,
pub version: String,
pub task_type: BenchmarkTaskType,
pub dataset: DatasetSpec,
pub evaluation: EvaluationSpec,
pub baselines: Vec<BaselineResult>,
pub metadata: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum BenchmarkTaskType {
AnomalyDetection,
FraudClassification,
DataQualityDetection,
EntityMatching,
MultiLabelAnomalyDetection,
AmountPrediction,
TimeSeriesAnomalyDetection,
GraphFraudDetection,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetSpec {
pub total_records: usize,
pub positive_count: usize,
pub negative_count: usize,
pub class_distribution: HashMap<String, usize>,
pub features: FeatureSet,
pub split_ratios: SplitRatios,
pub seed: u64,
pub time_span_days: u32,
pub num_companies: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeatureSet {
pub numerical_count: usize,
pub categorical_count: usize,
pub temporal_count: usize,
pub text_count: usize,
pub feature_descriptions: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SplitRatios {
pub train: f64,
pub validation: f64,
pub test: f64,
pub temporal_split: bool,
}
impl Default for SplitRatios {
fn default() -> Self {
Self {
train: 0.7,
validation: 0.15,
test: 0.15,
temporal_split: false,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationSpec {
pub primary_metric: MetricType,
pub metrics: Vec<MetricType>,
pub classification_threshold: Option<f64>,
pub cv_folds: Option<usize>,
pub cost_matrix: Option<CostMatrix>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum MetricType {
AucRoc,
AucPr,
PrecisionAtK(usize),
RecallAtK(usize),
F1Score,
Precision,
Recall,
Accuracy,
Mcc,
Map,
Ndcg,
Mse,
Mae,
R2,
LogLoss,
CohenKappa,
MacroF1,
WeightedF1,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CostMatrix {
pub false_positive_cost: f64,
pub false_negative_cost: f64,
pub true_positive_cost: f64,
pub true_negative_cost: f64,
}
impl Default for CostMatrix {
fn default() -> Self {
Self {
false_positive_cost: 1.0,
false_negative_cost: 10.0, true_positive_cost: 0.0,
true_negative_cost: 0.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BaselineResult {
pub model_name: String,
pub model_type: BaselineModelType,
pub metrics: HashMap<String, f64>,
pub training_time_seconds: Option<f64>,
pub inference_time_ms: Option<f64>,
pub notes: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum BaselineModelType {
Random,
MajorityClass,
RuleBased,
IsolationForest,
OneClassSvm,
Lof,
LogisticRegression,
RandomForest,
XgBoost,
LightGbm,
NeuralNetwork,
Gnn,
Autoencoder,
Custom(String),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LeaderboardEntry {
pub rank: usize,
pub submission_name: String,
pub submitter: String,
pub model_description: String,
pub primary_score: f64,
pub all_scores: HashMap<String, f64>,
pub submission_date: String,
pub is_baseline: bool,
}
pub struct BenchmarkBuilder {
suite: BenchmarkSuite,
}
impl BenchmarkBuilder {
pub fn new(id: &str, name: &str) -> Self {
Self {
suite: BenchmarkSuite {
id: id.to_string(),
name: name.to_string(),
description: String::new(),
version: "1.0.0".to_string(),
task_type: BenchmarkTaskType::AnomalyDetection,
dataset: DatasetSpec {
total_records: 1000,
positive_count: 50,
negative_count: 950,
class_distribution: HashMap::new(),
features: FeatureSet {
numerical_count: 10,
categorical_count: 5,
temporal_count: 3,
text_count: 1,
feature_descriptions: HashMap::new(),
},
split_ratios: SplitRatios::default(),
seed: 42,
time_span_days: 365,
num_companies: 1,
},
evaluation: EvaluationSpec {
primary_metric: MetricType::AucRoc,
metrics: vec![MetricType::AucRoc, MetricType::AucPr, MetricType::F1Score],
classification_threshold: Some(0.5),
cv_folds: None,
cost_matrix: None,
},
baselines: Vec::new(),
metadata: HashMap::new(),
},
}
}
pub fn description(mut self, desc: &str) -> Self {
self.suite.description = desc.to_string();
self
}
pub fn task_type(mut self, task_type: BenchmarkTaskType) -> Self {
self.suite.task_type = task_type;
self
}
pub fn dataset_size(mut self, total: usize, positive: usize) -> Self {
self.suite.dataset.total_records = total;
self.suite.dataset.positive_count = positive;
self.suite.dataset.negative_count = total.saturating_sub(positive);
self
}
pub fn class_distribution(mut self, distribution: HashMap<String, usize>) -> Self {
self.suite.dataset.class_distribution = distribution;
self
}
pub fn split_ratios(mut self, train: f64, val: f64, test: f64, temporal: bool) -> Self {
self.suite.dataset.split_ratios = SplitRatios {
train,
validation: val,
test,
temporal_split: temporal,
};
self
}
pub fn primary_metric(mut self, metric: MetricType) -> Self {
self.suite.evaluation.primary_metric = metric;
self
}
pub fn metrics(mut self, metrics: Vec<MetricType>) -> Self {
self.suite.evaluation.metrics = metrics;
self
}
pub fn add_baseline(mut self, baseline: BaselineResult) -> Self {
self.suite.baselines.push(baseline);
self
}
pub fn seed(mut self, seed: u64) -> Self {
self.suite.dataset.seed = seed;
self
}
pub fn time_span_days(mut self, days: u32) -> Self {
self.suite.dataset.time_span_days = days;
self
}
pub fn num_companies(mut self, n: usize) -> Self {
self.suite.dataset.num_companies = n;
self
}
pub fn metadata(mut self, key: &str, value: &str) -> Self {
self.suite
.metadata
.insert(key.to_string(), value.to_string());
self
}
pub fn build(self) -> BenchmarkSuite {
self.suite
}
}
pub fn anomaly_bench_1k() -> BenchmarkSuite {
let mut class_dist = HashMap::new();
class_dist.insert("normal".to_string(), 950);
class_dist.insert("anomaly".to_string(), 50);
BenchmarkBuilder::new("anomaly-bench-1k", "AnomalyBench-1K")
.description("1000 journal entry transactions with 5% anomaly rate. Balanced mix of fraud, error, and statistical anomalies.")
.task_type(BenchmarkTaskType::AnomalyDetection)
.dataset_size(1000, 50)
.class_distribution(class_dist)
.split_ratios(0.7, 0.15, 0.15, true)
.primary_metric(MetricType::AucPr)
.metrics(vec![
MetricType::AucRoc,
MetricType::AucPr,
MetricType::F1Score,
MetricType::PrecisionAtK(10),
MetricType::PrecisionAtK(50),
MetricType::Recall,
])
.seed(42)
.time_span_days(90)
.num_companies(1)
.add_baseline(BaselineResult {
model_name: "Random".to_string(),
model_type: BaselineModelType::Random,
metrics: [
("auc_roc".to_string(), 0.50),
("auc_pr".to_string(), 0.05),
("f1".to_string(), 0.09),
].into_iter().collect(),
training_time_seconds: Some(0.0),
inference_time_ms: Some(0.01),
notes: Some("Random baseline for reference".to_string()),
})
.add_baseline(BaselineResult {
model_name: "IsolationForest".to_string(),
model_type: BaselineModelType::IsolationForest,
metrics: [
("auc_roc".to_string(), 0.78),
("auc_pr".to_string(), 0.42),
("f1".to_string(), 0.45),
].into_iter().collect(),
training_time_seconds: Some(0.5),
inference_time_ms: Some(0.1),
notes: Some("Unsupervised baseline".to_string()),
})
.add_baseline(BaselineResult {
model_name: "XGBoost".to_string(),
model_type: BaselineModelType::XgBoost,
metrics: [
("auc_roc".to_string(), 0.92),
("auc_pr".to_string(), 0.68),
("f1".to_string(), 0.72),
].into_iter().collect(),
training_time_seconds: Some(2.0),
inference_time_ms: Some(0.05),
notes: Some("Supervised baseline with full labels".to_string()),
})
.metadata("domain", "accounting")
.metadata("difficulty", "easy")
.build()
}
pub fn fraud_detect_10k() -> BenchmarkSuite {
let mut class_dist = HashMap::new();
class_dist.insert("normal".to_string(), 9700);
class_dist.insert("fictitious_transaction".to_string(), 80);
class_dist.insert("duplicate_payment".to_string(), 60);
class_dist.insert("round_tripping".to_string(), 40);
class_dist.insert("threshold_manipulation".to_string(), 50);
class_dist.insert("self_approval".to_string(), 30);
class_dist.insert("other_fraud".to_string(), 40);
BenchmarkBuilder::new("fraud-detect-10k", "FraudDetect-10K")
.description("10K journal entries with multi-class fraud labels. Includes 6 fraud types with realistic class imbalance.")
.task_type(BenchmarkTaskType::FraudClassification)
.dataset_size(10000, 300)
.class_distribution(class_dist)
.split_ratios(0.7, 0.15, 0.15, true)
.primary_metric(MetricType::MacroF1)
.metrics(vec![
MetricType::AucRoc,
MetricType::MacroF1,
MetricType::WeightedF1,
MetricType::Recall,
MetricType::Precision,
MetricType::CohenKappa,
])
.seed(12345)
.time_span_days(365)
.num_companies(3)
.add_baseline(BaselineResult {
model_name: "MajorityClass".to_string(),
model_type: BaselineModelType::MajorityClass,
metrics: [
("macro_f1".to_string(), 0.07),
("weighted_f1".to_string(), 0.94),
].into_iter().collect(),
training_time_seconds: Some(0.0),
inference_time_ms: Some(0.01),
notes: Some("Predicts normal for all transactions".to_string()),
})
.add_baseline(BaselineResult {
model_name: "RandomForest".to_string(),
model_type: BaselineModelType::RandomForest,
metrics: [
("macro_f1".to_string(), 0.58),
("weighted_f1".to_string(), 0.96),
("auc_roc".to_string(), 0.89),
].into_iter().collect(),
training_time_seconds: Some(5.0),
inference_time_ms: Some(0.2),
notes: Some("Balanced class weights".to_string()),
})
.add_baseline(BaselineResult {
model_name: "LightGBM".to_string(),
model_type: BaselineModelType::LightGbm,
metrics: [
("macro_f1".to_string(), 0.65),
("weighted_f1".to_string(), 0.97),
("auc_roc".to_string(), 0.93),
].into_iter().collect(),
training_time_seconds: Some(3.0),
inference_time_ms: Some(0.05),
notes: Some("Optimized hyperparameters".to_string()),
})
.metadata("domain", "accounting")
.metadata("difficulty", "medium")
.build()
}
pub fn data_quality_100k() -> BenchmarkSuite {
let mut class_dist = HashMap::new();
class_dist.insert("clean".to_string(), 90000);
class_dist.insert("missing_value".to_string(), 3000);
class_dist.insert("typo".to_string(), 2000);
class_dist.insert("format_error".to_string(), 2000);
class_dist.insert("duplicate".to_string(), 1500);
class_dist.insert("encoding_issue".to_string(), 1000);
class_dist.insert("truncation".to_string(), 500);
BenchmarkBuilder::new("data-quality-100k", "DataQuality-100K")
.description("100K records with various data quality issues. Tests detection of missing values, typos, format errors, duplicates, and encoding issues.")
.task_type(BenchmarkTaskType::DataQualityDetection)
.dataset_size(100000, 10000)
.class_distribution(class_dist)
.split_ratios(0.8, 0.1, 0.1, false)
.primary_metric(MetricType::F1Score)
.metrics(vec![
MetricType::F1Score,
MetricType::Precision,
MetricType::Recall,
MetricType::AucRoc,
MetricType::MacroF1,
])
.seed(99999)
.time_span_days(730) .num_companies(5)
.add_baseline(BaselineResult {
model_name: "RuleBased".to_string(),
model_type: BaselineModelType::RuleBased,
metrics: [
("f1".to_string(), 0.72),
("precision".to_string(), 0.85),
("recall".to_string(), 0.62),
].into_iter().collect(),
training_time_seconds: Some(0.0),
inference_time_ms: Some(0.5),
notes: Some("Regex patterns and null checks".to_string()),
})
.add_baseline(BaselineResult {
model_name: "LogisticRegression".to_string(),
model_type: BaselineModelType::LogisticRegression,
metrics: [
("f1".to_string(), 0.78),
("precision".to_string(), 0.80),
("recall".to_string(), 0.76),
].into_iter().collect(),
training_time_seconds: Some(2.0),
inference_time_ms: Some(0.02),
notes: Some("Character n-gram features".to_string()),
})
.add_baseline(BaselineResult {
model_name: "XGBoost".to_string(),
model_type: BaselineModelType::XgBoost,
metrics: [
("f1".to_string(), 0.88),
("precision".to_string(), 0.90),
("recall".to_string(), 0.86),
].into_iter().collect(),
training_time_seconds: Some(15.0),
inference_time_ms: Some(0.08),
notes: Some("Mixed feature types".to_string()),
})
.metadata("domain", "data_quality")
.metadata("difficulty", "medium")
.build()
}
pub fn entity_match_5k() -> BenchmarkSuite {
let mut class_dist = HashMap::new();
class_dist.insert("match".to_string(), 2000);
class_dist.insert("non_match".to_string(), 3000);
BenchmarkBuilder::new("entity-match-5k", "EntityMatch-5K")
.description("5K vendor/customer record pairs for entity matching. Includes name variations, typos, and abbreviations.")
.task_type(BenchmarkTaskType::EntityMatching)
.dataset_size(5000, 2000)
.class_distribution(class_dist)
.split_ratios(0.7, 0.15, 0.15, false)
.primary_metric(MetricType::F1Score)
.metrics(vec![
MetricType::F1Score,
MetricType::Precision,
MetricType::Recall,
MetricType::AucRoc,
])
.seed(54321)
.time_span_days(365)
.num_companies(2)
.add_baseline(BaselineResult {
model_name: "ExactMatch".to_string(),
model_type: BaselineModelType::RuleBased,
metrics: [
("f1".to_string(), 0.35),
("precision".to_string(), 1.0),
("recall".to_string(), 0.21),
].into_iter().collect(),
training_time_seconds: Some(0.0),
inference_time_ms: Some(0.1),
notes: Some("Exact string matching only".to_string()),
})
.add_baseline(BaselineResult {
model_name: "FuzzyMatch".to_string(),
model_type: BaselineModelType::RuleBased,
metrics: [
("f1".to_string(), 0.68),
("precision".to_string(), 0.72),
("recall".to_string(), 0.65),
].into_iter().collect(),
training_time_seconds: Some(0.0),
inference_time_ms: Some(2.0),
notes: Some("Levenshtein distance threshold".to_string()),
})
.add_baseline(BaselineResult {
model_name: "Magellan".to_string(),
model_type: BaselineModelType::RandomForest,
metrics: [
("f1".to_string(), 0.89),
("precision".to_string(), 0.91),
("recall".to_string(), 0.87),
].into_iter().collect(),
training_time_seconds: Some(10.0),
inference_time_ms: Some(5.0),
notes: Some("Feature-based entity matcher".to_string()),
})
.metadata("domain", "master_data")
.metadata("difficulty", "hard")
.build()
}
pub fn graph_fraud_10k() -> BenchmarkSuite {
let mut class_dist = HashMap::new();
class_dist.insert("normal".to_string(), 9500);
class_dist.insert("fraud_network".to_string(), 500);
BenchmarkBuilder::new("graph-fraud-10k", "GraphFraud-10K")
.description("10K transactions with entity graph structure. Fraud detection using network features and GNN models.")
.task_type(BenchmarkTaskType::GraphFraudDetection)
.dataset_size(10000, 500)
.class_distribution(class_dist)
.split_ratios(0.7, 0.15, 0.15, true)
.primary_metric(MetricType::AucPr)
.metrics(vec![
MetricType::AucPr,
MetricType::AucRoc,
MetricType::F1Score,
MetricType::PrecisionAtK(100),
])
.seed(77777)
.time_span_days(365)
.num_companies(4)
.add_baseline(BaselineResult {
model_name: "NodeFeatures".to_string(),
model_type: BaselineModelType::XgBoost,
metrics: [
("auc_pr".to_string(), 0.45),
("auc_roc".to_string(), 0.78),
].into_iter().collect(),
training_time_seconds: Some(5.0),
inference_time_ms: Some(0.1),
notes: Some("XGBoost on node features only".to_string()),
})
.add_baseline(BaselineResult {
model_name: "GraphSAGE".to_string(),
model_type: BaselineModelType::Gnn,
metrics: [
("auc_pr".to_string(), 0.62),
("auc_roc".to_string(), 0.88),
].into_iter().collect(),
training_time_seconds: Some(60.0),
inference_time_ms: Some(5.0),
notes: Some("2-layer GraphSAGE".to_string()),
})
.add_baseline(BaselineResult {
model_name: "GAT".to_string(),
model_type: BaselineModelType::Gnn,
metrics: [
("auc_pr".to_string(), 0.68),
("auc_roc".to_string(), 0.91),
].into_iter().collect(),
training_time_seconds: Some(90.0),
inference_time_ms: Some(8.0),
notes: Some("Graph Attention Network".to_string()),
})
.metadata("domain", "graph_analytics")
.metadata("difficulty", "hard")
.build()
}
pub fn all_benchmarks() -> Vec<BenchmarkSuite> {
let mut benchmarks = vec![
anomaly_bench_1k(),
fraud_detect_10k(),
data_quality_100k(),
entity_match_5k(),
graph_fraud_10k(),
];
benchmarks.extend(all_acfe_benchmarks());
benchmarks.extend(all_industry_benchmarks());
benchmarks
}
pub fn get_benchmark(id: &str) -> Option<BenchmarkSuite> {
all_benchmarks().into_iter().find(|b| b.id == id)
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_anomaly_bench_1k() {
let bench = anomaly_bench_1k();
assert_eq!(bench.id, "anomaly-bench-1k");
assert_eq!(bench.dataset.total_records, 1000);
assert_eq!(bench.dataset.positive_count, 50);
assert_eq!(bench.baselines.len(), 3);
}
#[test]
fn test_fraud_detect_10k() {
let bench = fraud_detect_10k();
assert_eq!(bench.id, "fraud-detect-10k");
assert_eq!(bench.dataset.total_records, 10000);
assert_eq!(bench.task_type, BenchmarkTaskType::FraudClassification);
}
#[test]
fn test_data_quality_100k() {
let bench = data_quality_100k();
assert_eq!(bench.id, "data-quality-100k");
assert_eq!(bench.dataset.total_records, 100000);
assert!(bench.dataset.class_distribution.len() > 5);
}
#[test]
fn test_all_benchmarks() {
let benchmarks = all_benchmarks();
assert_eq!(benchmarks.len(), 13);
for bench in &benchmarks {
assert!(
!bench.baselines.is_empty(),
"Benchmark {} has no baselines",
bench.id
);
}
}
#[test]
fn test_get_benchmark() {
assert!(get_benchmark("fraud-detect-10k").is_some());
assert!(get_benchmark("nonexistent").is_none());
}
#[test]
fn test_builder() {
let bench = BenchmarkBuilder::new("custom", "Custom Benchmark")
.description("A custom test benchmark")
.task_type(BenchmarkTaskType::AnomalyDetection)
.dataset_size(500, 25)
.seed(123)
.build();
assert_eq!(bench.id, "custom");
assert_eq!(bench.dataset.total_records, 500);
assert_eq!(bench.dataset.seed, 123);
}
}