use std::sync::Arc;
use serde::{Deserialize, Serialize};
use datasynth_core::error::SynthError;
use datasynth_core::llm::{LlmProvider, LlmRequest};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyContext {
pub industry: String,
pub company_size: String,
pub country: String,
pub employee_count: Option<u32>,
pub annual_revenue: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ControlContext {
pub maturity_level: String,
pub weak_controls: Vec<String>,
pub sod_gaps: Vec<String>,
pub audit_active: bool,
pub itgc_gaps: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DesignedScheme {
pub name: String,
pub narrative: String,
pub exploited_weaknesses: Vec<String>,
pub detection_signals: Vec<String>,
pub stages: Vec<DesignedStage>,
pub difficulty: String,
pub impact_range: (f64, f64),
pub realism_score: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DesignedStage {
pub name: String,
pub description: String,
pub duration_months: u32,
pub amount_min: f64,
pub amount_max: f64,
pub transaction_count_min: u32,
pub transaction_count_max: u32,
pub concealment: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DesignResult {
pub schemes: Vec<DesignedScheme>,
pub context_summary: String,
}
pub struct AnomalyDesigner {
provider: Arc<dyn LlmProvider>,
}
impl AnomalyDesigner {
pub fn new(provider: Arc<dyn LlmProvider>) -> Self {
Self { provider }
}
pub fn design(
&self,
company: &CompanyContext,
controls: &ControlContext,
max_schemes: usize,
) -> Result<DesignResult, SynthError> {
let context_summary = self.build_context_summary(company, controls);
let prompt = self.build_design_prompt(company, controls, max_schemes);
let request = LlmRequest::new(prompt)
.with_system(Self::system_prompt().to_string())
.with_temperature(0.4)
.with_max_tokens(4096);
let schemes = match self.provider.complete(&request) {
Ok(response) => {
let parsed = self.parse_schemes(&response.content, max_schemes);
if parsed.is_empty() {
tracing::debug!("LLM returned no parseable schemes, using fallback");
Self::fallback_schemes(company, controls, max_schemes)
} else {
parsed
}
}
Err(e) => {
tracing::warn!("LLM scheme design failed: {e}, using fallback templates");
Self::fallback_schemes(company, controls, max_schemes)
}
};
Ok(DesignResult {
schemes,
context_summary,
})
}
fn build_context_summary(&self, company: &CompanyContext, controls: &ControlContext) -> String {
format!(
"{} {} company in {} | Controls: {} maturity, {} weak controls, {} SoD gaps",
company.company_size,
company.industry,
company.country,
controls.maturity_level,
controls.weak_controls.len(),
controls.sod_gaps.len(),
)
}
fn build_design_prompt(
&self,
company: &CompanyContext,
controls: &ControlContext,
max_schemes: usize,
) -> String {
let mut prompt = String::with_capacity(2048);
prompt.push_str(&format!(
"Design {max_schemes} realistic fraud scheme(s) for this company:\n\n"
));
prompt.push_str(&format!("Industry: {}\n", company.industry));
prompt.push_str(&format!("Size: {}\n", company.company_size));
prompt.push_str(&format!("Country: {}\n", company.country));
if let Some(count) = company.employee_count {
prompt.push_str(&format!("Employees: ~{count}\n"));
}
if let Some(ref rev) = company.annual_revenue {
prompt.push_str(&format!("Revenue: {rev}\n"));
}
prompt.push_str(&format!(
"\nControl maturity: {}\n",
controls.maturity_level
));
if !controls.weak_controls.is_empty() {
prompt.push_str(&format!(
"Weak controls: {}\n",
controls.weak_controls.join(", ")
));
}
if !controls.sod_gaps.is_empty() {
prompt.push_str(&format!("SoD gaps: {}\n", controls.sod_gaps.join(", ")));
}
if !controls.itgc_gaps.is_empty() {
prompt.push_str(&format!("IT gaps: {}\n", controls.itgc_gaps.join(", ")));
}
if controls.audit_active {
prompt.push_str("Note: external audit is currently active\n");
}
prompt
}
fn parse_schemes(&self, content: &str, max_schemes: usize) -> Vec<DesignedScheme> {
let json_str = datasynth_core::llm::extract_json_array(content);
match json_str {
Some(json) => match serde_json::from_str::<Vec<DesignedScheme>>(json) {
Ok(mut schemes) => {
schemes.truncate(max_schemes);
schemes.sort_by(|a, b| {
b.realism_score
.partial_cmp(&a.realism_score)
.unwrap_or(std::cmp::Ordering::Equal)
});
schemes
}
Err(e) => {
tracing::debug!("Failed to parse LLM schemes: {e}");
vec![]
}
},
None => {
tracing::debug!("No JSON array found in LLM scheme response");
vec![]
}
}
}
fn fallback_schemes(
company: &CompanyContext,
controls: &ControlContext,
max_schemes: usize,
) -> Vec<DesignedScheme> {
let mut schemes = Vec::new();
if controls
.weak_controls
.iter()
.any(|c| c.contains("three_way") || c.contains("C003") || c.contains("procurement"))
{
schemes.push(DesignedScheme {
name: "procurement_kickback".to_string(),
narrative: format!(
"A procurement manager at this {} {} company exploits weak three-way match \
controls to process inflated invoices from a colluding vendor. The kickback \
is structured as consulting fees to a separate entity.",
company.company_size, company.industry
),
exploited_weaknesses: vec!["three_way_match".to_string()],
detection_signals: vec![
"Vendor invoice amounts consistently 10-20% above market".to_string(),
"Single-source procurement without competitive bids".to_string(),
"Correlated timing between vendor payments and personal deposits".to_string(),
],
stages: vec![
DesignedStage {
name: "testing".to_string(),
description: "Small inflated invoices to test detection".to_string(),
duration_months: 2,
amount_min: 500.0,
amount_max: 2000.0,
transaction_count_min: 2,
transaction_count_max: 5,
concealment: vec!["document_manipulation".to_string()],
},
DesignedStage {
name: "escalation".to_string(),
description: "Larger invoices with fabricated services".to_string(),
duration_months: 6,
amount_min: 2000.0,
amount_max: 15000.0,
transaction_count_min: 3,
transaction_count_max: 8,
concealment: vec![
"document_manipulation".to_string(),
"approval_circumvention".to_string(),
],
},
DesignedStage {
name: "acceleration".to_string(),
description: "Maximum extraction before potential detection".to_string(),
duration_months: 3,
amount_min: 10000.0,
amount_max: 50000.0,
transaction_count_min: 5,
transaction_count_max: 12,
concealment: vec![
"transaction_splitting".to_string(),
"timing_exploitation".to_string(),
],
},
],
difficulty: "hard".to_string(),
impact_range: (25000.0, 250000.0),
realism_score: 0.85,
});
}
if !controls.sod_gaps.is_empty() {
schemes.push(DesignedScheme {
name: "sod_exploitation".to_string(),
narrative: format!(
"An employee with conflicting duties ({}) creates and approves their own \
expense reimbursements and vendor payments, gradually increasing amounts.",
controls.sod_gaps.first().cloned().unwrap_or_default()
),
exploited_weaknesses: controls.sod_gaps.clone(),
detection_signals: vec![
"Same user creates and approves transactions".to_string(),
"Expense amounts cluster just below approval thresholds".to_string(),
"Weekend/after-hours transaction approvals".to_string(),
],
stages: vec![
DesignedStage {
name: "setup".to_string(),
description: "Establish pattern of legitimate self-approvals".to_string(),
duration_months: 1,
amount_min: 100.0,
amount_max: 500.0,
transaction_count_min: 3,
transaction_count_max: 6,
concealment: vec!["approval_circumvention".to_string()],
},
DesignedStage {
name: "exploitation".to_string(),
description: "Fraudulent reimbursements mixed with legitimate ones"
.to_string(),
duration_months: 8,
amount_min: 500.0,
amount_max: 9500.0,
transaction_count_min: 2,
transaction_count_max: 6,
concealment: vec![
"approval_circumvention".to_string(),
"transaction_splitting".to_string(),
],
},
],
difficulty: "moderate".to_string(),
impact_range: (5000.0, 75000.0),
realism_score: 0.80,
});
}
if schemes.len() < max_schemes {
schemes.push(DesignedScheme {
name: "revenue_timing_manipulation".to_string(),
narrative: format!(
"Management at this {} company accelerates revenue recognition \
near quarter-end to meet analyst expectations, booking sales before \
delivery criteria are met.",
company.industry
),
exploited_weaknesses: vec!["period_close_pressure".to_string()],
detection_signals: vec![
"Spike in revenue entries in last 3 days of quarter".to_string(),
"Reversals in first week of new quarter".to_string(),
"Revenue entries without matching delivery documentation".to_string(),
],
stages: vec![
DesignedStage {
name: "channel_stuffing".to_string(),
description: "Push sales to distributors with side agreements".to_string(),
duration_months: 1,
amount_min: 50000.0,
amount_max: 500000.0,
transaction_count_min: 5,
transaction_count_max: 20,
concealment: vec![
"timing_exploitation".to_string(),
"document_manipulation".to_string(),
],
},
DesignedStage {
name: "reversal_concealment".to_string(),
description: "Offset reversals through credit notes and returns"
.to_string(),
duration_months: 2,
amount_min: 10000.0,
amount_max: 200000.0,
transaction_count_min: 3,
transaction_count_max: 10,
concealment: vec![
"account_misclassification".to_string(),
"timing_exploitation".to_string(),
],
},
],
difficulty: "hard".to_string(),
impact_range: (100000.0, 2000000.0),
realism_score: 0.75,
});
}
if !controls.itgc_gaps.is_empty() && schemes.len() < max_schemes {
schemes.push(DesignedScheme {
name: "it_control_bypass".to_string(),
narrative: format!(
"An IT administrator exploits weak change management controls ({}) \
to modify transaction records directly in the database, creating \
ghost vendor payments that bypass application-level controls.",
controls.itgc_gaps.first().cloned().unwrap_or_default()
),
exploited_weaknesses: controls.itgc_gaps.clone(),
detection_signals: vec![
"Database changes without corresponding application audit trail".to_string(),
"Vendor master data changes outside business hours".to_string(),
"Payments to newly created vendors with no purchase history".to_string(),
],
stages: vec![DesignedStage {
name: "direct_manipulation".to_string(),
description: "Direct database inserts for fictitious vendor payments"
.to_string(),
duration_months: 4,
amount_min: 5000.0,
amount_max: 25000.0,
transaction_count_min: 1,
transaction_count_max: 3,
concealment: vec![
"it_control_bypass".to_string(),
"audit_trail_manipulation".to_string(),
],
}],
difficulty: "expert".to_string(),
impact_range: (20000.0, 100000.0),
realism_score: 0.70,
});
}
schemes.truncate(max_schemes);
schemes
}
fn system_prompt() -> &'static str {
concat!(
"You are a fraud risk expert designing realistic fraud scenarios for synthetic data ",
"generation. Given a company profile and control environment, design fraud schemes that:\n",
"1. Exploit the SPECIFIC control weaknesses described\n",
"2. Follow realistic multi-stage progressions (testing → escalation → acceleration)\n",
"3. Include concealment techniques appropriate to the control maturity level\n",
"4. Scale amounts realistically for the company size and industry\n\n",
"Return a JSON array of scheme objects. Each scheme has:\n",
"- name: short identifier\n",
"- narrative: 2-3 sentence description of the scheme\n",
"- exploited_weaknesses: [list of control weaknesses exploited]\n",
"- detection_signals: [observable indicators an analyst should look for]\n",
"- stages: [{name, description, duration_months, amount_min, amount_max, ",
"transaction_count_min, transaction_count_max, concealment: [techniques]}]\n",
"- difficulty: \"trivial\" | \"easy\" | \"moderate\" | \"hard\" | \"expert\"\n",
"- impact_range: [min_total, max_total]\n",
"- realism_score: 0.0-1.0 confidence in realism\n\n",
"Valid concealment techniques: document_manipulation, approval_circumvention, ",
"timing_exploitation, transaction_splitting, account_misclassification, ",
"collusion, it_control_bypass, audit_trail_manipulation\n\n",
"Return ONLY the JSON array.\n"
)
}
}
#[cfg(test)]
fn extract_json_array(content: &str) -> Option<&str> {
datasynth_core::llm::extract_json_array(content)
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SchemeLibrary {
entries: Vec<SchemeLibraryEntry>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SchemeLibraryEntry {
pub context_summary: String,
pub industry: String,
pub maturity_level: String,
pub schemes: Vec<DesignedScheme>,
}
impl SchemeLibrary {
pub fn new() -> Self {
Self {
entries: Vec::new(),
}
}
pub fn add(&mut self, result: &DesignResult, industry: &str, maturity_level: &str) {
self.entries.push(SchemeLibraryEntry {
context_summary: result.context_summary.clone(),
industry: industry.to_string(),
maturity_level: maturity_level.to_string(),
schemes: result.schemes.clone(),
});
}
pub fn find(&self, industry: &str, maturity_level: &str) -> Option<&[DesignedScheme]> {
self.entries
.iter()
.find(|e| e.industry == industry && e.maturity_level == maturity_level)
.map(|e| e.schemes.as_slice())
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn save(&self, path: &std::path::Path) -> Result<(), SynthError> {
let json = serde_json::to_string_pretty(self).map_err(|e| {
SynthError::generation(format!("Failed to serialize scheme library: {e}"))
})?;
std::fs::write(path, json)
.map_err(|e| SynthError::generation(format!("Failed to write scheme library: {e}")))?;
Ok(())
}
pub fn load(path: &std::path::Path) -> Result<Self, SynthError> {
let json = std::fs::read_to_string(path)
.map_err(|e| SynthError::generation(format!("Failed to read scheme library: {e}")))?;
let lib: Self = serde_json::from_str(&json)
.map_err(|e| SynthError::generation(format!("Failed to parse scheme library: {e}")))?;
Ok(lib)
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use datasynth_core::llm::MockLlmProvider;
fn sample_company() -> CompanyContext {
CompanyContext {
industry: "manufacturing".to_string(),
company_size: "medium".to_string(),
country: "US".to_string(),
employee_count: Some(500),
annual_revenue: Some("$50M-100M".to_string()),
}
}
fn weak_controls() -> ControlContext {
ControlContext {
maturity_level: "repeatable".to_string(),
weak_controls: vec!["three_way_match".to_string(), "C003".to_string()],
sod_gaps: vec!["AP clerk approves payments".to_string()],
audit_active: false,
itgc_gaps: vec![],
}
}
fn strong_controls() -> ControlContext {
ControlContext {
maturity_level: "managed".to_string(),
weak_controls: vec![],
sod_gaps: vec![],
audit_active: true,
itgc_gaps: vec![],
}
}
#[test]
fn test_design_with_weak_controls() {
let provider = Arc::new(MockLlmProvider::new(42));
let designer = AnomalyDesigner::new(provider);
let result = designer
.design(&sample_company(), &weak_controls(), 3)
.unwrap();
assert!(!result.schemes.is_empty());
assert!(!result.context_summary.is_empty());
let has_procurement = result
.schemes
.iter()
.any(|s| s.name.contains("procurement"));
assert!(
has_procurement,
"Should design procurement fraud for weak 3-way match"
);
let has_sod = result.schemes.iter().any(|s| s.name.contains("sod"));
assert!(has_sod, "Should design SoD exploitation scheme");
}
#[test]
fn test_design_with_strong_controls() {
let provider = Arc::new(MockLlmProvider::new(42));
let designer = AnomalyDesigner::new(provider);
let result = designer
.design(&sample_company(), &strong_controls(), 2)
.unwrap();
assert!(!result.schemes.is_empty());
let has_procurement = result
.schemes
.iter()
.any(|s| s.name.contains("procurement"));
assert!(!has_procurement);
}
#[test]
fn test_design_with_itgc_gaps() {
let provider = Arc::new(MockLlmProvider::new(42));
let designer = AnomalyDesigner::new(provider);
let controls = ControlContext {
maturity_level: "ad_hoc".to_string(),
weak_controls: vec![],
sod_gaps: vec![],
audit_active: false,
itgc_gaps: vec!["weak_change_management".to_string()],
};
let result = designer.design(&sample_company(), &controls, 3).unwrap();
let has_it = result.schemes.iter().any(|s| s.name.contains("it_control"));
assert!(has_it, "Should design IT control bypass scheme");
}
#[test]
fn test_fallback_scheme_stages_are_valid() {
let schemes = AnomalyDesigner::fallback_schemes(&sample_company(), &weak_controls(), 5);
for scheme in &schemes {
assert!(!scheme.name.is_empty());
assert!(!scheme.narrative.is_empty());
assert!(!scheme.stages.is_empty());
assert!(scheme.realism_score > 0.0 && scheme.realism_score <= 1.0);
assert!(scheme.impact_range.0 < scheme.impact_range.1);
for stage in &scheme.stages {
assert!(!stage.name.is_empty());
assert!(stage.duration_months > 0);
assert!(stage.amount_min <= stage.amount_max);
assert!(stage.transaction_count_min <= stage.transaction_count_max);
}
}
}
#[test]
fn test_scheme_library_roundtrip() {
let provider = Arc::new(MockLlmProvider::new(42));
let designer = AnomalyDesigner::new(provider);
let result = designer
.design(&sample_company(), &weak_controls(), 2)
.unwrap();
let mut lib = SchemeLibrary::new();
assert!(lib.is_empty());
lib.add(&result, "manufacturing", "repeatable");
assert_eq!(lib.len(), 1);
let found = lib.find("manufacturing", "repeatable");
assert!(found.is_some());
assert!(!found.unwrap().is_empty());
assert!(lib.find("retail", "managed").is_none());
}
#[test]
fn test_scheme_library_save_load() {
let mut lib = SchemeLibrary::new();
lib.add(
&DesignResult {
schemes: vec![DesignedScheme {
name: "test_scheme".to_string(),
narrative: "Test".to_string(),
exploited_weaknesses: vec![],
detection_signals: vec![],
stages: vec![],
difficulty: "moderate".to_string(),
impact_range: (1000.0, 5000.0),
realism_score: 0.8,
}],
context_summary: "test".to_string(),
},
"retail",
"defined",
);
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("library.json");
lib.save(&path).unwrap();
let loaded = SchemeLibrary::load(&path).unwrap();
assert_eq!(loaded.len(), 1);
assert!(loaded.find("retail", "defined").is_some());
}
#[test]
fn test_parse_valid_llm_schemes() {
let provider = Arc::new(MockLlmProvider::new(42));
let designer = AnomalyDesigner::new(provider);
let json = r#"[{
"name": "llm_scheme",
"narrative": "A scheme designed by LLM",
"exploited_weaknesses": ["weak_auth"],
"detection_signals": ["unusual timing"],
"stages": [{
"name": "phase1",
"description": "Initial phase",
"duration_months": 3,
"amount_min": 1000.0,
"amount_max": 5000.0,
"transaction_count_min": 2,
"transaction_count_max": 8,
"concealment": ["timing_exploitation"]
}],
"difficulty": "hard",
"impact_range": [5000.0, 50000.0],
"realism_score": 0.9
}]"#;
let schemes = designer.parse_schemes(json, 5);
assert_eq!(schemes.len(), 1);
assert_eq!(schemes[0].name, "llm_scheme");
assert!((schemes[0].realism_score - 0.9).abs() < 1e-10);
assert_eq!(schemes[0].stages.len(), 1);
assert_eq!(schemes[0].stages[0].duration_months, 3);
}
#[test]
fn test_max_schemes_limit() {
let provider = Arc::new(MockLlmProvider::new(42));
let designer = AnomalyDesigner::new(provider);
let controls = ControlContext {
maturity_level: "ad_hoc".to_string(),
weak_controls: vec!["three_way_match".to_string()],
sod_gaps: vec!["AP approves".to_string()],
audit_active: false,
itgc_gaps: vec!["change_mgmt".to_string()],
};
let result = designer.design(&sample_company(), &controls, 2).unwrap();
assert!(result.schemes.len() <= 2);
}
#[test]
fn test_extract_json_array() {
let input = "Here are schemes: [{\"name\": \"a\"}] end";
assert!(extract_json_array(input).is_some());
assert!(extract_json_array("no array here").is_none());
}
}