use chrono::NaiveDate;
use datasynth_eval::{
AmountDistributionAnalyzer, BenfordAnalyzer, LineItemAnalyzer, LineItemEntry, TemporalAnalyzer,
TemporalEntry,
};
use rust_decimal::Decimal;
use serde::Deserialize;
use std::collections::HashMap;
use std::error::Error;
use std::fs;
#[derive(Debug, Deserialize)]
struct JournalEntry {
header: Header,
lines: Vec<Line>,
}
#[derive(Debug, Deserialize)]
struct Header {
document_id: String,
#[allow(dead_code)]
company_code: String,
#[allow(dead_code)]
fiscal_year: u16,
#[allow(dead_code)]
fiscal_period: u8,
posting_date: String,
#[allow(dead_code)]
document_date: String,
source: String,
business_process: Option<String>,
is_fraud: bool,
#[allow(dead_code)]
fraud_type: Option<String>,
}
#[derive(Debug, Deserialize)]
struct Line {
#[allow(dead_code)]
line_number: u32,
#[allow(dead_code)]
gl_account: String,
debit_amount: String,
credit_amount: String,
}
fn main() -> Result<(), Box<dyn Error>> {
let args: Vec<String> = std::env::args().collect();
let file_path = args
.get(1)
.map(|s| s.as_str())
.unwrap_or("/tmp/synth-eval-output/sample_entries.json");
println!("╔══════════════════════════════════════════════════════════════════╗");
println!("║ SYNTHETIC DATA COMPREHENSIVE EVALUATION REPORT ║");
println!("╚══════════════════════════════════════════════════════════════════╝");
println!();
println!("Loading data from: {}", file_path);
let content = fs::read_to_string(file_path)?;
let entries: Vec<JournalEntry> = serde_json::from_str(&content)?;
println!("Loaded {} journal entries", entries.len());
println!();
let mut amounts: Vec<Decimal> = Vec::new();
let mut line_item_entries: Vec<LineItemEntry> = Vec::new();
let mut temporal_entries: Vec<TemporalEntry> = Vec::new();
let mut balance_issues: Vec<String> = Vec::new();
let mut fraud_count = 0;
let mut source_counts: HashMap<String, usize> = HashMap::new();
let mut process_counts: HashMap<String, usize> = HashMap::new();
for entry in &entries {
if let Ok(date) = NaiveDate::parse_from_str(&entry.header.posting_date, "%Y-%m-%d") {
temporal_entries.push(TemporalEntry { posting_date: date });
}
if entry.header.is_fraud {
fraud_count += 1;
}
*source_counts
.entry(entry.header.source.clone())
.or_insert(0) += 1;
if let Some(ref bp) = entry.header.business_process {
*process_counts.entry(bp.clone()).or_insert(0) += 1;
}
let mut total_debit = Decimal::ZERO;
let mut total_credit = Decimal::ZERO;
let mut debit_count = 0usize;
let mut credit_count = 0usize;
for line in &entry.lines {
if let Ok(debit) = line.debit_amount.parse::<Decimal>() {
if debit > Decimal::ZERO {
amounts.push(debit);
total_debit += debit;
debit_count += 1;
}
}
if let Ok(credit) = line.credit_amount.parse::<Decimal>() {
if credit > Decimal::ZERO {
amounts.push(credit);
total_credit += credit;
credit_count += 1;
}
}
}
line_item_entries.push(LineItemEntry {
line_count: entry.lines.len(),
debit_count,
credit_count,
});
let imbalance = (total_debit - total_credit).abs();
if imbalance > Decimal::new(1, 2) {
balance_issues.push(format!(
"Entry {}: Debit={}, Credit={}, Imbalance={}",
entry.header.document_id, total_debit, total_credit, imbalance
));
}
}
println!("┌────────────────────────────────────────────────────────────────┐");
println!("│ 1. BALANCE COHERENCE (Debits = Credits) │");
println!("└────────────────────────────────────────────────────────────────┘");
let balance_rate = 1.0 - (balance_issues.len() as f64 / entries.len() as f64);
let balance_status = if balance_issues.is_empty() {
"✓ PASS"
} else {
"✗ FAIL"
};
println!(" Balance rate: {:.2}%", balance_rate * 100.0);
println!(" Imbalanced entries: {}", balance_issues.len());
println!(" Status: {}", balance_status);
if !balance_issues.is_empty() && balance_issues.len() <= 5 {
println!(" Sample issues:");
for issue in balance_issues.iter().take(5) {
println!(" - {}", issue);
}
}
println!();
println!("┌────────────────────────────────────────────────────────────────┐");
println!("│ 2. BENFORD'S LAW ANALYSIS │");
println!("└────────────────────────────────────────────────────────────────┘");
let benford_analyzer = BenfordAnalyzer::new(0.05);
let benford_result = benford_analyzer.analyze(&amounts)?;
let benford_status = if benford_result.passes {
"✓ PASS"
} else {
"✗ FAIL"
};
println!(" Sample size: {}", benford_result.sample_size);
println!(" Chi-squared: {:.4}", benford_result.chi_squared);
println!(" P-value: {:.6}", benford_result.p_value);
println!(" MAD (Mean Absolute Deviation): {:.6}", benford_result.mad);
println!(" Conformity level: {:?}", benford_result.conformity);
println!(
" Anti-Benford score: {:.4}",
benford_result.anti_benford_score
);
println!(" Status: {}", benford_status);
println!();
println!(" First-Digit Distribution:");
println!(" Digit Expected Observed Deviation");
println!(" ───── ───────── ───────── ─────────");
let expected = [
0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046,
];
for (i, (obs, exp)) in benford_result
.observed_frequencies
.iter()
.zip(expected.iter())
.enumerate()
{
let dev = obs - exp;
let indicator = if dev.abs() > 0.02 { "⚠" } else { "✓" };
println!(
" {} {:.3} {:.3} {:+.3} {}",
i + 1,
exp,
obs,
dev,
indicator
);
}
println!();
println!("┌────────────────────────────────────────────────────────────────┐");
println!("│ 3. AMOUNT DISTRIBUTION ANALYSIS │");
println!("└────────────────────────────────────────────────────────────────┘");
let amount_analyzer = AmountDistributionAnalyzer::new();
let amount_result = amount_analyzer.analyze(&amounts)?;
println!(" Sample size: {}", amount_result.sample_size);
println!(" Mean: ${}", amount_result.mean);
println!(" Median: ${}", amount_result.median);
println!(" Std Dev: ${}", amount_result.std_dev);
println!(" Min: ${}", amount_result.min);
println!(" Max: ${}", amount_result.max);
println!(" Skewness: {:.4}", amount_result.skewness);
println!(" Kurtosis: {:.4}", amount_result.kurtosis);
println!();
println!(
" Round number ratio: {:.2}%",
amount_result.round_number_ratio * 100.0
);
println!(
" Nice number ratio: {:.2}%",
amount_result.nice_number_ratio * 100.0
);
if let Some(p) = amount_result.lognormal_ks_pvalue {
let status = if p > 0.05 { "✓ PASS" } else { "⚠ MARGINAL" };
println!(" Log-normal KS p-value: {:.6} {}", p, status);
}
if let (Some(mu), Some(sigma)) = (amount_result.fitted_mu, amount_result.fitted_sigma) {
println!(" Fitted log-normal: μ={:.4}, σ={:.4}", mu, sigma);
}
println!();
println!("┌────────────────────────────────────────────────────────────────┐");
println!("│ 4. LINE ITEM DISTRIBUTION ANALYSIS │");
println!("└────────────────────────────────────────────────────────────────┘");
let line_analyzer = LineItemAnalyzer::new(0.05);
let line_result = line_analyzer.analyze(&line_item_entries)?;
println!(" Sample size: {}", line_result.sample_size);
println!(" Average line items: {:.2}", line_result.avg_line_count);
println!(" Min line count: {}", line_result.min_line_count);
println!(" Max line count: {}", line_result.max_line_count);
println!(
" Even ratio: {:.2}% (expected: 88%)",
line_result.even_ratio * 100.0
);
println!(
" Even ratio deviation: {:.4}",
line_result.even_ratio_deviation
);
println!(
" Equal split ratio: {:.2}% (expected: 82%)",
line_result.equal_split_ratio * 100.0
);
println!(" Chi-squared: {:.4}", line_result.chi_squared);
println!(" P-value: {:.6}", line_result.p_value);
let line_status = if line_result.passes {
"✓ PASS"
} else {
"✗ FAIL"
};
println!(" Status: {}", line_status);
println!();
println!(" Line Count Distribution:");
println!(" Count Observed Expected");
println!(" ───── ──────── ────────");
let expected_dist = [
(2, 0.6068),
(3, 0.0577),
(4, 0.1663),
(5, 0.0306),
(6, 0.0332),
];
let total = line_result.sample_size as f64;
for (count, expected_pct) in &expected_dist {
let observed = line_result.line_count_distribution.get(count).unwrap_or(&0);
let observed_pct = *observed as f64 / total;
let indicator = if (observed_pct - expected_pct).abs() > 0.05 {
"⚠"
} else {
"✓"
};
println!(
" {:5} {:.4} {:.4} {}",
count, observed_pct, expected_pct, indicator
);
}
println!();
println!("┌────────────────────────────────────────────────────────────────┐");
println!("│ 5. TEMPORAL PATTERN ANALYSIS │");
println!("└────────────────────────────────────────────────────────────────┘");
let temporal_analyzer = TemporalAnalyzer::new();
let temporal_result = temporal_analyzer.analyze(&temporal_entries)?;
println!(" Sample size: {}", temporal_result.sample_size);
println!(
" Date range: {} to {}",
temporal_result.start_date, temporal_result.end_date
);
println!(" Days spanned: {}", temporal_result.days_spanned);
println!(
" Weekend activity ratio: {:.2}% (expected: <10%)",
temporal_result.weekend_ratio * 100.0
);
println!(
" Month-end spike: {:.2}x (expected: ~2.5x)",
temporal_result.month_end_spike
);
println!(
" Quarter-end spike: {:.2}x (expected: ~4.0x)",
temporal_result.quarter_end_spike
);
println!(
" Year-end spike: {:.2}x (expected: ~6.0x)",
temporal_result.year_end_spike
);
println!(
" Pattern correlation: {:.4}",
temporal_result.pattern_correlation
);
println!(
" Day-of-week correlation: {:.4}",
temporal_result.day_of_week_correlation
);
let temporal_status = if temporal_result.passes {
"✓ PASS"
} else {
"✗ FAIL"
};
println!(" Status: {}", temporal_status);
println!();
println!(" Day of Week Distribution:");
println!(" Day Volume");
println!(" ────────── ──────");
let days = [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
];
for day in &days {
let vol = temporal_result
.day_of_week_distribution
.get(*day)
.unwrap_or(&0.0);
println!(" {:10} {:.4}", day, vol);
}
println!();
println!("┌────────────────────────────────────────────────────────────────┐");
println!("│ 6. DATA COMPOSITION │");
println!("└────────────────────────────────────────────────────────────────┘");
let total = entries.len();
println!(
" Fraud entries: {} ({:.2}%)",
fraud_count,
(fraud_count as f64 / total as f64) * 100.0
);
println!();
println!(" Source Distribution:");
for (source, count) in &source_counts {
println!(
" {:15}: {:6} ({:.1}%)",
source,
count,
(*count as f64 / total as f64) * 100.0
);
}
println!();
println!(" Business Process Distribution:");
for (process, count) in &process_counts {
println!(
" {:15}: {:6} ({:.1}%)",
process,
count,
(*count as f64 / total as f64) * 100.0
);
}
println!();
println!("╔══════════════════════════════════════════════════════════════════╗");
println!("║ EVALUATION SUMMARY ║");
println!("╚══════════════════════════════════════════════════════════════════╝");
println!();
let mut issues: Vec<(String, &str)> = Vec::new();
let mut score: f64 = 100.0;
if !balance_issues.is_empty() {
issues.push((
"Balance coherence: Some entries are not balanced".to_string(),
"CRITICAL",
));
score -= 20.0;
}
if !benford_result.passes {
issues.push((
format!(
"Benford's Law: p-value {:.4} < 0.05",
benford_result.p_value
),
"HIGH",
));
score -= 15.0;
}
if benford_result.mad > 0.015 {
issues.push((
format!(
"Benford MAD: {:.4} > 0.015 (acceptable)",
benford_result.mad
),
"MEDIUM",
));
score -= 10.0;
}
if !line_result.passes {
issues.push((
"Line item distribution doesn't match expected pattern".to_string(),
"MEDIUM",
));
score -= 10.0;
}
let even_diff = (line_result.even_ratio - 0.88).abs();
if even_diff > 0.10 {
issues.push((
format!(
"Even/odd ratio: {:.1}% vs expected 88%",
line_result.even_ratio * 100.0
),
"MEDIUM",
));
score -= 5.0;
}
if temporal_result.weekend_ratio > 0.15 {
issues.push((
format!(
"Weekend activity: {:.1}% (expected < 10%)",
temporal_result.weekend_ratio * 100.0
),
"MEDIUM",
));
score -= 5.0;
}
if temporal_result.month_end_spike < 1.5 {
issues.push((
format!(
"Month-end spike: {:.1}x (expected ~2.5x)",
temporal_result.month_end_spike
),
"LOW",
));
score -= 5.0;
}
if !temporal_result.passes {
issues.push((
"Temporal patterns don't match expectations".to_string(),
"MEDIUM",
));
score -= 5.0;
}
if issues.is_empty() {
println!(" ✓ All checks passed!");
println!(" Score: {:.0}/100", score);
} else {
println!(" Issues found: {}", issues.len());
println!();
for (issue, severity) in &issues {
let icon = match *severity {
"CRITICAL" => "🔴",
"HIGH" => "🟠",
"MEDIUM" => "🟡",
_ => "🟢",
};
println!(" {} [{}] {}", icon, severity, issue);
}
println!();
println!(" Overall Score: {:.0}/100", score.max(0.0));
}
println!();
println!("╔══════════════════════════════════════════════════════════════════╗");
println!("║ ENHANCEMENT RECOMMENDATIONS ║");
println!("╚══════════════════════════════════════════════════════════════════╝");
println!();
let mut recommendations: Vec<(&str, &str, &str)> = Vec::new();
if !balance_issues.is_empty() {
recommendations.push((
"CRITICAL",
"Balance Validation",
"Fix balance coherence - all journal entries MUST have debits = credits",
));
}
if !benford_result.passes || benford_result.mad > 0.012 {
recommendations.push((
"HIGH",
"Benford's Law Compliance",
"Tune amount distribution parameters to better match Benford's Law.\n\
Consider adjusting lognormal_mu and lognormal_sigma parameters.",
));
}
if !line_result.passes {
recommendations.push((
"MEDIUM",
"Line Item Distribution",
"Adjust line item distribution parameters to better match empirical data.\n\
Expected: 60% 2-line, 17% 4-line entries. Review line_item_distribution config.",
));
}
if (line_result.even_ratio - 0.88).abs() > 0.05 {
recommendations.push((
"MEDIUM",
"Even/Odd Line Ratio",
"Adjust even/odd line item ratio. Research suggests 88% even, 12% odd.\n\
Review even_odd_distribution settings.",
));
}
if temporal_result.weekend_ratio > 0.10 {
recommendations.push((
"MEDIUM",
"Weekend Activity",
"Reduce weekend transaction volume. Most business transactions occur on weekdays.\n\
Adjust weekend_activity parameter to ~5%.",
));
}
if temporal_result.month_end_spike < 2.0 {
recommendations.push((
"LOW",
"Month-End Patterns",
"Strengthen month-end spike patterns for more realistic temporal distribution.\n\
Set month_end_multiplier to 2.5-3.0.",
));
}
if benford_result.anti_benford_score > 0.3 {
recommendations.push((
"HIGH",
"Anti-Benford Patterns",
"High anti-Benford score indicates potential fraud-like patterns.\n\
If unintentional, review amount generation for anomalies.",
));
}
if amount_result.skewness > 5.0 {
recommendations.push((
"LOW",
"Amount Distribution Skewness",
"Very high skewness in amounts. Consider adjusting max_amount\n\
or using a different distribution shape.",
));
}
if recommendations.is_empty() {
println!(" ✓ No major enhancements needed - generation quality is excellent!");
} else {
for (priority, area, recommendation) in &recommendations {
let icon = match *priority {
"CRITICAL" => "🔴",
"HIGH" => "🟠",
"MEDIUM" => "🟡",
_ => "🟢",
};
println!(" {} [{}] {}", icon, priority, area);
for line in recommendation.lines() {
println!(" {}", line);
}
println!();
}
}
println!("══════════════════════════════════════════════════════════════════════");
println!(" END OF EVALUATION REPORT");
println!("══════════════════════════════════════════════════════════════════════");
Ok(())
}