Skip to main content

datasynth_eval/
lib.rs

1#![deny(clippy::unwrap_used)]
2// Allow some clippy lints that are common in test/evaluation code
3#![allow(clippy::field_reassign_with_default)]
4#![allow(clippy::too_many_arguments)]
5#![allow(clippy::upper_case_acronyms)] // MCAR, MAR, MNAR, ISO are standard abbreviations
6
7//! Synthetic Data Evaluation Framework
8//!
9//! This crate provides comprehensive evaluation capabilities for validating
10//! the quality and correctness of generated synthetic financial data.
11//!
12//! # Features
13//!
14//! - **Statistical Quality**: Benford's Law, amount distributions, line item patterns
15//! - **Semantic Coherence**: Balance sheet validation, subledger reconciliation
16//! - **Data Quality**: Uniqueness, completeness, format consistency
17//! - **ML-Readiness**: Feature distributions, label quality, graph structure
18//! - **Reporting**: HTML and JSON reports with pass/fail thresholds
19//!
20//! # Example
21//!
22//! ```ignore
23//! use datasynth_eval::{Evaluator, EvaluationConfig};
24//!
25//! let config = EvaluationConfig::default();
26//! let evaluator = Evaluator::new(config);
27//!
28//! // Evaluate generated data
29//! let result = evaluator.evaluate(&generation_result)?;
30//!
31//! // Generate report
32//! result.generate_html_report("evaluation_report.html")?;
33//! ```
34
35pub mod benchmarks;
36pub mod config;
37pub mod enhancement;
38pub mod error;
39pub mod gates;
40pub mod privacy;
41
42pub mod coherence;
43pub mod ml;
44pub mod quality;
45pub mod report;
46pub mod statistical;
47pub mod tuning;
48
49pub mod banking;
50pub mod causal;
51pub mod diff_engine;
52pub mod enrichment;
53pub mod process_mining;
54pub mod scenario_diff;
55
56// Re-exports
57pub use config::{EvaluationConfig, EvaluationThresholds, PrivacyEvaluationConfig};
58pub use error::{EvalError, EvalResult};
59
60pub use statistical::{
61    AmountDistributionAnalysis, AmountDistributionAnalyzer, AnomalyRealismEvaluation,
62    AnomalyRealismEvaluator, BenfordAnalysis, BenfordAnalyzer, BenfordConformity,
63    CorrelationAnalysis, CorrelationAnalyzer, CorrelationCheckResult, DetectionDifficulty,
64    DriftDetectionAnalysis, DriftDetectionAnalyzer, DriftDetectionEntry, DriftDetectionMetrics,
65    DriftEventCategory, ExpectedCorrelation, LabeledDriftEvent, LabeledEventAnalysis,
66    LineItemAnalysis, LineItemAnalyzer, LineItemEntry, SecondDigitAnalysis, StatisticalEvaluation,
67    TemporalAnalysis, TemporalAnalyzer, TemporalEntry,
68};
69
70pub use coherence::{
71    AccountType,
72    ApprovalLevelData,
73    AuditEvaluation,
74    AuditEvaluator,
75    AuditFindingData,
76    AuditRiskData,
77    AuditTrailEvaluation,
78    AuditTrailGap,
79    BalanceSheetEvaluation,
80    BalanceSheetEvaluator,
81    BalanceSnapshot,
82    BankReconciliationEvaluation,
83    BankReconciliationEvaluator,
84    BidEvaluationData,
85    BudgetVarianceData,
86    CashPositionData,
87    CoherenceEvaluation,
88    ConcentrationMetrics,
89    CountryPackData,
90    CountryPackEvaluation,
91    CountryPackEvaluator,
92    CountryPackThresholds,
93    CovenantData,
94    CrossProcessEvaluation,
95    CrossProcessEvaluator,
96    CycleCountData,
97    DocumentChainEvaluation,
98    DocumentChainEvaluator,
99    DocumentReferenceData,
100    EarnedValueData,
101    EntityReferenceData,
102    EsgEvaluation,
103    EsgEvaluator,
104    EsgThresholds,
105    ExpenseReportData,
106    FairValueEvaluation,
107    // Task 4.1: Financial Ratio Evaluator
108    FinancialRatios,
109    FinancialReportingEvaluation,
110    FinancialReportingEvaluator,
111    FinancialStatementData,
112    FrameworkViolation,
113    GovernanceData,
114    HedgeEffectivenessData,
115    HolidayData,
116    HrPayrollEvaluation,
117    HrPayrollEvaluator,
118    ICMatchingData,
119    ICMatchingEvaluation,
120    ICMatchingEvaluator,
121    ImpairmentEvaluation,
122    IsaComplianceEvaluation,
123    // Task 4.2: JE Risk Scoring Evaluator
124    JeRiskScoringResult,
125    KpiData,
126    LeaseAccountingEvaluation,
127    LeaseAccountingEvaluator,
128    LeaseEvaluation,
129    ManufacturingEvaluation,
130    ManufacturingEvaluator,
131    MaterialityData,
132    NettingData,
133    NetworkEdge,
134    NetworkEvaluation,
135    NetworkEvaluator,
136    NetworkNode,
137    NetworkThresholds,
138    O2CChainData,
139    P2PChainData,
140    PayrollHoursData,
141    PayrollLineItemData,
142    PayrollRunData,
143    PcaobComplianceEvaluation,
144    PerformanceObligation,
145    ProductionOrderData,
146    ProjectAccountingEvaluation,
147    ProjectAccountingEvaluator,
148    ProjectAccountingThresholds,
149    ProjectRevenueData,
150    QualityInspectionData,
151    QuoteLineData,
152    RatioAnalysisResult,
153    RatioCheck,
154    ReconciliationData,
155    ReferentialData,
156    ReferentialIntegrityEvaluation,
157    ReferentialIntegrityEvaluator,
158    RetainageData,
159    RevenueContract,
160    RevenueRecognitionEvaluation,
161    RevenueRecognitionEvaluator,
162    RiskAttributeStats,
163    RiskDistribution,
164    RoutingOperationData,
165    SafetyMetricData,
166    SalesQuoteData,
167    SalesQuoteEvaluation,
168    SalesQuoteEvaluator,
169    SalesQuoteThresholds,
170    ScorecardCoverageData,
171    SourcingEvaluation,
172    SourcingEvaluator,
173    SourcingProjectData,
174    SoxComplianceEvaluation,
175    SpendAnalysisData,
176    StandardsComplianceEvaluation,
177    StandardsThresholds,
178    StrengthStats,
179    SubledgerEvaluator,
180    SubledgerReconciliationEvaluation,
181    SupplierEsgData,
182    TaxEvaluation,
183    TaxEvaluator,
184    TaxLineData,
185    TaxRateData,
186    TaxReturnData,
187    TaxThresholds,
188    TimeEntryData,
189    TreasuryEvaluation,
190    TreasuryEvaluator,
191    TreasuryThresholds,
192    UnmatchedICItem,
193    VariableConsideration,
194    ViolationSeverity,
195    WaterUsageData,
196    WithholdingData,
197    WorkpaperData,
198};
199
200pub use quality::{
201    CompletenessAnalysis, CompletenessAnalyzer, ConsistencyAnalysis, ConsistencyAnalyzer,
202    ConsistencyRule, DuplicateInfo, FieldCompleteness, FieldDefinition, FieldValue, FormatAnalysis,
203    FormatAnalyzer, FormatVariation, QualityEvaluation, UniqueRecord, UniquenessAnalysis,
204    UniquenessAnalyzer,
205};
206
207pub use ml::{
208    AnomalyScoringAnalysis, AnomalyScoringAnalyzer, CrossModalAnalysis, CrossModalAnalyzer,
209    DomainGapAnalysis, DomainGapAnalyzer, EmbeddingReadinessAnalysis, EmbeddingReadinessAnalyzer,
210    FeatureAnalysis, FeatureAnalyzer, FeatureQualityAnalysis, FeatureQualityAnalyzer, FeatureStats,
211    GnnReadinessAnalysis, GnnReadinessAnalyzer, GraphAnalysis, GraphAnalyzer, GraphMetrics,
212    LabelAnalysis, LabelAnalyzer, LabelDistribution, MLReadinessEvaluation,
213    SchemeDetectabilityAnalysis, SchemeDetectabilityAnalyzer, SplitAnalysis, SplitAnalyzer,
214    SplitMetrics, TemporalFidelityAnalysis, TemporalFidelityAnalyzer,
215};
216
217pub use report::{
218    BaselineComparison, ComparisonResult, EvaluationReport, HtmlReportGenerator,
219    JsonReportGenerator, MetricChange, ReportMetadata, ThresholdChecker, ThresholdResult,
220};
221
222pub use tuning::{
223    ConfigSuggestion, ConfigSuggestionGenerator, TuningAnalyzer, TuningCategory, TuningOpportunity,
224};
225
226pub use enhancement::{
227    AutoTuneResult, AutoTuner, ConfigPatch, EnhancementReport, Recommendation,
228    RecommendationCategory, RecommendationEngine, RecommendationPriority, RootCause,
229    SuggestedAction,
230};
231
232pub use privacy::{
233    LinkageAttack, LinkageConfig, LinkageResults, MembershipInferenceAttack, MiaConfig, MiaResults,
234    NistAlignmentReport, NistCriterion, PrivacyEvaluation, SynQPMatrix, SynQPQuadrant,
235};
236
237pub use benchmarks::{
238    // ACFE-calibrated benchmarks
239    acfe_calibrated_1k,
240    acfe_collusion_5k,
241    acfe_management_override_2k,
242    all_acfe_benchmarks,
243    all_benchmarks,
244    // Industry-specific benchmarks
245    all_industry_benchmarks,
246    anomaly_bench_1k,
247    data_quality_100k,
248    entity_match_5k,
249    financial_services_fraud_5k,
250    fraud_detect_10k,
251    get_benchmark,
252    get_industry_benchmark,
253    graph_fraud_10k,
254    healthcare_fraud_5k,
255    manufacturing_fraud_5k,
256    retail_fraud_10k,
257    technology_fraud_3k,
258    AcfeAlignment,
259    AcfeCalibration,
260    AcfeCategoryDistribution,
261    BaselineModelType,
262    BaselineResult,
263    BenchmarkBuilder,
264    BenchmarkSuite,
265    BenchmarkTaskType,
266    CostMatrix,
267    DatasetSpec,
268    EvaluationSpec,
269    FeatureSet,
270    IndustryBenchmarkAnalysis,
271    LeaderboardEntry,
272    MetricType,
273    SplitRatios,
274};
275
276pub use banking::{
277    AmlDetectabilityAnalysis, AmlDetectabilityAnalyzer, AmlTransactionData, BankingEvaluation,
278    KycCompletenessAnalysis, KycCompletenessAnalyzer, KycProfileData, TypologyData,
279};
280
281pub use process_mining::{
282    EventSequenceAnalysis, EventSequenceAnalyzer, ProcessEventData, ProcessMiningEvaluation,
283    VariantAnalysis, VariantAnalyzer, VariantData,
284};
285
286pub use causal::{CausalModelEvaluation, CausalModelEvaluator};
287
288pub use enrichment::{EnrichmentQualityEvaluation, EnrichmentQualityEvaluator};
289
290use serde::{Deserialize, Serialize};
291
292/// Comprehensive evaluation result combining all evaluation modules.
293#[derive(Debug, Clone, Serialize, Deserialize)]
294pub struct ComprehensiveEvaluation {
295    /// Statistical quality evaluation.
296    pub statistical: StatisticalEvaluation,
297    /// Semantic coherence evaluation.
298    pub coherence: CoherenceEvaluation,
299    /// Data quality evaluation.
300    pub quality: QualityEvaluation,
301    /// ML-readiness evaluation.
302    pub ml_readiness: MLReadinessEvaluation,
303    /// Privacy evaluation (optional — only populated when privacy testing is enabled).
304    #[serde(default, skip_serializing_if = "Option::is_none")]
305    pub privacy: Option<PrivacyEvaluation>,
306    /// Banking/KYC/AML evaluation (optional).
307    #[serde(default, skip_serializing_if = "Option::is_none")]
308    pub banking: Option<BankingEvaluation>,
309    /// OCEL 2.0 process mining evaluation (optional).
310    #[serde(default, skip_serializing_if = "Option::is_none")]
311    pub process_mining: Option<ProcessMiningEvaluation>,
312    /// Causal model evaluation (optional).
313    #[serde(default, skip_serializing_if = "Option::is_none")]
314    pub causal: Option<CausalModelEvaluation>,
315    /// LLM enrichment quality evaluation (optional).
316    #[serde(default, skip_serializing_if = "Option::is_none")]
317    pub enrichment_quality: Option<EnrichmentQualityEvaluation>,
318    /// Overall pass/fail status.
319    pub passes: bool,
320    /// Summary of all failures.
321    pub failures: Vec<String>,
322    /// Tuning opportunities identified.
323    pub tuning_opportunities: Vec<TuningOpportunity>,
324    /// Configuration suggestions.
325    pub config_suggestions: Vec<ConfigSuggestion>,
326}
327
328impl ComprehensiveEvaluation {
329    /// Create a new empty evaluation.
330    pub fn new() -> Self {
331        Self {
332            statistical: StatisticalEvaluation::default(),
333            coherence: CoherenceEvaluation::default(),
334            quality: QualityEvaluation::default(),
335            ml_readiness: MLReadinessEvaluation::default(),
336            privacy: None,
337            banking: None,
338            process_mining: None,
339            causal: None,
340            enrichment_quality: None,
341            passes: true,
342            failures: Vec::new(),
343            tuning_opportunities: Vec::new(),
344            config_suggestions: Vec::new(),
345        }
346    }
347
348    /// Check all evaluations against thresholds and update overall status.
349    pub fn check_all_thresholds(&mut self, thresholds: &EvaluationThresholds) {
350        self.failures.clear();
351
352        // Check statistical thresholds
353        self.statistical.check_thresholds(thresholds);
354        self.failures.extend(self.statistical.failures.clone());
355
356        // Check coherence thresholds
357        self.coherence.check_thresholds(thresholds);
358        self.failures.extend(self.coherence.failures.clone());
359
360        // Check quality thresholds
361        self.quality.check_thresholds(thresholds);
362        self.failures.extend(self.quality.failures.clone());
363
364        // Check ML thresholds
365        self.ml_readiness.check_thresholds(thresholds);
366        self.failures.extend(self.ml_readiness.failures.clone());
367
368        // Check privacy evaluation (if present)
369        if let Some(ref mut privacy) = self.privacy {
370            privacy.update_status();
371            self.failures.extend(privacy.failures.clone());
372        }
373
374        // Check banking evaluation
375        if let Some(ref mut banking) = self.banking {
376            banking.check_thresholds();
377            self.failures.extend(banking.issues.clone());
378        }
379
380        // Check process mining evaluation
381        if let Some(ref mut pm) = self.process_mining {
382            pm.check_thresholds();
383            self.failures.extend(pm.issues.clone());
384        }
385
386        // Check causal model evaluation
387        if let Some(ref causal) = self.causal {
388            if !causal.passes {
389                self.failures.extend(causal.issues.clone());
390            }
391        }
392
393        // Check enrichment quality evaluation
394        if let Some(ref enrichment) = self.enrichment_quality {
395            if !enrichment.passes {
396                self.failures.extend(enrichment.issues.clone());
397            }
398        }
399
400        self.passes = self.failures.is_empty();
401    }
402}
403
404impl Default for ComprehensiveEvaluation {
405    fn default() -> Self {
406        Self::new()
407    }
408}
409
410/// Main evaluator that coordinates all evaluation modules.
411pub struct Evaluator {
412    /// Evaluation configuration.
413    config: EvaluationConfig,
414}
415
416impl Evaluator {
417    /// Create a new evaluator with the given configuration.
418    pub fn new(config: EvaluationConfig) -> Self {
419        Self { config }
420    }
421
422    /// Create an evaluator with default configuration.
423    pub fn with_defaults() -> Self {
424        Self::new(EvaluationConfig::default())
425    }
426
427    /// Get the configuration.
428    pub fn config(&self) -> &EvaluationConfig {
429        &self.config
430    }
431
432    /// Run a comprehensive evaluation and return results.
433    ///
434    /// # Architectural note
435    ///
436    /// This zero-argument variant returns a default (passing) evaluation because the
437    /// `Evaluator` struct holds only configuration — it has no access to the generated
438    /// journal entry or balance data that the sub-module evaluators require.
439    ///
440    /// To evaluate actual generation output, use [`run_evaluation_with_amounts`] which
441    /// accepts raw JE amounts and runs the Benford analysis.  Full wiring of all
442    /// sub-modules (BalanceSheetEvaluator, DocumentChainEvaluator, etc.) requires
443    /// passing the complete `EnhancedGenerationResult` from the runtime crate, which
444    /// would create a circular dependency.  The recommended integration point is the
445    /// orchestrator layer (datasynth-runtime) which already calls the gate engine with
446    /// a populated `ComprehensiveEvaluation`.
447    pub fn run_evaluation(&self) -> ComprehensiveEvaluation {
448        let mut evaluation = ComprehensiveEvaluation::new();
449        evaluation.check_all_thresholds(&self.config.thresholds);
450        evaluation
451    }
452
453    /// Run a Benford-augmented evaluation given raw JE amounts.
454    ///
455    /// This method calls the [`BenfordAnalyzer`] sub-module and populates the
456    /// `statistical.benford` field of the returned [`ComprehensiveEvaluation`].
457    /// All other sub-module fields remain at their default (passing) values.
458    pub fn run_evaluation_with_amounts(
459        &self,
460        je_amounts: &[rust_decimal::Decimal],
461    ) -> ComprehensiveEvaluation {
462        let mut evaluation = ComprehensiveEvaluation::new();
463
464        if !je_amounts.is_empty() {
465            let analyzer = BenfordAnalyzer::new(self.config.thresholds.benford_p_value_min);
466            match analyzer.analyze(je_amounts) {
467                Ok(benford) => {
468                    evaluation.statistical.benford = Some(benford);
469                }
470                Err(e) => {
471                    evaluation
472                        .failures
473                        .push(format!("Benford analysis failed: {e}"));
474                    evaluation.passes = false;
475                }
476            }
477        }
478
479        evaluation.check_all_thresholds(&self.config.thresholds);
480        evaluation
481    }
482}
483
484#[cfg(test)]
485#[allow(clippy::unwrap_used)]
486mod tests {
487    use super::*;
488
489    #[test]
490    fn test_comprehensive_evaluation_new() {
491        let eval = ComprehensiveEvaluation::new();
492        assert!(eval.passes);
493        assert!(eval.failures.is_empty());
494    }
495
496    #[test]
497    fn test_evaluator_creation() {
498        let evaluator = Evaluator::with_defaults();
499        assert_eq!(evaluator.config().thresholds.benford_p_value_min, 0.05);
500    }
501}