Skip to main content

datasynth_eval/
config.rs

1//! Configuration for the evaluation framework.
2
3use rust_decimal::Decimal;
4use serde::{Deserialize, Serialize};
5
6/// Main configuration for running an evaluation.
7#[derive(Debug, Clone, Serialize, Deserialize, Default)]
8pub struct EvaluationConfig {
9    /// Statistical evaluation settings.
10    pub statistical: StatisticalConfig,
11    /// Coherence evaluation settings.
12    pub coherence: CoherenceConfig,
13    /// Data quality evaluation settings.
14    pub quality: QualityConfig,
15    /// ML-readiness evaluation settings.
16    pub ml: MlConfig,
17    /// Report generation settings.
18    pub report: ReportConfig,
19    /// Pass/fail thresholds.
20    pub thresholds: EvaluationThresholds,
21}
22
23/// Statistical evaluation configuration.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct StatisticalConfig {
26    /// Enable Benford's Law analysis.
27    pub benford_enabled: bool,
28    /// Enable amount distribution analysis.
29    pub amount_distribution_enabled: bool,
30    /// Enable line item distribution analysis.
31    pub line_item_enabled: bool,
32    /// Enable temporal pattern analysis.
33    pub temporal_enabled: bool,
34    /// Significance level for statistical tests (default: 0.05).
35    pub significance_level: f64,
36    /// Minimum sample size for statistical tests.
37    pub min_sample_size: usize,
38}
39
40impl Default for StatisticalConfig {
41    fn default() -> Self {
42        Self {
43            benford_enabled: true,
44            amount_distribution_enabled: true,
45            line_item_enabled: true,
46            temporal_enabled: true,
47            significance_level: 0.05,
48            min_sample_size: 100,
49        }
50    }
51}
52
53/// Coherence evaluation configuration.
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct CoherenceConfig {
56    /// Enable balance sheet validation.
57    pub balance_enabled: bool,
58    /// Enable subledger reconciliation.
59    pub subledger_enabled: bool,
60    /// Enable document chain validation.
61    pub document_chain_enabled: bool,
62    /// Enable intercompany matching validation.
63    pub intercompany_enabled: bool,
64    /// Enable referential integrity validation.
65    pub referential_enabled: bool,
66    /// Tolerance for balance differences.
67    pub balance_tolerance: Decimal,
68}
69
70impl Default for CoherenceConfig {
71    fn default() -> Self {
72        Self {
73            balance_enabled: true,
74            subledger_enabled: true,
75            document_chain_enabled: true,
76            intercompany_enabled: true,
77            referential_enabled: true,
78            balance_tolerance: Decimal::new(1, 2), // 0.01
79        }
80    }
81}
82
83/// Data quality evaluation configuration.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct QualityConfig {
86    /// Enable uniqueness validation.
87    pub uniqueness_enabled: bool,
88    /// Enable completeness validation.
89    pub completeness_enabled: bool,
90    /// Enable format consistency validation.
91    pub format_enabled: bool,
92    /// Enable cross-field consistency validation.
93    pub consistency_enabled: bool,
94    /// Similarity threshold for near-duplicate detection (0.0-1.0).
95    pub near_duplicate_threshold: f64,
96}
97
98impl Default for QualityConfig {
99    fn default() -> Self {
100        Self {
101            uniqueness_enabled: true,
102            completeness_enabled: true,
103            format_enabled: true,
104            consistency_enabled: true,
105            near_duplicate_threshold: 0.95,
106        }
107    }
108}
109
110/// ML-readiness evaluation configuration.
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct MlConfig {
113    /// Enable feature distribution analysis.
114    pub features_enabled: bool,
115    /// Enable label quality analysis.
116    pub labels_enabled: bool,
117    /// Enable train/test split validation.
118    pub splits_enabled: bool,
119    /// Enable graph structure analysis.
120    pub graph_enabled: bool,
121}
122
123impl Default for MlConfig {
124    fn default() -> Self {
125        Self {
126            features_enabled: true,
127            labels_enabled: true,
128            splits_enabled: true,
129            graph_enabled: true,
130        }
131    }
132}
133
134/// Report generation configuration.
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct ReportConfig {
137    /// Generate HTML report.
138    pub html_enabled: bool,
139    /// Generate JSON report.
140    pub json_enabled: bool,
141    /// Include charts in HTML report.
142    pub charts_enabled: bool,
143    /// Path to baseline report for comparison.
144    pub baseline_path: Option<String>,
145}
146
147impl Default for ReportConfig {
148    fn default() -> Self {
149        Self {
150            html_enabled: true,
151            json_enabled: true,
152            charts_enabled: true,
153            baseline_path: None,
154        }
155    }
156}
157
158/// Pass/fail thresholds for evaluation metrics.
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct EvaluationThresholds {
161    // Statistical thresholds
162    /// Minimum p-value for Benford's Law chi-squared test.
163    pub benford_p_value_min: f64,
164    /// Maximum Mean Absolute Deviation for Benford's Law.
165    pub benford_mad_max: f64,
166    /// Minimum p-value for amount distribution KS test.
167    pub amount_ks_p_value_min: f64,
168    /// Minimum correlation for temporal patterns.
169    pub temporal_correlation_min: f64,
170
171    // Coherence thresholds
172    /// Maximum balance sheet imbalance.
173    pub balance_tolerance: Decimal,
174    /// Minimum subledger reconciliation rate.
175    pub subledger_reconciliation_rate_min: f64,
176    /// Minimum document chain completion rate.
177    pub document_chain_completion_min: f64,
178    /// Minimum intercompany match rate.
179    pub ic_match_rate_min: f64,
180    /// Minimum referential integrity rate.
181    pub referential_integrity_min: f64,
182
183    // Quality thresholds
184    /// Maximum duplicate rate.
185    pub duplicate_rate_max: f64,
186    /// Minimum completeness rate.
187    pub completeness_rate_min: f64,
188    /// Minimum format consistency rate.
189    pub format_consistency_min: f64,
190
191    // ML thresholds
192    /// Minimum anomaly rate.
193    pub anomaly_rate_min: f64,
194    /// Maximum anomaly rate.
195    pub anomaly_rate_max: f64,
196    /// Minimum label coverage.
197    pub label_coverage_min: f64,
198    /// Minimum train ratio.
199    pub train_ratio_min: f64,
200    /// Minimum graph connectivity.
201    pub graph_connectivity_min: f64,
202}
203
204impl Default for EvaluationThresholds {
205    fn default() -> Self {
206        Self {
207            // Statistical
208            benford_p_value_min: 0.05,
209            benford_mad_max: 0.015,
210            amount_ks_p_value_min: 0.05,
211            temporal_correlation_min: 0.80,
212
213            // Coherence
214            balance_tolerance: Decimal::new(1, 2), // 0.01
215            subledger_reconciliation_rate_min: 0.99,
216            document_chain_completion_min: 0.90,
217            ic_match_rate_min: 0.95,
218            referential_integrity_min: 0.99,
219
220            // Quality
221            duplicate_rate_max: 0.01,
222            completeness_rate_min: 0.95,
223            format_consistency_min: 0.99,
224
225            // ML
226            anomaly_rate_min: 0.01,
227            anomaly_rate_max: 0.20,
228            label_coverage_min: 0.99,
229            train_ratio_min: 0.60,
230            graph_connectivity_min: 0.95,
231        }
232    }
233}
234
235impl EvaluationThresholds {
236    /// Create strict thresholds for rigorous validation.
237    pub fn strict() -> Self {
238        Self {
239            benford_p_value_min: 0.10,
240            benford_mad_max: 0.010,
241            amount_ks_p_value_min: 0.10,
242            temporal_correlation_min: 0.90,
243            balance_tolerance: Decimal::new(1, 4), // 0.0001
244            subledger_reconciliation_rate_min: 0.999,
245            document_chain_completion_min: 0.95,
246            ic_match_rate_min: 0.99,
247            referential_integrity_min: 0.999,
248            duplicate_rate_max: 0.001,
249            completeness_rate_min: 0.99,
250            format_consistency_min: 0.999,
251            anomaly_rate_min: 0.01,
252            anomaly_rate_max: 0.10,
253            label_coverage_min: 0.999,
254            train_ratio_min: 0.70,
255            graph_connectivity_min: 0.99,
256        }
257    }
258
259    /// Create lenient thresholds for exploratory validation.
260    pub fn lenient() -> Self {
261        Self {
262            benford_p_value_min: 0.01,
263            benford_mad_max: 0.025,
264            amount_ks_p_value_min: 0.01,
265            temporal_correlation_min: 0.60,
266            balance_tolerance: Decimal::new(1, 1), // 0.1
267            subledger_reconciliation_rate_min: 0.90,
268            document_chain_completion_min: 0.80,
269            ic_match_rate_min: 0.85,
270            referential_integrity_min: 0.95,
271            duplicate_rate_max: 0.05,
272            completeness_rate_min: 0.90,
273            format_consistency_min: 0.95,
274            anomaly_rate_min: 0.005,
275            anomaly_rate_max: 0.30,
276            label_coverage_min: 0.95,
277            train_ratio_min: 0.50,
278            graph_connectivity_min: 0.90,
279        }
280    }
281}