Skip to main content

datasynth_eval/
config.rs

1//! Configuration for the evaluation framework.
2
3use rust_decimal::Decimal;
4use serde::{Deserialize, Serialize};
5
6/// Main configuration for running an evaluation.
7#[derive(Debug, Clone, Serialize, Deserialize, Default)]
8pub struct EvaluationConfig {
9    /// Statistical evaluation settings.
10    pub statistical: StatisticalConfig,
11    /// Coherence evaluation settings.
12    pub coherence: CoherenceConfig,
13    /// Data quality evaluation settings.
14    pub quality: QualityConfig,
15    /// ML-readiness evaluation settings.
16    pub ml: MlConfig,
17    /// Report generation settings.
18    pub report: ReportConfig,
19    /// Pass/fail thresholds.
20    pub thresholds: EvaluationThresholds,
21}
22
23/// Statistical evaluation configuration.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct StatisticalConfig {
26    /// Enable Benford's Law analysis.
27    pub benford_enabled: bool,
28    /// Enable amount distribution analysis.
29    pub amount_distribution_enabled: bool,
30    /// Enable line item distribution analysis.
31    pub line_item_enabled: bool,
32    /// Enable temporal pattern analysis.
33    pub temporal_enabled: bool,
34    /// Enable drift detection analysis.
35    pub drift_detection_enabled: bool,
36    /// Significance level for statistical tests (default: 0.05).
37    pub significance_level: f64,
38    /// Minimum sample size for statistical tests.
39    pub min_sample_size: usize,
40    /// Window size for drift detection rolling statistics.
41    pub drift_window_size: usize,
42}
43
44impl Default for StatisticalConfig {
45    fn default() -> Self {
46        Self {
47            benford_enabled: true,
48            amount_distribution_enabled: true,
49            line_item_enabled: true,
50            temporal_enabled: true,
51            drift_detection_enabled: true,
52            significance_level: 0.05,
53            min_sample_size: 100,
54            drift_window_size: 10,
55        }
56    }
57}
58
59/// Coherence evaluation configuration.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct CoherenceConfig {
62    /// Enable balance sheet validation.
63    pub balance_enabled: bool,
64    /// Enable subledger reconciliation.
65    pub subledger_enabled: bool,
66    /// Enable document chain validation.
67    pub document_chain_enabled: bool,
68    /// Enable intercompany matching validation.
69    pub intercompany_enabled: bool,
70    /// Enable referential integrity validation.
71    pub referential_enabled: bool,
72    /// Tolerance for balance differences.
73    pub balance_tolerance: Decimal,
74}
75
76impl Default for CoherenceConfig {
77    fn default() -> Self {
78        Self {
79            balance_enabled: true,
80            subledger_enabled: true,
81            document_chain_enabled: true,
82            intercompany_enabled: true,
83            referential_enabled: true,
84            balance_tolerance: Decimal::new(1, 2), // 0.01
85        }
86    }
87}
88
89/// Data quality evaluation configuration.
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct QualityConfig {
92    /// Enable uniqueness validation.
93    pub uniqueness_enabled: bool,
94    /// Enable completeness validation.
95    pub completeness_enabled: bool,
96    /// Enable format consistency validation.
97    pub format_enabled: bool,
98    /// Enable cross-field consistency validation.
99    pub consistency_enabled: bool,
100    /// Similarity threshold for near-duplicate detection (0.0-1.0).
101    pub near_duplicate_threshold: f64,
102}
103
104impl Default for QualityConfig {
105    fn default() -> Self {
106        Self {
107            uniqueness_enabled: true,
108            completeness_enabled: true,
109            format_enabled: true,
110            consistency_enabled: true,
111            near_duplicate_threshold: 0.95,
112        }
113    }
114}
115
116/// ML-readiness evaluation configuration.
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct MlConfig {
119    /// Enable feature distribution analysis.
120    pub features_enabled: bool,
121    /// Enable label quality analysis.
122    pub labels_enabled: bool,
123    /// Enable train/test split validation.
124    pub splits_enabled: bool,
125    /// Enable graph structure analysis.
126    pub graph_enabled: bool,
127}
128
129impl Default for MlConfig {
130    fn default() -> Self {
131        Self {
132            features_enabled: true,
133            labels_enabled: true,
134            splits_enabled: true,
135            graph_enabled: true,
136        }
137    }
138}
139
140/// Report generation configuration.
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct ReportConfig {
143    /// Generate HTML report.
144    pub html_enabled: bool,
145    /// Generate JSON report.
146    pub json_enabled: bool,
147    /// Include charts in HTML report.
148    pub charts_enabled: bool,
149    /// Path to baseline report for comparison.
150    pub baseline_path: Option<String>,
151}
152
153impl Default for ReportConfig {
154    fn default() -> Self {
155        Self {
156            html_enabled: true,
157            json_enabled: true,
158            charts_enabled: true,
159            baseline_path: None,
160        }
161    }
162}
163
164/// Pass/fail thresholds for evaluation metrics.
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct EvaluationThresholds {
167    // Statistical thresholds
168    /// Minimum p-value for Benford's Law chi-squared test.
169    pub benford_p_value_min: f64,
170    /// Maximum Mean Absolute Deviation for Benford's Law.
171    pub benford_mad_max: f64,
172    /// Minimum p-value for amount distribution KS test.
173    pub amount_ks_p_value_min: f64,
174    /// Minimum correlation for temporal patterns.
175    pub temporal_correlation_min: f64,
176
177    // Drift detection thresholds
178    /// Minimum drift magnitude to consider significant.
179    pub drift_magnitude_min: f64,
180    /// Maximum Hellinger distance threshold.
181    pub drift_hellinger_max: f64,
182    /// Maximum Population Stability Index (PSI) threshold.
183    pub drift_psi_max: f64,
184    /// Minimum F1 score for drift detection quality.
185    pub drift_f1_score_min: f64,
186
187    // Coherence thresholds
188    /// Maximum balance sheet imbalance.
189    pub balance_tolerance: Decimal,
190    /// Minimum subledger reconciliation rate.
191    pub subledger_reconciliation_rate_min: f64,
192    /// Minimum document chain completion rate.
193    pub document_chain_completion_min: f64,
194    /// Minimum intercompany match rate.
195    pub ic_match_rate_min: f64,
196    /// Minimum referential integrity rate.
197    pub referential_integrity_min: f64,
198
199    // Quality thresholds
200    /// Maximum duplicate rate.
201    pub duplicate_rate_max: f64,
202    /// Minimum completeness rate.
203    pub completeness_rate_min: f64,
204    /// Minimum format consistency rate.
205    pub format_consistency_min: f64,
206
207    // ML thresholds
208    /// Minimum anomaly rate.
209    pub anomaly_rate_min: f64,
210    /// Maximum anomaly rate.
211    pub anomaly_rate_max: f64,
212    /// Minimum label coverage.
213    pub label_coverage_min: f64,
214    /// Minimum train ratio.
215    pub train_ratio_min: f64,
216    /// Minimum graph connectivity.
217    pub graph_connectivity_min: f64,
218}
219
220impl Default for EvaluationThresholds {
221    fn default() -> Self {
222        Self {
223            // Statistical
224            benford_p_value_min: 0.05,
225            benford_mad_max: 0.015,
226            amount_ks_p_value_min: 0.05,
227            temporal_correlation_min: 0.80,
228
229            // Drift detection
230            drift_magnitude_min: 0.05,
231            drift_hellinger_max: 0.30,
232            drift_psi_max: 0.25,
233            drift_f1_score_min: 0.50,
234
235            // Coherence
236            balance_tolerance: Decimal::new(1, 2), // 0.01
237            subledger_reconciliation_rate_min: 0.99,
238            document_chain_completion_min: 0.90,
239            ic_match_rate_min: 0.95,
240            referential_integrity_min: 0.99,
241
242            // Quality
243            duplicate_rate_max: 0.01,
244            completeness_rate_min: 0.95,
245            format_consistency_min: 0.99,
246
247            // ML
248            anomaly_rate_min: 0.01,
249            anomaly_rate_max: 0.20,
250            label_coverage_min: 0.99,
251            train_ratio_min: 0.60,
252            graph_connectivity_min: 0.95,
253        }
254    }
255}
256
257impl EvaluationThresholds {
258    /// Create strict thresholds for rigorous validation.
259    pub fn strict() -> Self {
260        Self {
261            benford_p_value_min: 0.10,
262            benford_mad_max: 0.010,
263            amount_ks_p_value_min: 0.10,
264            temporal_correlation_min: 0.90,
265            drift_magnitude_min: 0.03,
266            drift_hellinger_max: 0.20,
267            drift_psi_max: 0.15,
268            drift_f1_score_min: 0.70,
269            balance_tolerance: Decimal::new(1, 4), // 0.0001
270            subledger_reconciliation_rate_min: 0.999,
271            document_chain_completion_min: 0.95,
272            ic_match_rate_min: 0.99,
273            referential_integrity_min: 0.999,
274            duplicate_rate_max: 0.001,
275            completeness_rate_min: 0.99,
276            format_consistency_min: 0.999,
277            anomaly_rate_min: 0.01,
278            anomaly_rate_max: 0.10,
279            label_coverage_min: 0.999,
280            train_ratio_min: 0.70,
281            graph_connectivity_min: 0.99,
282        }
283    }
284
285    /// Create lenient thresholds for exploratory validation.
286    pub fn lenient() -> Self {
287        Self {
288            benford_p_value_min: 0.01,
289            benford_mad_max: 0.025,
290            amount_ks_p_value_min: 0.01,
291            temporal_correlation_min: 0.60,
292            drift_magnitude_min: 0.10,
293            drift_hellinger_max: 0.50,
294            drift_psi_max: 0.40,
295            drift_f1_score_min: 0.30,
296            balance_tolerance: Decimal::new(1, 1), // 0.1
297            subledger_reconciliation_rate_min: 0.90,
298            document_chain_completion_min: 0.80,
299            ic_match_rate_min: 0.85,
300            referential_integrity_min: 0.95,
301            duplicate_rate_max: 0.05,
302            completeness_rate_min: 0.90,
303            format_consistency_min: 0.95,
304            anomaly_rate_min: 0.005,
305            anomaly_rate_max: 0.30,
306            label_coverage_min: 0.95,
307            train_ratio_min: 0.50,
308            graph_connectivity_min: 0.90,
309        }
310    }
311}