Skip to main content

datasynth_eval/
config.rs

1//! Configuration for the evaluation framework.
2
3use rust_decimal::Decimal;
4use serde::{Deserialize, Serialize};
5
6/// Main configuration for running an evaluation.
7#[derive(Debug, Clone, Serialize, Deserialize, Default)]
8pub struct EvaluationConfig {
9    /// Statistical evaluation settings.
10    pub statistical: StatisticalConfig,
11    /// Coherence evaluation settings.
12    pub coherence: CoherenceConfig,
13    /// Data quality evaluation settings.
14    pub quality: QualityConfig,
15    /// ML-readiness evaluation settings.
16    pub ml: MlConfig,
17    /// Privacy evaluation settings.
18    #[serde(default)]
19    pub privacy: PrivacyEvaluationConfig,
20    /// Report generation settings.
21    pub report: ReportConfig,
22    /// Pass/fail thresholds.
23    pub thresholds: EvaluationThresholds,
24    /// Quality gate configuration.
25    #[serde(default)]
26    pub quality_gates: QualityGateConfig,
27}
28
29/// Configuration for quality gates.
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct QualityGateConfig {
32    /// Whether quality gate evaluation is enabled.
33    #[serde(default)]
34    pub enabled: bool,
35    /// Profile name: "strict", "default", "lenient", or "custom".
36    #[serde(default = "default_gate_profile")]
37    pub profile: String,
38    /// Custom gate definitions (used when profile = "custom").
39    #[serde(default)]
40    pub custom_gates: Vec<CustomGateConfig>,
41    /// Whether to fail the generation run on gate violations.
42    #[serde(default)]
43    pub fail_on_violation: bool,
44}
45
46fn default_gate_profile() -> String {
47    "default".to_string()
48}
49
50impl Default for QualityGateConfig {
51    fn default() -> Self {
52        Self {
53            enabled: false,
54            profile: default_gate_profile(),
55            custom_gates: Vec::new(),
56            fail_on_violation: false,
57        }
58    }
59}
60
61/// Configuration for a custom quality gate.
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct CustomGateConfig {
64    /// Gate name.
65    pub name: String,
66    /// Metric to check (e.g., "benford_mad", "completion_rate", "duplicate_rate").
67    pub metric: String,
68    /// Threshold value.
69    pub threshold: f64,
70    /// Upper threshold for "between" comparison.
71    #[serde(default)]
72    pub upper_threshold: Option<f64>,
73    /// Comparison: "gte", "lte", "eq", "between".
74    #[serde(default = "default_comparison")]
75    pub comparison: String,
76}
77
78fn default_comparison() -> String {
79    "gte".to_string()
80}
81
82/// Privacy evaluation configuration.
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct PrivacyEvaluationConfig {
85    /// Enable membership inference attack testing.
86    pub mia_enabled: bool,
87    /// Enable linkage attack assessment.
88    pub linkage_enabled: bool,
89    /// Enable NIST SP 800-226 alignment report.
90    pub nist_alignment_enabled: bool,
91    /// Enable SynQP quality-privacy matrix.
92    pub synqp_enabled: bool,
93    /// Maximum AUC-ROC threshold for MIA (default: 0.6).
94    pub mia_auc_threshold: f64,
95    /// Maximum re-identification rate for linkage (default: 0.05).
96    pub max_reidentification_rate: f64,
97    /// Minimum k-anonymity for linkage (default: 5).
98    pub min_k_anonymity: usize,
99}
100
101impl Default for PrivacyEvaluationConfig {
102    fn default() -> Self {
103        Self {
104            mia_enabled: false,
105            linkage_enabled: false,
106            nist_alignment_enabled: false,
107            synqp_enabled: false,
108            mia_auc_threshold: 0.6,
109            max_reidentification_rate: 0.05,
110            min_k_anonymity: 5,
111        }
112    }
113}
114
115/// Statistical evaluation configuration.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct StatisticalConfig {
118    /// Enable Benford's Law analysis.
119    pub benford_enabled: bool,
120    /// Enable amount distribution analysis.
121    pub amount_distribution_enabled: bool,
122    /// Enable line item distribution analysis.
123    pub line_item_enabled: bool,
124    /// Enable temporal pattern analysis.
125    pub temporal_enabled: bool,
126    /// Enable drift detection analysis.
127    pub drift_detection_enabled: bool,
128    /// Significance level for statistical tests (default: 0.05).
129    pub significance_level: f64,
130    /// Minimum sample size for statistical tests.
131    pub min_sample_size: usize,
132    /// Window size for drift detection rolling statistics.
133    pub drift_window_size: usize,
134}
135
136impl Default for StatisticalConfig {
137    fn default() -> Self {
138        Self {
139            benford_enabled: true,
140            amount_distribution_enabled: true,
141            line_item_enabled: true,
142            temporal_enabled: true,
143            drift_detection_enabled: true,
144            significance_level: 0.05,
145            min_sample_size: 100,
146            drift_window_size: 10,
147        }
148    }
149}
150
151/// Coherence evaluation configuration.
152#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct CoherenceConfig {
154    /// Enable balance sheet validation.
155    pub balance_enabled: bool,
156    /// Enable subledger reconciliation.
157    pub subledger_enabled: bool,
158    /// Enable document chain validation.
159    pub document_chain_enabled: bool,
160    /// Enable intercompany matching validation.
161    pub intercompany_enabled: bool,
162    /// Enable referential integrity validation.
163    pub referential_enabled: bool,
164    /// Tolerance for balance differences.
165    pub balance_tolerance: Decimal,
166}
167
168impl Default for CoherenceConfig {
169    fn default() -> Self {
170        Self {
171            balance_enabled: true,
172            subledger_enabled: true,
173            document_chain_enabled: true,
174            intercompany_enabled: true,
175            referential_enabled: true,
176            balance_tolerance: Decimal::new(1, 2), // 0.01
177        }
178    }
179}
180
181/// Data quality evaluation configuration.
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct QualityConfig {
184    /// Enable uniqueness validation.
185    pub uniqueness_enabled: bool,
186    /// Enable completeness validation.
187    pub completeness_enabled: bool,
188    /// Enable format consistency validation.
189    pub format_enabled: bool,
190    /// Enable cross-field consistency validation.
191    pub consistency_enabled: bool,
192    /// Similarity threshold for near-duplicate detection (0.0-1.0).
193    pub near_duplicate_threshold: f64,
194}
195
196impl Default for QualityConfig {
197    fn default() -> Self {
198        Self {
199            uniqueness_enabled: true,
200            completeness_enabled: true,
201            format_enabled: true,
202            consistency_enabled: true,
203            near_duplicate_threshold: 0.95,
204        }
205    }
206}
207
208/// ML-readiness evaluation configuration.
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct MlConfig {
211    /// Enable feature distribution analysis.
212    pub features_enabled: bool,
213    /// Enable label quality analysis.
214    pub labels_enabled: bool,
215    /// Enable train/test split validation.
216    pub splits_enabled: bool,
217    /// Enable graph structure analysis.
218    pub graph_enabled: bool,
219}
220
221impl Default for MlConfig {
222    fn default() -> Self {
223        Self {
224            features_enabled: true,
225            labels_enabled: true,
226            splits_enabled: true,
227            graph_enabled: true,
228        }
229    }
230}
231
232/// Report generation configuration.
233#[derive(Debug, Clone, Serialize, Deserialize)]
234pub struct ReportConfig {
235    /// Generate HTML report.
236    pub html_enabled: bool,
237    /// Generate JSON report.
238    pub json_enabled: bool,
239    /// Include charts in HTML report.
240    pub charts_enabled: bool,
241    /// Path to baseline report for comparison.
242    pub baseline_path: Option<String>,
243}
244
245impl Default for ReportConfig {
246    fn default() -> Self {
247        Self {
248            html_enabled: true,
249            json_enabled: true,
250            charts_enabled: true,
251            baseline_path: None,
252        }
253    }
254}
255
256/// Pass/fail thresholds for evaluation metrics.
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct EvaluationThresholds {
259    // Statistical thresholds
260    /// Minimum p-value for Benford's Law chi-squared test.
261    pub benford_p_value_min: f64,
262    /// Maximum Mean Absolute Deviation for Benford's Law.
263    pub benford_mad_max: f64,
264    /// Minimum p-value for amount distribution KS test.
265    pub amount_ks_p_value_min: f64,
266    /// Minimum correlation for temporal patterns.
267    pub temporal_correlation_min: f64,
268
269    // Drift detection thresholds
270    /// Minimum drift magnitude to consider significant.
271    pub drift_magnitude_min: f64,
272    /// Maximum Hellinger distance threshold.
273    pub drift_hellinger_max: f64,
274    /// Maximum Population Stability Index (PSI) threshold.
275    pub drift_psi_max: f64,
276    /// Minimum F1 score for drift detection quality.
277    pub drift_f1_score_min: f64,
278
279    // Coherence thresholds
280    /// Maximum balance sheet imbalance.
281    pub balance_tolerance: Decimal,
282    /// Minimum subledger reconciliation rate.
283    pub subledger_reconciliation_rate_min: f64,
284    /// Minimum document chain completion rate.
285    pub document_chain_completion_min: f64,
286    /// Minimum intercompany match rate.
287    pub ic_match_rate_min: f64,
288    /// Minimum referential integrity rate.
289    pub referential_integrity_min: f64,
290
291    // Quality thresholds
292    /// Maximum duplicate rate.
293    pub duplicate_rate_max: f64,
294    /// Minimum completeness rate.
295    pub completeness_rate_min: f64,
296    /// Minimum format consistency rate.
297    pub format_consistency_min: f64,
298
299    // ML thresholds
300    /// Minimum anomaly rate.
301    pub anomaly_rate_min: f64,
302    /// Maximum anomaly rate.
303    pub anomaly_rate_max: f64,
304    /// Minimum label coverage.
305    pub label_coverage_min: f64,
306    /// Minimum train ratio.
307    pub train_ratio_min: f64,
308    /// Minimum graph connectivity.
309    pub graph_connectivity_min: f64,
310}
311
312impl Default for EvaluationThresholds {
313    fn default() -> Self {
314        Self {
315            // Statistical
316            benford_p_value_min: 0.05,
317            benford_mad_max: 0.015,
318            amount_ks_p_value_min: 0.05,
319            temporal_correlation_min: 0.80,
320
321            // Drift detection
322            drift_magnitude_min: 0.05,
323            drift_hellinger_max: 0.30,
324            drift_psi_max: 0.25,
325            drift_f1_score_min: 0.50,
326
327            // Coherence
328            balance_tolerance: Decimal::new(1, 2), // 0.01
329            subledger_reconciliation_rate_min: 0.99,
330            document_chain_completion_min: 0.90,
331            ic_match_rate_min: 0.95,
332            referential_integrity_min: 0.99,
333
334            // Quality
335            duplicate_rate_max: 0.01,
336            completeness_rate_min: 0.95,
337            format_consistency_min: 0.99,
338
339            // ML
340            anomaly_rate_min: 0.01,
341            anomaly_rate_max: 0.20,
342            label_coverage_min: 0.99,
343            train_ratio_min: 0.60,
344            graph_connectivity_min: 0.95,
345        }
346    }
347}
348
349impl EvaluationThresholds {
350    /// Create strict thresholds for rigorous validation.
351    pub fn strict() -> Self {
352        Self {
353            benford_p_value_min: 0.10,
354            benford_mad_max: 0.010,
355            amount_ks_p_value_min: 0.10,
356            temporal_correlation_min: 0.90,
357            drift_magnitude_min: 0.03,
358            drift_hellinger_max: 0.20,
359            drift_psi_max: 0.15,
360            drift_f1_score_min: 0.70,
361            balance_tolerance: Decimal::new(1, 4), // 0.0001
362            subledger_reconciliation_rate_min: 0.999,
363            document_chain_completion_min: 0.95,
364            ic_match_rate_min: 0.99,
365            referential_integrity_min: 0.999,
366            duplicate_rate_max: 0.001,
367            completeness_rate_min: 0.99,
368            format_consistency_min: 0.999,
369            anomaly_rate_min: 0.01,
370            anomaly_rate_max: 0.10,
371            label_coverage_min: 0.999,
372            train_ratio_min: 0.70,
373            graph_connectivity_min: 0.99,
374        }
375    }
376
377    /// Create lenient thresholds for exploratory validation.
378    pub fn lenient() -> Self {
379        Self {
380            benford_p_value_min: 0.01,
381            benford_mad_max: 0.025,
382            amount_ks_p_value_min: 0.01,
383            temporal_correlation_min: 0.60,
384            drift_magnitude_min: 0.10,
385            drift_hellinger_max: 0.50,
386            drift_psi_max: 0.40,
387            drift_f1_score_min: 0.30,
388            balance_tolerance: Decimal::new(1, 1), // 0.1
389            subledger_reconciliation_rate_min: 0.90,
390            document_chain_completion_min: 0.80,
391            ic_match_rate_min: 0.85,
392            referential_integrity_min: 0.95,
393            duplicate_rate_max: 0.05,
394            completeness_rate_min: 0.90,
395            format_consistency_min: 0.95,
396            anomaly_rate_min: 0.005,
397            anomaly_rate_max: 0.30,
398            label_coverage_min: 0.95,
399            train_ratio_min: 0.50,
400            graph_connectivity_min: 0.90,
401        }
402    }
403}