datasynth_eval/statistical/
mod.rs1mod amount_distribution;
18mod anderson_darling;
19mod anomaly_realism;
20mod benford;
21mod chi_squared;
22mod correlation;
23mod drift_detection;
24mod line_item;
25mod relational_fidelity;
26mod temporal;
27
28pub use amount_distribution::{AmountDistributionAnalysis, AmountDistributionAnalyzer};
29pub use anderson_darling::{
30 AndersonDarlingAnalysis, AndersonDarlingAnalyzer, CriticalValues, FittedParameters,
31 TargetDistribution,
32};
33pub use benford::{BenfordAnalysis, BenfordAnalyzer, BenfordConformity, SecondDigitAnalysis};
34pub use chi_squared::{
35 BinFrequency, BinningStrategy, ChiSquaredAnalysis, ChiSquaredAnalyzer, ExpectedDistribution,
36};
37pub use correlation::{
38 pearson_correlation, spearman_correlation, CorrelationAnalysis, CorrelationAnalyzer,
39 CorrelationCheckResult, ExpectedCorrelation,
40};
41pub use drift_detection::{
42 DetectionDifficulty, DriftDetectionAnalysis, DriftDetectionAnalyzer, DriftDetectionEntry,
43 DriftDetectionMetrics, DriftEventCategory, LabeledDriftEvent, LabeledEventAnalysis,
44};
45pub use line_item::{LineItemAnalysis, LineItemAnalyzer, LineItemEntry};
46pub use relational_fidelity::{
47 flow_edges_from_entries, FlowEdge, RelationalFidelityAnalyzer, RelationalFidelityReport,
48 RelationalFidelityThresholds,
49};
50pub use temporal::{TemporalAnalysis, TemporalAnalyzer, TemporalEntry};
51
52pub use anomaly_realism::{
53 AnomalyData, AnomalyRealismEvaluation, AnomalyRealismEvaluator, AnomalyRealismThresholds,
54};
55
56use serde::{Deserialize, Serialize};
57
58#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct StatisticalEvaluation {
61 pub benford: Option<BenfordAnalysis>,
63 pub amount_distribution: Option<AmountDistributionAnalysis>,
65 pub line_item: Option<LineItemAnalysis>,
67 pub temporal: Option<TemporalAnalysis>,
69 pub correlation: Option<CorrelationAnalysis>,
71 pub anderson_darling: Option<AndersonDarlingAnalysis>,
73 pub chi_squared: Option<ChiSquaredAnalysis>,
75 pub drift_detection: Option<DriftDetectionAnalysis>,
77 pub drift_events: Option<LabeledEventAnalysis>,
79 #[serde(default, skip_serializing_if = "Option::is_none")]
81 pub anomaly_realism: Option<AnomalyRealismEvaluation>,
82 pub passes: bool,
84 pub failures: Vec<String>,
86 pub issues: Vec<String>,
88 pub overall_score: f64,
90}
91
92impl StatisticalEvaluation {
93 pub fn new() -> Self {
95 Self {
96 benford: None,
97 amount_distribution: None,
98 line_item: None,
99 temporal: None,
100 correlation: None,
101 anderson_darling: None,
102 chi_squared: None,
103 drift_detection: None,
104 drift_events: None,
105 anomaly_realism: None,
106 passes: true,
107 failures: Vec::new(),
108 issues: Vec::new(),
109 overall_score: 1.0,
110 }
111 }
112
113 pub fn check_thresholds(&mut self, thresholds: &crate::config::EvaluationThresholds) {
115 self.failures.clear();
116 self.issues.clear();
117 let mut scores = Vec::new();
118
119 if let Some(ref benford) = self.benford {
120 if benford.p_value < thresholds.benford_p_value_min {
121 self.failures.push(format!(
122 "Benford p-value {} < {} (threshold)",
123 benford.p_value, thresholds.benford_p_value_min
124 ));
125 }
126 if benford.mad > thresholds.benford_mad_max {
127 self.failures.push(format!(
128 "Benford MAD {} > {} (threshold)",
129 benford.mad, thresholds.benford_mad_max
130 ));
131 }
132 let p_score = (benford.p_value / 0.5).min(1.0);
134 let mad_score = 1.0 - (benford.mad / 0.05).min(1.0);
135 scores.push((p_score + mad_score) / 2.0);
136 }
137
138 if let Some(ref amount) = self.amount_distribution {
139 if let Some(p_value) = amount.lognormal_ks_pvalue {
140 if p_value < thresholds.amount_ks_p_value_min {
141 self.failures.push(format!(
142 "Amount KS p-value {} < {} (threshold)",
143 p_value, thresholds.amount_ks_p_value_min
144 ));
145 }
146 scores.push((p_value / 0.5).min(1.0));
147 }
148 }
149
150 if let Some(ref temporal) = self.temporal {
151 if temporal.pattern_correlation < thresholds.temporal_correlation_min {
152 self.failures.push(format!(
153 "Temporal correlation {} < {} (threshold)",
154 temporal.pattern_correlation, thresholds.temporal_correlation_min
155 ));
156 }
157 scores.push(temporal.pattern_correlation);
158 }
159
160 if let Some(ref correlation) = self.correlation {
162 if !correlation.passes {
163 for issue in &correlation.issues {
164 self.failures.push(format!("Correlation: {issue}"));
165 }
166 }
167 let total_checks = correlation.checks_passed + correlation.checks_failed;
169 if total_checks > 0 {
170 scores.push(correlation.checks_passed as f64 / total_checks as f64);
171 }
172 }
173
174 if let Some(ref ad) = self.anderson_darling {
176 if !ad.passes {
177 for issue in &ad.issues {
178 self.failures.push(format!("Anderson-Darling: {issue}"));
179 }
180 }
181 scores.push((ad.p_value / 0.5).min(1.0));
183 }
184
185 if let Some(ref chi_sq) = self.chi_squared {
187 if !chi_sq.passes {
188 for issue in &chi_sq.issues {
189 self.failures.push(format!("Chi-squared: {issue}"));
190 }
191 }
192 scores.push((chi_sq.p_value / 0.5).min(1.0));
194 }
195
196 if let Some(ref drift) = self.drift_detection {
198 if !drift.passes {
199 for issue in &drift.issues {
200 self.failures.push(format!("Drift detection: {issue}"));
201 }
202 }
203 if drift.drift_magnitude >= thresholds.drift_magnitude_min {
205 scores.push(drift.detection_metrics.f1_score);
206 }
207 if let Some(hellinger) = drift.hellinger_distance {
209 if hellinger > thresholds.drift_hellinger_max {
210 self.failures.push(format!(
211 "Drift Hellinger distance {} > {} (threshold)",
212 hellinger, thresholds.drift_hellinger_max
213 ));
214 }
215 }
216 if let Some(psi) = drift.psi {
218 if psi > thresholds.drift_psi_max {
219 self.failures.push(format!(
220 "Drift PSI {} > {} (threshold)",
221 psi, thresholds.drift_psi_max
222 ));
223 }
224 }
225 }
226
227 if let Some(ref events) = self.drift_events {
229 if !events.passes {
230 for issue in &events.issues {
231 self.failures.push(format!("Drift events: {issue}"));
232 }
233 }
234 if events.total_events > 0 {
236 let difficulty_score = 1.0 - events.avg_difficulty;
237 scores.push(difficulty_score);
238 }
239 }
240
241 if let Some(ref anomaly_realism) = self.anomaly_realism {
243 if !anomaly_realism.passes {
244 for issue in &anomaly_realism.issues {
245 self.failures.push(format!("Anomaly realism: {issue}"));
246 }
247 }
248 scores.push(anomaly_realism.statistical_detectability);
250 }
251
252 self.issues = self.failures.clone();
254 self.passes = self.failures.is_empty();
255
256 self.overall_score = if scores.is_empty() {
258 1.0
259 } else {
260 scores.iter().sum::<f64>() / scores.len() as f64
261 };
262 }
263}
264
265impl Default for StatisticalEvaluation {
266 fn default() -> Self {
267 Self::new()
268 }
269}