datasynth_eval/statistical/
mod.rs1mod amount_distribution;
18mod anderson_darling;
19mod anomaly_realism;
20mod benford;
21mod chi_squared;
22mod correlation;
23mod drift_detection;
24mod line_item;
25mod temporal;
26
27pub use amount_distribution::{AmountDistributionAnalysis, AmountDistributionAnalyzer};
28pub use anderson_darling::{
29 AndersonDarlingAnalysis, AndersonDarlingAnalyzer, CriticalValues, FittedParameters,
30 TargetDistribution,
31};
32pub use benford::{BenfordAnalysis, BenfordAnalyzer, BenfordConformity};
33pub use chi_squared::{
34 BinFrequency, BinningStrategy, ChiSquaredAnalysis, ChiSquaredAnalyzer, ExpectedDistribution,
35};
36pub use correlation::{
37 pearson_correlation, spearman_correlation, CorrelationAnalysis, CorrelationAnalyzer,
38 CorrelationCheckResult, ExpectedCorrelation,
39};
40pub use drift_detection::{
41 DetectionDifficulty, DriftDetectionAnalysis, DriftDetectionAnalyzer, DriftDetectionEntry,
42 DriftDetectionMetrics, DriftEventCategory, LabeledDriftEvent, LabeledEventAnalysis,
43};
44pub use line_item::{LineItemAnalysis, LineItemAnalyzer, LineItemEntry};
45pub use temporal::{TemporalAnalysis, TemporalAnalyzer, TemporalEntry};
46
47pub use anomaly_realism::{
48 AnomalyData, AnomalyRealismEvaluation, AnomalyRealismEvaluator, AnomalyRealismThresholds,
49};
50
51use serde::{Deserialize, Serialize};
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct StatisticalEvaluation {
56 pub benford: Option<BenfordAnalysis>,
58 pub amount_distribution: Option<AmountDistributionAnalysis>,
60 pub line_item: Option<LineItemAnalysis>,
62 pub temporal: Option<TemporalAnalysis>,
64 pub correlation: Option<CorrelationAnalysis>,
66 pub anderson_darling: Option<AndersonDarlingAnalysis>,
68 pub chi_squared: Option<ChiSquaredAnalysis>,
70 pub drift_detection: Option<DriftDetectionAnalysis>,
72 pub drift_events: Option<LabeledEventAnalysis>,
74 #[serde(default, skip_serializing_if = "Option::is_none")]
76 pub anomaly_realism: Option<AnomalyRealismEvaluation>,
77 pub passes: bool,
79 pub failures: Vec<String>,
81 pub issues: Vec<String>,
83 pub overall_score: f64,
85}
86
87impl StatisticalEvaluation {
88 pub fn new() -> Self {
90 Self {
91 benford: None,
92 amount_distribution: None,
93 line_item: None,
94 temporal: None,
95 correlation: None,
96 anderson_darling: None,
97 chi_squared: None,
98 drift_detection: None,
99 drift_events: None,
100 anomaly_realism: None,
101 passes: true,
102 failures: Vec::new(),
103 issues: Vec::new(),
104 overall_score: 1.0,
105 }
106 }
107
108 pub fn check_thresholds(&mut self, thresholds: &crate::config::EvaluationThresholds) {
110 self.failures.clear();
111 self.issues.clear();
112 let mut scores = Vec::new();
113
114 if let Some(ref benford) = self.benford {
115 if benford.p_value < thresholds.benford_p_value_min {
116 self.failures.push(format!(
117 "Benford p-value {} < {} (threshold)",
118 benford.p_value, thresholds.benford_p_value_min
119 ));
120 }
121 if benford.mad > thresholds.benford_mad_max {
122 self.failures.push(format!(
123 "Benford MAD {} > {} (threshold)",
124 benford.mad, thresholds.benford_mad_max
125 ));
126 }
127 let p_score = (benford.p_value / 0.5).min(1.0);
129 let mad_score = 1.0 - (benford.mad / 0.05).min(1.0);
130 scores.push((p_score + mad_score) / 2.0);
131 }
132
133 if let Some(ref amount) = self.amount_distribution {
134 if let Some(p_value) = amount.lognormal_ks_pvalue {
135 if p_value < thresholds.amount_ks_p_value_min {
136 self.failures.push(format!(
137 "Amount KS p-value {} < {} (threshold)",
138 p_value, thresholds.amount_ks_p_value_min
139 ));
140 }
141 scores.push((p_value / 0.5).min(1.0));
142 }
143 }
144
145 if let Some(ref temporal) = self.temporal {
146 if temporal.pattern_correlation < thresholds.temporal_correlation_min {
147 self.failures.push(format!(
148 "Temporal correlation {} < {} (threshold)",
149 temporal.pattern_correlation, thresholds.temporal_correlation_min
150 ));
151 }
152 scores.push(temporal.pattern_correlation);
153 }
154
155 if let Some(ref correlation) = self.correlation {
157 if !correlation.passes {
158 for issue in &correlation.issues {
159 self.failures.push(format!("Correlation: {}", issue));
160 }
161 }
162 let total_checks = correlation.checks_passed + correlation.checks_failed;
164 if total_checks > 0 {
165 scores.push(correlation.checks_passed as f64 / total_checks as f64);
166 }
167 }
168
169 if let Some(ref ad) = self.anderson_darling {
171 if !ad.passes {
172 for issue in &ad.issues {
173 self.failures.push(format!("Anderson-Darling: {}", issue));
174 }
175 }
176 scores.push((ad.p_value / 0.5).min(1.0));
178 }
179
180 if let Some(ref chi_sq) = self.chi_squared {
182 if !chi_sq.passes {
183 for issue in &chi_sq.issues {
184 self.failures.push(format!("Chi-squared: {}", issue));
185 }
186 }
187 scores.push((chi_sq.p_value / 0.5).min(1.0));
189 }
190
191 if let Some(ref drift) = self.drift_detection {
193 if !drift.passes {
194 for issue in &drift.issues {
195 self.failures.push(format!("Drift detection: {}", issue));
196 }
197 }
198 if drift.drift_magnitude >= thresholds.drift_magnitude_min {
200 scores.push(drift.detection_metrics.f1_score);
201 }
202 if let Some(hellinger) = drift.hellinger_distance {
204 if hellinger > thresholds.drift_hellinger_max {
205 self.failures.push(format!(
206 "Drift Hellinger distance {} > {} (threshold)",
207 hellinger, thresholds.drift_hellinger_max
208 ));
209 }
210 }
211 if let Some(psi) = drift.psi {
213 if psi > thresholds.drift_psi_max {
214 self.failures.push(format!(
215 "Drift PSI {} > {} (threshold)",
216 psi, thresholds.drift_psi_max
217 ));
218 }
219 }
220 }
221
222 if let Some(ref events) = self.drift_events {
224 if !events.passes {
225 for issue in &events.issues {
226 self.failures.push(format!("Drift events: {}", issue));
227 }
228 }
229 if events.total_events > 0 {
231 let difficulty_score = 1.0 - events.avg_difficulty;
232 scores.push(difficulty_score);
233 }
234 }
235
236 if let Some(ref anomaly_realism) = self.anomaly_realism {
238 if !anomaly_realism.passes {
239 for issue in &anomaly_realism.issues {
240 self.failures.push(format!("Anomaly realism: {}", issue));
241 }
242 }
243 scores.push(anomaly_realism.statistical_detectability);
245 }
246
247 self.issues = self.failures.clone();
249 self.passes = self.failures.is_empty();
250
251 self.overall_score = if scores.is_empty() {
253 1.0
254 } else {
255 scores.iter().sum::<f64>() / scores.len() as f64
256 };
257 }
258}
259
260impl Default for StatisticalEvaluation {
261 fn default() -> Self {
262 Self::new()
263 }
264}