datasynth_eval/statistical/
mod.rs1mod amount_distribution;
18mod anderson_darling;
19mod benford;
20mod chi_squared;
21mod correlation;
22mod drift_detection;
23mod line_item;
24mod temporal;
25
26pub use amount_distribution::{AmountDistributionAnalysis, AmountDistributionAnalyzer};
27pub use anderson_darling::{
28 AndersonDarlingAnalysis, AndersonDarlingAnalyzer, CriticalValues, FittedParameters,
29 TargetDistribution,
30};
31pub use benford::{BenfordAnalysis, BenfordAnalyzer, BenfordConformity};
32pub use chi_squared::{
33 BinFrequency, BinningStrategy, ChiSquaredAnalysis, ChiSquaredAnalyzer, ExpectedDistribution,
34};
35pub use correlation::{
36 pearson_correlation, spearman_correlation, CorrelationAnalysis, CorrelationAnalyzer,
37 CorrelationCheckResult, ExpectedCorrelation,
38};
39pub use drift_detection::{
40 DetectionDifficulty, DriftDetectionAnalysis, DriftDetectionAnalyzer, DriftDetectionEntry,
41 DriftDetectionMetrics, DriftEventCategory, LabeledDriftEvent, LabeledEventAnalysis,
42};
43pub use line_item::{LineItemAnalysis, LineItemAnalyzer, LineItemEntry};
44pub use temporal::{TemporalAnalysis, TemporalAnalyzer, TemporalEntry};
45
46use serde::{Deserialize, Serialize};
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct StatisticalEvaluation {
51 pub benford: Option<BenfordAnalysis>,
53 pub amount_distribution: Option<AmountDistributionAnalysis>,
55 pub line_item: Option<LineItemAnalysis>,
57 pub temporal: Option<TemporalAnalysis>,
59 pub correlation: Option<CorrelationAnalysis>,
61 pub anderson_darling: Option<AndersonDarlingAnalysis>,
63 pub chi_squared: Option<ChiSquaredAnalysis>,
65 pub drift_detection: Option<DriftDetectionAnalysis>,
67 pub drift_events: Option<LabeledEventAnalysis>,
69 pub passes: bool,
71 pub failures: Vec<String>,
73 pub issues: Vec<String>,
75 pub overall_score: f64,
77}
78
79impl StatisticalEvaluation {
80 pub fn new() -> Self {
82 Self {
83 benford: None,
84 amount_distribution: None,
85 line_item: None,
86 temporal: None,
87 correlation: None,
88 anderson_darling: None,
89 chi_squared: None,
90 drift_detection: None,
91 drift_events: None,
92 passes: true,
93 failures: Vec::new(),
94 issues: Vec::new(),
95 overall_score: 1.0,
96 }
97 }
98
99 pub fn check_thresholds(&mut self, thresholds: &crate::config::EvaluationThresholds) {
101 self.failures.clear();
102 self.issues.clear();
103 let mut scores = Vec::new();
104
105 if let Some(ref benford) = self.benford {
106 if benford.p_value < thresholds.benford_p_value_min {
107 self.failures.push(format!(
108 "Benford p-value {} < {} (threshold)",
109 benford.p_value, thresholds.benford_p_value_min
110 ));
111 }
112 if benford.mad > thresholds.benford_mad_max {
113 self.failures.push(format!(
114 "Benford MAD {} > {} (threshold)",
115 benford.mad, thresholds.benford_mad_max
116 ));
117 }
118 let p_score = (benford.p_value / 0.5).min(1.0);
120 let mad_score = 1.0 - (benford.mad / 0.05).min(1.0);
121 scores.push((p_score + mad_score) / 2.0);
122 }
123
124 if let Some(ref amount) = self.amount_distribution {
125 if let Some(p_value) = amount.lognormal_ks_pvalue {
126 if p_value < thresholds.amount_ks_p_value_min {
127 self.failures.push(format!(
128 "Amount KS p-value {} < {} (threshold)",
129 p_value, thresholds.amount_ks_p_value_min
130 ));
131 }
132 scores.push((p_value / 0.5).min(1.0));
133 }
134 }
135
136 if let Some(ref temporal) = self.temporal {
137 if temporal.pattern_correlation < thresholds.temporal_correlation_min {
138 self.failures.push(format!(
139 "Temporal correlation {} < {} (threshold)",
140 temporal.pattern_correlation, thresholds.temporal_correlation_min
141 ));
142 }
143 scores.push(temporal.pattern_correlation);
144 }
145
146 if let Some(ref correlation) = self.correlation {
148 if !correlation.passes {
149 for issue in &correlation.issues {
150 self.failures.push(format!("Correlation: {}", issue));
151 }
152 }
153 let total_checks = correlation.checks_passed + correlation.checks_failed;
155 if total_checks > 0 {
156 scores.push(correlation.checks_passed as f64 / total_checks as f64);
157 }
158 }
159
160 if let Some(ref ad) = self.anderson_darling {
162 if !ad.passes {
163 for issue in &ad.issues {
164 self.failures.push(format!("Anderson-Darling: {}", issue));
165 }
166 }
167 scores.push((ad.p_value / 0.5).min(1.0));
169 }
170
171 if let Some(ref chi_sq) = self.chi_squared {
173 if !chi_sq.passes {
174 for issue in &chi_sq.issues {
175 self.failures.push(format!("Chi-squared: {}", issue));
176 }
177 }
178 scores.push((chi_sq.p_value / 0.5).min(1.0));
180 }
181
182 if let Some(ref drift) = self.drift_detection {
184 if !drift.passes {
185 for issue in &drift.issues {
186 self.failures.push(format!("Drift detection: {}", issue));
187 }
188 }
189 if drift.drift_magnitude >= thresholds.drift_magnitude_min {
191 scores.push(drift.detection_metrics.f1_score);
192 }
193 if let Some(hellinger) = drift.hellinger_distance {
195 if hellinger > thresholds.drift_hellinger_max {
196 self.failures.push(format!(
197 "Drift Hellinger distance {} > {} (threshold)",
198 hellinger, thresholds.drift_hellinger_max
199 ));
200 }
201 }
202 if let Some(psi) = drift.psi {
204 if psi > thresholds.drift_psi_max {
205 self.failures.push(format!(
206 "Drift PSI {} > {} (threshold)",
207 psi, thresholds.drift_psi_max
208 ));
209 }
210 }
211 }
212
213 if let Some(ref events) = self.drift_events {
215 if !events.passes {
216 for issue in &events.issues {
217 self.failures.push(format!("Drift events: {}", issue));
218 }
219 }
220 if events.total_events > 0 {
222 let difficulty_score = 1.0 - events.avg_difficulty;
223 scores.push(difficulty_score);
224 }
225 }
226
227 self.issues = self.failures.clone();
229 self.passes = self.failures.is_empty();
230
231 self.overall_score = if scores.is_empty() {
233 1.0
234 } else {
235 scores.iter().sum::<f64>() / scores.len() as f64
236 };
237 }
238}
239
240impl Default for StatisticalEvaluation {
241 fn default() -> Self {
242 Self::new()
243 }
244}