datasynth_eval/statistical/
anomaly_realism.rs1use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9
10#[derive(Debug, Clone)]
12pub struct AnomalyData {
13 pub anomaly_id: String,
15 pub anomaly_type: String,
17 pub value: f64,
19 pub population_mean: f64,
21 pub population_std: f64,
23 pub parent_anomaly_id: Option<String>,
25 pub scheme_id: Option<String>,
27 pub participants: Vec<String>,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct AnomalyRealismThresholds {
34 pub min_avg_z_score: f64,
36 pub min_cascade_coherence: f64,
38 pub min_scheme_consistency: f64,
40}
41
42impl Default for AnomalyRealismThresholds {
43 fn default() -> Self {
44 Self {
45 min_avg_z_score: 2.0,
46 min_cascade_coherence: 0.90,
47 min_scheme_consistency: 0.85,
48 }
49 }
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct AnomalyRealismEvaluation {
55 pub statistical_detectability: f64,
57 pub avg_anomaly_z_score: f64,
59 pub cascade_coherence: f64,
61 pub scheme_participant_consistency: f64,
63 pub total_anomalies: usize,
65 pub cascaded_count: usize,
67 pub scheme_count: usize,
69 pub passes: bool,
71 pub issues: Vec<String>,
73}
74
75pub struct AnomalyRealismEvaluator {
77 thresholds: AnomalyRealismThresholds,
78}
79
80impl AnomalyRealismEvaluator {
81 pub fn new() -> Self {
83 Self {
84 thresholds: AnomalyRealismThresholds::default(),
85 }
86 }
87
88 pub fn with_thresholds(thresholds: AnomalyRealismThresholds) -> Self {
90 Self { thresholds }
91 }
92
93 pub fn evaluate(&self, anomalies: &[AnomalyData]) -> EvalResult<AnomalyRealismEvaluation> {
95 let mut issues = Vec::new();
96
97 if anomalies.is_empty() {
98 return Ok(AnomalyRealismEvaluation {
99 statistical_detectability: 1.0,
100 avg_anomaly_z_score: 0.0,
101 cascade_coherence: 1.0,
102 scheme_participant_consistency: 1.0,
103 total_anomalies: 0,
104 cascaded_count: 0,
105 scheme_count: 0,
106 passes: true,
107 issues: Vec::new(),
108 });
109 }
110
111 let z_scores: Vec<f64> = anomalies
113 .iter()
114 .filter(|a| a.population_std > f64::EPSILON)
115 .map(|a| (a.value - a.population_mean).abs() / a.population_std)
116 .collect();
117
118 let detectable = z_scores.iter().filter(|&&z| z > 2.0).count();
119 let statistical_detectability = if z_scores.is_empty() {
120 1.0
121 } else {
122 detectable as f64 / z_scores.len() as f64
123 };
124
125 let avg_anomaly_z_score = if z_scores.is_empty() {
126 0.0
127 } else {
128 z_scores.iter().sum::<f64>() / z_scores.len() as f64
129 };
130
131 let all_ids: HashSet<&str> = anomalies.iter().map(|a| a.anomaly_id.as_str()).collect();
133 let cascaded: Vec<&AnomalyData> = anomalies
134 .iter()
135 .filter(|a| a.parent_anomaly_id.is_some())
136 .collect();
137 let cascaded_count = cascaded.len();
138
139 let cascade_valid = cascaded
140 .iter()
141 .filter(|a| {
142 a.parent_anomaly_id
143 .as_ref()
144 .map(|pid| all_ids.contains(pid.as_str()))
145 .unwrap_or(false)
146 })
147 .count();
148 let cascade_coherence = if cascaded_count == 0 {
149 1.0
150 } else {
151 cascade_valid as f64 / cascaded_count as f64
152 };
153
154 let mut schemes: HashMap<&str, Vec<HashSet<&str>>> = HashMap::new();
156 for a in anomalies {
157 if let Some(ref sid) = a.scheme_id {
158 let participants: HashSet<&str> =
159 a.participants.iter().map(|p| p.as_str()).collect();
160 schemes.entry(sid.as_str()).or_default().push(participants);
161 }
162 }
163 let scheme_count = schemes.len();
164
165 let consistent_schemes = schemes
166 .values()
167 .filter(|participant_sets| {
168 if participant_sets.len() < 2 {
169 return true;
170 }
171 let first = &participant_sets[0];
173 participant_sets[1..]
174 .iter()
175 .all(|ps| !first.is_disjoint(ps))
176 })
177 .count();
178 let scheme_participant_consistency = if scheme_count == 0 {
179 1.0
180 } else {
181 consistent_schemes as f64 / scheme_count as f64
182 };
183
184 if avg_anomaly_z_score < self.thresholds.min_avg_z_score && !z_scores.is_empty() {
186 issues.push(format!(
187 "Avg anomaly z-score {:.2} < {:.2}",
188 avg_anomaly_z_score, self.thresholds.min_avg_z_score
189 ));
190 }
191 if cascade_coherence < self.thresholds.min_cascade_coherence {
192 issues.push(format!(
193 "Cascade coherence {:.3} < {:.3}",
194 cascade_coherence, self.thresholds.min_cascade_coherence
195 ));
196 }
197 if scheme_participant_consistency < self.thresholds.min_scheme_consistency {
198 issues.push(format!(
199 "Scheme participant consistency {:.3} < {:.3}",
200 scheme_participant_consistency, self.thresholds.min_scheme_consistency
201 ));
202 }
203
204 let passes = issues.is_empty();
205
206 Ok(AnomalyRealismEvaluation {
207 statistical_detectability,
208 avg_anomaly_z_score,
209 cascade_coherence,
210 scheme_participant_consistency,
211 total_anomalies: anomalies.len(),
212 cascaded_count,
213 scheme_count,
214 passes,
215 issues,
216 })
217 }
218}
219
220impl Default for AnomalyRealismEvaluator {
221 fn default() -> Self {
222 Self::new()
223 }
224}
225
226#[cfg(test)]
227#[allow(clippy::unwrap_used)]
228mod tests {
229 use super::*;
230
231 #[test]
232 fn test_detectable_anomalies() {
233 let evaluator = AnomalyRealismEvaluator::new();
234 let anomalies = vec![
235 AnomalyData {
236 anomaly_id: "A001".to_string(),
237 anomaly_type: "unusual_amount".to_string(),
238 value: 100_000.0,
239 population_mean: 10_000.0,
240 population_std: 5_000.0,
241 parent_anomaly_id: None,
242 scheme_id: None,
243 participants: vec![],
244 },
245 AnomalyData {
246 anomaly_id: "A002".to_string(),
247 anomaly_type: "unusual_amount".to_string(),
248 value: 50_000.0,
249 population_mean: 10_000.0,
250 population_std: 5_000.0,
251 parent_anomaly_id: None,
252 scheme_id: None,
253 participants: vec![],
254 },
255 ];
256
257 let result = evaluator.evaluate(&anomalies).unwrap();
258 assert!(result.passes);
259 assert!(result.avg_anomaly_z_score > 2.0);
260 }
261
262 #[test]
263 fn test_undetectable_anomalies() {
264 let evaluator = AnomalyRealismEvaluator::new();
265 let anomalies = vec![AnomalyData {
266 anomaly_id: "A001".to_string(),
267 anomaly_type: "subtle".to_string(),
268 value: 10_100.0, population_mean: 10_000.0,
270 population_std: 5_000.0,
271 parent_anomaly_id: None,
272 scheme_id: None,
273 participants: vec![],
274 }];
275
276 let result = evaluator.evaluate(&anomalies).unwrap();
277 assert!(!result.passes); }
279
280 #[test]
281 fn test_cascade_coherence() {
282 let evaluator = AnomalyRealismEvaluator::new();
283 let anomalies = vec![
284 AnomalyData {
285 anomaly_id: "A001".to_string(),
286 anomaly_type: "root".to_string(),
287 value: 50_000.0,
288 population_mean: 10_000.0,
289 population_std: 5_000.0,
290 parent_anomaly_id: None,
291 scheme_id: None,
292 participants: vec![],
293 },
294 AnomalyData {
295 anomaly_id: "A002".to_string(),
296 anomaly_type: "cascade".to_string(),
297 value: 50_000.0,
298 population_mean: 10_000.0,
299 population_std: 5_000.0,
300 parent_anomaly_id: Some("A001".to_string()), scheme_id: None,
302 participants: vec![],
303 },
304 ];
305
306 let result = evaluator.evaluate(&anomalies).unwrap();
307 assert_eq!(result.cascade_coherence, 1.0);
308 }
309
310 #[test]
311 fn test_empty() {
312 let evaluator = AnomalyRealismEvaluator::new();
313 let result = evaluator.evaluate(&[]).unwrap();
314 assert!(result.passes);
315 }
316}