Skip to main content

datasynth_eval/statistical/
anomaly_realism.rs

1//! Anomaly injection realism evaluator.
2//!
3//! Validates that injected anomalies produce statistically detectable patterns,
4//! cascade coherence is maintained, and multi-stage schemes share participants.
5
6use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9
10/// Anomaly data for realism validation.
11#[derive(Debug, Clone)]
12pub struct AnomalyData {
13    /// Anomaly identifier.
14    pub anomaly_id: String,
15    /// Anomaly type/category.
16    pub anomaly_type: String,
17    /// The anomalous value.
18    pub value: f64,
19    /// Mean of the normal population.
20    pub population_mean: f64,
21    /// Standard deviation of the normal population.
22    pub population_std: f64,
23    /// Parent anomaly ID for cascaded anomalies.
24    pub parent_anomaly_id: Option<String>,
25    /// Scheme identifier for multi-stage schemes.
26    pub scheme_id: Option<String>,
27    /// Participants involved in this anomaly.
28    pub participants: Vec<String>,
29}
30
31/// Thresholds for anomaly realism.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct AnomalyRealismThresholds {
34    /// Minimum average z-score for anomalies to be detectable.
35    pub min_avg_z_score: f64,
36    /// Minimum cascade coherence rate.
37    pub min_cascade_coherence: f64,
38    /// Minimum scheme participant consistency.
39    pub min_scheme_consistency: f64,
40}
41
42impl Default for AnomalyRealismThresholds {
43    fn default() -> Self {
44        Self {
45            min_avg_z_score: 2.0,
46            min_cascade_coherence: 0.90,
47            min_scheme_consistency: 0.85,
48        }
49    }
50}
51
52/// Results of anomaly realism evaluation.
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct AnomalyRealismEvaluation {
55    /// Statistical detectability: fraction of anomalies with z-score > 2.
56    pub statistical_detectability: f64,
57    /// Average z-score across all anomalies.
58    pub avg_anomaly_z_score: f64,
59    /// Cascade coherence: fraction of cascaded anomalies referencing valid parents.
60    pub cascade_coherence: f64,
61    /// Scheme participant consistency: fraction of schemes where participants overlap.
62    pub scheme_participant_consistency: f64,
63    /// Total anomalies evaluated.
64    pub total_anomalies: usize,
65    /// Cascaded anomalies count.
66    pub cascaded_count: usize,
67    /// Unique schemes count.
68    pub scheme_count: usize,
69    /// Overall pass/fail.
70    pub passes: bool,
71    /// Issues found.
72    pub issues: Vec<String>,
73}
74
75/// Evaluator for anomaly injection realism.
76pub struct AnomalyRealismEvaluator {
77    thresholds: AnomalyRealismThresholds,
78}
79
80impl AnomalyRealismEvaluator {
81    /// Create a new evaluator with default thresholds.
82    pub fn new() -> Self {
83        Self {
84            thresholds: AnomalyRealismThresholds::default(),
85        }
86    }
87
88    /// Create with custom thresholds.
89    pub fn with_thresholds(thresholds: AnomalyRealismThresholds) -> Self {
90        Self { thresholds }
91    }
92
93    /// Evaluate anomaly data.
94    pub fn evaluate(&self, anomalies: &[AnomalyData]) -> EvalResult<AnomalyRealismEvaluation> {
95        let mut issues = Vec::new();
96
97        if anomalies.is_empty() {
98            return Ok(AnomalyRealismEvaluation {
99                statistical_detectability: 1.0,
100                avg_anomaly_z_score: 0.0,
101                cascade_coherence: 1.0,
102                scheme_participant_consistency: 1.0,
103                total_anomalies: 0,
104                cascaded_count: 0,
105                scheme_count: 0,
106                passes: true,
107                issues: Vec::new(),
108            });
109        }
110
111        // 1. Statistical detectability via z-scores
112        let z_scores: Vec<f64> = anomalies
113            .iter()
114            .filter(|a| a.population_std > f64::EPSILON)
115            .map(|a| (a.value - a.population_mean).abs() / a.population_std)
116            .collect();
117
118        let detectable = z_scores.iter().filter(|&&z| z > 2.0).count();
119        let statistical_detectability = if z_scores.is_empty() {
120            1.0
121        } else {
122            detectable as f64 / z_scores.len() as f64
123        };
124
125        let avg_anomaly_z_score = if z_scores.is_empty() {
126            0.0
127        } else {
128            z_scores.iter().sum::<f64>() / z_scores.len() as f64
129        };
130
131        // 2. Cascade coherence: cascaded anomalies should reference valid parent IDs
132        let all_ids: HashSet<&str> = anomalies.iter().map(|a| a.anomaly_id.as_str()).collect();
133        let cascaded: Vec<&AnomalyData> = anomalies
134            .iter()
135            .filter(|a| a.parent_anomaly_id.is_some())
136            .collect();
137        let cascaded_count = cascaded.len();
138
139        let cascade_valid = cascaded
140            .iter()
141            .filter(|a| {
142                a.parent_anomaly_id
143                    .as_ref()
144                    .map(|pid| all_ids.contains(pid.as_str()))
145                    .unwrap_or(false)
146            })
147            .count();
148        let cascade_coherence = if cascaded_count == 0 {
149            1.0
150        } else {
151            cascade_valid as f64 / cascaded_count as f64
152        };
153
154        // 3. Scheme participant consistency
155        let mut schemes: HashMap<&str, Vec<HashSet<&str>>> = HashMap::new();
156        for a in anomalies {
157            if let Some(ref sid) = a.scheme_id {
158                let participants: HashSet<&str> =
159                    a.participants.iter().map(|p| p.as_str()).collect();
160                schemes.entry(sid.as_str()).or_default().push(participants);
161            }
162        }
163        let scheme_count = schemes.len();
164
165        let consistent_schemes = schemes
166            .values()
167            .filter(|participant_sets| {
168                if participant_sets.len() < 2 {
169                    return true;
170                }
171                // Check that there's overlap between all participant sets
172                let first = &participant_sets[0];
173                participant_sets[1..]
174                    .iter()
175                    .all(|ps| !first.is_disjoint(ps))
176            })
177            .count();
178        let scheme_participant_consistency = if scheme_count == 0 {
179            1.0
180        } else {
181            consistent_schemes as f64 / scheme_count as f64
182        };
183
184        // Check thresholds
185        if avg_anomaly_z_score < self.thresholds.min_avg_z_score && !z_scores.is_empty() {
186            issues.push(format!(
187                "Avg anomaly z-score {:.2} < {:.2}",
188                avg_anomaly_z_score, self.thresholds.min_avg_z_score
189            ));
190        }
191        if cascade_coherence < self.thresholds.min_cascade_coherence {
192            issues.push(format!(
193                "Cascade coherence {:.3} < {:.3}",
194                cascade_coherence, self.thresholds.min_cascade_coherence
195            ));
196        }
197        if scheme_participant_consistency < self.thresholds.min_scheme_consistency {
198            issues.push(format!(
199                "Scheme participant consistency {:.3} < {:.3}",
200                scheme_participant_consistency, self.thresholds.min_scheme_consistency
201            ));
202        }
203
204        let passes = issues.is_empty();
205
206        Ok(AnomalyRealismEvaluation {
207            statistical_detectability,
208            avg_anomaly_z_score,
209            cascade_coherence,
210            scheme_participant_consistency,
211            total_anomalies: anomalies.len(),
212            cascaded_count,
213            scheme_count,
214            passes,
215            issues,
216        })
217    }
218}
219
220impl Default for AnomalyRealismEvaluator {
221    fn default() -> Self {
222        Self::new()
223    }
224}
225
226#[cfg(test)]
227#[allow(clippy::unwrap_used)]
228mod tests {
229    use super::*;
230
231    #[test]
232    fn test_detectable_anomalies() {
233        let evaluator = AnomalyRealismEvaluator::new();
234        let anomalies = vec![
235            AnomalyData {
236                anomaly_id: "A001".to_string(),
237                anomaly_type: "unusual_amount".to_string(),
238                value: 100_000.0,
239                population_mean: 10_000.0,
240                population_std: 5_000.0,
241                parent_anomaly_id: None,
242                scheme_id: None,
243                participants: vec![],
244            },
245            AnomalyData {
246                anomaly_id: "A002".to_string(),
247                anomaly_type: "unusual_amount".to_string(),
248                value: 50_000.0,
249                population_mean: 10_000.0,
250                population_std: 5_000.0,
251                parent_anomaly_id: None,
252                scheme_id: None,
253                participants: vec![],
254            },
255        ];
256
257        let result = evaluator.evaluate(&anomalies).unwrap();
258        assert!(result.passes);
259        assert!(result.avg_anomaly_z_score > 2.0);
260    }
261
262    #[test]
263    fn test_undetectable_anomalies() {
264        let evaluator = AnomalyRealismEvaluator::new();
265        let anomalies = vec![AnomalyData {
266            anomaly_id: "A001".to_string(),
267            anomaly_type: "subtle".to_string(),
268            value: 10_100.0, // Only slightly above mean
269            population_mean: 10_000.0,
270            population_std: 5_000.0,
271            parent_anomaly_id: None,
272            scheme_id: None,
273            participants: vec![],
274        }];
275
276        let result = evaluator.evaluate(&anomalies).unwrap();
277        assert!(!result.passes); // z-score < 2.0
278    }
279
280    #[test]
281    fn test_cascade_coherence() {
282        let evaluator = AnomalyRealismEvaluator::new();
283        let anomalies = vec![
284            AnomalyData {
285                anomaly_id: "A001".to_string(),
286                anomaly_type: "root".to_string(),
287                value: 50_000.0,
288                population_mean: 10_000.0,
289                population_std: 5_000.0,
290                parent_anomaly_id: None,
291                scheme_id: None,
292                participants: vec![],
293            },
294            AnomalyData {
295                anomaly_id: "A002".to_string(),
296                anomaly_type: "cascade".to_string(),
297                value: 50_000.0,
298                population_mean: 10_000.0,
299                population_std: 5_000.0,
300                parent_anomaly_id: Some("A001".to_string()), // Valid parent
301                scheme_id: None,
302                participants: vec![],
303            },
304        ];
305
306        let result = evaluator.evaluate(&anomalies).unwrap();
307        assert_eq!(result.cascade_coherence, 1.0);
308    }
309
310    #[test]
311    fn test_empty() {
312        let evaluator = AnomalyRealismEvaluator::new();
313        let result = evaluator.evaluate(&[]).unwrap();
314        assert!(result.passes);
315    }
316}