Skip to main content

datasynth_eval/statistical/
anomaly_realism.rs

1//! Anomaly injection realism evaluator.
2//!
3//! Validates that injected anomalies produce statistically detectable patterns,
4//! cascade coherence is maintained, and multi-stage schemes share participants.
5
6use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9
10/// Anomaly data for realism validation.
11#[derive(Debug, Clone)]
12pub struct AnomalyData {
13    /// Anomaly identifier.
14    pub anomaly_id: String,
15    /// Anomaly type/category.
16    pub anomaly_type: String,
17    /// The anomalous value.
18    pub value: f64,
19    /// Mean of the normal population.
20    pub population_mean: f64,
21    /// Standard deviation of the normal population.
22    pub population_std: f64,
23    /// Parent anomaly ID for cascaded anomalies.
24    pub parent_anomaly_id: Option<String>,
25    /// Scheme identifier for multi-stage schemes.
26    pub scheme_id: Option<String>,
27    /// Participants involved in this anomaly.
28    pub participants: Vec<String>,
29}
30
31/// Thresholds for anomaly realism.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct AnomalyRealismThresholds {
34    /// Minimum average z-score for anomalies to be detectable.
35    pub min_avg_z_score: f64,
36    /// Minimum cascade coherence rate.
37    pub min_cascade_coherence: f64,
38    /// Minimum scheme participant consistency.
39    pub min_scheme_consistency: f64,
40}
41
42impl Default for AnomalyRealismThresholds {
43    fn default() -> Self {
44        Self {
45            min_avg_z_score: 2.0,
46            min_cascade_coherence: 0.90,
47            min_scheme_consistency: 0.85,
48        }
49    }
50}
51
52/// Results of anomaly realism evaluation.
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct AnomalyRealismEvaluation {
55    /// Statistical detectability: fraction of anomalies with z-score > 2.
56    pub statistical_detectability: f64,
57    /// Average z-score across all anomalies.
58    pub avg_anomaly_z_score: f64,
59    /// Cascade coherence: fraction of cascaded anomalies referencing valid parents.
60    pub cascade_coherence: f64,
61    /// Scheme participant consistency: fraction of schemes where participants overlap.
62    pub scheme_participant_consistency: f64,
63    /// Total anomalies evaluated.
64    pub total_anomalies: usize,
65    /// Cascaded anomalies count.
66    pub cascaded_count: usize,
67    /// Unique schemes count.
68    pub scheme_count: usize,
69    /// Overall pass/fail.
70    pub passes: bool,
71    /// Issues found.
72    pub issues: Vec<String>,
73}
74
75/// Evaluator for anomaly injection realism.
76pub struct AnomalyRealismEvaluator {
77    thresholds: AnomalyRealismThresholds,
78}
79
80impl AnomalyRealismEvaluator {
81    /// Create a new evaluator with default thresholds.
82    pub fn new() -> Self {
83        Self {
84            thresholds: AnomalyRealismThresholds::default(),
85        }
86    }
87
88    /// Create with custom thresholds.
89    pub fn with_thresholds(thresholds: AnomalyRealismThresholds) -> Self {
90        Self { thresholds }
91    }
92
93    /// Evaluate anomaly data.
94    pub fn evaluate(&self, anomalies: &[AnomalyData]) -> EvalResult<AnomalyRealismEvaluation> {
95        let mut issues = Vec::new();
96
97        if anomalies.is_empty() {
98            return Ok(AnomalyRealismEvaluation {
99                statistical_detectability: 1.0,
100                avg_anomaly_z_score: 0.0,
101                cascade_coherence: 1.0,
102                scheme_participant_consistency: 1.0,
103                total_anomalies: 0,
104                cascaded_count: 0,
105                scheme_count: 0,
106                passes: true,
107                issues: Vec::new(),
108            });
109        }
110
111        // 1. Statistical detectability via z-scores
112        let z_scores: Vec<f64> = anomalies
113            .iter()
114            .filter(|a| a.population_std > f64::EPSILON)
115            .map(|a| (a.value - a.population_mean).abs() / a.population_std)
116            .collect();
117
118        let detectable = z_scores.iter().filter(|&&z| z > 2.0).count();
119        let statistical_detectability = if z_scores.is_empty() {
120            1.0
121        } else {
122            detectable as f64 / z_scores.len() as f64
123        };
124
125        let avg_anomaly_z_score = if z_scores.is_empty() {
126            0.0
127        } else {
128            z_scores.iter().sum::<f64>() / z_scores.len() as f64
129        };
130
131        // 2. Cascade coherence: cascaded anomalies should reference valid parent IDs
132        let all_ids: HashSet<&str> = anomalies.iter().map(|a| a.anomaly_id.as_str()).collect();
133        let cascaded: Vec<&AnomalyData> = anomalies
134            .iter()
135            .filter(|a| a.parent_anomaly_id.is_some())
136            .collect();
137        let cascaded_count = cascaded.len();
138
139        let cascade_valid = cascaded
140            .iter()
141            .filter(|a| {
142                a.parent_anomaly_id
143                    .as_ref()
144                    .map(|pid| all_ids.contains(pid.as_str()))
145                    .unwrap_or(false)
146            })
147            .count();
148        let cascade_coherence = if cascaded_count == 0 {
149            1.0
150        } else {
151            cascade_valid as f64 / cascaded_count as f64
152        };
153
154        // 3. Scheme participant consistency
155        let mut schemes: HashMap<&str, Vec<HashSet<&str>>> = HashMap::new();
156        for a in anomalies {
157            if let Some(ref sid) = a.scheme_id {
158                let participants: HashSet<&str> = a
159                    .participants
160                    .iter()
161                    .map(std::string::String::as_str)
162                    .collect();
163                schemes.entry(sid.as_str()).or_default().push(participants);
164            }
165        }
166        let scheme_count = schemes.len();
167
168        let consistent_schemes = schemes
169            .values()
170            .filter(|participant_sets| {
171                if participant_sets.len() < 2 {
172                    return true;
173                }
174                // Check that there's overlap between all participant sets
175                let first = &participant_sets[0];
176                participant_sets[1..]
177                    .iter()
178                    .all(|ps| !first.is_disjoint(ps))
179            })
180            .count();
181        let scheme_participant_consistency = if scheme_count == 0 {
182            1.0
183        } else {
184            consistent_schemes as f64 / scheme_count as f64
185        };
186
187        // Check thresholds
188        if avg_anomaly_z_score < self.thresholds.min_avg_z_score && !z_scores.is_empty() {
189            issues.push(format!(
190                "Avg anomaly z-score {:.2} < {:.2}",
191                avg_anomaly_z_score, self.thresholds.min_avg_z_score
192            ));
193        }
194        if cascade_coherence < self.thresholds.min_cascade_coherence {
195            issues.push(format!(
196                "Cascade coherence {:.3} < {:.3}",
197                cascade_coherence, self.thresholds.min_cascade_coherence
198            ));
199        }
200        if scheme_participant_consistency < self.thresholds.min_scheme_consistency {
201            issues.push(format!(
202                "Scheme participant consistency {:.3} < {:.3}",
203                scheme_participant_consistency, self.thresholds.min_scheme_consistency
204            ));
205        }
206
207        let passes = issues.is_empty();
208
209        Ok(AnomalyRealismEvaluation {
210            statistical_detectability,
211            avg_anomaly_z_score,
212            cascade_coherence,
213            scheme_participant_consistency,
214            total_anomalies: anomalies.len(),
215            cascaded_count,
216            scheme_count,
217            passes,
218            issues,
219        })
220    }
221}
222
223impl Default for AnomalyRealismEvaluator {
224    fn default() -> Self {
225        Self::new()
226    }
227}
228
229#[cfg(test)]
230#[allow(clippy::unwrap_used)]
231mod tests {
232    use super::*;
233
234    #[test]
235    fn test_detectable_anomalies() {
236        let evaluator = AnomalyRealismEvaluator::new();
237        let anomalies = vec![
238            AnomalyData {
239                anomaly_id: "A001".to_string(),
240                anomaly_type: "unusual_amount".to_string(),
241                value: 100_000.0,
242                population_mean: 10_000.0,
243                population_std: 5_000.0,
244                parent_anomaly_id: None,
245                scheme_id: None,
246                participants: vec![],
247            },
248            AnomalyData {
249                anomaly_id: "A002".to_string(),
250                anomaly_type: "unusual_amount".to_string(),
251                value: 50_000.0,
252                population_mean: 10_000.0,
253                population_std: 5_000.0,
254                parent_anomaly_id: None,
255                scheme_id: None,
256                participants: vec![],
257            },
258        ];
259
260        let result = evaluator.evaluate(&anomalies).unwrap();
261        assert!(result.passes);
262        assert!(result.avg_anomaly_z_score > 2.0);
263    }
264
265    #[test]
266    fn test_undetectable_anomalies() {
267        let evaluator = AnomalyRealismEvaluator::new();
268        let anomalies = vec![AnomalyData {
269            anomaly_id: "A001".to_string(),
270            anomaly_type: "subtle".to_string(),
271            value: 10_100.0, // Only slightly above mean
272            population_mean: 10_000.0,
273            population_std: 5_000.0,
274            parent_anomaly_id: None,
275            scheme_id: None,
276            participants: vec![],
277        }];
278
279        let result = evaluator.evaluate(&anomalies).unwrap();
280        assert!(!result.passes); // z-score < 2.0
281    }
282
283    #[test]
284    fn test_cascade_coherence() {
285        let evaluator = AnomalyRealismEvaluator::new();
286        let anomalies = vec![
287            AnomalyData {
288                anomaly_id: "A001".to_string(),
289                anomaly_type: "root".to_string(),
290                value: 50_000.0,
291                population_mean: 10_000.0,
292                population_std: 5_000.0,
293                parent_anomaly_id: None,
294                scheme_id: None,
295                participants: vec![],
296            },
297            AnomalyData {
298                anomaly_id: "A002".to_string(),
299                anomaly_type: "cascade".to_string(),
300                value: 50_000.0,
301                population_mean: 10_000.0,
302                population_std: 5_000.0,
303                parent_anomaly_id: Some("A001".to_string()), // Valid parent
304                scheme_id: None,
305                participants: vec![],
306            },
307        ];
308
309        let result = evaluator.evaluate(&anomalies).unwrap();
310        assert_eq!(result.cascade_coherence, 1.0);
311    }
312
313    #[test]
314    fn test_empty() {
315        let evaluator = AnomalyRealismEvaluator::new();
316        let result = evaluator.evaluate(&[]).unwrap();
317        assert!(result.passes);
318    }
319}