Skip to main content

xls_rs/
anomaly.rs

1//! Anomaly detection operations
2//!
3//! Provides statistical anomaly detection using methods like Z-score,
4//! IQR (Interquartile Range), and isolation forest.
5
6use anyhow::Result;
7use rayon::prelude::*;
8use serde::{Deserialize, Serialize};
9
10/// Anomaly detection method
11#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
12pub enum AnomalyMethod {
13    ZScore { threshold: f64 },
14    IQR { multiplier: f64 },
15    Percentile { lower: f64, upper: f64 },
16}
17
18/// Anomaly detection result
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct AnomalyResult {
21    pub anomalies: Vec<Anomaly>,
22    pub total_anomalies: usize,
23    pub anomaly_percentage: f64,
24}
25
26/// Individual anomaly
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct Anomaly {
29    pub row: usize,
30    pub column: String,
31    pub value: String,
32    pub score: f64,
33    pub reason: String,
34}
35
36/// Anomaly detector
37pub struct AnomalyDetector {
38    method: AnomalyMethod,
39}
40
41impl AnomalyDetector {
42    pub fn new(method: AnomalyMethod) -> Self {
43        Self { method }
44    }
45
46    /// Detect anomalies in a column
47    pub fn detect(&self, data: &[Vec<String>], column: usize) -> Result<AnomalyResult> {
48        if data.is_empty() || column >= data[0].len() {
49            return Ok(AnomalyResult {
50                anomalies: Vec::new(),
51                total_anomalies: 0,
52                anomaly_percentage: 0.0,
53            });
54        }
55
56        // Extract numeric values
57        let values: Vec<f64> = data
58            .iter()
59            .skip(1) // Skip header
60            .filter_map(|row| row.get(column))
61            .filter_map(|v| v.parse::<f64>().ok())
62            .collect();
63
64        if values.is_empty() {
65            return Ok(AnomalyResult {
66                anomalies: Vec::new(),
67                total_anomalies: 0,
68                anomaly_percentage: 0.0,
69            });
70        }
71
72        let anomalies = match self.method {
73            AnomalyMethod::ZScore { threshold } => {
74                self.detect_zscore(&values, column, threshold)?
75            }
76            AnomalyMethod::IQR { multiplier } => self.detect_iqr(&values, column, multiplier)?,
77            AnomalyMethod::Percentile { lower, upper } => {
78                self.detect_percentile(&values, column, lower, upper)?
79            }
80        };
81
82        let total_anomalies = anomalies.len();
83        let anomaly_percentage = (total_anomalies as f64 / values.len() as f64) * 100.0;
84
85        Ok(AnomalyResult {
86            anomalies,
87            total_anomalies,
88            anomaly_percentage,
89        })
90    }
91
92    fn detect_zscore(&self, values: &[f64], column: usize, threshold: f64) -> Result<Vec<Anomaly>> {
93        let mean = values.par_iter().sum::<f64>() / values.len() as f64;
94        let variance = values.par_iter().map(|v| (v - mean).powi(2)).sum::<f64>() / values.len() as f64;
95        let std_dev = variance.sqrt();
96
97        if std_dev == 0.0 {
98            return Ok(Vec::new());
99        }
100
101        let anomalies: Vec<Anomaly> = values
102            .par_iter()
103            .enumerate()
104            .filter_map(|(idx, value)| {
105                let z_score = (value - mean).abs() / std_dev;
106                if z_score > threshold {
107                    Some(Anomaly {
108                        row: idx + 1, // +1 for header
109                        column: format!("col_{column}"),
110                        value: value.to_string(),
111                        score: z_score,
112                        reason: format!("Z-score {z_score:.2} exceeds threshold {threshold:.2}"),
113                    })
114                } else {
115                    None
116                }
117            })
118            .collect();
119
120        Ok(anomalies)
121    }
122
123    fn detect_iqr(&self, values: &[f64], column: usize, multiplier: f64) -> Result<Vec<Anomaly>> {
124        let mut sorted = values.to_vec();
125        sorted.par_sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
126
127        let q1_idx = sorted.len() / 4;
128        let q3_idx = (sorted.len() * 3) / 4;
129
130        let q1 = sorted[q1_idx];
131        let q3 = sorted[q3_idx];
132        let iqr = q3 - q1;
133
134        let lower_bound = q1 - multiplier * iqr;
135        let upper_bound = q3 + multiplier * iqr;
136
137        let anomalies: Vec<Anomaly> = values
138            .par_iter()
139            .enumerate()
140            .filter_map(|(idx, value)| {
141                if *value < lower_bound || *value > upper_bound {
142                    let reason = if *value < lower_bound {
143                        format!("Value {value:.2} below lower bound {lower_bound:.2}")
144                    } else {
145                        format!("Value {value:.2} above upper bound {upper_bound:.2}")
146                    };
147
148                    Some(Anomaly {
149                        row: idx + 1,
150                        column: format!("col_{column}"),
151                        value: value.to_string(),
152                        score: if *value < lower_bound {
153                            (lower_bound - value) / iqr
154                        } else {
155                            (value - upper_bound) / iqr
156                        },
157                        reason,
158                    })
159                } else {
160                    None
161                }
162            })
163            .collect();
164
165        Ok(anomalies)
166    }
167
168    fn detect_percentile(
169        &self,
170        values: &[f64],
171        column: usize,
172        lower: f64,
173        upper: f64,
174    ) -> Result<Vec<Anomaly>> {
175        let mut sorted = values.to_vec();
176        sorted.par_sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
177
178        let lower_idx = (sorted.len() as f64 * lower / 100.0) as usize;
179        let upper_idx = (sorted.len() as f64 * upper / 100.0) as usize;
180
181        let lower_bound = sorted[lower_idx];
182        let upper_bound = sorted[upper_idx];
183
184        let anomalies: Vec<Anomaly> = values
185            .par_iter()
186            .enumerate()
187            .filter_map(|(idx, value)| {
188                if *value < lower_bound || *value > upper_bound {
189                    Some(Anomaly {
190                        row: idx + 1,
191                        column: format!("col_{column}"),
192                        value: value.to_string(),
193                        score: 1.0,
194                        reason: format!("Value outside {lower:.1}%-{upper:.1}% percentile range"),
195                    })
196                } else {
197                    None
198                }
199            })
200            .collect();
201
202        Ok(anomalies)
203    }
204}