1use anyhow::Result;
7use rayon::prelude::*;
8use serde::{Deserialize, Serialize};
9
10#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
12pub enum AnomalyMethod {
13 ZScore { threshold: f64 },
14 IQR { multiplier: f64 },
15 Percentile { lower: f64, upper: f64 },
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct AnomalyResult {
21 pub anomalies: Vec<Anomaly>,
22 pub total_anomalies: usize,
23 pub anomaly_percentage: f64,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct Anomaly {
29 pub row: usize,
30 pub column: String,
31 pub value: String,
32 pub score: f64,
33 pub reason: String,
34}
35
36pub struct AnomalyDetector {
38 method: AnomalyMethod,
39}
40
41impl AnomalyDetector {
42 pub fn new(method: AnomalyMethod) -> Self {
43 Self { method }
44 }
45
46 pub fn detect(&self, data: &[Vec<String>], column: usize) -> Result<AnomalyResult> {
48 if data.is_empty() || column >= data[0].len() {
49 return Ok(AnomalyResult {
50 anomalies: Vec::new(),
51 total_anomalies: 0,
52 anomaly_percentage: 0.0,
53 });
54 }
55
56 let values: Vec<f64> = data
58 .iter()
59 .skip(1) .filter_map(|row| row.get(column))
61 .filter_map(|v| v.parse::<f64>().ok())
62 .collect();
63
64 if values.is_empty() {
65 return Ok(AnomalyResult {
66 anomalies: Vec::new(),
67 total_anomalies: 0,
68 anomaly_percentage: 0.0,
69 });
70 }
71
72 let anomalies = match self.method {
73 AnomalyMethod::ZScore { threshold } => {
74 self.detect_zscore(&values, column, threshold)?
75 }
76 AnomalyMethod::IQR { multiplier } => self.detect_iqr(&values, column, multiplier)?,
77 AnomalyMethod::Percentile { lower, upper } => {
78 self.detect_percentile(&values, column, lower, upper)?
79 }
80 };
81
82 let total_anomalies = anomalies.len();
83 let anomaly_percentage = (total_anomalies as f64 / values.len() as f64) * 100.0;
84
85 Ok(AnomalyResult {
86 anomalies,
87 total_anomalies,
88 anomaly_percentage,
89 })
90 }
91
92 fn detect_zscore(&self, values: &[f64], column: usize, threshold: f64) -> Result<Vec<Anomaly>> {
93 let mean = values.par_iter().sum::<f64>() / values.len() as f64;
94 let variance = values.par_iter().map(|v| (v - mean).powi(2)).sum::<f64>() / values.len() as f64;
95 let std_dev = variance.sqrt();
96
97 if std_dev == 0.0 {
98 return Ok(Vec::new());
99 }
100
101 let anomalies: Vec<Anomaly> = values
102 .par_iter()
103 .enumerate()
104 .filter_map(|(idx, value)| {
105 let z_score = (value - mean).abs() / std_dev;
106 if z_score > threshold {
107 Some(Anomaly {
108 row: idx + 1, column: format!("col_{column}"),
110 value: value.to_string(),
111 score: z_score,
112 reason: format!("Z-score {z_score:.2} exceeds threshold {threshold:.2}"),
113 })
114 } else {
115 None
116 }
117 })
118 .collect();
119
120 Ok(anomalies)
121 }
122
123 fn detect_iqr(&self, values: &[f64], column: usize, multiplier: f64) -> Result<Vec<Anomaly>> {
124 let mut sorted = values.to_vec();
125 sorted.par_sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
126
127 let q1_idx = sorted.len() / 4;
128 let q3_idx = (sorted.len() * 3) / 4;
129
130 let q1 = sorted[q1_idx];
131 let q3 = sorted[q3_idx];
132 let iqr = q3 - q1;
133
134 let lower_bound = q1 - multiplier * iqr;
135 let upper_bound = q3 + multiplier * iqr;
136
137 let anomalies: Vec<Anomaly> = values
138 .par_iter()
139 .enumerate()
140 .filter_map(|(idx, value)| {
141 if *value < lower_bound || *value > upper_bound {
142 let reason = if *value < lower_bound {
143 format!("Value {value:.2} below lower bound {lower_bound:.2}")
144 } else {
145 format!("Value {value:.2} above upper bound {upper_bound:.2}")
146 };
147
148 Some(Anomaly {
149 row: idx + 1,
150 column: format!("col_{column}"),
151 value: value.to_string(),
152 score: if *value < lower_bound {
153 (lower_bound - value) / iqr
154 } else {
155 (value - upper_bound) / iqr
156 },
157 reason,
158 })
159 } else {
160 None
161 }
162 })
163 .collect();
164
165 Ok(anomalies)
166 }
167
168 fn detect_percentile(
169 &self,
170 values: &[f64],
171 column: usize,
172 lower: f64,
173 upper: f64,
174 ) -> Result<Vec<Anomaly>> {
175 let mut sorted = values.to_vec();
176 sorted.par_sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
177
178 let lower_idx = (sorted.len() as f64 * lower / 100.0) as usize;
179 let upper_idx = (sorted.len() as f64 * upper / 100.0) as usize;
180
181 let lower_bound = sorted[lower_idx];
182 let upper_bound = sorted[upper_idx];
183
184 let anomalies: Vec<Anomaly> = values
185 .par_iter()
186 .enumerate()
187 .filter_map(|(idx, value)| {
188 if *value < lower_bound || *value > upper_bound {
189 Some(Anomaly {
190 row: idx + 1,
191 column: format!("col_{column}"),
192 value: value.to_string(),
193 score: 1.0,
194 reason: format!("Value outside {lower:.1}%-{upper:.1}% percentile range"),
195 })
196 } else {
197 None
198 }
199 })
200 .collect();
201
202 Ok(anomalies)
203 }
204}