1use crate::percentiles::compute_percentile;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum OutlierMethod {
14 Iqr {
16 k: u32,
18 },
19 ZScore {
21 threshold: u32,
23 },
24 None,
26}
27
28impl Default for OutlierMethod {
29 fn default() -> Self {
30 OutlierMethod::Iqr { k: 3 }
32 }
33}
34
35#[derive(Debug, Clone)]
37pub struct OutlierAnalysis {
38 pub all_samples: Vec<f64>,
40 pub cleaned_samples: Vec<f64>,
42 pub outlier_indices: Vec<usize>,
44 pub low_outlier_count: usize,
46 pub high_outlier_count: usize,
48 pub lower_bound: f64,
50 pub upper_bound: f64,
52 pub method: OutlierMethod,
54}
55
56impl OutlierAnalysis {
57 pub fn outlier_percentage(&self) -> f64 {
59 if self.all_samples.is_empty() {
60 return 0.0;
61 }
62 (self.outlier_indices.len() as f64 / self.all_samples.len() as f64) * 100.0
63 }
64
65 pub fn is_noisy(&self, threshold_pct: f64) -> bool {
67 self.outlier_percentage() > threshold_pct
68 }
69}
70
71pub fn detect_outliers(samples: &[f64], method: OutlierMethod) -> OutlierAnalysis {
83 if samples.is_empty() {
84 return OutlierAnalysis {
85 all_samples: Vec::new(),
86 cleaned_samples: Vec::new(),
87 outlier_indices: Vec::new(),
88 low_outlier_count: 0,
89 high_outlier_count: 0,
90 lower_bound: 0.0,
91 upper_bound: 0.0,
92 method,
93 };
94 }
95
96 match method {
97 OutlierMethod::None => OutlierAnalysis {
98 all_samples: samples.to_vec(),
99 cleaned_samples: samples.to_vec(),
100 outlier_indices: Vec::new(),
101 low_outlier_count: 0,
102 high_outlier_count: 0,
103 lower_bound: f64::NEG_INFINITY,
104 upper_bound: f64::INFINITY,
105 method,
106 },
107 OutlierMethod::Iqr { k } => detect_iqr_outliers(samples, k as f64 * 0.5),
108 OutlierMethod::ZScore { threshold } => {
109 detect_zscore_outliers(samples, threshold as f64 * 0.5)
110 }
111 }
112}
113
114fn detect_iqr_outliers(samples: &[f64], k: f64) -> OutlierAnalysis {
116 let q1 = compute_percentile(samples, 25.0);
117 let q3 = compute_percentile(samples, 75.0);
118 let iqr = q3 - q1;
119
120 let lower_bound = q1 - k * iqr;
121 let upper_bound = q3 + k * iqr;
122
123 let mut outlier_indices = Vec::new();
124 let mut low_count = 0;
125 let mut high_count = 0;
126 let mut cleaned = Vec::with_capacity(samples.len());
127
128 for (i, &sample) in samples.iter().enumerate() {
129 if sample < lower_bound {
130 outlier_indices.push(i);
131 low_count += 1;
132 } else if sample > upper_bound {
133 outlier_indices.push(i);
134 high_count += 1;
135 } else {
136 cleaned.push(sample);
137 }
138 }
139
140 OutlierAnalysis {
141 all_samples: samples.to_vec(),
142 cleaned_samples: cleaned,
143 outlier_indices,
144 low_outlier_count: low_count,
145 high_outlier_count: high_count,
146 lower_bound,
147 upper_bound,
148 method: OutlierMethod::Iqr {
149 k: (k * 2.0) as u32,
150 },
151 }
152}
153
154fn detect_zscore_outliers(samples: &[f64], threshold: f64) -> OutlierAnalysis {
156 let n = samples.len() as f64;
157 let mean: f64 = samples.iter().sum::<f64>() / n;
158 let variance: f64 = samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
159 let std_dev = variance.sqrt();
160
161 if std_dev == 0.0 {
162 return OutlierAnalysis {
164 all_samples: samples.to_vec(),
165 cleaned_samples: samples.to_vec(),
166 outlier_indices: Vec::new(),
167 low_outlier_count: 0,
168 high_outlier_count: 0,
169 lower_bound: mean,
170 upper_bound: mean,
171 method: OutlierMethod::ZScore {
172 threshold: (threshold * 2.0) as u32,
173 },
174 };
175 }
176
177 let lower_bound = mean - threshold * std_dev;
178 let upper_bound = mean + threshold * std_dev;
179
180 let mut outlier_indices = Vec::new();
181 let mut low_count = 0;
182 let mut high_count = 0;
183 let mut cleaned = Vec::with_capacity(samples.len());
184
185 for (i, &sample) in samples.iter().enumerate() {
186 let z_score = (sample - mean) / std_dev;
187 if z_score < -threshold {
188 outlier_indices.push(i);
189 low_count += 1;
190 } else if z_score > threshold {
191 outlier_indices.push(i);
192 high_count += 1;
193 } else {
194 cleaned.push(sample);
195 }
196 }
197
198 OutlierAnalysis {
199 all_samples: samples.to_vec(),
200 cleaned_samples: cleaned,
201 outlier_indices,
202 low_outlier_count: low_count,
203 high_outlier_count: high_count,
204 lower_bound,
205 upper_bound,
206 method: OutlierMethod::ZScore {
207 threshold: (threshold * 2.0) as u32,
208 },
209 }
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 #[test]
217 fn test_no_outliers() {
218 let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0];
219 let result = detect_outliers(&samples, OutlierMethod::default());
220
221 assert!(result.outlier_indices.is_empty());
222 assert_eq!(result.cleaned_samples.len(), 5);
223 }
224
225 #[test]
226 fn test_with_outliers() {
227 let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let result = detect_outliers(&samples, OutlierMethod::default());
229
230 assert!(!result.outlier_indices.is_empty());
231 assert_eq!(result.high_outlier_count, 1);
232 assert_eq!(result.all_samples.len(), 6); assert_eq!(result.cleaned_samples.len(), 5); }
235
236 #[test]
237 fn test_outlier_percentage() {
238 let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0];
239 let result = detect_outliers(&samples, OutlierMethod::default());
240
241 assert!(result.outlier_percentage() > 15.0);
243 assert!(result.outlier_percentage() < 20.0);
244 }
245
246 #[test]
247 fn test_no_detection() {
248 let samples = vec![1.0, 2.0, 100.0];
249 let result = detect_outliers(&samples, OutlierMethod::None);
250
251 assert!(result.outlier_indices.is_empty());
252 assert_eq!(result.cleaned_samples.len(), 3);
253 }
254
255 #[test]
256 fn test_empty_samples() {
257 let samples: Vec<f64> = Vec::new();
258 let result = detect_outliers(&samples, OutlierMethod::default());
259
260 assert!(result.outlier_indices.is_empty());
261 assert!(result.all_samples.is_empty());
262 }
263}