1use statrs::statistics::OrderStatistics;
12use thiserror::Error;
13
14#[derive(Error, Debug)]
15pub enum OutlierError {
16 #[error("The data set contains one or more NANs")]
17 ContainsNans,
18 #[error("K value cannot be negative")]
19 NegativeKValue,
20}
21
22pub struct OutlierIdentifier {
23 data_set: Vec<f64>,
24 k_value: f64,
25 data_is_sorted: bool,
26}
27
28impl OutlierIdentifier {
29 pub fn new(data_set: Vec<f64>, data_is_sorted: bool) -> OutlierIdentifier {
32 OutlierIdentifier {
33 data_set,
34 data_is_sorted,
35 k_value: 1.5,
36 }
37 }
38
39 pub fn with_k_value(self, k_value: f64) -> OutlierIdentifier {
44 OutlierIdentifier {
45 data_set: self.data_set,
46 data_is_sorted: self.data_is_sorted,
47 k_value,
48 }
49 }
50
51 #[allow(clippy::type_complexity)]
58 pub fn get_outliers(mut self) -> Result<(Vec<f64>, Vec<f64>, Vec<f64>), OutlierError> {
59 let (lower_fence, upper_fence) = self.get_fences()?;
60
61 let mut lower_outliers: Vec<f64> = Vec::new();
62 let mut upper_outliers: Vec<f64> = Vec::new();
63 let mut non_outliers: Vec<f64> = Vec::new();
64
65 for data in self.data_set {
66 if data < lower_fence {
67 lower_outliers.push(data);
68 } else if data > upper_fence {
69 upper_outliers.push(data);
70 } else {
71 non_outliers.push(data);
72 }
73 }
74
75 Ok((lower_outliers, non_outliers, upper_outliers))
76 }
77
78 pub fn has_outliers(mut self) -> Result<bool, OutlierError> {
83 let (lower_fence, upper_fence) = self.get_fences()?;
84
85 for data in self.data_set {
86 if data < lower_fence || data > upper_fence {
87 return Ok(true);
88 }
89 }
90
91 Ok(false)
92 }
93
94 fn get_fences(&mut self) -> Result<(f64, f64), OutlierError> {
95 if self.k_value < 0.0 {
96 return Err(OutlierError::NegativeKValue);
97 }
98
99 let data_set_has_nans = self.data_set.iter().any(|x| x.is_nan());
102
103 if data_set_has_nans {
104 return Err(OutlierError::ContainsNans);
105 }
106
107 if !self.data_is_sorted {
108 self.data_set.sort_by(|a, b| a.partial_cmp(b).unwrap());
109 }
110
111 let q1_value = self.data_set.lower_quartile();
112 let q3_value = self.data_set.upper_quartile();
113 let interquartile_range = q3_value - q1_value;
114
115 let intermediate_value = self.k_value * interquartile_range;
116 let lower_fence = q1_value - intermediate_value;
117 let upper_fence = q3_value + intermediate_value;
118
119 Ok((lower_fence, upper_fence))
120 }
121}
122
123#[test]
124fn get_outliers_needs_sorted_nan_set() {
125 let data: Vec<f64> = [f64::NAN, f64::NAN].to_vec();
126 let outlier_identifier = OutlierIdentifier::new(data, false);
127 let results_tuple = outlier_identifier.get_outliers();
128
129 assert!(matches!(results_tuple, Err(OutlierError::ContainsNans)));
130}
131
132#[test]
133fn get_outliers_is_sorted_nan_set() {
134 let data: Vec<f64> = [3.0, 2.9, 2.8, 33.3, f64::NAN, f64::NAN].to_vec();
135 let outlier_identifier = OutlierIdentifier::new(data, true);
136 let results_tuple = outlier_identifier.get_outliers();
137
138 assert!(matches!(results_tuple, Err(OutlierError::ContainsNans)));
139}
140
141#[test]
142fn get_outliers_empty_data_set() {
143 let data: Vec<f64> = [].to_vec();
144 let outlier_identifier = OutlierIdentifier::new(data, true);
145 let results_tuple = outlier_identifier.get_outliers().unwrap();
146
147 assert_eq!(results_tuple.0, [].to_vec());
148 assert_eq!(results_tuple.1, [].to_vec());
149 assert_eq!(results_tuple.2, [].to_vec());
150}
151
152#[test]
153fn get_outliers_set_of_one() {
154 let data: Vec<f64> = [30.0].to_vec();
155 let outlier_identifier = OutlierIdentifier::new(data, true);
156 let results_tuple = outlier_identifier.get_outliers().unwrap();
157
158 assert_eq!(results_tuple.0, [].to_vec());
159 assert_eq!(results_tuple.1, [30.0].to_vec());
160 assert_eq!(results_tuple.2, [].to_vec());
161}
162
163#[test]
164fn get_outliers_set_of_two() {
165 let data: Vec<f64> = [30.0, 90.0].to_vec();
166 let outlier_identifier = OutlierIdentifier::new(data, true);
167 let results_tuple = outlier_identifier.get_outliers().unwrap();
168
169 assert_eq!(results_tuple.0, [].to_vec());
170 assert_eq!(results_tuple.1, [30.0, 90.0].to_vec());
171 assert_eq!(results_tuple.2, [].to_vec());
172}
173
174#[test]
175fn get_outliers_none() {
176 let data: Vec<f64> = [1.0, 2.0, 4.0, 10.0].to_vec();
177 let outlier_identifier = OutlierIdentifier::new(data, true);
178 let results_tuple = outlier_identifier.get_outliers().unwrap();
179
180 assert_eq!(results_tuple.0, [].to_vec());
181 assert_eq!(results_tuple.1, [1.0, 2.0, 4.0, 10.0].to_vec());
182 assert_eq!(results_tuple.2, [].to_vec());
183}
184
185#[test]
186fn get_outliers_1() {
187 let data = [
188 0.0, 3.0, 3.0, 3.0, 11.0, 12.0, 13.0, 15.0, 19.0, 20.0, 29.0, 40.0, 79.0,
189 ]
190 .to_vec();
191 let outlier_identifier = OutlierIdentifier::new(data, true);
192 let results_tuple = outlier_identifier.get_outliers().unwrap();
193
194 assert_eq!(results_tuple.0, [].to_vec());
195 assert_eq!(
196 results_tuple.1,
197 [0.0, 3.0, 3.0, 3.0, 11.0, 12.0, 13.0, 15.0, 19.0, 20.0, 29.0, 40.0].to_vec()
198 );
199 assert_eq!(results_tuple.2, [79.0].to_vec());
200}
201
202#[test]
203fn get_outliers_negative_1() {
204 let data = [
205 29.5, -3.79, 15.0, 11.47, 3.6, 3.6, 19.0, 79.37, 40.7, -23.3, 12.0, 20.113, 13.39,
206 ]
207 .to_vec();
208 let outlier_identifier = OutlierIdentifier::new(data, false);
209 let results_tuple = outlier_identifier.get_outliers().unwrap();
210
211 assert_eq!(results_tuple.0, [].to_vec());
212 assert_eq!(
213 results_tuple.1,
214 [-23.3, -3.79, 3.6, 3.6, 11.47, 12.0, 13.39, 15.0, 19.0, 20.113, 29.5, 40.7].to_vec()
215 );
216 assert_eq!(results_tuple.2, [79.37].to_vec());
217}
218
219#[test]
220fn get_outliers_negative_2() {
221 let data = [-62.3, 67.9, 71.02, 43.3, 51.7, 65.43, 67.23].to_vec();
222 let outlier_identifier = OutlierIdentifier::new(data, false);
223 let results_tuple = outlier_identifier.get_outliers().unwrap();
224
225 assert_eq!(results_tuple.0, [-62.3].to_vec());
226 assert_eq!(
227 results_tuple.1,
228 [43.3, 51.7, 65.43, 67.23, 67.9, 71.02].to_vec()
229 );
230 assert_eq!(results_tuple.2, [].to_vec());
231}
232
233#[test]
234fn negative_k_value_error() {
235 let data = [30.0].to_vec();
236 let outlier_identifier = OutlierIdentifier::new(data, true).with_k_value(-3.0);
237 let results_tuple = outlier_identifier.get_outliers();
238
239 assert!(matches!(results_tuple, Err(OutlierError::NegativeKValue)));
240}
241
242#[test]
243fn has_outliers_false() {
244 let data: Vec<f64> = [1.0, 2.0, 4.0, 10.0].to_vec();
245 let has_outliers = OutlierIdentifier::new(data, true).has_outliers().unwrap();
246
247 assert!(!has_outliers);
248}
249
250#[test]
251fn has_outliers_true() {
252 let data = [-62.3, 67.9, 71.02, 43.3, 51.7, 65.43, 67.23].to_vec();
253 let has_outliers = OutlierIdentifier::new(data, true).has_outliers().unwrap();
254
255 assert!(has_outliers);
256}