outliers/
lib.rs

1//! ```
2//! let data = [10.0, 12.0, 11.0, 15.0, 11.0, 14.0, 13.0, 17.0, 12.0, 22.0, 14.0, 11.0].to_vec();
3//! let outlier_identifier = outliers::OutlierIdentifier::new(data, false);
4//! let results_tuple = outlier_identifier.get_outliers().unwrap();
5//!
6//! assert_eq!(results_tuple.0, [].to_vec()); // Lower outliers
7//! assert_eq!(results_tuple.1, [10.0, 11.0, 11.0, 11.0, 12.0, 12.0, 13.0, 14.0, 14.0, 15.0, 17.0].to_vec()); // Non-outliers
8//! assert_eq!(results_tuple.2, [22.0].to_vec()); // Upper outliers
9//! ```
10
11use statrs::statistics::OrderStatistics;
12use thiserror::Error;
13
14#[derive(Error, Debug)]
15pub enum OutlierError {
16    #[error("The data set contains one or more NANs")]
17    ContainsNans,
18    #[error("K value cannot be negative")]
19    NegativeKValue,
20}
21
22pub struct OutlierIdentifier {
23    data_set: Vec<f64>,
24    k_value: f64,
25    data_is_sorted: bool,
26}
27
28impl OutlierIdentifier {
29    /// Creates a new `OutlierIdentifier`.  The default `k_value` is `1.5`, a value in outlier
30    /// identification made popular by the mathematician John Tukey.
31    pub fn new(data_set: Vec<f64>, data_is_sorted: bool) -> OutlierIdentifier {
32        OutlierIdentifier {
33            data_set,
34            data_is_sorted,
35            k_value: 1.5,
36        }
37    }
38
39    /// Allows for altering the `k_value`.  A larger `k_value` will result in fewer numbers being
40    /// identified as outliers, while a smaller `k_value` will result in more numbers being
41    /// identified as outliers.  The `k_value` must be non-negative, or `get_outliers()` will return
42    /// an `Err`.
43    pub fn with_k_value(self, k_value: f64) -> OutlierIdentifier {
44        OutlierIdentifier {
45            data_set: self.data_set,
46            data_is_sorted: self.data_is_sorted,
47            k_value,
48        }
49    }
50
51    /// Performs the outlier identification.  In the case that is does not return an `Err`, it
52    /// returns a tuple of `Vec<f64>`s.  The first vector contains any lower outliers and the third
53    /// vector contains any upper outliers.  Additionally, the second vector returned contains all
54    /// the non-outliers, so that the data set passed in is returned, in its entirety, as
55    /// partitioned subsets.  `get_outliers()` will return an `Err` if the `data_set` contains one
56    /// or more `NAN`s or if the `k_value` is a negative number.
57    #[allow(clippy::type_complexity)]
58    pub fn get_outliers(mut self) -> Result<(Vec<f64>, Vec<f64>, Vec<f64>), OutlierError> {
59        let (lower_fence, upper_fence) = self.get_fences()?;
60
61        let mut lower_outliers: Vec<f64> = Vec::new();
62        let mut upper_outliers: Vec<f64> = Vec::new();
63        let mut non_outliers: Vec<f64> = Vec::new();
64
65        for data in self.data_set {
66            if data < lower_fence {
67                lower_outliers.push(data);
68            } else if data > upper_fence {
69                upper_outliers.push(data);
70            } else {
71                non_outliers.push(data);
72            }
73        }
74
75        Ok((lower_outliers, non_outliers, upper_outliers))
76    }
77
78    /// Indicates whether the data set has outliers.  This method is useful when one only needs to
79    /// know if a data set has outliers and isn't concerned with the details of the outliers.  This
80    /// method short circuits; if any outliers exist, the moment the first one is found, the method
81    /// immediately returns with `true`, else, it returns `false`.
82    pub fn has_outliers(mut self) -> Result<bool, OutlierError> {
83        let (lower_fence, upper_fence) = self.get_fences()?;
84
85        for data in self.data_set {
86            if data < lower_fence || data > upper_fence {
87                return Ok(true);
88            }
89        }
90
91        Ok(false)
92    }
93
94    fn get_fences(&mut self) -> Result<(f64, f64), OutlierError> {
95        if self.k_value < 0.0 {
96            return Err(OutlierError::NegativeKValue);
97        }
98
99        // This should catch cases where the next `unwrap()` would panic, see:
100        // https://doc.rust-lang.org/std/vec/struct.Vec.html#method.sort_by
101        let data_set_has_nans = self.data_set.iter().any(|x| x.is_nan());
102
103        if data_set_has_nans {
104            return Err(OutlierError::ContainsNans);
105        }
106
107        if !self.data_is_sorted {
108            self.data_set.sort_by(|a, b| a.partial_cmp(b).unwrap());
109        }
110
111        let q1_value = self.data_set.lower_quartile();
112        let q3_value = self.data_set.upper_quartile();
113        let interquartile_range = q3_value - q1_value;
114
115        let intermediate_value = self.k_value * interquartile_range;
116        let lower_fence = q1_value - intermediate_value;
117        let upper_fence = q3_value + intermediate_value;
118
119        Ok((lower_fence, upper_fence))
120    }
121}
122
123#[test]
124fn get_outliers_needs_sorted_nan_set() {
125    let data: Vec<f64> = [f64::NAN, f64::NAN].to_vec();
126    let outlier_identifier = OutlierIdentifier::new(data, false);
127    let results_tuple = outlier_identifier.get_outliers();
128
129    assert!(matches!(results_tuple, Err(OutlierError::ContainsNans)));
130}
131
132#[test]
133fn get_outliers_is_sorted_nan_set() {
134    let data: Vec<f64> = [3.0, 2.9, 2.8, 33.3, f64::NAN, f64::NAN].to_vec();
135    let outlier_identifier = OutlierIdentifier::new(data, true);
136    let results_tuple = outlier_identifier.get_outliers();
137
138    assert!(matches!(results_tuple, Err(OutlierError::ContainsNans)));
139}
140
141#[test]
142fn get_outliers_empty_data_set() {
143    let data: Vec<f64> = [].to_vec();
144    let outlier_identifier = OutlierIdentifier::new(data, true);
145    let results_tuple = outlier_identifier.get_outliers().unwrap();
146
147    assert_eq!(results_tuple.0, [].to_vec());
148    assert_eq!(results_tuple.1, [].to_vec());
149    assert_eq!(results_tuple.2, [].to_vec());
150}
151
152#[test]
153fn get_outliers_set_of_one() {
154    let data: Vec<f64> = [30.0].to_vec();
155    let outlier_identifier = OutlierIdentifier::new(data, true);
156    let results_tuple = outlier_identifier.get_outliers().unwrap();
157
158    assert_eq!(results_tuple.0, [].to_vec());
159    assert_eq!(results_tuple.1, [30.0].to_vec());
160    assert_eq!(results_tuple.2, [].to_vec());
161}
162
163#[test]
164fn get_outliers_set_of_two() {
165    let data: Vec<f64> = [30.0, 90.0].to_vec();
166    let outlier_identifier = OutlierIdentifier::new(data, true);
167    let results_tuple = outlier_identifier.get_outliers().unwrap();
168
169    assert_eq!(results_tuple.0, [].to_vec());
170    assert_eq!(results_tuple.1, [30.0, 90.0].to_vec());
171    assert_eq!(results_tuple.2, [].to_vec());
172}
173
174#[test]
175fn get_outliers_none() {
176    let data: Vec<f64> = [1.0, 2.0, 4.0, 10.0].to_vec();
177    let outlier_identifier = OutlierIdentifier::new(data, true);
178    let results_tuple = outlier_identifier.get_outliers().unwrap();
179
180    assert_eq!(results_tuple.0, [].to_vec());
181    assert_eq!(results_tuple.1, [1.0, 2.0, 4.0, 10.0].to_vec());
182    assert_eq!(results_tuple.2, [].to_vec());
183}
184
185#[test]
186fn get_outliers_1() {
187    let data = [
188        0.0, 3.0, 3.0, 3.0, 11.0, 12.0, 13.0, 15.0, 19.0, 20.0, 29.0, 40.0, 79.0,
189    ]
190    .to_vec();
191    let outlier_identifier = OutlierIdentifier::new(data, true);
192    let results_tuple = outlier_identifier.get_outliers().unwrap();
193
194    assert_eq!(results_tuple.0, [].to_vec());
195    assert_eq!(
196        results_tuple.1,
197        [0.0, 3.0, 3.0, 3.0, 11.0, 12.0, 13.0, 15.0, 19.0, 20.0, 29.0, 40.0].to_vec()
198    );
199    assert_eq!(results_tuple.2, [79.0].to_vec());
200}
201
202#[test]
203fn get_outliers_negative_1() {
204    let data = [
205        29.5, -3.79, 15.0, 11.47, 3.6, 3.6, 19.0, 79.37, 40.7, -23.3, 12.0, 20.113, 13.39,
206    ]
207    .to_vec();
208    let outlier_identifier = OutlierIdentifier::new(data, false);
209    let results_tuple = outlier_identifier.get_outliers().unwrap();
210
211    assert_eq!(results_tuple.0, [].to_vec());
212    assert_eq!(
213        results_tuple.1,
214        [-23.3, -3.79, 3.6, 3.6, 11.47, 12.0, 13.39, 15.0, 19.0, 20.113, 29.5, 40.7].to_vec()
215    );
216    assert_eq!(results_tuple.2, [79.37].to_vec());
217}
218
219#[test]
220fn get_outliers_negative_2() {
221    let data = [-62.3, 67.9, 71.02, 43.3, 51.7, 65.43, 67.23].to_vec();
222    let outlier_identifier = OutlierIdentifier::new(data, false);
223    let results_tuple = outlier_identifier.get_outliers().unwrap();
224
225    assert_eq!(results_tuple.0, [-62.3].to_vec());
226    assert_eq!(
227        results_tuple.1,
228        [43.3, 51.7, 65.43, 67.23, 67.9, 71.02].to_vec()
229    );
230    assert_eq!(results_tuple.2, [].to_vec());
231}
232
233#[test]
234fn negative_k_value_error() {
235    let data = [30.0].to_vec();
236    let outlier_identifier = OutlierIdentifier::new(data, true).with_k_value(-3.0);
237    let results_tuple = outlier_identifier.get_outliers();
238
239    assert!(matches!(results_tuple, Err(OutlierError::NegativeKValue)));
240}
241
242#[test]
243fn has_outliers_false() {
244    let data: Vec<f64> = [1.0, 2.0, 4.0, 10.0].to_vec();
245    let has_outliers = OutlierIdentifier::new(data, true).has_outliers().unwrap();
246
247    assert!(!has_outliers);
248}
249
250#[test]
251fn has_outliers_true() {
252    let data = [-62.3, 67.9, 71.02, 43.3, 51.7, 65.43, 67.23].to_vec();
253    let has_outliers = OutlierIdentifier::new(data, true).has_outliers().unwrap();
254
255    assert!(has_outliers);
256}