augurs_outlier/
lib.rs

1#![doc = include_str!("../README.md")]
2
3use std::collections::BTreeSet;
4
5mod dbscan;
6mod error;
7mod mad;
8mod sensitivity;
9#[cfg(test)]
10mod testing;
11
12pub use dbscan::{Data as DbscanData, DbscanDetector};
13pub use error::Error;
14pub use mad::{MADDetector, PreprocessedData as MADData};
15use sensitivity::Sensitivity;
16
17/// A band indicating the min and max value considered outlying
18/// at each timestamp.
19#[derive(Debug, Clone)]
20#[cfg_attr(feature = "serde", derive(serde::Serialize))]
21#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))]
22pub struct Band {
23    /// The minimum value considered outlying at each timestamp.
24    pub min: Vec<f64>,
25    /// The maximum value considered outlying at each timestamp.
26    pub max: Vec<f64>,
27}
28
29impl Band {
30    fn new(n_timestamps: usize) -> Self {
31        Self {
32            min: vec![f64::NAN; n_timestamps],
33            max: vec![f64::NAN; n_timestamps],
34        }
35    }
36}
37
38/// The result of applying an outlier detection algorithm to a time series.
39#[derive(Debug, Clone)]
40#[cfg_attr(feature = "serde", derive(serde::Serialize))]
41#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))]
42pub struct OutlierOutput {
43    /// The indexes of the series considered outliers.
44    ///
45    /// This is a `BTreeSet` to ensure that the order of the series is preserved.
46    pub outlying_series: BTreeSet<usize>,
47
48    /// The results of the detection for each series.
49    pub series_results: Vec<Series>,
50
51    /// The band indicating the min and max value considered outlying
52    /// at each timestamp.
53    ///
54    /// This may be `None` if no cluster was found (for example if
55    /// there were fewer than 3 series in the input data in the case of
56    /// DBSCAN).
57    pub cluster_band: Option<Band>,
58}
59
60impl OutlierOutput {
61    /// Create a new `OutlierResult` from the given series results.
62    pub fn new(series_results: Vec<Series>, cluster_band: Option<Band>) -> Self {
63        Self {
64            outlying_series: series_results
65                .iter()
66                .enumerate()
67                .filter_map(|(i, s)| s.is_outlier.then_some(i))
68                .collect(),
69            series_results,
70            cluster_band,
71        }
72    }
73
74    /// Determine whether the series at the given index is an outlier.
75    pub fn is_outlier(&self, i: usize) -> bool {
76        self.outlying_series.contains(&i)
77    }
78}
79
80/// A potentially outlying series.
81#[derive(Debug, Clone)]
82#[cfg_attr(feature = "serde", derive(serde::Serialize))]
83#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))]
84pub struct Series {
85    /// Whether the series is an outlier for at least one of the samples.
86    pub is_outlier: bool,
87    /// The intervals of the samples that are considered outliers.
88    pub outlier_intervals: OutlierIntervals,
89    /// The outlier scores of the series for each sample.
90    ///
91    /// The higher the score, the more likely the series is an outlier.
92    /// Note that some implementations may not provide continuous scores
93    /// but rather a binary classification. In this case, the scores will
94    /// be 0.0 for non-outliers and 1.0 for outliers.
95    pub scores: Vec<f64>,
96}
97
98impl Series {
99    /// Create a new non-outlying `Series` with an empty set of scores.
100    pub fn empty() -> Self {
101        Self {
102            is_outlier: false,
103            scores: Vec::new(),
104            outlier_intervals: OutlierIntervals::empty(),
105        }
106    }
107
108    /// Create a new non-outlying `Series` with an empty set of scores
109    /// and the given capacity.
110    pub fn with_capacity(n: usize) -> Self {
111        Self {
112            is_outlier: false,
113            scores: Vec::with_capacity(n),
114            outlier_intervals: OutlierIntervals::empty(),
115        }
116    }
117
118    /// Create a `Vec<Series>` with length `n_series` where each inner `Series`
119    /// has its scores preallocated to length `n_timestamps`, all initialized to 0.0.
120    pub fn preallocated(n_series: usize, n_timestamps: usize) -> Vec<Self> {
121        std::iter::repeat_with(|| {
122            let mut s = Series::with_capacity(n_timestamps);
123            s.scores.resize(n_timestamps, 0.0);
124            s
125        })
126        .take(n_series)
127        .collect()
128    }
129}
130
131/// A list of outlier intervals for a single series.
132#[derive(Debug, Clone)]
133#[cfg_attr(feature = "serde", derive(serde::Serialize))]
134#[cfg_attr(feature = "serde", serde(rename_all = "camelCase", transparent))]
135pub struct OutlierIntervals {
136    /// The list of outlier intervals.
137    pub intervals: Vec<OutlierInterval>,
138
139    /// Are we expecting a start or end timestamp to be pushed next?
140    #[cfg_attr(feature = "serde", serde(skip))]
141    expecting_end: bool,
142}
143
144impl OutlierIntervals {
145    fn empty() -> Self {
146        Self {
147            intervals: Vec::new(),
148            expecting_end: false,
149        }
150    }
151
152    fn add_start(&mut self, ts: usize) {
153        debug_assert!(
154            !self.expecting_end,
155            "Expected end of outlier interval, got start"
156        );
157
158        self.intervals.push(OutlierInterval {
159            start: ts,
160            end: None,
161        });
162        self.expecting_end = true;
163    }
164
165    fn add_end(&mut self, ts: usize) {
166        debug_assert!(
167            self.expecting_end,
168            "Expected start of outlier interval, got end"
169        );
170
171        match self.intervals.last_mut() {
172            Some(x @ OutlierInterval { end: None, .. }) => {
173                x.end = Some(ts);
174            }
175            _ => unreachable!("tried to add end to an open-ended interval"),
176        };
177        self.expecting_end = false;
178    }
179}
180
181/// A single outlier interval.
182///
183/// An outlier interval is a contiguous range of indices in a time series
184/// where an outlier is detected.
185#[derive(Debug, Clone, PartialEq, Eq)]
186#[cfg_attr(feature = "serde", derive(serde::Serialize))]
187#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))]
188pub struct OutlierInterval {
189    /// The start index of the interval.
190    pub start: usize,
191    /// The end index of the interval, if it exists.
192    ///
193    /// If the interval is open-ended, this will be `None`.
194    pub end: Option<usize>,
195}
196
197/// An outlier detection algorithm.
198pub trait OutlierDetector {
199    /// The preprocessed data used by the outlier detection algorithm.
200    ///
201    /// This type is used to store the preprocessed data that is
202    /// calculated from the input data. The preprocessed data is
203    /// then used by the `detect` method to determine whether each
204    /// series is an outlier.
205    ///
206    /// An example of preprocessed data might be the transposed
207    /// input data, where elements of the inner vectors correspond
208    /// to the same timestamp in different series.
209    type PreprocessedData;
210
211    /// Preprocess the given slice of series.
212    ///
213    /// The input is a slice of aligned time series. Each series is
214    /// a slice of `f64` which represents the values of the series
215    /// over time. The length of the inner slice is the same for all series.
216    ///
217    /// The output is a preprocessed version of the input data. The exact
218    /// format of the preprocessed data is up to the implementation.
219    /// The preprocessed data will be passed to the `detect` method.
220    ///
221    /// This method is separate from `detect` to allow for more efficient
222    /// recalculations of the preprocessed data when some input parameters
223    /// change. For example, if the input data is the same but the sensitivity
224    /// changes, the outlier detection calculation can be rerun without
225    /// reprocessing the input data.
226    fn preprocess(&self, y: &[&[f64]]) -> Result<Self::PreprocessedData, Error>;
227
228    /// Detect outliers in the given slice of series.
229    ///
230    /// The output is a vector of `Series` where each `Series` corresponds
231    /// to the corresponding series in the input. The implementation will
232    /// decide whether each series is an outlier, i.e. whether it behaves
233    /// differently to the other input series.
234    fn detect(&self, y: &Self::PreprocessedData) -> Result<OutlierOutput, Error>;
235}
236
237#[cfg(test)]
238mod test {
239    use super::*;
240
241    // Keeping this here as an example of how to implement an outlier detector.
242    #[allow(dead_code)]
243    struct DummyDetector;
244
245    impl OutlierDetector for DummyDetector {
246        type PreprocessedData = Vec<Vec<f64>>;
247
248        fn preprocess(&self, y: &[&[f64]]) -> Result<Self::PreprocessedData, Error> {
249            Ok(y.iter().map(|x| x.to_vec()).collect())
250        }
251
252        fn detect(&self, y: &Self::PreprocessedData) -> Result<OutlierOutput, Error> {
253            let serieses = y
254                .iter()
255                .map(|series| {
256                    let mut intervals = OutlierIntervals::empty();
257                    intervals.add_start(1);
258                    Series {
259                        is_outlier: series.iter().any(|&x| x > 10.0),
260                        scores: series.to_vec(),
261                        outlier_intervals: intervals,
262                    }
263                })
264                .collect();
265            let band = Band {
266                min: vec![-1.0; y[0].len()],
267                max: vec![1.0; y[0].len()],
268            };
269            Ok(OutlierOutput::new(serieses, Some(band)))
270        }
271    }
272
273    #[cfg(feature = "serde")]
274    #[test]
275    fn serialize() {
276        let mut outlier_intervals = OutlierIntervals::empty();
277        outlier_intervals.add_start(1);
278        let series = Series {
279            is_outlier: true,
280            scores: vec![1.0, 2.0, 3.0],
281            outlier_intervals,
282        };
283        let output = OutlierOutput {
284            outlying_series: BTreeSet::from([0, 1]),
285            series_results: vec![series],
286            cluster_band: None,
287        };
288        let serialized = serde_json::to_string(&output).unwrap();
289        assert_eq!(
290            serialized,
291            r#"{"outlyingSeries":[0,1],"seriesResults":[{"isOutlier":true,"outlierIntervals":[{"start":1,"end":null}],"scores":[1.0,2.0,3.0]}],"clusterBand":null}"#
292        );
293    }
294}