1use itertools::Itertools;
5use roots::{find_root_brent, SimpleConvergency};
6use rv::{
7 dist::Beta,
8 traits::{Cdf, ContinuousDistr},
9};
10
11use crate::{
12 error::{DetectionError, PreprocessingError},
13 Band, Error, OutlierDetector, OutlierOutput, Sensitivity, Series,
14};
15
16const MAD_K: f64 = 1.4826;
19
20#[derive(Debug, Clone, Copy)]
21enum ThresholdOrSensitivity {
22 Sensitivity(Sensitivity),
27 Threshold(f64),
29}
30
31impl ThresholdOrSensitivity {
32 fn resolve_threshold(&self) -> f64 {
33 match self {
34 Self::Sensitivity(Sensitivity(sensitivity)) => {
35 const MAX_T: f64 = 7.941444487; const MIN_T: f64 = 0.841621234; MAX_T - ((MAX_T - MIN_T) * sensitivity.sqrt())
43 }
44 Self::Threshold(threshold) => *threshold,
45 }
46 }
47}
48
49#[derive(Debug, Clone)]
51pub struct Medians {
52 lower: f64,
53 global: f64,
54 upper: f64,
55}
56
57#[derive(Debug, Clone)]
59pub struct MADDetector {
60 threshold_or_sensitivity: ThresholdOrSensitivity,
62
63 medians: Option<Medians>,
70}
71
72impl MADDetector {
73 pub fn with_threshold(threshold: f64) -> Self {
75 Self {
76 threshold_or_sensitivity: ThresholdOrSensitivity::Threshold(threshold),
77 medians: None,
78 }
79 }
80
81 pub fn with_sensitivity(sensitivity: f64) -> Result<Self, Error> {
86 Ok(Self {
87 threshold_or_sensitivity: ThresholdOrSensitivity::Sensitivity(sensitivity.try_into()?),
88 medians: None,
89 })
90 }
91
92 pub fn set_threshold(&mut self, threshold: f64) {
103 self.threshold_or_sensitivity = ThresholdOrSensitivity::Threshold(threshold);
104 }
105
106 pub fn set_sensitivity(&mut self, sensitivity: f64) -> Result<(), Error> {
117 self.threshold_or_sensitivity =
118 ThresholdOrSensitivity::Sensitivity(sensitivity.try_into()?);
119 Ok(())
120 }
121
122 pub fn set_medians(&mut self, medians: Medians) {
126 self.medians = Some(medians);
127 }
128
129 pub fn calculate_double_medians(data: &[&[f64]]) -> Result<Medians, PreprocessingError> {
139 let flattened = data
140 .iter()
141 .flat_map(|x| x.iter())
142 .copied()
143 .collect::<Vec<_>>();
144 let global = thd_nanmedian(&flattened, true)?;
145 let (mut lower_deviations, mut upper_deviations) = (Vec::new(), Vec::new());
146 for row in data {
147 for value in *row {
148 if !value.is_finite() {
149 continue;
150 }
151 let deviation = value - global;
152 match deviation {
153 0.0 => {
155 upper_deviations.push(0.0);
156 lower_deviations.push(0.0);
157 }
158 _ if deviation > 0.0 => upper_deviations.push(deviation),
159 _ => lower_deviations.push(-deviation),
160 }
161 }
162 }
163
164 let lower = thd_median(&lower_deviations, false, true);
165 let upper = thd_median(&upper_deviations, false, true);
166 if let (Ok(lower), Ok(upper)) = (lower, upper) {
167 if lower == 0.0 || upper == 0.0 {
168 Err(PreprocessingError::from(MADError::DivideByZero))
169 } else {
170 Ok(Medians {
171 lower,
172 global,
173 upper,
174 })
175 }
176 } else {
177 Err(PreprocessingError::from(MADError::DivideByZero))
178 }
179 }
180
181 fn calculate_mad(
182 &self,
183 data: &[&[f64]],
184 Medians {
185 global,
186 lower,
187 upper,
188 }: &Medians,
189 ) -> Vec<Vec<f64>> {
190 data.iter()
191 .map(|row| {
192 row.iter()
193 .map(|&value| {
194 if !value.is_finite() {
195 return f64::NAN;
196 }
197 let deviation = value - global;
198 let mut score = if deviation == 0.0 {
199 0.0
200 } else if deviation < 0.0 {
201 -deviation / (MAD_K * lower)
202 } else {
203 deviation / (MAD_K * upper)
204 };
205 if score.is_infinite() {
206 score = f64::NAN;
207 }
208 score
209 })
210 .collect::<Vec<_>>()
211 })
212 .collect::<Vec<_>>()
213 }
214
215 fn preprocess_impl(&self, y: &[&[f64]]) -> Result<PreprocessedData, PreprocessingError> {
216 let medians = self
217 .medians
218 .clone()
219 .map(Ok)
220 .unwrap_or_else(|| Self::calculate_double_medians(y))
221 .map_err(|x| PreprocessingError::from(Box::new(x) as Box<dyn std::error::Error>))?;
222 let mad_scores = self.calculate_mad(y, &medians);
223 Ok(PreprocessedData {
224 medians,
225 mad_scores,
226 })
227 }
228
229 fn detect_impl(
230 &self,
231 y: &<Self as OutlierDetector>::PreprocessedData,
232 ) -> Result<OutlierOutput, DetectionError> {
233 let PreprocessedData {
234 mad_scores,
235 medians,
236 } = y;
237 let threshold = self.threshold_or_sensitivity.resolve_threshold();
238 let upper_limit = medians.global + MAD_K * medians.upper * threshold;
239 let lower_limit = medians.global - MAD_K * medians.lower * threshold;
240 let n_series = mad_scores.len();
241 let n_timestamps = mad_scores
242 .first()
243 .map(Vec::len)
244 .ok_or(MADError::EmptyInput)?;
245
246 let normal_band = Some(Band {
248 min: vec![lower_limit; n_timestamps],
249 max: vec![upper_limit; n_timestamps],
250 });
251
252 let mut serieses = Series::preallocated(n_series, n_timestamps);
254 for (series, scores) in serieses.iter_mut().zip(mad_scores.iter()) {
255 series.scores.clone_from(scores);
256 let mut current = false;
258 for (i, score) in scores.iter().copied().enumerate() {
259 if score > threshold {
260 series.is_outlier = true;
261 if !current {
262 series.outlier_intervals.add_start(i);
263 }
264 current = true;
265 } else if current {
266 series.outlier_intervals.add_end(i);
267 current = false;
268 }
269 }
270 }
271
272 Ok(OutlierOutput::new(serieses, normal_band))
273 }
274}
275
276#[derive(Debug, Clone)]
281pub struct PreprocessedData {
282 medians: Medians,
283 mad_scores: Vec<Vec<f64>>,
284}
285
286impl OutlierDetector for MADDetector {
287 type PreprocessedData = PreprocessedData;
288 fn preprocess(&self, y: &[&[f64]]) -> Result<Self::PreprocessedData, Error> {
289 Ok(self.preprocess_impl(y)?)
290 }
291
292 fn detect(&self, y: &Self::PreprocessedData) -> Result<OutlierOutput, Error> {
293 Ok(self.detect_impl(y)?)
294 }
295}
296
297#[derive(Debug, thiserror::Error)]
298enum MADError {
299 #[error("no convergence: {0}")]
300 NoConvergence(roots::SearchError),
301 #[error("invalid parameters: {0}")]
302 InvalidParameters(rv::dist::BetaError),
303 #[error("empty input")]
304 EmptyInput,
305 #[error("division by zero")]
306 DivideByZero,
307}
308
309impl From<MADError> for PreprocessingError {
310 fn from(e: MADError) -> Self {
311 PreprocessingError::from(Box::new(e) as Box<dyn std::error::Error>)
312 }
313}
314
315impl From<MADError> for DetectionError {
316 fn from(e: MADError) -> Self {
317 DetectionError::from(Box::new(e) as Box<dyn std::error::Error>)
318 }
319}
320
321fn beta_hdi(alpha: f64, beta: f64, width: f64) -> Result<(f64, f64), MADError> {
322 const EPS: f64 = 1e-9;
323 if alpha < 1.0 + EPS && beta < 1.0 + EPS {
324 return Ok((0.0, 0.0));
326 } else if alpha < 1.0 + EPS && beta > 1.0 {
327 return Ok((0.0, width));
329 } else if alpha > 1.0 && beta < 1.0 + EPS {
330 return Ok((width, 0.0));
332 }
333 if width > 1.0 - EPS {
334 return Ok((0.0, 1.0));
335 }
336
337 let mode = (alpha - 1.0) / (alpha + beta - 2.0);
339 let dist = Beta::new(alpha, beta).map_err(MADError::InvalidParameters)?;
340
341 let lower = (mode - width).max(0.0);
342 let upper = mode.min(1.0 - width);
343 let mut convergency = SimpleConvergency {
344 eps: f64::EPSILON,
345 max_iter: 30,
346 };
347 let left = find_root_brent(
348 lower,
349 upper,
350 |x| dist.pdf(&x) - dist.pdf(&(x + width)),
351 &mut convergency,
352 )
353 .map_err(MADError::NoConvergence)?;
354 let right = left + width;
355 Ok((left, right))
356}
357
358fn thd_quantile(x: &[f64], q: f64, ignore_nan: bool, sort: bool) -> Result<f64, MADError> {
359 let mut x = if ignore_nan {
360 x.iter()
361 .copied()
362 .filter(|&v| !v.is_nan())
363 .collect::<Vec<_>>()
364 } else {
365 x.to_vec()
366 };
367 if sort {
368 x.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater));
369 }
370
371 let width = 1.0 / (x.len() as f64).sqrt();
372 let n = x.len();
373 if n == 0 {
374 return Err(MADError::EmptyInput);
375 } else if n == 1 {
376 return Ok(x[0]);
377 }
378
379 let alpha = (n as f64 + 1.0) * q;
380 let beta = (n as f64 + 1.0) * (1.0 - q);
381 let dist = Beta::new(alpha, beta).map_err(MADError::InvalidParameters)?;
382 let (hdi_left, hdi_right) = beta_hdi(alpha, beta, width)?;
383 let hdi_cdf_left = dist.cdf(&hdi_left);
384 let hdi_cdf_right = dist.cdf(&hdi_right);
385
386 let left_index = (hdi_left * n as f64).floor() as usize;
387 let right_index = (hdi_right * n as f64).ceil() as usize;
388 let weights = (left_index..(right_index + 1))
389 .map(|i| {
390 let numerator = dist.cdf(&(i as f64 / n as f64)) - hdi_cdf_left;
391 let denominator = hdi_cdf_right - hdi_cdf_left;
392 (numerator / denominator).clamp(0.0, 1.0)
393 })
394 .tuple_windows()
395 .map(|(a, b)| b - a);
396 Ok(x[left_index..right_index]
397 .iter()
398 .zip(weights)
399 .map(|(&x_i, w)| x_i * w)
400 .sum())
401}
402
403fn thd_median(x: &[f64], ignore_nan: bool, sort: bool) -> Result<f64, MADError> {
404 thd_quantile(x, 0.5, ignore_nan, sort)
405}
406
407fn thd_nanmedian(x: &[f64], sort: bool) -> Result<f64, MADError> {
408 thd_quantile(x, 0.5, true, sort)
409}
410
411#[cfg(test)]
412mod test {
413 use itertools::Itertools;
414 use rv::prelude::*;
415
416 use crate::{testing::flatten_intervals, MADDetector, OutlierDetector};
417
418 use super::Medians;
419
420 #[test]
421 fn beta_hdi() {
422 assert_eq!(
423 super::beta_hdi(5.5, 5.5, 0.31622776601683794).unwrap(),
424 (0.341886116991581, 0.658113883008419)
425 );
426 }
427
428 struct THDTestCase {
429 data: &'static [f64],
430 expected: f64,
431 }
432 const THD_TEST_CASES: &[THDTestCase] = &[
433 THDTestCase {
434 data: &[
435 -0.565, -0.106, -0.095, 0.363, 0.404, 0.633, 1.371, 1.512, 2.018, 100_000.0,
436 ],
437 expected: 0.6268069427582939,
438 },
439 THDTestCase {
440 data: &[-6.0, -5.0, -4.0, -16.0, -5.0, 15.0, -7.0, -8.0, -16.0],
441 expected: -6.0,
442 },
443 ];
444 const Q: f64 = 0.5;
445
446 #[test]
447 fn thd_quantile() {
448 for tc in THD_TEST_CASES {
449 assert_eq!(
450 super::thd_quantile(tc.data, Q, false, true).unwrap(),
451 tc.expected
452 );
453 }
454 }
455
456 #[test]
457 fn thd_median() {
458 for tc in THD_TEST_CASES {
459 assert_eq!(
460 super::thd_median(tc.data, false, true).unwrap(),
461 tc.expected
462 );
463 assert_eq!(
464 super::thd_quantile(tc.data, 0.5, false, true).unwrap(),
465 tc.expected
466 );
467 }
468 assert!(
469 super::thd_median(&[f64::NAN, f64::NAN, f64::NAN, 1.0, 1.0], false, true)
470 .unwrap()
471 .is_nan()
472 );
473 assert!(super::thd_median(
474 &[f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::NAN],
475 false,
476 true
477 )
478 .unwrap()
479 .is_nan());
480 }
481
482 #[test]
483 fn thd_nanmedian() {
484 for tc in THD_TEST_CASES {
485 assert_eq!(super::thd_nanmedian(tc.data, true).unwrap(), tc.expected);
486 assert_eq!(
487 super::thd_quantile(tc.data, 0.5, true, true).unwrap(),
488 tc.expected
489 );
490 }
491 assert_eq!(
492 super::thd_nanmedian(&[f64::NAN, f64::NAN, f64::NAN, 1.0, 1.0], true).unwrap(),
493 1.0
494 );
495 assert!(matches!(
496 super::thd_nanmedian(&[f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::NAN], true),
497 Err(super::MADError::EmptyInput),
498 ));
499 }
500
501 #[derive(Debug, Clone)]
502 struct Expected {
503 outliers: &'static [f64],
504 intervals: &'static [usize],
505 }
506
507 #[derive(Debug, Clone)]
508 struct MADTestCase<'a> {
509 name: &'static str,
510 data: &'a [f64],
511 expected: Result<Expected, &'static str>,
512 precalculated_medians: Option<Medians>,
513 }
514
515 const BASE_SAMPLE: &[f64] = &[
516 -2002., -2001., -2000., 9., 47., 50., 71., 78., 79., 97., 98., 117., 123., 136., 138.,
517 143., 145., 167., 185., 202., 216., 217., 229., 235., 242., 257., 297., 300., 315., 344.,
518 347., 347., 360., 362., 368., 387., 400., 428., 455., 468., 484., 493., 523., 557., 574.,
519 586., 605., 617., 618., 634., 641., 646., 649., 674., 678., 689., 699., 703., 709., 714.,
520 740., 795., 798., 839., 880., 938., 941., 983., 1014., 1021., 1022., 1165., 1183., 1195.,
521 1250., 1254., 1288., 1292., 1326., 1362., 1363., 1421., 1549., 1585., 1605., 1629., 1694.,
522 1695., 1719., 1799., 1827., 1828., 1862., 1991., 2140., 2186., 2255., 2266., 2295., 2321.,
523 2419., 2919., 3612., 6000., 6001., 6002.,
524 ];
525
526 fn gen_multi_modal_data() -> Vec<f64> {
527 let mut rng = rand::thread_rng();
528 let lower = rv::dist::Uniform::new_unchecked(0., 100.).sample(100, &mut rng);
529 let upper = rv::dist::Uniform::new_unchecked(1000., 1100.).sample(100, &mut rng);
530 lower.into_iter().interleave(upper).collect()
531 }
532
533 const MAD_TEST_CASES: &[MADTestCase<'_>] = &[
534 MADTestCase {
535 name: "normal",
536 data: BASE_SAMPLE,
537 expected: Ok(Expected {
538 outliers: &[-2002., -2001., -2000., 6000., 6001., 6002.],
539 intervals: &[0, 3, 103],
540 }),
541 precalculated_medians: None,
542 },
543 MADTestCase {
544 name: "precalculated medians",
545 data: BASE_SAMPLE,
546 expected: Ok(Expected {
547 outliers: &[-2002., -2001., -2000.],
548 intervals: &[0, 3, 103],
549 }),
550 precalculated_medians: Some(Medians {
551 lower: 378., global: 663., upper: 6000., }),
555 },
556 MADTestCase {
557 name: "all same so no outliers [throws]",
558 data: &[2., 2., 2., 2., 2., 2., 2., 2., 2.],
559 expected: Err("division by zero"),
560 precalculated_medians: None,
561 },
562 MADTestCase {
563 data: &[-6., -5., -4., -16., -5., 15., -7., -8., -16.],
564 name: "mixed positive and negative",
565 expected: Ok(Expected {
566 outliers: &[15.],
567 intervals: &[5, 6],
568 }),
569 precalculated_medians: None,
570 },
571 MADTestCase {
572 name: "zero majority [throws]",
573 data: &[0., 0., 0., 0., 0., 0., 11., 12., 0., 11.],
574 expected: Err("division by zero"),
575 precalculated_medians: None,
576 },
577 MADTestCase {
578 name: "only zero [throws]",
579 data: &[0., 0., 0., 0., 0., 0., 0., 0., 0.],
580 expected: Err("division by zero"),
581 precalculated_medians: None,
582 },
583 MADTestCase {
584 name: "the -2 likely outlying here",
585 data: &[-2., f64::NAN, 21., 22., 23., f64::NAN, f64::NAN, 21., 24.],
586 expected: Ok(Expected {
587 outliers: &[-2.],
588 intervals: &[0, 1],
589 }),
590 precalculated_medians: None,
591 },
592 MADTestCase {
593 name: "mostly 3s mixed with nans, no outliers [throws]",
594 data: &[3., f64::NAN, 3., 3., 3., f64::NAN, f64::NAN, 3., 4.],
595 expected: Err("division by zero"),
596 precalculated_medians: None,
597 },
598 MADTestCase {
599 name: "just checking floats are ok",
600 data: &[
601 31.6, 33.12, 33.84, 38.234, 12.83, 15.23, 33.23, 32.85, 24.72,
602 ],
603 expected: Ok(Expected {
604 outliers: &[38.234],
605 intervals: &[3, 4],
606 }),
607 precalculated_medians: None,
608 },
609 MADTestCase {
610 name: "all nans returns an error",
611 data: &[
612 f64::NAN,
613 f64::NAN,
614 f64::NAN,
615 f64::NAN,
616 f64::NAN,
617 f64::NAN,
618 f64::NAN,
619 f64::NAN,
620 f64::NAN,
621 ],
622 expected: Err("empty input"),
625 precalculated_medians: None,
626 },
627 MADTestCase {
628 name: "single very large outlier",
629 data: &[
630 -0.565, -0.106, -0.095, 0.363, 0.404, 0.633, 1.371, 1.512, 2.018, 100_000.,
631 ],
632 expected: Ok(Expected {
633 outliers: &[100_000.],
634 intervals: &[9],
635 }),
636 precalculated_medians: None,
637 },
638 MADTestCase {
639 name: "zero global median with outliers",
640 data: &[
641 -1000., -2., -1., -0.1, 0., 0., 0., 0., 0., 0.1, 1., 2., 1000.,
642 ],
643 expected: Ok(Expected {
644 outliers: &[-1000., -2., -1., 1., 2., 1000.],
645 intervals: &[0, 3, 10],
646 }),
647 precalculated_medians: None,
648 },
649 ];
650
651 fn test_calculate_mad(tc: &MADTestCase<'_>) {
652 let mad = MADDetector::with_sensitivity(0.5).unwrap();
653 let result = tc
654 .precalculated_medians
655 .clone()
656 .map(Ok)
657 .unwrap_or_else(|| MADDetector::calculate_double_medians(&[tc.data]))
658 .map(|medians| mad.calculate_mad(&[tc.data], &medians));
659 match &tc.expected {
660 Ok(Expected { outliers, .. }) => {
661 assert!(
662 result.is_ok(),
663 "case {} failed, got {}",
664 tc.name,
665 result.unwrap_err()
666 );
667 let scores = result.unwrap();
668 let got_outliers = tc
669 .data
670 .iter()
671 .enumerate()
672 .filter_map(|(i, x)| if scores[0][i] > 3.0 { Some(x) } else { None })
673 .copied()
674 .collect_vec();
675 assert_eq!(outliers, &got_outliers, "case {} failed", tc.name);
676 }
677 Err(exp) => {
678 assert!(result.is_err(), "case {} failed", tc.name);
679 assert_eq!(
680 &result.unwrap_err().to_string(),
681 exp,
682 "case {} failed",
683 tc.name
684 );
685 }
686 }
687 }
688
689 #[test]
690 fn calculate_mad() {
691 for case in MAD_TEST_CASES {
692 test_calculate_mad(case);
693 }
694 }
695
696 #[test]
697 fn calculate_mad_missing_one() {
698 let mut data = BASE_SAMPLE.to_vec();
699 data[0] = f64::NAN;
700 let tc = MADTestCase {
701 name: "missing one",
702 data: &data,
703 expected: Ok(Expected {
704 outliers: &[-2001., -2000., 6000., 6001., 6002.],
705 intervals: &[],
706 }),
707 precalculated_medians: None,
708 };
709 test_calculate_mad(&tc)
710 }
711
712 #[test]
713 fn calculate_mad_multimodal() {
714 let tc = MADTestCase {
715 name: "multimodal data",
716 data: &gen_multi_modal_data(),
717 expected: Ok(Expected {
718 outliers: &[],
719 intervals: &[],
720 }),
721 precalculated_medians: None,
722 };
723 test_calculate_mad(&tc)
724 }
725
726 #[test]
727 fn run() {
728 for tc in MAD_TEST_CASES {
729 let sensitivity = 0.5;
730 let mad = MADDetector::with_sensitivity(sensitivity).unwrap();
731 let result = mad
732 .preprocess(&[tc.data])
733 .and_then(|preprocessed| mad.detect(&preprocessed));
734 match &tc.expected {
735 Ok(Expected { intervals, .. }) => {
736 assert!(
737 result.is_ok(),
738 "case {} failed, got {}",
739 tc.name,
740 result.unwrap_err()
741 );
742 let output = result.unwrap();
743 assert_eq!(output.series_results.len(), 1, "case {} failed", tc.name);
744 let got_intervals =
745 flatten_intervals(&output.series_results[0].outlier_intervals.intervals);
746 assert_eq!(intervals, &got_intervals, "case {} failed", tc.name);
747 }
748 Err(exp) => {
749 assert!(result.is_err(), "case {} failed", tc.name);
750 assert!(
751 &result.unwrap_err().to_string().contains(exp),
752 "case {} failed",
753 tc.name
754 );
755 }
756 }
757 }
758 }
759}