dsfb_database/baselines/adwin.rs
1//! ADWIN (ADaptive WINdowing) change-point detector.
2//!
3//! Reference: Bifet & Gavaldà, *Learning from time-changing data with
4//! adaptive windowing*, SDM 2007, Theorems 1–3.
5//!
6//! Core idea: maintain a window W of recent observations. Whenever any
7//! split `W = W₀ · W₁` shows |mean(W₀) − mean(W₁)| larger than a
8//! Hoeffding-style bound ε_cut depending on the split sizes and a
9//! confidence δ, drop W₀ and emit a change-point at the cut.
10//!
11//! Simplifications vs. the paper that do not affect correctness for our
12//! purposes:
13//! * We use the linear-scan split test (the paper's §4 exponential-
14//! histogram data structure is an optimisation; our series are short,
15//! so the simpler code is faster to audit).
16//! * We assume values are real and bounded within a single series (the
17//! paper's Hoeffding bound uses the sample range; we track
18//! `observed_min` / `observed_max` per series).
19//!
20//! Determinism: the detector is a pure function of the input series.
21
22use super::ChangePointDetector;
23
24pub struct Adwin {
25 /// Confidence parameter δ. Smaller δ → larger ε_cut → fewer false
26 /// positives. The SDM paper uses δ = 0.002; we match that so the
27 /// numbers in our bake-off can be cross-checked against
28 /// implementations calibrated on the published SEA / HYPERPLANE
29 /// benchmarks.
30 pub delta: f64,
31 /// Minimum number of samples on either side of a candidate split —
32 /// below this we don't test. Matches the paper's "buckets of at least
33 /// 2 observations" guidance; 5 is the conservative default used by
34 /// MOA's reference Java implementation.
35 pub min_side: usize,
36}
37
38impl Default for Adwin {
39 fn default() -> Self {
40 Self {
41 delta: 0.002,
42 min_side: 5,
43 }
44 }
45}
46
47impl ChangePointDetector for Adwin {
48 fn name(&self) -> &'static str {
49 "adwin"
50 }
51
52 fn detect(&self, series: &[(f64, f64)]) -> Vec<f64> {
53 let mut cps = Vec::new();
54 if series.len() < 2 * self.min_side {
55 return cps;
56 }
57 // Running window = indices [start, end). After emitting a change
58 // at cut-index `k`, we drop [start, k) and continue with [k, end).
59 let mut start: usize = 0;
60 let mut end: usize = 0;
61 while end < series.len() {
62 end += 1;
63 let n = end - start;
64 if n < 2 * self.min_side {
65 continue;
66 }
67 // Scan all valid splits; take the first one that exceeds
68 // ε_cut, as the paper does. A more elaborate version takes
69 // argmax |difference|; using "first" keeps the output
70 // deterministic without an ordering tiebreak rule.
71 let (mut best_cut, mut best_diff) = (None, 0.0f64);
72 for k in (start + self.min_side)..=(end - self.min_side) {
73 let n0 = (k - start) as f64;
74 let n1 = (end - k) as f64;
75 let mean0 = series[start..k].iter().map(|(_, v)| v).sum::<f64>() / n0;
76 let mean1 = series[k..end].iter().map(|(_, v)| v).sum::<f64>() / n1;
77 let diff = (mean0 - mean1).abs();
78 if diff > best_diff {
79 best_diff = diff;
80 best_cut = Some(k);
81 }
82 }
83 let Some(k) = best_cut else { continue };
84 let n0 = (k - start) as f64;
85 let n1 = (end - k) as f64;
86 let eps_cut = self.epsilon_cut(series, start, end, n0, n1);
87 if best_diff > eps_cut {
88 cps.push(series[k].0);
89 start = k;
90 }
91 }
92 cps
93 }
94}
95
96impl Adwin {
97 /// ε_cut per Theorem 3.1 of Bifet & Gavaldà 2007, specialised to
98 /// bounded-range data. `R` is the observed range over the current
99 /// window; `m` is the harmonic mean of the two side sizes.
100 fn epsilon_cut(
101 &self,
102 series: &[(f64, f64)],
103 start: usize,
104 end: usize,
105 n0: f64,
106 n1: f64,
107 ) -> f64 {
108 let (mut lo, mut hi) = (f64::INFINITY, f64::NEG_INFINITY);
109 for &(_, v) in &series[start..end] {
110 if v < lo {
111 lo = v;
112 }
113 if v > hi {
114 hi = v;
115 }
116 }
117 let r = (hi - lo).max(f64::EPSILON);
118 let m = 2.0 * n0 * n1 / (n0 + n1);
119 let n = (end - start) as f64;
120 let d_prime = self.delta / n.max(1.0);
121 // ε_cut = sqrt( (R² / (2m)) · ln(2/δ′) )
122 ((r * r) / (2.0 * m) * (2.0 / d_prime).ln()).sqrt()
123 }
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn flags_large_mean_shift() {
132 // 50 samples at 0.0, then 50 at 2.0 — an obvious change.
133 let mut series = Vec::new();
134 for i in 0..50 {
135 series.push((i as f64, 0.0));
136 }
137 for i in 50..100 {
138 series.push((i as f64, 2.0));
139 }
140 let cps = Adwin::default().detect(&series);
141 assert!(!cps.is_empty(), "ADWIN should flag an obvious shift");
142 }
143
144 #[test]
145 fn stays_silent_on_flat_series() {
146 // All zeros — no change to find.
147 let series: Vec<(f64, f64)> = (0..200).map(|i| (i as f64, 0.0)).collect();
148 let cps = Adwin::default().detect(&series);
149 assert!(
150 cps.is_empty(),
151 "ADWIN should not invent changes on flat data"
152 );
153 }
154}