dsfb_database/baselines/
adwin.rs

1//! ADWIN (ADaptive WINdowing) change-point detector.
2//!
3//! Reference: Bifet & Gavaldà, *Learning from time-changing data with
4//! adaptive windowing*, SDM 2007, Theorems 1–3.
5//!
6//! Core idea: maintain a window W of recent observations. Whenever any
7//! split `W = W₀ · W₁` shows |mean(W₀) − mean(W₁)| larger than a
8//! Hoeffding-style bound ε_cut depending on the split sizes and a
9//! confidence δ, drop W₀ and emit a change-point at the cut.
10//!
11//! Simplifications vs. the paper that do not affect correctness for our
12//! purposes:
13//! * We use the linear-scan split test (the paper's §4 exponential-
14//!   histogram data structure is an optimisation; our series are short,
15//!   so the simpler code is faster to audit).
16//! * We assume values are real and bounded within a single series (the
17//!   paper's Hoeffding bound uses the sample range; we track
18//!   `observed_min` / `observed_max` per series).
19//!
20//! Determinism: the detector is a pure function of the input series.
21
22use super::ChangePointDetector;
23
24pub struct Adwin {
25    /// Confidence parameter δ. Smaller δ → larger ε_cut → fewer false
26    /// positives. The SDM paper uses δ = 0.002; we match that so the
27    /// numbers in our bake-off can be cross-checked against
28    /// implementations calibrated on the published SEA / HYPERPLANE
29    /// benchmarks.
30    pub delta: f64,
31    /// Minimum number of samples on either side of a candidate split —
32    /// below this we don't test. Matches the paper's "buckets of at least
33    /// 2 observations" guidance; 5 is the conservative default used by
34    /// MOA's reference Java implementation.
35    pub min_side: usize,
36}
37
38impl Default for Adwin {
39    fn default() -> Self {
40        Self {
41            delta: 0.002,
42            min_side: 5,
43        }
44    }
45}
46
47impl ChangePointDetector for Adwin {
48    fn name(&self) -> &'static str {
49        "adwin"
50    }
51
52    fn detect(&self, series: &[(f64, f64)]) -> Vec<f64> {
53        let mut cps = Vec::new();
54        if series.len() < 2 * self.min_side {
55            return cps;
56        }
57        // Running window = indices [start, end). After emitting a change
58        // at cut-index `k`, we drop [start, k) and continue with [k, end).
59        let mut start: usize = 0;
60        let mut end: usize = 0;
61        while end < series.len() {
62            end += 1;
63            let n = end - start;
64            if n < 2 * self.min_side {
65                continue;
66            }
67            // Scan all valid splits; take the first one that exceeds
68            // ε_cut, as the paper does. A more elaborate version takes
69            // argmax |difference|; using "first" keeps the output
70            // deterministic without an ordering tiebreak rule.
71            let (mut best_cut, mut best_diff) = (None, 0.0f64);
72            for k in (start + self.min_side)..=(end - self.min_side) {
73                let n0 = (k - start) as f64;
74                let n1 = (end - k) as f64;
75                let mean0 = series[start..k].iter().map(|(_, v)| v).sum::<f64>() / n0;
76                let mean1 = series[k..end].iter().map(|(_, v)| v).sum::<f64>() / n1;
77                let diff = (mean0 - mean1).abs();
78                if diff > best_diff {
79                    best_diff = diff;
80                    best_cut = Some(k);
81                }
82            }
83            let Some(k) = best_cut else { continue };
84            let n0 = (k - start) as f64;
85            let n1 = (end - k) as f64;
86            let eps_cut = self.epsilon_cut(series, start, end, n0, n1);
87            if best_diff > eps_cut {
88                cps.push(series[k].0);
89                start = k;
90            }
91        }
92        cps
93    }
94}
95
96impl Adwin {
97    /// ε_cut per Theorem 3.1 of Bifet & Gavaldà 2007, specialised to
98    /// bounded-range data. `R` is the observed range over the current
99    /// window; `m` is the harmonic mean of the two side sizes.
100    fn epsilon_cut(
101        &self,
102        series: &[(f64, f64)],
103        start: usize,
104        end: usize,
105        n0: f64,
106        n1: f64,
107    ) -> f64 {
108        let (mut lo, mut hi) = (f64::INFINITY, f64::NEG_INFINITY);
109        for &(_, v) in &series[start..end] {
110            if v < lo {
111                lo = v;
112            }
113            if v > hi {
114                hi = v;
115            }
116        }
117        let r = (hi - lo).max(f64::EPSILON);
118        let m = 2.0 * n0 * n1 / (n0 + n1);
119        let n = (end - start) as f64;
120        let d_prime = self.delta / n.max(1.0);
121        // ε_cut = sqrt( (R² / (2m)) · ln(2/δ′) )
122        ((r * r) / (2.0 * m) * (2.0 / d_prime).ln()).sqrt()
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn flags_large_mean_shift() {
132        // 50 samples at 0.0, then 50 at 2.0 — an obvious change.
133        let mut series = Vec::new();
134        for i in 0..50 {
135            series.push((i as f64, 0.0));
136        }
137        for i in 50..100 {
138            series.push((i as f64, 2.0));
139        }
140        let cps = Adwin::default().detect(&series);
141        assert!(!cps.is_empty(), "ADWIN should flag an obvious shift");
142    }
143
144    #[test]
145    fn stays_silent_on_flat_series() {
146        // All zeros — no change to find.
147        let series: Vec<(f64, f64)> = (0..200).map(|i| (i as f64, 0.0)).collect();
148        let cps = Adwin::default().detect(&series);
149        assert!(
150            cps.is_empty(),
151            "ADWIN should not invent changes on flat data"
152        );
153    }
154}
dsfb_database/baselines/adwin.rs

dsfb_database/baselines/
adwin.rs