Skip to main content

anomalyx_detect/
point.rs

1//! Statistical point-anomaly detector.
2//!
3//! Flags individual cells that sit far from their column's center using the
4//! **Iglewicz–Hoaglin modified z-score**, `M_i = 0.6745·(x_i − median)/MAD`.
5//! MAD is used instead of the mean/σ z-score because it is robust: a few wild
6//! values don't inflate the spread and mask each other. When MAD is zero (a
7//! near-constant column with a handful of different values) the detector falls
8//! back to the classic mean/σ z-score; if σ is also zero the column is truly
9//! constant and nothing is flagged.
10//!
11//! Everything is shift- and scale-invariant and order-independent, which is
12//! exactly what the property tests pin down.
13
14use crate::config::DetectConfig;
15use crate::{calibrate, fdr, robustz, Detector, Report, ScanContext};
16use ax_core::finding::Handle;
17use ax_core::{AnomalyClass, Column, Finding, Value};
18
19#[derive(Debug, Default, Clone)]
20pub struct PointDetector;
21
22impl Detector for PointDetector {
23    fn id(&self) -> &'static str {
24        "point.modz"
25    }
26
27    fn class(&self) -> AnomalyClass {
28        AnomalyClass::Point
29    }
30
31    fn detect(&self, ctx: &ScanContext, cfg: &DetectConfig, out: &mut Report) {
32        let mut eligible = 0usize; // numeric columns with enough finite values
33        let mut scanned = 0usize; // of those, the ones a role didn't skip
34        for col in &ctx.current.columns {
35            if !col.ty.is_numeric() {
36                continue;
37            }
38            let xs = col.numeric();
39            if xs.len() < cfg.point_min_n {
40                continue;
41            }
42            eligible += 1;
43            // Skip columns whose role makes a magnitude outlier meaningless: an
44            // identifier (arbitrary label) or a monotonic sequence (a ramp's
45            // "outlier" is just its endpoint). A constant column is left to
46            // `scan_column` (it self-no-ops). Roles still ship in the envelope;
47            // `column_roles = false` disables this skipping entirely.
48            if cfg.column_roles && col.role().skips_value_detection() {
49                continue;
50            }
51            scanned += 1;
52            self.scan_column(col, &xs, cfg, out);
53        }
54        // Honest absence only when there was nothing to measure in the first
55        // place — not when columns existed but were all role-skipped (point ran;
56        // it simply had no measurement column to flag).
57        if eligible == 0 {
58            out.mark_absent(
59                self.id(),
60                format!(
61                    "no numeric column with at least {} finite values",
62                    cfg.point_min_n
63                ),
64            );
65        } else if scanned == 0 {
66            out.mark_absent(
67                self.id(),
68                "every numeric column was an identifier, category, or sequence \
69                 (no measurement column to assess; see `roles`)"
70                    .to_string(),
71            );
72        }
73    }
74}
75
76impl PointDetector {
77    fn scan_column(&self, col: &Column, xs: &[f64], cfg: &DetectConfig, out: &mut Report) {
78        // Robust center/scale; a constant column has none and flags nothing.
79        let Some((center, scale, k)) = robustz::center_scale(xs) else {
80            return;
81        };
82
83        match cfg.point_fdr_q {
84            Some(q) => self.scan_column_fdr(col, center, scale, q, out),
85            None => self.scan_column_threshold(col, center, scale, k, cfg, out),
86        }
87    }
88
89    /// Fixed-cutoff mode: flag every cell whose modified z-score exceeds
90    /// `point_threshold`. No multiplicity control.
91    fn scan_column_threshold(
92        &self,
93        col: &Column,
94        center: f64,
95        scale: f64,
96        k: f64,
97        cfg: &DetectConfig,
98        out: &mut Report,
99    ) {
100        // Iterate the original cells so row indices in handles are correct.
101        for (row, cell) in col.cells.iter().enumerate() {
102            let Some(x) = numeric_cell(cell) else {
103                continue;
104            };
105            let modz = robustz::score(x, center, scale, k);
106            if modz <= cfg.point_threshold {
107                continue;
108            }
109            let reason = format!(
110                "{} = {:.6}: modified z-score {:.3} exceeds {:.3} (center={:.6}, scale={:.6})",
111                col.name, x, modz, cfg.point_threshold, center, scale
112            );
113            self.emit(
114                col,
115                row,
116                modz,
117                calibrate::from_exceedance(modz, cfg.point_threshold),
118                reason,
119                out,
120            );
121        }
122    }
123
124    /// FDR mode: convert each cell's modified z-score to a two-sided p-value and
125    /// flag only the cells that survive Benjamini–Hochberg control at level `q`
126    /// *within this column*. A column that is really just noise rejects nothing,
127    /// so it stops contributing chance flags; the fixed threshold is bypassed.
128    fn scan_column_fdr(&self, col: &Column, center: f64, scale: f64, q: f64, out: &mut Report) {
129        // First pass: (row, value, |z|, p) for every finite cell, in cell order.
130        // `z = (x − center)/scale` is the consistent-σ standardized deviation
131        // (≈ N(0, 1) under the null in both the MAD and σ branches) — unlike
132        // `robustz::score`, which additionally folds in the display constant
133        // `MODZ_K`, so it is not on a unit-variance scale and would mis-state the
134        // p-value. We use `z` for the p-value (the FDR decision) and as the
135        // reported score.
136        let mut cand: Vec<(usize, f64, f64, f64)> = Vec::new();
137        for (row, cell) in col.cells.iter().enumerate() {
138            let Some(x) = numeric_cell(cell) else {
139                continue;
140            };
141            let z = ((x - center) / scale).abs();
142            cand.push((row, x, z, fdr::two_sided_p(z)));
143        }
144
145        let pvals: Vec<f64> = cand.iter().map(|c| c.3).collect();
146        let Some(cutoff) = fdr::benjamini_hochberg(&pvals, q) else {
147            return; // nothing significant in this column
148        };
149
150        // Second pass: emit the cells BH rejects (p ≤ cutoff), in row order.
151        for (row, x, z, p) in cand {
152            if p > cutoff {
153                continue;
154            }
155            let reason = format!(
156                "{} = {:.6}: standardized deviation z={:.3}, p={:.3e} ≤ BH cutoff \
157                 {:.3e} at FDR q={:.4} (center={:.6}, scale={:.6})",
158                col.name, x, z, p, cutoff, q, center, scale
159            );
160            self.emit(
161                col,
162                row,
163                z,
164                calibrate::from_undercut(p, cutoff),
165                reason,
166                out,
167            );
168        }
169    }
170
171    /// Pushes a point finding for cell `row` with the given `score` (the
172    /// magnitude statistic) and calibrated `confidence` in `[0, 1]`.
173    fn emit(
174        &self,
175        col: &Column,
176        row: usize,
177        score: f64,
178        confidence: f64,
179        reason: String,
180        out: &mut Report,
181    ) {
182        out.push(
183            Finding::new(
184                self.id(),
185                AnomalyClass::Point,
186                Handle::Cell {
187                    column: col.name.clone(),
188                    row,
189                },
190                confidence,
191                score,
192                reason,
193            )
194            .with_col_type(col.ty),
195        );
196    }
197}
198
199/// Finite numeric projection of a single cell (mirrors [`Value::as_f64`] but
200/// drops non-finite values so they never become findings).
201fn numeric_cell(v: &Value) -> Option<f64> {
202    v.as_f64().filter(|x| x.is_finite())
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208    use proptest::prelude::*;
209
210    fn col(name: &str, xs: &[f64]) -> Column {
211        Column::new(name, xs.iter().map(|&x| Value::Float(x)).collect())
212    }
213
214    fn run(xs: &[f64]) -> Report {
215        let rs = ax_core::RecordSet::new("-", "test", vec![col("x", xs)]);
216        let mut out = Report::new();
217        PointDetector.detect(
218            &ScanContext::single(&rs),
219            &DetectConfig::default(),
220            &mut out,
221        );
222        out
223    }
224
225    /// The set of cell values flagged, for invariance comparisons.
226    fn flagged_values(xs: &[f64]) -> Vec<u64> {
227        let report = run(xs);
228        let mut v: Vec<u64> = report
229            .findings
230            .iter()
231            .map(|f| match &f.handle {
232                Handle::Cell { row, .. } => xs[*row].to_bits(),
233                _ => unreachable!("point detector emits cell handles"),
234            })
235            .collect();
236        v.sort_unstable();
237        v
238    }
239
240    #[test]
241    fn obvious_outlier_is_flagged() {
242        let mut xs = vec![10.0; 30];
243        xs.push(1000.0);
244        let report = run(&xs);
245        assert_eq!(report.findings.len(), 1);
246        assert!(matches!(
247            report.findings[0].handle,
248            Handle::Cell { row: 30, .. }
249        ));
250        assert!(report.findings[0].confidence > 0.5);
251    }
252
253    fn run_cfg(xs: &[f64], cfg: &DetectConfig) -> Report {
254        let rs = ax_core::RecordSet::new("-", "test", vec![col("x", xs)]);
255        let mut out = Report::new();
256        PointDetector.detect(&ScanContext::single(&rs), cfg, &mut out);
257        out
258    }
259
260    fn fdr_cfg(q: f64) -> DetectConfig {
261        DetectConfig {
262            point_fdr_q: Some(q),
263            ..DetectConfig::default()
264        }
265    }
266
267    #[test]
268    fn fdr_flags_the_clear_outlier() {
269        // A blatant outlier has p ≈ 0, which Benjamini–Hochberg rejects in any
270        // column — FDR mode still catches what matters.
271        let mut xs = vec![10.0; 30];
272        xs.push(1000.0);
273        let r = run_cfg(&xs, &fdr_cfg(0.05));
274        assert_eq!(r.findings.len(), 1);
275        assert!(matches!(r.findings[0].handle, Handle::Cell { row: 30, .. }));
276        // The reason records the FDR decision, not a fixed threshold.
277        assert!(r.findings[0].reason.contains("FDR q="));
278    }
279
280    #[test]
281    fn fdr_adapts_to_the_number_of_tests() {
282        // The SAME outlier — standardized deviation z ≈ 4.0 (two-sided p ≈
283        // 6.3e-5) over a symmetric [-1, 0, 1] base (median 0, consistent scale
284        // 1.4826) — is significant in a small column but not in a large one,
285        // because BH's per-rank bar (k/m)·q shrinks with the number of cells
286        // tested. That multiplicity awareness is exactly what a fixed cutoff
287        // lacks. The base cells (z ≈ 0.45) are never flagged in either column.
288        let outlier = 4.0 * 1.4826; // (x − 0)/1.4826 = z ≈ 4.0
289        let make = |n: usize| {
290            let mut xs = Vec::new();
291            for _ in 0..n {
292                xs.extend_from_slice(&[-1.0, 0.0, 1.0]);
293            }
294            xs.push(outlier);
295            xs
296        };
297        let small = run_cfg(&make(7), &fdr_cfg(0.05)); // m = 22:  (1/22)·.05 ≈ 2.3e-3 ≥ p
298        let large = run_cfg(&make(700), &fdr_cfg(0.05)); // m = 2101: (1/2101)·.05 ≈ 2.4e-5 < p
299        assert_eq!(small.findings.len(), 1, "rare in a small column ⇒ flagged");
300        assert_eq!(
301            large.findings.len(),
302            0,
303            "the same cell among 2101 tests ⇒ not significant after correction"
304        );
305    }
306
307    #[test]
308    fn fdr_uses_deviation_from_center_not_sum() {
309        // Median 100, a tight base around it, and one real outlier at 200. The
310        // standardized deviation is (x − center)/scale: only the 200 is extreme.
311        // Were it (x + center) instead, every base cell would look ~130 σ out
312        // (98 + 100 ≈ 198) and get flagged — so this pins the subtraction sign.
313        let mut xs: Vec<f64> = Vec::new();
314        for _ in 0..20 {
315            xs.extend_from_slice(&[98.0, 99.0, 100.0, 101.0, 102.0]);
316        }
317        xs.push(200.0);
318        let r = run_cfg(&xs, &fdr_cfg(0.05));
319        assert_eq!(r.findings.len(), 1, "only the 200 outlier is significant");
320        assert!(matches!(
321            r.findings[0].handle,
322            Handle::Cell { row: 100, .. }
323        ));
324    }
325
326    fn cfg_no_roles() -> DetectConfig {
327        DetectConfig {
328            column_roles: false,
329            ..DetectConfig::default()
330        }
331    }
332
333    #[test]
334    fn identifier_named_column_is_skipped_by_role() {
335        // An id-named numeric column with a blatant "outlier": skipped when roles
336        // are on (a big PID is not an anomaly), scanned when roles are off.
337        let mut xs = vec![100.0; 30];
338        xs.push(999_999.0);
339        let id_col = col("_PID", &xs);
340        let rs = ax_core::RecordSet::new("-", "t", vec![id_col]);
341
342        let mut on = Report::new();
343        PointDetector.detect(&ScanContext::single(&rs), &DetectConfig::default(), &mut on);
344        assert!(
345            on.findings.is_empty(),
346            "identifier column must be role-skipped"
347        );
348        // It WAS the only numeric column and it was skipped → honest absence.
349        assert_eq!(on.absent.len(), 1);
350
351        let mut off = Report::new();
352        PointDetector.detect(&ScanContext::single(&rs), &cfg_no_roles(), &mut off);
353        assert_eq!(
354            off.findings.len(),
355            1,
356            "--no-column-roles scans it as before"
357        );
358    }
359
360    #[test]
361    fn measurement_column_alongside_identifier_still_scanned() {
362        // A measurement column is assessed even when an identifier column sits
363        // next to it; only the identifier is skipped.
364        let mut m = vec![10.0; 30];
365        m.push(1000.0);
366        let rs = ax_core::RecordSet::new("-", "t", vec![col("fare", &m), col("user_id", &m)]);
367        let mut out = Report::new();
368        PointDetector.detect(
369            &ScanContext::single(&rs),
370            &DetectConfig::default(),
371            &mut out,
372        );
373        assert_eq!(
374            out.findings.len(),
375            1,
376            "only the measurement column's outlier"
377        );
378        match &out.findings[0].handle {
379            Handle::Cell { column, .. } => assert_eq!(column, "fare"),
380            _ => unreachable!(),
381        }
382        assert!(out.absent.is_empty(), "a measurement column WAS assessed");
383    }
384
385    #[test]
386    fn fdr_off_matches_the_threshold_path_exactly() {
387        // With point_fdr_q = None the FDR machinery is inert: identical findings.
388        let xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1000.0];
389        let off = run_cfg(&xs, &DetectConfig::default());
390        assert_eq!(off.findings.len(), 1);
391        assert!(off.findings[0].reason.contains("exceeds"));
392    }
393
394    #[test]
395    fn constant_column_has_no_findings() {
396        let report = run(&[7.0; 20]);
397        assert!(report.is_clean());
398        // It ran (numeric, enough values), so it is NOT marked absent.
399        assert!(report.absent.is_empty());
400    }
401
402    #[test]
403    fn non_numeric_corpus_marks_absent() {
404        let rs = ax_core::RecordSet::new(
405            "-",
406            "test",
407            vec![Column::new(
408                "name",
409                (0..20).map(|i| Value::Str(format!("u{i}"))).collect(),
410            )],
411        );
412        let mut out = Report::new();
413        PointDetector.detect(
414            &ScanContext::single(&rs),
415            &DetectConfig::default(),
416            &mut out,
417        );
418        assert!(out.is_clean());
419        assert_eq!(out.absent.len(), 1);
420        assert_eq!(out.absent[0].detector, "point.modz");
421    }
422
423    #[test]
424    fn too_few_values_marks_absent() {
425        let report = run(&[1.0, 2.0, 100.0]); // below default min_n = 8
426        assert!(report.is_clean());
427        assert_eq!(report.absent.len(), 1);
428    }
429
430    #[test]
431    fn exactly_min_n_values_is_assessed() {
432        // A column with exactly point_min_n (=8) finite values must be scanned,
433        // not skipped. Catches `len < min_n` → `len <= min_n`.
434        let report = run(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]);
435        assert_eq!(
436            report.findings.len(),
437            1,
438            "the 100.0 outlier must be flagged"
439        );
440        assert!(report.absent.is_empty(), "8 values is enough to assess");
441    }
442
443    #[test]
444    fn robust_path_catches_what_sigma_path_misses() {
445        // With MAD, the lone 1000 is wildly anomalous. If the detector were
446        // forced down the mean/σ fallback, σ is so inflated by that same point
447        // that its z-score (~2.7) falls under threshold and nothing is flagged.
448        // So this asserts the robust (MAD) branch is actually taken.
449        let report = run(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1000.0]);
450        assert_eq!(report.findings.len(), 1);
451        assert!(matches!(
452            report.findings[0].handle,
453            Handle::Cell { row: 8, .. }
454        ));
455        assert!(
456            report.findings[0].score > 100.0,
457            "MAD-scaled score is large"
458        );
459    }
460
461    proptest! {
462        // A clean Gaussian-ish base with one injected spike: the flagged set is
463        // invariant to shifting all values by a constant.
464        #[test]
465        fn shift_invariant(shift in -1e6f64..1e6, base in 1.0f64..5.0) {
466            let mut xs: Vec<f64> = (0..40).map(|i| base + (i % 5) as f64 * 0.01).collect();
467            xs.push(base + 500.0); // outlier
468            let original = flagged_values(&xs);
469            let shifted: Vec<f64> = xs.iter().map(|x| x + shift).collect();
470            // compare by index, not bits (values differ): recompute against shifted
471            let report = run(&shifted);
472            let mut rows: Vec<usize> = report.findings.iter().map(|f| match &f.handle {
473                Handle::Cell { row, .. } => *row,
474                _ => unreachable!(),
475            }).collect();
476            rows.sort_unstable();
477            let mut orig_rows: Vec<usize> = run(&xs).findings.iter().map(|f| match &f.handle {
478                Handle::Cell { row, .. } => *row,
479                _ => unreachable!(),
480            }).collect();
481            orig_rows.sort_unstable();
482            prop_assert_eq!(rows, orig_rows);
483            let _ = original;
484        }
485
486        // Scaling by a positive constant does not change which rows are flagged
487        // (modified z-score is scale-invariant).
488        #[test]
489        fn scale_invariant(scale in 0.001f64..1000.0) {
490            let mut xs: Vec<f64> = (0..40).map(|i| 100.0 + (i % 7) as f64).collect();
491            xs.push(100_000.0); // outlier
492            let base_rows = flagged_rows(&xs);
493            let scaled: Vec<f64> = xs.iter().map(|x| x * scale).collect();
494            prop_assert_eq!(flagged_rows(&scaled), base_rows);
495        }
496
497        // Running the same input twice yields byte-identical findings.
498        #[test]
499        fn deterministic(seed in 0u64..1000) {
500            let xs: Vec<f64> = (0..50).map(|i| ((i as u64).wrapping_mul(seed) % 97) as f64).collect();
501            let a = run(&xs);
502            let b = run(&xs);
503            prop_assert_eq!(
504                serde_json::to_string(&a.findings).unwrap(),
505                serde_json::to_string(&b.findings).unwrap()
506            );
507        }
508
509        // Row order does not change the multiset of flagged values.
510        #[test]
511        fn permutation_invariant_values(rot in 1usize..39) {
512            let mut xs: Vec<f64> = (0..40).map(|i| 50.0 + (i % 3) as f64).collect();
513            xs.push(9999.0);
514            let base = flagged_values(&xs);
515            xs.rotate_left(rot);
516            prop_assert_eq!(flagged_values(&xs), base);
517        }
518    }
519
520    fn flagged_rows(xs: &[f64]) -> Vec<usize> {
521        let mut rows: Vec<usize> = run(xs)
522            .findings
523            .iter()
524            .map(|f| match &f.handle {
525                Handle::Cell { row, .. } => *row,
526                _ => unreachable!(),
527            })
528            .collect();
529        rows.sort_unstable();
530        rows
531    }
532}