1use crate::config::DetectConfig;
15use crate::{calibrate, fdr, robustz, Detector, Report, ScanContext};
16use ax_core::finding::Handle;
17use ax_core::{AnomalyClass, Column, Finding, Value};
18
19#[derive(Debug, Default, Clone)]
20pub struct PointDetector;
21
22impl Detector for PointDetector {
23 fn id(&self) -> &'static str {
24 "point.modz"
25 }
26
27 fn class(&self) -> AnomalyClass {
28 AnomalyClass::Point
29 }
30
31 fn detect(&self, ctx: &ScanContext, cfg: &DetectConfig, out: &mut Report) {
32 let mut eligible = 0usize; let mut scanned = 0usize; for col in &ctx.current.columns {
35 if !col.ty.is_numeric() {
36 continue;
37 }
38 let xs = col.numeric();
39 if xs.len() < cfg.point_min_n {
40 continue;
41 }
42 eligible += 1;
43 if cfg.column_roles && col.role().skips_value_detection() {
49 continue;
50 }
51 scanned += 1;
52 self.scan_column(col, &xs, cfg, out);
53 }
54 if eligible == 0 {
58 out.mark_absent(
59 self.id(),
60 format!(
61 "no numeric column with at least {} finite values",
62 cfg.point_min_n
63 ),
64 );
65 } else if scanned == 0 {
66 out.mark_absent(
67 self.id(),
68 "every numeric column was an identifier, category, or sequence \
69 (no measurement column to assess; see `roles`)"
70 .to_string(),
71 );
72 }
73 }
74}
75
76impl PointDetector {
77 fn scan_column(&self, col: &Column, xs: &[f64], cfg: &DetectConfig, out: &mut Report) {
78 let Some((center, scale, k)) = robustz::center_scale(xs) else {
80 return;
81 };
82
83 match cfg.point_fdr_q {
84 Some(q) => self.scan_column_fdr(col, center, scale, q, out),
85 None => self.scan_column_threshold(col, center, scale, k, cfg, out),
86 }
87 }
88
89 fn scan_column_threshold(
92 &self,
93 col: &Column,
94 center: f64,
95 scale: f64,
96 k: f64,
97 cfg: &DetectConfig,
98 out: &mut Report,
99 ) {
100 for (row, cell) in col.cells.iter().enumerate() {
102 let Some(x) = numeric_cell(cell) else {
103 continue;
104 };
105 let modz = robustz::score(x, center, scale, k);
106 if modz <= cfg.point_threshold {
107 continue;
108 }
109 let reason = format!(
110 "{} = {:.6}: modified z-score {:.3} exceeds {:.3} (center={:.6}, scale={:.6})",
111 col.name, x, modz, cfg.point_threshold, center, scale
112 );
113 self.emit(
114 col,
115 row,
116 modz,
117 calibrate::from_exceedance(modz, cfg.point_threshold),
118 reason,
119 out,
120 );
121 }
122 }
123
124 fn scan_column_fdr(&self, col: &Column, center: f64, scale: f64, q: f64, out: &mut Report) {
129 let mut cand: Vec<(usize, f64, f64, f64)> = Vec::new();
137 for (row, cell) in col.cells.iter().enumerate() {
138 let Some(x) = numeric_cell(cell) else {
139 continue;
140 };
141 let z = ((x - center) / scale).abs();
142 cand.push((row, x, z, fdr::two_sided_p(z)));
143 }
144
145 let pvals: Vec<f64> = cand.iter().map(|c| c.3).collect();
146 let Some(cutoff) = fdr::benjamini_hochberg(&pvals, q) else {
147 return; };
149
150 for (row, x, z, p) in cand {
152 if p > cutoff {
153 continue;
154 }
155 let reason = format!(
156 "{} = {:.6}: standardized deviation z={:.3}, p={:.3e} ≤ BH cutoff \
157 {:.3e} at FDR q={:.4} (center={:.6}, scale={:.6})",
158 col.name, x, z, p, cutoff, q, center, scale
159 );
160 self.emit(
161 col,
162 row,
163 z,
164 calibrate::from_undercut(p, cutoff),
165 reason,
166 out,
167 );
168 }
169 }
170
171 fn emit(
174 &self,
175 col: &Column,
176 row: usize,
177 score: f64,
178 confidence: f64,
179 reason: String,
180 out: &mut Report,
181 ) {
182 out.push(
183 Finding::new(
184 self.id(),
185 AnomalyClass::Point,
186 Handle::Cell {
187 column: col.name.clone(),
188 row,
189 },
190 confidence,
191 score,
192 reason,
193 )
194 .with_col_type(col.ty),
195 );
196 }
197}
198
199fn numeric_cell(v: &Value) -> Option<f64> {
202 v.as_f64().filter(|x| x.is_finite())
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208 use proptest::prelude::*;
209
210 fn col(name: &str, xs: &[f64]) -> Column {
211 Column::new(name, xs.iter().map(|&x| Value::Float(x)).collect())
212 }
213
214 fn run(xs: &[f64]) -> Report {
215 let rs = ax_core::RecordSet::new("-", "test", vec![col("x", xs)]);
216 let mut out = Report::new();
217 PointDetector.detect(
218 &ScanContext::single(&rs),
219 &DetectConfig::default(),
220 &mut out,
221 );
222 out
223 }
224
225 fn flagged_values(xs: &[f64]) -> Vec<u64> {
227 let report = run(xs);
228 let mut v: Vec<u64> = report
229 .findings
230 .iter()
231 .map(|f| match &f.handle {
232 Handle::Cell { row, .. } => xs[*row].to_bits(),
233 _ => unreachable!("point detector emits cell handles"),
234 })
235 .collect();
236 v.sort_unstable();
237 v
238 }
239
240 #[test]
241 fn obvious_outlier_is_flagged() {
242 let mut xs = vec![10.0; 30];
243 xs.push(1000.0);
244 let report = run(&xs);
245 assert_eq!(report.findings.len(), 1);
246 assert!(matches!(
247 report.findings[0].handle,
248 Handle::Cell { row: 30, .. }
249 ));
250 assert!(report.findings[0].confidence > 0.5);
251 }
252
253 fn run_cfg(xs: &[f64], cfg: &DetectConfig) -> Report {
254 let rs = ax_core::RecordSet::new("-", "test", vec![col("x", xs)]);
255 let mut out = Report::new();
256 PointDetector.detect(&ScanContext::single(&rs), cfg, &mut out);
257 out
258 }
259
260 fn fdr_cfg(q: f64) -> DetectConfig {
261 DetectConfig {
262 point_fdr_q: Some(q),
263 ..DetectConfig::default()
264 }
265 }
266
267 #[test]
268 fn fdr_flags_the_clear_outlier() {
269 let mut xs = vec![10.0; 30];
272 xs.push(1000.0);
273 let r = run_cfg(&xs, &fdr_cfg(0.05));
274 assert_eq!(r.findings.len(), 1);
275 assert!(matches!(r.findings[0].handle, Handle::Cell { row: 30, .. }));
276 assert!(r.findings[0].reason.contains("FDR q="));
278 }
279
280 #[test]
281 fn fdr_adapts_to_the_number_of_tests() {
282 let outlier = 4.0 * 1.4826; let make = |n: usize| {
290 let mut xs = Vec::new();
291 for _ in 0..n {
292 xs.extend_from_slice(&[-1.0, 0.0, 1.0]);
293 }
294 xs.push(outlier);
295 xs
296 };
297 let small = run_cfg(&make(7), &fdr_cfg(0.05)); let large = run_cfg(&make(700), &fdr_cfg(0.05)); assert_eq!(small.findings.len(), 1, "rare in a small column ⇒ flagged");
300 assert_eq!(
301 large.findings.len(),
302 0,
303 "the same cell among 2101 tests ⇒ not significant after correction"
304 );
305 }
306
307 #[test]
308 fn fdr_uses_deviation_from_center_not_sum() {
309 let mut xs: Vec<f64> = Vec::new();
314 for _ in 0..20 {
315 xs.extend_from_slice(&[98.0, 99.0, 100.0, 101.0, 102.0]);
316 }
317 xs.push(200.0);
318 let r = run_cfg(&xs, &fdr_cfg(0.05));
319 assert_eq!(r.findings.len(), 1, "only the 200 outlier is significant");
320 assert!(matches!(
321 r.findings[0].handle,
322 Handle::Cell { row: 100, .. }
323 ));
324 }
325
326 fn cfg_no_roles() -> DetectConfig {
327 DetectConfig {
328 column_roles: false,
329 ..DetectConfig::default()
330 }
331 }
332
333 #[test]
334 fn identifier_named_column_is_skipped_by_role() {
335 let mut xs = vec![100.0; 30];
338 xs.push(999_999.0);
339 let id_col = col("_PID", &xs);
340 let rs = ax_core::RecordSet::new("-", "t", vec![id_col]);
341
342 let mut on = Report::new();
343 PointDetector.detect(&ScanContext::single(&rs), &DetectConfig::default(), &mut on);
344 assert!(
345 on.findings.is_empty(),
346 "identifier column must be role-skipped"
347 );
348 assert_eq!(on.absent.len(), 1);
350
351 let mut off = Report::new();
352 PointDetector.detect(&ScanContext::single(&rs), &cfg_no_roles(), &mut off);
353 assert_eq!(
354 off.findings.len(),
355 1,
356 "--no-column-roles scans it as before"
357 );
358 }
359
360 #[test]
361 fn measurement_column_alongside_identifier_still_scanned() {
362 let mut m = vec![10.0; 30];
365 m.push(1000.0);
366 let rs = ax_core::RecordSet::new("-", "t", vec![col("fare", &m), col("user_id", &m)]);
367 let mut out = Report::new();
368 PointDetector.detect(
369 &ScanContext::single(&rs),
370 &DetectConfig::default(),
371 &mut out,
372 );
373 assert_eq!(
374 out.findings.len(),
375 1,
376 "only the measurement column's outlier"
377 );
378 match &out.findings[0].handle {
379 Handle::Cell { column, .. } => assert_eq!(column, "fare"),
380 _ => unreachable!(),
381 }
382 assert!(out.absent.is_empty(), "a measurement column WAS assessed");
383 }
384
385 #[test]
386 fn fdr_off_matches_the_threshold_path_exactly() {
387 let xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1000.0];
389 let off = run_cfg(&xs, &DetectConfig::default());
390 assert_eq!(off.findings.len(), 1);
391 assert!(off.findings[0].reason.contains("exceeds"));
392 }
393
394 #[test]
395 fn constant_column_has_no_findings() {
396 let report = run(&[7.0; 20]);
397 assert!(report.is_clean());
398 assert!(report.absent.is_empty());
400 }
401
402 #[test]
403 fn non_numeric_corpus_marks_absent() {
404 let rs = ax_core::RecordSet::new(
405 "-",
406 "test",
407 vec![Column::new(
408 "name",
409 (0..20).map(|i| Value::Str(format!("u{i}"))).collect(),
410 )],
411 );
412 let mut out = Report::new();
413 PointDetector.detect(
414 &ScanContext::single(&rs),
415 &DetectConfig::default(),
416 &mut out,
417 );
418 assert!(out.is_clean());
419 assert_eq!(out.absent.len(), 1);
420 assert_eq!(out.absent[0].detector, "point.modz");
421 }
422
423 #[test]
424 fn too_few_values_marks_absent() {
425 let report = run(&[1.0, 2.0, 100.0]); assert!(report.is_clean());
427 assert_eq!(report.absent.len(), 1);
428 }
429
430 #[test]
431 fn exactly_min_n_values_is_assessed() {
432 let report = run(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]);
435 assert_eq!(
436 report.findings.len(),
437 1,
438 "the 100.0 outlier must be flagged"
439 );
440 assert!(report.absent.is_empty(), "8 values is enough to assess");
441 }
442
443 #[test]
444 fn robust_path_catches_what_sigma_path_misses() {
445 let report = run(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1000.0]);
450 assert_eq!(report.findings.len(), 1);
451 assert!(matches!(
452 report.findings[0].handle,
453 Handle::Cell { row: 8, .. }
454 ));
455 assert!(
456 report.findings[0].score > 100.0,
457 "MAD-scaled score is large"
458 );
459 }
460
461 proptest! {
462 #[test]
465 fn shift_invariant(shift in -1e6f64..1e6, base in 1.0f64..5.0) {
466 let mut xs: Vec<f64> = (0..40).map(|i| base + (i % 5) as f64 * 0.01).collect();
467 xs.push(base + 500.0); let original = flagged_values(&xs);
469 let shifted: Vec<f64> = xs.iter().map(|x| x + shift).collect();
470 let report = run(&shifted);
472 let mut rows: Vec<usize> = report.findings.iter().map(|f| match &f.handle {
473 Handle::Cell { row, .. } => *row,
474 _ => unreachable!(),
475 }).collect();
476 rows.sort_unstable();
477 let mut orig_rows: Vec<usize> = run(&xs).findings.iter().map(|f| match &f.handle {
478 Handle::Cell { row, .. } => *row,
479 _ => unreachable!(),
480 }).collect();
481 orig_rows.sort_unstable();
482 prop_assert_eq!(rows, orig_rows);
483 let _ = original;
484 }
485
486 #[test]
489 fn scale_invariant(scale in 0.001f64..1000.0) {
490 let mut xs: Vec<f64> = (0..40).map(|i| 100.0 + (i % 7) as f64).collect();
491 xs.push(100_000.0); let base_rows = flagged_rows(&xs);
493 let scaled: Vec<f64> = xs.iter().map(|x| x * scale).collect();
494 prop_assert_eq!(flagged_rows(&scaled), base_rows);
495 }
496
497 #[test]
499 fn deterministic(seed in 0u64..1000) {
500 let xs: Vec<f64> = (0..50).map(|i| ((i as u64).wrapping_mul(seed) % 97) as f64).collect();
501 let a = run(&xs);
502 let b = run(&xs);
503 prop_assert_eq!(
504 serde_json::to_string(&a.findings).unwrap(),
505 serde_json::to_string(&b.findings).unwrap()
506 );
507 }
508
509 #[test]
511 fn permutation_invariant_values(rot in 1usize..39) {
512 let mut xs: Vec<f64> = (0..40).map(|i| 50.0 + (i % 3) as f64).collect();
513 xs.push(9999.0);
514 let base = flagged_values(&xs);
515 xs.rotate_left(rot);
516 prop_assert_eq!(flagged_values(&xs), base);
517 }
518 }
519
520 fn flagged_rows(xs: &[f64]) -> Vec<usize> {
521 let mut rows: Vec<usize> = run(xs)
522 .findings
523 .iter()
524 .map(|f| match &f.handle {
525 Handle::Cell { row, .. } => *row,
526 _ => unreachable!(),
527 })
528 .collect();
529 rows.sort_unstable();
530 rows
531 }
532}