anomstream_core/forensic.rs
1//! Imputation-like forensic baseline — answer "what would this
2//! dim have looked like if the point were normal?" by aggregating
3//! the per-dim distribution of the forest's currently-held sample
4//! points.
5//!
6//! Inspired by AWS's `ImputeVisitor` but repurposed: instead of
7//! imputing a `NaN` feature, this helper tells an SOC analyst how
8//! far an observed point sits from the forest's current idea of
9//! "normal" on every dimension — the *expected value under
10//! normality* plus a z-score-style delta.
11//!
12//! # Semantics
13//!
14//! - `expected[d]` — mean of dim `d` across every point currently
15//! held in any tree's reservoir (the forest's live baseline).
16//! - `stddev[d]` — population standard deviation of the same set.
17//! - `observed[d]` — the caller's raw query value.
18//! - `delta[d] = observed[d] − expected[d]`.
19//! - `zscore[d] = delta[d] / stddev[d]` (clamped to `0` when the
20//! baseline stddev is zero on a dim — constant baseline means
21//! no meaningful z-score).
22//! - `live_points` — number of unique points contributing to the
23//! baseline.
24//!
25//! The baseline is computed in raw-point space: `feature_scales`
26//! is applied to the stored points for averaging then inverted so
27//! `expected` / `stddev` / `delta` live in the caller's original
28//! coordinate system. SOC dashboards don't need to know about the
29//! internal scaling.
30
31/// Per-dim forensic baseline comparing an observed point against
32/// the forest's current live sample distribution.
33///
34/// Serialisable under the `serde` feature through the crate's
35/// `fixed_array_f64` adapter — callers that persist alert records
36/// for NIS2 / SOC2 audit trails can embed this struct directly.
37#[derive(Debug, Clone, PartialEq)]
38#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
39pub struct ForensicBaseline<const D: usize> {
40 /// Raw query point the baseline was computed against.
41 #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
42 pub observed: [f64; D],
43 /// Per-dim mean of the live reservoir points (in raw space).
44 #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
45 pub expected: [f64; D],
46 /// Per-dim population stddev of the live reservoir points.
47 #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
48 pub stddev: [f64; D],
49 /// `observed − expected` per dim.
50 #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
51 pub delta: [f64; D],
52 /// Per-dim z-score: `delta / stddev`, `0` when stddev is zero.
53 #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
54 pub zscore: [f64; D],
55 /// Number of unique live points contributing to the baseline.
56 pub live_points: usize,
57}
58
59impl<const D: usize> ForensicBaseline<D> {
60 /// Index of the dimension with the largest `|zscore|` — the dim
61 /// most out-of-family relative to the live baseline. Returns
62 /// `None` on an empty forest (no live points) or when every
63 /// z-score is exactly zero.
64 #[must_use]
65 pub fn argmax_abs_zscore(&self) -> Option<usize> {
66 if D == 0 || self.live_points == 0 {
67 return None;
68 }
69 let mut best: usize = 0;
70 let mut best_val = self.zscore[0].abs();
71 for d in 1..D {
72 let v = self.zscore[d].abs();
73 if v > best_val {
74 best = d;
75 best_val = v;
76 }
77 }
78 if best_val == 0.0 { None } else { Some(best) }
79 }
80}
81
82#[cfg(test)]
83#[allow(clippy::float_cmp)]
84mod tests {
85 use super::*;
86
87 #[test]
88 fn argmax_abs_zscore_picks_biggest() {
89 let b = ForensicBaseline::<4> {
90 observed: [0.0, 0.0, 0.0, 0.0],
91 expected: [0.0, 0.0, 0.0, 0.0],
92 stddev: [1.0, 1.0, 1.0, 1.0],
93 delta: [0.1, -2.0, 0.5, 1.0],
94 zscore: [0.1, -2.0, 0.5, 1.0],
95 live_points: 16,
96 };
97 assert_eq!(b.argmax_abs_zscore(), Some(1));
98 }
99
100 #[test]
101 fn argmax_abs_zscore_empty_when_no_live_points() {
102 let b = ForensicBaseline::<2> {
103 observed: [0.0, 0.0],
104 expected: [0.0, 0.0],
105 stddev: [0.0, 0.0],
106 delta: [0.0, 0.0],
107 zscore: [0.0, 0.0],
108 live_points: 0,
109 };
110 assert!(b.argmax_abs_zscore().is_none());
111 }
112
113 #[test]
114 fn argmax_abs_zscore_empty_when_all_zero() {
115 let b = ForensicBaseline::<2> {
116 observed: [0.0, 0.0],
117 expected: [0.0, 0.0],
118 stddev: [1.0, 1.0],
119 delta: [0.0, 0.0],
120 zscore: [0.0, 0.0],
121 live_points: 4,
122 };
123 assert!(b.argmax_abs_zscore().is_none());
124 }
125}