Skip to main content

anomstream_core/
forensic.rs

1//! Imputation-like forensic baseline — answer "what would this
2//! dim have looked like if the point were normal?" by aggregating
3//! the per-dim distribution of the forest's currently-held sample
4//! points.
5//!
6//! Inspired by AWS's `ImputeVisitor` but repurposed: instead of
7//! imputing a `NaN` feature, this helper tells an SOC analyst how
8//! far an observed point sits from the forest's current idea of
9//! "normal" on every dimension — the *expected value under
10//! normality* plus a z-score-style delta.
11//!
12//! # Semantics
13//!
14//! - `expected[d]` — mean of dim `d` across every point currently
15//!   held in any tree's reservoir (the forest's live baseline).
16//! - `stddev[d]` — population standard deviation of the same set.
17//! - `observed[d]` — the caller's raw query value.
18//! - `delta[d] = observed[d] − expected[d]`.
19//! - `zscore[d] = delta[d] / stddev[d]` (clamped to `0` when the
20//!   baseline stddev is zero on a dim — constant baseline means
21//!   no meaningful z-score).
22//! - `live_points` — number of unique points contributing to the
23//!   baseline.
24//!
25//! The baseline is computed in raw-point space: `feature_scales`
26//! is applied to the stored points for averaging then inverted so
27//! `expected` / `stddev` / `delta` live in the caller's original
28//! coordinate system. SOC dashboards don't need to know about the
29//! internal scaling.
30
31/// Per-dim forensic baseline comparing an observed point against
32/// the forest's current live sample distribution.
33///
34/// Serialisable under the `serde` feature through the crate's
35/// `fixed_array_f64` adapter — callers that persist alert records
36/// for NIS2 / SOC2 audit trails can embed this struct directly.
37#[derive(Debug, Clone, PartialEq)]
38#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
39pub struct ForensicBaseline<const D: usize> {
40    /// Raw query point the baseline was computed against.
41    #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
42    pub observed: [f64; D],
43    /// Per-dim mean of the live reservoir points (in raw space).
44    #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
45    pub expected: [f64; D],
46    /// Per-dim population stddev of the live reservoir points.
47    #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
48    pub stddev: [f64; D],
49    /// `observed − expected` per dim.
50    #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
51    pub delta: [f64; D],
52    /// Per-dim z-score: `delta / stddev`, `0` when stddev is zero.
53    #[cfg_attr(feature = "serde", serde(with = "crate::serde_util::fixed_array_f64"))]
54    pub zscore: [f64; D],
55    /// Number of unique live points contributing to the baseline.
56    pub live_points: usize,
57}
58
59impl<const D: usize> ForensicBaseline<D> {
60    /// Index of the dimension with the largest `|zscore|` — the dim
61    /// most out-of-family relative to the live baseline. Returns
62    /// `None` on an empty forest (no live points) or when every
63    /// z-score is exactly zero.
64    #[must_use]
65    pub fn argmax_abs_zscore(&self) -> Option<usize> {
66        if D == 0 || self.live_points == 0 {
67            return None;
68        }
69        let mut best: usize = 0;
70        let mut best_val = self.zscore[0].abs();
71        for d in 1..D {
72            let v = self.zscore[d].abs();
73            if v > best_val {
74                best = d;
75                best_val = v;
76            }
77        }
78        if best_val == 0.0 { None } else { Some(best) }
79    }
80}
81
82#[cfg(test)]
83#[allow(clippy::float_cmp)]
84mod tests {
85    use super::*;
86
87    #[test]
88    fn argmax_abs_zscore_picks_biggest() {
89        let b = ForensicBaseline::<4> {
90            observed: [0.0, 0.0, 0.0, 0.0],
91            expected: [0.0, 0.0, 0.0, 0.0],
92            stddev: [1.0, 1.0, 1.0, 1.0],
93            delta: [0.1, -2.0, 0.5, 1.0],
94            zscore: [0.1, -2.0, 0.5, 1.0],
95            live_points: 16,
96        };
97        assert_eq!(b.argmax_abs_zscore(), Some(1));
98    }
99
100    #[test]
101    fn argmax_abs_zscore_empty_when_no_live_points() {
102        let b = ForensicBaseline::<2> {
103            observed: [0.0, 0.0],
104            expected: [0.0, 0.0],
105            stddev: [0.0, 0.0],
106            delta: [0.0, 0.0],
107            zscore: [0.0, 0.0],
108            live_points: 0,
109        };
110        assert!(b.argmax_abs_zscore().is_none());
111    }
112
113    #[test]
114    fn argmax_abs_zscore_empty_when_all_zero() {
115        let b = ForensicBaseline::<2> {
116            observed: [0.0, 0.0],
117            expected: [0.0, 0.0],
118            stddev: [1.0, 1.0],
119            delta: [0.0, 0.0],
120            zscore: [0.0, 0.0],
121            live_points: 4,
122        };
123        assert!(b.argmax_abs_zscore().is_none());
124    }
125}