Skip to main content

anomstream_core/
early_term.rs

1//! Early-termination scoring — stop traversing trees once the
2//! running per-tree mean has converged tightly enough to be
3//! actionable.
4//!
5//! The classic [`crate::RandomCutForest::score`] always walks every
6//! tree, averaging the result. On most traffic — where the point
7//! sits cleanly inside or outside the baseline — the first ~20 of
8//! 100 trees already agree so closely that the remaining 80
9//! traversals only refine the last digit of the score. Stopping
10//! early cuts inline detection latency by 30-50 % on "obvious"
11//! cases without losing alerting signal.
12//!
13//! [`crate::RandomCutForest::score_early_term`] walks trees sequentially,
14//! maintains a Welford (1962) running mean / variance, and breaks
15//! when the standard error of the mean falls below
16//! [`EarlyTermConfig::confidence_threshold`] times the absolute
17//! mean. The returned [`EarlyTermScore`] reports how many trees
18//! were actually evaluated so callers can meter latency savings.
19//!
20//! The parallel [`crate::RandomCutForest::score`] path is unchanged — use
21//! it when you do not care about tail latency and want the full
22//! ensemble answer.
23
24use alloc::format;
25
26use crate::error::{RcfError, RcfResult};
27
28/// Default minimum tree count before the early-term check kicks
29/// in — picked so the running stderr estimate is stable enough to
30/// trust.
31pub const DEFAULT_MIN_TREES: usize = 16;
32
33/// Default relative standard-error threshold — a running
34/// `stderr / |mean|` below `0.05` (5 %) is narrow enough to stop.
35pub const DEFAULT_CONFIDENCE_THRESHOLD: f64 = 0.05;
36
37/// Validated early-term configuration.
38#[derive(Debug, Clone, Copy, PartialEq)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub struct EarlyTermConfig {
41    /// Minimum tree count to evaluate before the early-term check
42    /// is even tried. The convergence test needs enough samples to
43    /// produce a non-degenerate stderr estimate.
44    pub min_trees: usize,
45    /// Relative standard-error threshold — stop as soon as
46    /// `stderr / max(|mean|, ε)` drops below this value.
47    pub confidence_threshold: f64,
48}
49
50impl Default for EarlyTermConfig {
51    fn default() -> Self {
52        Self {
53            min_trees: DEFAULT_MIN_TREES,
54            confidence_threshold: DEFAULT_CONFIDENCE_THRESHOLD,
55        }
56    }
57}
58
59impl EarlyTermConfig {
60    /// Validate every field.
61    ///
62    /// # Errors
63    ///
64    /// Returns [`RcfError::InvalidConfig`] when `min_trees == 0`,
65    /// when `confidence_threshold` is non-finite, or when it is
66    /// outside `(0, 1]`.
67    pub fn validate(&self) -> RcfResult<()> {
68        if self.min_trees == 0 {
69            return Err(RcfError::InvalidConfig(
70                "EarlyTermConfig::min_trees must be > 0".into(),
71            ));
72        }
73        if !self.confidence_threshold.is_finite()
74            || self.confidence_threshold <= 0.0
75            || self.confidence_threshold > 1.0
76        {
77            return Err(RcfError::InvalidConfig(
78                format!(
79                    "EarlyTermConfig::confidence_threshold must be in (0, 1], got {}",
80                    self.confidence_threshold
81                )
82                .into(),
83            ));
84        }
85        Ok(())
86    }
87}
88
89/// Outcome of a [`crate::RandomCutForest::score_early_term`] call.
90#[derive(Debug, Clone, Copy, PartialEq)]
91#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
92pub struct EarlyTermScore {
93    /// Final scalar anomaly score — running mean at break time,
94    /// identical in shape to the full-ensemble score.
95    pub score: crate::domain::AnomalyScore,
96    /// Number of trees that actually contributed before the
97    /// detector broke out of the loop.
98    pub trees_evaluated: usize,
99    /// Total trees available in the forest — use with
100    /// `trees_evaluated` to compute the latency savings.
101    pub trees_available: usize,
102    /// Standard error of the per-tree score mean at break time
103    /// (`sqrt(var / n)`). Useful for caller-side confidence
104    /// diagnostics.
105    pub stderr: f64,
106    /// `true` when the loop exited before every tree was walked,
107    /// `false` when the full ensemble was traversed (low
108    /// confidence, too few leaves in the forest, etc.).
109    pub early_stopped: bool,
110}