Skip to main content

anomstream_core/
lib.rs

1//! `anomstream-core` — core detectors + streaming primitives + cross-cut contracts.
2//!
3//! This crate is the math-first floor of the
4//! [`anomstream`](https://crates.io/crates/anomstream) workspace.
5//! It implements the Random Cut Forest (RCF) algorithm from Guha et al.
6//! (ICML 2016) conformant with the
7//! [AWS SageMaker RCF specification][aws-rcf]: reservoir sampling without
8//! replacement (Park et al. 2004), random cuts weighted by per-dimension
9//! range, anomaly score averaged across trees, and hyperparameter bounds
10//! matching the AWS reference (`feature_dim`, `num_trees`,
11//! `num_samples_per_tree`).
12//!
13//! Beyond the core forest, the crate ships a set of **companion
14//! primitives** — per-feature drift detectors, normalisers, streaming
15//! stats, frequency sketches — reused across detection pipelines so
16//! callers can compose `RandomCutForest` + `PerFeatureEwma` +
17//! `PerFeatureCusum` + `FeatureDriftDetector` + `Normalizer` + …
18//! without reimplementing the underlying math.
19//!
20//! # Workspace charter
21//!
22//! `anomstream-core` = *detectors + streaming primitives + cross-cut
23//! contracts*. Three categories of cross-cutting contracts live here
24//! (not in a sibling crate) because every downstream layer depends
25//! on them:
26//!
27//! - [`metrics::MetricsSink`] — telemetry trait consumed by every
28//!   detector, the hot-path sampler, and the triage pipeline.
29//! - [`severity::Severity`] + [`severity::SeverityBands`] —
30//!   classification vocabulary used by both the bare forest
31//!   ([`domain::AnomalyScore::severity`]) and the triage layer's
32//!   `AlertRecord` / `AlertClusterer`.
33//! - [`forest::ForestSnapshot`] — read-only health view that lets
34//!   downstream triage consume forest state without reaching into
35//!   reservoir internals.
36//!
37//! The scope is kept deliberately tight: streaming multivariate
38//! anomaly primitives only; no protocol parsers, no ONNX runtimes,
39//! no IP-centric trackers, no SOC-specific triage opinion. Those
40//! belong in [`anomstream-triage`](https://crates.io/crates/anomstream-triage)
41//! and [`anomstream-hotpath`](https://crates.io/crates/anomstream-hotpath).
42//!
43//! # Architecture
44//!
45//! ```text
46//! +-----------------+        +------------------+
47//! |  ForestBuilder  |        |  RcfConfig       |
48//! +--------+--------+        +---------+--------+
49//!          |                           |
50//!          v                           v
51//!   +------+---------------------------+------+
52//!   |          RandomCutForest                |   <- aggregate root
53//!   |                                         |
54//!   |   PointStore (ring buffer, repository)  |
55//!   |                                         |
56//!   |   trees: Vec<(RandomCutTree, Sampler)>  |
57//!   +-+---------------------+-----------------+
58//!     |                     |
59//!     v                     v
60//! +---+---------+   +-------+--------+
61//! | RandomCutTr |   | ReservoirSampl |
62//! +-------------+   +----------------+
63//!     |
64//!     v  via Visitor trait
65//! +---+----------------+    +-----------------------+
66//! | ScalarScoreVisitor |    | AttributionVisitor    |
67//! +--------------------+    +-----------------------+
68//! ```
69//!
70//! Modules are organised in layered fashion: `domain` (value objects),
71//! `tree` (storage + cut tree), `sampler` (reservoir), `visitor` (scoring
72//! strategies), `forest` (aggregate root), `thresholded` (adaptive
73//! threshold layer on top of the forest), `pool` (bounded per-tenant
74//! detector pool with LRU eviction), `bootstrap` (cold-start replay
75//! helpers for restart resumption from an upstream TSDB),
76//! `group_score` (named-group decomposition of the per-dim
77//! attribution vector), `attribution_stability` (inter-tree
78//! dispersion + confidence for attribution), and `meta_drift`
79//! (two-sided CUSUM change-point detector over the score stream).
80//! The `persistence` module is gated behind the `serde` feature;
81//! `pool` is gated behind `std`.
82//!
83//! # Companion primitives
84//!
85//! Reusable streaming primitives that compose with the forest:
86//!
87//! | Module | Purpose |
88//! |---|---|
89//! | [`online_stats`] | Welford streaming mean + variance |
90//! | [`count_min_sketch`] | Probabilistic frequency sketch (`std`-gated) |
91//! | [`normalize`] | `MinMax` / `ZScore` / `None` per-feature transforms |
92//! | [`per_feature_ewma`] | Parallel univariate EWMA z-score detector |
93//! | [`per_feature_cusum`] | Parallel two-sided CUSUM change-point detector |
94//! | [`severity`] | Ordinal severity bands + classification |
95//!
96//! The companion layer is policy-free — detectors return raw
97//! statistics (z-scores, CUSUM magnitudes, min-max transforms);
98//! callers map them to alert severity via [`SeverityBands`] or a
99//! custom rule. `per_feature_cusum` intentionally co-exists with
100//! [`meta_drift::MetaDriftDetector`] (scalar CUSUM on the score
101//! stream): use the per-feature variant for attribution, the
102//! meta variant for score-level regime change.
103//!
104//! # Example
105//!
106//! ```ignore
107//! use anomstream_core::ForestBuilder;
108//!
109//! let mut forest = ForestBuilder::<4>::new()
110//!     .num_trees(100)
111//!     .sample_size(256)
112//!     .seed(42)
113//!     .build()?;
114//!
115//! for point in stream_of_points {
116//!     forest.update(point.clone())?;
117//!     let score = forest.score(&point)?;
118//!     if f64::from(score) > 1.5 {
119//!         eprintln!("anomaly: {score}");
120//!     }
121//! }
122//! # Ok::<(), anomstream_core::RcfError>(())
123//! ```
124//!
125//! # Conformance
126//!
127//! `anomstream-core` enforces the AWS `SageMaker` hyperparameter bounds at build time:
128//!
129//! | Parameter | Range | Default |
130//! |---|---|---|
131//! | `feature_dim` | `[1, 10000]` | required |
132//! | `num_trees` | `[50, 1000]` | `100` |
133//! | `num_samples_per_tree` | `[1, 2048]` | `256` |
134//! | `time_decay` | `[0, 1]` | `0.1 / sample_size` (tracks AWS Java `CompactSampler`; pass `0.0` to disable recency bias) |
135//!
136//! See the crate's `README.md` for the full conformance matrix and the
137//! comparison against `krcf` and the AWS Java port.
138//!
139//! # References
140//!
141//! 1. Sudipto Guha, Nina Mishra, Gourav Roy, Okke Schrijvers. "Robust Random
142//!    Cut Forest Based Anomaly Detection on Streams." *International
143//!    Conference on Machine Learning*, pp. 2712–2721. 2016.
144//! 2. Byung-Hoon Park, George Ostrouchov, Nagiza F. Samatova, Al Geist.
145//!    "Reservoir-based random sampling with replacement from data stream."
146//!    *SIAM International Conference on Data Mining*, pp. 492–496. 2004.
147//! 3. AWS `SageMaker` RCF reference.
148//!
149//! [aws-rcf]: https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html
150
151#![cfg_attr(not(feature = "std"), no_std)]
152#![forbid(unsafe_code)]
153#![warn(missing_docs)]
154#![warn(clippy::missing_docs_in_private_items)]
155// The crate-level restriction deny on `unwrap_used` / `panic` keeps
156// the public prod paths free of panic-on-unwrap; unit tests inside
157// `#[cfg(test)]` modules use `.unwrap()` and explicit `panic!` as
158// fail-fast idioms and opt out here.
159#![cfg_attr(test, allow(clippy::unwrap_used, clippy::panic))]
160
161extern crate alloc;
162
163#[cfg(feature = "std")]
164pub mod adwin;
165pub mod attribution_stability;
166#[cfg(feature = "std")]
167pub mod bloom;
168pub mod bootstrap;
169pub mod config;
170#[cfg(feature = "std")]
171pub mod count_min_sketch;
172pub mod domain;
173#[cfg(feature = "std")]
174pub mod drift_aware;
175#[cfg(feature = "std")]
176pub mod dynamic_forest;
177pub mod early_term;
178#[cfg(feature = "std")]
179pub mod ensemble;
180pub mod error;
181pub mod feature_drift;
182pub mod forensic;
183pub mod forest;
184pub mod group_score;
185pub mod histogram;
186#[cfg(feature = "std")]
187pub mod hyperloglog;
188#[cfg(feature = "std")]
189pub mod matrix_profile;
190pub mod meta_drift;
191pub mod metrics;
192pub mod normalize;
193pub mod online_stats;
194pub mod per_feature_cusum;
195pub mod per_feature_ewma;
196#[cfg(feature = "serde")]
197pub mod persistence;
198#[cfg(feature = "std")]
199pub mod pool;
200pub mod sampler;
201pub mod score_ci;
202#[cfg(feature = "serde")]
203pub mod serde_util;
204pub mod severity;
205#[cfg(feature = "std")]
206pub mod shingled;
207#[cfg(feature = "std")]
208pub mod space_saving;
209pub mod tdigest;
210pub mod thresholded;
211pub mod tree;
212#[cfg(feature = "std")]
213pub mod tsb_ad_m;
214#[cfg(feature = "std")]
215pub mod univariate_spot;
216pub mod visitor;
217#[cfg(feature = "std")]
218pub mod vus_pr;
219
220#[cfg(feature = "std")]
221pub use adwin::{
222    AdwinDetector, DEFAULT_DELTA as ADWIN_DEFAULT_DELTA, DEFAULT_WINDOW_CAP as ADWIN_DEFAULT_WINDOW,
223};
224pub use attribution_stability::AttributionStability;
225#[cfg(feature = "std")]
226pub use bloom::{
227    BloomFilter, DEFAULT_FALSE_POSITIVE_RATE as BLOOM_DEFAULT_FPR, MAX_HASHES as BLOOM_MAX_HASHES,
228};
229pub use bootstrap::BootstrapReport;
230pub use config::{ForestBuilder, RcfConfig};
231#[cfg(feature = "std")]
232pub use count_min_sketch::CountMinSketch;
233pub use domain::{AnomalyScore, BoundingBox, Cut, DiVector, Point};
234#[cfg(feature = "std")]
235pub use drift_aware::{DriftAwareForest, DriftRecoveryConfig};
236#[cfg(feature = "std")]
237pub use dynamic_forest::DynamicForest;
238pub use early_term::{EarlyTermConfig, EarlyTermScore};
239#[cfg(feature = "std")]
240pub use ensemble::{chi_squared_survival_even, fisher_combine};
241pub use error::{RcfError, RcfResult};
242pub use feature_drift::{DriftLevel, FeatureDriftDetector};
243pub use forensic::ForensicBaseline;
244pub use forest::{ForestSnapshot, PointStore, RandomCutForest};
245pub use group_score::{FeatureGroup, FeatureGroups, FeatureGroupsBuilder, GroupScores};
246pub use histogram::{HistogramConfig, ScoreHistogram};
247#[cfg(feature = "std")]
248pub use hyperloglog::{
249    DEFAULT_PRECISION as HLL_DEFAULT_PRECISION, HyperLogLog, MAX_PRECISION as HLL_MAX_PRECISION,
250    MIN_PRECISION as HLL_MIN_PRECISION,
251};
252#[cfg(feature = "std")]
253pub use matrix_profile::{MIN_WINDOW as MATRIX_PROFILE_MIN_WINDOW, MatrixProfile};
254pub use meta_drift::{CusumConfig, DriftKind, DriftVerdict, MetaDriftDetector};
255pub use metrics::{MetricsSink, NoopSink};
256pub use normalize::{NormParams, NormStrategy, Normalizer};
257pub use online_stats::OnlineStats;
258pub use per_feature_cusum::{
259    DriftDirection, PerFeatureCusum, PerFeatureCusumAccumulator, PerFeatureCusumAlert,
260    PerFeatureCusumConfig, PerFeatureCusumResult,
261};
262pub use per_feature_ewma::{
263    EwmaAccumulator, PerFeatureEwma, PerFeatureEwmaConfig, PerFeatureEwmaResult,
264};
265#[cfg(feature = "std")]
266pub use pool::{ReadinessSummary, TenantForestPool};
267pub use sampler::{ReservoirSampler, SamplerOp};
268pub use score_ci::{DEFAULT_Z_FACTOR as DEFAULT_CI_Z_FACTOR, ScoreWithConfidence};
269pub use severity::{Severity, SeverityBands};
270#[cfg(feature = "std")]
271pub use shingled::{ShingledForest, ShingledForestBuilder};
272#[cfg(feature = "std")]
273pub use space_saving::{
274    DEFAULT_CAPACITY as SPACE_SAVING_DEFAULT_CAPACITY, HeavyHitter, HeavyHitterEntry, SpaceSaving,
275};
276pub use tdigest::{Centroid, DEFAULT_COMPRESSION as TDIGEST_DEFAULT_COMPRESSION, TDigest};
277pub use thresholded::{
278    AnomalyGrade, EmaStats, ThresholdMode, ThresholdedConfig, ThresholdedForest,
279    ThresholdedForestBuilder,
280};
281pub use tree::{
282    InternalData, LeafData, NodeRef, NodeStore, NodeView, NodeViewMut, PointAccessor, RandomCutTree,
283};
284#[cfg(feature = "std")]
285pub use tsb_ad_m::TsbAdMDataset;
286#[cfg(feature = "std")]
287pub use univariate_spot::{
288    DEFAULT_ALERT_P as SPOT_DEFAULT_ALERT_P, DEFAULT_QUANTILE as SPOT_DEFAULT_QUANTILE, PotDetector,
289};
290pub use visitor::{AttributionVisitor, ScalarScoreVisitor, ScoreAttributionVisitor, Visitor};
291#[cfg(feature = "std")]
292pub use vus_pr::{
293    DEFAULT_MAX_BUFFER as VUS_PR_DEFAULT_MAX_BUFFER, range_auc_pr, vus_pr, vus_pr_with_buffer,
294};