Skip to main content

anomstream_core/
histogram.rs

1//! Fixed-bin histogram for score / grade / CUSUM streams.
2//!
3//! [`ScoreHistogram`] bins a stream of `f64` observations into
4//! equal-width buckets on a caller-specified `[min, max)` range.
5//! Under- and overflow counters capture values outside the range
6//! so the total bin count plus `underflow + overflow` always sums
7//! to `total()`.
8//!
9//! Used for operator-facing dashboards: export the bin counts to
10//! Prometheus / Grafana, plot the distribution shape, and diagnose
11//! whether the detector's threshold is well-calibrated without
12//! re-deriving per-minute aggregates in the monitoring pipeline.
13//!
14//! The histogram itself is standalone — callers feed it the scores
15//! or grades they already collect through
16//! [`crate::ThresholdedForest::process`],
17//! [`crate::MetaDriftDetector::observe`], etc. Composition over
18//! entanglement: no dependency on the forest machinery.
19
20use alloc::format;
21use alloc::vec;
22use alloc::vec::Vec;
23
24use crate::error::{RcfError, RcfResult};
25
26/// Default number of equal-width bins used when the caller does not
27/// override [`HistogramConfig::bin_count`].
28pub const DEFAULT_BIN_COUNT: usize = 32;
29
30/// Validated configuration of a [`ScoreHistogram`].
31#[derive(Debug, Clone, Copy, PartialEq)]
32#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
33#[cfg_attr(feature = "serde", serde(try_from = "HistogramConfigShadow"))]
34pub struct HistogramConfig {
35    /// Number of equal-width bins covering `[min, max)`.
36    pub bin_count: usize,
37    /// Inclusive lower bound of the binned range.
38    pub min: f64,
39    /// Exclusive upper bound of the binned range.
40    pub max: f64,
41}
42
43/// Over-the-wire [`HistogramConfig`] layout. Deserialization lands
44/// here first so [`TryFrom`] can re-run [`HistogramConfig::validate`]
45/// before a live config is handed out — `bin_count > 0`, finite
46/// `min`/`max`, `min < max`.
47#[cfg(feature = "serde")]
48#[derive(serde::Serialize, serde::Deserialize)]
49#[allow(clippy::missing_docs_in_private_items)]
50struct HistogramConfigShadow {
51    bin_count: usize,
52    min: f64,
53    max: f64,
54}
55
56#[cfg(feature = "serde")]
57impl TryFrom<HistogramConfigShadow> for HistogramConfig {
58    type Error = RcfError;
59
60    fn try_from(raw: HistogramConfigShadow) -> Result<Self, Self::Error> {
61        let c = Self {
62            bin_count: raw.bin_count,
63            min: raw.min,
64            max: raw.max,
65        };
66        c.validate()?;
67        Ok(c)
68    }
69}
70
71impl HistogramConfig {
72    /// Build a config with the declared bounds and
73    /// [`DEFAULT_BIN_COUNT`] equal-width buckets.
74    ///
75    /// # Errors
76    ///
77    /// Same as [`Self::validate`].
78    pub fn with_range(min: f64, max: f64) -> RcfResult<Self> {
79        let c = Self {
80            bin_count: DEFAULT_BIN_COUNT,
81            min,
82            max,
83        };
84        c.validate()?;
85        Ok(c)
86    }
87
88    /// Validate every field.
89    ///
90    /// # Errors
91    ///
92    /// Returns [`RcfError::InvalidConfig`] when `bin_count == 0`,
93    /// when `min`/`max` are non-finite, or when `min >= max`.
94    pub fn validate(&self) -> RcfResult<()> {
95        if self.bin_count == 0 {
96            return Err(RcfError::InvalidConfig(
97                "HistogramConfig::bin_count must be > 0".into(),
98            ));
99        }
100        if !self.min.is_finite() || !self.max.is_finite() {
101            return Err(RcfError::InvalidConfig(
102                format!(
103                    "HistogramConfig bounds must be finite, got min={} max={}",
104                    self.min, self.max
105                )
106                .into(),
107            ));
108        }
109        if self.min >= self.max {
110            return Err(RcfError::InvalidConfig(
111                format!(
112                    "HistogramConfig::min ({}) must be strictly less than max ({})",
113                    self.min, self.max
114                )
115                .into(),
116            ));
117        }
118        Ok(())
119    }
120
121    /// Width of each equal-width bin.
122    #[must_use]
123    pub fn bin_width(&self) -> f64 {
124        #[allow(clippy::cast_precision_loss)]
125        {
126            (self.max - self.min) / self.bin_count as f64
127        }
128    }
129}
130
131/// Fixed-bin histogram over an `f64` stream.
132#[derive(Debug, Clone, PartialEq)]
133#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
134#[cfg_attr(feature = "serde", serde(try_from = "ScoreHistogramShadow"))]
135pub struct ScoreHistogram {
136    /// Validated bin layout.
137    config: HistogramConfig,
138    /// Per-bin observation counts; `bins[i]` covers
139    /// `[min + i · bin_width, min + (i+1) · bin_width)`.
140    bins: Vec<u64>,
141    /// Observations strictly below `config.min`.
142    underflow: u64,
143    /// Observations at or above `config.max`.
144    overflow: u64,
145    /// Non-finite observations (`NaN`, `±∞`) — counted separately so
146    /// `total()` can still reason about the sum of bin counts.
147    non_finite: u64,
148}
149
150/// Over-the-wire [`ScoreHistogram`] layout. Deserialization lands
151/// here first so [`TryFrom`] can re-run [`HistogramConfig::validate`]
152/// and confirm `bins.len() == config.bin_count` before a live
153/// histogram is handed out.
154#[cfg(feature = "serde")]
155#[derive(serde::Serialize, serde::Deserialize)]
156#[allow(clippy::missing_docs_in_private_items)]
157struct ScoreHistogramShadow {
158    config: HistogramConfig,
159    bins: Vec<u64>,
160    underflow: u64,
161    overflow: u64,
162    non_finite: u64,
163}
164
165#[cfg(feature = "serde")]
166impl TryFrom<ScoreHistogramShadow> for ScoreHistogram {
167    type Error = RcfError;
168
169    fn try_from(raw: ScoreHistogramShadow) -> Result<Self, Self::Error> {
170        raw.config.validate()?;
171        if raw.bins.len() != raw.config.bin_count {
172            return Err(RcfError::InvalidConfig(
173                format!(
174                    "ScoreHistogram: bins length {} != config.bin_count {}",
175                    raw.bins.len(),
176                    raw.config.bin_count
177                )
178                .into(),
179            ));
180        }
181        Ok(Self {
182            config: raw.config,
183            bins: raw.bins,
184            underflow: raw.underflow,
185            overflow: raw.overflow,
186            non_finite: raw.non_finite,
187        })
188    }
189}
190
191impl ScoreHistogram {
192    /// Build a fresh histogram with the supplied config.
193    ///
194    /// # Errors
195    ///
196    /// Forwards [`HistogramConfig::validate`] errors.
197    pub fn new(config: HistogramConfig) -> RcfResult<Self> {
198        config.validate()?;
199        Ok(Self {
200            bins: vec![0; config.bin_count],
201            config,
202            underflow: 0,
203            overflow: 0,
204            non_finite: 0,
205        })
206    }
207
208    /// Short-hand for `new(HistogramConfig::with_range(min, max)?)`.
209    ///
210    /// # Errors
211    ///
212    /// Same as [`Self::new`].
213    pub fn with_range(min: f64, max: f64) -> RcfResult<Self> {
214        Self::new(HistogramConfig::with_range(min, max)?)
215    }
216
217    /// Fold `value` into the histogram.
218    pub fn record(&mut self, value: f64) {
219        if !value.is_finite() {
220            self.non_finite = self.non_finite.saturating_add(1);
221            return;
222        }
223        if value < self.config.min {
224            self.underflow = self.underflow.saturating_add(1);
225            return;
226        }
227        if value >= self.config.max {
228            self.overflow = self.overflow.saturating_add(1);
229            return;
230        }
231        let width = self.config.bin_width();
232        #[allow(
233            clippy::cast_possible_truncation,
234            clippy::cast_sign_loss,
235            clippy::cast_precision_loss
236        )]
237        let mut idx = ((value - self.config.min) / width) as usize;
238        if idx >= self.bins.len() {
239            // Defensive: floating drift can nudge a value at the
240            // upper edge past the last bin. Cap to the last bin
241            // rather than bounce it to overflow.
242            idx = self.bins.len() - 1;
243        }
244        self.bins[idx] = self.bins[idx].saturating_add(1);
245    }
246
247    /// Read-only view of the per-bin counts, in ascending-value
248    /// order.
249    #[must_use]
250    pub fn bins(&self) -> &[u64] {
251        &self.bins
252    }
253
254    /// Inclusive/exclusive edges of every bin — `[(min_0, max_0),
255    /// (min_1, max_1), …]`. Useful for Prometheus-style export
256    /// where each bucket is named by its upper bound.
257    #[must_use]
258    pub fn bin_edges(&self) -> Vec<(f64, f64)> {
259        let width = self.config.bin_width();
260        let mut out = Vec::with_capacity(self.bins.len());
261        for i in 0..self.bins.len() {
262            #[allow(clippy::cast_precision_loss)]
263            let lo = self.config.min + width * i as f64;
264            let hi = lo + width;
265            out.push((lo, hi));
266        }
267        out
268    }
269
270    /// Configured bin layout.
271    #[must_use]
272    pub fn config(&self) -> &HistogramConfig {
273        &self.config
274    }
275
276    /// Observations below [`HistogramConfig::min`].
277    #[must_use]
278    pub fn underflow(&self) -> u64 {
279        self.underflow
280    }
281
282    /// Observations at or above [`HistogramConfig::max`].
283    #[must_use]
284    pub fn overflow(&self) -> u64 {
285        self.overflow
286    }
287
288    /// Non-finite observations (`NaN`, `±∞`).
289    #[must_use]
290    pub fn non_finite(&self) -> u64 {
291        self.non_finite
292    }
293
294    /// Total number of `record` calls — sum of every bin,
295    /// underflow, overflow, and non-finite.
296    #[must_use]
297    pub fn total(&self) -> u64 {
298        let sum: u64 = self.bins.iter().copied().sum();
299        sum.saturating_add(self.underflow)
300            .saturating_add(self.overflow)
301            .saturating_add(self.non_finite)
302    }
303
304    /// Drop every count, keeping the config.
305    pub fn reset(&mut self) {
306        for b in &mut self.bins {
307            *b = 0;
308        }
309        self.underflow = 0;
310        self.overflow = 0;
311        self.non_finite = 0;
312    }
313
314    /// Merge `other` into `self`. Both histograms must share the
315    /// exact same config.
316    ///
317    /// # Errors
318    ///
319    /// Returns [`RcfError::InvalidConfig`] when the configs differ.
320    pub fn merge(&mut self, other: &Self) -> RcfResult<()> {
321        if self.config != other.config {
322            return Err(RcfError::InvalidConfig(
323                "ScoreHistogram::merge requires identical configs".into(),
324            ));
325        }
326        for (a, b) in self.bins.iter_mut().zip(other.bins.iter()) {
327            *a = a.saturating_add(*b);
328        }
329        self.underflow = self.underflow.saturating_add(other.underflow);
330        self.overflow = self.overflow.saturating_add(other.overflow);
331        self.non_finite = self.non_finite.saturating_add(other.non_finite);
332        Ok(())
333    }
334
335    /// Linear-interpolated percentile of the recorded values.
336    /// Returns `None` when `p` is outside `[0, 1]` or the histogram
337    /// has seen no in-range observations. Under-/overflow and
338    /// non-finite counts are excluded from the percentile.
339    #[must_use]
340    pub fn percentile(&self, p: f64) -> Option<f64> {
341        if !p.is_finite() || !(0.0..=1.0).contains(&p) {
342            return None;
343        }
344        let total: u64 = self.bins.iter().copied().sum();
345        if total == 0 {
346            return None;
347        }
348        #[allow(clippy::cast_precision_loss)]
349        let target = p * total as f64;
350        let width = self.config.bin_width();
351        let mut cum: u64 = 0;
352        for (i, count) in self.bins.iter().enumerate() {
353            let prev = cum;
354            cum = cum.saturating_add(*count);
355            #[allow(clippy::cast_precision_loss)]
356            let prev_f = prev as f64;
357            #[allow(clippy::cast_precision_loss)]
358            let cum_f = cum as f64;
359            if target <= cum_f && *count > 0 {
360                let in_bin =
361                    (target - prev_f) / f64::from(u32::try_from(*count).unwrap_or(u32::MAX));
362                #[allow(clippy::cast_precision_loss)]
363                let lo = self.config.min + width * i as f64;
364                return Some(lo + width * in_bin.clamp(0.0, 1.0));
365            }
366        }
367        None
368    }
369}
370
371#[cfg(test)]
372#[allow(clippy::float_cmp)] // Tests assert closed-form histogram behaviour.
373mod tests {
374    use super::*;
375
376    fn hist() -> ScoreHistogram {
377        ScoreHistogram::new(HistogramConfig {
378            bin_count: 10,
379            min: 0.0,
380            max: 10.0,
381        })
382        .unwrap()
383    }
384
385    #[test]
386    fn new_rejects_zero_bin_count() {
387        assert!(
388            ScoreHistogram::new(HistogramConfig {
389                bin_count: 0,
390                min: 0.0,
391                max: 1.0,
392            })
393            .is_err()
394        );
395    }
396
397    #[test]
398    fn new_rejects_inverted_range() {
399        assert!(
400            ScoreHistogram::new(HistogramConfig {
401                bin_count: 4,
402                min: 1.0,
403                max: 0.5,
404            })
405            .is_err()
406        );
407    }
408
409    #[test]
410    fn new_rejects_non_finite_bounds() {
411        assert!(
412            ScoreHistogram::new(HistogramConfig {
413                bin_count: 4,
414                min: f64::NAN,
415                max: 1.0,
416            })
417            .is_err()
418        );
419    }
420
421    #[test]
422    fn record_routes_value_to_correct_bin() {
423        let mut h = hist();
424        h.record(0.5); // bin 0
425        h.record(1.5); // bin 1
426        h.record(9.9); // bin 9
427        assert_eq!(h.bins()[0], 1);
428        assert_eq!(h.bins()[1], 1);
429        assert_eq!(h.bins()[9], 1);
430        assert_eq!(h.total(), 3);
431    }
432
433    #[test]
434    fn record_under_and_overflow() {
435        let mut h = hist();
436        h.record(-1.0);
437        h.record(10.0); // exclusive max
438        h.record(100.0);
439        assert_eq!(h.underflow(), 1);
440        assert_eq!(h.overflow(), 2);
441        assert_eq!(h.total(), 3);
442    }
443
444    #[test]
445    fn record_non_finite_tallied_separately() {
446        let mut h = hist();
447        h.record(f64::NAN);
448        h.record(f64::INFINITY);
449        h.record(f64::NEG_INFINITY);
450        assert_eq!(h.non_finite(), 3);
451        assert_eq!(h.total(), 3);
452        assert!(h.bins().iter().all(|&c| c == 0));
453    }
454
455    #[test]
456    fn upper_edge_goes_to_last_bin_not_overflow() {
457        // `max` is exclusive by contract — a value landing exactly
458        // on `max - ε` should fall in the last bin via floating
459        // drift fallback.
460        let mut h = hist();
461        h.record(9.999_999_999);
462        assert_eq!(h.bins()[9], 1);
463        assert_eq!(h.overflow(), 0);
464    }
465
466    #[test]
467    fn bin_edges_cover_whole_range() {
468        let h = hist();
469        let edges = h.bin_edges();
470        assert_eq!(edges.len(), 10);
471        assert_eq!(edges[0], (0.0, 1.0));
472        assert_eq!(edges[9], (9.0, 10.0));
473    }
474
475    #[test]
476    fn reset_clears_counts_but_keeps_config() {
477        let mut h = hist();
478        for _ in 0..5 {
479            h.record(3.0);
480        }
481        h.record(-1.0);
482        h.reset();
483        assert_eq!(h.total(), 0);
484        assert_eq!(h.underflow(), 0);
485        assert_eq!(h.config().bin_count, 10);
486    }
487
488    #[test]
489    fn merge_sums_componentwise() {
490        let mut a = hist();
491        a.record(1.0);
492        a.record(5.0);
493        let mut b = hist();
494        b.record(5.0);
495        b.record(20.0);
496        a.merge(&b).unwrap();
497        assert_eq!(a.bins()[1], 1);
498        assert_eq!(a.bins()[5], 2);
499        assert_eq!(a.overflow(), 1);
500    }
501
502    #[test]
503    fn merge_rejects_mismatched_config() {
504        let mut a = hist();
505        let b = ScoreHistogram::with_range(0.0, 100.0).unwrap();
506        assert!(a.merge(&b).is_err());
507    }
508
509    #[test]
510    fn percentile_handles_empty() {
511        let h = hist();
512        assert!(h.percentile(0.5).is_none());
513    }
514
515    #[test]
516    fn percentile_interpolates_within_bin() {
517        let mut h = hist();
518        for _ in 0..100 {
519            h.record(5.0);
520        }
521        let p50 = h.percentile(0.5).unwrap();
522        assert!((5.0..6.0).contains(&p50));
523    }
524
525    #[test]
526    fn with_range_uses_default_bin_count() {
527        let h = ScoreHistogram::with_range(0.0, 1.0).unwrap();
528        assert_eq!(h.bins().len(), DEFAULT_BIN_COUNT);
529    }
530
531    #[cfg(all(feature = "serde", feature = "postcard"))]
532    #[test]
533    fn deserialize_rejects_nan_bounds() {
534        let bad = HistogramConfigShadow {
535            bin_count: 10,
536            min: f64::NAN,
537            max: 10.0,
538        };
539        let bytes = postcard::to_allocvec(&bad).unwrap();
540        let back: Result<HistogramConfig, _> = postcard::from_bytes(&bytes);
541        assert!(back.is_err());
542    }
543
544    #[cfg(all(feature = "serde", feature = "postcard"))]
545    #[test]
546    fn deserialize_rejects_inverted_bounds() {
547        let bad = HistogramConfigShadow {
548            bin_count: 10,
549            min: 5.0,
550            max: 1.0,
551        };
552        let bytes = postcard::to_allocvec(&bad).unwrap();
553        let back: Result<HistogramConfig, _> = postcard::from_bytes(&bytes);
554        assert!(back.is_err());
555    }
556
557    #[cfg(all(feature = "serde", feature = "postcard"))]
558    #[test]
559    fn deserialize_rejects_bin_length_mismatch() {
560        let bad = ScoreHistogramShadow {
561            config: HistogramConfig {
562                bin_count: 10,
563                min: 0.0,
564                max: 1.0,
565            },
566            bins: alloc::vec![0_u64; 3],
567            underflow: 0,
568            overflow: 0,
569            non_finite: 0,
570        };
571        let bytes = postcard::to_allocvec(&bad).unwrap();
572        let back: Result<ScoreHistogram, _> = postcard::from_bytes(&bytes);
573        assert!(back.is_err());
574    }
575}