Skip to main content

irithyll_core/histogram/
mod.rs

1//! Histogram-based feature binning for streaming tree construction.
2//!
3//! Streaming trees need to evaluate potential splits without storing raw data.
4//! Histograms discretize continuous features into bins and accumulate gradient/hessian
5//! statistics per bin, enabling efficient split evaluation.
6
7use alloc::boxed::Box;
8use alloc::vec::Vec;
9
10pub mod bins;
11pub mod categorical;
12#[cfg(feature = "kmeans-binning")]
13pub mod kmeans;
14pub mod quantile;
15#[cfg(feature = "simd")]
16pub mod simd;
17pub mod uniform;
18
19/// Bin edge boundaries computed by a binning strategy.
20#[derive(Debug, Clone)]
21pub struct BinEdges {
22    /// Sorted thresholds defining bin boundaries. `n_bins - 1` values.
23    pub edges: Vec<f64>,
24}
25
26impl BinEdges {
27    /// Find which bin a value falls into. Returns bin index in `[0, n_bins)`.
28    #[inline]
29    pub fn find_bin(&self, value: f64) -> usize {
30        match self
31            .edges
32            .binary_search_by(|e| e.partial_cmp(&value).unwrap())
33        {
34            Ok(i) => i + 1,
35            Err(i) => i,
36        }
37    }
38
39    /// Number of bins (edges.len() + 1).
40    #[inline]
41    pub fn n_bins(&self) -> usize {
42        self.edges.len() + 1
43    }
44}
45
46/// A strategy for computing histogram bin edges from a stream of values.
47pub trait BinningStrategy: Send + Sync + 'static {
48    /// Observe a single value from the stream.
49    fn observe(&mut self, value: f64);
50
51    /// Compute bin edges from observed values.
52    fn compute_edges(&self, n_bins: usize) -> BinEdges;
53
54    /// Reset observed state.
55    fn reset(&mut self);
56
57    /// Create a fresh instance with the same configuration.
58    fn clone_fresh(&self) -> Box<dyn BinningStrategy>;
59}
60
61// ---------------------------------------------------------------------------
62// BinnerKind -- enum dispatch replacing Box<dyn BinningStrategy>
63// ---------------------------------------------------------------------------
64
65/// Concrete binning strategy enum, eliminating `Box<dyn BinningStrategy>`
66/// heap allocations per feature per leaf.
67///
68/// This is used internally by `HoeffdingTree`
69/// leaf states. Each variant delegates to the corresponding strategy impl.
70#[derive(Debug, Clone)]
71pub enum BinnerKind {
72    /// Equal-width binning (default).
73    Uniform(uniform::UniformBinning),
74
75    /// Categorical binning -- one bin per observed distinct value.
76    Categorical(categorical::CategoricalBinning),
77
78    /// K-means binning (feature-gated).
79    #[cfg(feature = "kmeans-binning")]
80    KMeans(kmeans::KMeansBinning),
81}
82
83impl BinnerKind {
84    /// Create a new uniform binner (the default strategy).
85    #[inline]
86    pub fn uniform() -> Self {
87        BinnerKind::Uniform(uniform::UniformBinning::new())
88    }
89
90    /// Create a new categorical binner.
91    #[inline]
92    pub fn categorical() -> Self {
93        BinnerKind::Categorical(categorical::CategoricalBinning::new())
94    }
95
96    /// Create a new k-means binner.
97    #[cfg(feature = "kmeans-binning")]
98    #[inline]
99    pub fn kmeans() -> Self {
100        BinnerKind::KMeans(kmeans::KMeansBinning::new())
101    }
102
103    /// Observe a single value.
104    #[inline]
105    pub fn observe(&mut self, value: f64) {
106        match self {
107            BinnerKind::Uniform(b) => b.observe(value),
108            BinnerKind::Categorical(b) => b.observe(value),
109            #[cfg(feature = "kmeans-binning")]
110            BinnerKind::KMeans(b) => b.observe(value),
111        }
112    }
113
114    /// Compute bin edges from observed values.
115    #[inline]
116    pub fn compute_edges(&self, n_bins: usize) -> BinEdges {
117        match self {
118            BinnerKind::Uniform(b) => b.compute_edges(n_bins),
119            BinnerKind::Categorical(b) => b.compute_edges(n_bins),
120            #[cfg(feature = "kmeans-binning")]
121            BinnerKind::KMeans(b) => b.compute_edges(n_bins),
122        }
123    }
124
125    /// Reset observed state.
126    #[inline]
127    pub fn reset(&mut self) {
128        match self {
129            BinnerKind::Uniform(b) => b.reset(),
130            BinnerKind::Categorical(b) => b.reset(),
131            #[cfg(feature = "kmeans-binning")]
132            BinnerKind::KMeans(b) => b.reset(),
133        }
134    }
135
136    /// Whether this binner is for a categorical feature.
137    #[inline]
138    pub fn is_categorical(&self) -> bool {
139        matches!(self, BinnerKind::Categorical(_))
140    }
141
142    /// Create a fresh instance with the same variant but no data.
143    pub fn clone_fresh(&self) -> Self {
144        match self {
145            BinnerKind::Uniform(_) => BinnerKind::Uniform(uniform::UniformBinning::new()),
146            BinnerKind::Categorical(_) => {
147                BinnerKind::Categorical(categorical::CategoricalBinning::new())
148            }
149            #[cfg(feature = "kmeans-binning")]
150            BinnerKind::KMeans(_) => BinnerKind::KMeans(kmeans::KMeansBinning::new()),
151        }
152    }
153}