Skip to main content

irithyll_core/histogram/
mod.rs

1//! Histogram-based feature binning for streaming tree construction.
2//!
3//! Streaming trees need to evaluate potential splits without storing raw data.
4//! Histograms discretize continuous features into bins and accumulate gradient/hessian
5//! statistics per bin, enabling efficient split evaluation.
6
7use alloc::boxed::Box;
8use alloc::vec::Vec;
9
10pub mod bins;
11pub mod categorical;
12#[cfg(feature = "kmeans-binning")]
13#[cfg_attr(docsrs, doc(cfg(feature = "kmeans-binning")))]
14pub mod kmeans;
15pub mod quantile;
16#[cfg(feature = "simd")]
17#[cfg_attr(docsrs, doc(cfg(feature = "simd")))]
18pub mod simd;
19pub mod uniform;
20
21/// Bin edge boundaries computed by a binning strategy.
22#[derive(Debug, Clone)]
23pub struct BinEdges {
24    /// Sorted thresholds defining bin boundaries. `n_bins - 1` values.
25    pub edges: Vec<f64>,
26}
27
28impl BinEdges {
29    /// Find which bin a value falls into. Returns bin index in `[0, n_bins)`.
30    #[inline]
31    pub fn find_bin(&self, value: f64) -> usize {
32        match self
33            .edges
34            .binary_search_by(|e| e.partial_cmp(&value).unwrap())
35        {
36            Ok(i) => i + 1,
37            Err(i) => i,
38        }
39    }
40
41    /// Number of bins (edges.len() + 1).
42    #[inline]
43    pub fn n_bins(&self) -> usize {
44        self.edges.len() + 1
45    }
46}
47
48/// A strategy for computing histogram bin edges from a stream of values.
49pub trait BinningStrategy: Send + Sync + 'static {
50    /// Observe a single value from the stream.
51    fn observe(&mut self, value: f64);
52
53    /// Compute bin edges from observed values.
54    fn compute_edges(&self, n_bins: usize) -> BinEdges;
55
56    /// Reset observed state.
57    fn reset(&mut self);
58
59    /// Create a fresh instance with the same configuration.
60    fn clone_fresh(&self) -> Box<dyn BinningStrategy>;
61}
62
63// ---------------------------------------------------------------------------
64// BinnerKind -- enum dispatch replacing Box<dyn BinningStrategy>
65// ---------------------------------------------------------------------------
66
67/// Concrete binning strategy enum, eliminating `Box<dyn BinningStrategy>`
68/// heap allocations per feature per leaf.
69///
70/// This is used internally by `HoeffdingTree`
71/// leaf states. Each variant delegates to the corresponding strategy impl.
72#[derive(Debug, Clone)]
73#[non_exhaustive]
74pub enum BinnerKind {
75    /// Equal-width binning (default).
76    Uniform(uniform::UniformBinning),
77
78    /// Categorical binning -- one bin per observed distinct value.
79    Categorical(categorical::CategoricalBinning),
80
81    /// K-means binning (feature-gated).
82    #[cfg(feature = "kmeans-binning")]
83    #[cfg_attr(docsrs, doc(cfg(feature = "kmeans-binning")))]
84    KMeans(kmeans::KMeansBinning),
85}
86
87impl BinnerKind {
88    /// Create a new uniform binner (the default strategy).
89    #[inline]
90    pub fn uniform() -> Self {
91        BinnerKind::Uniform(uniform::UniformBinning::new())
92    }
93
94    /// Create a new categorical binner.
95    #[inline]
96    pub fn categorical() -> Self {
97        BinnerKind::Categorical(categorical::CategoricalBinning::new())
98    }
99
100    /// Create a new k-means binner.
101    #[cfg(feature = "kmeans-binning")]
102    #[cfg_attr(docsrs, doc(cfg(feature = "kmeans-binning")))]
103    #[inline]
104    pub fn kmeans() -> Self {
105        BinnerKind::KMeans(kmeans::KMeansBinning::new())
106    }
107
108    /// Observe a single value.
109    #[inline]
110    pub fn observe(&mut self, value: f64) {
111        match self {
112            BinnerKind::Uniform(b) => b.observe(value),
113            BinnerKind::Categorical(b) => b.observe(value),
114            #[cfg(feature = "kmeans-binning")]
115            BinnerKind::KMeans(b) => b.observe(value),
116        }
117    }
118
119    /// Compute bin edges from observed values.
120    #[inline]
121    pub fn compute_edges(&self, n_bins: usize) -> BinEdges {
122        match self {
123            BinnerKind::Uniform(b) => b.compute_edges(n_bins),
124            BinnerKind::Categorical(b) => b.compute_edges(n_bins),
125            #[cfg(feature = "kmeans-binning")]
126            BinnerKind::KMeans(b) => b.compute_edges(n_bins),
127        }
128    }
129
130    /// Reset observed state.
131    #[inline]
132    pub fn reset(&mut self) {
133        match self {
134            BinnerKind::Uniform(b) => b.reset(),
135            BinnerKind::Categorical(b) => b.reset(),
136            #[cfg(feature = "kmeans-binning")]
137            BinnerKind::KMeans(b) => b.reset(),
138        }
139    }
140
141    /// Whether this binner is for a categorical feature.
142    #[inline]
143    pub fn is_categorical(&self) -> bool {
144        matches!(self, BinnerKind::Categorical(_))
145    }
146
147    /// Create a fresh instance with the same variant but no data.
148    pub fn clone_fresh(&self) -> Self {
149        match self {
150            BinnerKind::Uniform(_) => BinnerKind::Uniform(uniform::UniformBinning::new()),
151            BinnerKind::Categorical(_) => {
152                BinnerKind::Categorical(categorical::CategoricalBinning::new())
153            }
154            #[cfg(feature = "kmeans-binning")]
155            BinnerKind::KMeans(_) => BinnerKind::KMeans(kmeans::KMeansBinning::new()),
156        }
157    }
158}