Skip to main content

irithyll_core/histogram/
categorical.rs

1//! Categorical feature binning -- one bin per observed category value.
2//!
3//! Unlike continuous binning strategies that discretize a range, categorical
4//! binning assigns each distinct category value its own bin. Category values
5//! are encoded as integers (cast from `f64`) and bins are ordered by value.
6//!
7//! The bin edges are placed at midpoints between consecutive category values,
8//! which allows the existing `BinEdges::find_bin()` to route category values
9//! to the correct bin via binary search.
10//!
11//! # Limits
12//!
13//! Supports up to 64 distinct category values per feature (matching the `u64`
14//! bitmask used for categorical split routing in the tree arena).
15
16use alloc::boxed::Box;
17use alloc::vec::Vec;
18
19use super::{BinEdges, BinningStrategy};
20
21/// Maximum number of distinct categories supported per feature.
22///
23/// Limited by the `u64` bitmask used for split routing.
24pub const MAX_CATEGORIES: usize = 64;
25
26/// Categorical binning: one bin per observed distinct value.
27///
28/// Tracks observed category values (as sorted integers) and creates bin
29/// edges at midpoints between consecutive values.
30#[derive(Debug, Clone)]
31pub struct CategoricalBinning {
32    /// Sorted list of distinct observed category values.
33    categories: Vec<i64>,
34}
35
36impl CategoricalBinning {
37    /// Create a new empty categorical binner.
38    pub fn new() -> Self {
39        Self {
40            categories: Vec::new(),
41        }
42    }
43
44    /// Number of distinct categories observed so far.
45    #[inline]
46    pub fn n_categories(&self) -> usize {
47        self.categories.len()
48    }
49
50    /// The sorted list of observed category values.
51    pub fn categories(&self) -> &[i64] {
52        &self.categories
53    }
54
55    /// Find the bin index for a category value.
56    ///
57    /// Returns the index of `value` in the sorted categories list, or `None`
58    /// if the value hasn't been observed.
59    pub fn category_index(&self, value: f64) -> Option<usize> {
60        let v = value as i64;
61        self.categories.binary_search(&v).ok()
62    }
63}
64
65impl Default for CategoricalBinning {
66    fn default() -> Self {
67        Self::new()
68    }
69}
70
71impl BinningStrategy for CategoricalBinning {
72    fn observe(&mut self, value: f64) {
73        let v = value as i64;
74        // Insert in sorted order if not already present, up to MAX_CATEGORIES
75        match self.categories.binary_search(&v) {
76            Ok(_) => {} // already present
77            Err(pos) => {
78                if self.categories.len() < MAX_CATEGORIES {
79                    self.categories.insert(pos, v);
80                }
81                // Silently ignore categories beyond MAX_CATEGORIES
82            }
83        }
84    }
85
86    fn compute_edges(&self, _n_bins: usize) -> BinEdges {
87        // For categorical features, n_bins is ignored -- we use one bin per category.
88        // Bin edges are placed at midpoints between consecutive category values.
89        if self.categories.len() <= 1 {
90            return BinEdges { edges: Vec::new() };
91        }
92
93        let edges: Vec<f64> = self
94            .categories
95            .windows(2)
96            .map(|w| (w[0] as f64 + w[1] as f64) / 2.0)
97            .collect();
98
99        BinEdges { edges }
100    }
101
102    fn reset(&mut self) {
103        self.categories.clear();
104    }
105
106    fn clone_fresh(&self) -> Box<dyn BinningStrategy> {
107        Box::new(CategoricalBinning::new())
108    }
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114
115    #[test]
116    fn empty_binner() {
117        let b = CategoricalBinning::new();
118        assert_eq!(b.n_categories(), 0);
119        let edges = b.compute_edges(10);
120        assert!(edges.edges.is_empty());
121        assert_eq!(edges.n_bins(), 1);
122    }
123
124    #[test]
125    fn single_category() {
126        let mut b = CategoricalBinning::new();
127        b.observe(3.0);
128        b.observe(3.0); // duplicate
129        assert_eq!(b.n_categories(), 1);
130        let edges = b.compute_edges(10);
131        assert!(edges.edges.is_empty());
132        assert_eq!(edges.n_bins(), 1);
133    }
134
135    #[test]
136    fn two_categories() {
137        let mut b = CategoricalBinning::new();
138        b.observe(1.0);
139        b.observe(5.0);
140        assert_eq!(b.n_categories(), 2);
141
142        let edges = b.compute_edges(10);
143        assert_eq!(edges.edges.len(), 1);
144        // Midpoint between 1 and 5 = 3.0
145        assert!((edges.edges[0] - 3.0).abs() < 1e-10);
146        assert_eq!(edges.n_bins(), 2);
147    }
148
149    #[test]
150    fn multiple_categories_sorted() {
151        let mut b = CategoricalBinning::new();
152        // Insert out of order
153        b.observe(5.0);
154        b.observe(1.0);
155        b.observe(3.0);
156        b.observe(7.0);
157
158        assert_eq!(b.categories(), &[1, 3, 5, 7]);
159
160        let edges = b.compute_edges(10);
161        assert_eq!(edges.edges.len(), 3);
162        assert!((edges.edges[0] - 2.0).abs() < 1e-10); // midpoint(1,3)
163        assert!((edges.edges[1] - 4.0).abs() < 1e-10); // midpoint(3,5)
164        assert!((edges.edges[2] - 6.0).abs() < 1e-10); // midpoint(5,7)
165    }
166
167    #[test]
168    fn find_bin_routes_correctly() {
169        let mut b = CategoricalBinning::new();
170        for i in 0..5 {
171            b.observe(i as f64 * 2.0); // 0, 2, 4, 6, 8
172        }
173
174        let edges = b.compute_edges(10);
175        // Edges: 1.0, 3.0, 5.0, 7.0 → bins: [<=1], (1,3], (3,5], (5,7], (>7)
176        assert_eq!(edges.find_bin(0.0), 0); // category 0
177        assert_eq!(edges.find_bin(2.0), 1); // category 2
178        assert_eq!(edges.find_bin(4.0), 2); // category 4
179        assert_eq!(edges.find_bin(6.0), 3); // category 6
180        assert_eq!(edges.find_bin(8.0), 4); // category 8
181    }
182
183    #[test]
184    fn category_index_lookup() {
185        let mut b = CategoricalBinning::new();
186        b.observe(10.0);
187        b.observe(20.0);
188        b.observe(30.0);
189
190        assert_eq!(b.category_index(10.0), Some(0));
191        assert_eq!(b.category_index(20.0), Some(1));
192        assert_eq!(b.category_index(30.0), Some(2));
193        assert_eq!(b.category_index(15.0), None);
194    }
195
196    #[test]
197    fn max_categories_enforced() {
198        let mut b = CategoricalBinning::new();
199        for i in 0..100 {
200            b.observe(i as f64);
201        }
202        assert_eq!(b.n_categories(), MAX_CATEGORIES);
203    }
204
205    #[test]
206    fn reset_clears() {
207        let mut b = CategoricalBinning::new();
208        b.observe(1.0);
209        b.observe(2.0);
210        b.observe(3.0);
211        b.reset();
212        assert_eq!(b.n_categories(), 0);
213    }
214
215    #[test]
216    fn n_bins_ignored() {
217        let mut b = CategoricalBinning::new();
218        b.observe(1.0);
219        b.observe(2.0);
220        b.observe(3.0);
221
222        // n_bins param is ignored for categorical
223        let edges_2 = b.compute_edges(2);
224        let edges_100 = b.compute_edges(100);
225        assert_eq!(edges_2.edges, edges_100.edges);
226    }
227}