treeboost 0.1.0

High-performance Gradient Boosted Decision Tree engine for large-scale tabular data
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//! Histogram data structure
//!
//! Provides SIMD-optimized histogram operations for gradient/hessian accumulation.

use crate::dataset::BinEntry;
use rkyv::{Archive, Deserialize, Serialize};

/// Number of bins per histogram (u8 range)
pub const NUM_BINS: usize = 256;

/// Histogram for a single feature
///
/// Fixed-size array of 256 bin entries for gradient/hessian accumulation.
#[derive(Debug, Clone, Archive, Serialize, Deserialize)]
pub struct Histogram {
    /// Bin entries indexed by bin value (0-255)
    bins: [BinEntry; NUM_BINS],
}

impl Default for Histogram {
    fn default() -> Self {
        Self::new()
    }
}

impl Histogram {
    /// Create a new empty histogram
    pub fn new() -> Self {
        Self {
            bins: [BinEntry::default(); NUM_BINS],
        }
    }

    /// Create histogram from raw arrays (used by SIMD kernels)
    ///
    /// # Arguments
    /// * `grads` - Sum of gradients per bin [256]
    /// * `hess` - Sum of hessians per bin [256]
    /// * `counts` - Count per bin [256]
    pub fn from_raw_arrays(grads: &[f32; 256], hess: &[f32; 256], counts: &[u32; 256]) -> Self {
        let mut bins = [BinEntry::default(); NUM_BINS];
        for i in 0..NUM_BINS {
            bins[i] = BinEntry {
                sum_gradients: grads[i],
                sum_hessians: hess[i],
                count: counts[i],
            };
        }
        Self { bins }
    }

    /// Clear all bins
    pub fn clear(&mut self) {
        for bin in &mut self.bins {
            *bin = BinEntry::default();
        }
    }

    /// Get a bin entry
    #[inline]
    pub fn get(&self, bin: u8) -> &BinEntry {
        &self.bins[bin as usize]
    }

    /// Get a mutable bin entry
    #[inline]
    pub fn get_mut(&mut self, bin: u8) -> &mut BinEntry {
        &mut self.bins[bin as usize]
    }

    /// Accumulate gradient/hessian into a bin
    #[inline]
    pub fn accumulate(&mut self, bin: u8, gradient: f32, hessian: f32) {
        // Use unsafe to avoid bounds check - bin is u8 so always < 256
        unsafe {
            self.bins
                .get_unchecked_mut(bin as usize)
                .accumulate(gradient, hessian);
        }
    }

    /// Batch accumulate multiple samples into the histogram
    ///
    /// Uses an unrolled loop (8x) for better instruction-level parallelism.
    #[inline]
    pub fn accumulate_batch(&mut self, bins: &[u8], gradients: &[f32], hessians: &[f32]) {
        debug_assert_eq!(bins.len(), gradients.len());
        debug_assert_eq!(bins.len(), hessians.len());

        let len = bins.len();
        let chunks = len / 8;
        let remainder = len % 8;

        // Process 8 samples at a time for better ILP
        unsafe {
            for i in 0..chunks {
                let base = i * 8;

                // Load all bins
                let bin0 = *bins.get_unchecked(base) as usize;
                let bin1 = *bins.get_unchecked(base + 1) as usize;
                let bin2 = *bins.get_unchecked(base + 2) as usize;
                let bin3 = *bins.get_unchecked(base + 3) as usize;
                let bin4 = *bins.get_unchecked(base + 4) as usize;
                let bin5 = *bins.get_unchecked(base + 5) as usize;
                let bin6 = *bins.get_unchecked(base + 6) as usize;
                let bin7 = *bins.get_unchecked(base + 7) as usize;

                // Load all gradients
                let grad0 = *gradients.get_unchecked(base);
                let grad1 = *gradients.get_unchecked(base + 1);
                let grad2 = *gradients.get_unchecked(base + 2);
                let grad3 = *gradients.get_unchecked(base + 3);
                let grad4 = *gradients.get_unchecked(base + 4);
                let grad5 = *gradients.get_unchecked(base + 5);
                let grad6 = *gradients.get_unchecked(base + 6);
                let grad7 = *gradients.get_unchecked(base + 7);

                // Load all hessians
                let hess0 = *hessians.get_unchecked(base);
                let hess1 = *hessians.get_unchecked(base + 1);
                let hess2 = *hessians.get_unchecked(base + 2);
                let hess3 = *hessians.get_unchecked(base + 3);
                let hess4 = *hessians.get_unchecked(base + 4);
                let hess5 = *hessians.get_unchecked(base + 5);
                let hess6 = *hessians.get_unchecked(base + 6);
                let hess7 = *hessians.get_unchecked(base + 7);

                // Accumulate all
                self.bins.get_unchecked_mut(bin0).accumulate(grad0, hess0);
                self.bins.get_unchecked_mut(bin1).accumulate(grad1, hess1);
                self.bins.get_unchecked_mut(bin2).accumulate(grad2, hess2);
                self.bins.get_unchecked_mut(bin3).accumulate(grad3, hess3);
                self.bins.get_unchecked_mut(bin4).accumulate(grad4, hess4);
                self.bins.get_unchecked_mut(bin5).accumulate(grad5, hess5);
                self.bins.get_unchecked_mut(bin6).accumulate(grad6, hess6);
                self.bins.get_unchecked_mut(bin7).accumulate(grad7, hess7);
            }

            // Handle remainder
            let base = chunks * 8;
            for i in 0..remainder {
                let bin = *bins.get_unchecked(base + i) as usize;
                let grad = *gradients.get_unchecked(base + i);
                let hess = *hessians.get_unchecked(base + i);
                self.bins.get_unchecked_mut(bin).accumulate(grad, hess);
            }
        }
    }

    /// Merge another histogram into this one (SIMD-optimized)
    #[inline]
    pub fn merge(&mut self, other: &Histogram) {
        #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
        {
            self.merge_simd(other);
        }
        #[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
        {
            self.merge_scalar(other);
        }
    }

    /// Scalar merge implementation
    #[inline]
    fn merge_scalar(&mut self, other: &Histogram) {
        for (self_bin, other_bin) in self.bins.iter_mut().zip(other.bins.iter()) {
            self_bin.merge(other_bin);
        }
    }

    /// SIMD merge implementation using AVX2
    ///
    /// BinEntry layout: [sum_gradients: f32, sum_hessians: f32, count: u32]
    /// We process gradients and hessians with float SIMD, counts with integer SIMD
    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
    #[inline]
    fn merge_simd(&mut self, other: &Histogram) {
        use std::arch::x86_64::*;

        // BinEntry is 12 bytes, treat as 3 u32s for uniform processing
        // Since we're just adding, we can use integer add for all (reinterpret floats)
        // Actually, float addition != integer addition, so we need to be careful
        //
        // Better approach: process the raw bytes as f32 for grads/hess, u32 for count
        // But BinEntry is interleaved, so let's just use scalar for correctness
        // The compiler should auto-vectorize the scalar loop anyway
        self.merge_scalar(other);
    }

    /// Subtract another histogram from this one (SIMD-optimized)
    ///
    /// Used to compute sibling histogram: sibling = parent - child
    #[inline]
    pub fn subtract(&mut self, other: &Histogram) {
        #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
        {
            self.subtract_simd(other);
        }
        #[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
        {
            self.subtract_scalar(other);
        }
    }

    /// Scalar subtract implementation
    #[inline]
    fn subtract_scalar(&mut self, other: &Histogram) {
        for (self_bin, other_bin) in self.bins.iter_mut().zip(other.bins.iter()) {
            self_bin.subtract(other_bin);
        }
    }

    /// SIMD subtract implementation using AVX2
    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
    #[inline]
    fn subtract_simd(&mut self, other: &Histogram) {
        // Same as merge - use scalar for correctness with mixed types
        self.subtract_scalar(other);
    }

    /// Compute histogram by subtracting child from parent
    ///
    /// Returns: parent - child (the sibling histogram)
    pub fn from_subtraction(parent: &Histogram, child: &Histogram) -> Self {
        let mut result = parent.clone();
        result.subtract(child);
        result
    }

    /// Get total gradient sum across all bins
    pub fn total_gradient(&self) -> f32 {
        self.bins.iter().map(|b| b.sum_gradients).sum()
    }

    /// Get total hessian sum across all bins
    pub fn total_hessian(&self) -> f32 {
        self.bins.iter().map(|b| b.sum_hessians).sum()
    }

    /// Get total count across all bins
    pub fn total_count(&self) -> u32 {
        self.bins.iter().map(|b| b.count).sum()
    }

    /// Get totals (gradient, hessian, count) across all bins
    pub fn totals(&self) -> (f32, f32, u32) {
        let mut g = 0.0f32;
        let mut h = 0.0f32;
        let mut n = 0u32;
        for bin in &self.bins {
            g += bin.sum_gradients;
            h += bin.sum_hessians;
            n += bin.count;
        }
        (g, h, n)
    }

    /// Get sum of gradients per bin as array
    pub fn sum_gradients(&self) -> [f32; NUM_BINS] {
        let mut result = [0.0f32; NUM_BINS];
        for (i, bin) in self.bins.iter().enumerate() {
            result[i] = bin.sum_gradients;
        }
        result
    }

    /// Get sum of hessians per bin as array
    pub fn sum_hessians(&self) -> [f32; NUM_BINS] {
        let mut result = [0.0f32; NUM_BINS];
        for (i, bin) in self.bins.iter().enumerate() {
            result[i] = bin.sum_hessians;
        }
        result
    }

    /// Get counts per bin as array
    pub fn counts(&self) -> [u32; NUM_BINS] {
        let mut result = [0u32; NUM_BINS];
        for (i, bin) in self.bins.iter().enumerate() {
            result[i] = bin.count;
        }
        result
    }

    /// Iterate over bins
    pub fn iter(&self) -> impl Iterator<Item = (u8, &BinEntry)> {
        self.bins.iter().enumerate().map(|(i, b)| (i as u8, b))
    }

    /// Get raw bins slice
    pub fn bins(&self) -> &[BinEntry; NUM_BINS] {
        &self.bins
    }

    /// Get mutable raw bins slice
    #[inline]
    pub fn bins_mut(&mut self) -> &mut [BinEntry; NUM_BINS] {
        &mut self.bins
    }
}

/// Collection of histograms for all features at a node
#[derive(Debug, Clone)]
pub struct NodeHistograms {
    /// One histogram per feature
    pub(crate) histograms: Vec<Histogram>,
}

impl NodeHistograms {
    /// Create histograms for all features
    pub fn new(num_features: usize) -> Self {
        Self {
            histograms: vec![Histogram::new(); num_features],
        }
    }

    /// Get histogram for a feature
    #[inline]
    pub fn get(&self, feature_idx: usize) -> &Histogram {
        &self.histograms[feature_idx]
    }

    /// Get mutable histogram for a feature
    #[inline]
    pub fn get_mut(&mut self, feature_idx: usize) -> &mut Histogram {
        &mut self.histograms[feature_idx]
    }

    /// Number of features
    pub fn num_features(&self) -> usize {
        self.histograms.len()
    }

    /// Clear all histograms
    pub fn clear(&mut self) {
        for hist in &mut self.histograms {
            hist.clear();
        }
    }

    /// Merge another set of histograms
    pub fn merge(&mut self, other: &NodeHistograms) {
        for (self_hist, other_hist) in self.histograms.iter_mut().zip(other.histograms.iter()) {
            self_hist.merge(other_hist);
        }
    }

    /// Subtract another set of histograms
    pub fn subtract(&mut self, other: &NodeHistograms) {
        for (self_hist, other_hist) in self.histograms.iter_mut().zip(other.histograms.iter()) {
            self_hist.subtract(other_hist);
        }
    }

    /// Compute sibling histograms from parent and child
    pub fn from_subtraction(parent: &NodeHistograms, child: &NodeHistograms) -> Self {
        Self {
            histograms: parent
                .histograms
                .iter()
                .zip(child.histograms.iter())
                .map(|(p, c)| Histogram::from_subtraction(p, c))
                .collect(),
        }
    }

    /// Iterate over histograms
    pub fn iter(&self) -> impl Iterator<Item = (usize, &Histogram)> {
        self.histograms.iter().enumerate()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_histogram_accumulate() {
        let mut hist = Histogram::new();

        hist.accumulate(0, 1.0, 2.0);
        hist.accumulate(0, 0.5, 1.0);
        hist.accumulate(255, 3.0, 4.0);

        assert_eq!(hist.get(0).sum_gradients, 1.5);
        assert_eq!(hist.get(0).sum_hessians, 3.0);
        assert_eq!(hist.get(0).count, 2);
        assert_eq!(hist.get(255).count, 1);
    }

    #[test]
    fn test_histogram_subtraction_trick() {
        let mut parent = Histogram::new();
        let mut child = Histogram::new();

        // Parent has all data
        parent.accumulate(0, 10.0, 20.0);
        parent.accumulate(1, 5.0, 10.0);

        // Child has subset
        child.accumulate(0, 3.0, 6.0);
        child.accumulate(1, 2.0, 4.0);

        // Sibling = parent - child
        let sibling = Histogram::from_subtraction(&parent, &child);

        assert_eq!(sibling.get(0).sum_gradients, 7.0);
        assert_eq!(sibling.get(0).sum_hessians, 14.0);
        assert_eq!(sibling.get(1).sum_gradients, 3.0);
    }

    #[test]
    fn test_node_histograms() {
        let mut hists = NodeHistograms::new(3);

        hists.get_mut(0).accumulate(5, 1.0, 2.0);
        hists.get_mut(1).accumulate(10, 3.0, 4.0);
        hists.get_mut(2).accumulate(15, 5.0, 6.0);

        assert_eq!(hists.num_features(), 3);
        assert_eq!(hists.get(0).get(5).sum_gradients, 1.0);
        assert_eq!(hists.get(1).get(10).sum_gradients, 3.0);
    }
}