xgboost-rs 0.1.31

Bindings to dmlc/xgboost.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
//! `BoosterParameters` for controlling tree boosters.
//!
//!
use std::default::Default;

use derive_builder::Builder;

use super::Interval;

/// The tree construction algorithm used in `XGBoost` (see description in the
/// [reference paper](http://arxiv.org/abs/1603.02754)).
///
/// Distributed and external memory version only support approximate algorithm.
#[derive(Clone, Default)]
pub enum TreeMethod {
    /// Use heuristic to choose faster one.
    ///
    /// * For small to medium dataset, exact greedy will be used.
    /// * For very large-dataset, approximate algorithm will be chosen.
    /// * Because old behavior is always use exact greedy in single machine, user will get a message when
    ///   approximate algorithm is chosen to notify this choice.
    #[default]
    Auto,

    /// Exact greedy algorithm.
    Exact,

    /// Approximate greedy algorithm using sketching and histogram.
    Approx,

    /// Fast histogram optimized approximate greedy algorithm. It uses some performance improvements
    /// such as bins caching.
    Hist,

    /// GPU implementation of exact algorithm.
    GpuExact,

    /// GPU implementation of hist algorithm.
    GpuHist,
}

impl ToString for TreeMethod {
    fn to_string(&self) -> String {
        match *self {
            TreeMethod::Auto => "auto".to_owned(),
            TreeMethod::Exact => "exact".to_owned(),
            TreeMethod::Approx => "approx".to_owned(),
            TreeMethod::Hist => "hist".to_owned(),
            TreeMethod::GpuExact => "gpu_exact".to_owned(),
            TreeMethod::GpuHist => "gpu_hist".to_owned(),
        }
    }
}

impl From<String> for TreeMethod {
    fn from(s: String) -> Self {
        use std::borrow::Borrow;
        Self::from(s.borrow())
    }
}

impl<'a> From<&'a str> for TreeMethod {
    fn from(s: &'a str) -> Self {
        match s {
            "auto" => TreeMethod::Auto,
            "exact" => TreeMethod::Exact,
            "approx" => TreeMethod::Approx,
            "hist" => TreeMethod::Hist,
            "gpu_exact" => TreeMethod::GpuExact,
            "gpu_hist" => TreeMethod::GpuHist,
            _ => panic!("no known tree_method for {s}"),
        }
    }
}

/// Provides a modular way to construct and to modify the trees. This is an advanced parameter that is usually set
/// automatically, depending on some other parameters. However, it could be also set explicitly by a user.
#[derive(Clone)]
pub enum TreeUpdater {
    /// Non-distributed column-based construction of trees.
    GrowColMaker,

    /// Distributed tree construction with column-based data splitting mode.
    DistCol,

    /// Distributed tree construction with row-based data splitting based on global proposal of histogram counting.
    GrowHistMaker,

    /// Based on local histogram counting.
    GrowLocalHistMaker,

    /// Uses the approximate sketching algorithm.
    GrowSkMaker,

    /// Synchronizes trees in all distributed nodes.
    Sync,

    /// Refreshes tree’s statistics and/or leaf values based on the current data.
    /// Note that no random subsampling of data rows is performed.
    Refresh,

    /// Prunes the splits where loss < min_split_loss (or gamma).
    Prune,
}

impl ToString for TreeUpdater {
    fn to_string(&self) -> String {
        match *self {
            TreeUpdater::GrowColMaker => "grow_colmaker".to_owned(),
            TreeUpdater::DistCol => "distcol".to_owned(),
            TreeUpdater::GrowHistMaker => "grow_histmaker".to_owned(),
            TreeUpdater::GrowLocalHistMaker => "grow_local_histmaker".to_owned(),
            TreeUpdater::GrowSkMaker => "grow_skmaker".to_owned(),
            TreeUpdater::Sync => "sync".to_owned(),
            TreeUpdater::Refresh => "refresh".to_owned(),
            TreeUpdater::Prune => "prune".to_owned(),
        }
    }
}

/// A type of boosting process to run.
#[derive(Clone, Default)]
pub enum ProcessType {
    /// The normal boosting process which creates new trees.
    #[default]
    Default,

    /// Starts from an existing model and only updates its trees. In each boosting iteration,
    /// a tree from the initial model is taken, a specified sequence of updater plugins is run for that tree,
    /// and a modified tree is added to the new model. The new model would have either the same or smaller number of
    /// trees, depending on the number of boosting iteratons performed.
    /// Currently, the following built-in updater plugins could be meaningfully used with this process type:
    /// 'refresh', 'prune'. With 'update', one cannot use updater plugins that create new trees.
    Update,
}

impl ToString for ProcessType {
    fn to_string(&self) -> String {
        match *self {
            ProcessType::Default => "default".to_owned(),
            ProcessType::Update => "update".to_owned(),
        }
    }
}

/// Controls the way new nodes are added to the tree.
#[derive(Clone, Default)]
pub enum GrowPolicy {
    /// Split at nodes closest to the root.
    #[default]
    Depthwise,

    /// Split at noeds with highest loss change.
    LossGuide,
}

impl ToString for GrowPolicy {
    fn to_string(&self) -> String {
        match *self {
            GrowPolicy::Depthwise => "depthwise".to_owned(),
            GrowPolicy::LossGuide => "lossguide".to_owned(),
        }
    }
}

/// The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
#[derive(Clone, Default)]
pub enum Predictor {
    /// Multicore CPU prediction algorithm.
    #[default]
    Cpu,

    /// Prediction using GPU. Default for ‘gpu_exact’ and ‘gpu_hist’ tree method.
    Gpu,
}

impl ToString for Predictor {
    fn to_string(&self) -> String {
        match *self {
            Predictor::Cpu => "cpu_predictor".to_owned(),
            Predictor::Gpu => "gpu_predictor".to_owned(),
        }
    }
}

/// `BoosterParameters` for Tree Booster. Create using
/// [`TreeBoosterParametersBuilder`](struct.TreeBoosterParametersBuilder.html).
#[derive(Builder, Clone)]
#[builder(build_fn(validate = "Self::validate"))]
#[builder(default)]
pub struct TreeBoosterParameters {
    /// Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly
    /// get the weights of new features, and eta actually shrinks the feature weights to make the boosting process
    /// more conservative.
    ///
    /// * range: [0.0, 1.0]
    /// * default: 0.3
    eta: f32,

    /// Minimum loss reduction required to make a further partition on a leaf node of the tree.
    /// The larger, the more conservative the algorithm will be.
    ///
    /// * range: [0,∞]
    /// * default: 0
    gamma: f32,

    /// Maximum depth of a tree, increase this value will make the model more complex / likely to be overfitting.
    /// 0 indicates no limit, limit is required for depth-wise grow policy.
    ///
    /// * range: [0,∞]
    /// * default: 6
    max_depth: u32,

    /// Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf
    /// node with the sum of instance weight less than min_child_weight, then the building process will give up
    /// further partitioning.
    /// In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node.
    /// The larger, the more conservative the algorithm will be.
    ///
    /// * range: [0,∞]
    /// * default: 1
    min_child_weight: f32,

    /// Maximum delta step we allow each tree’s weight estimation to be.
    /// If the value is set to 0, it means there is no constraint. If it is set to a positive value,
    /// it can help making the update step more conservative. Usually this parameter is not needed,
    /// but it might help in logistic regression when class is extremely imbalanced.
    /// Set it to value of 1-10 might help control the update.
    ///
    /// * range: [0,∞]
    /// * default: 0
    max_delta_step: f32,

    /// Subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half
    /// of the data instances to grow trees and this will prevent overfitting.
    ///
    /// * range: (0, 1]
    /// * default: 1.0
    subsample: f32,

    /// Subsample ratio of columns when constructing each tree.
    ///
    /// * range: (0.0, 1.0]
    /// * default: 1.0
    colsample_bytree: f32,

    /// Subsample ratio of columns for each split, in each level.
    ///
    /// * range: (0.0, 1.0]
    /// * default: 1.0
    colsample_bylevel: f32,

    /// Subsample ratio of columns for each node.
    ///
    /// * range: (0.0, 1.0]
    /// * default: 1.0
    colsample_bynode: f32,

    /// L2 regularization term on weights, increase this value will make model more conservative.
    ///
    /// * default: 1
    lambda: f32,

    /// L1 regularization term on weights, increase this value will make model more conservative.
    ///
    /// * default: 0
    alpha: f32,

    /// The tree construction algorithm used in XGBoost.
    #[builder(default = "TreeMethod::default()")]
    tree_method: TreeMethod,

    /// This is only used for approximate greedy algorithm.
    /// This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select number of bins,
    /// this comes with theoretical guarantee with sketch accuracy.
    /// Usually user does not have to tune this. but consider setting to a lower number for more accurate enumeration.
    ///
    /// * range: (0.0, 1.0)
    /// * default: 0.03
    sketch_eps: f32,

    /// Control the balance of positive and negative weights, useful for unbalanced classes.
    /// A typical value to consider: sum(negative cases) / sum(positive cases).
    ///
    /// default: 1.0
    scale_pos_weight: f32,

    /// Sequence of tree updaters to run, providing a modular way to construct and to modify the trees.
    ///
    /// * default: vec![]
    updater: Vec<TreeUpdater>,

    /// This is a parameter of the ‘refresh’ updater plugin. When this flag is true, tree leafs as well as tree nodes'
    /// stats are updated. When it is false, only node stats are updated.
    ///
    /// * default: true
    refresh_leaf: bool,

    /// A type of boosting process to run.
    ///
    /// * default: ProcessType::Default
    process_type: ProcessType,

    /// Controls a way new nodes are added to the tree.  Currently supported only if tree_method is set to 'hist'.
    grow_policy: GrowPolicy,

    /// Maximum number of nodes to be added. Only relevant for the `GrowPolicy::LossGuide` grow
    /// policy.
    ///
    /// * default: 0
    max_leaves: u32,

    /// This is only used if 'hist' is specified as tree_method.
    /// Maximum number of discrete bins to bucket continuous features.
    /// Increasing this number improves the optimality of splits at the cost of higher computation time.
    ///
    /// * default: 256
    max_bin: u32,

    /// Number of trees to train in parallel for boosted random forest.
    ///
    /// * default: 1
    num_parallel_tree: u32,

    /// The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
    ///
    /// * default: [`Predictor::Cpu`](enum.Predictor.html#variant.Cpu)
    predictor: Predictor,
}

impl Default for TreeBoosterParameters {
    fn default() -> Self {
        TreeBoosterParameters {
            eta: 0.3,
            gamma: 0.0,
            max_depth: 6,
            min_child_weight: 1.0,
            max_delta_step: 0.0,
            subsample: 1.0,
            colsample_bytree: 1.0,
            colsample_bylevel: 1.0,
            colsample_bynode: 1.0,
            lambda: 1.0,
            alpha: 0.0,
            tree_method: TreeMethod::default(),
            sketch_eps: 0.03,
            scale_pos_weight: 1.0,
            updater: Vec::new(),
            refresh_leaf: true,
            process_type: ProcessType::default(),
            grow_policy: GrowPolicy::default(),
            max_leaves: 0,
            max_bin: 256,
            num_parallel_tree: 1,
            predictor: Predictor::default(),
        }
    }
}

impl TreeBoosterParameters {
    pub(crate) fn as_string_pairs(&self) -> Vec<(String, String)> {
        let mut v = vec![
            ("booster".to_owned(), "gbtree".to_owned()),
            ("eta".to_owned(), self.eta.to_string()),
            ("gamma".to_owned(), self.gamma.to_string()),
            ("max_depth".to_owned(), self.max_depth.to_string()),
            (
                "min_child_weight".to_owned(),
                self.min_child_weight.to_string(),
            ),
            ("max_delta_step".to_owned(), self.max_delta_step.to_string()),
            ("subsample".to_owned(), self.subsample.to_string()),
            (
                "colsample_bytree".to_owned(),
                self.colsample_bytree.to_string(),
            ),
            (
                "colsample_bylevel".to_owned(),
                self.colsample_bylevel.to_string(),
            ),
            (
                "colsample_bynode".to_owned(),
                self.colsample_bynode.to_string(),
            ),
            ("lambda".to_owned(), self.lambda.to_string()),
            ("alpha".to_owned(), self.alpha.to_string()),
            ("tree_method".to_owned(), self.tree_method.to_string()),
            ("sketch_eps".to_owned(), self.sketch_eps.to_string()),
            (
                "scale_pos_weight".to_owned(),
                self.scale_pos_weight.to_string(),
            ),
            (
                "refresh_leaf".to_owned(),
                (u8::from(self.refresh_leaf)).to_string(),
            ),
            ("process_type".to_owned(), self.process_type.to_string()),
            ("grow_policy".to_owned(), self.grow_policy.to_string()),
            ("max_leaves".to_owned(), self.max_leaves.to_string()),
            ("max_bin".to_owned(), self.max_bin.to_string()),
            (
                "num_parallel_tree".to_owned(),
                self.num_parallel_tree.to_string(),
            ),
            ("predictor".to_owned(), self.predictor.to_string()),
        ];

        // Don't pass anything to XGBoost if the user didn't specify anything.
        // This allows XGBoost to figure it out on it's own, and suppresses the
        // warning message during training.
        // See: https://github.com/davechallis/rust-xgboost/issues/7
        if !self.updater.is_empty() {
            v.push((
                "updater".to_owned(),
                self.updater
                    .iter()
                    .map(std::string::ToString::to_string)
                    .collect::<Vec<String>>()
                    .join(","),
            ));
        }

        v
    }
}

impl TreeBoosterParametersBuilder {
    fn validate(&self) -> Result<(), String> {
        Interval::new_closed_closed(0.0, 1.0).validate(&self.eta, "eta")?;
        Interval::new_open_closed(0.0, 1.0).validate(&self.subsample, "subsample")?;
        Interval::new_open_closed(0.0, 1.0).validate(&self.colsample_bytree, "colsample_bytree")?;
        Interval::new_open_closed(0.0, 1.0)
            .validate(&self.colsample_bylevel, "colsample_bylevel")?;
        Interval::new_open_closed(0.0, 1.0).validate(&self.colsample_bynode, "colsample_bynode")?;
        Interval::new_open_open(0.0, 1.0).validate(&self.sketch_eps, "sketch_eps")?;
        Ok(())
    }
}

#[cfg(test)]
mod tests {

    use super::*;

    #[test]
    fn tree_params() {
        let error_margin = f32::EPSILON;
        let p = TreeBoosterParameters::default();
        assert!((p.eta - 0.3).abs() < error_margin);
        let p = TreeBoosterParametersBuilder::default().build().unwrap();
        assert!((p.eta - 0.3).abs() < error_margin);
    }
}