use ferrolearn_core::error::FerroError;
use ferrolearn_core::introspection::{HasClasses, HasFeatureImportances};
use ferrolearn_core::pipeline::{FittedPipelineEstimator, PipelineEstimator};
use ferrolearn_core::traits::{Fit, Predict};
use ndarray::{Array1, Array2};
use num_traits::{Float, FromPrimitive, ToPrimitive};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum HistRegressionLoss {
LeastSquares,
LeastAbsoluteDeviation,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum HistClassificationLoss {
LogLoss,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum HistNode<F> {
Split {
feature: usize,
threshold_bin: u16,
nan_goes_left: bool,
left: usize,
right: usize,
gain: F,
n_samples: usize,
},
Leaf {
value: F,
n_samples: usize,
},
}
const NAN_BIN: u16 = u16::MAX;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FeatureBinInfo<F> {
edges: Vec<F>,
n_bins: u16,
has_nan: bool,
}
fn compute_bin_edges<F: Float>(x: &Array2<F>, max_bins: u16) -> Vec<FeatureBinInfo<F>> {
let n_features = x.ncols();
let n_samples = x.nrows();
let mut infos = Vec::with_capacity(n_features);
for j in 0..n_features {
let col = x.column(j);
let mut vals: Vec<F> = col.iter().copied().filter(|v| !v.is_nan()).collect();
vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let has_nan = vals.len() < n_samples;
if vals.is_empty() {
infos.push(FeatureBinInfo {
edges: Vec::new(),
n_bins: 0,
has_nan: true,
});
continue;
}
let mut unique: Vec<F> = Vec::new();
for &v in &vals {
if unique.is_empty() || (v - *unique.last().unwrap()).abs() > F::epsilon() {
unique.push(v);
}
}
let n_unique = unique.len();
let actual_bins = (max_bins as usize).min(n_unique);
if actual_bins <= 1 {
infos.push(FeatureBinInfo {
edges: vec![unique[0]],
n_bins: 1,
has_nan,
});
continue;
}
let mut edges = Vec::with_capacity(actual_bins);
for b in 1..actual_bins {
let frac = b as f64 / actual_bins as f64;
let idx_f = frac * (n_unique as f64 - 1.0);
let lo = idx_f.floor() as usize;
let hi = (lo + 1).min(n_unique - 1);
let t = F::from(idx_f - lo as f64).unwrap();
let edge = unique[lo] * (F::one() - t) + unique[hi] * t;
if edges.is_empty() || (edge - *edges.last().unwrap()).abs() > F::epsilon() {
edges.push(edge);
}
}
let last = *unique.last().unwrap();
if edges.is_empty() || (last - *edges.last().unwrap()).abs() > F::epsilon() {
edges.push(last);
}
let n_bins = edges.len() as u16;
infos.push(FeatureBinInfo {
edges,
n_bins,
has_nan,
});
}
infos
}
#[inline]
fn map_to_bin<F: Float>(value: F, info: &FeatureBinInfo<F>) -> u16 {
if value.is_nan() {
return NAN_BIN;
}
if info.n_bins == 0 {
return NAN_BIN;
}
let edges = &info.edges;
let mut lo: usize = 0;
let mut hi: usize = edges.len();
while lo < hi {
let mid = lo + (hi - lo) / 2;
if edges[mid] < value {
lo = mid + 1;
} else {
hi = mid;
}
}
if lo >= edges.len() {
(info.n_bins - 1).min(edges.len() as u16 - 1)
} else {
lo as u16
}
}
fn bin_data<F: Float>(x: &Array2<F>, bin_infos: &[FeatureBinInfo<F>]) -> Vec<Vec<u16>> {
let (n_samples, n_features) = x.dim();
let mut bins = vec![vec![0u16; n_features]; n_samples];
for j in 0..n_features {
for i in 0..n_samples {
bins[i][j] = map_to_bin(x[[i, j]], &bin_infos[j]);
}
}
bins
}
#[derive(Debug, Clone, Copy, Default)]
struct BinEntry<F> {
grad_sum: F,
hess_sum: F,
count: usize,
}
struct FeatureHistogram<F> {
bins: Vec<BinEntry<F>>,
nan_entry: BinEntry<F>,
}
impl<F: Float> FeatureHistogram<F> {
fn new(n_bins: u16) -> Self {
Self {
bins: vec![
BinEntry {
grad_sum: F::zero(),
hess_sum: F::zero(),
count: 0,
};
n_bins as usize
],
nan_entry: BinEntry {
grad_sum: F::zero(),
hess_sum: F::zero(),
count: 0,
},
}
}
}
fn build_histograms<F: Float>(
bin_data: &[Vec<u16>],
gradients: &[F],
hessians: &[F],
sample_indices: &[usize],
bin_infos: &[FeatureBinInfo<F>],
) -> Vec<FeatureHistogram<F>> {
let n_features = bin_infos.len();
let mut histograms: Vec<FeatureHistogram<F>> = bin_infos
.iter()
.map(|info| FeatureHistogram::new(info.n_bins))
.collect();
for &i in sample_indices {
let g = gradients[i];
let h = hessians[i];
for j in 0..n_features {
let b = bin_data[i][j];
if b == NAN_BIN {
histograms[j].nan_entry.grad_sum = histograms[j].nan_entry.grad_sum + g;
histograms[j].nan_entry.hess_sum = histograms[j].nan_entry.hess_sum + h;
histograms[j].nan_entry.count += 1;
} else {
let entry = &mut histograms[j].bins[b as usize];
entry.grad_sum = entry.grad_sum + g;
entry.hess_sum = entry.hess_sum + h;
entry.count += 1;
}
}
}
histograms
}
fn subtract_histograms<F: Float>(
parent: &[FeatureHistogram<F>],
sibling: &[FeatureHistogram<F>],
bin_infos: &[FeatureBinInfo<F>],
) -> Vec<FeatureHistogram<F>> {
let n_features = bin_infos.len();
let mut result: Vec<FeatureHistogram<F>> = bin_infos
.iter()
.map(|info| FeatureHistogram::new(info.n_bins))
.collect();
for j in 0..n_features {
let n_bins = bin_infos[j].n_bins as usize;
for b in 0..n_bins {
result[j].bins[b].grad_sum = parent[j].bins[b].grad_sum - sibling[j].bins[b].grad_sum;
result[j].bins[b].hess_sum = parent[j].bins[b].hess_sum - sibling[j].bins[b].hess_sum;
result[j].bins[b].count = parent[j].bins[b]
.count
.saturating_sub(sibling[j].bins[b].count);
}
result[j].nan_entry.grad_sum = parent[j].nan_entry.grad_sum - sibling[j].nan_entry.grad_sum;
result[j].nan_entry.hess_sum = parent[j].nan_entry.hess_sum - sibling[j].nan_entry.hess_sum;
result[j].nan_entry.count = parent[j]
.nan_entry
.count
.saturating_sub(sibling[j].nan_entry.count);
}
result
}
struct SplitCandidate<F> {
feature: usize,
threshold_bin: u16,
gain: F,
nan_goes_left: bool,
}
fn find_best_split_from_histograms<F: Float>(
histograms: &[FeatureHistogram<F>],
bin_infos: &[FeatureBinInfo<F>],
total_grad: F,
total_hess: F,
total_count: usize,
l2_regularization: F,
min_samples_leaf: usize,
) -> Option<SplitCandidate<F>> {
let n_features = bin_infos.len();
let parent_gain = total_grad * total_grad / (total_hess + l2_regularization);
let mut best: Option<SplitCandidate<F>> = None;
for j in 0..n_features {
let n_bins = bin_infos[j].n_bins as usize;
if n_bins <= 1 {
continue;
}
let nan = &histograms[j].nan_entry;
let mut left_grad = F::zero();
let mut left_hess = F::zero();
let mut left_count: usize = 0;
for b in 0..(n_bins - 1) {
let entry = &histograms[j].bins[b];
left_grad = left_grad + entry.grad_sum;
left_hess = left_hess + entry.hess_sum;
left_count += entry.count;
let right_grad_no_nan = total_grad - left_grad - nan.grad_sum;
let right_hess_no_nan = total_hess - left_hess - nan.hess_sum;
let right_count_no_nan = total_count
.saturating_sub(left_count)
.saturating_sub(nan.count);
{
let lg = left_grad + nan.grad_sum;
let lh = left_hess + nan.hess_sum;
let lc = left_count + nan.count;
let rg = right_grad_no_nan;
let rh = right_hess_no_nan;
let rc = right_count_no_nan;
if lc >= min_samples_leaf && rc >= min_samples_leaf {
let gain = lg * lg / (lh + l2_regularization)
+ rg * rg / (rh + l2_regularization)
- parent_gain;
if gain > F::zero() {
let better = match &best {
None => true,
Some(curr) => gain > curr.gain,
};
if better {
best = Some(SplitCandidate {
feature: j,
threshold_bin: b as u16,
gain,
nan_goes_left: true,
});
}
}
}
}
{
let lg = left_grad;
let lh = left_hess;
let lc = left_count;
let rg = right_grad_no_nan + nan.grad_sum;
let rh = right_hess_no_nan + nan.hess_sum;
let rc = right_count_no_nan + nan.count;
if lc >= min_samples_leaf && rc >= min_samples_leaf {
let gain = lg * lg / (lh + l2_regularization)
+ rg * rg / (rh + l2_regularization)
- parent_gain;
if gain > F::zero() {
let better = match &best {
None => true,
Some(curr) => gain > curr.gain,
};
if better {
best = Some(SplitCandidate {
feature: j,
threshold_bin: b as u16,
gain,
nan_goes_left: false,
});
}
}
}
}
}
}
best
}
#[inline]
fn compute_leaf_value<F: Float>(grad_sum: F, hess_sum: F, l2_reg: F) -> F {
if hess_sum.abs() < F::epsilon() {
F::zero()
} else {
-grad_sum / (hess_sum + l2_reg)
}
}
struct HistTreeParams<F> {
max_depth: Option<usize>,
min_samples_leaf: usize,
max_leaf_nodes: Option<usize>,
l2_regularization: F,
}
fn build_hist_tree<F: Float>(
binned: &[Vec<u16>],
gradients: &[F],
hessians: &[F],
sample_indices: &[usize],
bin_infos: &[FeatureBinInfo<F>],
params: &HistTreeParams<F>,
) -> Vec<HistNode<F>> {
let n_features = bin_infos.len();
let _ = n_features;
if sample_indices.is_empty() {
return vec![HistNode::Leaf {
value: F::zero(),
n_samples: 0,
}];
}
if params.max_leaf_nodes.is_some() {
build_hist_tree_best_first(
binned,
gradients,
hessians,
sample_indices,
bin_infos,
params,
)
} else {
let mut nodes = Vec::new();
let hist = build_histograms(binned, gradients, hessians, sample_indices, bin_infos);
build_hist_tree_recursive(
binned,
gradients,
hessians,
sample_indices,
bin_infos,
params,
&hist,
0,
&mut nodes,
);
nodes
}
}
#[allow(clippy::too_many_arguments)]
fn build_hist_tree_recursive<F: Float>(
binned: &[Vec<u16>],
gradients: &[F],
hessians: &[F],
sample_indices: &[usize],
bin_infos: &[FeatureBinInfo<F>],
params: &HistTreeParams<F>,
histograms: &[FeatureHistogram<F>],
depth: usize,
nodes: &mut Vec<HistNode<F>>,
) -> usize {
let n = sample_indices.len();
let grad_sum: F = sample_indices
.iter()
.map(|&i| gradients[i])
.fold(F::zero(), |a, b| a + b);
let hess_sum: F = sample_indices
.iter()
.map(|&i| hessians[i])
.fold(F::zero(), |a, b| a + b);
let at_max_depth = params.max_depth.is_some_and(|d| depth >= d);
let too_few = n < 2 * params.min_samples_leaf;
if at_max_depth || too_few {
let idx = nodes.len();
nodes.push(HistNode::Leaf {
value: compute_leaf_value(grad_sum, hess_sum, params.l2_regularization),
n_samples: n,
});
return idx;
}
let split = find_best_split_from_histograms(
histograms,
bin_infos,
grad_sum,
hess_sum,
n,
params.l2_regularization,
params.min_samples_leaf,
);
let split = if let Some(s) = split {
s
} else {
let idx = nodes.len();
nodes.push(HistNode::Leaf {
value: compute_leaf_value(grad_sum, hess_sum, params.l2_regularization),
n_samples: n,
});
return idx;
};
let (left_indices, right_indices): (Vec<usize>, Vec<usize>) =
sample_indices.iter().partition(|&&i| {
let b = binned[i][split.feature];
if b == NAN_BIN {
split.nan_goes_left
} else {
b <= split.threshold_bin
}
});
if left_indices.is_empty() || right_indices.is_empty() {
let idx = nodes.len();
nodes.push(HistNode::Leaf {
value: compute_leaf_value(grad_sum, hess_sum, params.l2_regularization),
n_samples: n,
});
return idx;
}
let (small_indices, _large_indices, small_is_left) =
if left_indices.len() <= right_indices.len() {
(&left_indices, &right_indices, true)
} else {
(&right_indices, &left_indices, false)
};
let small_hist = build_histograms(binned, gradients, hessians, small_indices, bin_infos);
let large_hist = subtract_histograms(histograms, &small_hist, bin_infos);
let (left_hist, right_hist) = if small_is_left {
(small_hist, large_hist)
} else {
(large_hist, small_hist)
};
let node_idx = nodes.len();
nodes.push(HistNode::Leaf {
value: F::zero(),
n_samples: 0,
});
let left_idx = build_hist_tree_recursive(
binned,
gradients,
hessians,
&left_indices,
bin_infos,
params,
&left_hist,
depth + 1,
nodes,
);
let right_idx = build_hist_tree_recursive(
binned,
gradients,
hessians,
&right_indices,
bin_infos,
params,
&right_hist,
depth + 1,
nodes,
);
nodes[node_idx] = HistNode::Split {
feature: split.feature,
threshold_bin: split.threshold_bin,
nan_goes_left: split.nan_goes_left,
left: left_idx,
right: right_idx,
gain: split.gain,
n_samples: n,
};
node_idx
}
struct SplitTask {
sample_indices: Vec<usize>,
node_idx: usize,
depth: usize,
gain: f64,
feature: usize,
threshold_bin: u16,
nan_goes_left: bool,
}
fn build_hist_tree_best_first<F: Float>(
binned: &[Vec<u16>],
gradients: &[F],
hessians: &[F],
sample_indices: &[usize],
bin_infos: &[FeatureBinInfo<F>],
params: &HistTreeParams<F>,
) -> Vec<HistNode<F>> {
let max_leaves = params.max_leaf_nodes.unwrap_or(usize::MAX);
let mut nodes: Vec<HistNode<F>> = Vec::new();
let n = sample_indices.len();
let grad_sum: F = sample_indices
.iter()
.map(|&i| gradients[i])
.fold(F::zero(), |a, b| a + b);
let hess_sum: F = sample_indices
.iter()
.map(|&i| hessians[i])
.fold(F::zero(), |a, b| a + b);
let root_idx = nodes.len();
nodes.push(HistNode::Leaf {
value: compute_leaf_value(grad_sum, hess_sum, params.l2_regularization),
n_samples: n,
});
let root_hist = build_histograms(binned, gradients, hessians, sample_indices, bin_infos);
let root_split = find_best_split_from_histograms(
&root_hist,
bin_infos,
grad_sum,
hess_sum,
n,
params.l2_regularization,
params.min_samples_leaf,
);
let mut pending: Vec<(SplitTask, Vec<FeatureHistogram<F>>)> = Vec::new();
let mut n_leaves: usize = 1;
if let Some(split) = root_split {
let at_max_depth = params.max_depth.is_some_and(|d| d == 0);
if !at_max_depth {
pending.push((
SplitTask {
sample_indices: sample_indices.to_vec(),
node_idx: root_idx,
depth: 0,
gain: split.gain.to_f64().unwrap_or(0.0),
feature: split.feature,
threshold_bin: split.threshold_bin,
nan_goes_left: split.nan_goes_left,
},
root_hist,
));
}
}
while !pending.is_empty() && n_leaves < max_leaves {
let best_idx = pending
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| {
a.0.gain
.partial_cmp(&b.0.gain)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|(i, _)| i)
.unwrap();
let (task, parent_hist) = pending.swap_remove(best_idx);
let (left_indices, right_indices): (Vec<usize>, Vec<usize>) =
task.sample_indices.iter().partition(|&&i| {
let b = binned[i][task.feature];
if b == NAN_BIN {
task.nan_goes_left
} else {
b <= task.threshold_bin
}
});
if left_indices.is_empty() || right_indices.is_empty() {
continue;
}
let (small_indices, _large_indices, small_is_left) =
if left_indices.len() <= right_indices.len() {
(&left_indices, &right_indices, true)
} else {
(&right_indices, &left_indices, false)
};
let small_hist = build_histograms(binned, gradients, hessians, small_indices, bin_infos);
let large_hist = subtract_histograms(&parent_hist, &small_hist, bin_infos);
let (left_hist, right_hist) = if small_is_left {
(small_hist, large_hist)
} else {
(large_hist, small_hist)
};
let left_grad: F = left_indices
.iter()
.map(|&i| gradients[i])
.fold(F::zero(), |a, b| a + b);
let left_hess: F = left_indices
.iter()
.map(|&i| hessians[i])
.fold(F::zero(), |a, b| a + b);
let right_grad: F = right_indices
.iter()
.map(|&i| gradients[i])
.fold(F::zero(), |a, b| a + b);
let right_hess: F = right_indices
.iter()
.map(|&i| hessians[i])
.fold(F::zero(), |a, b| a + b);
let left_idx = nodes.len();
nodes.push(HistNode::Leaf {
value: compute_leaf_value(left_grad, left_hess, params.l2_regularization),
n_samples: left_indices.len(),
});
let right_idx = nodes.len();
nodes.push(HistNode::Leaf {
value: compute_leaf_value(right_grad, right_hess, params.l2_regularization),
n_samples: right_indices.len(),
});
nodes[task.node_idx] = HistNode::Split {
feature: task.feature,
threshold_bin: task.threshold_bin,
nan_goes_left: task.nan_goes_left,
left: left_idx,
right: right_idx,
gain: F::from(task.gain).unwrap(),
n_samples: task.sample_indices.len(),
};
n_leaves += 1;
let child_depth = task.depth + 1;
let at_max_depth = params.max_depth.is_some_and(|d| child_depth >= d);
if !at_max_depth && n_leaves < max_leaves {
if left_indices.len() >= 2 * params.min_samples_leaf {
let left_split = find_best_split_from_histograms(
&left_hist,
bin_infos,
left_grad,
left_hess,
left_indices.len(),
params.l2_regularization,
params.min_samples_leaf,
);
if let Some(s) = left_split {
pending.push((
SplitTask {
sample_indices: left_indices,
node_idx: left_idx,
depth: child_depth,
gain: s.gain.to_f64().unwrap_or(0.0),
feature: s.feature,
threshold_bin: s.threshold_bin,
nan_goes_left: s.nan_goes_left,
},
left_hist,
));
}
}
if right_indices.len() >= 2 * params.min_samples_leaf {
let right_split = find_best_split_from_histograms(
&right_hist,
bin_infos,
right_grad,
right_hess,
right_indices.len(),
params.l2_regularization,
params.min_samples_leaf,
);
if let Some(s) = right_split {
pending.push((
SplitTask {
sample_indices: right_indices,
node_idx: right_idx,
depth: child_depth,
gain: s.gain.to_f64().unwrap_or(0.0),
feature: s.feature,
threshold_bin: s.threshold_bin,
nan_goes_left: s.nan_goes_left,
},
right_hist,
));
}
}
}
}
nodes
}
#[inline]
fn traverse_hist_tree<F: Float>(nodes: &[HistNode<F>], sample_bins: &[u16]) -> usize {
let mut idx = 0;
loop {
match &nodes[idx] {
HistNode::Split {
feature,
threshold_bin,
nan_goes_left,
left,
right,
..
} => {
let b = sample_bins[*feature];
if b == NAN_BIN {
idx = if *nan_goes_left { *left } else { *right };
} else if b <= *threshold_bin {
idx = *left;
} else {
idx = *right;
}
}
HistNode::Leaf { .. } => return idx,
}
}
}
fn compute_hist_feature_importances<F: Float>(
nodes: &[HistNode<F>],
n_features: usize,
) -> Array1<F> {
let mut importances = Array1::zeros(n_features);
for node in nodes {
if let HistNode::Split { feature, gain, .. } = node {
importances[*feature] = importances[*feature] + *gain;
}
}
importances
}
fn sigmoid<F: Float>(x: F) -> F {
F::one() / (F::one() + (-x).exp())
}
fn softmax_matrix<F: Float>(
f_vals: &[Array1<F>],
n_samples: usize,
n_classes: usize,
) -> Vec<Vec<F>> {
let mut probs: Vec<Vec<F>> = vec![vec![F::zero(); n_samples]; n_classes];
for i in 0..n_samples {
let max_val = (0..n_classes)
.map(|k| f_vals[k][i])
.fold(F::neg_infinity(), |a, b| if b > a { b } else { a });
let mut sum = F::zero();
let mut exps = vec![F::zero(); n_classes];
for k in 0..n_classes {
exps[k] = (f_vals[k][i] - max_val).exp();
sum = sum + exps[k];
}
let eps = F::from(1e-15).unwrap();
if sum < eps {
sum = eps;
}
for k in 0..n_classes {
probs[k][i] = exps[k] / sum;
}
}
probs
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HistGradientBoostingRegressor<F> {
pub n_estimators: usize,
pub learning_rate: f64,
pub max_depth: Option<usize>,
pub min_samples_leaf: usize,
pub max_bins: u16,
pub l2_regularization: f64,
pub max_leaf_nodes: Option<usize>,
pub loss: HistRegressionLoss,
pub random_state: Option<u64>,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float> HistGradientBoostingRegressor<F> {
#[must_use]
pub fn new() -> Self {
Self {
n_estimators: 100,
learning_rate: 0.1,
max_depth: None,
min_samples_leaf: 20,
max_bins: 255,
l2_regularization: 0.0,
max_leaf_nodes: Some(31),
loss: HistRegressionLoss::LeastSquares,
random_state: None,
_marker: std::marker::PhantomData,
}
}
#[must_use]
pub fn with_n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
#[must_use]
pub fn with_learning_rate(mut self, lr: f64) -> Self {
self.learning_rate = lr;
self
}
#[must_use]
pub fn with_max_depth(mut self, d: Option<usize>) -> Self {
self.max_depth = d;
self
}
#[must_use]
pub fn with_min_samples_leaf(mut self, n: usize) -> Self {
self.min_samples_leaf = n;
self
}
#[must_use]
pub fn with_max_bins(mut self, bins: u16) -> Self {
self.max_bins = bins;
self
}
#[must_use]
pub fn with_l2_regularization(mut self, reg: f64) -> Self {
self.l2_regularization = reg;
self
}
#[must_use]
pub fn with_max_leaf_nodes(mut self, n: Option<usize>) -> Self {
self.max_leaf_nodes = n;
self
}
#[must_use]
pub fn with_loss(mut self, loss: HistRegressionLoss) -> Self {
self.loss = loss;
self
}
#[must_use]
pub fn with_random_state(mut self, seed: u64) -> Self {
self.random_state = Some(seed);
self
}
}
impl<F: Float> Default for HistGradientBoostingRegressor<F> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct FittedHistGradientBoostingRegressor<F> {
bin_infos: Vec<FeatureBinInfo<F>>,
init: F,
learning_rate: F,
trees: Vec<Vec<HistNode<F>>>,
n_features: usize,
feature_importances: Array1<F>,
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, Array1<F>>
for HistGradientBoostingRegressor<F>
{
type Fitted = FittedHistGradientBoostingRegressor<F>;
type Error = FerroError;
fn fit(
&self,
x: &Array2<F>,
y: &Array1<F>,
) -> Result<FittedHistGradientBoostingRegressor<F>, FerroError> {
let (n_samples, n_features) = x.dim();
if n_samples != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![n_samples],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
if n_samples == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "HistGradientBoostingRegressor requires at least one sample".into(),
});
}
if self.n_estimators == 0 {
return Err(FerroError::InvalidParameter {
name: "n_estimators".into(),
reason: "must be at least 1".into(),
});
}
if self.learning_rate <= 0.0 {
return Err(FerroError::InvalidParameter {
name: "learning_rate".into(),
reason: "must be positive".into(),
});
}
if self.max_bins < 2 {
return Err(FerroError::InvalidParameter {
name: "max_bins".into(),
reason: "must be at least 2".into(),
});
}
let lr = F::from(self.learning_rate).unwrap();
let l2_reg = F::from(self.l2_regularization).unwrap();
let bin_infos = compute_bin_edges(x, self.max_bins);
let binned = bin_data(x, &bin_infos);
let init = match self.loss {
HistRegressionLoss::LeastSquares => {
let sum: F = y.iter().copied().fold(F::zero(), |a, b| a + b);
sum / F::from(n_samples).unwrap()
}
HistRegressionLoss::LeastAbsoluteDeviation => median_f(y),
};
let mut f_vals = Array1::from_elem(n_samples, init);
let all_indices: Vec<usize> = (0..n_samples).collect();
let tree_params = HistTreeParams {
max_depth: self.max_depth,
min_samples_leaf: self.min_samples_leaf,
max_leaf_nodes: self.max_leaf_nodes,
l2_regularization: l2_reg,
};
let mut trees = Vec::with_capacity(self.n_estimators);
for _ in 0..self.n_estimators {
let (gradients, hessians) = match self.loss {
HistRegressionLoss::LeastSquares => {
let grads: Vec<F> = (0..n_samples).map(|i| -(y[i] - f_vals[i])).collect();
let hess: Vec<F> = vec![F::one(); n_samples];
(grads, hess)
}
HistRegressionLoss::LeastAbsoluteDeviation => {
let grads: Vec<F> = (0..n_samples)
.map(|i| {
let diff = y[i] - f_vals[i];
if diff > F::zero() {
-F::one()
} else if diff < F::zero() {
F::one()
} else {
F::zero()
}
})
.collect();
let hess: Vec<F> = vec![F::one(); n_samples];
(grads, hess)
}
};
let tree = build_hist_tree(
&binned,
&gradients,
&hessians,
&all_indices,
&bin_infos,
&tree_params,
);
for i in 0..n_samples {
let leaf_idx = traverse_hist_tree(&tree, &binned[i]);
if let HistNode::Leaf { value, .. } = tree[leaf_idx] {
f_vals[i] = f_vals[i] + lr * value;
}
}
trees.push(tree);
}
let mut total_importances = Array1::<F>::zeros(n_features);
for tree_nodes in &trees {
total_importances =
total_importances + compute_hist_feature_importances(tree_nodes, n_features);
}
let imp_sum: F = total_importances
.iter()
.copied()
.fold(F::zero(), |a, b| a + b);
if imp_sum > F::zero() {
total_importances.mapv_inplace(|v| v / imp_sum);
}
Ok(FittedHistGradientBoostingRegressor {
bin_infos,
init,
learning_rate: lr,
trees,
n_features,
feature_importances: total_importances,
})
}
}
impl<F: Float + Send + Sync + 'static> Predict<Array2<F>>
for FittedHistGradientBoostingRegressor<F>
{
type Output = Array1<F>;
type Error = FerroError;
fn predict(&self, x: &Array2<F>) -> Result<Array1<F>, FerroError> {
if x.ncols() != self.n_features {
return Err(FerroError::ShapeMismatch {
expected: vec![self.n_features],
actual: vec![x.ncols()],
context: "number of features must match fitted model".into(),
});
}
let n_samples = x.nrows();
let binned = bin_data(x, &self.bin_infos);
let mut predictions = Array1::from_elem(n_samples, self.init);
for i in 0..n_samples {
for tree_nodes in &self.trees {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
predictions[i] = predictions[i] + self.learning_rate * value;
}
}
}
Ok(predictions)
}
}
impl<F: Float + Send + Sync + 'static> HasFeatureImportances<F>
for FittedHistGradientBoostingRegressor<F>
{
fn feature_importances(&self) -> &Array1<F> {
&self.feature_importances
}
}
impl<F: Float + Send + Sync + 'static> PipelineEstimator<F> for HistGradientBoostingRegressor<F> {
fn fit_pipeline(
&self,
x: &Array2<F>,
y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineEstimator<F>>, FerroError> {
let fitted = self.fit(x, y)?;
Ok(Box::new(fitted))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineEstimator<F>
for FittedHistGradientBoostingRegressor<F>
{
fn predict_pipeline(&self, x: &Array2<F>) -> Result<Array1<F>, FerroError> {
self.predict(x)
}
}
impl<F: Float + Send + Sync + 'static> FittedHistGradientBoostingRegressor<F> {
pub fn score(&self, x: &Array2<F>, y: &Array1<F>) -> Result<F, FerroError> {
if x.nrows() != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows()],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
let preds = self.predict(x)?;
Ok(crate::r2_score(&preds, y))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HistGradientBoostingClassifier<F> {
pub n_estimators: usize,
pub learning_rate: f64,
pub max_depth: Option<usize>,
pub min_samples_leaf: usize,
pub max_bins: u16,
pub l2_regularization: f64,
pub max_leaf_nodes: Option<usize>,
pub loss: HistClassificationLoss,
pub random_state: Option<u64>,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float> HistGradientBoostingClassifier<F> {
#[must_use]
pub fn new() -> Self {
Self {
n_estimators: 100,
learning_rate: 0.1,
max_depth: None,
min_samples_leaf: 20,
max_bins: 255,
l2_regularization: 0.0,
max_leaf_nodes: Some(31),
loss: HistClassificationLoss::LogLoss,
random_state: None,
_marker: std::marker::PhantomData,
}
}
#[must_use]
pub fn with_n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
#[must_use]
pub fn with_learning_rate(mut self, lr: f64) -> Self {
self.learning_rate = lr;
self
}
#[must_use]
pub fn with_max_depth(mut self, d: Option<usize>) -> Self {
self.max_depth = d;
self
}
#[must_use]
pub fn with_min_samples_leaf(mut self, n: usize) -> Self {
self.min_samples_leaf = n;
self
}
#[must_use]
pub fn with_max_bins(mut self, bins: u16) -> Self {
self.max_bins = bins;
self
}
#[must_use]
pub fn with_l2_regularization(mut self, reg: f64) -> Self {
self.l2_regularization = reg;
self
}
#[must_use]
pub fn with_max_leaf_nodes(mut self, n: Option<usize>) -> Self {
self.max_leaf_nodes = n;
self
}
#[must_use]
pub fn with_random_state(mut self, seed: u64) -> Self {
self.random_state = Some(seed);
self
}
}
impl<F: Float> Default for HistGradientBoostingClassifier<F> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct FittedHistGradientBoostingClassifier<F> {
bin_infos: Vec<FeatureBinInfo<F>>,
classes: Vec<usize>,
init: Vec<F>,
learning_rate: F,
trees: Vec<Vec<Vec<HistNode<F>>>>,
n_features: usize,
feature_importances: Array1<F>,
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, Array1<usize>>
for HistGradientBoostingClassifier<F>
{
type Fitted = FittedHistGradientBoostingClassifier<F>;
type Error = FerroError;
fn fit(
&self,
x: &Array2<F>,
y: &Array1<usize>,
) -> Result<FittedHistGradientBoostingClassifier<F>, FerroError> {
let (n_samples, n_features) = x.dim();
if n_samples != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![n_samples],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
if n_samples == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "HistGradientBoostingClassifier requires at least one sample".into(),
});
}
if self.n_estimators == 0 {
return Err(FerroError::InvalidParameter {
name: "n_estimators".into(),
reason: "must be at least 1".into(),
});
}
if self.learning_rate <= 0.0 {
return Err(FerroError::InvalidParameter {
name: "learning_rate".into(),
reason: "must be positive".into(),
});
}
if self.max_bins < 2 {
return Err(FerroError::InvalidParameter {
name: "max_bins".into(),
reason: "must be at least 2".into(),
});
}
let mut classes: Vec<usize> = y.iter().copied().collect();
classes.sort_unstable();
classes.dedup();
let n_classes = classes.len();
if n_classes < 2 {
return Err(FerroError::InvalidParameter {
name: "y".into(),
reason: "need at least 2 distinct classes".into(),
});
}
let y_mapped: Vec<usize> = y
.iter()
.map(|&c| classes.iter().position(|&cl| cl == c).unwrap())
.collect();
let lr = F::from(self.learning_rate).unwrap();
let l2_reg = F::from(self.l2_regularization).unwrap();
let bin_infos = compute_bin_edges(x, self.max_bins);
let binned = bin_data(x, &bin_infos);
let tree_params = HistTreeParams {
max_depth: self.max_depth,
min_samples_leaf: self.min_samples_leaf,
max_leaf_nodes: self.max_leaf_nodes,
l2_regularization: l2_reg,
};
let all_indices: Vec<usize> = (0..n_samples).collect();
if n_classes == 2 {
self.fit_binary(
&binned,
&y_mapped,
n_samples,
n_features,
&classes,
lr,
&tree_params,
&all_indices,
&bin_infos,
)
} else {
self.fit_multiclass(
&binned,
&y_mapped,
n_samples,
n_features,
n_classes,
&classes,
lr,
&tree_params,
&all_indices,
&bin_infos,
)
}
}
}
impl<F: Float + Send + Sync + 'static> HistGradientBoostingClassifier<F> {
#[allow(clippy::too_many_arguments)]
fn fit_binary(
&self,
binned: &[Vec<u16>],
y_mapped: &[usize],
n_samples: usize,
n_features: usize,
classes: &[usize],
lr: F,
tree_params: &HistTreeParams<F>,
all_indices: &[usize],
bin_infos: &[FeatureBinInfo<F>],
) -> Result<FittedHistGradientBoostingClassifier<F>, FerroError> {
let pos_count = y_mapped.iter().filter(|&&c| c == 1).count();
let p = F::from(pos_count).unwrap() / F::from(n_samples).unwrap();
let eps = F::from(1e-15).unwrap();
let p_clipped = p.max(eps).min(F::one() - eps);
let init_val = (p_clipped / (F::one() - p_clipped)).ln();
let mut f_vals = Array1::from_elem(n_samples, init_val);
let mut trees_seq: Vec<Vec<HistNode<F>>> = Vec::with_capacity(self.n_estimators);
for _ in 0..self.n_estimators {
let probs: Vec<F> = f_vals.iter().map(|&fv| sigmoid(fv)).collect();
let gradients: Vec<F> = (0..n_samples)
.map(|i| {
let yi = F::from(y_mapped[i]).unwrap();
probs[i] - yi
})
.collect();
let hessians: Vec<F> = (0..n_samples)
.map(|i| {
let pi = probs[i].max(eps).min(F::one() - eps);
pi * (F::one() - pi)
})
.collect();
let tree = build_hist_tree(
binned,
&gradients,
&hessians,
all_indices,
bin_infos,
tree_params,
);
for i in 0..n_samples {
let leaf_idx = traverse_hist_tree(&tree, &binned[i]);
if let HistNode::Leaf { value, .. } = tree[leaf_idx] {
f_vals[i] = f_vals[i] + lr * value;
}
}
trees_seq.push(tree);
}
let mut total_importances = Array1::<F>::zeros(n_features);
for tree_nodes in &trees_seq {
total_importances =
total_importances + compute_hist_feature_importances(tree_nodes, n_features);
}
let imp_sum: F = total_importances
.iter()
.copied()
.fold(F::zero(), |a, b| a + b);
if imp_sum > F::zero() {
total_importances.mapv_inplace(|v| v / imp_sum);
}
Ok(FittedHistGradientBoostingClassifier {
bin_infos: bin_infos.to_vec(),
classes: classes.to_vec(),
init: vec![init_val],
learning_rate: lr,
trees: vec![trees_seq],
n_features,
feature_importances: total_importances,
})
}
#[allow(clippy::too_many_arguments)]
fn fit_multiclass(
&self,
binned: &[Vec<u16>],
y_mapped: &[usize],
n_samples: usize,
n_features: usize,
n_classes: usize,
classes: &[usize],
lr: F,
tree_params: &HistTreeParams<F>,
all_indices: &[usize],
bin_infos: &[FeatureBinInfo<F>],
) -> Result<FittedHistGradientBoostingClassifier<F>, FerroError> {
let mut class_counts = vec![0usize; n_classes];
for &c in y_mapped {
class_counts[c] += 1;
}
let n_f = F::from(n_samples).unwrap();
let eps = F::from(1e-15).unwrap();
let init_vals: Vec<F> = class_counts
.iter()
.map(|&cnt| {
let p = (F::from(cnt).unwrap() / n_f).max(eps);
p.ln()
})
.collect();
let mut f_vals: Vec<Array1<F>> = init_vals
.iter()
.map(|&init| Array1::from_elem(n_samples, init))
.collect();
let mut trees_per_class: Vec<Vec<Vec<HistNode<F>>>> = (0..n_classes)
.map(|_| Vec::with_capacity(self.n_estimators))
.collect();
for _ in 0..self.n_estimators {
let probs = softmax_matrix(&f_vals, n_samples, n_classes);
for k in 0..n_classes {
let gradients: Vec<F> = (0..n_samples)
.map(|i| {
let yi_k = if y_mapped[i] == k {
F::one()
} else {
F::zero()
};
probs[k][i] - yi_k
})
.collect();
let hessians: Vec<F> = (0..n_samples)
.map(|i| {
let pk = probs[k][i].max(eps).min(F::one() - eps);
pk * (F::one() - pk)
})
.collect();
let tree = build_hist_tree(
binned,
&gradients,
&hessians,
all_indices,
bin_infos,
tree_params,
);
for (i, fv) in f_vals[k].iter_mut().enumerate() {
let leaf_idx = traverse_hist_tree(&tree, &binned[i]);
if let HistNode::Leaf { value, .. } = tree[leaf_idx] {
*fv = *fv + lr * value;
}
}
trees_per_class[k].push(tree);
}
}
let mut total_importances = Array1::<F>::zeros(n_features);
for class_trees in &trees_per_class {
for tree_nodes in class_trees {
total_importances =
total_importances + compute_hist_feature_importances(tree_nodes, n_features);
}
}
let imp_sum: F = total_importances
.iter()
.copied()
.fold(F::zero(), |a, b| a + b);
if imp_sum > F::zero() {
total_importances.mapv_inplace(|v| v / imp_sum);
}
Ok(FittedHistGradientBoostingClassifier {
bin_infos: bin_infos.to_vec(),
classes: classes.to_vec(),
init: init_vals,
learning_rate: lr,
trees: trees_per_class,
n_features,
feature_importances: total_importances,
})
}
}
impl<F: Float + Send + Sync + 'static> Predict<Array2<F>>
for FittedHistGradientBoostingClassifier<F>
{
type Output = Array1<usize>;
type Error = FerroError;
fn predict(&self, x: &Array2<F>) -> Result<Array1<usize>, FerroError> {
if x.ncols() != self.n_features {
return Err(FerroError::ShapeMismatch {
expected: vec![self.n_features],
actual: vec![x.ncols()],
context: "number of features must match fitted model".into(),
});
}
let n_samples = x.nrows();
let n_classes = self.classes.len();
let binned = bin_data(x, &self.bin_infos);
if n_classes == 2 {
let init = self.init[0];
let mut predictions = Array1::zeros(n_samples);
for i in 0..n_samples {
let mut f_val = init;
for tree_nodes in &self.trees[0] {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
f_val = f_val + self.learning_rate * value;
}
}
let prob = sigmoid(f_val);
let class_idx = if prob >= F::from(0.5).unwrap() { 1 } else { 0 };
predictions[i] = self.classes[class_idx];
}
Ok(predictions)
} else {
let mut predictions = Array1::zeros(n_samples);
for i in 0..n_samples {
let mut scores = Vec::with_capacity(n_classes);
for k in 0..n_classes {
let mut f_val = self.init[k];
for tree_nodes in &self.trees[k] {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
f_val = f_val + self.learning_rate * value;
}
}
scores.push(f_val);
}
let best_k = scores
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.map_or(0, |(k, _)| k);
predictions[i] = self.classes[best_k];
}
Ok(predictions)
}
}
}
impl<F: Float + Send + Sync + 'static> HasFeatureImportances<F>
for FittedHistGradientBoostingClassifier<F>
{
fn feature_importances(&self) -> &Array1<F> {
&self.feature_importances
}
}
impl<F: Float + Send + Sync + 'static> HasClasses for FittedHistGradientBoostingClassifier<F> {
fn classes(&self) -> &[usize] {
&self.classes
}
fn n_classes(&self) -> usize {
self.classes.len()
}
}
impl<F: Float + Send + Sync + 'static> FittedHistGradientBoostingClassifier<F> {
pub fn score(&self, x: &Array2<F>, y: &Array1<usize>) -> Result<F, FerroError> {
if x.nrows() != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows()],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
let preds = self.predict(x)?;
Ok(crate::mean_accuracy(&preds, y))
}
pub fn predict_proba(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
if x.ncols() != self.n_features {
return Err(FerroError::ShapeMismatch {
expected: vec![self.n_features],
actual: vec![x.ncols()],
context: "number of features must match fitted model".into(),
});
}
let n_samples = x.nrows();
let n_classes = self.classes.len();
let binned = bin_data(x, &self.bin_infos);
let mut proba = Array2::<F>::zeros((n_samples, n_classes));
if n_classes == 2 {
let init = self.init[0];
for i in 0..n_samples {
let mut f_val = init;
for tree_nodes in &self.trees[0] {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
f_val = f_val + self.learning_rate * value;
}
}
let p1 = sigmoid(f_val);
proba[[i, 0]] = F::one() - p1;
proba[[i, 1]] = p1;
}
} else {
for i in 0..n_samples {
let mut scores = vec![F::zero(); n_classes];
for k in 0..n_classes {
let mut f_val = self.init[k];
for tree_nodes in &self.trees[k] {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
f_val = f_val + self.learning_rate * value;
}
}
scores[k] = f_val;
}
let max_s = scores
.iter()
.copied()
.fold(F::neg_infinity(), |a, b| if b > a { b } else { a });
let mut sum_exp = F::zero();
for k in 0..n_classes {
let e = (scores[k] - max_s).exp();
proba[[i, k]] = e;
sum_exp = sum_exp + e;
}
if sum_exp > F::zero() {
for k in 0..n_classes {
proba[[i, k]] = proba[[i, k]] / sum_exp;
}
}
}
}
Ok(proba)
}
pub fn predict_log_proba(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let proba = self.predict_proba(x)?;
Ok(crate::log_proba(&proba))
}
pub fn decision_function(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
if x.ncols() != self.n_features {
return Err(FerroError::ShapeMismatch {
expected: vec![self.n_features],
actual: vec![x.ncols()],
context: "number of features must match fitted model".into(),
});
}
let n_samples = x.nrows();
let n_classes = self.classes.len();
let binned = bin_data(x, &self.bin_infos);
if n_classes == 2 {
let init = self.init[0];
let mut out = Array2::<F>::zeros((n_samples, 1));
for i in 0..n_samples {
let mut f_val = init;
for tree_nodes in &self.trees[0] {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
f_val = f_val + self.learning_rate * value;
}
}
out[[i, 0]] = f_val;
}
Ok(out)
} else {
let mut out = Array2::<F>::zeros((n_samples, n_classes));
for i in 0..n_samples {
for k in 0..n_classes {
let mut f_val = self.init[k];
for tree_nodes in &self.trees[k] {
let leaf_idx = traverse_hist_tree(tree_nodes, &binned[i]);
if let HistNode::Leaf { value, .. } = tree_nodes[leaf_idx] {
f_val = f_val + self.learning_rate * value;
}
}
out[[i, k]] = f_val;
}
}
Ok(out)
}
}
}
impl<F: Float + ToPrimitive + FromPrimitive + Send + Sync + 'static> PipelineEstimator<F>
for HistGradientBoostingClassifier<F>
{
fn fit_pipeline(
&self,
x: &Array2<F>,
y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineEstimator<F>>, FerroError> {
let y_usize: Array1<usize> = y.mapv(|v| v.to_usize().unwrap_or(0));
let fitted = self.fit(x, &y_usize)?;
Ok(Box::new(FittedHgbcPipelineAdapter(fitted)))
}
}
struct FittedHgbcPipelineAdapter<F: Float + Send + Sync + 'static>(
FittedHistGradientBoostingClassifier<F>,
);
impl<F: Float + ToPrimitive + FromPrimitive + Send + Sync + 'static> FittedPipelineEstimator<F>
for FittedHgbcPipelineAdapter<F>
{
fn predict_pipeline(&self, x: &Array2<F>) -> Result<Array1<F>, FerroError> {
let preds = self.0.predict(x)?;
Ok(preds.mapv(|v| F::from_usize(v).unwrap_or_else(F::nan)))
}
}
fn median_f<F: Float>(arr: &Array1<F>) -> F {
let mut sorted: Vec<F> = arr.iter().copied().collect();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
if n == 0 {
return F::zero();
}
if n % 2 == 1 {
sorted[n / 2]
} else {
(sorted[n / 2 - 1] + sorted[n / 2]) / F::from(2.0).unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_relative_eq;
use ndarray::array;
#[test]
fn test_bin_edges_simple() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let infos = compute_bin_edges(&x, 4);
assert_eq!(infos.len(), 1);
assert!(infos[0].n_bins <= 4);
assert!(infos[0].n_bins >= 2);
assert!(!infos[0].has_nan);
}
#[test]
fn test_bin_edges_with_nan() {
let x = Array2::from_shape_vec((5, 1), vec![1.0, f64::NAN, 3.0, f64::NAN, 5.0]).unwrap();
let infos = compute_bin_edges(&x, 4);
assert_eq!(infos.len(), 1);
assert!(infos[0].has_nan);
assert!(infos[0].n_bins >= 1);
}
#[test]
fn test_bin_edges_all_nan() {
let x = Array2::from_shape_vec((3, 1), vec![f64::NAN, f64::NAN, f64::NAN]).unwrap();
let infos = compute_bin_edges(&x, 4);
assert_eq!(infos[0].n_bins, 0);
assert!(infos[0].has_nan);
}
#[test]
fn test_map_to_bin_basic() {
let info = FeatureBinInfo {
edges: vec![2.0, 4.0, 6.0, 8.0],
n_bins: 4,
has_nan: false,
};
assert_eq!(map_to_bin(1.0, &info), 0);
assert_eq!(map_to_bin(2.0, &info), 0);
assert_eq!(map_to_bin(3.0, &info), 1);
assert_eq!(map_to_bin(5.0, &info), 2);
assert_eq!(map_to_bin(7.0, &info), 3);
assert_eq!(map_to_bin(9.0, &info), 3);
assert_eq!(map_to_bin(f64::NAN, &info), NAN_BIN);
}
#[test]
fn test_bin_data_roundtrip() {
let x = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
.unwrap();
let infos = compute_bin_edges(&x, 255);
let binned = bin_data(&x, &infos);
assert_eq!(binned.len(), 4);
assert_eq!(binned[0].len(), 2);
for window in binned.windows(2) {
let (prev, curr) = (&window[0], &window[1]);
for (&p, &c) in prev.iter().zip(curr.iter()) {
assert!(c >= p);
}
}
}
#[test]
fn test_subtraction_trick() {
let x = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let bin_infos = compute_bin_edges(&x, 255);
let binned = bin_data(&x, &bin_infos);
let gradients = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6];
let hessians = vec![1.0; 6];
let all_indices: Vec<usize> = (0..6).collect();
let left_indices: Vec<usize> = vec![0, 1, 2];
let right_indices: Vec<usize> = vec![3, 4, 5];
let parent_hist =
build_histograms(&binned, &gradients, &hessians, &all_indices, &bin_infos);
let left_hist = build_histograms(&binned, &gradients, &hessians, &left_indices, &bin_infos);
let right_from_sub = subtract_histograms(&parent_hist, &left_hist, &bin_infos);
let right_direct =
build_histograms(&binned, &gradients, &hessians, &right_indices, &bin_infos);
for j in 0..bin_infos.len() {
let n_bins = bin_infos[j].n_bins as usize;
for b in 0..n_bins {
assert_relative_eq!(
right_from_sub[j].bins[b].grad_sum,
right_direct[j].bins[b].grad_sum,
epsilon = 1e-10
);
assert_relative_eq!(
right_from_sub[j].bins[b].hess_sum,
right_direct[j].bins[b].hess_sum,
epsilon = 1e-10
);
assert_eq!(
right_from_sub[j].bins[b].count,
right_direct[j].bins[b].count
);
}
}
}
#[test]
fn test_hgbr_simple_least_squares() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(50)
.with_learning_rate(0.1)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
for i in 0..4 {
assert!(preds[i] < 3.0, "Expected ~1.0, got {}", preds[i]);
}
for i in 4..8 {
assert!(preds[i] > 3.0, "Expected ~5.0, got {}", preds[i]);
}
}
#[test]
fn test_hgbr_lad_loss() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(50)
.with_loss(HistRegressionLoss::LeastAbsoluteDeviation)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
for i in 0..4 {
assert!(preds[i] < 3.5, "LAD expected <3.5, got {}", preds[i]);
}
for i in 4..8 {
assert!(preds[i] > 2.5, "LAD expected >2.5, got {}", preds[i]);
}
}
#[test]
fn test_hgbr_reproducibility() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(123);
let fitted1 = model.fit(&x, &y).unwrap();
let fitted2 = model.fit(&x, &y).unwrap();
let preds1 = fitted1.predict(&x).unwrap();
let preds2 = fitted2.predict(&x).unwrap();
for (p1, p2) in preds1.iter().zip(preds2.iter()) {
assert_relative_eq!(*p1, *p2, epsilon = 1e-10);
}
}
#[test]
fn test_hgbr_feature_importances() {
let x = Array2::from_shape_vec(
(10, 3),
vec![
1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0, 5.0, 0.0, 0.0, 6.0,
0.0, 0.0, 7.0, 0.0, 0.0, 8.0, 0.0, 0.0, 9.0, 0.0, 0.0, 10.0, 0.0, 0.0,
],
)
.unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let importances = fitted.feature_importances();
assert_eq!(importances.len(), 3);
assert!(
importances[0] > importances[1],
"Expected imp[0]={} > imp[1]={}",
importances[0],
importances[1]
);
}
#[test]
fn test_hgbr_shape_mismatch_fit() {
let x = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let y = array![1.0, 2.0];
let model = HistGradientBoostingRegressor::<f64>::new().with_n_estimators(5);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbr_shape_mismatch_predict() {
let x =
Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 2.0, 3.0, 4.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(5)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(0);
let fitted = model.fit(&x, &y).unwrap();
let x_bad = Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
assert!(fitted.predict(&x_bad).is_err());
}
#[test]
fn test_hgbr_empty_data() {
let x = Array2::<f64>::zeros((0, 2));
let y = Array1::<f64>::zeros(0);
let model = HistGradientBoostingRegressor::<f64>::new().with_n_estimators(5);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbr_zero_estimators() {
let x = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
let y = array![1.0, 2.0, 3.0, 4.0];
let model = HistGradientBoostingRegressor::<f64>::new().with_n_estimators(0);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbr_invalid_learning_rate() {
let x = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
let y = array![1.0, 2.0, 3.0, 4.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(5)
.with_learning_rate(0.0);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbr_invalid_max_bins() {
let x = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
let y = array![1.0, 2.0, 3.0, 4.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(5)
.with_max_bins(1);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbr_default_trait() {
let model = HistGradientBoostingRegressor::<f64>::default();
assert_eq!(model.n_estimators, 100);
assert!((model.learning_rate - 0.1).abs() < 1e-10);
assert_eq!(model.max_bins, 255);
}
#[test]
fn test_hgbr_pipeline_integration() {
let x = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
let y = array![1.0, 2.0, 3.0, 4.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(10)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit_pipeline(&x, &y).unwrap();
let preds = fitted.predict_pipeline(&x).unwrap();
assert_eq!(preds.len(), 4);
}
#[test]
fn test_hgbr_f32_support() {
let x = Array2::from_shape_vec((4, 1), vec![1.0f32, 2.0, 3.0, 4.0]).unwrap();
let y = Array1::from_vec(vec![1.0f32, 2.0, 3.0, 4.0]);
let model = HistGradientBoostingRegressor::<f32>::new()
.with_n_estimators(10)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 4);
}
#[test]
fn test_hgbr_nan_handling() {
let x = Array2::from_shape_vec(
(8, 1),
vec![1.0, f64::NAN, 3.0, 4.0, 5.0, f64::NAN, 7.0, 8.0],
)
.unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
for p in &preds {
assert!(p.is_finite(), "Expected finite prediction, got {p}");
}
}
#[test]
fn test_hgbr_convergence() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let mse = |preds: &Array1<f64>, y: &Array1<f64>| -> f64 {
preds
.iter()
.zip(y.iter())
.map(|(p, t)| (p - t).powi(2))
.sum::<f64>()
/ y.len() as f64
};
let model_few = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(5)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted_few = model_few.fit(&x, &y).unwrap();
let preds_few = fitted_few.predict(&x).unwrap();
let mse_few = mse(&preds_few, &y);
let model_many = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(50)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted_many = model_many.fit(&x, &y).unwrap();
let preds_many = fitted_many.predict(&x).unwrap();
let mse_many = mse(&preds_many, &y);
assert!(
mse_many < mse_few,
"Expected MSE to decrease with more estimators: {mse_many} (50) vs {mse_few} (5)"
);
}
#[test]
fn test_hgbr_max_leaf_nodes() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(Some(4))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
}
#[test]
fn test_hgbr_l2_regularization() {
let x =
Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0];
let model_noreg = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_l2_regularization(0.0)
.with_random_state(42);
let fitted_noreg = model_noreg.fit(&x, &y).unwrap();
let preds_noreg = fitted_noreg.predict(&x).unwrap();
let model_highreg = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_l2_regularization(100.0)
.with_random_state(42);
let fitted_highreg = model_highreg.fit(&x, &y).unwrap();
let preds_highreg = fitted_highreg.predict(&x).unwrap();
let var = |preds: &Array1<f64>| -> f64 {
let mean = preds.mean().unwrap();
preds.iter().map(|p| (p - mean).powi(2)).sum::<f64>() / preds.len() as f64
};
assert!(
var(&preds_highreg) < var(&preds_noreg),
"High regularization should reduce prediction variance"
);
}
#[test]
fn test_hgbc_binary_simple() {
let x = Array2::from_shape_vec(
(8, 2),
vec![
1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 9.0,
],
)
.unwrap();
let y = array![0, 0, 0, 0, 1, 1, 1, 1];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(50)
.with_learning_rate(0.1)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
for i in 0..4 {
assert_eq!(preds[i], 0, "Expected 0 at index {}, got {}", i, preds[i]);
}
for i in 4..8 {
assert_eq!(preds[i], 1, "Expected 1 at index {}, got {}", i, preds[i]);
}
}
#[test]
fn test_hgbc_multiclass() {
let x = Array2::from_shape_vec((9, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
.unwrap();
let y = array![0, 0, 0, 1, 1, 1, 2, 2, 2];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(50)
.with_learning_rate(0.1)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 9);
let correct = preds.iter().zip(y.iter()).filter(|(p, t)| p == t).count();
assert!(
correct >= 6,
"Expected at least 6/9 correct, got {correct}/9"
);
}
#[test]
fn test_hgbc_has_classes() {
let x = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let y = array![0, 1, 2, 0, 1, 2];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(5)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(0);
let fitted = model.fit(&x, &y).unwrap();
assert_eq!(fitted.classes(), &[0, 1, 2]);
assert_eq!(fitted.n_classes(), 3);
}
#[test]
fn test_hgbc_reproducibility() {
let x = Array2::from_shape_vec(
(8, 2),
vec![
1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 9.0,
],
)
.unwrap();
let y = array![0, 0, 0, 0, 1, 1, 1, 1];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(10)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted1 = model.fit(&x, &y).unwrap();
let fitted2 = model.fit(&x, &y).unwrap();
let preds1 = fitted1.predict(&x).unwrap();
let preds2 = fitted2.predict(&x).unwrap();
assert_eq!(preds1, preds2);
}
#[test]
fn test_hgbc_feature_importances() {
let x = Array2::from_shape_vec(
(10, 3),
vec![
1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0, 5.0, 0.0, 0.0, 6.0,
0.0, 0.0, 7.0, 0.0, 0.0, 8.0, 0.0, 0.0, 9.0, 0.0, 0.0, 10.0, 0.0, 0.0,
],
)
.unwrap();
let y = array![0, 0, 0, 0, 0, 1, 1, 1, 1, 1];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let importances = fitted.feature_importances();
assert_eq!(importances.len(), 3);
assert!(importances[0] > importances[1]);
}
#[test]
fn test_hgbc_shape_mismatch_fit() {
let x = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let y = array![0, 1];
let model = HistGradientBoostingClassifier::<f64>::new().with_n_estimators(5);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbc_shape_mismatch_predict() {
let x =
Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
let y = array![0, 0, 1, 1];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(5)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(0);
let fitted = model.fit(&x, &y).unwrap();
let x_bad = Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
assert!(fitted.predict(&x_bad).is_err());
}
#[test]
fn test_hgbc_empty_data() {
let x = Array2::<f64>::zeros((0, 2));
let y = Array1::<usize>::zeros(0);
let model = HistGradientBoostingClassifier::<f64>::new().with_n_estimators(5);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbc_single_class() {
let x = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
let y = array![0, 0, 0];
let model = HistGradientBoostingClassifier::<f64>::new().with_n_estimators(5);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbc_zero_estimators() {
let x = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
let y = array![0, 0, 1, 1];
let model = HistGradientBoostingClassifier::<f64>::new().with_n_estimators(0);
assert!(model.fit(&x, &y).is_err());
}
#[test]
fn test_hgbc_pipeline_integration() {
let x = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let y = Array1::from_vec(vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]);
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(10)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit_pipeline(&x, &y).unwrap();
let preds = fitted.predict_pipeline(&x).unwrap();
assert_eq!(preds.len(), 6);
}
#[test]
fn test_hgbc_f32_support() {
let x = Array2::from_shape_vec((6, 1), vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let y = array![0, 0, 0, 1, 1, 1];
let model = HistGradientBoostingClassifier::<f32>::new()
.with_n_estimators(10)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 6);
}
#[test]
fn test_hgbc_default_trait() {
let model = HistGradientBoostingClassifier::<f64>::default();
assert_eq!(model.n_estimators, 100);
assert!((model.learning_rate - 0.1).abs() < 1e-10);
assert_eq!(model.max_bins, 255);
}
#[test]
fn test_hgbc_non_contiguous_labels() {
let x = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let y = array![10, 10, 10, 20, 20, 20];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 6);
for &p in &preds {
assert!(p == 10 || p == 20);
}
}
#[test]
fn test_hgbc_nan_handling() {
let x = Array2::from_shape_vec(
(8, 2),
vec![
1.0,
f64::NAN,
2.0,
3.0,
f64::NAN,
3.0,
4.0,
4.0,
5.0,
6.0,
6.0,
f64::NAN,
7.0,
8.0,
f64::NAN,
9.0,
],
)
.unwrap();
let y = array![0, 0, 0, 0, 1, 1, 1, 1];
let model = HistGradientBoostingClassifier::<f64>::new()
.with_n_estimators(20)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
for &p in &preds {
assert!(p == 0 || p == 1);
}
}
#[test]
fn test_hist_vs_standard_gbm_similar_accuracy() {
use crate::GradientBoostingRegressor;
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.unwrap();
let y = array![1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0];
let mse = |preds: &Array1<f64>, y: &Array1<f64>| -> f64 {
preds
.iter()
.zip(y.iter())
.map(|(p, t)| (p - t).powi(2))
.sum::<f64>()
/ y.len() as f64
};
let std_model = GradientBoostingRegressor::<f64>::new()
.with_n_estimators(50)
.with_learning_rate(0.1)
.with_max_depth(Some(3))
.with_random_state(42);
let std_fitted = std_model.fit(&x, &y).unwrap();
let std_preds = std_fitted.predict(&x).unwrap();
let std_mse = mse(&std_preds, &y);
let hist_model = HistGradientBoostingRegressor::<f64>::new()
.with_n_estimators(50)
.with_learning_rate(0.1)
.with_min_samples_leaf(1)
.with_max_leaf_nodes(None)
.with_max_depth(Some(3))
.with_random_state(42);
let hist_fitted = hist_model.fit(&x, &y).unwrap();
let hist_preds = hist_fitted.predict(&x).unwrap();
let hist_mse = mse(&hist_preds, &y);
assert!(std_mse < 1.0, "Standard GBM MSE too high: {std_mse}");
assert!(hist_mse < 1.0, "Hist GBM MSE too high: {hist_mse}");
}
}