use crate::dataset::Dataset;
use crate::error::{Result, ScryLearnError};
use crate::tree::binning::FeatureBinner;
use rayon::prelude::*;
const NUM_BINS: usize = 256;
#[derive(Clone, Copy, Debug, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
struct HistBin {
grad_sum: f64,
hess_sum: f64,
count: u32,
}
type FeatureHistogram = [HistBin; NUM_BINS];
fn build_histograms(
binned: &[Vec<u8>], gradients: &[f64],
hessians: &[f64],
sample_indices: &[usize],
n_features: usize,
) -> Vec<FeatureHistogram> {
#[cfg(feature = "scry-gpu")]
{
if let Ok(gpu) = crate::accel::ScryGpuBackend::new() {
use crate::accel::ComputeBackend;
let accel_hists = gpu.build_histograms(
binned,
gradients,
hessians,
sample_indices,
n_features,
NUM_BINS,
);
return accel_hists
.into_iter()
.map(|feat_bins| {
let mut hist: FeatureHistogram = [HistBin::default(); NUM_BINS];
for (b, &(g, h, c)) in feat_bins.iter().enumerate() {
if b < NUM_BINS {
hist[b].grad_sum = g;
hist[b].hess_sum = h;
hist[b].count = c as u32;
}
}
hist
})
.collect();
}
}
(0..n_features)
.into_par_iter()
.map(|f| {
let col = &binned[f];
let mut hist: FeatureHistogram = [HistBin::default(); NUM_BINS];
for &idx in sample_indices {
let bin = col[idx] as usize;
hist[bin].grad_sum += gradients[idx];
hist[bin].hess_sum += hessians[idx];
hist[bin].count += 1;
}
hist
})
.collect()
}
fn subtract_histograms(
parent: &[FeatureHistogram],
left: &[FeatureHistogram],
) -> Vec<FeatureHistogram> {
parent
.iter()
.zip(left.iter())
.map(|(p, l)| {
let mut right = [HistBin::default(); NUM_BINS];
for b in 0..NUM_BINS {
right[b].grad_sum = p[b].grad_sum - l[b].grad_sum;
right[b].hess_sum = p[b].hess_sum - l[b].hess_sum;
right[b].count = p[b].count.saturating_sub(l[b].count);
}
right
})
.collect()
}
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
enum HistNode {
Leaf { value: f64 },
Split {
feature: usize,
bin_threshold: u8,
left: usize, right: usize,
gain: f64,
},
}
#[derive(Clone, Debug)]
#[non_exhaustive]
pub enum HistNodeView {
Leaf {
value: f64,
},
Split {
feature: usize,
threshold: f64,
left: usize,
right: usize,
},
}
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
struct HistTree {
nodes: Vec<HistNode>,
}
impl HistTree {
fn predict_one(&self, sample_binned: &[u8]) -> f64 {
let mut node_idx = 0;
loop {
match &self.nodes[node_idx] {
HistNode::Leaf { value } => return *value,
HistNode::Split {
feature,
bin_threshold,
left,
right,
..
} => {
if sample_binned[*feature] <= *bin_threshold {
node_idx = *left;
} else {
node_idx = *right;
}
}
}
}
}
fn predict_one_raw(&self, sample: &[f64], binner: &FeatureBinner) -> f64 {
let mut node_idx = 0;
loop {
match &self.nodes[node_idx] {
HistNode::Leaf { value } => return *value,
HistNode::Split {
feature,
bin_threshold,
left,
right,
..
} => {
let val = sample[*feature];
let bin = if val.is_nan() {
0u8
} else {
let edges = &binner.bin_edges()[*feature];
let pos = match edges.binary_search_by(|edge| {
edge.partial_cmp(&val).unwrap_or(std::cmp::Ordering::Equal)
}) {
Ok(p) => p + 1,
Err(p) => p,
};
(pos + 1).min(255) as u8
};
if bin <= *bin_threshold {
node_idx = *left;
} else {
node_idx = *right;
}
}
}
}
}
fn feature_importances(&self, n_features: usize) -> Vec<f64> {
let mut imp = vec![0.0; n_features];
for node in &self.nodes {
if let HistNode::Split { feature, gain, .. } = node {
if *feature < n_features {
imp[*feature] += gain;
}
}
}
imp
}
fn to_node_views(&self, binner: &FeatureBinner) -> Vec<HistNodeView> {
let edges = binner.bin_edges();
self.nodes
.iter()
.map(|node| match node {
HistNode::Leaf { value } => HistNodeView::Leaf { value: *value },
HistNode::Split {
feature,
bin_threshold,
left,
right,
..
} => {
let threshold = if *bin_threshold == 0 || *feature >= edges.len() {
f64::NEG_INFINITY
} else {
let feat_edges = &edges[*feature];
let idx = (*bin_threshold as usize).saturating_sub(1);
if idx < feat_edges.len() {
feat_edges[idx]
} else if !feat_edges.is_empty() {
feat_edges[feat_edges.len() - 1]
} else {
0.0
}
};
HistNodeView::Split {
feature: *feature,
threshold,
left: *left,
right: *right,
}
}
})
.collect()
}
}
struct LeafCandidate {
node_idx: usize,
sample_indices: Vec<usize>,
histograms: Vec<FeatureHistogram>,
grad_sum: f64,
hess_sum: f64,
depth: usize,
}
struct SplitResult {
feature: usize,
bin_threshold: u8,
gain: f64,
left_indices: Vec<usize>,
right_indices: Vec<usize>,
left_value: f64,
right_value: f64,
left_grad_sum: f64,
left_hess_sum: f64,
right_grad_sum: f64,
right_hess_sum: f64,
}
#[inline]
fn leaf_value(grad_sum: f64, hess_sum: f64, l2_reg: f64) -> f64 {
let denom = hess_sum + l2_reg;
if denom.abs() < 1e-10 {
0.0
} else {
-grad_sum / denom
}
}
#[inline]
fn split_gain(
grad_left: f64,
hess_left: f64,
grad_right: f64,
hess_right: f64,
l2_reg: f64,
) -> f64 {
let left_term = grad_left * grad_left / (hess_left + l2_reg);
let right_term = grad_right * grad_right / (hess_right + l2_reg);
let parent_grad = grad_left + grad_right;
let parent_hess = hess_left + hess_right;
let parent_term = parent_grad * parent_grad / (parent_hess + l2_reg);
0.5 * (left_term + right_term - parent_term)
}
#[allow(clippy::too_many_arguments)]
fn find_best_split(
histograms: &[FeatureHistogram],
binned: &[Vec<u8>],
sample_indices: &[usize],
grad_sum: f64,
hess_sum: f64,
min_samples_leaf: usize,
l2_reg: f64,
n_features: usize,
) -> Option<SplitResult> {
let mut best_gain = 0.0; let mut best_feature = 0;
let mut best_threshold: u8 = 0;
let mut best_left_grad = 0.0;
let mut best_left_hess = 0.0;
for (f, hist) in histograms.iter().enumerate().take(n_features) {
let mut running_grad = 0.0;
let mut running_hess = 0.0;
let mut running_count: u32 = 0;
let total_count = sample_indices.len() as u32;
for bin in 0..255u8 {
let b = bin as usize;
running_grad += hist[b].grad_sum;
running_hess += hist[b].hess_sum;
running_count += hist[b].count;
let right_count = total_count.saturating_sub(running_count);
if (running_count as usize) < min_samples_leaf
|| (right_count as usize) < min_samples_leaf
{
continue;
}
let right_grad = grad_sum - running_grad;
let right_hess = hess_sum - running_hess;
let gain = split_gain(running_grad, running_hess, right_grad, right_hess, l2_reg);
if gain > best_gain {
best_gain = gain;
best_feature = f;
best_threshold = bin;
best_left_grad = running_grad;
best_left_hess = running_hess;
}
}
}
if best_gain <= 0.0 {
return None;
}
let col = &binned[best_feature];
let mut left_indices = Vec::new();
let mut right_indices = Vec::new();
for &idx in sample_indices {
if col[idx] <= best_threshold {
left_indices.push(idx);
} else {
right_indices.push(idx);
}
}
let best_right_grad = grad_sum - best_left_grad;
let best_right_hess = hess_sum - best_left_hess;
Some(SplitResult {
feature: best_feature,
bin_threshold: best_threshold,
gain: best_gain,
left_indices,
right_indices,
left_value: leaf_value(best_left_grad, best_left_hess, l2_reg),
right_value: leaf_value(best_right_grad, best_right_hess, l2_reg),
left_grad_sum: best_left_grad,
left_hess_sum: best_left_hess,
right_grad_sum: best_right_grad,
right_hess_sum: best_right_hess,
})
}
#[allow(clippy::too_many_arguments)]
fn build_tree_leaf_wise(
binned: &[Vec<u8>],
gradients: &[f64],
hessians: &[f64],
sample_indices: &[usize],
max_leaf_nodes: usize,
min_samples_leaf: usize,
max_depth: usize,
l2_reg: f64,
n_features: usize,
) -> HistTree {
let mut nodes: Vec<HistNode> = Vec::new();
let total_grad: f64 = sample_indices.iter().map(|&i| gradients[i]).sum();
let total_hess: f64 = sample_indices.iter().map(|&i| hessians[i]).sum();
let root_value = leaf_value(total_grad, total_hess, l2_reg);
nodes.push(HistNode::Leaf { value: root_value });
let root_histograms = build_histograms(binned, gradients, hessians, sample_indices, n_features);
let mut candidates: Vec<LeafCandidate> = Vec::new();
candidates.push(LeafCandidate {
node_idx: 0,
sample_indices: sample_indices.to_vec(),
histograms: root_histograms,
grad_sum: total_grad,
hess_sum: total_hess,
depth: 0,
});
let mut n_leaves = 1usize;
while n_leaves < max_leaf_nodes && !candidates.is_empty() {
let mut best_cand_idx = 0;
let mut best_gain = f64::NEG_INFINITY;
for (c_idx, cand) in candidates.iter().enumerate() {
if cand.depth >= max_depth {
continue;
}
if cand.sample_indices.len() < 2 * min_samples_leaf {
continue;
}
if let Some(split) = find_best_split(
&cand.histograms,
binned,
&cand.sample_indices,
cand.grad_sum,
cand.hess_sum,
min_samples_leaf,
l2_reg,
n_features,
) {
if split.gain > best_gain {
best_gain = split.gain;
best_cand_idx = c_idx;
}
}
}
if best_gain <= 0.0 {
break;
}
let cand = candidates.remove(best_cand_idx);
let split = find_best_split(
&cand.histograms,
binned,
&cand.sample_indices,
cand.grad_sum,
cand.hess_sum,
min_samples_leaf,
l2_reg,
n_features,
);
let Some(split) = split else {
continue;
};
let left_idx = nodes.len();
nodes.push(HistNode::Leaf {
value: split.left_value,
});
let right_idx = nodes.len();
nodes.push(HistNode::Leaf {
value: split.right_value,
});
nodes[cand.node_idx] = HistNode::Split {
feature: split.feature,
bin_threshold: split.bin_threshold,
left: left_idx,
right: right_idx,
gain: split.gain,
};
n_leaves += 1;
let (small_indices, _large_indices, small_is_left) =
if split.left_indices.len() <= split.right_indices.len() {
(&split.left_indices, &split.right_indices, true)
} else {
(&split.right_indices, &split.left_indices, false)
};
let small_histograms =
build_histograms(binned, gradients, hessians, small_indices, n_features);
let large_histograms = subtract_histograms(&cand.histograms, &small_histograms);
let (left_hist, right_hist) = if small_is_left {
(small_histograms, large_histograms)
} else {
(large_histograms, small_histograms)
};
let new_depth = cand.depth + 1;
if split.left_indices.len() >= 2 * min_samples_leaf && new_depth < max_depth {
candidates.push(LeafCandidate {
node_idx: left_idx,
sample_indices: split.left_indices,
histograms: left_hist,
grad_sum: split.left_grad_sum,
hess_sum: split.left_hess_sum,
depth: new_depth,
});
}
if split.right_indices.len() >= 2 * min_samples_leaf && new_depth < max_depth {
candidates.push(LeafCandidate {
node_idx: right_idx,
sample_indices: split.right_indices,
histograms: right_hist,
grad_sum: split.right_grad_sum,
hess_sum: split.right_hess_sum,
depth: new_depth,
});
}
}
HistTree { nodes }
}
#[derive(Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct HistGradientBoostingRegressor {
n_estimators: usize,
learning_rate: f64,
max_leaf_nodes: usize,
min_samples_leaf: usize,
max_depth: usize,
max_bins: usize,
l2_regularization: f64,
seed: u64,
trees: Vec<HistTree>,
binner: FeatureBinner,
init_prediction: f64,
n_features: usize,
fitted: bool,
#[cfg_attr(feature = "serde", serde(default))]
_schema_version: u32,
}
impl HistGradientBoostingRegressor {
pub fn new() -> Self {
Self {
n_estimators: 100,
learning_rate: 0.1,
max_leaf_nodes: 31,
min_samples_leaf: 20,
max_depth: 8,
max_bins: NUM_BINS,
l2_regularization: 0.0,
seed: 42,
trees: Vec::new(),
binner: FeatureBinner::new(),
init_prediction: 0.0,
n_features: 0,
fitted: false,
_schema_version: crate::version::SCHEMA_VERSION,
}
}
pub fn n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
pub fn learning_rate(mut self, lr: f64) -> Self {
self.learning_rate = lr;
self
}
pub fn max_leaf_nodes(mut self, n: usize) -> Self {
self.max_leaf_nodes = n;
self
}
pub fn min_samples_leaf(mut self, n: usize) -> Self {
self.min_samples_leaf = n;
self
}
pub fn max_depth(mut self, d: usize) -> Self {
self.max_depth = d;
self
}
pub fn max_bins(mut self, bins: usize) -> Self {
self.max_bins = bins.clamp(2, NUM_BINS);
self
}
pub fn l2_regularization(mut self, l2: f64) -> Self {
self.l2_regularization = l2;
self
}
pub fn seed(mut self, s: u64) -> Self {
self.seed = s;
self
}
pub fn fit(&mut self, data: &Dataset) -> Result<()> {
data.validate_no_inf()?;
let n = data.n_samples();
if n == 0 {
return Err(ScryLearnError::EmptyDataset);
}
if self.learning_rate <= 0.0 || self.learning_rate > 1.0 {
return Err(ScryLearnError::InvalidParameter(
"learning_rate must be in (0, 1]".into(),
));
}
self.n_features = data.n_features();
self.binner = FeatureBinner::new().max_bins(self.max_bins);
let binned = self.binner.fit_transform(data)?;
let mean: f64 = data.target.iter().sum::<f64>() / n as f64;
self.init_prediction = mean;
let mut predictions = vec![mean; n];
let all_indices: Vec<usize> = (0..n).collect();
self.trees = Vec::with_capacity(self.n_estimators);
let effective_min_leaf = self.min_samples_leaf.min(n / 4).max(1);
for _ in 0..self.n_estimators {
let gradients: Vec<f64> = (0..n).map(|i| -(data.target[i] - predictions[i])).collect();
let hessians = vec![1.0; n];
let tree = build_tree_leaf_wise(
&binned,
&gradients,
&hessians,
&all_indices,
self.max_leaf_nodes,
effective_min_leaf,
self.max_depth,
self.l2_regularization,
self.n_features,
);
for &i in &all_indices {
let sample: Vec<u8> = binned.iter().map(|col| col[i]).collect();
predictions[i] += self.learning_rate * tree.predict_one(&sample);
}
self.trees.push(tree);
}
self.fitted = true;
Ok(())
}
pub fn predict(&self, features: &[Vec<f64>]) -> Result<Vec<f64>> {
crate::version::check_schema_version(self._schema_version)?;
if !self.fitted {
return Err(ScryLearnError::NotFitted);
}
let n = features.len();
let mut preds = vec![self.init_prediction; n];
for tree in &self.trees {
for (i, sample) in features.iter().enumerate() {
preds[i] += self.learning_rate * tree.predict_one_raw(sample, &self.binner);
}
}
Ok(preds)
}
pub fn feature_importances(&self) -> Result<Vec<f64>> {
if !self.fitted {
return Err(ScryLearnError::NotFitted);
}
let m = self.n_features;
let mut imp = vec![0.0; m];
for tree in &self.trees {
let ti = tree.feature_importances(m);
for (i, &v) in ti.iter().enumerate() {
imp[i] += v;
}
}
let total: f64 = imp.iter().sum();
if total > 0.0 {
for v in &mut imp {
*v /= total;
}
}
Ok(imp)
}
pub fn n_trees(&self) -> usize {
self.trees.len()
}
pub fn n_features(&self) -> usize {
self.n_features
}
pub fn learning_rate_val(&self) -> f64 {
self.learning_rate
}
pub fn init_prediction_val(&self) -> f64 {
self.init_prediction
}
pub fn tree_node_views(&self) -> Vec<Vec<HistNodeView>> {
self.trees
.iter()
.map(|tree| tree.to_node_views(&self.binner))
.collect()
}
}
impl Default for HistGradientBoostingRegressor {
fn default() -> Self {
Self::new()
}
}
#[derive(Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct HistGradientBoostingClassifier {
n_estimators: usize,
learning_rate: f64,
max_leaf_nodes: usize,
min_samples_leaf: usize,
max_depth: usize,
max_bins: usize,
l2_regularization: f64,
seed: u64,
trees: Vec<Vec<HistTree>>,
binner: FeatureBinner,
init_predictions: Vec<f64>,
n_classes: usize,
n_features: usize,
fitted: bool,
#[cfg_attr(feature = "serde", serde(default))]
_schema_version: u32,
}
impl HistGradientBoostingClassifier {
pub fn new() -> Self {
Self {
n_estimators: 100,
learning_rate: 0.1,
max_leaf_nodes: 31,
min_samples_leaf: 20,
max_depth: 8,
max_bins: NUM_BINS,
l2_regularization: 0.0,
seed: 42,
trees: Vec::new(),
binner: FeatureBinner::new(),
init_predictions: Vec::new(),
n_classes: 0,
n_features: 0,
fitted: false,
_schema_version: crate::version::SCHEMA_VERSION,
}
}
pub fn n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
pub fn learning_rate(mut self, lr: f64) -> Self {
self.learning_rate = lr;
self
}
pub fn max_leaf_nodes(mut self, n: usize) -> Self {
self.max_leaf_nodes = n;
self
}
pub fn min_samples_leaf(mut self, n: usize) -> Self {
self.min_samples_leaf = n;
self
}
pub fn max_depth(mut self, d: usize) -> Self {
self.max_depth = d;
self
}
pub fn max_bins(mut self, bins: usize) -> Self {
self.max_bins = bins.clamp(2, NUM_BINS);
self
}
pub fn l2_regularization(mut self, l2: f64) -> Self {
self.l2_regularization = l2;
self
}
pub fn seed(mut self, s: u64) -> Self {
self.seed = s;
self
}
pub fn fit(&mut self, data: &Dataset) -> Result<()> {
data.validate_no_inf()?;
let n = data.n_samples();
if n == 0 {
return Err(ScryLearnError::EmptyDataset);
}
if self.learning_rate <= 0.0 || self.learning_rate > 1.0 {
return Err(ScryLearnError::InvalidParameter(
"learning_rate must be in (0, 1]".into(),
));
}
self.n_features = data.n_features();
self.n_classes = data.n_classes();
let k = self.n_classes;
if k < 2 {
return Err(ScryLearnError::InvalidParameter(
"need at least 2 classes for classification".into(),
));
}
self.binner = FeatureBinner::new().max_bins(self.max_bins);
let binned = self.binner.fit_transform(data)?;
let all_indices: Vec<usize> = (0..n).collect();
let effective_min_leaf = self.min_samples_leaf.min(n / 4).max(1);
if k == 2 {
self.fit_binary(data, n, &binned, &all_indices, effective_min_leaf)
} else {
self.fit_multiclass(data, n, k, &binned, &all_indices, effective_min_leaf)
}
}
#[allow(clippy::unnecessary_wraps)]
fn fit_binary(
&mut self,
data: &Dataset,
n: usize,
binned: &[Vec<u8>],
all_indices: &[usize],
min_leaf: usize,
) -> Result<()> {
let pos_count = data.target.iter().filter(|&&y| y > 0.5).count();
let p = (pos_count as f64 / n as f64).clamp(1e-7, 1.0 - 1e-7);
let f0 = (p / (1.0 - p)).ln();
self.init_predictions = vec![f0];
let mut f_vals = vec![f0; n];
let mut trees_seq = Vec::with_capacity(self.n_estimators);
for _ in 0..self.n_estimators {
let probs: Vec<f64> = f_vals.iter().map(|&f| sigmoid(f)).collect();
let gradients: Vec<f64> = (0..n).map(|i| probs[i] - data.target[i]).collect();
let hessians: Vec<f64> = probs.iter().map(|&p| (p * (1.0 - p)).max(1e-10)).collect();
let tree = build_tree_leaf_wise(
binned,
&gradients,
&hessians,
all_indices,
self.max_leaf_nodes,
min_leaf,
self.max_depth,
self.l2_regularization,
self.n_features,
);
for &i in all_indices {
let sample: Vec<u8> = binned.iter().map(|col| col[i]).collect();
f_vals[i] += self.learning_rate * tree.predict_one(&sample);
}
trees_seq.push(tree);
}
self.trees = vec![trees_seq];
self.fitted = true;
Ok(())
}
#[allow(clippy::unnecessary_wraps)]
fn fit_multiclass(
&mut self,
data: &Dataset,
n: usize,
k: usize,
binned: &[Vec<u8>],
all_indices: &[usize],
min_leaf: usize,
) -> Result<()> {
let y_onehot: Vec<Vec<f64>> = (0..k)
.map(|cls| {
data.target
.iter()
.map(|&y| if (y as usize) == cls { 1.0 } else { 0.0 })
.collect()
})
.collect();
let class_counts: Vec<usize> = (0..k)
.map(|cls| data.target.iter().filter(|&&y| (y as usize) == cls).count())
.collect();
let init_preds: Vec<f64> = class_counts
.iter()
.map(|&c| (c as f64 / n as f64).clamp(1e-7, 1.0 - 1e-7).ln())
.collect();
self.init_predictions.clone_from(&init_preds);
let mut f_vals: Vec<Vec<f64>> = (0..k).map(|c| vec![init_preds[c]; n]).collect();
let mut trees_all: Vec<Vec<HistTree>> = (0..k)
.map(|_| Vec::with_capacity(self.n_estimators))
.collect();
for _ in 0..self.n_estimators {
let probs = softmax_matrix(&f_vals, n, k);
for cls in 0..k {
let gradients: Vec<f64> =
(0..n).map(|i| probs[cls][i] - y_onehot[cls][i]).collect();
let hessians: Vec<f64> = (0..n)
.map(|i| (probs[cls][i] * (1.0 - probs[cls][i])).max(1e-10))
.collect();
let tree = build_tree_leaf_wise(
binned,
&gradients,
&hessians,
all_indices,
self.max_leaf_nodes,
min_leaf,
self.max_depth,
self.l2_regularization,
self.n_features,
);
for &i in all_indices {
let sample: Vec<u8> = binned.iter().map(|col| col[i]).collect();
f_vals[cls][i] += self.learning_rate * tree.predict_one(&sample);
}
trees_all[cls].push(tree);
}
}
self.trees = trees_all;
self.fitted = true;
Ok(())
}
pub fn predict(&self, features: &[Vec<f64>]) -> Result<Vec<f64>> {
crate::version::check_schema_version(self._schema_version)?;
if !self.fitted {
return Err(ScryLearnError::NotFitted);
}
let proba = self.predict_proba(features)?;
Ok(proba
.iter()
.map(|row| {
row.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0.0, |(idx, _)| idx as f64)
})
.collect())
}
pub fn predict_proba(&self, features: &[Vec<f64>]) -> Result<Vec<Vec<f64>>> {
if !self.fitted {
return Err(ScryLearnError::NotFitted);
}
let n = features.len();
let k = self.n_classes;
if k == 2 {
let mut f_vals = vec![self.init_predictions[0]; n];
for tree in &self.trees[0] {
for (i, sample) in features.iter().enumerate() {
f_vals[i] += self.learning_rate * tree.predict_one_raw(sample, &self.binner);
}
}
Ok(f_vals
.iter()
.map(|&f| {
let p = sigmoid(f);
vec![1.0 - p, p]
})
.collect())
} else {
let mut f_vals: Vec<Vec<f64>> =
(0..k).map(|c| vec![self.init_predictions[c]; n]).collect();
for (cls_vals, cls_trees) in f_vals.iter_mut().zip(self.trees.iter()).take(k) {
for tree in cls_trees {
for (i, sample) in features.iter().enumerate() {
cls_vals[i] +=
self.learning_rate * tree.predict_one_raw(sample, &self.binner);
}
}
}
let probs = softmax_matrix(&f_vals, n, k);
Ok((0..n)
.map(|i| (0..k).map(|c| probs[c][i]).collect())
.collect())
}
}
pub fn feature_importances(&self) -> Result<Vec<f64>> {
if !self.fitted {
return Err(ScryLearnError::NotFitted);
}
let m = self.n_features;
let mut imp = vec![0.0; m];
for tree_seq in &self.trees {
for tree in tree_seq {
let ti = tree.feature_importances(m);
for (i, &v) in ti.iter().enumerate() {
imp[i] += v;
}
}
}
let total: f64 = imp.iter().sum();
if total > 0.0 {
for v in &mut imp {
*v /= total;
}
}
Ok(imp)
}
pub fn n_trees(&self) -> usize {
self.trees.iter().map(Vec::len).sum()
}
pub fn n_classes(&self) -> usize {
self.n_classes
}
pub fn n_features(&self) -> usize {
self.n_features
}
pub fn learning_rate_val(&self) -> f64 {
self.learning_rate
}
pub fn init_predictions_val(&self) -> &[f64] {
&self.init_predictions
}
pub fn class_tree_node_views(&self) -> Vec<Vec<Vec<HistNodeView>>> {
self.trees
.iter()
.map(|class_trees| {
class_trees
.iter()
.map(|tree| tree.to_node_views(&self.binner))
.collect()
})
.collect()
}
}
impl Default for HistGradientBoostingClassifier {
fn default() -> Self {
Self::new()
}
}
#[inline]
fn sigmoid(x: f64) -> f64 {
1.0 / (1.0 + (-x).exp())
}
fn softmax_matrix(f_vals: &[Vec<f64>], n: usize, k: usize) -> Vec<Vec<f64>> {
let mut result: Vec<Vec<f64>> = vec![vec![0.0; n]; k];
for i in 0..n {
let max_f = (0..k)
.map(|c| f_vals[c][i])
.fold(f64::NEG_INFINITY, f64::max);
let exp_sum: f64 = (0..k).map(|c| (f_vals[c][i] - max_f).exp()).sum();
for c in 0..k {
result[c][i] = (f_vals[c][i] - max_f).exp() / exp_sum;
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::metrics::{accuracy, r2_score};
fn simple_regression_data() -> Dataset {
let x: Vec<f64> = (0..100).map(|i| i as f64 * 0.1).collect();
let y: Vec<f64> = x.iter().map(|&v| 2.0 * v + 1.0).collect();
Dataset::new(vec![x], y, vec!["x".into()], "y")
}
fn simple_classification_data() -> Dataset {
let n = 200;
let mut f1 = Vec::with_capacity(n);
let mut f2 = Vec::with_capacity(n);
let mut target = Vec::with_capacity(n);
let mut rng = crate::rng::FastRng::new(42);
for _ in 0..n / 2 {
f1.push(rng.f64() * 2.0);
f2.push(rng.f64() * 2.0);
target.push(0.0);
}
for _ in 0..n / 2 {
f1.push(5.0 + rng.f64() * 2.0);
f2.push(5.0 + rng.f64() * 2.0);
target.push(1.0);
}
Dataset::new(
vec![f1, f2],
target,
vec!["f1".into(), "f2".into()],
"class",
)
}
#[test]
fn test_hist_gbr_fit_predict() {
let data = simple_regression_data();
let mut model = HistGradientBoostingRegressor::new()
.n_estimators(50)
.learning_rate(0.1)
.max_leaf_nodes(15)
.min_samples_leaf(5);
model.fit(&data).unwrap();
let test_x = vec![vec![3.0], vec![5.0], vec![7.0]];
let preds = model.predict(&test_x).unwrap();
assert_eq!(preds.len(), 3);
assert!((preds[0] - 7.0).abs() < 1.5, "got {}", preds[0]);
assert!((preds[1] - 11.0).abs() < 1.5, "got {}", preds[1]);
}
#[test]
fn test_hist_gbr_r2() {
let data = simple_regression_data();
let mut model = HistGradientBoostingRegressor::new()
.n_estimators(100)
.learning_rate(0.1)
.max_leaf_nodes(31)
.min_samples_leaf(3);
model.fit(&data).unwrap();
let features = data.feature_matrix();
let preds = model.predict(&features).unwrap();
let r2 = r2_score(&data.target, &preds);
assert!(r2 > 0.95, "R² should be > 0.95, got {r2:.4}");
}
#[test]
fn test_hist_gbc_binary() {
let data = simple_classification_data();
let mut model = HistGradientBoostingClassifier::new()
.n_estimators(50)
.learning_rate(0.1)
.max_leaf_nodes(15)
.min_samples_leaf(5);
model.fit(&data).unwrap();
let features = data.feature_matrix();
let preds = model.predict(&features).unwrap();
let acc = accuracy(&data.target, &preds);
assert!(
acc > 0.90,
"accuracy should be > 90%, got {:.1}%",
acc * 100.0
);
}
#[test]
fn test_hist_gbc_multiclass() {
let n_per_class = 50;
let mut rng = crate::rng::FastRng::new(42);
let mut f1 = Vec::new();
let mut f2 = Vec::new();
let mut target = Vec::new();
for cls in 0..3 {
let offset = cls as f64 * 5.0;
for _ in 0..n_per_class {
f1.push(offset + rng.f64() * 2.0);
f2.push(offset + rng.f64() * 2.0);
target.push(cls as f64);
}
}
let data = Dataset::new(
vec![f1, f2],
target,
vec!["f1".into(), "f2".into()],
"class",
);
let mut model = HistGradientBoostingClassifier::new()
.n_estimators(50)
.learning_rate(0.1)
.max_leaf_nodes(15)
.min_samples_leaf(3);
model.fit(&data).unwrap();
let features = data.feature_matrix();
let preds = model.predict(&features).unwrap();
let acc = accuracy(&data.target, &preds);
assert!(
acc > 0.90,
"multiclass accuracy > 90%, got {:.1}%",
acc * 100.0
);
}
#[test]
fn test_hist_gbc_predict_proba() {
let data = simple_classification_data();
let mut model = HistGradientBoostingClassifier::new()
.n_estimators(30)
.learning_rate(0.1)
.min_samples_leaf(5);
model.fit(&data).unwrap();
let features = data.feature_matrix();
let proba = model.predict_proba(&features).unwrap();
for row in &proba {
let sum: f64 = row.iter().sum();
assert!((sum - 1.0).abs() < 1e-6, "probabilities should sum to 1.0");
for &p in row {
assert!((0.0..=1.0).contains(&p), "probability out of range: {p}");
}
}
}
#[test]
fn test_hist_gbr_not_fitted() {
let model = HistGradientBoostingRegressor::new();
let result = model.predict(&[vec![1.0]]);
assert!(result.is_err());
}
#[test]
fn test_hist_gbc_not_fitted() {
let model = HistGradientBoostingClassifier::new();
let result = model.predict(&[vec![1.0]]);
assert!(result.is_err());
}
#[test]
fn test_hist_gbr_feature_importances() {
let data = simple_regression_data();
let mut model = HistGradientBoostingRegressor::new()
.n_estimators(50)
.min_samples_leaf(3);
model.fit(&data).unwrap();
let imp = model.feature_importances().unwrap();
assert_eq!(imp.len(), 1);
let sum: f64 = imp.iter().sum();
assert!((sum - 1.0).abs() < 1e-6 || sum == 0.0);
}
#[test]
fn test_hist_gbr_with_nan() {
let x: Vec<f64> = (0..100)
.map(|i| {
if i % 10 == 0 {
f64::NAN
} else {
i as f64 * 0.1
}
})
.collect();
let y: Vec<f64> = (0..100).map(|i| i as f64 * 0.2 + 1.0).collect();
let data = Dataset::new(vec![x], y, vec!["x".into()], "y");
let mut model = HistGradientBoostingRegressor::new()
.n_estimators(50)
.min_samples_leaf(3);
model.fit(&data).unwrap();
let preds = model.predict(&[vec![f64::NAN], vec![5.0]]).unwrap();
assert_eq!(preds.len(), 2);
assert!(
!preds[0].is_nan(),
"NaN input should produce a finite prediction"
);
assert!(!preds[1].is_nan());
}
}