use anofox_ml_core::{Fit, Predict, Result, RustMlError};
use ndarray::{Array1, Array2};
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
const DEFAULT_MAX_BINS: usize = 255;
const MISSING_BIN: u8 = 255;
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
struct FeatureBins {
edges: Vec<f64>,
is_categorical: bool,
}
fn compute_bins(
x: &Array2<f64>,
max_bins: usize,
categorical_features: &[usize],
min_data_in_bin: usize,
) -> (Array2<u8>, Vec<FeatureBins>) {
let n = x.nrows();
let p = x.ncols();
let mut binned = Array2::zeros((n, p));
let mut all_bins = Vec::with_capacity(p);
let cat_set: std::collections::HashSet<usize> = categorical_features.iter().copied().collect();
for j in 0..p {
let is_categorical = cat_set.contains(&j);
let mut edges = Vec::new();
if is_categorical {
let mut col: Vec<f64> = (0..n).map(|i| x[[i, j]]).filter(|v| !v.is_nan()).collect();
col.sort_by(|a, b| a.partial_cmp(b).unwrap());
col.dedup();
for &v in col.iter().take(max_bins - 1) {
edges.push(v);
}
} else {
let mut raw: Vec<f64> = (0..n).map(|i| x[[i, j]]).filter(|v| !v.is_nan()).collect();
raw.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mut distinct: Vec<(f64, usize)> = Vec::new();
if !raw.is_empty() {
let mut cur = raw[0];
let mut cnt = 1usize;
for &v in &raw[1..] {
if v == cur {
cnt += 1;
} else {
distinct.push((cur, cnt));
cur = v;
cnt = 1;
}
}
distinct.push((cur, cnt));
}
if distinct.len() > 1 {
let max_n_bins = max_bins - 1; let min_per = min_data_in_bin.max(1);
let mut group_ends: Vec<usize> = Vec::new();
let mut cur_count = 0usize;
for (idx, &(_, count)) in distinct.iter().enumerate() {
cur_count += count;
if cur_count >= min_per {
group_ends.push(idx + 1);
cur_count = 0;
if group_ends.len() >= max_n_bins {
break;
}
}
}
if cur_count > 0 || group_ends.is_empty() {
group_ends.push(distinct.len());
} else if let Some(last) = group_ends.last_mut() {
*last = distinct.len();
}
for k in 0..group_ends.len() - 1 {
let last_in_k = distinct[group_ends[k] - 1].0;
let first_in_next = distinct[group_ends[k]].0;
edges.push(0.5 * (last_in_k + first_in_next));
}
}
}
for i in 0..n {
let v = x[[i, j]];
if v.is_nan() {
binned[[i, j]] = MISSING_BIN;
} else if is_categorical {
let bin = edges
.iter()
.position(|&e| (e - v).abs() < 1e-12)
.unwrap_or(edges.len().saturating_sub(1));
binned[[i, j]] = bin.min((MISSING_BIN - 1) as usize) as u8;
} else {
let bin = edges.partition_point(|&e| e < v) as u8;
binned[[i, j]] = bin.min(MISSING_BIN - 1);
}
}
all_bins.push(FeatureBins {
edges,
is_categorical,
});
}
(binned, all_bins)
}
fn bin_row(row: &[f64], all_bins: &[FeatureBins]) -> Vec<u8> {
row.iter()
.zip(all_bins.iter())
.map(|(&v, bins)| {
if v.is_nan() {
MISSING_BIN
} else if bins.is_categorical {
bins.edges
.iter()
.position(|&e| (e - v).abs() < 1e-12)
.map(|b| b.min((MISSING_BIN - 1) as usize) as u8)
.unwrap_or(MISSING_BIN - 1)
} else {
let bin = bins.edges.partition_point(|&e| e < v) as u8;
bin.min(MISSING_BIN - 1)
}
})
.collect()
}
#[derive(Clone)]
struct Histogram {
grad_sum: Vec<f64>,
hess_sum: Vec<f64>,
count: Vec<u32>,
missing_grad: f64,
missing_hess: f64,
missing_count: u32,
}
impl Histogram {
fn new(n_bins: usize) -> Self {
Self {
grad_sum: vec![0.0; n_bins],
hess_sum: vec![0.0; n_bins],
count: vec![0; n_bins],
missing_grad: 0.0,
missing_hess: 0.0,
missing_count: 0,
}
}
fn reset(&mut self) {
self.grad_sum.fill(0.0);
self.hess_sum.fill(0.0);
self.count.fill(0);
self.missing_grad = 0.0;
self.missing_hess = 0.0;
self.missing_count = 0;
}
fn accumulate(&mut self, bin: u8, grad: f64, hess: f64) {
if bin == MISSING_BIN {
self.missing_grad += grad;
self.missing_hess += hess;
self.missing_count += 1;
} else {
let b = bin as usize;
self.grad_sum[b] += grad;
self.hess_sum[b] += hess;
self.count[b] += 1;
}
}
}
#[derive(Clone)]
#[allow(dead_code)]
struct BestSplit {
feature: usize,
bin_threshold: u8,
gain: f64,
left_value: f64,
right_value: f64,
left_count: usize,
right_count: usize,
missing_left: bool,
categorical_order: Option<Vec<u8>>,
}
#[inline]
fn leaf_value(grad_sum: f64, hess_sum: f64, l1: f64, l2: f64) -> f64 {
if grad_sum.abs() <= l1 {
0.0
} else {
-(grad_sum.signum() * (grad_sum.abs() - l1)) / (hess_sum + l2)
}
}
#[inline]
fn leaf_gain(grad_sum: f64, hess_sum: f64, l1: f64, l2: f64) -> f64 {
if grad_sum.abs() <= l1 {
0.0
} else {
let thresholded = grad_sum.abs() - l1;
thresholded * thresholded / (hess_sum + l2)
}
}
#[allow(clippy::too_many_arguments)]
fn find_best_split(
binned_x: &Array2<u8>,
gradients: &[f64],
hessians: &[f64],
indices: &[usize],
feature_indices: &[usize],
all_bins: &[FeatureBins],
min_child_samples: usize,
min_child_weight: f64,
min_split_gain: f64,
l1: f64,
l2: f64,
monotone_constraints: &[i8],
) -> Option<BestSplit> {
let n_bins = DEFAULT_MAX_BINS;
let mut best: Option<BestSplit> = None;
let mut hist = Histogram::new(n_bins);
let total_grad: f64 = indices.iter().map(|&i| gradients[i]).sum();
let total_hess: f64 = indices.iter().map(|&i| hessians[i]).sum();
let total_count = indices.len();
let parent_gain = leaf_gain(total_grad, total_hess, l1, l2);
for &feat in feature_indices {
hist.reset();
for &i in indices {
hist.accumulate(binned_x[[i, feat]], gradients[i], hessians[i]);
}
let scan_order: Vec<u8> = if all_bins[feat].is_categorical {
let mut order: Vec<u8> = (0..n_bins as u8).collect();
order.sort_by(|&a, &b| {
let ra = if hist.hess_sum[a as usize] > 1e-10 {
hist.grad_sum[a as usize] / hist.hess_sum[a as usize]
} else {
0.0
};
let rb = if hist.hess_sum[b as usize] > 1e-10 {
hist.grad_sum[b as usize] / hist.hess_sum[b as usize]
} else {
0.0
};
ra.partial_cmp(&rb).unwrap_or(Ordering::Equal)
});
order
} else {
(0..n_bins as u8).collect()
};
for &missing_left in &[true, false] {
let mut left_grad = if missing_left { hist.missing_grad } else { 0.0 };
let mut left_hess = if missing_left { hist.missing_hess } else { 0.0 };
let mut left_count: usize = if missing_left {
hist.missing_count as usize
} else {
0
};
for (idx, &bin) in scan_order.iter().enumerate() {
if idx >= scan_order.len() - 1 {
break;
}
let b = bin as usize;
left_grad += hist.grad_sum[b];
left_hess += hist.hess_sum[b];
left_count += hist.count[b] as usize;
if left_count < min_child_samples || left_hess < min_child_weight {
continue;
}
let right_count = total_count - left_count;
let right_grad = total_grad - left_grad;
let right_hess = total_hess - left_hess;
if right_count < min_child_samples || right_hess < min_child_weight {
continue;
}
let left_g = leaf_gain(left_grad, left_hess, l1, l2);
let right_g = leaf_gain(right_grad, right_hess, l1, l2);
let gain = 0.5 * (left_g + right_g - parent_gain);
let left_val = leaf_value(left_grad, left_hess, l1, l2);
let right_val = leaf_value(right_grad, right_hess, l1, l2);
let mono = monotone_constraints.get(feat).copied().unwrap_or(0);
if !all_bins[feat].is_categorical && mono != 0 {
match mono {
1 if left_val > right_val => continue,
-1 if left_val < right_val => continue,
_ => {}
}
}
if gain > min_split_gain && gain > best.as_ref().map_or(0.0, |b| b.gain) {
best = Some(BestSplit {
feature: feat,
bin_threshold: bin,
gain,
left_value: left_val,
right_value: right_val,
left_count,
right_count,
missing_left,
categorical_order: if all_bins[feat].is_categorical {
Some(scan_order.clone())
} else {
None
},
});
}
}
}
}
best
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
enum TreeNode {
Leaf {
value: f64,
},
Internal {
feature: usize,
bin_threshold: u8,
missing_left: bool,
categorical_order: Option<Vec<u8>>,
left: Box<TreeNode>,
right: Box<TreeNode>,
},
}
impl TreeNode {
fn predict_binned(&self, bins: &[u8]) -> f64 {
match self {
TreeNode::Leaf { value } => *value,
TreeNode::Internal {
feature,
bin_threshold,
missing_left,
categorical_order,
left,
right,
} => {
let bin = bins[*feature];
let go_left = if bin == MISSING_BIN {
*missing_left
} else if let Some(order) = categorical_order {
let pos = order.iter().position(|&b| b == bin).unwrap_or(order.len());
let thresh_pos = order.iter().position(|&b| b == *bin_threshold).unwrap_or(0);
pos <= thresh_pos
} else {
bin <= *bin_threshold
};
if go_left {
left.predict_binned(bins)
} else {
right.predict_binned(bins)
}
}
}
}
}
struct LeafCandidate {
node_id: usize,
gain: f64,
split: BestSplit,
indices: Vec<usize>,
depth: usize,
}
impl PartialEq for LeafCandidate {
fn eq(&self, other: &Self) -> bool {
self.gain == other.gain
}
}
impl Eq for LeafCandidate {}
impl PartialOrd for LeafCandidate {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for LeafCandidate {
fn cmp(&self, other: &Self) -> Ordering {
self.gain
.partial_cmp(&other.gain)
.unwrap_or(Ordering::Equal)
}
}
enum BuildNode {
Leaf {
value: f64,
},
Internal {
feature: usize,
bin_threshold: u8,
missing_left: bool,
categorical_order: Option<Vec<u8>>,
left: usize,
right: usize,
},
}
#[allow(clippy::too_many_arguments)]
fn build_lgbm_tree(
binned_x: &Array2<u8>,
gradients: &[f64],
hessians: &[f64],
root_indices: Vec<usize>,
feature_indices: &[usize],
all_bins: &[FeatureBins],
params: &LgbmParams,
feature_gain_out: &mut [f64],
feature_split_count_out: &mut [u32],
) -> TreeNode {
let mut nodes: Vec<BuildNode> = Vec::new();
let root_grad: f64 = root_indices.iter().map(|&i| gradients[i]).sum();
let root_hess: f64 = root_indices.iter().map(|&i| hessians[i]).sum();
let root_value = leaf_value(root_grad, root_hess, params.reg_alpha, params.reg_lambda);
nodes.push(BuildNode::Leaf { value: root_value });
let root_split = find_best_split(
binned_x,
gradients,
hessians,
&root_indices,
feature_indices,
all_bins,
params.min_child_samples,
params.min_child_weight,
params.min_split_gain,
params.reg_alpha,
params.reg_lambda,
¶ms.monotone_constraints,
);
let mut heap: BinaryHeap<LeafCandidate> = BinaryHeap::new();
if let Some(split) = root_split {
heap.push(LeafCandidate {
node_id: 0,
gain: split.gain,
split,
indices: root_indices,
depth: 0,
});
}
let mut n_leaves = 1;
while n_leaves < params.num_leaves {
let candidate = match heap.pop() {
Some(c) => c,
None => break,
};
if let Some(max_d) = params.max_depth {
if candidate.depth >= max_d {
continue;
}
}
let split = candidate.split;
let indices = candidate.indices;
let (left_idx, right_idx): (Vec<usize>, Vec<usize>) = indices.iter().partition(|&&i| {
let bin = binned_x[[i, split.feature]];
if bin == MISSING_BIN {
split.missing_left
} else if let Some(ref order) = split.categorical_order {
let pos = order.iter().position(|&b| b == bin).unwrap_or(order.len());
let thresh_pos = order
.iter()
.position(|&b| b == split.bin_threshold)
.unwrap_or(0);
pos <= thresh_pos
} else {
bin <= split.bin_threshold
}
});
let left_id = nodes.len();
nodes.push(BuildNode::Leaf {
value: split.left_value,
});
let right_id = nodes.len();
nodes.push(BuildNode::Leaf {
value: split.right_value,
});
nodes[candidate.node_id] = BuildNode::Internal {
feature: split.feature,
bin_threshold: split.bin_threshold,
missing_left: split.missing_left,
categorical_order: split.categorical_order.clone(),
left: left_id,
right: right_id,
};
feature_gain_out[split.feature] += split.gain;
feature_split_count_out[split.feature] += 1;
n_leaves += 1;
for (child_id, child_indices, depth) in [
(left_id, left_idx, candidate.depth + 1),
(right_id, right_idx, candidate.depth + 1),
] {
if child_indices.len() < 2 * params.min_child_samples {
continue;
}
if let Some(max_d) = params.max_depth {
if depth >= max_d {
continue;
}
}
let child_split = find_best_split(
binned_x,
gradients,
hessians,
&child_indices,
feature_indices,
all_bins,
params.min_child_samples,
params.min_child_weight,
params.min_split_gain,
params.reg_alpha,
params.reg_lambda,
¶ms.monotone_constraints,
);
if let Some(s) = child_split {
heap.push(LeafCandidate {
node_id: child_id,
gain: s.gain,
split: s,
indices: child_indices,
depth,
});
}
}
}
flatten_tree(&nodes, 0)
}
fn flatten_tree(nodes: &[BuildNode], id: usize) -> TreeNode {
match &nodes[id] {
BuildNode::Leaf { value } => TreeNode::Leaf { value: *value },
BuildNode::Internal {
feature,
bin_threshold,
missing_left,
categorical_order,
left,
right,
} => TreeNode::Internal {
feature: *feature,
bin_threshold: *bin_threshold,
missing_left: *missing_left,
categorical_order: categorical_order.clone(),
left: Box::new(flatten_tree(nodes, *left)),
right: Box::new(flatten_tree(nodes, *right)),
},
}
}
fn goss_sample(
gradients: &[f64],
indices: &[usize],
top_rate: f64,
other_rate: f64,
rng: &mut StdRng,
) -> (Vec<usize>, Vec<f64>) {
let n = indices.len();
let top_n = (n as f64 * top_rate) as usize;
let other_n = (n as f64 * other_rate) as usize;
let mut sorted: Vec<usize> = indices.to_vec();
sorted.sort_by(|&a, &b| {
gradients[b]
.abs()
.partial_cmp(&gradients[a].abs())
.unwrap_or(Ordering::Equal)
});
let mut selected: Vec<usize> = sorted[..top_n.min(n)].to_vec();
let mut weights: Vec<f64> = vec![1.0; selected.len()];
if sorted.len() > top_n && other_n > 0 {
let rest = &sorted[top_n..];
let mut shuffled: Vec<usize> = rest.to_vec();
shuffled.shuffle(rng);
let take = other_n.min(shuffled.len());
let amp = if other_rate > 0.0 {
(1.0 - top_rate) / other_rate
} else {
0.0
};
selected.extend_from_slice(&shuffled[..take]);
weights.extend(std::iter::repeat(amp).take(take));
}
(selected, weights)
}
#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
pub enum BoostingType {
Gbdt,
Goss,
}
impl Default for BoostingType {
fn default() -> Self {
BoostingType::Gbdt
}
}
#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
pub enum LgbmObjective {
Regression,
RegressionL1,
Huber,
Binary,
Multiclass,
}
#[derive(Clone)]
struct LgbmParams {
num_leaves: usize,
max_depth: Option<usize>,
min_child_samples: usize,
min_child_weight: f64,
min_split_gain: f64,
reg_alpha: f64,
reg_lambda: f64,
monotone_constraints: Vec<i8>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LgbmRegressor {
pub n_estimators: usize,
pub num_leaves: usize,
pub max_depth: Option<usize>,
pub learning_rate: f64,
pub min_child_samples: usize,
pub min_child_weight: f64,
pub min_split_gain: f64,
pub reg_alpha: f64,
pub reg_lambda: f64,
pub subsample: f64,
pub colsample_bytree: f64,
pub max_bins: usize,
pub min_data_in_bin: usize,
pub boosting_type: BoostingType,
pub goss_top_rate: f64,
pub goss_other_rate: f64,
pub objective: LgbmObjective,
pub huber_delta: f64,
pub early_stopping_rounds: Option<usize>,
pub categorical_features: Vec<usize>,
pub monotone_constraints: Vec<i8>,
pub seed: u64,
}
impl LgbmRegressor {
pub fn new() -> Self {
Self {
n_estimators: 100,
num_leaves: 31,
max_depth: None,
learning_rate: 0.1,
min_child_samples: 20,
min_child_weight: 1e-3,
min_split_gain: 0.0,
reg_alpha: 0.0,
reg_lambda: 0.0,
subsample: 1.0,
colsample_bytree: 1.0,
max_bins: DEFAULT_MAX_BINS,
min_data_in_bin: 3,
boosting_type: BoostingType::Gbdt,
goss_top_rate: 0.2,
goss_other_rate: 0.1,
objective: LgbmObjective::Regression,
huber_delta: 1.0,
early_stopping_rounds: None,
categorical_features: Vec::new(),
monotone_constraints: Vec::new(),
seed: 0,
}
}
pub fn with_n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
pub fn with_num_leaves(mut self, n: usize) -> Self {
self.num_leaves = n;
self
}
pub fn with_max_depth(mut self, d: Option<usize>) -> Self {
self.max_depth = d;
self
}
pub fn with_learning_rate(mut self, lr: f64) -> Self {
self.learning_rate = lr;
self
}
pub fn with_min_child_samples(mut self, n: usize) -> Self {
self.min_child_samples = n;
self
}
pub fn with_min_child_weight(mut self, w: f64) -> Self {
self.min_child_weight = w;
self
}
pub fn with_min_split_gain(mut self, g: f64) -> Self {
self.min_split_gain = g;
self
}
pub fn with_reg_alpha(mut self, a: f64) -> Self {
self.reg_alpha = a;
self
}
pub fn with_reg_lambda(mut self, l: f64) -> Self {
self.reg_lambda = l;
self
}
pub fn with_subsample(mut self, s: f64) -> Self {
self.subsample = s;
self
}
pub fn with_colsample_bytree(mut self, c: f64) -> Self {
self.colsample_bytree = c;
self
}
pub fn with_min_data_in_bin(mut self, n: usize) -> Self {
self.min_data_in_bin = n;
self
}
pub fn with_boosting_type(mut self, b: BoostingType) -> Self {
self.boosting_type = b;
self
}
pub fn with_goss_rates(mut self, top: f64, other: f64) -> Self {
self.goss_top_rate = top;
self.goss_other_rate = other;
self
}
pub fn with_objective(mut self, o: LgbmObjective) -> Self {
self.objective = o;
self
}
pub fn with_huber_delta(mut self, d: f64) -> Self {
self.huber_delta = d;
self
}
pub fn with_early_stopping(mut self, r: Option<usize>) -> Self {
self.early_stopping_rounds = r;
self
}
pub fn with_categorical_features(mut self, feats: Vec<usize>) -> Self {
self.categorical_features = feats;
self
}
pub fn with_monotone_constraints(mut self, constraints: Vec<i8>) -> Self {
self.monotone_constraints = constraints;
self
}
pub fn with_seed(mut self, s: u64) -> Self {
self.seed = s;
self
}
fn params(&self) -> LgbmParams {
LgbmParams {
num_leaves: self.num_leaves,
max_depth: self.max_depth,
min_child_samples: self.min_child_samples,
min_child_weight: self.min_child_weight,
min_split_gain: self.min_split_gain,
reg_alpha: self.reg_alpha,
reg_lambda: self.reg_lambda,
monotone_constraints: self.monotone_constraints.clone(),
}
}
}
impl Default for LgbmRegressor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct LgbmFitOptions<'a> {
pub sample_weight: Option<&'a Array1<f64>>,
pub init_score: Option<&'a Array1<f64>>,
pub eval_set: Option<(&'a Array2<f64>, &'a Array1<f64>)>,
pub eval_sample_weight: Option<&'a Array1<f64>>,
pub verbose: usize,
}
impl<'a> Default for LgbmFitOptions<'a> {
fn default() -> Self {
Self {
sample_weight: None,
init_score: None,
eval_set: None,
eval_sample_weight: None,
verbose: 0,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct FittedLgbmRegressor {
trees: Vec<TreeNode>,
bins: Vec<FeatureBins>,
baseline: f64,
learning_rate: f64,
n_features: usize,
feature_gain: Vec<f64>,
feature_split_count: Vec<u32>,
best_iteration: usize,
}
impl FittedLgbmRegressor {
pub fn n_estimators(&self) -> usize {
self.trees.len()
}
pub fn best_iteration(&self) -> usize {
self.best_iteration
}
pub fn feature_importances(&self) -> Array1<f64> {
let sum: f64 = self.feature_gain.iter().sum();
if sum > 0.0 {
Array1::from_vec(self.feature_gain.iter().map(|g| g / sum).collect())
} else {
Array1::zeros(self.n_features)
}
}
pub fn feature_split_counts(&self) -> Array1<u32> {
Array1::from_vec(self.feature_split_count.clone())
}
}
fn compute_regression_gradients(
y: &[f64],
preds: &[f64],
objective: LgbmObjective,
huber_delta: f64,
) -> (Vec<f64>, Vec<f64>) {
let n = y.len();
let mut grad = vec![0.0; n];
let mut hess = vec![1.0; n];
match objective {
LgbmObjective::Regression => {
for i in 0..n {
grad[i] = preds[i] - y[i];
hess[i] = 1.0;
}
}
LgbmObjective::RegressionL1 => {
for i in 0..n {
let r = preds[i] - y[i];
grad[i] = r.signum();
hess[i] = 1.0; }
}
LgbmObjective::Huber => {
for i in 0..n {
let r = preds[i] - y[i];
if r.abs() <= huber_delta {
grad[i] = r;
} else {
grad[i] = huber_delta * r.signum();
}
hess[i] = 1.0;
}
}
_ => unreachable!("regressor only supports regression objectives"),
}
(grad, hess)
}
fn sample_features(n_features: usize, frac: f64, rng: &mut StdRng) -> Vec<usize> {
if frac >= 1.0 {
return (0..n_features).collect();
}
let k = ((n_features as f64 * frac).ceil() as usize).max(1);
let mut indices: Vec<usize> = (0..n_features).collect();
indices.shuffle(rng);
indices.truncate(k);
indices.sort_unstable();
indices
}
fn sample_rows(n: usize, frac: f64, rng: &mut StdRng) -> Vec<usize> {
if frac >= 1.0 {
return (0..n).collect();
}
let k = ((n as f64 * frac).ceil() as usize).max(1);
let mut indices: Vec<usize> = (0..n).collect();
indices.shuffle(rng);
indices.truncate(k);
indices.sort_unstable();
indices
}
impl LgbmRegressor {
pub fn fit_with_eval(
&self,
x: &Array2<f64>,
y: &Array1<f64>,
opts: &LgbmFitOptions<'_>,
) -> Result<FittedLgbmRegressor> {
if x.nrows() != y.len() {
return Err(RustMlError::ShapeMismatch(format!(
"X has {} rows but y has {} elements",
x.nrows(),
y.len()
)));
}
if x.is_empty() {
return Err(RustMlError::EmptyInput("training data is empty".into()));
}
if let Some(sw) = opts.sample_weight {
if sw.len() != y.len() {
return Err(RustMlError::ShapeMismatch(format!(
"sample_weight has {} entries but y has {}",
sw.len(),
y.len()
)));
}
}
if let Some(init) = opts.init_score {
if init.len() != y.len() {
return Err(RustMlError::ShapeMismatch(format!(
"init_score has {} entries but y has {}",
init.len(),
y.len()
)));
}
}
if !self.monotone_constraints.is_empty() && self.monotone_constraints.len() != x.ncols() {
return Err(RustMlError::InvalidParameter(format!(
"monotone_constraints has {} entries but X has {} features",
self.monotone_constraints.len(),
x.ncols()
)));
}
let n = x.nrows();
let p = x.ncols();
let (binned_x, bins) = compute_bins(
x,
self.max_bins,
&self.categorical_features,
self.min_data_in_bin,
);
let mean_y: f64 = y.iter().sum::<f64>() / n as f64;
let (baseline, mut preds): (f64, Vec<f64>) = if let Some(init) = opts.init_score {
let m = init.iter().sum::<f64>() / n as f64;
(m, init.to_vec())
} else {
(mean_y, vec![mean_y; n])
};
let mut trees: Vec<TreeNode> = Vec::with_capacity(self.n_estimators);
let mut feature_gain = vec![0.0f64; p];
let mut feature_split_count = vec![0u32; p];
let mut rng = StdRng::seed_from_u64(self.seed);
let params = self.params();
let eval_binned = opts.eval_set.as_ref().map(|(xe, _)| {
let (b, _) = compute_bins(
xe,
self.max_bins,
&self.categorical_features,
self.min_data_in_bin,
);
b
});
let mut eval_preds: Option<Vec<f64>> = opts
.eval_set
.as_ref()
.map(|(xe, _)| vec![baseline; xe.nrows()]);
let mut best_eval_loss = f64::INFINITY;
let mut best_iteration = 0usize;
let mut stale_rounds = 0usize;
for iter in 0..self.n_estimators {
let (mut gradients, mut hessians) = compute_regression_gradients(
y.as_slice().unwrap(),
&preds,
self.objective,
self.huber_delta,
);
if let Some(sw) = opts.sample_weight {
for i in 0..n {
gradients[i] *= sw[i];
hessians[i] *= sw[i];
}
}
let row_indices: Vec<usize> = match self.boosting_type {
BoostingType::Gbdt => sample_rows(n, self.subsample, &mut rng),
BoostingType::Goss => {
let all: Vec<usize> = (0..n).collect();
let (selected, amp_weights) = goss_sample(
&gradients,
&all,
self.goss_top_rate,
self.goss_other_rate,
&mut rng,
);
for (k, &i) in selected.iter().enumerate() {
gradients[i] *= amp_weights[k];
hessians[i] *= amp_weights[k];
}
selected
}
};
let feature_indices = sample_features(p, self.colsample_bytree, &mut rng);
let tree = build_lgbm_tree(
&binned_x,
&gradients,
&hessians,
row_indices,
&feature_indices,
&bins,
¶ms,
&mut feature_gain,
&mut feature_split_count,
);
for i in 0..n {
let row_bins: Vec<u8> = (0..p).map(|j| binned_x[[i, j]]).collect();
preds[i] += self.learning_rate * tree.predict_binned(&row_bins);
}
if let (Some((xe, ye)), Some(ref eb), Some(ref mut ep)) =
(opts.eval_set, eval_binned.as_ref(), eval_preds.as_mut())
{
for i in 0..xe.nrows() {
let row_bins: Vec<u8> = (0..p).map(|j| eb[[i, j]]).collect();
ep[i] += self.learning_rate * tree.predict_binned(&row_bins);
}
let loss = eval_loss(
ye.as_slice().unwrap(),
ep,
opts.eval_sample_weight,
self.objective,
self.huber_delta,
);
if loss < best_eval_loss - 1e-12 {
best_eval_loss = loss;
best_iteration = iter + 1;
stale_rounds = 0;
} else {
stale_rounds += 1;
}
if opts.verbose > 0 && (iter + 1) % opts.verbose == 0 {
eprintln!(
"[lgbm] iter {}: eval_loss={:.6} best={:.6}",
iter + 1,
loss,
best_eval_loss
);
}
if let Some(rounds) = self.early_stopping_rounds {
if stale_rounds >= rounds {
trees.push(tree);
break;
}
}
} else {
best_iteration = iter + 1;
}
trees.push(tree);
}
Ok(FittedLgbmRegressor {
trees,
bins,
baseline,
learning_rate: self.learning_rate,
n_features: p,
feature_gain,
feature_split_count,
best_iteration,
})
}
}
fn eval_loss(
y: &[f64],
preds: &[f64],
weights: Option<&Array1<f64>>,
objective: LgbmObjective,
huber_delta: f64,
) -> f64 {
let n = y.len();
let mut total = 0.0;
let mut wsum = 0.0;
for i in 0..n {
let w = weights.map_or(1.0, |w| w[i]);
let r = preds[i] - y[i];
let loss_i = match objective {
LgbmObjective::Regression => 0.5 * r * r,
LgbmObjective::RegressionL1 => r.abs(),
LgbmObjective::Huber => {
if r.abs() <= huber_delta {
0.5 * r * r
} else {
huber_delta * (r.abs() - 0.5 * huber_delta)
}
}
_ => 0.5 * r * r,
};
total += w * loss_i;
wsum += w;
}
if wsum > 0.0 {
total / wsum
} else {
total
}
}
impl Fit<f64> for LgbmRegressor {
type Fitted = FittedLgbmRegressor;
fn fit(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Self::Fitted> {
self.fit_with_eval(x, y, &LgbmFitOptions::default())
}
}
impl Predict<f64> for FittedLgbmRegressor {
fn predict(&self, x: &Array2<f64>) -> Result<Array1<f64>> {
if x.ncols() != self.n_features {
return Err(RustMlError::ShapeMismatch(format!(
"expected {} features, got {}",
self.n_features,
x.ncols()
)));
}
let n = x.nrows();
let mut preds = Array1::from_elem(n, self.baseline);
for i in 0..n {
let row: Vec<f64> = (0..self.n_features).map(|j| x[[i, j]]).collect();
let bins = bin_row(&row, &self.bins);
for tree in &self.trees {
preds[i] += self.learning_rate * tree.predict_binned(&bins);
}
}
Ok(preds)
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub enum LgbmClassWeight {
Balanced,
Manual(Vec<(f64, f64)>),
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LgbmClassifier {
pub n_estimators: usize,
pub num_leaves: usize,
pub max_depth: Option<usize>,
pub learning_rate: f64,
pub min_child_samples: usize,
pub min_child_weight: f64,
pub min_split_gain: f64,
pub reg_alpha: f64,
pub reg_lambda: f64,
pub subsample: f64,
pub colsample_bytree: f64,
pub max_bins: usize,
pub min_data_in_bin: usize,
pub boosting_type: BoostingType,
pub goss_top_rate: f64,
pub goss_other_rate: f64,
pub categorical_features: Vec<usize>,
pub monotone_constraints: Vec<i8>,
pub class_weight: Option<LgbmClassWeight>,
pub early_stopping_rounds: Option<usize>,
pub seed: u64,
}
impl LgbmClassifier {
pub fn new() -> Self {
Self {
n_estimators: 100,
num_leaves: 31,
max_depth: None,
learning_rate: 0.1,
min_child_samples: 20,
min_child_weight: 1e-3,
min_split_gain: 0.0,
reg_alpha: 0.0,
reg_lambda: 0.0,
subsample: 1.0,
colsample_bytree: 1.0,
max_bins: DEFAULT_MAX_BINS,
min_data_in_bin: 3,
boosting_type: BoostingType::Gbdt,
goss_top_rate: 0.2,
goss_other_rate: 0.1,
categorical_features: Vec::new(),
monotone_constraints: Vec::new(),
class_weight: None,
early_stopping_rounds: None,
seed: 0,
}
}
pub fn with_n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
pub fn with_num_leaves(mut self, n: usize) -> Self {
self.num_leaves = n;
self
}
pub fn with_max_depth(mut self, d: Option<usize>) -> Self {
self.max_depth = d;
self
}
pub fn with_learning_rate(mut self, lr: f64) -> Self {
self.learning_rate = lr;
self
}
pub fn with_min_child_samples(mut self, n: usize) -> Self {
self.min_child_samples = n;
self
}
pub fn with_min_child_weight(mut self, w: f64) -> Self {
self.min_child_weight = w;
self
}
pub fn with_reg_alpha(mut self, a: f64) -> Self {
self.reg_alpha = a;
self
}
pub fn with_reg_lambda(mut self, l: f64) -> Self {
self.reg_lambda = l;
self
}
pub fn with_subsample(mut self, s: f64) -> Self {
self.subsample = s;
self
}
pub fn with_colsample_bytree(mut self, c: f64) -> Self {
self.colsample_bytree = c;
self
}
pub fn with_min_data_in_bin(mut self, n: usize) -> Self {
self.min_data_in_bin = n;
self
}
pub fn with_boosting_type(mut self, b: BoostingType) -> Self {
self.boosting_type = b;
self
}
pub fn with_categorical_features(mut self, feats: Vec<usize>) -> Self {
self.categorical_features = feats;
self
}
pub fn with_monotone_constraints(mut self, c: Vec<i8>) -> Self {
self.monotone_constraints = c;
self
}
pub fn with_class_weight(mut self, w: Option<LgbmClassWeight>) -> Self {
self.class_weight = w;
self
}
pub fn with_early_stopping(mut self, r: Option<usize>) -> Self {
self.early_stopping_rounds = r;
self
}
pub fn with_seed(mut self, s: u64) -> Self {
self.seed = s;
self
}
fn params(&self) -> LgbmParams {
LgbmParams {
num_leaves: self.num_leaves,
max_depth: self.max_depth,
min_child_samples: self.min_child_samples,
min_child_weight: self.min_child_weight,
min_split_gain: self.min_split_gain,
reg_alpha: self.reg_alpha,
reg_lambda: self.reg_lambda,
monotone_constraints: self.monotone_constraints.clone(),
}
}
fn compute_class_weights(&self, y: &Array1<f64>, classes: &[f64]) -> Vec<f64> {
let n = y.len();
match &self.class_weight {
None => vec![1.0; n],
Some(LgbmClassWeight::Balanced) => {
let n_classes = classes.len() as f64;
let mut per_class: std::collections::HashMap<u64, f64> =
std::collections::HashMap::new();
for &cls in classes {
let count = y.iter().filter(|&&v| v == cls).count() as f64;
let w = if count > 0.0 {
n as f64 / (n_classes * count)
} else {
1.0
};
per_class.insert(cls.to_bits(), w);
}
y.iter()
.map(|&v| *per_class.get(&v.to_bits()).unwrap_or(&1.0))
.collect()
}
Some(LgbmClassWeight::Manual(pairs)) => {
let map: std::collections::HashMap<u64, f64> =
pairs.iter().map(|&(c, w)| (c.to_bits(), w)).collect();
y.iter()
.map(|&v| *map.get(&v.to_bits()).unwrap_or(&1.0))
.collect()
}
}
}
}
impl Default for LgbmClassifier {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct FittedLgbmClassifier {
tree_sets: Vec<Vec<TreeNode>>,
bins: Vec<FeatureBins>,
baselines: Vec<f64>,
classes: Vec<f64>,
learning_rate: f64,
n_features: usize,
feature_gain: Vec<f64>,
feature_split_count: Vec<u32>,
}
impl FittedLgbmClassifier {
pub fn classes(&self) -> &[f64] {
&self.classes
}
pub fn n_estimators(&self) -> usize {
self.tree_sets.first().map_or(0, |ts| ts.len())
}
pub fn feature_importances(&self) -> Array1<f64> {
let sum: f64 = self.feature_gain.iter().sum();
if sum > 0.0 {
Array1::from_vec(self.feature_gain.iter().map(|g| g / sum).collect())
} else {
Array1::zeros(self.n_features)
}
}
pub fn predict_proba(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
if x.ncols() != self.n_features {
return Err(RustMlError::ShapeMismatch(format!(
"expected {} features, got {}",
self.n_features,
x.ncols()
)));
}
let n = x.nrows();
let n_classes = self.classes.len();
if n_classes == 2 {
let mut proba = Array2::zeros((n, 2));
for i in 0..n {
let row: Vec<f64> = (0..self.n_features).map(|j| x[[i, j]]).collect();
let bins = bin_row(&row, &self.bins);
let mut score = self.baselines[0];
for tree in &self.tree_sets[0] {
score += self.learning_rate * tree.predict_binned(&bins);
}
let p1 = 1.0 / (1.0 + (-score).exp());
proba[[i, 0]] = 1.0 - p1;
proba[[i, 1]] = p1;
}
Ok(proba)
} else {
let mut proba = Array2::zeros((n, n_classes));
for i in 0..n {
let row: Vec<f64> = (0..self.n_features).map(|j| x[[i, j]]).collect();
let bins = bin_row(&row, &self.bins);
let mut scores = vec![0.0; n_classes];
for (c, ts) in self.tree_sets.iter().enumerate() {
scores[c] = self.baselines[c];
for tree in ts {
scores[c] += self.learning_rate * tree.predict_binned(&bins);
}
}
let max_s = scores.iter().copied().fold(f64::NEG_INFINITY, f64::max);
let exp_sum: f64 = scores.iter().map(|&s| (s - max_s).exp()).sum();
for c in 0..n_classes {
proba[[i, c]] = (scores[c] - max_s).exp() / exp_sum;
}
}
Ok(proba)
}
}
}
impl LgbmClassifier {
pub fn fit_with_eval(
&self,
x: &Array2<f64>,
y: &Array1<f64>,
opts: &LgbmFitOptions<'_>,
) -> Result<FittedLgbmClassifier> {
if x.nrows() != y.len() {
return Err(RustMlError::ShapeMismatch(format!(
"X has {} rows but y has {} elements",
x.nrows(),
y.len()
)));
}
if x.is_empty() {
return Err(RustMlError::EmptyInput("training data is empty".into()));
}
if !self.monotone_constraints.is_empty() && self.monotone_constraints.len() != x.ncols() {
return Err(RustMlError::InvalidParameter(format!(
"monotone_constraints has {} entries but X has {} features",
self.monotone_constraints.len(),
x.ncols()
)));
}
let n = x.nrows();
let p = x.ncols();
let (binned_x, bins) = compute_bins(
x,
self.max_bins,
&self.categorical_features,
self.min_data_in_bin,
);
let mut classes: Vec<f64> = y.iter().copied().collect();
classes.sort_by(|a, b| a.partial_cmp(b).unwrap());
classes.dedup();
let n_classes = classes.len();
if n_classes < 2 {
return Err(RustMlError::InvalidParameter(
"need at least 2 classes".into(),
));
}
let class_weights = self.compute_class_weights(y, &classes);
let effective_weights: Vec<f64> = (0..n)
.map(|i| class_weights[i] * opts.sample_weight.map_or(1.0, |sw| sw[i]))
.collect();
let mut rng = StdRng::seed_from_u64(self.seed);
let params = self.params();
let mut feature_gain = vec![0.0f64; p];
let mut feature_split_count = vec![0u32; p];
if n_classes == 2 {
let pos_class = classes[1];
let labels: Vec<f64> = y
.iter()
.map(|&v| if v == pos_class { 1.0 } else { 0.0 })
.collect();
let pos_frac: f64 = labels.iter().sum::<f64>() / n as f64;
let baseline = (pos_frac / (1.0 - pos_frac + 1e-15)).ln();
let mut raw = if let Some(init) = opts.init_score {
init.to_vec()
} else {
vec![baseline; n]
};
let mut trees = Vec::with_capacity(self.n_estimators);
for _iter in 0..self.n_estimators {
let mut gradients: Vec<f64> = (0..n)
.map(|i| {
let pr = 1.0 / (1.0 + (-raw[i]).exp());
(pr - labels[i]) * effective_weights[i]
})
.collect();
let mut hessians: Vec<f64> = (0..n)
.map(|i| {
let pr = 1.0 / (1.0 + (-raw[i]).exp());
(pr * (1.0 - pr)).max(1e-12) * effective_weights[i]
})
.collect();
let row_indices: Vec<usize> = match self.boosting_type {
BoostingType::Gbdt => sample_rows(n, self.subsample, &mut rng),
BoostingType::Goss => {
let all: Vec<usize> = (0..n).collect();
let (selected, amp) = goss_sample(
&gradients,
&all,
self.goss_top_rate,
self.goss_other_rate,
&mut rng,
);
for (k, &i) in selected.iter().enumerate() {
gradients[i] *= amp[k];
hessians[i] *= amp[k];
}
selected
}
};
let feature_indices = sample_features(p, self.colsample_bytree, &mut rng);
let tree = build_lgbm_tree(
&binned_x,
&gradients,
&hessians,
row_indices,
&feature_indices,
&bins,
¶ms,
&mut feature_gain,
&mut feature_split_count,
);
for i in 0..n {
let row_bins: Vec<u8> = (0..p).map(|j| binned_x[[i, j]]).collect();
raw[i] += self.learning_rate * tree.predict_binned(&row_bins);
}
trees.push(tree);
}
Ok(FittedLgbmClassifier {
tree_sets: vec![trees],
bins,
baselines: vec![baseline],
classes,
learning_rate: self.learning_rate,
n_features: p,
feature_gain,
feature_split_count,
})
} else {
let mut tree_sets: Vec<Vec<TreeNode>> =
vec![Vec::with_capacity(self.n_estimators); n_classes];
let mut baselines = Vec::with_capacity(n_classes);
let mut raw_scores = vec![vec![0.0; n]; n_classes];
for (c, &cls) in classes.iter().enumerate() {
let count = y.iter().filter(|&&v| v == cls).count() as f64;
let prior = (count / n as f64).max(1e-15);
let bl = prior.ln();
baselines.push(bl);
raw_scores[c] = vec![bl; n];
}
for _iter in 0..self.n_estimators {
let mut probas = vec![vec![0.0; n_classes]; n];
for i in 0..n {
let max_s = raw_scores
.iter()
.map(|s| s[i])
.fold(f64::NEG_INFINITY, f64::max);
let exp_sum: f64 = raw_scores.iter().map(|s| (s[i] - max_s).exp()).sum();
for c in 0..n_classes {
probas[i][c] = (raw_scores[c][i] - max_s).exp() / exp_sum;
}
}
for (c, &cls) in classes.iter().enumerate() {
let mut gradients: Vec<f64> = (0..n)
.map(|i| {
let lbl = if y[i] == cls { 1.0 } else { 0.0 };
(probas[i][c] - lbl) * effective_weights[i]
})
.collect();
let mut hessians: Vec<f64> = (0..n)
.map(|i| {
(probas[i][c] * (1.0 - probas[i][c])).max(1e-12) * effective_weights[i]
})
.collect();
let row_indices: Vec<usize> = match self.boosting_type {
BoostingType::Gbdt => sample_rows(n, self.subsample, &mut rng),
BoostingType::Goss => {
let all: Vec<usize> = (0..n).collect();
let (selected, amp) = goss_sample(
&gradients,
&all,
self.goss_top_rate,
self.goss_other_rate,
&mut rng,
);
for (k, &i) in selected.iter().enumerate() {
gradients[i] *= amp[k];
hessians[i] *= amp[k];
}
selected
}
};
let feature_indices = sample_features(p, self.colsample_bytree, &mut rng);
let tree = build_lgbm_tree(
&binned_x,
&gradients,
&hessians,
row_indices,
&feature_indices,
&bins,
¶ms,
&mut feature_gain,
&mut feature_split_count,
);
for i in 0..n {
let row_bins: Vec<u8> = (0..p).map(|j| binned_x[[i, j]]).collect();
raw_scores[c][i] += self.learning_rate * tree.predict_binned(&row_bins);
}
tree_sets[c].push(tree);
}
}
Ok(FittedLgbmClassifier {
tree_sets,
bins,
baselines,
classes,
learning_rate: self.learning_rate,
n_features: p,
feature_gain,
feature_split_count,
})
}
}
}
impl Fit<f64> for LgbmClassifier {
type Fitted = FittedLgbmClassifier;
fn fit(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Self::Fitted> {
self.fit_with_eval(x, y, &LgbmFitOptions::default())
}
}
impl Predict<f64> for FittedLgbmClassifier {
fn predict(&self, x: &Array2<f64>) -> Result<Array1<f64>> {
let proba = self.predict_proba(x)?;
let n = x.nrows();
let mut preds = Array1::zeros(n);
for i in 0..n {
let mut best_c = 0;
let mut best_p = proba[[i, 0]];
for c in 1..self.classes.len() {
if proba[[i, c]] > best_p {
best_p = proba[[i, c]];
best_c = c;
}
}
preds[i] = self.classes[best_c];
}
Ok(preds)
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use ndarray::array;
#[test]
fn test_lgbm_regressor_basic() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let model = LgbmRegressor::new()
.with_n_estimators(50)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_learning_rate(0.1);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
for (p, t) in preds.iter().zip(y.iter()) {
assert_abs_diff_eq!(*p, *t, epsilon = 3.0);
}
}
#[test]
fn test_lgbm_regressor_nan_handling() {
let x = array![
[1.0],
[2.0],
[f64::NAN],
[4.0],
[5.0],
[6.0],
[7.0],
[f64::NAN],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite(), "prediction should be finite");
}
}
#[test]
fn test_lgbm_classifier_binary() {
let x = array![
[1.0, 0.0],
[2.0, 0.0],
[3.0, 0.0],
[4.0, 0.0],
[5.0, 0.0],
[10.0, 1.0],
[11.0, 1.0],
[12.0, 1.0],
[13.0, 1.0],
[14.0, 1.0]
];
let y = array![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
let correct: usize = preds.iter().zip(y.iter()).filter(|(&p, &t)| p == t).count();
assert!(
correct >= 9,
"should classify most correctly, got {}/10",
correct
);
}
#[test]
fn test_lgbm_classifier_multiclass() {
let x = array![
[0.0, 0.0],
[0.5, 0.5],
[0.0, 0.5],
[5.0, 0.0],
[5.5, 0.5],
[5.0, 0.5],
[0.0, 10.0],
[0.5, 10.5],
[0.0, 10.5]
];
let y = array![0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
assert_eq!(fitted.classes(), &[0.0, 1.0, 2.0]);
let proba = fitted.predict_proba(&x).unwrap();
assert_eq!(proba.ncols(), 3);
for i in 0..x.nrows() {
let row_sum: f64 = (0..3).map(|c| proba[[i, c]]).sum();
assert_abs_diff_eq!(row_sum, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_lgbm_regressor_goss() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_boosting_type(BoostingType::Goss)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 10);
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_regressor_l1_l2() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_reg_lambda(100.0)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_regressor_subsample() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_subsample(0.5)
.with_colsample_bytree(0.5)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 10);
}
#[test]
fn test_lgbm_regressor_num_leaves_controls_complexity() {
let x = Array2::from_shape_vec((20, 1), (0..20).map(|i| i as f64).collect()).unwrap();
let y = Array1::from_vec((0..20).map(|i| 2.0 * i as f64).collect());
let fitted_small = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(2)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let fitted_large = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(16)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds_small = fitted_small.predict(&x).unwrap();
let preds_large = fitted_large.predict(&x).unwrap();
let err_small: f64 = preds_small
.iter()
.zip(y.iter())
.map(|(&p, &t)| (p - t).powi(2))
.sum();
let err_large: f64 = preds_large
.iter()
.zip(y.iter())
.map(|(&p, &t)| (p - t).powi(2))
.sum();
assert!(
err_large <= err_small + 1e-6,
"larger num_leaves should fit better: small={}, large={}",
err_small,
err_large
);
}
#[test]
fn test_lgbm_feature_importances() {
let x = array![
[1.0, 100.0],
[2.0, 100.0],
[3.0, 100.0],
[10.0, 100.0],
[11.0, 100.0],
[12.0, 100.0]
];
let y = array![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let imp = fitted.feature_importances();
assert!(imp[0] > imp[1]);
}
#[test]
fn test_lgbm_shape_mismatch() {
let x = array![[1.0], [2.0], [3.0]];
let y = array![1.0, 2.0];
assert!(LgbmRegressor::new().fit(&x, &y).is_err());
}
#[test]
fn test_lgbm_empty_input() {
let x = Array2::<f64>::zeros((0, 2));
let y = Array1::<f64>::zeros(0);
assert!(LgbmRegressor::new().fit(&x, &y).is_err());
}
#[test]
fn test_lgbm_classifier_single_class() {
let x = array![[1.0], [2.0], [3.0]];
let y = array![0.0, 0.0, 0.0];
assert!(LgbmClassifier::new().fit(&x, &y).is_err());
}
#[test]
fn test_lgbm_regressor_sample_weight() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let sw = Array1::from_vec(vec![
0.01, 0.01, 0.01, 0.01, 0.01, 10.0, 10.0, 10.0, 10.0, 10.0,
]);
let opts = LgbmFitOptions {
sample_weight: Some(&sw),
..Default::default()
};
let fitted = LgbmRegressor::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit_with_eval(&x, &y, &opts)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for i in 5..10 {
assert!((preds[i] - y[i]).abs() < 5.0);
assert!(preds[i].is_finite());
}
}
#[test]
fn test_lgbm_regressor_init_score() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let init = Array1::from_vec(y.to_vec());
let opts = LgbmFitOptions {
init_score: Some(&init),
..Default::default()
};
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_learning_rate(0.01)
.fit_with_eval(&x, &y, &opts)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_regressor_early_stopping() {
let x_train = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y_train = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let x_eval = array![[11.0], [12.0], [13.0]];
let y_eval = array![22.0, 24.0, 26.0];
let opts = LgbmFitOptions {
eval_set: Some((&x_eval, &y_eval)),
..Default::default()
};
let fitted = LgbmRegressor::new()
.with_n_estimators(100)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_early_stopping(Some(5))
.fit_with_eval(&x_train, &y_train, &opts)
.unwrap();
assert!(fitted.best_iteration() <= 100);
let preds = fitted.predict(&x_train).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_classifier_class_weight_balanced() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_class_weight(Some(LgbmClassWeight::Balanced))
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
let n_minority: usize = preds.iter().filter(|&&v| v == 1.0).count();
assert!(
n_minority >= 1,
"class balancing should predict minority class"
);
}
#[test]
fn test_lgbm_regressor_monotone_constraint_increasing() {
let x = array![
[1.0, 10.0],
[2.0, 9.0],
[3.0, 8.0],
[4.0, 7.0],
[5.0, 6.0],
[6.0, 5.0],
[7.0, 4.0],
[8.0, 3.0],
[9.0, 2.0],
[10.0, 1.0],
];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_monotone_constraints(vec![1, -1])
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_monotone_constraints_wrong_length() {
let x = array![[1.0, 2.0], [3.0, 4.0]];
let y = array![1.0, 2.0];
let result = LgbmRegressor::new()
.with_monotone_constraints(vec![1, 0, -1])
.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_lgbm_feature_importance_reflects_gain() {
let x = array![
[1.0, 50.0],
[2.0, 50.0],
[3.0, 50.0],
[4.0, 50.0],
[10.0, 50.0],
[11.0, 50.0],
[12.0, 50.0],
[13.0, 50.0],
];
let y = array![1.0, 2.0, 3.0, 4.0, 10.0, 11.0, 12.0, 13.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let imp = fitted.feature_importances();
assert!(
imp[0] > imp[1],
"feature 0 should have higher importance: {:?}",
imp
);
let sum: f64 = imp.iter().sum();
assert!((sum - 1.0).abs() < 1e-10 || sum == 0.0);
}
#[test]
fn test_lgbm_goss_unbiased_vs_gbdt() {
let x = array![
[1.0, 0.0],
[2.0, 0.0],
[3.0, 0.0],
[4.0, 0.0],
[5.0, 0.0],
[10.0, 1.0],
[11.0, 1.0],
[12.0, 1.0],
[13.0, 1.0],
[14.0, 1.0]
];
let y = array![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0];
let gbdt = LgbmClassifier::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_boosting_type(BoostingType::Gbdt)
.fit(&x, &y)
.unwrap();
let goss = LgbmClassifier::new()
.with_n_estimators(30)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_boosting_type(BoostingType::Goss)
.fit(&x, &y)
.unwrap();
let gbdt_correct: usize = gbdt
.predict(&x)
.unwrap()
.iter()
.zip(y.iter())
.filter(|(&p, &t)| p == t)
.count();
let goss_correct: usize = goss
.predict(&x)
.unwrap()
.iter()
.zip(y.iter())
.filter(|(&p, &t)| p == t)
.count();
assert!(gbdt_correct >= 9);
assert!(goss_correct >= 9);
}
#[test]
fn test_lgbm_constant_y() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![7.0, 7.0, 7.0, 7.0, 7.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert_abs_diff_eq!(p, 7.0, epsilon = 0.1);
}
}
#[test]
fn test_lgbm_num_leaves_one_is_baseline_only() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(30)
.with_num_leaves(1)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
assert_abs_diff_eq!(p, 6.0, epsilon = 1.0);
}
}
#[test]
fn test_lgbm_zero_learning_rate() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_learning_rate(0.0)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
let mean: f64 = y.iter().sum::<f64>() / y.len() as f64;
for &p in preds.iter() {
assert_abs_diff_eq!(p, mean, epsilon = 1e-10);
}
}
#[test]
fn test_lgbm_single_estimator() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(1)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
assert_eq!(fitted.n_estimators(), 1);
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_extreme_l2_regularization() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_reg_lambda(1e6)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert_abs_diff_eq!(p, 3.0, epsilon = 0.5);
}
}
#[test]
fn test_lgbm_extreme_l1_regularization() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_reg_alpha(1e6)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert_abs_diff_eq!(p, 3.0, epsilon = 0.5);
}
}
#[test]
fn test_lgbm_all_nan_row_predict() {
let x_train = array![[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [5.0, 5.0]];
let y_train = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x_train, &y_train)
.unwrap();
let x_test = array![[f64::NAN, f64::NAN]];
let preds = fitted.predict(&x_test).unwrap();
assert!(preds[0].is_finite());
}
#[test]
fn test_lgbm_nan_in_predict_only() {
let x_train = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y_train = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x_train, &y_train)
.unwrap();
let x_test = array![[2.5], [f64::NAN], [4.0]];
let preds = fitted.predict(&x_test).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_infinity_in_features() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [f64::INFINITY]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 10.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(!p.is_nan());
}
}
#[test]
fn test_lgbm_duplicate_rows() {
let x = array![
[1.0, 2.0],
[1.0, 2.0],
[1.0, 2.0],
[3.0, 4.0],
[3.0, 4.0],
[3.0, 4.0]
];
let y = array![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p == 0.0 || p == 1.0);
}
}
#[test]
fn test_lgbm_highly_imbalanced_with_balanced_weight() {
let x = Array2::from_shape_vec((20, 1), (0..20).map(|i| i as f64).collect()).unwrap();
let y = Array1::from_vec((0..20).map(|i| if i >= 19 { 1.0 } else { 0.0 }).collect());
let fitted = LgbmClassifier::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_class_weight(Some(LgbmClassWeight::Balanced))
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
let n_minority: usize = preds.iter().filter(|&&v| v == 1.0).count();
assert!(n_minority >= 1);
}
#[test]
fn test_lgbm_negative_class_labels() {
let x = array![[1.0], [2.0], [3.0], [10.0], [11.0], [12.0]];
let y = array![-1.0, -1.0, -1.0, 5.5, 5.5, 5.5];
let fitted = LgbmClassifier::new()
.with_n_estimators(15)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
assert_eq!(fitted.classes(), &[-1.0, 5.5]);
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p == -1.0 || p == 5.5);
}
}
#[test]
fn test_lgbm_zero_sample_weights() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let sw = Array1::from_vec(vec![1.0, 1.0, 1.0, 0.0, 0.0, 0.0]);
let opts = LgbmFitOptions {
sample_weight: Some(&sw),
..Default::default()
};
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit_with_eval(&x, &y, &opts)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_goss_extreme_rates() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_boosting_type(BoostingType::Goss)
.with_goss_rates(0.6, 0.6)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_max_depth_one_stumps() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(2)
.with_max_depth(Some(1))
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_feature_never_used_zero_importance() {
let x = array![
[1.0, 42.0],
[2.0, 42.0],
[3.0, 42.0],
[4.0, 42.0],
[5.0, 42.0]
];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let imp = fitted.feature_importances();
assert_eq!(imp[1], 0.0);
}
#[test]
fn test_lgbm_min_split_gain_too_high() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_min_split_gain(1e12)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert_abs_diff_eq!(p, 3.0, epsilon = 0.5);
}
}
#[test]
fn test_lgbm_predict_wrong_n_features() {
let x_train = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
let y_train = array![1.0, 2.0, 3.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(5)
.with_min_child_samples(1)
.fit(&x_train, &y_train)
.unwrap();
let x_bad = array![[1.0, 2.0, 3.0]];
assert!(fitted.predict(&x_bad).is_err());
}
#[test]
fn test_lgbm_monotone_on_constant_feature() {
let x = array![
[1.0, 50.0],
[2.0, 50.0],
[3.0, 50.0],
[4.0, 50.0],
[5.0, 50.0]
];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_monotone_constraints(vec![1, -1])
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_many_classes() {
let x = array![
[0.0],
[0.1],
[0.2],
[1.0],
[1.1],
[1.2],
[2.0],
[2.1],
[2.2],
[3.0],
[3.1],
[3.2],
[4.0],
[4.1],
[4.2]
];
let y = array![0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
assert_eq!(fitted.classes().len(), 5);
let proba = fitted.predict_proba(&x).unwrap();
assert_eq!(proba.ncols(), 5);
for i in 0..x.nrows() {
let row_sum: f64 = (0..5).map(|c| proba[[i, c]]).sum();
assert_abs_diff_eq!(row_sum, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_lgbm_all_nan_column() {
let x = array![
[1.0, f64::NAN],
[2.0, f64::NAN],
[3.0, f64::NAN],
[4.0, f64::NAN],
[5.0, f64::NAN]
];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_num_leaves_exceeds_n() {
let x = array![[1.0], [2.0], [3.0]];
let y = array![1.0, 2.0, 3.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(5)
.with_num_leaves(32)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
}
}
#[test]
fn test_lgbm_predict_empty_matrix() {
let x_train = array![[1.0], [2.0], [3.0]];
let y_train = array![1.0, 2.0, 3.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(5)
.with_num_leaves(2)
.with_min_child_samples(1)
.fit(&x_train, &y_train)
.unwrap();
let x_empty = Array2::<f64>::zeros((0, 1));
let preds = fitted.predict(&x_empty).unwrap();
assert_eq!(preds.len(), 0);
}
#[test]
fn test_lgbm_classifier_categorical_multiclass() {
let x = array![
[0.0, 1.0],
[1.0, 1.0],
[2.0, 1.0],
[0.0, 2.0],
[1.0, 2.0],
[2.0, 2.0],
[0.0, 3.0],
[1.0, 3.0],
[2.0, 3.0]
];
let y = array![0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0];
let fitted = LgbmClassifier::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_categorical_features(vec![1])
.fit(&x, &y)
.unwrap();
assert_eq!(fitted.classes().len(), 3);
let preds = fitted.predict(&x).unwrap();
let correct: usize = preds.iter().zip(y.iter()).filter(|(&p, &t)| p == t).count();
assert!(correct >= 7, "got {}/9", correct);
}
#[test]
fn test_lgbm_sample_weight_wrong_length_errors() {
let x = array![[1.0], [2.0], [3.0]];
let y = array![1.0, 2.0, 3.0];
let sw = Array1::from_vec(vec![1.0, 1.0]);
let opts = LgbmFitOptions {
sample_weight: Some(&sw),
..Default::default()
};
assert!(LgbmRegressor::new().fit_with_eval(&x, &y, &opts).is_err());
}
#[test]
fn test_lgbm_init_score_wrong_length_errors() {
let x = array![[1.0], [2.0], [3.0]];
let y = array![1.0, 2.0, 3.0];
let init = Array1::from_vec(vec![0.0, 0.0]);
let opts = LgbmFitOptions {
init_score: Some(&init),
..Default::default()
};
assert!(LgbmRegressor::new().fit_with_eval(&x, &y, &opts).is_err());
}
#[test]
fn test_lgbm_stress_larger_dataset() {
let n = 200;
let p = 5;
let mut x_data = Vec::with_capacity(n * p);
let mut y_data = Vec::with_capacity(n);
let mut rng_state = 42u64;
let mut next = || {
rng_state = rng_state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
((rng_state >> 33) as f64) / (u32::MAX as f64) - 0.5
};
for i in 0..n {
let f0 = next();
let f1 = next();
let f2 = next();
let f3 = next();
let f4 = next();
x_data.extend([f0, f1, f2, f3, f4]);
y_data.push(2.0 * f0 - 1.5 * f1 + 0.3 * f2 + 0.1 * next());
let _ = i;
}
let x = Array2::from_shape_vec((n, p), x_data).unwrap();
let y = Array1::from_vec(y_data);
let fitted = LgbmRegressor::new()
.with_n_estimators(100)
.with_num_leaves(15)
.with_learning_rate(0.05)
.with_min_child_samples(5)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
let y_mean: f64 = y.iter().sum::<f64>() / n as f64;
let ss_res: f64 = preds
.iter()
.zip(y.iter())
.map(|(&p, &t)| (p - t).powi(2))
.sum();
let ss_tot: f64 = y.iter().map(|&t| (t - y_mean).powi(2)).sum();
let r2 = 1.0 - ss_res / ss_tot;
assert!(
r2 > 0.5,
"R² should be > 0.5 on noisy linear data, got {:.4}",
r2
);
let imp = fitted.feature_importances();
let sum: f64 = imp.iter().sum();
assert!((sum - 1.0).abs() < 1e-9);
assert!(imp[0] > 0.0);
}
#[test]
fn test_lgbm_stress_many_estimators() {
let x = array![
[1.0],
[2.0],
[3.0],
[4.0],
[5.0],
[6.0],
[7.0],
[8.0],
[9.0],
[10.0]
];
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(500)
.with_num_leaves(4)
.with_learning_rate(0.01)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for &p in preds.iter() {
assert!(p.is_finite());
assert!(!p.is_nan());
}
assert_eq!(fitted.n_estimators(), 500);
}
#[test]
fn test_lgbm_reproducible_with_seed() {
let x = array![
[1.0, 2.0],
[3.0, 4.0],
[5.0, 6.0],
[7.0, 8.0],
[9.0, 10.0],
[11.0, 12.0],
[13.0, 14.0],
[15.0, 16.0]
];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let m = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_subsample(0.5)
.with_colsample_bytree(0.5)
.with_seed(42);
let f1 = m.fit(&x, &y).unwrap();
let f2 = m.fit(&x, &y).unwrap();
let p1 = f1.predict(&x).unwrap();
let p2 = f2.predict(&x).unwrap();
for (a, b) in p1.iter().zip(p2.iter()) {
assert_abs_diff_eq!(*a, *b, epsilon = 1e-12);
}
}
#[test]
fn test_lgbm_different_seeds_differ() {
let x = array![
[1.0, 2.0],
[3.0, 4.0],
[5.0, 6.0],
[7.0, 8.0],
[9.0, 10.0],
[11.0, 12.0],
[13.0, 14.0],
[15.0, 16.0]
];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let m1 = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_subsample(0.5)
.with_seed(1);
let m2 = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_subsample(0.5)
.with_seed(2);
let f1 = m1.fit(&x, &y).unwrap();
let f2 = m2.fit(&x, &y).unwrap();
let p1 = f1.predict(&x).unwrap();
let p2 = f2.predict(&x).unwrap();
let any_diff = p1.iter().zip(p2.iter()).any(|(a, b)| (a - b).abs() > 1e-10);
assert!(
any_diff,
"different seeds should give different predictions"
);
}
#[test]
fn test_lgbm_clone_preserves_predictions() {
let x = array![[1.0], [2.0], [3.0], [4.0], [5.0]];
let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(10)
.with_num_leaves(4)
.with_min_child_samples(1)
.fit(&x, &y)
.unwrap();
let cloned = fitted.clone();
let orig = fitted.predict(&x).unwrap();
let clon = cloned.predict(&x).unwrap();
for (a, b) in orig.iter().zip(clon.iter()) {
assert_abs_diff_eq!(*a, *b, epsilon = 1e-12);
}
}
#[test]
fn test_lgbm_early_stopping_respects_rounds() {
let x_train = Array2::from_shape_vec((20, 1), (0..20).map(|i| i as f64).collect()).unwrap();
let y_train = Array1::from_vec((0..20).map(|i| 2.0 * i as f64).collect());
let x_eval = array![[5.0], [10.0], [15.0]];
let y_eval = array![10.0, 20.0, 30.0];
let opts = LgbmFitOptions {
eval_set: Some((&x_eval, &y_eval)),
..Default::default()
};
let fitted = LgbmRegressor::new()
.with_n_estimators(500)
.with_num_leaves(4)
.with_min_child_samples(1)
.with_early_stopping(Some(3))
.fit_with_eval(&x_train, &y_train, &opts)
.unwrap();
assert!(fitted.best_iteration() > 0);
assert!(fitted.best_iteration() <= 500);
}
#[test]
fn test_lgbm_regressor_categorical_features() {
let x = array![
[1.0, 0.0],
[2.0, 0.0],
[3.0, 1.0],
[4.0, 1.0],
[5.0, 2.0],
[6.0, 2.0],
[7.0, 0.0],
[8.0, 1.0]
];
let y = array![1.0, 2.0, 10.0, 11.0, 20.0, 21.0, 3.0, 12.0];
let fitted = LgbmRegressor::new()
.with_n_estimators(20)
.with_num_leaves(8)
.with_min_child_samples(1)
.with_categorical_features(vec![1])
.fit(&x, &y)
.unwrap();
let preds = fitted.predict(&x).unwrap();
for (p, t) in preds.iter().zip(y.iter()) {
assert_abs_diff_eq!(*p, *t, epsilon = 5.0);
}
}
}