use ghostflow_core::Tensor;
use rand::prelude::*;
use rayon::prelude::*;
use std::collections::HashMap;
const MAX_BINS: usize = 255;
pub struct LightGBMClassifier {
pub n_estimators: usize,
pub learning_rate: f32,
pub num_leaves: usize,
pub max_depth: i32,
pub min_data_in_leaf: usize,
pub min_sum_hessian_in_leaf: f32,
pub feature_fraction: f32,
pub bagging_fraction: f32,
pub bagging_freq: usize,
pub lambda_l1: f32,
pub lambda_l2: f32,
pub max_bin: usize,
trees: Vec<LGBMTree>,
feature_bins: Vec<Vec<f32>>,
base_score: f32,
}
#[derive(Clone)]
struct LGBMTree {
nodes: Vec<LGBMNode>,
}
#[derive(Clone)]
struct LGBMNode {
is_leaf: bool,
feature: usize,
threshold_bin: usize,
left: usize,
right: usize,
value: f32,
split_gain: f32,
}
impl LightGBMClassifier {
pub fn new(n_estimators: usize) -> Self {
Self {
n_estimators,
learning_rate: 0.1,
num_leaves: 31,
max_depth: -1, min_data_in_leaf: 20,
min_sum_hessian_in_leaf: 1e-3,
feature_fraction: 1.0,
bagging_fraction: 1.0,
bagging_freq: 0,
lambda_l1: 0.0,
lambda_l2: 0.0,
max_bin: MAX_BINS,
trees: Vec::new(),
feature_bins: Vec::new(),
base_score: 0.5,
}
}
pub fn learning_rate(mut self, lr: f32) -> Self {
self.learning_rate = lr;
self
}
pub fn num_leaves(mut self, leaves: usize) -> Self {
self.num_leaves = leaves;
self
}
pub fn max_depth(mut self, depth: i32) -> Self {
self.max_depth = depth;
self
}
pub fn feature_fraction(mut self, fraction: f32) -> Self {
self.feature_fraction = fraction;
self
}
fn build_histograms(&mut self, x: &Tensor) {
let n_samples = x.dims()[0];
let n_features = x.dims()[1];
let x_data = x.data_f32();
self.feature_bins = (0..n_features)
.into_par_iter()
.map(|feature| {
let mut values: Vec<f32> = (0..n_samples)
.map(|i| x_data[i * n_features + feature])
.collect();
values.sort_by(|a, b| a.partial_cmp(b).unwrap());
values.dedup();
if values.len() <= self.max_bin {
values
} else {
let step = values.len() / self.max_bin;
(0..self.max_bin)
.map(|i| values[i * step])
.collect()
}
})
.collect();
}
fn value_to_bin(&self, feature: usize, value: f32) -> usize {
let bins = &self.feature_bins[feature];
match bins.binary_search_by(|&bin| bin.partial_cmp(&value).unwrap()) {
Ok(idx) => idx,
Err(idx) => idx.saturating_sub(1),
}
}
pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
let n_samples = x.dims()[0];
let n_features = x.dims()[1];
let x_data = x.data_f32();
let y_data = y.data_f32();
self.build_histograms(x);
let mut predictions = vec![self.base_score; n_samples];
for iter in 0..self.n_estimators {
let mut gradients = Vec::with_capacity(n_samples);
let mut hessians = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let pred = 1.0 / (1.0 + (-predictions[i]).exp());
let grad = pred - y_data[i];
let hess = pred * (1.0 - pred);
gradients.push(grad);
hessians.push(hess.max(1e-6));
}
let mut rng = thread_rng();
let sample_indices: Vec<usize> = if self.bagging_freq > 0 && iter % self.bagging_freq == 0 {
let n_subsample = (n_samples as f32 * self.bagging_fraction) as usize;
(0..n_samples).choose_multiple(&mut rng, n_subsample)
} else {
(0..n_samples).collect()
};
let feature_indices: Vec<usize> = if self.feature_fraction < 1.0 {
let n_features_sample = (n_features as f32 * self.feature_fraction) as usize;
(0..n_features).choose_multiple(&mut rng, n_features_sample)
} else {
(0..n_features).collect()
};
let tree = self.build_tree_leafwise(
&x_data,
&gradients,
&hessians,
&sample_indices,
&feature_indices,
n_features,
);
for i in 0..n_samples {
let leaf_value = self.predict_tree(&tree, &x_data[i * n_features..(i + 1) * n_features]);
predictions[i] += self.learning_rate * leaf_value;
}
self.trees.push(tree);
}
}
fn build_tree_leafwise(
&self,
x_data: &[f32],
gradients: &[f32],
hessians: &[f32],
sample_indices: &[usize],
feature_indices: &[usize],
n_features: usize,
) -> LGBMTree {
let mut nodes = Vec::new();
let mut leaf_queue: Vec<(Vec<usize>, usize, usize)> = Vec::new();
let root_idx = nodes.len();
let sum_grad: f32 = sample_indices.iter().map(|&i| gradients[i]).sum();
let sum_hess: f32 = sample_indices.iter().map(|&i| hessians[i]).sum();
let root_value = self.calculate_leaf_value(sum_grad, sum_hess);
nodes.push(LGBMNode {
is_leaf: true,
feature: 0,
threshold_bin: 0,
left: 0,
right: 0,
value: root_value,
split_gain: 0.0,
});
leaf_queue.push((sample_indices.to_vec(), root_idx, 0));
while nodes.len() < self.num_leaves && !leaf_queue.is_empty() {
let (best_idx, best_split) = self.find_best_leaf_to_split(
x_data,
gradients,
hessians,
&leaf_queue,
feature_indices,
n_features,
);
if best_split.is_none() {
break;
}
let (_samples, node_idx, depth) = leaf_queue.remove(best_idx);
let (feature, threshold_bin, gain, left_samples, right_samples) = best_split.unwrap();
if self.max_depth > 0 && depth >= self.max_depth as usize {
continue;
}
nodes[node_idx].is_leaf = false;
nodes[node_idx].feature = feature;
nodes[node_idx].threshold_bin = threshold_bin;
nodes[node_idx].split_gain = gain;
let left_idx = nodes.len();
let left_sum_grad: f32 = left_samples.iter().map(|&i| gradients[i]).sum();
let left_sum_hess: f32 = left_samples.iter().map(|&i| hessians[i]).sum();
let left_value = self.calculate_leaf_value(left_sum_grad, left_sum_hess);
nodes.push(LGBMNode {
is_leaf: true,
feature: 0,
threshold_bin: 0,
left: 0,
right: 0,
value: left_value,
split_gain: 0.0,
});
let right_idx = nodes.len();
let right_sum_grad: f32 = right_samples.iter().map(|&i| gradients[i]).sum();
let right_sum_hess: f32 = right_samples.iter().map(|&i| hessians[i]).sum();
let right_value = self.calculate_leaf_value(right_sum_grad, right_sum_hess);
nodes.push(LGBMNode {
is_leaf: true,
feature: 0,
threshold_bin: 0,
left: 0,
right: 0,
value: right_value,
split_gain: 0.0,
});
nodes[node_idx].left = left_idx;
nodes[node_idx].right = right_idx;
if left_samples.len() >= self.min_data_in_leaf {
leaf_queue.push((left_samples, left_idx, depth + 1));
}
if right_samples.len() >= self.min_data_in_leaf {
leaf_queue.push((right_samples, right_idx, depth + 1));
}
}
LGBMTree { nodes }
}
fn find_best_leaf_to_split(
&self,
x_data: &[f32],
gradients: &[f32],
hessians: &[f32],
leaf_queue: &[(Vec<usize>, usize, usize)],
feature_indices: &[usize],
n_features: usize,
) -> (usize, Option<(usize, usize, f32, Vec<usize>, Vec<usize>)>) {
let mut best_leaf_idx = 0;
let mut best_split: Option<(usize, usize, f32, Vec<usize>, Vec<usize>)> = None;
let mut best_gain = 0.0;
for (idx, (samples, _, _)) in leaf_queue.iter().enumerate() {
if samples.len() < self.min_data_in_leaf * 2 {
continue;
}
let split = self.find_best_split_histogram(
x_data,
gradients,
hessians,
samples,
feature_indices,
n_features,
);
if let Some((_, _, gain, _, _)) = &split {
if *gain > best_gain {
best_gain = *gain;
best_split = split;
best_leaf_idx = idx;
}
}
}
(best_leaf_idx, best_split)
}
fn find_best_split_histogram(
&self,
x_data: &[f32],
gradients: &[f32],
hessians: &[f32],
sample_indices: &[usize],
feature_indices: &[usize],
n_features: usize,
) -> Option<(usize, usize, f32, Vec<usize>, Vec<usize>)> {
let sum_grad: f32 = sample_indices.iter().map(|&i| gradients[i]).sum();
let sum_hess: f32 = sample_indices.iter().map(|&i| hessians[i]).sum();
let mut best_gain = 0.0;
let mut best_feature = 0;
let mut best_bin = 0;
let mut best_left = Vec::new();
let mut best_right = Vec::new();
for &feature in feature_indices {
let n_bins = self.feature_bins[feature].len();
let mut hist_grad = vec![0.0; n_bins];
let mut hist_hess = vec![0.0; n_bins];
let mut bin_samples: Vec<Vec<usize>> = vec![Vec::new(); n_bins];
for &idx in sample_indices {
let value = x_data[idx * n_features + feature];
let bin = self.value_to_bin(feature, value);
hist_grad[bin] += gradients[idx];
hist_hess[bin] += hessians[idx];
bin_samples[bin].push(idx);
}
let mut left_grad = 0.0;
let mut left_hess = 0.0;
let mut left_samples = Vec::new();
for bin in 0..n_bins - 1 {
left_grad += hist_grad[bin];
left_hess += hist_hess[bin];
left_samples.extend(&bin_samples[bin]);
let right_grad = sum_grad - left_grad;
let right_hess = sum_hess - left_hess;
if left_hess < self.min_sum_hessian_in_leaf || right_hess < self.min_sum_hessian_in_leaf {
continue;
}
let gain = self.calculate_split_gain(left_grad, left_hess, right_grad, right_hess, sum_grad, sum_hess);
if gain > best_gain {
best_gain = gain;
best_feature = feature;
best_bin = bin;
best_left = left_samples.clone();
best_right = sample_indices.iter()
.filter(|&&idx| !best_left.contains(&idx))
.copied()
.collect();
}
}
}
if best_gain > 0.0 {
Some((best_feature, best_bin, best_gain, best_left, best_right))
} else {
None
}
}
fn calculate_leaf_value(&self, sum_grad: f32, sum_hess: f32) -> f32 {
-sum_grad / (sum_hess + self.lambda_l2)
}
fn calculate_split_gain(
&self,
left_grad: f32,
left_hess: f32,
right_grad: f32,
right_hess: f32,
sum_grad: f32,
sum_hess: f32,
) -> f32 {
let left_weight = -left_grad / (left_hess + self.lambda_l2);
let right_weight = -right_grad / (right_hess + self.lambda_l2);
let parent_weight = -sum_grad / (sum_hess + self.lambda_l2);
let gain = 0.5 * (
left_grad * left_weight +
right_grad * right_weight -
sum_grad * parent_weight
);
let l1_penalty = self.lambda_l1 * (left_weight.abs() + right_weight.abs() - parent_weight.abs());
gain - l1_penalty
}
fn predict_tree(&self, tree: &LGBMTree, sample: &[f32]) -> f32 {
let mut node_idx = 0;
loop {
let node = &tree.nodes[node_idx];
if node.is_leaf {
return node.value;
}
let bin = self.value_to_bin(node.feature, sample[node.feature]);
if bin <= node.threshold_bin {
node_idx = node.left;
} else {
node_idx = node.right;
}
}
}
pub fn predict(&self, x: &Tensor) -> Tensor {
let n_samples = x.dims()[0];
let n_features = x.dims()[1];
let x_data = x.data_f32();
let predictions: Vec<f32> = (0..n_samples)
.into_par_iter()
.map(|i| {
let sample = &x_data[i * n_features..(i + 1) * n_features];
let mut pred = self.base_score;
for tree in &self.trees {
pred += self.learning_rate * self.predict_tree(tree, sample);
}
let prob = 1.0 / (1.0 + (-pred).exp());
if prob >= 0.5 { 1.0 } else { 0.0 }
})
.collect();
Tensor::from_slice(&predictions, &[n_samples]).unwrap()
}
pub fn predict_proba(&self, x: &Tensor) -> Tensor {
let n_samples = x.dims()[0];
let n_features = x.dims()[1];
let x_data = x.data_f32();
let probabilities: Vec<f32> = (0..n_samples)
.into_par_iter()
.map(|i| {
let sample = &x_data[i * n_features..(i + 1) * n_features];
let mut pred = self.base_score;
for tree in &self.trees {
pred += self.learning_rate * self.predict_tree(tree, sample);
}
1.0 / (1.0 + (-pred).exp())
})
.collect();
Tensor::from_slice(&probabilities, &[n_samples]).unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lightgbm_classifier() {
let x = Tensor::from_slice(
&[0.0f32, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0],
&[4, 2],
).unwrap();
let y = Tensor::from_slice(&[0.0f32, 1.0, 1.0, 0.0], &[4]).unwrap();
let mut lgbm = LightGBMClassifier::new(10)
.learning_rate(0.1)
.num_leaves(7);
lgbm.fit(&x, &y);
let predictions = lgbm.predict(&x);
assert_eq!(predictions.dims()[0], 4); }
}