use nanogbm::metric::{BinaryLogloss, Metric};
use nanogbm::{Config, DatasetBuilder, GbdtTrainer, Model};
use rand::SeedableRng;
use rand::prelude::*;
use rand_chacha::ChaCha8Rng;
fn make_classification(
train_n: usize,
valid_n: usize,
d: usize,
seed: u64,
) -> (Vec<f64>, Vec<f32>, Vec<f64>, Vec<f32>) {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
let weights: Vec<f64> = (0..d).map(|_| rng.gen_range(-1.0..1.0)).collect();
let bias: f64 = rng.gen_range(-0.5..0.5);
let mut sample = |n: usize| -> (Vec<f64>, Vec<f32>) {
let mut features = vec![0.0; n * d];
let mut labels = vec![0f32; n];
for i in 0..n {
let mut z = bias;
for j in 0..d {
let x: f64 = rng.gen_range(-3.0..3.0);
features[i * d + j] = x;
z += weights[j] * x;
}
z += rng.gen_range(-0.4..0.4);
let p = 1.0 / (1.0 + (-z).exp());
labels[i] = if rng.r#gen::<f64>() < p { 1.0 } else { 0.0 };
}
(features, labels)
};
let (tx, ty) = sample(train_n);
let (vx, vy) = sample(valid_n);
(tx, ty, vx, vy)
}
#[test]
fn trains_and_reduces_logloss_on_synthetic() {
let n_train = 4000;
let n_valid = 1000;
let d = 8;
let (train_x, train_y, valid_x, valid_y) = make_classification(n_train, n_valid, d, 42);
let mut cfg = Config::default();
cfg.num_iterations = 200;
cfg.learning_rate = 0.1;
cfg.num_leaves = 31;
cfg.min_data_in_leaf = 20;
cfg.max_bin = 64;
cfg.feature_fraction = 1.0;
cfg.bagging_fraction = 1.0;
cfg.lambda_l2 = 1.0;
cfg.early_stopping_round = 20;
cfg.seed = 0;
let train_ds = DatasetBuilder::from_rows(&train_x, n_train, d, &train_y, &cfg).unwrap();
let valid_ds = DatasetBuilder::from_rows(&valid_x, n_valid, d, &valid_y, &cfg).unwrap();
let model = GbdtTrainer::new(&cfg)
.fit(&train_ds, Some(&valid_ds))
.unwrap();
assert!(model.n_trees() > 0);
let metric = BinaryLogloss;
let init_only_scores = vec![model.init_score(); n_valid];
let init_loss = metric.evaluate(&init_only_scores, &valid_y);
let final_scores = model.predict_raw_scores(&valid_x, n_valid);
let final_loss = metric.evaluate(&final_scores, &valid_y);
println!("init_loss={init_loss:.5} final_loss={final_loss:.5}");
assert!(
final_loss < init_loss * 0.9,
"model failed to improve appreciably: {init_loss} -> {final_loss}"
);
assert!(final_loss < 0.6, "final loss too high: {final_loss}");
}
#[test]
fn saved_model_round_trip_matches_predictions() {
let n = 500;
let d = 4;
let (x, y, _, _) = make_classification(n, 1, d, 7);
let mut cfg = Config::default();
cfg.num_iterations = 10;
cfg.num_leaves = 15;
cfg.min_data_in_leaf = 10;
cfg.max_bin = 32;
let ds = DatasetBuilder::from_rows(&x, n, d, &y, &cfg).unwrap();
let model = GbdtTrainer::new(&cfg).fit(&ds, None).unwrap();
let tmp = std::env::temp_dir().join("nanogbm_model.bin");
model.save(&tmp).unwrap();
let loaded = Model::load(&tmp).unwrap();
let p1 = model.predict_proba(&x, n);
let p2 = loaded.predict_proba(&x, n);
for (a, b) in p1.iter().zip(p2.iter()) {
assert!((a - b).abs() < 1e-12);
}
}
#[test]
fn predict_bin_and_raw_paths_agree() {
let n = 400;
let d = 3;
let (x, y, _, _) = make_classification(n, 1, d, 9);
let mut cfg = Config::default();
cfg.num_iterations = 8;
cfg.num_leaves = 11;
cfg.min_data_in_leaf = 5;
cfg.max_bin = 32;
let ds = DatasetBuilder::from_rows(&x, n, d, &y, &cfg).unwrap();
let model = GbdtTrainer::new(&cfg).fit(&ds, None).unwrap();
let raw = model.predict_raw_scores(&x, n);
let bin_raw = model.predict_raw_scores_on_dataset(&ds);
let max_abs = raw
.iter()
.zip(bin_raw.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f64, f64::max);
assert!(
max_abs < 1e-9,
"raw vs bin predictions diverged by {max_abs}"
);
}
#[test]
fn predict_proba_binned_matches_raw_proba() {
let n_train = 800;
let n_eval = 1500;
let d = 6;
let (tx, ty, ex, _ey) = make_classification(n_train, n_eval, d, 17);
let mut cfg = Config::default();
cfg.num_iterations = 15;
cfg.num_leaves = 21;
cfg.min_data_in_leaf = 5;
cfg.max_bin = 64;
cfg.lambda_l2 = 0.5;
let train_ds = DatasetBuilder::from_rows(&tx, n_train, d, &ty, &cfg).unwrap();
let model = GbdtTrainer::new(&cfg).fit(&train_ds, None).unwrap();
let raw_proba = model.predict_proba(&ex, n_eval);
let binned_proba = model.predict_proba_binned(&ex, n_eval);
let max_abs = raw_proba
.iter()
.zip(binned_proba.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f64, f64::max);
assert!(
max_abs < 1e-9,
"raw vs binned proba diverged by {max_abs}"
);
}
#[test]
fn binned_predict_survives_bincode_round_trip() {
let n = 600;
let d = 4;
let (x, y, ex, _) = make_classification(n, n / 2, d, 31);
let mut cfg = Config::default();
cfg.num_iterations = 12;
cfg.num_leaves = 15;
cfg.min_data_in_leaf = 10;
cfg.max_bin = 48;
let ds = DatasetBuilder::from_rows(&x, n, d, &y, &cfg).unwrap();
let model = GbdtTrainer::new(&cfg).fit(&ds, None).unwrap();
let tmp = std::env::temp_dir().join("nanogbm_model_binned.bin");
model.save(&tmp).unwrap();
let loaded = Model::load(&tmp).unwrap();
let before = model.predict_proba_binned(&ex, n / 2);
let after = loaded.predict_proba_binned(&ex, n / 2);
for (a, b) in before.iter().zip(after.iter()) {
assert!((a - b).abs() < 1e-12);
}
}