use polars::prelude::*;
use treeboost::booster::{GBDTConfig, GBDTModel};
use treeboost::dataset::{DataPipeline, PipelineConfig};
#[test]
fn test_data_pipeline_end_to_end() {
let df = df! {
"price" => &[100.0, 150.0, 200.0, 180.0, 220.0, 90.0, 300.0, 250.0, 175.0, 400.0,
120.0, 160.0, 210.0, 190.0, 230.0, 95.0, 310.0, 260.0, 185.0, 420.0],
"sqft" => &[1000.0, 1200.0, 1500.0, 1400.0, 1600.0, 900.0, 2000.0, 1800.0, 1350.0, 2500.0,
1050.0, 1250.0, 1550.0, 1450.0, 1650.0, 950.0, 2050.0, 1850.0, 1400.0, 2600.0],
"bedrooms" => &[2.0, 3.0, 3.0, 3.0, 4.0, 2.0, 4.0, 4.0, 3.0, 5.0,
2.0, 3.0, 3.0, 3.0, 4.0, 2.0, 4.0, 4.0, 3.0, 5.0],
"neighborhood" => &["downtown", "suburbs", "downtown", "suburbs", "downtown",
"rural", "downtown", "suburbs", "downtown", "suburbs",
"downtown", "suburbs", "downtown", "suburbs", "downtown",
"rural", "downtown", "suburbs", "downtown", "typo_rare"],
"target" => &[250.0, 280.0, 350.0, 320.0, 400.0, 180.0, 500.0, 450.0, 300.0, 600.0,
260.0, 290.0, 360.0, 330.0, 410.0, 185.0, 510.0, 460.0, 310.0, 550.0]
}
.unwrap();
let pipeline = DataPipeline::new(
PipelineConfig::new()
.with_num_bins(16)
.with_cms_params(0.01, 0.99, 2) .with_smoothing(5.0),
);
let (dataset, state, _filtered_df) = pipeline
.process_for_training(df.clone(), "target", Some(&["neighborhood"]))
.expect("Pipeline should succeed");
assert_eq!(dataset.num_rows(), 20);
assert_eq!(dataset.num_features(), 4);
assert_eq!(state.categorical_encodings.len(), 1);
assert_eq!(state.categorical_encodings[0].name, "neighborhood");
let config = GBDTConfig::new()
.with_num_rounds(20)
.with_max_depth(4)
.with_learning_rate(0.1);
let model = GBDTModel::train_binned(&dataset, config).expect("Training should succeed");
let predictions = model.predict(&dataset);
assert_eq!(predictions.len(), 20);
let targets = dataset.targets();
let mse: f32 = predictions
.iter()
.zip(targets.iter())
.map(|(p, t): (&f32, &f32)| (p - t).powi(2))
.sum::<f32>()
/ predictions.len() as f32;
assert!(mse < 10000.0, "MSE {} is too high for training data", mse);
let test_df = df! {
"price" => &[175.0, 275.0],
"sqft" => &[1300.0, 1900.0],
"bedrooms" => &[3.0, 4.0],
"neighborhood" => &["downtown", "unseen_area"] }
.unwrap();
let (_test_preprocessed_df, test_dataset) = pipeline
.process_for_inference(test_df, &state)
.expect("Inference pipeline should succeed");
assert_eq!(test_dataset.num_rows(), 2);
let test_predictions = model.predict(&test_dataset);
assert_eq!(test_predictions.len(), 2);
for pred in &test_predictions {
assert!(*pred > 0.0, "Prediction should be positive");
assert!(*pred < 1000.0, "Prediction should be reasonable");
}
}
#[test]
fn test_pipeline_rare_category_filtering() {
let df = df! {
"feature" => &[1.0; 100],
"category" => {
let mut cats = vec!["frequent".to_string(); 50];
cats.extend(vec!["also_frequent".to_string(); 30]);
cats.extend(vec!["rare1".to_string(); 5]);
cats.extend(vec!["rare2".to_string(); 5]);
cats.extend(vec!["very_rare1".to_string(); 3]);
cats.extend(vec!["very_rare2".to_string(); 3]);
cats.extend(vec!["typo1".to_string(); 2]);
cats.extend(vec!["typo2".to_string(); 2]);
cats
},
"target" => &(0..100).map(|i| i as f64).collect::<Vec<_>>()
}
.unwrap();
let pipeline = DataPipeline::new(
PipelineConfig::new()
.with_num_bins(8)
.with_cms_params(0.01, 0.99, 10) .with_smoothing(1.0),
);
let (_dataset, state, _filtered_df) = pipeline
.process_for_training(df, "target", Some(&["category"]))
.expect("Pipeline should succeed");
let cat_state = &state.categorical_encodings[0];
assert_eq!(
cat_state.category_mapping.category_to_idx.len(),
2,
"Expected 2 frequent categories, got {}",
cat_state.category_mapping.category_to_idx.len()
);
let kept_cats: Vec<&str> = cat_state
.category_mapping
.category_to_idx
.iter()
.map(|(name, _)| name.as_str())
.collect();
assert!(kept_cats.contains(&"frequent"), "Should keep 'frequent'");
assert!(
kept_cats.contains(&"also_frequent"),
"Should keep 'also_frequent'"
);
}
#[test]
fn test_pipeline_target_encoding_prevents_leakage() {
let df = df! {
"category" => &["A", "A", "A", "B", "B", "B", "C", "C", "C", "C"],
"target" => &[10.0, 20.0, 30.0, 100.0, 200.0, 300.0, 1.0, 2.0, 3.0, 4.0]
}
.unwrap();
let pipeline = DataPipeline::new(
PipelineConfig::new()
.with_num_bins(8)
.with_cms_params(0.01, 0.99, 1) .with_smoothing(0.0), );
let (_dataset, state, _filtered_df) = pipeline
.process_for_training(df, "target", Some(&["category"]))
.expect("Pipeline should succeed");
let cat_state = &state.categorical_encodings[0];
let enc_a = cat_state.encoding_map.encode("A");
let enc_b = cat_state.encoding_map.encode("B");
let enc_c = cat_state.encoding_map.encode("C");
assert!(
enc_a < enc_b,
"A (low target) should have lower encoding than B (high target)"
);
assert!(
enc_c < enc_a,
"C (very low target) should have lowest encoding"
);
}
#[test]
fn test_raw_prediction_equivalence() {
let df = df! {
"f0" => &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0,
21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0,
41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0],
"f1" => &[50.0, 49.0, 48.0, 47.0, 46.0, 45.0, 44.0, 43.0, 42.0, 41.0,
40.0, 39.0, 38.0, 37.0, 36.0, 35.0, 34.0, 33.0, 32.0, 31.0,
30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0,
20.0, 19.0, 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0,
10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
"f2" => &[5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0,
5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0,
5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0,
5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0,
5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
"target" => &[10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0,
110.0, 120.0, 130.0, 140.0, 150.0, 160.0, 170.0, 180.0, 190.0, 200.0,
210.0, 220.0, 230.0, 240.0, 250.0, 260.0, 270.0, 280.0, 290.0, 300.0,
310.0, 320.0, 330.0, 340.0, 350.0, 360.0, 370.0, 380.0, 390.0, 400.0,
410.0, 420.0, 430.0, 440.0, 450.0, 460.0, 470.0, 480.0, 490.0, 500.0]
}
.unwrap();
let pipeline = DataPipeline::new(PipelineConfig::new().with_num_bins(16));
let (dataset, _state, _filtered_df) = pipeline
.process_for_training(df.clone(), "target", None)
.expect("Pipeline should succeed");
let config = GBDTConfig::new()
.with_num_rounds(20)
.with_max_depth(4)
.with_learning_rate(0.1);
let model = GBDTModel::train_binned(&dataset, config).expect("Training should succeed");
let binned_predictions = model.predict(&dataset);
let num_rows = 50;
let num_features = 3;
let mut raw_features = Vec::with_capacity(num_rows * num_features);
for i in 0..num_rows {
raw_features.push((i + 1) as f64); raw_features.push((50 - i) as f64); raw_features.push(((i % 10) * 5 + 5) as f64); }
let raw_predictions = model.predict_raw(&raw_features);
assert_eq!(binned_predictions.len(), raw_predictions.len());
let max_diff: f32 = binned_predictions
.iter()
.zip(raw_predictions.iter())
.map(|(b, r)| (b - r).abs())
.fold(0.0f32, f32::max);
assert!(
max_diff < 50.0,
"Max difference between binned and raw predictions: {} (expected < 50.0)",
max_diff
);
}