fn main() {
println!("{}", "=".repeat(70));
println!("TreeBoost: Target Encoding Example");
println!("{}", "=".repeat(70));
println!();
println!("1. Categorical Feature Handling Demonstration");
println!();
println!(" Count-Min Sketch:");
println!(" - Efficiently estimates frequencies of categorical values");
println!(" - Used to identify and filter rare categories");
println!(" - Memory usage: O(1/eps * ln(1/delta))");
println!(" - Example parameters:");
println!(" * eps: 0.001 (1% error tolerance)");
println!(" * confidence: 0.99 (99% confidence)");
println!(" * min_count: 5 (categories with <5 samples -> 'unknown')");
println!();
let categories = vec![
"category_1",
"category_2",
"category_3",
"category_1",
"category_4",
"category_1",
"category_5",
"category_2",
"category_1",
"category_6",
"category_1",
"category_7",
"category_2",
"category_8",
"category_1",
];
let mut counts: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
for cat in &categories {
*counts.entry(cat).or_insert(0) += 1;
}
println!(" Category Frequencies:");
let mut sorted_counts: Vec<_> = counts.iter().collect();
sorted_counts.sort_by_key(|&(_, &count)| std::cmp::Reverse(count));
for (cat, count) in &sorted_counts {
let freq = **count as f32 / categories.len() as f32;
println!(
" {:15}: {:2} occurrences ({:.1}%)",
cat,
count,
freq * 100.0
);
}
println!();
println!("2. Ordered Target Encoding");
println!();
println!(" Approach:");
println!(" - Process training data sequentially");
println!(" - For each category, encode with mean of target values seen so far");
println!(" - Prevents target leakage (uses only past information)");
println!(" - Applies smoothing to reduce variance for rare categories");
println!();
let training_data = vec![
("cat_a", 1.2),
("cat_b", 2.5),
("cat_a", 1.8),
("cat_c", 0.9),
("cat_b", 2.1),
("cat_a", 1.5),
];
println!(" Training Sequence:");
let mut encodings: std::collections::HashMap<&str, Vec<f32>> = std::collections::HashMap::new();
for (i, (cat, target)) in training_data.iter().enumerate() {
encodings.entry(cat).or_insert_with(Vec::new).push(*target);
let mean_so_far = encodings[cat].iter().sum::<f32>() / encodings[cat].len() as f32;
println!(
" Row {}: category='{}', target={:.1}, encoding (mean so far)={:.4}",
i + 1,
cat,
target,
mean_so_far
);
}
println!();
println!(" Final Encodings (without smoothing):");
for cat in &["cat_a", "cat_b", "cat_c"] {
if let Some(targets) = encodings.get(cat) {
let mean = targets.iter().sum::<f32>() / targets.len() as f32;
println!(" {}: {:.4}", cat, mean);
}
}
println!();
println!("3. Smoothing Effect");
println!();
println!(" Smoothing reduces variance for rare categories:");
println!(
" Smoothed encoding = (count * mean + smoothing * global_mean) / (count + smoothing)"
);
println!();
let global_mean =
training_data.iter().map(|(_, t)| t).sum::<f32>() / training_data.len() as f32;
let smoothing = 10.0;
println!(" Parameters:");
println!(" Global mean (all data): {:.4}", global_mean);
println!(" Smoothing parameter: {:.1}", smoothing);
println!();
println!(" Smoothed Encodings:");
for cat in &["cat_a", "cat_b", "cat_c"] {
if let Some(targets) = encodings.get(cat) {
let count = targets.len() as f32;
let mean = targets.iter().sum::<f32>() / count;
let smoothed = (count * mean + smoothing * global_mean) / (count + smoothing);
println!(
" {:5}: raw={:.4}, count={:.0}, smoothed={:.4}",
cat, mean, count, smoothed
);
}
}
println!();
println!("4. Integration with TreeBoost");
println!();
println!(" In TreeBoost, categorical handling is automatic:");
println!();
println!(" Rust Example:");
println!(" ```rust");
println!(" let config = GBDTConfig::new()");
println!(" .with_use_target_encoding(true) // Enable encoding");
println!(" .with_cms_params(eps: 0.001, confidence: 0.99)");
println!(" .with_categorical_features(vec![0, 2, 5]); // Which columns");
println!();
println!(" let model = GBDTModel::train(&features, num_features, &targets, config, None)?;");
println!(" ```");
println!();
println!(" Python Example:");
println!(" ```python");
println!(" config = GBDTConfig()");
println!(" config.use_target_encoding = True");
println!(" config.cms_params = {{\"eps\": 0.001, \"confidence\": 0.99}}");
println!(" config.categorical_features = [0, 2, 5]");
println!();
println!(" model = GBDTModel.train(X, y, config)");
println!(" ```");
println!();
println!("5. Performance Characteristics");
println!();
println!(" Time Complexity:");
println!(" - Count-Min Sketch: O(1) per update, O(k ln(1/delta)) for query");
println!(" - Ordered Target Encoding: O(n) for n training samples");
println!();
println!(" Space Complexity:");
println!(" - Count-Min Sketch: O(1/eps * ln(1/delta))");
println!(" - Encoding map: O(k) where k = number of unique categories");
println!();
println!(" Handles:");
println!(" - Arbitrary cardinality (100s to millions of categories)");
println!(" - New categories at inference time (-> 'unknown' fallback)");
println!(" - Imbalanced categories (rare categories smoothed)");
println!();
println!("6. Advantages over One-Hot Encoding");
println!();
println!(" One-Hot Encoding Issues:");
println!(" - Explodes feature dimensionality (k categories -> k columns)");
println!(" - Memory intensive for high-cardinality features");
println!(" - Sparse representation (mostly zeros)");
println!();
println!(" Target Encoding Benefits:");
println!(" - Single column per categorical feature");
println!(" - Dense representation (no sparsity)");
println!(" - Incorporates target information (supervised)");
println!(" - Memory efficient even for millions of categories");
println!();
println!("{}", "=".repeat(70));
println!("Example demonstration completed!");
println!();
println!("In practice, use DatasetLoader or DataPipeline for automatic");
println!("categorical encoding in your TreeBoost pipeline.");
println!("{}", "=".repeat(70));
}