#![allow(clippy::result_large_err)]
use pandrs::dataframe::DataFrame;
use pandrs::error::Result;
use pandrs::ml::models::ensemble::{
RandomForestClassifier, RandomForestConfig, RandomForestConfigBuilder, RandomForestRegressor,
};
use pandrs::ml::models::tree::{DecisionTreeClassifier, DecisionTreeConfig};
use pandrs::ml::models::{train_test_split, SupervisedModel};
use pandrs::series::Series;
use std::collections::HashMap;
fn main() -> Result<()> {
println!("=== Random Forest Examples ===\n");
classification_example()?;
regression_example()?;
forest_vs_tree_comparison()?;
feature_importance_example()?;
Ok(())
}
fn classification_example() -> Result<()> {
println!("--- Classification Example: Credit Scoring ---");
let mut df = DataFrame::new();
df.add_column(
"credit_score".to_string(),
Series::new(
vec![
720.0, 680.0, 750.0, 620.0, 580.0, 700.0, 760.0, 640.0, 590.0, 710.0, 670.0, 730.0,
600.0, 690.0, 740.0, 610.0, 720.0, 650.0, 770.0, 630.0, 595.0, 705.0, 725.0, 615.0,
580.0, 715.0, 685.0, 755.0, 605.0, 695.0,
],
Some("credit_score".to_string()),
)?,
)?;
df.add_column(
"income".to_string(),
Series::new(
vec![
60.0, 45.0, 75.0, 35.0, 30.0, 55.0, 80.0, 40.0, 28.0, 58.0, 48.0, 70.0, 32.0, 50.0,
72.0, 33.0, 62.0, 42.0, 85.0, 38.0, 29.0, 57.0, 65.0, 36.0, 27.0, 59.0, 47.0, 78.0,
31.0, 52.0,
],
Some("income".to_string()),
)?,
)?;
df.add_column(
"debt_to_income".to_string(),
Series::new(
vec![
0.25, 0.35, 0.20, 0.45, 0.50, 0.30, 0.18, 0.40, 0.48, 0.28, 0.33, 0.22, 0.47, 0.32,
0.19, 0.46, 0.26, 0.38, 0.17, 0.42, 0.49, 0.29, 0.24, 0.44, 0.51, 0.27, 0.34, 0.21,
0.45, 0.31,
],
Some("debt_to_income".to_string()),
)?,
)?;
df.add_column(
"employment_years".to_string(),
Series::new(
vec![
5.0, 3.0, 8.0, 2.0, 1.0, 4.0, 10.0, 2.5, 1.5, 6.0, 3.5, 7.0, 2.0, 4.5, 9.0, 1.8,
5.5, 3.2, 12.0, 2.8, 1.2, 5.8, 6.5, 2.2, 0.8, 6.2, 3.8, 8.5, 1.9, 4.8,
],
Some("employment_years".to_string()),
)?,
)?;
df.add_column(
"loan_denied".to_string(),
Series::new(
vec![
0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0,
0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
],
Some("loan_denied".to_string()),
)?,
)?;
println!("Dataset shape: {} rows, {} columns", df.nrows(), df.ncols());
let (train_df, test_df) = train_test_split(&df, 0.3, false, Some(42))?;
println!("Training set: {} rows", train_df.nrows());
println!("Test set: {} rows", test_df.nrows());
let config = RandomForestConfigBuilder::new()
.n_estimators(100)
.max_depth(5)
.max_features(2) .bootstrap(true)
.oob_score(true) .random_seed(42)
.build();
let mut classifier = RandomForestClassifier::new(config);
println!("\nTraining Random Forest with 100 trees...");
classifier.fit(&train_df, "loan_denied")?;
println!("Forest trained successfully!");
let predictions = classifier.predict(&test_df)?;
println!("\nPredictions on test set:");
let test_labels = test_df.get_column::<f64>("loan_denied")?;
for (i, (pred, actual)) in predictions
.iter()
.zip(test_labels.values())
.take(5)
.enumerate()
{
let pred_label = if *pred < 0.5 { "Approved" } else { "Denied" };
let actual_label = if *actual < 0.5 { "Approved" } else { "Denied" };
println!(
" Loan {}: Predicted = {}, Actual = {}",
i + 1,
pred_label,
actual_label
);
}
let mut correct = 0;
for (pred, actual) in predictions.iter().zip(test_labels.values()) {
if (pred.round() - actual).abs() < 0.5 {
correct += 1;
}
}
let accuracy = correct as f64 / predictions.len() as f64;
println!("\nAccuracy: {:.2}%", accuracy * 100.0);
if let Some(importances) = classifier.feature_importances() {
println!("\nFeature Importances (averaged across all trees):");
let mut importance_vec: Vec<(&String, &f64)> = importances.iter().collect();
importance_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
for (feature, importance) in importance_vec {
let bar_length = (importance * 50.0) as usize;
let bar = "█".repeat(bar_length);
println!(" {:20} {:.4} {}", feature, importance, bar);
}
}
println!();
Ok(())
}
fn regression_example() -> Result<()> {
println!("--- Regression Example: Real Estate Price Prediction ---");
let mut df = DataFrame::new();
df.add_column(
"square_feet".to_string(),
Series::new(
vec![
1200.0, 1500.0, 1800.0, 2200.0, 1350.0, 1600.0, 2500.0, 1950.0, 1450.0, 1750.0,
2100.0, 2400.0, 1300.0, 1650.0, 2300.0, 2600.0, 1550.0, 1900.0, 2050.0, 2450.0,
1400.0, 1700.0, 2150.0, 2350.0, 1250.0, 1850.0, 2000.0, 2500.0, 1500.0, 1800.0,
],
Some("square_feet".to_string()),
)?,
)?;
df.add_column(
"bedrooms".to_string(),
Series::new(
vec![
2.0, 3.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, 5.0,
3.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 3.0, 4.0, 3.0, 3.0,
],
Some("bedrooms".to_string()),
)?,
)?;
df.add_column(
"bathrooms".to_string(),
Series::new(
vec![
1.0, 2.0, 2.0, 2.5, 1.0, 2.0, 3.0, 2.0, 1.5, 2.0, 2.5, 3.0, 1.0, 2.0, 2.5, 3.5,
2.0, 2.0, 2.5, 3.0, 1.5, 2.0, 2.5, 2.5, 1.0, 2.0, 2.0, 3.0, 2.0, 2.0,
],
Some("bathrooms".to_string()),
)?,
)?;
df.add_column(
"lot_size".to_string(),
Series::new(
vec![
5000.0, 6000.0, 7500.0, 9000.0, 5500.0, 6500.0, 10000.0, 8000.0, 5200.0, 7000.0,
8500.0, 9500.0, 5100.0, 6800.0, 9200.0, 11000.0, 6200.0, 7800.0, 8200.0, 9800.0,
5400.0, 7200.0, 8700.0, 9300.0, 4900.0, 7600.0, 8100.0, 10500.0, 6300.0, 7500.0,
],
Some("lot_size".to_string()),
)?,
)?;
df.add_column(
"age".to_string(),
Series::new(
vec![
15.0, 10.0, 5.0, 2.0, 20.0, 8.0, 1.0, 6.0, 18.0, 7.0, 3.0, 2.0, 22.0, 9.0, 4.0,
1.0, 12.0, 5.0, 3.0, 2.0, 16.0, 6.0, 4.0, 3.0, 19.0, 5.0, 4.0, 1.0, 11.0, 7.0,
],
Some("age".to_string()),
)?,
)?;
df.add_column(
"price".to_string(),
Series::new(
vec![
220.0, 280.0, 340.0, 420.0, 240.0, 300.0, 520.0, 380.0, 250.0, 330.0, 400.0, 480.0,
230.0, 310.0, 450.0, 580.0, 290.0, 360.0, 390.0, 490.0, 245.0, 325.0, 410.0, 460.0,
215.0, 350.0, 385.0, 510.0, 285.0, 335.0,
],
Some("price".to_string()),
)?,
)?;
println!("Dataset shape: {} rows, {} columns", df.nrows(), df.ncols());
let (train_df, test_df) = train_test_split(&df, 0.25, false, Some(42))?;
let config = RandomForestConfigBuilder::new()
.n_estimators(150) .max_depth(6)
.min_samples_split(3)
.min_samples_leaf(2)
.bootstrap(true)
.random_seed(42)
.build();
let mut regressor = RandomForestRegressor::new(config);
println!("\nTraining Random Forest with 150 trees...");
regressor.fit(&train_df, "price")?;
println!("Forest trained successfully!");
let predictions = regressor.predict(&test_df)?;
println!("\nPredictions on test set (price in $1000s):");
let test_labels = test_df.get_column::<f64>("price")?;
for (i, (pred, actual)) in predictions
.iter()
.zip(test_labels.values())
.take(5)
.enumerate()
{
let error_pct = ((pred - actual).abs() / actual) * 100.0;
println!(
" House {}: Predicted = ${:.1}k, Actual = ${:.1}k, Error = {:.1}%",
i + 1,
pred,
actual,
error_pct
);
}
let mut mse = 0.0;
let mut mae = 0.0;
let n = predictions.len() as f64;
for (pred, actual) in predictions.iter().zip(test_labels.values()) {
let error = pred - actual;
mse += error * error;
mae += error.abs();
}
mse /= n;
mae /= n;
let rmse = mse.sqrt();
println!("\nPerformance Metrics:");
println!(" RMSE: ${:.2}k", rmse);
println!(" MAE: ${:.2}k", mae);
println!(
" R² Score approximation: {:.2}%",
(1.0 - mse / 10000.0) * 100.0
);
println!();
Ok(())
}
fn forest_vs_tree_comparison() -> Result<()> {
println!("--- Random Forest vs Single Decision Tree ---");
let mut df = DataFrame::new();
df.add_column(
"x1".to_string(),
Series::new(
vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
],
Some("x1".to_string()),
)?,
)?;
df.add_column(
"x2".to_string(),
Series::new(
vec![
2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0,
],
Some("x2".to_string()),
)?,
)?;
df.add_column(
"y".to_string(),
Series::new(
vec![
2.1, 4.3, 6.2, 8.4, 10.1, 12.3, 14.2, 16.4, 18.1, 20.3, 22.2, 24.4, 26.1, 28.3,
30.2, 32.4, 34.1, 36.3, 38.2, 40.4, 42.1, 44.3, 46.2, 48.4, 50.1,
],
Some("y".to_string()),
)?,
)?;
let (train_df, test_df) = train_test_split(&df, 0.3, false, Some(42))?;
println!("Training single Decision Tree...");
let tree_config = DecisionTreeConfig::default();
let mut tree = DecisionTreeClassifier::new(tree_config);
tree.fit(&train_df, "y")?;
let tree_pred = tree.predict(&test_df)?;
let mut tree_mse = 0.0;
let test_labels = test_df.get_column::<f64>("y")?;
for (pred, actual) in tree_pred.iter().zip(test_labels.values()) {
tree_mse += (pred - actual).powi(2);
}
tree_mse /= tree_pred.len() as f64;
println!(" Single Tree RMSE: {:.4}", tree_mse.sqrt());
println!("\nTraining Random Forest (50 trees)...");
let forest_config = RandomForestConfigBuilder::new()
.n_estimators(50)
.bootstrap(true)
.random_seed(42)
.build();
let mut forest = RandomForestRegressor::new(forest_config);
forest.fit(&train_df, "y")?;
let forest_pred = forest.predict(&test_df)?;
let mut forest_mse = 0.0;
for (pred, actual) in forest_pred.iter().zip(test_labels.values()) {
forest_mse += (pred - actual).powi(2);
}
forest_mse /= forest_pred.len() as f64;
println!(" Random Forest RMSE: {:.4}", forest_mse.sqrt());
println!("\nRandom Forest typically provides:");
println!(" - Better generalization (reduced overfitting)");
println!(" - More stable predictions");
println!(" - Robust feature importance estimates");
println!(" - Higher accuracy on unseen data");
println!();
Ok(())
}
fn feature_importance_example() -> Result<()> {
println!("--- Feature Importance with Random Forest ---");
let mut df = DataFrame::new();
df.add_column(
"critical_feature".to_string(),
Series::new(
vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
16.0, 17.0, 18.0, 19.0, 20.0,
],
Some("critical_feature".to_string()),
)?,
)?;
df.add_column(
"moderate_feature".to_string(),
Series::new(
vec![
20.0, 19.0, 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0,
6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
],
Some("moderate_feature".to_string()),
)?,
)?;
df.add_column(
"weak_feature".to_string(),
Series::new(
vec![
5.0, 5.1, 4.9, 5.2, 4.8, 5.3, 4.7, 5.0, 5.1, 4.9, 5.2, 4.8, 5.0, 5.1, 4.9, 5.2,
4.8, 5.0, 5.1, 4.9,
],
Some("weak_feature".to_string()),
)?,
)?;
df.add_column(
"noise1".to_string(),
Series::new(
vec![
1.2, 7.8, 3.4, 9.1, 2.5, 8.3, 4.7, 6.2, 5.1, 3.9, 7.5, 2.3, 9.6, 4.4, 6.8, 1.7,
8.9, 3.2, 5.5, 7.1,
],
Some("noise1".to_string()),
)?,
)?;
df.add_column(
"noise2".to_string(),
Series::new(
vec![
9.3, 2.7, 6.1, 4.5, 8.2, 1.9, 7.4, 3.8, 5.6, 2.1, 8.7, 4.3, 6.9, 1.5, 7.8, 3.4,
9.2, 2.6, 5.1, 8.4,
],
Some("noise2".to_string()),
)?,
)?;
df.add_column(
"target".to_string(),
Series::new(
vec![
2.0, 4.1, 6.0, 8.2, 10.1, 12.0, 14.2, 16.1, 18.0, 20.2, 22.1, 24.0, 26.2, 28.1,
30.0, 32.2, 34.1, 36.0, 38.2, 40.1,
],
Some("target".to_string()),
)?,
)?;
let config = RandomForestConfigBuilder::new()
.n_estimators(100)
.max_depth(5)
.bootstrap(true)
.random_seed(42)
.build();
let mut regressor = RandomForestRegressor::new(config);
regressor.fit(&df, "target")?;
println!("Feature Importances from Random Forest (100 trees):");
println!("(Higher values indicate features used more frequently for splitting)\n");
if let Some(importances) = regressor.feature_importances() {
let mut importance_vec: Vec<(&String, &f64)> = importances.iter().collect();
importance_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
for (feature, importance) in importance_vec {
let bar_length = (importance * 60.0) as usize;
let bar = "█".repeat(bar_length);
println!(" {:20} {:.4} {}", feature, importance, bar);
}
}
println!("\nRandom Forest aggregates feature importance across all trees,");
println!("providing more robust estimates than a single tree.");
println!();
Ok(())
}