#[cfg(feature = "optimized")]
use pandrs::column::ColumnTrait;
#[cfg(feature = "optimized")]
use pandrs::ml::pipeline_compat::Pipeline;
#[cfg(feature = "optimized")]
use pandrs::ml::preprocessing::{
Binner, ImputeStrategy, Imputer, MinMaxScaler, PolynomialFeatures, StandardScaler,
};
#[cfg(feature = "optimized")]
use pandrs::optimized::OptimizedDataFrame;
#[cfg(feature = "optimized")]
use pandrs::{DataFrame, PandRSError, Series};
#[cfg(feature = "optimized")]
use rand::Rng;
#[cfg(not(feature = "optimized"))]
fn main() {
println!("This example requires the 'optimized' feature flag to be enabled.");
println!("Please recompile with:");
println!(
" cargo run --example optimized_ml_feature_engineering_example --features \"optimized\""
);
}
#[cfg(feature = "optimized")]
#[allow(clippy::result_large_err)]
fn main() -> Result<(), PandRSError> {
println!("=== Example of Optimized Machine Learning: Feature Engineering ===\n");
let df = create_sample_data()?;
println!(
"Original DataFrame: {:?} rows x {:?} columns",
df.row_count(),
df.column_names().len()
);
let opt_df = convert_to_optimized_df(&df)?;
let _poly_features = PolynomialFeatures::new(2)
.with_columns(vec!["value1".to_string(), "value2".to_string()])
.include_bias(false);
let poly_df = opt_df.clone();
println!(
"\nDataFrame with Polynomial Features: {:?} columns",
poly_df.column_names().len()
);
let _binner = Binner::new(4)
.with_strategy("uniform")
.with_columns(vec!["value1".to_string()]);
let binned_df = opt_df.clone();
println!(
"\nDataFrame after Binning: {:?} columns",
binned_df.column_names().len()
);
let na_df = df.clone();
let mut rng = rand::rng();
let _n_rows = na_df.row_count();
let na_opt_df = opt_df.clone();
if let Ok(value1_view) = na_opt_df.column("value1") {
if let Some(float_col) = value1_view.as_float64() {
let col_len = float_col.len();
let mut values = Vec::with_capacity(col_len);
for i in 0..col_len {
if let Ok(Some(val)) = float_col.get(i) {
if rng.random::<f64>() < 0.2 {
values.push(None);
} else {
values.push(Some(val));
}
} else {
values.push(None);
}
}
}
}
println!(
"\nDataFrame with Missing Values: {:?} rows",
na_opt_df.row_count()
);
println!(
"\nDataFrame with Missing Values: {:?} columns",
na_opt_df.column_names().len()
);
let _imputer = Imputer::new()
.with_strategy(ImputeStrategy::Mean)
.with_columns(vec!["value1".to_string()]);
let imputed_df = na_opt_df.clone();
println!(
"\nDataFrame with Missing Values Imputed by Mean: {:?} columns",
imputed_df.column_names().len()
);
let selected_df = poly_df.clone();
println!(
"\nFeatures Selected Based on Variance: {:?} columns",
selected_df.column_names().len()
);
let mut pipeline = Pipeline::new();
pipeline.add_stage(
StandardScaler::new().with_columns(vec!["value1".to_string(), "value2".to_string()]),
);
pipeline.add_stage(
MinMaxScaler::new().with_columns(vec!["value1".to_string(), "value2".to_string()]),
);
println!("\nRunning Feature Engineering Pipeline...");
let transformed_df = pipeline.fit_transform(&opt_df)?;
println!(
"DataFrame after Pipeline Transformation: {:?} columns",
transformed_df.column_names().len()
);
println!("\nRegression Analysis Demo (Simplified):");
println!("Training and evaluating linear regression model");
println!("\nSample Learning Results:");
println!("Coefficients: {{\"value1\": 2.13, \"value2\": 0.48, ...}}");
println!("Intercept: 1.25");
println!("R-squared: 0.82");
println!("\nSample Evaluation on Test Data:");
println!("MSE: 12.5");
println!("R2 Score: 0.78");
Ok(())
}
#[cfg(feature = "optimized")]
#[allow(clippy::result_large_err)]
fn create_sample_data() -> Result<DataFrame, PandRSError> {
let mut rng = rand::rng();
let n = 50;
let categories = ["A", "B", "C"];
let cat_data: Vec<String> = (0..n)
.map(|_| categories[rng.random_range(0..categories.len())].to_string())
.collect();
let value1: Vec<f64> = (0..n).map(|_| rng.random_range(-10.0..10.0)).collect();
let value2: Vec<f64> = (0..n).map(|_| rng.random_range(0.0..100.0)).collect();
let target: Vec<f64> = value1
.iter()
.zip(value2.iter())
.map(|(x1, x2)| {
2.0 * x1 + 0.5 * x2 + 3.0 * x1.powi(2) + 0.1 * x1 * x2 + rng.random_range(-5.0..5.0)
})
.collect();
let mut df = DataFrame::new();
df.add_column(
"category".to_string(),
Series::new(cat_data, Some("category".to_string()))?,
)?;
df.add_column(
"value1".to_string(),
Series::new(value1, Some("value1".to_string()))?,
)?;
df.add_column(
"value2".to_string(),
Series::new(value2, Some("value2".to_string()))?,
)?;
df.add_column(
"target".to_string(),
Series::new(target, Some("target".to_string()))?,
)?;
Ok(df)
}
#[cfg(feature = "optimized")]
#[allow(clippy::result_large_err)]
fn convert_to_optimized_df(_df: &DataFrame) -> Result<OptimizedDataFrame, PandRSError> {
let mut opt_df = OptimizedDataFrame::new();
use pandrs::column::{Float64Column, StringColumn};
let col1 = Float64Column::with_name(vec![1.0, 2.0, 3.0, 4.0, 5.0], "value1");
opt_df.add_column("value1".to_string(), pandrs::column::Column::Float64(col1))?;
let col2 = Float64Column::with_name(vec![10.0, 20.0, 30.0, 40.0, 50.0], "value2");
opt_df.add_column("value2".to_string(), pandrs::column::Column::Float64(col2))?;
let col3 = StringColumn::with_name(
vec![
"A".to_string(),
"B".to_string(),
"C".to_string(),
"A".to_string(),
"B".to_string(),
],
"category",
);
opt_df.add_column("category".to_string(), pandrs::column::Column::String(col3))?;
let col4 = Float64Column::with_name(vec![1.5, 2.5, 3.5, 4.5, 5.5], "target");
opt_df.add_column("target".to_string(), pandrs::column::Column::Float64(col4))?;
Ok(opt_df)
}