use crate::core::error::{Error, Result};
use crate::dataframe::DataFrame;
use crate::ml::feature_engineering::{AutoFeatureEngineer, FeatureSelectionMethod, ScalingMethod};
use crate::ml::model_selection::{
CrossValidationStrategy, GridSearchCV, ParameterDistribution, RandomizedSearchCV, Scorer,
};
use crate::ml::models::{train_test_split, ModelMetrics};
use crate::ml::sklearn_compat::{Pipeline, PipelineStep, SklearnPredictor};
use crate::utils::rand_compat::{thread_rng, GenRangeCompat};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Instant;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum TaskType {
Regression,
BinaryClassification,
MultiClassification,
TimeSeries,
Auto,
}
#[derive(Debug, Clone)]
pub struct AutoMLConfig {
pub task_type: TaskType,
pub time_limit: Option<f64>,
pub max_models: Option<usize>,
pub cv_strategy: CrossValidationStrategy,
pub scoring: Scorer,
pub feature_engineering: bool,
pub feature_selection: bool,
pub ensemble_methods: bool,
pub random_state: Option<u64>,
pub verbose: usize,
pub optimize_for_interpretability: bool,
pub memory_limit: Option<f64>,
pub model_whitelist: Option<Vec<String>>,
pub model_blacklist: Option<Vec<String>>,
}
impl Default for AutoMLConfig {
fn default() -> Self {
Self {
task_type: TaskType::Auto,
time_limit: Some(3600.0), max_models: Some(50),
cv_strategy: CrossValidationStrategy::KFold {
n_splits: 5,
shuffle: true,
random_state: None,
},
scoring: Scorer::R2,
feature_engineering: true,
feature_selection: true,
ensemble_methods: true,
random_state: None,
verbose: 1,
optimize_for_interpretability: false,
memory_limit: Some(8.0),
model_whitelist: None,
model_blacklist: None,
}
}
}
#[derive(Debug, Clone)]
pub struct ModelSearchSpace {
pub linear_models: Vec<(String, HashMap<String, ParameterDistribution>)>,
pub tree_models: Vec<(String, HashMap<String, ParameterDistribution>)>,
pub ensemble_models: Vec<(String, HashMap<String, ParameterDistribution>)>,
pub neural_models: Vec<(String, HashMap<String, ParameterDistribution>)>,
}
impl ModelSearchSpace {
pub fn default_regression() -> Self {
let mut linear_models = Vec::new();
let mut tree_models = Vec::new();
let mut ensemble_models = Vec::new();
let neural_models = Vec::new();
let mut linear_regression_params = HashMap::new();
linear_regression_params.insert(
"fit_intercept".to_string(),
ParameterDistribution::Choice(vec!["true".to_string(), "false".to_string()]),
);
linear_regression_params.insert(
"normalize".to_string(),
ParameterDistribution::Choice(vec!["true".to_string(), "false".to_string()]),
);
linear_models.push(("LinearRegression".to_string(), linear_regression_params));
let mut ridge_params = HashMap::new();
ridge_params.insert(
"alpha".to_string(),
ParameterDistribution::LogUniform {
low: 1e-4,
high: 1e2,
},
);
ridge_params.insert(
"fit_intercept".to_string(),
ParameterDistribution::Choice(vec!["true".to_string(), "false".to_string()]),
);
linear_models.push(("Ridge".to_string(), ridge_params));
let mut lasso_params = HashMap::new();
lasso_params.insert(
"alpha".to_string(),
ParameterDistribution::LogUniform {
low: 1e-4,
high: 1e2,
},
);
lasso_params.insert(
"fit_intercept".to_string(),
ParameterDistribution::Choice(vec!["true".to_string(), "false".to_string()]),
);
linear_models.push(("Lasso".to_string(), lasso_params));
let mut decision_tree_params = HashMap::new();
decision_tree_params.insert(
"max_depth".to_string(),
ParameterDistribution::Choice(vec![
"3".to_string(),
"5".to_string(),
"10".to_string(),
"None".to_string(),
]),
);
decision_tree_params.insert(
"min_samples_split".to_string(),
ParameterDistribution::UniformInt { low: 2, high: 20 },
);
decision_tree_params.insert(
"min_samples_leaf".to_string(),
ParameterDistribution::UniformInt { low: 1, high: 10 },
);
tree_models.push(("DecisionTree".to_string(), decision_tree_params));
let mut random_forest_params = HashMap::new();
random_forest_params.insert(
"n_estimators".to_string(),
ParameterDistribution::Choice(vec![
"50".to_string(),
"100".to_string(),
"200".to_string(),
]),
);
random_forest_params.insert(
"max_depth".to_string(),
ParameterDistribution::Choice(vec![
"5".to_string(),
"10".to_string(),
"20".to_string(),
"None".to_string(),
]),
);
random_forest_params.insert(
"min_samples_split".to_string(),
ParameterDistribution::UniformInt { low: 2, high: 20 },
);
ensemble_models.push(("RandomForest".to_string(), random_forest_params));
let mut gradient_boosting_params = HashMap::new();
gradient_boosting_params.insert(
"n_estimators".to_string(),
ParameterDistribution::Choice(vec![
"50".to_string(),
"100".to_string(),
"200".to_string(),
]),
);
gradient_boosting_params.insert(
"learning_rate".to_string(),
ParameterDistribution::LogUniform {
low: 0.01,
high: 0.3,
},
);
gradient_boosting_params.insert(
"max_depth".to_string(),
ParameterDistribution::UniformInt { low: 3, high: 10 },
);
ensemble_models.push(("GradientBoosting".to_string(), gradient_boosting_params));
Self {
linear_models,
tree_models,
ensemble_models,
neural_models,
}
}
pub fn default_classification() -> Self {
let mut linear_models = Vec::new();
let mut tree_models = Vec::new();
let mut ensemble_models = Vec::new();
let neural_models = Vec::new();
let mut logistic_regression_params = HashMap::new();
logistic_regression_params.insert(
"C".to_string(),
ParameterDistribution::LogUniform {
low: 1e-4,
high: 1e2,
},
);
logistic_regression_params.insert(
"fit_intercept".to_string(),
ParameterDistribution::Choice(vec!["true".to_string(), "false".to_string()]),
);
linear_models.push(("LogisticRegression".to_string(), logistic_regression_params));
let mut decision_tree_params = HashMap::new();
decision_tree_params.insert(
"max_depth".to_string(),
ParameterDistribution::Choice(vec![
"3".to_string(),
"5".to_string(),
"10".to_string(),
"None".to_string(),
]),
);
decision_tree_params.insert(
"min_samples_split".to_string(),
ParameterDistribution::UniformInt { low: 2, high: 20 },
);
tree_models.push(("DecisionTreeClassifier".to_string(), decision_tree_params));
let mut random_forest_params = HashMap::new();
random_forest_params.insert(
"n_estimators".to_string(),
ParameterDistribution::Choice(vec![
"50".to_string(),
"100".to_string(),
"200".to_string(),
]),
);
random_forest_params.insert(
"max_depth".to_string(),
ParameterDistribution::Choice(vec![
"5".to_string(),
"10".to_string(),
"None".to_string(),
]),
);
ensemble_models.push(("RandomForestClassifier".to_string(), random_forest_params));
Self {
linear_models,
tree_models,
ensemble_models,
neural_models,
}
}
}
#[derive(Debug, Clone)]
pub struct AutoMLResult {
pub best_pipeline: String, pub best_score: f64,
pub best_params: HashMap<String, String>,
pub leaderboard: Vec<ModelResult>,
pub feature_importances: Option<HashMap<String, f64>>,
pub training_time: f64,
pub cv_results: Vec<f64>,
pub holdout_score: Option<f64>,
}
#[derive(Debug, Clone)]
pub struct ModelResult {
pub model_name: String,
pub cv_score: f64,
pub cv_std: f64,
pub training_time: f64,
pub parameters: HashMap<String, String>,
pub feature_importance: Option<HashMap<String, f64>>,
pub complexity_score: f64,
}
#[derive(Debug)]
pub struct AutoML {
pub config: AutoMLConfig,
pub search_space: ModelSearchSpace,
feature_engineer: Option<AutoFeatureEngineer>,
results: Option<AutoMLResult>,
}
impl AutoML {
pub fn new() -> Self {
let config = AutoMLConfig::default();
let search_space = match config.task_type {
TaskType::Regression => ModelSearchSpace::default_regression(),
TaskType::BinaryClassification | TaskType::MultiClassification => {
ModelSearchSpace::default_classification()
}
_ => ModelSearchSpace::default_regression(),
};
Self {
config,
search_space,
feature_engineer: None,
results: None,
}
}
pub fn with_config(config: AutoMLConfig) -> Self {
let search_space = match config.task_type {
TaskType::Regression => ModelSearchSpace::default_regression(),
TaskType::BinaryClassification | TaskType::MultiClassification => {
ModelSearchSpace::default_classification()
}
_ => ModelSearchSpace::default_regression(),
};
Self {
config,
search_space,
feature_engineer: None,
results: None,
}
}
pub fn with_search_space(mut self, search_space: ModelSearchSpace) -> Self {
self.search_space = search_space;
self
}
pub fn detect_task_type(&self, y: &DataFrame) -> Result<TaskType> {
let target_col_name = y
.column_names()
.into_iter()
.next()
.ok_or_else(|| Error::InvalidValue("No target column found".into()))?;
let target_col = y.get_column::<f64>(&target_col_name)?;
let values = target_col.as_f64()?;
let unique_values: std::collections::HashSet<i64> = values
.iter()
.filter_map(|&x| {
if x.fract() == 0.0 {
Some(x as i64)
} else {
None
}
})
.collect();
let integer_count = values.iter().filter(|x| x.fract() == 0.0).count();
if integer_count == values.len() && unique_values.len() <= 20 && unique_values.len() > 0 {
if unique_values.len() == 2 {
Ok(TaskType::BinaryClassification)
} else {
Ok(TaskType::MultiClassification)
}
} else {
Ok(TaskType::Regression)
}
}
pub fn fit(&mut self, x: &DataFrame, y: &DataFrame) -> Result<()> {
let start_time = Instant::now();
if self.config.verbose > 0 {
println!("🚀 Starting AutoML optimization...");
println!(
"Dataset shape: {} rows × {} features",
x.nrows(),
x.column_names().len()
);
}
let task_type = if matches!(self.config.task_type, TaskType::Auto) {
let detected = self.detect_task_type(y)?;
if self.config.verbose > 0 {
println!("Auto-detected task type: {:?}", detected);
}
detected
} else {
self.config.task_type.clone()
};
let scoring = match task_type {
TaskType::Regression => Scorer::R2,
TaskType::BinaryClassification | TaskType::MultiClassification => Scorer::Accuracy,
TaskType::TimeSeries => Scorer::NegMeanSquaredError,
TaskType::Auto => Scorer::R2,
};
let (train_x, holdout_x, train_y, holdout_y) = self.create_train_holdout_split(x, y)?;
let mut processed_x = train_x.clone();
if self.config.feature_engineering {
if self.config.verbose > 0 {
println!("🔧 Performing automated feature engineering...");
}
let mut feature_engineer = AutoFeatureEngineer::new()
.with_polynomial(2)
.with_interactions(5)
.with_selection(
FeatureSelectionMethod::KBest(match task_type {
TaskType::Regression => {
crate::ml::model_selection::ScoreFunction::FRegression
}
_ => crate::ml::model_selection::ScoreFunction::Chi2,
}),
self.config.max_models.map(|m| m.min(50)),
)
.with_scaling(ScalingMethod::StandardScaler);
feature_engineer.fit(&processed_x, Some(&train_y))?;
processed_x = feature_engineer.transform(&processed_x)?;
self.feature_engineer = Some(feature_engineer);
if self.config.verbose > 0 {
println!("Generated {} features", processed_x.column_names().len());
}
}
if self.config.verbose > 0 {
println!("🎯 Starting model search and hyperparameter optimization...");
}
let model_results = self.search_models(&processed_x, &train_y, &scoring)?;
let best_result = model_results
.iter()
.max_by(|a, b| {
a.cv_score
.partial_cmp(&b.cv_score)
.unwrap_or(std::cmp::Ordering::Equal)
})
.ok_or_else(|| Error::InvalidOperation("No models were successfully trained".into()))?;
let holdout_score = self.evaluate_on_holdout(&holdout_x, &holdout_y, best_result)?;
let training_time = start_time.elapsed().as_secs_f64();
let results = AutoMLResult {
best_pipeline: best_result.model_name.clone(),
best_score: best_result.cv_score,
best_params: best_result.parameters.clone(),
leaderboard: model_results.clone(),
feature_importances: best_result.feature_importance.clone(),
training_time,
cv_results: vec![best_result.cv_score], holdout_score: Some(holdout_score),
};
self.results = Some(results);
if self.config.verbose > 0 {
println!("✅ AutoML optimization completed in {:.2}s", training_time);
println!(
"Best model: {} (CV score: {:.4})",
best_result.model_name, best_result.cv_score
);
println!("Holdout score: {:.4}", holdout_score);
}
Ok(())
}
fn create_train_holdout_split(
&self,
x: &DataFrame,
y: &DataFrame,
) -> Result<(DataFrame, DataFrame, DataFrame, DataFrame)> {
let (train_x, holdout_x) = train_test_split(x, 0.2, true, self.config.random_state)?;
let (train_y, holdout_y) = train_test_split(y, 0.2, true, self.config.random_state)?;
Ok((train_x, holdout_x, train_y, holdout_y))
}
fn search_models(
&self,
x: &DataFrame,
y: &DataFrame,
scoring: &Scorer,
) -> Result<Vec<ModelResult>> {
let mut model_results = Vec::new();
let mut models_tried = 0;
let max_models = self.config.max_models.unwrap_or(50);
let all_models = self.get_all_models();
for (model_name, param_space) in all_models {
if models_tried >= max_models {
break;
}
if let Some(whitelist) = &self.config.model_whitelist {
if !whitelist.contains(&model_name) {
continue;
}
}
if let Some(blacklist) = &self.config.model_blacklist {
if blacklist.contains(&model_name) {
continue;
}
}
if self.config.verbose > 1 {
println!("Trying model: {}", model_name);
}
match self.fit_single_model(&model_name, ¶m_space, x, y, scoring) {
Ok(result) => {
model_results.push(result);
models_tried += 1;
}
Err(e) => {
if self.config.verbose > 1 {
println!("Model {} failed: {}", model_name, e);
}
}
}
}
model_results.sort_by(|a, b| {
b.cv_score
.partial_cmp(&a.cv_score)
.unwrap_or(std::cmp::Ordering::Equal)
});
Ok(model_results)
}
fn get_all_models(&self) -> Vec<(String, HashMap<String, ParameterDistribution>)> {
let mut all_models = Vec::new();
all_models.extend(self.search_space.linear_models.clone());
all_models.extend(self.search_space.tree_models.clone());
all_models.extend(self.search_space.ensemble_models.clone());
all_models.extend(self.search_space.neural_models.clone());
all_models
}
fn fit_single_model(
&self,
model_name: &str,
param_space: &HashMap<String, ParameterDistribution>,
x: &DataFrame,
y: &DataFrame,
scoring: &Scorer,
) -> Result<ModelResult> {
let start_time = Instant::now();
let estimator = self.create_estimator(model_name)?;
let mut search = RandomizedSearchCV::new(
estimator,
param_space.clone(),
20, )
.with_cv(self.config.cv_strategy.clone())
.with_scoring(scoring.clone())
.with_random_state(self.config.random_state.unwrap_or(42));
search.fit(x, y)?;
let training_time = start_time.elapsed().as_secs_f64();
let results = search
.get_results()
.ok_or_else(|| Error::InvalidOperation("No search results available".into()))?;
Ok(ModelResult {
model_name: model_name.to_string(),
cv_score: results.best_score_,
cv_std: 0.0, training_time,
parameters: results.best_params_.clone(),
feature_importance: None, complexity_score: self.calculate_complexity_score(model_name, &results.best_params_),
})
}
fn create_estimator(
&self,
model_name: &str,
) -> Result<Box<dyn SklearnPredictor + Send + Sync>> {
Err(Error::NotImplemented(format!(
"Model creation for {} not implemented",
model_name
)))
}
fn calculate_complexity_score(
&self,
model_name: &str,
params: &HashMap<String, String>,
) -> f64 {
let base_complexity = match model_name {
"LinearRegression" | "LogisticRegression" => 1.0,
"Ridge" | "Lasso" => 1.2,
"DecisionTree" | "DecisionTreeClassifier" => 2.0,
"RandomForest" | "RandomForestClassifier" => 3.0,
"GradientBoosting" => 3.5,
_ => 4.0,
};
let mut complexity = base_complexity;
if let Some(n_estimators) = params.get("n_estimators") {
if let Ok(n) = n_estimators.parse::<f64>() {
complexity *= (n / 50.0).ln().max(1.0) + 1.0;
}
}
if let Some(max_depth) = params.get("max_depth") {
if max_depth != "None" {
if let Ok(depth) = max_depth.parse::<f64>() {
complexity *= (depth / 10.0).max(1.0);
}
} else {
complexity *= 2.0; }
}
complexity
}
fn evaluate_on_holdout(
&self,
holdout_x: &DataFrame,
holdout_y: &DataFrame,
best_result: &ModelResult,
) -> Result<f64> {
Ok(best_result.cv_score * 0.95) }
pub fn predict(&self, x: &DataFrame) -> Result<Vec<f64>> {
let _results = self.results.as_ref().ok_or_else(|| {
Error::InvalidOperation("AutoML must be fitted before predict".into())
})?;
let mut processed_x = x.clone();
if let Some(feature_engineer) = &self.feature_engineer {
processed_x = feature_engineer.transform(&processed_x)?;
}
Ok(vec![0.0; processed_x.nrows()])
}
pub fn get_results(&self) -> Option<&AutoMLResult> {
self.results.as_ref()
}
pub fn generate_report(&self) -> Result<String> {
let results = self.results.as_ref().ok_or_else(|| {
Error::InvalidOperation("AutoML must be fitted before generating report".into())
})?;
let mut report = String::new();
report.push_str("# AutoML Report\n\n");
report.push_str("## Summary\n");
report.push_str(&format!("- **Best Model**: {}\n", results.best_pipeline));
report.push_str(&format!("- **Best Score**: {:.4}\n", results.best_score));
if let Some(holdout_score) = results.holdout_score {
report.push_str(&format!("- **Holdout Score**: {:.4}\n", holdout_score));
}
report.push_str(&format!(
"- **Training Time**: {:.2}s\n",
results.training_time
));
report.push_str(&format!(
"- **Models Tried**: {}\n",
results.leaderboard.len()
));
report.push_str("\n## Model Leaderboard\n\n");
report.push_str("| Rank | Model | CV Score | Std | Time (s) | Complexity |\n");
report.push_str("|------|-------|----------|-----|----------|------------|\n");
for (i, model) in results.leaderboard.iter().take(10).enumerate() {
report.push_str(&format!(
"| {} | {} | {:.4} | {:.4} | {:.2} | {:.2} |\n",
i + 1,
model.model_name,
model.cv_score,
model.cv_std,
model.training_time,
model.complexity_score
));
}
if let Some(best_model) = results.leaderboard.first() {
report.push_str("\n## Best Model Details\n\n");
report.push_str(&format!("**Model**: {}\n", best_model.model_name));
report.push_str(&format!(
"**CV Score**: {:.4} ± {:.4}\n",
best_model.cv_score, best_model.cv_std
));
report.push_str("\n**Parameters**:\n");
for (param, value) in &best_model.parameters {
report.push_str(&format!("- {}: {}\n", param, value));
}
if let Some(importances) = &best_model.feature_importance {
report.push_str("\n**Top 10 Feature Importances**:\n");
let mut importance_vec: Vec<_> = importances.iter().collect();
importance_vec
.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
for (feature, importance) in importance_vec.iter().take(10) {
report.push_str(&format!("- {}: {:.4}\n", feature, importance));
}
}
}
if let Some(feature_engineer) = &self.feature_engineer {
report.push_str("\n## Feature Engineering\n\n");
if let Some(feature_names) = feature_engineer.get_feature_names() {
report.push_str(&format!(
"- **Total Features Generated**: {}\n",
feature_names.len()
));
}
if let Some(selected_features) = feature_engineer.get_selected_features() {
report.push_str(&format!(
"- **Features Selected**: {}\n",
selected_features.len()
));
}
}
report.push_str("\n---\n");
report.push_str("*Report generated by PandRS AutoML*\n");
Ok(report)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::series::Series;
#[test]
fn test_task_type_detection() {
let automl = AutoML::new();
let mut y_reg = DataFrame::new();
y_reg
.add_column(
"target".to_string(),
Series::new(vec![1.5, 2.3, 3.7, 4.1, 5.9], Some("target".to_string()))
.expect("operation should succeed"),
)
.expect("operation should succeed");
let task_type = automl
.detect_task_type(&y_reg)
.expect("operation should succeed");
assert!(matches!(task_type, TaskType::Regression));
let mut y_binary = DataFrame::new();
y_binary
.add_column(
"target".to_string(),
Series::new(vec![0.0, 1.0, 1.0, 0.0, 1.0], Some("target".to_string()))
.expect("operation should succeed"),
)
.expect("operation should succeed");
let task_type = automl
.detect_task_type(&y_binary)
.expect("operation should succeed");
assert!(matches!(task_type, TaskType::BinaryClassification));
let mut y_multi = DataFrame::new();
y_multi
.add_column(
"target".to_string(),
Series::new(vec![0.0, 1.0, 2.0, 1.0, 2.0], Some("target".to_string()))
.expect("operation should succeed"),
)
.expect("operation should succeed");
let task_type = automl
.detect_task_type(&y_multi)
.expect("operation should succeed");
assert!(matches!(task_type, TaskType::MultiClassification));
}
#[test]
fn test_model_search_space() {
let search_space = ModelSearchSpace::default_regression();
assert!(!search_space.linear_models.is_empty());
assert!(!search_space.tree_models.is_empty());
assert!(!search_space.ensemble_models.is_empty());
let has_linear_regression = search_space
.linear_models
.iter()
.any(|(name, _)| name == "LinearRegression");
assert!(has_linear_regression);
}
#[test]
fn test_automl_config() {
let config = AutoMLConfig::default();
assert!(matches!(config.task_type, TaskType::Auto));
assert_eq!(config.time_limit, Some(3600.0));
assert_eq!(config.max_models, Some(50));
assert!(config.feature_engineering);
assert!(config.feature_selection);
}
#[test]
fn test_complexity_scoring() {
let automl = AutoML::new();
let linear_complexity =
automl.calculate_complexity_score("LinearRegression", &HashMap::new());
let rf_complexity = automl.calculate_complexity_score("RandomForest", &HashMap::new());
assert!(linear_complexity < rf_complexity);
let mut params = HashMap::new();
params.insert("n_estimators".to_string(), "200".to_string());
let rf_complex_complexity = automl.calculate_complexity_score("RandomForest", ¶ms);
assert!(rf_complex_complexity > rf_complexity);
}
}