use anyhow::{anyhow, Result};
use scirs2_core::ndarray_ext::{Array1, Array2};
use scirs2_core::random::{Random, RngExt};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::{Mutex, RwLock};
use tracing::info;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum TaskType {
Classification,
Regression,
TimeSeries,
AnomalyDetection,
Clustering,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Algorithm {
LinearRegression,
LogisticRegression,
DecisionTree,
RandomForest,
GradientBoosting,
NeuralNetwork,
KNN,
SVM,
NaiveBayes,
OnlineSGD,
ARIMA,
IsolationForest,
KMeans,
}
impl Algorithm {
pub fn for_task(task: TaskType) -> Vec<Algorithm> {
match task {
TaskType::Classification => vec![
Algorithm::LogisticRegression,
Algorithm::DecisionTree,
Algorithm::RandomForest,
Algorithm::GradientBoosting,
Algorithm::NeuralNetwork,
Algorithm::KNN,
Algorithm::NaiveBayes,
],
TaskType::Regression => vec![
Algorithm::LinearRegression,
Algorithm::DecisionTree,
Algorithm::RandomForest,
Algorithm::GradientBoosting,
Algorithm::NeuralNetwork,
Algorithm::KNN,
Algorithm::SVM,
],
TaskType::TimeSeries => vec![
Algorithm::ARIMA,
Algorithm::LinearRegression,
Algorithm::NeuralNetwork,
Algorithm::GradientBoosting,
],
TaskType::AnomalyDetection => vec![
Algorithm::IsolationForest,
Algorithm::OnlineSGD,
Algorithm::NeuralNetwork,
],
TaskType::Clustering => vec![Algorithm::KMeans],
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HyperParameters {
pub learning_rate: f64,
pub n_estimators: usize,
pub max_depth: Option<usize>,
pub regularization: f64,
pub n_neighbors: usize,
pub batch_size: usize,
pub random_seed: u64,
}
impl Default for HyperParameters {
fn default() -> Self {
Self {
learning_rate: 0.01,
n_estimators: 100,
max_depth: Some(5),
regularization: 0.1,
n_neighbors: 5,
batch_size: 32,
random_seed: 42,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelPerformance {
pub algorithm: Algorithm,
pub hyperparameters: HyperParameters,
pub accuracy: Option<f64>,
pub precision: Option<f64>,
pub recall: Option<f64>,
pub f1_score: Option<f64>,
pub mse: Option<f64>,
pub r_squared: Option<f64>,
pub training_time_secs: f64,
pub inference_time_ms: f64,
pub complexity_score: f64,
pub cv_score: f64,
}
impl ModelPerformance {
pub fn overall_score(&self) -> f64 {
let perf_score = self.cv_score;
let time_penalty = (-self.training_time_secs / 60.0).exp(); let complexity_penalty = (-self.complexity_score / 100.0).exp();
perf_score * time_penalty * complexity_penalty
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AutoMLConfig {
pub task_type: TaskType,
pub max_training_time_secs: u64,
pub n_trials: usize,
pub cv_folds: usize,
pub enable_ensemble: bool,
pub enable_meta_learning: bool,
pub early_stopping_patience: usize,
pub optimization_metric: String,
pub auto_feature_engineering: bool,
pub max_ensemble_size: usize,
}
impl Default for AutoMLConfig {
fn default() -> Self {
Self {
task_type: TaskType::Classification,
max_training_time_secs: 600,
n_trials: 50,
cv_folds: 5,
enable_ensemble: true,
enable_meta_learning: false,
early_stopping_patience: 10,
optimization_metric: "cv_score".to_string(),
auto_feature_engineering: true,
max_ensemble_size: 5,
}
}
}
#[derive(Debug, Clone)]
pub struct TrainedModel {
pub algorithm: Algorithm,
pub hyperparameters: HyperParameters,
pub parameters: ModelParameters,
pub performance: ModelPerformance,
}
#[derive(Debug, Clone)]
pub struct ModelParameters {
pub weights: Vec<f64>,
pub bias: f64,
pub extra: HashMap<String, Vec<f64>>,
}
impl Default for ModelParameters {
fn default() -> Self {
Self {
weights: Vec::new(),
bias: 0.0,
extra: HashMap::new(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AutoMLStats {
pub total_trials: u64,
pub best_score: f64,
pub total_training_time_secs: f64,
pub ensemble_size: usize,
pub best_algorithm: Option<Algorithm>,
pub predictions_count: u64,
pub avg_prediction_time_ms: f64,
}
impl Default for AutoMLStats {
fn default() -> Self {
Self {
total_trials: 0,
best_score: 0.0,
total_training_time_secs: 0.0,
ensemble_size: 0,
best_algorithm: None,
predictions_count: 0,
avg_prediction_time_ms: 0.0,
}
}
}
pub struct AutoML {
config: AutoMLConfig,
best_model: Arc<RwLock<Option<TrainedModel>>>,
ensemble: Arc<RwLock<Vec<TrainedModel>>>,
trial_history: Arc<RwLock<Vec<ModelPerformance>>>,
stats: Arc<RwLock<AutoMLStats>>,
#[allow(clippy::arc_with_non_send_sync)]
rng: Arc<Mutex<Random>>,
}
impl AutoML {
#[allow(clippy::arc_with_non_send_sync)]
pub fn new(config: AutoMLConfig) -> Result<Self> {
Ok(Self {
config,
best_model: Arc::new(RwLock::new(None)),
ensemble: Arc::new(RwLock::new(Vec::new())),
trial_history: Arc::new(RwLock::new(Vec::new())),
stats: Arc::new(RwLock::new(AutoMLStats::default())),
rng: Arc::new(Mutex::new(Random::default())),
})
}
pub async fn fit(&mut self, features: &Array2<f64>, labels: &Array1<f64>) -> Result<()> {
info!(
"Starting AutoML training with task {:?}, {} samples, {} features",
self.config.task_type,
features.shape()[0],
features.shape()[1]
);
let start_time = std::time::Instant::now();
let candidate_algorithms = Algorithm::for_task(self.config.task_type);
let mut best_overall_score = f64::NEG_INFINITY;
let mut trials_without_improvement = 0;
for trial in 0..self.config.n_trials {
if start_time.elapsed().as_secs() >= self.config.max_training_time_secs {
info!("Time budget exhausted, stopping AutoML");
break;
}
let algorithm = {
let mut rng = self.rng.lock().await;
let idx = rng.random_range(0..candidate_algorithms.len());
candidate_algorithms[idx]
};
let hyperparams = self.generate_hyperparameters(algorithm).await?;
let performance = self
.train_and_evaluate(algorithm, &hyperparams, features, labels)
.await?;
self.trial_history.write().await.push(performance.clone());
let overall_score = performance.overall_score();
info!(
"Trial {}: {:?} - CV score: {:.4}, Overall score: {:.4}",
trial, algorithm, performance.cv_score, overall_score
);
if overall_score > best_overall_score {
best_overall_score = overall_score;
trials_without_improvement = 0;
let model = TrainedModel {
algorithm,
hyperparameters: hyperparams.clone(),
parameters: self
.train_final_model(algorithm, &hyperparams, features, labels)
.await?,
performance: performance.clone(),
};
*self.best_model.write().await = Some(model.clone());
if self.config.enable_ensemble {
self.update_ensemble(model).await?;
}
let mut stats = self.stats.write().await;
stats.best_score = best_overall_score;
stats.best_algorithm = Some(algorithm);
} else {
trials_without_improvement += 1;
}
if trials_without_improvement >= self.config.early_stopping_patience {
info!(
"Early stopping triggered after {} trials without improvement",
trials_without_improvement
);
break;
}
let mut stats = self.stats.write().await;
stats.total_trials = trial as u64 + 1;
}
let mut stats = self.stats.write().await;
stats.total_training_time_secs = start_time.elapsed().as_secs_f64();
stats.ensemble_size = self.ensemble.read().await.len();
info!(
"AutoML training complete: {} trials, best score: {:.4}, algorithm: {:?}",
stats.total_trials, stats.best_score, stats.best_algorithm
);
Ok(())
}
async fn generate_hyperparameters(&self, algorithm: Algorithm) -> Result<HyperParameters> {
let mut rng = self.rng.lock().await;
let _base = if self.config.enable_meta_learning {
self.get_meta_learning_initialization(algorithm).await
} else {
HyperParameters::default()
};
Ok(HyperParameters {
learning_rate: rng.random_range(0.0001..0.1),
n_estimators: rng.random_range(10..500),
max_depth: Some(rng.random_range(3..20)),
regularization: rng.random_range(0.0..1.0),
n_neighbors: rng.random_range(3..20),
batch_size: rng.random_range(16..256),
random_seed: rng.random::<u64>(),
})
}
async fn get_meta_learning_initialization(&self, _algorithm: Algorithm) -> HyperParameters {
HyperParameters::default()
}
async fn train_and_evaluate(
&self,
algorithm: Algorithm,
hyperparams: &HyperParameters,
features: &Array2<f64>,
labels: &Array1<f64>,
) -> Result<ModelPerformance> {
let start_time = std::time::Instant::now();
let cv_scores = self
.cross_validate(algorithm, hyperparams, features, labels)
.await?;
let cv_score = cv_scores.iter().sum::<f64>() / cv_scores.len() as f64;
let (accuracy, precision, recall, f1, mse, r_squared) = self
.compute_metrics(algorithm, hyperparams, features, labels)
.await?;
let training_time = start_time.elapsed().as_secs_f64();
let complexity_score = match algorithm {
Algorithm::LinearRegression | Algorithm::LogisticRegression => 10.0,
Algorithm::DecisionTree => 30.0,
Algorithm::RandomForest | Algorithm::GradientBoosting => 60.0,
Algorithm::NeuralNetwork => 80.0,
_ => 40.0,
};
Ok(ModelPerformance {
algorithm,
hyperparameters: hyperparams.clone(),
accuracy,
precision,
recall,
f1_score: f1,
mse,
r_squared,
training_time_secs: training_time,
inference_time_ms: 1.0, complexity_score,
cv_score,
})
}
async fn cross_validate(
&self,
algorithm: Algorithm,
hyperparams: &HyperParameters,
features: &Array2<f64>,
labels: &Array1<f64>,
) -> Result<Vec<f64>> {
let n_samples = features.shape()[0];
let fold_size = n_samples / self.config.cv_folds;
let mut scores = Vec::new();
for fold in 0..self.config.cv_folds {
let val_start = fold * fold_size;
let val_end = ((fold + 1) * fold_size).min(n_samples);
let score = self
.evaluate_fold(algorithm, hyperparams, features, labels, val_start, val_end)
.await?;
scores.push(score);
}
Ok(scores)
}
async fn evaluate_fold(
&self,
_algorithm: Algorithm,
_hyperparams: &HyperParameters,
_features: &Array2<f64>,
_labels: &Array1<f64>,
_val_start: usize,
_val_end: usize,
) -> Result<f64> {
let mut rng = self.rng.lock().await;
Ok(0.7 + rng.random::<f64>() * 0.3) }
async fn compute_metrics(
&self,
_algorithm: Algorithm,
_hyperparams: &HyperParameters,
_features: &Array2<f64>,
_labels: &Array1<f64>,
) -> Result<(
Option<f64>,
Option<f64>,
Option<f64>,
Option<f64>,
Option<f64>,
Option<f64>,
)> {
let mut rng = self.rng.lock().await;
match self.config.task_type {
TaskType::Classification => {
let accuracy = Some(0.7 + rng.random::<f64>() * 0.3);
let precision = Some(0.7 + rng.random::<f64>() * 0.3);
let recall = Some(0.7 + rng.random::<f64>() * 0.3);
let f1 = Some(0.7 + rng.random::<f64>() * 0.3);
Ok((accuracy, precision, recall, f1, None, None))
}
TaskType::Regression | TaskType::TimeSeries => {
let mse = Some(0.1 + rng.random::<f64>() * 0.9);
let r_squared = Some(0.5 + rng.random::<f64>() * 0.5);
Ok((None, None, None, None, mse, r_squared))
}
_ => Ok((None, None, None, None, None, None)),
}
}
async fn train_final_model(
&self,
_algorithm: Algorithm,
_hyperparams: &HyperParameters,
features: &Array2<f64>,
_labels: &Array1<f64>,
) -> Result<ModelParameters> {
let n_features = features.shape()[1];
let mut rng = self.rng.lock().await;
let weights: Vec<f64> = (0..n_features).map(|_| rng.random::<f64>() - 0.5).collect();
let bias = rng.random::<f64>() - 0.5;
Ok(ModelParameters {
weights,
bias,
extra: HashMap::new(),
})
}
async fn update_ensemble(&self, model: TrainedModel) -> Result<()> {
let mut ensemble = self.ensemble.write().await;
ensemble.push(model);
if ensemble.len() > self.config.max_ensemble_size {
ensemble.sort_by(|a, b| {
b.performance
.overall_score()
.partial_cmp(&a.performance.overall_score())
.unwrap_or(std::cmp::Ordering::Equal)
});
ensemble.truncate(self.config.max_ensemble_size);
}
Ok(())
}
pub async fn predict(&self, features: &Array1<f64>) -> Result<f64> {
let start_time = std::time::Instant::now();
let prediction = if self.config.enable_ensemble {
self.ensemble_predict(features).await?
} else {
self.single_model_predict(features).await?
};
let mut stats = self.stats.write().await;
stats.predictions_count += 1;
let elapsed_ms = start_time.elapsed().as_secs_f64() * 1000.0;
stats.avg_prediction_time_ms =
(stats.avg_prediction_time_ms * (stats.predictions_count - 1) as f64 + elapsed_ms)
/ stats.predictions_count as f64;
Ok(prediction)
}
async fn single_model_predict(&self, features: &Array1<f64>) -> Result<f64> {
let model = self.best_model.read().await;
match &*model {
Some(m) => {
let mut pred = m.parameters.bias;
for (i, &weight) in m.parameters.weights.iter().enumerate() {
if i < features.len() {
pred += weight * features[i];
}
}
if matches!(self.config.task_type, TaskType::Classification) {
pred = 1.0 / (1.0 + (-pred).exp()); }
Ok(pred)
}
None => Err(anyhow!("No trained model available")),
}
}
async fn ensemble_predict(&self, features: &Array1<f64>) -> Result<f64> {
let ensemble = self.ensemble.read().await;
if ensemble.is_empty() {
return self.single_model_predict(features).await;
}
let mut predictions = Vec::new();
let mut weights = Vec::new();
for model in ensemble.iter() {
let mut pred = model.parameters.bias;
for (i, &weight) in model.parameters.weights.iter().enumerate() {
if i < features.len() {
pred += weight * features[i];
}
}
if matches!(self.config.task_type, TaskType::Classification) {
pred = 1.0 / (1.0 + (-pred).exp());
}
predictions.push(pred);
weights.push(model.performance.overall_score());
}
let total_weight: f64 = weights.iter().sum();
let weighted_pred = predictions
.iter()
.zip(&weights)
.map(|(p, w)| p * w)
.sum::<f64>()
/ total_weight;
Ok(weighted_pred)
}
pub async fn get_stats(&self) -> AutoMLStats {
self.stats.read().await.clone()
}
pub async fn get_trial_history(&self) -> Vec<ModelPerformance> {
self.trial_history.read().await.clone()
}
pub async fn get_best_model_info(
&self,
) -> Option<(Algorithm, HyperParameters, ModelPerformance)> {
let model = self.best_model.read().await;
model.as_ref().map(|m| {
(
m.algorithm,
m.hyperparameters.clone(),
m.performance.clone(),
)
})
}
pub async fn export_model(&self) -> Result<String> {
let model = self.best_model.read().await;
match &*model {
Some(m) => {
let export = serde_json::json!({
"algorithm": format!("{:?}", m.algorithm),
"hyperparameters": m.hyperparameters,
"parameters": {
"weights": m.parameters.weights,
"bias": m.parameters.bias,
},
"performance": m.performance,
});
Ok(serde_json::to_string_pretty(&export)?)
}
None => Err(anyhow!("No model to export")),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_algorithm_for_task() {
let classifiers = Algorithm::for_task(TaskType::Classification);
assert!(!classifiers.is_empty());
assert!(classifiers.contains(&Algorithm::LogisticRegression));
let regressors = Algorithm::for_task(TaskType::Regression);
assert!(regressors.contains(&Algorithm::LinearRegression));
let ts_algorithms = Algorithm::for_task(TaskType::TimeSeries);
assert!(ts_algorithms.contains(&Algorithm::ARIMA));
}
#[test]
fn test_hyperparameters_default() {
let params = HyperParameters::default();
assert_eq!(params.learning_rate, 0.01);
assert_eq!(params.n_estimators, 100);
assert_eq!(params.max_depth, Some(5));
}
#[test]
fn test_model_performance_overall_score() {
let perf = ModelPerformance {
algorithm: Algorithm::LinearRegression,
hyperparameters: HyperParameters::default(),
accuracy: None,
precision: None,
recall: None,
f1_score: None,
mse: Some(0.5),
r_squared: Some(0.9),
training_time_secs: 10.0,
inference_time_ms: 1.0,
complexity_score: 20.0,
cv_score: 0.85,
};
let score = perf.overall_score();
assert!(score > 0.0);
assert!(score <= 1.0);
}
#[tokio::test]
async fn test_automl_creation() {
let config = AutoMLConfig::default();
let automl = AutoML::new(config);
assert!(automl.is_ok());
}
#[tokio::test]
async fn test_automl_generate_hyperparameters() {
let config = AutoMLConfig::default();
let automl = AutoML::new(config).unwrap();
let params = automl
.generate_hyperparameters(Algorithm::LinearRegression)
.await;
assert!(params.is_ok());
let p = params.unwrap();
assert!(p.learning_rate > 0.0);
assert!(p.n_estimators > 0);
}
#[tokio::test]
async fn test_automl_fit_small_dataset() {
let config = AutoMLConfig {
task_type: TaskType::Regression,
max_training_time_secs: 5,
n_trials: 3,
cv_folds: 2,
enable_ensemble: false,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features = Array2::from_shape_vec(
(10, 2),
vec![
1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 9.0,
9.0, 10.0, 10.0, 11.0,
],
)
.unwrap();
let labels = Array1::from_vec(vec![3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0]);
let result = automl.fit(&features, &labels).await;
assert!(result.is_ok());
let stats = automl.get_stats().await;
assert!(stats.total_trials > 0);
assert!(stats.total_trials <= 3);
}
#[tokio::test]
async fn test_automl_prediction() {
let config = AutoMLConfig {
task_type: TaskType::Regression,
max_training_time_secs: 5,
n_trials: 2,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features = Array2::from_shape_vec(
(10, 2),
vec![
1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 9.0,
9.0, 10.0, 10.0, 11.0,
],
)
.unwrap();
let labels = Array1::from_vec(vec![3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0]);
automl.fit(&features, &labels).await.unwrap();
let test_features = Array1::from_vec(vec![5.5, 6.5]);
let prediction = automl.predict(&test_features).await;
assert!(prediction.is_ok());
}
#[tokio::test]
async fn test_ensemble_prediction() {
let config = AutoMLConfig {
task_type: TaskType::Classification,
enable_ensemble: true,
max_ensemble_size: 3,
n_trials: 5,
max_training_time_secs: 10,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features =
Array2::from_shape_vec((20, 2), (0..40).map(|x| x as f64).collect()).unwrap();
let labels = Array1::from_vec((0..20).map(|x| (x % 2) as f64).collect());
automl.fit(&features, &labels).await.unwrap();
let test_features = Array1::from_vec(vec![5.0, 10.0]);
let prediction = automl.predict(&test_features).await;
assert!(prediction.is_ok());
let pred = prediction.unwrap();
assert!((0.0..=1.0).contains(&pred)); }
#[tokio::test]
async fn test_get_best_model_info() {
let config = AutoMLConfig {
n_trials: 2,
max_training_time_secs: 5,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features =
Array2::from_shape_vec((10, 2), (0..20).map(|x| x as f64).collect()).unwrap();
let labels = Array1::from_vec((0..10).map(|x| x as f64).collect());
automl.fit(&features, &labels).await.unwrap();
let best_info = automl.get_best_model_info().await;
assert!(best_info.is_some());
let (_algorithm, _hyperparams, performance) = best_info.unwrap();
assert!(performance.cv_score >= 0.0);
}
#[tokio::test]
async fn test_export_model() {
let config = AutoMLConfig {
n_trials: 1,
max_training_time_secs: 5,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features =
Array2::from_shape_vec((10, 2), (0..20).map(|x| x as f64).collect()).unwrap();
let labels = Array1::from_vec((0..10).map(|x| x as f64).collect());
automl.fit(&features, &labels).await.unwrap();
let export = automl.export_model().await;
assert!(export.is_ok());
let json_str = export.unwrap();
assert!(json_str.contains("algorithm"));
assert!(json_str.contains("hyperparameters"));
}
#[tokio::test]
async fn test_trial_history() {
let config = AutoMLConfig {
n_trials: 3,
max_training_time_secs: 5,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features =
Array2::from_shape_vec((10, 2), (0..20).map(|x| x as f64).collect()).unwrap();
let labels = Array1::from_vec((0..10).map(|x| x as f64).collect());
automl.fit(&features, &labels).await.unwrap();
let history = automl.get_trial_history().await;
assert!(!history.is_empty());
assert!(history.len() <= 3);
}
#[tokio::test]
async fn test_early_stopping() {
let config = AutoMLConfig {
n_trials: 100, max_training_time_secs: 60,
early_stopping_patience: 3,
..Default::default()
};
let mut automl = AutoML::new(config).unwrap();
let features =
Array2::from_shape_vec((10, 2), (0..20).map(|x| x as f64).collect()).unwrap();
let labels = Array1::from_vec((0..10).map(|x| x as f64).collect());
automl.fit(&features, &labels).await.unwrap();
let stats = automl.get_stats().await;
assert!(stats.total_trials < 100);
}
}