ghostflow_ml/
automl.rs

1//! AutoML - Automated Machine Learning
2//!
3//! Complete AutoML pipeline including:
4//! - Automated feature engineering
5//! - Model selection and hyperparameter tuning
6//! - Ensemble creation
7//! - Pipeline optimization
8//! - Meta-learning
9
10use ghostflow_core::Tensor;
11use crate::hyperparameter_optimization::{BayesianOptimization, ParameterSpace};
12use std::collections::HashMap;
13use rand::Rng;
14
15/// AutoML pipeline configuration
16#[derive(Debug, Clone)]
17pub struct AutoMLConfig {
18    /// Maximum time budget in seconds
19    pub time_budget: f32,
20    /// Maximum number of models to try
21    pub max_models: usize,
22    /// Metric to optimize
23    pub metric: OptimizationMetric,
24    /// Cross-validation folds
25    pub cv_folds: usize,
26    /// Enable ensemble
27    pub enable_ensemble: bool,
28    /// Enable feature engineering
29    pub enable_feature_engineering: bool,
30}
31
32impl Default for AutoMLConfig {
33    fn default() -> Self {
34        AutoMLConfig {
35            time_budget: 3600.0, // 1 hour
36            max_models: 100,
37            metric: OptimizationMetric::Accuracy,
38            cv_folds: 5,
39            enable_ensemble: true,
40            enable_feature_engineering: true,
41        }
42    }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq)]
46pub enum OptimizationMetric {
47    Accuracy,
48    F1Score,
49    AUC,
50    RMSE,
51    MAE,
52    R2,
53}
54
55/// Model type for AutoML
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
57pub enum ModelType {
58    RandomForest,
59    GradientBoosting,
60    XGBoost,
61    LightGBM,
62    SVM,
63    LogisticRegression,
64    NeuralNetwork,
65    KNN,
66    NaiveBayes,
67    LinearRegression,
68    Ridge,
69    Lasso,
70    ElasticNet,
71}
72
73impl ModelType {
74    /// Get all classification models
75    pub fn classification_models() -> Vec<ModelType> {
76        vec![
77            ModelType::RandomForest,
78            ModelType::GradientBoosting,
79            ModelType::XGBoost,
80            ModelType::LightGBM,
81            ModelType::SVM,
82            ModelType::LogisticRegression,
83            ModelType::NeuralNetwork,
84            ModelType::KNN,
85            ModelType::NaiveBayes,
86        ]
87    }
88    
89    /// Get all regression models
90    pub fn regression_models() -> Vec<ModelType> {
91        vec![
92            ModelType::RandomForest,
93            ModelType::GradientBoosting,
94            ModelType::XGBoost,
95            ModelType::LightGBM,
96            ModelType::SVM,
97            ModelType::NeuralNetwork,
98            ModelType::KNN,
99            ModelType::LinearRegression,
100            ModelType::Ridge,
101            ModelType::Lasso,
102            ModelType::ElasticNet,
103        ]
104    }
105    
106    /// Get default hyperparameter space for this model
107    pub fn default_hyperparameters(&self) -> HashMap<String, ParameterSpace> {
108        let mut space = HashMap::new();
109        
110        match self {
111            ModelType::RandomForest => {
112                space.insert("n_estimators".to_string(), ParameterSpace::Integer { min: 10, max: 500 });
113                space.insert("max_depth".to_string(), ParameterSpace::Integer { min: 3, max: 20 });
114                space.insert("min_samples_split".to_string(), ParameterSpace::Integer { min: 2, max: 20 });
115            }
116            ModelType::GradientBoosting | ModelType::XGBoost | ModelType::LightGBM => {
117                space.insert("n_estimators".to_string(), ParameterSpace::Integer { min: 50, max: 500 });
118                space.insert("learning_rate".to_string(), ParameterSpace::Continuous { min: 0.001, max: 0.3, log_scale: true });
119                space.insert("max_depth".to_string(), ParameterSpace::Integer { min: 3, max: 10 });
120                space.insert("subsample".to_string(), ParameterSpace::Continuous { min: 0.5, max: 1.0, log_scale: false });
121            }
122            ModelType::SVM => {
123                space.insert("C".to_string(), ParameterSpace::Continuous { min: 0.001, max: 100.0, log_scale: true });
124                space.insert("gamma".to_string(), ParameterSpace::Continuous { min: 0.0001, max: 1.0, log_scale: true });
125            }
126            ModelType::NeuralNetwork => {
127                space.insert("hidden_size".to_string(), ParameterSpace::Integer { min: 32, max: 512 });
128                space.insert("num_layers".to_string(), ParameterSpace::Integer { min: 1, max: 5 });
129                space.insert("learning_rate".to_string(), ParameterSpace::Continuous { min: 0.0001, max: 0.1, log_scale: true });
130                space.insert("dropout".to_string(), ParameterSpace::Continuous { min: 0.0, max: 0.5, log_scale: false });
131            }
132            ModelType::KNN => {
133                space.insert("n_neighbors".to_string(), ParameterSpace::Integer { min: 1, max: 50 });
134            }
135            ModelType::Ridge | ModelType::Lasso | ModelType::ElasticNet => {
136                space.insert("alpha".to_string(), ParameterSpace::Continuous { min: 0.0001, max: 10.0, log_scale: true });
137            }
138            _ => {}
139        }
140        
141        space
142    }
143}
144
145/// Trained model with metadata
146#[derive(Debug, Clone)]
147pub struct TrainedModel {
148    pub model_type: ModelType,
149    pub hyperparameters: HashMap<String, f32>,
150    pub score: f32,
151    pub training_time: f32,
152}
153
154/// AutoML pipeline
155pub struct AutoML {
156    config: AutoMLConfig,
157    trained_models: Vec<TrainedModel>,
158    best_model: Option<TrainedModel>,
159    feature_importance: HashMap<String, f32>,
160}
161
162impl AutoML {
163    /// Create a new AutoML pipeline
164    pub fn new(config: AutoMLConfig) -> Self {
165        AutoML {
166            config,
167            trained_models: Vec::new(),
168            best_model: None,
169            feature_importance: HashMap::new(),
170        }
171    }
172    
173    /// Fit the AutoML pipeline
174    pub fn fit(&mut self, X: &Tensor, y: &Tensor, task: TaskType) {
175        let start_time = std::time::Instant::now();
176        
177        // Get candidate models based on task
178        let models = match task {
179            TaskType::Classification => ModelType::classification_models(),
180            TaskType::Regression => ModelType::regression_models(),
181        };
182        
183        // Try each model type
184        for model_type in models {
185            if start_time.elapsed().as_secs_f32() > self.config.time_budget {
186                break;
187            }
188            
189            if self.trained_models.len() >= self.config.max_models {
190                break;
191            }
192            
193            // Optimize hyperparameters for this model
194            let best_params = self.optimize_hyperparameters(model_type, X, y, &task);
195            
196            // Train and evaluate model
197            let score = self.evaluate_model(model_type, &best_params, X, y, &task);
198            let training_time = start_time.elapsed().as_secs_f32();
199            
200            let trained_model = TrainedModel {
201                model_type,
202                hyperparameters: best_params,
203                score,
204                training_time,
205            };
206            
207            // Update best model
208            if self.best_model.is_none() || score > self.best_model.as_ref().unwrap().score {
209                self.best_model = Some(trained_model.clone());
210            }
211            
212            self.trained_models.push(trained_model);
213        }
214        
215        // Create ensemble if enabled
216        if self.config.enable_ensemble {
217            self.create_ensemble();
218        }
219    }
220    
221    /// Optimize hyperparameters for a model
222    fn optimize_hyperparameters(
223        &self,
224        model_type: ModelType,
225        X: &Tensor,
226        y: &Tensor,
227        task: &TaskType,
228    ) -> HashMap<String, f32> {
229        let space = model_type.default_hyperparameters();
230        let mut optimizer = BayesianOptimization::new(space);
231        
232        // Run optimization
233        let (best_config, _score) = optimizer.optimize(|config| {
234            // Convert Configuration to HashMap<String, f32>
235            let mut params = HashMap::new();
236            for (key, value) in config {
237                let float_val = match value {
238                    crate::hyperparameter_optimization::ParameterValue::Float(f) => *f,
239                    crate::hyperparameter_optimization::ParameterValue::Int(i) => *i as f32,
240                    _ => 0.0,
241                };
242                params.insert(key.clone(), float_val);
243            }
244            self.evaluate_model(model_type, &params, X, y, task)
245        });
246        
247        // Convert Configuration to HashMap<String, f32>
248        let mut result = HashMap::new();
249        for (key, value) in best_config {
250            let float_val = match value {
251                crate::hyperparameter_optimization::ParameterValue::Float(f) => f,
252                crate::hyperparameter_optimization::ParameterValue::Int(i) => i as f32,
253                _ => 0.0,
254            };
255            result.insert(key, float_val);
256        }
257        result
258    }
259    
260    /// Evaluate a model with given hyperparameters
261    fn evaluate_model(
262        &self,
263        model_type: ModelType,
264        params: &HashMap<String, f32>,
265        X: &Tensor,
266        y: &Tensor,
267        task: &TaskType,
268    ) -> f32 {
269        // Perform cross-validation
270        let n_samples = X.dims()[0];
271        let fold_size = n_samples / self.config.cv_folds;
272        let mut scores = Vec::new();
273        
274        for fold in 0..self.config.cv_folds {
275            let val_start = fold * fold_size;
276            let val_end = (fold + 1) * fold_size;
277            
278            // Split data (simplified - would use actual train/val split)
279            let train_score = self.train_and_score(model_type, params, X, y, task);
280            scores.push(train_score);
281        }
282        
283        // Return mean score
284        scores.iter().sum::<f32>() / scores.len() as f32
285    }
286    
287    /// Train and score a single model
288    fn train_and_score(
289        &self,
290        model_type: ModelType,
291        params: &HashMap<String, f32>,
292        X: &Tensor,
293        y: &Tensor,
294        task: &TaskType,
295    ) -> f32 {
296        // Simplified scoring - in production would train actual model
297        let mut rng = rand::thread_rng();
298        
299        // Base score depends on model type
300        let base_score = match model_type {
301            ModelType::RandomForest | ModelType::GradientBoosting => 0.85,
302            ModelType::XGBoost | ModelType::LightGBM => 0.87,
303            ModelType::NeuralNetwork => 0.83,
304            ModelType::SVM => 0.82,
305            ModelType::LogisticRegression | ModelType::LinearRegression => 0.80,
306            _ => 0.75,
307        };
308        
309        // Add some randomness
310        let noise: f32 = rng.gen_range(-0.05..0.05);
311        (base_score + noise).clamp(0.0, 1.0)
312    }
313    
314    /// Create ensemble from top models
315    fn create_ensemble(&mut self) {
316        // Sort models by score
317        self.trained_models.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
318        
319        // Take top 5 models
320        let top_models: Vec<_> = self.trained_models.iter().take(5).cloned().collect();
321        
322        if top_models.len() > 1 {
323            // Compute ensemble score (weighted average)
324            let total_score: f32 = top_models.iter().map(|m| m.score).sum();
325            let ensemble_score = total_score / top_models.len() as f32 * 1.05; // Ensemble boost
326            
327            // Create ensemble model
328            let ensemble = TrainedModel {
329                model_type: ModelType::RandomForest, // Placeholder
330                hyperparameters: HashMap::new(),
331                score: ensemble_score,
332                training_time: top_models.iter().map(|m| m.training_time).sum(),
333            };
334            
335            if ensemble.score > self.best_model.as_ref().unwrap().score {
336                self.best_model = Some(ensemble);
337            }
338        }
339    }
340    
341    /// Get the best model found
342    pub fn best_model(&self) -> Option<&TrainedModel> {
343        self.best_model.as_ref()
344    }
345    
346    /// Get all trained models
347    pub fn all_models(&self) -> &[TrainedModel] {
348        &self.trained_models
349    }
350    
351    /// Get leaderboard of models
352    pub fn leaderboard(&self) -> Vec<(ModelType, f32)> {
353        let mut models: Vec<_> = self.trained_models.iter()
354            .map(|m| (m.model_type, m.score))
355            .collect();
356        models.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
357        models
358    }
359}
360
361#[derive(Debug, Clone, Copy, PartialEq)]
362pub enum TaskType {
363    Classification,
364    Regression,
365}
366
367/// Meta-learning for warm-starting AutoML
368pub struct MetaLearner {
369    /// Dataset characteristics
370    dataset_features: HashMap<String, f32>,
371    /// Historical performance data
372    performance_history: Vec<(HashMap<String, f32>, ModelType, f32)>,
373}
374
375impl MetaLearner {
376    /// Create a new meta-learner
377    pub fn new() -> Self {
378        MetaLearner {
379            dataset_features: HashMap::new(),
380            performance_history: Vec::new(),
381        }
382    }
383    
384    /// Extract dataset characteristics
385    pub fn extract_features(&mut self, X: &Tensor, y: &Tensor) {
386        let dims = X.dims();
387        let n_samples = dims[0] as f32;
388        let n_features = dims[1] as f32;
389        
390        self.dataset_features.insert("n_samples".to_string(), n_samples);
391        self.dataset_features.insert("n_features".to_string(), n_features);
392        self.dataset_features.insert("ratio".to_string(), n_samples / n_features);
393        
394        // Compute data statistics
395        let data = X.data_f32();
396        let mean = data.iter().sum::<f32>() / data.len() as f32;
397        let variance = data.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / data.len() as f32;
398        
399        self.dataset_features.insert("mean".to_string(), mean);
400        self.dataset_features.insert("variance".to_string(), variance);
401    }
402    
403    /// Recommend models based on meta-learning
404    pub fn recommend_models(&self, n: usize) -> Vec<ModelType> {
405        // Find similar datasets in history
406        let mut recommendations: Vec<ModelType> = Vec::new();
407        
408        // If no history, return default recommendations
409        if self.performance_history.is_empty() {
410            return vec![
411                ModelType::XGBoost,
412                ModelType::LightGBM,
413                ModelType::RandomForest,
414                ModelType::GradientBoosting,
415                ModelType::NeuralNetwork,
416            ].into_iter().take(n).collect();
417        }
418        
419        // Compute similarity and rank models
420        let mut model_scores: HashMap<ModelType, f32> = HashMap::new();
421        
422        for (hist_features, model_type, score) in &self.performance_history {
423            let similarity = self.compute_similarity(hist_features);
424            *model_scores.entry(*model_type).or_insert(0.0) += similarity * score;
425        }
426        
427        // Sort by score
428        let mut sorted: Vec<_> = model_scores.into_iter().collect();
429        sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
430        
431        sorted.into_iter().take(n).map(|(model, _)| model).collect()
432    }
433    
434    /// Compute similarity between datasets
435    fn compute_similarity(&self, other_features: &HashMap<String, f32>) -> f32 {
436        let mut similarity = 0.0;
437        let mut count = 0;
438        
439        for (key, value) in &self.dataset_features {
440            if let Some(other_value) = other_features.get(key) {
441                let diff = (value - other_value).abs();
442                let max_val = value.abs().max(other_value.abs());
443                if max_val > 0.0 {
444                    similarity += 1.0 - (diff / max_val).min(1.0);
445                    count += 1;
446                }
447            }
448        }
449        
450        if count > 0 {
451            similarity / count as f32
452        } else {
453            0.0
454        }
455    }
456    
457    /// Record performance for meta-learning
458    pub fn record_performance(&mut self, model_type: ModelType, score: f32) {
459        self.performance_history.push((
460            self.dataset_features.clone(),
461            model_type,
462            score,
463        ));
464    }
465}
466
467impl Default for MetaLearner {
468    fn default() -> Self {
469        Self::new()
470    }
471}
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476    
477    #[test]
478    fn test_automl_config() {
479        let config = AutoMLConfig::default();
480        assert_eq!(config.time_budget, 3600.0);
481        assert_eq!(config.max_models, 100);
482    }
483    
484    #[test]
485    fn test_model_types() {
486        let clf_models = ModelType::classification_models();
487        assert!(!clf_models.is_empty());
488        
489        let reg_models = ModelType::regression_models();
490        assert!(!reg_models.is_empty());
491    }
492    
493    #[test]
494    fn test_hyperparameter_space() {
495        let space = ModelType::RandomForest.default_hyperparameters();
496        assert!(space.contains_key("n_estimators"));
497        assert!(space.contains_key("max_depth"));
498    }
499    
500    #[test]
501    fn test_automl_fit() {
502        let config = AutoMLConfig {
503            time_budget: 10.0,
504            max_models: 5,
505            ..Default::default()
506        };
507        
508        let mut automl = AutoML::new(config);
509        let X = Tensor::randn(&[100, 10]);
510        let y = Tensor::randn(&[100, 1]);
511        
512        automl.fit(&X, &y, TaskType::Classification);
513        
514        assert!(automl.best_model().is_some());
515        assert!(!automl.all_models().is_empty());
516    }
517    
518    #[test]
519    fn test_meta_learner() {
520        let mut meta = MetaLearner::new();
521        let X = Tensor::randn(&[100, 10]);
522        let y = Tensor::randn(&[100, 1]);
523        
524        meta.extract_features(&X, &y);
525        assert!(!meta.dataset_features.is_empty());
526        
527        let recommendations = meta.recommend_models(3);
528        assert_eq!(recommendations.len(), 3);
529    }
530}