sklears_feature_selection/automl/
mod.rs

1//! AutoML Feature Selection Module
2//!
3//! Comprehensive automated feature selection framework with multiple specialized modules.
4//! All implementations follow the SciRS2 policy using scirs2-core for numerical computations.
5
6// Module declarations
7pub mod advanced_optimizer;
8pub mod automl_core;
9pub mod benchmark_framework;
10pub mod data_analyzer;
11pub mod hyperparameter_optimizer;
12pub mod method_selector;
13pub mod pipeline_optimizer;
14pub mod preprocessing_integration;
15
16// Re-export core types and functionality
17pub use automl_core::{
18    AutoMLError, AutoMLMethod, AutoMLResults, AutoMLSummary, AutomatedFeatureSelectionPipeline,
19    ComputationalBudget, CorrelationStructure, DataCharacteristics, TargetType,
20};
21
22pub use data_analyzer::DataAnalyzer;
23
24pub use method_selector::MethodSelector;
25
26pub use hyperparameter_optimizer::{
27    HyperparameterOptimizer, MethodConfig, OptimizedMethod, TrainedMethod,
28};
29
30pub use pipeline_optimizer::{
31    MethodInfo, MethodPerformance, OptimalPipeline, PipelineConfig, PipelineConfigResult,
32    PipelineOptimizer, TrainedOptimalPipeline, ValidationStrategy,
33};
34
35pub use preprocessing_integration::{
36    DimensionalityReduction, FeatureEngineering, MissingValueStrategy, OutlierHandling,
37    PreprocessingIntegration, ScalerType,
38};
39
40pub use advanced_optimizer::{
41    AdvancedHyperparameterOptimizer, EarlyStoppingConfig, OptimizationStrategy,
42};
43
44pub use benchmark_framework::{
45    AutoMLBenchmark, BenchmarkDataset, BenchmarkMetric, BenchmarkResults, DatasetType,
46    DetailedBenchmarkResults, DifficultyLevel, ErrorAnalysis, ImprovementRatios, MethodComparison,
47    OptimizationDetails, PerformanceMetrics,
48};
49
50// Factory pattern for easy construction
51use scirs2_core::ndarray::{ArrayView1, ArrayView2};
52use sklears_core::error::Result as SklResult;
53
54type Result<T> = SklResult<T>;
55
56/// Comprehensive AutoML Factory for creating and managing all components
57#[derive(Debug, Clone)]
58pub struct AutoMLFactory {
59    config: AutoMLFactoryConfig,
60}
61
62/// Configuration for the AutoML factory
63#[derive(Debug, Clone)]
64pub struct AutoMLFactoryConfig {
65    pub enable_advanced_optimization: bool,
66    pub enable_preprocessing: bool,
67    pub enable_benchmarking: bool,
68    pub parallel_workers: usize,
69    pub time_budget_seconds: u64,
70}
71
72impl Default for AutoMLFactoryConfig {
73    fn default() -> Self {
74        Self {
75            enable_advanced_optimization: false,
76            enable_preprocessing: true,
77            enable_benchmarking: false,
78            parallel_workers: 1,
79            time_budget_seconds: 300,
80        }
81    }
82}
83
84impl AutoMLFactory {
85    /// Create a new AutoML factory with default configuration
86    pub fn new() -> Self {
87        Self {
88            config: AutoMLFactoryConfig::default(),
89        }
90    }
91
92    /// Create factory with custom configuration
93    pub fn with_config(config: AutoMLFactoryConfig) -> Self {
94        Self { config }
95    }
96
97    /// Enable advanced hyperparameter optimization
98    pub fn with_advanced_optimization(mut self) -> Self {
99        self.config.enable_advanced_optimization = true;
100        self
101    }
102
103    /// Enable preprocessing integration
104    pub fn with_preprocessing(mut self) -> Self {
105        self.config.enable_preprocessing = true;
106        self
107    }
108
109    /// Enable benchmarking capabilities
110    pub fn with_benchmarking(mut self) -> Self {
111        self.config.enable_benchmarking = true;
112        self
113    }
114
115    /// Set time budget for optimization
116    pub fn with_time_budget(mut self, seconds: u64) -> Self {
117        self.config.time_budget_seconds = seconds;
118        self
119    }
120
121    /// Set number of parallel workers
122    pub fn with_parallel_workers(mut self, workers: usize) -> Self {
123        self.config.parallel_workers = workers;
124        self
125    }
126
127    /// Create a basic AutoML pipeline
128    pub fn create_basic_pipeline(&self) -> AutomatedFeatureSelectionPipeline {
129        let mut pipeline = AutomatedFeatureSelectionPipeline::new();
130
131        if self.config.enable_preprocessing {
132            pipeline = pipeline.with_preprocessing();
133        }
134
135        if self.config.enable_advanced_optimization {
136            let advanced_optimizer = AdvancedHyperparameterOptimizer::new()
137                .with_time_budget(std::time::Duration::from_secs(
138                    self.config.time_budget_seconds,
139                ))
140                .with_parallel_workers(self.config.parallel_workers);
141            pipeline = pipeline.with_advanced_optimizer(advanced_optimizer);
142        }
143
144        pipeline
145    }
146
147    /// Create an advanced AutoML pipeline with full configuration
148    pub fn create_advanced_pipeline(&self) -> AutomatedFeatureSelectionPipeline {
149        let preprocessing = PreprocessingIntegration::new()
150            .with_scaler(ScalerType::StandardScaler)
151            .with_missing_value_strategy(MissingValueStrategy::KNN { k: 5 })
152            .with_outlier_handling(OutlierHandling::IQR { multiplier: 1.5 })
153            .with_feature_engineering(FeatureEngineering::Polynomial { degree: 2 });
154
155        let advanced_optimizer = AdvancedHyperparameterOptimizer::new()
156            .with_strategy(OptimizationStrategy::BayesianOptimization)
157            .with_time_budget(std::time::Duration::from_secs(
158                self.config.time_budget_seconds,
159            ))
160            .with_parallel_workers(self.config.parallel_workers)
161            .with_early_stopping(EarlyStoppingConfig {
162                patience: 10,
163                min_improvement: 0.001,
164                restore_best: true,
165            });
166
167        AutomatedFeatureSelectionPipeline::new()
168            .with_custom_preprocessing(preprocessing)
169            .with_advanced_optimizer(advanced_optimizer)
170    }
171
172    /// Create a speed-optimized pipeline for large datasets
173    pub fn create_speed_optimized_pipeline(&self) -> AutomatedFeatureSelectionPipeline {
174        let preprocessing = PreprocessingIntegration::new()
175            .with_scaler(ScalerType::MinMaxScaler)
176            .with_missing_value_strategy(MissingValueStrategy::Mean);
177
178        AutomatedFeatureSelectionPipeline::new().with_custom_preprocessing(preprocessing)
179    }
180
181    /// Create a comprehensive benchmark suite
182    pub fn create_benchmark_suite(&self) -> Result<AutoMLBenchmark> {
183        if !self.config.enable_benchmarking {
184            return Err(AutoMLError::InvalidConfiguration.into());
185        }
186
187        let mut benchmark = AutoMLBenchmark::new()
188            .with_methods(vec![
189                AutoMLMethod::UnivariateFiltering,
190                AutoMLMethod::CorrelationBased,
191                AutoMLMethod::TreeBased,
192                AutoMLMethod::LassoBased,
193                AutoMLMethod::WrapperBased,
194                AutoMLMethod::EnsembleBased,
195                AutoMLMethod::Hybrid,
196                AutoMLMethod::NeuralArchitectureSearch,
197                AutoMLMethod::TransferLearning,
198                AutoMLMethod::MetaLearningEnsemble,
199            ])
200            .with_metrics(vec![
201                BenchmarkMetric::Accuracy,
202                BenchmarkMetric::F1Score,
203                BenchmarkMetric::FeatureReduction,
204                BenchmarkMetric::ComputationalTime,
205                BenchmarkMetric::FeatureStability,
206            ]);
207
208        // Generate synthetic datasets for comprehensive evaluation
209        benchmark.generate_synthetic_datasets(10)?;
210
211        Ok(benchmark)
212    }
213
214    /// Run quick AutoML feature selection
215    pub fn quick_feature_selection(
216        &self,
217        X: ArrayView2<f64>,
218        y: ArrayView1<f64>,
219        target_features: Option<usize>,
220    ) -> Result<AutoMLResults> {
221        let pipeline = self.create_basic_pipeline();
222        pipeline.auto_select_features(X, y, target_features)
223    }
224
225    /// Run comprehensive AutoML feature selection with all optimizations
226    pub fn comprehensive_feature_selection(
227        &self,
228        X: ArrayView2<f64>,
229        y: ArrayView1<f64>,
230        target_features: Option<usize>,
231    ) -> Result<AutoMLResults> {
232        let pipeline = self.create_advanced_pipeline();
233        pipeline.auto_select_features(X, y, target_features)
234    }
235
236    /// Analyze data characteristics
237    pub fn analyze_data_characteristics(
238        &self,
239        X: ArrayView2<f64>,
240        y: ArrayView1<f64>,
241    ) -> Result<DataCharacteristics> {
242        let analyzer = DataAnalyzer::new();
243        analyzer.analyze_data(X, y)
244    }
245
246    /// Get method recommendations based on data characteristics
247    pub fn recommend_methods(
248        &self,
249        characteristics: &DataCharacteristics,
250    ) -> Result<Vec<AutoMLMethod>> {
251        let selector = MethodSelector::new();
252        selector.select_methods(characteristics)
253    }
254
255    /// Create custom preprocessing configuration based on data
256    pub fn auto_configure_preprocessing(
257        &self,
258        characteristics: &DataCharacteristics,
259    ) -> PreprocessingIntegration {
260        PreprocessingIntegration::auto_configure(characteristics)
261    }
262
263    /// Run benchmarking evaluation
264    pub fn run_benchmark_evaluation(&self) -> Result<BenchmarkResults> {
265        let benchmark = self.create_benchmark_suite()?;
266        benchmark.run_benchmark()
267    }
268
269    /// Generate a comprehensive report of AutoML capabilities
270    pub fn generate_capability_report(&self) -> String {
271        let mut report = String::new();
272
273        report.push_str(
274            "╔══════════════════════════════════════════════════════════════════════════════╗\n",
275        );
276        report.push_str(
277            "║                          AutoML Factory Capabilities                         ║\n",
278        );
279        report.push_str(
280            "╚══════════════════════════════════════════════════════════════════════════════╝\n\n",
281        );
282
283        // Configuration summary
284        report.push_str("=== Configuration ===\n");
285        report.push_str(&format!(
286            "Advanced Optimization: {}\n",
287            self.config.enable_advanced_optimization
288        ));
289        report.push_str(&format!(
290            "Preprocessing: {}\n",
291            self.config.enable_preprocessing
292        ));
293        report.push_str(&format!(
294            "Benchmarking: {}\n",
295            self.config.enable_benchmarking
296        ));
297        report.push_str(&format!(
298            "Parallel Workers: {}\n",
299            self.config.parallel_workers
300        ));
301        report.push_str(&format!(
302            "Time Budget: {} seconds\n",
303            self.config.time_budget_seconds
304        ));
305
306        // Available methods
307        report.push_str("\n=== Available Methods ===\n");
308        let methods = vec![
309            "• Univariate Filtering - Fast statistical feature selection",
310            "• Correlation-Based - Remove redundant features",
311            "• Tree-Based - Feature importance from tree models",
312            "• Lasso-Based - L1 regularization feature selection",
313            "• Wrapper-Based - Model-based selection with CV",
314            "• Ensemble-Based - Combine multiple selection methods",
315            "• Hybrid - Multi-stage selection pipeline",
316            "• Neural Architecture Search - Deep learning optimization",
317            "• Transfer Learning - Leverage pre-trained models",
318            "• Meta-Learning Ensemble - Adaptive method combination",
319        ];
320        for method in methods {
321            report.push_str(&format!("{}\n", method));
322        }
323
324        // Available optimizations
325        if self.config.enable_advanced_optimization {
326            report.push_str("\n=== Optimization Strategies ===\n");
327            let strategies = vec![
328                "• Bayesian Optimization - Gaussian process guided search",
329                "• Genetic Algorithm - Evolutionary optimization",
330                "• Random Search - Efficient random exploration",
331                "• Grid Search - Exhaustive parameter exploration",
332                "• Particle Swarm Optimization - Swarm intelligence",
333                "• Simulated Annealing - Temperature-based optimization",
334                "• HyperBand - Multi-fidelity optimization",
335            ];
336            for strategy in strategies {
337                report.push_str(&format!("{}\n", strategy));
338            }
339        }
340
341        // Preprocessing capabilities
342        if self.config.enable_preprocessing {
343            report.push_str("\n=== Preprocessing Features ===\n");
344            let preprocessing = vec![
345                "• Scaling: Standard, MinMax, Robust, Quantile",
346                "• Missing Values: Mean, Median, KNN, Interpolation",
347                "• Outlier Handling: IQR, Z-Score, Isolation Forest",
348                "• Feature Engineering: Polynomial, Interaction terms",
349                "• Dimensionality Reduction: PCA, ICA, SVD",
350            ];
351            for feature in preprocessing {
352                report.push_str(&format!("{}\n", feature));
353            }
354        }
355
356        // Benchmarking capabilities
357        if self.config.enable_benchmarking {
358            report.push_str("\n=== Benchmarking Features ===\n");
359            let benchmarking = vec![
360                "• Synthetic Dataset Generation",
361                "• Multi-metric Evaluation (Accuracy, F1, Time, Stability)",
362                "• Statistical Significance Testing",
363                "• Performance Comparison and Ranking",
364                "• Error Analysis and Diagnostics",
365                "• Improvement Ratio Calculations",
366            ];
367            for feature in benchmarking {
368                report.push_str(&format!("{}\n", feature));
369            }
370        }
371
372        report.push_str("\n💡 Use AutoMLFactory::quick_feature_selection() for fast results\n");
373        report
374            .push_str("💡 Use AutoMLFactory::comprehensive_feature_selection() for best quality\n");
375
376        report
377    }
378}
379
380impl Default for AutoMLFactory {
381    fn default() -> Self {
382        Self::new()
383    }
384}
385
386// Convenience functions for quick access
387/// Quick feature selection with default settings
388pub fn quick_automl(
389    X: ArrayView2<f64>,
390    y: ArrayView1<f64>,
391    target_features: Option<usize>,
392) -> Result<AutoMLResults> {
393    let factory = AutoMLFactory::new();
394    factory.quick_feature_selection(X, y, target_features)
395}
396
397/// Comprehensive feature selection with all optimizations
398pub fn comprehensive_automl(
399    X: ArrayView2<f64>,
400    y: ArrayView1<f64>,
401    target_features: Option<usize>,
402) -> Result<AutoMLResults> {
403    let factory = AutoMLFactory::new()
404        .with_advanced_optimization()
405        .with_preprocessing()
406        .with_time_budget(600); // 10 minutes for comprehensive analysis
407    factory.comprehensive_feature_selection(X, y, target_features)
408}
409
410/// Analyze dataset and get method recommendations
411pub fn analyze_and_recommend(
412    X: ArrayView2<f64>,
413    y: ArrayView1<f64>,
414) -> Result<(DataCharacteristics, Vec<AutoMLMethod>)> {
415    let factory = AutoMLFactory::new();
416    let characteristics = factory.analyze_data_characteristics(X, y)?;
417    let methods = factory.recommend_methods(&characteristics)?;
418    Ok((characteristics, methods))
419}