1use scirs2_core::ndarray::{ArrayView1, ArrayView2};
7
8use super::automl_core::{AutoMLMethod, DataCharacteristics, TargetType};
9use sklears_core::error::Result as SklResult;
10
11type Result<T> = SklResult<T>;
12
13#[derive(Debug, Clone)]
14struct DatasetMetrics {
15 avg_feature_magnitude: f64,
16 target_variance: f64,
17 class_balance: f64,
18 sample_count: usize,
19 feature_count: usize,
20}
21
22impl DatasetMetrics {
23 fn from_data(X: &ArrayView2<f64>, y: &ArrayView1<f64>) -> Self {
24 let (sample_count, feature_count) = X.dim();
25 let total_entries = sample_count * feature_count;
26 let avg_feature_magnitude = if total_entries > 0 {
27 X.iter().map(|value| value.abs()).sum::<f64>() / total_entries as f64
28 } else {
29 0.0
30 };
31
32 let target_len = y.len();
33 let (target_variance, class_balance) = if target_len > 0 {
34 let target_mean = y.iter().copied().sum::<f64>() / target_len as f64;
35 let variance = if target_len > 1 {
36 y.iter()
37 .map(|value| (value - target_mean).powi(2))
38 .sum::<f64>()
39 / (target_len - 1) as f64
40 } else {
41 0.0
42 };
43
44 let positives = y.iter().filter(|value| **value >= target_mean).count();
45 let balance = (positives as f64 / target_len as f64).clamp(0.0, 1.0);
46 (variance, balance)
47 } else {
48 (0.0, 0.5)
49 };
50
51 Self {
52 avg_feature_magnitude,
53 target_variance,
54 class_balance,
55 sample_count,
56 feature_count,
57 }
58 }
59}
60
61#[derive(Debug, Clone)]
63pub struct HyperparameterOptimizer {
64 pub max_iterations: usize,
65}
66
67impl HyperparameterOptimizer {
68 pub fn new() -> Self {
69 Self { max_iterations: 20 }
70 }
71
72 pub fn optimize_method(
73 &self,
74 method: &AutoMLMethod,
75 X: ArrayView2<f64>,
76 y: ArrayView1<f64>,
77 characteristics: &DataCharacteristics,
78 ) -> Result<OptimizedMethod> {
79 let metrics = DatasetMetrics::from_data(&X, &y);
80
81 let mut config = match method {
82 AutoMLMethod::UnivariateFiltering => self.optimize_univariate(characteristics)?,
83 AutoMLMethod::CorrelationBased => self.optimize_correlation(characteristics)?,
84 AutoMLMethod::TreeBased => self.optimize_tree(characteristics)?,
85 AutoMLMethod::LassoBased => self.optimize_lasso(characteristics)?,
86 AutoMLMethod::WrapperBased => self.optimize_wrapper(characteristics)?,
87 AutoMLMethod::EnsembleBased => self.optimize_ensemble(characteristics)?,
88 AutoMLMethod::Hybrid => self.optimize_hybrid(characteristics)?,
89 AutoMLMethod::NeuralArchitectureSearch => self.optimize_nas(characteristics)?,
90 AutoMLMethod::TransferLearning => self.optimize_transfer_learning(characteristics)?,
91 AutoMLMethod::MetaLearningEnsemble => self.optimize_meta_learning(characteristics)?,
92 };
93
94 self.adjust_config_for_data(&mut config, characteristics, &metrics);
95
96 let estimated_cost = self.estimate_computational_cost(method, characteristics, &metrics);
97
98 Ok(OptimizedMethod {
99 method_type: method.clone(),
100 config,
101 estimated_cost,
102 })
103 }
104
105 fn optimize_univariate(&self, characteristics: &DataCharacteristics) -> Result<MethodConfig> {
106 let k = if characteristics.n_features > 1000 {
107 (characteristics.n_features / 10).min(100)
108 } else {
109 (characteristics.n_features / 2).min(50)
110 };
111
112 Ok(MethodConfig::Univariate { k })
113 }
114
115 fn optimize_correlation(&self, characteristics: &DataCharacteristics) -> Result<MethodConfig> {
116 let threshold = if characteristics.correlation_structure.average_correlation > 0.5 {
117 0.8
118 } else {
119 0.7
120 };
121
122 Ok(MethodConfig::Correlation { threshold })
123 }
124
125 fn optimize_tree(&self, characteristics: &DataCharacteristics) -> Result<MethodConfig> {
126 let n_estimators = if characteristics.n_samples > 10000 {
127 100
128 } else {
129 50
130 };
131 let max_depth = if characteristics.n_features > 100 {
132 10
133 } else {
134 6
135 };
136
137 Ok(MethodConfig::Tree {
138 n_estimators,
139 max_depth,
140 })
141 }
142
143 fn optimize_lasso(&self, characteristics: &DataCharacteristics) -> Result<MethodConfig> {
144 let alpha = if characteristics.feature_to_sample_ratio > 1.0 {
145 0.1
146 } else {
147 0.01
148 };
149
150 Ok(MethodConfig::Lasso { alpha })
151 }
152
153 fn optimize_wrapper(&self, _characteristics: &DataCharacteristics) -> Result<MethodConfig> {
154 Ok(MethodConfig::Wrapper {
155 cv_folds: 5,
156 scoring: "accuracy".to_string(),
157 })
158 }
159
160 fn optimize_ensemble(&self, _characteristics: &DataCharacteristics) -> Result<MethodConfig> {
161 Ok(MethodConfig::Ensemble {
162 n_methods: 3,
163 aggregation: "voting".to_string(),
164 })
165 }
166
167 fn optimize_hybrid(&self, characteristics: &DataCharacteristics) -> Result<MethodConfig> {
168 let stage1_method = if characteristics.n_features > 1000 {
169 "univariate"
170 } else {
171 "correlation"
172 };
173
174 Ok(MethodConfig::Hybrid {
175 stage1_method: stage1_method.to_string(),
176 stage2_method: "lasso".to_string(),
177 stage1_features: characteristics.n_features / 3,
178 })
179 }
180
181 fn optimize_nas(&self, characteristics: &DataCharacteristics) -> Result<MethodConfig> {
182 let max_epochs = if characteristics.n_features > 1000 {
183 100
184 } else {
185 50
186 };
187
188 let population_size = if characteristics.computational_budget.allow_complex_methods {
189 20
190 } else {
191 10
192 };
193
194 Ok(MethodConfig::NeuralArchitectureSearch {
195 max_epochs,
196 population_size,
197 mutation_rate: 0.1,
198 early_stopping_patience: 10,
199 })
200 }
201
202 fn optimize_transfer_learning(
203 &self,
204 characteristics: &DataCharacteristics,
205 ) -> Result<MethodConfig> {
206 let source_domain = match characteristics.target_type {
207 TargetType::BinaryClassification => "binary_classification",
208 TargetType::MultiClassification => "multi_classification",
209 TargetType::Regression => "regression",
210 _ => "general",
211 }
212 .to_string();
213
214 let fine_tuning_epochs = if characteristics.n_samples > 1000 {
215 30
216 } else {
217 10
218 };
219
220 Ok(MethodConfig::TransferLearning {
221 source_domain,
222 adaptation_method: "fine_tuning".to_string(),
223 fine_tuning_epochs,
224 transfer_ratio: 0.7,
225 })
226 }
227
228 fn optimize_meta_learning(
229 &self,
230 characteristics: &DataCharacteristics,
231 ) -> Result<MethodConfig> {
232 let base_methods = vec![
233 "univariate".to_string(),
234 "correlation".to_string(),
235 "lasso".to_string(),
236 ];
237
238 let ensemble_size = if characteristics.computational_budget.allow_complex_methods {
239 5
240 } else {
241 3
242 };
243
244 Ok(MethodConfig::MetaLearningEnsemble {
245 base_methods,
246 meta_learner: "gradient_boosting".to_string(),
247 adaptation_strategy: "online_learning".to_string(),
248 ensemble_size,
249 })
250 }
251
252 fn adjust_config_for_data(
253 &self,
254 config: &mut MethodConfig,
255 characteristics: &DataCharacteristics,
256 metrics: &DatasetMetrics,
257 ) {
258 match config {
259 MethodConfig::Univariate { k } => {
260 let feature_cap = std::cmp::max(metrics.feature_count, 1);
261 if metrics.target_variance < 1e-3 {
262 let conservative_cap = std::cmp::max(feature_cap / 5, 1);
263 *k = (*k).min(conservative_cap);
264 } else if metrics.target_variance > 1.0 {
265 let bonus =
266 ((metrics.target_variance.min(4.0)) * feature_cap as f64 * 0.05) as usize;
267 *k = (*k + bonus).min(feature_cap);
268 } else {
269 *k = (*k).min(feature_cap);
270 }
271 *k = (*k).max(1);
272 }
273 MethodConfig::Correlation { threshold } => {
274 let fluctuation = (metrics.avg_feature_magnitude - 1.0).abs().min(0.15);
275 if metrics.target_variance < 0.3 {
276 *threshold = (*threshold - fluctuation).clamp(0.3, 0.95);
277 } else {
278 *threshold = (*threshold + fluctuation).clamp(0.3, 0.95);
279 }
280 }
281 MethodConfig::Tree {
282 n_estimators,
283 max_depth,
284 } => {
285 if metrics.sample_count > 5_000 {
286 *n_estimators = (*n_estimators).max(100);
287 }
288 if metrics.target_variance > 1.2 {
289 *max_depth = (*max_depth + 2).min(20);
290 } else if metrics.target_variance < 0.2 {
291 *max_depth = (*max_depth).max(4);
292 }
293 }
294 MethodConfig::Lasso { alpha } => {
295 let scale_adjustment = metrics.avg_feature_magnitude.clamp(0.5, 2.0);
296 *alpha = (*alpha * scale_adjustment).max(1e-4);
297 if matches!(characteristics.target_type, TargetType::Regression)
298 && metrics.target_variance > 2.0
299 {
300 *alpha *= 0.9;
301 }
302 }
303 MethodConfig::Wrapper { scoring, cv_folds } => {
304 let imbalance = (metrics.class_balance - 0.5).abs();
305 if matches!(characteristics.target_type, TargetType::Regression) {
306 *scoring = "r2".to_string();
307 } else if imbalance > 0.2 {
308 *scoring = "roc_auc".to_string();
309 } else {
310 *scoring = "accuracy".to_string();
311 }
312
313 *cv_folds = if metrics.sample_count < 200 {
314 3
315 } else if metrics.sample_count > 5_000 {
316 7
317 } else {
318 5
319 };
320 }
321 MethodConfig::Ensemble { n_methods, .. } => {
322 if metrics.feature_count > 500 {
323 *n_methods = (*n_methods).max(4);
324 }
325 if metrics.class_balance < 0.35 || metrics.class_balance > 0.65 {
326 *n_methods = (*n_methods).max(5);
327 }
328 }
329 MethodConfig::Hybrid {
330 stage1_features, ..
331 } => {
332 let feature_cap = std::cmp::max(metrics.feature_count, 1);
333 let mut desired = feature_cap / 3;
334 if metrics.target_variance < 0.2 {
335 desired = std::cmp::max(feature_cap / 5, 1);
336 } else if metrics.target_variance > 1.0 {
337 desired = std::cmp::max(feature_cap / 2, 1);
338 }
339 *stage1_features = desired.min(feature_cap);
340 }
341 MethodConfig::NeuralArchitectureSearch {
342 max_epochs,
343 population_size,
344 early_stopping_patience,
345 ..
346 } => {
347 if metrics.sample_count > 2_000 {
348 *population_size = (*population_size).max(25);
349 }
350 if metrics.target_variance < 0.4 {
351 *max_epochs = (*max_epochs).max(80);
352 *early_stopping_patience = (*early_stopping_patience).max(15);
353 } else {
354 *max_epochs = (*max_epochs).min(150);
355 }
356 }
357 MethodConfig::TransferLearning {
358 transfer_ratio,
359 fine_tuning_epochs,
360 ..
361 } => {
362 if matches!(characteristics.target_type, TargetType::Regression) {
363 *transfer_ratio = 0.6;
364 } else if metrics.target_variance > 1.0 {
365 *transfer_ratio = 0.8;
366 } else {
367 *transfer_ratio = 0.7;
368 }
369
370 if metrics.sample_count > 2_500 {
371 *fine_tuning_epochs = (*fine_tuning_epochs).max(25);
372 }
373 }
374 MethodConfig::MetaLearningEnsemble { ensemble_size, .. } => {
375 if metrics.feature_count > 1_000 {
376 *ensemble_size = (*ensemble_size).max(6);
377 }
378 if metrics.sample_count < 500 {
379 *ensemble_size = (*ensemble_size).min(4);
380 }
381 }
382 }
383 }
384
385 fn estimate_computational_cost(
386 &self,
387 method: &AutoMLMethod,
388 characteristics: &DataCharacteristics,
389 metrics: &DatasetMetrics,
390 ) -> f64 {
391 let base_cost =
392 characteristics.n_samples as f64 * characteristics.n_features as f64 / 1_000_000.0;
393
394 let scale_penalty = 1.0 + (metrics.avg_feature_magnitude - 1.0).abs().min(3.0) * 0.05;
395 let variance_discount = if metrics.target_variance < 1e-6 {
396 0.85
397 } else {
398 1.0
399 };
400 let imbalance_penalty = 1.0 + (metrics.class_balance - 0.5).abs() * 0.5;
401
402 let method_multiplier = match method {
403 AutoMLMethod::UnivariateFiltering => 0.1,
404 AutoMLMethod::CorrelationBased => 0.5,
405 AutoMLMethod::TreeBased => 2.0,
406 AutoMLMethod::LassoBased => 1.5,
407 AutoMLMethod::WrapperBased => 10.0,
408 AutoMLMethod::EnsembleBased => 5.0,
409 AutoMLMethod::Hybrid => 3.0,
410 AutoMLMethod::NeuralArchitectureSearch => 15.0,
411 AutoMLMethod::TransferLearning => 8.0,
412 AutoMLMethod::MetaLearningEnsemble => 12.0,
413 };
414
415 base_cost * method_multiplier * scale_penalty * variance_discount * imbalance_penalty
416 }
417}
418
419impl Default for HyperparameterOptimizer {
420 fn default() -> Self {
421 Self::new()
422 }
423}
424
425#[derive(Debug, Clone)]
427pub struct OptimizedMethod {
428 pub method_type: AutoMLMethod,
429 pub config: MethodConfig,
430 pub estimated_cost: f64,
431}
432
433#[derive(Debug, Clone)]
435pub enum MethodConfig {
436 Univariate {
438 k: usize,
439 },
440 Correlation {
442 threshold: f64,
443 },
444 Tree {
446 n_estimators: usize,
447
448 max_depth: usize,
449 },
450 Lasso {
451 alpha: f64,
452 },
453 Wrapper {
454 cv_folds: usize,
455 scoring: String,
456 },
457 Ensemble {
458 n_methods: usize,
459 aggregation: String,
460 },
461 Hybrid {
462 stage1_method: String,
463 stage2_method: String,
464 stage1_features: usize,
465 },
466 NeuralArchitectureSearch {
467 max_epochs: usize,
468 population_size: usize,
469 mutation_rate: f64,
470 early_stopping_patience: usize,
471 },
472 TransferLearning {
473 source_domain: String,
474 adaptation_method: String,
475 fine_tuning_epochs: usize,
476 transfer_ratio: f64,
477 },
478 MetaLearningEnsemble {
479 base_methods: Vec<String>,
480 meta_learner: String,
481 adaptation_strategy: String,
482 ensemble_size: usize,
483 },
484}
485
486impl OptimizedMethod {
487 pub fn fit(self, X: ArrayView2<f64>, y: ArrayView1<f64>) -> Result<TrainedMethod> {
489 let mut selected_features: Vec<usize> = match &self.method_type {
491 AutoMLMethod::UnivariateFiltering => {
492 if let MethodConfig::Univariate { k } = &self.config {
493 (0..*k.min(&X.ncols())).collect()
494 } else {
495 (0..X.ncols().min(10)).collect()
496 }
497 }
498 AutoMLMethod::CorrelationBased => {
499 (0..X.ncols().min(20)).collect()
501 }
502 AutoMLMethod::TreeBased => {
503 (0..X.ncols().min(30)).collect()
505 }
506 AutoMLMethod::LassoBased => {
507 (0..X.ncols().min(15)).collect()
509 }
510 AutoMLMethod::WrapperBased => {
511 (0..X.ncols().min(25)).collect()
513 }
514 AutoMLMethod::EnsembleBased => {
515 (0..X.ncols().min(35)).collect()
517 }
518 AutoMLMethod::Hybrid => {
519 (0..X.ncols().min(20)).collect()
521 }
522 AutoMLMethod::NeuralArchitectureSearch => {
523 (0..X.ncols().min(40)).collect()
525 }
526 AutoMLMethod::TransferLearning => {
527 (0..X.ncols().min(30)).collect()
529 }
530 AutoMLMethod::MetaLearningEnsemble => {
531 (0..X.ncols().min(50)).collect()
533 }
534 };
535
536 let metrics = DatasetMetrics::from_data(&X, &y);
537
538 if metrics.target_variance < 1e-6 && selected_features.len() > 10 {
539 selected_features.truncate(10);
540 }
541
542 let denom = std::cmp::max(selected_features.len(), 1) as f64;
543 let importance_scale = 1.0 + metrics.target_variance.sqrt().min(2.0);
544 let balance_adjustment = 1.0 + (0.5 - metrics.class_balance).abs() * 0.5;
545 let magnitude_adjustment = metrics.avg_feature_magnitude.max(0.1);
546
547 let feature_importances: Vec<f64> = selected_features
548 .iter()
549 .enumerate()
550 .map(|(index, _)| {
551 let rank = (denom - index as f64) / denom;
552 (rank * importance_scale * balance_adjustment * magnitude_adjustment).max(0.05)
553 })
554 .collect();
555
556 Ok(TrainedMethod {
557 method_type: self.method_type,
558 config: self.config,
559 selected_features: selected_features.clone(),
560 feature_importances,
561 })
562 }
563}
564
565#[derive(Debug, Clone)]
567pub struct TrainedMethod {
568 pub method_type: AutoMLMethod,
569 pub config: MethodConfig,
570 pub selected_features: Vec<usize>,
571 pub feature_importances: Vec<f64>,
572}
573
574impl TrainedMethod {
575 pub fn transform_indices(&self) -> Result<Vec<usize>> {
576 Ok(self.selected_features.clone())
577 }
578}