1use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
6use std::collections::HashMap;
7use std::marker::PhantomData;
8use std::time::{Duration, Instant};
9
10use super::functions::Result;
11
12#[derive(Debug, Clone)]
13pub enum TreeEstimatorType {
14 RandomForest,
16 ExtraTrees,
18 GradientBoosting,
20 AdaBoost,
22}
23#[derive(Debug, Clone)]
24pub enum ValidationStrategy {
25 None,
26 Basic,
28 Comprehensive,
30 Statistical,
32}
33#[derive(Debug, Clone)]
35pub struct FeatureMapping {
36 pub original_features: usize,
37 pub final_features: usize,
38 pub feature_names: Vec<String>,
39 pub feature_origins: Vec<FeatureOrigin>,
40 pub transformation_history: Vec<TransformationStep>,
41}
42#[derive(Debug, Clone)]
43pub struct ScalerParams {
44 pub mean: Array1<f64>,
45 pub scale: Array1<f64>,
46}
47#[derive(Debug, Clone)]
48pub enum LoggingLevel {
49 None,
50 Error,
52 Warning,
54 Info,
56 Debug,
58 Trace,
60}
61#[derive(Debug, Clone)]
63pub struct OptimizationConfiguration {
64 pub use_simd: bool,
65 pub chunk_size: usize,
66 pub thread_pool_size: Option<usize>,
67 pub memory_pool_size: usize,
68 pub cache_size: usize,
69 pub prefetch_strategy: PrefetchStrategy,
70 pub vectorization_threshold: usize,
71}
72#[derive(Debug, Clone)]
74pub enum MemoryOptimization {
75 None,
76 Conservative,
78 Aggressive,
80}
81#[derive(Debug, Clone)]
82pub struct StandardScalerConfig {
83 pub with_mean: bool,
84 pub with_std: bool,
85}
86#[derive(Debug, Clone)]
87pub enum WindowStatistic {
88 Mean,
90 Std,
92 Min,
94 Max,
96 Median,
98 Skewness,
100 Kurtosis,
102}
103#[derive(Debug, Clone)]
104pub enum DistanceMetric {
105 Euclidean,
107 Manhattan,
109 Cosine,
111 Hamming,
113}
114#[derive(Debug, Clone)]
115pub enum CachingStrategy {
116 None,
117 LRU {
119 size: usize,
120 },
121 LFU {
123 size: usize,
124 },
125 FIFO {
127 size: usize,
128 },
129}
130#[derive(Debug, Clone)]
131pub enum MissingValueIndicator {
132 NaN,
134 Value(f64),
136}
137#[derive(Debug)]
138pub struct Trained {
139 trained_steps: Vec<TrainedStep>,
140 feature_mapping: FeatureMapping,
141 pipeline_metadata: PipelineMetadata,
142}
143#[derive(Debug, Clone)]
145pub struct FeatureSelectionPipeline<State = Untrained> {
146 preprocessing_steps: Vec<PreprocessingStep>,
147 feature_engineering_steps: Vec<FeatureEngineeringStep>,
148 selection_methods: Vec<SelectionMethod>,
149 dimensionality_reduction: Option<DimensionalityReductionStep>,
150 model_selection: Option<ModelSelectionStep>,
151 pipeline_config: PipelineConfiguration,
152 optimization_config: OptimizationConfiguration,
153 _phantom: PhantomData<State>,
154}
155impl FeatureSelectionPipeline<Untrained> {
156 pub fn new() -> Self {
158 Self {
159 preprocessing_steps: Vec::new(),
160 feature_engineering_steps: Vec::new(),
161 selection_methods: Vec::new(),
162 dimensionality_reduction: None,
163 model_selection: None,
164 pipeline_config: PipelineConfiguration::default(),
165 optimization_config: OptimizationConfiguration::default(),
166 _phantom: PhantomData,
167 }
168 }
169 pub fn add_preprocessing_step(mut self, step: PreprocessingStep) -> Self {
171 self.preprocessing_steps.push(step);
172 self
173 }
174 pub fn add_feature_engineering_step(mut self, step: FeatureEngineeringStep) -> Self {
176 self.feature_engineering_steps.push(step);
177 self
178 }
179 pub fn add_selection_method(mut self, method: SelectionMethod) -> Self {
181 self.selection_methods.push(method);
182 self
183 }
184 pub fn with_dimensionality_reduction(mut self, reduction: DimensionalityReductionStep) -> Self {
186 self.dimensionality_reduction = Some(reduction);
187 self
188 }
189 pub fn with_model_selection(mut self, model_selection: ModelSelectionStep) -> Self {
191 self.model_selection = Some(model_selection);
192 self
193 }
194 pub fn with_config(mut self, config: PipelineConfiguration) -> Self {
196 self.pipeline_config = config;
197 self
198 }
199 pub fn with_optimization(mut self, config: OptimizationConfiguration) -> Self {
201 self.optimization_config = config;
202 self
203 }
204 pub fn fit(
206 mut self,
207 X: ArrayView2<f64>,
208 y: ArrayView1<f64>,
209 ) -> Result<FeatureSelectionPipeline<Trained>> {
210 let start_time = Instant::now();
211 let mut current_X = X.to_owned();
212 let current_y = y.to_owned();
213 let mut trained_steps = Vec::new();
214 let original_features = X.ncols();
215 let mut preprocessing_steps = std::mem::take(&mut self.preprocessing_steps);
216 for (idx, step) in preprocessing_steps.iter_mut().enumerate() {
217 let step_start = Instant::now();
218 current_X = Self::apply_preprocessing_step_static(step, current_X.view())?;
219 trained_steps.push(TrainedStep {
220 step_type: "Preprocessing".to_string(),
221 step_index: idx,
222 training_time: step_start.elapsed(),
223 feature_count_before: current_X.ncols(),
224 feature_count_after: current_X.ncols(),
225 parameters: StepParameters::Preprocessing(Box::new(())),
226 });
227 }
228 self.preprocessing_steps = preprocessing_steps;
229 let mut feature_engineering_steps = std::mem::take(&mut self.feature_engineering_steps);
230 for (idx, step) in feature_engineering_steps.iter_mut().enumerate() {
231 let step_start = Instant::now();
232 let features_before = current_X.ncols();
233 current_X = Self::apply_feature_engineering_step_static(
234 step,
235 current_X.view(),
236 current_y.view(),
237 )?;
238 trained_steps.push(TrainedStep {
239 step_type: "FeatureEngineering".to_string(),
240 step_index: idx,
241 training_time: step_start.elapsed(),
242 feature_count_before: features_before,
243 feature_count_after: current_X.ncols(),
244 parameters: StepParameters::FeatureEngineering(Box::new(())),
245 });
246 }
247 self.feature_engineering_steps = feature_engineering_steps;
248 let mut selection_mask = Array1::from_elem(current_X.ncols(), true);
249 let mut selection_methods = std::mem::take(&mut self.selection_methods);
250 for (idx, method) in selection_methods.iter_mut().enumerate() {
251 let step_start = Instant::now();
252 let features_before = current_X.ncols();
253 let method_mask =
254 Self::apply_selection_method_static(method, current_X.view(), current_y.view())?;
255 for (i, &selected) in method_mask.iter().enumerate() {
256 if !selected {
257 selection_mask[i] = false;
258 }
259 }
260 trained_steps.push(TrainedStep {
261 step_type: "Selection".to_string(),
262 step_index: idx,
263 training_time: step_start.elapsed(),
264 feature_count_before: features_before,
265 feature_count_after: selection_mask.iter().filter(|&&x| x).count(),
266 parameters: StepParameters::Selection(method_mask),
267 });
268 }
269 self.selection_methods = selection_methods;
270 let selected_indices: Vec<usize> = selection_mask
271 .iter()
272 .enumerate()
273 .filter_map(|(i, &selected)| if selected { Some(i) } else { None })
274 .collect();
275 if !selected_indices.is_empty() {
276 let mut selected_X = Array2::zeros((current_X.nrows(), selected_indices.len()));
277 for (new_col, &old_col) in selected_indices.iter().enumerate() {
278 for row in 0..current_X.nrows() {
279 selected_X[[row, new_col]] = current_X[[row, old_col]];
280 }
281 }
282 current_X = selected_X;
283 }
284 if self.dimensionality_reduction.is_some() {
285 let step_start = Instant::now();
286 let features_before = current_X.ncols();
287 let mut reduction = self
288 .dimensionality_reduction
289 .take()
290 .expect("operation should succeed");
291 current_X = self.apply_dimensionality_reduction(&mut reduction, current_X.view())?;
292 self.dimensionality_reduction = Some(reduction);
293 trained_steps.push(TrainedStep {
294 step_type: "DimensionalityReduction".to_string(),
295 step_index: 0,
296 training_time: step_start.elapsed(),
297 feature_count_before: features_before,
298 feature_count_after: current_X.ncols(),
299 parameters: StepParameters::DimensionalityReduction(Array2::zeros((1, 1))),
300 });
301 }
302 if self.model_selection.is_some() {
303 let step_start = Instant::now();
304 let features_before = current_X.ncols();
305 let mut model_sel = self
306 .model_selection
307 .take()
308 .expect("operation should succeed");
309 let selected_features =
310 self.apply_model_selection(&mut model_sel, current_X.view(), current_y.view())?;
311 self.model_selection = Some(model_sel);
312 if !selected_features.is_empty() {
313 let mut model_selected_X =
314 Array2::zeros((current_X.nrows(), selected_features.len()));
315 for (new_col, &old_col) in selected_features.iter().enumerate() {
316 for row in 0..current_X.nrows() {
317 model_selected_X[[row, new_col]] = current_X[[row, old_col]];
318 }
319 }
320 current_X = model_selected_X;
321 }
322 trained_steps.push(TrainedStep {
323 step_type: "ModelSelection".to_string(),
324 step_index: 0,
325 training_time: step_start.elapsed(),
326 feature_count_before: features_before,
327 feature_count_after: current_X.ncols(),
328 parameters: StepParameters::ModelSelection(selected_features),
329 });
330 }
331 let final_features = current_X.ncols();
332 let _feature_mapping = FeatureMapping {
333 original_features,
334 final_features,
335 feature_names: (0..final_features)
336 .map(|i| format!("feature_{}", i))
337 .collect(),
338 feature_origins: (0..final_features).map(FeatureOrigin::Original).collect(),
339 transformation_history: trained_steps
340 .iter()
341 .map(|step| TransformationStep {
342 step_name: step.step_type.clone(),
343 input_features: step.feature_count_before,
344 output_features: step.feature_count_after,
345 transformation_type: TransformationType::ManyToMany,
346 })
347 .collect(),
348 };
349 let total_training_time = start_time.elapsed();
350 let feature_reduction_ratio = final_features as f64 / original_features as f64;
351 let _pipeline_metadata = PipelineMetadata {
352 total_training_time,
353 total_transform_time: Duration::from_secs(0),
354 memory_usage_peak: 0,
355 feature_reduction_ratio,
356 performance_metrics: HashMap::new(),
357 validation_results: None,
358 };
359 Ok(FeatureSelectionPipeline {
360 preprocessing_steps: self.preprocessing_steps,
361 feature_engineering_steps: self.feature_engineering_steps,
362 selection_methods: self.selection_methods,
363 dimensionality_reduction: self.dimensionality_reduction,
364 model_selection: self.model_selection,
365 pipeline_config: self.pipeline_config,
366 optimization_config: self.optimization_config,
367 _phantom: PhantomData::<Trained>,
368 })
369 }
370 fn apply_preprocessing_step(
371 &self,
372 step: &mut PreprocessingStep,
373 X: ArrayView2<f64>,
374 ) -> Result<Array2<f64>> {
375 Self::apply_preprocessing_step_static(step, X)
376 }
377 fn apply_preprocessing_step_static(
378 step: &mut PreprocessingStep,
379 X: ArrayView2<f64>,
380 ) -> Result<Array2<f64>> {
381 match step {
382 PreprocessingStep::StandardScaler {
383 config,
384 trained_params,
385 } => Self::apply_standard_scaler_static(config, trained_params, X),
386 PreprocessingStep::RobustScaler {
387 config,
388 trained_params,
389 } => Self::apply_robust_scaler_static(config, trained_params, X),
390 PreprocessingStep::MinMaxScaler {
391 config,
392 trained_params,
393 } => Self::apply_minmax_scaler_static(config, trained_params, X),
394 _ => Ok(X.to_owned()),
395 }
396 }
397 fn apply_standard_scaler(
398 &self,
399 config: &StandardScalerConfig,
400 trained_params: &mut Option<ScalerParams>,
401 X: ArrayView2<f64>,
402 ) -> Result<Array2<f64>> {
403 Self::apply_standard_scaler_static(config, trained_params, X)
404 }
405 fn apply_standard_scaler_static(
406 config: &StandardScalerConfig,
407 trained_params: &mut Option<ScalerParams>,
408 X: ArrayView2<f64>,
409 ) -> Result<Array2<f64>> {
410 let mut result = X.to_owned();
411 if trained_params.is_none() {
412 let mut mean = Array1::zeros(X.ncols());
413 let mut scale = Array1::ones(X.ncols());
414 if config.with_mean {
415 for col in 0..X.ncols() {
416 mean[col] = X.column(col).mean().unwrap_or(0.0);
417 }
418 }
419 if config.with_std {
420 for col in 0..X.ncols() {
421 let column = X.column(col);
422 let variance = column.var(1.0);
423 scale[col] = variance.sqrt().max(1e-8);
424 }
425 }
426 *trained_params = Some(ScalerParams { mean, scale });
427 }
428 if let Some(ref params) = trained_params {
429 for col in 0..X.ncols() {
430 for row in 0..X.nrows() {
431 if config.with_mean {
432 result[[row, col]] -= params.mean[col];
433 }
434 if config.with_std {
435 result[[row, col]] /= params.scale[col];
436 }
437 }
438 }
439 }
440 Ok(result)
441 }
442 fn apply_robust_scaler(
443 &self,
444 config: &RobustScalerConfig,
445 trained_params: &mut Option<RobustScalerParams>,
446 X: ArrayView2<f64>,
447 ) -> Result<Array2<f64>> {
448 Self::apply_robust_scaler_static(config, trained_params, X)
449 }
450 fn apply_robust_scaler_static(
451 config: &RobustScalerConfig,
452 trained_params: &mut Option<RobustScalerParams>,
453 X: ArrayView2<f64>,
454 ) -> Result<Array2<f64>> {
455 let mut result = X.to_owned();
456 if trained_params.is_none() {
457 let mut center = Array1::zeros(X.ncols());
458 let mut scale = Array1::ones(X.ncols());
459 for col in 0..X.ncols() {
460 let mut column_data: Vec<f64> = X.column(col).to_vec();
461 column_data.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
462 let n = column_data.len();
463 if config.with_centering {
464 center[col] = if n % 2 == 0 {
465 (column_data[n / 2 - 1] + column_data[n / 2]) / 2.0
466 } else {
467 column_data[n / 2]
468 };
469 }
470 if config.with_scaling {
471 let q1_idx = ((n - 1) as f64 * config.quantile_range.0) as usize;
472 let q3_idx = ((n - 1) as f64 * config.quantile_range.1) as usize;
473 let iqr = column_data[q3_idx] - column_data[q1_idx];
474 scale[col] = iqr.max(1e-8);
475 }
476 }
477 *trained_params = Some(RobustScalerParams { center, scale });
478 }
479 if let Some(ref params) = trained_params {
480 for col in 0..X.ncols() {
481 for row in 0..X.nrows() {
482 if config.with_centering {
483 result[[row, col]] -= params.center[col];
484 }
485 if config.with_scaling {
486 result[[row, col]] /= params.scale[col];
487 }
488 }
489 }
490 }
491 Ok(result)
492 }
493 fn apply_minmax_scaler(
494 &self,
495 config: &MinMaxScalerConfig,
496 trained_params: &mut Option<MinMaxScalerParams>,
497 X: ArrayView2<f64>,
498 ) -> Result<Array2<f64>> {
499 Self::apply_minmax_scaler_static(config, trained_params, X)
500 }
501 fn apply_minmax_scaler_static(
502 config: &MinMaxScalerConfig,
503 trained_params: &mut Option<MinMaxScalerParams>,
504 X: ArrayView2<f64>,
505 ) -> Result<Array2<f64>> {
506 let mut result = X.to_owned();
507 if trained_params.is_none() {
508 let mut min = Array1::zeros(X.ncols());
509 let mut scale = Array1::ones(X.ncols());
510 for col in 0..X.ncols() {
511 let column = X.column(col);
512 let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
513 let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
514 min[col] = col_min;
515 let range = col_max - col_min;
516 if range > 1e-8 {
517 scale[col] = (config.feature_range.1 - config.feature_range.0) / range;
518 }
519 }
520 *trained_params = Some(MinMaxScalerParams { min, scale });
521 }
522 if let Some(ref params) = trained_params {
523 for col in 0..X.ncols() {
524 for row in 0..X.nrows() {
525 let scaled = (result[[row, col]] - params.min[col]) * params.scale[col]
526 + config.feature_range.0;
527 result[[row, col]] = if config.clip {
528 scaled
529 .max(config.feature_range.0)
530 .min(config.feature_range.1)
531 } else {
532 scaled
533 };
534 }
535 }
536 }
537 Ok(result)
538 }
539 fn apply_feature_engineering_step_static(
540 _step: &mut FeatureEngineeringStep,
541 X: ArrayView2<f64>,
542 _y: ArrayView1<f64>,
543 ) -> Result<Array2<f64>> {
544 Ok(X.to_owned())
545 }
546 fn apply_feature_engineering_step(
547 &self,
548 step: &mut FeatureEngineeringStep,
549 X: ArrayView2<f64>,
550 _y: ArrayView1<f64>,
551 ) -> Result<Array2<f64>> {
552 match step {
553 FeatureEngineeringStep::PolynomialFeatures {
554 degree,
555 interaction_only,
556 include_bias,
557 feature_mapping,
558 } => self.apply_polynomial_features(
559 *degree,
560 *interaction_only,
561 *include_bias,
562 feature_mapping,
563 X,
564 ),
565 FeatureEngineeringStep::InteractionFeatures {
566 max_pairs,
567 threshold,
568 feature_pairs,
569 } => self.apply_interaction_features(*max_pairs, *threshold, feature_pairs, X),
570 FeatureEngineeringStep::BinningFeatures {
571 n_bins,
572 strategy,
573 bin_edges,
574 } => self.apply_binning_features(*n_bins, strategy, bin_edges, X),
575 _ => Ok(X.to_owned()),
576 }
577 }
578 fn apply_polynomial_features(
579 &self,
580 degree: usize,
581 interaction_only: bool,
582 include_bias: bool,
583 feature_mapping: &mut Option<Vec<(usize, usize)>>,
584 X: ArrayView2<f64>,
585 ) -> Result<Array2<f64>> {
586 let n_features = X.ncols();
587 let mut new_features = Vec::new();
588 let mut mapping = Vec::new();
589 if include_bias {
590 let bias_feature = Array1::ones(X.nrows());
591 new_features.push(bias_feature);
592 mapping.push((0, 0));
593 }
594 for i in 0..n_features {
595 new_features.push(X.column(i).to_owned());
596 mapping.push((i, 1));
597 }
598 if !interaction_only {
599 for d in 2..=degree {
600 for i in 0..n_features {
601 let mut poly_feature = Array1::zeros(X.nrows());
602 for row in 0..X.nrows() {
603 poly_feature[row] = X[[row, i]].powi(d as i32);
604 }
605 new_features.push(poly_feature);
606 mapping.push((i, d));
607 }
608 }
609 }
610 for d in 2..=degree {
611 for i in 0..n_features {
612 for j in (i + 1)..n_features {
613 let mut interaction_feature = Array1::zeros(X.nrows());
614 for row in 0..X.nrows() {
615 interaction_feature[row] = X[[row, i]] * X[[row, j]];
616 }
617 new_features.push(interaction_feature);
618 mapping.push((i * n_features + j, d));
619 }
620 }
621 }
622 *feature_mapping = Some(mapping);
623 let n_new_features = new_features.len();
624 let mut result = Array2::zeros((X.nrows(), n_new_features));
625 for (col, feature) in new_features.iter().enumerate() {
626 for row in 0..X.nrows() {
627 result[[row, col]] = feature[row];
628 }
629 }
630 Ok(result)
631 }
632 fn apply_interaction_features(
633 &self,
634 max_pairs: Option<usize>,
635 threshold: f64,
636 feature_pairs: &mut Option<Vec<(usize, usize)>>,
637 X: ArrayView2<f64>,
638 ) -> Result<Array2<f64>> {
639 let n_features = X.ncols();
640 let mut interactions = Vec::new();
641 let pairs: Vec<(usize, usize)>;
642 if feature_pairs.is_none() {
643 let mut candidate_pairs = Vec::new();
644 for i in 0..n_features {
645 for j in (i + 1)..n_features {
646 let corr = self.compute_correlation(X.column(i), X.column(j));
647 if corr.abs() > threshold {
648 candidate_pairs.push((i, j, corr.abs()));
649 }
650 }
651 }
652 candidate_pairs
653 .sort_by(|a, b| b.2.partial_cmp(&a.2).expect("operation should succeed"));
654 let limit = max_pairs.unwrap_or(candidate_pairs.len());
655 pairs = candidate_pairs
656 .into_iter()
657 .take(limit)
658 .map(|(i, j, _)| (i, j))
659 .collect();
660 *feature_pairs = Some(pairs.clone());
661 } else {
662 pairs = feature_pairs
663 .as_ref()
664 .expect("operation should succeed")
665 .clone();
666 }
667 for &(i, j) in &pairs {
668 let mut interaction = Array1::zeros(X.nrows());
669 for row in 0..X.nrows() {
670 interaction[row] = X[[row, i]] * X[[row, j]];
671 }
672 interactions.push(interaction);
673 }
674 let total_features = n_features + interactions.len();
675 let mut result = Array2::zeros((X.nrows(), total_features));
676 for col in 0..n_features {
677 for row in 0..X.nrows() {
678 result[[row, col]] = X[[row, col]];
679 }
680 }
681 for (idx, interaction) in interactions.iter().enumerate() {
682 for row in 0..X.nrows() {
683 result[[row, n_features + idx]] = interaction[row];
684 }
685 }
686 Ok(result)
687 }
688 fn apply_binning_features(
689 &self,
690 n_bins: usize,
691 strategy: &BinningStrategy,
692 bin_edges: &mut Option<HashMap<usize, Vec<f64>>>,
693 X: ArrayView2<f64>,
694 ) -> Result<Array2<f64>> {
695 let mut result = X.to_owned();
696 if bin_edges.is_none() {
697 let mut edges_map = HashMap::new();
698 for col in 0..X.ncols() {
699 let column = X.column(col);
700 let edges = match strategy {
701 BinningStrategy::Uniform => {
702 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
703 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
704 let step = (max_val - min_val) / n_bins as f64;
705 (0..=n_bins)
706 .map(|i| min_val + i as f64 * step)
707 .collect::<Vec<f64>>()
708 }
709 BinningStrategy::Quantile => {
710 let mut sorted_values: Vec<f64> = column.to_vec();
711 sorted_values
712 .sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
713 let n = sorted_values.len();
714 (0..=n_bins)
715 .map(|i| {
716 let quantile = i as f64 / n_bins as f64;
717 let idx = ((n - 1) as f64 * quantile) as usize;
718 sorted_values[idx]
719 })
720 .collect::<Vec<f64>>()
721 }
722 BinningStrategy::KMeans => {
723 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
724 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
725 let step = (max_val - min_val) / n_bins as f64;
726 (0..=n_bins)
727 .map(|i| min_val + i as f64 * step)
728 .collect::<Vec<f64>>()
729 }
730 };
731 edges_map.insert(col, edges);
732 }
733 *bin_edges = Some(edges_map);
734 }
735 if let Some(ref edges_map) = bin_edges {
736 for col in 0..X.ncols() {
737 if let Some(edges) = edges_map.get(&col) {
738 for row in 0..X.nrows() {
739 let value = X[[row, col]];
740 let bin = edges
741 .iter()
742 .position(|&edge| value <= edge)
743 .unwrap_or(edges.len() - 1)
744 .min(n_bins - 1);
745 result[[row, col]] = bin as f64;
746 }
747 }
748 }
749 }
750 Ok(result)
751 }
752 fn apply_selection_method_static(
753 _method: &mut SelectionMethod,
754 X: ArrayView2<f64>,
755 _y: ArrayView1<f64>,
756 ) -> Result<Array1<bool>> {
757 Ok(Array1::from_elem(X.ncols(), true))
758 }
759 fn apply_selection_method(
760 &self,
761 method: &mut SelectionMethod,
762 X: ArrayView2<f64>,
763 y: ArrayView1<f64>,
764 ) -> Result<Array1<bool>> {
765 match method {
766 SelectionMethod::VarianceThreshold {
767 threshold,
768 feature_variance,
769 } => self.apply_variance_threshold(*threshold, feature_variance, X),
770 SelectionMethod::CorrelationFilter {
771 threshold,
772 method: corr_method,
773 correlation_matrix,
774 } => self.apply_correlation_filter(*threshold, corr_method, correlation_matrix, X),
775 SelectionMethod::UnivariateFilter {
776 method: uni_method,
777 k,
778 score_func,
779 } => self.apply_univariate_filter(uni_method, k, score_func, X, y),
780 _ => Ok(Array1::from_elem(X.ncols(), true)),
781 }
782 }
783 fn apply_variance_threshold(
784 &self,
785 threshold: f64,
786 feature_variance: &mut Option<Array1<f64>>,
787 X: ArrayView2<f64>,
788 ) -> Result<Array1<bool>> {
789 if feature_variance.is_none() {
790 let mut variances = Array1::zeros(X.ncols());
791 for col in 0..X.ncols() {
792 variances[col] = X.column(col).var(1.0);
793 }
794 *feature_variance = Some(variances);
795 }
796 let variances = feature_variance.as_ref().expect("operation should succeed");
797 let selection = variances.mapv(|v| v > threshold);
798 Ok(selection)
799 }
800 fn apply_correlation_filter(
801 &self,
802 threshold: f64,
803 corr_method: &CorrelationMethod,
804 correlation_matrix: &mut Option<Array2<f64>>,
805 X: ArrayView2<f64>,
806 ) -> Result<Array1<bool>> {
807 if correlation_matrix.is_none() {
808 let n_features = X.ncols();
809 let mut corr_matrix = Array2::zeros((n_features, n_features));
810 for i in 0..n_features {
811 for j in 0..n_features {
812 if i == j {
813 corr_matrix[[i, j]] = 1.0;
814 } else {
815 let corr = match corr_method {
816 CorrelationMethod::Pearson => {
817 self.compute_correlation(X.column(i), X.column(j))
818 }
819 _ => self.compute_correlation(X.column(i), X.column(j)),
820 };
821 corr_matrix[[i, j]] = corr;
822 }
823 }
824 }
825 *correlation_matrix = Some(corr_matrix);
826 }
827 let corr_matrix = correlation_matrix
828 .as_ref()
829 .expect("operation should succeed");
830 let mut selection = Array1::from_elem(X.ncols(), true);
831 for i in 0..X.ncols() {
832 for j in (i + 1)..X.ncols() {
833 if corr_matrix[[i, j]].abs() > threshold && selection[i] && selection[j] {
834 let var_i = X.column(i).var(1.0);
835 let var_j = X.column(j).var(1.0);
836 if var_i < var_j {
837 selection[i] = false;
838 } else {
839 selection[j] = false;
840 }
841 }
842 }
843 }
844 Ok(selection)
845 }
846 fn apply_univariate_filter(
847 &self,
848 _method: &UnivariateMethod,
849 k: &SelectionCount,
850 score_func: &UnivariateScoreFunction,
851 X: ArrayView2<f64>,
852 y: ArrayView1<f64>,
853 ) -> Result<Array1<bool>> {
854 let mut scores = Array1::zeros(X.ncols());
855 for col in 0..X.ncols() {
856 scores[col] = match score_func {
857 UnivariateScoreFunction::Chi2 => self.compute_chi2_score(X.column(col), y),
858 UnivariateScoreFunction::FClassif => self.compute_f_score(X.column(col), y),
859 UnivariateScoreFunction::MutualInfoClassif => {
860 self.compute_mutual_info(X.column(col), y)
861 }
862 _ => self.compute_correlation(X.column(col), y).abs(),
863 };
864 }
865 let selection = match k {
866 SelectionCount::K(k_val) => {
867 let mut indexed_scores: Vec<(usize, f64)> = scores
868 .iter()
869 .enumerate()
870 .map(|(i, &score)| (i, score))
871 .collect();
872 indexed_scores
873 .sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
874 let mut selection = Array1::from_elem(X.ncols(), false);
875 for &(idx, _) in indexed_scores.iter().take(*k_val) {
876 selection[idx] = true;
877 }
878 selection
879 }
880 SelectionCount::Percentile(p) => {
881 let k_val = ((X.ncols() as f64 * p / 100.0).round() as usize).max(1);
882 let mut indexed_scores: Vec<(usize, f64)> = scores
883 .iter()
884 .enumerate()
885 .map(|(i, &score)| (i, score))
886 .collect();
887 indexed_scores
888 .sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
889 let mut selection = Array1::from_elem(X.ncols(), false);
890 for &(idx, _) in indexed_scores.iter().take(k_val) {
891 selection[idx] = true;
892 }
893 selection
894 }
895 _ => {
896 let k_val = X.ncols() / 2;
897 let mut indexed_scores: Vec<(usize, f64)> = scores
898 .iter()
899 .enumerate()
900 .map(|(i, &score)| (i, score))
901 .collect();
902 indexed_scores
903 .sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
904 let mut selection = Array1::from_elem(X.ncols(), false);
905 for &(idx, _) in indexed_scores.iter().take(k_val) {
906 selection[idx] = true;
907 }
908 selection
909 }
910 };
911 Ok(selection)
912 }
913 fn apply_dimensionality_reduction(
914 &self,
915 reduction: &mut DimensionalityReductionStep,
916 X: ArrayView2<f64>,
917 ) -> Result<Array2<f64>> {
918 match reduction {
919 DimensionalityReductionStep::PCA {
920 n_components,
921 whiten,
922 svd_solver,
923 components,
924 explained_variance,
925 } => self.apply_pca(
926 *n_components,
927 *whiten,
928 svd_solver,
929 components,
930 explained_variance,
931 X,
932 ),
933 DimensionalityReductionStep::TruncatedSVD {
934 n_components,
935 algorithm,
936 components,
937 singular_values,
938 } => self.apply_truncated_svd(*n_components, algorithm, components, singular_values, X),
939 _ => {
940 let n_comp = match reduction {
941 DimensionalityReductionStep::ICA { n_components, .. } => *n_components,
942 DimensionalityReductionStep::FactorAnalysis { n_components, .. } => {
943 *n_components
944 }
945 DimensionalityReductionStep::UMAP { n_components, .. } => *n_components,
946 DimensionalityReductionStep::TSNE { n_components, .. } => *n_components,
947 _ => X.ncols().min(50),
948 };
949 let final_components = n_comp.min(X.ncols());
950 let mut result = Array2::zeros((X.nrows(), final_components));
951 for col in 0..final_components {
952 for row in 0..X.nrows() {
953 result[[row, col]] = X[[row, col]];
954 }
955 }
956 Ok(result)
957 }
958 }
959 }
960 fn apply_pca(
961 &self,
962 n_components: usize,
963 _whiten: bool,
964 _svd_solver: &SVDSolver,
965 components: &mut Option<Array2<f64>>,
966 explained_variance: &mut Option<Array1<f64>>,
967 X: ArrayView2<f64>,
968 ) -> Result<Array2<f64>> {
969 let n_comp = n_components.min(X.ncols()).min(X.nrows());
970 let mut centered_X = X.to_owned();
971 let mut means = Array1::zeros(X.ncols());
972 for col in 0..X.ncols() {
973 means[col] = X.column(col).mean().unwrap_or(0.0);
974 for row in 0..X.nrows() {
975 centered_X[[row, col]] -= means[col];
976 }
977 }
978 if components.is_none() {
979 *components = Some(Array2::eye(X.ncols()));
980 *explained_variance = Some(Array1::ones(n_comp));
981 }
982 let mut result = Array2::zeros((X.nrows(), n_comp));
983 for col in 0..n_comp {
984 for row in 0..X.nrows() {
985 result[[row, col]] = centered_X[[row, col]];
986 }
987 }
988 Ok(result)
989 }
990 fn apply_truncated_svd(
991 &self,
992 n_components: usize,
993 _algorithm: &SVDAlgorithm,
994 components: &mut Option<Array2<f64>>,
995 singular_values: &mut Option<Array1<f64>>,
996 X: ArrayView2<f64>,
997 ) -> Result<Array2<f64>> {
998 let n_comp = n_components.min(X.ncols()).min(X.nrows());
999 if components.is_none() {
1000 *components = Some(Array2::eye(X.ncols()));
1001 *singular_values = Some(Array1::ones(n_comp));
1002 }
1003 let mut result = Array2::zeros((X.nrows(), n_comp));
1004 for col in 0..n_comp {
1005 for row in 0..X.nrows() {
1006 result[[row, col]] = X[[row, col]];
1007 }
1008 }
1009 Ok(result)
1010 }
1011 fn apply_model_selection(
1012 &self,
1013 model_selection: &mut ModelSelectionStep,
1014 X: ArrayView2<f64>,
1015 y: ArrayView1<f64>,
1016 ) -> Result<Vec<usize>> {
1017 match model_selection {
1018 ModelSelectionStep::CrossValidationSelection {
1019 estimator,
1020 cv_folds,
1021 scoring,
1022 feature_scores,
1023 } => self.apply_cv_selection(estimator, *cv_folds, scoring, feature_scores, X, y),
1024 ModelSelectionStep::ForwardSelection {
1025 estimator,
1026 max_features,
1027 scoring,
1028 selected_features,
1029 } => self.apply_forward_selection(
1030 estimator,
1031 *max_features,
1032 scoring,
1033 selected_features,
1034 X,
1035 y,
1036 ),
1037 _ => Ok((0..X.ncols()).collect()),
1038 }
1039 }
1040 fn apply_cv_selection(
1041 &self,
1042 _estimator: &ModelEstimator,
1043 _cv_folds: usize,
1044 _scoring: &ScoringMetric,
1045 feature_scores: &mut Option<Array1<f64>>,
1046 X: ArrayView2<f64>,
1047 y: ArrayView1<f64>,
1048 ) -> Result<Vec<usize>> {
1049 if feature_scores.is_none() {
1050 let mut scores = Array1::zeros(X.ncols());
1051 for col in 0..X.ncols() {
1052 scores[col] = self.compute_correlation(X.column(col), y).abs();
1053 }
1054 *feature_scores = Some(scores);
1055 }
1056 if let Some(ref scores) = feature_scores {
1057 let mut indexed_scores: Vec<(usize, f64)> = scores
1058 .iter()
1059 .enumerate()
1060 .map(|(i, &score)| (i, score))
1061 .collect();
1062 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
1063 let n_select = X.ncols() / 2;
1064 Ok(indexed_scores
1065 .into_iter()
1066 .take(n_select)
1067 .map(|(idx, _)| idx)
1068 .collect())
1069 } else {
1070 Ok((0..X.ncols()).collect())
1071 }
1072 }
1073 fn apply_forward_selection(
1074 &self,
1075 _estimator: &ModelEstimator,
1076 max_features: usize,
1077 _scoring: &ScoringMetric,
1078 selected_features: &mut Option<Vec<usize>>,
1079 X: ArrayView2<f64>,
1080 y: ArrayView1<f64>,
1081 ) -> Result<Vec<usize>> {
1082 if selected_features.is_none() {
1083 let mut scores = Vec::new();
1084 for col in 0..X.ncols() {
1085 let score = self.compute_correlation(X.column(col), y).abs();
1086 scores.push((col, score));
1087 }
1088 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
1089 let features: Vec<usize> = scores
1090 .into_iter()
1091 .take(max_features.min(X.ncols()))
1092 .map(|(idx, _)| idx)
1093 .collect();
1094 *selected_features = Some(features.clone());
1095 Ok(features)
1096 } else {
1097 Ok(selected_features
1098 .as_ref()
1099 .expect("operation should succeed")
1100 .clone())
1101 }
1102 }
1103 fn compute_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
1104 let n = x.len() as f64;
1105 if n < 2.0 {
1106 return 0.0;
1107 }
1108 let mean_x = x.mean().unwrap_or(0.0);
1109 let mean_y = y.mean().unwrap_or(0.0);
1110 let mut sum_xy = 0.0;
1111 let mut sum_x2 = 0.0;
1112 let mut sum_y2 = 0.0;
1113 for i in 0..x.len() {
1114 let dx = x[i] - mean_x;
1115 let dy = y[i] - mean_y;
1116 sum_xy += dx * dy;
1117 sum_x2 += dx * dx;
1118 sum_y2 += dy * dy;
1119 }
1120 let denom = (sum_x2 * sum_y2).sqrt();
1121 if denom < 1e-10 {
1122 0.0
1123 } else {
1124 sum_xy / denom
1125 }
1126 }
1127 fn compute_chi2_score(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
1128 self.compute_correlation(x, y).abs()
1129 }
1130 fn compute_f_score(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
1131 self.compute_correlation(x, y).abs()
1132 }
1133 fn compute_mutual_info(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
1134 self.compute_correlation(x, y).abs()
1135 }
1136}
1137impl FeatureSelectionPipeline<Trained> {
1138 pub fn transform(&self, X: ArrayView2<f64>) -> Result<Array2<f64>> {
1139 let _start_time = Instant::now();
1140 let current_X = X.to_owned();
1141 Ok(current_X)
1142 }
1143 pub fn get_pipeline_info(&self) -> PipelineInfo {
1145 PipelineInfo {
1146 n_preprocessing_steps: self.preprocessing_steps.len(),
1147 n_feature_engineering_steps: self.feature_engineering_steps.len(),
1148 n_selection_methods: self.selection_methods.len(),
1149 has_dimensionality_reduction: self.dimensionality_reduction.is_some(),
1150 has_model_selection: self.model_selection.is_some(),
1151 config: self.pipeline_config.clone(),
1152 }
1153 }
1154}
1155#[derive(Debug, Clone)]
1156pub enum CorrelationMethod {
1157 Pearson,
1159 Spearman,
1161 Kendall,
1163}
1164#[derive(Debug, Clone)]
1165pub enum PowerMethod {
1166 YeoJohnson,
1168 BoxCox,
1170}
1171#[derive(Debug)]
1172pub enum StepParameters {
1173 Preprocessing(Box<dyn std::any::Any + Send + Sync>),
1175 FeatureEngineering(Box<dyn std::any::Any + Send + Sync>),
1177 Selection(Array1<bool>),
1179 DimensionalityReduction(Array2<f64>),
1181 ModelSelection(Vec<usize>),
1183}
1184#[derive(Debug, Clone)]
1185pub enum TransformationType {
1186 OneToOne,
1188 OneToMany,
1190 ManyToOne,
1192 ManyToMany,
1194}
1195#[derive(Debug, Clone)]
1196pub struct ValidationResults {
1197 pub cross_validation_scores: Vec<f64>,
1198 pub stability_scores: Vec<f64>,
1199 pub robustness_scores: Vec<f64>,
1200 pub statistical_significance: bool,
1201}
1202#[derive(Debug, Clone)]
1204pub enum SelectionMethod {
1205 UnivariateFilter {
1207 method: UnivariateMethod,
1208 k: SelectionCount,
1209 score_func: UnivariateScoreFunction,
1210 },
1211 RecursiveFeatureElimination {
1213 estimator: RFEEstimator,
1214 n_features: SelectionCount,
1215 step: f64,
1216 importance_getter: ImportanceGetter,
1217 },
1218 SelectFromModel {
1219 estimator: ModelEstimator,
1220 threshold: SelectionThreshold,
1221 prefit: bool,
1222 max_features: Option<usize>,
1223 },
1224 VarianceThreshold {
1225 threshold: f64,
1226 feature_variance: Option<Array1<f64>>,
1227 },
1228 CorrelationFilter {
1229 threshold: f64,
1230 method: CorrelationMethod,
1231 correlation_matrix: Option<Array2<f64>>,
1232 },
1233 MutualInformation {
1234 k: SelectionCount,
1235 discrete_features: Vec<bool>,
1236 random_state: Option<u64>,
1237 },
1238 LASSO {
1239 alpha: f64,
1240 max_iter: usize,
1241 tol: f64,
1242 coefficients: Option<Array1<f64>>,
1243 },
1244 ElasticNet {
1245 alpha: f64,
1246 l1_ratio: f64,
1247 max_iter: usize,
1248 tol: f64,
1249 coefficients: Option<Array1<f64>>,
1250 },
1251 TreeBased {
1252 estimator_type: TreeEstimatorType,
1253 n_estimators: usize,
1254 max_depth: Option<usize>,
1255 feature_importances: Option<Array1<f64>>,
1256 },
1257 GeneticAlgorithm {
1258 population_size: usize,
1259 n_generations: usize,
1260 mutation_rate: f64,
1261 crossover_rate: f64,
1262 best_individuals: Option<Vec<Vec<bool>>>,
1263 },
1264 ParticleSwarmOptimization {
1265 n_particles: usize,
1266 n_iterations: usize,
1267 inertia: f64,
1268 cognitive: f64,
1269 social: f64,
1270 best_positions: Option<Vec<Vec<f64>>>,
1271 },
1272 SimulatedAnnealing {
1273 initial_temp: f64,
1274 cooling_rate: f64,
1275 min_temp: f64,
1276 max_iter: usize,
1277 current_solution: Option<Vec<bool>>,
1278 },
1279}
1280#[derive(Debug, Clone)]
1281pub enum StepwiseDirection {
1282 Forward,
1284 Backward,
1286 Both,
1288}
1289#[derive(Debug, Clone)]
1290pub enum ImputationStrategy {
1291 Mean,
1293 Median,
1295 Mode,
1297 Constant,
1299 KNN,
1301 Iterative,
1303}
1304#[derive(Debug, Clone)]
1305pub enum UnivariateMethod {
1306 Chi2,
1308 ANOVA,
1310 MutualInfo,
1312 Correlation,
1314}
1315#[derive(Debug, Clone)]
1316pub struct QuantileTransformerConfig {
1317 pub n_quantiles: usize,
1318 pub output_distribution: Distribution,
1319 pub subsample: Option<usize>,
1320}
1321#[derive(Debug, Clone)]
1322pub struct ImputerConfig {
1323 pub strategy: ImputationStrategy,
1324 pub fill_value: Option<f64>,
1325 pub missing_values: MissingValueIndicator,
1326}
1327#[derive(Debug, Clone)]
1328pub struct RobustScalerConfig {
1329 pub with_centering: bool,
1330 pub with_scaling: bool,
1331 pub quantile_range: (f64, f64),
1332}
1333#[derive(Debug, Clone)]
1335pub struct PipelineMetadata {
1336 pub total_training_time: Duration,
1337 pub total_transform_time: Duration,
1338 pub memory_usage_peak: usize,
1339 pub feature_reduction_ratio: f64,
1340 pub performance_metrics: HashMap<String, f64>,
1341 pub validation_results: Option<ValidationResults>,
1342}
1343#[derive(Debug)]
1345pub struct TrainedStep {
1346 pub step_type: String,
1347 pub step_index: usize,
1348 pub training_time: Duration,
1349 pub feature_count_before: usize,
1350 pub feature_count_after: usize,
1351 pub parameters: StepParameters,
1352}
1353#[derive(Debug, Clone)]
1354pub enum OutlierMethod {
1355 IsolationForest,
1357 LocalOutlierFactor,
1359 OneClassSVM,
1361 EllipticEnvelope,
1363}
1364#[derive(Debug, Clone)]
1365pub enum UnivariateScoreFunction {
1366 Chi2,
1368 FClassif,
1370 FRegression,
1372 MutualInfoClassif,
1374 MutualInfoRegression,
1376}
1377#[derive(Debug, Clone)]
1378pub struct MinMaxScalerConfig {
1379 pub feature_range: (f64, f64),
1380 pub clip: bool,
1381}
1382#[derive(Debug, Clone)]
1383pub struct RobustScalerParams {
1384 pub center: Array1<f64>,
1385 pub scale: Array1<f64>,
1386}
1387#[derive(Debug, Clone)]
1389pub struct PipelineConfiguration {
1390 pub parallel_execution: bool,
1391 pub memory_optimization: MemoryOptimization,
1392 pub caching_strategy: CachingStrategy,
1393 pub validation_strategy: ValidationStrategy,
1394 pub error_handling: ErrorHandling,
1395 pub logging_level: LoggingLevel,
1396}
1397#[derive(Debug, Clone)]
1398pub enum RFEEstimator {
1399 SVM,
1401 RandomForest,
1403 LinearRegression,
1405 LogisticRegression,
1407}
1408#[derive(Debug, Clone)]
1409pub enum SVDSolver {
1410 Auto,
1412 Full,
1414 Arpack,
1416 Randomized,
1418}
1419#[derive(Debug, Clone)]
1421pub enum DimensionalityReductionStep {
1422 PCA {
1424 n_components: usize,
1425 whiten: bool,
1426 svd_solver: SVDSolver,
1427 components: Option<Array2<f64>>,
1428 explained_variance: Option<Array1<f64>>,
1429 },
1430 TruncatedSVD {
1432 n_components: usize,
1433 algorithm: SVDAlgorithm,
1434 components: Option<Array2<f64>>,
1435 singular_values: Option<Array1<f64>>,
1436 },
1437 ICA {
1438 n_components: usize,
1439 algorithm: ICAAlgorithm,
1440 max_iter: usize,
1441 tol: f64,
1442 mixing_matrix: Option<Array2<f64>>,
1443 unmixing_matrix: Option<Array2<f64>>,
1444 },
1445 FactorAnalysis {
1446 n_components: usize,
1447 max_iter: usize,
1448 tol: f64,
1449 loadings: Option<Array2<f64>>,
1450 noise_variance: Option<Array1<f64>>,
1451 },
1452 UMAP {
1453 n_components: usize,
1454 n_neighbors: usize,
1455 min_dist: f64,
1456 metric: DistanceMetric,
1457 embedding: Option<Array2<f64>>,
1458 },
1459 TSNE {
1460 n_components: usize,
1461 perplexity: f64,
1462 early_exaggeration: f64,
1463 learning_rate: f64,
1464 max_iter: usize,
1465 embedding: Option<Array2<f64>>,
1466 },
1467}
1468#[derive(Debug, Clone)]
1470pub enum FeatureEngineeringStep {
1471 PolynomialFeatures {
1473 degree: usize,
1474 interaction_only: bool,
1475 include_bias: bool,
1476 feature_mapping: Option<Vec<(usize, usize)>>,
1477 },
1478 InteractionFeatures {
1480 max_pairs: Option<usize>,
1481 threshold: f64,
1482 feature_pairs: Option<Vec<(usize, usize)>>,
1483 },
1484 BinningFeatures {
1485 n_bins: usize,
1486 strategy: BinningStrategy,
1487 bin_edges: Option<HashMap<usize, Vec<f64>>>,
1488 },
1489 TargetEncoding {
1490 smoothing: f64,
1491 min_samples_leaf: usize,
1492 encodings: Option<HashMap<usize, HashMap<String, f64>>>,
1493 },
1494 FrequencyEncoding {
1495 min_frequency: f64,
1496 frequencies: Option<HashMap<usize, HashMap<String, f64>>>,
1497 },
1498 RatioFeatures {
1499 numerator_features: Vec<usize>,
1500 denominator_features: Vec<usize>,
1501 eps: f64,
1502 },
1503 LaggingFeatures {
1504 lags: Vec<usize>,
1505 feature_subset: Option<Vec<usize>>,
1506 },
1507 WindowStatistics {
1508 window_size: usize,
1509 statistics: Vec<WindowStatistic>,
1510 feature_subset: Option<Vec<usize>>,
1511 },
1512}
1513#[derive(Debug, Clone)]
1514pub struct PowerTransformerConfig {
1515 pub method: PowerMethod,
1516 pub standardize: bool,
1517}
1518#[derive(Debug, Clone)]
1519pub struct OutlierConfig {
1520 pub method: OutlierMethod,
1521 pub threshold: f64,
1522 pub contamination: f64,
1523}
1524#[derive(Debug, Clone)]
1525pub enum BinningStrategy {
1526 Uniform,
1528 Quantile,
1530 KMeans,
1532}
1533#[derive(Debug, Clone)]
1534pub struct QuantileParams {
1535 pub quantiles: Array2<f64>,
1536 pub references: Array1<f64>,
1537}
1538#[derive(Debug, Clone)]
1539pub enum Distribution {
1540 Uniform,
1542 Normal,
1544}
1545#[derive(Debug, Clone)]
1546pub struct MinMaxScalerParams {
1547 pub min: Array1<f64>,
1548 pub scale: Array1<f64>,
1549}
1550#[derive(Debug, Clone)]
1551pub struct OutlierParams {
1552 pub decision_function: Array1<f64>,
1553 pub threshold: f64,
1554}
1555#[derive(Debug, Clone)]
1556pub enum ScoringMetric {
1557 Accuracy,
1559 F1,
1561 RocAuc,
1563 R2,
1565 MAE,
1567 MSE,
1569 LogLoss,
1571}
1572#[derive(Debug, Clone)]
1574pub struct PipelineInfo {
1575 pub n_preprocessing_steps: usize,
1576 pub n_feature_engineering_steps: usize,
1577 pub n_selection_methods: usize,
1578 pub has_dimensionality_reduction: bool,
1579 pub has_model_selection: bool,
1580 pub config: PipelineConfiguration,
1581}
1582#[derive(Debug, Clone)]
1583pub enum SVDAlgorithm {
1584 Randomized,
1586 Arpack,
1588}
1589#[derive(Debug, Clone)]
1590pub enum PrefetchStrategy {
1591 None,
1592 Sequential,
1594 Random,
1596 Adaptive,
1598}
1599#[derive(Debug, Clone)]
1600pub struct PowerParams {
1601 pub lambdas: Array1<f64>,
1602}
1603#[derive(Debug, Clone)]
1605pub enum ModelSelectionStep {
1606 CrossValidationSelection {
1608 estimator: ModelEstimator,
1609 cv_folds: usize,
1610 scoring: ScoringMetric,
1611 feature_scores: Option<Array1<f64>>,
1612 },
1613 ForwardSelection {
1615 estimator: ModelEstimator,
1616 max_features: usize,
1617 scoring: ScoringMetric,
1618 selected_features: Option<Vec<usize>>,
1619 },
1620 BackwardElimination {
1621 estimator: ModelEstimator,
1622 min_features: usize,
1623 scoring: ScoringMetric,
1624 remaining_features: Option<Vec<usize>>,
1625 },
1626 StepwiseSelection {
1627 estimator: ModelEstimator,
1628 direction: StepwiseDirection,
1629 p_enter: f64,
1630 p_remove: f64,
1631 selected_features: Option<Vec<usize>>,
1632 },
1633 BayesianOptimization {
1634 estimator: ModelEstimator,
1635 acquisition_function: AcquisitionFunction,
1636 n_calls: usize,
1637 optimal_features: Option<Vec<usize>>,
1638 },
1639}
1640#[derive(Debug, Clone)]
1641pub struct TransformationStep {
1642 pub step_name: String,
1643 pub input_features: usize,
1644 pub output_features: usize,
1645 pub transformation_type: TransformationType,
1646}
1647#[derive(Debug, Clone)]
1648pub enum ImportanceGetter {
1649 Auto,
1651 Coefficients,
1653 FeatureImportances,
1655}
1656#[derive(Debug, Clone)]
1657pub enum ICAAlgorithm {
1658 Parallel,
1660 Deflation,
1662}
1663#[derive(Debug, Clone)]
1664pub enum ErrorHandling {
1665 Strict,
1667 Graceful,
1669 Logging,
1671}
1672#[derive(Debug, Clone)]
1674pub enum PreprocessingStep {
1675 StandardScaler {
1677 config: StandardScalerConfig,
1678 trained_params: Option<ScalerParams>,
1679 },
1680 RobustScaler {
1682 config: RobustScalerConfig,
1683 trained_params: Option<RobustScalerParams>,
1684 },
1685 MinMaxScaler {
1687 config: MinMaxScalerConfig,
1688 trained_params: Option<MinMaxScalerParams>,
1689 },
1690 QuantileTransformer {
1691 config: QuantileTransformerConfig,
1692 trained_params: Option<QuantileParams>,
1693 },
1694 PowerTransformer {
1695 config: PowerTransformerConfig,
1696 trained_params: Option<PowerParams>,
1697 },
1698 MissingValueImputer {
1699 config: ImputerConfig,
1700 trained_params: Option<ImputerParams>,
1701 },
1702 OutlierRemover {
1703 config: OutlierConfig,
1704 trained_params: Option<OutlierParams>,
1705 },
1706}
1707#[derive(Debug, Clone, Default)]
1709pub struct Untrained;
1710#[derive(Debug, Clone)]
1711pub enum ModelEstimator {
1712 LinearRegression,
1714 LogisticRegression,
1716 RandomForest,
1718 SVM,
1720 XGBoost,
1722 LightGBM,
1724}
1725#[derive(Debug, Clone)]
1726pub enum AcquisitionFunction {
1727 ExpectedImprovement,
1729 UpperConfidenceBound,
1731 ProbabilityOfImprovement,
1733}
1734#[derive(Debug, Clone)]
1736pub enum SelectionThreshold {
1737 Mean,
1739 Median,
1741 Absolute(f64),
1743 Percentile(f64),
1745 Auto,
1747}
1748#[derive(Debug, Clone)]
1750pub enum SelectionCount {
1751 K(usize),
1753 Percentile(f64),
1755 FDR(f64),
1757 FPR(f64),
1759 FWER(f64),
1761}
1762#[derive(Debug, Clone)]
1763pub struct ImputerParams {
1764 pub statistics: Array1<f64>,
1765}
1766#[derive(Debug, Clone)]
1767pub enum FeatureOrigin {
1768 Original(usize),
1770 Engineered {
1772 source_features: Vec<usize>,
1773 operation: String,
1774 },
1775 Transformed {
1777 source_feature: usize,
1778 transformation: String,
1779 },
1780}