1use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
9use sklears_core::error::{Result as SklResult, SklearsError};
10use sklears_core::traits::{Estimator, Fit, Transform};
11use std::collections::HashMap;
12use std::marker::PhantomData;
13use std::time::{Duration, Instant};
14
15type Result<T> = SklResult<T>;
16
17#[derive(Debug, Clone)]
19pub struct FeatureSelectionPipeline<State = Untrained> {
20 preprocessing_steps: Vec<PreprocessingStep>,
21 feature_engineering_steps: Vec<FeatureEngineeringStep>,
22 selection_methods: Vec<SelectionMethod>,
23 dimensionality_reduction: Option<DimensionalityReductionStep>,
24 model_selection: Option<ModelSelectionStep>,
25 pipeline_config: PipelineConfiguration,
26 optimization_config: OptimizationConfiguration,
27 _phantom: PhantomData<State>,
28}
29
30#[derive(Debug, Clone, Default)]
32pub struct Untrained;
33
34#[derive(Debug)]
35pub struct Trained {
36 trained_steps: Vec<TrainedStep>,
37 feature_mapping: FeatureMapping,
38 pipeline_metadata: PipelineMetadata,
39}
40
41#[derive(Debug, Clone)]
43pub enum PreprocessingStep {
44 StandardScaler {
46 config: StandardScalerConfig,
47
48 trained_params: Option<ScalerParams>,
49 },
50 RobustScaler {
52 config: RobustScalerConfig,
53
54 trained_params: Option<RobustScalerParams>,
55 },
56 MinMaxScaler {
58 config: MinMaxScalerConfig,
59 trained_params: Option<MinMaxScalerParams>,
60 },
61 QuantileTransformer {
62 config: QuantileTransformerConfig,
63 trained_params: Option<QuantileParams>,
64 },
65 PowerTransformer {
66 config: PowerTransformerConfig,
67 trained_params: Option<PowerParams>,
68 },
69 MissingValueImputer {
70 config: ImputerConfig,
71 trained_params: Option<ImputerParams>,
72 },
73 OutlierRemover {
74 config: OutlierConfig,
75 trained_params: Option<OutlierParams>,
76 },
77}
78
79#[derive(Debug, Clone)]
81pub enum FeatureEngineeringStep {
82 PolynomialFeatures {
84 degree: usize,
85
86 interaction_only: bool,
87
88 include_bias: bool,
89
90 feature_mapping: Option<Vec<(usize, usize)>>,
91 },
92 InteractionFeatures {
94 max_pairs: Option<usize>,
95
96 threshold: f64,
97
98 feature_pairs: Option<Vec<(usize, usize)>>,
99 },
100 BinningFeatures {
101 n_bins: usize,
102 strategy: BinningStrategy,
103 bin_edges: Option<HashMap<usize, Vec<f64>>>,
104 },
105 TargetEncoding {
106 smoothing: f64,
107 min_samples_leaf: usize,
108 encodings: Option<HashMap<usize, HashMap<String, f64>>>,
109 },
110 FrequencyEncoding {
111 min_frequency: f64,
112 frequencies: Option<HashMap<usize, HashMap<String, f64>>>,
113 },
114 RatioFeatures {
115 numerator_features: Vec<usize>,
116 denominator_features: Vec<usize>,
117 eps: f64,
118 },
119 LaggingFeatures {
120 lags: Vec<usize>,
121 feature_subset: Option<Vec<usize>>,
122 },
123 WindowStatistics {
124 window_size: usize,
125 statistics: Vec<WindowStatistic>,
126 feature_subset: Option<Vec<usize>>,
127 },
128}
129
130#[derive(Debug, Clone)]
132pub enum SelectionMethod {
133 UnivariateFilter {
135 method: UnivariateMethod,
136
137 k: SelectionCount,
138
139 score_func: UnivariateScoreFunction,
140 },
141 RecursiveFeatureElimination {
143 estimator: RFEEstimator,
144
145 n_features: SelectionCount,
146
147 step: f64,
148
149 importance_getter: ImportanceGetter,
150 },
151 SelectFromModel {
152 estimator: ModelEstimator,
153 threshold: SelectionThreshold,
154 prefit: bool,
155 max_features: Option<usize>,
156 },
157 VarianceThreshold {
158 threshold: f64,
159 feature_variance: Option<Array1<f64>>,
160 },
161 CorrelationFilter {
162 threshold: f64,
163 method: CorrelationMethod,
164 correlation_matrix: Option<Array2<f64>>,
165 },
166 MutualInformation {
167 k: SelectionCount,
168 discrete_features: Vec<bool>,
169 random_state: Option<u64>,
170 },
171 LASSO {
172 alpha: f64,
173 max_iter: usize,
174 tol: f64,
175 coefficients: Option<Array1<f64>>,
176 },
177 ElasticNet {
178 alpha: f64,
179 l1_ratio: f64,
180 max_iter: usize,
181 tol: f64,
182 coefficients: Option<Array1<f64>>,
183 },
184 TreeBased {
185 estimator_type: TreeEstimatorType,
186 n_estimators: usize,
187 max_depth: Option<usize>,
188 feature_importances: Option<Array1<f64>>,
189 },
190 GeneticAlgorithm {
191 population_size: usize,
192 n_generations: usize,
193 mutation_rate: f64,
194 crossover_rate: f64,
195 best_individuals: Option<Vec<Vec<bool>>>,
196 },
197 ParticleSwarmOptimization {
198 n_particles: usize,
199 n_iterations: usize,
200 inertia: f64,
201 cognitive: f64,
202 social: f64,
203 best_positions: Option<Vec<Vec<f64>>>,
204 },
205 SimulatedAnnealing {
206 initial_temp: f64,
207 cooling_rate: f64,
208 min_temp: f64,
209 max_iter: usize,
210 current_solution: Option<Vec<bool>>,
211 },
212}
213
214#[derive(Debug, Clone)]
216pub enum DimensionalityReductionStep {
217 PCA {
219 n_components: usize,
220
221 whiten: bool,
222
223 svd_solver: SVDSolver,
224
225 components: Option<Array2<f64>>,
226
227 explained_variance: Option<Array1<f64>>,
228 },
229 TruncatedSVD {
231 n_components: usize,
232
233 algorithm: SVDAlgorithm,
234 components: Option<Array2<f64>>,
235 singular_values: Option<Array1<f64>>,
236 },
237 ICA {
238 n_components: usize,
239 algorithm: ICAAlgorithm,
240 max_iter: usize,
241 tol: f64,
242 mixing_matrix: Option<Array2<f64>>,
243 unmixing_matrix: Option<Array2<f64>>,
244 },
245 FactorAnalysis {
246 n_components: usize,
247 max_iter: usize,
248 tol: f64,
249 loadings: Option<Array2<f64>>,
250 noise_variance: Option<Array1<f64>>,
251 },
252 UMAP {
253 n_components: usize,
254 n_neighbors: usize,
255 min_dist: f64,
256 metric: DistanceMetric,
257 embedding: Option<Array2<f64>>,
258 },
259 TSNE {
260 n_components: usize,
261 perplexity: f64,
262 early_exaggeration: f64,
263 learning_rate: f64,
264 max_iter: usize,
265 embedding: Option<Array2<f64>>,
266 },
267}
268
269#[derive(Debug, Clone)]
271pub enum ModelSelectionStep {
272 CrossValidationSelection {
274 estimator: ModelEstimator,
275
276 cv_folds: usize,
277
278 scoring: ScoringMetric,
279
280 feature_scores: Option<Array1<f64>>,
281 },
282 ForwardSelection {
284 estimator: ModelEstimator,
285
286 max_features: usize,
287
288 scoring: ScoringMetric,
289 selected_features: Option<Vec<usize>>,
290 },
291 BackwardElimination {
292 estimator: ModelEstimator,
293 min_features: usize,
294 scoring: ScoringMetric,
295 remaining_features: Option<Vec<usize>>,
296 },
297 StepwiseSelection {
298 estimator: ModelEstimator,
299 direction: StepwiseDirection,
300 p_enter: f64,
301 p_remove: f64,
302 selected_features: Option<Vec<usize>>,
303 },
304 BayesianOptimization {
305 estimator: ModelEstimator,
306 acquisition_function: AcquisitionFunction,
307 n_calls: usize,
308 optimal_features: Option<Vec<usize>>,
309 },
310}
311
312#[derive(Debug, Clone)]
314pub enum SelectionCount {
315 K(usize),
317 Percentile(f64),
319 FDR(f64),
321 FPR(f64),
323 FWER(f64),
325}
326
327#[derive(Debug, Clone)]
329pub enum SelectionThreshold {
330 Mean,
332 Median,
334 Absolute(f64),
336 Percentile(f64),
338 Auto,
340}
341
342#[derive(Debug, Clone)]
344pub struct PipelineConfiguration {
345 pub parallel_execution: bool,
346 pub memory_optimization: MemoryOptimization,
347 pub caching_strategy: CachingStrategy,
348 pub validation_strategy: ValidationStrategy,
349 pub error_handling: ErrorHandling,
350 pub logging_level: LoggingLevel,
351}
352
353#[derive(Debug, Clone)]
355pub struct OptimizationConfiguration {
356 pub use_simd: bool,
357 pub chunk_size: usize,
358 pub thread_pool_size: Option<usize>,
359 pub memory_pool_size: usize,
360 pub cache_size: usize,
361 pub prefetch_strategy: PrefetchStrategy,
362 pub vectorization_threshold: usize,
363}
364
365#[derive(Debug, Clone)]
367pub enum MemoryOptimization {
368 None,
369 Conservative,
371 Aggressive,
373}
374
375#[derive(Debug, Clone)]
376pub enum CachingStrategy {
377 None,
378 LRU {
380 size: usize,
381 },
382 LFU {
384 size: usize,
385 },
386 FIFO {
388 size: usize,
389 },
390}
391
392#[derive(Debug, Clone)]
393pub enum ValidationStrategy {
394 None,
395 Basic,
397 Comprehensive,
399 Statistical,
401}
402
403#[derive(Debug, Clone)]
404pub enum ErrorHandling {
405 Strict,
407 Graceful,
409 Logging,
411}
412
413#[derive(Debug, Clone)]
414pub enum LoggingLevel {
415 None,
416 Error,
418 Warning,
420 Info,
422 Debug,
424 Trace,
426}
427
428#[derive(Debug, Clone)]
429pub enum PrefetchStrategy {
430 None,
431 Sequential,
433 Random,
435 Adaptive,
437}
438
439#[derive(Debug, Clone)]
441pub struct StandardScalerConfig {
442 pub with_mean: bool,
443 pub with_std: bool,
444}
445
446#[derive(Debug, Clone)]
447pub struct ScalerParams {
448 pub mean: Array1<f64>,
449 pub scale: Array1<f64>,
450}
451
452#[derive(Debug, Clone)]
453pub struct RobustScalerConfig {
454 pub with_centering: bool,
455 pub with_scaling: bool,
456 pub quantile_range: (f64, f64),
457}
458
459#[derive(Debug, Clone)]
460pub struct RobustScalerParams {
461 pub center: Array1<f64>,
462 pub scale: Array1<f64>,
463}
464
465#[derive(Debug, Clone)]
466pub struct MinMaxScalerConfig {
467 pub feature_range: (f64, f64),
468 pub clip: bool,
469}
470
471#[derive(Debug, Clone)]
472pub struct MinMaxScalerParams {
473 pub min: Array1<f64>,
474 pub scale: Array1<f64>,
475}
476
477#[derive(Debug, Clone)]
478pub struct QuantileTransformerConfig {
479 pub n_quantiles: usize,
480 pub output_distribution: Distribution,
481 pub subsample: Option<usize>,
482}
483
484#[derive(Debug, Clone)]
485pub struct QuantileParams {
486 pub quantiles: Array2<f64>,
487 pub references: Array1<f64>,
488}
489
490#[derive(Debug, Clone)]
491pub struct PowerTransformerConfig {
492 pub method: PowerMethod,
493 pub standardize: bool,
494}
495
496#[derive(Debug, Clone)]
497pub struct PowerParams {
498 pub lambdas: Array1<f64>,
499}
500
501#[derive(Debug, Clone)]
502pub struct ImputerConfig {
503 pub strategy: ImputationStrategy,
504 pub fill_value: Option<f64>,
505 pub missing_values: MissingValueIndicator,
506}
507
508#[derive(Debug, Clone)]
509pub struct ImputerParams {
510 pub statistics: Array1<f64>,
511}
512
513#[derive(Debug, Clone)]
514pub struct OutlierConfig {
515 pub method: OutlierMethod,
516 pub threshold: f64,
517 pub contamination: f64,
518}
519
520#[derive(Debug, Clone)]
521pub struct OutlierParams {
522 pub decision_function: Array1<f64>,
523 pub threshold: f64,
524}
525
526#[derive(Debug, Clone)]
528pub enum BinningStrategy {
529 Uniform,
531 Quantile,
533 KMeans,
535}
536
537#[derive(Debug, Clone)]
538pub enum WindowStatistic {
539 Mean,
541 Std,
543 Min,
545 Max,
547 Median,
549 Skewness,
551 Kurtosis,
553}
554
555#[derive(Debug, Clone)]
556pub enum UnivariateMethod {
557 Chi2,
559 ANOVA,
561 MutualInfo,
563 Correlation,
565}
566
567#[derive(Debug, Clone)]
568pub enum UnivariateScoreFunction {
569 Chi2,
571 FClassif,
573 FRegression,
575 MutualInfoClassif,
577 MutualInfoRegression,
579}
580
581#[derive(Debug, Clone)]
582pub enum RFEEstimator {
583 SVM,
585 RandomForest,
587 LinearRegression,
589 LogisticRegression,
591}
592
593#[derive(Debug, Clone)]
594pub enum ImportanceGetter {
595 Auto,
597 Coefficients,
599 FeatureImportances,
601}
602
603#[derive(Debug, Clone)]
604pub enum ModelEstimator {
605 LinearRegression,
607 LogisticRegression,
609 RandomForest,
611 SVM,
613 XGBoost,
615 LightGBM,
617}
618
619#[derive(Debug, Clone)]
620pub enum CorrelationMethod {
621 Pearson,
623 Spearman,
625 Kendall,
627}
628
629#[derive(Debug, Clone)]
630pub enum TreeEstimatorType {
631 RandomForest,
633 ExtraTrees,
635 GradientBoosting,
637 AdaBoost,
639}
640
641#[derive(Debug, Clone)]
642pub enum SVDSolver {
643 Auto,
645 Full,
647 Arpack,
649 Randomized,
651}
652
653#[derive(Debug, Clone)]
654pub enum SVDAlgorithm {
655 Randomized,
657 Arpack,
659}
660
661#[derive(Debug, Clone)]
662pub enum ICAAlgorithm {
663 Parallel,
665 Deflation,
667}
668
669#[derive(Debug, Clone)]
670pub enum DistanceMetric {
671 Euclidean,
673 Manhattan,
675 Cosine,
677 Hamming,
679}
680
681#[derive(Debug, Clone)]
682pub enum ScoringMetric {
683 Accuracy,
685 F1,
687 RocAuc,
689 R2,
691 MAE,
693 MSE,
695 LogLoss,
697}
698
699#[derive(Debug, Clone)]
700pub enum StepwiseDirection {
701 Forward,
703 Backward,
705 Both,
707}
708
709#[derive(Debug, Clone)]
710pub enum AcquisitionFunction {
711 ExpectedImprovement,
713 UpperConfidenceBound,
715 ProbabilityOfImprovement,
717}
718
719#[derive(Debug, Clone)]
720pub enum Distribution {
721 Uniform,
723 Normal,
725}
726
727#[derive(Debug, Clone)]
728pub enum PowerMethod {
729 YeoJohnson,
731 BoxCox,
733}
734
735#[derive(Debug, Clone)]
736pub enum ImputationStrategy {
737 Mean,
739 Median,
741 Mode,
743 Constant,
745 KNN,
747 Iterative,
749}
750
751#[derive(Debug, Clone)]
752pub enum MissingValueIndicator {
753 NaN,
755 Value(f64),
757}
758
759#[derive(Debug, Clone)]
760pub enum OutlierMethod {
761 IsolationForest,
763 LocalOutlierFactor,
765 OneClassSVM,
767 EllipticEnvelope,
769}
770
771#[derive(Debug)]
773pub struct TrainedStep {
774 pub step_type: String,
775 pub step_index: usize,
776 pub training_time: Duration,
777 pub feature_count_before: usize,
778 pub feature_count_after: usize,
779 pub parameters: StepParameters,
780}
781
782#[derive(Debug)]
783pub enum StepParameters {
784 Preprocessing(Box<dyn std::any::Any + Send + Sync>),
786 FeatureEngineering(Box<dyn std::any::Any + Send + Sync>),
788 Selection(Array1<bool>),
790 DimensionalityReduction(Array2<f64>),
792 ModelSelection(Vec<usize>),
794}
795
796#[derive(Debug, Clone)]
798pub struct FeatureMapping {
799 pub original_features: usize,
800 pub final_features: usize,
801 pub feature_names: Vec<String>,
802 pub feature_origins: Vec<FeatureOrigin>,
803 pub transformation_history: Vec<TransformationStep>,
804}
805
806#[derive(Debug, Clone)]
807pub enum FeatureOrigin {
808 Original(usize),
810 Engineered {
812 source_features: Vec<usize>,
813
814 operation: String,
815 },
816 Transformed {
818 source_feature: usize,
819
820 transformation: String,
821 },
822}
823
824#[derive(Debug, Clone)]
825pub struct TransformationStep {
826 pub step_name: String,
827 pub input_features: usize,
828 pub output_features: usize,
829 pub transformation_type: TransformationType,
830}
831
832#[derive(Debug, Clone)]
833pub enum TransformationType {
834 OneToOne,
836 OneToMany,
838 ManyToOne,
840 ManyToMany,
842}
843
844#[derive(Debug, Clone)]
846pub struct PipelineMetadata {
847 pub total_training_time: Duration,
848 pub total_transform_time: Duration,
849 pub memory_usage_peak: usize,
850 pub feature_reduction_ratio: f64,
851 pub performance_metrics: HashMap<String, f64>,
852 pub validation_results: Option<ValidationResults>,
853}
854
855#[derive(Debug, Clone)]
856pub struct ValidationResults {
857 pub cross_validation_scores: Vec<f64>,
858 pub stability_scores: Vec<f64>,
859 pub robustness_scores: Vec<f64>,
860 pub statistical_significance: bool,
861}
862
863impl Default for FeatureSelectionPipeline<Untrained> {
864 fn default() -> Self {
865 Self::new()
866 }
867}
868
869impl FeatureSelectionPipeline<Untrained> {
870 pub fn new() -> Self {
872 Self {
873 preprocessing_steps: Vec::new(),
874 feature_engineering_steps: Vec::new(),
875 selection_methods: Vec::new(),
876 dimensionality_reduction: None,
877 model_selection: None,
878 pipeline_config: PipelineConfiguration::default(),
879 optimization_config: OptimizationConfiguration::default(),
880 _phantom: PhantomData,
881 }
882 }
883
884 pub fn add_preprocessing_step(mut self, step: PreprocessingStep) -> Self {
886 self.preprocessing_steps.push(step);
887 self
888 }
889
890 pub fn add_feature_engineering_step(mut self, step: FeatureEngineeringStep) -> Self {
892 self.feature_engineering_steps.push(step);
893 self
894 }
895
896 pub fn add_selection_method(mut self, method: SelectionMethod) -> Self {
898 self.selection_methods.push(method);
899 self
900 }
901
902 pub fn with_dimensionality_reduction(mut self, reduction: DimensionalityReductionStep) -> Self {
904 self.dimensionality_reduction = Some(reduction);
905 self
906 }
907
908 pub fn with_model_selection(mut self, model_selection: ModelSelectionStep) -> Self {
910 self.model_selection = Some(model_selection);
911 self
912 }
913
914 pub fn with_config(mut self, config: PipelineConfiguration) -> Self {
916 self.pipeline_config = config;
917 self
918 }
919
920 pub fn with_optimization(mut self, config: OptimizationConfiguration) -> Self {
922 self.optimization_config = config;
923 self
924 }
925
926 pub fn fit(
928 mut self,
929 X: ArrayView2<f64>,
930 y: ArrayView1<f64>,
931 ) -> Result<FeatureSelectionPipeline<Trained>> {
932 let start_time = Instant::now();
933 let mut current_X = X.to_owned();
934 let current_y = y.to_owned();
935 let mut trained_steps = Vec::new();
936 let original_features = X.ncols();
937
938 let mut preprocessing_steps = std::mem::take(&mut self.preprocessing_steps);
940 for (idx, step) in preprocessing_steps.iter_mut().enumerate() {
941 let step_start = Instant::now();
942 current_X = Self::apply_preprocessing_step_static(step, current_X.view())?;
943
944 trained_steps.push(TrainedStep {
945 step_type: "Preprocessing".to_string(),
946 step_index: idx,
947 training_time: step_start.elapsed(),
948 feature_count_before: current_X.ncols(),
949 feature_count_after: current_X.ncols(),
950 parameters: StepParameters::Preprocessing(Box::new(())), });
952 }
953 self.preprocessing_steps = preprocessing_steps;
954
955 let mut feature_engineering_steps = std::mem::take(&mut self.feature_engineering_steps);
957 for (idx, step) in feature_engineering_steps.iter_mut().enumerate() {
958 let step_start = Instant::now();
959 let features_before = current_X.ncols();
960 current_X = Self::apply_feature_engineering_step_static(
961 step,
962 current_X.view(),
963 current_y.view(),
964 )?;
965
966 trained_steps.push(TrainedStep {
967 step_type: "FeatureEngineering".to_string(),
968 step_index: idx,
969 training_time: step_start.elapsed(),
970 feature_count_before: features_before,
971 feature_count_after: current_X.ncols(),
972 parameters: StepParameters::FeatureEngineering(Box::new(())),
973 });
974 }
975 self.feature_engineering_steps = feature_engineering_steps;
976
977 let mut selection_mask = Array1::from_elem(current_X.ncols(), true);
979 let mut selection_methods = std::mem::take(&mut self.selection_methods);
980 for (idx, method) in selection_methods.iter_mut().enumerate() {
981 let step_start = Instant::now();
982 let features_before = current_X.ncols();
983 let method_mask =
984 Self::apply_selection_method_static(method, current_X.view(), current_y.view())?;
985
986 for (i, &selected) in method_mask.iter().enumerate() {
988 if !selected {
989 selection_mask[i] = false;
990 }
991 }
992
993 trained_steps.push(TrainedStep {
994 step_type: "Selection".to_string(),
995 step_index: idx,
996 training_time: step_start.elapsed(),
997 feature_count_before: features_before,
998 feature_count_after: selection_mask.iter().filter(|&&x| x).count(),
999 parameters: StepParameters::Selection(method_mask),
1000 });
1001 }
1002 self.selection_methods = selection_methods;
1003
1004 let selected_indices: Vec<usize> = selection_mask
1006 .iter()
1007 .enumerate()
1008 .filter_map(|(i, &selected)| if selected { Some(i) } else { None })
1009 .collect();
1010
1011 if !selected_indices.is_empty() {
1012 let mut selected_X = Array2::zeros((current_X.nrows(), selected_indices.len()));
1013 for (new_col, &old_col) in selected_indices.iter().enumerate() {
1014 for row in 0..current_X.nrows() {
1015 selected_X[[row, new_col]] = current_X[[row, old_col]];
1016 }
1017 }
1018 current_X = selected_X;
1019 }
1020
1021 if self.dimensionality_reduction.is_some() {
1023 let step_start = Instant::now();
1024 let features_before = current_X.ncols();
1025 let mut reduction = self.dimensionality_reduction.take().unwrap();
1026 current_X = self.apply_dimensionality_reduction(&mut reduction, current_X.view())?;
1027 self.dimensionality_reduction = Some(reduction);
1028
1029 trained_steps.push(TrainedStep {
1030 step_type: "DimensionalityReduction".to_string(),
1031 step_index: 0,
1032 training_time: step_start.elapsed(),
1033 feature_count_before: features_before,
1034 feature_count_after: current_X.ncols(),
1035 parameters: StepParameters::DimensionalityReduction(Array2::zeros((1, 1))), });
1037 }
1038
1039 if self.model_selection.is_some() {
1041 let step_start = Instant::now();
1042 let features_before = current_X.ncols();
1043 let mut model_sel = self.model_selection.take().unwrap();
1044 let selected_features =
1045 self.apply_model_selection(&mut model_sel, current_X.view(), current_y.view())?;
1046 self.model_selection = Some(model_sel);
1047
1048 if !selected_features.is_empty() {
1050 let mut model_selected_X =
1051 Array2::zeros((current_X.nrows(), selected_features.len()));
1052 for (new_col, &old_col) in selected_features.iter().enumerate() {
1053 for row in 0..current_X.nrows() {
1054 model_selected_X[[row, new_col]] = current_X[[row, old_col]];
1055 }
1056 }
1057 current_X = model_selected_X;
1058 }
1059
1060 trained_steps.push(TrainedStep {
1061 step_type: "ModelSelection".to_string(),
1062 step_index: 0,
1063 training_time: step_start.elapsed(),
1064 feature_count_before: features_before,
1065 feature_count_after: current_X.ncols(),
1066 parameters: StepParameters::ModelSelection(selected_features),
1067 });
1068 }
1069
1070 let final_features = current_X.ncols();
1072 let _feature_mapping = FeatureMapping {
1073 original_features,
1074 final_features,
1075 feature_names: (0..final_features)
1076 .map(|i| format!("feature_{}", i))
1077 .collect(),
1078 feature_origins: (0..final_features).map(FeatureOrigin::Original).collect(),
1079 transformation_history: trained_steps
1080 .iter()
1081 .map(|step| TransformationStep {
1082 step_name: step.step_type.clone(),
1083 input_features: step.feature_count_before,
1084 output_features: step.feature_count_after,
1085 transformation_type: TransformationType::ManyToMany,
1086 })
1087 .collect(),
1088 };
1089
1090 let total_training_time = start_time.elapsed();
1092 let feature_reduction_ratio = final_features as f64 / original_features as f64;
1093
1094 let _pipeline_metadata = PipelineMetadata {
1095 total_training_time,
1096 total_transform_time: Duration::from_secs(0),
1097 memory_usage_peak: 0, feature_reduction_ratio,
1099 performance_metrics: HashMap::new(),
1100 validation_results: None,
1101 };
1102
1103 Ok(FeatureSelectionPipeline {
1104 preprocessing_steps: self.preprocessing_steps,
1105 feature_engineering_steps: self.feature_engineering_steps,
1106 selection_methods: self.selection_methods,
1107 dimensionality_reduction: self.dimensionality_reduction,
1108 model_selection: self.model_selection,
1109 pipeline_config: self.pipeline_config,
1110 optimization_config: self.optimization_config,
1111 _phantom: PhantomData::<Trained>,
1112 })
1113 }
1114
1115 fn apply_preprocessing_step(
1117 &self,
1118 step: &mut PreprocessingStep,
1119 X: ArrayView2<f64>,
1120 ) -> Result<Array2<f64>> {
1121 Self::apply_preprocessing_step_static(step, X)
1122 }
1123
1124 fn apply_preprocessing_step_static(
1125 step: &mut PreprocessingStep,
1126 X: ArrayView2<f64>,
1127 ) -> Result<Array2<f64>> {
1128 match step {
1129 PreprocessingStep::StandardScaler {
1130 config,
1131 trained_params,
1132 } => Self::apply_standard_scaler_static(config, trained_params, X),
1133 PreprocessingStep::RobustScaler {
1134 config,
1135 trained_params,
1136 } => Self::apply_robust_scaler_static(config, trained_params, X),
1137 PreprocessingStep::MinMaxScaler {
1138 config,
1139 trained_params,
1140 } => Self::apply_minmax_scaler_static(config, trained_params, X),
1141 _ => {
1142 Ok(X.to_owned())
1144 }
1145 }
1146 }
1147
1148 fn apply_standard_scaler(
1149 &self,
1150 config: &StandardScalerConfig,
1151 trained_params: &mut Option<ScalerParams>,
1152 X: ArrayView2<f64>,
1153 ) -> Result<Array2<f64>> {
1154 Self::apply_standard_scaler_static(config, trained_params, X)
1155 }
1156
1157 fn apply_standard_scaler_static(
1158 config: &StandardScalerConfig,
1159 trained_params: &mut Option<ScalerParams>,
1160 X: ArrayView2<f64>,
1161 ) -> Result<Array2<f64>> {
1162 let mut result = X.to_owned();
1163
1164 if trained_params.is_none() {
1165 let mut mean = Array1::zeros(X.ncols());
1167 let mut scale = Array1::ones(X.ncols());
1168
1169 if config.with_mean {
1170 for col in 0..X.ncols() {
1171 mean[col] = X.column(col).mean().unwrap_or(0.0);
1172 }
1173 }
1174
1175 if config.with_std {
1176 for col in 0..X.ncols() {
1177 let column = X.column(col);
1178 let variance = column.var(1.0);
1179 scale[col] = variance.sqrt().max(1e-8);
1180 }
1181 }
1182
1183 *trained_params = Some(ScalerParams { mean, scale });
1184 }
1185
1186 if let Some(ref params) = trained_params {
1188 for col in 0..X.ncols() {
1189 for row in 0..X.nrows() {
1190 if config.with_mean {
1191 result[[row, col]] -= params.mean[col];
1192 }
1193 if config.with_std {
1194 result[[row, col]] /= params.scale[col];
1195 }
1196 }
1197 }
1198 }
1199
1200 Ok(result)
1201 }
1202
1203 fn apply_robust_scaler(
1204 &self,
1205 config: &RobustScalerConfig,
1206 trained_params: &mut Option<RobustScalerParams>,
1207 X: ArrayView2<f64>,
1208 ) -> Result<Array2<f64>> {
1209 Self::apply_robust_scaler_static(config, trained_params, X)
1210 }
1211
1212 fn apply_robust_scaler_static(
1213 config: &RobustScalerConfig,
1214 trained_params: &mut Option<RobustScalerParams>,
1215 X: ArrayView2<f64>,
1216 ) -> Result<Array2<f64>> {
1217 let mut result = X.to_owned();
1218
1219 if trained_params.is_none() {
1220 let mut center = Array1::zeros(X.ncols());
1222 let mut scale = Array1::ones(X.ncols());
1223
1224 for col in 0..X.ncols() {
1225 let mut column_data: Vec<f64> = X.column(col).to_vec();
1226 column_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1227
1228 let n = column_data.len();
1229 if config.with_centering {
1230 center[col] = if n % 2 == 0 {
1231 (column_data[n / 2 - 1] + column_data[n / 2]) / 2.0
1232 } else {
1233 column_data[n / 2]
1234 };
1235 }
1236
1237 if config.with_scaling {
1238 let q1_idx = ((n - 1) as f64 * config.quantile_range.0) as usize;
1239 let q3_idx = ((n - 1) as f64 * config.quantile_range.1) as usize;
1240 let iqr = column_data[q3_idx] - column_data[q1_idx];
1241 scale[col] = iqr.max(1e-8);
1242 }
1243 }
1244
1245 *trained_params = Some(RobustScalerParams { center, scale });
1246 }
1247
1248 if let Some(ref params) = trained_params {
1250 for col in 0..X.ncols() {
1251 for row in 0..X.nrows() {
1252 if config.with_centering {
1253 result[[row, col]] -= params.center[col];
1254 }
1255 if config.with_scaling {
1256 result[[row, col]] /= params.scale[col];
1257 }
1258 }
1259 }
1260 }
1261
1262 Ok(result)
1263 }
1264
1265 fn apply_minmax_scaler(
1266 &self,
1267 config: &MinMaxScalerConfig,
1268 trained_params: &mut Option<MinMaxScalerParams>,
1269 X: ArrayView2<f64>,
1270 ) -> Result<Array2<f64>> {
1271 Self::apply_minmax_scaler_static(config, trained_params, X)
1272 }
1273
1274 fn apply_minmax_scaler_static(
1275 config: &MinMaxScalerConfig,
1276 trained_params: &mut Option<MinMaxScalerParams>,
1277 X: ArrayView2<f64>,
1278 ) -> Result<Array2<f64>> {
1279 let mut result = X.to_owned();
1280
1281 if trained_params.is_none() {
1282 let mut min = Array1::zeros(X.ncols());
1284 let mut scale = Array1::ones(X.ncols());
1285
1286 for col in 0..X.ncols() {
1287 let column = X.column(col);
1288 let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1289 let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1290
1291 min[col] = col_min;
1292 let range = col_max - col_min;
1293 if range > 1e-8 {
1294 scale[col] = (config.feature_range.1 - config.feature_range.0) / range;
1295 }
1296 }
1297
1298 *trained_params = Some(MinMaxScalerParams { min, scale });
1299 }
1300
1301 if let Some(ref params) = trained_params {
1303 for col in 0..X.ncols() {
1304 for row in 0..X.nrows() {
1305 let scaled = (result[[row, col]] - params.min[col]) * params.scale[col]
1306 + config.feature_range.0;
1307 result[[row, col]] = if config.clip {
1308 scaled
1309 .max(config.feature_range.0)
1310 .min(config.feature_range.1)
1311 } else {
1312 scaled
1313 };
1314 }
1315 }
1316 }
1317
1318 Ok(result)
1319 }
1320
1321 fn apply_feature_engineering_step_static(
1322 _step: &mut FeatureEngineeringStep,
1323 X: ArrayView2<f64>,
1324 _y: ArrayView1<f64>,
1325 ) -> Result<Array2<f64>> {
1326 Ok(X.to_owned())
1328 }
1329
1330 fn apply_feature_engineering_step(
1331 &self,
1332 step: &mut FeatureEngineeringStep,
1333 X: ArrayView2<f64>,
1334 _y: ArrayView1<f64>,
1335 ) -> Result<Array2<f64>> {
1336 match step {
1337 FeatureEngineeringStep::PolynomialFeatures {
1338 degree,
1339 interaction_only,
1340 include_bias,
1341 feature_mapping,
1342 } => self.apply_polynomial_features(
1343 *degree,
1344 *interaction_only,
1345 *include_bias,
1346 feature_mapping,
1347 X,
1348 ),
1349 FeatureEngineeringStep::InteractionFeatures {
1350 max_pairs,
1351 threshold,
1352 feature_pairs,
1353 } => self.apply_interaction_features(*max_pairs, *threshold, feature_pairs, X),
1354 FeatureEngineeringStep::BinningFeatures {
1355 n_bins,
1356 strategy,
1357 bin_edges,
1358 } => self.apply_binning_features(*n_bins, strategy, bin_edges, X),
1359 _ => {
1360 Ok(X.to_owned())
1362 }
1363 }
1364 }
1365
1366 fn apply_polynomial_features(
1367 &self,
1368 degree: usize,
1369 interaction_only: bool,
1370 include_bias: bool,
1371 feature_mapping: &mut Option<Vec<(usize, usize)>>,
1372 X: ArrayView2<f64>,
1373 ) -> Result<Array2<f64>> {
1374 let n_features = X.ncols();
1375 let mut new_features = Vec::new();
1376 let mut mapping = Vec::new();
1377
1378 if include_bias {
1380 let bias_feature = Array1::ones(X.nrows());
1381 new_features.push(bias_feature);
1382 mapping.push((0, 0)); }
1384
1385 for i in 0..n_features {
1387 new_features.push(X.column(i).to_owned());
1388 mapping.push((i, 1));
1389 }
1390
1391 if !interaction_only {
1393 for d in 2..=degree {
1395 for i in 0..n_features {
1396 let mut poly_feature = Array1::zeros(X.nrows());
1397 for row in 0..X.nrows() {
1398 poly_feature[row] = X[[row, i]].powi(d as i32);
1399 }
1400 new_features.push(poly_feature);
1401 mapping.push((i, d));
1402 }
1403 }
1404 }
1405
1406 for d in 2..=degree {
1408 for i in 0..n_features {
1409 for j in (i + 1)..n_features {
1410 let mut interaction_feature = Array1::zeros(X.nrows());
1411 for row in 0..X.nrows() {
1412 interaction_feature[row] = X[[row, i]] * X[[row, j]];
1413 }
1414 new_features.push(interaction_feature);
1415 mapping.push((i * n_features + j, d));
1416 }
1417 }
1418 }
1419
1420 *feature_mapping = Some(mapping);
1421
1422 let n_new_features = new_features.len();
1424 let mut result = Array2::zeros((X.nrows(), n_new_features));
1425 for (col, feature) in new_features.iter().enumerate() {
1426 for row in 0..X.nrows() {
1427 result[[row, col]] = feature[row];
1428 }
1429 }
1430
1431 Ok(result)
1432 }
1433
1434 fn apply_interaction_features(
1435 &self,
1436 max_pairs: Option<usize>,
1437 threshold: f64,
1438 feature_pairs: &mut Option<Vec<(usize, usize)>>,
1439 X: ArrayView2<f64>,
1440 ) -> Result<Array2<f64>> {
1441 let n_features = X.ncols();
1442 let mut interactions = Vec::new();
1443 let pairs: Vec<(usize, usize)>;
1444
1445 if feature_pairs.is_none() {
1447 let mut candidate_pairs = Vec::new();
1448 for i in 0..n_features {
1449 for j in (i + 1)..n_features {
1450 let corr = self.compute_correlation(X.column(i), X.column(j));
1452 if corr.abs() > threshold {
1453 candidate_pairs.push((i, j, corr.abs()));
1454 }
1455 }
1456 }
1457
1458 candidate_pairs.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
1460 let limit = max_pairs.unwrap_or(candidate_pairs.len());
1461 pairs = candidate_pairs
1462 .into_iter()
1463 .take(limit)
1464 .map(|(i, j, _)| (i, j))
1465 .collect();
1466 *feature_pairs = Some(pairs.clone());
1467 } else {
1468 pairs = feature_pairs.as_ref().unwrap().clone();
1469 }
1470
1471 for &(i, j) in &pairs {
1473 let mut interaction = Array1::zeros(X.nrows());
1474 for row in 0..X.nrows() {
1475 interaction[row] = X[[row, i]] * X[[row, j]];
1476 }
1477 interactions.push(interaction);
1478 }
1479
1480 let total_features = n_features + interactions.len();
1482 let mut result = Array2::zeros((X.nrows(), total_features));
1483
1484 for col in 0..n_features {
1486 for row in 0..X.nrows() {
1487 result[[row, col]] = X[[row, col]];
1488 }
1489 }
1490
1491 for (idx, interaction) in interactions.iter().enumerate() {
1493 for row in 0..X.nrows() {
1494 result[[row, n_features + idx]] = interaction[row];
1495 }
1496 }
1497
1498 Ok(result)
1499 }
1500
1501 fn apply_binning_features(
1502 &self,
1503 n_bins: usize,
1504 strategy: &BinningStrategy,
1505 bin_edges: &mut Option<HashMap<usize, Vec<f64>>>,
1506 X: ArrayView2<f64>,
1507 ) -> Result<Array2<f64>> {
1508 let mut result = X.to_owned();
1509
1510 if bin_edges.is_none() {
1511 let mut edges_map = HashMap::new();
1512
1513 for col in 0..X.ncols() {
1514 let column = X.column(col);
1515 let edges = match strategy {
1516 BinningStrategy::Uniform => {
1517 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1518 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1519 let step = (max_val - min_val) / n_bins as f64;
1520 (0..=n_bins)
1521 .map(|i| min_val + i as f64 * step)
1522 .collect::<Vec<f64>>()
1523 }
1524 BinningStrategy::Quantile => {
1525 let mut sorted_values: Vec<f64> = column.to_vec();
1526 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
1527 let n = sorted_values.len();
1528 (0..=n_bins)
1529 .map(|i| {
1530 let quantile = i as f64 / n_bins as f64;
1531 let idx = ((n - 1) as f64 * quantile) as usize;
1532 sorted_values[idx]
1533 })
1534 .collect::<Vec<f64>>()
1535 }
1536 BinningStrategy::KMeans => {
1537 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1539 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1540 let step = (max_val - min_val) / n_bins as f64;
1541 (0..=n_bins)
1542 .map(|i| min_val + i as f64 * step)
1543 .collect::<Vec<f64>>()
1544 }
1545 };
1546 edges_map.insert(col, edges);
1547 }
1548
1549 *bin_edges = Some(edges_map);
1550 }
1551
1552 if let Some(ref edges_map) = bin_edges {
1554 for col in 0..X.ncols() {
1555 if let Some(edges) = edges_map.get(&col) {
1556 for row in 0..X.nrows() {
1557 let value = X[[row, col]];
1558 let bin = edges
1559 .iter()
1560 .position(|&edge| value <= edge)
1561 .unwrap_or(edges.len() - 1)
1562 .min(n_bins - 1);
1563 result[[row, col]] = bin as f64;
1564 }
1565 }
1566 }
1567 }
1568
1569 Ok(result)
1570 }
1571
1572 fn apply_selection_method_static(
1573 _method: &mut SelectionMethod,
1574 X: ArrayView2<f64>,
1575 _y: ArrayView1<f64>,
1576 ) -> Result<Array1<bool>> {
1577 Ok(Array1::from_elem(X.ncols(), true))
1579 }
1580
1581 fn apply_selection_method(
1582 &self,
1583 method: &mut SelectionMethod,
1584 X: ArrayView2<f64>,
1585 y: ArrayView1<f64>,
1586 ) -> Result<Array1<bool>> {
1587 match method {
1588 SelectionMethod::VarianceThreshold {
1589 threshold,
1590 feature_variance,
1591 } => self.apply_variance_threshold(*threshold, feature_variance, X),
1592 SelectionMethod::CorrelationFilter {
1593 threshold,
1594 method: corr_method,
1595 correlation_matrix,
1596 } => self.apply_correlation_filter(*threshold, corr_method, correlation_matrix, X),
1597 SelectionMethod::UnivariateFilter {
1598 method: uni_method,
1599 k,
1600 score_func,
1601 } => self.apply_univariate_filter(uni_method, k, score_func, X, y),
1602 _ => {
1603 Ok(Array1::from_elem(X.ncols(), true))
1605 }
1606 }
1607 }
1608
1609 fn apply_variance_threshold(
1610 &self,
1611 threshold: f64,
1612 feature_variance: &mut Option<Array1<f64>>,
1613 X: ArrayView2<f64>,
1614 ) -> Result<Array1<bool>> {
1615 if feature_variance.is_none() {
1616 let mut variances = Array1::zeros(X.ncols());
1617 for col in 0..X.ncols() {
1618 variances[col] = X.column(col).var(1.0);
1619 }
1620 *feature_variance = Some(variances);
1621 }
1622
1623 let variances = feature_variance.as_ref().unwrap();
1624 let selection = variances.mapv(|v| v > threshold);
1625 Ok(selection)
1626 }
1627
1628 fn apply_correlation_filter(
1629 &self,
1630 threshold: f64,
1631 corr_method: &CorrelationMethod,
1632 correlation_matrix: &mut Option<Array2<f64>>,
1633 X: ArrayView2<f64>,
1634 ) -> Result<Array1<bool>> {
1635 if correlation_matrix.is_none() {
1636 let n_features = X.ncols();
1637 let mut corr_matrix = Array2::zeros((n_features, n_features));
1638
1639 for i in 0..n_features {
1640 for j in 0..n_features {
1641 if i == j {
1642 corr_matrix[[i, j]] = 1.0;
1643 } else {
1644 let corr = match corr_method {
1645 CorrelationMethod::Pearson => {
1646 self.compute_correlation(X.column(i), X.column(j))
1647 }
1648 _ => {
1649 self.compute_correlation(X.column(i), X.column(j))
1651 }
1652 };
1653 corr_matrix[[i, j]] = corr;
1654 }
1655 }
1656 }
1657 *correlation_matrix = Some(corr_matrix);
1658 }
1659
1660 let corr_matrix = correlation_matrix.as_ref().unwrap();
1662 let mut selection = Array1::from_elem(X.ncols(), true);
1663
1664 for i in 0..X.ncols() {
1665 for j in (i + 1)..X.ncols() {
1666 if corr_matrix[[i, j]].abs() > threshold && selection[i] && selection[j] {
1667 let var_i = X.column(i).var(1.0);
1669 let var_j = X.column(j).var(1.0);
1670 if var_i < var_j {
1671 selection[i] = false;
1672 } else {
1673 selection[j] = false;
1674 }
1675 }
1676 }
1677 }
1678
1679 Ok(selection)
1680 }
1681
1682 fn apply_univariate_filter(
1683 &self,
1684 _method: &UnivariateMethod,
1685 k: &SelectionCount,
1686 score_func: &UnivariateScoreFunction,
1687 X: ArrayView2<f64>,
1688 y: ArrayView1<f64>,
1689 ) -> Result<Array1<bool>> {
1690 let mut scores = Array1::zeros(X.ncols());
1692
1693 for col in 0..X.ncols() {
1694 scores[col] = match score_func {
1695 UnivariateScoreFunction::Chi2 => self.compute_chi2_score(X.column(col), y),
1696 UnivariateScoreFunction::FClassif => self.compute_f_score(X.column(col), y),
1697 UnivariateScoreFunction::MutualInfoClassif => {
1698 self.compute_mutual_info(X.column(col), y)
1699 }
1700 _ => {
1701 self.compute_correlation(X.column(col), y).abs()
1703 }
1704 };
1705 }
1706
1707 let selection = match k {
1709 SelectionCount::K(k_val) => {
1710 let mut indexed_scores: Vec<(usize, f64)> = scores
1711 .iter()
1712 .enumerate()
1713 .map(|(i, &score)| (i, score))
1714 .collect();
1715 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1716
1717 let mut selection = Array1::from_elem(X.ncols(), false);
1718 for &(idx, _) in indexed_scores.iter().take(*k_val) {
1719 selection[idx] = true;
1720 }
1721 selection
1722 }
1723 SelectionCount::Percentile(p) => {
1724 let k_val = ((X.ncols() as f64 * p / 100.0).round() as usize).max(1);
1725 let mut indexed_scores: Vec<(usize, f64)> = scores
1726 .iter()
1727 .enumerate()
1728 .map(|(i, &score)| (i, score))
1729 .collect();
1730 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1731
1732 let mut selection = Array1::from_elem(X.ncols(), false);
1733 for &(idx, _) in indexed_scores.iter().take(k_val) {
1734 selection[idx] = true;
1735 }
1736 selection
1737 }
1738 _ => {
1739 let k_val = X.ncols() / 2;
1741 let mut indexed_scores: Vec<(usize, f64)> = scores
1742 .iter()
1743 .enumerate()
1744 .map(|(i, &score)| (i, score))
1745 .collect();
1746 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1747
1748 let mut selection = Array1::from_elem(X.ncols(), false);
1749 for &(idx, _) in indexed_scores.iter().take(k_val) {
1750 selection[idx] = true;
1751 }
1752 selection
1753 }
1754 };
1755
1756 Ok(selection)
1757 }
1758
1759 fn apply_dimensionality_reduction(
1760 &self,
1761 reduction: &mut DimensionalityReductionStep,
1762 X: ArrayView2<f64>,
1763 ) -> Result<Array2<f64>> {
1764 match reduction {
1765 DimensionalityReductionStep::PCA {
1766 n_components,
1767 whiten,
1768 svd_solver,
1769 components,
1770 explained_variance,
1771 } => self.apply_pca(
1772 *n_components,
1773 *whiten,
1774 svd_solver,
1775 components,
1776 explained_variance,
1777 X,
1778 ),
1779 DimensionalityReductionStep::TruncatedSVD {
1780 n_components,
1781 algorithm,
1782 components,
1783 singular_values,
1784 } => self.apply_truncated_svd(*n_components, algorithm, components, singular_values, X),
1785 _ => {
1786 let n_comp = match reduction {
1788 DimensionalityReductionStep::ICA { n_components, .. } => *n_components,
1789 DimensionalityReductionStep::FactorAnalysis { n_components, .. } => {
1790 *n_components
1791 }
1792 DimensionalityReductionStep::UMAP { n_components, .. } => *n_components,
1793 DimensionalityReductionStep::TSNE { n_components, .. } => *n_components,
1794 _ => X.ncols().min(50),
1795 };
1796 let final_components = n_comp.min(X.ncols());
1797 let mut result = Array2::zeros((X.nrows(), final_components));
1798 for col in 0..final_components {
1799 for row in 0..X.nrows() {
1800 result[[row, col]] = X[[row, col]];
1801 }
1802 }
1803 Ok(result)
1804 }
1805 }
1806 }
1807
1808 fn apply_pca(
1809 &self,
1810 n_components: usize,
1811 _whiten: bool,
1812 _svd_solver: &SVDSolver,
1813 components: &mut Option<Array2<f64>>,
1814 explained_variance: &mut Option<Array1<f64>>,
1815 X: ArrayView2<f64>,
1816 ) -> Result<Array2<f64>> {
1817 let n_comp = n_components.min(X.ncols()).min(X.nrows());
1818
1819 let mut centered_X = X.to_owned();
1821 let mut means = Array1::zeros(X.ncols());
1822 for col in 0..X.ncols() {
1823 means[col] = X.column(col).mean().unwrap_or(0.0);
1824 for row in 0..X.nrows() {
1825 centered_X[[row, col]] -= means[col];
1826 }
1827 }
1828
1829 if components.is_none() {
1832 *components = Some(Array2::eye(X.ncols()));
1833 *explained_variance = Some(Array1::ones(n_comp));
1834 }
1835
1836 let mut result = Array2::zeros((X.nrows(), n_comp));
1837 for col in 0..n_comp {
1838 for row in 0..X.nrows() {
1839 result[[row, col]] = centered_X[[row, col]];
1840 }
1841 }
1842
1843 Ok(result)
1844 }
1845
1846 fn apply_truncated_svd(
1847 &self,
1848 n_components: usize,
1849 _algorithm: &SVDAlgorithm,
1850 components: &mut Option<Array2<f64>>,
1851 singular_values: &mut Option<Array1<f64>>,
1852 X: ArrayView2<f64>,
1853 ) -> Result<Array2<f64>> {
1854 let n_comp = n_components.min(X.ncols()).min(X.nrows());
1855
1856 if components.is_none() {
1858 *components = Some(Array2::eye(X.ncols()));
1859 *singular_values = Some(Array1::ones(n_comp));
1860 }
1861
1862 let mut result = Array2::zeros((X.nrows(), n_comp));
1863 for col in 0..n_comp {
1864 for row in 0..X.nrows() {
1865 result[[row, col]] = X[[row, col]];
1866 }
1867 }
1868
1869 Ok(result)
1870 }
1871
1872 fn apply_model_selection(
1873 &self,
1874 model_selection: &mut ModelSelectionStep,
1875 X: ArrayView2<f64>,
1876 y: ArrayView1<f64>,
1877 ) -> Result<Vec<usize>> {
1878 match model_selection {
1879 ModelSelectionStep::CrossValidationSelection {
1880 estimator,
1881 cv_folds,
1882 scoring,
1883 feature_scores,
1884 } => self.apply_cv_selection(estimator, *cv_folds, scoring, feature_scores, X, y),
1885 ModelSelectionStep::ForwardSelection {
1886 estimator,
1887 max_features,
1888 scoring,
1889 selected_features,
1890 } => self.apply_forward_selection(
1891 estimator,
1892 *max_features,
1893 scoring,
1894 selected_features,
1895 X,
1896 y,
1897 ),
1898 _ => {
1899 Ok((0..X.ncols()).collect())
1901 }
1902 }
1903 }
1904
1905 fn apply_cv_selection(
1906 &self,
1907 _estimator: &ModelEstimator,
1908 _cv_folds: usize,
1909 _scoring: &ScoringMetric,
1910 feature_scores: &mut Option<Array1<f64>>,
1911 X: ArrayView2<f64>,
1912 y: ArrayView1<f64>,
1913 ) -> Result<Vec<usize>> {
1914 if feature_scores.is_none() {
1915 let mut scores = Array1::zeros(X.ncols());
1917 for col in 0..X.ncols() {
1918 scores[col] = self.compute_correlation(X.column(col), y).abs();
1919 }
1920 *feature_scores = Some(scores);
1921 }
1922
1923 if let Some(ref scores) = feature_scores {
1925 let mut indexed_scores: Vec<(usize, f64)> = scores
1926 .iter()
1927 .enumerate()
1928 .map(|(i, &score)| (i, score))
1929 .collect();
1930 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1931
1932 let n_select = X.ncols() / 2;
1933 Ok(indexed_scores
1934 .into_iter()
1935 .take(n_select)
1936 .map(|(idx, _)| idx)
1937 .collect())
1938 } else {
1939 Ok((0..X.ncols()).collect())
1940 }
1941 }
1942
1943 fn apply_forward_selection(
1944 &self,
1945 _estimator: &ModelEstimator,
1946 max_features: usize,
1947 _scoring: &ScoringMetric,
1948 selected_features: &mut Option<Vec<usize>>,
1949 X: ArrayView2<f64>,
1950 y: ArrayView1<f64>,
1951 ) -> Result<Vec<usize>> {
1952 if selected_features.is_none() {
1953 let mut scores = Vec::new();
1955 for col in 0..X.ncols() {
1956 let score = self.compute_correlation(X.column(col), y).abs();
1957 scores.push((col, score));
1958 }
1959
1960 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1961 let features: Vec<usize> = scores
1962 .into_iter()
1963 .take(max_features.min(X.ncols()))
1964 .map(|(idx, _)| idx)
1965 .collect();
1966
1967 *selected_features = Some(features.clone());
1968 Ok(features)
1969 } else {
1970 Ok(selected_features.as_ref().unwrap().clone())
1971 }
1972 }
1973
1974 fn compute_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
1976 let n = x.len() as f64;
1977 if n < 2.0 {
1978 return 0.0;
1979 }
1980
1981 let mean_x = x.mean().unwrap_or(0.0);
1982 let mean_y = y.mean().unwrap_or(0.0);
1983
1984 let mut sum_xy = 0.0;
1985 let mut sum_x2 = 0.0;
1986 let mut sum_y2 = 0.0;
1987
1988 for i in 0..x.len() {
1989 let dx = x[i] - mean_x;
1990 let dy = y[i] - mean_y;
1991 sum_xy += dx * dy;
1992 sum_x2 += dx * dx;
1993 sum_y2 += dy * dy;
1994 }
1995
1996 let denom = (sum_x2 * sum_y2).sqrt();
1997 if denom < 1e-10 {
1998 0.0
1999 } else {
2000 sum_xy / denom
2001 }
2002 }
2003
2004 fn compute_chi2_score(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
2005 self.compute_correlation(x, y).abs()
2007 }
2008
2009 fn compute_f_score(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
2010 self.compute_correlation(x, y).abs()
2012 }
2013
2014 fn compute_mutual_info(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
2015 self.compute_correlation(x, y).abs()
2017 }
2018}
2019
2020impl FeatureSelectionPipeline<Trained> {
2021 pub fn transform(&self, X: ArrayView2<f64>) -> Result<Array2<f64>> {
2022 let _start_time = Instant::now();
2023 let current_X = X.to_owned();
2024
2025 Ok(current_X)
2030 }
2031
2032 pub fn get_pipeline_info(&self) -> PipelineInfo {
2034 PipelineInfo {
2035 n_preprocessing_steps: self.preprocessing_steps.len(),
2036 n_feature_engineering_steps: self.feature_engineering_steps.len(),
2037 n_selection_methods: self.selection_methods.len(),
2038 has_dimensionality_reduction: self.dimensionality_reduction.is_some(),
2039 has_model_selection: self.model_selection.is_some(),
2040 config: self.pipeline_config.clone(),
2041 }
2042 }
2043}
2044
2045#[derive(Debug, Clone)]
2047pub struct PipelineInfo {
2048 pub n_preprocessing_steps: usize,
2049 pub n_feature_engineering_steps: usize,
2050 pub n_selection_methods: usize,
2051 pub has_dimensionality_reduction: bool,
2052 pub has_model_selection: bool,
2053 pub config: PipelineConfiguration,
2054}
2055
2056impl Default for PipelineConfiguration {
2058 fn default() -> Self {
2059 Self {
2060 parallel_execution: true,
2061 memory_optimization: MemoryOptimization::Conservative,
2062 caching_strategy: CachingStrategy::LRU { size: 1000 },
2063 validation_strategy: ValidationStrategy::Basic,
2064 error_handling: ErrorHandling::Graceful,
2065 logging_level: LoggingLevel::Info,
2066 }
2067 }
2068}
2069
2070impl Default for OptimizationConfiguration {
2071 fn default() -> Self {
2072 Self {
2073 use_simd: true,
2074 chunk_size: 1000,
2075 thread_pool_size: None,
2076 memory_pool_size: 1024 * 1024, cache_size: 100,
2078 prefetch_strategy: PrefetchStrategy::Sequential,
2079 vectorization_threshold: 1000,
2080 }
2081 }
2082}
2083
2084impl Estimator for FeatureSelectionPipeline<Untrained> {
2086 type Config = ();
2087 type Error = SklearsError;
2088 type Float = f64;
2089
2090 fn config(&self) -> &Self::Config {
2091 &()
2092 }
2093}
2094
2095impl Fit<ArrayView2<'_, f64>, ArrayView1<'_, f64>> for FeatureSelectionPipeline<Untrained> {
2096 type Fitted = FeatureSelectionPipeline<Trained>;
2097
2098 fn fit(self, X: &ArrayView2<'_, f64>, y: &ArrayView1<'_, f64>) -> Result<Self::Fitted> {
2099 self.fit(*X, *y)
2100 }
2101}
2102
2103impl Transform<ArrayView2<'_, f64>, Array2<f64>> for FeatureSelectionPipeline<Trained> {
2104 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
2105 self.transform(*X)
2106 }
2107}