sklears_feature_selection/
pipeline.rs

1//! Pipeline Integration Framework
2//!
3//! This module provides seamless integration between feature selection and other
4//! components like preprocessing, feature engineering, dimensionality reduction,
5//! and model selection. It follows the SciRS2 policy using scirs2-core for
6//! numerical computations and implements Rust-specific optimizations.
7
8use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
9use sklears_core::error::{Result as SklResult, SklearsError};
10use sklears_core::traits::{Estimator, Fit, Transform};
11use std::collections::HashMap;
12use std::marker::PhantomData;
13use std::time::{Duration, Instant};
14
15type Result<T> = SklResult<T>;
16
17/// Comprehensive pipeline integration framework for feature selection
18#[derive(Debug, Clone)]
19pub struct FeatureSelectionPipeline<State = Untrained> {
20    preprocessing_steps: Vec<PreprocessingStep>,
21    feature_engineering_steps: Vec<FeatureEngineeringStep>,
22    selection_methods: Vec<SelectionMethod>,
23    dimensionality_reduction: Option<DimensionalityReductionStep>,
24    model_selection: Option<ModelSelectionStep>,
25    pipeline_config: PipelineConfiguration,
26    optimization_config: OptimizationConfiguration,
27    _phantom: PhantomData<State>,
28}
29
30/// Type-safe state markers for compile-time pipeline validation
31#[derive(Debug, Clone, Default)]
32pub struct Untrained;
33
34#[derive(Debug)]
35pub struct Trained {
36    trained_steps: Vec<TrainedStep>,
37    feature_mapping: FeatureMapping,
38    pipeline_metadata: PipelineMetadata,
39}
40
41/// Individual preprocessing step in the pipeline
42#[derive(Debug, Clone)]
43pub enum PreprocessingStep {
44    /// StandardScaler
45    StandardScaler {
46        config: StandardScalerConfig,
47
48        trained_params: Option<ScalerParams>,
49    },
50    /// RobustScaler
51    RobustScaler {
52        config: RobustScalerConfig,
53
54        trained_params: Option<RobustScalerParams>,
55    },
56    /// MinMaxScaler
57    MinMaxScaler {
58        config: MinMaxScalerConfig,
59        trained_params: Option<MinMaxScalerParams>,
60    },
61    QuantileTransformer {
62        config: QuantileTransformerConfig,
63        trained_params: Option<QuantileParams>,
64    },
65    PowerTransformer {
66        config: PowerTransformerConfig,
67        trained_params: Option<PowerParams>,
68    },
69    MissingValueImputer {
70        config: ImputerConfig,
71        trained_params: Option<ImputerParams>,
72    },
73    OutlierRemover {
74        config: OutlierConfig,
75        trained_params: Option<OutlierParams>,
76    },
77}
78
79/// Feature engineering steps for creating new features
80#[derive(Debug, Clone)]
81pub enum FeatureEngineeringStep {
82    /// PolynomialFeatures
83    PolynomialFeatures {
84        degree: usize,
85
86        interaction_only: bool,
87
88        include_bias: bool,
89
90        feature_mapping: Option<Vec<(usize, usize)>>,
91    },
92    /// InteractionFeatures
93    InteractionFeatures {
94        max_pairs: Option<usize>,
95
96        threshold: f64,
97
98        feature_pairs: Option<Vec<(usize, usize)>>,
99    },
100    BinningFeatures {
101        n_bins: usize,
102        strategy: BinningStrategy,
103        bin_edges: Option<HashMap<usize, Vec<f64>>>,
104    },
105    TargetEncoding {
106        smoothing: f64,
107        min_samples_leaf: usize,
108        encodings: Option<HashMap<usize, HashMap<String, f64>>>,
109    },
110    FrequencyEncoding {
111        min_frequency: f64,
112        frequencies: Option<HashMap<usize, HashMap<String, f64>>>,
113    },
114    RatioFeatures {
115        numerator_features: Vec<usize>,
116        denominator_features: Vec<usize>,
117        eps: f64,
118    },
119    LaggingFeatures {
120        lags: Vec<usize>,
121        feature_subset: Option<Vec<usize>>,
122    },
123    WindowStatistics {
124        window_size: usize,
125        statistics: Vec<WindowStatistic>,
126        feature_subset: Option<Vec<usize>>,
127    },
128}
129
130/// Selection method configuration with type safety
131#[derive(Debug, Clone)]
132pub enum SelectionMethod {
133    /// UnivariateFilter
134    UnivariateFilter {
135        method: UnivariateMethod,
136
137        k: SelectionCount,
138
139        score_func: UnivariateScoreFunction,
140    },
141    /// RecursiveFeatureElimination
142    RecursiveFeatureElimination {
143        estimator: RFEEstimator,
144
145        n_features: SelectionCount,
146
147        step: f64,
148
149        importance_getter: ImportanceGetter,
150    },
151    SelectFromModel {
152        estimator: ModelEstimator,
153        threshold: SelectionThreshold,
154        prefit: bool,
155        max_features: Option<usize>,
156    },
157    VarianceThreshold {
158        threshold: f64,
159        feature_variance: Option<Array1<f64>>,
160    },
161    CorrelationFilter {
162        threshold: f64,
163        method: CorrelationMethod,
164        correlation_matrix: Option<Array2<f64>>,
165    },
166    MutualInformation {
167        k: SelectionCount,
168        discrete_features: Vec<bool>,
169        random_state: Option<u64>,
170    },
171    LASSO {
172        alpha: f64,
173        max_iter: usize,
174        tol: f64,
175        coefficients: Option<Array1<f64>>,
176    },
177    ElasticNet {
178        alpha: f64,
179        l1_ratio: f64,
180        max_iter: usize,
181        tol: f64,
182        coefficients: Option<Array1<f64>>,
183    },
184    TreeBased {
185        estimator_type: TreeEstimatorType,
186        n_estimators: usize,
187        max_depth: Option<usize>,
188        feature_importances: Option<Array1<f64>>,
189    },
190    GeneticAlgorithm {
191        population_size: usize,
192        n_generations: usize,
193        mutation_rate: f64,
194        crossover_rate: f64,
195        best_individuals: Option<Vec<Vec<bool>>>,
196    },
197    ParticleSwarmOptimization {
198        n_particles: usize,
199        n_iterations: usize,
200        inertia: f64,
201        cognitive: f64,
202        social: f64,
203        best_positions: Option<Vec<Vec<f64>>>,
204    },
205    SimulatedAnnealing {
206        initial_temp: f64,
207        cooling_rate: f64,
208        min_temp: f64,
209        max_iter: usize,
210        current_solution: Option<Vec<bool>>,
211    },
212}
213
214/// Dimensionality reduction step (applied after feature selection)
215#[derive(Debug, Clone)]
216pub enum DimensionalityReductionStep {
217    /// PCA
218    PCA {
219        n_components: usize,
220
221        whiten: bool,
222
223        svd_solver: SVDSolver,
224
225        components: Option<Array2<f64>>,
226
227        explained_variance: Option<Array1<f64>>,
228    },
229    /// TruncatedSVD
230    TruncatedSVD {
231        n_components: usize,
232
233        algorithm: SVDAlgorithm,
234        components: Option<Array2<f64>>,
235        singular_values: Option<Array1<f64>>,
236    },
237    ICA {
238        n_components: usize,
239        algorithm: ICAAlgorithm,
240        max_iter: usize,
241        tol: f64,
242        mixing_matrix: Option<Array2<f64>>,
243        unmixing_matrix: Option<Array2<f64>>,
244    },
245    FactorAnalysis {
246        n_components: usize,
247        max_iter: usize,
248        tol: f64,
249        loadings: Option<Array2<f64>>,
250        noise_variance: Option<Array1<f64>>,
251    },
252    UMAP {
253        n_components: usize,
254        n_neighbors: usize,
255        min_dist: f64,
256        metric: DistanceMetric,
257        embedding: Option<Array2<f64>>,
258    },
259    TSNE {
260        n_components: usize,
261        perplexity: f64,
262        early_exaggeration: f64,
263        learning_rate: f64,
264        max_iter: usize,
265        embedding: Option<Array2<f64>>,
266    },
267}
268
269/// Model selection step for choosing optimal features for specific models
270#[derive(Debug, Clone)]
271pub enum ModelSelectionStep {
272    /// CrossValidationSelection
273    CrossValidationSelection {
274        estimator: ModelEstimator,
275
276        cv_folds: usize,
277
278        scoring: ScoringMetric,
279
280        feature_scores: Option<Array1<f64>>,
281    },
282    /// ForwardSelection
283    ForwardSelection {
284        estimator: ModelEstimator,
285
286        max_features: usize,
287
288        scoring: ScoringMetric,
289        selected_features: Option<Vec<usize>>,
290    },
291    BackwardElimination {
292        estimator: ModelEstimator,
293        min_features: usize,
294        scoring: ScoringMetric,
295        remaining_features: Option<Vec<usize>>,
296    },
297    StepwiseSelection {
298        estimator: ModelEstimator,
299        direction: StepwiseDirection,
300        p_enter: f64,
301        p_remove: f64,
302        selected_features: Option<Vec<usize>>,
303    },
304    BayesianOptimization {
305        estimator: ModelEstimator,
306        acquisition_function: AcquisitionFunction,
307        n_calls: usize,
308        optimal_features: Option<Vec<usize>>,
309    },
310}
311
312/// Type-safe selection count specification
313#[derive(Debug, Clone)]
314pub enum SelectionCount {
315    /// K
316    K(usize),
317    /// Percentile
318    Percentile(f64),
319    /// FDR
320    FDR(f64),
321    /// FPR
322    FPR(f64),
323    /// FWER
324    FWER(f64),
325}
326
327/// Type-safe selection threshold specification
328#[derive(Debug, Clone)]
329pub enum SelectionThreshold {
330    /// Mean
331    Mean,
332    /// Median
333    Median,
334    /// Absolute
335    Absolute(f64),
336    /// Percentile
337    Percentile(f64),
338    /// Auto
339    Auto,
340}
341
342/// Configuration for pipeline behavior
343#[derive(Debug, Clone)]
344pub struct PipelineConfiguration {
345    pub parallel_execution: bool,
346    pub memory_optimization: MemoryOptimization,
347    pub caching_strategy: CachingStrategy,
348    pub validation_strategy: ValidationStrategy,
349    pub error_handling: ErrorHandling,
350    pub logging_level: LoggingLevel,
351}
352
353/// Optimization configuration for performance tuning
354#[derive(Debug, Clone)]
355pub struct OptimizationConfiguration {
356    pub use_simd: bool,
357    pub chunk_size: usize,
358    pub thread_pool_size: Option<usize>,
359    pub memory_pool_size: usize,
360    pub cache_size: usize,
361    pub prefetch_strategy: PrefetchStrategy,
362    pub vectorization_threshold: usize,
363}
364
365/// Supporting enums and structs for configuration
366#[derive(Debug, Clone)]
367pub enum MemoryOptimization {
368    None,
369    /// Conservative
370    Conservative,
371    /// Aggressive
372    Aggressive,
373}
374
375#[derive(Debug, Clone)]
376pub enum CachingStrategy {
377    None,
378    /// LRU
379    LRU {
380        size: usize,
381    },
382    /// LFU
383    LFU {
384        size: usize,
385    },
386    /// FIFO
387    FIFO {
388        size: usize,
389    },
390}
391
392#[derive(Debug, Clone)]
393pub enum ValidationStrategy {
394    None,
395    /// Basic
396    Basic,
397    /// Comprehensive
398    Comprehensive,
399    /// Statistical
400    Statistical,
401}
402
403#[derive(Debug, Clone)]
404pub enum ErrorHandling {
405    /// Strict
406    Strict,
407    /// Graceful
408    Graceful,
409    /// Logging
410    Logging,
411}
412
413#[derive(Debug, Clone)]
414pub enum LoggingLevel {
415    None,
416    /// Error
417    Error,
418    /// Warning
419    Warning,
420    /// Info
421    Info,
422    /// Debug
423    Debug,
424    /// Trace
425    Trace,
426}
427
428#[derive(Debug, Clone)]
429pub enum PrefetchStrategy {
430    None,
431    /// Sequential
432    Sequential,
433    /// Random
434    Random,
435    /// Adaptive
436    Adaptive,
437}
438
439// Configuration structs for each preprocessing step
440#[derive(Debug, Clone)]
441pub struct StandardScalerConfig {
442    pub with_mean: bool,
443    pub with_std: bool,
444}
445
446#[derive(Debug, Clone)]
447pub struct ScalerParams {
448    pub mean: Array1<f64>,
449    pub scale: Array1<f64>,
450}
451
452#[derive(Debug, Clone)]
453pub struct RobustScalerConfig {
454    pub with_centering: bool,
455    pub with_scaling: bool,
456    pub quantile_range: (f64, f64),
457}
458
459#[derive(Debug, Clone)]
460pub struct RobustScalerParams {
461    pub center: Array1<f64>,
462    pub scale: Array1<f64>,
463}
464
465#[derive(Debug, Clone)]
466pub struct MinMaxScalerConfig {
467    pub feature_range: (f64, f64),
468    pub clip: bool,
469}
470
471#[derive(Debug, Clone)]
472pub struct MinMaxScalerParams {
473    pub min: Array1<f64>,
474    pub scale: Array1<f64>,
475}
476
477#[derive(Debug, Clone)]
478pub struct QuantileTransformerConfig {
479    pub n_quantiles: usize,
480    pub output_distribution: Distribution,
481    pub subsample: Option<usize>,
482}
483
484#[derive(Debug, Clone)]
485pub struct QuantileParams {
486    pub quantiles: Array2<f64>,
487    pub references: Array1<f64>,
488}
489
490#[derive(Debug, Clone)]
491pub struct PowerTransformerConfig {
492    pub method: PowerMethod,
493    pub standardize: bool,
494}
495
496#[derive(Debug, Clone)]
497pub struct PowerParams {
498    pub lambdas: Array1<f64>,
499}
500
501#[derive(Debug, Clone)]
502pub struct ImputerConfig {
503    pub strategy: ImputationStrategy,
504    pub fill_value: Option<f64>,
505    pub missing_values: MissingValueIndicator,
506}
507
508#[derive(Debug, Clone)]
509pub struct ImputerParams {
510    pub statistics: Array1<f64>,
511}
512
513#[derive(Debug, Clone)]
514pub struct OutlierConfig {
515    pub method: OutlierMethod,
516    pub threshold: f64,
517    pub contamination: f64,
518}
519
520#[derive(Debug, Clone)]
521pub struct OutlierParams {
522    pub decision_function: Array1<f64>,
523    pub threshold: f64,
524}
525
526// Supporting enums
527#[derive(Debug, Clone)]
528pub enum BinningStrategy {
529    /// Uniform
530    Uniform,
531    /// Quantile
532    Quantile,
533    /// KMeans
534    KMeans,
535}
536
537#[derive(Debug, Clone)]
538pub enum WindowStatistic {
539    /// Mean
540    Mean,
541    /// Std
542    Std,
543    /// Min
544    Min,
545    /// Max
546    Max,
547    /// Median
548    Median,
549    /// Skewness
550    Skewness,
551    /// Kurtosis
552    Kurtosis,
553}
554
555#[derive(Debug, Clone)]
556pub enum UnivariateMethod {
557    /// Chi2
558    Chi2,
559    /// ANOVA
560    ANOVA,
561    /// MutualInfo
562    MutualInfo,
563    /// Correlation
564    Correlation,
565}
566
567#[derive(Debug, Clone)]
568pub enum UnivariateScoreFunction {
569    /// Chi2
570    Chi2,
571    /// FClassif
572    FClassif,
573    /// FRegression
574    FRegression,
575    /// MutualInfoClassif
576    MutualInfoClassif,
577    /// MutualInfoRegression
578    MutualInfoRegression,
579}
580
581#[derive(Debug, Clone)]
582pub enum RFEEstimator {
583    /// SVM
584    SVM,
585    /// RandomForest
586    RandomForest,
587    /// LinearRegression
588    LinearRegression,
589    /// LogisticRegression
590    LogisticRegression,
591}
592
593#[derive(Debug, Clone)]
594pub enum ImportanceGetter {
595    /// Auto
596    Auto,
597    /// Coefficients
598    Coefficients,
599    /// FeatureImportances
600    FeatureImportances,
601}
602
603#[derive(Debug, Clone)]
604pub enum ModelEstimator {
605    /// LinearRegression
606    LinearRegression,
607    /// LogisticRegression
608    LogisticRegression,
609    /// RandomForest
610    RandomForest,
611    /// SVM
612    SVM,
613    /// XGBoost
614    XGBoost,
615    /// LightGBM
616    LightGBM,
617}
618
619#[derive(Debug, Clone)]
620pub enum CorrelationMethod {
621    /// Pearson
622    Pearson,
623    /// Spearman
624    Spearman,
625    /// Kendall
626    Kendall,
627}
628
629#[derive(Debug, Clone)]
630pub enum TreeEstimatorType {
631    /// RandomForest
632    RandomForest,
633    /// ExtraTrees
634    ExtraTrees,
635    /// GradientBoosting
636    GradientBoosting,
637    /// AdaBoost
638    AdaBoost,
639}
640
641#[derive(Debug, Clone)]
642pub enum SVDSolver {
643    /// Auto
644    Auto,
645    /// Full
646    Full,
647    /// Arpack
648    Arpack,
649    /// Randomized
650    Randomized,
651}
652
653#[derive(Debug, Clone)]
654pub enum SVDAlgorithm {
655    /// Randomized
656    Randomized,
657    /// Arpack
658    Arpack,
659}
660
661#[derive(Debug, Clone)]
662pub enum ICAAlgorithm {
663    /// Parallel
664    Parallel,
665    /// Deflation
666    Deflation,
667}
668
669#[derive(Debug, Clone)]
670pub enum DistanceMetric {
671    /// Euclidean
672    Euclidean,
673    /// Manhattan
674    Manhattan,
675    /// Cosine
676    Cosine,
677    /// Hamming
678    Hamming,
679}
680
681#[derive(Debug, Clone)]
682pub enum ScoringMetric {
683    /// Accuracy
684    Accuracy,
685    /// F1
686    F1,
687    /// RocAuc
688    RocAuc,
689    /// R2
690    R2,
691    /// MAE
692    MAE,
693    /// MSE
694    MSE,
695    /// LogLoss
696    LogLoss,
697}
698
699#[derive(Debug, Clone)]
700pub enum StepwiseDirection {
701    /// Forward
702    Forward,
703    /// Backward
704    Backward,
705    /// Both
706    Both,
707}
708
709#[derive(Debug, Clone)]
710pub enum AcquisitionFunction {
711    /// ExpectedImprovement
712    ExpectedImprovement,
713    /// UpperConfidenceBound
714    UpperConfidenceBound,
715    /// ProbabilityOfImprovement
716    ProbabilityOfImprovement,
717}
718
719#[derive(Debug, Clone)]
720pub enum Distribution {
721    /// Uniform
722    Uniform,
723    /// Normal
724    Normal,
725}
726
727#[derive(Debug, Clone)]
728pub enum PowerMethod {
729    /// YeoJohnson
730    YeoJohnson,
731    /// BoxCox
732    BoxCox,
733}
734
735#[derive(Debug, Clone)]
736pub enum ImputationStrategy {
737    /// Mean
738    Mean,
739    /// Median
740    Median,
741    /// Mode
742    Mode,
743    /// Constant
744    Constant,
745    /// KNN
746    KNN,
747    /// Iterative
748    Iterative,
749}
750
751#[derive(Debug, Clone)]
752pub enum MissingValueIndicator {
753    /// NaN
754    NaN,
755    /// Value
756    Value(f64),
757}
758
759#[derive(Debug, Clone)]
760pub enum OutlierMethod {
761    /// IsolationForest
762    IsolationForest,
763    /// LocalOutlierFactor
764    LocalOutlierFactor,
765    /// OneClassSVM
766    OneClassSVM,
767    /// EllipticEnvelope
768    EllipticEnvelope,
769}
770
771/// Trained step information for pipeline state tracking
772#[derive(Debug)]
773pub struct TrainedStep {
774    pub step_type: String,
775    pub step_index: usize,
776    pub training_time: Duration,
777    pub feature_count_before: usize,
778    pub feature_count_after: usize,
779    pub parameters: StepParameters,
780}
781
782#[derive(Debug)]
783pub enum StepParameters {
784    /// Preprocessing
785    Preprocessing(Box<dyn std::any::Any + Send + Sync>),
786    /// FeatureEngineering
787    FeatureEngineering(Box<dyn std::any::Any + Send + Sync>),
788    /// Selection
789    Selection(Array1<bool>),
790    /// DimensionalityReduction
791    DimensionalityReduction(Array2<f64>),
792    /// ModelSelection
793    ModelSelection(Vec<usize>),
794}
795
796/// Feature mapping for tracking feature transformations
797#[derive(Debug, Clone)]
798pub struct FeatureMapping {
799    pub original_features: usize,
800    pub final_features: usize,
801    pub feature_names: Vec<String>,
802    pub feature_origins: Vec<FeatureOrigin>,
803    pub transformation_history: Vec<TransformationStep>,
804}
805
806#[derive(Debug, Clone)]
807pub enum FeatureOrigin {
808    /// Original
809    Original(usize),
810    /// Engineered
811    Engineered {
812        source_features: Vec<usize>,
813
814        operation: String,
815    },
816    /// Transformed
817    Transformed {
818        source_feature: usize,
819
820        transformation: String,
821    },
822}
823
824#[derive(Debug, Clone)]
825pub struct TransformationStep {
826    pub step_name: String,
827    pub input_features: usize,
828    pub output_features: usize,
829    pub transformation_type: TransformationType,
830}
831
832#[derive(Debug, Clone)]
833pub enum TransformationType {
834    /// OneToOne
835    OneToOne,
836    /// OneToMany
837    OneToMany,
838    /// ManyToOne
839    ManyToOne,
840    /// ManyToMany
841    ManyToMany,
842}
843
844/// Pipeline metadata for tracking execution and performance
845#[derive(Debug, Clone)]
846pub struct PipelineMetadata {
847    pub total_training_time: Duration,
848    pub total_transform_time: Duration,
849    pub memory_usage_peak: usize,
850    pub feature_reduction_ratio: f64,
851    pub performance_metrics: HashMap<String, f64>,
852    pub validation_results: Option<ValidationResults>,
853}
854
855#[derive(Debug, Clone)]
856pub struct ValidationResults {
857    pub cross_validation_scores: Vec<f64>,
858    pub stability_scores: Vec<f64>,
859    pub robustness_scores: Vec<f64>,
860    pub statistical_significance: bool,
861}
862
863impl Default for FeatureSelectionPipeline<Untrained> {
864    fn default() -> Self {
865        Self::new()
866    }
867}
868
869impl FeatureSelectionPipeline<Untrained> {
870    /// Create a new untrained pipeline with default configuration
871    pub fn new() -> Self {
872        Self {
873            preprocessing_steps: Vec::new(),
874            feature_engineering_steps: Vec::new(),
875            selection_methods: Vec::new(),
876            dimensionality_reduction: None,
877            model_selection: None,
878            pipeline_config: PipelineConfiguration::default(),
879            optimization_config: OptimizationConfiguration::default(),
880            _phantom: PhantomData,
881        }
882    }
883
884    /// Builder pattern for adding preprocessing steps
885    pub fn add_preprocessing_step(mut self, step: PreprocessingStep) -> Self {
886        self.preprocessing_steps.push(step);
887        self
888    }
889
890    /// Builder pattern for adding feature engineering steps
891    pub fn add_feature_engineering_step(mut self, step: FeatureEngineeringStep) -> Self {
892        self.feature_engineering_steps.push(step);
893        self
894    }
895
896    /// Builder pattern for adding selection methods
897    pub fn add_selection_method(mut self, method: SelectionMethod) -> Self {
898        self.selection_methods.push(method);
899        self
900    }
901
902    /// Builder pattern for setting dimensionality reduction
903    pub fn with_dimensionality_reduction(mut self, reduction: DimensionalityReductionStep) -> Self {
904        self.dimensionality_reduction = Some(reduction);
905        self
906    }
907
908    /// Builder pattern for setting model selection
909    pub fn with_model_selection(mut self, model_selection: ModelSelectionStep) -> Self {
910        self.model_selection = Some(model_selection);
911        self
912    }
913
914    /// Configure pipeline behavior
915    pub fn with_config(mut self, config: PipelineConfiguration) -> Self {
916        self.pipeline_config = config;
917        self
918    }
919
920    /// Configure optimization settings
921    pub fn with_optimization(mut self, config: OptimizationConfiguration) -> Self {
922        self.optimization_config = config;
923        self
924    }
925
926    /// Train the entire pipeline on the provided data
927    pub fn fit(
928        mut self,
929        X: ArrayView2<f64>,
930        y: ArrayView1<f64>,
931    ) -> Result<FeatureSelectionPipeline<Trained>> {
932        let start_time = Instant::now();
933        let mut current_X = X.to_owned();
934        let current_y = y.to_owned();
935        let mut trained_steps = Vec::new();
936        let original_features = X.ncols();
937
938        // Step 1: Apply preprocessing steps
939        let mut preprocessing_steps = std::mem::take(&mut self.preprocessing_steps);
940        for (idx, step) in preprocessing_steps.iter_mut().enumerate() {
941            let step_start = Instant::now();
942            current_X = Self::apply_preprocessing_step_static(step, current_X.view())?;
943
944            trained_steps.push(TrainedStep {
945                step_type: "Preprocessing".to_string(),
946                step_index: idx,
947                training_time: step_start.elapsed(),
948                feature_count_before: current_X.ncols(),
949                feature_count_after: current_X.ncols(),
950                parameters: StepParameters::Preprocessing(Box::new(())), // Simplified for now
951            });
952        }
953        self.preprocessing_steps = preprocessing_steps;
954
955        // Step 2: Apply feature engineering steps
956        let mut feature_engineering_steps = std::mem::take(&mut self.feature_engineering_steps);
957        for (idx, step) in feature_engineering_steps.iter_mut().enumerate() {
958            let step_start = Instant::now();
959            let features_before = current_X.ncols();
960            current_X = Self::apply_feature_engineering_step_static(
961                step,
962                current_X.view(),
963                current_y.view(),
964            )?;
965
966            trained_steps.push(TrainedStep {
967                step_type: "FeatureEngineering".to_string(),
968                step_index: idx,
969                training_time: step_start.elapsed(),
970                feature_count_before: features_before,
971                feature_count_after: current_X.ncols(),
972                parameters: StepParameters::FeatureEngineering(Box::new(())),
973            });
974        }
975        self.feature_engineering_steps = feature_engineering_steps;
976
977        // Step 3: Apply feature selection methods
978        let mut selection_mask = Array1::from_elem(current_X.ncols(), true);
979        let mut selection_methods = std::mem::take(&mut self.selection_methods);
980        for (idx, method) in selection_methods.iter_mut().enumerate() {
981            let step_start = Instant::now();
982            let features_before = current_X.ncols();
983            let method_mask =
984                Self::apply_selection_method_static(method, current_X.view(), current_y.view())?;
985
986            // Combine with existing selection mask
987            for (i, &selected) in method_mask.iter().enumerate() {
988                if !selected {
989                    selection_mask[i] = false;
990                }
991            }
992
993            trained_steps.push(TrainedStep {
994                step_type: "Selection".to_string(),
995                step_index: idx,
996                training_time: step_start.elapsed(),
997                feature_count_before: features_before,
998                feature_count_after: selection_mask.iter().filter(|&&x| x).count(),
999                parameters: StepParameters::Selection(method_mask),
1000            });
1001        }
1002        self.selection_methods = selection_methods;
1003
1004        // Apply selection mask to reduce features
1005        let selected_indices: Vec<usize> = selection_mask
1006            .iter()
1007            .enumerate()
1008            .filter_map(|(i, &selected)| if selected { Some(i) } else { None })
1009            .collect();
1010
1011        if !selected_indices.is_empty() {
1012            let mut selected_X = Array2::zeros((current_X.nrows(), selected_indices.len()));
1013            for (new_col, &old_col) in selected_indices.iter().enumerate() {
1014                for row in 0..current_X.nrows() {
1015                    selected_X[[row, new_col]] = current_X[[row, old_col]];
1016                }
1017            }
1018            current_X = selected_X;
1019        }
1020
1021        // Step 4: Apply dimensionality reduction if specified
1022        if self.dimensionality_reduction.is_some() {
1023            let step_start = Instant::now();
1024            let features_before = current_X.ncols();
1025            let mut reduction = self.dimensionality_reduction.take().unwrap();
1026            current_X = self.apply_dimensionality_reduction(&mut reduction, current_X.view())?;
1027            self.dimensionality_reduction = Some(reduction);
1028
1029            trained_steps.push(TrainedStep {
1030                step_type: "DimensionalityReduction".to_string(),
1031                step_index: 0,
1032                training_time: step_start.elapsed(),
1033                feature_count_before: features_before,
1034                feature_count_after: current_X.ncols(),
1035                parameters: StepParameters::DimensionalityReduction(Array2::zeros((1, 1))), // Simplified
1036            });
1037        }
1038
1039        // Step 5: Apply model selection if specified
1040        if self.model_selection.is_some() {
1041            let step_start = Instant::now();
1042            let features_before = current_X.ncols();
1043            let mut model_sel = self.model_selection.take().unwrap();
1044            let selected_features =
1045                self.apply_model_selection(&mut model_sel, current_X.view(), current_y.view())?;
1046            self.model_selection = Some(model_sel);
1047
1048            // Further reduce features based on model selection
1049            if !selected_features.is_empty() {
1050                let mut model_selected_X =
1051                    Array2::zeros((current_X.nrows(), selected_features.len()));
1052                for (new_col, &old_col) in selected_features.iter().enumerate() {
1053                    for row in 0..current_X.nrows() {
1054                        model_selected_X[[row, new_col]] = current_X[[row, old_col]];
1055                    }
1056                }
1057                current_X = model_selected_X;
1058            }
1059
1060            trained_steps.push(TrainedStep {
1061                step_type: "ModelSelection".to_string(),
1062                step_index: 0,
1063                training_time: step_start.elapsed(),
1064                feature_count_before: features_before,
1065                feature_count_after: current_X.ncols(),
1066                parameters: StepParameters::ModelSelection(selected_features),
1067            });
1068        }
1069
1070        // Create feature mapping
1071        let final_features = current_X.ncols();
1072        let _feature_mapping = FeatureMapping {
1073            original_features,
1074            final_features,
1075            feature_names: (0..final_features)
1076                .map(|i| format!("feature_{}", i))
1077                .collect(),
1078            feature_origins: (0..final_features).map(FeatureOrigin::Original).collect(),
1079            transformation_history: trained_steps
1080                .iter()
1081                .map(|step| TransformationStep {
1082                    step_name: step.step_type.clone(),
1083                    input_features: step.feature_count_before,
1084                    output_features: step.feature_count_after,
1085                    transformation_type: TransformationType::ManyToMany,
1086                })
1087                .collect(),
1088        };
1089
1090        // Create pipeline metadata
1091        let total_training_time = start_time.elapsed();
1092        let feature_reduction_ratio = final_features as f64 / original_features as f64;
1093
1094        let _pipeline_metadata = PipelineMetadata {
1095            total_training_time,
1096            total_transform_time: Duration::from_secs(0),
1097            memory_usage_peak: 0, // Would be calculated in real implementation
1098            feature_reduction_ratio,
1099            performance_metrics: HashMap::new(),
1100            validation_results: None,
1101        };
1102
1103        Ok(FeatureSelectionPipeline {
1104            preprocessing_steps: self.preprocessing_steps,
1105            feature_engineering_steps: self.feature_engineering_steps,
1106            selection_methods: self.selection_methods,
1107            dimensionality_reduction: self.dimensionality_reduction,
1108            model_selection: self.model_selection,
1109            pipeline_config: self.pipeline_config,
1110            optimization_config: self.optimization_config,
1111            _phantom: PhantomData::<Trained>,
1112        })
1113    }
1114
1115    // Helper methods for applying different types of steps
1116    fn apply_preprocessing_step(
1117        &self,
1118        step: &mut PreprocessingStep,
1119        X: ArrayView2<f64>,
1120    ) -> Result<Array2<f64>> {
1121        Self::apply_preprocessing_step_static(step, X)
1122    }
1123
1124    fn apply_preprocessing_step_static(
1125        step: &mut PreprocessingStep,
1126        X: ArrayView2<f64>,
1127    ) -> Result<Array2<f64>> {
1128        match step {
1129            PreprocessingStep::StandardScaler {
1130                config,
1131                trained_params,
1132            } => Self::apply_standard_scaler_static(config, trained_params, X),
1133            PreprocessingStep::RobustScaler {
1134                config,
1135                trained_params,
1136            } => Self::apply_robust_scaler_static(config, trained_params, X),
1137            PreprocessingStep::MinMaxScaler {
1138                config,
1139                trained_params,
1140            } => Self::apply_minmax_scaler_static(config, trained_params, X),
1141            _ => {
1142                // For other preprocessing steps, return data unchanged for now
1143                Ok(X.to_owned())
1144            }
1145        }
1146    }
1147
1148    fn apply_standard_scaler(
1149        &self,
1150        config: &StandardScalerConfig,
1151        trained_params: &mut Option<ScalerParams>,
1152        X: ArrayView2<f64>,
1153    ) -> Result<Array2<f64>> {
1154        Self::apply_standard_scaler_static(config, trained_params, X)
1155    }
1156
1157    fn apply_standard_scaler_static(
1158        config: &StandardScalerConfig,
1159        trained_params: &mut Option<ScalerParams>,
1160        X: ArrayView2<f64>,
1161    ) -> Result<Array2<f64>> {
1162        let mut result = X.to_owned();
1163
1164        if trained_params.is_none() {
1165            // Training phase: compute mean and std
1166            let mut mean = Array1::zeros(X.ncols());
1167            let mut scale = Array1::ones(X.ncols());
1168
1169            if config.with_mean {
1170                for col in 0..X.ncols() {
1171                    mean[col] = X.column(col).mean().unwrap_or(0.0);
1172                }
1173            }
1174
1175            if config.with_std {
1176                for col in 0..X.ncols() {
1177                    let column = X.column(col);
1178                    let variance = column.var(1.0);
1179                    scale[col] = variance.sqrt().max(1e-8);
1180                }
1181            }
1182
1183            *trained_params = Some(ScalerParams { mean, scale });
1184        }
1185
1186        // Apply scaling
1187        if let Some(ref params) = trained_params {
1188            for col in 0..X.ncols() {
1189                for row in 0..X.nrows() {
1190                    if config.with_mean {
1191                        result[[row, col]] -= params.mean[col];
1192                    }
1193                    if config.with_std {
1194                        result[[row, col]] /= params.scale[col];
1195                    }
1196                }
1197            }
1198        }
1199
1200        Ok(result)
1201    }
1202
1203    fn apply_robust_scaler(
1204        &self,
1205        config: &RobustScalerConfig,
1206        trained_params: &mut Option<RobustScalerParams>,
1207        X: ArrayView2<f64>,
1208    ) -> Result<Array2<f64>> {
1209        Self::apply_robust_scaler_static(config, trained_params, X)
1210    }
1211
1212    fn apply_robust_scaler_static(
1213        config: &RobustScalerConfig,
1214        trained_params: &mut Option<RobustScalerParams>,
1215        X: ArrayView2<f64>,
1216    ) -> Result<Array2<f64>> {
1217        let mut result = X.to_owned();
1218
1219        if trained_params.is_none() {
1220            // Training phase: compute median and IQR
1221            let mut center = Array1::zeros(X.ncols());
1222            let mut scale = Array1::ones(X.ncols());
1223
1224            for col in 0..X.ncols() {
1225                let mut column_data: Vec<f64> = X.column(col).to_vec();
1226                column_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1227
1228                let n = column_data.len();
1229                if config.with_centering {
1230                    center[col] = if n % 2 == 0 {
1231                        (column_data[n / 2 - 1] + column_data[n / 2]) / 2.0
1232                    } else {
1233                        column_data[n / 2]
1234                    };
1235                }
1236
1237                if config.with_scaling {
1238                    let q1_idx = ((n - 1) as f64 * config.quantile_range.0) as usize;
1239                    let q3_idx = ((n - 1) as f64 * config.quantile_range.1) as usize;
1240                    let iqr = column_data[q3_idx] - column_data[q1_idx];
1241                    scale[col] = iqr.max(1e-8);
1242                }
1243            }
1244
1245            *trained_params = Some(RobustScalerParams { center, scale });
1246        }
1247
1248        // Apply scaling
1249        if let Some(ref params) = trained_params {
1250            for col in 0..X.ncols() {
1251                for row in 0..X.nrows() {
1252                    if config.with_centering {
1253                        result[[row, col]] -= params.center[col];
1254                    }
1255                    if config.with_scaling {
1256                        result[[row, col]] /= params.scale[col];
1257                    }
1258                }
1259            }
1260        }
1261
1262        Ok(result)
1263    }
1264
1265    fn apply_minmax_scaler(
1266        &self,
1267        config: &MinMaxScalerConfig,
1268        trained_params: &mut Option<MinMaxScalerParams>,
1269        X: ArrayView2<f64>,
1270    ) -> Result<Array2<f64>> {
1271        Self::apply_minmax_scaler_static(config, trained_params, X)
1272    }
1273
1274    fn apply_minmax_scaler_static(
1275        config: &MinMaxScalerConfig,
1276        trained_params: &mut Option<MinMaxScalerParams>,
1277        X: ArrayView2<f64>,
1278    ) -> Result<Array2<f64>> {
1279        let mut result = X.to_owned();
1280
1281        if trained_params.is_none() {
1282            // Training phase: compute min and scale
1283            let mut min = Array1::zeros(X.ncols());
1284            let mut scale = Array1::ones(X.ncols());
1285
1286            for col in 0..X.ncols() {
1287                let column = X.column(col);
1288                let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1289                let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1290
1291                min[col] = col_min;
1292                let range = col_max - col_min;
1293                if range > 1e-8 {
1294                    scale[col] = (config.feature_range.1 - config.feature_range.0) / range;
1295                }
1296            }
1297
1298            *trained_params = Some(MinMaxScalerParams { min, scale });
1299        }
1300
1301        // Apply scaling
1302        if let Some(ref params) = trained_params {
1303            for col in 0..X.ncols() {
1304                for row in 0..X.nrows() {
1305                    let scaled = (result[[row, col]] - params.min[col]) * params.scale[col]
1306                        + config.feature_range.0;
1307                    result[[row, col]] = if config.clip {
1308                        scaled
1309                            .max(config.feature_range.0)
1310                            .min(config.feature_range.1)
1311                    } else {
1312                        scaled
1313                    };
1314                }
1315            }
1316        }
1317
1318        Ok(result)
1319    }
1320
1321    fn apply_feature_engineering_step_static(
1322        _step: &mut FeatureEngineeringStep,
1323        X: ArrayView2<f64>,
1324        _y: ArrayView1<f64>,
1325    ) -> Result<Array2<f64>> {
1326        // Simplified static version - just return X for now
1327        Ok(X.to_owned())
1328    }
1329
1330    fn apply_feature_engineering_step(
1331        &self,
1332        step: &mut FeatureEngineeringStep,
1333        X: ArrayView2<f64>,
1334        _y: ArrayView1<f64>,
1335    ) -> Result<Array2<f64>> {
1336        match step {
1337            FeatureEngineeringStep::PolynomialFeatures {
1338                degree,
1339                interaction_only,
1340                include_bias,
1341                feature_mapping,
1342            } => self.apply_polynomial_features(
1343                *degree,
1344                *interaction_only,
1345                *include_bias,
1346                feature_mapping,
1347                X,
1348            ),
1349            FeatureEngineeringStep::InteractionFeatures {
1350                max_pairs,
1351                threshold,
1352                feature_pairs,
1353            } => self.apply_interaction_features(*max_pairs, *threshold, feature_pairs, X),
1354            FeatureEngineeringStep::BinningFeatures {
1355                n_bins,
1356                strategy,
1357                bin_edges,
1358            } => self.apply_binning_features(*n_bins, strategy, bin_edges, X),
1359            _ => {
1360                // For other feature engineering steps, return data unchanged for now
1361                Ok(X.to_owned())
1362            }
1363        }
1364    }
1365
1366    fn apply_polynomial_features(
1367        &self,
1368        degree: usize,
1369        interaction_only: bool,
1370        include_bias: bool,
1371        feature_mapping: &mut Option<Vec<(usize, usize)>>,
1372        X: ArrayView2<f64>,
1373    ) -> Result<Array2<f64>> {
1374        let n_features = X.ncols();
1375        let mut new_features = Vec::new();
1376        let mut mapping = Vec::new();
1377
1378        // Add bias term if requested
1379        if include_bias {
1380            let bias_feature = Array1::ones(X.nrows());
1381            new_features.push(bias_feature);
1382            mapping.push((0, 0)); // Bias has no source feature
1383        }
1384
1385        // Add original features
1386        for i in 0..n_features {
1387            new_features.push(X.column(i).to_owned());
1388            mapping.push((i, 1));
1389        }
1390
1391        // Add polynomial and interaction features
1392        if !interaction_only {
1393            // Add polynomial features (x^2, x^3, etc.)
1394            for d in 2..=degree {
1395                for i in 0..n_features {
1396                    let mut poly_feature = Array1::zeros(X.nrows());
1397                    for row in 0..X.nrows() {
1398                        poly_feature[row] = X[[row, i]].powi(d as i32);
1399                    }
1400                    new_features.push(poly_feature);
1401                    mapping.push((i, d));
1402                }
1403            }
1404        }
1405
1406        // Add interaction features
1407        for d in 2..=degree {
1408            for i in 0..n_features {
1409                for j in (i + 1)..n_features {
1410                    let mut interaction_feature = Array1::zeros(X.nrows());
1411                    for row in 0..X.nrows() {
1412                        interaction_feature[row] = X[[row, i]] * X[[row, j]];
1413                    }
1414                    new_features.push(interaction_feature);
1415                    mapping.push((i * n_features + j, d));
1416                }
1417            }
1418        }
1419
1420        *feature_mapping = Some(mapping);
1421
1422        // Combine all features into a matrix
1423        let n_new_features = new_features.len();
1424        let mut result = Array2::zeros((X.nrows(), n_new_features));
1425        for (col, feature) in new_features.iter().enumerate() {
1426            for row in 0..X.nrows() {
1427                result[[row, col]] = feature[row];
1428            }
1429        }
1430
1431        Ok(result)
1432    }
1433
1434    fn apply_interaction_features(
1435        &self,
1436        max_pairs: Option<usize>,
1437        threshold: f64,
1438        feature_pairs: &mut Option<Vec<(usize, usize)>>,
1439        X: ArrayView2<f64>,
1440    ) -> Result<Array2<f64>> {
1441        let n_features = X.ncols();
1442        let mut interactions = Vec::new();
1443        let pairs: Vec<(usize, usize)>;
1444
1445        // Generate all possible pairs or use provided pairs
1446        if feature_pairs.is_none() {
1447            let mut candidate_pairs = Vec::new();
1448            for i in 0..n_features {
1449                for j in (i + 1)..n_features {
1450                    // Compute correlation to filter relevant interactions
1451                    let corr = self.compute_correlation(X.column(i), X.column(j));
1452                    if corr.abs() > threshold {
1453                        candidate_pairs.push((i, j, corr.abs()));
1454                    }
1455                }
1456            }
1457
1458            // Sort by correlation and take top pairs
1459            candidate_pairs.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
1460            let limit = max_pairs.unwrap_or(candidate_pairs.len());
1461            pairs = candidate_pairs
1462                .into_iter()
1463                .take(limit)
1464                .map(|(i, j, _)| (i, j))
1465                .collect();
1466            *feature_pairs = Some(pairs.clone());
1467        } else {
1468            pairs = feature_pairs.as_ref().unwrap().clone();
1469        }
1470
1471        // Create interaction features
1472        for &(i, j) in &pairs {
1473            let mut interaction = Array1::zeros(X.nrows());
1474            for row in 0..X.nrows() {
1475                interaction[row] = X[[row, i]] * X[[row, j]];
1476            }
1477            interactions.push(interaction);
1478        }
1479
1480        // Combine original features with interactions
1481        let total_features = n_features + interactions.len();
1482        let mut result = Array2::zeros((X.nrows(), total_features));
1483
1484        // Copy original features
1485        for col in 0..n_features {
1486            for row in 0..X.nrows() {
1487                result[[row, col]] = X[[row, col]];
1488            }
1489        }
1490
1491        // Add interaction features
1492        for (idx, interaction) in interactions.iter().enumerate() {
1493            for row in 0..X.nrows() {
1494                result[[row, n_features + idx]] = interaction[row];
1495            }
1496        }
1497
1498        Ok(result)
1499    }
1500
1501    fn apply_binning_features(
1502        &self,
1503        n_bins: usize,
1504        strategy: &BinningStrategy,
1505        bin_edges: &mut Option<HashMap<usize, Vec<f64>>>,
1506        X: ArrayView2<f64>,
1507    ) -> Result<Array2<f64>> {
1508        let mut result = X.to_owned();
1509
1510        if bin_edges.is_none() {
1511            let mut edges_map = HashMap::new();
1512
1513            for col in 0..X.ncols() {
1514                let column = X.column(col);
1515                let edges = match strategy {
1516                    BinningStrategy::Uniform => {
1517                        let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1518                        let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1519                        let step = (max_val - min_val) / n_bins as f64;
1520                        (0..=n_bins)
1521                            .map(|i| min_val + i as f64 * step)
1522                            .collect::<Vec<f64>>()
1523                    }
1524                    BinningStrategy::Quantile => {
1525                        let mut sorted_values: Vec<f64> = column.to_vec();
1526                        sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
1527                        let n = sorted_values.len();
1528                        (0..=n_bins)
1529                            .map(|i| {
1530                                let quantile = i as f64 / n_bins as f64;
1531                                let idx = ((n - 1) as f64 * quantile) as usize;
1532                                sorted_values[idx]
1533                            })
1534                            .collect::<Vec<f64>>()
1535                    }
1536                    BinningStrategy::KMeans => {
1537                        // Simplified K-means binning - would need proper clustering in real implementation
1538                        let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1539                        let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1540                        let step = (max_val - min_val) / n_bins as f64;
1541                        (0..=n_bins)
1542                            .map(|i| min_val + i as f64 * step)
1543                            .collect::<Vec<f64>>()
1544                    }
1545                };
1546                edges_map.insert(col, edges);
1547            }
1548
1549            *bin_edges = Some(edges_map);
1550        }
1551
1552        // Apply binning
1553        if let Some(ref edges_map) = bin_edges {
1554            for col in 0..X.ncols() {
1555                if let Some(edges) = edges_map.get(&col) {
1556                    for row in 0..X.nrows() {
1557                        let value = X[[row, col]];
1558                        let bin = edges
1559                            .iter()
1560                            .position(|&edge| value <= edge)
1561                            .unwrap_or(edges.len() - 1)
1562                            .min(n_bins - 1);
1563                        result[[row, col]] = bin as f64;
1564                    }
1565                }
1566            }
1567        }
1568
1569        Ok(result)
1570    }
1571
1572    fn apply_selection_method_static(
1573        _method: &mut SelectionMethod,
1574        X: ArrayView2<f64>,
1575        _y: ArrayView1<f64>,
1576    ) -> Result<Array1<bool>> {
1577        // Simplified static version - select all features for now
1578        Ok(Array1::from_elem(X.ncols(), true))
1579    }
1580
1581    fn apply_selection_method(
1582        &self,
1583        method: &mut SelectionMethod,
1584        X: ArrayView2<f64>,
1585        y: ArrayView1<f64>,
1586    ) -> Result<Array1<bool>> {
1587        match method {
1588            SelectionMethod::VarianceThreshold {
1589                threshold,
1590                feature_variance,
1591            } => self.apply_variance_threshold(*threshold, feature_variance, X),
1592            SelectionMethod::CorrelationFilter {
1593                threshold,
1594                method: corr_method,
1595                correlation_matrix,
1596            } => self.apply_correlation_filter(*threshold, corr_method, correlation_matrix, X),
1597            SelectionMethod::UnivariateFilter {
1598                method: uni_method,
1599                k,
1600                score_func,
1601            } => self.apply_univariate_filter(uni_method, k, score_func, X, y),
1602            _ => {
1603                // For other methods, select all features for now
1604                Ok(Array1::from_elem(X.ncols(), true))
1605            }
1606        }
1607    }
1608
1609    fn apply_variance_threshold(
1610        &self,
1611        threshold: f64,
1612        feature_variance: &mut Option<Array1<f64>>,
1613        X: ArrayView2<f64>,
1614    ) -> Result<Array1<bool>> {
1615        if feature_variance.is_none() {
1616            let mut variances = Array1::zeros(X.ncols());
1617            for col in 0..X.ncols() {
1618                variances[col] = X.column(col).var(1.0);
1619            }
1620            *feature_variance = Some(variances);
1621        }
1622
1623        let variances = feature_variance.as_ref().unwrap();
1624        let selection = variances.mapv(|v| v > threshold);
1625        Ok(selection)
1626    }
1627
1628    fn apply_correlation_filter(
1629        &self,
1630        threshold: f64,
1631        corr_method: &CorrelationMethod,
1632        correlation_matrix: &mut Option<Array2<f64>>,
1633        X: ArrayView2<f64>,
1634    ) -> Result<Array1<bool>> {
1635        if correlation_matrix.is_none() {
1636            let n_features = X.ncols();
1637            let mut corr_matrix = Array2::zeros((n_features, n_features));
1638
1639            for i in 0..n_features {
1640                for j in 0..n_features {
1641                    if i == j {
1642                        corr_matrix[[i, j]] = 1.0;
1643                    } else {
1644                        let corr = match corr_method {
1645                            CorrelationMethod::Pearson => {
1646                                self.compute_correlation(X.column(i), X.column(j))
1647                            }
1648                            _ => {
1649                                // For now, use Pearson for all methods
1650                                self.compute_correlation(X.column(i), X.column(j))
1651                            }
1652                        };
1653                        corr_matrix[[i, j]] = corr;
1654                    }
1655                }
1656            }
1657            *correlation_matrix = Some(corr_matrix);
1658        }
1659
1660        // Remove features with high correlation
1661        let corr_matrix = correlation_matrix.as_ref().unwrap();
1662        let mut selection = Array1::from_elem(X.ncols(), true);
1663
1664        for i in 0..X.ncols() {
1665            for j in (i + 1)..X.ncols() {
1666                if corr_matrix[[i, j]].abs() > threshold && selection[i] && selection[j] {
1667                    // Remove the feature with lower variance
1668                    let var_i = X.column(i).var(1.0);
1669                    let var_j = X.column(j).var(1.0);
1670                    if var_i < var_j {
1671                        selection[i] = false;
1672                    } else {
1673                        selection[j] = false;
1674                    }
1675                }
1676            }
1677        }
1678
1679        Ok(selection)
1680    }
1681
1682    fn apply_univariate_filter(
1683        &self,
1684        _method: &UnivariateMethod,
1685        k: &SelectionCount,
1686        score_func: &UnivariateScoreFunction,
1687        X: ArrayView2<f64>,
1688        y: ArrayView1<f64>,
1689    ) -> Result<Array1<bool>> {
1690        // Compute scores for each feature
1691        let mut scores = Array1::zeros(X.ncols());
1692
1693        for col in 0..X.ncols() {
1694            scores[col] = match score_func {
1695                UnivariateScoreFunction::Chi2 => self.compute_chi2_score(X.column(col), y),
1696                UnivariateScoreFunction::FClassif => self.compute_f_score(X.column(col), y),
1697                UnivariateScoreFunction::MutualInfoClassif => {
1698                    self.compute_mutual_info(X.column(col), y)
1699                }
1700                _ => {
1701                    // Default to correlation
1702                    self.compute_correlation(X.column(col), y).abs()
1703                }
1704            };
1705        }
1706
1707        // Select features based on scores
1708        let selection = match k {
1709            SelectionCount::K(k_val) => {
1710                let mut indexed_scores: Vec<(usize, f64)> = scores
1711                    .iter()
1712                    .enumerate()
1713                    .map(|(i, &score)| (i, score))
1714                    .collect();
1715                indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1716
1717                let mut selection = Array1::from_elem(X.ncols(), false);
1718                for &(idx, _) in indexed_scores.iter().take(*k_val) {
1719                    selection[idx] = true;
1720                }
1721                selection
1722            }
1723            SelectionCount::Percentile(p) => {
1724                let k_val = ((X.ncols() as f64 * p / 100.0).round() as usize).max(1);
1725                let mut indexed_scores: Vec<(usize, f64)> = scores
1726                    .iter()
1727                    .enumerate()
1728                    .map(|(i, &score)| (i, score))
1729                    .collect();
1730                indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1731
1732                let mut selection = Array1::from_elem(X.ncols(), false);
1733                for &(idx, _) in indexed_scores.iter().take(k_val) {
1734                    selection[idx] = true;
1735                }
1736                selection
1737            }
1738            _ => {
1739                // For other selection methods, use top 50% for now
1740                let k_val = X.ncols() / 2;
1741                let mut indexed_scores: Vec<(usize, f64)> = scores
1742                    .iter()
1743                    .enumerate()
1744                    .map(|(i, &score)| (i, score))
1745                    .collect();
1746                indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1747
1748                let mut selection = Array1::from_elem(X.ncols(), false);
1749                for &(idx, _) in indexed_scores.iter().take(k_val) {
1750                    selection[idx] = true;
1751                }
1752                selection
1753            }
1754        };
1755
1756        Ok(selection)
1757    }
1758
1759    fn apply_dimensionality_reduction(
1760        &self,
1761        reduction: &mut DimensionalityReductionStep,
1762        X: ArrayView2<f64>,
1763    ) -> Result<Array2<f64>> {
1764        match reduction {
1765            DimensionalityReductionStep::PCA {
1766                n_components,
1767                whiten,
1768                svd_solver,
1769                components,
1770                explained_variance,
1771            } => self.apply_pca(
1772                *n_components,
1773                *whiten,
1774                svd_solver,
1775                components,
1776                explained_variance,
1777                X,
1778            ),
1779            DimensionalityReductionStep::TruncatedSVD {
1780                n_components,
1781                algorithm,
1782                components,
1783                singular_values,
1784            } => self.apply_truncated_svd(*n_components, algorithm, components, singular_values, X),
1785            _ => {
1786                // For other methods, return first n_components columns for now
1787                let n_comp = match reduction {
1788                    DimensionalityReductionStep::ICA { n_components, .. } => *n_components,
1789                    DimensionalityReductionStep::FactorAnalysis { n_components, .. } => {
1790                        *n_components
1791                    }
1792                    DimensionalityReductionStep::UMAP { n_components, .. } => *n_components,
1793                    DimensionalityReductionStep::TSNE { n_components, .. } => *n_components,
1794                    _ => X.ncols().min(50),
1795                };
1796                let final_components = n_comp.min(X.ncols());
1797                let mut result = Array2::zeros((X.nrows(), final_components));
1798                for col in 0..final_components {
1799                    for row in 0..X.nrows() {
1800                        result[[row, col]] = X[[row, col]];
1801                    }
1802                }
1803                Ok(result)
1804            }
1805        }
1806    }
1807
1808    fn apply_pca(
1809        &self,
1810        n_components: usize,
1811        _whiten: bool,
1812        _svd_solver: &SVDSolver,
1813        components: &mut Option<Array2<f64>>,
1814        explained_variance: &mut Option<Array1<f64>>,
1815        X: ArrayView2<f64>,
1816    ) -> Result<Array2<f64>> {
1817        let n_comp = n_components.min(X.ncols()).min(X.nrows());
1818
1819        // Center the data
1820        let mut centered_X = X.to_owned();
1821        let mut means = Array1::zeros(X.ncols());
1822        for col in 0..X.ncols() {
1823            means[col] = X.column(col).mean().unwrap_or(0.0);
1824            for row in 0..X.nrows() {
1825                centered_X[[row, col]] -= means[col];
1826            }
1827        }
1828
1829        // Simplified PCA: just return first n_components columns
1830        // In a real implementation, this would compute SVD
1831        if components.is_none() {
1832            *components = Some(Array2::eye(X.ncols()));
1833            *explained_variance = Some(Array1::ones(n_comp));
1834        }
1835
1836        let mut result = Array2::zeros((X.nrows(), n_comp));
1837        for col in 0..n_comp {
1838            for row in 0..X.nrows() {
1839                result[[row, col]] = centered_X[[row, col]];
1840            }
1841        }
1842
1843        Ok(result)
1844    }
1845
1846    fn apply_truncated_svd(
1847        &self,
1848        n_components: usize,
1849        _algorithm: &SVDAlgorithm,
1850        components: &mut Option<Array2<f64>>,
1851        singular_values: &mut Option<Array1<f64>>,
1852        X: ArrayView2<f64>,
1853    ) -> Result<Array2<f64>> {
1854        let n_comp = n_components.min(X.ncols()).min(X.nrows());
1855
1856        // Simplified SVD: just return first n_components columns
1857        if components.is_none() {
1858            *components = Some(Array2::eye(X.ncols()));
1859            *singular_values = Some(Array1::ones(n_comp));
1860        }
1861
1862        let mut result = Array2::zeros((X.nrows(), n_comp));
1863        for col in 0..n_comp {
1864            for row in 0..X.nrows() {
1865                result[[row, col]] = X[[row, col]];
1866            }
1867        }
1868
1869        Ok(result)
1870    }
1871
1872    fn apply_model_selection(
1873        &self,
1874        model_selection: &mut ModelSelectionStep,
1875        X: ArrayView2<f64>,
1876        y: ArrayView1<f64>,
1877    ) -> Result<Vec<usize>> {
1878        match model_selection {
1879            ModelSelectionStep::CrossValidationSelection {
1880                estimator,
1881                cv_folds,
1882                scoring,
1883                feature_scores,
1884            } => self.apply_cv_selection(estimator, *cv_folds, scoring, feature_scores, X, y),
1885            ModelSelectionStep::ForwardSelection {
1886                estimator,
1887                max_features,
1888                scoring,
1889                selected_features,
1890            } => self.apply_forward_selection(
1891                estimator,
1892                *max_features,
1893                scoring,
1894                selected_features,
1895                X,
1896                y,
1897            ),
1898            _ => {
1899                // For other methods, return all features for now
1900                Ok((0..X.ncols()).collect())
1901            }
1902        }
1903    }
1904
1905    fn apply_cv_selection(
1906        &self,
1907        _estimator: &ModelEstimator,
1908        _cv_folds: usize,
1909        _scoring: &ScoringMetric,
1910        feature_scores: &mut Option<Array1<f64>>,
1911        X: ArrayView2<f64>,
1912        y: ArrayView1<f64>,
1913    ) -> Result<Vec<usize>> {
1914        if feature_scores.is_none() {
1915            // Simplified CV: compute correlation as proxy for CV score
1916            let mut scores = Array1::zeros(X.ncols());
1917            for col in 0..X.ncols() {
1918                scores[col] = self.compute_correlation(X.column(col), y).abs();
1919            }
1920            *feature_scores = Some(scores);
1921        }
1922
1923        // Select top half of features based on scores
1924        if let Some(ref scores) = feature_scores {
1925            let mut indexed_scores: Vec<(usize, f64)> = scores
1926                .iter()
1927                .enumerate()
1928                .map(|(i, &score)| (i, score))
1929                .collect();
1930            indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1931
1932            let n_select = X.ncols() / 2;
1933            Ok(indexed_scores
1934                .into_iter()
1935                .take(n_select)
1936                .map(|(idx, _)| idx)
1937                .collect())
1938        } else {
1939            Ok((0..X.ncols()).collect())
1940        }
1941    }
1942
1943    fn apply_forward_selection(
1944        &self,
1945        _estimator: &ModelEstimator,
1946        max_features: usize,
1947        _scoring: &ScoringMetric,
1948        selected_features: &mut Option<Vec<usize>>,
1949        X: ArrayView2<f64>,
1950        y: ArrayView1<f64>,
1951    ) -> Result<Vec<usize>> {
1952        if selected_features.is_none() {
1953            // Simplified forward selection: select features with highest correlation
1954            let mut scores = Vec::new();
1955            for col in 0..X.ncols() {
1956                let score = self.compute_correlation(X.column(col), y).abs();
1957                scores.push((col, score));
1958            }
1959
1960            scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1961            let features: Vec<usize> = scores
1962                .into_iter()
1963                .take(max_features.min(X.ncols()))
1964                .map(|(idx, _)| idx)
1965                .collect();
1966
1967            *selected_features = Some(features.clone());
1968            Ok(features)
1969        } else {
1970            Ok(selected_features.as_ref().unwrap().clone())
1971        }
1972    }
1973
1974    // Helper methods for statistical computations
1975    fn compute_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
1976        let n = x.len() as f64;
1977        if n < 2.0 {
1978            return 0.0;
1979        }
1980
1981        let mean_x = x.mean().unwrap_or(0.0);
1982        let mean_y = y.mean().unwrap_or(0.0);
1983
1984        let mut sum_xy = 0.0;
1985        let mut sum_x2 = 0.0;
1986        let mut sum_y2 = 0.0;
1987
1988        for i in 0..x.len() {
1989            let dx = x[i] - mean_x;
1990            let dy = y[i] - mean_y;
1991            sum_xy += dx * dy;
1992            sum_x2 += dx * dx;
1993            sum_y2 += dy * dy;
1994        }
1995
1996        let denom = (sum_x2 * sum_y2).sqrt();
1997        if denom < 1e-10 {
1998            0.0
1999        } else {
2000            sum_xy / denom
2001        }
2002    }
2003
2004    fn compute_chi2_score(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
2005        // Simplified chi2 score - in real implementation would need proper contingency table
2006        self.compute_correlation(x, y).abs()
2007    }
2008
2009    fn compute_f_score(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
2010        // Simplified F-score - in real implementation would compute ANOVA F-statistic
2011        self.compute_correlation(x, y).abs()
2012    }
2013
2014    fn compute_mutual_info(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
2015        // Simplified mutual information - in real implementation would use proper MI estimation
2016        self.compute_correlation(x, y).abs()
2017    }
2018}
2019
2020impl FeatureSelectionPipeline<Trained> {
2021    pub fn transform(&self, X: ArrayView2<f64>) -> Result<Array2<f64>> {
2022        let _start_time = Instant::now();
2023        let current_X = X.to_owned();
2024
2025        // Apply all trained transformations in sequence
2026        // This would follow the same sequence as training but use stored parameters
2027
2028        // For now, return the input data (full implementation would apply all transformations)
2029        Ok(current_X)
2030    }
2031
2032    /// Get information about the trained pipeline
2033    pub fn get_pipeline_info(&self) -> PipelineInfo {
2034        PipelineInfo {
2035            n_preprocessing_steps: self.preprocessing_steps.len(),
2036            n_feature_engineering_steps: self.feature_engineering_steps.len(),
2037            n_selection_methods: self.selection_methods.len(),
2038            has_dimensionality_reduction: self.dimensionality_reduction.is_some(),
2039            has_model_selection: self.model_selection.is_some(),
2040            config: self.pipeline_config.clone(),
2041        }
2042    }
2043}
2044
2045/// Information about a trained pipeline
2046#[derive(Debug, Clone)]
2047pub struct PipelineInfo {
2048    pub n_preprocessing_steps: usize,
2049    pub n_feature_engineering_steps: usize,
2050    pub n_selection_methods: usize,
2051    pub has_dimensionality_reduction: bool,
2052    pub has_model_selection: bool,
2053    pub config: PipelineConfiguration,
2054}
2055
2056// Default implementations
2057impl Default for PipelineConfiguration {
2058    fn default() -> Self {
2059        Self {
2060            parallel_execution: true,
2061            memory_optimization: MemoryOptimization::Conservative,
2062            caching_strategy: CachingStrategy::LRU { size: 1000 },
2063            validation_strategy: ValidationStrategy::Basic,
2064            error_handling: ErrorHandling::Graceful,
2065            logging_level: LoggingLevel::Info,
2066        }
2067    }
2068}
2069
2070impl Default for OptimizationConfiguration {
2071    fn default() -> Self {
2072        Self {
2073            use_simd: true,
2074            chunk_size: 1000,
2075            thread_pool_size: None,
2076            memory_pool_size: 1024 * 1024, // 1MB
2077            cache_size: 100,
2078            prefetch_strategy: PrefetchStrategy::Sequential,
2079            vectorization_threshold: 1000,
2080        }
2081    }
2082}
2083
2084// Estimator trait implementation for type safety
2085impl Estimator for FeatureSelectionPipeline<Untrained> {
2086    type Config = ();
2087    type Error = SklearsError;
2088    type Float = f64;
2089
2090    fn config(&self) -> &Self::Config {
2091        &()
2092    }
2093}
2094
2095impl Fit<ArrayView2<'_, f64>, ArrayView1<'_, f64>> for FeatureSelectionPipeline<Untrained> {
2096    type Fitted = FeatureSelectionPipeline<Trained>;
2097
2098    fn fit(self, X: &ArrayView2<'_, f64>, y: &ArrayView1<'_, f64>) -> Result<Self::Fitted> {
2099        self.fit(*X, *y)
2100    }
2101}
2102
2103impl Transform<ArrayView2<'_, f64>, Array2<f64>> for FeatureSelectionPipeline<Trained> {
2104    fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
2105        self.transform(*X)
2106    }
2107}
sklears_feature_selection/pipeline.rs

sklears_feature_selection/
pipeline.rs