sklears_feature_selection/automl/
preprocessing_integration.rs

1//! Preprocessing Integration Module for AutoML Feature Selection
2//!
3//! Provides automated data preprocessing including scaling, missing value handling, and feature engineering.
4//! All implementations follow the SciRS2 policy using scirs2-core for numerical computations.
5
6use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
7
8use super::automl_core::DataCharacteristics;
9use sklears_core::error::Result as SklResult;
10
11type Result<T> = SklResult<T>;
12
13/// Preprocessing integration for automated data preparation
14#[derive(Debug, Clone)]
15pub struct PreprocessingIntegration {
16    scaler_type: ScalerType,
17    missing_value_strategy: MissingValueStrategy,
18    outlier_handling: OutlierHandling,
19    feature_engineering: FeatureEngineering,
20    dimensionality_reduction: Option<DimensionalityReduction>,
21}
22
23#[derive(Debug, Clone, PartialEq)]
24pub enum ScalerType {
25    /// StandardScaler
26    StandardScaler,
27    /// MinMaxScaler
28    MinMaxScaler,
29    /// RobustScaler
30    RobustScaler,
31    /// QuantileUniform
32    QuantileUniform,
33    /// QuantileNormal
34    QuantileNormal,
35
36    None,
37}
38
39#[derive(Debug, Clone, PartialEq)]
40pub enum MissingValueStrategy {
41    /// Mean
42    Mean,
43    /// Median
44    Median,
45    /// Mode
46    Mode,
47    /// Forward
48    Forward,
49    /// Backward
50    Backward,
51    /// Interpolation
52    Interpolation,
53    /// Remove
54    Remove,
55    /// KNN
56    KNN { k: usize },
57}
58
59#[derive(Debug, Clone, PartialEq)]
60pub enum OutlierHandling {
61    /// IQR
62    IQR {
63        multiplier: f64,
64    },
65    /// ZScore
66    ZScore {
67        threshold: f64,
68    },
69    /// Isolation
70    Isolation,
71    /// LocalOutlierFactor
72    LocalOutlierFactor {
73        k: usize,
74    },
75
76    None,
77}
78
79#[derive(Debug, Clone, PartialEq)]
80pub enum FeatureEngineering {
81    /// Polynomial
82    Polynomial {
83        degree: usize,
84    },
85    /// Interaction
86    Interaction,
87    /// TargetEncoding
88    TargetEncoding,
89    /// FrequencyEncoding
90    FrequencyEncoding,
91    /// BinDiscretization
92    BinDiscretization {
93        bins: usize,
94    },
95
96    None,
97}
98
99#[derive(Debug, Clone, PartialEq)]
100pub enum DimensionalityReduction {
101    /// PCA
102    PCA { n_components: usize },
103    /// ICA
104    ICA { n_components: usize },
105    /// TruncatedSVD
106    TruncatedSVD { n_components: usize },
107    /// FactorAnalysis
108    FactorAnalysis { n_components: usize },
109}
110
111impl PreprocessingIntegration {
112    pub fn new() -> Self {
113        Self {
114            scaler_type: ScalerType::StandardScaler,
115            missing_value_strategy: MissingValueStrategy::Mean,
116            outlier_handling: OutlierHandling::None,
117            feature_engineering: FeatureEngineering::None,
118            dimensionality_reduction: None,
119        }
120    }
121
122    pub fn with_scaler(mut self, scaler_type: ScalerType) -> Self {
123        self.scaler_type = scaler_type;
124        self
125    }
126
127    pub fn with_missing_value_strategy(mut self, strategy: MissingValueStrategy) -> Self {
128        self.missing_value_strategy = strategy;
129        self
130    }
131
132    pub fn with_outlier_handling(mut self, handling: OutlierHandling) -> Self {
133        self.outlier_handling = handling;
134        self
135    }
136
137    pub fn with_feature_engineering(mut self, engineering: FeatureEngineering) -> Self {
138        self.feature_engineering = engineering;
139        self
140    }
141
142    pub fn with_dimensionality_reduction(mut self, reduction: DimensionalityReduction) -> Self {
143        self.dimensionality_reduction = Some(reduction);
144        self
145    }
146
147    /// Apply preprocessing to the data
148    pub fn preprocess_data(
149        &self,
150        X: ArrayView2<f64>,
151        y: ArrayView1<f64>,
152    ) -> Result<(Array2<f64>, Array1<f64>)> {
153        let mut processed_X = X.to_owned();
154        let processed_y = y.to_owned();
155
156        // Step 1: Handle missing values
157        processed_X = self.handle_missing_values(processed_X)?;
158
159        // Step 2: Handle outliers
160        processed_X = self.handle_outliers(processed_X)?;
161
162        // Step 3: Scale features
163        processed_X = self.scale_features(processed_X)?;
164
165        // Step 4: Feature engineering
166        processed_X = self.apply_feature_engineering(processed_X)?;
167
168        // Step 5: Dimensionality reduction (if specified)
169        if let Some(ref reduction) = self.dimensionality_reduction {
170            processed_X = self.apply_dimensionality_reduction(processed_X, reduction)?;
171        }
172
173        Ok((processed_X, processed_y))
174    }
175
176    /// Auto-configure preprocessing based on data characteristics
177    pub fn auto_configure(characteristics: &DataCharacteristics) -> Self {
178        let mut config = Self::new();
179
180        // Choose scaler based on data properties
181        config.scaler_type = if characteristics
182            .feature_variance_distribution
183            .iter()
184            .any(|&v| v > 1000.0)
185        {
186            ScalerType::RobustScaler
187        } else {
188            ScalerType::StandardScaler
189        };
190
191        // Choose missing value strategy
192        config.missing_value_strategy = if characteristics.has_missing_values {
193            if characteristics.n_samples > 1000 {
194                MissingValueStrategy::KNN { k: 5 }
195            } else {
196                MissingValueStrategy::Mean
197            }
198        } else {
199            MissingValueStrategy::Mean // No missing values, strategy doesn't matter
200        };
201
202        // Configure outlier handling for high-dimensional data
203        config.outlier_handling = if characteristics.n_features > 100 {
204            OutlierHandling::IQR { multiplier: 1.5 }
205        } else {
206            OutlierHandling::None
207        };
208
209        // Feature engineering for small datasets
210        config.feature_engineering =
211            if characteristics.n_features < 50 && characteristics.n_samples > 200 {
212                FeatureEngineering::Polynomial { degree: 2 }
213            } else {
214                FeatureEngineering::None
215            };
216
217        // Dimensionality reduction for high-dimensional data
218        config.dimensionality_reduction = if characteristics.feature_to_sample_ratio > 2.0 {
219            Some(DimensionalityReduction::PCA {
220                n_components: (characteristics.n_samples / 2).min(100),
221            })
222        } else {
223            None
224        };
225
226        config
227    }
228
229    fn handle_missing_values(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
230        match &self.missing_value_strategy {
231            MissingValueStrategy::Mean => {
232                for col in 0..X.ncols() {
233                    let mut column = X.column_mut(col);
234                    let valid_values: Vec<f64> =
235                        column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
236                    if !valid_values.is_empty() {
237                        let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
238                        for val in column.iter_mut() {
239                            if val.is_nan() {
240                                *val = mean;
241                            }
242                        }
243                    }
244                }
245            }
246            MissingValueStrategy::Median => {
247                for col in 0..X.ncols() {
248                    let mut column = X.column_mut(col);
249                    let mut valid_values: Vec<f64> =
250                        column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
251                    if !valid_values.is_empty() {
252                        valid_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
253                        let median = if valid_values.len() % 2 == 0 {
254                            (valid_values[valid_values.len() / 2 - 1]
255                                + valid_values[valid_values.len() / 2])
256                                / 2.0
257                        } else {
258                            valid_values[valid_values.len() / 2]
259                        };
260                        for val in column.iter_mut() {
261                            if val.is_nan() {
262                                *val = median;
263                            }
264                        }
265                    }
266                }
267            }
268            // Simplified implementations for other strategies
269            _ => {
270                // For other strategies, use mean as fallback
271                return self.handle_missing_values_fallback(X);
272            }
273        }
274        Ok(X)
275    }
276
277    fn handle_missing_values_fallback(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
278        for col in 0..X.ncols() {
279            let mut column = X.column_mut(col);
280            let valid_values: Vec<f64> = column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
281            if !valid_values.is_empty() {
282                let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
283                for val in column.iter_mut() {
284                    if val.is_nan() {
285                        *val = mean;
286                    }
287                }
288            }
289        }
290        Ok(X)
291    }
292
293    fn handle_outliers(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
294        match &self.outlier_handling {
295            OutlierHandling::IQR { multiplier } => {
296                for col in 0..X.ncols() {
297                    let column = X.column(col);
298                    let mut values: Vec<f64> = column.to_vec();
299                    values.sort_by(|a, b| a.partial_cmp(b).unwrap());
300
301                    let q1_idx = values.len() / 4;
302                    let q3_idx = 3 * values.len() / 4;
303                    let q1 = values[q1_idx];
304                    let q3 = values[q3_idx];
305                    let iqr = q3 - q1;
306
307                    let lower_bound = q1 - multiplier * iqr;
308                    let upper_bound = q3 + multiplier * iqr;
309
310                    // Cap outliers
311                    for val in X.column_mut(col).iter_mut() {
312                        if *val < lower_bound {
313                            *val = lower_bound;
314                        } else if *val > upper_bound {
315                            *val = upper_bound;
316                        }
317                    }
318                }
319            }
320            OutlierHandling::ZScore { threshold } => {
321                for col in 0..X.ncols() {
322                    let column = X.column(col);
323                    let mean = column.mean().unwrap_or(0.0);
324                    let std = column.std(1.0);
325
326                    for val in X.column_mut(col).iter_mut() {
327                        let z_score = (*val - mean) / std;
328                        if z_score.abs() > *threshold {
329                            *val = mean; // Replace outliers with mean
330                        }
331                    }
332                }
333            }
334            _ => {
335                // No outlier handling
336            }
337        }
338        Ok(X)
339    }
340
341    fn scale_features(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
342        match &self.scaler_type {
343            ScalerType::StandardScaler => {
344                for col in 0..X.ncols() {
345                    let column = X.column(col);
346                    let mean = column.mean().unwrap_or(0.0);
347                    let std = column.std(1.0);
348
349                    if std > 1e-10 {
350                        for val in X.column_mut(col).iter_mut() {
351                            *val = (*val - mean) / std;
352                        }
353                    }
354                }
355            }
356            ScalerType::MinMaxScaler => {
357                for col in 0..X.ncols() {
358                    let column = X.column(col);
359                    let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
360                    let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
361                    let range = max_val - min_val;
362
363                    if range > 1e-10 {
364                        for val in X.column_mut(col).iter_mut() {
365                            *val = (*val - min_val) / range;
366                        }
367                    }
368                }
369            }
370            ScalerType::RobustScaler => {
371                for col in 0..X.ncols() {
372                    let mut values: Vec<f64> = X.column(col).to_vec();
373                    values.sort_by(|a, b| a.partial_cmp(b).unwrap());
374
375                    let median = if values.len() % 2 == 0 {
376                        (values[values.len() / 2 - 1] + values[values.len() / 2]) / 2.0
377                    } else {
378                        values[values.len() / 2]
379                    };
380
381                    let q1 = values[values.len() / 4];
382                    let q3 = values[3 * values.len() / 4];
383                    let iqr = q3 - q1;
384
385                    if iqr > 1e-10 {
386                        for val in X.column_mut(col).iter_mut() {
387                            *val = (*val - median) / iqr;
388                        }
389                    }
390                }
391            }
392            _ => {
393                // No scaling
394            }
395        }
396        Ok(X)
397    }
398
399    fn apply_feature_engineering(&self, X: Array2<f64>) -> Result<Array2<f64>> {
400        match &self.feature_engineering {
401            FeatureEngineering::Polynomial { degree: 2 } => {
402                // Simple quadratic features (x^2 for each feature)
403                let mut new_X = Array2::zeros((X.nrows(), X.ncols() * 2));
404
405                // Original features
406                for i in 0..X.nrows() {
407                    for j in 0..X.ncols() {
408                        new_X[[i, j]] = X[[i, j]];
409                    }
410                }
411
412                // Squared features
413                for i in 0..X.nrows() {
414                    for j in 0..X.ncols() {
415                        new_X[[i, X.ncols() + j]] = X[[i, j]] * X[[i, j]];
416                    }
417                }
418
419                Ok(new_X)
420            }
421            _ => Ok(X),
422        }
423    }
424
425    fn apply_dimensionality_reduction(
426        &self,
427        X: Array2<f64>,
428        reduction: &DimensionalityReduction,
429    ) -> Result<Array2<f64>> {
430        match reduction {
431            DimensionalityReduction::PCA { n_components } => {
432                // Simplified PCA implementation - just select first n_components features
433                let n_comp = (*n_components).min(X.ncols());
434                let mut reduced_X = Array2::zeros((X.nrows(), n_comp));
435
436                for i in 0..X.nrows() {
437                    for j in 0..n_comp {
438                        reduced_X[[i, j]] = X[[i, j]];
439                    }
440                }
441
442                Ok(reduced_X)
443            }
444            _ => Ok(X), // Other reduction methods not implemented
445        }
446    }
447}
448
449impl Default for PreprocessingIntegration {
450    fn default() -> Self {
451        Self::new()
452    }
453}