sklears_feature_selection/automl/
preprocessing_integration.rs1use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
7
8use super::automl_core::DataCharacteristics;
9use sklears_core::error::Result as SklResult;
10
11type Result<T> = SklResult<T>;
12
13#[derive(Debug, Clone)]
15pub struct PreprocessingIntegration {
16 scaler_type: ScalerType,
17 missing_value_strategy: MissingValueStrategy,
18 outlier_handling: OutlierHandling,
19 feature_engineering: FeatureEngineering,
20 dimensionality_reduction: Option<DimensionalityReduction>,
21}
22
23#[derive(Debug, Clone, PartialEq)]
24pub enum ScalerType {
25 StandardScaler,
27 MinMaxScaler,
29 RobustScaler,
31 QuantileUniform,
33 QuantileNormal,
35
36 None,
37}
38
39#[derive(Debug, Clone, PartialEq)]
40pub enum MissingValueStrategy {
41 Mean,
43 Median,
45 Mode,
47 Forward,
49 Backward,
51 Interpolation,
53 Remove,
55 KNN { k: usize },
57}
58
59#[derive(Debug, Clone, PartialEq)]
60pub enum OutlierHandling {
61 IQR {
63 multiplier: f64,
64 },
65 ZScore {
67 threshold: f64,
68 },
69 Isolation,
71 LocalOutlierFactor {
73 k: usize,
74 },
75
76 None,
77}
78
79#[derive(Debug, Clone, PartialEq)]
80pub enum FeatureEngineering {
81 Polynomial {
83 degree: usize,
84 },
85 Interaction,
87 TargetEncoding,
89 FrequencyEncoding,
91 BinDiscretization {
93 bins: usize,
94 },
95
96 None,
97}
98
99#[derive(Debug, Clone, PartialEq)]
100pub enum DimensionalityReduction {
101 PCA { n_components: usize },
103 ICA { n_components: usize },
105 TruncatedSVD { n_components: usize },
107 FactorAnalysis { n_components: usize },
109}
110
111impl PreprocessingIntegration {
112 pub fn new() -> Self {
113 Self {
114 scaler_type: ScalerType::StandardScaler,
115 missing_value_strategy: MissingValueStrategy::Mean,
116 outlier_handling: OutlierHandling::None,
117 feature_engineering: FeatureEngineering::None,
118 dimensionality_reduction: None,
119 }
120 }
121
122 pub fn with_scaler(mut self, scaler_type: ScalerType) -> Self {
123 self.scaler_type = scaler_type;
124 self
125 }
126
127 pub fn with_missing_value_strategy(mut self, strategy: MissingValueStrategy) -> Self {
128 self.missing_value_strategy = strategy;
129 self
130 }
131
132 pub fn with_outlier_handling(mut self, handling: OutlierHandling) -> Self {
133 self.outlier_handling = handling;
134 self
135 }
136
137 pub fn with_feature_engineering(mut self, engineering: FeatureEngineering) -> Self {
138 self.feature_engineering = engineering;
139 self
140 }
141
142 pub fn with_dimensionality_reduction(mut self, reduction: DimensionalityReduction) -> Self {
143 self.dimensionality_reduction = Some(reduction);
144 self
145 }
146
147 pub fn preprocess_data(
149 &self,
150 X: ArrayView2<f64>,
151 y: ArrayView1<f64>,
152 ) -> Result<(Array2<f64>, Array1<f64>)> {
153 let mut processed_X = X.to_owned();
154 let processed_y = y.to_owned();
155
156 processed_X = self.handle_missing_values(processed_X)?;
158
159 processed_X = self.handle_outliers(processed_X)?;
161
162 processed_X = self.scale_features(processed_X)?;
164
165 processed_X = self.apply_feature_engineering(processed_X)?;
167
168 if let Some(ref reduction) = self.dimensionality_reduction {
170 processed_X = self.apply_dimensionality_reduction(processed_X, reduction)?;
171 }
172
173 Ok((processed_X, processed_y))
174 }
175
176 pub fn auto_configure(characteristics: &DataCharacteristics) -> Self {
178 let mut config = Self::new();
179
180 config.scaler_type = if characteristics
182 .feature_variance_distribution
183 .iter()
184 .any(|&v| v > 1000.0)
185 {
186 ScalerType::RobustScaler
187 } else {
188 ScalerType::StandardScaler
189 };
190
191 config.missing_value_strategy = if characteristics.has_missing_values {
193 if characteristics.n_samples > 1000 {
194 MissingValueStrategy::KNN { k: 5 }
195 } else {
196 MissingValueStrategy::Mean
197 }
198 } else {
199 MissingValueStrategy::Mean };
201
202 config.outlier_handling = if characteristics.n_features > 100 {
204 OutlierHandling::IQR { multiplier: 1.5 }
205 } else {
206 OutlierHandling::None
207 };
208
209 config.feature_engineering =
211 if characteristics.n_features < 50 && characteristics.n_samples > 200 {
212 FeatureEngineering::Polynomial { degree: 2 }
213 } else {
214 FeatureEngineering::None
215 };
216
217 config.dimensionality_reduction = if characteristics.feature_to_sample_ratio > 2.0 {
219 Some(DimensionalityReduction::PCA {
220 n_components: (characteristics.n_samples / 2).min(100),
221 })
222 } else {
223 None
224 };
225
226 config
227 }
228
229 fn handle_missing_values(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
230 match &self.missing_value_strategy {
231 MissingValueStrategy::Mean => {
232 for col in 0..X.ncols() {
233 let mut column = X.column_mut(col);
234 let valid_values: Vec<f64> =
235 column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
236 if !valid_values.is_empty() {
237 let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
238 for val in column.iter_mut() {
239 if val.is_nan() {
240 *val = mean;
241 }
242 }
243 }
244 }
245 }
246 MissingValueStrategy::Median => {
247 for col in 0..X.ncols() {
248 let mut column = X.column_mut(col);
249 let mut valid_values: Vec<f64> =
250 column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
251 if !valid_values.is_empty() {
252 valid_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
253 let median = if valid_values.len() % 2 == 0 {
254 (valid_values[valid_values.len() / 2 - 1]
255 + valid_values[valid_values.len() / 2])
256 / 2.0
257 } else {
258 valid_values[valid_values.len() / 2]
259 };
260 for val in column.iter_mut() {
261 if val.is_nan() {
262 *val = median;
263 }
264 }
265 }
266 }
267 }
268 _ => {
270 return self.handle_missing_values_fallback(X);
272 }
273 }
274 Ok(X)
275 }
276
277 fn handle_missing_values_fallback(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
278 for col in 0..X.ncols() {
279 let mut column = X.column_mut(col);
280 let valid_values: Vec<f64> = column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
281 if !valid_values.is_empty() {
282 let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
283 for val in column.iter_mut() {
284 if val.is_nan() {
285 *val = mean;
286 }
287 }
288 }
289 }
290 Ok(X)
291 }
292
293 fn handle_outliers(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
294 match &self.outlier_handling {
295 OutlierHandling::IQR { multiplier } => {
296 for col in 0..X.ncols() {
297 let column = X.column(col);
298 let mut values: Vec<f64> = column.to_vec();
299 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
300
301 let q1_idx = values.len() / 4;
302 let q3_idx = 3 * values.len() / 4;
303 let q1 = values[q1_idx];
304 let q3 = values[q3_idx];
305 let iqr = q3 - q1;
306
307 let lower_bound = q1 - multiplier * iqr;
308 let upper_bound = q3 + multiplier * iqr;
309
310 for val in X.column_mut(col).iter_mut() {
312 if *val < lower_bound {
313 *val = lower_bound;
314 } else if *val > upper_bound {
315 *val = upper_bound;
316 }
317 }
318 }
319 }
320 OutlierHandling::ZScore { threshold } => {
321 for col in 0..X.ncols() {
322 let column = X.column(col);
323 let mean = column.mean().unwrap_or(0.0);
324 let std = column.std(1.0);
325
326 for val in X.column_mut(col).iter_mut() {
327 let z_score = (*val - mean) / std;
328 if z_score.abs() > *threshold {
329 *val = mean; }
331 }
332 }
333 }
334 _ => {
335 }
337 }
338 Ok(X)
339 }
340
341 fn scale_features(&self, mut X: Array2<f64>) -> Result<Array2<f64>> {
342 match &self.scaler_type {
343 ScalerType::StandardScaler => {
344 for col in 0..X.ncols() {
345 let column = X.column(col);
346 let mean = column.mean().unwrap_or(0.0);
347 let std = column.std(1.0);
348
349 if std > 1e-10 {
350 for val in X.column_mut(col).iter_mut() {
351 *val = (*val - mean) / std;
352 }
353 }
354 }
355 }
356 ScalerType::MinMaxScaler => {
357 for col in 0..X.ncols() {
358 let column = X.column(col);
359 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
360 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
361 let range = max_val - min_val;
362
363 if range > 1e-10 {
364 for val in X.column_mut(col).iter_mut() {
365 *val = (*val - min_val) / range;
366 }
367 }
368 }
369 }
370 ScalerType::RobustScaler => {
371 for col in 0..X.ncols() {
372 let mut values: Vec<f64> = X.column(col).to_vec();
373 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
374
375 let median = if values.len() % 2 == 0 {
376 (values[values.len() / 2 - 1] + values[values.len() / 2]) / 2.0
377 } else {
378 values[values.len() / 2]
379 };
380
381 let q1 = values[values.len() / 4];
382 let q3 = values[3 * values.len() / 4];
383 let iqr = q3 - q1;
384
385 if iqr > 1e-10 {
386 for val in X.column_mut(col).iter_mut() {
387 *val = (*val - median) / iqr;
388 }
389 }
390 }
391 }
392 _ => {
393 }
395 }
396 Ok(X)
397 }
398
399 fn apply_feature_engineering(&self, X: Array2<f64>) -> Result<Array2<f64>> {
400 match &self.feature_engineering {
401 FeatureEngineering::Polynomial { degree: 2 } => {
402 let mut new_X = Array2::zeros((X.nrows(), X.ncols() * 2));
404
405 for i in 0..X.nrows() {
407 for j in 0..X.ncols() {
408 new_X[[i, j]] = X[[i, j]];
409 }
410 }
411
412 for i in 0..X.nrows() {
414 for j in 0..X.ncols() {
415 new_X[[i, X.ncols() + j]] = X[[i, j]] * X[[i, j]];
416 }
417 }
418
419 Ok(new_X)
420 }
421 _ => Ok(X),
422 }
423 }
424
425 fn apply_dimensionality_reduction(
426 &self,
427 X: Array2<f64>,
428 reduction: &DimensionalityReduction,
429 ) -> Result<Array2<f64>> {
430 match reduction {
431 DimensionalityReduction::PCA { n_components } => {
432 let n_comp = (*n_components).min(X.ncols());
434 let mut reduced_X = Array2::zeros((X.nrows(), n_comp));
435
436 for i in 0..X.nrows() {
437 for j in 0..n_comp {
438 reduced_X[[i, j]] = X[[i, j]];
439 }
440 }
441
442 Ok(reduced_X)
443 }
444 _ => Ok(X), }
446 }
447}
448
449impl Default for PreprocessingIntegration {
450 fn default() -> Self {
451 Self::new()
452 }
453}