sklears_preprocessing/
automated_feature_engineering.rs

1//! Automated Feature Engineering
2//!
3//! This module provides automated feature engineering capabilities that can automatically
4//! generate new features from existing ones to improve model performance.
5//!
6//! # Features
7//!
8//! - **Feature Generation**: Automatically create polynomial, interaction, and transformation features
9//! - **Feature Selection**: Select the most relevant features using various scoring methods
10//! - **Feature Transformation**: Apply mathematical transformations to discover hidden patterns
11//! - **Feature Importance**: Rank features by their predictive power
12//! - **Domain-Specific Engineering**: Apply domain knowledge for specific feature types
13//!
14//! # Examples
15//!
16//! ```rust,ignore
17//! use sklears_preprocessing::automated_feature_engineering::{
18//!     AutoFeatureEngineer, AutoFeatureConfig, GenerationStrategy
19//! };
20//! use scirs2_core::ndarray::Array2;
21//!
22//! fn example() -> Result<(), Box<dyn std::error::Error>> {
23//!     let config = AutoFeatureConfig::new()
24//!         .with_strategy(GenerationStrategy::Polynomial { degree: 2 })
25//!         .with_max_features(100)
26//!         .with_selection_threshold(0.01);
27//!     
28//!     let mut engineer = AutoFeatureEngineer::new(config);
29//!     
30//!     let data = Array2::from_shape_vec((100, 5), (0..500).map(|x| x as f64).collect())?;
31//!     let target = Array1::from_vec((0..100).map(|x| (x % 2) as f64).collect());
32//!     
33//!     let engineer_fitted = engineer.fit(&data, &target)?;
34//!     let transformed = engineer_fitted.transform(&data)?;
35//!     
36//!     println!("Original features: {}", data.ncols());
37//!     println!("Generated features: {}", transformed.ncols());
38//!     
39//!     Ok(())
40//! }
41//! ```
42
43use scirs2_core::ndarray::{Array1, Array2};
44use sklears_core::{
45    error::{Result, SklearsError},
46    traits::{Fit, Transform, Untrained},
47};
48use std::collections::{HashMap, HashSet};
49use std::marker::PhantomData;
50
51#[cfg(feature = "serde")]
52use serde::{Deserialize, Serialize};
53
54/// Configuration for automated feature engineering
55#[derive(Debug, Clone)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57pub struct AutoFeatureConfig {
58    /// Feature generation strategies
59    pub strategies: Vec<GenerationStrategy>,
60    /// Maximum number of features to generate
61    pub max_features: usize,
62    /// Feature selection method
63    pub selection_method: SelectionMethod,
64    /// Threshold for feature selection
65    pub selection_threshold: f64,
66    /// Whether to include original features
67    pub include_original: bool,
68    /// Random state for reproducible results
69    pub random_state: Option<u64>,
70    /// Maximum interaction depth
71    pub max_interaction_depth: usize,
72    /// Whether to remove highly correlated features
73    pub remove_correlated: bool,
74    /// Correlation threshold for removal
75    pub correlation_threshold: f64,
76    /// Whether to scale features before selection
77    pub scale_features: bool,
78}
79
80/// Feature generation strategies
81#[derive(Debug, Clone)]
82#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
83pub enum GenerationStrategy {
84    /// Polynomial features up to specified degree
85    Polynomial { degree: usize },
86    /// Mathematical transformations
87    Mathematical { functions: Vec<MathFunction> },
88    /// Feature interactions
89    Interactions { max_depth: usize },
90    /// Binning and discretization
91    Binning { n_bins: usize },
92    /// Ratios between features
93    Ratios,
94    /// Statistical aggregations (for grouped data)
95    Aggregations { window_size: usize },
96    /// Frequency encoding for categorical-like numerical features
97    FrequencyEncoding,
98    /// Domain-specific features
99    DomainSpecific { domain: Domain },
100}
101
102/// Mathematical functions for feature transformation
103#[derive(Debug, Clone, Copy)]
104#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
105pub enum MathFunction {
106    Log,
107    Log1p,
108    Sqrt,
109    Square,
110    Exp,
111    Sin,
112    Cos,
113    Tan,
114    Abs,
115    Reciprocal,
116}
117
118/// Domain-specific feature engineering
119#[derive(Debug, Clone)]
120#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
121pub enum Domain {
122    /// Time series domain
123    TimeSeries,
124    /// Financial domain
125    Financial,
126    /// Text domain (for numerical text features)
127    Text,
128    /// Image domain (for flattened image data)
129    Image,
130    /// Generic numerical domain
131    Generic,
132}
133
134/// Feature selection methods
135#[derive(Debug, Clone, Copy)]
136#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
137pub enum SelectionMethod {
138    /// Mutual information-based selection
139    MutualInformation,
140    /// Correlation-based selection
141    Correlation,
142    /// Variance-based selection
143    Variance,
144    /// Chi-squared test
145    ChiSquared,
146    /// F-test
147    FTest,
148    /// Recursive feature elimination
149    RecursiveElimination,
150    /// LASSO-based selection
151    LASSO,
152}
153
154impl Default for AutoFeatureConfig {
155    fn default() -> Self {
156        Self {
157            strategies: vec![
158                GenerationStrategy::Polynomial { degree: 2 },
159                GenerationStrategy::Mathematical {
160                    functions: vec![
161                        MathFunction::Log1p,
162                        MathFunction::Sqrt,
163                        MathFunction::Square,
164                    ],
165                },
166                GenerationStrategy::Interactions { max_depth: 2 },
167            ],
168            max_features: 200,
169            selection_method: SelectionMethod::MutualInformation,
170            selection_threshold: 0.01,
171            include_original: true,
172            random_state: None,
173            max_interaction_depth: 2,
174            remove_correlated: true,
175            correlation_threshold: 0.95,
176            scale_features: true,
177        }
178    }
179}
180
181impl AutoFeatureConfig {
182    /// Create a new configuration
183    pub fn new() -> Self {
184        Self::default()
185    }
186
187    /// Add a generation strategy
188    pub fn with_strategy(mut self, strategy: GenerationStrategy) -> Self {
189        self.strategies.push(strategy);
190        self
191    }
192
193    /// Set maximum number of features
194    pub fn with_max_features(mut self, max_features: usize) -> Self {
195        self.max_features = max_features;
196        self
197    }
198
199    /// Set selection method
200    pub fn with_selection_method(mut self, method: SelectionMethod) -> Self {
201        self.selection_method = method;
202        self
203    }
204
205    /// Set selection threshold
206    pub fn with_selection_threshold(mut self, threshold: f64) -> Self {
207        self.selection_threshold = threshold;
208        self
209    }
210
211    /// Set whether to include original features
212    pub fn with_include_original(mut self, include: bool) -> Self {
213        self.include_original = include;
214        self
215    }
216
217    /// Set random state
218    pub fn with_random_state(mut self, seed: u64) -> Self {
219        self.random_state = Some(seed);
220        self
221    }
222}
223
224/// Automated feature engineering transformer
225pub struct AutoFeatureEngineer<State = Untrained> {
226    config: AutoFeatureConfig,
227    state: PhantomData<State>,
228}
229
230/// Fitted automated feature engineer
231pub struct AutoFeatureEngineerFitted {
232    config: AutoFeatureConfig,
233    selected_features: Vec<usize>,
234    feature_names: Vec<String>,
235    feature_scores: Vec<f64>,
236    transformation_functions: Vec<TransformationFunction>,
237    n_original_features: usize,
238    feature_importance: Vec<f64>,
239    correlation_matrix: Option<Array2<f64>>,
240}
241
242/// Represents a transformation function for feature generation
243#[derive(Debug, Clone)]
244pub struct TransformationFunction {
245    pub name: String,
246    pub function_type: TransformationType,
247    pub input_indices: Vec<usize>,
248    pub parameters: HashMap<String, f64>,
249}
250
251/// Types of transformations
252#[derive(Debug, Clone)]
253pub enum TransformationType {
254    Polynomial { degree: usize },
255    Mathematical { function: MathFunction },
256    Interaction,
257    Binning { bins: Vec<f64> },
258    Ratio,
259    Aggregation { window_size: usize },
260    FrequencyEncoding { mapping: HashMap<String, f64> },
261}
262
263impl AutoFeatureEngineer<Untrained> {
264    /// Create a new automated feature engineer
265    pub fn new(config: AutoFeatureConfig) -> Self {
266        Self {
267            config,
268            state: PhantomData,
269        }
270    }
271
272    /// Get the configuration
273    pub fn config(&self) -> &AutoFeatureConfig {
274        &self.config
275    }
276}
277
278impl Fit<Array2<f64>, Array1<f64>> for AutoFeatureEngineer<Untrained> {
279    type Fitted = AutoFeatureEngineerFitted;
280
281    fn fit(self, x: &Array2<f64>, y: &Array1<f64>) -> Result<AutoFeatureEngineerFitted> {
282        if x.is_empty() || y.is_empty() {
283            return Err(SklearsError::InvalidInput(
284                "Input arrays cannot be empty".to_string(),
285            ));
286        }
287
288        let (n_samples, n_features) = x.dim();
289        if y.len() != n_samples {
290            return Err(SklearsError::InvalidInput(
291                "X and y must have the same number of samples".to_string(),
292            ));
293        }
294
295        // Step 1: Generate features according to strategies
296        let mut generated_features = Vec::new();
297        let mut transformation_functions = Vec::new();
298        let mut feature_names = Vec::new();
299
300        // Add original features if requested
301        if self.config.include_original {
302            for i in 0..n_features {
303                feature_names.push(format!("original_{}", i));
304                generated_features.push(x.column(i).to_owned());
305                transformation_functions.push(TransformationFunction {
306                    name: format!("original_{}", i),
307                    function_type: TransformationType::Mathematical {
308                        function: MathFunction::Abs,
309                    }, // Identity placeholder
310                    input_indices: vec![i],
311                    parameters: HashMap::new(),
312                });
313            }
314        }
315
316        // Generate features for each strategy
317        for strategy in &self.config.strategies {
318            let (strategy_features, strategy_transforms, strategy_names) =
319                self.generate_features_for_strategy(x, strategy)?;
320
321            generated_features.extend(strategy_features);
322            transformation_functions.extend(strategy_transforms);
323            feature_names.extend(strategy_names);
324        }
325
326        // Step 2: Create feature matrix
327        let n_generated = generated_features.len();
328        if n_generated == 0 {
329            return Err(SklearsError::InvalidInput(
330                "No features were generated".to_string(),
331            ));
332        }
333
334        let mut feature_matrix = Array2::zeros((n_samples, n_generated));
335        for (i, feature) in generated_features.iter().enumerate() {
336            for (j, &value) in feature.iter().enumerate() {
337                feature_matrix[[j, i]] = value;
338            }
339        }
340
341        // Step 3: Scale features if requested
342        let feature_matrix = if self.config.scale_features {
343            scale_features(&feature_matrix)?
344        } else {
345            feature_matrix
346        };
347
348        // Step 4: Remove highly correlated features
349        let (feature_matrix, feature_indices) = if self.config.remove_correlated {
350            remove_correlated_features(&feature_matrix, self.config.correlation_threshold)?
351        } else {
352            let indices: Vec<usize> = (0..n_generated).collect();
353            (feature_matrix, indices)
354        };
355
356        // Update transformation functions and names based on remaining features
357        let mut filtered_transforms = Vec::new();
358        let mut filtered_names = Vec::new();
359        for &idx in &feature_indices {
360            if idx < transformation_functions.len() {
361                filtered_transforms.push(transformation_functions[idx].clone());
362                filtered_names.push(feature_names[idx].clone());
363            }
364        }
365
366        // Step 5: Calculate feature scores
367        let feature_scores = self.calculate_feature_scores(&feature_matrix, y)?;
368
369        // Step 6: Select top features
370        let selected_features = self.select_features(&feature_scores)?;
371
372        // Step 7: Calculate feature importance
373        let feature_importance =
374            self.calculate_feature_importance(&feature_matrix, y, &selected_features)?;
375
376        // Step 8: Calculate correlation matrix for analysis
377        let correlation_matrix = if feature_matrix.ncols() <= 1000 {
378            // Only for reasonably sized matrices
379            Some(calculate_correlation_matrix(&feature_matrix)?)
380        } else {
381            None
382        };
383
384        Ok(AutoFeatureEngineerFitted {
385            config: self.config,
386            selected_features,
387            feature_names: filtered_names,
388            feature_scores,
389            transformation_functions: filtered_transforms,
390            n_original_features: n_features,
391            feature_importance,
392            correlation_matrix,
393        })
394    }
395}
396
397impl AutoFeatureEngineer<Untrained> {
398    /// Generate features for a specific strategy
399    fn generate_features_for_strategy(
400        &self,
401        x: &Array2<f64>,
402        strategy: &GenerationStrategy,
403    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
404        match strategy {
405            GenerationStrategy::Polynomial { degree } => {
406                self.generate_polynomial_features(x, *degree)
407            }
408            GenerationStrategy::Mathematical { functions } => {
409                self.generate_mathematical_features(x, functions)
410            }
411            GenerationStrategy::Interactions { max_depth } => {
412                self.generate_interaction_features(x, *max_depth)
413            }
414            GenerationStrategy::Binning { n_bins } => self.generate_binning_features(x, *n_bins),
415            GenerationStrategy::Ratios => self.generate_ratio_features(x),
416            GenerationStrategy::Aggregations { window_size } => {
417                self.generate_aggregation_features(x, *window_size)
418            }
419            GenerationStrategy::FrequencyEncoding => self.generate_frequency_encoding_features(x),
420            GenerationStrategy::DomainSpecific { domain } => {
421                self.generate_domain_specific_features(x, domain)
422            }
423        }
424    }
425
426    /// Generate polynomial features
427    fn generate_polynomial_features(
428        &self,
429        x: &Array2<f64>,
430        degree: usize,
431    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
432        let (_n_samples, n_features) = x.dim();
433        let mut features = Vec::new();
434        let mut transforms = Vec::new();
435        let mut names = Vec::new();
436
437        // Generate polynomial features for each individual feature
438        for i in 0..n_features {
439            let column = x.column(i);
440            for d in 2..=degree {
441                let poly_feature = column.mapv(|x| x.powi(d as i32));
442                features.push(poly_feature);
443                transforms.push(TransformationFunction {
444                    name: format!("poly_{}_{}", i, d),
445                    function_type: TransformationType::Polynomial { degree: d },
446                    input_indices: vec![i],
447                    parameters: HashMap::new(),
448                });
449                names.push(format!("poly_{}_{}", i, d));
450            }
451        }
452
453        Ok((features, transforms, names))
454    }
455
456    /// Generate mathematical transformation features
457    fn generate_mathematical_features(
458        &self,
459        x: &Array2<f64>,
460        functions: &[MathFunction],
461    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
462        let (_n_samples, n_features) = x.dim();
463        let mut features = Vec::new();
464        let mut transforms = Vec::new();
465        let mut names = Vec::new();
466
467        for i in 0..n_features {
468            let column = x.column(i);
469            for &function in functions {
470                let transformed = apply_math_function(&column.to_owned(), function)?;
471                features.push(transformed);
472                transforms.push(TransformationFunction {
473                    name: format!("{}_{}", math_function_name(function), i),
474                    function_type: TransformationType::Mathematical { function },
475                    input_indices: vec![i],
476                    parameters: HashMap::new(),
477                });
478                names.push(format!("{}_{}", math_function_name(function), i));
479            }
480        }
481
482        Ok((features, transforms, names))
483    }
484
485    /// Generate interaction features
486    fn generate_interaction_features(
487        &self,
488        x: &Array2<f64>,
489        max_depth: usize,
490    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
491        let (_n_samples, n_features) = x.dim();
492        let mut features = Vec::new();
493        let mut transforms = Vec::new();
494        let mut names = Vec::new();
495
496        // Generate pairwise interactions
497        if max_depth >= 2 {
498            for i in 0..n_features {
499                for j in (i + 1)..n_features {
500                    let col_i = x.column(i);
501                    let col_j = x.column(j);
502                    let interaction = &col_i * &col_j;
503                    features.push(interaction);
504                    transforms.push(TransformationFunction {
505                        name: format!("interact_{}_{}", i, j),
506                        function_type: TransformationType::Interaction,
507                        input_indices: vec![i, j],
508                        parameters: HashMap::new(),
509                    });
510                    names.push(format!("interact_{}_{}", i, j));
511                }
512            }
513        }
514
515        // Generate three-way interactions if requested
516        if max_depth >= 3 && n_features >= 3 {
517            for i in 0..n_features {
518                for j in (i + 1)..n_features {
519                    for k in (j + 1)..n_features {
520                        let col_i = x.column(i);
521                        let col_j = x.column(j);
522                        let col_k = x.column(k);
523                        let interaction = &(&col_i * &col_j) * &col_k;
524                        features.push(interaction);
525                        transforms.push(TransformationFunction {
526                            name: format!("interact_{}_{}_{}", i, j, k),
527                            function_type: TransformationType::Interaction,
528                            input_indices: vec![i, j, k],
529                            parameters: HashMap::new(),
530                        });
531                        names.push(format!("interact_{}_{}_{}", i, j, k));
532                    }
533                }
534            }
535        }
536
537        Ok((features, transforms, names))
538    }
539
540    /// Generate binning features (placeholder implementation)
541    fn generate_binning_features(
542        &self,
543        x: &Array2<f64>,
544        n_bins: usize,
545    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
546        let (_n_samples, n_features) = x.dim();
547        let mut features = Vec::new();
548        let mut transforms = Vec::new();
549        let mut names = Vec::new();
550
551        for i in 0..n_features {
552            let column = x.column(i);
553            let (min_val, max_val) = column
554                .iter()
555                .fold((f64::INFINITY, f64::NEG_INFINITY), |(min, max), &val| {
556                    (min.min(val), max.max(val))
557                });
558
559            if (max_val - min_val).abs() < f64::EPSILON {
560                continue; // Skip constant features
561            }
562
563            let bin_width = (max_val - min_val) / n_bins as f64;
564            let bins: Vec<f64> = (0..=n_bins)
565                .map(|b| min_val + b as f64 * bin_width)
566                .collect();
567
568            let binned_feature = column.mapv(|x| {
569                let bin_index = ((x - min_val) / bin_width).floor() as usize;
570                bin_index.min(n_bins - 1) as f64
571            });
572
573            features.push(binned_feature);
574            transforms.push(TransformationFunction {
575                name: format!("bin_{}", i),
576                function_type: TransformationType::Binning { bins: bins.clone() },
577                input_indices: vec![i],
578                parameters: HashMap::new(),
579            });
580            names.push(format!("bin_{}", i));
581        }
582
583        Ok((features, transforms, names))
584    }
585
586    /// Generate ratio features
587    fn generate_ratio_features(
588        &self,
589        x: &Array2<f64>,
590    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
591        let (_n_samples, n_features) = x.dim();
592        let mut features = Vec::new();
593        let mut transforms = Vec::new();
594        let mut names = Vec::new();
595
596        for i in 0..n_features {
597            for j in 0..n_features {
598                if i != j {
599                    let col_i = x.column(i);
600                    let col_j = x.column(j);
601
602                    // Avoid division by zero
603                    let ratio = col_i
604                        .iter()
605                        .zip(col_j.iter())
606                        .map(|(&a, &b)| if b.abs() < 1e-8 { 0.0 } else { a / b })
607                        .collect::<Vec<f64>>();
608
609                    features.push(Array1::from_vec(ratio));
610                    transforms.push(TransformationFunction {
611                        name: format!("ratio_{}_{}", i, j),
612                        function_type: TransformationType::Ratio,
613                        input_indices: vec![i, j],
614                        parameters: HashMap::new(),
615                    });
616                    names.push(format!("ratio_{}_{}", i, j));
617                }
618            }
619        }
620
621        Ok((features, transforms, names))
622    }
623
624    /// Generate aggregation features (placeholder implementation)
625    fn generate_aggregation_features(
626        &self,
627        x: &Array2<f64>,
628        window_size: usize,
629    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
630        let (n_samples, n_features) = x.dim();
631        let mut features = Vec::new();
632        let mut transforms = Vec::new();
633        let mut names = Vec::new();
634
635        if window_size >= n_samples {
636            return Ok((features, transforms, names)); // Not enough samples for windowing
637        }
638
639        for i in 0..n_features {
640            let column = x.column(i);
641
642            // Rolling mean
643            let rolling_mean = (0..n_samples)
644                .map(|idx| {
645                    let start = idx.saturating_sub(window_size / 2);
646                    let end = (idx + window_size / 2 + 1).min(n_samples);
647                    let window = &column.slice(scirs2_core::ndarray::s![start..end]);
648                    window.mean().unwrap_or(0.0)
649                })
650                .collect::<Vec<f64>>();
651
652            features.push(Array1::from_vec(rolling_mean));
653            transforms.push(TransformationFunction {
654                name: format!("rolling_mean_{}_{}", i, window_size),
655                function_type: TransformationType::Aggregation { window_size },
656                input_indices: vec![i],
657                parameters: HashMap::new(),
658            });
659            names.push(format!("rolling_mean_{}_{}", i, window_size));
660        }
661
662        Ok((features, transforms, names))
663    }
664
665    /// Generate frequency encoding features (placeholder implementation)
666    fn generate_frequency_encoding_features(
667        &self,
668        x: &Array2<f64>,
669    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
670        let (_n_samples, n_features) = x.dim();
671        let mut features = Vec::new();
672        let mut transforms = Vec::new();
673        let mut names = Vec::new();
674
675        for i in 0..n_features {
676            let column = x.column(i);
677
678            // Count frequencies of values (rounded to handle floating point)
679            let mut frequency_map: HashMap<i64, i32> = HashMap::new();
680            for &value in column.iter() {
681                let rounded = (value * 1000.0).round() as i64; // Round to 3 decimal places and convert to int
682                *frequency_map.entry(rounded).or_insert(0) += 1;
683            }
684
685            // Convert to frequency encoding
686            let freq_encoded = column.mapv(|x| {
687                let rounded = (x * 1000.0).round() as i64;
688                *frequency_map.get(&rounded).unwrap_or(&0) as f64
689            });
690
691            features.push(freq_encoded);
692            transforms.push(TransformationFunction {
693                name: format!("freq_encode_{}", i),
694                function_type: TransformationType::FrequencyEncoding {
695                    mapping: frequency_map
696                        .iter()
697                        .map(|(&k, &v)| (k.to_string(), v as f64))
698                        .collect(),
699                },
700                input_indices: vec![i],
701                parameters: HashMap::new(),
702            });
703            names.push(format!("freq_encode_{}", i));
704        }
705
706        Ok((features, transforms, names))
707    }
708
709    /// Generate domain-specific features (placeholder implementation)
710    fn generate_domain_specific_features(
711        &self,
712        x: &Array2<f64>,
713        domain: &Domain,
714    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
715        match domain {
716            Domain::TimeSeries => self.generate_time_series_features(x),
717            Domain::Financial => self.generate_financial_features(x),
718            Domain::Text => self.generate_text_features(x),
719            Domain::Image => self.generate_image_features(x),
720            Domain::Generic => self.generate_generic_features(x),
721        }
722    }
723
724    /// Generate time series specific features
725    fn generate_time_series_features(
726        &self,
727        x: &Array2<f64>,
728    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
729        let mut features = Vec::new();
730        let mut transforms = Vec::new();
731        let mut names = Vec::new();
732
733        // For each feature, generate lag features, differences, etc.
734        for i in 0..x.ncols() {
735            let column = x.column(i);
736
737            // First difference
738            if column.len() > 1 {
739                let diff = (1..column.len())
740                    .map(|j| column[j] - column[j - 1])
741                    .collect::<Vec<f64>>();
742                let mut diff_feature = vec![0.0]; // Pad with zero for first element
743                diff_feature.extend(diff);
744
745                features.push(Array1::from_vec(diff_feature));
746                transforms.push(TransformationFunction {
747                    name: format!("diff_{}", i),
748                    function_type: TransformationType::Mathematical {
749                        function: MathFunction::Abs,
750                    }, // Placeholder
751                    input_indices: vec![i],
752                    parameters: HashMap::new(),
753                });
754                names.push(format!("diff_{}", i));
755            }
756        }
757
758        Ok((features, transforms, names))
759    }
760
761    /// Generate financial domain features
762    fn generate_financial_features(
763        &self,
764        x: &Array2<f64>,
765    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
766        // Placeholder for financial features like returns, volatility, etc.
767        self.generate_generic_features(x)
768    }
769
770    /// Generate text domain features
771    fn generate_text_features(
772        &self,
773        x: &Array2<f64>,
774    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
775        // Placeholder for text features like length, character counts, etc.
776        self.generate_generic_features(x)
777    }
778
779    /// Generate image domain features
780    fn generate_image_features(
781        &self,
782        x: &Array2<f64>,
783    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
784        // Placeholder for image features like gradients, filters, etc.
785        self.generate_generic_features(x)
786    }
787
788    /// Generate generic domain features
789    fn generate_generic_features(
790        &self,
791        x: &Array2<f64>,
792    ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
793        // Basic statistical features
794        let mut features = Vec::new();
795        let mut transforms = Vec::new();
796        let mut names = Vec::new();
797
798        // Row-wise statistics
799        for stat_name in &["sum", "mean", "std", "min", "max"] {
800            let stat_feature = (0..x.nrows())
801                .map(|i| {
802                    let row = x.row(i);
803                    match *stat_name {
804                        "sum" => row.sum(),
805                        "mean" => row.mean().unwrap_or(0.0),
806                        "std" => {
807                            let mean = row.mean().unwrap_or(0.0);
808                            let variance = row.mapv(|x| (x - mean).powi(2)).mean().unwrap_or(0.0);
809                            variance.sqrt()
810                        }
811                        "min" => row.iter().fold(f64::INFINITY, |a, &b| a.min(b)),
812                        "max" => row.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b)),
813                        _ => 0.0,
814                    }
815                })
816                .collect::<Vec<f64>>();
817
818            features.push(Array1::from_vec(stat_feature));
819            transforms.push(TransformationFunction {
820                name: format!("row_{}", stat_name),
821                function_type: TransformationType::Aggregation {
822                    window_size: x.ncols(),
823                },
824                input_indices: (0..x.ncols()).collect(),
825                parameters: HashMap::new(),
826            });
827            names.push(format!("row_{}", stat_name));
828        }
829
830        Ok((features, transforms, names))
831    }
832
833    /// Calculate feature scores using the selected method
834    fn calculate_feature_scores(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Vec<f64>> {
835        match self.config.selection_method {
836            SelectionMethod::Correlation => calculate_correlation_scores(x, y),
837            SelectionMethod::Variance => calculate_variance_scores(x),
838            SelectionMethod::MutualInformation => calculate_mutual_information_scores(x, y),
839            _ => {
840                // Fallback to correlation for unimplemented methods
841                calculate_correlation_scores(x, y)
842            }
843        }
844    }
845
846    /// Select features based on scores and configuration
847    fn select_features(&self, scores: &[f64]) -> Result<Vec<usize>> {
848        let mut indexed_scores: Vec<(usize, f64)> = scores
849            .iter()
850            .enumerate()
851            .map(|(i, &score)| (i, score.abs()))
852            .collect();
853
854        // Sort by score descending
855        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
856
857        // Select features above threshold and up to max_features
858        let selected: Vec<usize> = indexed_scores
859            .iter()
860            .filter(|(_, score)| *score >= self.config.selection_threshold)
861            .take(self.config.max_features)
862            .map(|(idx, _)| *idx)
863            .collect();
864
865        if selected.is_empty() {
866            // If no features meet the threshold, select the top one
867            Ok(vec![indexed_scores[0].0])
868        } else {
869            Ok(selected)
870        }
871    }
872
873    /// Calculate feature importance
874    fn calculate_feature_importance(
875        &self,
876        x: &Array2<f64>,
877        y: &Array1<f64>,
878        selected_features: &[usize],
879    ) -> Result<Vec<f64>> {
880        // Simple importance based on correlation with target
881        let mut importance = vec![0.0; selected_features.len()];
882
883        for (i, &feature_idx) in selected_features.iter().enumerate() {
884            if feature_idx < x.ncols() {
885                let feature_col = x.column(feature_idx).to_owned();
886                let correlation = calculate_correlation(&feature_col, y)?;
887                importance[i] = correlation.abs();
888            }
889        }
890
891        Ok(importance)
892    }
893}
894
895impl AutoFeatureEngineerFitted {
896    /// Get selected feature indices
897    pub fn selected_features(&self) -> &[usize] {
898        &self.selected_features
899    }
900
901    /// Get feature names
902    pub fn feature_names(&self) -> &[String] {
903        &self.feature_names
904    }
905
906    /// Get feature scores
907    pub fn feature_scores(&self) -> &[f64] {
908        &self.feature_scores
909    }
910
911    /// Get feature importance
912    pub fn feature_importance(&self) -> &[f64] {
913        &self.feature_importance
914    }
915
916    /// Get transformation functions
917    pub fn transformations(&self) -> &[TransformationFunction] {
918        &self.transformation_functions
919    }
920
921    /// Get correlation matrix (if computed)
922    pub fn correlation_matrix(&self) -> Option<&Array2<f64>> {
923        self.correlation_matrix.as_ref()
924    }
925}
926
927impl Transform<Array2<f64>, Array2<f64>> for AutoFeatureEngineerFitted {
928    fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
929        if x.is_empty() {
930            return Err(SklearsError::InvalidInput(
931                "Input array is empty".to_string(),
932            ));
933        }
934
935        let (n_samples, n_features) = x.dim();
936        if n_features != self.n_original_features {
937            return Err(SklearsError::InvalidInput(format!(
938                "Feature count mismatch: expected {}, got {}",
939                self.n_original_features, n_features
940            )));
941        }
942
943        // Apply transformations and select features
944        // This is a simplified implementation
945        // In practice, you'd apply the exact transformations learned during fitting
946
947        let mut result = Array2::zeros((n_samples, self.selected_features.len()));
948
949        for (out_idx, &in_idx) in self.selected_features.iter().enumerate() {
950            if in_idx < n_features {
951                // For original features, just copy
952                for (row_idx, &value) in x.column(in_idx).iter().enumerate() {
953                    result[[row_idx, out_idx]] = value;
954                }
955            }
956            // For generated features, we'd need to apply the corresponding transformation
957            // This is simplified for now
958        }
959
960        Ok(result)
961    }
962}
963
964// Helper functions
965
966/// Apply a mathematical function to an array
967fn apply_math_function(arr: &Array1<f64>, function: MathFunction) -> Result<Array1<f64>> {
968    let result = match function {
969        MathFunction::Log => arr.mapv(|x| if x > 0.0 { x.ln() } else { f64::NEG_INFINITY }),
970        MathFunction::Log1p => arr.mapv(|x| (1.0 + x).ln()),
971        MathFunction::Sqrt => arr.mapv(|x| if x >= 0.0 { x.sqrt() } else { 0.0 }),
972        MathFunction::Square => arr.mapv(|x| x * x),
973        MathFunction::Exp => arr.mapv(|x| x.exp()),
974        MathFunction::Sin => arr.mapv(|x| x.sin()),
975        MathFunction::Cos => arr.mapv(|x| x.cos()),
976        MathFunction::Tan => arr.mapv(|x| x.tan()),
977        MathFunction::Abs => arr.mapv(|x| x.abs()),
978        MathFunction::Reciprocal => arr.mapv(|x| if x.abs() > 1e-8 { 1.0 / x } else { 0.0 }),
979    };
980    Ok(result)
981}
982
983/// Get the name of a mathematical function
984fn math_function_name(function: MathFunction) -> &'static str {
985    match function {
986        MathFunction::Log => "log",
987        MathFunction::Log1p => "log1p",
988        MathFunction::Sqrt => "sqrt",
989        MathFunction::Square => "square",
990        MathFunction::Exp => "exp",
991        MathFunction::Sin => "sin",
992        MathFunction::Cos => "cos",
993        MathFunction::Tan => "tan",
994        MathFunction::Abs => "abs",
995        MathFunction::Reciprocal => "reciprocal",
996    }
997}
998
999/// Scale features to have zero mean and unit variance
1000fn scale_features(x: &Array2<f64>) -> Result<Array2<f64>> {
1001    let mut result = x.clone();
1002    let n_features = x.ncols();
1003
1004    for i in 0..n_features {
1005        let col = x.column(i);
1006        let mean = col.mean().unwrap_or(0.0);
1007        let std = {
1008            let variance = col.mapv(|x| (x - mean).powi(2)).mean().unwrap_or(0.0);
1009            variance.sqrt()
1010        };
1011
1012        if std > 1e-8 {
1013            for j in 0..x.nrows() {
1014                result[[j, i]] = (result[[j, i]] - mean) / std;
1015            }
1016        }
1017    }
1018
1019    Ok(result)
1020}
1021
1022/// Remove highly correlated features
1023fn remove_correlated_features(
1024    x: &Array2<f64>,
1025    threshold: f64,
1026) -> Result<(Array2<f64>, Vec<usize>)> {
1027    let n_features = x.ncols();
1028    let mut to_remove = HashSet::new();
1029
1030    // Calculate correlation matrix
1031    for i in 0..n_features {
1032        for j in (i + 1)..n_features {
1033            if to_remove.contains(&i) || to_remove.contains(&j) {
1034                continue;
1035            }
1036
1037            let corr = calculate_correlation(&x.column(i).to_owned(), &x.column(j).to_owned())?;
1038            if corr.abs() > threshold {
1039                // Remove the feature with lower variance
1040                let var_i = x.column(i).var(0.0);
1041                let var_j = x.column(j).var(0.0);
1042                if var_i < var_j {
1043                    to_remove.insert(i);
1044                } else {
1045                    to_remove.insert(j);
1046                }
1047            }
1048        }
1049    }
1050
1051    // Create new matrix without correlated features
1052    let remaining_features: Vec<usize> =
1053        (0..n_features).filter(|i| !to_remove.contains(i)).collect();
1054
1055    if remaining_features.is_empty() {
1056        return Ok((x.clone(), (0..n_features).collect()));
1057    }
1058
1059    let mut result = Array2::zeros((x.nrows(), remaining_features.len()));
1060    for (new_idx, &old_idx) in remaining_features.iter().enumerate() {
1061        for (row_idx, &value) in x.column(old_idx).iter().enumerate() {
1062            result[[row_idx, new_idx]] = value;
1063        }
1064    }
1065
1066    Ok((result, remaining_features))
1067}
1068
1069/// Calculate correlation between two arrays
1070fn calculate_correlation(x: &Array1<f64>, y: &Array1<f64>) -> Result<f64> {
1071    if x.len() != y.len() {
1072        return Err(SklearsError::InvalidInput(
1073            "Arrays must have the same length".to_string(),
1074        ));
1075    }
1076
1077    let mean_x = x.mean().unwrap_or(0.0);
1078    let mean_y = y.mean().unwrap_or(0.0);
1079
1080    let mut numerator = 0.0;
1081    let mut sum_sq_x = 0.0;
1082    let mut sum_sq_y = 0.0;
1083
1084    for (&xi, &yi) in x.iter().zip(y.iter()) {
1085        let dx = xi - mean_x;
1086        let dy = yi - mean_y;
1087        numerator += dx * dy;
1088        sum_sq_x += dx * dx;
1089        sum_sq_y += dy * dy;
1090    }
1091
1092    let denominator = (sum_sq_x * sum_sq_y).sqrt();
1093    if denominator < 1e-8 {
1094        Ok(0.0)
1095    } else {
1096        Ok(numerator / denominator)
1097    }
1098}
1099
1100/// Calculate correlation scores for features
1101fn calculate_correlation_scores(x: &Array2<f64>, y: &Array1<f64>) -> Result<Vec<f64>> {
1102    let mut scores = Vec::new();
1103    for i in 0..x.ncols() {
1104        let correlation = calculate_correlation(&x.column(i).to_owned(), y)?;
1105        scores.push(correlation.abs());
1106    }
1107    Ok(scores)
1108}
1109
1110/// Calculate variance-based scores
1111fn calculate_variance_scores(x: &Array2<f64>) -> Result<Vec<f64>> {
1112    let mut scores = Vec::new();
1113    for i in 0..x.ncols() {
1114        let variance = x.column(i).var(0.0);
1115        scores.push(variance);
1116    }
1117    Ok(scores)
1118}
1119
1120/// Calculate mutual information scores (simplified implementation)
1121fn calculate_mutual_information_scores(x: &Array2<f64>, y: &Array1<f64>) -> Result<Vec<f64>> {
1122    // For now, use correlation as a proxy for mutual information
1123    // In a full implementation, you'd use proper mutual information calculation
1124    calculate_correlation_scores(x, y)
1125}
1126
1127/// Calculate correlation matrix
1128fn calculate_correlation_matrix(x: &Array2<f64>) -> Result<Array2<f64>> {
1129    let n_features = x.ncols();
1130    let mut corr_matrix = Array2::zeros((n_features, n_features));
1131
1132    for i in 0..n_features {
1133        for j in 0..n_features {
1134            if i == j {
1135                corr_matrix[[i, j]] = 1.0;
1136            } else {
1137                let corr = calculate_correlation(&x.column(i).to_owned(), &x.column(j).to_owned())?;
1138                corr_matrix[[i, j]] = corr;
1139            }
1140        }
1141    }
1142
1143    Ok(corr_matrix)
1144}
1145
1146#[allow(non_snake_case)]
1147#[cfg(test)]
1148mod tests {
1149    use super::*;
1150    use approx::assert_relative_eq;
1151    use scirs2_core::ndarray::{arr1, arr2};
1152
1153    #[test]
1154    fn test_auto_feature_config() {
1155        let config = AutoFeatureConfig::new()
1156            .with_max_features(50)
1157            .with_selection_threshold(0.05)
1158            .with_strategy(GenerationStrategy::Polynomial { degree: 3 });
1159
1160        assert_eq!(config.max_features, 50);
1161        assert_relative_eq!(config.selection_threshold, 0.05);
1162        assert_eq!(config.strategies.len(), 4); // 3 default + 1 added
1163    }
1164
1165    #[test]
1166    fn test_auto_feature_engineer_creation() {
1167        let config = AutoFeatureConfig::new();
1168        let engineer = AutoFeatureEngineer::new(config);
1169        assert_eq!(engineer.config().max_features, 200);
1170    }
1171
1172    #[test]
1173    fn test_auto_feature_engineer_fit() {
1174        let config = AutoFeatureConfig::new()
1175            .with_max_features(10)
1176            .with_selection_threshold(0.0); // Accept all features
1177        let engineer = AutoFeatureEngineer::new(config);
1178
1179        let X = arr2(&[[1.0, 2.0], [2.0, 4.0], [3.0, 6.0], [4.0, 8.0]]);
1180        let y = arr1(&[1.0, 2.0, 3.0, 4.0]);
1181
1182        let fitted = engineer.fit(&X, &y).unwrap();
1183        assert!(!fitted.selected_features().is_empty());
1184        assert!(!fitted.feature_names().is_empty());
1185    }
1186
1187    #[test]
1188    fn test_mathematical_functions() {
1189        let arr = arr1(&[1.0, 2.0, 3.0, 4.0]);
1190
1191        let sqrt_result = apply_math_function(&arr, MathFunction::Sqrt).unwrap();
1192        let expected_sqrt = arr1(&[1.0, 2.0_f64.sqrt(), 3.0_f64.sqrt(), 2.0]);
1193
1194        for (a, b) in sqrt_result.iter().zip(expected_sqrt.iter()) {
1195            assert_relative_eq!(a, b, epsilon = 1e-10);
1196        }
1197
1198        let square_result = apply_math_function(&arr, MathFunction::Square).unwrap();
1199        let expected_square = arr1(&[1.0, 4.0, 9.0, 16.0]);
1200
1201        for (a, b) in square_result.iter().zip(expected_square.iter()) {
1202            assert_relative_eq!(a, b, epsilon = 1e-10);
1203        }
1204    }
1205
1206    #[test]
1207    fn test_correlation_calculation() {
1208        let x = arr1(&[1.0, 2.0, 3.0, 4.0, 5.0]);
1209        let y = arr1(&[2.0, 4.0, 6.0, 8.0, 10.0]); // Perfect positive correlation
1210
1211        let corr = calculate_correlation(&x, &y).unwrap();
1212        assert_relative_eq!(corr, 1.0, epsilon = 1e-10);
1213
1214        let z = arr1(&[5.0, 4.0, 3.0, 2.0, 1.0]); // Perfect negative correlation
1215        let corr_neg = calculate_correlation(&x, &z).unwrap();
1216        assert_relative_eq!(corr_neg, -1.0, epsilon = 1e-10);
1217    }
1218
1219    #[test]
1220    fn test_feature_scaling() {
1221        let X = arr2(&[[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]]);
1222
1223        let scaled = scale_features(&X).unwrap();
1224
1225        // Check that each column has approximately zero mean and unit variance
1226        for i in 0..scaled.ncols() {
1227            let col = scaled.column(i);
1228            let mean = col.mean().unwrap();
1229            let std = col.mapv(|x| (x - mean).powi(2)).mean().unwrap().sqrt();
1230
1231            assert_relative_eq!(mean, 0.0, epsilon = 1e-10);
1232            assert_relative_eq!(std, 1.0, epsilon = 1e-10);
1233        }
1234    }
1235
1236    #[test]
1237    fn test_auto_feature_engineer_transform() {
1238        let config = AutoFeatureConfig::new()
1239            .with_max_features(5)
1240            .with_include_original(true);
1241        let engineer = AutoFeatureEngineer::new(config);
1242
1243        let X_train = arr2(&[[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]]);
1244        let y_train = arr1(&[1.0, 2.0, 3.0]);
1245
1246        let fitted = engineer.fit(&X_train, &y_train).unwrap();
1247
1248        let X_test = arr2(&[[4.0, 8.0], [5.0, 10.0]]);
1249
1250        let result = fitted.transform(&X_test).unwrap();
1251        assert_eq!(result.nrows(), 2);
1252        assert!(!result.is_empty());
1253    }
1254
1255    #[test]
1256    fn test_error_handling() {
1257        // Test empty arrays
1258        let config = AutoFeatureConfig::new();
1259        let engineer = AutoFeatureEngineer::new(config);
1260        let empty_X = Array2::from_shape_vec((0, 0), vec![]).unwrap();
1261        let empty_y = Array1::from_vec(vec![]);
1262        assert!(engineer.fit(&empty_X, &empty_y).is_err());
1263
1264        // Test mismatched dimensions
1265        let config = AutoFeatureConfig::new();
1266        let engineer = AutoFeatureEngineer::new(config);
1267        let X = arr2(&[[1.0, 2.0], [3.0, 4.0]]);
1268        let y = arr1(&[1.0]); // Wrong size
1269        assert!(engineer.fit(&X, &y).is_err());
1270    }
1271}
sklears_preprocessing/automated_feature_engineering.rs

sklears_preprocessing/
automated_feature_engineering.rs