sklears_compose/
feature_engineering.rs

1//! Feature engineering and interaction detection
2//!
3//! Automated feature engineering, interaction detection, and feature selection.
4
5use scirs2_core::ndarray::{concatenate, Array1, Array2, ArrayView1, ArrayView2, Axis};
6use sklears_core::{
7    error::Result as SklResult,
8    prelude::SklearsError,
9    types::{Float, FloatBounds},
10};
11use std::collections::HashSet;
12
13/// Feature interaction detector
14pub struct FeatureInteractionDetector {
15    interaction_type: InteractionType,
16    max_interactions: usize,
17    min_correlation: f64,
18    method: DetectionMethod,
19    threshold: f64,
20}
21
22/// Types of feature interactions to detect
23#[derive(Debug, Clone)]
24pub enum InteractionType {
25    /// Linear interactions (correlation-based)
26    Linear,
27    /// Polynomial interactions
28    Polynomial { degree: usize },
29    /// Multiplicative interactions
30    Multiplicative,
31    /// Statistical interactions (ANOVA-based)
32    Statistical,
33    /// Mutual information based
34    MutualInformation,
35}
36
37/// Methods for detecting interactions
38#[derive(Debug, Clone)]
39pub enum DetectionMethod {
40    /// Correlation analysis
41    Correlation,
42    /// Mutual information
43    MutualInfo,
44    /// Statistical tests
45    StatisticalTest,
46    /// Tree-based importance
47    TreeBased,
48}
49
50impl FeatureInteractionDetector {
51    /// Create a new feature interaction detector
52    #[must_use]
53    pub fn new() -> Self {
54        Self {
55            interaction_type: InteractionType::Linear,
56            max_interactions: 100,
57            min_correlation: 0.1,
58            method: DetectionMethod::Correlation,
59            threshold: 0.05,
60        }
61    }
62
63    /// Set interaction type
64    #[must_use]
65    pub fn interaction_type(mut self, interaction_type: InteractionType) -> Self {
66        self.interaction_type = interaction_type;
67        self
68    }
69
70    /// Set maximum number of interactions to detect
71    #[must_use]
72    pub fn max_interactions(mut self, max: usize) -> Self {
73        self.max_interactions = max;
74        self
75    }
76
77    /// Set minimum correlation threshold
78    #[must_use]
79    pub fn min_correlation(mut self, min_corr: f64) -> Self {
80        self.min_correlation = min_corr;
81        self
82    }
83
84    /// Set detection method
85    #[must_use]
86    pub fn method(mut self, method: DetectionMethod) -> Self {
87        self.method = method;
88        self
89    }
90
91    /// Set threshold for detection
92    #[must_use]
93    pub fn threshold(mut self, threshold: f64) -> Self {
94        self.threshold = threshold;
95        self
96    }
97
98    /// Detect feature interactions
99    pub fn detect_interactions(
100        &self,
101        x: &ArrayView2<'_, Float>,
102        y: Option<&ArrayView1<'_, Float>>,
103    ) -> SklResult<Vec<FeatureInteraction>> {
104        match self.method {
105            DetectionMethod::Correlation => self.detect_correlation_interactions(x),
106            DetectionMethod::MutualInfo => self.detect_mutual_info_interactions(x, y),
107            DetectionMethod::StatisticalTest => self.detect_statistical_interactions(x, y),
108            DetectionMethod::TreeBased => self.detect_tree_based_interactions(x, y),
109        }
110    }
111
112    fn detect_correlation_interactions(
113        &self,
114        x: &ArrayView2<'_, Float>,
115    ) -> SklResult<Vec<FeatureInteraction>> {
116        let mut interactions = Vec::new();
117        let n_features = x.ncols();
118
119        for i in 0..n_features {
120            for j in (i + 1)..n_features {
121                let correlation = self.calculate_correlation(&x.column(i), &x.column(j))?;
122
123                if correlation.abs() >= self.min_correlation {
124                    interactions.push(FeatureInteraction {
125                        feature_indices: vec![i, j],
126                        interaction_type: self.interaction_type.clone(),
127                        strength: correlation.abs(),
128                        p_value: None,
129                    });
130                }
131            }
132        }
133
134        // Sort by strength and take top interactions
135        interactions.sort_by(|a, b| b.strength.partial_cmp(&a.strength).unwrap());
136        interactions.truncate(self.max_interactions);
137
138        Ok(interactions)
139    }
140
141    fn detect_mutual_info_interactions(
142        &self,
143        _x: &ArrayView2<'_, Float>,
144        _y: Option<&ArrayView1<'_, Float>>,
145    ) -> SklResult<Vec<FeatureInteraction>> {
146        // Placeholder implementation
147        Ok(Vec::new())
148    }
149
150    fn detect_statistical_interactions(
151        &self,
152        _x: &ArrayView2<'_, Float>,
153        _y: Option<&ArrayView1<'_, Float>>,
154    ) -> SklResult<Vec<FeatureInteraction>> {
155        // Placeholder implementation
156        Ok(Vec::new())
157    }
158
159    fn detect_tree_based_interactions(
160        &self,
161        _x: &ArrayView2<'_, Float>,
162        _y: Option<&ArrayView1<'_, Float>>,
163    ) -> SklResult<Vec<FeatureInteraction>> {
164        // Placeholder implementation
165        Ok(Vec::new())
166    }
167
168    fn calculate_correlation(
169        &self,
170        x1: &ArrayView1<'_, Float>,
171        x2: &ArrayView1<'_, Float>,
172    ) -> SklResult<f64> {
173        let n = x1.len();
174        if n != x2.len() {
175            return Err(SklearsError::ShapeMismatch {
176                expected: format!("{n}"),
177                actual: format!("{}", x2.len()),
178            });
179        }
180
181        let mean1 = x1.iter().copied().sum::<f64>() / n as f64;
182        let mean2 = x2.iter().copied().sum::<f64>() / n as f64;
183
184        let mut numerator = 0.0;
185        let mut sum_sq1 = 0.0;
186        let mut sum_sq2 = 0.0;
187
188        for i in 0..n {
189            let diff1 = x1[i] - mean1;
190            let diff2 = x2[i] - mean2;
191
192            numerator += diff1 * diff2;
193            sum_sq1 += diff1 * diff1;
194            sum_sq2 += diff2 * diff2;
195        }
196
197        let denominator = (sum_sq1 * sum_sq2).sqrt();
198        if denominator == 0.0 {
199            Ok(0.0)
200        } else {
201            Ok(numerator / denominator)
202        }
203    }
204}
205
206impl Default for FeatureInteractionDetector {
207    fn default() -> Self {
208        Self::new()
209    }
210}
211
212/// Represents a detected feature interaction
213#[derive(Debug, Clone)]
214pub struct FeatureInteraction {
215    /// Indices of features involved in the interaction
216    pub feature_indices: Vec<usize>,
217    /// Type of interaction
218    pub interaction_type: InteractionType,
219    /// Strength of interaction
220    pub strength: f64,
221    /// Statistical significance (p-value)
222    pub p_value: Option<f64>,
223}
224
225/// Automatic feature engineering pipeline
226pub struct AutoFeatureEngineer {
227    enable_polynomial: bool,
228    polynomial_degree: usize,
229    enable_interactions: bool,
230    enable_binning: bool,
231    n_bins: usize,
232    enable_scaling: bool,
233    enable_selection: bool,
234    max_features: Option<usize>,
235}
236
237impl AutoFeatureEngineer {
238    /// Create a new automatic feature engineer
239    #[must_use]
240    pub fn new() -> Self {
241        Self {
242            enable_polynomial: true,
243            polynomial_degree: 2,
244            enable_interactions: true,
245            enable_binning: false,
246            n_bins: 10,
247            enable_scaling: true,
248            enable_selection: true,
249            max_features: None,
250        }
251    }
252
253    /// Enable/disable polynomial features
254    #[must_use]
255    pub fn polynomial_features(mut self, enable: bool, degree: usize) -> Self {
256        self.enable_polynomial = enable;
257        self.polynomial_degree = degree;
258        self
259    }
260
261    /// Enable/disable interaction features
262    #[must_use]
263    pub fn interaction_features(mut self, enable: bool) -> Self {
264        self.enable_interactions = enable;
265        self
266    }
267
268    /// Enable/disable binning features
269    #[must_use]
270    pub fn binning_features(mut self, enable: bool, n_bins: usize) -> Self {
271        self.enable_binning = enable;
272        self.n_bins = n_bins;
273        self
274    }
275
276    /// Enable/disable scaling
277    #[must_use]
278    pub fn scaling(mut self, enable: bool) -> Self {
279        self.enable_scaling = enable;
280        self
281    }
282
283    /// Enable/disable feature selection
284    #[must_use]
285    pub fn feature_selection(mut self, enable: bool, max_features: Option<usize>) -> Self {
286        self.enable_selection = enable;
287        self.max_features = max_features;
288        self
289    }
290
291    /// Generate engineered features
292    pub fn generate_features(
293        &self,
294        x: &ArrayView2<'_, Float>,
295        y: Option<&ArrayView1<'_, Float>>,
296    ) -> SklResult<Array2<f64>> {
297        let mut engineered = x.mapv(|v| v);
298
299        if self.enable_polynomial {
300            engineered = self.add_polynomial_features(&engineered)?;
301        }
302
303        if self.enable_interactions {
304            engineered = self.add_interaction_features(&engineered)?;
305        }
306
307        if self.enable_binning {
308            engineered = self.add_binning_features(&engineered)?;
309        }
310
311        if self.enable_scaling {
312            engineered = self.apply_scaling(&engineered)?;
313        }
314
315        if self.enable_selection {
316            engineered = self.select_features(&engineered, y)?;
317        }
318
319        Ok(engineered)
320    }
321
322    fn add_polynomial_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
323        let (n_samples, n_features) = x.dim();
324        let mut features = x.clone();
325
326        for degree in 2..=self.polynomial_degree {
327            for i in 0..n_features {
328                let mut poly_col = Array1::zeros(n_samples);
329                for (j, &val) in x.column(i).iter().enumerate() {
330                    poly_col[j] = val.powi(degree as i32);
331                }
332
333                // Add polynomial feature as new column
334                let new_features = concatenate![Axis(1), features, poly_col.insert_axis(Axis(1))];
335                features = new_features;
336            }
337        }
338
339        Ok(features)
340    }
341
342    fn add_interaction_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
343        let (n_samples, n_features) = x.dim();
344        let mut features = x.clone();
345
346        for i in 0..n_features {
347            for j in (i + 1)..n_features {
348                let mut interaction_col = Array1::zeros(n_samples);
349                for k in 0..n_samples {
350                    interaction_col[k] = x[[k, i]] * x[[k, j]];
351                }
352
353                // Add interaction feature as new column
354                let new_features =
355                    concatenate![Axis(1), features, interaction_col.insert_axis(Axis(1))];
356                features = new_features;
357            }
358        }
359
360        Ok(features)
361    }
362
363    fn add_binning_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
364        let (n_samples, n_features) = x.dim();
365        let mut features = x.clone();
366
367        for i in 0..n_features {
368            let column = x.column(i);
369            let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
370            let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
371            let bin_width = (max_val - min_val) / self.n_bins as f64;
372
373            let mut binned_col = Array1::zeros(n_samples);
374            for (j, &val) in column.iter().enumerate() {
375                let bin = ((val - min_val) / bin_width).floor() as usize;
376                binned_col[j] = bin.min(self.n_bins - 1) as f64;
377            }
378
379            // Add binned feature as new column
380            let new_features = concatenate![Axis(1), features, binned_col.insert_axis(Axis(1))];
381            features = new_features;
382        }
383
384        Ok(features)
385    }
386
387    fn apply_scaling(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
388        let (n_samples, n_features) = x.dim();
389        let mut scaled = Array2::zeros((n_samples, n_features));
390
391        for i in 0..n_features {
392            let column = x.column(i);
393            let mean = column.mean().unwrap_or(0.0);
394            let std = column.var(0.0).sqrt();
395
396            for j in 0..n_samples {
397                scaled[[j, i]] = if std > 0.0 {
398                    (x[[j, i]] - mean) / std
399                } else {
400                    0.0
401                };
402            }
403        }
404
405        Ok(scaled)
406    }
407
408    fn select_features(
409        &self,
410        x: &Array2<f64>,
411        _y: Option<&ArrayView1<'_, Float>>,
412    ) -> SklResult<Array2<f64>> {
413        // Simple feature selection based on variance
414        let (n_samples, n_features) = x.dim();
415
416        if let Some(max_features) = self.max_features {
417            if max_features >= n_features {
418                return Ok(x.clone());
419            }
420
421            let mut feature_scores = Vec::new();
422
423            for i in 0..n_features {
424                let column = x.column(i);
425                let variance = column.var(0.0);
426                feature_scores.push((i, variance));
427            }
428
429            // Sort by variance (descending)
430            feature_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
431
432            // Select top features
433            let selected_indices: Vec<usize> = feature_scores
434                .into_iter()
435                .take(max_features)
436                .map(|(idx, _)| idx)
437                .collect();
438
439            // Create new array with selected features
440            let mut selected = Array2::zeros((n_samples, max_features));
441            for (new_idx, &old_idx) in selected_indices.iter().enumerate() {
442                for j in 0..n_samples {
443                    selected[[j, new_idx]] = x[[j, old_idx]];
444                }
445            }
446
447            Ok(selected)
448        } else {
449            Ok(x.clone())
450        }
451    }
452}
453
454impl Default for AutoFeatureEngineer {
455    fn default() -> Self {
456        Self::new()
457    }
458}
459
460/// Column type detector for automatic preprocessing
461pub struct ColumnTypeDetector {
462    categorical_threshold: f64,
463    date_pattern_detection: bool,
464    text_detection: bool,
465}
466
467/// Detected column types
468#[derive(Debug, Clone, PartialEq)]
469pub enum ColumnType {
470    /// Numeric (continuous)
471    Numeric,
472    /// Categorical
473    Categorical,
474    /// Boolean
475    Boolean,
476    /// Date/Time
477    DateTime,
478    /// Text
479    Text,
480    /// Binary (0/1)
481    Binary,
482    /// Ordinal
483    Ordinal,
484}
485
486impl ColumnTypeDetector {
487    /// Create a new column type detector
488    #[must_use]
489    pub fn new() -> Self {
490        Self {
491            categorical_threshold: 0.1, // If unique values / total values < threshold, consider categorical
492            date_pattern_detection: true,
493            text_detection: true,
494        }
495    }
496
497    /// Set categorical threshold
498    #[must_use]
499    pub fn categorical_threshold(mut self, threshold: f64) -> Self {
500        self.categorical_threshold = threshold;
501        self
502    }
503
504    /// Enable/disable date pattern detection
505    #[must_use]
506    pub fn date_pattern_detection(mut self, enable: bool) -> Self {
507        self.date_pattern_detection = enable;
508        self
509    }
510
511    /// Enable/disable text detection
512    #[must_use]
513    pub fn text_detection(mut self, enable: bool) -> Self {
514        self.text_detection = enable;
515        self
516    }
517
518    /// Detect column types
519    #[must_use]
520    pub fn detect_types(&self, x: &ArrayView2<'_, Float>) -> Vec<ColumnType> {
521        let mut column_types = Vec::new();
522
523        for i in 0..x.ncols() {
524            let column = x.column(i);
525            let column_type = self.detect_column_type(&column);
526            column_types.push(column_type);
527        }
528
529        column_types
530    }
531
532    fn detect_column_type(&self, column: &ArrayView1<'_, Float>) -> ColumnType {
533        let unique_values = self.count_unique_values(column);
534        let total_values = column.len();
535        let unique_ratio = unique_values as f64 / total_values as f64;
536
537        // Check for binary
538        if unique_values == 2 {
539            return ColumnType::Binary;
540        }
541
542        // Check for boolean (assuming 0.0 and 1.0 represent false and true)
543        if self.is_boolean_column(column) {
544            return ColumnType::Boolean;
545        }
546
547        // Check for categorical
548        if unique_ratio < self.categorical_threshold {
549            return ColumnType::Categorical;
550        }
551
552        // Default to numeric
553        ColumnType::Numeric
554    }
555
556    fn count_unique_values(&self, column: &ArrayView1<'_, Float>) -> usize {
557        let mut unique_set = HashSet::new();
558        for &value in column {
559            // Use a small epsilon for floating point comparison
560            let rounded = (value * 1000.0).round() / 1000.0;
561            unique_set.insert(rounded.to_bits());
562        }
563        unique_set.len()
564    }
565
566    fn is_boolean_column(&self, column: &ArrayView1<'_, Float>) -> bool {
567        for &value in column {
568            if value != 0.0 && value != 1.0 {
569                return false;
570            }
571        }
572        true
573    }
574}
575
576impl Default for ColumnTypeDetector {
577    fn default() -> Self {
578        Self::new()
579    }
580}
581
582#[allow(non_snake_case)]
583#[cfg(test)]
584mod tests {
585    use super::*;
586    use scirs2_core::ndarray::array;
587
588    #[test]
589    fn test_feature_interaction_detector() {
590        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
591
592        let detector = FeatureInteractionDetector::new().min_correlation(0.5);
593
594        let interactions = detector.detect_interactions(&x.view(), None).unwrap();
595        assert!(!interactions.is_empty());
596    }
597
598    #[test]
599    fn test_auto_feature_engineer() {
600        let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
601
602        let engineer = AutoFeatureEngineer::new()
603            .polynomial_features(true, 2)
604            .interaction_features(true);
605
606        let engineered = engineer.generate_features(&x.view(), None).unwrap();
607        assert!(engineered.ncols() > x.ncols());
608    }
609
610    #[test]
611    fn test_column_type_detector() {
612        let x = array![[0.0, 1.0, 5.5], [1.0, 0.0, 6.2], [0.0, 1.0, 7.8]];
613
614        let detector = ColumnTypeDetector::new();
615        let types = detector.detect_types(&x.view());
616
617        assert_eq!(types.len(), 3);
618        assert_eq!(types[0], ColumnType::Binary); // Binary column
619        assert_eq!(types[1], ColumnType::Binary); // Binary column
620        assert_eq!(types[2], ColumnType::Numeric); // Numeric column
621    }
622}