Skip to main content

sklears_compose/
feature_engineering.rs

1//! Feature engineering and interaction detection
2//!
3//! Automated feature engineering, interaction detection, and feature selection.
4
5use scirs2_core::ndarray::{concatenate, Array1, Array2, ArrayView1, ArrayView2, Axis};
6use sklears_core::{
7    error::Result as SklResult,
8    prelude::SklearsError,
9    types::{Float, FloatBounds},
10};
11use std::collections::HashSet;
12
13/// Feature interaction detector
14pub struct FeatureInteractionDetector {
15    interaction_type: InteractionType,
16    max_interactions: usize,
17    min_correlation: f64,
18    method: DetectionMethod,
19    threshold: f64,
20}
21
22/// Types of feature interactions to detect
23#[derive(Debug, Clone)]
24pub enum InteractionType {
25    /// Linear interactions (correlation-based)
26    Linear,
27    /// Polynomial interactions
28    Polynomial { degree: usize },
29    /// Multiplicative interactions
30    Multiplicative,
31    /// Statistical interactions (ANOVA-based)
32    Statistical,
33    /// Mutual information based
34    MutualInformation,
35}
36
37/// Methods for detecting interactions
38#[derive(Debug, Clone)]
39pub enum DetectionMethod {
40    /// Correlation analysis
41    Correlation,
42    /// Mutual information
43    MutualInfo,
44    /// Statistical tests
45    StatisticalTest,
46    /// Tree-based importance
47    TreeBased,
48}
49
50impl FeatureInteractionDetector {
51    /// Create a new feature interaction detector
52    #[must_use]
53    pub fn new() -> Self {
54        Self {
55            interaction_type: InteractionType::Linear,
56            max_interactions: 100,
57            min_correlation: 0.1,
58            method: DetectionMethod::Correlation,
59            threshold: 0.05,
60        }
61    }
62
63    /// Set interaction type
64    #[must_use]
65    pub fn interaction_type(mut self, interaction_type: InteractionType) -> Self {
66        self.interaction_type = interaction_type;
67        self
68    }
69
70    /// Set maximum number of interactions to detect
71    #[must_use]
72    pub fn max_interactions(mut self, max: usize) -> Self {
73        self.max_interactions = max;
74        self
75    }
76
77    /// Set minimum correlation threshold
78    #[must_use]
79    pub fn min_correlation(mut self, min_corr: f64) -> Self {
80        self.min_correlation = min_corr;
81        self
82    }
83
84    /// Set detection method
85    #[must_use]
86    pub fn method(mut self, method: DetectionMethod) -> Self {
87        self.method = method;
88        self
89    }
90
91    /// Set threshold for detection
92    #[must_use]
93    pub fn threshold(mut self, threshold: f64) -> Self {
94        self.threshold = threshold;
95        self
96    }
97
98    /// Detect feature interactions
99    pub fn detect_interactions(
100        &self,
101        x: &ArrayView2<'_, Float>,
102        y: Option<&ArrayView1<'_, Float>>,
103    ) -> SklResult<Vec<FeatureInteraction>> {
104        match self.method {
105            DetectionMethod::Correlation => self.detect_correlation_interactions(x),
106            DetectionMethod::MutualInfo => self.detect_mutual_info_interactions(x, y),
107            DetectionMethod::StatisticalTest => self.detect_statistical_interactions(x, y),
108            DetectionMethod::TreeBased => self.detect_tree_based_interactions(x, y),
109        }
110    }
111
112    fn detect_correlation_interactions(
113        &self,
114        x: &ArrayView2<'_, Float>,
115    ) -> SklResult<Vec<FeatureInteraction>> {
116        let mut interactions = Vec::new();
117        let n_features = x.ncols();
118
119        for i in 0..n_features {
120            for j in (i + 1)..n_features {
121                let correlation = self.calculate_correlation(&x.column(i), &x.column(j))?;
122
123                if correlation.abs() >= self.min_correlation {
124                    interactions.push(FeatureInteraction {
125                        feature_indices: vec![i, j],
126                        interaction_type: self.interaction_type.clone(),
127                        strength: correlation.abs(),
128                        p_value: None,
129                    });
130                }
131            }
132        }
133
134        // Sort by strength and take top interactions
135        interactions.sort_by(|a, b| {
136            b.strength
137                .partial_cmp(&a.strength)
138                .unwrap_or(std::cmp::Ordering::Equal)
139        });
140        interactions.truncate(self.max_interactions);
141
142        Ok(interactions)
143    }
144
145    fn detect_mutual_info_interactions(
146        &self,
147        _x: &ArrayView2<'_, Float>,
148        _y: Option<&ArrayView1<'_, Float>>,
149    ) -> SklResult<Vec<FeatureInteraction>> {
150        // Placeholder implementation
151        Ok(Vec::new())
152    }
153
154    fn detect_statistical_interactions(
155        &self,
156        _x: &ArrayView2<'_, Float>,
157        _y: Option<&ArrayView1<'_, Float>>,
158    ) -> SklResult<Vec<FeatureInteraction>> {
159        // Placeholder implementation
160        Ok(Vec::new())
161    }
162
163    fn detect_tree_based_interactions(
164        &self,
165        _x: &ArrayView2<'_, Float>,
166        _y: Option<&ArrayView1<'_, Float>>,
167    ) -> SklResult<Vec<FeatureInteraction>> {
168        // Placeholder implementation
169        Ok(Vec::new())
170    }
171
172    fn calculate_correlation(
173        &self,
174        x1: &ArrayView1<'_, Float>,
175        x2: &ArrayView1<'_, Float>,
176    ) -> SklResult<f64> {
177        let n = x1.len();
178        if n != x2.len() {
179            return Err(SklearsError::ShapeMismatch {
180                expected: format!("{n}"),
181                actual: format!("{}", x2.len()),
182            });
183        }
184
185        let mean1 = x1.iter().copied().sum::<f64>() / n as f64;
186        let mean2 = x2.iter().copied().sum::<f64>() / n as f64;
187
188        let mut numerator = 0.0;
189        let mut sum_sq1 = 0.0;
190        let mut sum_sq2 = 0.0;
191
192        for i in 0..n {
193            let diff1 = x1[i] - mean1;
194            let diff2 = x2[i] - mean2;
195
196            numerator += diff1 * diff2;
197            sum_sq1 += diff1 * diff1;
198            sum_sq2 += diff2 * diff2;
199        }
200
201        let denominator = (sum_sq1 * sum_sq2).sqrt();
202        if denominator == 0.0 {
203            Ok(0.0)
204        } else {
205            Ok(numerator / denominator)
206        }
207    }
208}
209
210impl Default for FeatureInteractionDetector {
211    fn default() -> Self {
212        Self::new()
213    }
214}
215
216/// Represents a detected feature interaction
217#[derive(Debug, Clone)]
218pub struct FeatureInteraction {
219    /// Indices of features involved in the interaction
220    pub feature_indices: Vec<usize>,
221    /// Type of interaction
222    pub interaction_type: InteractionType,
223    /// Strength of interaction
224    pub strength: f64,
225    /// Statistical significance (p-value)
226    pub p_value: Option<f64>,
227}
228
229/// Automatic feature engineering pipeline
230pub struct AutoFeatureEngineer {
231    enable_polynomial: bool,
232    polynomial_degree: usize,
233    enable_interactions: bool,
234    enable_binning: bool,
235    n_bins: usize,
236    enable_scaling: bool,
237    enable_selection: bool,
238    max_features: Option<usize>,
239}
240
241impl AutoFeatureEngineer {
242    /// Create a new automatic feature engineer
243    #[must_use]
244    pub fn new() -> Self {
245        Self {
246            enable_polynomial: true,
247            polynomial_degree: 2,
248            enable_interactions: true,
249            enable_binning: false,
250            n_bins: 10,
251            enable_scaling: true,
252            enable_selection: true,
253            max_features: None,
254        }
255    }
256
257    /// Enable/disable polynomial features
258    #[must_use]
259    pub fn polynomial_features(mut self, enable: bool, degree: usize) -> Self {
260        self.enable_polynomial = enable;
261        self.polynomial_degree = degree;
262        self
263    }
264
265    /// Enable/disable interaction features
266    #[must_use]
267    pub fn interaction_features(mut self, enable: bool) -> Self {
268        self.enable_interactions = enable;
269        self
270    }
271
272    /// Enable/disable binning features
273    #[must_use]
274    pub fn binning_features(mut self, enable: bool, n_bins: usize) -> Self {
275        self.enable_binning = enable;
276        self.n_bins = n_bins;
277        self
278    }
279
280    /// Enable/disable scaling
281    #[must_use]
282    pub fn scaling(mut self, enable: bool) -> Self {
283        self.enable_scaling = enable;
284        self
285    }
286
287    /// Enable/disable feature selection
288    #[must_use]
289    pub fn feature_selection(mut self, enable: bool, max_features: Option<usize>) -> Self {
290        self.enable_selection = enable;
291        self.max_features = max_features;
292        self
293    }
294
295    /// Generate engineered features
296    pub fn generate_features(
297        &self,
298        x: &ArrayView2<'_, Float>,
299        y: Option<&ArrayView1<'_, Float>>,
300    ) -> SklResult<Array2<f64>> {
301        let mut engineered = x.mapv(|v| v);
302
303        if self.enable_polynomial {
304            engineered = self.add_polynomial_features(&engineered)?;
305        }
306
307        if self.enable_interactions {
308            engineered = self.add_interaction_features(&engineered)?;
309        }
310
311        if self.enable_binning {
312            engineered = self.add_binning_features(&engineered)?;
313        }
314
315        if self.enable_scaling {
316            engineered = self.apply_scaling(&engineered)?;
317        }
318
319        if self.enable_selection {
320            engineered = self.select_features(&engineered, y)?;
321        }
322
323        Ok(engineered)
324    }
325
326    fn add_polynomial_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
327        let (n_samples, n_features) = x.dim();
328        let mut features = x.clone();
329
330        for degree in 2..=self.polynomial_degree {
331            for i in 0..n_features {
332                let mut poly_col = Array1::zeros(n_samples);
333                for (j, &val) in x.column(i).iter().enumerate() {
334                    poly_col[j] = val.powi(degree as i32);
335                }
336
337                // Add polynomial feature as new column
338                let new_features = concatenate![Axis(1), features, poly_col.insert_axis(Axis(1))];
339                features = new_features;
340            }
341        }
342
343        Ok(features)
344    }
345
346    fn add_interaction_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
347        let (n_samples, n_features) = x.dim();
348        let mut features = x.clone();
349
350        for i in 0..n_features {
351            for j in (i + 1)..n_features {
352                let mut interaction_col = Array1::zeros(n_samples);
353                for k in 0..n_samples {
354                    interaction_col[k] = x[[k, i]] * x[[k, j]];
355                }
356
357                // Add interaction feature as new column
358                let new_features =
359                    concatenate![Axis(1), features, interaction_col.insert_axis(Axis(1))];
360                features = new_features;
361            }
362        }
363
364        Ok(features)
365    }
366
367    fn add_binning_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
368        let (n_samples, n_features) = x.dim();
369        let mut features = x.clone();
370
371        for i in 0..n_features {
372            let column = x.column(i);
373            let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
374            let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
375            let bin_width = (max_val - min_val) / self.n_bins as f64;
376
377            let mut binned_col = Array1::zeros(n_samples);
378            for (j, &val) in column.iter().enumerate() {
379                let bin = ((val - min_val) / bin_width).floor() as usize;
380                binned_col[j] = bin.min(self.n_bins - 1) as f64;
381            }
382
383            // Add binned feature as new column
384            let new_features = concatenate![Axis(1), features, binned_col.insert_axis(Axis(1))];
385            features = new_features;
386        }
387
388        Ok(features)
389    }
390
391    fn apply_scaling(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
392        let (n_samples, n_features) = x.dim();
393        let mut scaled = Array2::zeros((n_samples, n_features));
394
395        for i in 0..n_features {
396            let column = x.column(i);
397            let mean = column.mean().unwrap_or(0.0);
398            let std = column.var(0.0).sqrt();
399
400            for j in 0..n_samples {
401                scaled[[j, i]] = if std > 0.0 {
402                    (x[[j, i]] - mean) / std
403                } else {
404                    0.0
405                };
406            }
407        }
408
409        Ok(scaled)
410    }
411
412    fn select_features(
413        &self,
414        x: &Array2<f64>,
415        _y: Option<&ArrayView1<'_, Float>>,
416    ) -> SklResult<Array2<f64>> {
417        // Simple feature selection based on variance
418        let (n_samples, n_features) = x.dim();
419
420        if let Some(max_features) = self.max_features {
421            if max_features >= n_features {
422                return Ok(x.clone());
423            }
424
425            let mut feature_scores = Vec::new();
426
427            for i in 0..n_features {
428                let column = x.column(i);
429                let variance = column.var(0.0);
430                feature_scores.push((i, variance));
431            }
432
433            // Sort by variance (descending)
434            feature_scores
435                .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
436
437            // Select top features
438            let selected_indices: Vec<usize> = feature_scores
439                .into_iter()
440                .take(max_features)
441                .map(|(idx, _)| idx)
442                .collect();
443
444            // Create new array with selected features
445            let mut selected = Array2::zeros((n_samples, max_features));
446            for (new_idx, &old_idx) in selected_indices.iter().enumerate() {
447                for j in 0..n_samples {
448                    selected[[j, new_idx]] = x[[j, old_idx]];
449                }
450            }
451
452            Ok(selected)
453        } else {
454            Ok(x.clone())
455        }
456    }
457}
458
459impl Default for AutoFeatureEngineer {
460    fn default() -> Self {
461        Self::new()
462    }
463}
464
465/// Column type detector for automatic preprocessing
466pub struct ColumnTypeDetector {
467    categorical_threshold: f64,
468    date_pattern_detection: bool,
469    text_detection: bool,
470}
471
472/// Detected column types
473#[derive(Debug, Clone, PartialEq)]
474pub enum ColumnType {
475    /// Numeric (continuous)
476    Numeric,
477    /// Categorical
478    Categorical,
479    /// Boolean
480    Boolean,
481    /// Date/Time
482    DateTime,
483    /// Text
484    Text,
485    /// Binary (0/1)
486    Binary,
487    /// Ordinal
488    Ordinal,
489}
490
491impl ColumnTypeDetector {
492    /// Create a new column type detector
493    #[must_use]
494    pub fn new() -> Self {
495        Self {
496            categorical_threshold: 0.1, // If unique values / total values < threshold, consider categorical
497            date_pattern_detection: true,
498            text_detection: true,
499        }
500    }
501
502    /// Set categorical threshold
503    #[must_use]
504    pub fn categorical_threshold(mut self, threshold: f64) -> Self {
505        self.categorical_threshold = threshold;
506        self
507    }
508
509    /// Enable/disable date pattern detection
510    #[must_use]
511    pub fn date_pattern_detection(mut self, enable: bool) -> Self {
512        self.date_pattern_detection = enable;
513        self
514    }
515
516    /// Enable/disable text detection
517    #[must_use]
518    pub fn text_detection(mut self, enable: bool) -> Self {
519        self.text_detection = enable;
520        self
521    }
522
523    /// Detect column types
524    #[must_use]
525    pub fn detect_types(&self, x: &ArrayView2<'_, Float>) -> Vec<ColumnType> {
526        let mut column_types = Vec::new();
527
528        for i in 0..x.ncols() {
529            let column = x.column(i);
530            let column_type = self.detect_column_type(&column);
531            column_types.push(column_type);
532        }
533
534        column_types
535    }
536
537    fn detect_column_type(&self, column: &ArrayView1<'_, Float>) -> ColumnType {
538        let unique_values = self.count_unique_values(column);
539        let total_values = column.len();
540        let unique_ratio = unique_values as f64 / total_values as f64;
541
542        // Check for binary
543        if unique_values == 2 {
544            return ColumnType::Binary;
545        }
546
547        // Check for boolean (assuming 0.0 and 1.0 represent false and true)
548        if self.is_boolean_column(column) {
549            return ColumnType::Boolean;
550        }
551
552        // Check for categorical
553        if unique_ratio < self.categorical_threshold {
554            return ColumnType::Categorical;
555        }
556
557        // Default to numeric
558        ColumnType::Numeric
559    }
560
561    fn count_unique_values(&self, column: &ArrayView1<'_, Float>) -> usize {
562        let mut unique_set = HashSet::new();
563        for &value in column {
564            // Use a small epsilon for floating point comparison
565            let rounded = (value * 1000.0).round() / 1000.0;
566            unique_set.insert(rounded.to_bits());
567        }
568        unique_set.len()
569    }
570
571    fn is_boolean_column(&self, column: &ArrayView1<'_, Float>) -> bool {
572        for &value in column {
573            if value != 0.0 && value != 1.0 {
574                return false;
575            }
576        }
577        true
578    }
579}
580
581impl Default for ColumnTypeDetector {
582    fn default() -> Self {
583        Self::new()
584    }
585}
586
587#[allow(non_snake_case)]
588#[cfg(test)]
589mod tests {
590    use super::*;
591    use scirs2_core::ndarray::array;
592
593    #[test]
594    fn test_feature_interaction_detector() {
595        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
596
597        let detector = FeatureInteractionDetector::new().min_correlation(0.5);
598
599        let interactions = detector
600            .detect_interactions(&x.view(), None)
601            .unwrap_or_default();
602        assert!(!interactions.is_empty());
603    }
604
605    #[test]
606    fn test_auto_feature_engineer() {
607        let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
608
609        let engineer = AutoFeatureEngineer::new()
610            .polynomial_features(true, 2)
611            .interaction_features(true);
612
613        let engineered = engineer
614            .generate_features(&x.view(), None)
615            .unwrap_or_default();
616        assert!(engineered.ncols() > x.ncols());
617    }
618
619    #[test]
620    fn test_column_type_detector() {
621        let x = array![[0.0, 1.0, 5.5], [1.0, 0.0, 6.2], [0.0, 1.0, 7.8]];
622
623        let detector = ColumnTypeDetector::new();
624        let types = detector.detect_types(&x.view());
625
626        assert_eq!(types.len(), 3);
627        assert_eq!(types[0], ColumnType::Binary); // Binary column
628        assert_eq!(types[1], ColumnType::Binary); // Binary column
629        assert_eq!(types[2], ColumnType::Numeric); // Numeric column
630    }
631}