sklears_utils/
feature_engineering.rs

1//! Feature engineering utilities for machine learning
2//!
3//! This module provides utilities for generating derived features including
4//! polynomial features, interaction features, and feature binning.
5
6use crate::{UtilsError, UtilsResult};
7use scirs2_core::ndarray::{Array1, Array2};
8
9/// Polynomial feature generator
10///
11/// Generates polynomial and interaction features from input data.
12/// For example, with degree=2 and features [a, b], generates: [1, a, b, a², ab, b²]
13#[derive(Clone, Debug)]
14pub struct PolynomialFeatures {
15    degree: usize,
16    include_bias: bool,
17    interaction_only: bool,
18}
19
20impl PolynomialFeatures {
21    /// Create a new polynomial feature generator
22    ///
23    /// # Arguments
24    /// * `degree` - Maximum degree of polynomial features
25    /// * `include_bias` - Whether to include a bias column (constant 1.0)
26    /// * `interaction_only` - If true, only interaction features (no powers of single features)
27    pub fn new(degree: usize, include_bias: bool, interaction_only: bool) -> UtilsResult<Self> {
28        if degree == 0 {
29            return Err(UtilsError::InvalidParameter(
30                "Degree must be at least 1".to_string(),
31            ));
32        }
33
34        Ok(Self {
35            degree,
36            include_bias,
37            interaction_only,
38        })
39    }
40
41    /// Compute the number of output features
42    ///
43    /// # Arguments
44    /// * `n_features` - Number of input features
45    ///
46    /// # Returns
47    /// Number of output features after transformation
48    pub fn n_output_features(&self, n_features: usize) -> usize {
49        let mut n_out = if self.include_bias { 1 } else { 0 };
50
51        if self.interaction_only {
52            // Combinations without powers
53            for d in 1..=self.degree {
54                n_out += Self::binomial_coefficient(n_features, d);
55            }
56        } else {
57            // Stars and bars: choose n_features + degree from degree
58            n_out += Self::binomial_coefficient(n_features + self.degree, self.degree) - 1;
59        }
60
61        n_out
62    }
63
64    /// Transform features to polynomial features
65    ///
66    /// # Arguments
67    /// * `x` - Input features (n_samples × n_features)
68    ///
69    /// # Returns
70    /// Transformed features with polynomial terms
71    pub fn transform(&self, x: &Array2<f64>) -> UtilsResult<Array2<f64>> {
72        if x.nrows() == 0 || x.ncols() == 0 {
73            return Err(UtilsError::EmptyInput);
74        }
75
76        let n_samples = x.nrows();
77        let n_features = x.ncols();
78        let n_output = self.n_output_features(n_features);
79
80        let mut result = Array2::zeros((n_samples, n_output));
81
82        for i in 0..n_samples {
83            let row = x.row(i);
84            let features = self.generate_polynomial_row(&row.to_vec());
85            for (j, &val) in features.iter().enumerate() {
86                result[[i, j]] = val;
87            }
88        }
89
90        Ok(result)
91    }
92
93    fn generate_polynomial_row(&self, row: &[f64]) -> Vec<f64> {
94        let mut features = Vec::new();
95
96        if self.include_bias {
97            features.push(1.0);
98        }
99
100        if self.interaction_only {
101            self.generate_interaction_features(row, &mut features);
102        } else {
103            self.generate_all_polynomial_features(row, &mut features);
104        }
105
106        features
107    }
108
109    fn generate_interaction_features(&self, row: &[f64], features: &mut Vec<f64>) {
110        // Generate all k-way interactions for k = 1 to degree
111        for degree in 1..=self.degree {
112            Self::generate_combinations(row, degree, 0, vec![], features);
113        }
114    }
115
116    fn generate_combinations(
117        row: &[f64],
118        k: usize,
119        start: usize,
120        current: Vec<usize>,
121        features: &mut Vec<f64>,
122    ) {
123        if current.len() == k {
124            let product: f64 = current.iter().map(|&i| row[i]).product();
125            features.push(product);
126            return;
127        }
128
129        for i in start..row.len() {
130            let mut next = current.clone();
131            next.push(i);
132            Self::generate_combinations(row, k, i + 1, next, features);
133        }
134    }
135
136    fn generate_all_polynomial_features(&self, row: &[f64], features: &mut Vec<f64>) {
137        let n = row.len();
138        Self::generate_powers_recursive(row, self.degree, 0, vec![0; n], 1.0, features);
139    }
140
141    fn generate_powers_recursive(
142        row: &[f64],
143        max_degree: usize,
144        feature_idx: usize,
145        powers: Vec<usize>,
146        current_product: f64,
147        features: &mut Vec<f64>,
148    ) {
149        if feature_idx == row.len() {
150            if powers.iter().sum::<usize>() > 0 {
151                features.push(current_product);
152            }
153            return;
154        }
155
156        let current_degree: usize = powers.iter().sum();
157
158        for power in 0..=(max_degree - current_degree) {
159            let mut new_powers = powers.clone();
160            new_powers[feature_idx] = power;
161
162            let new_product = if power > 0 {
163                current_product * row[feature_idx].powi(power as i32)
164            } else {
165                current_product
166            };
167
168            Self::generate_powers_recursive(
169                row,
170                max_degree,
171                feature_idx + 1,
172                new_powers,
173                new_product,
174                features,
175            );
176        }
177    }
178
179    fn binomial_coefficient(n: usize, k: usize) -> usize {
180        if k > n {
181            return 0;
182        }
183        if k == 0 || k == n {
184            return 1;
185        }
186
187        let k = k.min(n - k); // Take advantage of symmetry
188        let mut result = 1;
189
190        for i in 0..k {
191            result *= n - i;
192            result /= i + 1;
193        }
194
195        result
196    }
197}
198
199/// Interaction feature generator
200///
201/// Generates pairwise interaction features (products) between features.
202#[derive(Clone, Debug)]
203pub struct InteractionFeatures {
204    include_self: bool,
205}
206
207impl InteractionFeatures {
208    /// Create a new interaction feature generator
209    ///
210    /// # Arguments
211    /// * `include_self` - Whether to include self-interactions (x * x)
212    pub fn new(include_self: bool) -> Self {
213        Self { include_self }
214    }
215
216    /// Transform features by adding interaction terms
217    ///
218    /// # Arguments
219    /// * `x` - Input features (n_samples × n_features)
220    ///
221    /// # Returns
222    /// Features with added interaction terms
223    pub fn transform(&self, x: &Array2<f64>) -> UtilsResult<Array2<f64>> {
224        if x.nrows() == 0 || x.ncols() == 0 {
225            return Err(UtilsError::EmptyInput);
226        }
227
228        let n_samples = x.nrows();
229        let n_features = x.ncols();
230
231        let n_interactions = if self.include_self {
232            n_features * (n_features + 1) / 2
233        } else {
234            n_features * (n_features - 1) / 2
235        };
236
237        let mut result = Array2::zeros((n_samples, n_features + n_interactions));
238
239        // Copy original features
240        for i in 0..n_samples {
241            for j in 0..n_features {
242                result[[i, j]] = x[[i, j]];
243            }
244        }
245
246        // Add interaction features
247        let mut col_idx = n_features;
248        for i in 0..n_features {
249            let start_j = if self.include_self { i } else { i + 1 };
250            for j in start_j..n_features {
251                for row in 0..n_samples {
252                    result[[row, col_idx]] = x[[row, i]] * x[[row, j]];
253                }
254                col_idx += 1;
255            }
256        }
257
258        Ok(result)
259    }
260
261    /// Get the number of output features
262    pub fn n_output_features(&self, n_input_features: usize) -> usize {
263        let n_interactions = if self.include_self {
264            n_input_features * (n_input_features + 1) / 2
265        } else {
266            n_input_features * (n_input_features - 1) / 2
267        };
268        n_input_features + n_interactions
269    }
270}
271
272impl Default for InteractionFeatures {
273    fn default() -> Self {
274        Self::new(false)
275    }
276}
277
278/// Feature binning (discretization) utility
279///
280/// Transforms continuous features into discrete bins.
281#[derive(Clone, Debug)]
282pub struct FeatureBinner {
283    n_bins: usize,
284    strategy: BinningStrategy,
285}
286
287/// Strategy for determining bin edges
288#[derive(Clone, Debug, PartialEq)]
289pub enum BinningStrategy {
290    /// Equal width bins
291    Uniform,
292    /// Equal frequency bins (quantiles)
293    Quantile,
294    /// K-means clustering for bin centers
295    KMeans,
296}
297
298impl FeatureBinner {
299    /// Create a new feature binner
300    ///
301    /// # Arguments
302    /// * `n_bins` - Number of bins to create
303    /// * `strategy` - Binning strategy
304    pub fn new(n_bins: usize, strategy: BinningStrategy) -> UtilsResult<Self> {
305        if n_bins < 2 {
306            return Err(UtilsError::InvalidParameter(
307                "n_bins must be at least 2".to_string(),
308            ));
309        }
310
311        Ok(Self { n_bins, strategy })
312    }
313
314    /// Fit and transform features into bins
315    ///
316    /// # Arguments
317    /// * `x` - Input features (1D array)
318    ///
319    /// # Returns
320    /// Bin indices for each value
321    pub fn fit_transform(&self, x: &Array1<f64>) -> UtilsResult<Array1<usize>> {
322        if x.is_empty() {
323            return Err(UtilsError::EmptyInput);
324        }
325
326        match self.strategy {
327            BinningStrategy::Uniform => self.uniform_binning(x),
328            BinningStrategy::Quantile => self.quantile_binning(x),
329            BinningStrategy::KMeans => {
330                // Simplified version - use quantiles as approximation
331                self.quantile_binning(x)
332            }
333        }
334    }
335
336    fn uniform_binning(&self, x: &Array1<f64>) -> UtilsResult<Array1<usize>> {
337        let min_val = x.iter().copied().fold(f64::INFINITY, f64::min);
338        let max_val = x.iter().copied().fold(f64::NEG_INFINITY, f64::max);
339
340        if (max_val - min_val).abs() < 1e-10 {
341            // All values are the same
342            return Ok(Array1::zeros(x.len()));
343        }
344
345        let bin_width = (max_val - min_val) / (self.n_bins as f64);
346        let mut result = Array1::zeros(x.len());
347
348        for (i, &val) in x.iter().enumerate() {
349            let bin = ((val - min_val) / bin_width).floor() as usize;
350            result[i] = bin.min(self.n_bins - 1); // Clamp to valid range
351        }
352
353        Ok(result)
354    }
355
356    fn quantile_binning(&self, x: &Array1<f64>) -> UtilsResult<Array1<usize>> {
357        let mut sorted: Vec<f64> = x.to_vec();
358        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
359
360        // Compute quantile edges
361        let mut edges = Vec::with_capacity(self.n_bins + 1);
362        edges.push(sorted[0] - 1e-10); // Slightly less than minimum
363
364        for i in 1..self.n_bins {
365            let quantile = i as f64 / self.n_bins as f64;
366            let idx = (quantile * sorted.len() as f64) as usize;
367            let idx = idx.min(sorted.len() - 1);
368            edges.push(sorted[idx]);
369        }
370
371        edges.push(sorted[sorted.len() - 1] + 1e-10); // Slightly more than maximum
372
373        // Assign bins
374        let mut result = Array1::zeros(x.len());
375        for (i, &val) in x.iter().enumerate() {
376            for bin in 0..self.n_bins {
377                if val > edges[bin] && val <= edges[bin + 1] {
378                    result[i] = bin;
379                    break;
380                }
381            }
382        }
383
384        Ok(result)
385    }
386}
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391    use approx::assert_abs_diff_eq;
392    use scirs2_core::ndarray::array;
393
394    #[test]
395    fn test_polynomial_features_degree_2() {
396        let poly = PolynomialFeatures::new(2, true, false).unwrap();
397
398        // Input: [a, b]
399        // The recursive generation produces: [1, b, ab, b², a, a², ...]
400        let x = array![[2.0, 3.0]];
401        let result = poly.transform(&x).unwrap();
402
403        assert_eq!(result.ncols(), 6);
404
405        // Check that all expected values are present (order may vary)
406        let values: Vec<f64> = (0..result.ncols()).map(|i| result[[0, i]]).collect();
407
408        // Must have bias
409        assert!(values.contains(&1.0));
410        // Must have a and b
411        assert!(values.contains(&2.0));
412        assert!(values.contains(&3.0));
413        // Must have a², ab, b²
414        assert!(values.contains(&4.0)); // a²
415        assert!(values.contains(&6.0)); // ab
416        assert!(values.contains(&9.0)); // b²
417    }
418
419    #[test]
420    fn test_polynomial_features_interaction_only() {
421        let poly = PolynomialFeatures::new(2, false, true).unwrap();
422
423        // Input: [a, b]
424        // Expected output: [a, b, ab] (no a², b²)
425        let x = array![[2.0, 3.0]];
426        let result = poly.transform(&x).unwrap();
427
428        assert_eq!(result.ncols(), 3);
429        assert_abs_diff_eq!(result[[0, 0]], 2.0, epsilon = 1e-10); // a
430        assert_abs_diff_eq!(result[[0, 1]], 3.0, epsilon = 1e-10); // b
431        assert_abs_diff_eq!(result[[0, 2]], 6.0, epsilon = 1e-10); // ab
432    }
433
434    #[test]
435    fn test_polynomial_n_output_features() {
436        let poly = PolynomialFeatures::new(2, true, false).unwrap();
437        assert_eq!(poly.n_output_features(2), 6); // [1, a, b, a², ab, b²]
438
439        let poly = PolynomialFeatures::new(3, false, false).unwrap();
440        assert_eq!(poly.n_output_features(2), 9); // [a, b, a², ab, b², a³, a²b, ab², b³]
441    }
442
443    #[test]
444    fn test_interaction_features() {
445        let interaction = InteractionFeatures::new(false);
446
447        // Input: [a, b, c]
448        // Interactions (without self): [ab, ac, bc]
449        let x = array![[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]];
450
451        let result = interaction.transform(&x).unwrap();
452
453        assert_eq!(result.ncols(), 6); // 3 original + 3 interactions
454
455        // First row
456        assert_abs_diff_eq!(result[[0, 0]], 1.0, epsilon = 1e-10); // a
457        assert_abs_diff_eq!(result[[0, 1]], 2.0, epsilon = 1e-10); // b
458        assert_abs_diff_eq!(result[[0, 2]], 3.0, epsilon = 1e-10); // c
459        assert_abs_diff_eq!(result[[0, 3]], 2.0, epsilon = 1e-10); // ab
460        assert_abs_diff_eq!(result[[0, 4]], 3.0, epsilon = 1e-10); // ac
461        assert_abs_diff_eq!(result[[0, 5]], 6.0, epsilon = 1e-10); // bc
462    }
463
464    #[test]
465    fn test_interaction_features_with_self() {
466        let interaction = InteractionFeatures::new(true);
467
468        let x = array![[2.0, 3.0]];
469        let result = interaction.transform(&x).unwrap();
470
471        assert_eq!(result.ncols(), 5); // 2 original + 3 interactions (a², ab, b²)
472
473        assert_abs_diff_eq!(result[[0, 2]], 4.0, epsilon = 1e-10); // a²
474        assert_abs_diff_eq!(result[[0, 3]], 6.0, epsilon = 1e-10); // ab
475        assert_abs_diff_eq!(result[[0, 4]], 9.0, epsilon = 1e-10); // b²
476    }
477
478    #[test]
479    fn test_uniform_binning() {
480        let binner = FeatureBinner::new(3, BinningStrategy::Uniform).unwrap();
481
482        let x = array![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
483        let bins = binner.fit_transform(&x).unwrap();
484
485        // Should be divided into 3 bins: [0-3), [3-6), [6-9]
486        assert_eq!(bins[0], 0); // 0.0
487        assert_eq!(bins[5], 1); // 5.0
488        assert_eq!(bins[9], 2); // 9.0
489    }
490
491    #[test]
492    fn test_quantile_binning() {
493        let binner = FeatureBinner::new(4, BinningStrategy::Quantile).unwrap();
494
495        let x = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
496        let bins = binner.fit_transform(&x).unwrap();
497
498        // Each bin should have approximately equal number of samples
499        let mut bin_counts = vec![0; 4];
500        for &bin in bins.iter() {
501            bin_counts[bin] += 1;
502        }
503
504        // All bins should have samples
505        for &count in &bin_counts {
506            assert!(count > 0);
507        }
508    }
509
510    #[test]
511    fn test_binning_constant_values() {
512        let binner = FeatureBinner::new(3, BinningStrategy::Uniform).unwrap();
513
514        let x = array![5.0, 5.0, 5.0, 5.0];
515        let bins = binner.fit_transform(&x).unwrap();
516
517        // All values should be in the same bin
518        assert!(bins.iter().all(|&b| b == 0));
519    }
520}