scirs2_datasets/utils/
feature_engineering.rs

1//! Feature engineering utilities for creating and transforming features
2//!
3//! This module provides various methods for generating new features from existing data,
4//! including polynomial features, statistical features, and binned features. These
5//! transformations can help improve machine learning model performance by creating
6//! more informative representations of the data.
7
8use crate::error::{DatasetsError, Result};
9use ndarray::{Array1, Array2};
10
11/// Binning strategies for discretization
12#[derive(Debug, Clone, Copy)]
13pub enum BinningStrategy {
14    /// Uniform-width bins based on min-max range
15    Uniform,
16    /// Quantile-based bins (equal frequency)
17    Quantile,
18}
19
20/// Generates polynomial features up to a specified degree
21///
22/// Creates polynomial combinations of features up to the specified degree.
23/// For example, with degree=2 and features [a, b], generates [1, a, b, a², ab, b²].
24///
25/// # Arguments
26///
27/// * `data` - Input feature matrix (n_samples, n_features)
28/// * `degree` - Maximum polynomial degree (must be >= 1)
29/// * `include_bias` - Whether to include the bias column (all ones)
30///
31/// # Returns
32///
33/// A new array with polynomial features
34///
35/// # Examples
36///
37/// ```rust
38/// use ndarray::Array2;
39/// use scirs2_datasets::utils::polynomial_features;
40///
41/// let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
42/// let poly_features = polynomial_features(&data, 2, true).unwrap();
43/// // Result includes: [1, x1, x2, x1², x1*x2, x2²]
44/// ```
45pub fn polynomial_features(
46    data: &Array2<f64>,
47    degree: usize,
48    include_bias: bool,
49) -> Result<Array2<f64>> {
50    if degree == 0 {
51        return Err(DatasetsError::InvalidFormat(
52            "Polynomial degree must be at least 1".to_string(),
53        ));
54    }
55
56    let n_samples = data.nrows();
57    let n_features = data.ncols();
58
59    // Calculate number of polynomial features
60    let mut n_output_features = 0;
61    if include_bias {
62        n_output_features += 1;
63    }
64
65    // Count features for each degree
66    for d in 1..=degree {
67        // Number of multivariate polynomials of degree d with n_features variables
68        // This uses the formula for combinations with repetition: C(n+k-1, k)
69        let mut combinations = 1;
70        for i in 0..d {
71            combinations = combinations * (n_features + i) / (i + 1);
72        }
73        n_output_features += combinations;
74    }
75
76    let mut output = Array2::zeros((n_samples, n_output_features));
77    let mut col_idx = 0;
78
79    // Add bias column if requested
80    if include_bias {
81        output.column_mut(col_idx).fill(1.0);
82    }
83
84    // Generate polynomial features
85    for sample_idx in 0..n_samples {
86        let sample = data.row(sample_idx);
87        col_idx = if include_bias { 1 } else { 0 };
88
89        // Degree 1 features (original features)
90        for &feature_val in sample.iter() {
91            output[[sample_idx, col_idx]] = feature_val;
92            col_idx += 1;
93        }
94
95        // Higher degree features
96        for deg in 2..=degree {
97            generate_polynomial_combinations(
98                &sample.to_owned(),
99                deg,
100                sample_idx,
101                &mut output,
102                &mut col_idx,
103            );
104        }
105    }
106
107    Ok(output)
108}
109
110/// Helper function to generate polynomial combinations recursively
111fn generate_polynomial_combinations(
112    features: &Array1<f64>,
113    degree: usize,
114    sample_idx: usize,
115    output: &mut Array2<f64>,
116    col_idx: &mut usize,
117) {
118    fn combinations_recursive(
119        features: &Array1<f64>,
120        degree: usize,
121        start_idx: usize,
122        current_product: f64,
123        sample_idx: usize,
124        output: &mut Array2<f64>,
125        col_idx: &mut usize,
126    ) {
127        if degree == 0 {
128            output[[sample_idx, *col_idx]] = current_product;
129            *col_idx += 1;
130            return;
131        }
132
133        for i in start_idx..features.len() {
134            combinations_recursive(
135                features,
136                degree - 1,
137                i, // Allow repetition by using i instead of i+1
138                current_product * features[i],
139                sample_idx,
140                output,
141                col_idx,
142            );
143        }
144    }
145
146    combinations_recursive(features, degree, 0, 1.0, sample_idx, output, col_idx);
147}
148
149/// Extracts statistical features from the data
150///
151/// Computes various statistical measures for each feature, including central tendency,
152/// dispersion, and shape statistics.
153///
154/// # Arguments
155///
156/// * `data` - Input feature matrix (n_samples, n_features)
157///
158/// # Returns
159///
160/// A new array with statistical features: [mean, std, min, max, median, q25, q75, skewness, kurtosis] for each original feature
161///
162/// # Examples
163///
164/// ```rust
165/// use ndarray::Array2;
166/// use scirs2_datasets::utils::statistical_features;
167///
168/// let data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0]).unwrap();
169/// let stats_features = statistical_features(&data).unwrap();
170/// // Result includes 9 statistical measures for each of the 2 original features
171/// ```
172pub fn statistical_features(data: &Array2<f64>) -> Result<Array2<f64>> {
173    let n_samples = data.nrows();
174    let n_features = data.ncols();
175
176    if n_samples == 0 || n_features == 0 {
177        return Err(DatasetsError::InvalidFormat(
178            "Data cannot be empty for statistical feature extraction".to_string(),
179        ));
180    }
181
182    // 9 statistical features per original feature
183    let n_stat_features = 9;
184    let mut stats = Array2::zeros((n_samples, n_features * n_stat_features));
185
186    for sample_idx in 0..n_samples {
187        for feature_idx in 0..n_features {
188            let feature_values = data.column(feature_idx);
189
190            // Calculate basic statistics
191            let mean = feature_values.mean().unwrap_or(0.0);
192            let std = feature_values.std(0.0);
193            let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
194            let max_val = feature_values
195                .iter()
196                .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
197
198            // Calculate quantiles
199            let mut sorted_values: Vec<f64> = feature_values.to_vec();
200            sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
201
202            let median = calculate_quantile(&sorted_values, 0.5);
203            let q25 = calculate_quantile(&sorted_values, 0.25);
204            let q75 = calculate_quantile(&sorted_values, 0.75);
205
206            // Calculate skewness and kurtosis
207            let skewness = calculate_skewness(&feature_values, mean, std);
208            let kurtosis = calculate_kurtosis(&feature_values, mean, std);
209
210            // Store statistical features
211            let base_idx = feature_idx * n_stat_features;
212            stats[[sample_idx, base_idx]] = mean;
213            stats[[sample_idx, base_idx + 1]] = std;
214            stats[[sample_idx, base_idx + 2]] = min_val;
215            stats[[sample_idx, base_idx + 3]] = max_val;
216            stats[[sample_idx, base_idx + 4]] = median;
217            stats[[sample_idx, base_idx + 5]] = q25;
218            stats[[sample_idx, base_idx + 6]] = q75;
219            stats[[sample_idx, base_idx + 7]] = skewness;
220            stats[[sample_idx, base_idx + 8]] = kurtosis;
221        }
222    }
223
224    Ok(stats)
225}
226
227/// Calculates a specific quantile from sorted data
228fn calculate_quantile(sorted_data: &[f64], quantile: f64) -> f64 {
229    if sorted_data.is_empty() {
230        return 0.0;
231    }
232
233    let n = sorted_data.len();
234    let index = quantile * (n - 1) as f64;
235    let lower = index.floor() as usize;
236    let upper = index.ceil() as usize;
237
238    if lower == upper {
239        sorted_data[lower]
240    } else {
241        let weight = index - lower as f64;
242        sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
243    }
244}
245
246/// Calculates skewness (third moment)
247fn calculate_skewness(data: &ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
248    if std <= 1e-10 {
249        return 0.0;
250    }
251
252    let n = data.len() as f64;
253    let sum_cubed_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(3)).sum();
254
255    sum_cubed_deviations / n
256}
257
258/// Calculates kurtosis (fourth moment)
259fn calculate_kurtosis(data: &ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
260    if std <= 1e-10 {
261        return 0.0;
262    }
263
264    let n = data.len() as f64;
265    let sum_fourth_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(4)).sum();
266
267    (sum_fourth_deviations / n) - 3.0 // Excess kurtosis (subtract 3 for normal distribution)
268}
269
270/// Creates binned (discretized) features from continuous features
271///
272/// Transforms continuous features into categorical features by binning values
273/// into specified ranges. This can be useful for creating non-linear features
274/// or reducing the impact of outliers.
275///
276/// # Arguments
277///
278/// * `data` - Input feature matrix (n_samples, n_features)
279/// * `n_bins` - Number of bins per feature
280/// * `strategy` - Binning strategy to use
281///
282/// # Returns
283///
284/// A new array with binned features (encoded as bin indices)
285///
286/// # Examples
287///
288/// ```rust
289/// use ndarray::Array2;
290/// use scirs2_datasets::utils::{create_binned_features, BinningStrategy};
291///
292/// let data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0]).unwrap();
293/// let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
294/// // Each feature is now discretized into 3 bins (values 0, 1, 2)
295/// ```
296pub fn create_binned_features(
297    data: &Array2<f64>,
298    n_bins: usize,
299    strategy: BinningStrategy,
300) -> Result<Array2<f64>> {
301    if n_bins < 2 {
302        return Err(DatasetsError::InvalidFormat(
303            "Number of bins must be at least 2".to_string(),
304        ));
305    }
306
307    let n_samples = data.nrows();
308    let n_features = data.ncols();
309    let mut binned = Array2::zeros((n_samples, n_features));
310
311    for j in 0..n_features {
312        let column = data.column(j);
313        let bin_edges = calculate_bin_edges(&column, n_bins, &strategy)?;
314
315        for i in 0..n_samples {
316            let value = column[i];
317            let bin_idx = find_bin_index(value, &bin_edges);
318            binned[[i, j]] = bin_idx as f64;
319        }
320    }
321
322    Ok(binned)
323}
324
325/// Calculate bin edges based on the specified strategy
326fn calculate_bin_edges(
327    data: &ndarray::ArrayView1<f64>,
328    n_bins: usize,
329    strategy: &BinningStrategy,
330) -> Result<Vec<f64>> {
331    match strategy {
332        BinningStrategy::Uniform => {
333            let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
334            let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
335
336            if (max_val - min_val).abs() <= 1e-10 {
337                return Ok(vec![min_val, min_val + 1e-10]);
338            }
339
340            let bin_width = (max_val - min_val) / n_bins as f64;
341            let mut edges = Vec::with_capacity(n_bins + 1);
342
343            for i in 0..=n_bins {
344                edges.push(min_val + i as f64 * bin_width);
345            }
346
347            Ok(edges)
348        }
349        BinningStrategy::Quantile => {
350            let mut sorted_data: Vec<f64> = data.to_vec();
351            sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
352
353            let mut edges = Vec::with_capacity(n_bins + 1);
354            edges.push(sorted_data[0]);
355
356            for i in 1..n_bins {
357                let quantile = i as f64 / n_bins as f64;
358                let edge = calculate_quantile(&sorted_data, quantile);
359                edges.push(edge);
360            }
361
362            edges.push(sorted_data[sorted_data.len() - 1]);
363
364            Ok(edges)
365        }
366    }
367}
368
369/// Find the bin index for a given value
370fn find_bin_index(value: f64, bin_edges: &[f64]) -> usize {
371    for (i, &edge) in bin_edges.iter().enumerate().skip(1) {
372        if value <= edge {
373            return i - 1;
374        }
375    }
376    bin_edges.len() - 2 // Last bin
377}
378
379#[cfg(test)]
380mod tests {
381    use super::*;
382    use ndarray::array;
383
384    #[test]
385    fn test_polynomial_features_degree_2() {
386        let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
387        let poly = polynomial_features(&data, 2, true).unwrap();
388
389        // Should have: [bias, x1, x2, x1², x1*x2, x2²] = 6 features
390        assert_eq!(poly.ncols(), 6);
391        assert_eq!(poly.nrows(), 2);
392
393        // Check first sample: [1, 1, 2, 1, 2, 4]
394        assert!((poly[[0, 0]] - 1.0).abs() < 1e-10); // bias
395        assert!((poly[[0, 1]] - 1.0).abs() < 1e-10); // x1
396        assert!((poly[[0, 2]] - 2.0).abs() < 1e-10); // x2
397        assert!((poly[[0, 3]] - 1.0).abs() < 1e-10); // x1²
398        assert!((poly[[0, 4]] - 2.0).abs() < 1e-10); // x1*x2
399        assert!((poly[[0, 5]] - 4.0).abs() < 1e-10); // x2²
400    }
401
402    #[test]
403    fn test_polynomial_features_no_bias() {
404        let data = Array2::from_shape_vec((1, 2), vec![2.0, 3.0]).unwrap();
405        let poly = polynomial_features(&data, 2, false).unwrap();
406
407        // Should have: [x1, x2, x1², x1*x2, x2²] = 5 features (no bias)
408        assert_eq!(poly.ncols(), 5);
409
410        // Check values: [2, 3, 4, 6, 9]
411        assert!((poly[[0, 0]] - 2.0).abs() < 1e-10); // x1
412        assert!((poly[[0, 1]] - 3.0).abs() < 1e-10); // x2
413        assert!((poly[[0, 2]] - 4.0).abs() < 1e-10); // x1²
414        assert!((poly[[0, 3]] - 6.0).abs() < 1e-10); // x1*x2
415        assert!((poly[[0, 4]] - 9.0).abs() < 1e-10); // x2²
416    }
417
418    #[test]
419    fn test_polynomial_features_invalid_degree() {
420        let data = Array2::from_shape_vec((1, 1), vec![1.0]).unwrap();
421        assert!(polynomial_features(&data, 0, true).is_err());
422    }
423
424    #[test]
425    fn test_statistical_features() {
426        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
427        let stats = statistical_features(&data).unwrap();
428
429        // Should have 9 statistical features for 1 original feature
430        assert_eq!(stats.ncols(), 9);
431        assert_eq!(stats.nrows(), 5);
432
433        // All samples should have the same statistical features (global statistics)
434        for i in 0..stats.nrows() {
435            assert!((stats[[i, 0]] - 3.0).abs() < 1e-10); // mean
436            assert!((stats[[i, 2]] - 1.0).abs() < 1e-10); // min
437            assert!((stats[[i, 3]] - 5.0).abs() < 1e-10); // max
438            assert!((stats[[i, 4]] - 3.0).abs() < 1e-10); // median
439        }
440    }
441
442    #[test]
443    fn test_statistical_features_empty_data() {
444        let data = Array2::zeros((0, 1));
445        assert!(statistical_features(&data).is_err());
446    }
447
448    #[test]
449    fn test_create_binned_features_uniform() {
450        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
451        let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
452
453        assert_eq!(binned.nrows(), 5);
454        assert_eq!(binned.ncols(), 1);
455
456        // Check that all values are valid bin indices (0, 1, or 2)
457        for i in 0..binned.nrows() {
458            let bin_val = binned[[i, 0]] as usize;
459            assert!(bin_val < 3);
460        }
461    }
462
463    #[test]
464    fn test_create_binned_features_quantile() {
465        let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
466        let binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
467
468        assert_eq!(binned.nrows(), 6);
469        assert_eq!(binned.ncols(), 1);
470
471        // With quantile binning, each bin should have roughly equal number of samples
472        let mut bin_counts = vec![0; 3];
473        for i in 0..binned.nrows() {
474            let bin_val = binned[[i, 0]] as usize;
475            bin_counts[bin_val] += 1;
476        }
477
478        // Each bin should have 2 samples (6 samples / 3 bins)
479        for &count in &bin_counts {
480            assert_eq!(count, 2);
481        }
482    }
483
484    #[test]
485    fn test_create_binned_features_invalid_bins() {
486        let data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
487        assert!(create_binned_features(&data, 1, BinningStrategy::Uniform).is_err());
488        assert!(create_binned_features(&data, 0, BinningStrategy::Uniform).is_err());
489    }
490
491    #[test]
492    fn test_calculate_quantile() {
493        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
494
495        assert_eq!(calculate_quantile(&data, 0.0), 1.0);
496        assert_eq!(calculate_quantile(&data, 0.5), 3.0);
497        assert_eq!(calculate_quantile(&data, 1.0), 5.0);
498        assert_eq!(calculate_quantile(&data, 0.25), 2.0);
499        assert_eq!(calculate_quantile(&data, 0.75), 4.0);
500    }
501
502    #[test]
503    fn test_calculate_skewness() {
504        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
505        let view = data.view();
506        let mean = view.mean().unwrap();
507        let std = view.std(0.0);
508
509        let skewness = calculate_skewness(&view, mean, std);
510        // Symmetric distribution should have skewness near 0
511        assert!(skewness.abs() < 1e-10);
512    }
513
514    #[test]
515    fn test_calculate_kurtosis() {
516        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
517        let view = data.view();
518        let mean = view.mean().unwrap();
519        let std = view.std(0.0);
520
521        let kurtosis = calculate_kurtosis(&view, mean, std);
522        // Uniform distribution should have negative excess kurtosis
523        assert!(kurtosis < 0.0);
524    }
525
526    #[test]
527    fn test_feature_extraction_pipeline() {
528        // Test a complete feature extraction pipeline
529        let data = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
530            .unwrap();
531
532        // Step 1: Generate polynomial features
533        let poly_data = polynomial_features(&data, 2, false).unwrap();
534
535        // Step 2: Create binned features
536        let binned_data = create_binned_features(&poly_data, 2, BinningStrategy::Uniform).unwrap();
537
538        // Step 3: Generate statistical features
539        let stats_data = statistical_features(&data).unwrap();
540
541        // Verify pipeline produces expected shapes
542        assert_eq!(poly_data.ncols(), 5); // [x1, x2, x1², x1*x2, x2²]
543        assert_eq!(binned_data.ncols(), 5); // Same number of features, but binned
544        assert_eq!(stats_data.ncols(), 18); // 9 statistics × 2 original features
545        assert_eq!(binned_data.nrows(), 4); // Same number of samples
546        assert_eq!(stats_data.nrows(), 4); // Same number of samples
547    }
548
549    #[test]
550    fn test_binning_strategies_comparison() {
551        // Create data with outliers
552        let data =
553            Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0]).unwrap();
554
555        let uniform_binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
556        let quantile_binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
557
558        // With uniform binning, the outlier will dominate the range
559        // With quantile binning, each bin should have roughly equal frequency
560
561        // Count bin distributions
562        let mut uniform_counts = [0; 3];
563        let mut quantile_counts = [0; 3];
564
565        for i in 0..data.nrows() {
566            uniform_counts[uniform_binned[[i, 0]] as usize] += 1;
567            quantile_counts[quantile_binned[[i, 0]] as usize] += 1;
568        }
569
570        // Quantile binning should have more balanced distribution
571        let uniform_max = *uniform_counts.iter().max().unwrap();
572        let uniform_min = *uniform_counts.iter().min().unwrap();
573        let quantile_max = *quantile_counts.iter().max().unwrap();
574        let quantile_min = *quantile_counts.iter().min().unwrap();
575
576        // Quantile binning should be more balanced (smaller difference between max and min)
577        assert!((quantile_max - quantile_min) <= (uniform_max - uniform_min));
578    }
579}