Skip to main content

scirs2_datasets/utils/
feature_engineering.rs

1//! Feature engineering utilities for creating and transforming features
2//!
3//! This module provides various methods for generating new features from existing data,
4//! including polynomial features, statistical features, and binned features. These
5//! transformations can help improve machine learning model performance by creating
6//! more informative representations of the data.
7
8use crate::error::{DatasetsError, Result};
9use scirs2_core::ndarray::{Array1, Array2};
10use statrs::statistics::Statistics;
11
12/// Binning strategies for discretization
13#[derive(Debug, Clone, Copy)]
14pub enum BinningStrategy {
15    /// Uniform-width bins based on min-max range
16    Uniform,
17    /// Quantile-based bins (equal frequency)
18    Quantile,
19}
20
21/// Generates polynomial features up to a specified degree
22///
23/// Creates polynomial combinations of features up to the specified degree.
24/// For example, with degree=2 and features [a, b], generates [1, a, b, a², ab, b²].
25///
26/// # Arguments
27///
28/// * `data` - Input feature matrix (n_samples, n_features)
29/// * `degree` - Maximum polynomial degree (must be >= 1)
30/// * `include_bias` - Whether to include the bias column (all ones)
31///
32/// # Returns
33///
34/// A new array with polynomial features
35///
36/// # Examples
37///
38/// ```rust
39/// use scirs2_core::ndarray::Array2;
40/// use scirs2_datasets::utils::polynomial_features;
41///
42/// let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).expect("Operation failed");
43/// let poly_features = polynomial_features(&data, 2, true).expect("Operation failed");
44/// // Result includes: [1, x1, x2, x1², x1*x2, x2²]
45/// ```
46#[allow(dead_code)]
47pub fn polynomial_features(
48    data: &Array2<f64>,
49    degree: usize,
50    include_bias: bool,
51) -> Result<Array2<f64>> {
52    if degree == 0 {
53        return Err(DatasetsError::InvalidFormat(
54            "Polynomial degree must be at least 1".to_string(),
55        ));
56    }
57
58    let n_samples = data.nrows();
59    let n_features = data.ncols();
60
61    // Calculate number of polynomial features
62    let mut n_output_features = 0;
63    if include_bias {
64        n_output_features += 1;
65    }
66
67    // Count features for each degree
68    for d in 1..=degree {
69        // Number of multivariate polynomials of degree d with n_features variables
70        // This uses the formula for combinations with repetition: C(n+k-1, k)
71        let mut combinations = 1;
72        for i in 0..d {
73            combinations = combinations * (n_features + i) / (i + 1);
74        }
75        n_output_features += combinations;
76    }
77
78    let mut output = Array2::zeros((n_samples, n_output_features));
79    let mut col_idx = 0;
80
81    // Add _bias column if requested
82    if include_bias {
83        output.column_mut(col_idx).fill(1.0);
84    }
85
86    // Generate polynomial features
87    for sample_idx in 0..n_samples {
88        let sample = data.row(sample_idx);
89        col_idx = if include_bias { 1 } else { 0 };
90
91        // Degree 1 features (original features)
92        for &feature_val in sample.iter() {
93            output[[sample_idx, col_idx]] = feature_val;
94            col_idx += 1;
95        }
96
97        // Higher degree features
98        for deg in 2..=degree {
99            generate_polynomial_combinations(
100                &sample.to_owned(),
101                deg,
102                sample_idx,
103                &mut output,
104                &mut col_idx,
105            );
106        }
107    }
108
109    Ok(output)
110}
111
112/// Helper function to generate polynomial combinations recursively
113#[allow(dead_code)]
114fn generate_polynomial_combinations(
115    features: &Array1<f64>,
116    degree: usize,
117    sample_idx: usize,
118    output: &mut Array2<f64>,
119    col_idx: &mut usize,
120) {
121    fn combinations_recursive(
122        features: &Array1<f64>,
123        degree: usize,
124        start_idx: usize,
125        current_product: f64,
126        sample_idx: usize,
127        output: &mut Array2<f64>,
128        col_idx: &mut usize,
129    ) {
130        if degree == 0 {
131            output[[sample_idx, *col_idx]] = current_product;
132            *col_idx += 1;
133            return;
134        }
135
136        for i in start_idx..features.len() {
137            combinations_recursive(
138                features,
139                degree - 1,
140                i, // Allow repetition by using i instead of i+1
141                current_product * features[i],
142                sample_idx,
143                output,
144                col_idx,
145            );
146        }
147    }
148
149    combinations_recursive(features, degree, 0, 1.0, sample_idx, output, col_idx);
150}
151
152/// Extracts statistical features from the data
153///
154/// Computes various statistical measures for each feature, including central tendency,
155/// dispersion, and shape statistics.
156///
157/// # Arguments
158///
159/// * `data` - Input feature matrix (n_samples, n_features)
160///
161/// # Returns
162///
163/// A new array with statistical features: [mean, std, min, max, median, q25, q75, skewness, kurtosis] for each original feature
164///
165/// # Examples
166///
167/// ```rust
168/// use scirs2_core::ndarray::Array2;
169/// use scirs2_datasets::utils::statistical_features;
170///
171/// let data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0]).expect("Operation failed");
172/// let stats_features = statistical_features(&data).expect("Operation failed");
173/// // Result includes 9 statistical measures for each of the 2 original features
174/// ```
175#[allow(dead_code)]
176pub fn statistical_features(data: &Array2<f64>) -> Result<Array2<f64>> {
177    let n_samples = data.nrows();
178    let n_features = data.ncols();
179
180    if n_samples == 0 || n_features == 0 {
181        return Err(DatasetsError::InvalidFormat(
182            "Data cannot be empty for statistical feature extraction".to_string(),
183        ));
184    }
185
186    // 9 statistical features per original feature
187    let n_stat_features = 9;
188    let mut stats = Array2::zeros((n_samples, n_features * n_stat_features));
189
190    for sample_idx in 0..n_samples {
191        for feature_idx in 0..n_features {
192            let feature_values = data.column(feature_idx);
193
194            // Calculate basic statistics
195            let mean = {
196                let val = feature_values.mean();
197                if val.is_nan() {
198                    0.0
199                } else {
200                    val
201                }
202            };
203            let std = feature_values.std(0.0);
204            let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
205            let max_val = feature_values
206                .iter()
207                .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
208
209            // Calculate quantiles
210            let mut sorted_values: Vec<f64> = feature_values.to_vec();
211            sorted_values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
212
213            let median = calculate_quantile(&sorted_values, 0.5);
214            let q25 = calculate_quantile(&sorted_values, 0.25);
215            let q75 = calculate_quantile(&sorted_values, 0.75);
216
217            // Calculate skewness and kurtosis
218            let skewness = calculate_skewness(&feature_values, mean, std);
219            let kurtosis = calculate_kurtosis(&feature_values, mean, std);
220
221            // Store statistical features
222            let base_idx = feature_idx * n_stat_features;
223            stats[[sample_idx, base_idx]] = mean;
224            stats[[sample_idx, base_idx + 1]] = std;
225            stats[[sample_idx, base_idx + 2]] = min_val;
226            stats[[sample_idx, base_idx + 3]] = max_val;
227            stats[[sample_idx, base_idx + 4]] = median;
228            stats[[sample_idx, base_idx + 5]] = q25;
229            stats[[sample_idx, base_idx + 6]] = q75;
230            stats[[sample_idx, base_idx + 7]] = skewness;
231            stats[[sample_idx, base_idx + 8]] = kurtosis;
232        }
233    }
234
235    Ok(stats)
236}
237
238/// Calculates a specific quantile from sorted data
239#[allow(dead_code)]
240fn calculate_quantile(sorted_data: &[f64], quantile: f64) -> f64 {
241    if sorted_data.is_empty() {
242        return 0.0;
243    }
244
245    let n = sorted_data.len();
246    let index = quantile * (n - 1) as f64;
247    let lower = index.floor() as usize;
248    let upper = index.ceil() as usize;
249
250    if lower == upper {
251        sorted_data[lower]
252    } else {
253        let weight = index - lower as f64;
254        sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
255    }
256}
257
258/// Calculates skewness (third moment)
259#[allow(dead_code)]
260fn calculate_skewness(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
261    if std <= 1e-10 {
262        return 0.0;
263    }
264
265    let n = data.len() as f64;
266    let sum_cubed_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(3)).sum();
267
268    sum_cubed_deviations / n
269}
270
271/// Calculates kurtosis (fourth moment)
272#[allow(dead_code)]
273fn calculate_kurtosis(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
274    if std <= 1e-10 {
275        return 0.0;
276    }
277
278    let n = data.len() as f64;
279    let sum_fourth_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(4)).sum();
280
281    (sum_fourth_deviations / n) - 3.0 // Excess kurtosis (subtract 3 for normal distribution)
282}
283
284/// Creates binned (discretized) features from continuous features
285///
286/// Transforms continuous features into categorical features by binning values
287/// into specified ranges. This can be useful for creating non-linear features
288/// or reducing the impact of outliers.
289///
290/// # Arguments
291///
292/// * `data` - Input feature matrix (n_samples, n_features)
293/// * `n_bins` - Number of bins per feature
294/// * `strategy` - Binning strategy to use
295///
296/// # Returns
297///
298/// A new array with binned features (encoded as bin indices)
299///
300/// # Examples
301///
302/// ```rust
303/// use scirs2_core::ndarray::Array2;
304/// use scirs2_datasets::utils::{create_binned_features, BinningStrategy};
305///
306/// let data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0]).expect("Operation failed");
307/// let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).expect("Operation failed");
308/// // Each feature is now discretized into 3 bins (values 0, 1, 2)
309/// ```
310#[allow(dead_code)]
311pub fn create_binned_features(
312    data: &Array2<f64>,
313    n_bins: usize,
314    strategy: BinningStrategy,
315) -> Result<Array2<f64>> {
316    if n_bins < 2 {
317        return Err(DatasetsError::InvalidFormat(
318            "Number of _bins must be at least 2".to_string(),
319        ));
320    }
321
322    let n_samples = data.nrows();
323    let n_features = data.ncols();
324    let mut binned = Array2::zeros((n_samples, n_features));
325
326    for j in 0..n_features {
327        let column = data.column(j);
328        let bin_edges = calculate_bin_edges(&column, n_bins, &strategy)?;
329
330        for i in 0..n_samples {
331            let value = column[i];
332            let bin_idx = find_bin_index(value, &bin_edges);
333            binned[[i, j]] = bin_idx as f64;
334        }
335    }
336
337    Ok(binned)
338}
339
340/// Calculate bin edges based on the specified strategy
341#[allow(dead_code)]
342fn calculate_bin_edges(
343    data: &scirs2_core::ndarray::ArrayView1<f64>,
344    n_bins: usize,
345    strategy: &BinningStrategy,
346) -> Result<Vec<f64>> {
347    match strategy {
348        BinningStrategy::Uniform => {
349            let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
350            let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
351
352            if (max_val - min_val).abs() <= 1e-10 {
353                return Ok(vec![min_val, min_val + 1e-10]);
354            }
355
356            let bin_width = (max_val - min_val) / n_bins as f64;
357            let mut edges = Vec::with_capacity(n_bins + 1);
358
359            for i in 0..=n_bins {
360                edges.push(min_val + i as f64 * bin_width);
361            }
362
363            Ok(edges)
364        }
365        BinningStrategy::Quantile => {
366            let mut sorted_data: Vec<f64> = data.to_vec();
367            sorted_data.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
368
369            let mut edges = Vec::with_capacity(n_bins + 1);
370            edges.push(sorted_data[0]);
371
372            for i in 1..n_bins {
373                let quantile = i as f64 / n_bins as f64;
374                let edge = calculate_quantile(&sorted_data, quantile);
375                edges.push(edge);
376            }
377
378            edges.push(sorted_data[sorted_data.len() - 1]);
379
380            Ok(edges)
381        }
382    }
383}
384
385/// Find the bin index for a given value
386#[allow(dead_code)]
387fn find_bin_index(_value: f64, binedges: &[f64]) -> usize {
388    for (i, &edge) in binedges.iter().enumerate().skip(1) {
389        if _value <= edge {
390            return i - 1;
391        }
392    }
393    binedges.len() - 2 // Last bin
394}
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399    use scirs2_core::ndarray::array;
400
401    #[test]
402    fn test_polynomial_features_degree_2() {
403        let data =
404            Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).expect("Operation failed");
405        let poly = polynomial_features(&data, 2, true).expect("Operation failed");
406
407        // Should have: [bias, x1, x2, x1², x1*x2, x2²] = 6 features
408        assert_eq!(poly.ncols(), 6);
409        assert_eq!(poly.nrows(), 2);
410
411        // Check first sample: [1, 1, 2, 1, 2, 4]
412        assert!((poly[[0, 0]] - 1.0).abs() < 1e-10); // bias
413        assert!((poly[[0, 1]] - 1.0).abs() < 1e-10); // x1
414        assert!((poly[[0, 2]] - 2.0).abs() < 1e-10); // x2
415        assert!((poly[[0, 3]] - 1.0).abs() < 1e-10); // x1²
416        assert!((poly[[0, 4]] - 2.0).abs() < 1e-10); // x1*x2
417        assert!((poly[[0, 5]] - 4.0).abs() < 1e-10); // x2²
418    }
419
420    #[test]
421    fn test_polynomial_features_no_bias() {
422        let data = Array2::from_shape_vec((1, 2), vec![2.0, 3.0]).expect("Operation failed");
423        let poly = polynomial_features(&data, 2, false).expect("Operation failed");
424
425        // Should have: [x1, x2, x1², x1*x2, x2²] = 5 features (no bias)
426        assert_eq!(poly.ncols(), 5);
427
428        // Check values: [2, 3, 4, 6, 9]
429        assert!((poly[[0, 0]] - 2.0).abs() < 1e-10); // x1
430        assert!((poly[[0, 1]] - 3.0).abs() < 1e-10); // x2
431        assert!((poly[[0, 2]] - 4.0).abs() < 1e-10); // x1²
432        assert!((poly[[0, 3]] - 6.0).abs() < 1e-10); // x1*x2
433        assert!((poly[[0, 4]] - 9.0).abs() < 1e-10); // x2²
434    }
435
436    #[test]
437    fn test_polynomial_features_invalid_degree() {
438        let data = Array2::from_shape_vec((1, 1), vec![1.0]).expect("Operation failed");
439        assert!(polynomial_features(&data, 0, true).is_err());
440    }
441
442    #[test]
443    fn test_statistical_features() {
444        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0])
445            .expect("Operation failed");
446        let stats = statistical_features(&data).expect("Operation failed");
447
448        // Should have 9 statistical features for 1 original feature
449        assert_eq!(stats.ncols(), 9);
450        assert_eq!(stats.nrows(), 5);
451
452        // All samples should have the same statistical features (global statistics)
453        for i in 0..stats.nrows() {
454            assert!((stats[[i, 0]] - 3.0).abs() < 1e-10); // mean
455            assert!((stats[[i, 2]] - 1.0).abs() < 1e-10); // min
456            assert!((stats[[i, 3]] - 5.0).abs() < 1e-10); // max
457            assert!((stats[[i, 4]] - 3.0).abs() < 1e-10); // median
458        }
459    }
460
461    #[test]
462    fn test_statistical_features_empty_data() {
463        let data = Array2::zeros((0, 1));
464        assert!(statistical_features(&data).is_err());
465    }
466
467    #[test]
468    fn test_create_binned_features_uniform() {
469        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0])
470            .expect("Operation failed");
471        let binned =
472            create_binned_features(&data, 3, BinningStrategy::Uniform).expect("Operation failed");
473
474        assert_eq!(binned.nrows(), 5);
475        assert_eq!(binned.ncols(), 1);
476
477        // Check that all values are valid bin indices (0, 1, or 2)
478        for i in 0..binned.nrows() {
479            let bin_val = binned[[i, 0]] as usize;
480            assert!(bin_val < 3);
481        }
482    }
483
484    #[test]
485    fn test_create_binned_features_quantile() {
486        let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
487            .expect("Operation failed");
488        let binned =
489            create_binned_features(&data, 3, BinningStrategy::Quantile).expect("Operation failed");
490
491        assert_eq!(binned.nrows(), 6);
492        assert_eq!(binned.ncols(), 1);
493
494        // With quantile binning, each bin should have roughly equal number of samples
495        let mut bin_counts = vec![0; 3];
496        for i in 0..binned.nrows() {
497            let bin_val = binned[[i, 0]] as usize;
498            bin_counts[bin_val] += 1;
499        }
500
501        // Each bin should have 2 samples (6 samples / 3 bins)
502        for &count in &bin_counts {
503            assert_eq!(count, 2);
504        }
505    }
506
507    #[test]
508    fn test_create_binned_features_invalid_bins() {
509        let data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).expect("Operation failed");
510        assert!(create_binned_features(&data, 1, BinningStrategy::Uniform).is_err());
511        assert!(create_binned_features(&data, 0, BinningStrategy::Uniform).is_err());
512    }
513
514    #[test]
515    fn test_calculate_quantile() {
516        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
517
518        assert_eq!(calculate_quantile(&data, 0.0), 1.0);
519        assert_eq!(calculate_quantile(&data, 0.5), 3.0);
520        assert_eq!(calculate_quantile(&data, 1.0), 5.0);
521        assert_eq!(calculate_quantile(&data, 0.25), 2.0);
522        assert_eq!(calculate_quantile(&data, 0.75), 4.0);
523    }
524
525    #[test]
526    fn test_calculate_skewness() {
527        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
528        let view = data.view();
529        let mean = view.mean();
530        let std = view.std(0.0);
531
532        let skewness = calculate_skewness(&view, mean, std);
533        // Symmetric distribution should have skewness near 0
534        assert!(skewness.abs() < 1e-10);
535    }
536
537    #[test]
538    fn test_calculate_kurtosis() {
539        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
540        let view = data.view();
541        let mean = view.mean();
542        let std = view.std(0.0);
543
544        let kurtosis = calculate_kurtosis(&view, mean, std);
545        // Uniform distribution should have negative excess kurtosis
546        assert!(kurtosis < 0.0);
547    }
548
549    #[test]
550    fn test_feature_extraction_pipeline() {
551        // Test a complete feature extraction pipeline
552        let data = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
553            .expect("Failed to create features");
554
555        // Step 1: Generate polynomial features
556        let poly_data = polynomial_features(&data, 2, false).expect("Operation failed");
557
558        // Step 2: Create binned features
559        let binned_data = create_binned_features(&poly_data, 2, BinningStrategy::Uniform)
560            .expect("Operation failed");
561
562        // Step 3: Generate statistical features
563        let stats_data = statistical_features(&data).expect("Operation failed");
564
565        // Verify pipeline produces expected shapes
566        assert_eq!(poly_data.ncols(), 5); // [x1, x2, x1², x1*x2, x2²]
567        assert_eq!(binned_data.ncols(), 5); // Same number of features, but binned
568        assert_eq!(stats_data.ncols(), 18); // 9 statistics × 2 original features
569        assert_eq!(binned_data.nrows(), 4); // Same number of samples
570        assert_eq!(stats_data.nrows(), 4); // Same number of samples
571    }
572
573    #[test]
574    fn test_binning_strategies_comparison() {
575        // Create data with outliers
576        let data = Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0])
577            .expect("Operation failed");
578
579        let uniform_binned =
580            create_binned_features(&data, 3, BinningStrategy::Uniform).expect("Operation failed");
581        let quantile_binned =
582            create_binned_features(&data, 3, BinningStrategy::Quantile).expect("Operation failed");
583
584        // With uniform binning, the outlier will dominate the range
585        // With quantile binning, each bin should have roughly equal frequency
586
587        // Count bin distributions
588        let mut uniform_counts = [0; 3];
589        let mut quantile_counts = [0; 3];
590
591        for i in 0..data.nrows() {
592            uniform_counts[uniform_binned[[i, 0]] as usize] += 1;
593            quantile_counts[quantile_binned[[i, 0]] as usize] += 1;
594        }
595
596        // Quantile binning should have more balanced distribution
597        let uniform_max = *uniform_counts.iter().max().expect("Operation failed");
598        let uniform_min = *uniform_counts.iter().min().expect("Operation failed");
599        let quantile_max = *quantile_counts.iter().max().expect("Operation failed");
600        let quantile_min = *quantile_counts.iter().min().expect("Operation failed");
601
602        // Quantile binning should be more balanced (smaller difference between max and min)
603        assert!((quantile_max - quantile_min) <= (uniform_max - uniform_min));
604    }
605}