scirs2_datasets/utils/
feature_engineering.rs

1//! Feature engineering utilities for creating and transforming features
2//!
3//! This module provides various methods for generating new features from existing data,
4//! including polynomial features, statistical features, and binned features. These
5//! transformations can help improve machine learning model performance by creating
6//! more informative representations of the data.
7
8use crate::error::{DatasetsError, Result};
9use scirs2_core::ndarray::{Array1, Array2};
10use statrs::statistics::Statistics;
11
12/// Binning strategies for discretization
13#[derive(Debug, Clone, Copy)]
14pub enum BinningStrategy {
15    /// Uniform-width bins based on min-max range
16    Uniform,
17    /// Quantile-based bins (equal frequency)
18    Quantile,
19}
20
21/// Generates polynomial features up to a specified degree
22///
23/// Creates polynomial combinations of features up to the specified degree.
24/// For example, with degree=2 and features [a, b], generates [1, a, b, a², ab, b²].
25///
26/// # Arguments
27///
28/// * `data` - Input feature matrix (n_samples, n_features)
29/// * `degree` - Maximum polynomial degree (must be >= 1)
30/// * `include_bias` - Whether to include the bias column (all ones)
31///
32/// # Returns
33///
34/// A new array with polynomial features
35///
36/// # Examples
37///
38/// ```rust
39/// use scirs2_core::ndarray::Array2;
40/// use scirs2_datasets::utils::polynomial_features;
41///
42/// let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
43/// let poly_features = polynomial_features(&data, 2, true).unwrap();
44/// // Result includes: [1, x1, x2, x1², x1*x2, x2²]
45/// ```
46#[allow(dead_code)]
47pub fn polynomial_features(
48    data: &Array2<f64>,
49    degree: usize,
50    include_bias: bool,
51) -> Result<Array2<f64>> {
52    if degree == 0 {
53        return Err(DatasetsError::InvalidFormat(
54            "Polynomial degree must be at least 1".to_string(),
55        ));
56    }
57
58    let n_samples = data.nrows();
59    let n_features = data.ncols();
60
61    // Calculate number of polynomial features
62    let mut n_output_features = 0;
63    if include_bias {
64        n_output_features += 1;
65    }
66
67    // Count features for each degree
68    for d in 1..=degree {
69        // Number of multivariate polynomials of degree d with n_features variables
70        // This uses the formula for combinations with repetition: C(n+k-1, k)
71        let mut combinations = 1;
72        for i in 0..d {
73            combinations = combinations * (n_features + i) / (i + 1);
74        }
75        n_output_features += combinations;
76    }
77
78    let mut output = Array2::zeros((n_samples, n_output_features));
79    let mut col_idx = 0;
80
81    // Add _bias column if requested
82    if include_bias {
83        output.column_mut(col_idx).fill(1.0);
84    }
85
86    // Generate polynomial features
87    for sample_idx in 0..n_samples {
88        let sample = data.row(sample_idx);
89        col_idx = if include_bias { 1 } else { 0 };
90
91        // Degree 1 features (original features)
92        for &feature_val in sample.iter() {
93            output[[sample_idx, col_idx]] = feature_val;
94            col_idx += 1;
95        }
96
97        // Higher degree features
98        for deg in 2..=degree {
99            generate_polynomial_combinations(
100                &sample.to_owned(),
101                deg,
102                sample_idx,
103                &mut output,
104                &mut col_idx,
105            );
106        }
107    }
108
109    Ok(output)
110}
111
112/// Helper function to generate polynomial combinations recursively
113#[allow(dead_code)]
114fn generate_polynomial_combinations(
115    features: &Array1<f64>,
116    degree: usize,
117    sample_idx: usize,
118    output: &mut Array2<f64>,
119    col_idx: &mut usize,
120) {
121    fn combinations_recursive(
122        features: &Array1<f64>,
123        degree: usize,
124        start_idx: usize,
125        current_product: f64,
126        sample_idx: usize,
127        output: &mut Array2<f64>,
128        col_idx: &mut usize,
129    ) {
130        if degree == 0 {
131            output[[sample_idx, *col_idx]] = current_product;
132            *col_idx += 1;
133            return;
134        }
135
136        for i in start_idx..features.len() {
137            combinations_recursive(
138                features,
139                degree - 1,
140                i, // Allow repetition by using i instead of i+1
141                current_product * features[i],
142                sample_idx,
143                output,
144                col_idx,
145            );
146        }
147    }
148
149    combinations_recursive(features, degree, 0, 1.0, sample_idx, output, col_idx);
150}
151
152/// Extracts statistical features from the data
153///
154/// Computes various statistical measures for each feature, including central tendency,
155/// dispersion, and shape statistics.
156///
157/// # Arguments
158///
159/// * `data` - Input feature matrix (n_samples, n_features)
160///
161/// # Returns
162///
163/// A new array with statistical features: [mean, std, min, max, median, q25, q75, skewness, kurtosis] for each original feature
164///
165/// # Examples
166///
167/// ```rust
168/// use scirs2_core::ndarray::Array2;
169/// use scirs2_datasets::utils::statistical_features;
170///
171/// let data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0]).unwrap();
172/// let stats_features = statistical_features(&data).unwrap();
173/// // Result includes 9 statistical measures for each of the 2 original features
174/// ```
175#[allow(dead_code)]
176pub fn statistical_features(data: &Array2<f64>) -> Result<Array2<f64>> {
177    let n_samples = data.nrows();
178    let n_features = data.ncols();
179
180    if n_samples == 0 || n_features == 0 {
181        return Err(DatasetsError::InvalidFormat(
182            "Data cannot be empty for statistical feature extraction".to_string(),
183        ));
184    }
185
186    // 9 statistical features per original feature
187    let n_stat_features = 9;
188    let mut stats = Array2::zeros((n_samples, n_features * n_stat_features));
189
190    for sample_idx in 0..n_samples {
191        for feature_idx in 0..n_features {
192            let feature_values = data.column(feature_idx);
193
194            // Calculate basic statistics
195            let mean = {
196                let val = feature_values.mean();
197                if val.is_nan() {
198                    0.0
199                } else {
200                    val
201                }
202            };
203            let std = feature_values.std(0.0);
204            let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
205            let max_val = feature_values
206                .iter()
207                .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
208
209            // Calculate quantiles
210            let mut sorted_values: Vec<f64> = feature_values.to_vec();
211            sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
212
213            let median = calculate_quantile(&sorted_values, 0.5);
214            let q25 = calculate_quantile(&sorted_values, 0.25);
215            let q75 = calculate_quantile(&sorted_values, 0.75);
216
217            // Calculate skewness and kurtosis
218            let skewness = calculate_skewness(&feature_values, mean, std);
219            let kurtosis = calculate_kurtosis(&feature_values, mean, std);
220
221            // Store statistical features
222            let base_idx = feature_idx * n_stat_features;
223            stats[[sample_idx, base_idx]] = mean;
224            stats[[sample_idx, base_idx + 1]] = std;
225            stats[[sample_idx, base_idx + 2]] = min_val;
226            stats[[sample_idx, base_idx + 3]] = max_val;
227            stats[[sample_idx, base_idx + 4]] = median;
228            stats[[sample_idx, base_idx + 5]] = q25;
229            stats[[sample_idx, base_idx + 6]] = q75;
230            stats[[sample_idx, base_idx + 7]] = skewness;
231            stats[[sample_idx, base_idx + 8]] = kurtosis;
232        }
233    }
234
235    Ok(stats)
236}
237
238/// Calculates a specific quantile from sorted data
239#[allow(dead_code)]
240fn calculate_quantile(sorted_data: &[f64], quantile: f64) -> f64 {
241    if sorted_data.is_empty() {
242        return 0.0;
243    }
244
245    let n = sorted_data.len();
246    let index = quantile * (n - 1) as f64;
247    let lower = index.floor() as usize;
248    let upper = index.ceil() as usize;
249
250    if lower == upper {
251        sorted_data[lower]
252    } else {
253        let weight = index - lower as f64;
254        sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
255    }
256}
257
258/// Calculates skewness (third moment)
259#[allow(dead_code)]
260fn calculate_skewness(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
261    if std <= 1e-10 {
262        return 0.0;
263    }
264
265    let n = data.len() as f64;
266    let sum_cubed_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(3)).sum();
267
268    sum_cubed_deviations / n
269}
270
271/// Calculates kurtosis (fourth moment)
272#[allow(dead_code)]
273fn calculate_kurtosis(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
274    if std <= 1e-10 {
275        return 0.0;
276    }
277
278    let n = data.len() as f64;
279    let sum_fourth_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(4)).sum();
280
281    (sum_fourth_deviations / n) - 3.0 // Excess kurtosis (subtract 3 for normal distribution)
282}
283
284/// Creates binned (discretized) features from continuous features
285///
286/// Transforms continuous features into categorical features by binning values
287/// into specified ranges. This can be useful for creating non-linear features
288/// or reducing the impact of outliers.
289///
290/// # Arguments
291///
292/// * `data` - Input feature matrix (n_samples, n_features)
293/// * `n_bins` - Number of bins per feature
294/// * `strategy` - Binning strategy to use
295///
296/// # Returns
297///
298/// A new array with binned features (encoded as bin indices)
299///
300/// # Examples
301///
302/// ```rust
303/// use scirs2_core::ndarray::Array2;
304/// use scirs2_datasets::utils::{create_binned_features, BinningStrategy};
305///
306/// let data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0]).unwrap();
307/// let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
308/// // Each feature is now discretized into 3 bins (values 0, 1, 2)
309/// ```
310#[allow(dead_code)]
311pub fn create_binned_features(
312    data: &Array2<f64>,
313    n_bins: usize,
314    strategy: BinningStrategy,
315) -> Result<Array2<f64>> {
316    if n_bins < 2 {
317        return Err(DatasetsError::InvalidFormat(
318            "Number of _bins must be at least 2".to_string(),
319        ));
320    }
321
322    let n_samples = data.nrows();
323    let n_features = data.ncols();
324    let mut binned = Array2::zeros((n_samples, n_features));
325
326    for j in 0..n_features {
327        let column = data.column(j);
328        let bin_edges = calculate_bin_edges(&column, n_bins, &strategy)?;
329
330        for i in 0..n_samples {
331            let value = column[i];
332            let bin_idx = find_bin_index(value, &bin_edges);
333            binned[[i, j]] = bin_idx as f64;
334        }
335    }
336
337    Ok(binned)
338}
339
340/// Calculate bin edges based on the specified strategy
341#[allow(dead_code)]
342fn calculate_bin_edges(
343    data: &scirs2_core::ndarray::ArrayView1<f64>,
344    n_bins: usize,
345    strategy: &BinningStrategy,
346) -> Result<Vec<f64>> {
347    match strategy {
348        BinningStrategy::Uniform => {
349            let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
350            let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
351
352            if (max_val - min_val).abs() <= 1e-10 {
353                return Ok(vec![min_val, min_val + 1e-10]);
354            }
355
356            let bin_width = (max_val - min_val) / n_bins as f64;
357            let mut edges = Vec::with_capacity(n_bins + 1);
358
359            for i in 0..=n_bins {
360                edges.push(min_val + i as f64 * bin_width);
361            }
362
363            Ok(edges)
364        }
365        BinningStrategy::Quantile => {
366            let mut sorted_data: Vec<f64> = data.to_vec();
367            sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
368
369            let mut edges = Vec::with_capacity(n_bins + 1);
370            edges.push(sorted_data[0]);
371
372            for i in 1..n_bins {
373                let quantile = i as f64 / n_bins as f64;
374                let edge = calculate_quantile(&sorted_data, quantile);
375                edges.push(edge);
376            }
377
378            edges.push(sorted_data[sorted_data.len() - 1]);
379
380            Ok(edges)
381        }
382    }
383}
384
385/// Find the bin index for a given value
386#[allow(dead_code)]
387fn find_bin_index(_value: f64, binedges: &[f64]) -> usize {
388    for (i, &edge) in binedges.iter().enumerate().skip(1) {
389        if _value <= edge {
390            return i - 1;
391        }
392    }
393    binedges.len() - 2 // Last bin
394}
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399    use scirs2_core::ndarray::array;
400
401    #[test]
402    fn test_polynomial_features_degree_2() {
403        let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
404        let poly = polynomial_features(&data, 2, true).unwrap();
405
406        // Should have: [bias, x1, x2, x1², x1*x2, x2²] = 6 features
407        assert_eq!(poly.ncols(), 6);
408        assert_eq!(poly.nrows(), 2);
409
410        // Check first sample: [1, 1, 2, 1, 2, 4]
411        assert!((poly[[0, 0]] - 1.0).abs() < 1e-10); // bias
412        assert!((poly[[0, 1]] - 1.0).abs() < 1e-10); // x1
413        assert!((poly[[0, 2]] - 2.0).abs() < 1e-10); // x2
414        assert!((poly[[0, 3]] - 1.0).abs() < 1e-10); // x1²
415        assert!((poly[[0, 4]] - 2.0).abs() < 1e-10); // x1*x2
416        assert!((poly[[0, 5]] - 4.0).abs() < 1e-10); // x2²
417    }
418
419    #[test]
420    fn test_polynomial_features_no_bias() {
421        let data = Array2::from_shape_vec((1, 2), vec![2.0, 3.0]).unwrap();
422        let poly = polynomial_features(&data, 2, false).unwrap();
423
424        // Should have: [x1, x2, x1², x1*x2, x2²] = 5 features (no bias)
425        assert_eq!(poly.ncols(), 5);
426
427        // Check values: [2, 3, 4, 6, 9]
428        assert!((poly[[0, 0]] - 2.0).abs() < 1e-10); // x1
429        assert!((poly[[0, 1]] - 3.0).abs() < 1e-10); // x2
430        assert!((poly[[0, 2]] - 4.0).abs() < 1e-10); // x1²
431        assert!((poly[[0, 3]] - 6.0).abs() < 1e-10); // x1*x2
432        assert!((poly[[0, 4]] - 9.0).abs() < 1e-10); // x2²
433    }
434
435    #[test]
436    fn test_polynomial_features_invalid_degree() {
437        let data = Array2::from_shape_vec((1, 1), vec![1.0]).unwrap();
438        assert!(polynomial_features(&data, 0, true).is_err());
439    }
440
441    #[test]
442    fn test_statistical_features() {
443        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
444        let stats = statistical_features(&data).unwrap();
445
446        // Should have 9 statistical features for 1 original feature
447        assert_eq!(stats.ncols(), 9);
448        assert_eq!(stats.nrows(), 5);
449
450        // All samples should have the same statistical features (global statistics)
451        for i in 0..stats.nrows() {
452            assert!((stats[[i, 0]] - 3.0).abs() < 1e-10); // mean
453            assert!((stats[[i, 2]] - 1.0).abs() < 1e-10); // min
454            assert!((stats[[i, 3]] - 5.0).abs() < 1e-10); // max
455            assert!((stats[[i, 4]] - 3.0).abs() < 1e-10); // median
456        }
457    }
458
459    #[test]
460    fn test_statistical_features_empty_data() {
461        let data = Array2::zeros((0, 1));
462        assert!(statistical_features(&data).is_err());
463    }
464
465    #[test]
466    fn test_create_binned_features_uniform() {
467        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
468        let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
469
470        assert_eq!(binned.nrows(), 5);
471        assert_eq!(binned.ncols(), 1);
472
473        // Check that all values are valid bin indices (0, 1, or 2)
474        for i in 0..binned.nrows() {
475            let bin_val = binned[[i, 0]] as usize;
476            assert!(bin_val < 3);
477        }
478    }
479
480    #[test]
481    fn test_create_binned_features_quantile() {
482        let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
483        let binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
484
485        assert_eq!(binned.nrows(), 6);
486        assert_eq!(binned.ncols(), 1);
487
488        // With quantile binning, each bin should have roughly equal number of samples
489        let mut bin_counts = vec![0; 3];
490        for i in 0..binned.nrows() {
491            let bin_val = binned[[i, 0]] as usize;
492            bin_counts[bin_val] += 1;
493        }
494
495        // Each bin should have 2 samples (6 samples / 3 bins)
496        for &count in &bin_counts {
497            assert_eq!(count, 2);
498        }
499    }
500
501    #[test]
502    fn test_create_binned_features_invalid_bins() {
503        let data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
504        assert!(create_binned_features(&data, 1, BinningStrategy::Uniform).is_err());
505        assert!(create_binned_features(&data, 0, BinningStrategy::Uniform).is_err());
506    }
507
508    #[test]
509    fn test_calculate_quantile() {
510        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
511
512        assert_eq!(calculate_quantile(&data, 0.0), 1.0);
513        assert_eq!(calculate_quantile(&data, 0.5), 3.0);
514        assert_eq!(calculate_quantile(&data, 1.0), 5.0);
515        assert_eq!(calculate_quantile(&data, 0.25), 2.0);
516        assert_eq!(calculate_quantile(&data, 0.75), 4.0);
517    }
518
519    #[test]
520    fn test_calculate_skewness() {
521        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
522        let view = data.view();
523        let mean = view.mean();
524        let std = view.std(0.0);
525
526        let skewness = calculate_skewness(&view, mean, std);
527        // Symmetric distribution should have skewness near 0
528        assert!(skewness.abs() < 1e-10);
529    }
530
531    #[test]
532    fn test_calculate_kurtosis() {
533        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
534        let view = data.view();
535        let mean = view.mean();
536        let std = view.std(0.0);
537
538        let kurtosis = calculate_kurtosis(&view, mean, std);
539        // Uniform distribution should have negative excess kurtosis
540        assert!(kurtosis < 0.0);
541    }
542
543    #[test]
544    fn test_feature_extraction_pipeline() {
545        // Test a complete feature extraction pipeline
546        let data = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
547            .unwrap();
548
549        // Step 1: Generate polynomial features
550        let poly_data = polynomial_features(&data, 2, false).unwrap();
551
552        // Step 2: Create binned features
553        let binned_data = create_binned_features(&poly_data, 2, BinningStrategy::Uniform).unwrap();
554
555        // Step 3: Generate statistical features
556        let stats_data = statistical_features(&data).unwrap();
557
558        // Verify pipeline produces expected shapes
559        assert_eq!(poly_data.ncols(), 5); // [x1, x2, x1², x1*x2, x2²]
560        assert_eq!(binned_data.ncols(), 5); // Same number of features, but binned
561        assert_eq!(stats_data.ncols(), 18); // 9 statistics × 2 original features
562        assert_eq!(binned_data.nrows(), 4); // Same number of samples
563        assert_eq!(stats_data.nrows(), 4); // Same number of samples
564    }
565
566    #[test]
567    fn test_binning_strategies_comparison() {
568        // Create data with outliers
569        let data =
570            Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0]).unwrap();
571
572        let uniform_binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
573        let quantile_binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
574
575        // With uniform binning, the outlier will dominate the range
576        // With quantile binning, each bin should have roughly equal frequency
577
578        // Count bin distributions
579        let mut uniform_counts = [0; 3];
580        let mut quantile_counts = [0; 3];
581
582        for i in 0..data.nrows() {
583            uniform_counts[uniform_binned[[i, 0]] as usize] += 1;
584            quantile_counts[quantile_binned[[i, 0]] as usize] += 1;
585        }
586
587        // Quantile binning should have more balanced distribution
588        let uniform_max = *uniform_counts.iter().max().unwrap();
589        let uniform_min = *uniform_counts.iter().min().unwrap();
590        let quantile_max = *quantile_counts.iter().max().unwrap();
591        let quantile_min = *quantile_counts.iter().min().unwrap();
592
593        // Quantile binning should be more balanced (smaller difference between max and min)
594        assert!((quantile_max - quantile_min) <= (uniform_max - uniform_min));
595    }
596}