feature_extraction_demo/
feature_extraction_demo.rs

1//! Feature extraction utilities demonstration
2//!
3//! This example demonstrates the use of feature extraction and transformation utilities
4//! for preprocessing datasets before machine learning model training.
5
6use scirs2_core::ndarray::{Array1, Array2};
7use scirs2_datasets::{
8    create_binned_features, load_iris, min_max_scale, polynomial_features, robust_scale,
9    statistical_features, BinningStrategy,
10};
11
12#[allow(dead_code)]
13fn main() {
14    println!("=== Feature Extraction Utilities Demonstration ===\n");
15
16    // Create a sample dataset for demonstration
17    let data = Array2::from_shape_vec(
18        (6, 2),
19        vec![
20            1.0, 10.0, // Normal data
21            2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0, 100.0, 500.0, // Outlier
22        ],
23    )
24    .unwrap();
25
26    println!("Original dataset:");
27    print_data_summary(&data, "Original");
28    println!();
29
30    // Demonstrate Min-Max Scaling
31    println!("=== Min-Max Scaling ============================");
32    let mut data_minmax = data.clone();
33    min_max_scale(&mut data_minmax, (0.0, 1.0));
34    print_data_summary(&data_minmax, "Min-Max Scaled [0, 1]");
35
36    let mut data_custom_range = data.clone();
37    min_max_scale(&mut data_custom_range, (-1.0, 1.0));
38    print_data_summary(&data_custom_range, "Min-Max Scaled [-1, 1]");
39    println!();
40
41    // Demonstrate Robust Scaling
42    println!("=== Robust Scaling ==============================");
43    let mut data_robust = data.clone();
44    robust_scale(&mut data_robust);
45    print_data_summary(&data_robust, "Robust Scaled (Median/IQR)");
46    println!();
47
48    // Demonstrate Polynomial Features
49    println!("=== Polynomial Feature Generation ==============");
50    let smalldata = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0]).unwrap();
51
52    println!("Small dataset for polynomial demonstration:");
53    print_data_matrix(&smalldata, &["x1", "x2"]);
54
55    let poly_with_bias = polynomial_features(&smalldata, 2, true).unwrap();
56    println!("Polynomial features (degree=2, with bias):");
57    print_data_matrix(&poly_with_bias, &["1", "x1", "x2", "x1²", "x1*x2", "x2²"]);
58
59    let poly_no_bias = polynomial_features(&smalldata, 2, false).unwrap();
60    println!("Polynomial features (degree=2, no bias):");
61    print_data_matrix(&poly_no_bias, &["x1", "x2", "x1²", "x1*x2", "x2²"]);
62    println!();
63
64    // Demonstrate Statistical Feature Extraction
65    println!("=== Statistical Feature Extraction =============");
66    let statsdata = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
67
68    let stats_features = statistical_features(&statsdata).unwrap();
69    println!("Statistical features for data [1, 2, 3, 4, 5]:");
70    println!("(Each sample gets the same global statistics)");
71    print_statistical_features(stats_features.row(0).to_owned());
72    println!();
73
74    // Demonstrate Binning/Discretization
75    println!("=== Feature Binning/Discretization =============");
76    let binningdata =
77        Array2::from_shape_vec((8, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
78
79    println!("Original data for binning: [1, 2, 3, 4, 5, 6, 7, 8]");
80
81    let uniform_binned = create_binned_features(&binningdata, 3, BinningStrategy::Uniform).unwrap();
82    println!(
83        "Uniform binning (3 bins): {:?}",
84        uniform_binned
85            .column(0)
86            .iter()
87            .map(|&x| x as usize)
88            .collect::<Vec<_>>()
89    );
90
91    let quantile_binned =
92        create_binned_features(&binningdata, 4, BinningStrategy::Quantile).unwrap();
93    println!(
94        "Quantile binning (4 bins): {:?}",
95        quantile_binned
96            .column(0)
97            .iter()
98            .map(|&x| x as usize)
99            .collect::<Vec<_>>()
100    );
101    println!();
102
103    // Demonstrate Feature Extraction Pipeline
104    println!("=== Complete Feature Extraction Pipeline =======");
105    let iris = load_iris().unwrap();
106    println!(
107        "Using Iris dataset ({} samples, {} features)",
108        iris.n_samples(),
109        iris.n_features()
110    );
111
112    // Step 1: Robust scaling (handles outliers better)
113    let mut scaled_iris = iris.data.clone();
114    robust_scale(&mut scaled_iris);
115    println!("Step 1: Applied robust scaling");
116
117    // Step 2: Generate polynomial features (degree 2)
118    let poly_iris = polynomial_features(&scaled_iris, 2, false).unwrap();
119    println!("Step 2: Generated polynomial features");
120    println!("  Original features: {}", scaled_iris.ncols());
121    println!("  Polynomial features: {}", poly_iris.ncols());
122
123    // Step 3: Create binned features for non-linearity
124    let binned_iris = create_binned_features(&scaled_iris, 5, BinningStrategy::Quantile).unwrap();
125    println!("Step 3: Created binned features");
126    println!("  Binned features: {}", binned_iris.ncols());
127
128    // Step 4: Extract statistical features
129    let stats_iris = statistical_features(
130        &iris
131            .data
132            .slice(scirs2_core::ndarray::s![0..20, ..])
133            .to_owned(),
134    )
135    .unwrap();
136    println!("Step 4: Extracted statistical features (from first 20 samples)");
137    println!("  Statistical features: {}", stats_iris.ncols());
138    println!();
139
140    // Comparison of scaling methods with outliers
141    println!("=== Scaling Methods Comparison (with outliers) =");
142    let outlierdata = Array2::from_shape_vec(
143        (5, 1),
144        vec![1.0, 2.0, 3.0, 4.0, 100.0], // 100.0 is a severe outlier
145    )
146    .unwrap();
147
148    println!("Original data with outlier: [1, 2, 3, 4, 100]");
149
150    let mut minmax_outlier = outlierdata.clone();
151    min_max_scale(&mut minmax_outlier, (0.0, 1.0));
152    println!(
153        "Min-Max scaled: {:?}",
154        minmax_outlier
155            .column(0)
156            .iter()
157            .map(|&x| format!("{x:.3}"))
158            .collect::<Vec<_>>()
159    );
160
161    let mut robust_outlier = outlierdata.clone();
162    robust_scale(&mut robust_outlier);
163    println!(
164        "Robust scaled: {:?}",
165        robust_outlier
166            .column(0)
167            .iter()
168            .map(|&x| format!("{x:.3}"))
169            .collect::<Vec<_>>()
170    );
171
172    println!("\nNotice how robust scaling is less affected by the outlier!");
173    println!();
174
175    // Feature engineering recommendations
176    println!("=== Feature Engineering Recommendations ========");
177    println!("1. **Scaling**: Use robust scaling when outliers are present");
178    println!("2. **Polynomial**: Use degree 2-3 for non-linear relationships");
179    println!("3. **Binning**: Use quantile binning for better distribution");
180    println!("4. **Statistical**: Extract global statistics for context");
181    println!("5. **Pipeline**: Always scale → transform → engineer → validate");
182    println!();
183
184    println!("=== Feature Extraction Demo Complete ===========");
185}
186
187/// Print a summary of data statistics
188#[allow(dead_code)]
189fn print_data_summary(data: &Array2<f64>, title: &str) {
190    println!("{}: shape=({}, {})", title, data.nrows(), data.ncols());
191    for j in 0..data.ncols() {
192        let col = data.column(j);
193        let min_val = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
194        let max_val = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
195        let mean = col.iter().sum::<f64>() / col.len() as f64;
196        println!("  Feature {j}: min={min_val:.3}, max={max_val:.3}, mean={mean:.3}");
197    }
198}
199
200/// Print a data matrix with feature names
201#[allow(dead_code)]
202fn print_data_matrix(data: &Array2<f64>, featurenames: &[&str]) {
203    // Print header
204    print!("     ");
205    for name in featurenames {
206        print!("{name:>8}");
207    }
208    println!();
209
210    // Print _data
211    for i in 0..data.nrows() {
212        print!("  {i}: ");
213        for j in 0..data.ncols() {
214            print!("{:8.3}", data[[i, j]]);
215        }
216        println!();
217    }
218}
219
220/// Print statistical features with labels
221#[allow(dead_code)]
222fn print_statistical_features(stats: Array1<f64>) {
223    let labels = [
224        "mean", "std", "min", "max", "median", "q25", "q75", "skewness", "kurtosis",
225    ];
226    println!("  Statistical measures:");
227    for (i, &value) in stats.iter().enumerate() {
228        if i < labels.len() {
229            println!("    {}: {:.3}", labels[i], value);
230        }
231    }
232}