1use crate::error::{DatasetsError, Result};
9use scirs2_core::ndarray::{Array1, Array2};
10use statrs::statistics::Statistics;
11
12#[derive(Debug, Clone, Copy)]
14pub enum BinningStrategy {
15 Uniform,
17 Quantile,
19}
20
21#[allow(dead_code)]
47pub fn polynomial_features(
48 data: &Array2<f64>,
49 degree: usize,
50 include_bias: bool,
51) -> Result<Array2<f64>> {
52 if degree == 0 {
53 return Err(DatasetsError::InvalidFormat(
54 "Polynomial degree must be at least 1".to_string(),
55 ));
56 }
57
58 let n_samples = data.nrows();
59 let n_features = data.ncols();
60
61 let mut n_output_features = 0;
63 if include_bias {
64 n_output_features += 1;
65 }
66
67 for d in 1..=degree {
69 let mut combinations = 1;
72 for i in 0..d {
73 combinations = combinations * (n_features + i) / (i + 1);
74 }
75 n_output_features += combinations;
76 }
77
78 let mut output = Array2::zeros((n_samples, n_output_features));
79 let mut col_idx = 0;
80
81 if include_bias {
83 output.column_mut(col_idx).fill(1.0);
84 }
85
86 for sample_idx in 0..n_samples {
88 let sample = data.row(sample_idx);
89 col_idx = if include_bias { 1 } else { 0 };
90
91 for &feature_val in sample.iter() {
93 output[[sample_idx, col_idx]] = feature_val;
94 col_idx += 1;
95 }
96
97 for deg in 2..=degree {
99 generate_polynomial_combinations(
100 &sample.to_owned(),
101 deg,
102 sample_idx,
103 &mut output,
104 &mut col_idx,
105 );
106 }
107 }
108
109 Ok(output)
110}
111
112#[allow(dead_code)]
114fn generate_polynomial_combinations(
115 features: &Array1<f64>,
116 degree: usize,
117 sample_idx: usize,
118 output: &mut Array2<f64>,
119 col_idx: &mut usize,
120) {
121 fn combinations_recursive(
122 features: &Array1<f64>,
123 degree: usize,
124 start_idx: usize,
125 current_product: f64,
126 sample_idx: usize,
127 output: &mut Array2<f64>,
128 col_idx: &mut usize,
129 ) {
130 if degree == 0 {
131 output[[sample_idx, *col_idx]] = current_product;
132 *col_idx += 1;
133 return;
134 }
135
136 for i in start_idx..features.len() {
137 combinations_recursive(
138 features,
139 degree - 1,
140 i, current_product * features[i],
142 sample_idx,
143 output,
144 col_idx,
145 );
146 }
147 }
148
149 combinations_recursive(features, degree, 0, 1.0, sample_idx, output, col_idx);
150}
151
152#[allow(dead_code)]
176pub fn statistical_features(data: &Array2<f64>) -> Result<Array2<f64>> {
177 let n_samples = data.nrows();
178 let n_features = data.ncols();
179
180 if n_samples == 0 || n_features == 0 {
181 return Err(DatasetsError::InvalidFormat(
182 "Data cannot be empty for statistical feature extraction".to_string(),
183 ));
184 }
185
186 let n_stat_features = 9;
188 let mut stats = Array2::zeros((n_samples, n_features * n_stat_features));
189
190 for sample_idx in 0..n_samples {
191 for feature_idx in 0..n_features {
192 let feature_values = data.column(feature_idx);
193
194 let mean = {
196 let val = feature_values.mean();
197 if val.is_nan() {
198 0.0
199 } else {
200 val
201 }
202 };
203 let std = feature_values.std(0.0);
204 let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
205 let max_val = feature_values
206 .iter()
207 .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
208
209 let mut sorted_values: Vec<f64> = feature_values.to_vec();
211 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
212
213 let median = calculate_quantile(&sorted_values, 0.5);
214 let q25 = calculate_quantile(&sorted_values, 0.25);
215 let q75 = calculate_quantile(&sorted_values, 0.75);
216
217 let skewness = calculate_skewness(&feature_values, mean, std);
219 let kurtosis = calculate_kurtosis(&feature_values, mean, std);
220
221 let base_idx = feature_idx * n_stat_features;
223 stats[[sample_idx, base_idx]] = mean;
224 stats[[sample_idx, base_idx + 1]] = std;
225 stats[[sample_idx, base_idx + 2]] = min_val;
226 stats[[sample_idx, base_idx + 3]] = max_val;
227 stats[[sample_idx, base_idx + 4]] = median;
228 stats[[sample_idx, base_idx + 5]] = q25;
229 stats[[sample_idx, base_idx + 6]] = q75;
230 stats[[sample_idx, base_idx + 7]] = skewness;
231 stats[[sample_idx, base_idx + 8]] = kurtosis;
232 }
233 }
234
235 Ok(stats)
236}
237
238#[allow(dead_code)]
240fn calculate_quantile(sorted_data: &[f64], quantile: f64) -> f64 {
241 if sorted_data.is_empty() {
242 return 0.0;
243 }
244
245 let n = sorted_data.len();
246 let index = quantile * (n - 1) as f64;
247 let lower = index.floor() as usize;
248 let upper = index.ceil() as usize;
249
250 if lower == upper {
251 sorted_data[lower]
252 } else {
253 let weight = index - lower as f64;
254 sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
255 }
256}
257
258#[allow(dead_code)]
260fn calculate_skewness(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
261 if std <= 1e-10 {
262 return 0.0;
263 }
264
265 let n = data.len() as f64;
266 let sum_cubed_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(3)).sum();
267
268 sum_cubed_deviations / n
269}
270
271#[allow(dead_code)]
273fn calculate_kurtosis(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
274 if std <= 1e-10 {
275 return 0.0;
276 }
277
278 let n = data.len() as f64;
279 let sum_fourth_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(4)).sum();
280
281 (sum_fourth_deviations / n) - 3.0 }
283
284#[allow(dead_code)]
311pub fn create_binned_features(
312 data: &Array2<f64>,
313 n_bins: usize,
314 strategy: BinningStrategy,
315) -> Result<Array2<f64>> {
316 if n_bins < 2 {
317 return Err(DatasetsError::InvalidFormat(
318 "Number of _bins must be at least 2".to_string(),
319 ));
320 }
321
322 let n_samples = data.nrows();
323 let n_features = data.ncols();
324 let mut binned = Array2::zeros((n_samples, n_features));
325
326 for j in 0..n_features {
327 let column = data.column(j);
328 let bin_edges = calculate_bin_edges(&column, n_bins, &strategy)?;
329
330 for i in 0..n_samples {
331 let value = column[i];
332 let bin_idx = find_bin_index(value, &bin_edges);
333 binned[[i, j]] = bin_idx as f64;
334 }
335 }
336
337 Ok(binned)
338}
339
340#[allow(dead_code)]
342fn calculate_bin_edges(
343 data: &scirs2_core::ndarray::ArrayView1<f64>,
344 n_bins: usize,
345 strategy: &BinningStrategy,
346) -> Result<Vec<f64>> {
347 match strategy {
348 BinningStrategy::Uniform => {
349 let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
350 let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
351
352 if (max_val - min_val).abs() <= 1e-10 {
353 return Ok(vec![min_val, min_val + 1e-10]);
354 }
355
356 let bin_width = (max_val - min_val) / n_bins as f64;
357 let mut edges = Vec::with_capacity(n_bins + 1);
358
359 for i in 0..=n_bins {
360 edges.push(min_val + i as f64 * bin_width);
361 }
362
363 Ok(edges)
364 }
365 BinningStrategy::Quantile => {
366 let mut sorted_data: Vec<f64> = data.to_vec();
367 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
368
369 let mut edges = Vec::with_capacity(n_bins + 1);
370 edges.push(sorted_data[0]);
371
372 for i in 1..n_bins {
373 let quantile = i as f64 / n_bins as f64;
374 let edge = calculate_quantile(&sorted_data, quantile);
375 edges.push(edge);
376 }
377
378 edges.push(sorted_data[sorted_data.len() - 1]);
379
380 Ok(edges)
381 }
382 }
383}
384
385#[allow(dead_code)]
387fn find_bin_index(_value: f64, binedges: &[f64]) -> usize {
388 for (i, &edge) in binedges.iter().enumerate().skip(1) {
389 if _value <= edge {
390 return i - 1;
391 }
392 }
393 binedges.len() - 2 }
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399 use scirs2_core::ndarray::array;
400
401 #[test]
402 fn test_polynomial_features_degree_2() {
403 let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
404 let poly = polynomial_features(&data, 2, true).unwrap();
405
406 assert_eq!(poly.ncols(), 6);
408 assert_eq!(poly.nrows(), 2);
409
410 assert!((poly[[0, 0]] - 1.0).abs() < 1e-10); assert!((poly[[0, 1]] - 1.0).abs() < 1e-10); assert!((poly[[0, 2]] - 2.0).abs() < 1e-10); assert!((poly[[0, 3]] - 1.0).abs() < 1e-10); assert!((poly[[0, 4]] - 2.0).abs() < 1e-10); assert!((poly[[0, 5]] - 4.0).abs() < 1e-10); }
418
419 #[test]
420 fn test_polynomial_features_no_bias() {
421 let data = Array2::from_shape_vec((1, 2), vec![2.0, 3.0]).unwrap();
422 let poly = polynomial_features(&data, 2, false).unwrap();
423
424 assert_eq!(poly.ncols(), 5);
426
427 assert!((poly[[0, 0]] - 2.0).abs() < 1e-10); assert!((poly[[0, 1]] - 3.0).abs() < 1e-10); assert!((poly[[0, 2]] - 4.0).abs() < 1e-10); assert!((poly[[0, 3]] - 6.0).abs() < 1e-10); assert!((poly[[0, 4]] - 9.0).abs() < 1e-10); }
434
435 #[test]
436 fn test_polynomial_features_invalid_degree() {
437 let data = Array2::from_shape_vec((1, 1), vec![1.0]).unwrap();
438 assert!(polynomial_features(&data, 0, true).is_err());
439 }
440
441 #[test]
442 fn test_statistical_features() {
443 let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
444 let stats = statistical_features(&data).unwrap();
445
446 assert_eq!(stats.ncols(), 9);
448 assert_eq!(stats.nrows(), 5);
449
450 for i in 0..stats.nrows() {
452 assert!((stats[[i, 0]] - 3.0).abs() < 1e-10); assert!((stats[[i, 2]] - 1.0).abs() < 1e-10); assert!((stats[[i, 3]] - 5.0).abs() < 1e-10); assert!((stats[[i, 4]] - 3.0).abs() < 1e-10); }
457 }
458
459 #[test]
460 fn test_statistical_features_empty_data() {
461 let data = Array2::zeros((0, 1));
462 assert!(statistical_features(&data).is_err());
463 }
464
465 #[test]
466 fn test_create_binned_features_uniform() {
467 let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
468 let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
469
470 assert_eq!(binned.nrows(), 5);
471 assert_eq!(binned.ncols(), 1);
472
473 for i in 0..binned.nrows() {
475 let bin_val = binned[[i, 0]] as usize;
476 assert!(bin_val < 3);
477 }
478 }
479
480 #[test]
481 fn test_create_binned_features_quantile() {
482 let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
483 let binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
484
485 assert_eq!(binned.nrows(), 6);
486 assert_eq!(binned.ncols(), 1);
487
488 let mut bin_counts = vec![0; 3];
490 for i in 0..binned.nrows() {
491 let bin_val = binned[[i, 0]] as usize;
492 bin_counts[bin_val] += 1;
493 }
494
495 for &count in &bin_counts {
497 assert_eq!(count, 2);
498 }
499 }
500
501 #[test]
502 fn test_create_binned_features_invalid_bins() {
503 let data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
504 assert!(create_binned_features(&data, 1, BinningStrategy::Uniform).is_err());
505 assert!(create_binned_features(&data, 0, BinningStrategy::Uniform).is_err());
506 }
507
508 #[test]
509 fn test_calculate_quantile() {
510 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
511
512 assert_eq!(calculate_quantile(&data, 0.0), 1.0);
513 assert_eq!(calculate_quantile(&data, 0.5), 3.0);
514 assert_eq!(calculate_quantile(&data, 1.0), 5.0);
515 assert_eq!(calculate_quantile(&data, 0.25), 2.0);
516 assert_eq!(calculate_quantile(&data, 0.75), 4.0);
517 }
518
519 #[test]
520 fn test_calculate_skewness() {
521 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
522 let view = data.view();
523 let mean = view.mean();
524 let std = view.std(0.0);
525
526 let skewness = calculate_skewness(&view, mean, std);
527 assert!(skewness.abs() < 1e-10);
529 }
530
531 #[test]
532 fn test_calculate_kurtosis() {
533 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
534 let view = data.view();
535 let mean = view.mean();
536 let std = view.std(0.0);
537
538 let kurtosis = calculate_kurtosis(&view, mean, std);
539 assert!(kurtosis < 0.0);
541 }
542
543 #[test]
544 fn test_feature_extraction_pipeline() {
545 let data = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
547 .unwrap();
548
549 let poly_data = polynomial_features(&data, 2, false).unwrap();
551
552 let binned_data = create_binned_features(&poly_data, 2, BinningStrategy::Uniform).unwrap();
554
555 let stats_data = statistical_features(&data).unwrap();
557
558 assert_eq!(poly_data.ncols(), 5); assert_eq!(binned_data.ncols(), 5); assert_eq!(stats_data.ncols(), 18); assert_eq!(binned_data.nrows(), 4); assert_eq!(stats_data.nrows(), 4); }
565
566 #[test]
567 fn test_binning_strategies_comparison() {
568 let data =
570 Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0]).unwrap();
571
572 let uniform_binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
573 let quantile_binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
574
575 let mut uniform_counts = [0; 3];
580 let mut quantile_counts = [0; 3];
581
582 for i in 0..data.nrows() {
583 uniform_counts[uniform_binned[[i, 0]] as usize] += 1;
584 quantile_counts[quantile_binned[[i, 0]] as usize] += 1;
585 }
586
587 let uniform_max = *uniform_counts.iter().max().unwrap();
589 let uniform_min = *uniform_counts.iter().min().unwrap();
590 let quantile_max = *quantile_counts.iter().max().unwrap();
591 let quantile_min = *quantile_counts.iter().min().unwrap();
592
593 assert!((quantile_max - quantile_min) <= (uniform_max - uniform_min));
595 }
596}