1use crate::error::{DatasetsError, Result};
9use scirs2_core::ndarray::{Array1, Array2};
10use statrs::statistics::Statistics;
11
12#[derive(Debug, Clone, Copy)]
14pub enum BinningStrategy {
15 Uniform,
17 Quantile,
19}
20
21#[allow(dead_code)]
47pub fn polynomial_features(
48 data: &Array2<f64>,
49 degree: usize,
50 include_bias: bool,
51) -> Result<Array2<f64>> {
52 if degree == 0 {
53 return Err(DatasetsError::InvalidFormat(
54 "Polynomial degree must be at least 1".to_string(),
55 ));
56 }
57
58 let n_samples = data.nrows();
59 let n_features = data.ncols();
60
61 let mut n_output_features = 0;
63 if include_bias {
64 n_output_features += 1;
65 }
66
67 for d in 1..=degree {
69 let mut combinations = 1;
72 for i in 0..d {
73 combinations = combinations * (n_features + i) / (i + 1);
74 }
75 n_output_features += combinations;
76 }
77
78 let mut output = Array2::zeros((n_samples, n_output_features));
79 let mut col_idx = 0;
80
81 if include_bias {
83 output.column_mut(col_idx).fill(1.0);
84 }
85
86 for sample_idx in 0..n_samples {
88 let sample = data.row(sample_idx);
89 col_idx = if include_bias { 1 } else { 0 };
90
91 for &feature_val in sample.iter() {
93 output[[sample_idx, col_idx]] = feature_val;
94 col_idx += 1;
95 }
96
97 for deg in 2..=degree {
99 generate_polynomial_combinations(
100 &sample.to_owned(),
101 deg,
102 sample_idx,
103 &mut output,
104 &mut col_idx,
105 );
106 }
107 }
108
109 Ok(output)
110}
111
112#[allow(dead_code)]
114fn generate_polynomial_combinations(
115 features: &Array1<f64>,
116 degree: usize,
117 sample_idx: usize,
118 output: &mut Array2<f64>,
119 col_idx: &mut usize,
120) {
121 fn combinations_recursive(
122 features: &Array1<f64>,
123 degree: usize,
124 start_idx: usize,
125 current_product: f64,
126 sample_idx: usize,
127 output: &mut Array2<f64>,
128 col_idx: &mut usize,
129 ) {
130 if degree == 0 {
131 output[[sample_idx, *col_idx]] = current_product;
132 *col_idx += 1;
133 return;
134 }
135
136 for i in start_idx..features.len() {
137 combinations_recursive(
138 features,
139 degree - 1,
140 i, current_product * features[i],
142 sample_idx,
143 output,
144 col_idx,
145 );
146 }
147 }
148
149 combinations_recursive(features, degree, 0, 1.0, sample_idx, output, col_idx);
150}
151
152#[allow(dead_code)]
176pub fn statistical_features(data: &Array2<f64>) -> Result<Array2<f64>> {
177 let n_samples = data.nrows();
178 let n_features = data.ncols();
179
180 if n_samples == 0 || n_features == 0 {
181 return Err(DatasetsError::InvalidFormat(
182 "Data cannot be empty for statistical feature extraction".to_string(),
183 ));
184 }
185
186 let n_stat_features = 9;
188 let mut stats = Array2::zeros((n_samples, n_features * n_stat_features));
189
190 for sample_idx in 0..n_samples {
191 for feature_idx in 0..n_features {
192 let feature_values = data.column(feature_idx);
193
194 let mean = {
196 let val = feature_values.mean();
197 if val.is_nan() {
198 0.0
199 } else {
200 val
201 }
202 };
203 let std = feature_values.std(0.0);
204 let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
205 let max_val = feature_values
206 .iter()
207 .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
208
209 let mut sorted_values: Vec<f64> = feature_values.to_vec();
211 sorted_values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
212
213 let median = calculate_quantile(&sorted_values, 0.5);
214 let q25 = calculate_quantile(&sorted_values, 0.25);
215 let q75 = calculate_quantile(&sorted_values, 0.75);
216
217 let skewness = calculate_skewness(&feature_values, mean, std);
219 let kurtosis = calculate_kurtosis(&feature_values, mean, std);
220
221 let base_idx = feature_idx * n_stat_features;
223 stats[[sample_idx, base_idx]] = mean;
224 stats[[sample_idx, base_idx + 1]] = std;
225 stats[[sample_idx, base_idx + 2]] = min_val;
226 stats[[sample_idx, base_idx + 3]] = max_val;
227 stats[[sample_idx, base_idx + 4]] = median;
228 stats[[sample_idx, base_idx + 5]] = q25;
229 stats[[sample_idx, base_idx + 6]] = q75;
230 stats[[sample_idx, base_idx + 7]] = skewness;
231 stats[[sample_idx, base_idx + 8]] = kurtosis;
232 }
233 }
234
235 Ok(stats)
236}
237
238#[allow(dead_code)]
240fn calculate_quantile(sorted_data: &[f64], quantile: f64) -> f64 {
241 if sorted_data.is_empty() {
242 return 0.0;
243 }
244
245 let n = sorted_data.len();
246 let index = quantile * (n - 1) as f64;
247 let lower = index.floor() as usize;
248 let upper = index.ceil() as usize;
249
250 if lower == upper {
251 sorted_data[lower]
252 } else {
253 let weight = index - lower as f64;
254 sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
255 }
256}
257
258#[allow(dead_code)]
260fn calculate_skewness(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
261 if std <= 1e-10 {
262 return 0.0;
263 }
264
265 let n = data.len() as f64;
266 let sum_cubed_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(3)).sum();
267
268 sum_cubed_deviations / n
269}
270
271#[allow(dead_code)]
273fn calculate_kurtosis(data: &scirs2_core::ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
274 if std <= 1e-10 {
275 return 0.0;
276 }
277
278 let n = data.len() as f64;
279 let sum_fourth_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(4)).sum();
280
281 (sum_fourth_deviations / n) - 3.0 }
283
284#[allow(dead_code)]
311pub fn create_binned_features(
312 data: &Array2<f64>,
313 n_bins: usize,
314 strategy: BinningStrategy,
315) -> Result<Array2<f64>> {
316 if n_bins < 2 {
317 return Err(DatasetsError::InvalidFormat(
318 "Number of _bins must be at least 2".to_string(),
319 ));
320 }
321
322 let n_samples = data.nrows();
323 let n_features = data.ncols();
324 let mut binned = Array2::zeros((n_samples, n_features));
325
326 for j in 0..n_features {
327 let column = data.column(j);
328 let bin_edges = calculate_bin_edges(&column, n_bins, &strategy)?;
329
330 for i in 0..n_samples {
331 let value = column[i];
332 let bin_idx = find_bin_index(value, &bin_edges);
333 binned[[i, j]] = bin_idx as f64;
334 }
335 }
336
337 Ok(binned)
338}
339
340#[allow(dead_code)]
342fn calculate_bin_edges(
343 data: &scirs2_core::ndarray::ArrayView1<f64>,
344 n_bins: usize,
345 strategy: &BinningStrategy,
346) -> Result<Vec<f64>> {
347 match strategy {
348 BinningStrategy::Uniform => {
349 let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
350 let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
351
352 if (max_val - min_val).abs() <= 1e-10 {
353 return Ok(vec![min_val, min_val + 1e-10]);
354 }
355
356 let bin_width = (max_val - min_val) / n_bins as f64;
357 let mut edges = Vec::with_capacity(n_bins + 1);
358
359 for i in 0..=n_bins {
360 edges.push(min_val + i as f64 * bin_width);
361 }
362
363 Ok(edges)
364 }
365 BinningStrategy::Quantile => {
366 let mut sorted_data: Vec<f64> = data.to_vec();
367 sorted_data.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
368
369 let mut edges = Vec::with_capacity(n_bins + 1);
370 edges.push(sorted_data[0]);
371
372 for i in 1..n_bins {
373 let quantile = i as f64 / n_bins as f64;
374 let edge = calculate_quantile(&sorted_data, quantile);
375 edges.push(edge);
376 }
377
378 edges.push(sorted_data[sorted_data.len() - 1]);
379
380 Ok(edges)
381 }
382 }
383}
384
385#[allow(dead_code)]
387fn find_bin_index(_value: f64, binedges: &[f64]) -> usize {
388 for (i, &edge) in binedges.iter().enumerate().skip(1) {
389 if _value <= edge {
390 return i - 1;
391 }
392 }
393 binedges.len() - 2 }
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399 use scirs2_core::ndarray::array;
400
401 #[test]
402 fn test_polynomial_features_degree_2() {
403 let data =
404 Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).expect("Operation failed");
405 let poly = polynomial_features(&data, 2, true).expect("Operation failed");
406
407 assert_eq!(poly.ncols(), 6);
409 assert_eq!(poly.nrows(), 2);
410
411 assert!((poly[[0, 0]] - 1.0).abs() < 1e-10); assert!((poly[[0, 1]] - 1.0).abs() < 1e-10); assert!((poly[[0, 2]] - 2.0).abs() < 1e-10); assert!((poly[[0, 3]] - 1.0).abs() < 1e-10); assert!((poly[[0, 4]] - 2.0).abs() < 1e-10); assert!((poly[[0, 5]] - 4.0).abs() < 1e-10); }
419
420 #[test]
421 fn test_polynomial_features_no_bias() {
422 let data = Array2::from_shape_vec((1, 2), vec![2.0, 3.0]).expect("Operation failed");
423 let poly = polynomial_features(&data, 2, false).expect("Operation failed");
424
425 assert_eq!(poly.ncols(), 5);
427
428 assert!((poly[[0, 0]] - 2.0).abs() < 1e-10); assert!((poly[[0, 1]] - 3.0).abs() < 1e-10); assert!((poly[[0, 2]] - 4.0).abs() < 1e-10); assert!((poly[[0, 3]] - 6.0).abs() < 1e-10); assert!((poly[[0, 4]] - 9.0).abs() < 1e-10); }
435
436 #[test]
437 fn test_polynomial_features_invalid_degree() {
438 let data = Array2::from_shape_vec((1, 1), vec![1.0]).expect("Operation failed");
439 assert!(polynomial_features(&data, 0, true).is_err());
440 }
441
442 #[test]
443 fn test_statistical_features() {
444 let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0])
445 .expect("Operation failed");
446 let stats = statistical_features(&data).expect("Operation failed");
447
448 assert_eq!(stats.ncols(), 9);
450 assert_eq!(stats.nrows(), 5);
451
452 for i in 0..stats.nrows() {
454 assert!((stats[[i, 0]] - 3.0).abs() < 1e-10); assert!((stats[[i, 2]] - 1.0).abs() < 1e-10); assert!((stats[[i, 3]] - 5.0).abs() < 1e-10); assert!((stats[[i, 4]] - 3.0).abs() < 1e-10); }
459 }
460
461 #[test]
462 fn test_statistical_features_empty_data() {
463 let data = Array2::zeros((0, 1));
464 assert!(statistical_features(&data).is_err());
465 }
466
467 #[test]
468 fn test_create_binned_features_uniform() {
469 let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0])
470 .expect("Operation failed");
471 let binned =
472 create_binned_features(&data, 3, BinningStrategy::Uniform).expect("Operation failed");
473
474 assert_eq!(binned.nrows(), 5);
475 assert_eq!(binned.ncols(), 1);
476
477 for i in 0..binned.nrows() {
479 let bin_val = binned[[i, 0]] as usize;
480 assert!(bin_val < 3);
481 }
482 }
483
484 #[test]
485 fn test_create_binned_features_quantile() {
486 let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
487 .expect("Operation failed");
488 let binned =
489 create_binned_features(&data, 3, BinningStrategy::Quantile).expect("Operation failed");
490
491 assert_eq!(binned.nrows(), 6);
492 assert_eq!(binned.ncols(), 1);
493
494 let mut bin_counts = vec![0; 3];
496 for i in 0..binned.nrows() {
497 let bin_val = binned[[i, 0]] as usize;
498 bin_counts[bin_val] += 1;
499 }
500
501 for &count in &bin_counts {
503 assert_eq!(count, 2);
504 }
505 }
506
507 #[test]
508 fn test_create_binned_features_invalid_bins() {
509 let data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).expect("Operation failed");
510 assert!(create_binned_features(&data, 1, BinningStrategy::Uniform).is_err());
511 assert!(create_binned_features(&data, 0, BinningStrategy::Uniform).is_err());
512 }
513
514 #[test]
515 fn test_calculate_quantile() {
516 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
517
518 assert_eq!(calculate_quantile(&data, 0.0), 1.0);
519 assert_eq!(calculate_quantile(&data, 0.5), 3.0);
520 assert_eq!(calculate_quantile(&data, 1.0), 5.0);
521 assert_eq!(calculate_quantile(&data, 0.25), 2.0);
522 assert_eq!(calculate_quantile(&data, 0.75), 4.0);
523 }
524
525 #[test]
526 fn test_calculate_skewness() {
527 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
528 let view = data.view();
529 let mean = view.mean();
530 let std = view.std(0.0);
531
532 let skewness = calculate_skewness(&view, mean, std);
533 assert!(skewness.abs() < 1e-10);
535 }
536
537 #[test]
538 fn test_calculate_kurtosis() {
539 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
540 let view = data.view();
541 let mean = view.mean();
542 let std = view.std(0.0);
543
544 let kurtosis = calculate_kurtosis(&view, mean, std);
545 assert!(kurtosis < 0.0);
547 }
548
549 #[test]
550 fn test_feature_extraction_pipeline() {
551 let data = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
553 .expect("Failed to create features");
554
555 let poly_data = polynomial_features(&data, 2, false).expect("Operation failed");
557
558 let binned_data = create_binned_features(&poly_data, 2, BinningStrategy::Uniform)
560 .expect("Operation failed");
561
562 let stats_data = statistical_features(&data).expect("Operation failed");
564
565 assert_eq!(poly_data.ncols(), 5); assert_eq!(binned_data.ncols(), 5); assert_eq!(stats_data.ncols(), 18); assert_eq!(binned_data.nrows(), 4); assert_eq!(stats_data.nrows(), 4); }
572
573 #[test]
574 fn test_binning_strategies_comparison() {
575 let data = Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0])
577 .expect("Operation failed");
578
579 let uniform_binned =
580 create_binned_features(&data, 3, BinningStrategy::Uniform).expect("Operation failed");
581 let quantile_binned =
582 create_binned_features(&data, 3, BinningStrategy::Quantile).expect("Operation failed");
583
584 let mut uniform_counts = [0; 3];
589 let mut quantile_counts = [0; 3];
590
591 for i in 0..data.nrows() {
592 uniform_counts[uniform_binned[[i, 0]] as usize] += 1;
593 quantile_counts[quantile_binned[[i, 0]] as usize] += 1;
594 }
595
596 let uniform_max = *uniform_counts.iter().max().expect("Operation failed");
598 let uniform_min = *uniform_counts.iter().min().expect("Operation failed");
599 let quantile_max = *quantile_counts.iter().max().expect("Operation failed");
600 let quantile_min = *quantile_counts.iter().min().expect("Operation failed");
601
602 assert!((quantile_max - quantile_min) <= (uniform_max - uniform_min));
604 }
605}