1use crate::error::{DatasetsError, Result};
9use ndarray::{Array1, Array2};
10
11#[derive(Debug, Clone, Copy)]
13pub enum BinningStrategy {
14 Uniform,
16 Quantile,
18}
19
20pub fn polynomial_features(
46 data: &Array2<f64>,
47 degree: usize,
48 include_bias: bool,
49) -> Result<Array2<f64>> {
50 if degree == 0 {
51 return Err(DatasetsError::InvalidFormat(
52 "Polynomial degree must be at least 1".to_string(),
53 ));
54 }
55
56 let n_samples = data.nrows();
57 let n_features = data.ncols();
58
59 let mut n_output_features = 0;
61 if include_bias {
62 n_output_features += 1;
63 }
64
65 for d in 1..=degree {
67 let mut combinations = 1;
70 for i in 0..d {
71 combinations = combinations * (n_features + i) / (i + 1);
72 }
73 n_output_features += combinations;
74 }
75
76 let mut output = Array2::zeros((n_samples, n_output_features));
77 let mut col_idx = 0;
78
79 if include_bias {
81 output.column_mut(col_idx).fill(1.0);
82 }
83
84 for sample_idx in 0..n_samples {
86 let sample = data.row(sample_idx);
87 col_idx = if include_bias { 1 } else { 0 };
88
89 for &feature_val in sample.iter() {
91 output[[sample_idx, col_idx]] = feature_val;
92 col_idx += 1;
93 }
94
95 for deg in 2..=degree {
97 generate_polynomial_combinations(
98 &sample.to_owned(),
99 deg,
100 sample_idx,
101 &mut output,
102 &mut col_idx,
103 );
104 }
105 }
106
107 Ok(output)
108}
109
110fn generate_polynomial_combinations(
112 features: &Array1<f64>,
113 degree: usize,
114 sample_idx: usize,
115 output: &mut Array2<f64>,
116 col_idx: &mut usize,
117) {
118 fn combinations_recursive(
119 features: &Array1<f64>,
120 degree: usize,
121 start_idx: usize,
122 current_product: f64,
123 sample_idx: usize,
124 output: &mut Array2<f64>,
125 col_idx: &mut usize,
126 ) {
127 if degree == 0 {
128 output[[sample_idx, *col_idx]] = current_product;
129 *col_idx += 1;
130 return;
131 }
132
133 for i in start_idx..features.len() {
134 combinations_recursive(
135 features,
136 degree - 1,
137 i, current_product * features[i],
139 sample_idx,
140 output,
141 col_idx,
142 );
143 }
144 }
145
146 combinations_recursive(features, degree, 0, 1.0, sample_idx, output, col_idx);
147}
148
149pub fn statistical_features(data: &Array2<f64>) -> Result<Array2<f64>> {
173 let n_samples = data.nrows();
174 let n_features = data.ncols();
175
176 if n_samples == 0 || n_features == 0 {
177 return Err(DatasetsError::InvalidFormat(
178 "Data cannot be empty for statistical feature extraction".to_string(),
179 ));
180 }
181
182 let n_stat_features = 9;
184 let mut stats = Array2::zeros((n_samples, n_features * n_stat_features));
185
186 for sample_idx in 0..n_samples {
187 for feature_idx in 0..n_features {
188 let feature_values = data.column(feature_idx);
189
190 let mean = feature_values.mean().unwrap_or(0.0);
192 let std = feature_values.std(0.0);
193 let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
194 let max_val = feature_values
195 .iter()
196 .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
197
198 let mut sorted_values: Vec<f64> = feature_values.to_vec();
200 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
201
202 let median = calculate_quantile(&sorted_values, 0.5);
203 let q25 = calculate_quantile(&sorted_values, 0.25);
204 let q75 = calculate_quantile(&sorted_values, 0.75);
205
206 let skewness = calculate_skewness(&feature_values, mean, std);
208 let kurtosis = calculate_kurtosis(&feature_values, mean, std);
209
210 let base_idx = feature_idx * n_stat_features;
212 stats[[sample_idx, base_idx]] = mean;
213 stats[[sample_idx, base_idx + 1]] = std;
214 stats[[sample_idx, base_idx + 2]] = min_val;
215 stats[[sample_idx, base_idx + 3]] = max_val;
216 stats[[sample_idx, base_idx + 4]] = median;
217 stats[[sample_idx, base_idx + 5]] = q25;
218 stats[[sample_idx, base_idx + 6]] = q75;
219 stats[[sample_idx, base_idx + 7]] = skewness;
220 stats[[sample_idx, base_idx + 8]] = kurtosis;
221 }
222 }
223
224 Ok(stats)
225}
226
227fn calculate_quantile(sorted_data: &[f64], quantile: f64) -> f64 {
229 if sorted_data.is_empty() {
230 return 0.0;
231 }
232
233 let n = sorted_data.len();
234 let index = quantile * (n - 1) as f64;
235 let lower = index.floor() as usize;
236 let upper = index.ceil() as usize;
237
238 if lower == upper {
239 sorted_data[lower]
240 } else {
241 let weight = index - lower as f64;
242 sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
243 }
244}
245
246fn calculate_skewness(data: &ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
248 if std <= 1e-10 {
249 return 0.0;
250 }
251
252 let n = data.len() as f64;
253 let sum_cubed_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(3)).sum();
254
255 sum_cubed_deviations / n
256}
257
258fn calculate_kurtosis(data: &ndarray::ArrayView1<f64>, mean: f64, std: f64) -> f64 {
260 if std <= 1e-10 {
261 return 0.0;
262 }
263
264 let n = data.len() as f64;
265 let sum_fourth_deviations: f64 = data.iter().map(|&x| ((x - mean) / std).powi(4)).sum();
266
267 (sum_fourth_deviations / n) - 3.0 }
269
270pub fn create_binned_features(
297 data: &Array2<f64>,
298 n_bins: usize,
299 strategy: BinningStrategy,
300) -> Result<Array2<f64>> {
301 if n_bins < 2 {
302 return Err(DatasetsError::InvalidFormat(
303 "Number of bins must be at least 2".to_string(),
304 ));
305 }
306
307 let n_samples = data.nrows();
308 let n_features = data.ncols();
309 let mut binned = Array2::zeros((n_samples, n_features));
310
311 for j in 0..n_features {
312 let column = data.column(j);
313 let bin_edges = calculate_bin_edges(&column, n_bins, &strategy)?;
314
315 for i in 0..n_samples {
316 let value = column[i];
317 let bin_idx = find_bin_index(value, &bin_edges);
318 binned[[i, j]] = bin_idx as f64;
319 }
320 }
321
322 Ok(binned)
323}
324
325fn calculate_bin_edges(
327 data: &ndarray::ArrayView1<f64>,
328 n_bins: usize,
329 strategy: &BinningStrategy,
330) -> Result<Vec<f64>> {
331 match strategy {
332 BinningStrategy::Uniform => {
333 let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
334 let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
335
336 if (max_val - min_val).abs() <= 1e-10 {
337 return Ok(vec![min_val, min_val + 1e-10]);
338 }
339
340 let bin_width = (max_val - min_val) / n_bins as f64;
341 let mut edges = Vec::with_capacity(n_bins + 1);
342
343 for i in 0..=n_bins {
344 edges.push(min_val + i as f64 * bin_width);
345 }
346
347 Ok(edges)
348 }
349 BinningStrategy::Quantile => {
350 let mut sorted_data: Vec<f64> = data.to_vec();
351 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
352
353 let mut edges = Vec::with_capacity(n_bins + 1);
354 edges.push(sorted_data[0]);
355
356 for i in 1..n_bins {
357 let quantile = i as f64 / n_bins as f64;
358 let edge = calculate_quantile(&sorted_data, quantile);
359 edges.push(edge);
360 }
361
362 edges.push(sorted_data[sorted_data.len() - 1]);
363
364 Ok(edges)
365 }
366 }
367}
368
369fn find_bin_index(value: f64, bin_edges: &[f64]) -> usize {
371 for (i, &edge) in bin_edges.iter().enumerate().skip(1) {
372 if value <= edge {
373 return i - 1;
374 }
375 }
376 bin_edges.len() - 2 }
378
379#[cfg(test)]
380mod tests {
381 use super::*;
382 use ndarray::array;
383
384 #[test]
385 fn test_polynomial_features_degree_2() {
386 let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
387 let poly = polynomial_features(&data, 2, true).unwrap();
388
389 assert_eq!(poly.ncols(), 6);
391 assert_eq!(poly.nrows(), 2);
392
393 assert!((poly[[0, 0]] - 1.0).abs() < 1e-10); assert!((poly[[0, 1]] - 1.0).abs() < 1e-10); assert!((poly[[0, 2]] - 2.0).abs() < 1e-10); assert!((poly[[0, 3]] - 1.0).abs() < 1e-10); assert!((poly[[0, 4]] - 2.0).abs() < 1e-10); assert!((poly[[0, 5]] - 4.0).abs() < 1e-10); }
401
402 #[test]
403 fn test_polynomial_features_no_bias() {
404 let data = Array2::from_shape_vec((1, 2), vec![2.0, 3.0]).unwrap();
405 let poly = polynomial_features(&data, 2, false).unwrap();
406
407 assert_eq!(poly.ncols(), 5);
409
410 assert!((poly[[0, 0]] - 2.0).abs() < 1e-10); assert!((poly[[0, 1]] - 3.0).abs() < 1e-10); assert!((poly[[0, 2]] - 4.0).abs() < 1e-10); assert!((poly[[0, 3]] - 6.0).abs() < 1e-10); assert!((poly[[0, 4]] - 9.0).abs() < 1e-10); }
417
418 #[test]
419 fn test_polynomial_features_invalid_degree() {
420 let data = Array2::from_shape_vec((1, 1), vec![1.0]).unwrap();
421 assert!(polynomial_features(&data, 0, true).is_err());
422 }
423
424 #[test]
425 fn test_statistical_features() {
426 let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
427 let stats = statistical_features(&data).unwrap();
428
429 assert_eq!(stats.ncols(), 9);
431 assert_eq!(stats.nrows(), 5);
432
433 for i in 0..stats.nrows() {
435 assert!((stats[[i, 0]] - 3.0).abs() < 1e-10); assert!((stats[[i, 2]] - 1.0).abs() < 1e-10); assert!((stats[[i, 3]] - 5.0).abs() < 1e-10); assert!((stats[[i, 4]] - 3.0).abs() < 1e-10); }
440 }
441
442 #[test]
443 fn test_statistical_features_empty_data() {
444 let data = Array2::zeros((0, 1));
445 assert!(statistical_features(&data).is_err());
446 }
447
448 #[test]
449 fn test_create_binned_features_uniform() {
450 let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
451 let binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
452
453 assert_eq!(binned.nrows(), 5);
454 assert_eq!(binned.ncols(), 1);
455
456 for i in 0..binned.nrows() {
458 let bin_val = binned[[i, 0]] as usize;
459 assert!(bin_val < 3);
460 }
461 }
462
463 #[test]
464 fn test_create_binned_features_quantile() {
465 let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
466 let binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
467
468 assert_eq!(binned.nrows(), 6);
469 assert_eq!(binned.ncols(), 1);
470
471 let mut bin_counts = vec![0; 3];
473 for i in 0..binned.nrows() {
474 let bin_val = binned[[i, 0]] as usize;
475 bin_counts[bin_val] += 1;
476 }
477
478 for &count in &bin_counts {
480 assert_eq!(count, 2);
481 }
482 }
483
484 #[test]
485 fn test_create_binned_features_invalid_bins() {
486 let data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
487 assert!(create_binned_features(&data, 1, BinningStrategy::Uniform).is_err());
488 assert!(create_binned_features(&data, 0, BinningStrategy::Uniform).is_err());
489 }
490
491 #[test]
492 fn test_calculate_quantile() {
493 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
494
495 assert_eq!(calculate_quantile(&data, 0.0), 1.0);
496 assert_eq!(calculate_quantile(&data, 0.5), 3.0);
497 assert_eq!(calculate_quantile(&data, 1.0), 5.0);
498 assert_eq!(calculate_quantile(&data, 0.25), 2.0);
499 assert_eq!(calculate_quantile(&data, 0.75), 4.0);
500 }
501
502 #[test]
503 fn test_calculate_skewness() {
504 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
505 let view = data.view();
506 let mean = view.mean().unwrap();
507 let std = view.std(0.0);
508
509 let skewness = calculate_skewness(&view, mean, std);
510 assert!(skewness.abs() < 1e-10);
512 }
513
514 #[test]
515 fn test_calculate_kurtosis() {
516 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
517 let view = data.view();
518 let mean = view.mean().unwrap();
519 let std = view.std(0.0);
520
521 let kurtosis = calculate_kurtosis(&view, mean, std);
522 assert!(kurtosis < 0.0);
524 }
525
526 #[test]
527 fn test_feature_extraction_pipeline() {
528 let data = Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
530 .unwrap();
531
532 let poly_data = polynomial_features(&data, 2, false).unwrap();
534
535 let binned_data = create_binned_features(&poly_data, 2, BinningStrategy::Uniform).unwrap();
537
538 let stats_data = statistical_features(&data).unwrap();
540
541 assert_eq!(poly_data.ncols(), 5); assert_eq!(binned_data.ncols(), 5); assert_eq!(stats_data.ncols(), 18); assert_eq!(binned_data.nrows(), 4); assert_eq!(stats_data.nrows(), 4); }
548
549 #[test]
550 fn test_binning_strategies_comparison() {
551 let data =
553 Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0]).unwrap();
554
555 let uniform_binned = create_binned_features(&data, 3, BinningStrategy::Uniform).unwrap();
556 let quantile_binned = create_binned_features(&data, 3, BinningStrategy::Quantile).unwrap();
557
558 let mut uniform_counts = [0; 3];
563 let mut quantile_counts = [0; 3];
564
565 for i in 0..data.nrows() {
566 uniform_counts[uniform_binned[[i, 0]] as usize] += 1;
567 quantile_counts[quantile_binned[[i, 0]] as usize] += 1;
568 }
569
570 let uniform_max = *uniform_counts.iter().max().unwrap();
572 let uniform_min = *uniform_counts.iter().min().unwrap();
573 let quantile_max = *quantile_counts.iter().max().unwrap();
574 let quantile_min = *quantile_counts.iter().min().unwrap();
575
576 assert!((quantile_max - quantile_min) <= (uniform_max - uniform_min));
578 }
579}