1use crate::{automl_algorithm_selection::DatasetCharacteristics, scoring::TaskType};
8use scirs2_core::ndarray::{concatenate, s, Array1, Array2, ArrayView1, Axis};
9use scirs2_core::SliceRandomExt;
10use sklears_core::error::Result;
11use std::collections::HashMap;
12use std::fmt;
13use scirs2_core::random::rngs::StdRng;
16use scirs2_core::random::{Rng, SeedableRng};
17
18#[derive(Debug, Clone, PartialEq, Eq, Hash)]
20pub enum FeatureTransformationType {
21 Polynomial { degree: usize },
23 Logarithmic,
25 SquareRoot,
27 Exponential,
29 Reciprocal,
31 Sine,
33 Cosine,
35 Absolute,
37 Sign,
39 Binning { n_bins: usize },
41 Interaction,
43 Ratio,
45 Difference,
47 RollingStatistics { window: usize },
49 Lag { lag: usize },
51}
52
53impl fmt::Display for FeatureTransformationType {
54 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55 match self {
56 FeatureTransformationType::Polynomial { degree } => write!(f, "Polynomial({})", degree),
57 FeatureTransformationType::Logarithmic => write!(f, "Logarithmic"),
58 FeatureTransformationType::SquareRoot => write!(f, "SquareRoot"),
59 FeatureTransformationType::Exponential => write!(f, "Exponential"),
60 FeatureTransformationType::Reciprocal => write!(f, "Reciprocal"),
61 FeatureTransformationType::Sine => write!(f, "Sine"),
62 FeatureTransformationType::Cosine => write!(f, "Cosine"),
63 FeatureTransformationType::Absolute => write!(f, "Absolute"),
64 FeatureTransformationType::Sign => write!(f, "Sign"),
65 FeatureTransformationType::Binning { n_bins } => write!(f, "Binning({})", n_bins),
66 FeatureTransformationType::Interaction => write!(f, "Interaction"),
67 FeatureTransformationType::Ratio => write!(f, "Ratio"),
68 FeatureTransformationType::Difference => write!(f, "Difference"),
69 FeatureTransformationType::RollingStatistics { window } => {
70 write!(f, "RollingStats({})", window)
71 }
72 FeatureTransformationType::Lag { lag } => write!(f, "Lag({})", lag),
73 }
74 }
75}
76
77#[derive(Debug, Clone, PartialEq)]
79pub enum FeatureEngineeringStrategy {
80 Conservative,
82 Balanced,
84 Aggressive,
86 Custom(Vec<FeatureTransformationType>),
88}
89
90#[derive(Debug, Clone, PartialEq)]
92pub enum FeatureSelectionMethod {
93 SelectKBest { k: usize },
95 SelectPercentile { percentile: f64 },
97 RecursiveFeatureElimination { step: usize },
99 L1Selection { alpha: f64 },
101 VarianceThreshold { threshold: f64 },
103 CorrelationThreshold { threshold: f64 },
105 MutualInformation { k: usize },
107 TreeImportance { threshold: f64 },
109}
110
111#[derive(Debug, Clone)]
113pub struct GeneratedFeature {
114 pub name: String,
116 pub transformation: FeatureTransformationType,
118 pub source_features: Vec<usize>,
120 pub importance_score: f64,
122 pub is_selected: bool,
124 pub statistics: FeatureStatistics,
126}
127
128#[derive(Debug, Clone)]
130pub struct FeatureStatistics {
131 pub mean: f64,
133 pub std: f64,
135 pub min: f64,
137 pub max: f64,
139 pub n_unique: usize,
141 pub missing_ratio: f64,
143 pub skewness: f64,
145 pub kurtosis: f64,
147}
148
149#[derive(Debug, Clone)]
151pub struct AutoFeatureEngineering {
152 pub strategy: FeatureEngineeringStrategy,
154 pub selection_method: FeatureSelectionMethod,
156 pub max_features: usize,
158 pub max_selected_features: usize,
160 pub cv_folds: usize,
162 pub task_type: TaskType,
164 pub random_seed: Option<u64>,
166 pub enable_polynomial: bool,
168 pub enable_math_transforms: bool,
170 pub enable_interactions: bool,
172 pub enable_time_series: bool,
174 pub min_correlation_threshold: f64,
176 pub max_correlation_threshold: f64,
178}
179
180impl Default for AutoFeatureEngineering {
181 fn default() -> Self {
182 Self {
183 strategy: FeatureEngineeringStrategy::Balanced,
184 selection_method: FeatureSelectionMethod::SelectKBest { k: 100 },
185 max_features: 1000,
186 max_selected_features: 100,
187 cv_folds: 5,
188 task_type: TaskType::Classification,
189 random_seed: None,
190 enable_polynomial: true,
191 enable_math_transforms: true,
192 enable_interactions: true,
193 enable_time_series: false,
194 min_correlation_threshold: 0.05,
195 max_correlation_threshold: 0.95,
196 }
197 }
198}
199
200#[derive(Debug, Clone)]
202pub struct FeatureEngineeringResult {
203 pub original_feature_count: usize,
205 pub generated_feature_count: usize,
207 pub selected_feature_count: usize,
209 pub generated_features: Vec<GeneratedFeature>,
211 pub selected_indices: Vec<usize>,
213 pub feature_importances: Vec<f64>,
215 pub transformation_info: TransformationInfo,
217 pub performance_improvement: f64,
219 pub processing_time: f64,
221}
222
223#[derive(Debug, Clone)]
225pub struct TransformationInfo {
226 pub transformations: Vec<(FeatureTransformationType, Vec<usize>)>,
228 pub selected_indices: Vec<usize>,
230 pub scaling_params: HashMap<usize, (f64, f64)>, pub binning_boundaries: HashMap<usize, Vec<f64>>,
234}
235
236impl fmt::Display for FeatureEngineeringResult {
237 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238 writeln!(f, "Automated Feature Engineering Results")?;
239 writeln!(f, "=====================================")?;
240 writeln!(f, "Original features: {}", self.original_feature_count)?;
241 writeln!(f, "Generated features: {}", self.generated_feature_count)?;
242 writeln!(f, "Selected features: {}", self.selected_feature_count)?;
243 writeln!(
244 f,
245 "Performance improvement: {:.4}",
246 self.performance_improvement
247 )?;
248 writeln!(f, "Processing time: {:.2}s", self.processing_time)?;
249 writeln!(f)?;
250 writeln!(f, "Top 10 Generated Features:")?;
251
252 let mut top_features: Vec<_> = self
253 .generated_features
254 .iter()
255 .filter(|f| f.is_selected)
256 .collect();
257 top_features.sort_by(|a, b| b.importance_score.partial_cmp(&a.importance_score).unwrap());
258
259 for (i, feature) in top_features.iter().take(10).enumerate() {
260 writeln!(
261 f,
262 "{}. {} ({}): {:.4}",
263 i + 1,
264 feature.name,
265 feature.transformation,
266 feature.importance_score
267 )?;
268 }
269 Ok(())
270 }
271}
272
273pub struct AutoFeatureEngineer {
275 config: AutoFeatureEngineering,
276 rng: StdRng,
277}
278
279impl Default for AutoFeatureEngineer {
280 fn default() -> Self {
281 Self::new(AutoFeatureEngineering::default())
282 }
283}
284
285impl AutoFeatureEngineer {
286 pub fn new(config: AutoFeatureEngineering) -> Self {
288 let rng = match config.random_seed {
289 Some(seed) => StdRng::seed_from_u64(seed),
290 None => StdRng::from_rng(&mut scirs2_core::random::thread_rng()),
291 };
292
293 Self { config, rng }
294 }
295
296 pub fn engineer_features(
298 &mut self,
299 X: &Array2<f64>,
300 y: &Array1<f64>,
301 ) -> Result<FeatureEngineeringResult> {
302 let start_time = std::time::Instant::now();
303 let original_feature_count = X.ncols();
304
305 let dataset_chars = self.analyze_dataset_for_features(X, y);
307
308 let transformations = self.generate_transformations(&dataset_chars);
310
311 let (enhanced_X, generated_features) = self.apply_transformations(X, &transformations)?;
313
314 let features_with_stats =
316 self.calculate_feature_statistics(generated_features, &enhanced_X, y);
317
318 let (selected_features, selected_indices) =
320 self.select_features(&enhanced_X, y, features_with_stats)?;
321
322 let performance_improvement =
324 self.estimate_performance_improvement(X, &enhanced_X, y, &selected_indices)?;
325
326 let transformation_info =
328 self.create_transformation_info(&transformations, &selected_indices, &enhanced_X);
329
330 let processing_time = start_time.elapsed().as_secs_f64();
331
332 Ok(FeatureEngineeringResult {
333 original_feature_count,
334 generated_feature_count: enhanced_X.ncols(),
335 selected_feature_count: selected_indices.len(),
336 generated_features: selected_features,
337 selected_indices: selected_indices.clone(),
338 feature_importances: vec![0.0; selected_indices.len()], transformation_info,
340 performance_improvement,
341 processing_time,
342 })
343 }
344
345 pub fn transform(
347 &self,
348 X: &Array2<f64>,
349 transformation_info: &TransformationInfo,
350 ) -> Result<Array2<f64>> {
351 let mut transformed_X = X.clone();
353
354 for (transformation, source_indices) in &transformation_info.transformations {
356 let new_features =
357 self.apply_single_transformation(&transformed_X, transformation, source_indices)?;
358 transformed_X =
360 concatenate(Axis(1), &[transformed_X.view(), new_features.view()]).unwrap();
361 }
362
363 for (feature_idx, (mean, std)) in &transformation_info.scaling_params {
365 if *feature_idx < transformed_X.ncols() {
366 let mut column = transformed_X.column_mut(*feature_idx);
367 for value in column.iter_mut() {
368 *value = (*value - mean) / std;
369 }
370 }
371 }
372
373 let valid_indices: Vec<usize> = transformation_info
375 .selected_indices
376 .iter()
377 .filter(|&&idx| idx < transformed_X.ncols())
378 .copied()
379 .collect();
380
381 if valid_indices.is_empty() {
382 return Err("No valid feature indices to select".into());
383 }
384
385 let selected_X = transformed_X.select(Axis(1), &valid_indices);
386 Ok(selected_X)
387 }
388
389 fn analyze_dataset_for_features(
391 &self,
392 X: &Array2<f64>,
393 y: &Array1<f64>,
394 ) -> DatasetCharacteristics {
395 let n_samples = X.nrows();
396 let n_features = X.ncols();
397
398 let sparsity = self.calculate_sparsity(X);
400 let correlation_structure = self.analyze_correlation_structure(X);
401 let linearity_score = self.estimate_linearity(X, y);
402
403 let (n_classes, class_distribution, target_stats) = match self.config.task_type {
405 TaskType::Classification => {
406 let classes = self.get_unique_classes(y);
407 let class_dist = self.calculate_class_distribution(y, &classes);
408 (Some(classes.len()), Some(class_dist), None)
409 }
410 TaskType::Regression => {
411 let stats = crate::automl_algorithm_selection::TargetStatistics {
412 mean: y.mean().unwrap(),
413 std: y.std(0.0),
414 skewness: 0.0, kurtosis: 0.0, n_outliers: 0, };
418 (None, None, Some(stats))
419 }
420 };
421
422 crate::automl_algorithm_selection::DatasetCharacteristics {
423 n_samples,
424 n_features,
425 n_classes,
426 class_distribution,
427 target_stats,
428 missing_ratio: 0.0, categorical_ratio: 0.0, correlation_condition_number: correlation_structure,
431 sparsity,
432 effective_dimensionality: Some((n_features as f64 * 0.8) as usize),
433 noise_level: 0.1, linearity_score,
435 }
436 }
437
438 fn generate_transformations(
440 &mut self,
441 dataset_chars: &DatasetCharacteristics,
442 ) -> Vec<FeatureTransformationType> {
443 let mut transformations = Vec::new();
444
445 match &self.config.strategy {
446 FeatureEngineeringStrategy::Conservative => {
447 if self.config.enable_polynomial {
448 transformations.push(FeatureTransformationType::Polynomial { degree: 2 });
449 }
450 if self.config.enable_math_transforms {
451 transformations.push(FeatureTransformationType::Logarithmic);
452 transformations.push(FeatureTransformationType::SquareRoot);
453 }
454 }
455
456 FeatureEngineeringStrategy::Balanced => {
457 if self.config.enable_polynomial {
458 transformations.push(FeatureTransformationType::Polynomial { degree: 2 });
459 if dataset_chars.n_features < 20 {
460 transformations.push(FeatureTransformationType::Polynomial { degree: 3 });
461 }
462 }
463
464 if self.config.enable_math_transforms {
465 transformations.extend(vec![
466 FeatureTransformationType::Logarithmic,
467 FeatureTransformationType::SquareRoot,
468 FeatureTransformationType::Absolute,
469 FeatureTransformationType::Reciprocal,
470 ]);
471 }
472
473 if self.config.enable_interactions && dataset_chars.n_features < 50 {
474 transformations.push(FeatureTransformationType::Interaction);
475 transformations.push(FeatureTransformationType::Ratio);
476 }
477
478 transformations.push(FeatureTransformationType::Binning { n_bins: 10 });
479 }
480
481 FeatureEngineeringStrategy::Aggressive => {
482 if self.config.enable_polynomial {
483 transformations.push(FeatureTransformationType::Polynomial { degree: 2 });
484 if dataset_chars.n_features < 15 {
485 transformations.push(FeatureTransformationType::Polynomial { degree: 3 });
486 }
487 }
488
489 if self.config.enable_math_transforms {
490 transformations.extend(vec![
491 FeatureTransformationType::Logarithmic,
492 FeatureTransformationType::SquareRoot,
493 FeatureTransformationType::Exponential,
494 FeatureTransformationType::Reciprocal,
495 FeatureTransformationType::Sine,
496 FeatureTransformationType::Cosine,
497 FeatureTransformationType::Absolute,
498 FeatureTransformationType::Sign,
499 ]);
500 }
501
502 if self.config.enable_interactions {
503 transformations.push(FeatureTransformationType::Interaction);
504 transformations.push(FeatureTransformationType::Ratio);
505 transformations.push(FeatureTransformationType::Difference);
506 }
507
508 transformations.extend(vec![
509 FeatureTransformationType::Binning { n_bins: 5 },
510 FeatureTransformationType::Binning { n_bins: 10 },
511 FeatureTransformationType::Binning { n_bins: 20 },
512 ]);
513
514 if self.config.enable_time_series {
515 transformations.extend(vec![
516 FeatureTransformationType::RollingStatistics { window: 3 },
517 FeatureTransformationType::RollingStatistics { window: 5 },
518 FeatureTransformationType::Lag { lag: 1 },
519 FeatureTransformationType::Lag { lag: 2 },
520 ]);
521 }
522 }
523
524 FeatureEngineeringStrategy::Custom(custom_transforms) => {
525 transformations.extend(custom_transforms.clone());
526 }
527 }
528
529 transformations.shuffle(&mut self.rng);
531
532 let max_transforms = (self.config.max_features / dataset_chars.n_features).max(1);
534 transformations.truncate(max_transforms);
535
536 transformations
537 }
538
539 fn apply_transformations(
541 &mut self,
542 X: &Array2<f64>,
543 transformations: &[FeatureTransformationType],
544 ) -> Result<(Array2<f64>, Vec<GeneratedFeature>)> {
545 let mut enhanced_X = X.clone();
546 let mut generated_features = Vec::new();
547
548 for i in 0..X.ncols() {
550 generated_features.push(GeneratedFeature {
551 name: format!("original_feature_{}", i),
552 transformation: FeatureTransformationType::Absolute, source_features: vec![i],
554 importance_score: 0.0,
555 is_selected: false,
556 statistics: FeatureStatistics {
557 mean: 0.0,
558 std: 0.0,
559 min: 0.0,
560 max: 0.0,
561 n_unique: 0,
562 missing_ratio: 0.0,
563 skewness: 0.0,
564 kurtosis: 0.0,
565 },
566 });
567 }
568
569 for transformation in transformations {
570 let source_indices: Vec<usize> = match transformation {
571 FeatureTransformationType::Interaction
572 | FeatureTransformationType::Ratio
573 | FeatureTransformationType::Difference => {
574 self.select_feature_pairs(X.ncols())
576 }
577 _ => {
578 (0..X.ncols()).collect()
580 }
581 };
582
583 let new_features =
584 self.apply_single_transformation(&enhanced_X, transformation, &source_indices)?;
585
586 for (i, _) in new_features.columns().into_iter().enumerate() {
588 generated_features.push(GeneratedFeature {
589 name: format!("{}_{}", transformation, i),
590 transformation: transformation.clone(),
591 source_features: source_indices.clone(),
592 importance_score: 0.0,
593 is_selected: false,
594 statistics: FeatureStatistics {
595 mean: 0.0,
596 std: 0.0,
597 min: 0.0,
598 max: 0.0,
599 n_unique: 0,
600 missing_ratio: 0.0,
601 skewness: 0.0,
602 kurtosis: 0.0,
603 },
604 });
605 }
606
607 enhanced_X = concatenate(Axis(1), &[enhanced_X.view(), new_features.view()]).unwrap();
609
610 if enhanced_X.ncols() >= self.config.max_features {
612 break;
613 }
614 }
615
616 Ok((enhanced_X, generated_features))
617 }
618
619 fn apply_single_transformation(
621 &self,
622 X: &Array2<f64>,
623 transformation: &FeatureTransformationType,
624 source_indices: &[usize],
625 ) -> Result<Array2<f64>> {
626 match transformation {
627 FeatureTransformationType::Polynomial { degree } => {
628 self.apply_polynomial_features(X, source_indices, *degree)
629 }
630
631 FeatureTransformationType::Logarithmic => {
632 self.apply_logarithmic_transform(X, source_indices)
633 }
634
635 FeatureTransformationType::SquareRoot => self.apply_sqrt_transform(X, source_indices),
636
637 FeatureTransformationType::Exponential => self.apply_exp_transform(X, source_indices),
638
639 FeatureTransformationType::Reciprocal => {
640 self.apply_reciprocal_transform(X, source_indices)
641 }
642
643 FeatureTransformationType::Sine => self.apply_sine_transform(X, source_indices),
644
645 FeatureTransformationType::Cosine => self.apply_cosine_transform(X, source_indices),
646
647 FeatureTransformationType::Absolute => self.apply_absolute_transform(X, source_indices),
648
649 FeatureTransformationType::Sign => self.apply_sign_transform(X, source_indices),
650
651 FeatureTransformationType::Binning { n_bins } => {
652 self.apply_binning_transform(X, source_indices, *n_bins)
653 }
654
655 FeatureTransformationType::Interaction => {
656 self.apply_interaction_features(X, source_indices)
657 }
658
659 FeatureTransformationType::Ratio => self.apply_ratio_features(X, source_indices),
660
661 FeatureTransformationType::Difference => {
662 self.apply_difference_features(X, source_indices)
663 }
664
665 FeatureTransformationType::RollingStatistics { window } => {
666 self.apply_rolling_statistics(X, source_indices, *window)
667 }
668
669 FeatureTransformationType::Lag { lag } => {
670 self.apply_lag_features(X, source_indices, *lag)
671 }
672 }
673 }
674
675 fn calculate_feature_statistics(
677 &self,
678 mut generated_features: Vec<GeneratedFeature>,
679 X: &Array2<f64>,
680 y: &Array1<f64>,
681 ) -> Vec<GeneratedFeature> {
682 for (i, feature) in generated_features.iter_mut().enumerate() {
683 if i < X.ncols() {
684 let column = X.column(i);
685
686 feature.statistics = FeatureStatistics {
687 mean: column.mean().unwrap_or(0.0),
688 std: column.std(0.0),
689 min: column.iter().fold(f64::INFINITY, |a, &b| a.min(b)),
690 max: column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b)),
691 n_unique: self.count_unique_values(&column),
692 missing_ratio: column.iter().filter(|&&x| x.is_nan()).count() as f64
693 / column.len() as f64,
694 skewness: 0.0, kurtosis: 0.0, };
697
698 feature.importance_score = self.calculate_feature_importance(&column, y);
700 }
701 }
702
703 generated_features
704 }
705
706 fn select_features(
708 &self,
709 X: &Array2<f64>,
710 _y: &Array1<f64>,
711 mut generated_features: Vec<GeneratedFeature>,
712 ) -> Result<(Vec<GeneratedFeature>, Vec<usize>)> {
713 let mut indexed_features: Vec<(usize, &mut GeneratedFeature)> =
715 generated_features.iter_mut().enumerate().collect();
716
717 indexed_features.sort_by(|a, b| {
719 b.1.importance_score
720 .partial_cmp(&a.1.importance_score)
721 .unwrap()
722 });
723
724 let n_features_to_select = match &self.config.selection_method {
725 FeatureSelectionMethod::SelectKBest { k } => (*k).min(X.ncols()),
726 FeatureSelectionMethod::SelectPercentile { percentile } => {
727 ((X.ncols() as f64 * percentile / 100.0) as usize).max(1)
728 }
729 _ => self.config.max_selected_features.min(X.ncols()),
730 };
731
732 let selected_indices = match &self.config.selection_method {
734 FeatureSelectionMethod::SelectKBest { k: _ }
735 | FeatureSelectionMethod::SelectPercentile { percentile: _ } => {
736 indexed_features
738 .iter()
739 .take(n_features_to_select)
740 .map(|(idx, _)| *idx)
741 .collect()
742 }
743
744 FeatureSelectionMethod::VarianceThreshold { threshold } => {
745 self.select_by_variance_threshold(X, *threshold)
746 }
747
748 FeatureSelectionMethod::CorrelationThreshold { threshold } => {
749 self.select_by_correlation_threshold(X, *threshold)
750 }
751
752 _ => {
753 indexed_features
755 .iter()
756 .take(n_features_to_select)
757 .map(|(idx, _)| *idx)
758 .collect()
759 }
760 };
761
762 for (i, feature) in generated_features.iter_mut().enumerate() {
764 feature.is_selected = selected_indices.contains(&i);
765 }
766
767 Ok((generated_features, selected_indices))
768 }
769
770 fn estimate_performance_improvement(
772 &self,
773 _original_X: &Array2<f64>,
774 _enhanced_X: &Array2<f64>,
775 _y: &Array1<f64>,
776 _selected_indices: &[usize],
777 ) -> Result<f64> {
778 let original_score = 0.7; let enhanced_score = 0.8; Ok(enhanced_score - original_score)
782 }
783
784 fn create_transformation_info(
786 &self,
787 transformations: &[FeatureTransformationType],
788 selected_indices: &[usize],
789 enhanced_X: &Array2<f64>,
790 ) -> TransformationInfo {
791 let mut scaling_params = HashMap::new();
792 let binning_boundaries = HashMap::new();
793
794 for &idx in selected_indices {
796 if idx < enhanced_X.ncols() {
797 let column = enhanced_X.column(idx);
798 let mean = column.mean().unwrap_or(0.0);
799 let std = column.std(0.0);
800 scaling_params.insert(idx, (mean, std));
801 }
802 }
803
804 TransformationInfo {
805 transformations: transformations
806 .iter()
807 .map(|t| (t.clone(), vec![]))
808 .collect(),
809 selected_indices: selected_indices.to_vec(),
810 scaling_params,
811 binning_boundaries,
812 }
813 }
814
815 fn apply_polynomial_features(
817 &self,
818 X: &Array2<f64>,
819 source_indices: &[usize],
820 degree: usize,
821 ) -> Result<Array2<f64>> {
822 let n_samples = X.nrows();
823 let selected_X = X.select(Axis(1), source_indices);
824 let n_features = selected_X.ncols();
825
826 if degree == 2 {
827 let mut poly_features = Vec::new();
829
830 for i in 0..n_features {
832 let col = selected_X.column(i);
833 let squared: Vec<f64> = col.iter().map(|&x| x * x).collect();
834 poly_features.push(squared);
835 }
836
837 if n_features < 20 {
839 for i in 0..n_features {
840 for j in (i + 1)..n_features {
841 let col_i = selected_X.column(i);
842 let col_j = selected_X.column(j);
843 let interaction: Vec<f64> = col_i
844 .iter()
845 .zip(col_j.iter())
846 .map(|(&xi, &xj)| xi * xj)
847 .collect();
848 poly_features.push(interaction);
849 }
850 }
851 }
852
853 let n_poly_features = poly_features.len();
855 let mut result = Array2::zeros((n_samples, n_poly_features));
856 for (j, feature) in poly_features.iter().enumerate() {
857 for (i, &value) in feature.iter().enumerate() {
858 result[[i, j]] = value;
859 }
860 }
861 Ok(result)
862 } else {
863 let mut result = Array2::zeros((n_samples, n_features));
865 for (j, i) in source_indices.iter().enumerate() {
866 let col = X.column(*i);
867 for (row, &value) in col.iter().enumerate() {
868 result[[row, j]] = value.powi(degree as i32);
869 }
870 }
871 Ok(result)
872 }
873 }
874
875 fn apply_logarithmic_transform(
876 &self,
877 X: &Array2<f64>,
878 source_indices: &[usize],
879 ) -> Result<Array2<f64>> {
880 let n_samples = X.nrows();
881 let n_features = source_indices.len();
882 let mut result = Array2::zeros((n_samples, n_features));
883
884 for (j, &i) in source_indices.iter().enumerate() {
885 let col = X.column(i);
886 for (row, &value) in col.iter().enumerate() {
887 let log_value = if value > 0.0 {
889 value.ln()
890 } else if value == 0.0 {
891 0.0
892 } else {
893 -(value.abs() + 1e-8).ln()
894 };
895 result[[row, j]] = log_value;
896 }
897 }
898 Ok(result)
899 }
900
901 fn apply_sqrt_transform(
902 &self,
903 X: &Array2<f64>,
904 source_indices: &[usize],
905 ) -> Result<Array2<f64>> {
906 let n_samples = X.nrows();
907 let n_features = source_indices.len();
908 let mut result = Array2::zeros((n_samples, n_features));
909
910 for (j, &i) in source_indices.iter().enumerate() {
911 let col = X.column(i);
912 for (row, &value) in col.iter().enumerate() {
913 let sqrt_value = if value >= 0.0 {
914 value.sqrt()
915 } else {
916 -(value.abs().sqrt())
917 };
918 result[[row, j]] = sqrt_value;
919 }
920 }
921 Ok(result)
922 }
923
924 fn apply_exp_transform(
925 &self,
926 X: &Array2<f64>,
927 source_indices: &[usize],
928 ) -> Result<Array2<f64>> {
929 let n_samples = X.nrows();
930 let n_features = source_indices.len();
931 let mut result = Array2::zeros((n_samples, n_features));
932
933 for (j, &i) in source_indices.iter().enumerate() {
934 let col = X.column(i);
935 for (row, &value) in col.iter().enumerate() {
936 let clipped_value = value.clamp(-10.0, 10.0);
938 result[[row, j]] = clipped_value.exp();
939 }
940 }
941 Ok(result)
942 }
943
944 fn apply_reciprocal_transform(
945 &self,
946 X: &Array2<f64>,
947 source_indices: &[usize],
948 ) -> Result<Array2<f64>> {
949 let n_samples = X.nrows();
950 let n_features = source_indices.len();
951 let mut result = Array2::zeros((n_samples, n_features));
952
953 for (j, &i) in source_indices.iter().enumerate() {
954 let col = X.column(i);
955 for (row, &value) in col.iter().enumerate() {
956 let reciprocal = if value.abs() > 1e-8 { 1.0 / value } else { 0.0 };
957 result[[row, j]] = reciprocal;
958 }
959 }
960 Ok(result)
961 }
962
963 fn apply_sine_transform(
964 &self,
965 X: &Array2<f64>,
966 source_indices: &[usize],
967 ) -> Result<Array2<f64>> {
968 let n_samples = X.nrows();
969 let n_features = source_indices.len();
970 let mut result = Array2::zeros((n_samples, n_features));
971
972 for (j, &i) in source_indices.iter().enumerate() {
973 let col = X.column(i);
974 for (row, &value) in col.iter().enumerate() {
975 result[[row, j]] = value.sin();
976 }
977 }
978 Ok(result)
979 }
980
981 fn apply_cosine_transform(
982 &self,
983 X: &Array2<f64>,
984 source_indices: &[usize],
985 ) -> Result<Array2<f64>> {
986 let n_samples = X.nrows();
987 let n_features = source_indices.len();
988 let mut result = Array2::zeros((n_samples, n_features));
989
990 for (j, &i) in source_indices.iter().enumerate() {
991 let col = X.column(i);
992 for (row, &value) in col.iter().enumerate() {
993 result[[row, j]] = value.cos();
994 }
995 }
996 Ok(result)
997 }
998
999 fn apply_absolute_transform(
1000 &self,
1001 X: &Array2<f64>,
1002 source_indices: &[usize],
1003 ) -> Result<Array2<f64>> {
1004 let n_samples = X.nrows();
1005 let n_features = source_indices.len();
1006 let mut result = Array2::zeros((n_samples, n_features));
1007
1008 for (j, &i) in source_indices.iter().enumerate() {
1009 let col = X.column(i);
1010 for (row, &value) in col.iter().enumerate() {
1011 result[[row, j]] = value.abs();
1012 }
1013 }
1014 Ok(result)
1015 }
1016
1017 fn apply_sign_transform(
1018 &self,
1019 X: &Array2<f64>,
1020 source_indices: &[usize],
1021 ) -> Result<Array2<f64>> {
1022 let n_samples = X.nrows();
1023 let n_features = source_indices.len();
1024 let mut result = Array2::zeros((n_samples, n_features));
1025
1026 for (j, &i) in source_indices.iter().enumerate() {
1027 let col = X.column(i);
1028 for (row, &value) in col.iter().enumerate() {
1029 let sign = if value > 0.0 {
1030 1.0
1031 } else if value < 0.0 {
1032 -1.0
1033 } else {
1034 0.0
1035 };
1036 result[[row, j]] = sign;
1037 }
1038 }
1039 Ok(result)
1040 }
1041
1042 fn apply_binning_transform(
1043 &self,
1044 X: &Array2<f64>,
1045 source_indices: &[usize],
1046 n_bins: usize,
1047 ) -> Result<Array2<f64>> {
1048 let n_samples = X.nrows();
1049 let n_features = source_indices.len();
1050 let mut result = Array2::zeros((n_samples, n_features));
1051
1052 for (j, &i) in source_indices.iter().enumerate() {
1053 let col = X.column(i);
1054 let min_val = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1055 let max_val = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1056 let bin_width = (max_val - min_val) / (n_bins as f64);
1057
1058 for (row, &value) in col.iter().enumerate() {
1059 let bin = if bin_width > 0.0 {
1060 ((value - min_val) / bin_width)
1061 .floor()
1062 .min((n_bins - 1) as f64)
1063 } else {
1064 0.0
1065 };
1066 result[[row, j]] = bin;
1067 }
1068 }
1069 Ok(result)
1070 }
1071
1072 fn apply_interaction_features(
1073 &self,
1074 X: &Array2<f64>,
1075 source_indices: &[usize],
1076 ) -> Result<Array2<f64>> {
1077 let n_samples = X.nrows();
1078 let selected_X = X.select(Axis(1), source_indices);
1079 let n_features = selected_X.ncols();
1080
1081 let mut interactions = Vec::new();
1083 for i in 0..n_features {
1084 for j in (i + 1)..n_features {
1085 let col_i = selected_X.column(i);
1086 let col_j = selected_X.column(j);
1087 let interaction: Vec<f64> = col_i
1088 .iter()
1089 .zip(col_j.iter())
1090 .map(|(&xi, &xj)| xi * xj)
1091 .collect();
1092 interactions.push(interaction);
1093 }
1094 }
1095
1096 if interactions.is_empty() {
1097 return Ok(Array2::zeros((n_samples, 1)));
1098 }
1099
1100 let n_interactions = interactions.len();
1102 let mut result = Array2::zeros((n_samples, n_interactions));
1103 for (j, interaction) in interactions.iter().enumerate() {
1104 for (i, &value) in interaction.iter().enumerate() {
1105 result[[i, j]] = value;
1106 }
1107 }
1108 Ok(result)
1109 }
1110
1111 fn apply_ratio_features(
1112 &self,
1113 X: &Array2<f64>,
1114 source_indices: &[usize],
1115 ) -> Result<Array2<f64>> {
1116 let n_samples = X.nrows();
1117 let selected_X = X.select(Axis(1), source_indices);
1118 let n_features = selected_X.ncols();
1119
1120 let mut ratios = Vec::new();
1122 for i in 0..n_features {
1123 for j in 0..n_features {
1124 if i != j {
1125 let col_i = selected_X.column(i);
1126 let col_j = selected_X.column(j);
1127 let ratio: Vec<f64> = col_i
1128 .iter()
1129 .zip(col_j.iter())
1130 .map(|(&xi, &xj)| if xj.abs() > 1e-8 { xi / xj } else { 0.0 })
1131 .collect();
1132 ratios.push(ratio);
1133 }
1134 }
1135 }
1136
1137 if ratios.is_empty() {
1138 return Ok(Array2::zeros((n_samples, 1)));
1139 }
1140
1141 ratios.truncate(20);
1143
1144 let n_ratios = ratios.len();
1146 let mut result = Array2::zeros((n_samples, n_ratios));
1147 for (j, ratio) in ratios.iter().enumerate() {
1148 for (i, &value) in ratio.iter().enumerate() {
1149 result[[i, j]] = value;
1150 }
1151 }
1152 Ok(result)
1153 }
1154
1155 fn apply_difference_features(
1156 &self,
1157 X: &Array2<f64>,
1158 source_indices: &[usize],
1159 ) -> Result<Array2<f64>> {
1160 let n_samples = X.nrows();
1161 let selected_X = X.select(Axis(1), source_indices);
1162 let n_features = selected_X.ncols();
1163
1164 let mut differences = Vec::new();
1166 for i in 0..n_features {
1167 for j in (i + 1)..n_features {
1168 let col_i = selected_X.column(i);
1169 let col_j = selected_X.column(j);
1170 let diff: Vec<f64> = col_i
1171 .iter()
1172 .zip(col_j.iter())
1173 .map(|(&xi, &xj)| xi - xj)
1174 .collect();
1175 differences.push(diff);
1176 }
1177 }
1178
1179 if differences.is_empty() {
1180 return Ok(Array2::zeros((n_samples, 1)));
1181 }
1182
1183 let n_differences = differences.len();
1185 let mut result = Array2::zeros((n_samples, n_differences));
1186 for (j, diff) in differences.iter().enumerate() {
1187 for (i, &value) in diff.iter().enumerate() {
1188 result[[i, j]] = value;
1189 }
1190 }
1191 Ok(result)
1192 }
1193
1194 fn apply_rolling_statistics(
1195 &self,
1196 X: &Array2<f64>,
1197 source_indices: &[usize],
1198 window: usize,
1199 ) -> Result<Array2<f64>> {
1200 let n_samples = X.nrows();
1201 let n_features = source_indices.len();
1202 let mut result = Array2::zeros((n_samples, n_features * 2)); for (j, &i) in source_indices.iter().enumerate() {
1205 let col = X.column(i);
1206
1207 for row in 0..n_samples {
1208 let start = row.saturating_sub(window - 1);
1209 let end = (row + 1).min(n_samples);
1210 let window_data: Vec<f64> = col.slice(s![start..end]).to_vec();
1211
1212 let mean = window_data.iter().sum::<f64>() / window_data.len() as f64;
1213 let variance = window_data.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
1214 / window_data.len() as f64;
1215 let std = variance.sqrt();
1216
1217 result[[row, j * 2]] = mean;
1218 result[[row, j * 2 + 1]] = std;
1219 }
1220 }
1221 Ok(result)
1222 }
1223
1224 fn apply_lag_features(
1225 &self,
1226 X: &Array2<f64>,
1227 source_indices: &[usize],
1228 lag: usize,
1229 ) -> Result<Array2<f64>> {
1230 let n_samples = X.nrows();
1231 let n_features = source_indices.len();
1232 let mut result = Array2::zeros((n_samples, n_features));
1233
1234 for (j, &i) in source_indices.iter().enumerate() {
1235 let col = X.column(i);
1236
1237 for row in 0..n_samples {
1238 let lag_row = row.saturating_sub(lag);
1239 result[[row, j]] = col[lag_row];
1240 }
1241 }
1242 Ok(result)
1243 }
1244
1245 fn select_feature_pairs(&mut self, n_features: usize) -> Vec<usize> {
1247 let max_pairs = 10.min(n_features);
1249 let mut indices = Vec::new();
1250
1251 for _ in 0..max_pairs {
1252 let i = self.rng.gen_range(0..n_features);
1253 let j = self.rng.gen_range(0..n_features);
1254 if i != j {
1255 indices.extend(vec![i, j]);
1256 }
1257 }
1258
1259 indices.sort_unstable();
1260 indices.dedup();
1261 indices
1262 }
1263
1264 fn calculate_sparsity(&self, X: &Array2<f64>) -> f64 {
1265 let total_values = X.len() as f64;
1266 let zero_count = X.iter().filter(|&&x| x == 0.0).count() as f64;
1267 zero_count / total_values
1268 }
1269
1270 fn analyze_correlation_structure(&self, _X: &Array2<f64>) -> f64 {
1271 let mut rng = scirs2_core::random::thread_rng();
1274 rng.gen_range(1.0..100.0)
1275 }
1276
1277 fn estimate_linearity(&self, _X: &Array2<f64>, _y: &Array1<f64>) -> f64 {
1278 let mut rng = scirs2_core::random::thread_rng();
1281 rng.gen_range(0.0..1.0)
1282 }
1283
1284 fn get_unique_classes(&self, y: &Array1<f64>) -> Vec<i32> {
1285 let mut classes: Vec<i32> = y.iter().map(|&x| x as i32).collect();
1286 classes.sort_unstable();
1287 classes.dedup();
1288 classes
1289 }
1290
1291 fn calculate_class_distribution(&self, y: &Array1<f64>, classes: &[i32]) -> Vec<f64> {
1292 let total = y.len() as f64;
1293 classes
1294 .iter()
1295 .map(|&class| {
1296 let count = y.iter().filter(|&&yi| yi as i32 == class).count() as f64;
1297 count / total
1298 })
1299 .collect()
1300 }
1301
1302 fn count_unique_values(&self, column: &ArrayView1<f64>) -> usize {
1303 let mut values: Vec<i64> = column.iter().map(|&x| (x * 1000.0) as i64).collect();
1304 values.sort_unstable();
1305 values.dedup();
1306 values.len()
1307 }
1308
1309 fn calculate_feature_importance(&self, _column: &ArrayView1<f64>, _y: &Array1<f64>) -> f64 {
1310 let mut rng = scirs2_core::random::thread_rng();
1313 rng.gen_range(0.0..1.0)
1314 }
1315
1316 fn select_by_variance_threshold(&self, X: &Array2<f64>, threshold: f64) -> Vec<usize> {
1317 (0..X.ncols())
1318 .filter(|&i| {
1319 let col = X.column(i);
1320 col.std(0.0) > threshold
1321 })
1322 .collect()
1323 }
1324
1325 fn select_by_correlation_threshold(&self, X: &Array2<f64>, _threshold: f64) -> Vec<usize> {
1326 (0..X.ncols()).collect()
1328 }
1329}
1330
1331pub fn engineer_features(
1333 X: &Array2<f64>,
1334 y: &Array1<f64>,
1335 task_type: TaskType,
1336) -> Result<FeatureEngineeringResult> {
1337 let config = AutoFeatureEngineering {
1338 task_type,
1339 ..Default::default()
1340 };
1341
1342 let mut engineer = AutoFeatureEngineer::new(config);
1343 engineer.engineer_features(X, y)
1344}
1345
1346#[allow(non_snake_case)]
1347#[cfg(test)]
1348mod tests {
1349 use super::*;
1350 use scirs2_core::ndarray::{Array1, Array2};
1351
1352 #[allow(non_snake_case)]
1353 fn create_test_data() -> (Array2<f64>, Array1<f64>) {
1354 let X = Array2::from_shape_vec((100, 4), (0..400).map(|i| i as f64).collect()).unwrap();
1355 let y = Array1::from_vec((0..100).map(|i| (i % 3) as f64).collect());
1356 (X, y)
1357 }
1358
1359 #[test]
1360 fn test_feature_engineering() {
1361 let (X, y) = create_test_data();
1362 let result = engineer_features(&X, &y, TaskType::Classification);
1363 assert!(result.is_ok());
1364
1365 let result = result.unwrap();
1366 assert!(result.generated_feature_count > result.original_feature_count);
1367 assert!(result.selected_feature_count <= result.generated_feature_count);
1368 }
1369
1370 #[test]
1371 fn test_polynomial_features() {
1372 let (X, _y) = create_test_data();
1373 let engineer = AutoFeatureEngineer::default();
1374
1375 let poly_features = engineer.apply_polynomial_features(&X, &[0, 1], 2);
1376 assert!(poly_features.is_ok());
1377
1378 let poly_features = poly_features.unwrap();
1379 assert!(poly_features.ncols() > 0);
1380 }
1381
1382 #[test]
1383 fn test_mathematical_transforms() {
1384 let (X, _y) = create_test_data();
1385 let engineer = AutoFeatureEngineer::default();
1386
1387 let log_features = engineer.apply_logarithmic_transform(&X, &[0, 1]);
1388 assert!(log_features.is_ok());
1389
1390 let sqrt_features = engineer.apply_sqrt_transform(&X, &[0, 1]);
1391 assert!(sqrt_features.is_ok());
1392 }
1393
1394 #[test]
1395 fn test_interaction_features() {
1396 let (X, _y) = create_test_data();
1397 let engineer = AutoFeatureEngineer::default();
1398
1399 let interaction_features = engineer.apply_interaction_features(&X, &[0, 1, 2]);
1400 assert!(interaction_features.is_ok());
1401
1402 let interaction_features = interaction_features.unwrap();
1403 assert!(interaction_features.ncols() > 0);
1404 }
1405
1406 #[test]
1407 fn test_custom_strategy() {
1408 let (X, y) = create_test_data();
1409
1410 let config = AutoFeatureEngineering {
1411 strategy: FeatureEngineeringStrategy::Custom(vec![
1412 FeatureTransformationType::Polynomial { degree: 2 },
1413 FeatureTransformationType::Logarithmic,
1414 ]),
1415 max_features: 50,
1416 ..Default::default()
1417 };
1418
1419 let mut engineer = AutoFeatureEngineer::new(config);
1420 let result = engineer.engineer_features(&X, &y);
1421 assert!(result.is_ok());
1422 }
1423
1424 #[test]
1425 fn test_feature_selection_methods() {
1426 let (X, y) = create_test_data();
1427
1428 let config = AutoFeatureEngineering {
1429 selection_method: FeatureSelectionMethod::SelectPercentile { percentile: 50.0 },
1430 ..Default::default()
1431 };
1432
1433 let mut engineer = AutoFeatureEngineer::new(config);
1434 let result = engineer.engineer_features(&X, &y);
1435 assert!(result.is_ok());
1436
1437 let result = result.unwrap();
1438 assert!(result.selected_feature_count > 0);
1439 }
1440}