1use crate::{automl_algorithm_selection::DatasetCharacteristics, scoring::TaskType};
8use scirs2_core::ndarray::{concatenate, s, Array1, Array2, ArrayView1, Axis};
9use scirs2_core::SliceRandomExt;
10use sklears_core::error::Result;
11use std::collections::HashMap;
12use std::fmt;
13use scirs2_core::random::rngs::StdRng;
16use scirs2_core::random::{RngExt, SeedableRng};
17
18#[derive(Debug, Clone, PartialEq, Eq, Hash)]
20pub enum FeatureTransformationType {
21 Polynomial { degree: usize },
23 Logarithmic,
25 SquareRoot,
27 Exponential,
29 Reciprocal,
31 Sine,
33 Cosine,
35 Absolute,
37 Sign,
39 Binning { n_bins: usize },
41 Interaction,
43 Ratio,
45 Difference,
47 RollingStatistics { window: usize },
49 Lag { lag: usize },
51}
52
53impl fmt::Display for FeatureTransformationType {
54 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55 match self {
56 FeatureTransformationType::Polynomial { degree } => write!(f, "Polynomial({})", degree),
57 FeatureTransformationType::Logarithmic => write!(f, "Logarithmic"),
58 FeatureTransformationType::SquareRoot => write!(f, "SquareRoot"),
59 FeatureTransformationType::Exponential => write!(f, "Exponential"),
60 FeatureTransformationType::Reciprocal => write!(f, "Reciprocal"),
61 FeatureTransformationType::Sine => write!(f, "Sine"),
62 FeatureTransformationType::Cosine => write!(f, "Cosine"),
63 FeatureTransformationType::Absolute => write!(f, "Absolute"),
64 FeatureTransformationType::Sign => write!(f, "Sign"),
65 FeatureTransformationType::Binning { n_bins } => write!(f, "Binning({})", n_bins),
66 FeatureTransformationType::Interaction => write!(f, "Interaction"),
67 FeatureTransformationType::Ratio => write!(f, "Ratio"),
68 FeatureTransformationType::Difference => write!(f, "Difference"),
69 FeatureTransformationType::RollingStatistics { window } => {
70 write!(f, "RollingStats({})", window)
71 }
72 FeatureTransformationType::Lag { lag } => write!(f, "Lag({})", lag),
73 }
74 }
75}
76
77#[derive(Debug, Clone, PartialEq)]
79pub enum FeatureEngineeringStrategy {
80 Conservative,
82 Balanced,
84 Aggressive,
86 Custom(Vec<FeatureTransformationType>),
88}
89
90#[derive(Debug, Clone, PartialEq)]
92pub enum FeatureSelectionMethod {
93 SelectKBest { k: usize },
95 SelectPercentile { percentile: f64 },
97 RecursiveFeatureElimination { step: usize },
99 L1Selection { alpha: f64 },
101 VarianceThreshold { threshold: f64 },
103 CorrelationThreshold { threshold: f64 },
105 MutualInformation { k: usize },
107 TreeImportance { threshold: f64 },
109}
110
111#[derive(Debug, Clone)]
113pub struct GeneratedFeature {
114 pub name: String,
116 pub transformation: FeatureTransformationType,
118 pub source_features: Vec<usize>,
120 pub importance_score: f64,
122 pub is_selected: bool,
124 pub statistics: FeatureStatistics,
126}
127
128#[derive(Debug, Clone)]
130pub struct FeatureStatistics {
131 pub mean: f64,
133 pub std: f64,
135 pub min: f64,
137 pub max: f64,
139 pub n_unique: usize,
141 pub missing_ratio: f64,
143 pub skewness: f64,
145 pub kurtosis: f64,
147}
148
149#[derive(Debug, Clone)]
151pub struct AutoFeatureEngineering {
152 pub strategy: FeatureEngineeringStrategy,
154 pub selection_method: FeatureSelectionMethod,
156 pub max_features: usize,
158 pub max_selected_features: usize,
160 pub cv_folds: usize,
162 pub task_type: TaskType,
164 pub random_seed: Option<u64>,
166 pub enable_polynomial: bool,
168 pub enable_math_transforms: bool,
170 pub enable_interactions: bool,
172 pub enable_time_series: bool,
174 pub min_correlation_threshold: f64,
176 pub max_correlation_threshold: f64,
178}
179
180impl Default for AutoFeatureEngineering {
181 fn default() -> Self {
182 Self {
183 strategy: FeatureEngineeringStrategy::Balanced,
184 selection_method: FeatureSelectionMethod::SelectKBest { k: 100 },
185 max_features: 1000,
186 max_selected_features: 100,
187 cv_folds: 5,
188 task_type: TaskType::Classification,
189 random_seed: None,
190 enable_polynomial: true,
191 enable_math_transforms: true,
192 enable_interactions: true,
193 enable_time_series: false,
194 min_correlation_threshold: 0.05,
195 max_correlation_threshold: 0.95,
196 }
197 }
198}
199
200#[derive(Debug, Clone)]
202pub struct FeatureEngineeringResult {
203 pub original_feature_count: usize,
205 pub generated_feature_count: usize,
207 pub selected_feature_count: usize,
209 pub generated_features: Vec<GeneratedFeature>,
211 pub selected_indices: Vec<usize>,
213 pub feature_importances: Vec<f64>,
215 pub transformation_info: TransformationInfo,
217 pub performance_improvement: f64,
219 pub processing_time: f64,
221}
222
223#[derive(Debug, Clone)]
225pub struct TransformationInfo {
226 pub transformations: Vec<(FeatureTransformationType, Vec<usize>)>,
228 pub selected_indices: Vec<usize>,
230 pub scaling_params: HashMap<usize, (f64, f64)>, pub binning_boundaries: HashMap<usize, Vec<f64>>,
234}
235
236impl fmt::Display for FeatureEngineeringResult {
237 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238 writeln!(f, "Automated Feature Engineering Results")?;
239 writeln!(f, "=====================================")?;
240 writeln!(f, "Original features: {}", self.original_feature_count)?;
241 writeln!(f, "Generated features: {}", self.generated_feature_count)?;
242 writeln!(f, "Selected features: {}", self.selected_feature_count)?;
243 writeln!(
244 f,
245 "Performance improvement: {:.4}",
246 self.performance_improvement
247 )?;
248 writeln!(f, "Processing time: {:.2}s", self.processing_time)?;
249 writeln!(f)?;
250 writeln!(f, "Top 10 Generated Features:")?;
251
252 let mut top_features: Vec<_> = self
253 .generated_features
254 .iter()
255 .filter(|f| f.is_selected)
256 .collect();
257 top_features.sort_by(|a, b| {
258 b.importance_score
259 .partial_cmp(&a.importance_score)
260 .expect("operation should succeed")
261 });
262
263 for (i, feature) in top_features.iter().take(10).enumerate() {
264 writeln!(
265 f,
266 "{}. {} ({}): {:.4}",
267 i + 1,
268 feature.name,
269 feature.transformation,
270 feature.importance_score
271 )?;
272 }
273 Ok(())
274 }
275}
276
277pub struct AutoFeatureEngineer {
279 config: AutoFeatureEngineering,
280 rng: StdRng,
281}
282
283impl Default for AutoFeatureEngineer {
284 fn default() -> Self {
285 Self::new(AutoFeatureEngineering::default())
286 }
287}
288
289impl AutoFeatureEngineer {
290 pub fn new(config: AutoFeatureEngineering) -> Self {
292 let rng = match config.random_seed {
293 Some(seed) => StdRng::seed_from_u64(seed),
294 None => StdRng::from_rng(&mut scirs2_core::random::thread_rng()),
295 };
296
297 Self { config, rng }
298 }
299
300 pub fn engineer_features(
302 &mut self,
303 X: &Array2<f64>,
304 y: &Array1<f64>,
305 ) -> Result<FeatureEngineeringResult> {
306 let start_time = std::time::Instant::now();
307 let original_feature_count = X.ncols();
308
309 let dataset_chars = self.analyze_dataset_for_features(X, y);
311
312 let transformations = self.generate_transformations(&dataset_chars);
314
315 let (enhanced_X, generated_features) = self.apply_transformations(X, &transformations)?;
317
318 let features_with_stats =
320 self.calculate_feature_statistics(generated_features, &enhanced_X, y);
321
322 let (selected_features, selected_indices) =
324 self.select_features(&enhanced_X, y, features_with_stats)?;
325
326 let performance_improvement =
328 self.estimate_performance_improvement(X, &enhanced_X, y, &selected_indices)?;
329
330 let transformation_info =
332 self.create_transformation_info(&transformations, &selected_indices, &enhanced_X);
333
334 let processing_time = start_time.elapsed().as_secs_f64();
335
336 Ok(FeatureEngineeringResult {
337 original_feature_count,
338 generated_feature_count: enhanced_X.ncols(),
339 selected_feature_count: selected_indices.len(),
340 generated_features: selected_features,
341 selected_indices: selected_indices.clone(),
342 feature_importances: vec![0.0; selected_indices.len()], transformation_info,
344 performance_improvement,
345 processing_time,
346 })
347 }
348
349 pub fn transform(
351 &self,
352 X: &Array2<f64>,
353 transformation_info: &TransformationInfo,
354 ) -> Result<Array2<f64>> {
355 let mut transformed_X = X.clone();
357
358 for (transformation, source_indices) in &transformation_info.transformations {
360 let new_features =
361 self.apply_single_transformation(&transformed_X, transformation, source_indices)?;
362 transformed_X = concatenate(Axis(1), &[transformed_X.view(), new_features.view()])
364 .expect("operation should succeed");
365 }
366
367 for (feature_idx, (mean, std)) in &transformation_info.scaling_params {
369 if *feature_idx < transformed_X.ncols() {
370 let mut column = transformed_X.column_mut(*feature_idx);
371 for value in column.iter_mut() {
372 *value = (*value - mean) / std;
373 }
374 }
375 }
376
377 let valid_indices: Vec<usize> = transformation_info
379 .selected_indices
380 .iter()
381 .filter(|&&idx| idx < transformed_X.ncols())
382 .copied()
383 .collect();
384
385 if valid_indices.is_empty() {
386 return Err("No valid feature indices to select".into());
387 }
388
389 let selected_X = transformed_X.select(Axis(1), &valid_indices);
390 Ok(selected_X)
391 }
392
393 fn analyze_dataset_for_features(
395 &self,
396 X: &Array2<f64>,
397 y: &Array1<f64>,
398 ) -> DatasetCharacteristics {
399 let n_samples = X.nrows();
400 let n_features = X.ncols();
401
402 let sparsity = self.calculate_sparsity(X);
404 let correlation_structure = self.analyze_correlation_structure(X);
405 let linearity_score = self.estimate_linearity(X, y);
406
407 let (n_classes, class_distribution, target_stats) = match self.config.task_type {
409 TaskType::Classification => {
410 let classes = self.get_unique_classes(y);
411 let class_dist = self.calculate_class_distribution(y, &classes);
412 (Some(classes.len()), Some(class_dist), None)
413 }
414 TaskType::Regression => {
415 let stats = crate::automl_algorithm_selection::TargetStatistics {
416 mean: y.mean().expect("operation should succeed"),
417 std: y.std(0.0),
418 skewness: 0.0, kurtosis: 0.0, n_outliers: 0, };
422 (None, None, Some(stats))
423 }
424 };
425
426 crate::automl_algorithm_selection::DatasetCharacteristics {
427 n_samples,
428 n_features,
429 n_classes,
430 class_distribution,
431 target_stats,
432 missing_ratio: 0.0, categorical_ratio: 0.0, correlation_condition_number: correlation_structure,
435 sparsity,
436 effective_dimensionality: Some((n_features as f64 * 0.8) as usize),
437 noise_level: 0.1, linearity_score,
439 }
440 }
441
442 fn generate_transformations(
444 &mut self,
445 dataset_chars: &DatasetCharacteristics,
446 ) -> Vec<FeatureTransformationType> {
447 let mut transformations = Vec::new();
448
449 match &self.config.strategy {
450 FeatureEngineeringStrategy::Conservative => {
451 if self.config.enable_polynomial {
452 transformations.push(FeatureTransformationType::Polynomial { degree: 2 });
453 }
454 if self.config.enable_math_transforms {
455 transformations.push(FeatureTransformationType::Logarithmic);
456 transformations.push(FeatureTransformationType::SquareRoot);
457 }
458 }
459
460 FeatureEngineeringStrategy::Balanced => {
461 if self.config.enable_polynomial {
462 transformations.push(FeatureTransformationType::Polynomial { degree: 2 });
463 if dataset_chars.n_features < 20 {
464 transformations.push(FeatureTransformationType::Polynomial { degree: 3 });
465 }
466 }
467
468 if self.config.enable_math_transforms {
469 transformations.extend(vec![
470 FeatureTransformationType::Logarithmic,
471 FeatureTransformationType::SquareRoot,
472 FeatureTransformationType::Absolute,
473 FeatureTransformationType::Reciprocal,
474 ]);
475 }
476
477 if self.config.enable_interactions && dataset_chars.n_features < 50 {
478 transformations.push(FeatureTransformationType::Interaction);
479 transformations.push(FeatureTransformationType::Ratio);
480 }
481
482 transformations.push(FeatureTransformationType::Binning { n_bins: 10 });
483 }
484
485 FeatureEngineeringStrategy::Aggressive => {
486 if self.config.enable_polynomial {
487 transformations.push(FeatureTransformationType::Polynomial { degree: 2 });
488 if dataset_chars.n_features < 15 {
489 transformations.push(FeatureTransformationType::Polynomial { degree: 3 });
490 }
491 }
492
493 if self.config.enable_math_transforms {
494 transformations.extend(vec![
495 FeatureTransformationType::Logarithmic,
496 FeatureTransformationType::SquareRoot,
497 FeatureTransformationType::Exponential,
498 FeatureTransformationType::Reciprocal,
499 FeatureTransformationType::Sine,
500 FeatureTransformationType::Cosine,
501 FeatureTransformationType::Absolute,
502 FeatureTransformationType::Sign,
503 ]);
504 }
505
506 if self.config.enable_interactions {
507 transformations.push(FeatureTransformationType::Interaction);
508 transformations.push(FeatureTransformationType::Ratio);
509 transformations.push(FeatureTransformationType::Difference);
510 }
511
512 transformations.extend(vec![
513 FeatureTransformationType::Binning { n_bins: 5 },
514 FeatureTransformationType::Binning { n_bins: 10 },
515 FeatureTransformationType::Binning { n_bins: 20 },
516 ]);
517
518 if self.config.enable_time_series {
519 transformations.extend(vec![
520 FeatureTransformationType::RollingStatistics { window: 3 },
521 FeatureTransformationType::RollingStatistics { window: 5 },
522 FeatureTransformationType::Lag { lag: 1 },
523 FeatureTransformationType::Lag { lag: 2 },
524 ]);
525 }
526 }
527
528 FeatureEngineeringStrategy::Custom(custom_transforms) => {
529 transformations.extend(custom_transforms.clone());
530 }
531 }
532
533 transformations.shuffle(&mut self.rng);
535
536 let max_transforms = (self.config.max_features / dataset_chars.n_features).max(1);
538 transformations.truncate(max_transforms);
539
540 transformations
541 }
542
543 fn apply_transformations(
545 &mut self,
546 X: &Array2<f64>,
547 transformations: &[FeatureTransformationType],
548 ) -> Result<(Array2<f64>, Vec<GeneratedFeature>)> {
549 let mut enhanced_X = X.clone();
550 let mut generated_features = Vec::new();
551
552 for i in 0..X.ncols() {
554 generated_features.push(GeneratedFeature {
555 name: format!("original_feature_{}", i),
556 transformation: FeatureTransformationType::Absolute, source_features: vec![i],
558 importance_score: 0.0,
559 is_selected: false,
560 statistics: FeatureStatistics {
561 mean: 0.0,
562 std: 0.0,
563 min: 0.0,
564 max: 0.0,
565 n_unique: 0,
566 missing_ratio: 0.0,
567 skewness: 0.0,
568 kurtosis: 0.0,
569 },
570 });
571 }
572
573 for transformation in transformations {
574 let source_indices: Vec<usize> = match transformation {
575 FeatureTransformationType::Interaction
576 | FeatureTransformationType::Ratio
577 | FeatureTransformationType::Difference => {
578 self.select_feature_pairs(X.ncols())
580 }
581 _ => {
582 (0..X.ncols()).collect()
584 }
585 };
586
587 let new_features =
588 self.apply_single_transformation(&enhanced_X, transformation, &source_indices)?;
589
590 for (i, _) in new_features.columns().into_iter().enumerate() {
592 generated_features.push(GeneratedFeature {
593 name: format!("{}_{}", transformation, i),
594 transformation: transformation.clone(),
595 source_features: source_indices.clone(),
596 importance_score: 0.0,
597 is_selected: false,
598 statistics: FeatureStatistics {
599 mean: 0.0,
600 std: 0.0,
601 min: 0.0,
602 max: 0.0,
603 n_unique: 0,
604 missing_ratio: 0.0,
605 skewness: 0.0,
606 kurtosis: 0.0,
607 },
608 });
609 }
610
611 enhanced_X = concatenate(Axis(1), &[enhanced_X.view(), new_features.view()])
613 .expect("operation should succeed");
614
615 if enhanced_X.ncols() >= self.config.max_features {
617 break;
618 }
619 }
620
621 Ok((enhanced_X, generated_features))
622 }
623
624 fn apply_single_transformation(
626 &self,
627 X: &Array2<f64>,
628 transformation: &FeatureTransformationType,
629 source_indices: &[usize],
630 ) -> Result<Array2<f64>> {
631 match transformation {
632 FeatureTransformationType::Polynomial { degree } => {
633 self.apply_polynomial_features(X, source_indices, *degree)
634 }
635
636 FeatureTransformationType::Logarithmic => {
637 self.apply_logarithmic_transform(X, source_indices)
638 }
639
640 FeatureTransformationType::SquareRoot => self.apply_sqrt_transform(X, source_indices),
641
642 FeatureTransformationType::Exponential => self.apply_exp_transform(X, source_indices),
643
644 FeatureTransformationType::Reciprocal => {
645 self.apply_reciprocal_transform(X, source_indices)
646 }
647
648 FeatureTransformationType::Sine => self.apply_sine_transform(X, source_indices),
649
650 FeatureTransformationType::Cosine => self.apply_cosine_transform(X, source_indices),
651
652 FeatureTransformationType::Absolute => self.apply_absolute_transform(X, source_indices),
653
654 FeatureTransformationType::Sign => self.apply_sign_transform(X, source_indices),
655
656 FeatureTransformationType::Binning { n_bins } => {
657 self.apply_binning_transform(X, source_indices, *n_bins)
658 }
659
660 FeatureTransformationType::Interaction => {
661 self.apply_interaction_features(X, source_indices)
662 }
663
664 FeatureTransformationType::Ratio => self.apply_ratio_features(X, source_indices),
665
666 FeatureTransformationType::Difference => {
667 self.apply_difference_features(X, source_indices)
668 }
669
670 FeatureTransformationType::RollingStatistics { window } => {
671 self.apply_rolling_statistics(X, source_indices, *window)
672 }
673
674 FeatureTransformationType::Lag { lag } => {
675 self.apply_lag_features(X, source_indices, *lag)
676 }
677 }
678 }
679
680 fn calculate_feature_statistics(
682 &self,
683 mut generated_features: Vec<GeneratedFeature>,
684 X: &Array2<f64>,
685 y: &Array1<f64>,
686 ) -> Vec<GeneratedFeature> {
687 for (i, feature) in generated_features.iter_mut().enumerate() {
688 if i < X.ncols() {
689 let column = X.column(i);
690
691 feature.statistics = FeatureStatistics {
692 mean: column.mean().unwrap_or(0.0),
693 std: column.std(0.0),
694 min: column.iter().fold(f64::INFINITY, |a, &b| a.min(b)),
695 max: column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b)),
696 n_unique: self.count_unique_values(&column),
697 missing_ratio: column.iter().filter(|&&x| x.is_nan()).count() as f64
698 / column.len() as f64,
699 skewness: 0.0, kurtosis: 0.0, };
702
703 feature.importance_score = self.calculate_feature_importance(&column, y);
705 }
706 }
707
708 generated_features
709 }
710
711 fn select_features(
713 &self,
714 X: &Array2<f64>,
715 _y: &Array1<f64>,
716 mut generated_features: Vec<GeneratedFeature>,
717 ) -> Result<(Vec<GeneratedFeature>, Vec<usize>)> {
718 let mut indexed_features: Vec<(usize, &mut GeneratedFeature)> =
720 generated_features.iter_mut().enumerate().collect();
721
722 indexed_features.sort_by(|a, b| {
724 b.1.importance_score
725 .partial_cmp(&a.1.importance_score)
726 .expect("operation should succeed")
727 });
728
729 let n_features_to_select = match &self.config.selection_method {
730 FeatureSelectionMethod::SelectKBest { k } => (*k).min(X.ncols()),
731 FeatureSelectionMethod::SelectPercentile { percentile } => {
732 ((X.ncols() as f64 * percentile / 100.0) as usize).max(1)
733 }
734 _ => self.config.max_selected_features.min(X.ncols()),
735 };
736
737 let selected_indices = match &self.config.selection_method {
739 FeatureSelectionMethod::SelectKBest { k: _ }
740 | FeatureSelectionMethod::SelectPercentile { percentile: _ } => {
741 indexed_features
743 .iter()
744 .take(n_features_to_select)
745 .map(|(idx, _)| *idx)
746 .collect()
747 }
748
749 FeatureSelectionMethod::VarianceThreshold { threshold } => {
750 self.select_by_variance_threshold(X, *threshold)
751 }
752
753 FeatureSelectionMethod::CorrelationThreshold { threshold } => {
754 self.select_by_correlation_threshold(X, *threshold)
755 }
756
757 _ => {
758 indexed_features
760 .iter()
761 .take(n_features_to_select)
762 .map(|(idx, _)| *idx)
763 .collect()
764 }
765 };
766
767 for (i, feature) in generated_features.iter_mut().enumerate() {
769 feature.is_selected = selected_indices.contains(&i);
770 }
771
772 Ok((generated_features, selected_indices))
773 }
774
775 fn estimate_performance_improvement(
777 &self,
778 _original_X: &Array2<f64>,
779 _enhanced_X: &Array2<f64>,
780 _y: &Array1<f64>,
781 _selected_indices: &[usize],
782 ) -> Result<f64> {
783 let original_score = 0.7; let enhanced_score = 0.8; Ok(enhanced_score - original_score)
787 }
788
789 fn create_transformation_info(
791 &self,
792 transformations: &[FeatureTransformationType],
793 selected_indices: &[usize],
794 enhanced_X: &Array2<f64>,
795 ) -> TransformationInfo {
796 let mut scaling_params = HashMap::new();
797 let binning_boundaries = HashMap::new();
798
799 for &idx in selected_indices {
801 if idx < enhanced_X.ncols() {
802 let column = enhanced_X.column(idx);
803 let mean = column.mean().unwrap_or(0.0);
804 let std = column.std(0.0);
805 scaling_params.insert(idx, (mean, std));
806 }
807 }
808
809 TransformationInfo {
810 transformations: transformations
811 .iter()
812 .map(|t| (t.clone(), vec![]))
813 .collect(),
814 selected_indices: selected_indices.to_vec(),
815 scaling_params,
816 binning_boundaries,
817 }
818 }
819
820 fn apply_polynomial_features(
822 &self,
823 X: &Array2<f64>,
824 source_indices: &[usize],
825 degree: usize,
826 ) -> Result<Array2<f64>> {
827 let n_samples = X.nrows();
828 let selected_X = X.select(Axis(1), source_indices);
829 let n_features = selected_X.ncols();
830
831 if degree == 2 {
832 let mut poly_features = Vec::new();
834
835 for i in 0..n_features {
837 let col = selected_X.column(i);
838 let squared: Vec<f64> = col.iter().map(|&x| x * x).collect();
839 poly_features.push(squared);
840 }
841
842 if n_features < 20 {
844 for i in 0..n_features {
845 for j in (i + 1)..n_features {
846 let col_i = selected_X.column(i);
847 let col_j = selected_X.column(j);
848 let interaction: Vec<f64> = col_i
849 .iter()
850 .zip(col_j.iter())
851 .map(|(&xi, &xj)| xi * xj)
852 .collect();
853 poly_features.push(interaction);
854 }
855 }
856 }
857
858 let n_poly_features = poly_features.len();
860 let mut result = Array2::zeros((n_samples, n_poly_features));
861 for (j, feature) in poly_features.iter().enumerate() {
862 for (i, &value) in feature.iter().enumerate() {
863 result[[i, j]] = value;
864 }
865 }
866 Ok(result)
867 } else {
868 let mut result = Array2::zeros((n_samples, n_features));
870 for (j, i) in source_indices.iter().enumerate() {
871 let col = X.column(*i);
872 for (row, &value) in col.iter().enumerate() {
873 result[[row, j]] = value.powi(degree as i32);
874 }
875 }
876 Ok(result)
877 }
878 }
879
880 fn apply_logarithmic_transform(
881 &self,
882 X: &Array2<f64>,
883 source_indices: &[usize],
884 ) -> Result<Array2<f64>> {
885 let n_samples = X.nrows();
886 let n_features = source_indices.len();
887 let mut result = Array2::zeros((n_samples, n_features));
888
889 for (j, &i) in source_indices.iter().enumerate() {
890 let col = X.column(i);
891 for (row, &value) in col.iter().enumerate() {
892 let log_value = if value > 0.0 {
894 value.ln()
895 } else if value == 0.0 {
896 0.0
897 } else {
898 -(value.abs() + 1e-8).ln()
899 };
900 result[[row, j]] = log_value;
901 }
902 }
903 Ok(result)
904 }
905
906 fn apply_sqrt_transform(
907 &self,
908 X: &Array2<f64>,
909 source_indices: &[usize],
910 ) -> Result<Array2<f64>> {
911 let n_samples = X.nrows();
912 let n_features = source_indices.len();
913 let mut result = Array2::zeros((n_samples, n_features));
914
915 for (j, &i) in source_indices.iter().enumerate() {
916 let col = X.column(i);
917 for (row, &value) in col.iter().enumerate() {
918 let sqrt_value = if value >= 0.0 {
919 value.sqrt()
920 } else {
921 -(value.abs().sqrt())
922 };
923 result[[row, j]] = sqrt_value;
924 }
925 }
926 Ok(result)
927 }
928
929 fn apply_exp_transform(
930 &self,
931 X: &Array2<f64>,
932 source_indices: &[usize],
933 ) -> Result<Array2<f64>> {
934 let n_samples = X.nrows();
935 let n_features = source_indices.len();
936 let mut result = Array2::zeros((n_samples, n_features));
937
938 for (j, &i) in source_indices.iter().enumerate() {
939 let col = X.column(i);
940 for (row, &value) in col.iter().enumerate() {
941 let clipped_value = value.clamp(-10.0, 10.0);
943 result[[row, j]] = clipped_value.exp();
944 }
945 }
946 Ok(result)
947 }
948
949 fn apply_reciprocal_transform(
950 &self,
951 X: &Array2<f64>,
952 source_indices: &[usize],
953 ) -> Result<Array2<f64>> {
954 let n_samples = X.nrows();
955 let n_features = source_indices.len();
956 let mut result = Array2::zeros((n_samples, n_features));
957
958 for (j, &i) in source_indices.iter().enumerate() {
959 let col = X.column(i);
960 for (row, &value) in col.iter().enumerate() {
961 let reciprocal = if value.abs() > 1e-8 { 1.0 / value } else { 0.0 };
962 result[[row, j]] = reciprocal;
963 }
964 }
965 Ok(result)
966 }
967
968 fn apply_sine_transform(
969 &self,
970 X: &Array2<f64>,
971 source_indices: &[usize],
972 ) -> Result<Array2<f64>> {
973 let n_samples = X.nrows();
974 let n_features = source_indices.len();
975 let mut result = Array2::zeros((n_samples, n_features));
976
977 for (j, &i) in source_indices.iter().enumerate() {
978 let col = X.column(i);
979 for (row, &value) in col.iter().enumerate() {
980 result[[row, j]] = value.sin();
981 }
982 }
983 Ok(result)
984 }
985
986 fn apply_cosine_transform(
987 &self,
988 X: &Array2<f64>,
989 source_indices: &[usize],
990 ) -> Result<Array2<f64>> {
991 let n_samples = X.nrows();
992 let n_features = source_indices.len();
993 let mut result = Array2::zeros((n_samples, n_features));
994
995 for (j, &i) in source_indices.iter().enumerate() {
996 let col = X.column(i);
997 for (row, &value) in col.iter().enumerate() {
998 result[[row, j]] = value.cos();
999 }
1000 }
1001 Ok(result)
1002 }
1003
1004 fn apply_absolute_transform(
1005 &self,
1006 X: &Array2<f64>,
1007 source_indices: &[usize],
1008 ) -> Result<Array2<f64>> {
1009 let n_samples = X.nrows();
1010 let n_features = source_indices.len();
1011 let mut result = Array2::zeros((n_samples, n_features));
1012
1013 for (j, &i) in source_indices.iter().enumerate() {
1014 let col = X.column(i);
1015 for (row, &value) in col.iter().enumerate() {
1016 result[[row, j]] = value.abs();
1017 }
1018 }
1019 Ok(result)
1020 }
1021
1022 fn apply_sign_transform(
1023 &self,
1024 X: &Array2<f64>,
1025 source_indices: &[usize],
1026 ) -> Result<Array2<f64>> {
1027 let n_samples = X.nrows();
1028 let n_features = source_indices.len();
1029 let mut result = Array2::zeros((n_samples, n_features));
1030
1031 for (j, &i) in source_indices.iter().enumerate() {
1032 let col = X.column(i);
1033 for (row, &value) in col.iter().enumerate() {
1034 let sign = if value > 0.0 {
1035 1.0
1036 } else if value < 0.0 {
1037 -1.0
1038 } else {
1039 0.0
1040 };
1041 result[[row, j]] = sign;
1042 }
1043 }
1044 Ok(result)
1045 }
1046
1047 fn apply_binning_transform(
1048 &self,
1049 X: &Array2<f64>,
1050 source_indices: &[usize],
1051 n_bins: usize,
1052 ) -> Result<Array2<f64>> {
1053 let n_samples = X.nrows();
1054 let n_features = source_indices.len();
1055 let mut result = Array2::zeros((n_samples, n_features));
1056
1057 for (j, &i) in source_indices.iter().enumerate() {
1058 let col = X.column(i);
1059 let min_val = col.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1060 let max_val = col.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1061 let bin_width = (max_val - min_val) / (n_bins as f64);
1062
1063 for (row, &value) in col.iter().enumerate() {
1064 let bin = if bin_width > 0.0 {
1065 ((value - min_val) / bin_width)
1066 .floor()
1067 .min((n_bins - 1) as f64)
1068 } else {
1069 0.0
1070 };
1071 result[[row, j]] = bin;
1072 }
1073 }
1074 Ok(result)
1075 }
1076
1077 fn apply_interaction_features(
1078 &self,
1079 X: &Array2<f64>,
1080 source_indices: &[usize],
1081 ) -> Result<Array2<f64>> {
1082 let n_samples = X.nrows();
1083 let selected_X = X.select(Axis(1), source_indices);
1084 let n_features = selected_X.ncols();
1085
1086 let mut interactions = Vec::new();
1088 for i in 0..n_features {
1089 for j in (i + 1)..n_features {
1090 let col_i = selected_X.column(i);
1091 let col_j = selected_X.column(j);
1092 let interaction: Vec<f64> = col_i
1093 .iter()
1094 .zip(col_j.iter())
1095 .map(|(&xi, &xj)| xi * xj)
1096 .collect();
1097 interactions.push(interaction);
1098 }
1099 }
1100
1101 if interactions.is_empty() {
1102 return Ok(Array2::zeros((n_samples, 1)));
1103 }
1104
1105 let n_interactions = interactions.len();
1107 let mut result = Array2::zeros((n_samples, n_interactions));
1108 for (j, interaction) in interactions.iter().enumerate() {
1109 for (i, &value) in interaction.iter().enumerate() {
1110 result[[i, j]] = value;
1111 }
1112 }
1113 Ok(result)
1114 }
1115
1116 fn apply_ratio_features(
1117 &self,
1118 X: &Array2<f64>,
1119 source_indices: &[usize],
1120 ) -> Result<Array2<f64>> {
1121 let n_samples = X.nrows();
1122 let selected_X = X.select(Axis(1), source_indices);
1123 let n_features = selected_X.ncols();
1124
1125 let mut ratios = Vec::new();
1127 for i in 0..n_features {
1128 for j in 0..n_features {
1129 if i != j {
1130 let col_i = selected_X.column(i);
1131 let col_j = selected_X.column(j);
1132 let ratio: Vec<f64> = col_i
1133 .iter()
1134 .zip(col_j.iter())
1135 .map(|(&xi, &xj)| if xj.abs() > 1e-8 { xi / xj } else { 0.0 })
1136 .collect();
1137 ratios.push(ratio);
1138 }
1139 }
1140 }
1141
1142 if ratios.is_empty() {
1143 return Ok(Array2::zeros((n_samples, 1)));
1144 }
1145
1146 ratios.truncate(20);
1148
1149 let n_ratios = ratios.len();
1151 let mut result = Array2::zeros((n_samples, n_ratios));
1152 for (j, ratio) in ratios.iter().enumerate() {
1153 for (i, &value) in ratio.iter().enumerate() {
1154 result[[i, j]] = value;
1155 }
1156 }
1157 Ok(result)
1158 }
1159
1160 fn apply_difference_features(
1161 &self,
1162 X: &Array2<f64>,
1163 source_indices: &[usize],
1164 ) -> Result<Array2<f64>> {
1165 let n_samples = X.nrows();
1166 let selected_X = X.select(Axis(1), source_indices);
1167 let n_features = selected_X.ncols();
1168
1169 let mut differences = Vec::new();
1171 for i in 0..n_features {
1172 for j in (i + 1)..n_features {
1173 let col_i = selected_X.column(i);
1174 let col_j = selected_X.column(j);
1175 let diff: Vec<f64> = col_i
1176 .iter()
1177 .zip(col_j.iter())
1178 .map(|(&xi, &xj)| xi - xj)
1179 .collect();
1180 differences.push(diff);
1181 }
1182 }
1183
1184 if differences.is_empty() {
1185 return Ok(Array2::zeros((n_samples, 1)));
1186 }
1187
1188 let n_differences = differences.len();
1190 let mut result = Array2::zeros((n_samples, n_differences));
1191 for (j, diff) in differences.iter().enumerate() {
1192 for (i, &value) in diff.iter().enumerate() {
1193 result[[i, j]] = value;
1194 }
1195 }
1196 Ok(result)
1197 }
1198
1199 fn apply_rolling_statistics(
1200 &self,
1201 X: &Array2<f64>,
1202 source_indices: &[usize],
1203 window: usize,
1204 ) -> Result<Array2<f64>> {
1205 let n_samples = X.nrows();
1206 let n_features = source_indices.len();
1207 let mut result = Array2::zeros((n_samples, n_features * 2)); for (j, &i) in source_indices.iter().enumerate() {
1210 let col = X.column(i);
1211
1212 for row in 0..n_samples {
1213 let start = row.saturating_sub(window - 1);
1214 let end = (row + 1).min(n_samples);
1215 let window_data: Vec<f64> = col.slice(s![start..end]).to_vec();
1216
1217 let mean = window_data.iter().sum::<f64>() / window_data.len() as f64;
1218 let variance = window_data.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
1219 / window_data.len() as f64;
1220 let std = variance.sqrt();
1221
1222 result[[row, j * 2]] = mean;
1223 result[[row, j * 2 + 1]] = std;
1224 }
1225 }
1226 Ok(result)
1227 }
1228
1229 fn apply_lag_features(
1230 &self,
1231 X: &Array2<f64>,
1232 source_indices: &[usize],
1233 lag: usize,
1234 ) -> Result<Array2<f64>> {
1235 let n_samples = X.nrows();
1236 let n_features = source_indices.len();
1237 let mut result = Array2::zeros((n_samples, n_features));
1238
1239 for (j, &i) in source_indices.iter().enumerate() {
1240 let col = X.column(i);
1241
1242 for row in 0..n_samples {
1243 let lag_row = row.saturating_sub(lag);
1244 result[[row, j]] = col[lag_row];
1245 }
1246 }
1247 Ok(result)
1248 }
1249
1250 fn select_feature_pairs(&mut self, n_features: usize) -> Vec<usize> {
1252 let max_pairs = 10.min(n_features);
1254 let mut indices = Vec::new();
1255
1256 for _ in 0..max_pairs {
1257 let i = self.rng.random_range(0..n_features);
1258 let j = self.rng.random_range(0..n_features);
1259 if i != j {
1260 indices.extend(vec![i, j]);
1261 }
1262 }
1263
1264 indices.sort_unstable();
1265 indices.dedup();
1266 indices
1267 }
1268
1269 fn calculate_sparsity(&self, X: &Array2<f64>) -> f64 {
1270 let total_values = X.len() as f64;
1271 let zero_count = X.iter().filter(|&&x| x == 0.0).count() as f64;
1272 zero_count / total_values
1273 }
1274
1275 fn analyze_correlation_structure(&self, _X: &Array2<f64>) -> f64 {
1276 let mut rng = scirs2_core::random::thread_rng();
1279 rng.gen_range(1.0..100.0)
1280 }
1281
1282 fn estimate_linearity(&self, _X: &Array2<f64>, _y: &Array1<f64>) -> f64 {
1283 let mut rng = scirs2_core::random::thread_rng();
1286 rng.gen_range(0.0..1.0)
1287 }
1288
1289 fn get_unique_classes(&self, y: &Array1<f64>) -> Vec<i32> {
1290 let mut classes: Vec<i32> = y.iter().map(|&x| x as i32).collect();
1291 classes.sort_unstable();
1292 classes.dedup();
1293 classes
1294 }
1295
1296 fn calculate_class_distribution(&self, y: &Array1<f64>, classes: &[i32]) -> Vec<f64> {
1297 let total = y.len() as f64;
1298 classes
1299 .iter()
1300 .map(|&class| {
1301 let count = y.iter().filter(|&&yi| yi as i32 == class).count() as f64;
1302 count / total
1303 })
1304 .collect()
1305 }
1306
1307 fn count_unique_values(&self, column: &ArrayView1<f64>) -> usize {
1308 let mut values: Vec<i64> = column.iter().map(|&x| (x * 1000.0) as i64).collect();
1309 values.sort_unstable();
1310 values.dedup();
1311 values.len()
1312 }
1313
1314 fn calculate_feature_importance(&self, _column: &ArrayView1<f64>, _y: &Array1<f64>) -> f64 {
1315 let mut rng = scirs2_core::random::thread_rng();
1318 rng.gen_range(0.0..1.0)
1319 }
1320
1321 fn select_by_variance_threshold(&self, X: &Array2<f64>, threshold: f64) -> Vec<usize> {
1322 (0..X.ncols())
1323 .filter(|&i| {
1324 let col = X.column(i);
1325 col.std(0.0) > threshold
1326 })
1327 .collect()
1328 }
1329
1330 fn select_by_correlation_threshold(&self, X: &Array2<f64>, _threshold: f64) -> Vec<usize> {
1331 (0..X.ncols()).collect()
1333 }
1334}
1335
1336pub fn engineer_features(
1338 X: &Array2<f64>,
1339 y: &Array1<f64>,
1340 task_type: TaskType,
1341) -> Result<FeatureEngineeringResult> {
1342 let config = AutoFeatureEngineering {
1343 task_type,
1344 ..Default::default()
1345 };
1346
1347 let mut engineer = AutoFeatureEngineer::new(config);
1348 engineer.engineer_features(X, y)
1349}
1350
1351#[allow(non_snake_case)]
1352#[cfg(test)]
1353mod tests {
1354 use super::*;
1355 use scirs2_core::ndarray::{Array1, Array2};
1356
1357 #[allow(non_snake_case)]
1358 fn create_test_data() -> (Array2<f64>, Array1<f64>) {
1359 let X = Array2::from_shape_vec((100, 4), (0..400).map(|i| i as f64).collect())
1360 .expect("operation should succeed");
1361 let y = Array1::from_vec((0..100).map(|i| (i % 3) as f64).collect());
1362 (X, y)
1363 }
1364
1365 #[test]
1366 fn test_feature_engineering() {
1367 let (X, y) = create_test_data();
1368 let result = engineer_features(&X, &y, TaskType::Classification);
1369 assert!(result.is_ok());
1370
1371 let result = result.expect("operation should succeed");
1372 assert!(result.generated_feature_count > result.original_feature_count);
1373 assert!(result.selected_feature_count <= result.generated_feature_count);
1374 }
1375
1376 #[test]
1377 fn test_polynomial_features() {
1378 let (X, _y) = create_test_data();
1379 let engineer = AutoFeatureEngineer::default();
1380
1381 let poly_features = engineer.apply_polynomial_features(&X, &[0, 1], 2);
1382 assert!(poly_features.is_ok());
1383
1384 let poly_features = poly_features.expect("operation should succeed");
1385 assert!(poly_features.ncols() > 0);
1386 }
1387
1388 #[test]
1389 fn test_mathematical_transforms() {
1390 let (X, _y) = create_test_data();
1391 let engineer = AutoFeatureEngineer::default();
1392
1393 let log_features = engineer.apply_logarithmic_transform(&X, &[0, 1]);
1394 assert!(log_features.is_ok());
1395
1396 let sqrt_features = engineer.apply_sqrt_transform(&X, &[0, 1]);
1397 assert!(sqrt_features.is_ok());
1398 }
1399
1400 #[test]
1401 fn test_interaction_features() {
1402 let (X, _y) = create_test_data();
1403 let engineer = AutoFeatureEngineer::default();
1404
1405 let interaction_features = engineer.apply_interaction_features(&X, &[0, 1, 2]);
1406 assert!(interaction_features.is_ok());
1407
1408 let interaction_features = interaction_features.expect("operation should succeed");
1409 assert!(interaction_features.ncols() > 0);
1410 }
1411
1412 #[test]
1413 fn test_custom_strategy() {
1414 let (X, y) = create_test_data();
1415
1416 let config = AutoFeatureEngineering {
1417 strategy: FeatureEngineeringStrategy::Custom(vec![
1418 FeatureTransformationType::Polynomial { degree: 2 },
1419 FeatureTransformationType::Logarithmic,
1420 ]),
1421 max_features: 50,
1422 ..Default::default()
1423 };
1424
1425 let mut engineer = AutoFeatureEngineer::new(config);
1426 let result = engineer.engineer_features(&X, &y);
1427 assert!(result.is_ok());
1428 }
1429
1430 #[test]
1431 fn test_feature_selection_methods() {
1432 let (X, y) = create_test_data();
1433
1434 let config = AutoFeatureEngineering {
1435 selection_method: FeatureSelectionMethod::SelectPercentile { percentile: 50.0 },
1436 ..Default::default()
1437 };
1438
1439 let mut engineer = AutoFeatureEngineer::new(config);
1440 let result = engineer.engineer_features(&X, &y);
1441 assert!(result.is_ok());
1442
1443 let result = result.expect("operation should succeed");
1444 assert!(result.selected_feature_count > 0);
1445 }
1446}