1use scirs2_core::ndarray::{Array1, Array2};
44use sklears_core::{
45 error::{Result, SklearsError},
46 traits::{Fit, Transform, Untrained},
47};
48use std::collections::{HashMap, HashSet};
49use std::marker::PhantomData;
50
51#[cfg(feature = "serde")]
52use serde::{Deserialize, Serialize};
53
54#[derive(Debug, Clone)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57pub struct AutoFeatureConfig {
58 pub strategies: Vec<GenerationStrategy>,
60 pub max_features: usize,
62 pub selection_method: SelectionMethod,
64 pub selection_threshold: f64,
66 pub include_original: bool,
68 pub random_state: Option<u64>,
70 pub max_interaction_depth: usize,
72 pub remove_correlated: bool,
74 pub correlation_threshold: f64,
76 pub scale_features: bool,
78}
79
80#[derive(Debug, Clone)]
82#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
83pub enum GenerationStrategy {
84 Polynomial { degree: usize },
86 Mathematical { functions: Vec<MathFunction> },
88 Interactions { max_depth: usize },
90 Binning { n_bins: usize },
92 Ratios,
94 Aggregations { window_size: usize },
96 FrequencyEncoding,
98 DomainSpecific { domain: Domain },
100}
101
102#[derive(Debug, Clone, Copy)]
104#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
105pub enum MathFunction {
106 Log,
107 Log1p,
108 Sqrt,
109 Square,
110 Exp,
111 Sin,
112 Cos,
113 Tan,
114 Abs,
115 Reciprocal,
116}
117
118#[derive(Debug, Clone)]
120#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
121pub enum Domain {
122 TimeSeries,
124 Financial,
126 Text,
128 Image,
130 Generic,
132}
133
134#[derive(Debug, Clone, Copy)]
136#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
137pub enum SelectionMethod {
138 MutualInformation,
140 Correlation,
142 Variance,
144 ChiSquared,
146 FTest,
148 RecursiveElimination,
150 LASSO,
152}
153
154impl Default for AutoFeatureConfig {
155 fn default() -> Self {
156 Self {
157 strategies: vec![
158 GenerationStrategy::Polynomial { degree: 2 },
159 GenerationStrategy::Mathematical {
160 functions: vec![
161 MathFunction::Log1p,
162 MathFunction::Sqrt,
163 MathFunction::Square,
164 ],
165 },
166 GenerationStrategy::Interactions { max_depth: 2 },
167 ],
168 max_features: 200,
169 selection_method: SelectionMethod::MutualInformation,
170 selection_threshold: 0.01,
171 include_original: true,
172 random_state: None,
173 max_interaction_depth: 2,
174 remove_correlated: true,
175 correlation_threshold: 0.95,
176 scale_features: true,
177 }
178 }
179}
180
181impl AutoFeatureConfig {
182 pub fn new() -> Self {
184 Self::default()
185 }
186
187 pub fn with_strategy(mut self, strategy: GenerationStrategy) -> Self {
189 self.strategies.push(strategy);
190 self
191 }
192
193 pub fn with_max_features(mut self, max_features: usize) -> Self {
195 self.max_features = max_features;
196 self
197 }
198
199 pub fn with_selection_method(mut self, method: SelectionMethod) -> Self {
201 self.selection_method = method;
202 self
203 }
204
205 pub fn with_selection_threshold(mut self, threshold: f64) -> Self {
207 self.selection_threshold = threshold;
208 self
209 }
210
211 pub fn with_include_original(mut self, include: bool) -> Self {
213 self.include_original = include;
214 self
215 }
216
217 pub fn with_random_state(mut self, seed: u64) -> Self {
219 self.random_state = Some(seed);
220 self
221 }
222}
223
224pub struct AutoFeatureEngineer<State = Untrained> {
226 config: AutoFeatureConfig,
227 state: PhantomData<State>,
228}
229
230pub struct AutoFeatureEngineerFitted {
232 config: AutoFeatureConfig,
233 selected_features: Vec<usize>,
234 feature_names: Vec<String>,
235 feature_scores: Vec<f64>,
236 transformation_functions: Vec<TransformationFunction>,
237 n_original_features: usize,
238 feature_importance: Vec<f64>,
239 correlation_matrix: Option<Array2<f64>>,
240}
241
242#[derive(Debug, Clone)]
244pub struct TransformationFunction {
245 pub name: String,
246 pub function_type: TransformationType,
247 pub input_indices: Vec<usize>,
248 pub parameters: HashMap<String, f64>,
249}
250
251#[derive(Debug, Clone)]
253pub enum TransformationType {
254 Polynomial { degree: usize },
255 Mathematical { function: MathFunction },
256 Interaction,
257 Binning { bins: Vec<f64> },
258 Ratio,
259 Aggregation { window_size: usize },
260 FrequencyEncoding { mapping: HashMap<String, f64> },
261}
262
263impl AutoFeatureEngineer<Untrained> {
264 pub fn new(config: AutoFeatureConfig) -> Self {
266 Self {
267 config,
268 state: PhantomData,
269 }
270 }
271
272 pub fn config(&self) -> &AutoFeatureConfig {
274 &self.config
275 }
276}
277
278impl Fit<Array2<f64>, Array1<f64>> for AutoFeatureEngineer<Untrained> {
279 type Fitted = AutoFeatureEngineerFitted;
280
281 fn fit(self, x: &Array2<f64>, y: &Array1<f64>) -> Result<AutoFeatureEngineerFitted> {
282 if x.is_empty() || y.is_empty() {
283 return Err(SklearsError::InvalidInput(
284 "Input arrays cannot be empty".to_string(),
285 ));
286 }
287
288 let (n_samples, n_features) = x.dim();
289 if y.len() != n_samples {
290 return Err(SklearsError::InvalidInput(
291 "X and y must have the same number of samples".to_string(),
292 ));
293 }
294
295 let mut generated_features = Vec::new();
297 let mut transformation_functions = Vec::new();
298 let mut feature_names = Vec::new();
299
300 if self.config.include_original {
302 for i in 0..n_features {
303 feature_names.push(format!("original_{}", i));
304 generated_features.push(x.column(i).to_owned());
305 transformation_functions.push(TransformationFunction {
306 name: format!("original_{}", i),
307 function_type: TransformationType::Mathematical {
308 function: MathFunction::Abs,
309 }, input_indices: vec![i],
311 parameters: HashMap::new(),
312 });
313 }
314 }
315
316 for strategy in &self.config.strategies {
318 let (strategy_features, strategy_transforms, strategy_names) =
319 self.generate_features_for_strategy(x, strategy)?;
320
321 generated_features.extend(strategy_features);
322 transformation_functions.extend(strategy_transforms);
323 feature_names.extend(strategy_names);
324 }
325
326 let n_generated = generated_features.len();
328 if n_generated == 0 {
329 return Err(SklearsError::InvalidInput(
330 "No features were generated".to_string(),
331 ));
332 }
333
334 let mut feature_matrix = Array2::zeros((n_samples, n_generated));
335 for (i, feature) in generated_features.iter().enumerate() {
336 for (j, &value) in feature.iter().enumerate() {
337 feature_matrix[[j, i]] = value;
338 }
339 }
340
341 let feature_matrix = if self.config.scale_features {
343 scale_features(&feature_matrix)?
344 } else {
345 feature_matrix
346 };
347
348 let (feature_matrix, feature_indices) = if self.config.remove_correlated {
350 remove_correlated_features(&feature_matrix, self.config.correlation_threshold)?
351 } else {
352 let indices: Vec<usize> = (0..n_generated).collect();
353 (feature_matrix, indices)
354 };
355
356 let mut filtered_transforms = Vec::new();
358 let mut filtered_names = Vec::new();
359 for &idx in &feature_indices {
360 if idx < transformation_functions.len() {
361 filtered_transforms.push(transformation_functions[idx].clone());
362 filtered_names.push(feature_names[idx].clone());
363 }
364 }
365
366 let feature_scores = self.calculate_feature_scores(&feature_matrix, y)?;
368
369 let selected_features = self.select_features(&feature_scores)?;
371
372 let feature_importance =
374 self.calculate_feature_importance(&feature_matrix, y, &selected_features)?;
375
376 let correlation_matrix = if feature_matrix.ncols() <= 1000 {
378 Some(calculate_correlation_matrix(&feature_matrix)?)
380 } else {
381 None
382 };
383
384 Ok(AutoFeatureEngineerFitted {
385 config: self.config,
386 selected_features,
387 feature_names: filtered_names,
388 feature_scores,
389 transformation_functions: filtered_transforms,
390 n_original_features: n_features,
391 feature_importance,
392 correlation_matrix,
393 })
394 }
395}
396
397impl AutoFeatureEngineer<Untrained> {
398 fn generate_features_for_strategy(
400 &self,
401 x: &Array2<f64>,
402 strategy: &GenerationStrategy,
403 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
404 match strategy {
405 GenerationStrategy::Polynomial { degree } => {
406 self.generate_polynomial_features(x, *degree)
407 }
408 GenerationStrategy::Mathematical { functions } => {
409 self.generate_mathematical_features(x, functions)
410 }
411 GenerationStrategy::Interactions { max_depth } => {
412 self.generate_interaction_features(x, *max_depth)
413 }
414 GenerationStrategy::Binning { n_bins } => self.generate_binning_features(x, *n_bins),
415 GenerationStrategy::Ratios => self.generate_ratio_features(x),
416 GenerationStrategy::Aggregations { window_size } => {
417 self.generate_aggregation_features(x, *window_size)
418 }
419 GenerationStrategy::FrequencyEncoding => self.generate_frequency_encoding_features(x),
420 GenerationStrategy::DomainSpecific { domain } => {
421 self.generate_domain_specific_features(x, domain)
422 }
423 }
424 }
425
426 fn generate_polynomial_features(
428 &self,
429 x: &Array2<f64>,
430 degree: usize,
431 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
432 let (_n_samples, n_features) = x.dim();
433 let mut features = Vec::new();
434 let mut transforms = Vec::new();
435 let mut names = Vec::new();
436
437 for i in 0..n_features {
439 let column = x.column(i);
440 for d in 2..=degree {
441 let poly_feature = column.mapv(|x| x.powi(d as i32));
442 features.push(poly_feature);
443 transforms.push(TransformationFunction {
444 name: format!("poly_{}_{}", i, d),
445 function_type: TransformationType::Polynomial { degree: d },
446 input_indices: vec![i],
447 parameters: HashMap::new(),
448 });
449 names.push(format!("poly_{}_{}", i, d));
450 }
451 }
452
453 Ok((features, transforms, names))
454 }
455
456 fn generate_mathematical_features(
458 &self,
459 x: &Array2<f64>,
460 functions: &[MathFunction],
461 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
462 let (_n_samples, n_features) = x.dim();
463 let mut features = Vec::new();
464 let mut transforms = Vec::new();
465 let mut names = Vec::new();
466
467 for i in 0..n_features {
468 let column = x.column(i);
469 for &function in functions {
470 let transformed = apply_math_function(&column.to_owned(), function)?;
471 features.push(transformed);
472 transforms.push(TransformationFunction {
473 name: format!("{}_{}", math_function_name(function), i),
474 function_type: TransformationType::Mathematical { function },
475 input_indices: vec![i],
476 parameters: HashMap::new(),
477 });
478 names.push(format!("{}_{}", math_function_name(function), i));
479 }
480 }
481
482 Ok((features, transforms, names))
483 }
484
485 fn generate_interaction_features(
487 &self,
488 x: &Array2<f64>,
489 max_depth: usize,
490 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
491 let (_n_samples, n_features) = x.dim();
492 let mut features = Vec::new();
493 let mut transforms = Vec::new();
494 let mut names = Vec::new();
495
496 if max_depth >= 2 {
498 for i in 0..n_features {
499 for j in (i + 1)..n_features {
500 let col_i = x.column(i);
501 let col_j = x.column(j);
502 let interaction = &col_i * &col_j;
503 features.push(interaction);
504 transforms.push(TransformationFunction {
505 name: format!("interact_{}_{}", i, j),
506 function_type: TransformationType::Interaction,
507 input_indices: vec![i, j],
508 parameters: HashMap::new(),
509 });
510 names.push(format!("interact_{}_{}", i, j));
511 }
512 }
513 }
514
515 if max_depth >= 3 && n_features >= 3 {
517 for i in 0..n_features {
518 for j in (i + 1)..n_features {
519 for k in (j + 1)..n_features {
520 let col_i = x.column(i);
521 let col_j = x.column(j);
522 let col_k = x.column(k);
523 let interaction = &(&col_i * &col_j) * &col_k;
524 features.push(interaction);
525 transforms.push(TransformationFunction {
526 name: format!("interact_{}_{}_{}", i, j, k),
527 function_type: TransformationType::Interaction,
528 input_indices: vec![i, j, k],
529 parameters: HashMap::new(),
530 });
531 names.push(format!("interact_{}_{}_{}", i, j, k));
532 }
533 }
534 }
535 }
536
537 Ok((features, transforms, names))
538 }
539
540 fn generate_binning_features(
542 &self,
543 x: &Array2<f64>,
544 n_bins: usize,
545 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
546 let (_n_samples, n_features) = x.dim();
547 let mut features = Vec::new();
548 let mut transforms = Vec::new();
549 let mut names = Vec::new();
550
551 for i in 0..n_features {
552 let column = x.column(i);
553 let (min_val, max_val) = column
554 .iter()
555 .fold((f64::INFINITY, f64::NEG_INFINITY), |(min, max), &val| {
556 (min.min(val), max.max(val))
557 });
558
559 if (max_val - min_val).abs() < f64::EPSILON {
560 continue; }
562
563 let bin_width = (max_val - min_val) / n_bins as f64;
564 let bins: Vec<f64> = (0..=n_bins)
565 .map(|b| min_val + b as f64 * bin_width)
566 .collect();
567
568 let binned_feature = column.mapv(|x| {
569 let bin_index = ((x - min_val) / bin_width).floor() as usize;
570 bin_index.min(n_bins - 1) as f64
571 });
572
573 features.push(binned_feature);
574 transforms.push(TransformationFunction {
575 name: format!("bin_{}", i),
576 function_type: TransformationType::Binning { bins: bins.clone() },
577 input_indices: vec![i],
578 parameters: HashMap::new(),
579 });
580 names.push(format!("bin_{}", i));
581 }
582
583 Ok((features, transforms, names))
584 }
585
586 fn generate_ratio_features(
588 &self,
589 x: &Array2<f64>,
590 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
591 let (_n_samples, n_features) = x.dim();
592 let mut features = Vec::new();
593 let mut transforms = Vec::new();
594 let mut names = Vec::new();
595
596 for i in 0..n_features {
597 for j in 0..n_features {
598 if i != j {
599 let col_i = x.column(i);
600 let col_j = x.column(j);
601
602 let ratio = col_i
604 .iter()
605 .zip(col_j.iter())
606 .map(|(&a, &b)| if b.abs() < 1e-8 { 0.0 } else { a / b })
607 .collect::<Vec<f64>>();
608
609 features.push(Array1::from_vec(ratio));
610 transforms.push(TransformationFunction {
611 name: format!("ratio_{}_{}", i, j),
612 function_type: TransformationType::Ratio,
613 input_indices: vec![i, j],
614 parameters: HashMap::new(),
615 });
616 names.push(format!("ratio_{}_{}", i, j));
617 }
618 }
619 }
620
621 Ok((features, transforms, names))
622 }
623
624 fn generate_aggregation_features(
626 &self,
627 x: &Array2<f64>,
628 window_size: usize,
629 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
630 let (n_samples, n_features) = x.dim();
631 let mut features = Vec::new();
632 let mut transforms = Vec::new();
633 let mut names = Vec::new();
634
635 if window_size >= n_samples {
636 return Ok((features, transforms, names)); }
638
639 for i in 0..n_features {
640 let column = x.column(i);
641
642 let rolling_mean = (0..n_samples)
644 .map(|idx| {
645 let start = idx.saturating_sub(window_size / 2);
646 let end = (idx + window_size / 2 + 1).min(n_samples);
647 let window = &column.slice(scirs2_core::ndarray::s![start..end]);
648 window.mean().unwrap_or(0.0)
649 })
650 .collect::<Vec<f64>>();
651
652 features.push(Array1::from_vec(rolling_mean));
653 transforms.push(TransformationFunction {
654 name: format!("rolling_mean_{}_{}", i, window_size),
655 function_type: TransformationType::Aggregation { window_size },
656 input_indices: vec![i],
657 parameters: HashMap::new(),
658 });
659 names.push(format!("rolling_mean_{}_{}", i, window_size));
660 }
661
662 Ok((features, transforms, names))
663 }
664
665 fn generate_frequency_encoding_features(
667 &self,
668 x: &Array2<f64>,
669 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
670 let (_n_samples, n_features) = x.dim();
671 let mut features = Vec::new();
672 let mut transforms = Vec::new();
673 let mut names = Vec::new();
674
675 for i in 0..n_features {
676 let column = x.column(i);
677
678 let mut frequency_map: HashMap<i64, i32> = HashMap::new();
680 for &value in column.iter() {
681 let rounded = (value * 1000.0).round() as i64; *frequency_map.entry(rounded).or_insert(0) += 1;
683 }
684
685 let freq_encoded = column.mapv(|x| {
687 let rounded = (x * 1000.0).round() as i64;
688 *frequency_map.get(&rounded).unwrap_or(&0) as f64
689 });
690
691 features.push(freq_encoded);
692 transforms.push(TransformationFunction {
693 name: format!("freq_encode_{}", i),
694 function_type: TransformationType::FrequencyEncoding {
695 mapping: frequency_map
696 .iter()
697 .map(|(&k, &v)| (k.to_string(), v as f64))
698 .collect(),
699 },
700 input_indices: vec![i],
701 parameters: HashMap::new(),
702 });
703 names.push(format!("freq_encode_{}", i));
704 }
705
706 Ok((features, transforms, names))
707 }
708
709 fn generate_domain_specific_features(
711 &self,
712 x: &Array2<f64>,
713 domain: &Domain,
714 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
715 match domain {
716 Domain::TimeSeries => self.generate_time_series_features(x),
717 Domain::Financial => self.generate_financial_features(x),
718 Domain::Text => self.generate_text_features(x),
719 Domain::Image => self.generate_image_features(x),
720 Domain::Generic => self.generate_generic_features(x),
721 }
722 }
723
724 fn generate_time_series_features(
726 &self,
727 x: &Array2<f64>,
728 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
729 let mut features = Vec::new();
730 let mut transforms = Vec::new();
731 let mut names = Vec::new();
732
733 for i in 0..x.ncols() {
735 let column = x.column(i);
736
737 if column.len() > 1 {
739 let diff = (1..column.len())
740 .map(|j| column[j] - column[j - 1])
741 .collect::<Vec<f64>>();
742 let mut diff_feature = vec![0.0]; diff_feature.extend(diff);
744
745 features.push(Array1::from_vec(diff_feature));
746 transforms.push(TransformationFunction {
747 name: format!("diff_{}", i),
748 function_type: TransformationType::Mathematical {
749 function: MathFunction::Abs,
750 }, input_indices: vec![i],
752 parameters: HashMap::new(),
753 });
754 names.push(format!("diff_{}", i));
755 }
756 }
757
758 Ok((features, transforms, names))
759 }
760
761 fn generate_financial_features(
763 &self,
764 x: &Array2<f64>,
765 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
766 self.generate_generic_features(x)
768 }
769
770 fn generate_text_features(
772 &self,
773 x: &Array2<f64>,
774 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
775 self.generate_generic_features(x)
777 }
778
779 fn generate_image_features(
781 &self,
782 x: &Array2<f64>,
783 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
784 self.generate_generic_features(x)
786 }
787
788 fn generate_generic_features(
790 &self,
791 x: &Array2<f64>,
792 ) -> Result<(Vec<Array1<f64>>, Vec<TransformationFunction>, Vec<String>)> {
793 let mut features = Vec::new();
795 let mut transforms = Vec::new();
796 let mut names = Vec::new();
797
798 for stat_name in &["sum", "mean", "std", "min", "max"] {
800 let stat_feature = (0..x.nrows())
801 .map(|i| {
802 let row = x.row(i);
803 match *stat_name {
804 "sum" => row.sum(),
805 "mean" => row.mean().unwrap_or(0.0),
806 "std" => {
807 let mean = row.mean().unwrap_or(0.0);
808 let variance = row.mapv(|x| (x - mean).powi(2)).mean().unwrap_or(0.0);
809 variance.sqrt()
810 }
811 "min" => row.iter().fold(f64::INFINITY, |a, &b| a.min(b)),
812 "max" => row.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b)),
813 _ => 0.0,
814 }
815 })
816 .collect::<Vec<f64>>();
817
818 features.push(Array1::from_vec(stat_feature));
819 transforms.push(TransformationFunction {
820 name: format!("row_{}", stat_name),
821 function_type: TransformationType::Aggregation {
822 window_size: x.ncols(),
823 },
824 input_indices: (0..x.ncols()).collect(),
825 parameters: HashMap::new(),
826 });
827 names.push(format!("row_{}", stat_name));
828 }
829
830 Ok((features, transforms, names))
831 }
832
833 fn calculate_feature_scores(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Vec<f64>> {
835 match self.config.selection_method {
836 SelectionMethod::Correlation => calculate_correlation_scores(x, y),
837 SelectionMethod::Variance => calculate_variance_scores(x),
838 SelectionMethod::MutualInformation => calculate_mutual_information_scores(x, y),
839 _ => {
840 calculate_correlation_scores(x, y)
842 }
843 }
844 }
845
846 fn select_features(&self, scores: &[f64]) -> Result<Vec<usize>> {
848 let mut indexed_scores: Vec<(usize, f64)> = scores
849 .iter()
850 .enumerate()
851 .map(|(i, &score)| (i, score.abs()))
852 .collect();
853
854 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
856
857 let selected: Vec<usize> = indexed_scores
859 .iter()
860 .filter(|(_, score)| *score >= self.config.selection_threshold)
861 .take(self.config.max_features)
862 .map(|(idx, _)| *idx)
863 .collect();
864
865 if selected.is_empty() {
866 Ok(vec![indexed_scores[0].0])
868 } else {
869 Ok(selected)
870 }
871 }
872
873 fn calculate_feature_importance(
875 &self,
876 x: &Array2<f64>,
877 y: &Array1<f64>,
878 selected_features: &[usize],
879 ) -> Result<Vec<f64>> {
880 let mut importance = vec![0.0; selected_features.len()];
882
883 for (i, &feature_idx) in selected_features.iter().enumerate() {
884 if feature_idx < x.ncols() {
885 let feature_col = x.column(feature_idx).to_owned();
886 let correlation = calculate_correlation(&feature_col, y)?;
887 importance[i] = correlation.abs();
888 }
889 }
890
891 Ok(importance)
892 }
893}
894
895impl AutoFeatureEngineerFitted {
896 pub fn selected_features(&self) -> &[usize] {
898 &self.selected_features
899 }
900
901 pub fn feature_names(&self) -> &[String] {
903 &self.feature_names
904 }
905
906 pub fn feature_scores(&self) -> &[f64] {
908 &self.feature_scores
909 }
910
911 pub fn feature_importance(&self) -> &[f64] {
913 &self.feature_importance
914 }
915
916 pub fn transformations(&self) -> &[TransformationFunction] {
918 &self.transformation_functions
919 }
920
921 pub fn correlation_matrix(&self) -> Option<&Array2<f64>> {
923 self.correlation_matrix.as_ref()
924 }
925}
926
927impl Transform<Array2<f64>, Array2<f64>> for AutoFeatureEngineerFitted {
928 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
929 if x.is_empty() {
930 return Err(SklearsError::InvalidInput(
931 "Input array is empty".to_string(),
932 ));
933 }
934
935 let (n_samples, n_features) = x.dim();
936 if n_features != self.n_original_features {
937 return Err(SklearsError::InvalidInput(format!(
938 "Feature count mismatch: expected {}, got {}",
939 self.n_original_features, n_features
940 )));
941 }
942
943 let mut result = Array2::zeros((n_samples, self.selected_features.len()));
948
949 for (out_idx, &in_idx) in self.selected_features.iter().enumerate() {
950 if in_idx < n_features {
951 for (row_idx, &value) in x.column(in_idx).iter().enumerate() {
953 result[[row_idx, out_idx]] = value;
954 }
955 }
956 }
959
960 Ok(result)
961 }
962}
963
964fn apply_math_function(arr: &Array1<f64>, function: MathFunction) -> Result<Array1<f64>> {
968 let result = match function {
969 MathFunction::Log => arr.mapv(|x| if x > 0.0 { x.ln() } else { f64::NEG_INFINITY }),
970 MathFunction::Log1p => arr.mapv(|x| (1.0 + x).ln()),
971 MathFunction::Sqrt => arr.mapv(|x| if x >= 0.0 { x.sqrt() } else { 0.0 }),
972 MathFunction::Square => arr.mapv(|x| x * x),
973 MathFunction::Exp => arr.mapv(|x| x.exp()),
974 MathFunction::Sin => arr.mapv(|x| x.sin()),
975 MathFunction::Cos => arr.mapv(|x| x.cos()),
976 MathFunction::Tan => arr.mapv(|x| x.tan()),
977 MathFunction::Abs => arr.mapv(|x| x.abs()),
978 MathFunction::Reciprocal => arr.mapv(|x| if x.abs() > 1e-8 { 1.0 / x } else { 0.0 }),
979 };
980 Ok(result)
981}
982
983fn math_function_name(function: MathFunction) -> &'static str {
985 match function {
986 MathFunction::Log => "log",
987 MathFunction::Log1p => "log1p",
988 MathFunction::Sqrt => "sqrt",
989 MathFunction::Square => "square",
990 MathFunction::Exp => "exp",
991 MathFunction::Sin => "sin",
992 MathFunction::Cos => "cos",
993 MathFunction::Tan => "tan",
994 MathFunction::Abs => "abs",
995 MathFunction::Reciprocal => "reciprocal",
996 }
997}
998
999fn scale_features(x: &Array2<f64>) -> Result<Array2<f64>> {
1001 let mut result = x.clone();
1002 let n_features = x.ncols();
1003
1004 for i in 0..n_features {
1005 let col = x.column(i);
1006 let mean = col.mean().unwrap_or(0.0);
1007 let std = {
1008 let variance = col.mapv(|x| (x - mean).powi(2)).mean().unwrap_or(0.0);
1009 variance.sqrt()
1010 };
1011
1012 if std > 1e-8 {
1013 for j in 0..x.nrows() {
1014 result[[j, i]] = (result[[j, i]] - mean) / std;
1015 }
1016 }
1017 }
1018
1019 Ok(result)
1020}
1021
1022fn remove_correlated_features(
1024 x: &Array2<f64>,
1025 threshold: f64,
1026) -> Result<(Array2<f64>, Vec<usize>)> {
1027 let n_features = x.ncols();
1028 let mut to_remove = HashSet::new();
1029
1030 for i in 0..n_features {
1032 for j in (i + 1)..n_features {
1033 if to_remove.contains(&i) || to_remove.contains(&j) {
1034 continue;
1035 }
1036
1037 let corr = calculate_correlation(&x.column(i).to_owned(), &x.column(j).to_owned())?;
1038 if corr.abs() > threshold {
1039 let var_i = x.column(i).var(0.0);
1041 let var_j = x.column(j).var(0.0);
1042 if var_i < var_j {
1043 to_remove.insert(i);
1044 } else {
1045 to_remove.insert(j);
1046 }
1047 }
1048 }
1049 }
1050
1051 let remaining_features: Vec<usize> =
1053 (0..n_features).filter(|i| !to_remove.contains(i)).collect();
1054
1055 if remaining_features.is_empty() {
1056 return Ok((x.clone(), (0..n_features).collect()));
1057 }
1058
1059 let mut result = Array2::zeros((x.nrows(), remaining_features.len()));
1060 for (new_idx, &old_idx) in remaining_features.iter().enumerate() {
1061 for (row_idx, &value) in x.column(old_idx).iter().enumerate() {
1062 result[[row_idx, new_idx]] = value;
1063 }
1064 }
1065
1066 Ok((result, remaining_features))
1067}
1068
1069fn calculate_correlation(x: &Array1<f64>, y: &Array1<f64>) -> Result<f64> {
1071 if x.len() != y.len() {
1072 return Err(SklearsError::InvalidInput(
1073 "Arrays must have the same length".to_string(),
1074 ));
1075 }
1076
1077 let mean_x = x.mean().unwrap_or(0.0);
1078 let mean_y = y.mean().unwrap_or(0.0);
1079
1080 let mut numerator = 0.0;
1081 let mut sum_sq_x = 0.0;
1082 let mut sum_sq_y = 0.0;
1083
1084 for (&xi, &yi) in x.iter().zip(y.iter()) {
1085 let dx = xi - mean_x;
1086 let dy = yi - mean_y;
1087 numerator += dx * dy;
1088 sum_sq_x += dx * dx;
1089 sum_sq_y += dy * dy;
1090 }
1091
1092 let denominator = (sum_sq_x * sum_sq_y).sqrt();
1093 if denominator < 1e-8 {
1094 Ok(0.0)
1095 } else {
1096 Ok(numerator / denominator)
1097 }
1098}
1099
1100fn calculate_correlation_scores(x: &Array2<f64>, y: &Array1<f64>) -> Result<Vec<f64>> {
1102 let mut scores = Vec::new();
1103 for i in 0..x.ncols() {
1104 let correlation = calculate_correlation(&x.column(i).to_owned(), y)?;
1105 scores.push(correlation.abs());
1106 }
1107 Ok(scores)
1108}
1109
1110fn calculate_variance_scores(x: &Array2<f64>) -> Result<Vec<f64>> {
1112 let mut scores = Vec::new();
1113 for i in 0..x.ncols() {
1114 let variance = x.column(i).var(0.0);
1115 scores.push(variance);
1116 }
1117 Ok(scores)
1118}
1119
1120fn calculate_mutual_information_scores(x: &Array2<f64>, y: &Array1<f64>) -> Result<Vec<f64>> {
1122 calculate_correlation_scores(x, y)
1125}
1126
1127fn calculate_correlation_matrix(x: &Array2<f64>) -> Result<Array2<f64>> {
1129 let n_features = x.ncols();
1130 let mut corr_matrix = Array2::zeros((n_features, n_features));
1131
1132 for i in 0..n_features {
1133 for j in 0..n_features {
1134 if i == j {
1135 corr_matrix[[i, j]] = 1.0;
1136 } else {
1137 let corr = calculate_correlation(&x.column(i).to_owned(), &x.column(j).to_owned())?;
1138 corr_matrix[[i, j]] = corr;
1139 }
1140 }
1141 }
1142
1143 Ok(corr_matrix)
1144}
1145
1146#[allow(non_snake_case)]
1147#[cfg(test)]
1148mod tests {
1149 use super::*;
1150 use approx::assert_relative_eq;
1151 use scirs2_core::ndarray::{arr1, arr2};
1152
1153 #[test]
1154 fn test_auto_feature_config() {
1155 let config = AutoFeatureConfig::new()
1156 .with_max_features(50)
1157 .with_selection_threshold(0.05)
1158 .with_strategy(GenerationStrategy::Polynomial { degree: 3 });
1159
1160 assert_eq!(config.max_features, 50);
1161 assert_relative_eq!(config.selection_threshold, 0.05);
1162 assert_eq!(config.strategies.len(), 4); }
1164
1165 #[test]
1166 fn test_auto_feature_engineer_creation() {
1167 let config = AutoFeatureConfig::new();
1168 let engineer = AutoFeatureEngineer::new(config);
1169 assert_eq!(engineer.config().max_features, 200);
1170 }
1171
1172 #[test]
1173 fn test_auto_feature_engineer_fit() {
1174 let config = AutoFeatureConfig::new()
1175 .with_max_features(10)
1176 .with_selection_threshold(0.0); let engineer = AutoFeatureEngineer::new(config);
1178
1179 let X = arr2(&[[1.0, 2.0], [2.0, 4.0], [3.0, 6.0], [4.0, 8.0]]);
1180 let y = arr1(&[1.0, 2.0, 3.0, 4.0]);
1181
1182 let fitted = engineer.fit(&X, &y).unwrap();
1183 assert!(!fitted.selected_features().is_empty());
1184 assert!(!fitted.feature_names().is_empty());
1185 }
1186
1187 #[test]
1188 fn test_mathematical_functions() {
1189 let arr = arr1(&[1.0, 2.0, 3.0, 4.0]);
1190
1191 let sqrt_result = apply_math_function(&arr, MathFunction::Sqrt).unwrap();
1192 let expected_sqrt = arr1(&[1.0, 2.0_f64.sqrt(), 3.0_f64.sqrt(), 2.0]);
1193
1194 for (a, b) in sqrt_result.iter().zip(expected_sqrt.iter()) {
1195 assert_relative_eq!(a, b, epsilon = 1e-10);
1196 }
1197
1198 let square_result = apply_math_function(&arr, MathFunction::Square).unwrap();
1199 let expected_square = arr1(&[1.0, 4.0, 9.0, 16.0]);
1200
1201 for (a, b) in square_result.iter().zip(expected_square.iter()) {
1202 assert_relative_eq!(a, b, epsilon = 1e-10);
1203 }
1204 }
1205
1206 #[test]
1207 fn test_correlation_calculation() {
1208 let x = arr1(&[1.0, 2.0, 3.0, 4.0, 5.0]);
1209 let y = arr1(&[2.0, 4.0, 6.0, 8.0, 10.0]); let corr = calculate_correlation(&x, &y).unwrap();
1212 assert_relative_eq!(corr, 1.0, epsilon = 1e-10);
1213
1214 let z = arr1(&[5.0, 4.0, 3.0, 2.0, 1.0]); let corr_neg = calculate_correlation(&x, &z).unwrap();
1216 assert_relative_eq!(corr_neg, -1.0, epsilon = 1e-10);
1217 }
1218
1219 #[test]
1220 fn test_feature_scaling() {
1221 let X = arr2(&[[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]]);
1222
1223 let scaled = scale_features(&X).unwrap();
1224
1225 for i in 0..scaled.ncols() {
1227 let col = scaled.column(i);
1228 let mean = col.mean().unwrap();
1229 let std = col.mapv(|x| (x - mean).powi(2)).mean().unwrap().sqrt();
1230
1231 assert_relative_eq!(mean, 0.0, epsilon = 1e-10);
1232 assert_relative_eq!(std, 1.0, epsilon = 1e-10);
1233 }
1234 }
1235
1236 #[test]
1237 fn test_auto_feature_engineer_transform() {
1238 let config = AutoFeatureConfig::new()
1239 .with_max_features(5)
1240 .with_include_original(true);
1241 let engineer = AutoFeatureEngineer::new(config);
1242
1243 let X_train = arr2(&[[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]]);
1244 let y_train = arr1(&[1.0, 2.0, 3.0]);
1245
1246 let fitted = engineer.fit(&X_train, &y_train).unwrap();
1247
1248 let X_test = arr2(&[[4.0, 8.0], [5.0, 10.0]]);
1249
1250 let result = fitted.transform(&X_test).unwrap();
1251 assert_eq!(result.nrows(), 2);
1252 assert!(!result.is_empty());
1253 }
1254
1255 #[test]
1256 fn test_error_handling() {
1257 let config = AutoFeatureConfig::new();
1259 let engineer = AutoFeatureEngineer::new(config);
1260 let empty_X = Array2::from_shape_vec((0, 0), vec![]).unwrap();
1261 let empty_y = Array1::from_vec(vec![]);
1262 assert!(engineer.fit(&empty_X, &empty_y).is_err());
1263
1264 let config = AutoFeatureConfig::new();
1266 let engineer = AutoFeatureEngineer::new(config);
1267 let X = arr2(&[[1.0, 2.0], [3.0, 4.0]]);
1268 let y = arr1(&[1.0]); assert!(engineer.fit(&X, &y).is_err());
1270 }
1271}