1use scirs2_core::ndarray::{concatenate, Array1, Array2, ArrayView1, ArrayView2, Axis};
6use sklears_core::{
7 error::Result as SklResult,
8 prelude::SklearsError,
9 types::{Float, FloatBounds},
10};
11use std::collections::HashSet;
12
13pub struct FeatureInteractionDetector {
15 interaction_type: InteractionType,
16 max_interactions: usize,
17 min_correlation: f64,
18 method: DetectionMethod,
19 threshold: f64,
20}
21
22#[derive(Debug, Clone)]
24pub enum InteractionType {
25 Linear,
27 Polynomial { degree: usize },
29 Multiplicative,
31 Statistical,
33 MutualInformation,
35}
36
37#[derive(Debug, Clone)]
39pub enum DetectionMethod {
40 Correlation,
42 MutualInfo,
44 StatisticalTest,
46 TreeBased,
48}
49
50impl FeatureInteractionDetector {
51 #[must_use]
53 pub fn new() -> Self {
54 Self {
55 interaction_type: InteractionType::Linear,
56 max_interactions: 100,
57 min_correlation: 0.1,
58 method: DetectionMethod::Correlation,
59 threshold: 0.05,
60 }
61 }
62
63 #[must_use]
65 pub fn interaction_type(mut self, interaction_type: InteractionType) -> Self {
66 self.interaction_type = interaction_type;
67 self
68 }
69
70 #[must_use]
72 pub fn max_interactions(mut self, max: usize) -> Self {
73 self.max_interactions = max;
74 self
75 }
76
77 #[must_use]
79 pub fn min_correlation(mut self, min_corr: f64) -> Self {
80 self.min_correlation = min_corr;
81 self
82 }
83
84 #[must_use]
86 pub fn method(mut self, method: DetectionMethod) -> Self {
87 self.method = method;
88 self
89 }
90
91 #[must_use]
93 pub fn threshold(mut self, threshold: f64) -> Self {
94 self.threshold = threshold;
95 self
96 }
97
98 pub fn detect_interactions(
100 &self,
101 x: &ArrayView2<'_, Float>,
102 y: Option<&ArrayView1<'_, Float>>,
103 ) -> SklResult<Vec<FeatureInteraction>> {
104 match self.method {
105 DetectionMethod::Correlation => self.detect_correlation_interactions(x),
106 DetectionMethod::MutualInfo => self.detect_mutual_info_interactions(x, y),
107 DetectionMethod::StatisticalTest => self.detect_statistical_interactions(x, y),
108 DetectionMethod::TreeBased => self.detect_tree_based_interactions(x, y),
109 }
110 }
111
112 fn detect_correlation_interactions(
113 &self,
114 x: &ArrayView2<'_, Float>,
115 ) -> SklResult<Vec<FeatureInteraction>> {
116 let mut interactions = Vec::new();
117 let n_features = x.ncols();
118
119 for i in 0..n_features {
120 for j in (i + 1)..n_features {
121 let correlation = self.calculate_correlation(&x.column(i), &x.column(j))?;
122
123 if correlation.abs() >= self.min_correlation {
124 interactions.push(FeatureInteraction {
125 feature_indices: vec![i, j],
126 interaction_type: self.interaction_type.clone(),
127 strength: correlation.abs(),
128 p_value: None,
129 });
130 }
131 }
132 }
133
134 interactions.sort_by(|a, b| {
136 b.strength
137 .partial_cmp(&a.strength)
138 .unwrap_or(std::cmp::Ordering::Equal)
139 });
140 interactions.truncate(self.max_interactions);
141
142 Ok(interactions)
143 }
144
145 fn detect_mutual_info_interactions(
146 &self,
147 _x: &ArrayView2<'_, Float>,
148 _y: Option<&ArrayView1<'_, Float>>,
149 ) -> SklResult<Vec<FeatureInteraction>> {
150 Ok(Vec::new())
152 }
153
154 fn detect_statistical_interactions(
155 &self,
156 _x: &ArrayView2<'_, Float>,
157 _y: Option<&ArrayView1<'_, Float>>,
158 ) -> SklResult<Vec<FeatureInteraction>> {
159 Ok(Vec::new())
161 }
162
163 fn detect_tree_based_interactions(
164 &self,
165 _x: &ArrayView2<'_, Float>,
166 _y: Option<&ArrayView1<'_, Float>>,
167 ) -> SklResult<Vec<FeatureInteraction>> {
168 Ok(Vec::new())
170 }
171
172 fn calculate_correlation(
173 &self,
174 x1: &ArrayView1<'_, Float>,
175 x2: &ArrayView1<'_, Float>,
176 ) -> SklResult<f64> {
177 let n = x1.len();
178 if n != x2.len() {
179 return Err(SklearsError::ShapeMismatch {
180 expected: format!("{n}"),
181 actual: format!("{}", x2.len()),
182 });
183 }
184
185 let mean1 = x1.iter().copied().sum::<f64>() / n as f64;
186 let mean2 = x2.iter().copied().sum::<f64>() / n as f64;
187
188 let mut numerator = 0.0;
189 let mut sum_sq1 = 0.0;
190 let mut sum_sq2 = 0.0;
191
192 for i in 0..n {
193 let diff1 = x1[i] - mean1;
194 let diff2 = x2[i] - mean2;
195
196 numerator += diff1 * diff2;
197 sum_sq1 += diff1 * diff1;
198 sum_sq2 += diff2 * diff2;
199 }
200
201 let denominator = (sum_sq1 * sum_sq2).sqrt();
202 if denominator == 0.0 {
203 Ok(0.0)
204 } else {
205 Ok(numerator / denominator)
206 }
207 }
208}
209
210impl Default for FeatureInteractionDetector {
211 fn default() -> Self {
212 Self::new()
213 }
214}
215
216#[derive(Debug, Clone)]
218pub struct FeatureInteraction {
219 pub feature_indices: Vec<usize>,
221 pub interaction_type: InteractionType,
223 pub strength: f64,
225 pub p_value: Option<f64>,
227}
228
229pub struct AutoFeatureEngineer {
231 enable_polynomial: bool,
232 polynomial_degree: usize,
233 enable_interactions: bool,
234 enable_binning: bool,
235 n_bins: usize,
236 enable_scaling: bool,
237 enable_selection: bool,
238 max_features: Option<usize>,
239}
240
241impl AutoFeatureEngineer {
242 #[must_use]
244 pub fn new() -> Self {
245 Self {
246 enable_polynomial: true,
247 polynomial_degree: 2,
248 enable_interactions: true,
249 enable_binning: false,
250 n_bins: 10,
251 enable_scaling: true,
252 enable_selection: true,
253 max_features: None,
254 }
255 }
256
257 #[must_use]
259 pub fn polynomial_features(mut self, enable: bool, degree: usize) -> Self {
260 self.enable_polynomial = enable;
261 self.polynomial_degree = degree;
262 self
263 }
264
265 #[must_use]
267 pub fn interaction_features(mut self, enable: bool) -> Self {
268 self.enable_interactions = enable;
269 self
270 }
271
272 #[must_use]
274 pub fn binning_features(mut self, enable: bool, n_bins: usize) -> Self {
275 self.enable_binning = enable;
276 self.n_bins = n_bins;
277 self
278 }
279
280 #[must_use]
282 pub fn scaling(mut self, enable: bool) -> Self {
283 self.enable_scaling = enable;
284 self
285 }
286
287 #[must_use]
289 pub fn feature_selection(mut self, enable: bool, max_features: Option<usize>) -> Self {
290 self.enable_selection = enable;
291 self.max_features = max_features;
292 self
293 }
294
295 pub fn generate_features(
297 &self,
298 x: &ArrayView2<'_, Float>,
299 y: Option<&ArrayView1<'_, Float>>,
300 ) -> SklResult<Array2<f64>> {
301 let mut engineered = x.mapv(|v| v);
302
303 if self.enable_polynomial {
304 engineered = self.add_polynomial_features(&engineered)?;
305 }
306
307 if self.enable_interactions {
308 engineered = self.add_interaction_features(&engineered)?;
309 }
310
311 if self.enable_binning {
312 engineered = self.add_binning_features(&engineered)?;
313 }
314
315 if self.enable_scaling {
316 engineered = self.apply_scaling(&engineered)?;
317 }
318
319 if self.enable_selection {
320 engineered = self.select_features(&engineered, y)?;
321 }
322
323 Ok(engineered)
324 }
325
326 fn add_polynomial_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
327 let (n_samples, n_features) = x.dim();
328 let mut features = x.clone();
329
330 for degree in 2..=self.polynomial_degree {
331 for i in 0..n_features {
332 let mut poly_col = Array1::zeros(n_samples);
333 for (j, &val) in x.column(i).iter().enumerate() {
334 poly_col[j] = val.powi(degree as i32);
335 }
336
337 let new_features = concatenate![Axis(1), features, poly_col.insert_axis(Axis(1))];
339 features = new_features;
340 }
341 }
342
343 Ok(features)
344 }
345
346 fn add_interaction_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
347 let (n_samples, n_features) = x.dim();
348 let mut features = x.clone();
349
350 for i in 0..n_features {
351 for j in (i + 1)..n_features {
352 let mut interaction_col = Array1::zeros(n_samples);
353 for k in 0..n_samples {
354 interaction_col[k] = x[[k, i]] * x[[k, j]];
355 }
356
357 let new_features =
359 concatenate![Axis(1), features, interaction_col.insert_axis(Axis(1))];
360 features = new_features;
361 }
362 }
363
364 Ok(features)
365 }
366
367 fn add_binning_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
368 let (n_samples, n_features) = x.dim();
369 let mut features = x.clone();
370
371 for i in 0..n_features {
372 let column = x.column(i);
373 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
374 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
375 let bin_width = (max_val - min_val) / self.n_bins as f64;
376
377 let mut binned_col = Array1::zeros(n_samples);
378 for (j, &val) in column.iter().enumerate() {
379 let bin = ((val - min_val) / bin_width).floor() as usize;
380 binned_col[j] = bin.min(self.n_bins - 1) as f64;
381 }
382
383 let new_features = concatenate![Axis(1), features, binned_col.insert_axis(Axis(1))];
385 features = new_features;
386 }
387
388 Ok(features)
389 }
390
391 fn apply_scaling(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
392 let (n_samples, n_features) = x.dim();
393 let mut scaled = Array2::zeros((n_samples, n_features));
394
395 for i in 0..n_features {
396 let column = x.column(i);
397 let mean = column.mean().unwrap_or(0.0);
398 let std = column.var(0.0).sqrt();
399
400 for j in 0..n_samples {
401 scaled[[j, i]] = if std > 0.0 {
402 (x[[j, i]] - mean) / std
403 } else {
404 0.0
405 };
406 }
407 }
408
409 Ok(scaled)
410 }
411
412 fn select_features(
413 &self,
414 x: &Array2<f64>,
415 _y: Option<&ArrayView1<'_, Float>>,
416 ) -> SklResult<Array2<f64>> {
417 let (n_samples, n_features) = x.dim();
419
420 if let Some(max_features) = self.max_features {
421 if max_features >= n_features {
422 return Ok(x.clone());
423 }
424
425 let mut feature_scores = Vec::new();
426
427 for i in 0..n_features {
428 let column = x.column(i);
429 let variance = column.var(0.0);
430 feature_scores.push((i, variance));
431 }
432
433 feature_scores
435 .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
436
437 let selected_indices: Vec<usize> = feature_scores
439 .into_iter()
440 .take(max_features)
441 .map(|(idx, _)| idx)
442 .collect();
443
444 let mut selected = Array2::zeros((n_samples, max_features));
446 for (new_idx, &old_idx) in selected_indices.iter().enumerate() {
447 for j in 0..n_samples {
448 selected[[j, new_idx]] = x[[j, old_idx]];
449 }
450 }
451
452 Ok(selected)
453 } else {
454 Ok(x.clone())
455 }
456 }
457}
458
459impl Default for AutoFeatureEngineer {
460 fn default() -> Self {
461 Self::new()
462 }
463}
464
465pub struct ColumnTypeDetector {
467 categorical_threshold: f64,
468 date_pattern_detection: bool,
469 text_detection: bool,
470}
471
472#[derive(Debug, Clone, PartialEq)]
474pub enum ColumnType {
475 Numeric,
477 Categorical,
479 Boolean,
481 DateTime,
483 Text,
485 Binary,
487 Ordinal,
489}
490
491impl ColumnTypeDetector {
492 #[must_use]
494 pub fn new() -> Self {
495 Self {
496 categorical_threshold: 0.1, date_pattern_detection: true,
498 text_detection: true,
499 }
500 }
501
502 #[must_use]
504 pub fn categorical_threshold(mut self, threshold: f64) -> Self {
505 self.categorical_threshold = threshold;
506 self
507 }
508
509 #[must_use]
511 pub fn date_pattern_detection(mut self, enable: bool) -> Self {
512 self.date_pattern_detection = enable;
513 self
514 }
515
516 #[must_use]
518 pub fn text_detection(mut self, enable: bool) -> Self {
519 self.text_detection = enable;
520 self
521 }
522
523 #[must_use]
525 pub fn detect_types(&self, x: &ArrayView2<'_, Float>) -> Vec<ColumnType> {
526 let mut column_types = Vec::new();
527
528 for i in 0..x.ncols() {
529 let column = x.column(i);
530 let column_type = self.detect_column_type(&column);
531 column_types.push(column_type);
532 }
533
534 column_types
535 }
536
537 fn detect_column_type(&self, column: &ArrayView1<'_, Float>) -> ColumnType {
538 let unique_values = self.count_unique_values(column);
539 let total_values = column.len();
540 let unique_ratio = unique_values as f64 / total_values as f64;
541
542 if unique_values == 2 {
544 return ColumnType::Binary;
545 }
546
547 if self.is_boolean_column(column) {
549 return ColumnType::Boolean;
550 }
551
552 if unique_ratio < self.categorical_threshold {
554 return ColumnType::Categorical;
555 }
556
557 ColumnType::Numeric
559 }
560
561 fn count_unique_values(&self, column: &ArrayView1<'_, Float>) -> usize {
562 let mut unique_set = HashSet::new();
563 for &value in column {
564 let rounded = (value * 1000.0).round() / 1000.0;
566 unique_set.insert(rounded.to_bits());
567 }
568 unique_set.len()
569 }
570
571 fn is_boolean_column(&self, column: &ArrayView1<'_, Float>) -> bool {
572 for &value in column {
573 if value != 0.0 && value != 1.0 {
574 return false;
575 }
576 }
577 true
578 }
579}
580
581impl Default for ColumnTypeDetector {
582 fn default() -> Self {
583 Self::new()
584 }
585}
586
587#[allow(non_snake_case)]
588#[cfg(test)]
589mod tests {
590 use super::*;
591 use scirs2_core::ndarray::array;
592
593 #[test]
594 fn test_feature_interaction_detector() {
595 let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
596
597 let detector = FeatureInteractionDetector::new().min_correlation(0.5);
598
599 let interactions = detector
600 .detect_interactions(&x.view(), None)
601 .unwrap_or_default();
602 assert!(!interactions.is_empty());
603 }
604
605 #[test]
606 fn test_auto_feature_engineer() {
607 let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
608
609 let engineer = AutoFeatureEngineer::new()
610 .polynomial_features(true, 2)
611 .interaction_features(true);
612
613 let engineered = engineer
614 .generate_features(&x.view(), None)
615 .unwrap_or_default();
616 assert!(engineered.ncols() > x.ncols());
617 }
618
619 #[test]
620 fn test_column_type_detector() {
621 let x = array![[0.0, 1.0, 5.5], [1.0, 0.0, 6.2], [0.0, 1.0, 7.8]];
622
623 let detector = ColumnTypeDetector::new();
624 let types = detector.detect_types(&x.view());
625
626 assert_eq!(types.len(), 3);
627 assert_eq!(types[0], ColumnType::Binary); assert_eq!(types[1], ColumnType::Binary); assert_eq!(types[2], ColumnType::Numeric); }
631}