1use scirs2_core::ndarray::{concatenate, Array1, Array2, ArrayView1, ArrayView2, Axis};
6use sklears_core::{
7 error::Result as SklResult,
8 prelude::SklearsError,
9 types::{Float, FloatBounds},
10};
11use std::collections::HashSet;
12
13pub struct FeatureInteractionDetector {
15 interaction_type: InteractionType,
16 max_interactions: usize,
17 min_correlation: f64,
18 method: DetectionMethod,
19 threshold: f64,
20}
21
22#[derive(Debug, Clone)]
24pub enum InteractionType {
25 Linear,
27 Polynomial { degree: usize },
29 Multiplicative,
31 Statistical,
33 MutualInformation,
35}
36
37#[derive(Debug, Clone)]
39pub enum DetectionMethod {
40 Correlation,
42 MutualInfo,
44 StatisticalTest,
46 TreeBased,
48}
49
50impl FeatureInteractionDetector {
51 #[must_use]
53 pub fn new() -> Self {
54 Self {
55 interaction_type: InteractionType::Linear,
56 max_interactions: 100,
57 min_correlation: 0.1,
58 method: DetectionMethod::Correlation,
59 threshold: 0.05,
60 }
61 }
62
63 #[must_use]
65 pub fn interaction_type(mut self, interaction_type: InteractionType) -> Self {
66 self.interaction_type = interaction_type;
67 self
68 }
69
70 #[must_use]
72 pub fn max_interactions(mut self, max: usize) -> Self {
73 self.max_interactions = max;
74 self
75 }
76
77 #[must_use]
79 pub fn min_correlation(mut self, min_corr: f64) -> Self {
80 self.min_correlation = min_corr;
81 self
82 }
83
84 #[must_use]
86 pub fn method(mut self, method: DetectionMethod) -> Self {
87 self.method = method;
88 self
89 }
90
91 #[must_use]
93 pub fn threshold(mut self, threshold: f64) -> Self {
94 self.threshold = threshold;
95 self
96 }
97
98 pub fn detect_interactions(
100 &self,
101 x: &ArrayView2<'_, Float>,
102 y: Option<&ArrayView1<'_, Float>>,
103 ) -> SklResult<Vec<FeatureInteraction>> {
104 match self.method {
105 DetectionMethod::Correlation => self.detect_correlation_interactions(x),
106 DetectionMethod::MutualInfo => self.detect_mutual_info_interactions(x, y),
107 DetectionMethod::StatisticalTest => self.detect_statistical_interactions(x, y),
108 DetectionMethod::TreeBased => self.detect_tree_based_interactions(x, y),
109 }
110 }
111
112 fn detect_correlation_interactions(
113 &self,
114 x: &ArrayView2<'_, Float>,
115 ) -> SklResult<Vec<FeatureInteraction>> {
116 let mut interactions = Vec::new();
117 let n_features = x.ncols();
118
119 for i in 0..n_features {
120 for j in (i + 1)..n_features {
121 let correlation = self.calculate_correlation(&x.column(i), &x.column(j))?;
122
123 if correlation.abs() >= self.min_correlation {
124 interactions.push(FeatureInteraction {
125 feature_indices: vec![i, j],
126 interaction_type: self.interaction_type.clone(),
127 strength: correlation.abs(),
128 p_value: None,
129 });
130 }
131 }
132 }
133
134 interactions.sort_by(|a, b| b.strength.partial_cmp(&a.strength).unwrap());
136 interactions.truncate(self.max_interactions);
137
138 Ok(interactions)
139 }
140
141 fn detect_mutual_info_interactions(
142 &self,
143 _x: &ArrayView2<'_, Float>,
144 _y: Option<&ArrayView1<'_, Float>>,
145 ) -> SklResult<Vec<FeatureInteraction>> {
146 Ok(Vec::new())
148 }
149
150 fn detect_statistical_interactions(
151 &self,
152 _x: &ArrayView2<'_, Float>,
153 _y: Option<&ArrayView1<'_, Float>>,
154 ) -> SklResult<Vec<FeatureInteraction>> {
155 Ok(Vec::new())
157 }
158
159 fn detect_tree_based_interactions(
160 &self,
161 _x: &ArrayView2<'_, Float>,
162 _y: Option<&ArrayView1<'_, Float>>,
163 ) -> SklResult<Vec<FeatureInteraction>> {
164 Ok(Vec::new())
166 }
167
168 fn calculate_correlation(
169 &self,
170 x1: &ArrayView1<'_, Float>,
171 x2: &ArrayView1<'_, Float>,
172 ) -> SklResult<f64> {
173 let n = x1.len();
174 if n != x2.len() {
175 return Err(SklearsError::ShapeMismatch {
176 expected: format!("{n}"),
177 actual: format!("{}", x2.len()),
178 });
179 }
180
181 let mean1 = x1.iter().copied().sum::<f64>() / n as f64;
182 let mean2 = x2.iter().copied().sum::<f64>() / n as f64;
183
184 let mut numerator = 0.0;
185 let mut sum_sq1 = 0.0;
186 let mut sum_sq2 = 0.0;
187
188 for i in 0..n {
189 let diff1 = x1[i] - mean1;
190 let diff2 = x2[i] - mean2;
191
192 numerator += diff1 * diff2;
193 sum_sq1 += diff1 * diff1;
194 sum_sq2 += diff2 * diff2;
195 }
196
197 let denominator = (sum_sq1 * sum_sq2).sqrt();
198 if denominator == 0.0 {
199 Ok(0.0)
200 } else {
201 Ok(numerator / denominator)
202 }
203 }
204}
205
206impl Default for FeatureInteractionDetector {
207 fn default() -> Self {
208 Self::new()
209 }
210}
211
212#[derive(Debug, Clone)]
214pub struct FeatureInteraction {
215 pub feature_indices: Vec<usize>,
217 pub interaction_type: InteractionType,
219 pub strength: f64,
221 pub p_value: Option<f64>,
223}
224
225pub struct AutoFeatureEngineer {
227 enable_polynomial: bool,
228 polynomial_degree: usize,
229 enable_interactions: bool,
230 enable_binning: bool,
231 n_bins: usize,
232 enable_scaling: bool,
233 enable_selection: bool,
234 max_features: Option<usize>,
235}
236
237impl AutoFeatureEngineer {
238 #[must_use]
240 pub fn new() -> Self {
241 Self {
242 enable_polynomial: true,
243 polynomial_degree: 2,
244 enable_interactions: true,
245 enable_binning: false,
246 n_bins: 10,
247 enable_scaling: true,
248 enable_selection: true,
249 max_features: None,
250 }
251 }
252
253 #[must_use]
255 pub fn polynomial_features(mut self, enable: bool, degree: usize) -> Self {
256 self.enable_polynomial = enable;
257 self.polynomial_degree = degree;
258 self
259 }
260
261 #[must_use]
263 pub fn interaction_features(mut self, enable: bool) -> Self {
264 self.enable_interactions = enable;
265 self
266 }
267
268 #[must_use]
270 pub fn binning_features(mut self, enable: bool, n_bins: usize) -> Self {
271 self.enable_binning = enable;
272 self.n_bins = n_bins;
273 self
274 }
275
276 #[must_use]
278 pub fn scaling(mut self, enable: bool) -> Self {
279 self.enable_scaling = enable;
280 self
281 }
282
283 #[must_use]
285 pub fn feature_selection(mut self, enable: bool, max_features: Option<usize>) -> Self {
286 self.enable_selection = enable;
287 self.max_features = max_features;
288 self
289 }
290
291 pub fn generate_features(
293 &self,
294 x: &ArrayView2<'_, Float>,
295 y: Option<&ArrayView1<'_, Float>>,
296 ) -> SklResult<Array2<f64>> {
297 let mut engineered = x.mapv(|v| v);
298
299 if self.enable_polynomial {
300 engineered = self.add_polynomial_features(&engineered)?;
301 }
302
303 if self.enable_interactions {
304 engineered = self.add_interaction_features(&engineered)?;
305 }
306
307 if self.enable_binning {
308 engineered = self.add_binning_features(&engineered)?;
309 }
310
311 if self.enable_scaling {
312 engineered = self.apply_scaling(&engineered)?;
313 }
314
315 if self.enable_selection {
316 engineered = self.select_features(&engineered, y)?;
317 }
318
319 Ok(engineered)
320 }
321
322 fn add_polynomial_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
323 let (n_samples, n_features) = x.dim();
324 let mut features = x.clone();
325
326 for degree in 2..=self.polynomial_degree {
327 for i in 0..n_features {
328 let mut poly_col = Array1::zeros(n_samples);
329 for (j, &val) in x.column(i).iter().enumerate() {
330 poly_col[j] = val.powi(degree as i32);
331 }
332
333 let new_features = concatenate![Axis(1), features, poly_col.insert_axis(Axis(1))];
335 features = new_features;
336 }
337 }
338
339 Ok(features)
340 }
341
342 fn add_interaction_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
343 let (n_samples, n_features) = x.dim();
344 let mut features = x.clone();
345
346 for i in 0..n_features {
347 for j in (i + 1)..n_features {
348 let mut interaction_col = Array1::zeros(n_samples);
349 for k in 0..n_samples {
350 interaction_col[k] = x[[k, i]] * x[[k, j]];
351 }
352
353 let new_features =
355 concatenate![Axis(1), features, interaction_col.insert_axis(Axis(1))];
356 features = new_features;
357 }
358 }
359
360 Ok(features)
361 }
362
363 fn add_binning_features(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
364 let (n_samples, n_features) = x.dim();
365 let mut features = x.clone();
366
367 for i in 0..n_features {
368 let column = x.column(i);
369 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
370 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
371 let bin_width = (max_val - min_val) / self.n_bins as f64;
372
373 let mut binned_col = Array1::zeros(n_samples);
374 for (j, &val) in column.iter().enumerate() {
375 let bin = ((val - min_val) / bin_width).floor() as usize;
376 binned_col[j] = bin.min(self.n_bins - 1) as f64;
377 }
378
379 let new_features = concatenate![Axis(1), features, binned_col.insert_axis(Axis(1))];
381 features = new_features;
382 }
383
384 Ok(features)
385 }
386
387 fn apply_scaling(&self, x: &Array2<f64>) -> SklResult<Array2<f64>> {
388 let (n_samples, n_features) = x.dim();
389 let mut scaled = Array2::zeros((n_samples, n_features));
390
391 for i in 0..n_features {
392 let column = x.column(i);
393 let mean = column.mean().unwrap_or(0.0);
394 let std = column.var(0.0).sqrt();
395
396 for j in 0..n_samples {
397 scaled[[j, i]] = if std > 0.0 {
398 (x[[j, i]] - mean) / std
399 } else {
400 0.0
401 };
402 }
403 }
404
405 Ok(scaled)
406 }
407
408 fn select_features(
409 &self,
410 x: &Array2<f64>,
411 _y: Option<&ArrayView1<'_, Float>>,
412 ) -> SklResult<Array2<f64>> {
413 let (n_samples, n_features) = x.dim();
415
416 if let Some(max_features) = self.max_features {
417 if max_features >= n_features {
418 return Ok(x.clone());
419 }
420
421 let mut feature_scores = Vec::new();
422
423 for i in 0..n_features {
424 let column = x.column(i);
425 let variance = column.var(0.0);
426 feature_scores.push((i, variance));
427 }
428
429 feature_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
431
432 let selected_indices: Vec<usize> = feature_scores
434 .into_iter()
435 .take(max_features)
436 .map(|(idx, _)| idx)
437 .collect();
438
439 let mut selected = Array2::zeros((n_samples, max_features));
441 for (new_idx, &old_idx) in selected_indices.iter().enumerate() {
442 for j in 0..n_samples {
443 selected[[j, new_idx]] = x[[j, old_idx]];
444 }
445 }
446
447 Ok(selected)
448 } else {
449 Ok(x.clone())
450 }
451 }
452}
453
454impl Default for AutoFeatureEngineer {
455 fn default() -> Self {
456 Self::new()
457 }
458}
459
460pub struct ColumnTypeDetector {
462 categorical_threshold: f64,
463 date_pattern_detection: bool,
464 text_detection: bool,
465}
466
467#[derive(Debug, Clone, PartialEq)]
469pub enum ColumnType {
470 Numeric,
472 Categorical,
474 Boolean,
476 DateTime,
478 Text,
480 Binary,
482 Ordinal,
484}
485
486impl ColumnTypeDetector {
487 #[must_use]
489 pub fn new() -> Self {
490 Self {
491 categorical_threshold: 0.1, date_pattern_detection: true,
493 text_detection: true,
494 }
495 }
496
497 #[must_use]
499 pub fn categorical_threshold(mut self, threshold: f64) -> Self {
500 self.categorical_threshold = threshold;
501 self
502 }
503
504 #[must_use]
506 pub fn date_pattern_detection(mut self, enable: bool) -> Self {
507 self.date_pattern_detection = enable;
508 self
509 }
510
511 #[must_use]
513 pub fn text_detection(mut self, enable: bool) -> Self {
514 self.text_detection = enable;
515 self
516 }
517
518 #[must_use]
520 pub fn detect_types(&self, x: &ArrayView2<'_, Float>) -> Vec<ColumnType> {
521 let mut column_types = Vec::new();
522
523 for i in 0..x.ncols() {
524 let column = x.column(i);
525 let column_type = self.detect_column_type(&column);
526 column_types.push(column_type);
527 }
528
529 column_types
530 }
531
532 fn detect_column_type(&self, column: &ArrayView1<'_, Float>) -> ColumnType {
533 let unique_values = self.count_unique_values(column);
534 let total_values = column.len();
535 let unique_ratio = unique_values as f64 / total_values as f64;
536
537 if unique_values == 2 {
539 return ColumnType::Binary;
540 }
541
542 if self.is_boolean_column(column) {
544 return ColumnType::Boolean;
545 }
546
547 if unique_ratio < self.categorical_threshold {
549 return ColumnType::Categorical;
550 }
551
552 ColumnType::Numeric
554 }
555
556 fn count_unique_values(&self, column: &ArrayView1<'_, Float>) -> usize {
557 let mut unique_set = HashSet::new();
558 for &value in column {
559 let rounded = (value * 1000.0).round() / 1000.0;
561 unique_set.insert(rounded.to_bits());
562 }
563 unique_set.len()
564 }
565
566 fn is_boolean_column(&self, column: &ArrayView1<'_, Float>) -> bool {
567 for &value in column {
568 if value != 0.0 && value != 1.0 {
569 return false;
570 }
571 }
572 true
573 }
574}
575
576impl Default for ColumnTypeDetector {
577 fn default() -> Self {
578 Self::new()
579 }
580}
581
582#[allow(non_snake_case)]
583#[cfg(test)]
584mod tests {
585 use super::*;
586 use scirs2_core::ndarray::array;
587
588 #[test]
589 fn test_feature_interaction_detector() {
590 let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
591
592 let detector = FeatureInteractionDetector::new().min_correlation(0.5);
593
594 let interactions = detector.detect_interactions(&x.view(), None).unwrap();
595 assert!(!interactions.is_empty());
596 }
597
598 #[test]
599 fn test_auto_feature_engineer() {
600 let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
601
602 let engineer = AutoFeatureEngineer::new()
603 .polynomial_features(true, 2)
604 .interaction_features(true);
605
606 let engineered = engineer.generate_features(&x.view(), None).unwrap();
607 assert!(engineered.ncols() > x.ncols());
608 }
609
610 #[test]
611 fn test_column_type_detector() {
612 let x = array![[0.0, 1.0, 5.5], [1.0, 0.0, 6.2], [0.0, 1.0, 7.8]];
613
614 let detector = ColumnTypeDetector::new();
615 let types = detector.detect_types(&x.view());
616
617 assert_eq!(types.len(), 3);
618 assert_eq!(types[0], ColumnType::Binary); assert_eq!(types[1], ColumnType::Binary); assert_eq!(types[2], ColumnType::Numeric); }
622}