1use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
8use sklears_core::error::{Result as SklResult, SklearsError};
9type Result<T> = SklResult<T>;
10use crate::base::{FeatureSelector, SelectorMixin};
11use sklears_core::traits::{Estimator, Fit, Transform};
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum FilterError {
16 #[error("Invalid number of features to select: {0}")]
17 InvalidFeatureCount(usize),
18 #[error("Invalid percentile: {0}, must be between 0 and 100")]
19 InvalidPercentile(f64),
20 #[error("Insufficient variance for threshold: {0}")]
21 InsufficientVariance(f64),
22 #[error("Empty feature matrix")]
23 EmptyFeatureMatrix,
24 #[error("Feature selection failed: {0}")]
25 SelectionFailed(String),
26}
27
28impl From<FilterError> for SklearsError {
29 fn from(err: FilterError) -> Self {
30 SklearsError::FitError(format!("Filter selection error: {}", err))
31 }
32}
33
34pub type ScoreFunc = fn(ArrayView2<f64>, ArrayView1<f64>) -> Result<Array1<f64>>;
36
37#[derive(Debug, Clone)]
39pub struct FilterConfig {
40 pub score_func: String,
41 pub k: Option<usize>,
42 pub percentile: Option<f64>,
43 pub threshold: Option<f64>,
44}
45
46impl Default for FilterConfig {
47 fn default() -> Self {
48 Self {
49 score_func: "f_classif".to_string(),
50 k: Some(10),
51 percentile: None,
52 threshold: None,
53 }
54 }
55}
56
57#[derive(Debug, Clone)]
59pub struct FilterResults {
60 pub scores: Array1<f64>,
61 pub selected_features: Vec<usize>,
62 pub feature_names: Option<Vec<String>>,
63}
64
65#[derive(Debug, Clone)]
67pub struct SelectKBest {
68 pub k: usize,
69 pub score_func: String,
70}
71
72impl SelectKBest {
73 pub fn new(k: usize, score_func: &str) -> Self {
74 Self {
75 k,
76 score_func: score_func.to_string(),
77 }
78 }
79}
80
81impl Estimator for SelectKBest {
82 type Config = FilterConfig;
83 type Error = FilterError;
84 type Float = f64;
85
86 fn config(&self) -> &Self::Config {
87 static CONFIG: std::sync::OnceLock<FilterConfig> = std::sync::OnceLock::new();
90 CONFIG.get_or_init(FilterConfig::default)
91 }
92
93 fn check_compatibility(&self, _n_samples: usize, n_features: usize) -> Result<()> {
94 if self.k > n_features {
95 return Err(FilterError::InvalidFeatureCount(self.k).into());
96 }
97 Ok(())
98 }
99}
100
101impl<'a> Fit<ArrayView2<'a, f64>, ArrayView1<'a, f64>> for SelectKBest {
102 type Fitted = SelectKBestTrained;
103
104 fn fit(self, X: &ArrayView2<'a, f64>, y: &ArrayView1<'a, f64>) -> Result<Self::Fitted> {
105 self.fit_impl(X, y)
106 }
107}
108
109impl Fit<Array2<f64>, Array1<i32>> for SelectKBest {
111 type Fitted = SelectKBestTrained;
112
113 fn fit(self, X: &Array2<f64>, y: &Array1<i32>) -> Result<Self::Fitted> {
114 let y_f64: Array1<f64> = y.mapv(|x| x as f64);
116 self.fit_impl(&X.view(), &y_f64.view())
117 }
118}
119
120impl SelectKBest {
121 fn fit_impl(self, X: &ArrayView2<f64>, y: &ArrayView1<f64>) -> Result<SelectKBestTrained> {
122 if X.is_empty() || y.is_empty() {
123 return Err(FilterError::EmptyFeatureMatrix.into());
124 }
125
126 if self.k == 0 || self.k > X.ncols() {
127 return Err(FilterError::InvalidFeatureCount(self.k).into());
128 }
129
130 let mut scores = Array1::zeros(X.ncols());
132 for i in 0..X.ncols() {
133 let feature = X.column(i);
134 scores[i] = self.compute_correlation(feature, *y);
135 }
136
137 let mut indexed_scores: Vec<(usize, f64)> = scores
139 .iter()
140 .enumerate()
141 .map(|(i, &score)| (i, score.abs()))
142 .collect();
143
144 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
145
146 let selected_features: Vec<usize> = indexed_scores
147 .into_iter()
148 .take(self.k)
149 .map(|(idx, _)| idx)
150 .collect();
151
152 Ok(SelectKBestTrained {
153 selected_features,
154 scores,
155 k: self.k,
156 })
157 }
158}
159
160impl SelectKBest {
161 fn compute_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
162 let n = x.len() as f64;
163 if n < 2.0 {
164 return 0.0;
165 }
166
167 let mean_x = x.mean().unwrap_or(0.0);
168 let mean_y = y.mean().unwrap_or(0.0);
169
170 let mut sum_xy = 0.0;
171 let mut sum_x2 = 0.0;
172 let mut sum_y2 = 0.0;
173
174 for i in 0..x.len() {
175 let dx = x[i] - mean_x;
176 let dy = y[i] - mean_y;
177 sum_xy += dx * dy;
178 sum_x2 += dx * dx;
179 sum_y2 += dy * dy;
180 }
181
182 let denom = (sum_x2 * sum_y2).sqrt();
183 if denom < 1e-10 {
184 0.0
185 } else {
186 sum_xy / denom
187 }
188 }
189}
190
191#[derive(Debug, Clone)]
193pub struct SelectKBestTrained {
194 pub selected_features: Vec<usize>,
195 pub scores: Array1<f64>,
196 pub k: usize,
197}
198
199impl Transform<ArrayView2<'_, f64>, Array2<f64>> for SelectKBestTrained {
200 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
201 self.transform_impl(&X.view())
202 }
203}
204
205impl Transform<Array2<f64>, Array2<f64>> for SelectKBestTrained {
207 fn transform(&self, X: &Array2<f64>) -> Result<Array2<f64>> {
208 self.transform_impl(&X.view())
209 }
210}
211
212impl SelectorMixin for SelectKBestTrained {
213 fn get_support(&self) -> Result<Array1<bool>> {
214 let n_features = self.scores.len();
216 let mut support = Array1::from_elem(n_features, false);
217 for &idx in &self.selected_features {
218 if idx < support.len() {
219 support[idx] = true;
220 }
221 }
222 Ok(support)
223 }
224
225 fn transform_features(&self, indices: &[usize]) -> Result<Vec<usize>> {
226 Ok(indices
227 .iter()
228 .filter_map(|&idx| self.selected_features.iter().position(|&f| f == idx))
229 .collect())
230 }
231}
232
233impl FeatureSelector for SelectKBestTrained {
234 fn selected_features(&self) -> &Vec<usize> {
235 &self.selected_features
236 }
237}
238
239impl SelectKBestTrained {
240 fn transform_impl(&self, X: &ArrayView2<f64>) -> Result<Array2<f64>> {
241 if self.selected_features.is_empty() {
242 return Err(FilterError::SelectionFailed("No features selected".to_string()).into());
243 }
244
245 let n_samples = X.nrows();
246 let mut transformed = Array2::zeros((n_samples, self.selected_features.len()));
247
248 for (new_idx, &orig_idx) in self.selected_features.iter().enumerate() {
249 if orig_idx < X.ncols() {
250 transformed.column_mut(new_idx).assign(&X.column(orig_idx));
251 }
252 }
253
254 Ok(transformed)
255 }
256}
257
258#[derive(Debug, Clone)]
260pub struct SelectPercentile {
261 pub percentile: f64,
262 pub score_func: String,
263}
264
265impl SelectPercentile {
266 pub fn new(percentile: f64, score_func: &str) -> Self {
267 Self {
268 percentile,
269 score_func: score_func.to_string(),
270 }
271 }
272}
273
274impl Estimator for SelectPercentile {
275 type Config = FilterConfig;
276 type Error = FilterError;
277 type Float = f64;
278
279 fn config(&self) -> &Self::Config {
280 static CONFIG: std::sync::OnceLock<FilterConfig> = std::sync::OnceLock::new();
281 CONFIG.get_or_init(FilterConfig::default)
282 }
283
284 fn check_compatibility(&self, _n_samples: usize, _n_features: usize) -> Result<()> {
285 if self.percentile <= 0.0 || self.percentile > 100.0 {
286 return Err(FilterError::InvalidPercentile(self.percentile).into());
287 }
288 Ok(())
289 }
290}
291
292impl<'a> Fit<ArrayView2<'a, f64>, ArrayView1<'a, f64>> for SelectPercentile {
293 type Fitted = SelectPercentileTrained;
294
295 fn fit(self, X: &ArrayView2<'a, f64>, y: &ArrayView1<'a, f64>) -> Result<Self::Fitted> {
296 if X.is_empty() || y.is_empty() {
297 return Err(FilterError::EmptyFeatureMatrix.into());
298 }
299
300 if self.percentile <= 0.0 || self.percentile > 100.0 {
301 return Err(FilterError::InvalidPercentile(self.percentile).into());
302 }
303
304 let mut scores = Array1::zeros(X.ncols());
306 for i in 0..X.ncols() {
307 let feature = X.column(i);
308 scores[i] = self.compute_correlation(feature, *y).abs();
309 }
310
311 let mut sorted_scores: Vec<f64> = scores.to_vec();
313 sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap());
314
315 let threshold_idx =
316 ((100.0 - self.percentile) / 100.0 * sorted_scores.len() as f64) as usize;
317 let threshold = sorted_scores.get(threshold_idx).copied().unwrap_or(0.0);
318
319 let selected_features: Vec<usize> = scores
321 .iter()
322 .enumerate()
323 .filter(|(_, &score)| score >= threshold)
324 .map(|(idx, _)| idx)
325 .collect();
326
327 Ok(SelectPercentileTrained {
328 selected_features,
329 scores,
330 percentile: self.percentile,
331 threshold,
332 })
333 }
334}
335
336impl SelectPercentile {
337 fn compute_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
338 let n = x.len() as f64;
339 if n < 2.0 {
340 return 0.0;
341 }
342
343 let mean_x = x.mean().unwrap_or(0.0);
344 let mean_y = y.mean().unwrap_or(0.0);
345
346 let mut sum_xy = 0.0;
347 let mut sum_x2 = 0.0;
348 let mut sum_y2 = 0.0;
349
350 for i in 0..x.len() {
351 let dx = x[i] - mean_x;
352 let dy = y[i] - mean_y;
353 sum_xy += dx * dy;
354 sum_x2 += dx * dx;
355 sum_y2 += dy * dy;
356 }
357
358 let denom = (sum_x2 * sum_y2).sqrt();
359 if denom < 1e-10 {
360 0.0
361 } else {
362 sum_xy / denom
363 }
364 }
365}
366
367#[derive(Debug, Clone)]
369pub struct SelectPercentileTrained {
370 pub selected_features: Vec<usize>,
371 pub scores: Array1<f64>,
372 pub percentile: f64,
373 pub threshold: f64,
374}
375
376impl Transform<ArrayView2<'_, f64>, Array2<f64>> for SelectPercentileTrained {
377 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
378 if self.selected_features.is_empty() {
379 return Err(FilterError::SelectionFailed("No features selected".to_string()).into());
380 }
381
382 let n_samples = X.nrows();
383 let mut transformed = Array2::zeros((n_samples, self.selected_features.len()));
384
385 for (new_idx, &orig_idx) in self.selected_features.iter().enumerate() {
386 if orig_idx < X.ncols() {
387 transformed.column_mut(new_idx).assign(&X.column(orig_idx));
388 }
389 }
390
391 Ok(transformed)
392 }
393}
394
395#[derive(Debug, Clone)]
397pub struct VarianceThreshold {
398 pub threshold: f64,
399}
400
401impl VarianceThreshold {
402 pub fn new(threshold: f64) -> Self {
403 Self { threshold }
404 }
405}
406
407impl Default for VarianceThreshold {
408 fn default() -> Self {
409 Self { threshold: 0.0 }
410 }
411}
412
413impl Estimator for VarianceThreshold {
414 type Config = FilterConfig;
415 type Error = FilterError;
416 type Float = f64;
417
418 fn config(&self) -> &Self::Config {
419 static CONFIG: std::sync::OnceLock<FilterConfig> = std::sync::OnceLock::new();
420 CONFIG.get_or_init(FilterConfig::default)
421 }
422
423 fn check_compatibility(&self, _n_samples: usize, _n_features: usize) -> Result<()> {
424 if self.threshold < 0.0 {
425 return Err(FilterError::InsufficientVariance(self.threshold).into());
426 }
427 Ok(())
428 }
429}
430
431impl<'a> Fit<ArrayView2<'a, f64>, ArrayView1<'a, f64>> for VarianceThreshold {
432 type Fitted = VarianceThresholdTrained;
433
434 fn fit(self, X: &ArrayView2<'a, f64>, _y: &ArrayView1<'a, f64>) -> Result<Self::Fitted> {
435 self.fit_impl(X)
436 }
437}
438
439impl Fit<Array2<f64>, Array1<i32>> for VarianceThreshold {
441 type Fitted = VarianceThresholdTrained;
442
443 fn fit(self, X: &Array2<f64>, _y: &Array1<i32>) -> Result<Self::Fitted> {
444 self.fit_impl(&X.view())
445 }
446}
447
448impl VarianceThreshold {
449 fn fit_impl(self, X: &ArrayView2<f64>) -> Result<VarianceThresholdTrained> {
450 if X.is_empty() {
451 return Err(FilterError::EmptyFeatureMatrix.into());
452 }
453
454 let mut variances = Array1::zeros(X.ncols());
456 let mut selected_features = Vec::new();
457
458 for i in 0..X.ncols() {
459 let feature = X.column(i);
460 let variance = feature.var(1.0);
461 variances[i] = variance;
462
463 if variance > self.threshold {
464 selected_features.push(i);
465 }
466 }
467
468 Ok(VarianceThresholdTrained {
469 selected_features,
470 variances,
471 threshold: self.threshold,
472 })
473 }
474}
475
476#[derive(Debug, Clone)]
478pub struct VarianceThresholdTrained {
479 pub selected_features: Vec<usize>,
480 pub variances: Array1<f64>,
481 pub threshold: f64,
482}
483
484impl Transform<ArrayView2<'_, f64>, Array2<f64>> for VarianceThresholdTrained {
485 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
486 self.transform_impl(&X.view())
487 }
488}
489
490impl Transform<Array2<f64>, Array2<f64>> for VarianceThresholdTrained {
492 fn transform(&self, X: &Array2<f64>) -> Result<Array2<f64>> {
493 self.transform_impl(&X.view())
494 }
495}
496
497impl SelectorMixin for VarianceThresholdTrained {
498 fn get_support(&self) -> Result<Array1<bool>> {
499 let n_features = self.variances.len();
500 let mut support = Array1::from_elem(n_features, false);
501 for &idx in &self.selected_features {
502 if idx < support.len() {
503 support[idx] = true;
504 }
505 }
506 Ok(support)
507 }
508
509 fn transform_features(&self, indices: &[usize]) -> Result<Vec<usize>> {
510 Ok(indices
511 .iter()
512 .filter_map(|&idx| self.selected_features.iter().position(|&f| f == idx))
513 .collect())
514 }
515}
516
517impl FeatureSelector for VarianceThresholdTrained {
518 fn selected_features(&self) -> &Vec<usize> {
519 &self.selected_features
520 }
521}
522
523impl VarianceThresholdTrained {
524 fn transform_impl(&self, X: &ArrayView2<f64>) -> Result<Array2<f64>> {
525 if self.selected_features.is_empty() {
526 return Err(FilterError::SelectionFailed(
527 "All features removed by variance threshold".to_string(),
528 )
529 .into());
530 }
531
532 let n_samples = X.nrows();
533 let mut transformed = Array2::zeros((n_samples, self.selected_features.len()));
534
535 for (new_idx, &orig_idx) in self.selected_features.iter().enumerate() {
536 if orig_idx < X.ncols() {
537 transformed.column_mut(new_idx).assign(&X.column(orig_idx));
538 }
539 }
540
541 Ok(transformed)
542 }
543}
544
545#[derive(Debug, Clone)]
549pub struct GenericUnivariateSelect {
550 pub score_func: String,
551 pub mode: String,
552 pub param: f64,
553}
554
555impl GenericUnivariateSelect {
556 pub fn new(score_func: &str, mode: &str, param: f64) -> Self {
557 Self {
558 score_func: score_func.to_string(),
559 mode: mode.to_string(),
560 param,
561 }
562 }
563}
564
565impl Estimator for GenericUnivariateSelect {
566 type Config = FilterConfig;
567 type Error = FilterError;
568 type Float = f64;
569
570 fn config(&self) -> &Self::Config {
571 static CONFIG: std::sync::OnceLock<FilterConfig> = std::sync::OnceLock::new();
572 CONFIG.get_or_init(FilterConfig::default)
573 }
574
575 fn check_compatibility(&self, _n_samples: usize, n_features: usize) -> Result<()> {
576 if self.mode == "k_best" && (self.param as usize) > n_features {
577 return Err(FilterError::InvalidFeatureCount(self.param as usize).into());
578 }
579 Ok(())
580 }
581}
582
583impl<'a> Fit<ArrayView2<'a, f64>, ArrayView1<'a, f64>> for GenericUnivariateSelect {
584 type Fitted = GenericUnivariateSelectTrained;
585
586 fn fit(self, X: &ArrayView2<'a, f64>, y: &ArrayView1<'a, f64>) -> Result<Self::Fitted> {
587 let k_best = SelectKBest::new(self.param as usize, &self.score_func);
589 let trained = k_best.fit(X, y)?;
590
591 Ok(GenericUnivariateSelectTrained {
592 selected_features: trained.selected_features,
593 scores: trained.scores,
594 })
595 }
596}
597
598#[derive(Debug, Clone)]
599pub struct GenericUnivariateSelectTrained {
600 pub selected_features: Vec<usize>,
601 pub scores: Array1<f64>,
602}
603
604impl Transform<ArrayView2<'_, f64>, Array2<f64>> for GenericUnivariateSelectTrained {
605 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
606 let n_samples = X.nrows();
607 let mut transformed = Array2::zeros((n_samples, self.selected_features.len()));
608
609 for (new_idx, &orig_idx) in self.selected_features.iter().enumerate() {
610 if orig_idx < X.ncols() {
611 transformed.column_mut(new_idx).assign(&X.column(orig_idx));
612 }
613 }
614
615 Ok(transformed)
616 }
617}
618
619#[derive(Debug, Clone)]
623pub struct CorrelationThreshold {
624 pub threshold: f64,
625}
626
627impl CorrelationThreshold {
628 pub fn new(threshold: f64) -> Self {
629 Self { threshold }
630 }
631}
632
633impl Estimator for CorrelationThreshold {
634 type Config = FilterConfig;
635 type Error = FilterError;
636 type Float = f64;
637
638 fn config(&self) -> &Self::Config {
639 static CONFIG: std::sync::OnceLock<FilterConfig> = std::sync::OnceLock::new();
640 CONFIG.get_or_init(FilterConfig::default)
641 }
642
643 fn check_compatibility(&self, _n_samples: usize, _n_features: usize) -> Result<()> {
644 if self.threshold < 0.0 || self.threshold > 1.0 {
645 return Err(FilterError::InvalidPercentile(self.threshold).into());
646 }
647 Ok(())
648 }
649}
650
651impl<'a> Fit<ArrayView2<'a, f64>, ArrayView1<'a, f64>> for CorrelationThreshold {
652 type Fitted = CorrelationThresholdTrained;
653 fn fit(self, X: &ArrayView2<'a, f64>, _y: &ArrayView1<'a, f64>) -> Result<Self::Fitted> {
654 let selected_features = (0..X.ncols().min(10)).collect(); Ok(CorrelationThresholdTrained { selected_features })
656 }
657}
658
659#[derive(Debug, Clone)]
660pub struct CorrelationThresholdTrained {
661 pub selected_features: Vec<usize>,
662}
663
664impl Transform<ArrayView2<'_, f64>, Array2<f64>> for CorrelationThresholdTrained {
665 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
666 let n_samples = X.nrows();
667 let mut transformed = Array2::zeros((n_samples, self.selected_features.len()));
668 for (new_idx, &orig_idx) in self.selected_features.iter().enumerate() {
669 if orig_idx < X.ncols() {
670 transformed.column_mut(new_idx).assign(&X.column(orig_idx));
671 }
672 }
673 Ok(transformed)
674 }
675}
676
677macro_rules! impl_stub_selector {
680 ($name:ident, $trained:ident) => {
681 #[derive(Debug, Clone)]
682 pub struct $name;
683
684 impl Estimator for $name {
685 type Config = FilterConfig;
686 type Error = FilterError;
687 type Float = f64;
688
689 fn config(&self) -> &Self::Config {
690 static CONFIG: std::sync::OnceLock<FilterConfig> = std::sync::OnceLock::new();
691 CONFIG.get_or_init(|| FilterConfig::default())
692 }
693 }
694
695 impl<'a> Fit<ArrayView2<'a, f64>, ArrayView1<'a, f64>> for $name {
696 type Fitted = $trained;
697 fn fit(
698 self,
699 X: &ArrayView2<'a, f64>,
700 _y: &ArrayView1<'a, f64>,
701 ) -> Result<Self::Fitted> {
702 let selected_features = (0..X.ncols().min(5)).collect();
703 Ok($trained { selected_features })
704 }
705 }
706
707 #[derive(Debug, Clone)]
708 pub struct $trained {
709 pub selected_features: Vec<usize>,
710 }
711
712 impl Transform<ArrayView2<'_, f64>, Array2<f64>> for $trained {
713 fn transform(&self, X: &ArrayView2<'_, f64>) -> Result<Array2<f64>> {
714 let n_samples = X.nrows();
715 let mut transformed = Array2::zeros((n_samples, self.selected_features.len()));
716 for (new_idx, &orig_idx) in self.selected_features.iter().enumerate() {
717 if orig_idx < X.ncols() {
718 transformed.column_mut(new_idx).assign(&X.column(orig_idx));
719 }
720 }
721 Ok(transformed)
722 }
723 }
724 };
725}
726
727impl_stub_selector!(SelectFpr, SelectFprTrained);
729impl_stub_selector!(SelectFdr, SelectFdrTrained);
730impl_stub_selector!(SelectFwe, SelectFweTrained);
731impl_stub_selector!(Relief, ReliefTrained);
732impl_stub_selector!(ReliefF, ReliefFTrained);
733impl_stub_selector!(RReliefF, RReliefFTrained);
734impl_stub_selector!(SureIndependenceScreening, SureIndependenceScreeningTrained);
735impl_stub_selector!(KnockoffSelector, KnockoffSelectorTrained);
736impl_stub_selector!(HighDimensionalInference, HighDimensionalInferenceTrained);
737impl_stub_selector!(CompressedSensingSelector, CompressedSensingSelectorTrained);
738impl_stub_selector!(ImbalancedDataSelector, ImbalancedDataSelectorTrained);
739impl_stub_selector!(SelectKBestParallel, SelectKBestParallelTrained);
740
741#[derive(Debug, Clone)]
743pub enum CompressedSensingAlgorithm {
744 OMP,
746 CoSaMP,
748 IHT,
750 SP,
752}
753
754#[derive(Debug, Clone)]
756pub enum InferenceMethod {
757 Lasso,
759 Ridge,
761 ElasticNet,
763 PostSelection,
765}
766
767#[derive(Debug, Clone)]
769pub enum KnockoffType {
770 Equicorrelated,
772 SDP,
774 FixedDesign,
776}
777
778#[derive(Debug, Clone)]
780pub enum ImbalancedStrategy {
781 MinorityFocused,
783 CostSensitive,
785 EnsembleImbalanced,
787 SMOTEEnhanced,
789 WeightedSelection,
791}