ferrolearn_preprocess/
stat_selectors.rs

1//! Statistical-test-based feature selectors.
2//!
3//! Three selectors that choose features based on p-values obtained from a
4//! statistical test (e.g., ANOVA F-test, chi-squared test):
5//!
6//! - [`SelectFpr`] — **False Positive Rate**: selects every feature whose
7//!   p-value is below `alpha`.
8//! - [`SelectFdr`] — **False Discovery Rate**: applies the Benjamini-Hochberg
9//!   procedure to control the expected proportion of false positives.
10//! - [`SelectFwe`] — **Family-Wise Error**: applies the Bonferroni correction
11//!   (`alpha / n_features`) to control the probability of any false positive.
12//!
13//! All three take a pre-computed vector of p-values (one per feature) at fit
14//! time, allowing integration with any upstream scoring function.
15//!
16//! ## REQ status
17//!
18//! Translation target: scikit-learn 1.5.2 `SelectFpr`/`SelectFdr`/`SelectFwe`
19//! (`sklearn/feature_selection/_univariate_selection.py:801,881,972`). Tracking:
20//! #1396. (The route's `parity_ops` lists the score functions
21//! `f_classif`/`f_regression`/`chi2` — those are translated in the sibling
22//! `feature_scoring.rs`; this unit owns the FPR/FDR/FWE selectors.) Each REQ is
23//! BINARY — SHIPPED (impl + non-test consumer + tests + green verification) or
24//! NOT-STARTED (with a concrete open blocker). HONEST scope: ferrolearn takes a
25//! static p-value vector; sklearn's `_BaseFilter` wraps a `score_func` and
26//! computes `pvalues_` internally — that wrapping is NOT-STARTED.
27//!
28//! | REQ | Scope | Status | Evidence / Blocker |
29//! |-----|-------|--------|--------------------|
30//! | REQ-1 | SelectFpr mask `p < alpha` (given static p-values) | SHIPPED | [`SelectFpr`] `fit` matches sklearn `_get_support_mask` `_univariate_selection.py:878` (strict `<`); oracle tests in `tests/divergence_stat_selectors.rs`. Consumer: re-export `lib.rs:181` |
31//! | REQ-2 | SelectFdr Benjamini-Hochberg mask (ties + non-monotone gap + none/all) | SHIPPED | [`SelectFdr`] `fit` (highest-qualifying-rank + `ranked[..=k]`) ≡ sklearn `pvalues_ <= sv[sv<=alpha/n·arange].max()` (`:959-969`); oracle tests (tie `[0.01,0.025,0.025,0.9]` → `[0,1,2]`, gap `[0.001,0.04,0.045,0.011]` → all) |
32//! | REQ-3 | SelectFwe mask `p < alpha/n` (given static p-values) | SHIPPED | [`SelectFwe`] `fit` matches sklearn `_get_support_mask` `:1044` (Bonferroni, strict `<`); oracle tests |
33//! | REQ-4 | Error/parameter contracts (empty p-values, `alpha ∈ [0,1]` closed-both, transform ncols) | SHIPPED | `validate_inputs` accepts `alpha=0` (matches sklearn `_parameter_constraints` `Interval(Real,0,1,closed="both")` `:868`, fixed #1397, see Changed); divergence error tests |
34//! | REQ-5 | `score_func` wrapping (f_classif/f_regression/chi2 → `scores_`/`pvalues_` at `fit(X,y)`) | NOT-STARTED | takes p-values directly; sklearn `_BaseFilter` `:526,569-570` — blocker #1398 |
35//! | REQ-6 | `_BaseFilter`/`SelectorMixin` surface (`get_support`/`inverse_transform`/`get_feature_names_out`) | NOT-STARTED | sklearn `_univariate_selection.py:526` — blocker #1399 |
36//! | REQ-7 | Computed `scores_`/`pvalues_` fitted attrs + `n_features_in_`/`feature_names_in_` | NOT-STARTED | sklearn `:569-570` — blocker #1400 |
37//! | REQ-8 | PyO3 binding | NOT-STARTED | no `ferrolearn-python` registration — blocker #1401 |
38//! | REQ-9 | ferray substrate | NOT-STARTED | dense `Array1`/`Array2` + `num_traits::Float` only — blocker #1402 |
39
40use ferrolearn_core::error::FerroError;
41use ferrolearn_core::traits::{Fit, Transform};
42use ndarray::{Array1, Array2};
43use num_traits::Float;
44
45// ---------------------------------------------------------------------------
46// Shared helper
47// ---------------------------------------------------------------------------
48
49/// Build a new `Array2<F>` containing only the columns listed in `indices`.
50fn select_columns<F: Float>(x: &Array2<F>, indices: &[usize]) -> Array2<F> {
51    let nrows = x.nrows();
52    let ncols = indices.len();
53    if ncols == 0 {
54        return Array2::zeros((nrows, 0));
55    }
56    let mut out = Array2::zeros((nrows, ncols));
57    for (new_j, &old_j) in indices.iter().enumerate() {
58        for i in 0..nrows {
59            out[[i, new_j]] = x[[i, old_j]];
60        }
61    }
62    out
63}
64
65/// Validate common inputs for all three selectors.
66fn validate_inputs(n_features: usize, alpha: f64) -> Result<(), FerroError> {
67    if n_features == 0 {
68        return Err(FerroError::InvalidParameter {
69            name: "p_values".into(),
70            reason: "p-value vector must not be empty".into(),
71        });
72    }
73    if !(0.0..=1.0).contains(&alpha) {
74        return Err(FerroError::InvalidParameter {
75            name: "alpha".into(),
76            reason: format!("alpha must be in [0, 1], got {alpha}"),
77        });
78    }
79    Ok(())
80}
81
82// ===========================================================================
83// SelectFpr — False Positive Rate
84// ===========================================================================
85
86/// Select features with p-values below `alpha`.
87///
88/// A feature is selected if its p-value is strictly less than `alpha`.
89/// This controls the per-feature false positive rate but does not adjust
90/// for multiple comparisons.
91///
92/// # Examples
93///
94/// ```
95/// use ferrolearn_preprocess::stat_selectors::SelectFpr;
96/// use ferrolearn_core::traits::{Fit, Transform};
97/// use ndarray::array;
98///
99/// let sel = SelectFpr::<f64>::new(0.05);
100/// let p_values = array![0.01, 0.5, 0.03, 0.9];
101/// let fitted = sel.fit(&p_values, &()).unwrap();
102/// // Features 0 (p=0.01) and 2 (p=0.03) are below alpha=0.05
103/// assert_eq!(fitted.selected_indices(), &[0, 2]);
104/// ```
105#[must_use]
106#[derive(Debug, Clone)]
107pub struct SelectFpr<F> {
108    /// Significance threshold.
109    alpha: f64,
110    _marker: std::marker::PhantomData<F>,
111}
112
113impl<F: Float + Send + Sync + 'static> SelectFpr<F> {
114    /// Create a new `SelectFpr` with the given significance level.
115    pub fn new(alpha: f64) -> Self {
116        Self {
117            alpha,
118            _marker: std::marker::PhantomData,
119        }
120    }
121
122    /// Return the significance level.
123    #[must_use]
124    pub fn alpha(&self) -> f64 {
125        self.alpha
126    }
127}
128
129/// A fitted `SelectFpr` holding the selected indices.
130#[derive(Debug, Clone)]
131pub struct FittedSelectFpr<F> {
132    /// Number of features seen during fitting.
133    n_features_in: usize,
134    /// P-values supplied during fitting.
135    p_values: Array1<F>,
136    /// Indices of selected columns (sorted).
137    selected_indices: Vec<usize>,
138}
139
140impl<F: Float + Send + Sync + 'static> FittedSelectFpr<F> {
141    /// Return the p-values.
142    #[must_use]
143    pub fn p_values(&self) -> &Array1<F> {
144        &self.p_values
145    }
146
147    /// Return the indices of the selected columns.
148    #[must_use]
149    pub fn selected_indices(&self) -> &[usize] {
150        &self.selected_indices
151    }
152
153    /// Return the number of selected features.
154    #[must_use]
155    pub fn n_features_selected(&self) -> usize {
156        self.selected_indices.len()
157    }
158}
159
160impl<F: Float + Send + Sync + 'static> Fit<Array1<F>, ()> for SelectFpr<F> {
161    type Fitted = FittedSelectFpr<F>;
162    type Error = FerroError;
163
164    /// Fit by selecting features whose p-value is below `alpha`.
165    ///
166    /// # Errors
167    ///
168    /// - [`FerroError::InvalidParameter`] if p-values are empty or alpha is
169    ///   not in `(0, 1]`.
170    fn fit(&self, x: &Array1<F>, _y: &()) -> Result<FittedSelectFpr<F>, FerroError> {
171        let n = x.len();
172        validate_inputs(n, self.alpha)?;
173
174        let alpha_f = F::from(self.alpha).unwrap_or_else(F::zero);
175        let selected_indices: Vec<usize> = x
176            .iter()
177            .enumerate()
178            .filter(|&(_, &p)| p < alpha_f)
179            .map(|(j, _)| j)
180            .collect();
181
182        Ok(FittedSelectFpr {
183            n_features_in: n,
184            p_values: x.clone(),
185            selected_indices,
186        })
187    }
188}
189
190impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSelectFpr<F> {
191    type Output = Array2<F>;
192    type Error = FerroError;
193
194    /// Return a matrix containing only the selected columns.
195    ///
196    /// # Errors
197    ///
198    /// Returns [`FerroError::ShapeMismatch`] if column count does not match.
199    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
200        if x.ncols() != self.n_features_in {
201            return Err(FerroError::ShapeMismatch {
202                expected: vec![x.nrows(), self.n_features_in],
203                actual: vec![x.nrows(), x.ncols()],
204                context: "FittedSelectFpr::transform".into(),
205            });
206        }
207        Ok(select_columns(x, &self.selected_indices))
208    }
209}
210
211// ===========================================================================
212// SelectFdr — False Discovery Rate (Benjamini-Hochberg)
213// ===========================================================================
214
215/// Select features controlling the false discovery rate via the
216/// Benjamini-Hochberg procedure.
217///
218/// Features are sorted by p-value. Feature *i* (0-indexed, sorted ascending)
219/// is selected if `p_value[i] <= alpha * (i+1) / n_features`. All features
220/// with rank at or below the highest qualifying rank are selected.
221///
222/// # Examples
223///
224/// ```
225/// use ferrolearn_preprocess::stat_selectors::SelectFdr;
226/// use ferrolearn_core::traits::{Fit, Transform};
227/// use ndarray::array;
228///
229/// let sel = SelectFdr::<f64>::new(0.05);
230/// let p_values = array![0.01, 0.5, 0.03, 0.9];
231/// let fitted = sel.fit(&p_values, &()).unwrap();
232/// assert!(fitted.selected_indices().contains(&0));
233/// ```
234#[must_use]
235#[derive(Debug, Clone)]
236pub struct SelectFdr<F> {
237    /// Target false discovery rate.
238    alpha: f64,
239    _marker: std::marker::PhantomData<F>,
240}
241
242impl<F: Float + Send + Sync + 'static> SelectFdr<F> {
243    /// Create a new `SelectFdr` with the given FDR level.
244    pub fn new(alpha: f64) -> Self {
245        Self {
246            alpha,
247            _marker: std::marker::PhantomData,
248        }
249    }
250
251    /// Return the FDR level.
252    #[must_use]
253    pub fn alpha(&self) -> f64 {
254        self.alpha
255    }
256}
257
258/// A fitted `SelectFdr` holding the selected indices.
259#[derive(Debug, Clone)]
260pub struct FittedSelectFdr<F> {
261    /// Number of features seen during fitting.
262    n_features_in: usize,
263    /// P-values supplied during fitting.
264    p_values: Array1<F>,
265    /// Indices of selected columns (sorted in original order).
266    selected_indices: Vec<usize>,
267}
268
269impl<F: Float + Send + Sync + 'static> FittedSelectFdr<F> {
270    /// Return the p-values.
271    #[must_use]
272    pub fn p_values(&self) -> &Array1<F> {
273        &self.p_values
274    }
275
276    /// Return the indices of the selected columns.
277    #[must_use]
278    pub fn selected_indices(&self) -> &[usize] {
279        &self.selected_indices
280    }
281
282    /// Return the number of selected features.
283    #[must_use]
284    pub fn n_features_selected(&self) -> usize {
285        self.selected_indices.len()
286    }
287}
288
289impl<F: Float + Send + Sync + 'static> Fit<Array1<F>, ()> for SelectFdr<F> {
290    type Fitted = FittedSelectFdr<F>;
291    type Error = FerroError;
292
293    /// Fit using the Benjamini-Hochberg procedure.
294    ///
295    /// # Errors
296    ///
297    /// - [`FerroError::InvalidParameter`] if p-values are empty or alpha is
298    ///   not in `(0, 1]`.
299    fn fit(&self, x: &Array1<F>, _y: &()) -> Result<FittedSelectFdr<F>, FerroError> {
300        let n = x.len();
301        validate_inputs(n, self.alpha)?;
302
303        let alpha_f = F::from(self.alpha).unwrap_or_else(F::zero);
304        let n_f = F::from(n).unwrap_or_else(F::one);
305
306        // Sort features by p-value (ascending), keeping original indices
307        let mut ranked: Vec<(usize, F)> = x.iter().copied().enumerate().collect();
308        ranked.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
309
310        // Find the largest rank k where p_(k) <= alpha * (k+1) / n
311        let mut max_qualifying_rank: Option<usize> = None;
312        for (rank, &(_, p_val)) in ranked.iter().enumerate() {
313            let bh_threshold = alpha_f * F::from(rank + 1).unwrap_or_else(F::one) / n_f;
314            if p_val <= bh_threshold {
315                max_qualifying_rank = Some(rank);
316            }
317        }
318
319        // Select all features at or below the max qualifying rank
320        let mut selected_indices: Vec<usize> = match max_qualifying_rank {
321            Some(max_rank) => ranked[..=max_rank].iter().map(|&(idx, _)| idx).collect(),
322            None => Vec::new(),
323        };
324        selected_indices.sort_unstable();
325
326        Ok(FittedSelectFdr {
327            n_features_in: n,
328            p_values: x.clone(),
329            selected_indices,
330        })
331    }
332}
333
334impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSelectFdr<F> {
335    type Output = Array2<F>;
336    type Error = FerroError;
337
338    /// Return a matrix containing only the selected columns.
339    ///
340    /// # Errors
341    ///
342    /// Returns [`FerroError::ShapeMismatch`] if column count does not match.
343    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
344        if x.ncols() != self.n_features_in {
345            return Err(FerroError::ShapeMismatch {
346                expected: vec![x.nrows(), self.n_features_in],
347                actual: vec![x.nrows(), x.ncols()],
348                context: "FittedSelectFdr::transform".into(),
349            });
350        }
351        Ok(select_columns(x, &self.selected_indices))
352    }
353}
354
355// ===========================================================================
356// SelectFwe — Family-Wise Error (Bonferroni)
357// ===========================================================================
358
359/// Select features controlling the family-wise error rate via the
360/// Bonferroni correction.
361///
362/// A feature is selected if its p-value is strictly less than
363/// `alpha / n_features`.
364///
365/// # Examples
366///
367/// ```
368/// use ferrolearn_preprocess::stat_selectors::SelectFwe;
369/// use ferrolearn_core::traits::{Fit, Transform};
370/// use ndarray::array;
371///
372/// let sel = SelectFwe::<f64>::new(0.05);
373/// let p_values = array![0.001, 0.5, 0.03, 0.9];
374/// let fitted = sel.fit(&p_values, &()).unwrap();
375/// // Bonferroni threshold = 0.05/4 = 0.0125; only feature 0 qualifies
376/// assert_eq!(fitted.selected_indices(), &[0]);
377/// ```
378#[must_use]
379#[derive(Debug, Clone)]
380pub struct SelectFwe<F> {
381    /// Significance level before Bonferroni correction.
382    alpha: f64,
383    _marker: std::marker::PhantomData<F>,
384}
385
386impl<F: Float + Send + Sync + 'static> SelectFwe<F> {
387    /// Create a new `SelectFwe` with the given significance level.
388    pub fn new(alpha: f64) -> Self {
389        Self {
390            alpha,
391            _marker: std::marker::PhantomData,
392        }
393    }
394
395    /// Return the significance level.
396    #[must_use]
397    pub fn alpha(&self) -> f64 {
398        self.alpha
399    }
400}
401
402/// A fitted `SelectFwe` holding the selected indices.
403#[derive(Debug, Clone)]
404pub struct FittedSelectFwe<F> {
405    /// Number of features seen during fitting.
406    n_features_in: usize,
407    /// P-values supplied during fitting.
408    p_values: Array1<F>,
409    /// Indices of selected columns (sorted).
410    selected_indices: Vec<usize>,
411}
412
413impl<F: Float + Send + Sync + 'static> FittedSelectFwe<F> {
414    /// Return the p-values.
415    #[must_use]
416    pub fn p_values(&self) -> &Array1<F> {
417        &self.p_values
418    }
419
420    /// Return the indices of the selected columns.
421    #[must_use]
422    pub fn selected_indices(&self) -> &[usize] {
423        &self.selected_indices
424    }
425
426    /// Return the number of selected features.
427    #[must_use]
428    pub fn n_features_selected(&self) -> usize {
429        self.selected_indices.len()
430    }
431}
432
433impl<F: Float + Send + Sync + 'static> Fit<Array1<F>, ()> for SelectFwe<F> {
434    type Fitted = FittedSelectFwe<F>;
435    type Error = FerroError;
436
437    /// Fit using the Bonferroni correction: `p < alpha / n_features`.
438    ///
439    /// # Errors
440    ///
441    /// - [`FerroError::InvalidParameter`] if p-values are empty or alpha is
442    ///   not in `(0, 1]`.
443    fn fit(&self, x: &Array1<F>, _y: &()) -> Result<FittedSelectFwe<F>, FerroError> {
444        let n = x.len();
445        validate_inputs(n, self.alpha)?;
446
447        let adjusted_alpha = self.alpha / n as f64;
448        let adjusted_alpha_f = F::from(adjusted_alpha).unwrap_or_else(F::zero);
449
450        let selected_indices: Vec<usize> = x
451            .iter()
452            .enumerate()
453            .filter(|&(_, &p)| p < adjusted_alpha_f)
454            .map(|(j, _)| j)
455            .collect();
456
457        Ok(FittedSelectFwe {
458            n_features_in: n,
459            p_values: x.clone(),
460            selected_indices,
461        })
462    }
463}
464
465impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSelectFwe<F> {
466    type Output = Array2<F>;
467    type Error = FerroError;
468
469    /// Return a matrix containing only the selected columns.
470    ///
471    /// # Errors
472    ///
473    /// Returns [`FerroError::ShapeMismatch`] if column count does not match.
474    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
475        if x.ncols() != self.n_features_in {
476            return Err(FerroError::ShapeMismatch {
477                expected: vec![x.nrows(), self.n_features_in],
478                actual: vec![x.nrows(), x.ncols()],
479                context: "FittedSelectFwe::transform".into(),
480            });
481        }
482        Ok(select_columns(x, &self.selected_indices))
483    }
484}
485
486// ---------------------------------------------------------------------------
487// Tests
488// ---------------------------------------------------------------------------
489
490#[cfg(test)]
491mod tests {
492    use super::*;
493    use ndarray::array;
494
495    // ========================================================================
496    // SelectFpr tests
497    // ========================================================================
498
499    #[test]
500    fn test_fpr_selects_below_alpha() {
501        let sel = SelectFpr::<f64>::new(0.05);
502        let p = array![0.01, 0.5, 0.03, 0.9];
503        let fitted = sel.fit(&p, &()).unwrap();
504        assert_eq!(fitted.selected_indices(), &[0, 2]);
505    }
506
507    #[test]
508    fn test_fpr_none_below_alpha() {
509        let sel = SelectFpr::<f64>::new(0.001);
510        let p = array![0.01, 0.5, 0.03];
511        let fitted = sel.fit(&p, &()).unwrap();
512        assert_eq!(fitted.n_features_selected(), 0);
513    }
514
515    #[test]
516    fn test_fpr_all_below_alpha() {
517        let sel = SelectFpr::<f64>::new(0.99);
518        let p = array![0.01, 0.5, 0.03];
519        let fitted = sel.fit(&p, &()).unwrap();
520        assert_eq!(fitted.n_features_selected(), 3);
521    }
522
523    #[test]
524    fn test_fpr_transform() {
525        let sel = SelectFpr::<f64>::new(0.05);
526        let p = array![0.01, 0.5, 0.03];
527        let fitted = sel.fit(&p, &()).unwrap();
528        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
529        let out = fitted.transform(&x).unwrap();
530        assert_eq!(out.ncols(), 2); // features 0 and 2
531        assert_eq!(out[[0, 0]], 1.0);
532        assert_eq!(out[[0, 1]], 3.0);
533    }
534
535    #[test]
536    fn test_fpr_empty_error() {
537        let sel = SelectFpr::<f64>::new(0.05);
538        let p: Array1<f64> = Array1::zeros(0);
539        assert!(sel.fit(&p, &()).is_err());
540    }
541
542    #[test]
543    fn test_fpr_invalid_alpha() {
544        // sklearn `_parameter_constraints` alpha: Interval(Real, 0, 1,
545        // closed="both") (_univariate_selection.py:868) -> only alpha < 0 or
546        // alpha > 1 are rejected; alpha == 0 is VALID.
547        let p = array![0.01];
548
549        let neg = SelectFpr::<f64>::new(-0.1);
550        assert!(neg.fit(&p, &()).is_err());
551
552        let sel2 = SelectFpr::<f64>::new(1.5);
553        assert!(sel2.fit(&p, &()).is_err());
554    }
555
556    #[test]
557    fn test_fpr_alpha_zero_valid() {
558        // alpha == 0 is the lower endpoint of sklearn's closed="both" interval
559        // (_univariate_selection.py:868): fit succeeds and the FPR mask
560        // `pvalues_ < 0` selects nothing for positive p-values.
561        let sel = SelectFpr::<f64>::new(0.0);
562        let p = array![0.01, 0.5, 0.03];
563        let fitted = sel.fit(&p, &());
564        assert!(fitted.is_ok(), "alpha=0 is valid (closed=both)");
565        if let Ok(f) = fitted {
566            assert_eq!(f.n_features_selected(), 0);
567        }
568    }
569
570    #[test]
571    fn test_fpr_shape_mismatch() {
572        let sel = SelectFpr::<f64>::new(0.05);
573        let p = array![0.01, 0.5];
574        let fitted = sel.fit(&p, &()).unwrap();
575        let x_bad = array![[1.0, 2.0, 3.0]];
576        assert!(fitted.transform(&x_bad).is_err());
577    }
578
579    #[test]
580    fn test_fpr_accessor() {
581        let sel = SelectFpr::<f64>::new(0.05);
582        assert_eq!(sel.alpha(), 0.05);
583    }
584
585    #[test]
586    fn test_fpr_p_values_accessor() {
587        let sel = SelectFpr::<f64>::new(0.05);
588        let p = array![0.01, 0.5];
589        let fitted = sel.fit(&p, &()).unwrap();
590        assert_eq!(fitted.p_values().len(), 2);
591    }
592
593    // ========================================================================
594    // SelectFdr tests (Benjamini-Hochberg)
595    // ========================================================================
596
597    #[test]
598    fn test_fdr_basic() {
599        let sel = SelectFdr::<f64>::new(0.05);
600        // Sorted p-values: 0.01 (feat 0), 0.03 (feat 2), 0.5 (feat 1), 0.9 (feat 3)
601        // BH thresholds: 0.05*1/4=0.0125, 0.05*2/4=0.025, 0.05*3/4=0.0375, 0.05*4/4=0.05
602        // 0.01 <= 0.0125 ✓ (rank 0)
603        // 0.03 <= 0.025  ✗ → but check all: max qualifying rank = 0
604        let p = array![0.01, 0.5, 0.03, 0.9];
605        let fitted = sel.fit(&p, &()).unwrap();
606        assert!(fitted.selected_indices().contains(&0));
607    }
608
609    #[test]
610    fn test_fdr_multiple_pass() {
611        let sel = SelectFdr::<f64>::new(0.10);
612        // Sorted: 0.005 (rank 0), 0.02 (rank 1), 0.04 (rank 2), 0.5 (rank 3)
613        // BH: 0.1*1/4=0.025, 0.1*2/4=0.05, 0.1*3/4=0.075, 0.1*4/4=0.1
614        // 0.005 <= 0.025 ✓
615        // 0.02  <= 0.05  ✓
616        // 0.04  <= 0.075 ✓ → max rank = 2 → select rank 0,1,2
617        let p = array![0.02, 0.5, 0.005, 0.04];
618        let fitted = sel.fit(&p, &()).unwrap();
619        assert_eq!(fitted.n_features_selected(), 3);
620        assert!(fitted.selected_indices().contains(&0)); // 0.02
621        assert!(fitted.selected_indices().contains(&2)); // 0.005
622        assert!(fitted.selected_indices().contains(&3)); // 0.04
623    }
624
625    #[test]
626    fn test_fdr_none_selected() {
627        let sel = SelectFdr::<f64>::new(0.001);
628        let p = array![0.01, 0.5, 0.03];
629        let fitted = sel.fit(&p, &()).unwrap();
630        assert_eq!(fitted.n_features_selected(), 0);
631    }
632
633    #[test]
634    fn test_fdr_transform() {
635        let sel = SelectFdr::<f64>::new(0.10);
636        let p = array![0.001, 0.5, 0.9];
637        let fitted = sel.fit(&p, &()).unwrap();
638        let x = array![[1.0, 2.0, 3.0]];
639        let out = fitted.transform(&x).unwrap();
640        // Feature 0 (p=0.001) selected: BH threshold = 0.1*1/3 ≈ 0.033
641        assert!(out.ncols() >= 1);
642    }
643
644    #[test]
645    fn test_fdr_empty_error() {
646        let sel = SelectFdr::<f64>::new(0.05);
647        let p: Array1<f64> = Array1::zeros(0);
648        assert!(sel.fit(&p, &()).is_err());
649    }
650
651    #[test]
652    fn test_fdr_invalid_alpha() {
653        // sklearn closed="both" (_univariate_selection.py:952): alpha == 0 is
654        // VALID; only out-of-range values (< 0 or > 1) are rejected.
655        let p = array![0.01];
656
657        let neg = SelectFdr::<f64>::new(-0.1);
658        assert!(neg.fit(&p, &()).is_err());
659
660        let big = SelectFdr::<f64>::new(1.5);
661        assert!(big.fit(&p, &()).is_err());
662    }
663
664    #[test]
665    fn test_fdr_alpha_zero_valid() {
666        // alpha == 0 lower endpoint (_univariate_selection.py:952): BH
667        // threshold is all-zero, so no positive p-value qualifies -> empty.
668        let sel = SelectFdr::<f64>::new(0.0);
669        let p = array![0.01, 0.5, 0.03];
670        let fitted = sel.fit(&p, &());
671        assert!(fitted.is_ok(), "alpha=0 is valid (closed=both)");
672        if let Ok(f) = fitted {
673            assert_eq!(f.n_features_selected(), 0);
674        }
675    }
676
677    #[test]
678    fn test_fdr_shape_mismatch() {
679        let sel = SelectFdr::<f64>::new(0.05);
680        let p = array![0.01, 0.5];
681        let fitted = sel.fit(&p, &()).unwrap();
682        let x_bad = array![[1.0, 2.0, 3.0]];
683        assert!(fitted.transform(&x_bad).is_err());
684    }
685
686    #[test]
687    fn test_fdr_accessor() {
688        let sel = SelectFdr::<f64>::new(0.05);
689        assert_eq!(sel.alpha(), 0.05);
690    }
691
692    // ========================================================================
693    // SelectFwe tests (Bonferroni)
694    // ========================================================================
695
696    #[test]
697    fn test_fwe_basic() {
698        let sel = SelectFwe::<f64>::new(0.05);
699        // Bonferroni threshold = 0.05/4 = 0.0125
700        let p = array![0.001, 0.5, 0.03, 0.9];
701        let fitted = sel.fit(&p, &()).unwrap();
702        assert_eq!(fitted.selected_indices(), &[0]);
703    }
704
705    #[test]
706    fn test_fwe_two_features() {
707        let sel = SelectFwe::<f64>::new(0.10);
708        // Bonferroni: 0.1/3 ≈ 0.0333
709        let p = array![0.01, 0.02, 0.5];
710        let fitted = sel.fit(&p, &()).unwrap();
711        assert_eq!(fitted.selected_indices(), &[0, 1]);
712    }
713
714    #[test]
715    fn test_fwe_none_selected() {
716        let sel = SelectFwe::<f64>::new(0.01);
717        // Bonferroni: 0.01/3 ≈ 0.00333
718        let p = array![0.005, 0.5, 0.03];
719        let fitted = sel.fit(&p, &()).unwrap();
720        assert_eq!(fitted.n_features_selected(), 0);
721    }
722
723    #[test]
724    fn test_fwe_transform() {
725        let sel = SelectFwe::<f64>::new(0.05);
726        let p = array![0.001, 0.5, 0.9];
727        let fitted = sel.fit(&p, &()).unwrap();
728        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
729        let out = fitted.transform(&x).unwrap();
730        assert_eq!(out.ncols(), 1);
731        assert_eq!(out[[0, 0]], 1.0);
732    }
733
734    #[test]
735    fn test_fwe_empty_error() {
736        let sel = SelectFwe::<f64>::new(0.05);
737        let p: Array1<f64> = Array1::zeros(0);
738        assert!(sel.fit(&p, &()).is_err());
739    }
740
741    #[test]
742    fn test_fwe_invalid_alpha() {
743        // sklearn closed="both" (_univariate_selection.py:1034): alpha == 0 is
744        // VALID; only out-of-range values (< 0 or > 1) are rejected.
745        let p = array![0.01];
746
747        let neg = SelectFwe::<f64>::new(-0.1);
748        assert!(neg.fit(&p, &()).is_err());
749
750        let big = SelectFwe::<f64>::new(1.5);
751        assert!(big.fit(&p, &()).is_err());
752    }
753
754    #[test]
755    fn test_fwe_alpha_zero_valid() {
756        // alpha == 0 lower endpoint (_univariate_selection.py:1034): the
757        // Bonferroni mask `pvalues_ < 0/n` selects nothing for positive p.
758        let sel = SelectFwe::<f64>::new(0.0);
759        let p = array![0.01, 0.5, 0.03];
760        let fitted = sel.fit(&p, &());
761        assert!(fitted.is_ok(), "alpha=0 is valid (closed=both)");
762        if let Ok(f) = fitted {
763            assert_eq!(f.n_features_selected(), 0);
764        }
765    }
766
767    #[test]
768    fn test_fwe_shape_mismatch() {
769        let sel = SelectFwe::<f64>::new(0.05);
770        let p = array![0.01, 0.5];
771        let fitted = sel.fit(&p, &()).unwrap();
772        let x_bad = array![[1.0, 2.0, 3.0]];
773        assert!(fitted.transform(&x_bad).is_err());
774    }
775
776    #[test]
777    fn test_fwe_accessor() {
778        let sel = SelectFwe::<f64>::new(0.05);
779        assert_eq!(sel.alpha(), 0.05);
780    }
781
782    #[test]
783    fn test_fwe_single_feature() {
784        let sel = SelectFwe::<f64>::new(0.05);
785        // Bonferroni: 0.05/1 = 0.05; p=0.01 < 0.05 ✓
786        let p = array![0.01];
787        let fitted = sel.fit(&p, &()).unwrap();
788        assert_eq!(fitted.selected_indices(), &[0]);
789    }
790
791    #[test]
792    fn test_fwe_f32() {
793        let sel = SelectFwe::<f32>::new(0.05);
794        let p: Array1<f32> = array![0.001f32, 0.5];
795        let fitted = sel.fit(&p, &()).unwrap();
796        // Bonferroni: 0.05/2 = 0.025; p=0.001 < 0.025 ✓
797        assert_eq!(fitted.selected_indices(), &[0]);
798    }
799}
ferrolearn_preprocess/stat_selectors.rs

ferrolearn_preprocess/
stat_selectors.rs