Skip to main content

ferrolearn_preprocess/
rfe.rs

1//! Recursive Feature Elimination (RFE) and RFE with Cross-Validation (RFECV).
2//!
3//! [`RFE`] recursively removes the least-important features, ranking features
4//! by their importance at each elimination step. The importance is determined by
5//! an external importance vector that the user supplies via a callback.
6//!
7//! [`RFECV`] extends RFE by using cross-validation to find the optimal number
8//! of features to retain.
9//!
10//! Because `ferrolearn-preprocess` cannot depend on estimator crates (to avoid
11//! circular dependencies), these implementations accept feature importance
12//! vectors directly rather than wrapping fitted estimators.
13//!
14//! Translation target: scikit-learn 1.5.2 `class RFE` / `class RFECV`
15//! (`sklearn/feature_selection/_rfe.py`). Design: `.design/preprocess/rfe.md`.
16//! Tracking: #1294.
17//!
18//! Note: ferrolearn takes a static importance vector (RFE) / pre-computed CV
19//! scores (RFECV); sklearn wraps an estimator and re-fits it each elimination
20//! round, recomputing squared importances (RFE), and runs cross-validation
21//! internally (RFECV). The ranking/elimination + optimal-count SHAPE matches;
22//! the estimator/re-fit/CV machinery is NOT-STARTED (#1295/#1300).
23//!
24//! `## REQ status`
25//!
26//! | REQ | Status | Anchor |
27//! |---|---|---|
28//! | REQ-1 RFE ranking/support/elimination (static importances) | SHIPPED | `RFE::new`; sklearn `_rfe.py:337`,`:345-346` |
29//! | REQ-2 RFE transform + error contracts (scoped) | SHIPPED | `RFE::new` / `transform` guards; sklearn `_rfe.py:199-212` |
30//! | REQ-4 `n_features_to_select > n_features` → keep all | SHIPPED (#1296) | `RFE::new` clamp; sklearn `_rfe.py:290-297`,`:314` |
31//! | REQ-9 RFECV optimal-count selection (static scores) | SHIPPED | `RFECV::new` argmax first-max; sklearn `_rfe.py:786-788` |
32//! | REQ-3 wrapped estimator + per-round re-fit + squared importances | NOT-STARTED (#1295) | sklearn `_rfe.py:319-331` |
33//! | REQ-5 `n_features_to_select=None` default + float fraction | NOT-STARTED (#1297) | sklearn `_rfe.py:286-299` |
34//! | REQ-6 float `step` in (0,1) | NOT-STARTED (#1298) | sklearn `_rfe.py:301-302` |
35//! | REQ-7 `importance_getter` + `verbose` | NOT-STARTED (#1299) | sklearn `_rfe.py:211`,`:326-330` |
36//! | REQ-8 RFECV internal cross-validation | NOT-STARTED (#1300) | sklearn `_rfe.py:736-815` |
37//! | REQ-10 SelectorMixin surface + `estimator_`/`n_features_` | NOT-STARTED (#1301) | sklearn `_rfe.py:348-359` |
38//! | REQ-11 PyO3 binding | NOT-STARTED (#1302) | `ferrolearn-python/src/` (absent) |
39//! | REQ-12 ferray substrate | NOT-STARTED (#1303) | R-SUBSTRATE |
40
41use ferrolearn_core::error::FerroError;
42use ferrolearn_core::traits::Transform;
43use ndarray::{Array1, Array2};
44use num_traits::Float;
45
46/// Build a new `Array2<F>` containing only the columns listed in `indices`.
47fn select_columns<F: Float>(x: &Array2<F>, indices: &[usize]) -> Array2<F> {
48    let nrows = x.nrows();
49    let ncols = indices.len();
50    if ncols == 0 {
51        return Array2::zeros((nrows, 0));
52    }
53    let mut out = Array2::zeros((nrows, ncols));
54    for (new_j, &old_j) in indices.iter().enumerate() {
55        for i in 0..nrows {
56            out[[i, new_j]] = x[[i, old_j]];
57        }
58    }
59    out
60}
61
62// ===========================================================================
63// RFE
64// ===========================================================================
65
66/// Recursive Feature Elimination.
67///
68/// Starting from all features, repeatedly removes the `step` least-important
69/// features until `n_features_to_select` features remain. The ranking is
70/// determined by the importance vector supplied at construction.
71///
72/// # Examples
73///
74/// ```
75/// use ferrolearn_preprocess::rfe::RFE;
76/// use ferrolearn_core::traits::Transform;
77/// use ndarray::array;
78///
79/// // Feature importances: feature 0 is most important, feature 2 least
80/// let importances = array![0.6, 0.3, 0.1];
81/// let rfe = RFE::<f64>::new(&importances, 1, 1).unwrap();
82/// assert_eq!(rfe.support(), &[true, false, false]);
83/// assert_eq!(rfe.ranking(), &[1, 2, 3]);
84/// let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
85/// let out = rfe.transform(&x).unwrap();
86/// assert_eq!(out.ncols(), 1);
87/// ```
88#[derive(Debug, Clone)]
89pub struct RFE<F> {
90    /// Feature ranking: `ranking[j]` is the rank of feature `j` (1 = selected).
91    ranking: Vec<usize>,
92    /// Boolean mask: `support[j]` is `true` if feature `j` is selected.
93    support: Vec<bool>,
94    /// Indices of the selected features (sorted).
95    selected_indices: Vec<usize>,
96    /// Original number of features.
97    n_features_in: usize,
98    _marker: std::marker::PhantomData<F>,
99}
100
101impl<F: Float + Send + Sync + 'static> RFE<F> {
102    /// Create a new `RFE` from pre-computed feature importances.
103    ///
104    /// # Parameters
105    ///
106    /// - `importances` — per-feature importance scores (higher = more important).
107    /// - `n_features_to_select` — number of features to keep.
108    /// - `step` — number of features to remove per iteration.
109    ///
110    /// # Errors
111    ///
112    /// - [`FerroError::InvalidParameter`] if `importances` is empty, `step` is
113    ///   zero, or `n_features_to_select` is zero. If `n_features_to_select`
114    ///   exceeds the number of features it is clamped (all features kept),
115    ///   matching scikit-learn's warn-and-keep-all behavior
116    ///   (`_rfe.py:290-297`).
117    pub fn new(
118        importances: &Array1<F>,
119        n_features_to_select: usize,
120        step: usize,
121    ) -> Result<Self, FerroError> {
122        let n_features = importances.len();
123        if n_features == 0 {
124            return Err(FerroError::InvalidParameter {
125                name: "importances".into(),
126                reason: "importance vector must not be empty".into(),
127            });
128        }
129        if step == 0 {
130            return Err(FerroError::InvalidParameter {
131                name: "step".into(),
132                reason: "step must be at least 1".into(),
133            });
134        }
135        if n_features_to_select == 0 {
136            return Err(FerroError::InvalidParameter {
137                name: "n_features_to_select".into(),
138                reason: format!(
139                    "n_features_to_select ({n_features_to_select}) must be in [1, {n_features}]"
140                ),
141            });
142        }
143        // sklearn does NOT raise when n_features_to_select > n_features: it warns
144        // and keeps ALL features (`_rfe.py:290-297`), and the elimination loop
145        // (`_rfe.py:314`) never runs. Clamp so the loop below is a no-op and all
146        // features are kept. The UserWarning has no Rust analog (no log facade).
147        let n_features_to_select = n_features_to_select.min(n_features);
148
149        // Simulate the elimination process.
150        // Track which round each feature is eliminated in; features removed in
151        // the same step share the same rank. Selected features get rank 1,
152        // features removed in the last elimination round get rank 2, etc.
153        let mut ranking = vec![0usize; n_features];
154        let mut remaining: Vec<usize> = (0..n_features).collect();
155        let mut elimination_rounds: Vec<Vec<usize>> = Vec::new();
156
157        // Working copy of importances
158        let imp: Vec<F> = importances.iter().copied().collect();
159
160        while remaining.len() > n_features_to_select {
161            // Sort remaining by importance (ascending)
162            remaining.sort_by(|&a, &b| {
163                imp[a]
164                    .partial_cmp(&imp[b])
165                    .unwrap_or(std::cmp::Ordering::Equal)
166            });
167
168            // Remove the `step` least important features
169            let n_to_remove = step.min(remaining.len() - n_features_to_select);
170            let removed: Vec<usize> = remaining[..n_to_remove].to_vec();
171            elimination_rounds.push(removed);
172            remaining = remaining[n_to_remove..].to_vec();
173        }
174
175        // Assign ranks: selected features get rank 1, features removed in the
176        // last round get rank 2, second-to-last round gets rank 3, etc.
177        for &idx in &remaining {
178            ranking[idx] = 1;
179        }
180        for (round_idx, round) in elimination_rounds.iter().rev().enumerate() {
181            let rank = round_idx + 2;
182            for &idx in round {
183                ranking[idx] = rank;
184            }
185        }
186
187        let support: Vec<bool> = ranking.iter().map(|&r| r == 1).collect();
188        let mut selected_indices: Vec<usize> = remaining;
189        selected_indices.sort_unstable();
190
191        Ok(Self {
192            ranking,
193            support,
194            selected_indices,
195            n_features_in: n_features,
196            _marker: std::marker::PhantomData,
197        })
198    }
199
200    /// Return the feature ranking (1 = best, higher = eliminated earlier).
201    #[must_use]
202    pub fn ranking(&self) -> &[usize] {
203        &self.ranking
204    }
205
206    /// Return the boolean support mask.
207    #[must_use]
208    pub fn support(&self) -> &[bool] {
209        &self.support
210    }
211
212    /// Return the indices of the selected features.
213    #[must_use]
214    pub fn selected_indices(&self) -> &[usize] {
215        &self.selected_indices
216    }
217
218    /// Return the number of selected features.
219    #[must_use]
220    pub fn n_features_selected(&self) -> usize {
221        self.selected_indices.len()
222    }
223}
224
225impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for RFE<F> {
226    type Output = Array2<F>;
227    type Error = FerroError;
228
229    /// Return a matrix containing only the selected features.
230    ///
231    /// # Errors
232    ///
233    /// Returns [`FerroError::ShapeMismatch`] if the number of columns differs
234    /// from the number of features used at construction.
235    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
236        if x.ncols() != self.n_features_in {
237            return Err(FerroError::ShapeMismatch {
238                expected: vec![x.nrows(), self.n_features_in],
239                actual: vec![x.nrows(), x.ncols()],
240                context: "RFE::transform".into(),
241            });
242        }
243        Ok(select_columns(x, &self.selected_indices))
244    }
245}
246
247// ===========================================================================
248// RFECV
249// ===========================================================================
250
251/// Recursive Feature Elimination with Cross-Validation.
252///
253/// Like [`RFE`], but uses cross-validation scores to determine the optimal
254/// number of features. The user supplies a vector of per-feature-count CV
255/// scores (e.g., from running RFE with different `n_features_to_select`
256/// values), and RFECV picks the number that maximises the score.
257///
258/// # Examples
259///
260/// ```
261/// use ferrolearn_preprocess::rfe::RFECV;
262/// use ferrolearn_core::traits::Transform;
263/// use ndarray::array;
264///
265/// let importances = array![0.5, 0.3, 0.2];
266/// // CV scores for selecting 1, 2, 3 features:
267/// let cv_scores = vec![0.85, 0.95, 0.90];
268/// let rfecv = RFECV::<f64>::new(&importances, &cv_scores, 1).unwrap();
269/// // Best is 2 features (score 0.95)
270/// assert_eq!(rfecv.n_features_selected(), 2);
271/// let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
272/// let out = rfecv.transform(&x).unwrap();
273/// assert_eq!(out.ncols(), 2);
274/// ```
275#[derive(Debug, Clone)]
276pub struct RFECV<F> {
277    /// The underlying RFE with the optimal number of features.
278    rfe: RFE<F>,
279    /// CV scores for each number of features (1..=n_features).
280    cv_scores: Vec<f64>,
281    /// The optimal number of features (1-indexed).
282    optimal_n_features: usize,
283}
284
285impl<F: Float + Send + Sync + 'static> RFECV<F> {
286    /// Create a new `RFECV` from pre-computed importances and CV scores.
287    ///
288    /// # Parameters
289    ///
290    /// - `importances` — per-feature importance scores.
291    /// - `cv_scores` — CV score for each possible number of features
292    ///   (index 0 = 1 feature, index 1 = 2 features, ...).
293    /// - `step` — features removed per iteration.
294    ///
295    /// # Errors
296    ///
297    /// - [`FerroError::InvalidParameter`] if inputs are empty or mismatched.
298    pub fn new(
299        importances: &Array1<F>,
300        cv_scores: &[f64],
301        step: usize,
302    ) -> Result<Self, FerroError> {
303        let n_features = importances.len();
304        if n_features == 0 {
305            return Err(FerroError::InvalidParameter {
306                name: "importances".into(),
307                reason: "importance vector must not be empty".into(),
308            });
309        }
310        if cv_scores.len() != n_features {
311            return Err(FerroError::InvalidParameter {
312                name: "cv_scores".into(),
313                reason: format!(
314                    "cv_scores length ({}) must equal number of features ({})",
315                    cv_scores.len(),
316                    n_features
317                ),
318            });
319        }
320
321        // Find the optimal number of features (1-indexed)
322        let mut best_idx = 0;
323        let mut best_score = f64::NEG_INFINITY;
324        for (i, &score) in cv_scores.iter().enumerate() {
325            if score > best_score {
326                best_score = score;
327                best_idx = i;
328            }
329        }
330        let optimal_n_features = best_idx + 1;
331
332        let rfe = RFE::new(importances, optimal_n_features, step)?;
333
334        Ok(Self {
335            rfe,
336            cv_scores: cv_scores.to_vec(),
337            optimal_n_features,
338        })
339    }
340
341    /// Return the CV scores.
342    #[must_use]
343    pub fn cv_scores(&self) -> &[f64] {
344        &self.cv_scores
345    }
346
347    /// Return the optimal number of features.
348    #[must_use]
349    pub fn optimal_n_features(&self) -> usize {
350        self.optimal_n_features
351    }
352
353    /// Return the number of selected features.
354    #[must_use]
355    pub fn n_features_selected(&self) -> usize {
356        self.rfe.n_features_selected()
357    }
358
359    /// Return the feature ranking.
360    #[must_use]
361    pub fn ranking(&self) -> &[usize] {
362        self.rfe.ranking()
363    }
364
365    /// Return the boolean support mask.
366    #[must_use]
367    pub fn support(&self) -> &[bool] {
368        self.rfe.support()
369    }
370
371    /// Return the indices of the selected features.
372    #[must_use]
373    pub fn selected_indices(&self) -> &[usize] {
374        self.rfe.selected_indices()
375    }
376}
377
378impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for RFECV<F> {
379    type Output = Array2<F>;
380    type Error = FerroError;
381
382    /// Return a matrix containing only the optimally selected features.
383    ///
384    /// # Errors
385    ///
386    /// Returns [`FerroError::ShapeMismatch`] if column count does not match.
387    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
388        self.rfe.transform(x)
389    }
390}
391
392// ---------------------------------------------------------------------------
393// Tests
394// ---------------------------------------------------------------------------
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399    use approx::assert_abs_diff_eq;
400    use ndarray::array;
401
402    // ========================================================================
403    // RFE tests
404    // ========================================================================
405
406    #[test]
407    fn test_rfe_basic_ranking() {
408        // Importances: [0.6, 0.3, 0.1]
409        // Select 1 feature, step 1
410        // Round 1: remove feature 2 (lowest 0.1) → remaining [0, 1]
411        // Round 2: remove feature 1 (lowest 0.3) → remaining [0]
412        let imp = array![0.6, 0.3, 0.1];
413        let rfe = RFE::<f64>::new(&imp, 1, 1).unwrap();
414        assert_eq!(rfe.ranking(), &[1, 2, 3]);
415        assert_eq!(rfe.support(), &[true, false, false]);
416        assert_eq!(rfe.selected_indices(), &[0]);
417    }
418
419    #[test]
420    fn test_rfe_select_two() {
421        let imp = array![0.5, 0.3, 0.2];
422        let rfe = RFE::<f64>::new(&imp, 2, 1).unwrap();
423        assert_eq!(rfe.n_features_selected(), 2);
424        // Feature 2 (0.2) should be eliminated first
425        assert_eq!(rfe.ranking()[2], 2); // eliminated in round 1
426        assert_eq!(rfe.ranking()[0], 1);
427        assert_eq!(rfe.ranking()[1], 1);
428    }
429
430    #[test]
431    fn test_rfe_step_two() {
432        let imp = array![0.5, 0.3, 0.2, 0.1];
433        // Select 2, step 2: remove 2 features at once
434        let rfe = RFE::<f64>::new(&imp, 2, 2).unwrap();
435        assert_eq!(rfe.n_features_selected(), 2);
436        assert!(rfe.support()[0]);
437        assert!(rfe.support()[1]);
438        assert!(!rfe.support()[2]);
439        assert!(!rfe.support()[3]);
440    }
441
442    #[test]
443    fn test_rfe_transform() {
444        let imp = array![0.6, 0.3, 0.1];
445        let rfe = RFE::<f64>::new(&imp, 1, 1).unwrap();
446        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
447        let out = rfe.transform(&x).unwrap();
448        assert_eq!(out.ncols(), 1);
449        // Feature 0 is selected
450        assert_abs_diff_eq!(out[[0, 0]], 1.0, epsilon = 1e-15);
451        assert_abs_diff_eq!(out[[1, 0]], 4.0, epsilon = 1e-15);
452    }
453
454    #[test]
455    fn test_rfe_all_features_selected() {
456        let imp = array![0.5, 0.3, 0.2];
457        let rfe = RFE::<f64>::new(&imp, 3, 1).unwrap();
458        assert_eq!(rfe.n_features_selected(), 3);
459        assert!(rfe.support().iter().all(|&s| s));
460    }
461
462    #[test]
463    fn test_rfe_empty_importances_error() {
464        let imp: Array1<f64> = Array1::zeros(0);
465        assert!(RFE::<f64>::new(&imp, 1, 1).is_err());
466    }
467
468    #[test]
469    fn test_rfe_zero_step_error() {
470        let imp = array![0.5, 0.3];
471        assert!(RFE::<f64>::new(&imp, 1, 0).is_err());
472    }
473
474    /// sklearn does not raise when `n_features_to_select > n_features`: it warns
475    /// and keeps ALL features (`sklearn/feature_selection/_rfe.py:290-297`), with
476    /// `support_` all true and `ranking_` all 1. We clamp to keep all features.
477    #[test]
478    fn test_rfe_n_features_too_large_keeps_all() {
479        let imp = array![0.5, 0.3];
480        let result = RFE::<f64>::new(&imp, 5, 1);
481        assert!(result.is_ok());
482        if let Ok(rfe) = result {
483            assert!(rfe.support().iter().all(|&s| s));
484            assert!(rfe.ranking().iter().all(|&r| r == 1));
485            assert_eq!(rfe.n_features_selected(), 2);
486        }
487    }
488
489    #[test]
490    fn test_rfe_n_features_zero_error() {
491        let imp = array![0.5, 0.3];
492        assert!(RFE::<f64>::new(&imp, 0, 1).is_err());
493    }
494
495    #[test]
496    fn test_rfe_shape_mismatch_error() {
497        let imp = array![0.5, 0.3];
498        let rfe = RFE::<f64>::new(&imp, 1, 1).unwrap();
499        let x_bad = array![[1.0, 2.0, 3.0]];
500        assert!(rfe.transform(&x_bad).is_err());
501    }
502
503    // ========================================================================
504    // RFECV tests
505    // ========================================================================
506
507    #[test]
508    fn test_rfecv_selects_optimal() {
509        let imp = array![0.5, 0.3, 0.2];
510        // Best CV score at 2 features
511        let cv_scores = vec![0.85, 0.95, 0.90];
512        let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
513        assert_eq!(rfecv.optimal_n_features(), 2);
514        assert_eq!(rfecv.n_features_selected(), 2);
515    }
516
517    #[test]
518    fn test_rfecv_transform() {
519        let imp = array![0.5, 0.3, 0.2];
520        let cv_scores = vec![0.85, 0.95, 0.90];
521        let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
522        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
523        let out = rfecv.transform(&x).unwrap();
524        assert_eq!(out.ncols(), 2);
525    }
526
527    #[test]
528    fn test_rfecv_cv_scores_accessor() {
529        let imp = array![0.5, 0.3];
530        let cv_scores = vec![0.9, 0.8];
531        let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
532        assert_eq!(rfecv.cv_scores(), &[0.9, 0.8]);
533        // Best is 1 feature (score 0.9)
534        assert_eq!(rfecv.optimal_n_features(), 1);
535    }
536
537    #[test]
538    fn test_rfecv_mismatched_scores_error() {
539        let imp = array![0.5, 0.3, 0.2];
540        let cv_scores = vec![0.85, 0.95]; // wrong length
541        assert!(RFECV::<f64>::new(&imp, &cv_scores, 1).is_err());
542    }
543
544    #[test]
545    fn test_rfecv_empty_importances_error() {
546        let imp: Array1<f64> = Array1::zeros(0);
547        let cv_scores: Vec<f64> = vec![];
548        assert!(RFECV::<f64>::new(&imp, &cv_scores, 1).is_err());
549    }
550
551    #[test]
552    fn test_rfecv_ranking_and_support() {
553        let imp = array![0.5, 0.3, 0.2];
554        let cv_scores = vec![0.80, 0.95, 0.90];
555        let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
556        assert_eq!(rfecv.n_features_selected(), 2);
557        let support = rfecv.support();
558        assert_eq!(support.iter().filter(|&&s| s).count(), 2);
559    }
560}