ferrolearn_preprocess/rfe.rs
1//! Recursive Feature Elimination (RFE) and RFE with Cross-Validation (RFECV).
2//!
3//! [`RFE`] recursively removes the least-important features, ranking features
4//! by their importance at each elimination step. The importance is determined by
5//! an external importance vector that the user supplies via a callback.
6//!
7//! [`RFECV`] extends RFE by using cross-validation to find the optimal number
8//! of features to retain.
9//!
10//! Because `ferrolearn-preprocess` cannot depend on estimator crates (to avoid
11//! circular dependencies), these implementations accept feature importance
12//! vectors directly rather than wrapping fitted estimators.
13//!
14//! Translation target: scikit-learn 1.5.2 `class RFE` / `class RFECV`
15//! (`sklearn/feature_selection/_rfe.py`). Design: `.design/preprocess/rfe.md`.
16//! Tracking: #1294.
17//!
18//! Note: ferrolearn takes a static importance vector (RFE) / pre-computed CV
19//! scores (RFECV); sklearn wraps an estimator and re-fits it each elimination
20//! round, recomputing squared importances (RFE), and runs cross-validation
21//! internally (RFECV). The ranking/elimination + optimal-count SHAPE matches;
22//! the estimator/re-fit/CV machinery is NOT-STARTED (#1295/#1300).
23//!
24//! `## REQ status`
25//!
26//! | REQ | Status | Anchor |
27//! |---|---|---|
28//! | REQ-1 RFE ranking/support/elimination (static importances) | SHIPPED | `RFE::new`; sklearn `_rfe.py:337`,`:345-346` |
29//! | REQ-2 RFE transform + error contracts (scoped) | SHIPPED | `RFE::new` / `transform` guards; sklearn `_rfe.py:199-212` |
30//! | REQ-4 `n_features_to_select > n_features` → keep all | SHIPPED (#1296) | `RFE::new` clamp; sklearn `_rfe.py:290-297`,`:314` |
31//! | REQ-9 RFECV optimal-count selection (static scores) | SHIPPED | `RFECV::new` argmax first-max; sklearn `_rfe.py:786-788` |
32//! | REQ-3 wrapped estimator + per-round re-fit + squared importances | NOT-STARTED (#1295) | sklearn `_rfe.py:319-331` |
33//! | REQ-5 `n_features_to_select=None` default + float fraction | NOT-STARTED (#1297) | sklearn `_rfe.py:286-299` |
34//! | REQ-6 float `step` in (0,1) | NOT-STARTED (#1298) | sklearn `_rfe.py:301-302` |
35//! | REQ-7 `importance_getter` + `verbose` | NOT-STARTED (#1299) | sklearn `_rfe.py:211`,`:326-330` |
36//! | REQ-8 RFECV internal cross-validation | NOT-STARTED (#1300) | sklearn `_rfe.py:736-815` |
37//! | REQ-10 SelectorMixin surface + `estimator_`/`n_features_` | NOT-STARTED (#1301) | sklearn `_rfe.py:348-359` |
38//! | REQ-11 PyO3 binding | NOT-STARTED (#1302) | `ferrolearn-python/src/` (absent) |
39//! | REQ-12 ferray substrate | NOT-STARTED (#1303) | R-SUBSTRATE |
40
41use ferrolearn_core::error::FerroError;
42use ferrolearn_core::traits::Transform;
43use ndarray::{Array1, Array2};
44use num_traits::Float;
45
46/// Build a new `Array2<F>` containing only the columns listed in `indices`.
47fn select_columns<F: Float>(x: &Array2<F>, indices: &[usize]) -> Array2<F> {
48 let nrows = x.nrows();
49 let ncols = indices.len();
50 if ncols == 0 {
51 return Array2::zeros((nrows, 0));
52 }
53 let mut out = Array2::zeros((nrows, ncols));
54 for (new_j, &old_j) in indices.iter().enumerate() {
55 for i in 0..nrows {
56 out[[i, new_j]] = x[[i, old_j]];
57 }
58 }
59 out
60}
61
62// ===========================================================================
63// RFE
64// ===========================================================================
65
66/// Recursive Feature Elimination.
67///
68/// Starting from all features, repeatedly removes the `step` least-important
69/// features until `n_features_to_select` features remain. The ranking is
70/// determined by the importance vector supplied at construction.
71///
72/// # Examples
73///
74/// ```
75/// use ferrolearn_preprocess::rfe::RFE;
76/// use ferrolearn_core::traits::Transform;
77/// use ndarray::array;
78///
79/// // Feature importances: feature 0 is most important, feature 2 least
80/// let importances = array![0.6, 0.3, 0.1];
81/// let rfe = RFE::<f64>::new(&importances, 1, 1).unwrap();
82/// assert_eq!(rfe.support(), &[true, false, false]);
83/// assert_eq!(rfe.ranking(), &[1, 2, 3]);
84/// let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
85/// let out = rfe.transform(&x).unwrap();
86/// assert_eq!(out.ncols(), 1);
87/// ```
88#[derive(Debug, Clone)]
89pub struct RFE<F> {
90 /// Feature ranking: `ranking[j]` is the rank of feature `j` (1 = selected).
91 ranking: Vec<usize>,
92 /// Boolean mask: `support[j]` is `true` if feature `j` is selected.
93 support: Vec<bool>,
94 /// Indices of the selected features (sorted).
95 selected_indices: Vec<usize>,
96 /// Original number of features.
97 n_features_in: usize,
98 _marker: std::marker::PhantomData<F>,
99}
100
101impl<F: Float + Send + Sync + 'static> RFE<F> {
102 /// Create a new `RFE` from pre-computed feature importances.
103 ///
104 /// # Parameters
105 ///
106 /// - `importances` — per-feature importance scores (higher = more important).
107 /// - `n_features_to_select` — number of features to keep.
108 /// - `step` — number of features to remove per iteration.
109 ///
110 /// # Errors
111 ///
112 /// - [`FerroError::InvalidParameter`] if `importances` is empty, `step` is
113 /// zero, or `n_features_to_select` is zero. If `n_features_to_select`
114 /// exceeds the number of features it is clamped (all features kept),
115 /// matching scikit-learn's warn-and-keep-all behavior
116 /// (`_rfe.py:290-297`).
117 pub fn new(
118 importances: &Array1<F>,
119 n_features_to_select: usize,
120 step: usize,
121 ) -> Result<Self, FerroError> {
122 let n_features = importances.len();
123 if n_features == 0 {
124 return Err(FerroError::InvalidParameter {
125 name: "importances".into(),
126 reason: "importance vector must not be empty".into(),
127 });
128 }
129 if step == 0 {
130 return Err(FerroError::InvalidParameter {
131 name: "step".into(),
132 reason: "step must be at least 1".into(),
133 });
134 }
135 if n_features_to_select == 0 {
136 return Err(FerroError::InvalidParameter {
137 name: "n_features_to_select".into(),
138 reason: format!(
139 "n_features_to_select ({n_features_to_select}) must be in [1, {n_features}]"
140 ),
141 });
142 }
143 // sklearn does NOT raise when n_features_to_select > n_features: it warns
144 // and keeps ALL features (`_rfe.py:290-297`), and the elimination loop
145 // (`_rfe.py:314`) never runs. Clamp so the loop below is a no-op and all
146 // features are kept. The UserWarning has no Rust analog (no log facade).
147 let n_features_to_select = n_features_to_select.min(n_features);
148
149 // Simulate the elimination process.
150 // Track which round each feature is eliminated in; features removed in
151 // the same step share the same rank. Selected features get rank 1,
152 // features removed in the last elimination round get rank 2, etc.
153 let mut ranking = vec![0usize; n_features];
154 let mut remaining: Vec<usize> = (0..n_features).collect();
155 let mut elimination_rounds: Vec<Vec<usize>> = Vec::new();
156
157 // Working copy of importances
158 let imp: Vec<F> = importances.iter().copied().collect();
159
160 while remaining.len() > n_features_to_select {
161 // Sort remaining by importance (ascending)
162 remaining.sort_by(|&a, &b| {
163 imp[a]
164 .partial_cmp(&imp[b])
165 .unwrap_or(std::cmp::Ordering::Equal)
166 });
167
168 // Remove the `step` least important features
169 let n_to_remove = step.min(remaining.len() - n_features_to_select);
170 let removed: Vec<usize> = remaining[..n_to_remove].to_vec();
171 elimination_rounds.push(removed);
172 remaining = remaining[n_to_remove..].to_vec();
173 }
174
175 // Assign ranks: selected features get rank 1, features removed in the
176 // last round get rank 2, second-to-last round gets rank 3, etc.
177 for &idx in &remaining {
178 ranking[idx] = 1;
179 }
180 for (round_idx, round) in elimination_rounds.iter().rev().enumerate() {
181 let rank = round_idx + 2;
182 for &idx in round {
183 ranking[idx] = rank;
184 }
185 }
186
187 let support: Vec<bool> = ranking.iter().map(|&r| r == 1).collect();
188 let mut selected_indices: Vec<usize> = remaining;
189 selected_indices.sort_unstable();
190
191 Ok(Self {
192 ranking,
193 support,
194 selected_indices,
195 n_features_in: n_features,
196 _marker: std::marker::PhantomData,
197 })
198 }
199
200 /// Return the feature ranking (1 = best, higher = eliminated earlier).
201 #[must_use]
202 pub fn ranking(&self) -> &[usize] {
203 &self.ranking
204 }
205
206 /// Return the boolean support mask.
207 #[must_use]
208 pub fn support(&self) -> &[bool] {
209 &self.support
210 }
211
212 /// Return the indices of the selected features.
213 #[must_use]
214 pub fn selected_indices(&self) -> &[usize] {
215 &self.selected_indices
216 }
217
218 /// Return the number of selected features.
219 #[must_use]
220 pub fn n_features_selected(&self) -> usize {
221 self.selected_indices.len()
222 }
223}
224
225impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for RFE<F> {
226 type Output = Array2<F>;
227 type Error = FerroError;
228
229 /// Return a matrix containing only the selected features.
230 ///
231 /// # Errors
232 ///
233 /// Returns [`FerroError::ShapeMismatch`] if the number of columns differs
234 /// from the number of features used at construction.
235 fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
236 if x.ncols() != self.n_features_in {
237 return Err(FerroError::ShapeMismatch {
238 expected: vec![x.nrows(), self.n_features_in],
239 actual: vec![x.nrows(), x.ncols()],
240 context: "RFE::transform".into(),
241 });
242 }
243 Ok(select_columns(x, &self.selected_indices))
244 }
245}
246
247// ===========================================================================
248// RFECV
249// ===========================================================================
250
251/// Recursive Feature Elimination with Cross-Validation.
252///
253/// Like [`RFE`], but uses cross-validation scores to determine the optimal
254/// number of features. The user supplies a vector of per-feature-count CV
255/// scores (e.g., from running RFE with different `n_features_to_select`
256/// values), and RFECV picks the number that maximises the score.
257///
258/// # Examples
259///
260/// ```
261/// use ferrolearn_preprocess::rfe::RFECV;
262/// use ferrolearn_core::traits::Transform;
263/// use ndarray::array;
264///
265/// let importances = array![0.5, 0.3, 0.2];
266/// // CV scores for selecting 1, 2, 3 features:
267/// let cv_scores = vec![0.85, 0.95, 0.90];
268/// let rfecv = RFECV::<f64>::new(&importances, &cv_scores, 1).unwrap();
269/// // Best is 2 features (score 0.95)
270/// assert_eq!(rfecv.n_features_selected(), 2);
271/// let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
272/// let out = rfecv.transform(&x).unwrap();
273/// assert_eq!(out.ncols(), 2);
274/// ```
275#[derive(Debug, Clone)]
276pub struct RFECV<F> {
277 /// The underlying RFE with the optimal number of features.
278 rfe: RFE<F>,
279 /// CV scores for each number of features (1..=n_features).
280 cv_scores: Vec<f64>,
281 /// The optimal number of features (1-indexed).
282 optimal_n_features: usize,
283}
284
285impl<F: Float + Send + Sync + 'static> RFECV<F> {
286 /// Create a new `RFECV` from pre-computed importances and CV scores.
287 ///
288 /// # Parameters
289 ///
290 /// - `importances` — per-feature importance scores.
291 /// - `cv_scores` — CV score for each possible number of features
292 /// (index 0 = 1 feature, index 1 = 2 features, ...).
293 /// - `step` — features removed per iteration.
294 ///
295 /// # Errors
296 ///
297 /// - [`FerroError::InvalidParameter`] if inputs are empty or mismatched.
298 pub fn new(
299 importances: &Array1<F>,
300 cv_scores: &[f64],
301 step: usize,
302 ) -> Result<Self, FerroError> {
303 let n_features = importances.len();
304 if n_features == 0 {
305 return Err(FerroError::InvalidParameter {
306 name: "importances".into(),
307 reason: "importance vector must not be empty".into(),
308 });
309 }
310 if cv_scores.len() != n_features {
311 return Err(FerroError::InvalidParameter {
312 name: "cv_scores".into(),
313 reason: format!(
314 "cv_scores length ({}) must equal number of features ({})",
315 cv_scores.len(),
316 n_features
317 ),
318 });
319 }
320
321 // Find the optimal number of features (1-indexed)
322 let mut best_idx = 0;
323 let mut best_score = f64::NEG_INFINITY;
324 for (i, &score) in cv_scores.iter().enumerate() {
325 if score > best_score {
326 best_score = score;
327 best_idx = i;
328 }
329 }
330 let optimal_n_features = best_idx + 1;
331
332 let rfe = RFE::new(importances, optimal_n_features, step)?;
333
334 Ok(Self {
335 rfe,
336 cv_scores: cv_scores.to_vec(),
337 optimal_n_features,
338 })
339 }
340
341 /// Return the CV scores.
342 #[must_use]
343 pub fn cv_scores(&self) -> &[f64] {
344 &self.cv_scores
345 }
346
347 /// Return the optimal number of features.
348 #[must_use]
349 pub fn optimal_n_features(&self) -> usize {
350 self.optimal_n_features
351 }
352
353 /// Return the number of selected features.
354 #[must_use]
355 pub fn n_features_selected(&self) -> usize {
356 self.rfe.n_features_selected()
357 }
358
359 /// Return the feature ranking.
360 #[must_use]
361 pub fn ranking(&self) -> &[usize] {
362 self.rfe.ranking()
363 }
364
365 /// Return the boolean support mask.
366 #[must_use]
367 pub fn support(&self) -> &[bool] {
368 self.rfe.support()
369 }
370
371 /// Return the indices of the selected features.
372 #[must_use]
373 pub fn selected_indices(&self) -> &[usize] {
374 self.rfe.selected_indices()
375 }
376}
377
378impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for RFECV<F> {
379 type Output = Array2<F>;
380 type Error = FerroError;
381
382 /// Return a matrix containing only the optimally selected features.
383 ///
384 /// # Errors
385 ///
386 /// Returns [`FerroError::ShapeMismatch`] if column count does not match.
387 fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
388 self.rfe.transform(x)
389 }
390}
391
392// ---------------------------------------------------------------------------
393// Tests
394// ---------------------------------------------------------------------------
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399 use approx::assert_abs_diff_eq;
400 use ndarray::array;
401
402 // ========================================================================
403 // RFE tests
404 // ========================================================================
405
406 #[test]
407 fn test_rfe_basic_ranking() {
408 // Importances: [0.6, 0.3, 0.1]
409 // Select 1 feature, step 1
410 // Round 1: remove feature 2 (lowest 0.1) → remaining [0, 1]
411 // Round 2: remove feature 1 (lowest 0.3) → remaining [0]
412 let imp = array![0.6, 0.3, 0.1];
413 let rfe = RFE::<f64>::new(&imp, 1, 1).unwrap();
414 assert_eq!(rfe.ranking(), &[1, 2, 3]);
415 assert_eq!(rfe.support(), &[true, false, false]);
416 assert_eq!(rfe.selected_indices(), &[0]);
417 }
418
419 #[test]
420 fn test_rfe_select_two() {
421 let imp = array![0.5, 0.3, 0.2];
422 let rfe = RFE::<f64>::new(&imp, 2, 1).unwrap();
423 assert_eq!(rfe.n_features_selected(), 2);
424 // Feature 2 (0.2) should be eliminated first
425 assert_eq!(rfe.ranking()[2], 2); // eliminated in round 1
426 assert_eq!(rfe.ranking()[0], 1);
427 assert_eq!(rfe.ranking()[1], 1);
428 }
429
430 #[test]
431 fn test_rfe_step_two() {
432 let imp = array![0.5, 0.3, 0.2, 0.1];
433 // Select 2, step 2: remove 2 features at once
434 let rfe = RFE::<f64>::new(&imp, 2, 2).unwrap();
435 assert_eq!(rfe.n_features_selected(), 2);
436 assert!(rfe.support()[0]);
437 assert!(rfe.support()[1]);
438 assert!(!rfe.support()[2]);
439 assert!(!rfe.support()[3]);
440 }
441
442 #[test]
443 fn test_rfe_transform() {
444 let imp = array![0.6, 0.3, 0.1];
445 let rfe = RFE::<f64>::new(&imp, 1, 1).unwrap();
446 let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
447 let out = rfe.transform(&x).unwrap();
448 assert_eq!(out.ncols(), 1);
449 // Feature 0 is selected
450 assert_abs_diff_eq!(out[[0, 0]], 1.0, epsilon = 1e-15);
451 assert_abs_diff_eq!(out[[1, 0]], 4.0, epsilon = 1e-15);
452 }
453
454 #[test]
455 fn test_rfe_all_features_selected() {
456 let imp = array![0.5, 0.3, 0.2];
457 let rfe = RFE::<f64>::new(&imp, 3, 1).unwrap();
458 assert_eq!(rfe.n_features_selected(), 3);
459 assert!(rfe.support().iter().all(|&s| s));
460 }
461
462 #[test]
463 fn test_rfe_empty_importances_error() {
464 let imp: Array1<f64> = Array1::zeros(0);
465 assert!(RFE::<f64>::new(&imp, 1, 1).is_err());
466 }
467
468 #[test]
469 fn test_rfe_zero_step_error() {
470 let imp = array![0.5, 0.3];
471 assert!(RFE::<f64>::new(&imp, 1, 0).is_err());
472 }
473
474 /// sklearn does not raise when `n_features_to_select > n_features`: it warns
475 /// and keeps ALL features (`sklearn/feature_selection/_rfe.py:290-297`), with
476 /// `support_` all true and `ranking_` all 1. We clamp to keep all features.
477 #[test]
478 fn test_rfe_n_features_too_large_keeps_all() {
479 let imp = array![0.5, 0.3];
480 let result = RFE::<f64>::new(&imp, 5, 1);
481 assert!(result.is_ok());
482 if let Ok(rfe) = result {
483 assert!(rfe.support().iter().all(|&s| s));
484 assert!(rfe.ranking().iter().all(|&r| r == 1));
485 assert_eq!(rfe.n_features_selected(), 2);
486 }
487 }
488
489 #[test]
490 fn test_rfe_n_features_zero_error() {
491 let imp = array![0.5, 0.3];
492 assert!(RFE::<f64>::new(&imp, 0, 1).is_err());
493 }
494
495 #[test]
496 fn test_rfe_shape_mismatch_error() {
497 let imp = array![0.5, 0.3];
498 let rfe = RFE::<f64>::new(&imp, 1, 1).unwrap();
499 let x_bad = array![[1.0, 2.0, 3.0]];
500 assert!(rfe.transform(&x_bad).is_err());
501 }
502
503 // ========================================================================
504 // RFECV tests
505 // ========================================================================
506
507 #[test]
508 fn test_rfecv_selects_optimal() {
509 let imp = array![0.5, 0.3, 0.2];
510 // Best CV score at 2 features
511 let cv_scores = vec![0.85, 0.95, 0.90];
512 let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
513 assert_eq!(rfecv.optimal_n_features(), 2);
514 assert_eq!(rfecv.n_features_selected(), 2);
515 }
516
517 #[test]
518 fn test_rfecv_transform() {
519 let imp = array![0.5, 0.3, 0.2];
520 let cv_scores = vec![0.85, 0.95, 0.90];
521 let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
522 let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
523 let out = rfecv.transform(&x).unwrap();
524 assert_eq!(out.ncols(), 2);
525 }
526
527 #[test]
528 fn test_rfecv_cv_scores_accessor() {
529 let imp = array![0.5, 0.3];
530 let cv_scores = vec![0.9, 0.8];
531 let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
532 assert_eq!(rfecv.cv_scores(), &[0.9, 0.8]);
533 // Best is 1 feature (score 0.9)
534 assert_eq!(rfecv.optimal_n_features(), 1);
535 }
536
537 #[test]
538 fn test_rfecv_mismatched_scores_error() {
539 let imp = array![0.5, 0.3, 0.2];
540 let cv_scores = vec![0.85, 0.95]; // wrong length
541 assert!(RFECV::<f64>::new(&imp, &cv_scores, 1).is_err());
542 }
543
544 #[test]
545 fn test_rfecv_empty_importances_error() {
546 let imp: Array1<f64> = Array1::zeros(0);
547 let cv_scores: Vec<f64> = vec![];
548 assert!(RFECV::<f64>::new(&imp, &cv_scores, 1).is_err());
549 }
550
551 #[test]
552 fn test_rfecv_ranking_and_support() {
553 let imp = array![0.5, 0.3, 0.2];
554 let cv_scores = vec![0.80, 0.95, 0.90];
555 let rfecv = RFECV::<f64>::new(&imp, &cv_scores, 1).unwrap();
556 assert_eq!(rfecv.n_features_selected(), 2);
557 let support = rfecv.support();
558 assert_eq!(support.iter().filter(|&&s| s).count(), 2);
559 }
560}