use anofox_ml_core::{Result, RustMlError};
use ndarray::{Array1, Array2};
pub type ImportanceFn = dyn Fn(&Array2<f64>, &Array1<f64>) -> Result<Array1<f64>> + Send + Sync;
pub struct Rfe {
pub n_features_to_select: usize,
pub step: usize,
importance: Box<ImportanceFn>,
}
impl Rfe {
pub fn new<F>(n_features_to_select: usize, importance_fn: F) -> Self
where
F: Fn(&Array2<f64>, &Array1<f64>) -> Result<Array1<f64>> + Send + Sync + 'static,
{
Self {
n_features_to_select,
step: 1,
importance: Box::new(importance_fn),
}
}
pub fn with_step(mut self, step: usize) -> Self {
self.step = step;
self
}
}
pub struct FittedRfe {
pub support: Vec<bool>,
pub ranking: Vec<usize>,
}
impl FittedRfe {
pub fn transform(&self, x: &Array2<f64>) -> Array2<f64> {
let cols: Vec<usize> = self
.support
.iter()
.enumerate()
.filter(|(_, &b)| b)
.map(|(i, _)| i)
.collect();
let mut out = Array2::<f64>::zeros((x.nrows(), cols.len()));
for (k, &c) in cols.iter().enumerate() {
for i in 0..x.nrows() {
out[[i, k]] = x[[i, c]];
}
}
out
}
}
fn select_cols(x: &Array2<f64>, cols: &[usize]) -> Array2<f64> {
let mut out = Array2::<f64>::zeros((x.nrows(), cols.len()));
for (k, &c) in cols.iter().enumerate() {
for i in 0..x.nrows() {
out[[i, k]] = x[[i, c]];
}
}
out
}
impl Rfe {
pub fn fit(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<FittedRfe> {
if x.nrows() != y.len() {
return Err(RustMlError::ShapeMismatch(format!(
"X has {} rows but y has {}",
x.nrows(),
y.len()
)));
}
let d = x.ncols();
if self.n_features_to_select == 0 || self.n_features_to_select > d {
return Err(RustMlError::InvalidParameter(format!(
"n_features_to_select must be in 1..={}",
d
)));
}
let mut active: Vec<usize> = (0..d).collect();
let mut ranking = vec![0usize; d];
while active.len() > self.n_features_to_select {
let sub = select_cols(x, &active);
let imp = (self.importance)(&sub, y)?;
if imp.len() != active.len() {
return Err(RustMlError::InvalidParameter(
"importance function returned wrong length".into(),
));
}
let n_drop = self.step.min(active.len() - self.n_features_to_select);
let mut order: Vec<usize> = (0..active.len()).collect();
order.sort_by(|&a, &b| imp[a].abs().partial_cmp(&imp[b].abs()).unwrap());
let to_drop: Vec<usize> = order[..n_drop].iter().map(|&i| active[i]).collect();
for &j in &to_drop {
ranking[j] = active.len(); }
active.retain(|i| !to_drop.contains(i));
}
for &j in &active {
ranking[j] = 1;
}
let mut support = vec![false; d];
for &j in &active {
support[j] = true;
}
Ok(FittedRfe { support, ranking })
}
}
pub struct Rfecv {
pub min_features_to_select: usize,
pub step: usize,
pub cv_folds: usize,
importance: Box<ImportanceFn>,
score: Box<ScoringFn>,
}
impl Rfecv {
pub fn new<I, S>(min_features_to_select: usize, importance_fn: I, score: S) -> Self
where
I: Fn(&Array2<f64>, &Array1<f64>) -> Result<Array1<f64>> + Send + Sync + 'static,
S: Fn(&Array2<f64>, &Array1<f64>) -> Result<f64> + Send + Sync + 'static,
{
Self {
min_features_to_select,
step: 1,
cv_folds: 5,
importance: Box::new(importance_fn),
score: Box::new(score),
}
}
pub fn with_cv_folds(mut self, k: usize) -> Self {
self.cv_folds = k;
self
}
pub fn with_step(mut self, step: usize) -> Self {
self.step = step;
self
}
pub fn fit(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<FittedRfecv> {
let n = x.nrows();
let d = x.ncols();
let k = self.cv_folds.min(n);
let folds = kfold(n, k);
let mut mean_scores = Vec::with_capacity(d - self.min_features_to_select + 1);
let mut sizes = Vec::new();
for size in self.min_features_to_select..=d {
let mut scores = Vec::with_capacity(k);
for (train_idx, test_idx) in &folds {
let x_train = select_rows(x, train_idx);
let y_train = select_elements(y, train_idx);
let x_test = select_rows(x, test_idx);
let y_test = select_elements(y, test_idx);
let support = run_rfe(
&x_train,
&y_train,
size,
self.step,
self.importance.as_ref(),
)?;
let x_test_sel = select_cols_mask(&x_test, &support);
let s = (self.score)(&x_test_sel, &y_test)?;
scores.push(s);
}
let mean = scores.iter().sum::<f64>() / scores.len() as f64;
mean_scores.push(mean);
sizes.push(size);
}
let (best_i, _) = mean_scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.unwrap();
let best_size = sizes[best_i];
let support = run_rfe(x, y, best_size, self.step, self.importance.as_ref())?;
let mut ranking = vec![0usize; d];
for (i, &b) in support.iter().enumerate() {
ranking[i] = if b { 1 } else { 2 };
}
Ok(FittedRfecv {
n_features_selected: best_size,
cv_scores: mean_scores,
sizes,
inner: FittedRfe { support, ranking },
})
}
}
fn run_rfe(
x: &Array2<f64>,
y: &Array1<f64>,
n_features_to_select: usize,
step: usize,
importance: &ImportanceFn,
) -> Result<Vec<bool>> {
let d = x.ncols();
let mut active: Vec<usize> = (0..d).collect();
while active.len() > n_features_to_select {
let sub = select_cols(x, &active);
let imp = importance(&sub, y)?;
let n_drop = step.min(active.len() - n_features_to_select);
let mut order: Vec<usize> = (0..active.len()).collect();
order.sort_by(|&a, &b| imp[a].abs().partial_cmp(&imp[b].abs()).unwrap());
let to_drop: Vec<usize> = order[..n_drop].iter().map(|&i| active[i]).collect();
active.retain(|i| !to_drop.contains(i));
}
let mut support = vec![false; d];
for &j in &active {
support[j] = true;
}
Ok(support)
}
fn select_cols_mask(x: &Array2<f64>, mask: &[bool]) -> Array2<f64> {
let cols: Vec<usize> = mask
.iter()
.enumerate()
.filter(|(_, &b)| b)
.map(|(i, _)| i)
.collect();
let mut out = Array2::<f64>::zeros((x.nrows(), cols.len()));
for (k, &c) in cols.iter().enumerate() {
for i in 0..x.nrows() {
out[[i, k]] = x[[i, c]];
}
}
out
}
pub struct FittedRfecv {
pub n_features_selected: usize,
pub cv_scores: Vec<f64>,
pub sizes: Vec<usize>,
pub inner: FittedRfe,
}
impl FittedRfecv {
pub fn transform(&self, x: &Array2<f64>) -> Array2<f64> {
self.inner.transform(x)
}
}
fn kfold(n: usize, k: usize) -> Vec<(Vec<usize>, Vec<usize>)> {
let fold_size = n / k;
let rem = n % k;
let mut folds = Vec::with_capacity(k);
let mut start = 0;
for f in 0..k {
let end = start + fold_size + if f < rem { 1 } else { 0 };
let test: Vec<usize> = (start..end).collect();
let train: Vec<usize> = (0..start).chain(end..n).collect();
folds.push((train, test));
start = end;
}
folds
}
fn select_rows(x: &Array2<f64>, idx: &[usize]) -> Array2<f64> {
let mut out = Array2::<f64>::zeros((idx.len(), x.ncols()));
for (k, &i) in idx.iter().enumerate() {
for j in 0..x.ncols() {
out[[k, j]] = x[[i, j]];
}
}
out
}
fn select_elements(y: &Array1<f64>, idx: &[usize]) -> Array1<f64> {
Array1::from_vec(idx.iter().map(|&i| y[i]).collect())
}
pub type ScoringFn = dyn Fn(&Array2<f64>, &Array1<f64>) -> Result<f64> + Send + Sync;
pub struct SequentialFeatureSelector {
pub n_features_to_select: usize,
score: Box<ScoringFn>,
}
impl SequentialFeatureSelector {
pub fn new<F>(n_features_to_select: usize, score: F) -> Self
where
F: Fn(&Array2<f64>, &Array1<f64>) -> Result<f64> + Send + Sync + 'static,
{
Self {
n_features_to_select,
score: Box::new(score),
}
}
}
pub struct FittedSequentialFeatureSelector {
pub support: Vec<bool>,
}
impl FittedSequentialFeatureSelector {
pub fn transform(&self, x: &Array2<f64>) -> Array2<f64> {
let cols: Vec<usize> = self
.support
.iter()
.enumerate()
.filter(|(_, &b)| b)
.map(|(i, _)| i)
.collect();
let mut out = Array2::<f64>::zeros((x.nrows(), cols.len()));
for (k, &c) in cols.iter().enumerate() {
for i in 0..x.nrows() {
out[[i, k]] = x[[i, c]];
}
}
out
}
}
impl SequentialFeatureSelector {
pub fn fit(&self, x: &Array2<f64>, y: &Array1<f64>) -> Result<FittedSequentialFeatureSelector> {
let d = x.ncols();
if self.n_features_to_select == 0 || self.n_features_to_select > d {
return Err(RustMlError::InvalidParameter("invalid k".into()));
}
let mut selected: Vec<usize> = Vec::with_capacity(self.n_features_to_select);
let mut remaining: Vec<usize> = (0..d).collect();
while selected.len() < self.n_features_to_select {
let mut best_score = f64::NEG_INFINITY;
let mut best_j = remaining[0];
for (i, &j) in remaining.iter().enumerate() {
let mut cols = selected.clone();
cols.push(j);
let sub = select_cols(x, &cols);
let s = (self.score)(&sub, y)?;
if s > best_score {
best_score = s;
best_j = j;
let _ = i; }
}
selected.push(best_j);
remaining.retain(|&j| j != best_j);
}
let mut support = vec![false; d];
for &j in &selected {
support[j] = true;
}
Ok(FittedSequentialFeatureSelector { support })
}
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::array;
fn dummy_importance(x: &Array2<f64>, y: &Array1<f64>) -> Result<Array1<f64>> {
let n = x.nrows() as f64;
let mut out = Array1::<f64>::zeros(x.ncols());
let y_mean = y.sum() / n;
for j in 0..x.ncols() {
let m = x.column(j).sum() / n;
let mut num = 0.0;
let mut sx = 0.0;
let mut sy = 0.0;
for i in 0..x.nrows() {
let dx = x[[i, j]] - m;
let dy = y[i] - y_mean;
num += dx * dy;
sx += dx * dx;
sy += dy * dy;
}
let den = (sx * sy).sqrt().max(1e-12);
out[j] = (num / den).abs();
}
Ok(out)
}
#[test]
fn test_rfe_keeps_correlated_features() {
let n = 40;
let mut xv = Vec::new();
let mut yv = Vec::new();
for i in 0..n {
let x0 = (i as f64) - 20.0;
let x1 = ((i * 11 % 13) as f64) - 6.0;
let x2 = ((i * 7 % 17) as f64) - 8.0;
let x3 = ((i * 5 % 11) as f64) - 5.0;
xv.extend([x0, x1, x2, x3]);
yv.push(3.0 * x0 + 2.0 * x2);
}
let x = Array2::from_shape_vec((n, 4), xv).unwrap();
let y = Array1::from_vec(yv);
let rfe = Rfe::new(2, dummy_importance);
let fitted = rfe.fit(&x, &y).unwrap();
assert!(fitted.support[0]);
assert!(fitted.support[2]);
assert!(!fitted.support[1]);
assert!(!fitted.support[3]);
let _ = array![1.0_f64];
}
#[test]
fn test_rfecv_finds_2_informative_features() {
let n = 60;
let mut xv = Vec::new();
let mut yv = Vec::new();
for i in 0..n {
let x0 = (i as f64) - 30.0;
let x1 = ((i * 11 % 13) as f64) - 6.0;
let x2 = ((i * 7 % 17) as f64) - 8.0;
let x3 = ((i * 5 % 11) as f64) - 5.0;
xv.extend([x0, x1, x2, x3]);
yv.push(3.0 * x0 + 2.0 * x2);
}
let x = Array2::from_shape_vec((n, 4), xv).unwrap();
let y = Array1::from_vec(yv);
let score_fn = |xs: &Array2<f64>, ys: &Array1<f64>| -> Result<f64> {
let n = xs.nrows() as f64;
let y_mean = ys.sum() / n.max(1.0);
let yc = ys.mapv(|v| v - y_mean);
let m = xs.ncols();
let mut xtx = Array2::<f64>::zeros((m, m));
let mut xty = Array1::<f64>::zeros(m);
for i in 0..m {
for j in 0..m {
let mut s = 0.0;
for k in 0..xs.nrows() {
s += xs[[k, i]] * xs[[k, j]];
}
xtx[[i, j]] = s;
}
xtx[[i, i]] += 1e-9; let mut s = 0.0;
for k in 0..xs.nrows() {
s += xs[[k, i]] * yc[k];
}
xty[i] = s;
}
let mut a = xtx.clone();
let mut b = xty.clone();
for col in 0..m {
let pv = a[[col, col]];
if pv.abs() < 1e-14 {
continue;
}
for r in (col + 1)..m {
let f = a[[r, col]] / pv;
for c in col..m {
a[[r, c]] -= f * a[[col, c]];
}
b[r] -= f * b[col];
}
}
let mut beta = Array1::<f64>::zeros(m);
for r in (0..m).rev() {
let mut s = b[r];
for c in (r + 1)..m {
s -= a[[r, c]] * beta[c];
}
let pv = a[[r, r]];
if pv.abs() > 1e-14 {
beta[r] = s / pv;
}
}
let mut pred = Array1::<f64>::zeros(xs.nrows());
for i in 0..xs.nrows() {
let mut p = y_mean;
for j in 0..m {
p += xs[[i, j]] * beta[j];
}
pred[i] = p;
}
let rss: f64 = pred
.iter()
.zip(ys.iter())
.map(|(p, t)| (p - t).powi(2))
.sum();
let tss: f64 = ys.iter().map(|t| (t - y_mean).powi(2)).sum();
Ok(1.0 - rss / tss.max(1e-12))
};
let rfecv = Rfecv::new(1, dummy_importance, score_fn).with_cv_folds(3);
let fitted = rfecv.fit(&x, &y).unwrap();
assert!(fitted.n_features_selected >= 2);
assert!(fitted.inner.support[0]);
assert!(fitted.inner.support[2]);
assert_eq!(fitted.cv_scores.len(), 4); }
}