use ferrolearn_core::error::FerroError;
use ferrolearn_core::pipeline::{FittedPipelineTransformer, PipelineTransformer};
use ferrolearn_core::traits::{Fit, Transform};
use ndarray::{Array1, Array2};
use num_traits::Float;
fn select_columns<F: Float>(x: &Array2<F>, indices: &[usize]) -> Array2<F> {
let nrows = x.nrows();
let ncols = indices.len();
if ncols == 0 {
return Array2::zeros((nrows, 0));
}
let mut out = Array2::zeros((nrows, ncols));
for (new_j, &old_j) in indices.iter().enumerate() {
for i in 0..nrows {
out[[i, new_j]] = x[[i, old_j]];
}
}
out
}
#[derive(Debug, Clone)]
pub struct VarianceThreshold<F> {
threshold: F,
}
impl<F: Float + Send + Sync + 'static> VarianceThreshold<F> {
pub fn new(threshold: F) -> Self {
Self { threshold }
}
#[must_use]
pub fn threshold(&self) -> F {
self.threshold
}
}
impl<F: Float + Send + Sync + 'static> Default for VarianceThreshold<F> {
fn default() -> Self {
Self::new(F::zero())
}
}
#[derive(Debug, Clone)]
pub struct FittedVarianceThreshold<F> {
selected_indices: Vec<usize>,
variances: Array1<F>,
}
impl<F: Float + Send + Sync + 'static> FittedVarianceThreshold<F> {
#[must_use]
pub fn selected_indices(&self) -> &[usize] {
&self.selected_indices
}
#[must_use]
pub fn variances(&self) -> &Array1<F> {
&self.variances
}
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for VarianceThreshold<F> {
type Fitted = FittedVarianceThreshold<F>;
type Error = FerroError;
fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedVarianceThreshold<F>, FerroError> {
if self.threshold < F::zero() {
return Err(FerroError::InvalidParameter {
name: "threshold".into(),
reason: "variance threshold must be non-negative".into(),
});
}
let n_samples = x.nrows();
if n_samples == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "VarianceThreshold::fit".into(),
});
}
let n = F::from(n_samples).unwrap_or(F::one());
let n_features = x.ncols();
let mut variances = Array1::zeros(n_features);
let mut selected_indices = Vec::new();
for j in 0..n_features {
let col = x.column(j);
let mean = col.iter().copied().fold(F::zero(), |acc, v| acc + v) / n;
let var = col
.iter()
.copied()
.map(|v| (v - mean) * (v - mean))
.fold(F::zero(), |acc, v| acc + v)
/ n;
variances[j] = var;
if var > self.threshold {
selected_indices.push(j);
}
}
Ok(FittedVarianceThreshold {
selected_indices,
variances,
})
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedVarianceThreshold<F> {
type Output = Array2<F>;
type Error = FerroError;
fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let n_original = self.variances.len();
if x.ncols() != n_original {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows(), n_original],
actual: vec![x.nrows(), x.ncols()],
context: "FittedVarianceThreshold::transform".into(),
});
}
Ok(select_columns(x, &self.selected_indices))
}
}
impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for VarianceThreshold<F> {
fn fit_pipeline(
&self,
x: &Array2<F>,
_y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
let fitted = self.fit(x, &())?;
Ok(Box::new(fitted))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F> for FittedVarianceThreshold<F> {
fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
self.transform(x)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ScoreFunc {
FClassif,
}
#[derive(Debug, Clone)]
pub struct SelectKBest<F> {
k: usize,
score_func: ScoreFunc,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float + Send + Sync + 'static> SelectKBest<F> {
#[must_use]
pub fn new(k: usize, score_func: ScoreFunc) -> Self {
Self {
k,
score_func,
_marker: std::marker::PhantomData,
}
}
#[must_use]
pub fn k(&self) -> usize {
self.k
}
#[must_use]
pub fn score_func(&self) -> ScoreFunc {
self.score_func
}
}
#[derive(Debug, Clone)]
pub struct FittedSelectKBest<F> {
n_features_in: usize,
scores: Array1<F>,
selected_indices: Vec<usize>,
}
impl<F: Float + Send + Sync + 'static> FittedSelectKBest<F> {
#[must_use]
pub fn scores(&self) -> &Array1<F> {
&self.scores
}
#[must_use]
pub fn selected_indices(&self) -> &[usize] {
&self.selected_indices
}
}
fn anova_f_scores<F: Float>(x: &Array2<F>, y: &Array1<usize>) -> Vec<F> {
let n_samples = x.nrows();
let n_features = x.ncols();
let mut class_indices: std::collections::HashMap<usize, Vec<usize>> =
std::collections::HashMap::new();
for (i, &label) in y.iter().enumerate() {
class_indices.entry(label).or_default().push(i);
}
let n_classes = class_indices.len();
let mut scores = Vec::with_capacity(n_features);
for j in 0..n_features {
let col = x.column(j);
let grand_mean =
col.iter().copied().fold(F::zero(), |acc, v| acc + v) / F::from(n_samples).unwrap();
let mut ss_between = F::zero();
let mut ss_within = F::zero();
for rows in class_indices.values() {
let n_k = F::from(rows.len()).unwrap();
let class_mean = rows
.iter()
.map(|&i| col[i])
.fold(F::zero(), |acc, v| acc + v)
/ n_k;
let diff = class_mean - grand_mean;
ss_between = ss_between + n_k * diff * diff;
for &i in rows {
let d = col[i] - class_mean;
ss_within = ss_within + d * d;
}
}
let df_between = F::from(n_classes.saturating_sub(1)).unwrap();
let df_within = F::from(n_samples.saturating_sub(n_classes)).unwrap();
let f = if df_between == F::zero() || df_within == F::zero() {
F::zero()
} else {
let ms_between = ss_between / df_between;
let ms_within = ss_within / df_within;
if ms_within == F::zero() {
F::infinity()
} else {
ms_between / ms_within
}
};
scores.push(f);
}
scores
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, Array1<usize>> for SelectKBest<F> {
type Fitted = FittedSelectKBest<F>;
type Error = FerroError;
fn fit(&self, x: &Array2<F>, y: &Array1<usize>) -> Result<FittedSelectKBest<F>, FerroError> {
let n_samples = x.nrows();
if n_samples == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "SelectKBest::fit".into(),
});
}
if y.len() != n_samples {
return Err(FerroError::ShapeMismatch {
expected: vec![n_samples],
actual: vec![y.len()],
context: "SelectKBest::fit — y must have the same length as x has rows".into(),
});
}
let n_features = x.ncols();
if self.k > n_features {
return Err(FerroError::InvalidParameter {
name: "k".into(),
reason: format!(
"k ({}) cannot exceed the number of features ({})",
self.k, n_features
),
});
}
let raw_scores = match self.score_func {
ScoreFunc::FClassif => anova_f_scores(x, y),
};
let scores = Array1::from_vec(raw_scores.clone());
let mut ranked: Vec<usize> = (0..n_features).collect();
ranked.sort_by(|&a, &b| {
raw_scores[b]
.partial_cmp(&raw_scores[a])
.unwrap_or(std::cmp::Ordering::Equal)
.then(a.cmp(&b))
});
let mut selected_indices: Vec<usize> = ranked[..self.k].to_vec();
selected_indices.sort_unstable();
Ok(FittedSelectKBest {
n_features_in: n_features,
scores,
selected_indices,
})
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSelectKBest<F> {
type Output = Array2<F>;
type Error = FerroError;
fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
if x.ncols() != self.n_features_in {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows(), self.n_features_in],
actual: vec![x.nrows(), x.ncols()],
context: "FittedSelectKBest::transform".into(),
});
}
Ok(select_columns(x, &self.selected_indices))
}
}
impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for SelectKBest<F> {
fn fit_pipeline(
&self,
x: &Array2<F>,
y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
let y_usize: Array1<usize> = y.mapv(|v| v.round().to_usize().unwrap_or(0));
let fitted = self.fit(x, &y_usize)?;
Ok(Box::new(fitted))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F> for FittedSelectKBest<F> {
fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
self.transform(x)
}
}
#[derive(Debug, Clone)]
pub struct SelectFromModel<F> {
importances: Array1<F>,
threshold: F,
selected_indices: Vec<usize>,
}
impl<F: Float + Send + Sync + 'static> SelectFromModel<F> {
pub fn new_from_importances(
importances: &Array1<F>,
threshold: Option<F>,
) -> Result<Self, FerroError> {
let n = importances.len();
if n == 0 {
return Err(FerroError::InvalidParameter {
name: "importances".into(),
reason: "importance vector must not be empty".into(),
});
}
let thr = threshold.unwrap_or_else(|| {
importances
.iter()
.copied()
.fold(F::zero(), |acc, v| acc + v)
/ F::from(n).unwrap_or(F::one())
});
let selected_indices: Vec<usize> = importances
.iter()
.enumerate()
.filter(|&(_, &imp)| imp >= thr)
.map(|(j, _)| j)
.collect();
Ok(Self {
importances: importances.clone(),
threshold: thr,
selected_indices,
})
}
#[must_use]
pub fn threshold(&self) -> F {
self.threshold
}
#[must_use]
pub fn importances(&self) -> &Array1<F> {
&self.importances
}
#[must_use]
pub fn selected_indices(&self) -> &[usize] {
&self.selected_indices
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for SelectFromModel<F> {
type Output = Array2<F>;
type Error = FerroError;
fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let n_features = self.importances.len();
if x.ncols() != n_features {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows(), n_features],
actual: vec![x.nrows(), x.ncols()],
context: "SelectFromModel::transform".into(),
});
}
Ok(select_columns(x, &self.selected_indices))
}
}
impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for SelectFromModel<F> {
fn fit_pipeline(
&self,
_x: &Array2<F>,
_y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
Ok(Box::new(self.clone()))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F> for SelectFromModel<F> {
fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
self.transform(x)
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use ndarray::array;
#[test]
fn test_variance_threshold_removes_constant_column() {
let sel = VarianceThreshold::<f64>::new(0.0);
let x = array![[1.0, 7.0], [2.0, 7.0], [3.0, 7.0]];
let fitted = sel.fit(&x, &()).unwrap();
assert_eq!(fitted.selected_indices(), &[0usize]);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.ncols(), 1);
assert_abs_diff_eq!(out[[0, 0]], 1.0, epsilon = 1e-15);
assert_abs_diff_eq!(out[[1, 0]], 2.0, epsilon = 1e-15);
}
#[test]
fn test_variance_threshold_keeps_all_when_above() {
let sel = VarianceThreshold::<f64>::new(0.0);
let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
let fitted = sel.fit(&x, &()).unwrap();
assert_eq!(fitted.selected_indices().len(), 2);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.ncols(), 2);
}
#[test]
fn test_variance_threshold_custom_threshold() {
let sel = VarianceThreshold::<f64>::new(1.5);
let x = array![[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]];
let fitted = sel.fit(&x, &()).unwrap();
assert_eq!(fitted.selected_indices(), &[1usize]);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.ncols(), 1);
}
#[test]
fn test_variance_threshold_stores_variances() {
let sel = VarianceThreshold::<f64>::default();
let x = array![[0.0], [0.0], [0.0]]; let fitted = sel.fit(&x, &()).unwrap();
assert_abs_diff_eq!(fitted.variances()[0], 0.0, epsilon = 1e-15);
}
#[test]
fn test_variance_threshold_zero_rows_error() {
let sel = VarianceThreshold::<f64>::new(0.0);
let x: Array2<f64> = Array2::zeros((0, 2));
assert!(sel.fit(&x, &()).is_err());
}
#[test]
fn test_variance_threshold_negative_threshold_error() {
let sel = VarianceThreshold::<f64>::new(-0.1);
let x = array![[1.0], [2.0]];
assert!(sel.fit(&x, &()).is_err());
}
#[test]
fn test_variance_threshold_shape_mismatch_on_transform() {
let sel = VarianceThreshold::<f64>::new(0.0);
let x_train = array![[1.0, 2.0], [3.0, 4.0]];
let fitted = sel.fit(&x_train, &()).unwrap();
let x_bad = array![[1.0, 2.0, 3.0]];
assert!(fitted.transform(&x_bad).is_err());
}
#[test]
fn test_variance_threshold_all_constant_columns() {
let sel = VarianceThreshold::<f64>::new(0.0);
let x = array![[5.0, 3.0], [5.0, 3.0], [5.0, 3.0]];
let fitted = sel.fit(&x, &()).unwrap();
assert_eq!(fitted.selected_indices().len(), 0);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.ncols(), 0);
assert_eq!(out.nrows(), 3);
}
#[test]
fn test_variance_threshold_pipeline_integration() {
use ferrolearn_core::pipeline::PipelineTransformer;
let sel = VarianceThreshold::<f64>::new(0.0);
let x = array![[1.0, 7.0], [2.0, 7.0], [3.0, 7.0]];
let y = ndarray::array![0.0, 1.0, 0.0];
let fitted_box = sel.fit_pipeline(&x, &y).unwrap();
let out = fitted_box.transform_pipeline(&x).unwrap();
assert_eq!(out.ncols(), 1);
}
#[test]
fn test_variance_threshold_f32() {
let sel = VarianceThreshold::<f32>::new(0.0f32);
let x: Array2<f32> = array![[1.0f32, 5.0], [2.0, 5.0], [3.0, 5.0]];
let fitted = sel.fit(&x, &()).unwrap();
assert_eq!(fitted.selected_indices(), &[0usize]);
}
#[test]
fn test_select_k_best_selects_highest_scoring_feature() {
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x = array![[1.0, 5.0], [1.0, 6.0], [10.0, 5.0], [10.0, 6.0]];
let y: Array1<usize> = array![0, 0, 1, 1];
let fitted = sel.fit(&x, &y).unwrap();
assert_eq!(fitted.selected_indices(), &[0usize]);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.ncols(), 1);
}
#[test]
fn test_select_k_best_k_equals_n_features_keeps_all() {
let sel = SelectKBest::<f64>::new(2, ScoreFunc::FClassif);
let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
let y: Array1<usize> = array![0, 1, 0];
let fitted = sel.fit(&x, &y).unwrap();
assert_eq!(fitted.selected_indices().len(), 2);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.ncols(), 2);
}
#[test]
fn test_select_k_best_scores_stored() {
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x = array![[1.0, 2.0], [1.0, 4.0]];
let y: Array1<usize> = array![0, 1];
let fitted = sel.fit(&x, &y).unwrap();
assert_eq!(fitted.scores().len(), 2);
}
#[test]
fn test_select_k_best_zero_rows_error() {
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x: Array2<f64> = Array2::zeros((0, 3));
let y: Array1<usize> = Array1::zeros(0);
assert!(sel.fit(&x, &y).is_err());
}
#[test]
fn test_select_k_best_k_exceeds_n_features_error() {
let sel = SelectKBest::<f64>::new(5, ScoreFunc::FClassif);
let x = array![[1.0, 2.0], [3.0, 4.0]];
let y: Array1<usize> = array![0, 1];
assert!(sel.fit(&x, &y).is_err());
}
#[test]
fn test_select_k_best_y_length_mismatch_error() {
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x = array![[1.0, 2.0], [3.0, 4.0]];
let y: Array1<usize> = array![0]; assert!(sel.fit(&x, &y).is_err());
}
#[test]
fn test_select_k_best_shape_mismatch_on_transform() {
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x = array![[1.0, 2.0], [3.0, 4.0]];
let y: Array1<usize> = array![0, 1];
let fitted = sel.fit(&x, &y).unwrap();
let x_bad = array![[1.0, 2.0, 3.0]];
assert!(fitted.transform(&x_bad).is_err());
}
#[test]
fn test_select_k_best_selected_indices_in_column_order() {
let sel = SelectKBest::<f64>::new(2, ScoreFunc::FClassif);
let x = array![[1.0, 100.0], [2.0, 200.0]];
let y: Array1<usize> = array![0, 1];
let fitted = sel.fit(&x, &y).unwrap();
let indices = fitted.selected_indices();
assert!(indices.windows(2).all(|w| w[0] < w[1]));
}
#[test]
fn test_select_k_best_pipeline_integration() {
use ferrolearn_core::pipeline::PipelineTransformer;
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x = array![[1.0, 5.0], [1.0, 6.0], [10.0, 5.0], [10.0, 6.0]];
let y = ndarray::array![0.0, 0.0, 1.0, 1.0];
let fitted_box = sel.fit_pipeline(&x, &y).unwrap();
let out = fitted_box.transform_pipeline(&x).unwrap();
assert_eq!(out.ncols(), 1);
}
#[test]
fn test_select_k_best_f_score_zero_within_class_variance() {
let sel = SelectKBest::<f64>::new(1, ScoreFunc::FClassif);
let x = array![[0.0], [0.0], [10.0], [10.0]];
let y: Array1<usize> = array![0, 0, 1, 1];
let fitted = sel.fit(&x, &y).unwrap();
assert!(fitted.scores()[0].is_infinite());
}
#[test]
fn test_select_from_model_mean_threshold() {
let importances = array![0.1, 0.5, 0.4];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, None).unwrap();
assert_eq!(sel.selected_indices(), &[1usize, 2]);
let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
let out = sel.transform(&x).unwrap();
assert_eq!(out.ncols(), 2);
assert_abs_diff_eq!(out[[0, 0]], 2.0, epsilon = 1e-15);
assert_abs_diff_eq!(out[[0, 1]], 3.0, epsilon = 1e-15);
}
#[test]
fn test_select_from_model_explicit_threshold() {
let importances = array![0.1, 0.5, 0.4];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, Some(0.45)).unwrap();
assert_eq!(sel.selected_indices(), &[1usize]);
let x = array![[1.0, 2.0, 3.0]];
let out = sel.transform(&x).unwrap();
assert_eq!(out.ncols(), 1);
assert_abs_diff_eq!(out[[0, 0]], 2.0, epsilon = 1e-15);
}
#[test]
fn test_select_from_model_all_selected_when_threshold_zero() {
let importances = array![0.1, 0.2, 0.3];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, Some(0.0)).unwrap();
assert_eq!(sel.selected_indices().len(), 3);
}
#[test]
fn test_select_from_model_none_selected_when_threshold_high() {
let importances = array![0.1, 0.2, 0.3];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, Some(1.0)).unwrap();
assert_eq!(sel.selected_indices().len(), 0);
let x = array![[1.0, 2.0, 3.0]];
let out = sel.transform(&x).unwrap();
assert_eq!(out.ncols(), 0);
}
#[test]
fn test_select_from_model_empty_importances_error() {
let importances: Array1<f64> = Array1::zeros(0);
assert!(SelectFromModel::<f64>::new_from_importances(&importances, None).is_err());
}
#[test]
fn test_select_from_model_shape_mismatch_on_transform() {
let importances = array![0.3, 0.7];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, None).unwrap();
let x_bad = array![[1.0, 2.0, 3.0]]; assert!(sel.transform(&x_bad).is_err());
}
#[test]
fn test_select_from_model_threshold_accessor() {
let importances = array![0.3, 0.7];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, Some(0.5)).unwrap();
assert_abs_diff_eq!(sel.threshold(), 0.5, epsilon = 1e-15);
}
#[test]
fn test_select_from_model_pipeline_integration() {
use ferrolearn_core::pipeline::PipelineTransformer;
let importances = array![0.1, 0.9];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, None).unwrap();
let x = array![[1.0, 2.0], [3.0, 4.0]];
let y = ndarray::array![0.0, 1.0];
let fitted_box = sel.fit_pipeline(&x, &y).unwrap();
let out = fitted_box.transform_pipeline(&x).unwrap();
assert_eq!(out.ncols(), 1);
assert_abs_diff_eq!(out[[0, 0]], 2.0, epsilon = 1e-15);
}
#[test]
fn test_select_from_model_importances_accessor() {
let importances = array![0.2, 0.8];
let sel = SelectFromModel::<f64>::new_from_importances(&importances, None).unwrap();
assert_abs_diff_eq!(sel.importances()[0], 0.2, epsilon = 1e-15);
assert_abs_diff_eq!(sel.importances()[1], 0.8, epsilon = 1e-15);
}
}