use scirs2_core::ndarray::ArrayStatCompat;
use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
use scirs2_core::numeric::{Float, NumCast};
use crate::error::{Result, TransformError};
use statrs::statistics::Statistics;
pub struct VarianceThreshold {
threshold: f64,
variances_: Option<Array1<f64>>,
selected_features_: Option<Vec<usize>>,
}
impl VarianceThreshold {
pub fn new(threshold: f64) -> Result<Self> {
if threshold < 0.0 {
return Err(TransformError::InvalidInput(
"Threshold must be non-negative".to_string(),
));
}
Ok(VarianceThreshold {
threshold,
variances_: None,
selected_features_: None,
})
}
pub fn with_defaults() -> Self {
Self::new(0.0).expect("Operation failed")
}
pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
where
S: Data,
S::Elem: Float + NumCast,
{
let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
let n_samples = x_f64.shape()[0];
let n_features = x_f64.shape()[1];
if n_samples == 0 || n_features == 0 {
return Err(TransformError::InvalidInput("Empty input data".to_string()));
}
if n_samples < 2 {
return Err(TransformError::InvalidInput(
"At least 2 samples required to compute variance".to_string(),
));
}
let mut variances = Array1::zeros(n_features);
let mut selected_features = Vec::new();
for j in 0..n_features {
let feature_data = x_f64.column(j);
let mean = feature_data.iter().sum::<f64>() / n_samples as f64;
let variance = feature_data
.iter()
.map(|&x| (x - mean).powi(2))
.sum::<f64>()
/ n_samples as f64;
variances[j] = variance;
if variance > self.threshold {
selected_features.push(j);
}
}
self.variances_ = Some(variances);
self.selected_features_ = Some(selected_features);
Ok(())
}
pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
where
S: Data,
S::Elem: Float + NumCast,
{
let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
let n_samples = x_f64.shape()[0];
let n_features = x_f64.shape()[1];
if self.selected_features_.is_none() {
return Err(TransformError::TransformationError(
"VarianceThreshold has not been fitted".to_string(),
));
}
let selected_features = self.selected_features_.as_ref().expect("Operation failed");
if let Some(ref variances) = self.variances_ {
if n_features != variances.len() {
return Err(TransformError::InvalidInput(format!(
"x has {} features, but VarianceThreshold was fitted with {} features",
n_features,
variances.len()
)));
}
}
let n_selected = selected_features.len();
let mut transformed = Array2::zeros((n_samples, n_selected));
for (new_idx, &old_idx) in selected_features.iter().enumerate() {
for i in 0..n_samples {
transformed[[i, new_idx]] = x_f64[[i, old_idx]];
}
}
Ok(transformed)
}
pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
where
S: Data,
S::Elem: Float + NumCast,
{
self.fit(x)?;
self.transform(x)
}
pub fn variances(&self) -> Option<&Array1<f64>> {
self.variances_.as_ref()
}
pub fn get_support(&self) -> Option<&Vec<usize>> {
self.selected_features_.as_ref()
}
pub fn get_support_mask(&self) -> Option<Array1<bool>> {
if let (Some(ref variances), Some(ref selected)) =
(&self.variances_, &self.selected_features_)
{
let n_features = variances.len();
let mut mask = Array1::from_elem(n_features, false);
for &idx in selected {
mask[idx] = true;
}
Some(mask)
} else {
None
}
}
pub fn n_features_selected(&self) -> Option<usize> {
self.selected_features_.as_ref().map(|s| s.len())
}
pub fn inverse_transform<S>(&self, _x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
where
S: Data,
S::Elem: Float + NumCast,
{
Err(TransformError::TransformationError(
"inverse_transform is not supported for feature selection".to_string(),
))
}
}
#[derive(Debug, Clone)]
pub struct RecursiveFeatureElimination<F>
where
F: Fn(&Array2<f64>, &Array1<f64>) -> Result<Array1<f64>>,
{
n_features_to_select: usize,
step: usize,
importance_func: F,
selected_features_: Option<Vec<usize>>,
ranking_: Option<Array1<usize>>,
scores_: Option<Array1<f64>>,
}
impl<F> RecursiveFeatureElimination<F>
where
F: Fn(&Array2<f64>, &Array1<f64>) -> Result<Array1<f64>>,
{
pub fn new(n_features_to_select: usize, importancefunc: F) -> Self {
RecursiveFeatureElimination {
n_features_to_select,
step: 1,
importance_func: importancefunc,
selected_features_: None,
ranking_: None,
scores_: None,
}
}
pub fn with_step(mut self, step: usize) -> Self {
self.step = step.max(1);
self
}
pub fn fit(&mut self, x: &Array2<f64>, y: &Array1<f64>) -> Result<()> {
let n_samples = x.shape()[0];
let n_features = x.shape()[1];
if n_samples != y.len() {
return Err(TransformError::InvalidInput(format!(
"X has {} samples but y has {} samples",
n_samples,
y.len()
)));
}
if self.n_features_to_select > n_features {
return Err(TransformError::InvalidInput(format!(
"n_features_to_select={} must be <= n_features={}",
self.n_features_to_select, n_features
)));
}
let mut remaining_features: Vec<usize> = (0..n_features).collect();
let mut ranking = Array1::zeros(n_features);
let mut current_rank = 1;
while remaining_features.len() > self.n_features_to_select {
let x_subset = self.subset_features(x, &remaining_features);
let importances = (self.importance_func)(&x_subset, y)?;
if importances.len() != remaining_features.len() {
return Err(TransformError::InvalidInput(
"Importance function returned wrong number of scores".to_string(),
));
}
let n_to_remove = (self.step).min(remaining_features.len() - self.n_features_to_select);
let mut indices: Vec<usize> = (0..importances.len()).collect();
indices.sort_by(|&i, &j| {
importances[i]
.partial_cmp(&importances[j])
.expect("Operation failed")
});
for i in 0..n_to_remove {
let feature_idx = remaining_features[indices[i]];
ranking[feature_idx] = n_features - current_rank + 1;
current_rank += 1;
}
let eliminated: std::collections::HashSet<usize> =
indices.iter().take(n_to_remove).cloned().collect();
let features_to_retain: Vec<usize> = remaining_features
.iter()
.filter(|&&idx| !eliminated.contains(&idx))
.cloned()
.collect();
remaining_features = features_to_retain;
}
for &feature_idx in &remaining_features {
ranking[feature_idx] = 1;
}
let x_final = self.subset_features(x, &remaining_features);
let final_scores = (self.importance_func)(&x_final, y)?;
let mut scores = Array1::zeros(n_features);
for (i, &feature_idx) in remaining_features.iter().enumerate() {
scores[feature_idx] = final_scores[i];
}
self.selected_features_ = Some(remaining_features);
self.ranking_ = Some(ranking);
self.scores_ = Some(scores);
Ok(())
}
fn subset_features(&self, x: &Array2<f64>, features: &[usize]) -> Array2<f64> {
let n_samples = x.shape()[0];
let n_selected = features.len();
let mut subset = Array2::zeros((n_samples, n_selected));
for (new_idx, &old_idx) in features.iter().enumerate() {
subset.column_mut(new_idx).assign(&x.column(old_idx));
}
subset
}
pub fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
if self.selected_features_.is_none() {
return Err(TransformError::TransformationError(
"RFE has not been fitted".to_string(),
));
}
let selected = self.selected_features_.as_ref().expect("Operation failed");
Ok(self.subset_features(x, selected))
}
pub fn fit_transform(&mut self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Array2<f64>> {
self.fit(x, y)?;
self.transform(x)
}
pub fn get_support(&self) -> Option<&Vec<usize>> {
self.selected_features_.as_ref()
}
pub fn ranking(&self) -> Option<&Array1<usize>> {
self.ranking_.as_ref()
}
pub fn scores(&self) -> Option<&Array1<f64>> {
self.scores_.as_ref()
}
}
#[derive(Debug, Clone)]
pub struct MutualInfoSelector {
k: usize,
discrete_target: bool,
n_neighbors: usize,
selected_features_: Option<Vec<usize>>,
scores_: Option<Array1<f64>>,
}
impl MutualInfoSelector {
pub fn new(k: usize) -> Self {
MutualInfoSelector {
k,
discrete_target: false,
n_neighbors: 3,
selected_features_: None,
scores_: None,
}
}
pub fn with_discrete_target(mut self) -> Self {
self.discrete_target = true;
self
}
pub fn with_n_neighbors(mut self, nneighbors: usize) -> Self {
self.n_neighbors = nneighbors;
self
}
fn estimate_mutual_info(&self, x: &Array1<f64>, y: &Array1<f64>) -> f64 {
let n = x.len();
if n < self.n_neighbors + 1 {
return 0.0;
}
if !self.discrete_target {
let x_mean = x.mean_or(0.0);
let y_mean = y.mean_or(0.0);
let x_std = x.std(0.0);
let y_std = y.std(0.0);
if x_std < 1e-10 || y_std < 1e-10 {
return 0.0;
}
let mut correlation = 0.0;
for i in 0..n {
correlation += (x[i] - x_mean) * (y[i] - y_mean);
}
correlation /= (n as f64 - 1.0) * x_std * y_std;
if correlation.abs() >= 1.0 {
return 5.0; }
(-0.5 * (1.0 - correlation * correlation).ln()).max(0.0)
} else {
let mut groups = std::collections::HashMap::new();
for i in 0..n {
let key = y[i].round() as i64;
groups.entry(key).or_insert_with(Vec::new).push(x[i]);
}
let total_mean = x.mean_or(0.0);
let total_var = x.variance();
if total_var < 1e-10 {
return 0.0;
}
let mut between_var = 0.0;
for (_, values) in groups {
let group_mean = values.iter().sum::<f64>() / values.len() as f64;
let weight = values.len() as f64 / n as f64;
between_var += weight * (group_mean - total_mean).powi(2);
}
(between_var / total_var).min(1.0) * 2.0 }
}
pub fn fit(&mut self, x: &Array2<f64>, y: &Array1<f64>) -> Result<()> {
let n_features = x.shape()[1];
if self.k > n_features {
return Err(TransformError::InvalidInput(format!(
"k={} must be <= n_features={}",
self.k, n_features
)));
}
let mut scores = Array1::zeros(n_features);
for j in 0..n_features {
let feature = x.column(j).to_owned();
scores[j] = self.estimate_mutual_info(&feature, y);
}
let mut indices: Vec<usize> = (0..n_features).collect();
indices.sort_by(|&i, &j| scores[j].partial_cmp(&scores[i]).expect("Operation failed"));
let selected_features = indices.into_iter().take(self.k).collect();
self.scores_ = Some(scores);
self.selected_features_ = Some(selected_features);
Ok(())
}
pub fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
if self.selected_features_.is_none() {
return Err(TransformError::TransformationError(
"MutualInfoSelector has not been fitted".to_string(),
));
}
let selected = self.selected_features_.as_ref().expect("Operation failed");
let n_samples = x.shape()[0];
let mut transformed = Array2::zeros((n_samples, self.k));
for (new_idx, &old_idx) in selected.iter().enumerate() {
transformed.column_mut(new_idx).assign(&x.column(old_idx));
}
Ok(transformed)
}
pub fn fit_transform(&mut self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Array2<f64>> {
self.fit(x, y)?;
self.transform(x)
}
pub fn get_support(&self) -> Option<&Vec<usize>> {
self.selected_features_.as_ref()
}
pub fn scores(&self) -> Option<&Array1<f64>> {
self.scores_.as_ref()
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use scirs2_core::ndarray::Array;
#[test]
fn test_variance_threshold_basic() {
let data = Array::from_shape_vec(
(3, 4),
vec![1.0, 1.0, 5.0, 1.0, 1.0, 2.0, 5.0, 3.0, 1.0, 3.0, 5.0, 5.0],
)
.expect("Operation failed");
let mut selector = VarianceThreshold::with_defaults();
let transformed = selector.fit_transform(&data).expect("Operation failed");
assert_eq!(transformed.shape(), &[3, 2]);
let selected = selector.get_support().expect("Operation failed");
assert_eq!(selected, &[1, 3]);
assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[1, 0]], 2.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[2, 0]], 3.0, epsilon = 1e-10);
assert_abs_diff_eq!(transformed[[0, 1]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[1, 1]], 3.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[2, 1]], 5.0, epsilon = 1e-10); }
#[test]
fn test_variance_threshold_custom() {
let data = Array::from_shape_vec(
(4, 3),
vec![
1.0, 1.0, 1.0, 2.0, 1.1, 2.0, 3.0, 1.0, 3.0, 4.0, 1.1, 4.0, ],
)
.expect("Operation failed");
let mut selector = VarianceThreshold::new(0.1).expect("Operation failed");
let transformed = selector.fit_transform(&data).expect("Operation failed");
assert_eq!(transformed.shape(), &[4, 2]);
let selected = selector.get_support().expect("Operation failed");
assert_eq!(selected, &[0, 2]);
let variances = selector.variances().expect("Operation failed");
assert!(variances[0] > 0.1); assert!(variances[1] <= 0.1); assert!(variances[2] > 0.1); }
#[test]
fn test_variance_threshold_support_mask() {
let data = Array::from_shape_vec(
(3, 4),
vec![1.0, 1.0, 5.0, 1.0, 1.0, 2.0, 5.0, 3.0, 1.0, 3.0, 5.0, 5.0],
)
.expect("Operation failed");
let mut selector = VarianceThreshold::with_defaults();
selector.fit(&data).expect("Operation failed");
let mask = selector.get_support_mask().expect("Operation failed");
assert_eq!(mask.len(), 4);
assert!(!mask[0]); assert!(mask[1]); assert!(!mask[2]); assert!(mask[3]);
assert_eq!(selector.n_features_selected().expect("Operation failed"), 2);
}
#[test]
fn test_variance_threshold_all_removed() {
let data = Array::from_shape_vec((3, 2), vec![5.0, 10.0, 5.0, 10.0, 5.0, 10.0])
.expect("Operation failed");
let mut selector = VarianceThreshold::with_defaults();
let transformed = selector.fit_transform(&data).expect("Operation failed");
assert_eq!(transformed.shape(), &[3, 0]);
assert_eq!(selector.n_features_selected().expect("Operation failed"), 0);
}
#[test]
fn test_variance_threshold_errors() {
assert!(VarianceThreshold::new(-0.1).is_err());
let small_data = Array::from_shape_vec((1, 2), vec![1.0, 2.0]).expect("Operation failed");
let mut selector = VarianceThreshold::with_defaults();
assert!(selector.fit(&small_data).is_err());
let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
.expect("Operation failed");
let selector_unfitted = VarianceThreshold::with_defaults();
assert!(selector_unfitted.transform(&data).is_err());
let mut selector = VarianceThreshold::with_defaults();
selector.fit(&data).expect("Operation failed");
assert!(selector.inverse_transform(&data).is_err());
}
#[test]
fn test_variance_threshold_feature_mismatch() {
let train_data =
Array::from_shape_vec((3, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
.expect("Operation failed");
let test_data =
Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).expect("Operation failed");
let mut selector = VarianceThreshold::with_defaults();
selector.fit(&train_data).expect("Operation failed");
assert!(selector.transform(&test_data).is_err());
}
#[test]
fn test_variance_calculation() {
let data = Array::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).expect("Operation failed");
let mut selector = VarianceThreshold::with_defaults();
selector.fit(&data).expect("Operation failed");
let variances = selector.variances().expect("Operation failed");
let expected_variance = 2.0 / 3.0;
assert_abs_diff_eq!(variances[0], expected_variance, epsilon = 1e-10);
}
#[test]
fn test_rfe_basic() {
let n_samples = 100;
let mut data_vec = Vec::new();
let mut target_vec = Vec::new();
for i in 0..n_samples {
let x1 = i as f64 / n_samples as f64;
let x2 = (i as f64 / n_samples as f64).sin();
let x3 = scirs2_core::random::random::<f64>(); let x4 = 2.0 * x1;
data_vec.extend_from_slice(&[x1, x2, x3, x4]);
target_vec.push(3.0 * x1 + x4 + 0.1 * scirs2_core::random::random::<f64>());
}
let x = Array::from_shape_vec((n_samples, 4), data_vec).expect("Operation failed");
let y = Array::from_vec(target_vec);
let importance_func = |x: &Array2<f64>, y: &Array1<f64>| -> Result<Array1<f64>> {
let n_features = x.shape()[1];
let mut scores = Array1::zeros(n_features);
for j in 0..n_features {
let feature = x.column(j);
let corr = pearson_correlation(&feature.to_owned(), y);
scores[j] = corr.abs();
}
Ok(scores)
};
let mut rfe = RecursiveFeatureElimination::new(2, importance_func);
let transformed = rfe.fit_transform(&x, &y).expect("Operation failed");
assert_eq!(transformed.shape()[1], 2);
let selected = rfe.get_support().expect("Operation failed");
assert!(selected.contains(&0) || selected.contains(&3));
}
#[test]
fn test_mutual_info_continuous() {
let n_samples = 100;
let mut x_data = Vec::new();
let mut y_data = Vec::new();
for i in 0..n_samples {
let t = i as f64 / n_samples as f64 * 2.0 * std::f64::consts::PI;
let x0 = t;
let x1 = scirs2_core::random::random::<f64>();
let x2 = t.sin();
x_data.extend_from_slice(&[x0, x1, x2]);
y_data.push(t + 0.5 * t.sin());
}
let x = Array::from_shape_vec((n_samples, 3), x_data).expect("Operation failed");
let y = Array::from_vec(y_data);
let mut selector = MutualInfoSelector::new(2);
selector.fit(&x, &y).expect("Operation failed");
let scores = selector.scores().expect("Operation failed");
assert!(scores[0] > scores[1]);
assert!(scores[2] > scores[1]);
}
#[test]
fn test_mutual_info_discrete() {
let x = Array::from_shape_vec(
(6, 3),
vec![
1.0, 0.1, 5.0, 1.1, 0.2, 5.1, 2.0, 0.1, 4.0, 2.1, 0.2, 4.1, 3.0, 0.1, 3.0, 3.1, 0.2, 3.1, ],
)
.expect("Operation failed");
let y = Array::from_vec(vec![0.0, 0.0, 1.0, 1.0, 2.0, 2.0]);
let mut selector = MutualInfoSelector::new(2).with_discrete_target();
let transformed = selector.fit_transform(&x, &y).expect("Operation failed");
assert_eq!(transformed.shape(), &[6, 2]);
let selected = selector.get_support().expect("Operation failed");
assert!(!selected.contains(&1));
}
fn pearson_correlation(x: &Array1<f64>, y: &Array1<f64>) -> f64 {
#[allow(unused_variables)]
let n = x.len() as f64;
let x_mean = x.mean_or(0.0);
let y_mean = y.mean_or(0.0);
let mut num = 0.0;
let mut x_var = 0.0;
let mut y_var = 0.0;
for i in 0..x.len() {
let x_diff = x[i] - x_mean;
let y_diff = y[i] - y_mean;
num += x_diff * y_diff;
x_var += x_diff * x_diff;
y_var += y_diff * y_diff;
}
if x_var * y_var > 0.0 {
num / (x_var * y_var).sqrt()
} else {
0.0
}
}
}