use crate::defaults::feature_selection as feature_selection_defaults;
#[derive(Debug, Clone)]
pub struct SelectionConfig {
pub min_variance: f32,
pub max_correlation: f32,
pub max_features: usize,
pub use_target_selection: bool,
pub drop_collinear: bool,
pub collinearity_threshold: f32,
}
impl Default for SelectionConfig {
fn default() -> Self {
Self {
min_variance: feature_selection_defaults::DEFAULT_MIN_VARIANCE,
max_correlation: feature_selection_defaults::DEFAULT_MAX_CORRELATION,
max_features: feature_selection_defaults::DEFAULT_MAX_FEATURES,
use_target_selection: feature_selection_defaults::DEFAULT_USE_TARGET_SELECTION,
drop_collinear: feature_selection_defaults::DEFAULT_DROP_COLLINEAR,
collinearity_threshold: feature_selection_defaults::DEFAULT_COLLINEARITY_THRESHOLD,
}
}
}
impl SelectionConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_min_variance(mut self, min: f32) -> Self {
self.min_variance = min;
self
}
pub fn with_max_correlation(mut self, max: f32) -> Self {
self.max_correlation = max;
self
}
pub fn with_max_features(mut self, max: usize) -> Self {
self.max_features = max;
self
}
pub fn with_target_selection(mut self, enabled: bool) -> Self {
self.use_target_selection = enabled;
self
}
pub fn with_drop_collinear(mut self, enabled: bool) -> Self {
self.drop_collinear = enabled;
self
}
pub fn with_collinearity_threshold(mut self, threshold: f32) -> Self {
self.collinearity_threshold = threshold;
self
}
}
pub struct FeatureSelector {
config: SelectionConfig,
}
impl FeatureSelector {
pub fn new(config: SelectionConfig) -> Self {
Self { config }
}
pub fn select(
&self,
original_data: &[f32],
original_num_features: usize,
candidate_data: &[f32],
candidate_num_features: usize,
candidate_names: &[String],
targets: Option<&[f32]>,
) -> (Vec<f32>, Vec<String>, Vec<usize>) {
if candidate_num_features == 0 || candidate_data.is_empty() {
return (Vec::new(), Vec::new(), Vec::new());
}
let num_rows = candidate_data.len() / candidate_num_features;
let mut candidates: Vec<(usize, f32)> = Vec::new();
for f in 0..candidate_num_features {
let variance = compute_variance(candidate_data, num_rows, candidate_num_features, f);
if variance >= self.config.min_variance {
candidates.push((f, variance));
}
}
if original_num_features > 0 && !original_data.is_empty() {
candidates.retain(|(f, _)| {
let max_corr = compute_max_correlation_with_originals(
original_data,
original_num_features,
candidate_data,
candidate_num_features,
num_rows,
*f,
);
max_corr < self.config.max_correlation
});
}
if self.config.use_target_selection {
if let Some(targets) = targets {
if targets.len() == num_rows {
let mut scored: Vec<(usize, f32)> = candidates
.iter()
.map(|&(f, _)| {
let corr = compute_target_correlation(
candidate_data,
candidate_num_features,
num_rows,
f,
targets,
);
(f, corr.abs())
})
.collect();
scored
.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
candidates = scored;
}
}
}
if candidates.len() > self.config.max_features {
candidates.truncate(self.config.max_features);
}
let selected_indices: Vec<usize> = candidates.iter().map(|(f, _)| *f).collect();
let selected_names: Vec<String> = selected_indices
.iter()
.filter_map(|&f| candidate_names.get(f).cloned())
.collect();
let n_selected = selected_indices.len();
let mut selected_data = vec![0.0f32; num_rows * n_selected];
for (new_idx, &orig_idx) in selected_indices.iter().enumerate() {
for r in 0..num_rows {
selected_data[r * n_selected + new_idx] =
candidate_data[r * candidate_num_features + orig_idx];
}
}
(selected_data, selected_names, selected_indices)
}
pub fn drop_collinear_features(
&self,
data: &[f32],
num_features: usize,
feature_names: &[String],
targets: Option<&[f32]>,
) -> (Vec<f32>, Vec<String>, Vec<usize>) {
if num_features <= 1 || data.is_empty() {
return (
data.to_vec(),
feature_names.to_vec(),
(0..num_features).collect(),
);
}
let num_rows = data.len() / num_features;
let threshold = self.config.collinearity_threshold;
let stats: Vec<(f32, f32, f32)> = (0..num_features)
.map(|f| compute_feature_stats(data, num_rows, num_features, f))
.collect();
let target_corrs: Option<Vec<f32>> = targets.map(|t| {
(0..num_features)
.map(|f| compute_target_correlation(data, num_features, num_rows, f, t).abs())
.collect()
});
let mut dropped = vec![false; num_features];
for i in 0..num_features {
if dropped[i] {
continue;
}
for j in (i + 1)..num_features {
if dropped[j] {
continue;
}
let corr = compute_pairwise_correlation(
data,
num_features,
num_rows,
i,
j,
&stats[i],
&stats[j],
);
if corr.abs() > threshold {
let drop_j = match &target_corrs {
Some(tc) => tc[i] >= tc[j], None => stats[i].2 >= stats[j].2, };
if drop_j {
dropped[j] = true;
} else {
dropped[i] = true;
break; }
}
}
}
let kept_indices: Vec<usize> = (0..num_features).filter(|&f| !dropped[f]).collect();
let kept_names: Vec<String> = kept_indices
.iter()
.filter_map(|&f| feature_names.get(f).cloned())
.collect();
let n_kept = kept_indices.len();
let mut kept_data = vec![0.0f32; num_rows * n_kept];
for (new_idx, &orig_idx) in kept_indices.iter().enumerate() {
for r in 0..num_rows {
kept_data[r * n_kept + new_idx] = data[r * num_features + orig_idx];
}
}
(kept_data, kept_names, kept_indices)
}
}
fn compute_feature_stats(
data: &[f32],
num_rows: usize,
num_features: usize,
feature_idx: usize,
) -> (f32, f32, f32) {
if num_rows == 0 {
return (0.0, 1.0, 0.0);
}
let mean: f32 = (0..num_rows)
.map(|r| data[r * num_features + feature_idx])
.filter(|v| v.is_finite())
.sum::<f32>()
/ num_rows as f32;
let variance: f32 = (0..num_rows)
.map(|r| {
let v = data[r * num_features + feature_idx];
if v.is_finite() {
let diff = v - mean;
diff * diff
} else {
0.0
}
})
.sum::<f32>()
/ num_rows as f32;
let std = variance.sqrt().max(1e-10);
(mean, std, variance)
}
fn compute_pairwise_correlation(
data: &[f32],
num_features: usize,
num_rows: usize,
idx_a: usize,
idx_b: usize,
stats_a: &(f32, f32, f32),
stats_b: &(f32, f32, f32),
) -> f32 {
let (mean_a, std_a, _) = stats_a;
let (mean_b, std_b, _) = stats_b;
let covar: f32 = (0..num_rows)
.map(|r| {
let a = data[r * num_features + idx_a];
let b = data[r * num_features + idx_b];
if a.is_finite() && b.is_finite() {
(a - mean_a) * (b - mean_b)
} else {
0.0
}
})
.sum::<f32>()
/ num_rows as f32;
covar / (std_a * std_b)
}
fn compute_variance(data: &[f32], num_rows: usize, num_features: usize, feature_idx: usize) -> f32 {
if num_rows == 0 {
return 0.0;
}
let mean: f32 = (0..num_rows)
.map(|r| data[r * num_features + feature_idx])
.sum::<f32>()
/ num_rows as f32;
let variance: f32 = (0..num_rows)
.map(|r| {
let diff = data[r * num_features + feature_idx] - mean;
diff * diff
})
.sum::<f32>()
/ num_rows as f32;
variance
}
fn compute_max_correlation_with_originals(
original_data: &[f32],
original_num_features: usize,
candidate_data: &[f32],
candidate_num_features: usize,
num_rows: usize,
candidate_idx: usize,
) -> f32 {
let mut max_corr = 0.0f32;
let cand_mean: f32 = (0..num_rows)
.map(|r| candidate_data[r * candidate_num_features + candidate_idx])
.sum::<f32>()
/ num_rows as f32;
let cand_std: f32 = (0..num_rows)
.map(|r| {
let diff = candidate_data[r * candidate_num_features + candidate_idx] - cand_mean;
diff * diff
})
.sum::<f32>()
/ num_rows as f32;
let cand_std = cand_std.sqrt().max(1e-10);
for orig_idx in 0..original_num_features {
let orig_mean: f32 = (0..num_rows)
.map(|r| original_data[r * original_num_features + orig_idx])
.sum::<f32>()
/ num_rows as f32;
let orig_std: f32 = (0..num_rows)
.map(|r| {
let diff = original_data[r * original_num_features + orig_idx] - orig_mean;
diff * diff
})
.sum::<f32>()
/ num_rows as f32;
let orig_std = orig_std.sqrt().max(1e-10);
let covar: f32 = (0..num_rows)
.map(|r| {
let x = candidate_data[r * candidate_num_features + candidate_idx] - cand_mean;
let y = original_data[r * original_num_features + orig_idx] - orig_mean;
x * y
})
.sum::<f32>()
/ num_rows as f32;
let corr = (covar / (cand_std * orig_std)).abs();
max_corr = max_corr.max(corr);
}
max_corr
}
fn compute_target_correlation(
candidate_data: &[f32],
candidate_num_features: usize,
num_rows: usize,
candidate_idx: usize,
targets: &[f32],
) -> f32 {
if num_rows == 0 {
return 0.0;
}
let cand_mean: f32 = (0..num_rows)
.map(|r| candidate_data[r * candidate_num_features + candidate_idx])
.sum::<f32>()
/ num_rows as f32;
let target_mean: f32 = targets.iter().sum::<f32>() / num_rows as f32;
let cand_std: f32 = (0..num_rows)
.map(|r| {
let diff = candidate_data[r * candidate_num_features + candidate_idx] - cand_mean;
diff * diff
})
.sum::<f32>()
/ num_rows as f32;
let cand_std = cand_std.sqrt().max(1e-10);
let target_std: f32 = targets
.iter()
.map(|&t| {
let diff = t - target_mean;
diff * diff
})
.sum::<f32>()
/ num_rows as f32;
let target_std = target_std.sqrt().max(1e-10);
let covar: f32 = (0..num_rows)
.map(|r| {
let x = candidate_data[r * candidate_num_features + candidate_idx] - cand_mean;
let y = targets[r] - target_mean;
x * y
})
.sum::<f32>()
/ num_rows as f32;
covar / (cand_std * target_std)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_selection_config_default() {
let config = SelectionConfig::default();
assert!((config.min_variance - 0.01).abs() < 1e-6);
assert!((config.max_correlation - 0.95).abs() < 1e-6);
assert_eq!(config.max_features, 50);
assert!(config.use_target_selection);
}
#[test]
fn test_selection_config_builder() {
let config = SelectionConfig::new()
.with_min_variance(0.1)
.with_max_correlation(0.9)
.with_max_features(10)
.with_target_selection(false);
assert!((config.min_variance - 0.1).abs() < 1e-6);
assert!((config.max_correlation - 0.9).abs() < 1e-6);
assert_eq!(config.max_features, 10);
assert!(!config.use_target_selection);
}
#[test]
fn test_variance_filter() {
let config = SelectionConfig::new()
.with_min_variance(0.5)
.with_max_features(100);
let selector = FeatureSelector::new(config);
let candidates = vec![1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0];
let names = vec!["high_var".to_string(), "low_var".to_string()];
let (_, selected_names, _) = selector.select(&[], 0, &candidates, 2, &names, None);
assert_eq!(selected_names.len(), 1);
assert_eq!(selected_names[0], "high_var");
}
#[test]
fn test_max_features_limit() {
let config = SelectionConfig::new()
.with_min_variance(0.0)
.with_max_features(2);
let selector = FeatureSelector::new(config);
let candidates = vec![1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 5.0];
let names = vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string(),
];
let (_, selected_names, _) = selector.select(&[], 0, &candidates, 4, &names, None);
assert_eq!(selected_names.len(), 2);
}
#[test]
fn test_target_selection() {
let config = SelectionConfig::new()
.with_min_variance(0.0)
.with_max_features(1)
.with_target_selection(true);
let selector = FeatureSelector::new(config);
let candidates = vec![
1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0, ];
let names = vec!["uncorrelated".to_string(), "correlated".to_string()];
let targets = vec![1.0, 2.0, 3.0, 4.0];
let (_, selected_names, _) =
selector.select(&[], 0, &candidates, 2, &names, Some(&targets));
assert_eq!(selected_names.len(), 1);
assert_eq!(selected_names[0], "correlated");
}
#[test]
fn test_empty_input() {
let selector = FeatureSelector::new(SelectionConfig::default());
let (data, names, indices) = selector.select(&[], 0, &[], 0, &[], None);
assert!(data.is_empty());
assert!(names.is_empty());
assert!(indices.is_empty());
}
#[test]
fn test_variance_computation() {
let data = vec![1.0, 2.0, 3.0, 4.0];
let var = compute_variance(&data, 4, 1, 0);
assert!((var - 1.25).abs() < 1e-6);
}
#[test]
fn test_collinearity_config_defaults() {
let config = SelectionConfig::default();
assert!(!config.drop_collinear);
assert!((config.collinearity_threshold - 0.99).abs() < 1e-6);
}
#[test]
fn test_collinearity_config_builder() {
let config = SelectionConfig::new()
.with_drop_collinear(true)
.with_collinearity_threshold(0.95);
assert!(config.drop_collinear);
assert!((config.collinearity_threshold - 0.95).abs() < 1e-6);
}
#[test]
fn test_drop_collinear_perfect_correlation() {
let config = SelectionConfig::new().with_collinearity_threshold(0.9);
let selector = FeatureSelector::new(config);
let data = vec![
1.0, 2.0, 4.0, 2.0, 4.0, 3.0, 3.0, 6.0, 2.0, 4.0, 8.0, 1.0, ];
let names = vec!["f0".to_string(), "f1".to_string(), "f2".to_string()];
let (_, kept_names, kept_indices) =
selector.drop_collinear_features(&data, 3, &names, None);
assert!(kept_names.len() <= 2);
assert_eq!(kept_names.len(), kept_indices.len());
}
#[test]
fn test_drop_collinear_uncorrelated_features() {
let config = SelectionConfig::new().with_collinearity_threshold(0.9);
let selector = FeatureSelector::new(config);
let data = vec![
1.0, 5.0, 9.0, 2.0, 3.0, 8.0, 3.0, 8.0, 2.0, 4.0, 1.0, 5.0, ];
let names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
let (_, kept_names, _) = selector.drop_collinear_features(&data, 3, &names, None);
assert_eq!(kept_names.len(), 3);
}
#[test]
fn test_drop_collinear_with_target() {
let config = SelectionConfig::new().with_collinearity_threshold(0.9);
let selector = FeatureSelector::new(config);
let data = vec![
1.0, 2.0, 2.0, 4.0, 3.0, 6.0, 4.0, 8.0, ];
let names = vec!["f0".to_string(), "f1".to_string()];
let targets = vec![1.0, 2.0, 3.0, 4.0];
let (_, kept_names, _) = selector.drop_collinear_features(&data, 2, &names, Some(&targets));
assert_eq!(kept_names.len(), 1);
assert_eq!(kept_names[0], "f0");
}
#[test]
fn test_drop_collinear_keeps_higher_variance() {
let config = SelectionConfig::new().with_collinearity_threshold(0.9);
let selector = FeatureSelector::new(config);
let data = vec![
1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, ];
let names = vec!["low_var".to_string(), "high_var".to_string()];
let (_, kept_names, _) = selector.drop_collinear_features(&data, 2, &names, None);
assert_eq!(kept_names.len(), 1);
assert_eq!(kept_names[0], "high_var");
}
#[test]
fn test_drop_collinear_single_feature() {
let config = SelectionConfig::new().with_collinearity_threshold(0.9);
let selector = FeatureSelector::new(config);
let data = vec![1.0, 2.0, 3.0, 4.0];
let names = vec!["only_one".to_string()];
let (kept_data, kept_names, kept_indices) =
selector.drop_collinear_features(&data, 1, &names, None);
assert_eq!(kept_data, data);
assert_eq!(kept_names, names);
assert_eq!(kept_indices, vec![0]);
}
#[test]
fn test_drop_collinear_empty_input() {
let config = SelectionConfig::new().with_collinearity_threshold(0.9);
let selector = FeatureSelector::new(config);
let (kept_data, kept_names, kept_indices) =
selector.drop_collinear_features(&[], 0, &[], None);
assert!(kept_data.is_empty());
assert!(kept_names.is_empty());
assert!(kept_indices.is_empty());
}
#[test]
fn test_pairwise_correlation() {
let data = vec![
1.0, 2.0, 2.0, 4.0, 3.0, 6.0, 4.0, 8.0, ];
let stats_0 = compute_feature_stats(&data, 4, 2, 0);
let stats_1 = compute_feature_stats(&data, 4, 2, 1);
let corr = compute_pairwise_correlation(&data, 2, 4, 0, 1, &stats_0, &stats_1);
assert!((corr - 1.0).abs() < 1e-5);
}
#[test]
fn test_feature_stats() {
let data = vec![1.0, 2.0, 3.0, 4.0];
let (mean, std, var) = compute_feature_stats(&data, 4, 1, 0);
assert!((mean - 2.5).abs() < 1e-6);
assert!((var - 1.25).abs() < 1e-6);
assert!((std - 1.25f32.sqrt()).abs() < 1e-6);
}
}