use crate::error::InsightError;
use u_numflow::matrix::Matrix;
#[derive(Debug, Clone)]
pub struct FeatureConfig {
pub variance_threshold: f64,
pub correlation_threshold: f64,
pub vif_threshold: f64,
}
impl Default for FeatureConfig {
fn default() -> Self {
Self {
variance_threshold: 1e-10,
correlation_threshold: 0.9,
vif_threshold: 10.0,
}
}
}
impl FeatureConfig {
pub fn variance_threshold(mut self, t: f64) -> Self {
self.variance_threshold = t;
self
}
pub fn correlation_threshold(mut self, t: f64) -> Self {
self.correlation_threshold = t;
self
}
pub fn vif_threshold(mut self, t: f64) -> Self {
self.vif_threshold = t;
self
}
}
#[derive(Debug, Clone)]
pub struct LowVarianceFeature {
pub name: String,
pub index: usize,
pub variance: f64,
}
#[derive(Debug, Clone)]
pub struct HighCorrelationPair {
pub feature_a: String,
pub feature_b: String,
pub index_a: usize,
pub index_b: usize,
pub correlation: f64,
}
#[derive(Debug, Clone)]
pub struct VifResult {
pub name: String,
pub index: usize,
pub vif: f64,
pub is_collinear: bool,
}
#[derive(Debug, Clone)]
pub struct FeatureScore {
pub name: String,
pub index: usize,
pub variance: f64,
pub max_abs_correlation: f64,
pub vif: f64,
pub importance: f64,
}
#[derive(Debug, Clone)]
pub struct FeatureAnalysisResult {
pub low_variance: Vec<LowVarianceFeature>,
pub high_correlations: Vec<HighCorrelationPair>,
pub vif_results: Vec<VifResult>,
pub condition_number: f64,
pub feature_scores: Vec<FeatureScore>,
pub n_features: usize,
pub n_observations: usize,
}
pub fn feature_analysis(
features: &[Vec<f64>],
names: &[String],
config: &FeatureConfig,
) -> Result<FeatureAnalysisResult, InsightError> {
let p = features.len();
if p == 0 {
return Err(InsightError::InvalidParameter {
name: "features".into(),
message: "at least 1 feature required".into(),
});
}
if names.len() != p {
return Err(InsightError::DimensionMismatch {
expected: p,
actual: names.len(),
});
}
let n = features[0].len();
if n < 2 {
return Err(InsightError::InsufficientData {
min_required: 2,
actual: n,
});
}
for (i, feat) in features.iter().enumerate() {
if feat.len() != n {
return Err(InsightError::DimensionMismatch {
expected: n,
actual: feat.len(),
});
}
for (j, &v) in feat.iter().enumerate() {
if !v.is_finite() {
return Err(InsightError::DegenerateData {
reason: format!(
"non-finite value at row {j} of feature '{}'",
names[i]
),
});
}
}
}
let variances: Vec<f64> = features.iter().map(|f| compute_variance(f)).collect();
let low_variance: Vec<LowVarianceFeature> = variances
.iter()
.enumerate()
.filter(|(_, &v)| v < config.variance_threshold)
.map(|(i, &v)| LowVarianceFeature {
name: names[i].clone(),
index: i,
variance: v,
})
.collect();
let means: Vec<f64> = features
.iter()
.map(|f| f.iter().sum::<f64>() / n as f64)
.collect();
let stds: Vec<f64> = variances.iter().map(|v| v.sqrt()).collect();
let mut correlation_matrix = vec![vec![0.0; p]; p];
for i in 0..p {
correlation_matrix[i][i] = 1.0;
for j in (i + 1)..p {
let r = if stds[i] > 1e-15 && stds[j] > 1e-15 {
let cov: f64 = features[i]
.iter()
.zip(features[j].iter())
.map(|(&a, &b)| (a - means[i]) * (b - means[j]))
.sum::<f64>()
/ (n - 1) as f64;
cov / (stds[i] * stds[j])
} else {
0.0
};
correlation_matrix[i][j] = r;
correlation_matrix[j][i] = r;
}
}
let high_correlations: Vec<HighCorrelationPair> = {
let mut pairs = Vec::new();
for i in 0..p {
for j in (i + 1)..p {
let r = correlation_matrix[i][j];
if r.abs() > config.correlation_threshold {
pairs.push(HighCorrelationPair {
feature_a: names[i].clone(),
feature_b: names[j].clone(),
index_a: i,
index_b: j,
correlation: r,
});
}
}
}
pairs.sort_by(|a, b| {
b.correlation
.abs()
.partial_cmp(&a.correlation.abs())
.unwrap_or(std::cmp::Ordering::Equal)
});
pairs
};
let vifs = compute_vif_values(features, n);
let vif_results: Vec<VifResult> = vifs
.iter()
.enumerate()
.map(|(i, &vif)| VifResult {
name: names[i].clone(),
index: i,
vif,
is_collinear: vif > config.vif_threshold,
})
.collect();
let condition_number = compute_condition_number(&correlation_matrix, p);
let max_abs_correlations: Vec<f64> = (0..p)
.map(|i| {
(0..p)
.filter(|&j| j != i)
.map(|j| correlation_matrix[i][j].abs())
.fold(0.0f64, f64::max)
})
.collect();
let mut feature_scores: Vec<FeatureScore> = (0..p)
.map(|i| {
let has_variance = variances[i] > config.variance_threshold;
let corr_score = 1.0 - max_abs_correlations[i].min(1.0);
let vif_score = if vifs[i].is_finite() && vifs[i] > 0.0 {
(1.0 / vifs[i]).min(1.0)
} else {
0.0
};
let importance = if has_variance {
(corr_score + vif_score) / 2.0
} else {
0.0
};
FeatureScore {
name: names[i].clone(),
index: i,
variance: variances[i],
max_abs_correlation: max_abs_correlations[i],
vif: vifs[i],
importance,
}
})
.collect();
feature_scores.sort_by(|a, b| {
b.importance
.partial_cmp(&a.importance)
.unwrap_or(std::cmp::Ordering::Equal)
});
Ok(FeatureAnalysisResult {
low_variance,
high_correlations,
vif_results,
condition_number,
feature_scores,
n_features: p,
n_observations: n,
})
}
fn compute_variance(data: &[f64]) -> f64 {
let n = data.len();
if n < 2 {
return 0.0;
}
let mean = data.iter().sum::<f64>() / n as f64;
let sum_sq: f64 = data.iter().map(|&x| (x - mean) * (x - mean)).sum();
sum_sq / (n - 1) as f64
}
fn compute_vif_values(features: &[Vec<f64>], n: usize) -> Vec<f64> {
let p = features.len();
if p <= 1 {
return vec![1.0; p];
}
let mut vifs = Vec::with_capacity(p);
for target in 0..p {
let predictors: Vec<&[f64]> = (0..p)
.filter(|&i| i != target)
.map(|i| features[i].as_slice())
.collect();
let r2 = compute_r2_multiple(&features[target], &predictors, n);
if r2 < 1.0 - 1e-15 {
vifs.push(1.0 / (1.0 - r2));
} else {
vifs.push(f64::INFINITY);
}
}
vifs
}
fn compute_r2_multiple(target: &[f64], predictors: &[&[f64]], n: usize) -> f64 {
let p = predictors.len();
if p == 0 || n < p + 1 {
return 0.0;
}
let y_mean = target.iter().sum::<f64>() / n as f64;
let p1 = p + 1; let mut xtx_data = vec![0.0; p1 * p1];
let mut xty = vec![0.0; p1];
for i in 0..n {
let y = target[i];
let mut x_row = Vec::with_capacity(p1);
x_row.push(1.0);
for pred in predictors {
x_row.push(pred[i]);
}
for j in 0..p1 {
xty[j] += x_row[j] * y;
for k in j..p1 {
xtx_data[j * p1 + k] += x_row[j] * x_row[k];
if j != k {
xtx_data[k * p1 + j] += x_row[j] * x_row[k];
}
}
}
}
let xtx = match Matrix::new(p1, p1, xtx_data) {
Ok(m) => m,
Err(_) => return 0.0,
};
let coeffs = match xtx.cholesky_solve(&xty) {
Ok(c) => c,
Err(_) => return 0.0, };
let mut ss_res = 0.0;
let mut ss_tot = 0.0;
for i in 0..n {
let mut y_hat = coeffs[0]; for (j, pred) in predictors.iter().enumerate() {
y_hat += coeffs[j + 1] * pred[i];
}
let y = target[i];
ss_res += (y - y_hat) * (y - y_hat);
ss_tot += (y - y_mean) * (y - y_mean);
}
if ss_tot < 1e-15 {
return 0.0;
}
1.0 - ss_res / ss_tot
}
fn compute_condition_number(corr: &[Vec<f64>], p: usize) -> f64 {
if p <= 1 {
return 1.0;
}
let mut data = vec![0.0; p * p];
for i in 0..p {
for j in 0..p {
data[i * p + j] = corr[i][j];
}
}
let mat = match Matrix::new(p, p, data) {
Ok(m) => m,
Err(_) => return f64::INFINITY,
};
match mat.eigen_symmetric() {
Ok((eigenvalues, _)) => {
let max_ev = eigenvalues
.iter()
.copied()
.fold(f64::NEG_INFINITY, f64::max);
let min_ev = eigenvalues.iter().copied().fold(f64::INFINITY, f64::min);
if min_ev.abs() < 1e-10 || min_ev <= 0.0 {
f64::INFINITY
} else {
(max_ev / min_ev).sqrt()
}
}
Err(_) => f64::INFINITY,
}
}
#[derive(Debug, Clone)]
pub struct PermutationImportanceFeature {
pub name: String,
pub index: usize,
pub importance: f64,
pub std_dev: f64,
}
#[derive(Debug, Clone)]
pub struct PermutationImportanceResult {
pub baseline_score: f64,
pub features: Vec<PermutationImportanceFeature>,
}
pub fn permutation_importance(
features: &[Vec<f64>],
feature_names: &[String],
target: &[f64],
n_repeats: usize,
seed: u64,
) -> Result<PermutationImportanceResult, InsightError> {
if features.is_empty() {
return Err(InsightError::InvalidParameter {
name: "features".into(),
message: "at least 1 feature required".into(),
});
}
let n = target.len();
let p = features.len();
if n < p + 2 {
return Err(InsightError::InsufficientData {
min_required: p + 2,
actual: n,
});
}
for (i, f) in features.iter().enumerate() {
if f.len() != n {
return Err(InsightError::DimensionMismatch {
expected: n,
actual: f.len(),
});
}
let nan_count = f.iter().filter(|v| v.is_nan()).count();
if nan_count > 0 {
let name = feature_names
.get(i)
.cloned()
.unwrap_or_else(|| format!("feature_{i}"));
return Err(InsightError::MissingValues {
column: name,
count: nan_count,
});
}
}
let pred_refs: Vec<&[f64]> = features.iter().map(|f| f.as_slice()).collect();
let baseline_score = compute_r2_multiple(target, &pred_refs, n).max(0.0);
let n_repeats = n_repeats.max(1);
let mut rng_state = seed;
let mut results: Vec<PermutationImportanceFeature> = Vec::with_capacity(p);
for fi in 0..p {
let mut decreases = Vec::with_capacity(n_repeats);
for _ in 0..n_repeats {
let mut permuted = features[fi].clone();
for i in (1..n).rev() {
rng_state = rng_state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
let j = (rng_state >> 33) as usize % (i + 1);
permuted.swap(i, j);
}
let mut perm_refs: Vec<&[f64]> = features.iter().map(|f| f.as_slice()).collect();
perm_refs[fi] = &permuted;
let perm_score = compute_r2_multiple(target, &perm_refs, n).max(0.0);
decreases.push(baseline_score - perm_score);
}
let mean_decrease = decreases.iter().sum::<f64>() / decreases.len() as f64;
let variance = if decreases.len() > 1 {
decreases
.iter()
.map(|&d| (d - mean_decrease).powi(2))
.sum::<f64>()
/ (decreases.len() - 1) as f64
} else {
0.0
};
let name = feature_names
.get(fi)
.cloned()
.unwrap_or_else(|| format!("feature_{fi}"));
results.push(PermutationImportanceFeature {
name,
index: fi,
importance: mean_decrease,
std_dev: variance.sqrt(),
});
}
results.sort_by(|a, b| {
b.importance
.partial_cmp(&a.importance)
.unwrap_or(std::cmp::Ordering::Equal)
});
Ok(PermutationImportanceResult {
baseline_score,
features: results,
})
}
#[cfg(test)]
mod tests {
use super::*;
fn names(n: &[&str]) -> Vec<String> {
n.iter().map(|s| s.to_string()).collect()
}
#[test]
fn constant_feature_flagged() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0],
vec![5.0, 5.0, 5.0, 5.0], ];
let result = feature_analysis(
&features,
&names(&["vary", "const"]),
&FeatureConfig::default(),
)
.unwrap();
assert_eq!(result.low_variance.len(), 1);
assert_eq!(result.low_variance[0].name, "const");
assert_eq!(result.low_variance[0].index, 1);
}
#[test]
fn all_varying_no_flags() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0], vec![10.0, 20.0, 30.0, 40.0]];
let result =
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).unwrap();
assert!(result.low_variance.is_empty());
}
#[test]
fn custom_variance_threshold() {
let features = vec![
vec![1.0, 1.001, 1.002, 1.003], vec![1.0, 2.0, 3.0, 4.0],
];
let config = FeatureConfig::default().variance_threshold(0.01);
let result = feature_analysis(&features, &names(&["low", "high"]), &config).unwrap();
assert_eq!(result.low_variance.len(), 1);
assert_eq!(result.low_variance[0].name, "low");
}
#[test]
fn perfectly_correlated_detected() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![2.0, 4.0, 6.0, 8.0, 10.0], vec![5.0, 3.0, 7.0, 1.0, 9.0], ];
let result = feature_analysis(
&features,
&names(&["a", "b", "c"]),
&FeatureConfig::default(),
)
.unwrap();
assert!(!result.high_correlations.is_empty());
let pair = &result.high_correlations[0];
assert!((pair.correlation.abs() - 1.0).abs() < 1e-10);
}
#[test]
fn uncorrelated_no_pairs() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![5.0, 3.0, 1.0, 4.0, 2.0], ];
let result =
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).unwrap();
assert!(result.high_correlations.is_empty());
}
#[test]
fn independent_features_low_vif() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
vec![8.0, 3.0, 6.0, 1.0, 7.0, 2.0, 5.0, 4.0],
vec![2.0, 7.0, 4.0, 5.0, 3.0, 8.0, 1.0, 6.0],
];
let result = feature_analysis(
&features,
&names(&["x1", "x2", "x3"]),
&FeatureConfig::default(),
)
.unwrap();
for vr in &result.vif_results {
assert!(
vr.vif < 5.0,
"VIF for {} = {}, expected < 5.0 for independent features",
vr.name,
vr.vif
);
assert!(!vr.is_collinear);
}
}
#[test]
fn collinear_features_high_vif() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
vec![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0], vec![5.0, 3.0, 7.0, 1.0, 9.0, 2.0, 6.0, 4.0], ];
let result = feature_analysis(
&features,
&names(&["x1", "x2", "x3"]),
&FeatureConfig::default(),
)
.unwrap();
assert!(
result.vif_results[0].vif > 10.0 || result.vif_results[0].vif.is_infinite(),
"VIF for x1 = {}, expected > 10",
result.vif_results[0].vif
);
}
#[test]
fn well_conditioned_features() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
vec![8.0, 3.0, 6.0, 1.0, 7.0, 2.0, 5.0, 4.0],
];
let result =
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).unwrap();
assert!(
result.condition_number < 100.0,
"condition number = {}, expected reasonable for independent features",
result.condition_number
);
}
#[test]
fn ill_conditioned_features() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![1.001, 2.002, 3.003, 4.004, 5.005], ];
let result =
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).unwrap();
assert!(
result.condition_number > 10.0,
"condition number = {}, expected high for near-identical features",
result.condition_number
);
}
#[test]
fn scores_sorted_descending() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![10.0, 20.0, 30.0, 40.0, 50.0],
vec![5.0, 5.0, 5.0, 5.0, 5.0], ];
let result = feature_analysis(
&features,
&names(&["a", "b", "const"]),
&FeatureConfig::default(),
)
.unwrap();
for i in 1..result.feature_scores.len() {
assert!(
result.feature_scores[i].importance <= result.feature_scores[i - 1].importance,
"scores not sorted: {} > {}",
result.feature_scores[i].importance,
result.feature_scores[i - 1].importance
);
}
let last = result.feature_scores.last().expect("non-empty");
assert_eq!(last.name, "const");
}
#[test]
fn scores_in_range() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0], vec![5.0, 3.0, 7.0, 1.0]];
let result =
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).unwrap();
for score in &result.feature_scores {
assert!(
(0.0..=1.0).contains(&score.importance),
"{}: importance {} out of range",
score.name,
score.importance
);
}
}
#[test]
fn empty_features() {
let features: Vec<Vec<f64>> = vec![];
assert!(feature_analysis(&features, &[], &FeatureConfig::default()).is_err());
}
#[test]
fn name_count_mismatch() {
let features = vec![vec![1.0, 2.0]];
assert!(
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).is_err()
);
}
#[test]
fn insufficient_data() {
let features = vec![vec![1.0]];
assert!(feature_analysis(&features, &names(&["a"]), &FeatureConfig::default()).is_err());
}
#[test]
fn nan_rejected() {
let features = vec![vec![1.0, f64::NAN, 3.0]];
assert!(feature_analysis(&features, &names(&["a"]), &FeatureConfig::default()).is_err());
}
#[test]
fn single_feature_analysis() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0]];
let result =
feature_analysis(&features, &names(&["only"]), &FeatureConfig::default()).unwrap();
assert_eq!(result.n_features, 1);
assert_eq!(result.vif_results.len(), 1);
assert!((result.vif_results[0].vif - 1.0).abs() < 1e-10);
assert!(result.high_correlations.is_empty());
}
#[test]
fn result_metadata() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0], vec![5.0, 6.0, 7.0, 8.0]];
let result =
feature_analysis(&features, &names(&["a", "b"]), &FeatureConfig::default()).unwrap();
assert_eq!(result.n_features, 2);
assert_eq!(result.n_observations, 4);
}
#[test]
fn perm_importance_signal_vs_noise() {
let signal: Vec<f64> = (1..=10).map(|i| i as f64).collect();
let noise: Vec<f64> = vec![5.0, 3.0, 7.0, 1.0, 8.0, 2.0, 6.0, 4.0, 9.0, 0.0];
let target: Vec<f64> = signal.iter().map(|&x| 2.0 * x + 1.0).collect();
let result = permutation_importance(
&[signal, noise],
&["signal".into(), "noise".into()],
&target,
5,
42,
)
.unwrap();
assert_eq!(result.features.len(), 2);
assert!(result.baseline_score > 0.5);
assert!(
result.features[0].importance > result.features[1].importance,
"signal imp={} should > noise imp={}",
result.features[0].importance,
result.features[1].importance,
);
}
#[test]
fn perm_importance_std_dev_nonneg() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
vec![5.0, 3.0, 7.0, 1.0, 8.0, 2.0, 6.0, 4.0],
];
let target: Vec<f64> = features[0].iter().map(|&x| x * 3.0).collect();
let result =
permutation_importance(&features, &["a".into(), "b".into()], &target, 3, 42).unwrap();
for f in &result.features {
assert!(f.std_dev >= 0.0, "{} std_dev={}", f.name, f.std_dev);
}
}
#[test]
fn perm_importance_sorted_desc() {
let f1: Vec<f64> = (1..=10).map(|i| i as f64).collect();
let f2: Vec<f64> = vec![5.0, 3.0, 7.0, 1.0, 8.0, 2.0, 6.0, 4.0, 9.0, 0.0];
let target: Vec<f64> = f1.iter().map(|&x| x * 2.0).collect();
let result =
permutation_importance(&[f1, f2], &["f1".into(), "f2".into()], &target, 3, 42).unwrap();
for i in 1..result.features.len() {
assert!(result.features[i].importance <= result.features[i - 1].importance);
}
}
#[test]
fn perm_importance_rejects_nan() {
let features = vec![vec![1.0, f64::NAN, 3.0, 4.0, 5.0]];
let target = vec![1.0, 2.0, 3.0, 4.0, 5.0];
assert!(permutation_importance(&features, &["a".into()], &target, 3, 42).is_err());
}
#[test]
fn perm_importance_rejects_empty() {
let features: Vec<Vec<f64>> = vec![];
let target: Vec<f64> = vec![];
assert!(permutation_importance(&features, &[], &target, 3, 42).is_err());
}
}