Skip to main content

datasynth_eval/ml/
feature_quality.rs

1//! Feature quality evaluation.
2//!
3//! Analyzes feature importance via label correlation, multicollinearity
4//! via VIF (Variance Inflation Factor), and feature stability.
5
6use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8
9/// A single feature vector with optional label values for importance estimation.
10#[derive(Debug, Clone)]
11pub struct FeatureVector {
12    /// Name of this feature.
13    pub feature_name: String,
14    /// Observed values for this feature.
15    pub values: Vec<f64>,
16    /// Optional label values for correlation-based importance.
17    pub label_values: Option<Vec<f64>>,
18}
19
20/// Thresholds for feature quality analysis.
21#[derive(Debug, Clone)]
22pub struct FeatureQualityThresholds {
23    /// Minimum overall feature quality score.
24    pub min_feature_quality: f64,
25    /// Maximum VIF before a feature is flagged as multicollinear.
26    pub max_vif: f64,
27}
28
29impl Default for FeatureQualityThresholds {
30    fn default() -> Self {
31        Self {
32            min_feature_quality: 0.60,
33            max_vif: 10.0,
34        }
35    }
36}
37
38/// Results of feature quality analysis.
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct FeatureQualityAnalysis {
41    /// Overall feature quality score (0.0-1.0).
42    pub feature_quality_score: f64,
43    /// Per-feature VIF values.
44    pub per_feature_vif: Vec<(String, f64)>,
45    /// Features with VIF exceeding the threshold.
46    pub multicollinear_features: Vec<String>,
47    /// Feature importance rankings (descending by absolute correlation with label).
48    pub importance_rankings: Vec<(String, f64)>,
49    /// Total number of features analyzed.
50    pub total_features: usize,
51    /// Whether the analysis passes all thresholds.
52    pub passes: bool,
53    /// Issues found during analysis.
54    pub issues: Vec<String>,
55}
56
57/// Analyzer for feature quality metrics.
58pub struct FeatureQualityAnalyzer {
59    thresholds: FeatureQualityThresholds,
60}
61
62impl FeatureQualityAnalyzer {
63    /// Create a new analyzer with default thresholds.
64    pub fn new() -> Self {
65        Self {
66            thresholds: FeatureQualityThresholds::default(),
67        }
68    }
69
70    /// Create an analyzer with custom thresholds.
71    pub fn with_thresholds(thresholds: FeatureQualityThresholds) -> Self {
72        Self { thresholds }
73    }
74
75    /// Analyze feature quality.
76    pub fn analyze(&self, features: &[FeatureVector]) -> EvalResult<FeatureQualityAnalysis> {
77        let mut issues = Vec::new();
78        let total_features = features.len();
79
80        if features.is_empty() {
81            return Ok(FeatureQualityAnalysis {
82                feature_quality_score: 0.0,
83                per_feature_vif: Vec::new(),
84                multicollinear_features: Vec::new(),
85                importance_rankings: Vec::new(),
86                total_features: 0,
87                passes: true,
88                issues: vec!["No features provided".to_string()],
89            });
90        }
91
92        // Compute importance rankings via Pearson correlation with label
93        let mut importance_rankings: Vec<(String, f64)> = Vec::new();
94        for feature in features {
95            if let Some(ref label_vals) = feature.label_values {
96                if let Some(corr) = pearson_correlation(&feature.values, label_vals) {
97                    importance_rankings.push((feature.feature_name.clone(), corr.abs()));
98                }
99            }
100        }
101        importance_rankings
102            .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
103
104        // Compute pairwise correlations for simplified VIF
105        let per_feature_vif = self.compute_vif(features);
106        let multicollinear_features: Vec<String> = per_feature_vif
107            .iter()
108            .filter(|(_, vif)| *vif > self.thresholds.max_vif)
109            .map(|(name, _)| name.clone())
110            .collect();
111
112        if !multicollinear_features.is_empty() {
113            issues.push(format!(
114                "{} feature(s) have VIF > {:.1}: {}",
115                multicollinear_features.len(),
116                self.thresholds.max_vif,
117                multicollinear_features.join(", ")
118            ));
119        }
120
121        // Overall quality score: fraction of non-multicollinear features,
122        // penalized if no importance can be computed
123        let non_mc_fraction = if total_features > 0 {
124            (total_features - multicollinear_features.len()) as f64 / total_features as f64
125        } else {
126            1.0
127        };
128        let importance_available = if importance_rankings.is_empty() {
129            0.5 // partial penalty when label values are absent
130        } else {
131            1.0
132        };
133        let feature_quality_score = (non_mc_fraction * importance_available).clamp(0.0, 1.0);
134
135        if feature_quality_score < self.thresholds.min_feature_quality {
136            issues.push(format!(
137                "Feature quality score {:.4} < {:.4} (threshold)",
138                feature_quality_score, self.thresholds.min_feature_quality
139            ));
140        }
141
142        let passes = issues.is_empty();
143
144        Ok(FeatureQualityAnalysis {
145            feature_quality_score,
146            per_feature_vif,
147            multicollinear_features,
148            importance_rankings,
149            total_features,
150            passes,
151            issues,
152        })
153    }
154
155    /// Compute simplified VIF for each feature.
156    ///
157    /// VIF_i = 1 / (1 - R^2_i), where R^2_i is the maximum R^2 from
158    /// regressing feature i on any other feature (simplified as max pairwise r^2).
159    fn compute_vif(&self, features: &[FeatureVector]) -> Vec<(String, f64)> {
160        let mut vifs = Vec::new();
161
162        for (i, fi) in features.iter().enumerate() {
163            let mut max_r2 = 0.0_f64;
164
165            for (j, fj) in features.iter().enumerate() {
166                if i == j {
167                    continue;
168                }
169                if let Some(corr) = pearson_correlation(&fi.values, &fj.values) {
170                    let r2 = corr * corr;
171                    if r2 > max_r2 {
172                        max_r2 = r2;
173                    }
174                }
175            }
176
177            let vif = if (1.0 - max_r2).abs() < 1e-12 {
178                f64::MAX
179            } else {
180                1.0 / (1.0 - max_r2)
181            };
182
183            vifs.push((fi.feature_name.clone(), vif));
184        }
185
186        vifs
187    }
188}
189
190/// Compute Pearson correlation between two vectors.
191fn pearson_correlation(x: &[f64], y: &[f64]) -> Option<f64> {
192    let n = x.len().min(y.len());
193    if n < 3 {
194        return None;
195    }
196
197    let mean_x = x[..n].iter().sum::<f64>() / n as f64;
198    let mean_y = y[..n].iter().sum::<f64>() / n as f64;
199
200    let mut cov = 0.0;
201    let mut var_x = 0.0;
202    let mut var_y = 0.0;
203
204    for i in 0..n {
205        let dx = x[i] - mean_x;
206        let dy = y[i] - mean_y;
207        cov += dx * dy;
208        var_x += dx * dx;
209        var_y += dy * dy;
210    }
211
212    let denom = (var_x * var_y).sqrt();
213    if denom < 1e-12 {
214        return None;
215    }
216
217    Some(cov / denom)
218}
219
220impl Default for FeatureQualityAnalyzer {
221    fn default() -> Self {
222        Self::new()
223    }
224}
225
226#[cfg(test)]
227#[allow(clippy::unwrap_used)]
228mod tests {
229    use super::*;
230
231    #[test]
232    fn test_valid_features() {
233        let features = vec![
234            FeatureVector {
235                feature_name: "amount".to_string(),
236                values: vec![1.0, 2.0, 3.0, 4.0, 5.0],
237                label_values: Some(vec![0.0, 0.0, 1.0, 1.0, 1.0]),
238            },
239            FeatureVector {
240                feature_name: "count".to_string(),
241                values: vec![10.0, 20.0, 15.0, 25.0, 30.0],
242                label_values: Some(vec![0.0, 0.0, 1.0, 1.0, 1.0]),
243            },
244        ];
245
246        let analyzer = FeatureQualityAnalyzer::new();
247        let result = analyzer.analyze(&features).unwrap();
248
249        assert_eq!(result.total_features, 2);
250        assert!(result.feature_quality_score > 0.0);
251        assert!(result.passes);
252    }
253
254    #[test]
255    fn test_multicollinear_features() {
256        // Two features that are perfectly correlated
257        let features = vec![
258            FeatureVector {
259                feature_name: "f1".to_string(),
260                values: vec![1.0, 2.0, 3.0, 4.0, 5.0],
261                label_values: None,
262            },
263            FeatureVector {
264                feature_name: "f2".to_string(),
265                values: vec![2.0, 4.0, 6.0, 8.0, 10.0],
266                label_values: None,
267            },
268        ];
269
270        let analyzer = FeatureQualityAnalyzer::new();
271        let result = analyzer.analyze(&features).unwrap();
272
273        assert!(!result.multicollinear_features.is_empty());
274    }
275
276    #[test]
277    fn test_empty_features() {
278        let analyzer = FeatureQualityAnalyzer::new();
279        let result = analyzer.analyze(&[]).unwrap();
280
281        assert_eq!(result.total_features, 0);
282        assert_eq!(result.feature_quality_score, 0.0);
283    }
284}