1use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone)]
11pub struct FeatureVector {
12 pub feature_name: String,
14 pub values: Vec<f64>,
16 pub label_values: Option<Vec<f64>>,
18}
19
20#[derive(Debug, Clone)]
22pub struct FeatureQualityThresholds {
23 pub min_feature_quality: f64,
25 pub max_vif: f64,
27}
28
29impl Default for FeatureQualityThresholds {
30 fn default() -> Self {
31 Self {
32 min_feature_quality: 0.60,
33 max_vif: 10.0,
34 }
35 }
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct FeatureQualityAnalysis {
41 pub feature_quality_score: f64,
43 pub per_feature_vif: Vec<(String, f64)>,
45 pub multicollinear_features: Vec<String>,
47 pub importance_rankings: Vec<(String, f64)>,
49 pub total_features: usize,
51 pub passes: bool,
53 pub issues: Vec<String>,
55}
56
57pub struct FeatureQualityAnalyzer {
59 thresholds: FeatureQualityThresholds,
60}
61
62impl FeatureQualityAnalyzer {
63 pub fn new() -> Self {
65 Self {
66 thresholds: FeatureQualityThresholds::default(),
67 }
68 }
69
70 pub fn with_thresholds(thresholds: FeatureQualityThresholds) -> Self {
72 Self { thresholds }
73 }
74
75 pub fn analyze(&self, features: &[FeatureVector]) -> EvalResult<FeatureQualityAnalysis> {
77 let mut issues = Vec::new();
78 let total_features = features.len();
79
80 if features.is_empty() {
81 return Ok(FeatureQualityAnalysis {
82 feature_quality_score: 0.0,
83 per_feature_vif: Vec::new(),
84 multicollinear_features: Vec::new(),
85 importance_rankings: Vec::new(),
86 total_features: 0,
87 passes: true,
88 issues: vec!["No features provided".to_string()],
89 });
90 }
91
92 let mut importance_rankings: Vec<(String, f64)> = Vec::new();
94 for feature in features {
95 if let Some(ref label_vals) = feature.label_values {
96 if let Some(corr) = pearson_correlation(&feature.values, label_vals) {
97 importance_rankings.push((feature.feature_name.clone(), corr.abs()));
98 }
99 }
100 }
101 importance_rankings
102 .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
103
104 let per_feature_vif = self.compute_vif(features);
106 let multicollinear_features: Vec<String> = per_feature_vif
107 .iter()
108 .filter(|(_, vif)| *vif > self.thresholds.max_vif)
109 .map(|(name, _)| name.clone())
110 .collect();
111
112 if !multicollinear_features.is_empty() {
113 issues.push(format!(
114 "{} feature(s) have VIF > {:.1}: {}",
115 multicollinear_features.len(),
116 self.thresholds.max_vif,
117 multicollinear_features.join(", ")
118 ));
119 }
120
121 let non_mc_fraction = if total_features > 0 {
124 (total_features - multicollinear_features.len()) as f64 / total_features as f64
125 } else {
126 1.0
127 };
128 let importance_available = if importance_rankings.is_empty() {
129 0.5 } else {
131 1.0
132 };
133 let feature_quality_score = (non_mc_fraction * importance_available).clamp(0.0, 1.0);
134
135 if feature_quality_score < self.thresholds.min_feature_quality {
136 issues.push(format!(
137 "Feature quality score {:.4} < {:.4} (threshold)",
138 feature_quality_score, self.thresholds.min_feature_quality
139 ));
140 }
141
142 let passes = issues.is_empty();
143
144 Ok(FeatureQualityAnalysis {
145 feature_quality_score,
146 per_feature_vif,
147 multicollinear_features,
148 importance_rankings,
149 total_features,
150 passes,
151 issues,
152 })
153 }
154
155 fn compute_vif(&self, features: &[FeatureVector]) -> Vec<(String, f64)> {
160 let mut vifs = Vec::new();
161
162 for (i, fi) in features.iter().enumerate() {
163 let mut max_r2 = 0.0_f64;
164
165 for (j, fj) in features.iter().enumerate() {
166 if i == j {
167 continue;
168 }
169 if let Some(corr) = pearson_correlation(&fi.values, &fj.values) {
170 let r2 = corr * corr;
171 if r2 > max_r2 {
172 max_r2 = r2;
173 }
174 }
175 }
176
177 let vif = if (1.0 - max_r2).abs() < 1e-12 {
178 f64::MAX
179 } else {
180 1.0 / (1.0 - max_r2)
181 };
182
183 vifs.push((fi.feature_name.clone(), vif));
184 }
185
186 vifs
187 }
188}
189
190fn pearson_correlation(x: &[f64], y: &[f64]) -> Option<f64> {
192 let n = x.len().min(y.len());
193 if n < 3 {
194 return None;
195 }
196
197 let mean_x = x[..n].iter().sum::<f64>() / n as f64;
198 let mean_y = y[..n].iter().sum::<f64>() / n as f64;
199
200 let mut cov = 0.0;
201 let mut var_x = 0.0;
202 let mut var_y = 0.0;
203
204 for i in 0..n {
205 let dx = x[i] - mean_x;
206 let dy = y[i] - mean_y;
207 cov += dx * dy;
208 var_x += dx * dx;
209 var_y += dy * dy;
210 }
211
212 let denom = (var_x * var_y).sqrt();
213 if denom < 1e-12 {
214 return None;
215 }
216
217 Some(cov / denom)
218}
219
220impl Default for FeatureQualityAnalyzer {
221 fn default() -> Self {
222 Self::new()
223 }
224}
225
226#[cfg(test)]
227#[allow(clippy::unwrap_used)]
228mod tests {
229 use super::*;
230
231 #[test]
232 fn test_valid_features() {
233 let features = vec![
234 FeatureVector {
235 feature_name: "amount".to_string(),
236 values: vec![1.0, 2.0, 3.0, 4.0, 5.0],
237 label_values: Some(vec![0.0, 0.0, 1.0, 1.0, 1.0]),
238 },
239 FeatureVector {
240 feature_name: "count".to_string(),
241 values: vec![10.0, 20.0, 15.0, 25.0, 30.0],
242 label_values: Some(vec![0.0, 0.0, 1.0, 1.0, 1.0]),
243 },
244 ];
245
246 let analyzer = FeatureQualityAnalyzer::new();
247 let result = analyzer.analyze(&features).unwrap();
248
249 assert_eq!(result.total_features, 2);
250 assert!(result.feature_quality_score > 0.0);
251 assert!(result.passes);
252 }
253
254 #[test]
255 fn test_multicollinear_features() {
256 let features = vec![
258 FeatureVector {
259 feature_name: "f1".to_string(),
260 values: vec![1.0, 2.0, 3.0, 4.0, 5.0],
261 label_values: None,
262 },
263 FeatureVector {
264 feature_name: "f2".to_string(),
265 values: vec![2.0, 4.0, 6.0, 8.0, 10.0],
266 label_values: None,
267 },
268 ];
269
270 let analyzer = FeatureQualityAnalyzer::new();
271 let result = analyzer.analyze(&features).unwrap();
272
273 assert!(!result.multicollinear_features.is_empty());
274 }
275
276 #[test]
277 fn test_empty_features() {
278 let analyzer = FeatureQualityAnalyzer::new();
279 let result = analyzer.analyze(&[]).unwrap();
280
281 assert_eq!(result.total_features, 0);
282 assert_eq!(result.feature_quality_score, 0.0);
283 }
284}