1use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct FeatureAnalysis {
12 pub feature_stats: Vec<FeatureStats>,
14 pub high_correlations: Vec<CorrelationPair>,
16 pub zero_variance_features: Vec<String>,
18 pub high_missing_features: Vec<String>,
20 pub quality_score: f64,
22 pub usable_features: usize,
24 pub total_features: usize,
26}
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct FeatureStats {
31 pub name: String,
33 pub feature_type: FeatureType,
35 pub count: usize,
37 pub missing_rate: f64,
39 pub mean: Option<f64>,
41 pub std_dev: Option<f64>,
43 pub min: Option<f64>,
45 pub max: Option<f64>,
47 pub skewness: Option<f64>,
49 pub kurtosis: Option<f64>,
51 pub unique_values: Option<usize>,
53 pub is_usable: bool,
55 pub issues: Vec<String>,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
61pub enum FeatureType {
62 Numeric,
64 Categorical,
66 Boolean,
68 DateTime,
70 Text,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct CorrelationPair {
77 pub feature1: String,
79 pub feature2: String,
81 pub correlation: f64,
83}
84
85#[derive(Debug, Clone, Default)]
87pub struct FeatureData {
88 pub numeric_features: HashMap<String, Vec<Option<f64>>>,
90 pub categorical_features: HashMap<String, Vec<Option<String>>>,
92 pub boolean_features: HashMap<String, Vec<Option<bool>>>,
94}
95
96pub struct FeatureAnalyzer {
98 correlation_threshold: f64,
100 missing_threshold: f64,
102 max_categorical_cardinality: usize,
104}
105
106impl FeatureAnalyzer {
107 pub fn new() -> Self {
109 Self {
110 correlation_threshold: 0.95,
111 missing_threshold: 0.20,
112 max_categorical_cardinality: 1000,
113 }
114 }
115
116 pub fn analyze(&self, data: &FeatureData) -> EvalResult<FeatureAnalysis> {
118 let mut feature_stats = Vec::new();
119 let mut zero_variance_features = Vec::new();
120 let mut high_missing_features = Vec::new();
121 let mut usable_features = 0;
122
123 for (name, values) in &data.numeric_features {
125 let stats = self.analyze_numeric_feature(name, values);
126 if stats.std_dev == Some(0.0) {
127 zero_variance_features.push(name.clone());
128 }
129 if stats.missing_rate > self.missing_threshold {
130 high_missing_features.push(name.clone());
131 }
132 if stats.is_usable {
133 usable_features += 1;
134 }
135 feature_stats.push(stats);
136 }
137
138 for (name, values) in &data.categorical_features {
140 let stats = self.analyze_categorical_feature(name, values);
141 if stats.missing_rate > self.missing_threshold {
142 high_missing_features.push(name.clone());
143 }
144 if stats.is_usable {
145 usable_features += 1;
146 }
147 feature_stats.push(stats);
148 }
149
150 for (name, values) in &data.boolean_features {
152 let stats = self.analyze_boolean_feature(name, values);
153 if stats.missing_rate > self.missing_threshold {
154 high_missing_features.push(name.clone());
155 }
156 if stats.is_usable {
157 usable_features += 1;
158 }
159 feature_stats.push(stats);
160 }
161
162 let high_correlations = self.find_high_correlations(&data.numeric_features);
164
165 let total_features = feature_stats.len();
166 let quality_score = if total_features > 0 {
167 usable_features as f64 / total_features as f64
168 } else {
169 1.0
170 };
171
172 Ok(FeatureAnalysis {
173 feature_stats,
174 high_correlations,
175 zero_variance_features,
176 high_missing_features,
177 quality_score,
178 usable_features,
179 total_features,
180 })
181 }
182
183 fn analyze_numeric_feature(&self, name: &str, values: &[Option<f64>]) -> FeatureStats {
185 let total = values.len();
186 let present: Vec<f64> = values.iter().filter_map(|v| *v).collect();
187 let count = present.len();
188 let missing_rate = if total > 0 {
189 (total - count) as f64 / total as f64
190 } else {
191 0.0
192 };
193
194 let mut issues = Vec::new();
195
196 if count == 0 {
197 issues.push("No non-null values".to_string());
198 return FeatureStats {
199 name: name.to_string(),
200 feature_type: FeatureType::Numeric,
201 count: 0,
202 missing_rate,
203 mean: None,
204 std_dev: None,
205 min: None,
206 max: None,
207 skewness: None,
208 kurtosis: None,
209 unique_values: None,
210 is_usable: false,
211 issues,
212 };
213 }
214
215 let mean = present.iter().sum::<f64>() / count as f64;
216 let variance: f64 = present.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / count as f64;
217 let std_dev = variance.sqrt();
218
219 let min = present.iter().cloned().fold(f64::INFINITY, f64::min);
220 let max = present.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
221
222 let (skewness, kurtosis) = if std_dev > 0.0 {
224 let m3: f64 = present
225 .iter()
226 .map(|x| ((x - mean) / std_dev).powi(3))
227 .sum::<f64>()
228 / count as f64;
229 let m4: f64 = present
230 .iter()
231 .map(|x| ((x - mean) / std_dev).powi(4))
232 .sum::<f64>()
233 / count as f64;
234 (Some(m3), Some(m4 - 3.0)) } else {
236 (None, None)
237 };
238
239 if std_dev == 0.0 {
241 issues.push("Zero variance".to_string());
242 }
243 if missing_rate > self.missing_threshold {
244 issues.push(format!("High missing rate: {:.2}%", missing_rate * 100.0));
245 }
246 if let Some(s) = skewness {
247 if s.abs() > 2.0 {
248 issues.push(format!("High skewness: {:.2}", s));
249 }
250 }
251
252 let is_usable = std_dev > 0.0 && missing_rate < 0.5;
253
254 FeatureStats {
255 name: name.to_string(),
256 feature_type: FeatureType::Numeric,
257 count,
258 missing_rate,
259 mean: Some(mean),
260 std_dev: Some(std_dev),
261 min: Some(min),
262 max: Some(max),
263 skewness,
264 kurtosis,
265 unique_values: None,
266 is_usable,
267 issues,
268 }
269 }
270
271 fn analyze_categorical_feature(&self, name: &str, values: &[Option<String>]) -> FeatureStats {
273 let total = values.len();
274 let present: Vec<&String> = values.iter().filter_map(|v| v.as_ref()).collect();
275 let count = present.len();
276 let missing_rate = if total > 0 {
277 (total - count) as f64 / total as f64
278 } else {
279 0.0
280 };
281
282 let unique: HashSet<_> = present.iter().collect();
283 let unique_count = unique.len();
284
285 let mut issues = Vec::new();
286
287 if unique_count == 0 {
288 issues.push("No non-null values".to_string());
289 } else if unique_count == 1 {
290 issues.push("Only one unique value".to_string());
291 } else if unique_count > self.max_categorical_cardinality {
292 issues.push(format!("High cardinality: {} unique values", unique_count));
293 }
294
295 if missing_rate > self.missing_threshold {
296 issues.push(format!("High missing rate: {:.2}%", missing_rate * 100.0));
297 }
298
299 let is_usable = unique_count > 1
300 && unique_count <= self.max_categorical_cardinality
301 && missing_rate < 0.5;
302
303 FeatureStats {
304 name: name.to_string(),
305 feature_type: FeatureType::Categorical,
306 count,
307 missing_rate,
308 mean: None,
309 std_dev: None,
310 min: None,
311 max: None,
312 skewness: None,
313 kurtosis: None,
314 unique_values: Some(unique_count),
315 is_usable,
316 issues,
317 }
318 }
319
320 fn analyze_boolean_feature(&self, name: &str, values: &[Option<bool>]) -> FeatureStats {
322 let total = values.len();
323 let present: Vec<bool> = values.iter().filter_map(|v| *v).collect();
324 let count = present.len();
325 let missing_rate = if total > 0 {
326 (total - count) as f64 / total as f64
327 } else {
328 0.0
329 };
330
331 let true_count = present.iter().filter(|v| **v).count();
332 let true_rate = if count > 0 {
333 true_count as f64 / count as f64
334 } else {
335 0.0
336 };
337
338 let mut issues = Vec::new();
339
340 if count == 0 {
341 issues.push("No non-null values".to_string());
342 } else if true_rate == 0.0 || true_rate == 1.0 {
343 issues.push("No variance (all same value)".to_string());
344 }
345
346 if missing_rate > self.missing_threshold {
347 issues.push(format!("High missing rate: {:.2}%", missing_rate * 100.0));
348 }
349
350 let is_usable = count > 0 && true_rate > 0.0 && true_rate < 1.0 && missing_rate < 0.5;
351
352 FeatureStats {
353 name: name.to_string(),
354 feature_type: FeatureType::Boolean,
355 count,
356 missing_rate,
357 mean: Some(true_rate),
358 std_dev: None,
359 min: Some(0.0),
360 max: Some(1.0),
361 skewness: None,
362 kurtosis: None,
363 unique_values: Some(2),
364 is_usable,
365 issues,
366 }
367 }
368
369 fn find_high_correlations(
371 &self,
372 numeric_features: &HashMap<String, Vec<Option<f64>>>,
373 ) -> Vec<CorrelationPair> {
374 let mut correlations = Vec::new();
375
376 let feature_names: Vec<_> = numeric_features.keys().collect();
377
378 for i in 0..feature_names.len() {
379 for j in (i + 1)..feature_names.len() {
380 let name1 = feature_names[i];
381 let name2 = feature_names[j];
382
383 if let (Some(vals1), Some(vals2)) =
384 (numeric_features.get(name1), numeric_features.get(name2))
385 {
386 if let Some(corr) = self.calculate_correlation(vals1, vals2) {
387 if corr.abs() >= self.correlation_threshold {
388 correlations.push(CorrelationPair {
389 feature1: name1.clone(),
390 feature2: name2.clone(),
391 correlation: corr,
392 });
393 }
394 }
395 }
396 }
397 }
398
399 correlations
400 }
401
402 fn calculate_correlation(&self, vals1: &[Option<f64>], vals2: &[Option<f64>]) -> Option<f64> {
404 let pairs: Vec<(f64, f64)> = vals1
405 .iter()
406 .zip(vals2.iter())
407 .filter_map(|(a, b)| match (a, b) {
408 (Some(a), Some(b)) => Some((*a, *b)),
409 _ => None,
410 })
411 .collect();
412
413 if pairs.len() < 3 {
414 return None;
415 }
416
417 let n = pairs.len() as f64;
418 let mean1: f64 = pairs.iter().map(|(a, _)| a).sum::<f64>() / n;
419 let mean2: f64 = pairs.iter().map(|(_, b)| b).sum::<f64>() / n;
420
421 let cov: f64 = pairs
422 .iter()
423 .map(|(a, b)| (a - mean1) * (b - mean2))
424 .sum::<f64>()
425 / n;
426
427 let std1 = (pairs.iter().map(|(a, _)| (a - mean1).powi(2)).sum::<f64>() / n).sqrt();
428 let std2 = (pairs.iter().map(|(_, b)| (b - mean2).powi(2)).sum::<f64>() / n).sqrt();
429
430 if std1 == 0.0 || std2 == 0.0 {
431 return None;
432 }
433
434 Some(cov / (std1 * std2))
435 }
436}
437
438impl Default for FeatureAnalyzer {
439 fn default() -> Self {
440 Self::new()
441 }
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn test_numeric_feature() {
450 let mut data = FeatureData::default();
451 data.numeric_features.insert(
452 "amount".to_string(),
453 vec![Some(100.0), Some(200.0), Some(150.0), Some(175.0)],
454 );
455
456 let analyzer = FeatureAnalyzer::new();
457 let result = analyzer.analyze(&data).unwrap();
458
459 assert_eq!(result.total_features, 1);
460 assert_eq!(result.usable_features, 1);
461
462 let stats = &result.feature_stats[0];
463 assert!(stats.mean.is_some());
464 assert!(stats.std_dev.is_some());
465 assert!(stats.is_usable);
466 }
467
468 #[test]
469 fn test_zero_variance_feature() {
470 let mut data = FeatureData::default();
471 data.numeric_features.insert(
472 "constant".to_string(),
473 vec![Some(100.0), Some(100.0), Some(100.0)],
474 );
475
476 let analyzer = FeatureAnalyzer::new();
477 let result = analyzer.analyze(&data).unwrap();
478
479 assert_eq!(result.zero_variance_features.len(), 1);
480 assert!(!result.feature_stats[0].is_usable);
481 }
482
483 #[test]
484 fn test_categorical_feature() {
485 let mut data = FeatureData::default();
486 data.categorical_features.insert(
487 "category".to_string(),
488 vec![
489 Some("A".to_string()),
490 Some("B".to_string()),
491 Some("A".to_string()),
492 ],
493 );
494
495 let analyzer = FeatureAnalyzer::new();
496 let result = analyzer.analyze(&data).unwrap();
497
498 let stats = &result.feature_stats[0];
499 assert_eq!(stats.unique_values, Some(2));
500 assert!(stats.is_usable);
501 }
502}