Skip to main content

datasynth_eval/ml/
features.rs

1//! Feature distribution analysis.
2//!
3//! Analyzes feature distributions for ML model training suitability.
4
5use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8
9/// Results of feature analysis.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct FeatureAnalysis {
12    /// Per-feature statistics.
13    pub feature_stats: Vec<FeatureStats>,
14    /// Correlation matrix (feature pairs with high correlation).
15    pub high_correlations: Vec<CorrelationPair>,
16    /// Zero-variance features.
17    pub zero_variance_features: Vec<String>,
18    /// Features with high missing rate.
19    pub high_missing_features: Vec<String>,
20    /// Overall feature quality score (0.0-1.0).
21    pub quality_score: f64,
22    /// Number of usable features.
23    pub usable_features: usize,
24    /// Total features analyzed.
25    pub total_features: usize,
26}
27
28/// Statistics for a single feature.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct FeatureStats {
31    /// Feature name.
32    pub name: String,
33    /// Feature type.
34    pub feature_type: FeatureType,
35    /// Count of non-null values.
36    pub count: usize,
37    /// Missing rate.
38    pub missing_rate: f64,
39    /// Mean (for numeric).
40    pub mean: Option<f64>,
41    /// Standard deviation (for numeric).
42    pub std_dev: Option<f64>,
43    /// Minimum (for numeric).
44    pub min: Option<f64>,
45    /// Maximum (for numeric).
46    pub max: Option<f64>,
47    /// Skewness (for numeric).
48    pub skewness: Option<f64>,
49    /// Kurtosis (for numeric).
50    pub kurtosis: Option<f64>,
51    /// Number of unique values (for categorical).
52    pub unique_values: Option<usize>,
53    /// Whether feature is usable for ML.
54    pub is_usable: bool,
55    /// Issues with this feature.
56    pub issues: Vec<String>,
57}
58
59/// Type of feature.
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
61pub enum FeatureType {
62    /// Numeric (continuous).
63    Numeric,
64    /// Categorical (discrete).
65    Categorical,
66    /// Boolean.
67    Boolean,
68    /// Date/Time.
69    DateTime,
70    /// Text (requires encoding).
71    Text,
72}
73
74/// A pair of highly correlated features.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct CorrelationPair {
77    /// First feature name.
78    pub feature1: String,
79    /// Second feature name.
80    pub feature2: String,
81    /// Correlation coefficient.
82    pub correlation: f64,
83}
84
85/// Input for feature analysis.
86#[derive(Debug, Clone, Default)]
87pub struct FeatureData {
88    /// Numeric features: feature_name -> values.
89    pub numeric_features: HashMap<String, Vec<Option<f64>>>,
90    /// Categorical features: feature_name -> values.
91    pub categorical_features: HashMap<String, Vec<Option<String>>>,
92    /// Boolean features: feature_name -> values.
93    pub boolean_features: HashMap<String, Vec<Option<bool>>>,
94}
95
96/// Analyzer for feature distributions.
97pub struct FeatureAnalyzer {
98    /// Threshold for high correlation warning.
99    correlation_threshold: f64,
100    /// Threshold for high missing rate warning.
101    missing_threshold: f64,
102    /// Maximum unique values for categorical features.
103    max_categorical_cardinality: usize,
104}
105
106impl FeatureAnalyzer {
107    /// Create a new analyzer.
108    pub fn new() -> Self {
109        Self {
110            correlation_threshold: 0.95,
111            missing_threshold: 0.20,
112            max_categorical_cardinality: 1000,
113        }
114    }
115
116    /// Analyze feature distributions.
117    pub fn analyze(&self, data: &FeatureData) -> EvalResult<FeatureAnalysis> {
118        let mut feature_stats = Vec::new();
119        let mut zero_variance_features = Vec::new();
120        let mut high_missing_features = Vec::new();
121        let mut usable_features = 0;
122
123        // Analyze numeric features
124        for (name, values) in &data.numeric_features {
125            let stats = self.analyze_numeric_feature(name, values);
126            if stats.std_dev == Some(0.0) {
127                zero_variance_features.push(name.clone());
128            }
129            if stats.missing_rate > self.missing_threshold {
130                high_missing_features.push(name.clone());
131            }
132            if stats.is_usable {
133                usable_features += 1;
134            }
135            feature_stats.push(stats);
136        }
137
138        // Analyze categorical features
139        for (name, values) in &data.categorical_features {
140            let stats = self.analyze_categorical_feature(name, values);
141            if stats.missing_rate > self.missing_threshold {
142                high_missing_features.push(name.clone());
143            }
144            if stats.is_usable {
145                usable_features += 1;
146            }
147            feature_stats.push(stats);
148        }
149
150        // Analyze boolean features
151        for (name, values) in &data.boolean_features {
152            let stats = self.analyze_boolean_feature(name, values);
153            if stats.missing_rate > self.missing_threshold {
154                high_missing_features.push(name.clone());
155            }
156            if stats.is_usable {
157                usable_features += 1;
158            }
159            feature_stats.push(stats);
160        }
161
162        // Calculate correlations for numeric features
163        let high_correlations = self.find_high_correlations(&data.numeric_features);
164
165        let total_features = feature_stats.len();
166        let quality_score = if total_features > 0 {
167            usable_features as f64 / total_features as f64
168        } else {
169            1.0
170        };
171
172        Ok(FeatureAnalysis {
173            feature_stats,
174            high_correlations,
175            zero_variance_features,
176            high_missing_features,
177            quality_score,
178            usable_features,
179            total_features,
180        })
181    }
182
183    /// Analyze a numeric feature.
184    fn analyze_numeric_feature(&self, name: &str, values: &[Option<f64>]) -> FeatureStats {
185        let total = values.len();
186        let present: Vec<f64> = values.iter().filter_map(|v| *v).collect();
187        let count = present.len();
188        let missing_rate = if total > 0 {
189            (total - count) as f64 / total as f64
190        } else {
191            0.0
192        };
193
194        let mut issues = Vec::new();
195
196        if count == 0 {
197            issues.push("No non-null values".to_string());
198            return FeatureStats {
199                name: name.to_string(),
200                feature_type: FeatureType::Numeric,
201                count: 0,
202                missing_rate,
203                mean: None,
204                std_dev: None,
205                min: None,
206                max: None,
207                skewness: None,
208                kurtosis: None,
209                unique_values: None,
210                is_usable: false,
211                issues,
212            };
213        }
214
215        let mean = present.iter().sum::<f64>() / count as f64;
216        let variance: f64 = present.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / count as f64;
217        let std_dev = variance.sqrt();
218
219        let min = present.iter().cloned().fold(f64::INFINITY, f64::min);
220        let max = present.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
221
222        // Skewness and kurtosis
223        let (skewness, kurtosis) = if std_dev > 0.0 {
224            let m3: f64 = present
225                .iter()
226                .map(|x| ((x - mean) / std_dev).powi(3))
227                .sum::<f64>()
228                / count as f64;
229            let m4: f64 = present
230                .iter()
231                .map(|x| ((x - mean) / std_dev).powi(4))
232                .sum::<f64>()
233                / count as f64;
234            (Some(m3), Some(m4 - 3.0)) // Excess kurtosis
235        } else {
236            (None, None)
237        };
238
239        // Check for issues
240        if std_dev == 0.0 {
241            issues.push("Zero variance".to_string());
242        }
243        if missing_rate > self.missing_threshold {
244            issues.push(format!("High missing rate: {:.2}%", missing_rate * 100.0));
245        }
246        if let Some(s) = skewness {
247            if s.abs() > 2.0 {
248                issues.push(format!("High skewness: {:.2}", s));
249            }
250        }
251
252        let is_usable = std_dev > 0.0 && missing_rate < 0.5;
253
254        FeatureStats {
255            name: name.to_string(),
256            feature_type: FeatureType::Numeric,
257            count,
258            missing_rate,
259            mean: Some(mean),
260            std_dev: Some(std_dev),
261            min: Some(min),
262            max: Some(max),
263            skewness,
264            kurtosis,
265            unique_values: None,
266            is_usable,
267            issues,
268        }
269    }
270
271    /// Analyze a categorical feature.
272    fn analyze_categorical_feature(&self, name: &str, values: &[Option<String>]) -> FeatureStats {
273        let total = values.len();
274        let present: Vec<&String> = values.iter().filter_map(|v| v.as_ref()).collect();
275        let count = present.len();
276        let missing_rate = if total > 0 {
277            (total - count) as f64 / total as f64
278        } else {
279            0.0
280        };
281
282        let unique: HashSet<_> = present.iter().collect();
283        let unique_count = unique.len();
284
285        let mut issues = Vec::new();
286
287        if unique_count == 0 {
288            issues.push("No non-null values".to_string());
289        } else if unique_count == 1 {
290            issues.push("Only one unique value".to_string());
291        } else if unique_count > self.max_categorical_cardinality {
292            issues.push(format!("High cardinality: {} unique values", unique_count));
293        }
294
295        if missing_rate > self.missing_threshold {
296            issues.push(format!("High missing rate: {:.2}%", missing_rate * 100.0));
297        }
298
299        let is_usable = unique_count > 1
300            && unique_count <= self.max_categorical_cardinality
301            && missing_rate < 0.5;
302
303        FeatureStats {
304            name: name.to_string(),
305            feature_type: FeatureType::Categorical,
306            count,
307            missing_rate,
308            mean: None,
309            std_dev: None,
310            min: None,
311            max: None,
312            skewness: None,
313            kurtosis: None,
314            unique_values: Some(unique_count),
315            is_usable,
316            issues,
317        }
318    }
319
320    /// Analyze a boolean feature.
321    fn analyze_boolean_feature(&self, name: &str, values: &[Option<bool>]) -> FeatureStats {
322        let total = values.len();
323        let present: Vec<bool> = values.iter().filter_map(|v| *v).collect();
324        let count = present.len();
325        let missing_rate = if total > 0 {
326            (total - count) as f64 / total as f64
327        } else {
328            0.0
329        };
330
331        let true_count = present.iter().filter(|v| **v).count();
332        let true_rate = if count > 0 {
333            true_count as f64 / count as f64
334        } else {
335            0.0
336        };
337
338        let mut issues = Vec::new();
339
340        if count == 0 {
341            issues.push("No non-null values".to_string());
342        } else if true_rate == 0.0 || true_rate == 1.0 {
343            issues.push("No variance (all same value)".to_string());
344        }
345
346        if missing_rate > self.missing_threshold {
347            issues.push(format!("High missing rate: {:.2}%", missing_rate * 100.0));
348        }
349
350        let is_usable = count > 0 && true_rate > 0.0 && true_rate < 1.0 && missing_rate < 0.5;
351
352        FeatureStats {
353            name: name.to_string(),
354            feature_type: FeatureType::Boolean,
355            count,
356            missing_rate,
357            mean: Some(true_rate),
358            std_dev: None,
359            min: Some(0.0),
360            max: Some(1.0),
361            skewness: None,
362            kurtosis: None,
363            unique_values: Some(2),
364            is_usable,
365            issues,
366        }
367    }
368
369    /// Find highly correlated feature pairs.
370    fn find_high_correlations(
371        &self,
372        numeric_features: &HashMap<String, Vec<Option<f64>>>,
373    ) -> Vec<CorrelationPair> {
374        let mut correlations = Vec::new();
375
376        let feature_names: Vec<_> = numeric_features.keys().collect();
377
378        for i in 0..feature_names.len() {
379            for j in (i + 1)..feature_names.len() {
380                let name1 = feature_names[i];
381                let name2 = feature_names[j];
382
383                if let (Some(vals1), Some(vals2)) =
384                    (numeric_features.get(name1), numeric_features.get(name2))
385                {
386                    if let Some(corr) = self.calculate_correlation(vals1, vals2) {
387                        if corr.abs() >= self.correlation_threshold {
388                            correlations.push(CorrelationPair {
389                                feature1: name1.clone(),
390                                feature2: name2.clone(),
391                                correlation: corr,
392                            });
393                        }
394                    }
395                }
396            }
397        }
398
399        correlations
400    }
401
402    /// Calculate Pearson correlation between two feature vectors.
403    fn calculate_correlation(&self, vals1: &[Option<f64>], vals2: &[Option<f64>]) -> Option<f64> {
404        let pairs: Vec<(f64, f64)> = vals1
405            .iter()
406            .zip(vals2.iter())
407            .filter_map(|(a, b)| match (a, b) {
408                (Some(a), Some(b)) => Some((*a, *b)),
409                _ => None,
410            })
411            .collect();
412
413        if pairs.len() < 3 {
414            return None;
415        }
416
417        let n = pairs.len() as f64;
418        let mean1: f64 = pairs.iter().map(|(a, _)| a).sum::<f64>() / n;
419        let mean2: f64 = pairs.iter().map(|(_, b)| b).sum::<f64>() / n;
420
421        let cov: f64 = pairs
422            .iter()
423            .map(|(a, b)| (a - mean1) * (b - mean2))
424            .sum::<f64>()
425            / n;
426
427        let std1 = (pairs.iter().map(|(a, _)| (a - mean1).powi(2)).sum::<f64>() / n).sqrt();
428        let std2 = (pairs.iter().map(|(_, b)| (b - mean2).powi(2)).sum::<f64>() / n).sqrt();
429
430        if std1 == 0.0 || std2 == 0.0 {
431            return None;
432        }
433
434        Some(cov / (std1 * std2))
435    }
436}
437
438impl Default for FeatureAnalyzer {
439    fn default() -> Self {
440        Self::new()
441    }
442}
443
444#[cfg(test)]
445mod tests {
446    use super::*;
447
448    #[test]
449    fn test_numeric_feature() {
450        let mut data = FeatureData::default();
451        data.numeric_features.insert(
452            "amount".to_string(),
453            vec![Some(100.0), Some(200.0), Some(150.0), Some(175.0)],
454        );
455
456        let analyzer = FeatureAnalyzer::new();
457        let result = analyzer.analyze(&data).unwrap();
458
459        assert_eq!(result.total_features, 1);
460        assert_eq!(result.usable_features, 1);
461
462        let stats = &result.feature_stats[0];
463        assert!(stats.mean.is_some());
464        assert!(stats.std_dev.is_some());
465        assert!(stats.is_usable);
466    }
467
468    #[test]
469    fn test_zero_variance_feature() {
470        let mut data = FeatureData::default();
471        data.numeric_features.insert(
472            "constant".to_string(),
473            vec![Some(100.0), Some(100.0), Some(100.0)],
474        );
475
476        let analyzer = FeatureAnalyzer::new();
477        let result = analyzer.analyze(&data).unwrap();
478
479        assert_eq!(result.zero_variance_features.len(), 1);
480        assert!(!result.feature_stats[0].is_usable);
481    }
482
483    #[test]
484    fn test_categorical_feature() {
485        let mut data = FeatureData::default();
486        data.categorical_features.insert(
487            "category".to_string(),
488            vec![
489                Some("A".to_string()),
490                Some("B".to_string()),
491                Some("A".to_string()),
492            ],
493        );
494
495        let analyzer = FeatureAnalyzer::new();
496        let result = analyzer.analyze(&data).unwrap();
497
498        let stats = &result.feature_stats[0];
499        assert_eq!(stats.unique_values, Some(2));
500        assert!(stats.is_usable);
501    }
502}