ghostflow_ml/
feature_selection.rs

1//! Feature Selection - Variance Threshold, SelectKBest, RFE, SelectFromModel
2
3use ghostflow_core::Tensor;
4
5/// Variance Threshold - remove low-variance features
6pub struct VarianceThreshold {
7    pub threshold: f32,
8    variances_: Option<Vec<f32>>,
9    mask_: Option<Vec<bool>>,
10    n_features_in_: usize,
11}
12
13impl VarianceThreshold {
14    pub fn new(threshold: f32) -> Self {
15        VarianceThreshold {
16            threshold,
17            variances_: None,
18            mask_: None,
19            n_features_in_: 0,
20        }
21    }
22
23    pub fn fit(&mut self, x: &Tensor) {
24        let x_data = x.data_f32();
25        let n_samples = x.dims()[0];
26        let n_features = x.dims()[1];
27
28        self.n_features_in_ = n_features;
29
30        // Compute variance for each feature
31        let variances: Vec<f32> = (0..n_features)
32            .map(|j| {
33                let mean: f32 = (0..n_samples)
34                    .map(|i| x_data[i * n_features + j])
35                    .sum::<f32>() / n_samples as f32;
36                
37                (0..n_samples)
38                    .map(|i| (x_data[i * n_features + j] - mean).powi(2))
39                    .sum::<f32>() / n_samples as f32
40            })
41            .collect();
42
43        let mask: Vec<bool> = variances.iter().map(|&v| v > self.threshold).collect();
44
45        self.variances_ = Some(variances);
46        self.mask_ = Some(mask);
47    }
48
49    pub fn transform(&self, x: &Tensor) -> Tensor {
50        let x_data = x.data_f32();
51        let n_samples = x.dims()[0];
52        let n_features = x.dims()[1];
53
54        let mask = self.mask_.as_ref().expect("Model not fitted");
55        let n_selected: usize = mask.iter().filter(|&&m| m).count();
56
57        let mut result = Vec::with_capacity(n_samples * n_selected);
58
59        for i in 0..n_samples {
60            for (j, &keep) in mask.iter().enumerate() {
61                if keep {
62                    result.push(x_data[i * n_features + j]);
63                }
64            }
65        }
66
67        Tensor::from_slice(&result, &[n_samples, n_selected]).unwrap()
68    }
69
70    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
71        self.fit(x);
72        self.transform(x)
73    }
74
75    pub fn get_support(&self) -> Vec<bool> {
76        self.mask_.clone().unwrap_or_default()
77    }
78}
79
80/// SelectKBest - select k features with highest scores
81pub struct SelectKBest {
82    pub k: usize,
83    pub score_func: ScoreFunction,
84    scores_: Option<Vec<f32>>,
85    #[allow(dead_code)]
86    pvalues_: Option<Vec<f32>>,
87    mask_: Option<Vec<bool>>,
88    n_features_in_: usize,
89}
90
91#[derive(Clone, Copy, Debug)]
92pub enum ScoreFunction {
93    FClassif,      // ANOVA F-value for classification
94    MutualInfoClassif,
95    Chi2,
96    FRegression,   // F-value for regression
97    MutualInfoRegression,
98}
99
100impl SelectKBest {
101    pub fn new(k: usize, score_func: ScoreFunction) -> Self {
102        SelectKBest {
103            k,
104            score_func,
105            scores_: None,
106            pvalues_: None,
107            mask_: None,
108            n_features_in_: 0,
109        }
110    }
111
112    fn f_classif(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
113        let n_classes = y.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
114
115        (0..n_features)
116            .map(|j| {
117                // Compute class means and overall mean
118                let mut class_sums = vec![0.0f32; n_classes];
119                let mut class_counts = vec![0usize; n_classes];
120                let mut total_sum = 0.0f32;
121
122                for i in 0..n_samples {
123                    let class = y[i] as usize;
124                    let val = x[i * n_features + j];
125                    class_sums[class] += val;
126                    class_counts[class] += 1;
127                    total_sum += val;
128                }
129
130                let overall_mean = total_sum / n_samples as f32;
131                let class_means: Vec<f32> = class_sums.iter()
132                    .zip(class_counts.iter())
133                    .map(|(&sum, &count)| if count > 0 { sum / count as f32 } else { 0.0 })
134                    .collect();
135
136                // Between-group variance
137                let ss_between: f32 = class_means.iter()
138                    .zip(class_counts.iter())
139                    .map(|(&mean, &count)| count as f32 * (mean - overall_mean).powi(2))
140                    .sum();
141
142                // Within-group variance
143                let mut ss_within = 0.0f32;
144                for i in 0..n_samples {
145                    let class = y[i] as usize;
146                    let val = x[i * n_features + j];
147                    ss_within += (val - class_means[class]).powi(2);
148                }
149
150                // F-statistic
151                let df_between = (n_classes - 1) as f32;
152                let df_within = (n_samples - n_classes) as f32;
153
154                if ss_within > 1e-10 && df_within > 0.0 {
155                    (ss_between / df_between) / (ss_within / df_within)
156                } else {
157                    0.0
158                }
159            })
160            .collect()
161    }
162
163    fn chi2(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
164        let n_classes = y.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
165
166        (0..n_features)
167            .map(|j| {
168                // Compute observed and expected frequencies
169                let mut observed = vec![vec![0.0f32; 2]; n_classes];  // Binary: feature present/absent
170                let mut class_totals = vec![0.0f32; n_classes];
171                let mut feature_totals = [0.0f32; 2];
172
173                for i in 0..n_samples {
174                    let class = y[i] as usize;
175                    let val = if x[i * n_features + j] > 0.0 { 1 } else { 0 };
176                    observed[class][val] += 1.0;
177                    class_totals[class] += 1.0;
178                    feature_totals[val] += 1.0;
179                }
180
181                let total = n_samples as f32;
182                let mut chi2 = 0.0f32;
183
184                for c in 0..n_classes {
185                    for f in 0..2 {
186                        let expected = class_totals[c] * feature_totals[f] / total;
187                        if expected > 0.0 {
188                            chi2 += (observed[c][f] - expected).powi(2) / expected;
189                        }
190                    }
191                }
192
193                chi2
194            })
195            .collect()
196    }
197
198    fn f_regression(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
199        let y_mean: f32 = y.iter().sum::<f32>() / n_samples as f32;
200        let ss_tot: f32 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
201
202        (0..n_features)
203            .map(|j| {
204                // Simple linear regression for each feature
205                let x_mean: f32 = (0..n_samples)
206                    .map(|i| x[i * n_features + j])
207                    .sum::<f32>() / n_samples as f32;
208
209                let mut cov = 0.0f32;
210                let mut var_x = 0.0f32;
211
212                for i in 0..n_samples {
213                    let xi = x[i * n_features + j] - x_mean;
214                    let yi = y[i] - y_mean;
215                    cov += xi * yi;
216                    var_x += xi * xi;
217                }
218
219                if var_x < 1e-10 {
220                    return 0.0;
221                }
222
223                let slope = cov / var_x;
224                let intercept = y_mean - slope * x_mean;
225
226                // Compute R² and convert to F-statistic
227                let ss_res: f32 = (0..n_samples)
228                    .map(|i| {
229                        let pred = slope * x[i * n_features + j] + intercept;
230                        (y[i] - pred).powi(2)
231                    })
232                    .sum();
233
234                let r2 = 1.0 - ss_res / ss_tot.max(1e-10);
235                let df1 = 1.0f32;
236                let df2 = (n_samples - 2) as f32;
237
238                if r2 < 1.0 && df2 > 0.0 {
239                    (r2 / df1) / ((1.0 - r2) / df2)
240                } else {
241                    0.0
242                }
243            })
244            .collect()
245    }
246
247    fn mutual_info(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize, is_classification: bool) -> Vec<f32> {
248        // Simplified mutual information using binning
249        let n_bins = 10;
250
251        (0..n_features)
252            .map(|j| {
253                let feature_vals: Vec<f32> = (0..n_samples)
254                    .map(|i| x[i * n_features + j])
255                    .collect();
256
257                let min_val = feature_vals.iter().cloned().fold(f32::INFINITY, f32::min);
258                let max_val = feature_vals.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
259                let range = (max_val - min_val).max(1e-10);
260
261                // Bin the feature
262                let x_binned: Vec<usize> = feature_vals.iter()
263                    .map(|&v| (((v - min_val) / range * (n_bins - 1) as f32) as usize).min(n_bins - 1))
264                    .collect();
265
266                // Bin y for regression
267                let y_binned: Vec<usize> = if is_classification {
268                    y.iter().map(|&v| v as usize).collect()
269                } else {
270                    let y_min = y.iter().cloned().fold(f32::INFINITY, f32::min);
271                    let y_max = y.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
272                    let y_range = (y_max - y_min).max(1e-10);
273                    y.iter()
274                        .map(|&v| (((v - y_min) / y_range * (n_bins - 1) as f32) as usize).min(n_bins - 1))
275                        .collect()
276                };
277
278                let n_y_bins = if is_classification {
279                    y.iter().map(|&v| v as usize).max().unwrap_or(0) + 1
280                } else {
281                    n_bins
282                };
283
284                // Compute joint and marginal probabilities
285                let mut joint = vec![vec![0.0f32; n_y_bins]; n_bins];
286                let mut p_x = vec![0.0f32; n_bins];
287                let mut p_y = vec![0.0f32; n_y_bins];
288
289                for i in 0..n_samples {
290                    joint[x_binned[i]][y_binned[i]] += 1.0;
291                    p_x[x_binned[i]] += 1.0;
292                    p_y[y_binned[i]] += 1.0;
293                }
294
295                let n = n_samples as f32;
296                for p in &mut p_x { *p /= n; }
297                for p in &mut p_y { *p /= n; }
298                for row in &mut joint {
299                    for p in row { *p /= n; }
300                }
301
302                // Compute mutual information
303                let mut mi = 0.0f32;
304                for xi in 0..n_bins {
305                    for yi in 0..n_y_bins {
306                        if joint[xi][yi] > 1e-10 && p_x[xi] > 1e-10 && p_y[yi] > 1e-10 {
307                            mi += joint[xi][yi] * (joint[xi][yi] / (p_x[xi] * p_y[yi])).ln();
308                        }
309                    }
310                }
311
312                mi.max(0.0)
313            })
314            .collect()
315    }
316
317    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
318        let x_data = x.data_f32();
319        let y_data = y.data_f32();
320        let n_samples = x.dims()[0];
321        let n_features = x.dims()[1];
322
323        self.n_features_in_ = n_features;
324
325        let scores = match self.score_func {
326            ScoreFunction::FClassif => self.f_classif(&x_data, &y_data, n_samples, n_features),
327            ScoreFunction::Chi2 => self.chi2(&x_data, &y_data, n_samples, n_features),
328            ScoreFunction::FRegression => self.f_regression(&x_data, &y_data, n_samples, n_features),
329            ScoreFunction::MutualInfoClassif => self.mutual_info(&x_data, &y_data, n_samples, n_features, true),
330            ScoreFunction::MutualInfoRegression => self.mutual_info(&x_data, &y_data, n_samples, n_features, false),
331        };
332
333        // Select top k features
334        let mut indexed_scores: Vec<(usize, f32)> = scores.iter().cloned().enumerate().collect();
335        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
336
337        let k = self.k.min(n_features);
338        let selected_indices: Vec<usize> = indexed_scores.iter().take(k).map(|(i, _)| *i).collect();
339
340        let mut mask = vec![false; n_features];
341        for &idx in &selected_indices {
342            mask[idx] = true;
343        }
344
345        self.scores_ = Some(scores);
346        self.mask_ = Some(mask);
347    }
348
349    pub fn transform(&self, x: &Tensor) -> Tensor {
350        let x_data = x.data_f32();
351        let n_samples = x.dims()[0];
352        let n_features = x.dims()[1];
353
354        let mask = self.mask_.as_ref().expect("Model not fitted");
355        let n_selected: usize = mask.iter().filter(|&&m| m).count();
356
357        let mut result = Vec::with_capacity(n_samples * n_selected);
358
359        for i in 0..n_samples {
360            for (j, &keep) in mask.iter().enumerate() {
361                if keep {
362                    result.push(x_data[i * n_features + j]);
363                }
364            }
365        }
366
367        Tensor::from_slice(&result, &[n_samples, n_selected]).unwrap()
368    }
369
370    pub fn fit_transform(&mut self, x: &Tensor, y: &Tensor) -> Tensor {
371        self.fit(x, y);
372        self.transform(x)
373    }
374
375    pub fn get_support(&self) -> Vec<bool> {
376        self.mask_.clone().unwrap_or_default()
377    }
378
379    pub fn scores(&self) -> Vec<f32> {
380        self.scores_.clone().unwrap_or_default()
381    }
382}
383
384/// Recursive Feature Elimination
385pub struct RFE {
386    pub n_features_to_select: usize,
387    pub step: usize,
388    support_: Option<Vec<bool>>,
389    ranking_: Option<Vec<usize>>,
390    n_features_in_: usize,
391}
392
393impl RFE {
394    pub fn new(n_features_to_select: usize) -> Self {
395        RFE {
396            n_features_to_select,
397            step: 1,
398            support_: None,
399            ranking_: None,
400            n_features_in_: 0,
401        }
402    }
403
404    pub fn step(mut self, step: usize) -> Self {
405        self.step = step.max(1);
406        self
407    }
408
409    /// Fit RFE using feature importance from a simple model
410    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
411        let x_data = x.data_f32();
412        let y_data = y.data_f32();
413        let n_samples = x.dims()[0];
414        let n_features = x.dims()[1];
415
416        self.n_features_in_ = n_features;
417
418        let mut mask = vec![true; n_features];
419        let mut ranking = vec![1usize; n_features];
420        let mut current_rank = 1;
421
422        while mask.iter().filter(|&&m| m).count() > self.n_features_to_select {
423            // Compute feature importances using correlation with target
424            let importances: Vec<(usize, f32)> = (0..n_features)
425                .filter(|&j| mask[j])
426                .map(|j| {
427                    let x_mean: f32 = (0..n_samples)
428                        .map(|i| x_data[i * n_features + j])
429                        .sum::<f32>() / n_samples as f32;
430                    let y_mean: f32 = y_data.iter().sum::<f32>() / n_samples as f32;
431
432                    let mut cov = 0.0f32;
433                    let mut var_x = 0.0f32;
434                    let mut var_y = 0.0f32;
435
436                    for i in 0..n_samples {
437                        let xi = x_data[i * n_features + j] - x_mean;
438                        let yi = y_data[i] - y_mean;
439                        cov += xi * yi;
440                        var_x += xi * xi;
441                        var_y += yi * yi;
442                    }
443
444                    let corr = if var_x > 1e-10 && var_y > 1e-10 {
445                        (cov / (var_x.sqrt() * var_y.sqrt())).abs()
446                    } else {
447                        0.0
448                    };
449
450                    (j, corr)
451                })
452                .collect();
453
454            // Find features to eliminate
455            let mut sorted_importances = importances.clone();
456            sorted_importances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
457
458            let n_to_remove = self.step.min(mask.iter().filter(|&&m| m).count() - self.n_features_to_select);
459
460            current_rank += 1;
461            for (idx, _) in sorted_importances.iter().take(n_to_remove) {
462                mask[*idx] = false;
463                ranking[*idx] = current_rank;
464            }
465        }
466
467        self.support_ = Some(mask);
468        self.ranking_ = Some(ranking);
469    }
470
471    pub fn transform(&self, x: &Tensor) -> Tensor {
472        let x_data = x.data_f32();
473        let n_samples = x.dims()[0];
474        let n_features = x.dims()[1];
475
476        let mask = self.support_.as_ref().expect("Model not fitted");
477        let n_selected: usize = mask.iter().filter(|&&m| m).count();
478
479        let mut result = Vec::with_capacity(n_samples * n_selected);
480
481        for i in 0..n_samples {
482            for (j, &keep) in mask.iter().enumerate() {
483                if keep {
484                    result.push(x_data[i * n_features + j]);
485                }
486            }
487        }
488
489        Tensor::from_slice(&result, &[n_samples, n_selected]).unwrap()
490    }
491
492    pub fn fit_transform(&mut self, x: &Tensor, y: &Tensor) -> Tensor {
493        self.fit(x, y);
494        self.transform(x)
495    }
496
497    pub fn get_support(&self) -> Vec<bool> {
498        self.support_.clone().unwrap_or_default()
499    }
500
501    pub fn ranking(&self) -> Vec<usize> {
502        self.ranking_.clone().unwrap_or_default()
503    }
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_variance_threshold() {
512        let x = Tensor::from_slice(&[
513            0.0, 1.0, 0.0,
514            0.0, 2.0, 0.0,
515            0.0, 3.0, 0.0,
516            0.0, 4.0, 0.0,
517        ], &[4, 3]).unwrap();
518
519        let mut vt = VarianceThreshold::new(0.0);
520        let transformed = vt.fit_transform(&x);
521        
522        // Should remove constant features
523        assert!(transformed.dims()[1] <= 3);
524    }
525
526    #[test]
527    fn test_select_k_best() {
528        let x = Tensor::from_slice(&[
529            1.0, 0.0, 2.0,
530            2.0, 0.0, 4.0,
531            3.0, 0.0, 6.0,
532            4.0, 0.0, 8.0,
533        ], &[4, 3]).unwrap();
534        
535        let y = Tensor::from_slice(&[0.0, 0.0, 1.0, 1.0], &[4]).unwrap();
536
537        let mut skb = SelectKBest::new(2, ScoreFunction::FClassif);
538        let transformed = skb.fit_transform(&x, &y);
539        
540        assert_eq!(transformed.dims()[1], 2);
541    }
542
543    #[test]
544    fn test_rfe() {
545        let x = Tensor::from_slice(&[
546            1.0, 0.0, 2.0, 0.5,
547            2.0, 0.0, 4.0, 0.5,
548            3.0, 0.0, 6.0, 0.5,
549            4.0, 0.0, 8.0, 0.5,
550        ], &[4, 4]).unwrap();
551        
552        let y = Tensor::from_slice(&[1.0, 2.0, 3.0, 4.0], &[4]).unwrap();
553
554        let mut rfe = RFE::new(2);
555        let transformed = rfe.fit_transform(&x, &y);
556        
557        assert_eq!(transformed.dims()[1], 2);
558    }
559}