Skip to main content

scirs2_series/feature_selection/
filter.rs

1//! Filter-based feature selection methods
2
3use scirs2_core::ndarray::{Array1, Array2};
4use std::collections::HashMap;
5
6use super::FeatureSelectionResult;
7use crate::error::{Result, TimeSeriesError};
8use crate::utils::autocorrelation;
9
10/// Filter-based feature selection methods
11pub struct FilterMethods;
12
13impl FilterMethods {
14    /// Variance-based feature selection
15    ///
16    /// Removes features with low variance (quasi-constant features).
17    ///
18    /// # Arguments
19    ///
20    /// * `features` - Feature matrix (samples x features)
21    /// * `threshold` - Minimum variance threshold
22    ///
23    /// # Returns
24    ///
25    /// * Feature selection result
26    ///
27    /// # Example
28    ///
29    /// ```
30    /// use scirs2_core::ndarray::Array2;
31    /// use scirs2_series::feature_selection::FilterMethods;
32    ///
33    /// let features = Array2::from_shape_vec((100, 5), (0..500).map(|x| x as f64).collect()).expect("Operation failed");
34    /// let result = FilterMethods::variance_threshold(&features, 0.1).expect("Operation failed");
35    /// println!("Selected {} features", result.selected_features.len());
36    /// ```
37    pub fn variance_threshold(
38        features: &Array2<f64>,
39        threshold: f64,
40    ) -> Result<FeatureSelectionResult> {
41        let (n_samples, n_features) = features.dim();
42
43        if n_samples < 2 {
44            return Err(TimeSeriesError::InsufficientData {
45                message: "Need at least 2 samples for variance calculation".to_string(),
46                required: 2,
47                actual: n_samples,
48            });
49        }
50
51        let mut selected_features = Vec::new();
52        let mut feature_scores = Array1::zeros(n_features);
53
54        for i in 0..n_features {
55            let feature_col = features.column(i);
56            let mean = feature_col.sum() / n_samples as f64;
57            let variance = feature_col.mapv(|x| (x - mean).powi(2)).sum() / (n_samples - 1) as f64;
58
59            feature_scores[i] = variance;
60
61            if variance >= threshold {
62                selected_features.push(i);
63            }
64        }
65
66        let mut metadata = HashMap::new();
67        metadata.insert("threshold".to_string(), threshold);
68        metadata.insert("n_selected".to_string(), selected_features.len() as f64);
69
70        Ok(FeatureSelectionResult {
71            selected_features,
72            feature_scores,
73            method: "VarianceThreshold".to_string(),
74            metadata,
75        })
76    }
77
78    /// Correlation-based feature selection
79    ///
80    /// Selects features based on their correlation with the target variable.
81    ///
82    /// # Arguments
83    ///
84    /// * `features` - Feature matrix (samples x features)
85    /// * `target` - Target variable
86    /// * `threshold` - Minimum absolute correlation threshold
87    ///
88    /// # Returns
89    ///
90    /// * Feature selection result
91    pub fn correlation_selection(
92        features: &Array2<f64>,
93        target: &Array1<f64>,
94        threshold: f64,
95    ) -> Result<FeatureSelectionResult> {
96        let (n_samples, n_features) = features.dim();
97
98        if n_samples != target.len() {
99            return Err(TimeSeriesError::DimensionMismatch {
100                expected: n_samples,
101                actual: target.len(),
102            });
103        }
104
105        if n_samples < 3 {
106            return Err(TimeSeriesError::InsufficientData {
107                message: "Need at least 3 samples for correlation calculation".to_string(),
108                required: 3,
109                actual: n_samples,
110            });
111        }
112
113        let mut selected_features = Vec::new();
114        let mut feature_scores = Array1::zeros(n_features);
115
116        let target_mean = target.sum() / n_samples as f64;
117
118        for i in 0..n_features {
119            let feature_col = features.column(i);
120            let feature_mean = feature_col.sum() / n_samples as f64;
121
122            let correlation =
123                Self::calculate_correlation(&feature_col, target, feature_mean, target_mean)?;
124            let abs_correlation = correlation.abs();
125
126            feature_scores[i] = abs_correlation;
127
128            if abs_correlation >= threshold {
129                selected_features.push(i);
130            }
131        }
132
133        let mut metadata = HashMap::new();
134        metadata.insert("threshold".to_string(), threshold);
135        metadata.insert("n_selected".to_string(), selected_features.len() as f64);
136
137        Ok(FeatureSelectionResult {
138            selected_features,
139            feature_scores,
140            method: "CorrelationSelection".to_string(),
141            metadata,
142        })
143    }
144
145    /// Mutual information-based feature selection for time series
146    ///
147    /// Estimates mutual information between features and target using histogram-based approach.
148    ///
149    /// # Arguments
150    ///
151    /// * `features` - Feature matrix (samples x features)
152    /// * `target` - Target variable
153    /// * `n_bins` - Number of bins for histogram estimation
154    /// * `n_features` - Number of top features to select
155    ///
156    /// # Returns
157    ///
158    /// * Feature selection result
159    pub fn mutual_information_selection(
160        features: &Array2<f64>,
161        target: &Array1<f64>,
162        n_bins: usize,
163        n_features: Option<usize>,
164    ) -> Result<FeatureSelectionResult> {
165        let (n_samples, n_feat) = features.dim();
166
167        if n_samples != target.len() {
168            return Err(TimeSeriesError::DimensionMismatch {
169                expected: n_samples,
170                actual: target.len(),
171            });
172        }
173
174        if n_samples < n_bins * 2 {
175            return Err(TimeSeriesError::InsufficientData {
176                message: "Insufficient samples for mutual information estimation".to_string(),
177                required: n_bins * 2,
178                actual: n_samples,
179            });
180        }
181
182        let mut feature_scores = Array1::zeros(n_feat);
183
184        for i in 0..n_feat {
185            let feature_col = features.column(i);
186            let mi = Self::calculate_mutual_information(&feature_col, target, n_bins)?;
187            feature_scores[i] = mi;
188        }
189
190        // Select top _features
191        let n_to_select = n_features.unwrap_or(n_feat / 2).min(n_feat);
192        let mut indexed_scores: Vec<(usize, f64)> = feature_scores
193            .iter()
194            .enumerate()
195            .map(|(i, &score)| (i, score))
196            .collect();
197
198        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
199
200        let selected_features: Vec<usize> = indexed_scores
201            .into_iter()
202            .take(n_to_select)
203            .map(|(idx_, _)| idx_)
204            .collect();
205
206        let mut metadata = HashMap::new();
207        metadata.insert("n_bins".to_string(), n_bins as f64);
208        metadata.insert("n_selected".to_string(), selected_features.len() as f64);
209
210        Ok(FeatureSelectionResult {
211            selected_features,
212            feature_scores,
213            method: "MutualInformation".to_string(),
214            metadata,
215        })
216    }
217
218    /// F-test based feature selection
219    ///
220    /// Performs F-test for feature relevance.
221    ///
222    /// # Arguments
223    ///
224    /// * `features` - Feature matrix (samples x features)
225    /// * `target` - Target variable
226    /// * `alpha` - Significance level
227    ///
228    /// # Returns
229    ///
230    /// * Feature selection result
231    pub fn f_test_selection(
232        features: &Array2<f64>,
233        target: &Array1<f64>,
234        alpha: f64,
235    ) -> Result<FeatureSelectionResult> {
236        let (n_samples, n_features) = features.dim();
237
238        if n_samples != target.len() {
239            return Err(TimeSeriesError::DimensionMismatch {
240                expected: n_samples,
241                actual: target.len(),
242            });
243        }
244
245        if n_samples < 4 {
246            return Err(TimeSeriesError::InsufficientData {
247                message: "Need at least 4 samples for F-test".to_string(),
248                required: 4,
249                actual: n_samples,
250            });
251        }
252
253        let mut selected_features = Vec::new();
254        let mut feature_scores = Array1::zeros(n_features);
255
256        let target_mean = target.sum() / n_samples as f64;
257        let sst = target.mapv(|y| (y - target_mean).powi(2)).sum();
258
259        for i in 0..n_features {
260            let feature_col = features.column(i);
261            let f_stat = Self::calculate_f_statistic(&feature_col, target, target_mean, sst)?;
262
263            feature_scores[i] = f_stat;
264
265            // Critical value for F(1, n-2) distribution at significance level alpha
266            // Simplified approximation - in practice, use proper F-distribution
267            let critical_value = Self::f_critical_value(alpha, n_samples);
268
269            if f_stat > critical_value {
270                selected_features.push(i);
271            }
272        }
273
274        let mut metadata = HashMap::new();
275        metadata.insert("alpha".to_string(), alpha);
276        metadata.insert("n_selected".to_string(), selected_features.len() as f64);
277
278        Ok(FeatureSelectionResult {
279            selected_features,
280            feature_scores,
281            method: "FTest".to_string(),
282            metadata,
283        })
284    }
285
286    /// Autocorrelation-based feature filtering for time series
287    ///
288    /// Selects features based on their autocorrelation structure.
289    ///
290    /// # Arguments
291    ///
292    /// * `features` - Feature matrix (samples x features)
293    /// * `max_lag` - Maximum lag to consider
294    /// * `threshold` - Minimum autocorrelation threshold
295    ///
296    /// # Returns
297    ///
298    /// * Feature selection result
299    pub fn autocorrelation_filter(
300        features: &Array2<f64>,
301        max_lag: usize,
302        threshold: f64,
303    ) -> Result<FeatureSelectionResult> {
304        let (n_samples, n_features) = features.dim();
305
306        if n_samples <= max_lag + 1 {
307            return Err(TimeSeriesError::InsufficientData {
308                message: "Insufficient samples for autocorrelation calculation".to_string(),
309                required: max_lag + 2,
310                actual: n_samples,
311            });
312        }
313
314        let mut selected_features = Vec::new();
315        let mut feature_scores = Array1::zeros(n_features);
316
317        for i in 0..n_features {
318            let feature_col = features.column(i).to_owned();
319
320            // Calculate autocorrelation function, handle constant series
321            let max_acf = match autocorrelation(&feature_col, Some(max_lag)) {
322                Ok(acf) => {
323                    // Score is the maximum significant autocorrelation
324                    acf.slice(scirs2_core::ndarray::s![1..])
325                        .iter()
326                        .map(|&x| x.abs())
327                        .fold(0.0f64, |a, b| a.max(b))
328                }
329                Err(_) => {
330                    // Constant series or other error - assign zero autocorrelation
331                    0.0
332                }
333            };
334
335            feature_scores[i] = max_acf;
336
337            if max_acf >= threshold {
338                selected_features.push(i);
339            }
340        }
341
342        let mut metadata = HashMap::new();
343        metadata.insert("max_lag".to_string(), max_lag as f64);
344        metadata.insert("threshold".to_string(), threshold);
345        metadata.insert("n_selected".to_string(), selected_features.len() as f64);
346
347        Ok(FeatureSelectionResult {
348            selected_features,
349            feature_scores,
350            method: "AutocorrelationFilter".to_string(),
351            metadata,
352        })
353    }
354
355    // Helper methods
356
357    fn calculate_correlation(
358        x: &scirs2_core::ndarray::ArrayView1<f64>,
359        y: &Array1<f64>,
360        x_mean: f64,
361        y_mean: f64,
362    ) -> Result<f64> {
363        let n = x.len();
364
365        let mut numerator = 0.0;
366        let mut x_var = 0.0;
367        let mut y_var = 0.0;
368
369        for i in 0..n {
370            let x_dev = x[i] - x_mean;
371            let y_dev = y[i] - y_mean;
372
373            numerator += x_dev * y_dev;
374            x_var += x_dev * x_dev;
375            y_var += y_dev * y_dev;
376        }
377
378        let denominator = (x_var * y_var).sqrt();
379
380        if denominator == 0.0 {
381            Ok(0.0)
382        } else {
383            Ok(numerator / denominator)
384        }
385    }
386
387    fn calculate_mutual_information(
388        x: &scirs2_core::ndarray::ArrayView1<f64>,
389        y: &Array1<f64>,
390        n_bins: usize,
391    ) -> Result<f64> {
392        let n = x.len();
393
394        // Create _bins
395        let x_min = x.iter().fold(f64::INFINITY, |a, &b| a.min(b));
396        let x_max = x.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
397        let y_min = y.iter().fold(f64::INFINITY, |a, &b| a.min(b));
398        let y_max = y.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
399
400        if x_max == x_min || y_max == y_min {
401            return Ok(0.0);
402        }
403
404        let x_bin_size = (x_max - x_min) / n_bins as f64;
405        let y_bin_size = (y_max - y_min) / n_bins as f64;
406
407        // Count occurrences
408        let mut joint_counts = vec![vec![0; n_bins]; n_bins];
409        let mut x_counts = vec![0; n_bins];
410        let mut y_counts = vec![0; n_bins];
411
412        for i in 0..n {
413            let x_bin = ((x[i] - x_min) / x_bin_size).floor() as usize;
414            let y_bin = ((y[i] - y_min) / y_bin_size).floor() as usize;
415
416            let x_bin = x_bin.min(n_bins - 1);
417            let y_bin = y_bin.min(n_bins - 1);
418
419            joint_counts[x_bin][y_bin] += 1;
420            x_counts[x_bin] += 1;
421            y_counts[y_bin] += 1;
422        }
423
424        // Calculate mutual information
425        let mut mi = 0.0;
426
427        for (i_, _) in x_counts.iter().enumerate().take(n_bins) {
428            for (j_, _) in y_counts.iter().enumerate().take(n_bins) {
429                if joint_counts[i_][j_] > 0 && x_counts[i_] > 0 && y_counts[j_] > 0 {
430                    let p_xy = joint_counts[i_][j_] as f64 / n as f64;
431                    let p_x = x_counts[i_] as f64 / n as f64;
432                    let p_y = y_counts[j_] as f64 / n as f64;
433
434                    mi += p_xy * (p_xy / (p_x * p_y)).ln();
435                }
436            }
437        }
438
439        Ok(mi)
440    }
441
442    fn calculate_f_statistic(
443        x: &scirs2_core::ndarray::ArrayView1<f64>,
444        y: &Array1<f64>,
445        y_mean: f64,
446        sst: f64,
447    ) -> Result<f64> {
448        let n = x.len();
449
450        // Calculate regression coefficients
451        let x_mean = x.sum() / n as f64;
452
453        let mut num = 0.0;
454        let mut den = 0.0;
455
456        for i in 0..n {
457            let x_dev = x[i] - x_mean;
458            num += x_dev * (y[i] - y_mean);
459            den += x_dev * x_dev;
460        }
461
462        if den == 0.0 {
463            return Ok(0.0);
464        }
465
466        let beta = num / den;
467        let alpha = y_mean - beta * x_mean;
468
469        // Calculate sum of squared residuals
470        let mut ssr = 0.0;
471        for i in 0..n {
472            let y_pred = alpha + beta * x[i];
473            ssr += (y[i] - y_pred).powi(2);
474        }
475
476        let sse = sst - ssr;
477
478        if ssr == 0.0 {
479            return Ok(f64::INFINITY);
480        }
481
482        // F-statistic = (SSE/1) / (SSR/(n-2))
483        let f_stat = (sse * (n - 2) as f64) / ssr;
484
485        Ok(f_stat)
486    }
487
488    fn f_critical_value(alpha: f64, n: usize) -> f64 {
489        // Simplified approximation for F(1, n-2) critical values
490        // In practice, use proper F-distribution tables or functions
491        match alpha {
492            a if a <= 0.01 => 6.635 + 10.0 / (n as f64),
493            a if a <= 0.05 => 3.841 + 5.0 / (n as f64),
494            a if a <= 0.10 => 2.706 + 3.0 / (n as f64),
495            _ => 1.0,
496        }
497    }
498}