scirs2_series/feature_selection/
filter.rs1use scirs2_core::ndarray::{Array1, Array2};
4use std::collections::HashMap;
5
6use super::FeatureSelectionResult;
7use crate::error::{Result, TimeSeriesError};
8use crate::utils::autocorrelation;
9
10pub struct FilterMethods;
12
13impl FilterMethods {
14 pub fn variance_threshold(
38 features: &Array2<f64>,
39 threshold: f64,
40 ) -> Result<FeatureSelectionResult> {
41 let (n_samples, n_features) = features.dim();
42
43 if n_samples < 2 {
44 return Err(TimeSeriesError::InsufficientData {
45 message: "Need at least 2 samples for variance calculation".to_string(),
46 required: 2,
47 actual: n_samples,
48 });
49 }
50
51 let mut selected_features = Vec::new();
52 let mut feature_scores = Array1::zeros(n_features);
53
54 for i in 0..n_features {
55 let feature_col = features.column(i);
56 let mean = feature_col.sum() / n_samples as f64;
57 let variance = feature_col.mapv(|x| (x - mean).powi(2)).sum() / (n_samples - 1) as f64;
58
59 feature_scores[i] = variance;
60
61 if variance >= threshold {
62 selected_features.push(i);
63 }
64 }
65
66 let mut metadata = HashMap::new();
67 metadata.insert("threshold".to_string(), threshold);
68 metadata.insert("n_selected".to_string(), selected_features.len() as f64);
69
70 Ok(FeatureSelectionResult {
71 selected_features,
72 feature_scores,
73 method: "VarianceThreshold".to_string(),
74 metadata,
75 })
76 }
77
78 pub fn correlation_selection(
92 features: &Array2<f64>,
93 target: &Array1<f64>,
94 threshold: f64,
95 ) -> Result<FeatureSelectionResult> {
96 let (n_samples, n_features) = features.dim();
97
98 if n_samples != target.len() {
99 return Err(TimeSeriesError::DimensionMismatch {
100 expected: n_samples,
101 actual: target.len(),
102 });
103 }
104
105 if n_samples < 3 {
106 return Err(TimeSeriesError::InsufficientData {
107 message: "Need at least 3 samples for correlation calculation".to_string(),
108 required: 3,
109 actual: n_samples,
110 });
111 }
112
113 let mut selected_features = Vec::new();
114 let mut feature_scores = Array1::zeros(n_features);
115
116 let target_mean = target.sum() / n_samples as f64;
117
118 for i in 0..n_features {
119 let feature_col = features.column(i);
120 let feature_mean = feature_col.sum() / n_samples as f64;
121
122 let correlation =
123 Self::calculate_correlation(&feature_col, target, feature_mean, target_mean)?;
124 let abs_correlation = correlation.abs();
125
126 feature_scores[i] = abs_correlation;
127
128 if abs_correlation >= threshold {
129 selected_features.push(i);
130 }
131 }
132
133 let mut metadata = HashMap::new();
134 metadata.insert("threshold".to_string(), threshold);
135 metadata.insert("n_selected".to_string(), selected_features.len() as f64);
136
137 Ok(FeatureSelectionResult {
138 selected_features,
139 feature_scores,
140 method: "CorrelationSelection".to_string(),
141 metadata,
142 })
143 }
144
145 pub fn mutual_information_selection(
160 features: &Array2<f64>,
161 target: &Array1<f64>,
162 n_bins: usize,
163 n_features: Option<usize>,
164 ) -> Result<FeatureSelectionResult> {
165 let (n_samples, n_feat) = features.dim();
166
167 if n_samples != target.len() {
168 return Err(TimeSeriesError::DimensionMismatch {
169 expected: n_samples,
170 actual: target.len(),
171 });
172 }
173
174 if n_samples < n_bins * 2 {
175 return Err(TimeSeriesError::InsufficientData {
176 message: "Insufficient samples for mutual information estimation".to_string(),
177 required: n_bins * 2,
178 actual: n_samples,
179 });
180 }
181
182 let mut feature_scores = Array1::zeros(n_feat);
183
184 for i in 0..n_feat {
185 let feature_col = features.column(i);
186 let mi = Self::calculate_mutual_information(&feature_col, target, n_bins)?;
187 feature_scores[i] = mi;
188 }
189
190 let n_to_select = n_features.unwrap_or(n_feat / 2).min(n_feat);
192 let mut indexed_scores: Vec<(usize, f64)> = feature_scores
193 .iter()
194 .enumerate()
195 .map(|(i, &score)| (i, score))
196 .collect();
197
198 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
199
200 let selected_features: Vec<usize> = indexed_scores
201 .into_iter()
202 .take(n_to_select)
203 .map(|(idx_, _)| idx_)
204 .collect();
205
206 let mut metadata = HashMap::new();
207 metadata.insert("n_bins".to_string(), n_bins as f64);
208 metadata.insert("n_selected".to_string(), selected_features.len() as f64);
209
210 Ok(FeatureSelectionResult {
211 selected_features,
212 feature_scores,
213 method: "MutualInformation".to_string(),
214 metadata,
215 })
216 }
217
218 pub fn f_test_selection(
232 features: &Array2<f64>,
233 target: &Array1<f64>,
234 alpha: f64,
235 ) -> Result<FeatureSelectionResult> {
236 let (n_samples, n_features) = features.dim();
237
238 if n_samples != target.len() {
239 return Err(TimeSeriesError::DimensionMismatch {
240 expected: n_samples,
241 actual: target.len(),
242 });
243 }
244
245 if n_samples < 4 {
246 return Err(TimeSeriesError::InsufficientData {
247 message: "Need at least 4 samples for F-test".to_string(),
248 required: 4,
249 actual: n_samples,
250 });
251 }
252
253 let mut selected_features = Vec::new();
254 let mut feature_scores = Array1::zeros(n_features);
255
256 let target_mean = target.sum() / n_samples as f64;
257 let sst = target.mapv(|y| (y - target_mean).powi(2)).sum();
258
259 for i in 0..n_features {
260 let feature_col = features.column(i);
261 let f_stat = Self::calculate_f_statistic(&feature_col, target, target_mean, sst)?;
262
263 feature_scores[i] = f_stat;
264
265 let critical_value = Self::f_critical_value(alpha, n_samples);
268
269 if f_stat > critical_value {
270 selected_features.push(i);
271 }
272 }
273
274 let mut metadata = HashMap::new();
275 metadata.insert("alpha".to_string(), alpha);
276 metadata.insert("n_selected".to_string(), selected_features.len() as f64);
277
278 Ok(FeatureSelectionResult {
279 selected_features,
280 feature_scores,
281 method: "FTest".to_string(),
282 metadata,
283 })
284 }
285
286 pub fn autocorrelation_filter(
300 features: &Array2<f64>,
301 max_lag: usize,
302 threshold: f64,
303 ) -> Result<FeatureSelectionResult> {
304 let (n_samples, n_features) = features.dim();
305
306 if n_samples <= max_lag + 1 {
307 return Err(TimeSeriesError::InsufficientData {
308 message: "Insufficient samples for autocorrelation calculation".to_string(),
309 required: max_lag + 2,
310 actual: n_samples,
311 });
312 }
313
314 let mut selected_features = Vec::new();
315 let mut feature_scores = Array1::zeros(n_features);
316
317 for i in 0..n_features {
318 let feature_col = features.column(i).to_owned();
319
320 let max_acf = match autocorrelation(&feature_col, Some(max_lag)) {
322 Ok(acf) => {
323 acf.slice(scirs2_core::ndarray::s![1..])
325 .iter()
326 .map(|&x| x.abs())
327 .fold(0.0f64, |a, b| a.max(b))
328 }
329 Err(_) => {
330 0.0
332 }
333 };
334
335 feature_scores[i] = max_acf;
336
337 if max_acf >= threshold {
338 selected_features.push(i);
339 }
340 }
341
342 let mut metadata = HashMap::new();
343 metadata.insert("max_lag".to_string(), max_lag as f64);
344 metadata.insert("threshold".to_string(), threshold);
345 metadata.insert("n_selected".to_string(), selected_features.len() as f64);
346
347 Ok(FeatureSelectionResult {
348 selected_features,
349 feature_scores,
350 method: "AutocorrelationFilter".to_string(),
351 metadata,
352 })
353 }
354
355 fn calculate_correlation(
358 x: &scirs2_core::ndarray::ArrayView1<f64>,
359 y: &Array1<f64>,
360 x_mean: f64,
361 y_mean: f64,
362 ) -> Result<f64> {
363 let n = x.len();
364
365 let mut numerator = 0.0;
366 let mut x_var = 0.0;
367 let mut y_var = 0.0;
368
369 for i in 0..n {
370 let x_dev = x[i] - x_mean;
371 let y_dev = y[i] - y_mean;
372
373 numerator += x_dev * y_dev;
374 x_var += x_dev * x_dev;
375 y_var += y_dev * y_dev;
376 }
377
378 let denominator = (x_var * y_var).sqrt();
379
380 if denominator == 0.0 {
381 Ok(0.0)
382 } else {
383 Ok(numerator / denominator)
384 }
385 }
386
387 fn calculate_mutual_information(
388 x: &scirs2_core::ndarray::ArrayView1<f64>,
389 y: &Array1<f64>,
390 n_bins: usize,
391 ) -> Result<f64> {
392 let n = x.len();
393
394 let x_min = x.iter().fold(f64::INFINITY, |a, &b| a.min(b));
396 let x_max = x.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
397 let y_min = y.iter().fold(f64::INFINITY, |a, &b| a.min(b));
398 let y_max = y.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
399
400 if x_max == x_min || y_max == y_min {
401 return Ok(0.0);
402 }
403
404 let x_bin_size = (x_max - x_min) / n_bins as f64;
405 let y_bin_size = (y_max - y_min) / n_bins as f64;
406
407 let mut joint_counts = vec![vec![0; n_bins]; n_bins];
409 let mut x_counts = vec![0; n_bins];
410 let mut y_counts = vec![0; n_bins];
411
412 for i in 0..n {
413 let x_bin = ((x[i] - x_min) / x_bin_size).floor() as usize;
414 let y_bin = ((y[i] - y_min) / y_bin_size).floor() as usize;
415
416 let x_bin = x_bin.min(n_bins - 1);
417 let y_bin = y_bin.min(n_bins - 1);
418
419 joint_counts[x_bin][y_bin] += 1;
420 x_counts[x_bin] += 1;
421 y_counts[y_bin] += 1;
422 }
423
424 let mut mi = 0.0;
426
427 for (i_, _) in x_counts.iter().enumerate().take(n_bins) {
428 for (j_, _) in y_counts.iter().enumerate().take(n_bins) {
429 if joint_counts[i_][j_] > 0 && x_counts[i_] > 0 && y_counts[j_] > 0 {
430 let p_xy = joint_counts[i_][j_] as f64 / n as f64;
431 let p_x = x_counts[i_] as f64 / n as f64;
432 let p_y = y_counts[j_] as f64 / n as f64;
433
434 mi += p_xy * (p_xy / (p_x * p_y)).ln();
435 }
436 }
437 }
438
439 Ok(mi)
440 }
441
442 fn calculate_f_statistic(
443 x: &scirs2_core::ndarray::ArrayView1<f64>,
444 y: &Array1<f64>,
445 y_mean: f64,
446 sst: f64,
447 ) -> Result<f64> {
448 let n = x.len();
449
450 let x_mean = x.sum() / n as f64;
452
453 let mut num = 0.0;
454 let mut den = 0.0;
455
456 for i in 0..n {
457 let x_dev = x[i] - x_mean;
458 num += x_dev * (y[i] - y_mean);
459 den += x_dev * x_dev;
460 }
461
462 if den == 0.0 {
463 return Ok(0.0);
464 }
465
466 let beta = num / den;
467 let alpha = y_mean - beta * x_mean;
468
469 let mut ssr = 0.0;
471 for i in 0..n {
472 let y_pred = alpha + beta * x[i];
473 ssr += (y[i] - y_pred).powi(2);
474 }
475
476 let sse = sst - ssr;
477
478 if ssr == 0.0 {
479 return Ok(f64::INFINITY);
480 }
481
482 let f_stat = (sse * (n - 2) as f64) / ssr;
484
485 Ok(f_stat)
486 }
487
488 fn f_critical_value(alpha: f64, n: usize) -> f64 {
489 match alpha {
492 a if a <= 0.01 => 6.635 + 10.0 / (n as f64),
493 a if a <= 0.05 => 3.841 + 5.0 / (n as f64),
494 a if a <= 0.10 => 2.706 + 3.0 / (n as f64),
495 _ => 1.0,
496 }
497 }
498}