1use std::collections::HashSet;
27
28use scirs2_core::ndarray::{Array1, Array2, Axis};
29use sklears_core::error::{Result, SklearsError};
30
31#[derive(Debug, Clone)]
33pub struct FeatureSelectionConfig {
34 pub method: FeatureSelectionMethod,
36 pub n_features: Option<usize>,
38 pub variance_threshold: f64,
40 pub n_neighbors: usize,
42}
43
44impl Default for FeatureSelectionConfig {
45 fn default() -> Self {
46 Self {
47 method: FeatureSelectionMethod::Variance,
48 n_features: None,
49 variance_threshold: 0.01,
50 n_neighbors: 10,
51 }
52 }
53}
54
55#[derive(Debug, Clone, Copy, PartialEq)]
57pub enum FeatureSelectionMethod {
58 Variance,
60 LaplacianScore,
62 Spectral,
64 Correlation,
66 Combined,
68}
69
70#[derive(Debug, Clone)]
72pub struct FeatureSelectionResult {
73 pub selected_features: Vec<usize>,
75 pub feature_scores: Vec<f64>,
77 pub n_selected: usize,
79 pub method: FeatureSelectionMethod,
81}
82
83pub struct FeatureSelector {
85 config: FeatureSelectionConfig,
86}
87
88impl FeatureSelector {
89 pub fn new(config: FeatureSelectionConfig) -> Self {
91 Self { config }
92 }
93
94 pub fn with_method(method: FeatureSelectionMethod) -> Self {
96 Self {
97 config: FeatureSelectionConfig {
98 method,
99 ..Default::default()
100 },
101 }
102 }
103
104 pub fn select_features(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
106 match self.config.method {
107 FeatureSelectionMethod::Variance => self.select_by_variance(data),
108 FeatureSelectionMethod::LaplacianScore => self.select_by_laplacian_score(data),
109 FeatureSelectionMethod::Spectral => self.select_by_spectral(data),
110 FeatureSelectionMethod::Correlation => self.select_by_correlation(data),
111 FeatureSelectionMethod::Combined => self.select_by_combined(data),
112 }
113 }
114
115 fn select_by_variance(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
117 let n_features = data.ncols();
118 let mut feature_scores = Vec::with_capacity(n_features);
119
120 for j in 0..n_features {
122 let column = data.column(j);
123 let mean = column.mean().unwrap_or(0.0);
124 let variance =
125 column.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / column.len() as f64;
126 feature_scores.push(variance);
127 }
128
129 let mut selected_features: Vec<usize> = feature_scores
131 .iter()
132 .enumerate()
133 .filter(|(_, &score)| score >= self.config.variance_threshold)
134 .map(|(idx, _)| idx)
135 .collect();
136
137 if let Some(n) = self.config.n_features {
139 if n < selected_features.len() {
140 let mut indexed_scores: Vec<(usize, f64)> = selected_features
142 .iter()
143 .map(|&idx| (idx, feature_scores[idx]))
144 .collect();
145 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
146 selected_features = indexed_scores.iter().take(n).map(|(idx, _)| *idx).collect();
147 }
148 }
149
150 let n_selected = selected_features.len();
151
152 Ok(FeatureSelectionResult {
153 selected_features,
154 feature_scores,
155 n_selected,
156 method: FeatureSelectionMethod::Variance,
157 })
158 }
159
160 fn select_by_laplacian_score(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
162 let n_samples = data.nrows();
163 let n_features = data.ncols();
164
165 let W = self.build_knn_graph(data)?;
167
168 let mut D = Array2::zeros((n_samples, n_samples));
170 for i in 0..n_samples {
171 let degree = W.row(i).sum();
172 D[[i, i]] = degree;
173 }
174
175 let L = &D - &W;
177
178 let mut feature_scores = Vec::with_capacity(n_features);
180
181 for j in 0..n_features {
182 let f = data.column(j);
183
184 let mean = f.mean().unwrap_or(0.0);
186 let f_centered: Vec<f64> = f.iter().map(|&x| x - mean).collect();
187 let f_array = Array1::from_vec(f_centered);
188
189 let Lf = L.dot(&f_array);
191 let numerator = f_array.dot(&Lf);
192
193 let Df = D.dot(&f_array);
195 let denominator = f_array.dot(&Df);
196
197 let score = if denominator.abs() > 1e-10 {
199 -(numerator / denominator)
200 } else {
201 0.0
202 };
203
204 feature_scores.push(score);
205 }
206
207 let n_select = self.config.n_features.unwrap_or(n_features / 2);
209 let mut indexed_scores: Vec<(usize, f64)> = feature_scores
210 .iter()
211 .enumerate()
212 .map(|(idx, &score)| (idx, score))
213 .collect();
214
215 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
216
217 let selected_features: Vec<usize> = indexed_scores
218 .iter()
219 .take(n_select)
220 .map(|(idx, _)| *idx)
221 .collect();
222
223 Ok(FeatureSelectionResult {
224 selected_features: selected_features.clone(),
225 feature_scores,
226 n_selected: selected_features.len(),
227 method: FeatureSelectionMethod::LaplacianScore,
228 })
229 }
230
231 fn build_knn_graph(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
233 let n_samples = data.nrows();
234 let mut W = Array2::zeros((n_samples, n_samples));
235
236 for i in 0..n_samples {
238 let point_i = data.row(i);
239 let mut distances: Vec<(usize, f64)> = Vec::with_capacity(n_samples);
240
241 for j in 0..n_samples {
242 if i != j {
243 let point_j = data.row(j);
244 let dist = self.euclidean_distance(&point_i, &point_j);
245 distances.push((j, dist));
246 }
247 }
248
249 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
251
252 for (j, dist) in distances.iter().take(self.config.n_neighbors) {
253 let weight = (-dist * dist / 2.0).exp();
255 W[[i, *j]] = weight;
256 W[[*j, i]] = weight; }
258 }
259
260 Ok(W)
261 }
262
263 fn euclidean_distance(
265 &self,
266 a: &scirs2_core::ndarray::ArrayView1<f64>,
267 b: &scirs2_core::ndarray::ArrayView1<f64>,
268 ) -> f64 {
269 a.iter()
270 .zip(b.iter())
271 .map(|(x, y)| (x - y).powi(2))
272 .sum::<f64>()
273 .sqrt()
274 }
275
276 fn select_by_spectral(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
278 self.select_by_variance(data)
281 }
282
283 fn select_by_correlation(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
285 let n_features = data.ncols();
286 let mut feature_scores = vec![1.0; n_features];
287 let mut selected_features: HashSet<usize> = (0..n_features).collect();
288
289 let mut correlations = Array2::zeros((n_features, n_features));
291 for i in 0..n_features {
292 for j in i..n_features {
293 let corr = self.calculate_correlation(&data.column(i), &data.column(j));
294 correlations[[i, j]] = corr;
295 correlations[[j, i]] = corr;
296 }
297 }
298
299 let correlation_threshold = 0.95;
301 for i in 0..n_features {
302 for j in (i + 1)..n_features {
303 if correlations[[i, j]].abs() > correlation_threshold {
304 let var_i = self.calculate_variance(&data.column(i));
306 let var_j = self.calculate_variance(&data.column(j));
307
308 if var_i < var_j {
309 selected_features.remove(&i);
310 feature_scores[i] = 0.0;
311 } else {
312 selected_features.remove(&j);
313 feature_scores[j] = 0.0;
314 }
315 }
316 }
317 }
318
319 let mut selected_features: Vec<usize> = selected_features.into_iter().collect();
320 selected_features.sort();
321
322 Ok(FeatureSelectionResult {
323 selected_features: selected_features.clone(),
324 feature_scores,
325 n_selected: selected_features.len(),
326 method: FeatureSelectionMethod::Correlation,
327 })
328 }
329
330 fn calculate_correlation(
332 &self,
333 a: &scirs2_core::ndarray::ArrayView1<f64>,
334 b: &scirs2_core::ndarray::ArrayView1<f64>,
335 ) -> f64 {
336 let n = a.len();
337 let mean_a = a.mean().unwrap_or(0.0);
338 let mean_b = b.mean().unwrap_or(0.0);
339
340 let mut numerator = 0.0;
341 let mut sum_sq_a = 0.0;
342 let mut sum_sq_b = 0.0;
343
344 for i in 0..n {
345 let diff_a = a[i] - mean_a;
346 let diff_b = b[i] - mean_b;
347 numerator += diff_a * diff_b;
348 sum_sq_a += diff_a * diff_a;
349 sum_sq_b += diff_b * diff_b;
350 }
351
352 let denominator = (sum_sq_a * sum_sq_b).sqrt();
353 if denominator.abs() > 1e-10 {
354 numerator / denominator
355 } else {
356 0.0
357 }
358 }
359
360 fn calculate_variance(&self, feature: &scirs2_core::ndarray::ArrayView1<f64>) -> f64 {
362 let mean = feature.mean().unwrap_or(0.0);
363 feature.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / feature.len() as f64
364 }
365
366 fn select_by_combined(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
368 let variance_result = self.select_by_variance(data)?;
370 let correlation_result = self.select_by_correlation(data)?;
371
372 let variance_set: HashSet<usize> = variance_result.selected_features.into_iter().collect();
374 let correlation_set: HashSet<usize> =
375 correlation_result.selected_features.into_iter().collect();
376
377 let mut selected_features: Vec<usize> = variance_set
378 .intersection(&correlation_set)
379 .copied()
380 .collect();
381 selected_features.sort();
382
383 let n_features = data.ncols();
384 let feature_scores = (0..n_features)
385 .map(|i| {
386 if selected_features.contains(&i) {
387 1.0
388 } else {
389 0.0
390 }
391 })
392 .collect();
393
394 Ok(FeatureSelectionResult {
395 selected_features: selected_features.clone(),
396 feature_scores,
397 n_selected: selected_features.len(),
398 method: FeatureSelectionMethod::Combined,
399 })
400 }
401
402 pub fn transform(
404 &self,
405 data: &Array2<f64>,
406 result: &FeatureSelectionResult,
407 ) -> Result<Array2<f64>> {
408 let n_samples = data.nrows();
409 let n_selected = result.n_selected;
410
411 if n_selected == 0 {
412 return Err(SklearsError::InvalidInput(
413 "No features selected".to_string(),
414 ));
415 }
416
417 let mut transformed = Array2::zeros((n_samples, n_selected));
418
419 for (new_idx, &old_idx) in result.selected_features.iter().enumerate() {
420 for i in 0..n_samples {
421 transformed[[i, new_idx]] = data[[i, old_idx]];
422 }
423 }
424
425 Ok(transformed)
426 }
427
428 pub fn fit_transform(
430 &self,
431 data: &Array2<f64>,
432 ) -> Result<(Array2<f64>, FeatureSelectionResult)> {
433 let result = self.select_features(data)?;
434 let transformed = self.transform(data, &result)?;
435 Ok((transformed, result))
436 }
437}
438
439pub struct FeatureSelectionConfigBuilder {
441 config: FeatureSelectionConfig,
442}
443
444impl FeatureSelectionConfigBuilder {
445 pub fn new() -> Self {
447 Self {
448 config: FeatureSelectionConfig::default(),
449 }
450 }
451
452 pub fn method(mut self, method: FeatureSelectionMethod) -> Self {
454 self.config.method = method;
455 self
456 }
457
458 pub fn n_features(mut self, n: usize) -> Self {
460 self.config.n_features = Some(n);
461 self
462 }
463
464 pub fn variance_threshold(mut self, threshold: f64) -> Self {
466 self.config.variance_threshold = threshold;
467 self
468 }
469
470 pub fn n_neighbors(mut self, n: usize) -> Self {
472 self.config.n_neighbors = n;
473 self
474 }
475
476 pub fn build(self) -> FeatureSelectionConfig {
478 self.config
479 }
480}
481
482impl Default for FeatureSelectionConfigBuilder {
483 fn default() -> Self {
484 Self::new()
485 }
486}
487
488impl FeatureSelectionResult {
489 pub fn feature_ranking(&self) -> Vec<(usize, f64)> {
491 let mut ranking: Vec<(usize, f64)> = self
492 .feature_scores
493 .iter()
494 .enumerate()
495 .map(|(idx, &score)| (idx, score))
496 .collect();
497
498 ranking.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
499 ranking
500 }
501
502 pub fn is_selected(&self, feature_idx: usize) -> bool {
504 self.selected_features.contains(&feature_idx)
505 }
506}
507
508#[cfg(test)]
509mod tests {
510 use super::*;
511 use scirs2_core::ndarray::Array2;
512
513 fn generate_test_data() -> Array2<f64> {
514 Array2::from_shape_vec(
516 (100, 4),
517 (0..400)
518 .map(|i| {
519 let row = i / 4;
520 let col = i % 4;
521 match col {
522 0 => (row % 2) as f64 * 5.0, 1 => (row / 10) as f64, 2 => 1.0, 3 => (row % 2) as f64 * 5.0 + 0.1, _ => 0.0,
527 }
528 })
529 .collect(),
530 )
531 .unwrap()
532 }
533
534 #[test]
535 fn test_variance_selection() {
536 let data = generate_test_data();
537 let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
538
539 let result = selector.select_features(&data).unwrap();
540
541 assert!(result.n_selected > 0);
542 assert!(result.n_selected <= 4);
543 assert!(!result.selected_features.contains(&2));
545 }
546
547 #[test]
548 fn test_correlation_selection() {
549 let data = generate_test_data();
550 let config = FeatureSelectionConfigBuilder::new()
551 .method(FeatureSelectionMethod::Correlation)
552 .build();
553
554 let selector = FeatureSelector::new(config);
555 let result = selector.select_features(&data).unwrap();
556
557 assert!(result.n_selected > 0);
558 let has_both =
560 result.selected_features.contains(&0) && result.selected_features.contains(&3);
561 assert!(
562 !has_both,
563 "Should not select both highly correlated features"
564 );
565 }
566
567 #[test]
568 fn test_transform() {
569 let data = generate_test_data();
570 let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
571
572 let result = selector.select_features(&data).unwrap();
573 let transformed = selector.transform(&data, &result).unwrap();
574
575 assert_eq!(transformed.nrows(), data.nrows());
576 assert_eq!(transformed.ncols(), result.n_selected);
577 }
578
579 #[test]
580 fn test_fit_transform() {
581 let data = generate_test_data();
582 let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
583
584 let (transformed, result) = selector.fit_transform(&data).unwrap();
585
586 assert_eq!(transformed.nrows(), data.nrows());
587 assert_eq!(transformed.ncols(), result.n_selected);
588 assert!(result.n_selected > 0);
589 }
590
591 #[test]
592 fn test_laplacian_score() {
593 let data = generate_test_data();
594 let config = FeatureSelectionConfigBuilder::new()
595 .method(FeatureSelectionMethod::LaplacianScore)
596 .n_features(2)
597 .n_neighbors(5)
598 .build();
599
600 let selector = FeatureSelector::new(config);
601 let result = selector.select_features(&data).unwrap();
602
603 assert_eq!(result.n_selected, 2);
604 assert_eq!(result.selected_features.len(), 2);
605 }
606
607 #[test]
608 fn test_feature_ranking() {
609 let data = generate_test_data();
610 let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
611
612 let result = selector.select_features(&data).unwrap();
613 let ranking = result.feature_ranking();
614
615 assert_eq!(ranking.len(), 4);
616 for i in 0..(ranking.len() - 1) {
618 assert!(ranking[i].1 >= ranking[i + 1].1);
619 }
620 }
621}