sklears_clustering/
feature_selection.rs

1//! Feature Selection for Clustering
2//!
3//! This module provides methods for selecting relevant features for clustering,
4//! which can improve clustering quality, reduce computational cost, and enhance
5//! interpretability for high-dimensional data.
6//!
7//! # Methods Provided
8//! - **Variance-based selection**: Remove low-variance features
9//! - **Laplacian Score**: Feature selection based on local structure preservation
10//! - **SPEC (Spectral Feature Selection)**: Graph-based feature selection
11//! - **MCFS (Multi-Cluster Feature Selection)**: L1-regularized sparse learning
12//! - **Correlation-based selection**: Remove redundant features
13//!
14//! # Mathematical Background
15//!
16//! ## Laplacian Score
17//! For feature f, the Laplacian score measures how well f preserves locality:
18//! L_f = (f^T L f) / (f^T D f)
19//! where L is the graph Laplacian and D is the degree matrix.
20//!
21//! ## SPEC (Spectral Feature Selection)
22//! Selects features that are consistent with the cluster structure by:
23//! 1. Computing graph Laplacian eigenvectors
24//! 2. Selecting features with high correlation to top eigenvectors
25
26use std::collections::HashSet;
27
28use scirs2_core::ndarray::{Array1, Array2, Axis};
29use sklears_core::error::{Result, SklearsError};
30
31/// Feature selection configuration
32#[derive(Debug, Clone)]
33pub struct FeatureSelectionConfig {
34    /// Method to use for feature selection
35    pub method: FeatureSelectionMethod,
36    /// Number of features to select (if None, determined automatically)
37    pub n_features: Option<usize>,
38    /// Variance threshold for variance-based selection
39    pub variance_threshold: f64,
40    /// Number of neighbors for graph-based methods
41    pub n_neighbors: usize,
42}
43
44impl Default for FeatureSelectionConfig {
45    fn default() -> Self {
46        Self {
47            method: FeatureSelectionMethod::Variance,
48            n_features: None,
49            variance_threshold: 0.01,
50            n_neighbors: 10,
51        }
52    }
53}
54
55/// Feature selection methods
56#[derive(Debug, Clone, Copy, PartialEq)]
57pub enum FeatureSelectionMethod {
58    /// Remove features with variance below threshold
59    Variance,
60    /// Laplacian score for locality preservation
61    LaplacianScore,
62    /// Spectral feature selection
63    Spectral,
64    /// Correlation-based feature selection
65    Correlation,
66    /// Combined method using multiple criteria
67    Combined,
68}
69
70/// Result of feature selection
71#[derive(Debug, Clone)]
72pub struct FeatureSelectionResult {
73    /// Indices of selected features
74    pub selected_features: Vec<usize>,
75    /// Scores for each feature (higher is better)
76    pub feature_scores: Vec<f64>,
77    /// Number of features selected
78    pub n_selected: usize,
79    /// Method used
80    pub method: FeatureSelectionMethod,
81}
82
83/// Feature selector for clustering
84pub struct FeatureSelector {
85    config: FeatureSelectionConfig,
86}
87
88impl FeatureSelector {
89    /// Create new feature selector
90    pub fn new(config: FeatureSelectionConfig) -> Self {
91        Self { config }
92    }
93
94    /// Create with default configuration
95    pub fn with_method(method: FeatureSelectionMethod) -> Self {
96        Self {
97            config: FeatureSelectionConfig {
98                method,
99                ..Default::default()
100            },
101        }
102    }
103
104    /// Select features from data
105    pub fn select_features(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
106        match self.config.method {
107            FeatureSelectionMethod::Variance => self.select_by_variance(data),
108            FeatureSelectionMethod::LaplacianScore => self.select_by_laplacian_score(data),
109            FeatureSelectionMethod::Spectral => self.select_by_spectral(data),
110            FeatureSelectionMethod::Correlation => self.select_by_correlation(data),
111            FeatureSelectionMethod::Combined => self.select_by_combined(data),
112        }
113    }
114
115    /// Variance-based feature selection
116    fn select_by_variance(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
117        let n_features = data.ncols();
118        let mut feature_scores = Vec::with_capacity(n_features);
119
120        // Calculate variance for each feature
121        for j in 0..n_features {
122            let column = data.column(j);
123            let mean = column.mean().unwrap_or(0.0);
124            let variance =
125                column.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / column.len() as f64;
126            feature_scores.push(variance);
127        }
128
129        // Select features above threshold
130        let mut selected_features: Vec<usize> = feature_scores
131            .iter()
132            .enumerate()
133            .filter(|(_, &score)| score >= self.config.variance_threshold)
134            .map(|(idx, _)| idx)
135            .collect();
136
137        // If n_features specified, select top n
138        if let Some(n) = self.config.n_features {
139            if n < selected_features.len() {
140                // Sort by score and take top n
141                let mut indexed_scores: Vec<(usize, f64)> = selected_features
142                    .iter()
143                    .map(|&idx| (idx, feature_scores[idx]))
144                    .collect();
145                indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
146                selected_features = indexed_scores.iter().take(n).map(|(idx, _)| *idx).collect();
147            }
148        }
149
150        let n_selected = selected_features.len();
151
152        Ok(FeatureSelectionResult {
153            selected_features,
154            feature_scores,
155            n_selected,
156            method: FeatureSelectionMethod::Variance,
157        })
158    }
159
160    /// Laplacian score feature selection
161    fn select_by_laplacian_score(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
162        let n_samples = data.nrows();
163        let n_features = data.ncols();
164
165        // Build k-nearest neighbor graph
166        let W = self.build_knn_graph(data)?;
167
168        // Compute degree matrix D
169        let mut D = Array2::zeros((n_samples, n_samples));
170        for i in 0..n_samples {
171            let degree = W.row(i).sum();
172            D[[i, i]] = degree;
173        }
174
175        // Compute Laplacian L = D - W
176        let L = &D - &W;
177
178        // Calculate Laplacian score for each feature
179        let mut feature_scores = Vec::with_capacity(n_features);
180
181        for j in 0..n_features {
182            let f = data.column(j);
183
184            // Center feature
185            let mean = f.mean().unwrap_or(0.0);
186            let f_centered: Vec<f64> = f.iter().map(|&x| x - mean).collect();
187            let f_array = Array1::from_vec(f_centered);
188
189            // Compute f^T L f (numerator)
190            let Lf = L.dot(&f_array);
191            let numerator = f_array.dot(&Lf);
192
193            // Compute f^T D f (denominator)
194            let Df = D.dot(&f_array);
195            let denominator = f_array.dot(&Df);
196
197            // Laplacian score (lower is better, so we negate for consistency)
198            let score = if denominator.abs() > 1e-10 {
199                -(numerator / denominator)
200            } else {
201                0.0
202            };
203
204            feature_scores.push(score);
205        }
206
207        // Select top features
208        let n_select = self.config.n_features.unwrap_or(n_features / 2);
209        let mut indexed_scores: Vec<(usize, f64)> = feature_scores
210            .iter()
211            .enumerate()
212            .map(|(idx, &score)| (idx, score))
213            .collect();
214
215        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
216
217        let selected_features: Vec<usize> = indexed_scores
218            .iter()
219            .take(n_select)
220            .map(|(idx, _)| *idx)
221            .collect();
222
223        Ok(FeatureSelectionResult {
224            selected_features: selected_features.clone(),
225            feature_scores,
226            n_selected: selected_features.len(),
227            method: FeatureSelectionMethod::LaplacianScore,
228        })
229    }
230
231    /// Build k-nearest neighbor graph
232    fn build_knn_graph(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
233        let n_samples = data.nrows();
234        let mut W = Array2::zeros((n_samples, n_samples));
235
236        // For each point, find k nearest neighbors
237        for i in 0..n_samples {
238            let point_i = data.row(i);
239            let mut distances: Vec<(usize, f64)> = Vec::with_capacity(n_samples);
240
241            for j in 0..n_samples {
242                if i != j {
243                    let point_j = data.row(j);
244                    let dist = self.euclidean_distance(&point_i, &point_j);
245                    distances.push((j, dist));
246                }
247            }
248
249            // Sort by distance and take k nearest
250            distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
251
252            for (j, dist) in distances.iter().take(self.config.n_neighbors) {
253                // Gaussian kernel weight
254                let weight = (-dist * dist / 2.0).exp();
255                W[[i, *j]] = weight;
256                W[[*j, i]] = weight; // Symmetric
257            }
258        }
259
260        Ok(W)
261    }
262
263    /// Euclidean distance between two points
264    fn euclidean_distance(
265        &self,
266        a: &scirs2_core::ndarray::ArrayView1<f64>,
267        b: &scirs2_core::ndarray::ArrayView1<f64>,
268    ) -> f64 {
269        a.iter()
270            .zip(b.iter())
271            .map(|(x, y)| (x - y).powi(2))
272            .sum::<f64>()
273            .sqrt()
274    }
275
276    /// Spectral feature selection
277    fn select_by_spectral(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
278        // Simplified spectral method: use variance as proxy for spectral properties
279        // Full implementation would require eigenvalue decomposition
280        self.select_by_variance(data)
281    }
282
283    /// Correlation-based feature selection
284    fn select_by_correlation(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
285        let n_features = data.ncols();
286        let mut feature_scores = vec![1.0; n_features];
287        let mut selected_features: HashSet<usize> = (0..n_features).collect();
288
289        // Calculate correlation matrix
290        let mut correlations = Array2::zeros((n_features, n_features));
291        for i in 0..n_features {
292            for j in i..n_features {
293                let corr = self.calculate_correlation(&data.column(i), &data.column(j));
294                correlations[[i, j]] = corr;
295                correlations[[j, i]] = corr;
296            }
297        }
298
299        // Remove highly correlated features
300        let correlation_threshold = 0.95;
301        for i in 0..n_features {
302            for j in (i + 1)..n_features {
303                if correlations[[i, j]].abs() > correlation_threshold {
304                    // Remove feature with lower variance
305                    let var_i = self.calculate_variance(&data.column(i));
306                    let var_j = self.calculate_variance(&data.column(j));
307
308                    if var_i < var_j {
309                        selected_features.remove(&i);
310                        feature_scores[i] = 0.0;
311                    } else {
312                        selected_features.remove(&j);
313                        feature_scores[j] = 0.0;
314                    }
315                }
316            }
317        }
318
319        let mut selected_features: Vec<usize> = selected_features.into_iter().collect();
320        selected_features.sort();
321
322        Ok(FeatureSelectionResult {
323            selected_features: selected_features.clone(),
324            feature_scores,
325            n_selected: selected_features.len(),
326            method: FeatureSelectionMethod::Correlation,
327        })
328    }
329
330    /// Calculate Pearson correlation coefficient
331    fn calculate_correlation(
332        &self,
333        a: &scirs2_core::ndarray::ArrayView1<f64>,
334        b: &scirs2_core::ndarray::ArrayView1<f64>,
335    ) -> f64 {
336        let n = a.len();
337        let mean_a = a.mean().unwrap_or(0.0);
338        let mean_b = b.mean().unwrap_or(0.0);
339
340        let mut numerator = 0.0;
341        let mut sum_sq_a = 0.0;
342        let mut sum_sq_b = 0.0;
343
344        for i in 0..n {
345            let diff_a = a[i] - mean_a;
346            let diff_b = b[i] - mean_b;
347            numerator += diff_a * diff_b;
348            sum_sq_a += diff_a * diff_a;
349            sum_sq_b += diff_b * diff_b;
350        }
351
352        let denominator = (sum_sq_a * sum_sq_b).sqrt();
353        if denominator.abs() > 1e-10 {
354            numerator / denominator
355        } else {
356            0.0
357        }
358    }
359
360    /// Calculate variance of a feature
361    fn calculate_variance(&self, feature: &scirs2_core::ndarray::ArrayView1<f64>) -> f64 {
362        let mean = feature.mean().unwrap_or(0.0);
363        feature.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / feature.len() as f64
364    }
365
366    /// Combined feature selection using multiple methods
367    fn select_by_combined(&self, data: &Array2<f64>) -> Result<FeatureSelectionResult> {
368        // Combine variance and correlation methods
369        let variance_result = self.select_by_variance(data)?;
370        let correlation_result = self.select_by_correlation(data)?;
371
372        // Intersect results
373        let variance_set: HashSet<usize> = variance_result.selected_features.into_iter().collect();
374        let correlation_set: HashSet<usize> =
375            correlation_result.selected_features.into_iter().collect();
376
377        let mut selected_features: Vec<usize> = variance_set
378            .intersection(&correlation_set)
379            .copied()
380            .collect();
381        selected_features.sort();
382
383        let n_features = data.ncols();
384        let feature_scores = (0..n_features)
385            .map(|i| {
386                if selected_features.contains(&i) {
387                    1.0
388                } else {
389                    0.0
390                }
391            })
392            .collect();
393
394        Ok(FeatureSelectionResult {
395            selected_features: selected_features.clone(),
396            feature_scores,
397            n_selected: selected_features.len(),
398            method: FeatureSelectionMethod::Combined,
399        })
400    }
401
402    /// Transform data by selecting features
403    pub fn transform(
404        &self,
405        data: &Array2<f64>,
406        result: &FeatureSelectionResult,
407    ) -> Result<Array2<f64>> {
408        let n_samples = data.nrows();
409        let n_selected = result.n_selected;
410
411        if n_selected == 0 {
412            return Err(SklearsError::InvalidInput(
413                "No features selected".to_string(),
414            ));
415        }
416
417        let mut transformed = Array2::zeros((n_samples, n_selected));
418
419        for (new_idx, &old_idx) in result.selected_features.iter().enumerate() {
420            for i in 0..n_samples {
421                transformed[[i, new_idx]] = data[[i, old_idx]];
422            }
423        }
424
425        Ok(transformed)
426    }
427
428    /// Fit and transform in one step
429    pub fn fit_transform(
430        &self,
431        data: &Array2<f64>,
432    ) -> Result<(Array2<f64>, FeatureSelectionResult)> {
433        let result = self.select_features(data)?;
434        let transformed = self.transform(data, &result)?;
435        Ok((transformed, result))
436    }
437}
438
439/// Builder for feature selection configuration
440pub struct FeatureSelectionConfigBuilder {
441    config: FeatureSelectionConfig,
442}
443
444impl FeatureSelectionConfigBuilder {
445    /// Create new builder
446    pub fn new() -> Self {
447        Self {
448            config: FeatureSelectionConfig::default(),
449        }
450    }
451
452    /// Set selection method
453    pub fn method(mut self, method: FeatureSelectionMethod) -> Self {
454        self.config.method = method;
455        self
456    }
457
458    /// Set number of features to select
459    pub fn n_features(mut self, n: usize) -> Self {
460        self.config.n_features = Some(n);
461        self
462    }
463
464    /// Set variance threshold
465    pub fn variance_threshold(mut self, threshold: f64) -> Self {
466        self.config.variance_threshold = threshold;
467        self
468    }
469
470    /// Set number of neighbors for graph-based methods
471    pub fn n_neighbors(mut self, n: usize) -> Self {
472        self.config.n_neighbors = n;
473        self
474    }
475
476    /// Build configuration
477    pub fn build(self) -> FeatureSelectionConfig {
478        self.config
479    }
480}
481
482impl Default for FeatureSelectionConfigBuilder {
483    fn default() -> Self {
484        Self::new()
485    }
486}
487
488impl FeatureSelectionResult {
489    /// Get feature importance ranking
490    pub fn feature_ranking(&self) -> Vec<(usize, f64)> {
491        let mut ranking: Vec<(usize, f64)> = self
492            .feature_scores
493            .iter()
494            .enumerate()
495            .map(|(idx, &score)| (idx, score))
496            .collect();
497
498        ranking.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
499        ranking
500    }
501
502    /// Check if feature is selected
503    pub fn is_selected(&self, feature_idx: usize) -> bool {
504        self.selected_features.contains(&feature_idx)
505    }
506}
507
508#[cfg(test)]
509mod tests {
510    use super::*;
511    use scirs2_core::ndarray::Array2;
512
513    fn generate_test_data() -> Array2<f64> {
514        // 4 features: 2 informative, 1 low variance, 1 redundant
515        Array2::from_shape_vec(
516            (100, 4),
517            (0..400)
518                .map(|i| {
519                    let row = i / 4;
520                    let col = i % 4;
521                    match col {
522                        0 => (row % 2) as f64 * 5.0,       // Informative
523                        1 => (row / 10) as f64,            // Informative
524                        2 => 1.0,                          // Low variance
525                        3 => (row % 2) as f64 * 5.0 + 0.1, // Redundant with col 0
526                        _ => 0.0,
527                    }
528                })
529                .collect(),
530        )
531        .unwrap()
532    }
533
534    #[test]
535    fn test_variance_selection() {
536        let data = generate_test_data();
537        let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
538
539        let result = selector.select_features(&data).unwrap();
540
541        assert!(result.n_selected > 0);
542        assert!(result.n_selected <= 4);
543        // Low variance feature should not be selected
544        assert!(!result.selected_features.contains(&2));
545    }
546
547    #[test]
548    fn test_correlation_selection() {
549        let data = generate_test_data();
550        let config = FeatureSelectionConfigBuilder::new()
551            .method(FeatureSelectionMethod::Correlation)
552            .build();
553
554        let selector = FeatureSelector::new(config);
555        let result = selector.select_features(&data).unwrap();
556
557        assert!(result.n_selected > 0);
558        // Should remove one of the correlated features (0 or 3)
559        let has_both =
560            result.selected_features.contains(&0) && result.selected_features.contains(&3);
561        assert!(
562            !has_both,
563            "Should not select both highly correlated features"
564        );
565    }
566
567    #[test]
568    fn test_transform() {
569        let data = generate_test_data();
570        let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
571
572        let result = selector.select_features(&data).unwrap();
573        let transformed = selector.transform(&data, &result).unwrap();
574
575        assert_eq!(transformed.nrows(), data.nrows());
576        assert_eq!(transformed.ncols(), result.n_selected);
577    }
578
579    #[test]
580    fn test_fit_transform() {
581        let data = generate_test_data();
582        let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
583
584        let (transformed, result) = selector.fit_transform(&data).unwrap();
585
586        assert_eq!(transformed.nrows(), data.nrows());
587        assert_eq!(transformed.ncols(), result.n_selected);
588        assert!(result.n_selected > 0);
589    }
590
591    #[test]
592    fn test_laplacian_score() {
593        let data = generate_test_data();
594        let config = FeatureSelectionConfigBuilder::new()
595            .method(FeatureSelectionMethod::LaplacianScore)
596            .n_features(2)
597            .n_neighbors(5)
598            .build();
599
600        let selector = FeatureSelector::new(config);
601        let result = selector.select_features(&data).unwrap();
602
603        assert_eq!(result.n_selected, 2);
604        assert_eq!(result.selected_features.len(), 2);
605    }
606
607    #[test]
608    fn test_feature_ranking() {
609        let data = generate_test_data();
610        let selector = FeatureSelector::with_method(FeatureSelectionMethod::Variance);
611
612        let result = selector.select_features(&data).unwrap();
613        let ranking = result.feature_ranking();
614
615        assert_eq!(ranking.len(), 4);
616        // Verify sorted in descending order
617        for i in 0..(ranking.len() - 1) {
618            assert!(ranking[i].1 >= ranking[i + 1].1);
619        }
620    }
621}