sklears_feature_selection/
performance.rs

1//! Performance Optimizations for Feature Selection
2//!
3//! This module provides SIMD-accelerated operations, parallel algorithms,
4//! and memory-efficient implementations for high-performance feature selection.
5//! All implementations follow the SciRS2 policy and use Rust-specific optimizations.
6
7use rayon::prelude::*;
8use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
9use sklears_core::error::Result as SklResult;
10#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
11use std::arch::x86_64::*;
12use std::sync::{Arc, Mutex};
13
14type Result<T> = SklResult<T>;
15
16/// SIMD-accelerated statistical computations for feature selection
17pub struct SIMDStats;
18
19impl SIMDStats {
20    /// Compute correlation between two arrays using SIMD operations
21    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
22    #[target_feature(enable = "avx2")]
23    #[allow(unused_assignments)]
24    pub unsafe fn correlation_simd(x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
25        if x.len() != y.len() || x.len() < 4 {
26            return Self::correlation_fallback(x, y);
27        }
28
29        let n = x.len();
30        let mut sum_x = 0.0;
31        let mut sum_y = 0.0;
32        let mut sum_x2 = 0.0;
33        let mut sum_y2 = 0.0;
34        let mut sum_xy = 0.0;
35
36        // Process 4 elements at a time using AVX2
37        let chunks = n / 4;
38        let x_ptr = x.as_ptr();
39        let y_ptr = y.as_ptr();
40
41        let mut vec_sum_x = _mm256_setzero_pd();
42        let mut vec_sum_y = _mm256_setzero_pd();
43        let mut vec_sum_x2 = _mm256_setzero_pd();
44        let mut vec_sum_y2 = _mm256_setzero_pd();
45        let mut vec_sum_xy = _mm256_setzero_pd();
46
47        for i in 0..chunks {
48            let offset = i * 4;
49            let vec_x = _mm256_loadu_pd(x_ptr.add(offset));
50            let vec_y = _mm256_loadu_pd(y_ptr.add(offset));
51
52            vec_sum_x = _mm256_add_pd(vec_sum_x, vec_x);
53            vec_sum_y = _mm256_add_pd(vec_sum_y, vec_y);
54            vec_sum_x2 = _mm256_fmadd_pd(vec_x, vec_x, vec_sum_x2);
55            vec_sum_y2 = _mm256_fmadd_pd(vec_y, vec_y, vec_sum_y2);
56            vec_sum_xy = _mm256_fmadd_pd(vec_x, vec_y, vec_sum_xy);
57        }
58
59        // Horizontal sum of vector registers
60        let mut temp = [0.0; 4];
61        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum_x);
62        sum_x = temp.iter().sum();
63
64        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum_y);
65        sum_y = temp.iter().sum();
66
67        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum_x2);
68        sum_x2 = temp.iter().sum();
69
70        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum_y2);
71        sum_y2 = temp.iter().sum();
72
73        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum_xy);
74        sum_xy = temp.iter().sum();
75
76        // Process remaining elements
77        for i in (chunks * 4)..n {
78            let xi = *x_ptr.add(i);
79            let yi = *y_ptr.add(i);
80            sum_x += xi;
81            sum_y += yi;
82            sum_x2 += xi * xi;
83            sum_y2 += yi * yi;
84            sum_xy += xi * yi;
85        }
86
87        // Compute correlation
88        let n_f64 = n as f64;
89        let numerator = n_f64 * sum_xy - sum_x * sum_y;
90        let denominator =
91            ((n_f64 * sum_x2 - sum_x * sum_x) * (n_f64 * sum_y2 - sum_y * sum_y)).sqrt();
92
93        if denominator.abs() < 1e-10 {
94            0.0
95        } else {
96            numerator / denominator
97        }
98    }
99
100    /// Fallback correlation computation for when SIMD is not available
101    pub fn correlation_fallback(x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
102        if x.len() != y.len() || x.len() < 2 {
103            return 0.0;
104        }
105
106        let mean_x = x.mean().unwrap_or(0.0);
107        let mean_y = y.mean().unwrap_or(0.0);
108
109        let mut sum_xy = 0.0;
110        let mut sum_x2 = 0.0;
111        let mut sum_y2 = 0.0;
112
113        for i in 0..x.len() {
114            let dx = x[i] - mean_x;
115            let dy = y[i] - mean_y;
116            sum_xy += dx * dy;
117            sum_x2 += dx * dx;
118            sum_y2 += dy * dy;
119        }
120
121        let denom = (sum_x2 * sum_y2).sqrt();
122        if denom < 1e-10 {
123            0.0
124        } else {
125            sum_xy / denom
126        }
127    }
128
129    /// Auto-select correlation method based on CPU features
130    pub fn correlation_auto(x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
131        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
132        {
133            if is_x86_feature_detected!("avx2") && x.len() >= 16 {
134                unsafe { Self::correlation_simd(x, y) }
135            } else {
136                Self::correlation_fallback(x, y)
137            }
138        }
139        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
140        {
141            Self::correlation_fallback(x, y)
142        }
143    }
144
145    /// SIMD-accelerated variance computation
146    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
147    #[target_feature(enable = "avx2")]
148    pub unsafe fn variance_simd(x: ArrayView1<f64>) -> f64 {
149        if x.len() < 4 {
150            return Self::variance_fallback(x);
151        }
152
153        let n = x.len();
154        let x_ptr = x.as_ptr();
155
156        // First pass: compute mean using SIMD
157        let chunks = n / 4;
158        let mut vec_sum = _mm256_setzero_pd();
159
160        for i in 0..chunks {
161            let offset = i * 4;
162            let vec_x = _mm256_loadu_pd(x_ptr.add(offset));
163            vec_sum = _mm256_add_pd(vec_sum, vec_x);
164        }
165
166        // Horizontal sum
167        let mut temp = [0.0; 4];
168        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum);
169        let mut sum = temp.iter().sum::<f64>();
170
171        // Process remaining elements
172        for i in (chunks * 4)..n {
173            sum += *x_ptr.add(i);
174        }
175
176        let mean = sum / n as f64;
177
178        // Second pass: compute variance using SIMD
179        let vec_mean = _mm256_set1_pd(mean);
180        let mut vec_sum_sq_diff = _mm256_setzero_pd();
181
182        for i in 0..chunks {
183            let offset = i * 4;
184            let vec_x = _mm256_loadu_pd(x_ptr.add(offset));
185            let vec_diff = _mm256_sub_pd(vec_x, vec_mean);
186            vec_sum_sq_diff = _mm256_fmadd_pd(vec_diff, vec_diff, vec_sum_sq_diff);
187        }
188
189        // Horizontal sum
190        _mm256_storeu_pd(temp.as_mut_ptr(), vec_sum_sq_diff);
191        let mut sum_sq_diff = temp.iter().sum::<f64>();
192
193        // Process remaining elements
194        for i in (chunks * 4)..n {
195            let diff = *x_ptr.add(i) - mean;
196            sum_sq_diff += diff * diff;
197        }
198
199        sum_sq_diff / (n - 1) as f64
200    }
201
202    /// Fallback variance computation
203    pub fn variance_fallback(x: ArrayView1<f64>) -> f64 {
204        x.var(1.0)
205    }
206
207    /// Auto-select variance method
208    pub fn variance_auto(x: ArrayView1<f64>) -> f64 {
209        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
210        {
211            if is_x86_feature_detected!("avx2") && x.len() >= 16 {
212                unsafe { Self::variance_simd(x) }
213            } else {
214                Self::variance_fallback(x)
215            }
216        }
217        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
218        {
219            Self::variance_fallback(x)
220        }
221    }
222
223    /// SIMD-accelerated chi-square computation
224    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
225    #[target_feature(enable = "avx2")]
226    pub unsafe fn chi_square_simd(observed: ArrayView1<f64>, expected: ArrayView1<f64>) -> f64 {
227        if observed.len() != expected.len() || observed.len() < 4 {
228            return Self::chi_square_fallback(observed, expected);
229        }
230
231        let n = observed.len();
232        let chunks = n / 4;
233        let obs_ptr = observed.as_ptr();
234        let exp_ptr = expected.as_ptr();
235
236        let mut vec_chi_sq = _mm256_setzero_pd();
237        let eps = _mm256_set1_pd(1e-10);
238
239        for i in 0..chunks {
240            let offset = i * 4;
241            let vec_obs = _mm256_loadu_pd(obs_ptr.add(offset));
242            let vec_exp = _mm256_loadu_pd(exp_ptr.add(offset));
243
244            // Avoid division by zero
245            let vec_exp_safe = _mm256_max_pd(vec_exp, eps);
246
247            let vec_diff = _mm256_sub_pd(vec_obs, vec_exp_safe);
248            let vec_diff_sq = _mm256_mul_pd(vec_diff, vec_diff);
249            let vec_chi_sq_contrib = _mm256_div_pd(vec_diff_sq, vec_exp_safe);
250
251            vec_chi_sq = _mm256_add_pd(vec_chi_sq, vec_chi_sq_contrib);
252        }
253
254        // Horizontal sum
255        let mut temp = [0.0; 4];
256        _mm256_storeu_pd(temp.as_mut_ptr(), vec_chi_sq);
257        let mut chi_sq = temp.iter().sum::<f64>();
258
259        // Process remaining elements
260        for i in (chunks * 4)..n {
261            let obs = *obs_ptr.add(i);
262            let exp = (*exp_ptr.add(i)).max(1e-10);
263            let diff = obs - exp;
264            chi_sq += (diff * diff) / exp;
265        }
266
267        chi_sq
268    }
269
270    /// Fallback chi-square computation
271    pub fn chi_square_fallback(observed: ArrayView1<f64>, expected: ArrayView1<f64>) -> f64 {
272        if observed.len() != expected.len() {
273            return 0.0;
274        }
275
276        let mut chi_sq = 0.0;
277        for i in 0..observed.len() {
278            let obs = observed[i];
279            let exp = expected[i].max(1e-10);
280            let diff = obs - exp;
281            chi_sq += (diff * diff) / exp;
282        }
283
284        chi_sq
285    }
286
287    /// Auto-select chi-square method
288    pub fn chi_square_auto(observed: ArrayView1<f64>, expected: ArrayView1<f64>) -> f64 {
289        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
290        {
291            if is_x86_feature_detected!("avx2") && observed.len() >= 16 {
292                unsafe { Self::chi_square_simd(observed, expected) }
293            } else {
294                Self::chi_square_fallback(observed, expected)
295            }
296        }
297        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
298        {
299            Self::chi_square_fallback(observed, expected)
300        }
301    }
302}
303
304/// Parallel feature selection algorithms
305pub struct ParallelSelector;
306
307impl ParallelSelector {
308    /// Parallel correlation-based feature selection
309    pub fn parallel_correlation_selection(
310        X: ArrayView2<f64>,
311        y: ArrayView1<f64>,
312        threshold: f64,
313        chunk_size: Option<usize>,
314    ) -> Result<Array1<bool>> {
315        let n_features = X.ncols();
316        let chunk_size = chunk_size.unwrap_or_else(|| (n_features / num_cpus::get()).max(1));
317
318        let correlations: Vec<f64> = (0..n_features)
319            .into_par_iter()
320            .chunks(chunk_size)
321            .flat_map(|chunk| {
322                chunk
323                    .into_iter()
324                    .map(|feature_idx| {
325                        let feature_data = X.column(feature_idx);
326                        SIMDStats::correlation_auto(feature_data, y).abs()
327                    })
328                    .collect::<Vec<f64>>()
329            })
330            .collect();
331
332        let selection_vec: Vec<bool> = correlations
333            .into_par_iter()
334            .map(|corr| corr > threshold)
335            .collect();
336        let selection = Array1::from_vec(selection_vec);
337
338        Ok(selection)
339    }
340
341    /// Parallel variance threshold selection
342    pub fn parallel_variance_selection(
343        X: ArrayView2<f64>,
344        threshold: f64,
345        chunk_size: Option<usize>,
346    ) -> Result<Array1<bool>> {
347        let n_features = X.ncols();
348        let chunk_size = chunk_size.unwrap_or_else(|| (n_features / num_cpus::get()).max(1));
349
350        let variances: Vec<f64> = (0..n_features)
351            .into_par_iter()
352            .chunks(chunk_size)
353            .flat_map(|chunk| {
354                chunk
355                    .into_iter()
356                    .map(|feature_idx| {
357                        let feature_data = X.column(feature_idx);
358                        SIMDStats::variance_auto(feature_data)
359                    })
360                    .collect::<Vec<f64>>()
361            })
362            .collect();
363
364        let selection_vec: Vec<bool> = variances
365            .into_par_iter()
366            .map(|var| var > threshold)
367            .collect();
368        let selection = Array1::from_vec(selection_vec);
369
370        Ok(selection)
371    }
372
373    /// Parallel mutual information selection
374    pub fn parallel_mutual_info_selection(
375        X: ArrayView2<f64>,
376        y: ArrayView1<f64>,
377        k: usize,
378        chunk_size: Option<usize>,
379    ) -> Result<Array1<bool>> {
380        let n_features = X.ncols();
381        let chunk_size = chunk_size.unwrap_or_else(|| (n_features / num_cpus::get()).max(1));
382
383        let mi_scores: Vec<(usize, f64)> = (0..n_features)
384            .into_par_iter()
385            .chunks(chunk_size)
386            .flat_map(|chunk| {
387                chunk
388                    .into_iter()
389                    .map(|feature_idx| {
390                        let feature_data = X.column(feature_idx);
391                        let mi_score = Self::estimate_mutual_information(feature_data, y);
392                        (feature_idx, mi_score)
393                    })
394                    .collect::<Vec<(usize, f64)>>()
395            })
396            .collect();
397
398        // Sort by MI score and select top k
399        let mut sorted_scores = mi_scores;
400        sorted_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
401
402        let mut selection = Array1::from_elem(n_features, false);
403        for (feature_idx, _) in sorted_scores.into_iter().take(k) {
404            selection[feature_idx] = true;
405        }
406
407        Ok(selection)
408    }
409
410    /// Parallel chi-square feature selection
411    pub fn parallel_chi_square_selection(
412        X: ArrayView2<f64>,
413        y: ArrayView1<f64>,
414        k: usize,
415        chunk_size: Option<usize>,
416    ) -> Result<Array1<bool>> {
417        let n_features = X.ncols();
418        let chunk_size = chunk_size.unwrap_or_else(|| (n_features / num_cpus::get()).max(1));
419
420        // Compute expected frequencies for chi-square test
421        let class_counts = Self::compute_class_counts(y);
422        let total_samples = y.len() as f64;
423
424        let chi_scores: Vec<(usize, f64)> = (0..n_features)
425            .into_par_iter()
426            .chunks(chunk_size)
427            .flat_map(|chunk| {
428                chunk
429                    .into_iter()
430                    .map(|feature_idx| {
431                        let feature_data = X.column(feature_idx);
432                        let chi_score = Self::compute_chi_square_score(
433                            feature_data,
434                            y,
435                            &class_counts,
436                            total_samples,
437                        );
438                        (feature_idx, chi_score)
439                    })
440                    .collect::<Vec<(usize, f64)>>()
441            })
442            .collect();
443
444        // Sort by chi-square score and select top k
445        let mut sorted_scores = chi_scores;
446        sorted_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
447
448        let mut selection = Array1::from_elem(n_features, false);
449        for (feature_idx, _) in sorted_scores.into_iter().take(k) {
450            selection[feature_idx] = true;
451        }
452
453        Ok(selection)
454    }
455
456    /// Parallel recursive feature elimination
457    pub fn parallel_recursive_elimination(
458        X: ArrayView2<f64>,
459        y: ArrayView1<f64>,
460        n_features_to_select: usize,
461        step: f64,
462    ) -> Result<Array1<bool>> {
463        let mut current_features: Vec<usize> = (0..X.ncols()).collect();
464        let mut selection = Array1::from_elem(X.ncols(), true);
465
466        while current_features.len() > n_features_to_select {
467            let n_to_remove = ((current_features.len() as f64 * step).ceil() as usize).max(1);
468
469            // Compute feature importance in parallel
470            let importances: Vec<f64> = current_features
471                .par_iter()
472                .map(|&feature_idx| {
473                    let feature_data = X.column(feature_idx);
474                    // Simplified importance: use correlation as proxy
475                    SIMDStats::correlation_auto(feature_data, y).abs()
476                })
477                .collect();
478
479            // Find features with lowest importance
480            let mut indexed_importances: Vec<(usize, f64)> = current_features
481                .iter()
482                .zip(importances.iter())
483                .map(|(&idx, &imp)| (idx, imp))
484                .collect();
485
486            indexed_importances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
487
488            // Remove features with lowest importance
489            for i in 0..n_to_remove {
490                if let Some((feature_idx, _)) = indexed_importances.get(i) {
491                    selection[*feature_idx] = false;
492                    current_features.retain(|&x| x != *feature_idx);
493                }
494            }
495        }
496
497        Ok(selection)
498    }
499
500    // Helper methods
501    fn estimate_mutual_information(x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64 {
502        // Simplified MI estimation using correlation as proxy
503        // In a full implementation, this would use proper MI estimation algorithms
504        SIMDStats::correlation_auto(x, y).abs()
505    }
506
507    fn compute_class_counts(y: ArrayView1<f64>) -> Vec<(f64, usize)> {
508        let mut counts = std::collections::HashMap::new();
509        for &label in y.iter() {
510            *counts.entry(label as i32).or_insert(0) += 1;
511        }
512        counts.into_iter().map(|(k, v)| (k as f64, v)).collect()
513    }
514
515    fn compute_chi_square_score(
516        feature: ArrayView1<f64>,
517        y: ArrayView1<f64>,
518        _class_counts: &[(f64, usize)],
519        _total_samples: f64,
520    ) -> f64 {
521        // Simplified chi-square computation
522        // In practice, this would discretize continuous features and compute proper contingency tables
523        SIMDStats::correlation_auto(feature, y).abs()
524    }
525}
526
527/// Memory-efficient feature selection with streaming algorithms
528pub struct MemoryEfficientSelector {
529    chunk_size: usize,
530    use_memory_mapping: bool,
531}
532
533impl MemoryEfficientSelector {
534    pub fn new(chunk_size: usize, use_memory_mapping: bool) -> Self {
535        Self {
536            chunk_size,
537            use_memory_mapping,
538        }
539    }
540
541    /// Streaming correlation computation for large datasets
542    pub fn streaming_correlation_selection(
543        &self,
544        X: ArrayView2<f64>,
545        y: ArrayView1<f64>,
546        threshold: f64,
547    ) -> Result<Array1<bool>> {
548        let n_features = X.ncols();
549        let n_samples = X.nrows();
550
551        // Initialize streaming statistics
552        let mut feature_stats: Vec<StreamingStats> =
553            (0..n_features).map(|_| StreamingStats::new()).collect();
554
555        let mut y_stats = StreamingStats::new();
556
557        // Process data in chunks to reduce memory usage
558        for chunk_start in (0..n_samples).step_by(self.chunk_size) {
559            let chunk_end = (chunk_start + self.chunk_size).min(n_samples);
560
561            // Update statistics for this chunk
562            for i in chunk_start..chunk_end {
563                let y_val = y[i];
564                y_stats.update(y_val);
565
566                for j in 0..n_features {
567                    let x_val = X[[i, j]];
568                    feature_stats[j].update(x_val);
569                    feature_stats[j].update_covariance(x_val, y_val);
570                }
571            }
572        }
573
574        // Compute final correlations
575        let mut selection = Array1::from_elem(n_features, false);
576        let y_variance = y_stats.variance();
577
578        for (j, stats) in feature_stats.iter().enumerate() {
579            let x_variance = stats.variance();
580            let covariance = stats.covariance_xy();
581
582            let correlation = if x_variance > 1e-10 && y_variance > 1e-10 {
583                covariance / (x_variance.sqrt() * y_variance.sqrt())
584            } else {
585                0.0
586            };
587
588            selection[j] = correlation.abs() > threshold;
589        }
590
591        Ok(selection)
592    }
593
594    /// Memory-efficient variance threshold selection
595    pub fn streaming_variance_selection(
596        &self,
597        X: ArrayView2<f64>,
598        threshold: f64,
599    ) -> Result<Array1<bool>> {
600        let n_features = X.ncols();
601        let n_samples = X.nrows();
602
603        let mut feature_stats: Vec<StreamingStats> =
604            (0..n_features).map(|_| StreamingStats::new()).collect();
605
606        // Process data in chunks
607        for chunk_start in (0..n_samples).step_by(self.chunk_size) {
608            let chunk_end = (chunk_start + self.chunk_size).min(n_samples);
609
610            for i in chunk_start..chunk_end {
611                for j in 0..n_features {
612                    let x_val = X[[i, j]];
613                    feature_stats[j].update(x_val);
614                }
615            }
616        }
617
618        // Compute final selection
619        let mut selection = Array1::from_elem(n_features, false);
620        for (j, stats) in feature_stats.iter().enumerate() {
621            selection[j] = stats.variance() > threshold;
622        }
623
624        Ok(selection)
625    }
626}
627
628/// Streaming statistics for memory-efficient computations
629#[derive(Debug, Clone)]
630struct StreamingStats {
631    count: usize,
632    sum: f64,
633    sum_sq: f64,
634    sum_xy: f64,
635    sum_y: f64,
636    sum_y_sq: f64,
637}
638
639impl StreamingStats {
640    fn new() -> Self {
641        Self {
642            count: 0,
643            sum: 0.0,
644            sum_sq: 0.0,
645            sum_xy: 0.0,
646            sum_y: 0.0,
647            sum_y_sq: 0.0,
648        }
649    }
650
651    fn update(&mut self, value: f64) {
652        self.count += 1;
653        self.sum += value;
654        self.sum_sq += value * value;
655    }
656
657    fn update_covariance(&mut self, x: f64, y: f64) {
658        self.sum_xy += x * y;
659        self.sum_y += y;
660        self.sum_y_sq += y * y;
661    }
662
663    fn mean(&self) -> f64 {
664        if self.count > 0 {
665            self.sum / self.count as f64
666        } else {
667            0.0
668        }
669    }
670
671    fn variance(&self) -> f64 {
672        if self.count > 1 {
673            let n = self.count as f64;
674            let mean = self.mean();
675            (self.sum_sq - n * mean * mean) / (n - 1.0)
676        } else {
677            0.0
678        }
679    }
680
681    fn covariance_xy(&self) -> f64 {
682        if self.count > 1 {
683            let n = self.count as f64;
684            let mean_x = self.sum / n;
685            let mean_y = self.sum_y / n;
686            (self.sum_xy - n * mean_x * mean_y) / (n - 1.0)
687        } else {
688            0.0
689        }
690    }
691}
692
693/// Cache-friendly data structures for improved performance
694pub struct CacheFriendlyMatrix<T> {
695    data: Vec<T>,
696    n_rows: usize,
697    n_cols: usize,
698    layout: MatrixLayout,
699}
700
701#[derive(Debug, Clone, Copy)]
702pub enum MatrixLayout {
703    /// RowMajor
704    RowMajor,
705    /// ColumnMajor
706    ColumnMajor,
707    /// Blocked
708    Blocked { block_size: usize },
709}
710
711impl<T: Clone> CacheFriendlyMatrix<T> {
712    pub fn new(data: Vec<T>, n_rows: usize, n_cols: usize, layout: MatrixLayout) -> Self {
713        assert_eq!(data.len(), n_rows * n_cols);
714
715        Self {
716            data,
717            n_rows,
718            n_cols,
719            layout,
720        }
721    }
722
723    pub fn from_array2(array: Array2<T>, layout: MatrixLayout) -> Self {
724        let (n_rows, n_cols) = array.dim();
725        let data = match layout {
726            MatrixLayout::RowMajor => array.into_raw_vec(),
727            MatrixLayout::ColumnMajor => {
728                // Transpose data for column-major layout
729                let mut col_major_data = Vec::with_capacity(n_rows * n_cols);
730                for col in 0..n_cols {
731                    for row in 0..n_rows {
732                        col_major_data.push(array[[row, col]].clone());
733                    }
734                }
735                col_major_data
736            }
737            MatrixLayout::Blocked { block_size } => Self::create_blocked_layout(array, block_size),
738        };
739
740        Self {
741            data,
742            layout,
743            n_rows,
744            n_cols,
745        }
746    }
747
748    fn create_blocked_layout(array: Array2<T>, block_size: usize) -> Vec<T> {
749        let (n_rows, n_cols) = array.dim();
750        let mut blocked_data = Vec::with_capacity(n_rows * n_cols);
751
752        let row_blocks = (n_rows + block_size - 1) / block_size;
753        let col_blocks = (n_cols + block_size - 1) / block_size;
754
755        for row_block in 0..row_blocks {
756            for col_block in 0..col_blocks {
757                let row_start = row_block * block_size;
758                let row_end = (row_start + block_size).min(n_rows);
759                let col_start = col_block * block_size;
760                let col_end = (col_start + block_size).min(n_cols);
761
762                for row in row_start..row_end {
763                    for col in col_start..col_end {
764                        blocked_data.push(array[[row, col]].clone());
765                    }
766                }
767            }
768        }
769
770        blocked_data
771    }
772
773    pub fn get(&self, row: usize, col: usize) -> Option<&T> {
774        if row >= self.n_rows || col >= self.n_cols {
775            return None;
776        }
777
778        let index = match self.layout {
779            MatrixLayout::RowMajor => row * self.n_cols + col,
780            MatrixLayout::ColumnMajor => col * self.n_rows + row,
781            MatrixLayout::Blocked { block_size } => self.blocked_index(row, col, block_size),
782        };
783
784        self.data.get(index)
785    }
786
787    fn blocked_index(&self, row: usize, col: usize, block_size: usize) -> usize {
788        let row_block = row / block_size;
789        let col_block = col / block_size;
790        let row_in_block = row % block_size;
791        let col_in_block = col % block_size;
792
793        let col_blocks = (self.n_cols + block_size - 1) / block_size;
794        let block_index = row_block * col_blocks + col_block;
795        let block_offset = block_index * block_size * block_size;
796        let in_block_index = row_in_block * block_size + col_in_block;
797
798        block_offset + in_block_index
799    }
800
801    pub fn column_iter(&self, col: usize) -> ColumnIterator<'_, T> {
802        ColumnIterator::new(self, col)
803    }
804}
805
806pub struct ColumnIterator<'a, T> {
807    matrix: &'a CacheFriendlyMatrix<T>,
808    col: usize,
809    current_row: usize,
810}
811
812impl<'a, T> ColumnIterator<'a, T> {
813    fn new(matrix: &'a CacheFriendlyMatrix<T>, col: usize) -> Self {
814        Self {
815            matrix,
816            col,
817            current_row: 0,
818        }
819    }
820}
821
822impl<'a, T: Clone> Iterator for ColumnIterator<'a, T> {
823    type Item = &'a T;
824
825    fn next(&mut self) -> Option<Self::Item> {
826        if self.current_row < self.matrix.n_rows {
827            let item = self.matrix.get(self.current_row, self.col);
828            self.current_row += 1;
829            item
830        } else {
831            None
832        }
833    }
834}
835
836/// Performance profiler for feature selection operations
837pub struct PerformanceProfiler {
838    timings: Arc<Mutex<Vec<(String, std::time::Duration)>>>,
839    memory_usage: Arc<Mutex<Vec<(String, usize)>>>,
840}
841
842impl Default for PerformanceProfiler {
843    fn default() -> Self {
844        Self::new()
845    }
846}
847
848impl PerformanceProfiler {
849    pub fn new() -> Self {
850        Self {
851            timings: Arc::new(Mutex::new(Vec::new())),
852            memory_usage: Arc::new(Mutex::new(Vec::new())),
853        }
854    }
855
856    pub fn time_operation<F, R>(&self, name: &str, operation: F) -> R
857    where
858        F: FnOnce() -> R,
859    {
860        let start = std::time::Instant::now();
861        let result = operation();
862        let duration = start.elapsed();
863
864        if let Ok(mut timings) = self.timings.lock() {
865            timings.push((name.to_string(), duration));
866        }
867
868        result
869    }
870
871    pub fn record_memory_usage(&self, name: &str, bytes: usize) {
872        if let Ok(mut memory) = self.memory_usage.lock() {
873            memory.push((name.to_string(), bytes));
874        }
875    }
876
877    pub fn get_report(&self) -> PerformanceReport {
878        let timings = self.timings.lock().unwrap().clone();
879        let memory_usage = self.memory_usage.lock().unwrap().clone();
880
881        PerformanceReport {
882            timings: timings.clone(),
883            memory_usage: memory_usage.clone(),
884            total_time: timings.iter().map(|(_, duration)| *duration).sum(),
885            peak_memory: memory_usage
886                .iter()
887                .map(|(_, bytes)| *bytes)
888                .max()
889                .unwrap_or(0),
890        }
891    }
892}
893
894#[derive(Debug, Clone)]
895pub struct PerformanceReport {
896    pub timings: Vec<(String, std::time::Duration)>,
897    pub memory_usage: Vec<(String, usize)>,
898    pub total_time: std::time::Duration,
899    pub peak_memory: usize,
900}
901
902impl PerformanceReport {
903    pub fn print_summary(&self) {
904        println!("=== Performance Report ===");
905        println!("Total execution time: {:?}", self.total_time);
906        println!("Peak memory usage: {} bytes", self.peak_memory);
907
908        println!("\nOperation timings:");
909        for (name, duration) in &self.timings {
910            println!("  {}: {:?}", name, duration);
911        }
912
913        println!("\nMemory usage:");
914        for (name, bytes) in &self.memory_usage {
915            println!("  {}: {} bytes", name, bytes);
916        }
917    }
918}
919
920#[allow(non_snake_case)]
921#[cfg(test)]
922mod tests {
923    use super::*;
924    use scirs2_core::ndarray::array;
925
926    #[test]
927    fn test_simd_correlation() {
928        let x = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
929        let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0];
930
931        let correlation_simd = SIMDStats::correlation_auto(x.view(), y.view());
932        let correlation_fallback = SIMDStats::correlation_fallback(x.view(), y.view());
933
934        assert!((correlation_simd - correlation_fallback).abs() < 1e-10);
935        assert!((correlation_simd - 1.0).abs() < 1e-10); // Perfect correlation
936    }
937
938    #[test]
939    #[allow(non_snake_case)]
940    fn test_parallel_correlation_selection() -> Result<()> {
941        let X = array![
942            [1.0, 2.0, 3.0],
943            [2.0, 4.0, 6.0],
944            [3.0, 6.0, 9.0],
945            [4.0, 8.0, 12.0],
946        ];
947        let y = array![1.0, 2.0, 3.0, 4.0];
948
949        let selection =
950            ParallelSelector::parallel_correlation_selection(X.view(), y.view(), 0.5, Some(2))?;
951
952        assert_eq!(selection.len(), 3);
953        assert!(selection.iter().any(|&x| x)); // At least one feature should be selected
954
955        Ok(())
956    }
957
958    #[test]
959    #[allow(non_snake_case)]
960    fn test_memory_efficient_selection() -> Result<()> {
961        let X = array![
962            [1.0, 2.0, 3.0],
963            [2.0, 4.0, 6.0],
964            [3.0, 6.0, 9.0],
965            [4.0, 8.0, 12.0],
966        ];
967        let y = array![1.0, 2.0, 3.0, 4.0];
968
969        let selector = MemoryEfficientSelector::new(2, false);
970        let selection = selector.streaming_correlation_selection(X.view(), y.view(), 0.5)?;
971
972        assert_eq!(selection.len(), 3);
973
974        Ok(())
975    }
976
977    #[test]
978    fn test_cache_friendly_matrix() {
979        let data = vec![1, 2, 3, 4, 5, 6];
980        let matrix = CacheFriendlyMatrix::new(data, 2, 3, MatrixLayout::RowMajor);
981
982        assert_eq!(matrix.get(0, 0), Some(&1));
983        assert_eq!(matrix.get(0, 1), Some(&2));
984        assert_eq!(matrix.get(1, 2), Some(&6));
985        assert_eq!(matrix.get(2, 0), None); // Out of bounds
986    }
987
988    #[test]
989    fn test_performance_profiler() {
990        let profiler = PerformanceProfiler::new();
991
992        let result = profiler.time_operation("test_operation", || {
993            std::thread::sleep(std::time::Duration::from_millis(10));
994            42
995        });
996
997        assert_eq!(result, 42);
998
999        profiler.record_memory_usage("test_memory", 1024);
1000
1001        let report = profiler.get_report();
1002        assert_eq!(report.timings.len(), 1);
1003        assert_eq!(report.memory_usage.len(), 1);
1004        assert!(report.total_time >= std::time::Duration::from_millis(10));
1005    }
1006}