sklears_feature_selection/
domain_benchmark.rs

1//! Benchmarking framework for domain-specific feature selection methods
2
3use scirs2_core::ndarray::{Array1, Array2};
4use scirs2_core::random::{thread_rng, Distribution, StandardNormal};
5use sklears_core::error::SklearsError;
6use sklears_core::traits::{Fit, Transform};
7use std::time::{Duration, Instant};
8
9use crate::domain_specific::advanced_nlp::NLPStrategy;
10use crate::domain_specific::bioinformatics::BioinformaticsStrategy;
11use crate::domain_specific::finance::FinanceStrategy;
12use crate::domain_specific::*;
13
14/// Result of a single benchmark run
15#[derive(Debug, Clone)]
16pub struct BenchmarkResult {
17    pub method_name: String,
18    pub domain: String,
19    pub strategy: String,
20    pub dataset_size: (usize, usize), // (n_samples, n_features)
21    pub k_features: usize,
22    pub fit_time: Duration,
23    pub transform_time: Duration,
24    pub total_time: Duration,
25    pub memory_usage_mb: f64,
26    pub selected_features_count: usize,
27    pub feature_quality_score: f64,
28}
29
30/// Collection of benchmark results
31#[derive(Debug, Clone)]
32pub struct BenchmarkSuite {
33    pub results: Vec<BenchmarkResult>,
34    pub summary: BenchmarkSummary,
35}
36
37/// Summary statistics for benchmark results
38#[derive(Debug, Clone)]
39pub struct BenchmarkSummary {
40    pub total_methods_tested: usize,
41    pub fastest_method: String,
42    pub slowest_method: String,
43    pub most_memory_efficient: String,
44    pub highest_quality_score: String,
45    pub average_fit_time: Duration,
46    pub average_transform_time: Duration,
47}
48
49/// Configuration for benchmarking
50#[derive(Debug, Clone)]
51pub struct BenchmarkConfig {
52    pub dataset_sizes: Vec<(usize, usize)>,
53    pub k_values: Vec<usize>,
54    pub repetitions: usize,
55    pub include_bioinformatics: bool,
56    pub include_finance: bool,
57    pub include_nlp: bool,
58    pub measure_memory: bool,
59}
60
61impl Default for BenchmarkConfig {
62    fn default() -> Self {
63        Self {
64            dataset_sizes: vec![(100, 50), (200, 100), (500, 200)],
65            k_values: vec![10, 20, 50],
66            repetitions: 3,
67            include_bioinformatics: true,
68            include_finance: true,
69            include_nlp: true,
70            measure_memory: false, // Simplified for demonstration
71        }
72    }
73}
74
75/// Domain-specific benchmarking framework
76pub struct DomainBenchmarkFramework {
77    config: BenchmarkConfig,
78}
79
80impl DomainBenchmarkFramework {
81    /// Create a new benchmarking framework
82    pub fn new(config: BenchmarkConfig) -> Self {
83        Self { config }
84    }
85
86    /// Run comprehensive benchmarks across all domain-specific methods
87    pub fn run_comprehensive_benchmark(&self) -> Result<BenchmarkSuite, SklearsError> {
88        let mut all_results = Vec::new();
89
90        // Benchmark bioinformatics methods
91        if self.config.include_bioinformatics {
92            let bio_results = self.benchmark_bioinformatics_methods()?;
93            all_results.extend(bio_results);
94        }
95
96        // Benchmark finance methods
97        if self.config.include_finance {
98            let finance_results = self.benchmark_finance_methods()?;
99            all_results.extend(finance_results);
100        }
101
102        // Benchmark NLP methods
103        if self.config.include_nlp {
104            let nlp_results = self.benchmark_nlp_methods()?;
105            all_results.extend(nlp_results);
106        }
107
108        let summary = self.generate_summary(&all_results);
109
110        Ok(BenchmarkSuite {
111            results: all_results,
112            summary,
113        })
114    }
115
116    /// Benchmark bioinformatics feature selection methods
117    fn benchmark_bioinformatics_methods(&self) -> Result<Vec<BenchmarkResult>, SklearsError> {
118        let mut results = Vec::new();
119
120        let strategies = vec![
121            BioinformaticsStrategy::DifferentialExpression,
122            BioinformaticsStrategy::FunctionalAnnotation,
123            BioinformaticsStrategy::PathwayEnrichment,
124            BioinformaticsStrategy::CoExpressionAnalysis,
125        ];
126
127        for &(n_samples, n_features) in &self.config.dataset_sizes {
128            for &k in &self.config.k_values {
129                if k >= n_features {
130                    continue;
131                }
132
133                for strategy in &strategies {
134                    let mut avg_fit_time = Duration::new(0, 0);
135                    let mut avg_transform_time = Duration::new(0, 0);
136                    let mut avg_quality = 0.0;
137                    let mut successful_runs = 0;
138
139                    for _ in 0..self.config.repetitions {
140                        match self.benchmark_single_bioinformatics_run(
141                            n_samples,
142                            n_features,
143                            k,
144                            strategy.clone(),
145                        ) {
146                            Ok((fit_time, transform_time, quality, _selected_count)) => {
147                                avg_fit_time += fit_time;
148                                avg_transform_time += transform_time;
149                                avg_quality += quality;
150                                successful_runs += 1;
151                            }
152                            Err(_) => continue, // Skip failed runs
153                        }
154                    }
155
156                    if successful_runs > 0 {
157                        avg_fit_time /= successful_runs as u32;
158                        avg_transform_time /= successful_runs as u32;
159                        avg_quality /= successful_runs as f64;
160
161                        results.push(BenchmarkResult {
162                            method_name: "BioinformaticsFeatureSelector".to_string(),
163                            domain: "bioinformatics".to_string(),
164                            strategy: format!("{:?}", strategy),
165                            dataset_size: (n_samples, n_features),
166                            k_features: k,
167                            fit_time: avg_fit_time,
168                            transform_time: avg_transform_time,
169                            total_time: avg_fit_time + avg_transform_time,
170                            memory_usage_mb: self.estimate_memory_usage(n_samples, n_features, k),
171                            selected_features_count: k,
172                            feature_quality_score: avg_quality,
173                        });
174                    }
175                }
176            }
177        }
178
179        Ok(results)
180    }
181
182    /// Benchmark finance feature selection methods
183    fn benchmark_finance_methods(&self) -> Result<Vec<BenchmarkResult>, SklearsError> {
184        let mut results = Vec::new();
185
186        let strategies = vec![
187            FinanceStrategy::Momentum,
188            FinanceStrategy::TechnicalIndicators,
189            FinanceStrategy::RiskAdjusted,
190            FinanceStrategy::Volatility,
191        ];
192
193        for &(n_samples, n_features) in &self.config.dataset_sizes {
194            for &k in &self.config.k_values {
195                if k >= n_features {
196                    continue;
197                }
198
199                for strategy in &strategies {
200                    let mut avg_fit_time = Duration::new(0, 0);
201                    let mut avg_transform_time = Duration::new(0, 0);
202                    let mut avg_quality = 0.0;
203                    let mut successful_runs = 0;
204
205                    for _ in 0..self.config.repetitions {
206                        match self.benchmark_single_finance_run(
207                            n_samples,
208                            n_features,
209                            k,
210                            strategy.clone(),
211                        ) {
212                            Ok((fit_time, transform_time, quality, _selected_count)) => {
213                                avg_fit_time += fit_time;
214                                avg_transform_time += transform_time;
215                                avg_quality += quality;
216                                successful_runs += 1;
217                            }
218                            Err(_) => continue,
219                        }
220                    }
221
222                    if successful_runs > 0 {
223                        avg_fit_time /= successful_runs as u32;
224                        avg_transform_time /= successful_runs as u32;
225                        avg_quality /= successful_runs as f64;
226
227                        results.push(BenchmarkResult {
228                            method_name: "FinanceFeatureSelector".to_string(),
229                            domain: "finance".to_string(),
230                            strategy: format!("{:?}", strategy),
231                            dataset_size: (n_samples, n_features),
232                            k_features: k,
233                            fit_time: avg_fit_time,
234                            transform_time: avg_transform_time,
235                            total_time: avg_fit_time + avg_transform_time,
236                            memory_usage_mb: self.estimate_memory_usage(n_samples, n_features, k),
237                            selected_features_count: k,
238                            feature_quality_score: avg_quality,
239                        });
240                    }
241                }
242            }
243        }
244
245        Ok(results)
246    }
247
248    /// Benchmark NLP feature selection methods
249    fn benchmark_nlp_methods(&self) -> Result<Vec<BenchmarkResult>, SklearsError> {
250        let mut results = Vec::new();
251
252        let strategies = vec![
253            NLPStrategy::InformationTheoretic,
254            NLPStrategy::SyntacticAnalysis,
255            NLPStrategy::SemanticAnalysis,
256            NLPStrategy::TransformerBased,
257        ];
258
259        for &(n_samples, n_features) in &self.config.dataset_sizes {
260            for &k in &self.config.k_values {
261                if k >= n_features {
262                    continue;
263                }
264
265                for strategy in &strategies {
266                    let mut avg_fit_time = Duration::new(0, 0);
267                    let mut avg_transform_time = Duration::new(0, 0);
268                    let mut avg_quality = 0.0;
269                    let mut successful_runs = 0;
270
271                    for _ in 0..self.config.repetitions {
272                        match self.benchmark_single_nlp_run(
273                            n_samples,
274                            n_features,
275                            k,
276                            strategy.clone(),
277                        ) {
278                            Ok((fit_time, transform_time, quality, _selected_count)) => {
279                                avg_fit_time += fit_time;
280                                avg_transform_time += transform_time;
281                                avg_quality += quality;
282                                successful_runs += 1;
283                            }
284                            Err(_) => continue,
285                        }
286                    }
287
288                    if successful_runs > 0 {
289                        avg_fit_time /= successful_runs as u32;
290                        avg_transform_time /= successful_runs as u32;
291                        avg_quality /= successful_runs as f64;
292
293                        results.push(BenchmarkResult {
294                            method_name: "AdvancedNLPFeatureSelector".to_string(),
295                            domain: "nlp".to_string(),
296                            strategy: format!("{:?}", strategy),
297                            dataset_size: (n_samples, n_features),
298                            k_features: k,
299                            fit_time: avg_fit_time,
300                            transform_time: avg_transform_time,
301                            total_time: avg_fit_time + avg_transform_time,
302                            memory_usage_mb: self.estimate_memory_usage(n_samples, n_features, k),
303                            selected_features_count: k,
304                            feature_quality_score: avg_quality,
305                        });
306                    }
307                }
308            }
309        }
310
311        Ok(results)
312    }
313
314    /// Benchmark a single bioinformatics run
315    #[allow(non_snake_case)]
316    fn benchmark_single_bioinformatics_run(
317        &self,
318        n_samples: usize,
319        n_features: usize,
320        k: usize,
321        strategy: BioinformaticsStrategy,
322    ) -> Result<(Duration, Duration, f64, usize), SklearsError> {
323        // Generate synthetic genomic data
324        let X = Array2::from_shape_fn((n_samples, n_features), |_| {
325            let mut rng = thread_rng();
326            StandardNormal.sample(&mut rng)
327        });
328        let y = Array1::from_shape_fn(n_samples, |_| {
329            let mut rng = thread_rng();
330            StandardNormal.sample(&mut rng)
331        });
332
333        let selector = BioinformaticsFeatureSelector::new().k(k).strategy(strategy);
334
335        // Measure fit time
336        let fit_start = Instant::now();
337        let trained_selector = selector
338            .fit(&X, &y)
339            .map_err(|_| SklearsError::InvalidInput("Fit failed".to_string()))?;
340        let fit_time = fit_start.elapsed();
341
342        // Measure transform time
343        let transform_start = Instant::now();
344        let transformed = trained_selector
345            .transform(&X)
346            .map_err(|_| SklearsError::InvalidInput("Transform failed".to_string()))?;
347        let transform_time = transform_start.elapsed();
348
349        let selected_count = transformed.ncols();
350        let quality_score = self.calculate_feature_quality(&transformed, &y);
351
352        Ok((fit_time, transform_time, quality_score, selected_count))
353    }
354
355    /// Benchmark a single finance run
356    #[allow(non_snake_case)]
357    fn benchmark_single_finance_run(
358        &self,
359        n_samples: usize,
360        n_features: usize,
361        k: usize,
362        strategy: FinanceStrategy,
363    ) -> Result<(Duration, Duration, f64, usize), SklearsError> {
364        // Generate synthetic financial time series data
365        let X = Array2::from_shape_fn((n_samples, n_features), |_| {
366            let mut rng = thread_rng();
367            StandardNormal.sample(&mut rng)
368        });
369        let y = Array1::from_shape_fn(n_samples, |_| {
370            let mut rng = thread_rng();
371            StandardNormal.sample(&mut rng)
372        });
373
374        let selector = FinanceFeatureSelector::new().k(k).strategy(strategy);
375
376        let fit_start = Instant::now();
377        let trained_selector = selector
378            .fit(&X, &y)
379            .map_err(|_| SklearsError::InvalidInput("Fit failed".to_string()))?;
380        let fit_time = fit_start.elapsed();
381
382        let transform_start = Instant::now();
383        let transformed = trained_selector
384            .transform(&X)
385            .map_err(|_| SklearsError::InvalidInput("Transform failed".to_string()))?;
386        let transform_time = transform_start.elapsed();
387
388        let selected_count = transformed.ncols();
389        let quality_score = self.calculate_feature_quality(&transformed, &y);
390
391        Ok((fit_time, transform_time, quality_score, selected_count))
392    }
393
394    /// Benchmark a single NLP run
395    #[allow(non_snake_case)]
396    fn benchmark_single_nlp_run(
397        &self,
398        n_samples: usize,
399        n_features: usize,
400        k: usize,
401        strategy: NLPStrategy,
402    ) -> Result<(Duration, Duration, f64, usize), SklearsError> {
403        // Generate synthetic text feature data
404        let X = Array2::from_shape_fn((n_samples, n_features), |_| {
405            let mut rng = thread_rng();
406            StandardNormal.sample(&mut rng)
407        });
408        let y = Array1::from_shape_fn(n_samples, |_| {
409            let mut rng = thread_rng();
410            StandardNormal.sample(&mut rng)
411        });
412
413        let selector = AdvancedNLPFeatureSelector::new().k(k).strategy(strategy);
414
415        let fit_start = Instant::now();
416        let trained_selector = selector
417            .fit(&X, &y)
418            .map_err(|_| SklearsError::InvalidInput("Fit failed".to_string()))?;
419        let fit_time = fit_start.elapsed();
420
421        let transform_start = Instant::now();
422        let transformed = trained_selector
423            .transform(&X)
424            .map_err(|_| SklearsError::InvalidInput("Transform failed".to_string()))?;
425        let transform_time = transform_start.elapsed();
426
427        let selected_count = transformed.ncols();
428        let quality_score = self.calculate_feature_quality(&transformed, &y);
429
430        Ok((fit_time, transform_time, quality_score, selected_count))
431    }
432
433    /// Calculate feature quality score (simplified correlation-based metric)
434    fn calculate_feature_quality(&self, X: &Array2<f64>, y: &Array1<f64>) -> f64 {
435        let mut total_correlation = 0.0;
436        let n_features = X.ncols();
437
438        for i in 0..n_features {
439            let feature_values = X.column(i);
440            let correlation = self.pearson_correlation(&feature_values.to_owned(), y);
441            total_correlation += correlation.abs();
442        }
443
444        total_correlation / n_features as f64
445    }
446
447    /// Calculate Pearson correlation
448    fn pearson_correlation(&self, x: &Array1<f64>, y: &Array1<f64>) -> f64 {
449        let x_mean = x.mean().unwrap_or(0.0);
450        let y_mean = y.mean().unwrap_or(0.0);
451
452        let numerator: f64 = x
453            .iter()
454            .zip(y.iter())
455            .map(|(&xi, &yi)| (xi - x_mean) * (yi - y_mean))
456            .sum();
457
458        let x_sq_sum: f64 = x.iter().map(|&xi| (xi - x_mean).powi(2)).sum();
459        let y_sq_sum: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
460
461        let denominator = (x_sq_sum * y_sq_sum).sqrt();
462
463        if denominator != 0.0 {
464            numerator / denominator
465        } else {
466            0.0
467        }
468    }
469
470    /// Estimate memory usage (simplified calculation)
471    fn estimate_memory_usage(&self, n_samples: usize, n_features: usize, k: usize) -> f64 {
472        // Rough estimate: input data + selected features + metadata
473        let input_size = n_samples * n_features * 8; // 8 bytes per f64
474        let selected_size = n_samples * k * 8;
475        let metadata_size = 1024; // Approximate metadata overhead
476
477        (input_size + selected_size + metadata_size) as f64 / (1024.0 * 1024.0) // Convert to MB
478    }
479
480    /// Generate benchmark summary
481    fn generate_summary(&self, results: &[BenchmarkResult]) -> BenchmarkSummary {
482        if results.is_empty() {
483            return BenchmarkSummary {
484                total_methods_tested: 0,
485                fastest_method: "None".to_string(),
486                slowest_method: "None".to_string(),
487                most_memory_efficient: "None".to_string(),
488                highest_quality_score: "None".to_string(),
489                average_fit_time: Duration::new(0, 0),
490                average_transform_time: Duration::new(0, 0),
491            };
492        }
493
494        let fastest = results.iter().min_by_key(|r| r.total_time).unwrap();
495
496        let slowest = results.iter().max_by_key(|r| r.total_time).unwrap();
497
498        let most_memory_efficient = results
499            .iter()
500            .min_by(|a, b| a.memory_usage_mb.partial_cmp(&b.memory_usage_mb).unwrap())
501            .unwrap();
502
503        let highest_quality = results
504            .iter()
505            .max_by(|a, b| {
506                a.feature_quality_score
507                    .partial_cmp(&b.feature_quality_score)
508                    .unwrap()
509            })
510            .unwrap();
511
512        let total_fit_time: Duration = results.iter().map(|r| r.fit_time).sum();
513        let total_transform_time: Duration = results.iter().map(|r| r.transform_time).sum();
514        let n_results = results.len() as u32;
515
516        BenchmarkSummary {
517            total_methods_tested: results.len(),
518            fastest_method: format!("{} ({})", fastest.method_name, fastest.strategy),
519            slowest_method: format!("{} ({})", slowest.method_name, slowest.strategy),
520            most_memory_efficient: format!(
521                "{} ({})",
522                most_memory_efficient.method_name, most_memory_efficient.strategy
523            ),
524            highest_quality_score: format!(
525                "{} ({})",
526                highest_quality.method_name, highest_quality.strategy
527            ),
528            average_fit_time: total_fit_time / n_results,
529            average_transform_time: total_transform_time / n_results,
530        }
531    }
532
533    /// Export benchmark results to CSV format
534    pub fn export_to_csv(&self, results: &BenchmarkSuite) -> String {
535        let mut csv = String::new();
536        csv.push_str("method_name,domain,strategy,n_samples,n_features,k_features,fit_time_ms,transform_time_ms,total_time_ms,memory_mb,selected_count,quality_score\n");
537
538        for result in &results.results {
539            csv.push_str(&format!(
540                "{},{},{},{},{},{},{},{},{},{:.2},{},{:.4}\n",
541                result.method_name,
542                result.domain,
543                result.strategy,
544                result.dataset_size.0,
545                result.dataset_size.1,
546                result.k_features,
547                result.fit_time.as_millis(),
548                result.transform_time.as_millis(),
549                result.total_time.as_millis(),
550                result.memory_usage_mb,
551                result.selected_features_count,
552                result.feature_quality_score
553            ));
554        }
555
556        csv
557    }
558}
559
560/// Convenience function to run a quick benchmark with default settings
561pub fn run_quick_benchmark() -> Result<BenchmarkSuite, SklearsError> {
562    let config = BenchmarkConfig {
563        dataset_sizes: vec![(50, 30), (100, 50)],
564        k_values: vec![10, 20],
565        repetitions: 2,
566        ..Default::default()
567    };
568
569    let framework = DomainBenchmarkFramework::new(config);
570    framework.run_comprehensive_benchmark()
571}
572
573#[allow(non_snake_case)]
574#[cfg(test)]
575mod tests {
576    use super::*;
577
578    #[test]
579    fn test_benchmark_framework_creation() {
580        let config = BenchmarkConfig::default();
581        let framework = DomainBenchmarkFramework::new(config);
582
583        // Test that framework is created successfully
584        assert_eq!(framework.config.repetitions, 3);
585        assert!(framework.config.include_bioinformatics);
586        assert!(framework.config.include_finance);
587        assert!(framework.config.include_nlp);
588    }
589
590    #[test]
591    fn test_memory_estimation() {
592        let config = BenchmarkConfig::default();
593        let framework = DomainBenchmarkFramework::new(config);
594
595        let memory_mb = framework.estimate_memory_usage(100, 50, 10);
596        assert!(memory_mb > 0.0);
597    }
598
599    #[test]
600    #[allow(non_snake_case)]
601    fn test_feature_quality_calculation() {
602        let config = BenchmarkConfig::default();
603        let framework = DomainBenchmarkFramework::new(config);
604
605        let X = Array2::from_shape_fn((20, 5), |_| {
606            let mut rng = thread_rng();
607            StandardNormal.sample(&mut rng)
608        });
609        let y = Array1::from_shape_fn(20, |_| {
610            let mut rng = thread_rng();
611            StandardNormal.sample(&mut rng)
612        });
613
614        let quality = framework.calculate_feature_quality(&X, &y);
615        assert!(quality >= 0.0 && quality <= 1.0);
616    }
617
618    #[test]
619    fn test_csv_export() {
620        let config = BenchmarkConfig::default();
621        let framework = DomainBenchmarkFramework::new(config);
622
623        let dummy_result = BenchmarkResult {
624            method_name: "TestMethod".to_string(),
625            domain: "test".to_string(),
626            strategy: "TestStrategy".to_string(),
627            dataset_size: (100, 50),
628            k_features: 10,
629            fit_time: Duration::from_millis(100),
630            transform_time: Duration::from_millis(50),
631            total_time: Duration::from_millis(150),
632            memory_usage_mb: 5.0,
633            selected_features_count: 10,
634            feature_quality_score: 0.75,
635        };
636
637        let dummy_summary = BenchmarkSummary {
638            total_methods_tested: 1,
639            fastest_method: "TestMethod".to_string(),
640            slowest_method: "TestMethod".to_string(),
641            most_memory_efficient: "TestMethod".to_string(),
642            highest_quality_score: "TestMethod".to_string(),
643            average_fit_time: Duration::from_millis(100),
644            average_transform_time: Duration::from_millis(50),
645        };
646
647        let suite = BenchmarkSuite {
648            results: vec![dummy_result],
649            summary: dummy_summary,
650        };
651
652        let csv = framework.export_to_csv(&suite);
653        assert!(csv.contains("method_name"));
654        assert!(csv.contains("TestMethod"));
655    }
656}