1use scirs2_core::ndarray::{Array1, Array2};
4use scirs2_core::random::{thread_rng, Distribution, StandardNormal};
5use sklears_core::error::SklearsError;
6use sklears_core::traits::{Fit, Transform};
7use std::time::{Duration, Instant};
8
9use crate::domain_specific::advanced_nlp::NLPStrategy;
10use crate::domain_specific::bioinformatics::BioinformaticsStrategy;
11use crate::domain_specific::finance::FinanceStrategy;
12use crate::domain_specific::*;
13
14#[derive(Debug, Clone)]
16pub struct BenchmarkResult {
17 pub method_name: String,
18 pub domain: String,
19 pub strategy: String,
20 pub dataset_size: (usize, usize), pub k_features: usize,
22 pub fit_time: Duration,
23 pub transform_time: Duration,
24 pub total_time: Duration,
25 pub memory_usage_mb: f64,
26 pub selected_features_count: usize,
27 pub feature_quality_score: f64,
28}
29
30#[derive(Debug, Clone)]
32pub struct BenchmarkSuite {
33 pub results: Vec<BenchmarkResult>,
34 pub summary: BenchmarkSummary,
35}
36
37#[derive(Debug, Clone)]
39pub struct BenchmarkSummary {
40 pub total_methods_tested: usize,
41 pub fastest_method: String,
42 pub slowest_method: String,
43 pub most_memory_efficient: String,
44 pub highest_quality_score: String,
45 pub average_fit_time: Duration,
46 pub average_transform_time: Duration,
47}
48
49#[derive(Debug, Clone)]
51pub struct BenchmarkConfig {
52 pub dataset_sizes: Vec<(usize, usize)>,
53 pub k_values: Vec<usize>,
54 pub repetitions: usize,
55 pub include_bioinformatics: bool,
56 pub include_finance: bool,
57 pub include_nlp: bool,
58 pub measure_memory: bool,
59}
60
61impl Default for BenchmarkConfig {
62 fn default() -> Self {
63 Self {
64 dataset_sizes: vec![(100, 50), (200, 100), (500, 200)],
65 k_values: vec![10, 20, 50],
66 repetitions: 3,
67 include_bioinformatics: true,
68 include_finance: true,
69 include_nlp: true,
70 measure_memory: false, }
72 }
73}
74
75pub struct DomainBenchmarkFramework {
77 config: BenchmarkConfig,
78}
79
80impl DomainBenchmarkFramework {
81 pub fn new(config: BenchmarkConfig) -> Self {
83 Self { config }
84 }
85
86 pub fn run_comprehensive_benchmark(&self) -> Result<BenchmarkSuite, SklearsError> {
88 let mut all_results = Vec::new();
89
90 if self.config.include_bioinformatics {
92 let bio_results = self.benchmark_bioinformatics_methods()?;
93 all_results.extend(bio_results);
94 }
95
96 if self.config.include_finance {
98 let finance_results = self.benchmark_finance_methods()?;
99 all_results.extend(finance_results);
100 }
101
102 if self.config.include_nlp {
104 let nlp_results = self.benchmark_nlp_methods()?;
105 all_results.extend(nlp_results);
106 }
107
108 let summary = self.generate_summary(&all_results);
109
110 Ok(BenchmarkSuite {
111 results: all_results,
112 summary,
113 })
114 }
115
116 fn benchmark_bioinformatics_methods(&self) -> Result<Vec<BenchmarkResult>, SklearsError> {
118 let mut results = Vec::new();
119
120 let strategies = vec![
121 BioinformaticsStrategy::DifferentialExpression,
122 BioinformaticsStrategy::FunctionalAnnotation,
123 BioinformaticsStrategy::PathwayEnrichment,
124 BioinformaticsStrategy::CoExpressionAnalysis,
125 ];
126
127 for &(n_samples, n_features) in &self.config.dataset_sizes {
128 for &k in &self.config.k_values {
129 if k >= n_features {
130 continue;
131 }
132
133 for strategy in &strategies {
134 let mut avg_fit_time = Duration::new(0, 0);
135 let mut avg_transform_time = Duration::new(0, 0);
136 let mut avg_quality = 0.0;
137 let mut successful_runs = 0;
138
139 for _ in 0..self.config.repetitions {
140 match self.benchmark_single_bioinformatics_run(
141 n_samples,
142 n_features,
143 k,
144 strategy.clone(),
145 ) {
146 Ok((fit_time, transform_time, quality, _selected_count)) => {
147 avg_fit_time += fit_time;
148 avg_transform_time += transform_time;
149 avg_quality += quality;
150 successful_runs += 1;
151 }
152 Err(_) => continue, }
154 }
155
156 if successful_runs > 0 {
157 avg_fit_time /= successful_runs as u32;
158 avg_transform_time /= successful_runs as u32;
159 avg_quality /= successful_runs as f64;
160
161 results.push(BenchmarkResult {
162 method_name: "BioinformaticsFeatureSelector".to_string(),
163 domain: "bioinformatics".to_string(),
164 strategy: format!("{:?}", strategy),
165 dataset_size: (n_samples, n_features),
166 k_features: k,
167 fit_time: avg_fit_time,
168 transform_time: avg_transform_time,
169 total_time: avg_fit_time + avg_transform_time,
170 memory_usage_mb: self.estimate_memory_usage(n_samples, n_features, k),
171 selected_features_count: k,
172 feature_quality_score: avg_quality,
173 });
174 }
175 }
176 }
177 }
178
179 Ok(results)
180 }
181
182 fn benchmark_finance_methods(&self) -> Result<Vec<BenchmarkResult>, SklearsError> {
184 let mut results = Vec::new();
185
186 let strategies = vec![
187 FinanceStrategy::Momentum,
188 FinanceStrategy::TechnicalIndicators,
189 FinanceStrategy::RiskAdjusted,
190 FinanceStrategy::Volatility,
191 ];
192
193 for &(n_samples, n_features) in &self.config.dataset_sizes {
194 for &k in &self.config.k_values {
195 if k >= n_features {
196 continue;
197 }
198
199 for strategy in &strategies {
200 let mut avg_fit_time = Duration::new(0, 0);
201 let mut avg_transform_time = Duration::new(0, 0);
202 let mut avg_quality = 0.0;
203 let mut successful_runs = 0;
204
205 for _ in 0..self.config.repetitions {
206 match self.benchmark_single_finance_run(
207 n_samples,
208 n_features,
209 k,
210 strategy.clone(),
211 ) {
212 Ok((fit_time, transform_time, quality, _selected_count)) => {
213 avg_fit_time += fit_time;
214 avg_transform_time += transform_time;
215 avg_quality += quality;
216 successful_runs += 1;
217 }
218 Err(_) => continue,
219 }
220 }
221
222 if successful_runs > 0 {
223 avg_fit_time /= successful_runs as u32;
224 avg_transform_time /= successful_runs as u32;
225 avg_quality /= successful_runs as f64;
226
227 results.push(BenchmarkResult {
228 method_name: "FinanceFeatureSelector".to_string(),
229 domain: "finance".to_string(),
230 strategy: format!("{:?}", strategy),
231 dataset_size: (n_samples, n_features),
232 k_features: k,
233 fit_time: avg_fit_time,
234 transform_time: avg_transform_time,
235 total_time: avg_fit_time + avg_transform_time,
236 memory_usage_mb: self.estimate_memory_usage(n_samples, n_features, k),
237 selected_features_count: k,
238 feature_quality_score: avg_quality,
239 });
240 }
241 }
242 }
243 }
244
245 Ok(results)
246 }
247
248 fn benchmark_nlp_methods(&self) -> Result<Vec<BenchmarkResult>, SklearsError> {
250 let mut results = Vec::new();
251
252 let strategies = vec![
253 NLPStrategy::InformationTheoretic,
254 NLPStrategy::SyntacticAnalysis,
255 NLPStrategy::SemanticAnalysis,
256 NLPStrategy::TransformerBased,
257 ];
258
259 for &(n_samples, n_features) in &self.config.dataset_sizes {
260 for &k in &self.config.k_values {
261 if k >= n_features {
262 continue;
263 }
264
265 for strategy in &strategies {
266 let mut avg_fit_time = Duration::new(0, 0);
267 let mut avg_transform_time = Duration::new(0, 0);
268 let mut avg_quality = 0.0;
269 let mut successful_runs = 0;
270
271 for _ in 0..self.config.repetitions {
272 match self.benchmark_single_nlp_run(
273 n_samples,
274 n_features,
275 k,
276 strategy.clone(),
277 ) {
278 Ok((fit_time, transform_time, quality, _selected_count)) => {
279 avg_fit_time += fit_time;
280 avg_transform_time += transform_time;
281 avg_quality += quality;
282 successful_runs += 1;
283 }
284 Err(_) => continue,
285 }
286 }
287
288 if successful_runs > 0 {
289 avg_fit_time /= successful_runs as u32;
290 avg_transform_time /= successful_runs as u32;
291 avg_quality /= successful_runs as f64;
292
293 results.push(BenchmarkResult {
294 method_name: "AdvancedNLPFeatureSelector".to_string(),
295 domain: "nlp".to_string(),
296 strategy: format!("{:?}", strategy),
297 dataset_size: (n_samples, n_features),
298 k_features: k,
299 fit_time: avg_fit_time,
300 transform_time: avg_transform_time,
301 total_time: avg_fit_time + avg_transform_time,
302 memory_usage_mb: self.estimate_memory_usage(n_samples, n_features, k),
303 selected_features_count: k,
304 feature_quality_score: avg_quality,
305 });
306 }
307 }
308 }
309 }
310
311 Ok(results)
312 }
313
314 #[allow(non_snake_case)]
316 fn benchmark_single_bioinformatics_run(
317 &self,
318 n_samples: usize,
319 n_features: usize,
320 k: usize,
321 strategy: BioinformaticsStrategy,
322 ) -> Result<(Duration, Duration, f64, usize), SklearsError> {
323 let X = Array2::from_shape_fn((n_samples, n_features), |_| {
325 let mut rng = thread_rng();
326 StandardNormal.sample(&mut rng)
327 });
328 let y = Array1::from_shape_fn(n_samples, |_| {
329 let mut rng = thread_rng();
330 StandardNormal.sample(&mut rng)
331 });
332
333 let selector = BioinformaticsFeatureSelector::new().k(k).strategy(strategy);
334
335 let fit_start = Instant::now();
337 let trained_selector = selector
338 .fit(&X, &y)
339 .map_err(|_| SklearsError::InvalidInput("Fit failed".to_string()))?;
340 let fit_time = fit_start.elapsed();
341
342 let transform_start = Instant::now();
344 let transformed = trained_selector
345 .transform(&X)
346 .map_err(|_| SklearsError::InvalidInput("Transform failed".to_string()))?;
347 let transform_time = transform_start.elapsed();
348
349 let selected_count = transformed.ncols();
350 let quality_score = self.calculate_feature_quality(&transformed, &y);
351
352 Ok((fit_time, transform_time, quality_score, selected_count))
353 }
354
355 #[allow(non_snake_case)]
357 fn benchmark_single_finance_run(
358 &self,
359 n_samples: usize,
360 n_features: usize,
361 k: usize,
362 strategy: FinanceStrategy,
363 ) -> Result<(Duration, Duration, f64, usize), SklearsError> {
364 let X = Array2::from_shape_fn((n_samples, n_features), |_| {
366 let mut rng = thread_rng();
367 StandardNormal.sample(&mut rng)
368 });
369 let y = Array1::from_shape_fn(n_samples, |_| {
370 let mut rng = thread_rng();
371 StandardNormal.sample(&mut rng)
372 });
373
374 let selector = FinanceFeatureSelector::new().k(k).strategy(strategy);
375
376 let fit_start = Instant::now();
377 let trained_selector = selector
378 .fit(&X, &y)
379 .map_err(|_| SklearsError::InvalidInput("Fit failed".to_string()))?;
380 let fit_time = fit_start.elapsed();
381
382 let transform_start = Instant::now();
383 let transformed = trained_selector
384 .transform(&X)
385 .map_err(|_| SklearsError::InvalidInput("Transform failed".to_string()))?;
386 let transform_time = transform_start.elapsed();
387
388 let selected_count = transformed.ncols();
389 let quality_score = self.calculate_feature_quality(&transformed, &y);
390
391 Ok((fit_time, transform_time, quality_score, selected_count))
392 }
393
394 #[allow(non_snake_case)]
396 fn benchmark_single_nlp_run(
397 &self,
398 n_samples: usize,
399 n_features: usize,
400 k: usize,
401 strategy: NLPStrategy,
402 ) -> Result<(Duration, Duration, f64, usize), SklearsError> {
403 let X = Array2::from_shape_fn((n_samples, n_features), |_| {
405 let mut rng = thread_rng();
406 StandardNormal.sample(&mut rng)
407 });
408 let y = Array1::from_shape_fn(n_samples, |_| {
409 let mut rng = thread_rng();
410 StandardNormal.sample(&mut rng)
411 });
412
413 let selector = AdvancedNLPFeatureSelector::new().k(k).strategy(strategy);
414
415 let fit_start = Instant::now();
416 let trained_selector = selector
417 .fit(&X, &y)
418 .map_err(|_| SklearsError::InvalidInput("Fit failed".to_string()))?;
419 let fit_time = fit_start.elapsed();
420
421 let transform_start = Instant::now();
422 let transformed = trained_selector
423 .transform(&X)
424 .map_err(|_| SklearsError::InvalidInput("Transform failed".to_string()))?;
425 let transform_time = transform_start.elapsed();
426
427 let selected_count = transformed.ncols();
428 let quality_score = self.calculate_feature_quality(&transformed, &y);
429
430 Ok((fit_time, transform_time, quality_score, selected_count))
431 }
432
433 fn calculate_feature_quality(&self, X: &Array2<f64>, y: &Array1<f64>) -> f64 {
435 let mut total_correlation = 0.0;
436 let n_features = X.ncols();
437
438 for i in 0..n_features {
439 let feature_values = X.column(i);
440 let correlation = self.pearson_correlation(&feature_values.to_owned(), y);
441 total_correlation += correlation.abs();
442 }
443
444 total_correlation / n_features as f64
445 }
446
447 fn pearson_correlation(&self, x: &Array1<f64>, y: &Array1<f64>) -> f64 {
449 let x_mean = x.mean().unwrap_or(0.0);
450 let y_mean = y.mean().unwrap_or(0.0);
451
452 let numerator: f64 = x
453 .iter()
454 .zip(y.iter())
455 .map(|(&xi, &yi)| (xi - x_mean) * (yi - y_mean))
456 .sum();
457
458 let x_sq_sum: f64 = x.iter().map(|&xi| (xi - x_mean).powi(2)).sum();
459 let y_sq_sum: f64 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
460
461 let denominator = (x_sq_sum * y_sq_sum).sqrt();
462
463 if denominator != 0.0 {
464 numerator / denominator
465 } else {
466 0.0
467 }
468 }
469
470 fn estimate_memory_usage(&self, n_samples: usize, n_features: usize, k: usize) -> f64 {
472 let input_size = n_samples * n_features * 8; let selected_size = n_samples * k * 8;
475 let metadata_size = 1024; (input_size + selected_size + metadata_size) as f64 / (1024.0 * 1024.0) }
479
480 fn generate_summary(&self, results: &[BenchmarkResult]) -> BenchmarkSummary {
482 if results.is_empty() {
483 return BenchmarkSummary {
484 total_methods_tested: 0,
485 fastest_method: "None".to_string(),
486 slowest_method: "None".to_string(),
487 most_memory_efficient: "None".to_string(),
488 highest_quality_score: "None".to_string(),
489 average_fit_time: Duration::new(0, 0),
490 average_transform_time: Duration::new(0, 0),
491 };
492 }
493
494 let fastest = results.iter().min_by_key(|r| r.total_time).unwrap();
495
496 let slowest = results.iter().max_by_key(|r| r.total_time).unwrap();
497
498 let most_memory_efficient = results
499 .iter()
500 .min_by(|a, b| a.memory_usage_mb.partial_cmp(&b.memory_usage_mb).unwrap())
501 .unwrap();
502
503 let highest_quality = results
504 .iter()
505 .max_by(|a, b| {
506 a.feature_quality_score
507 .partial_cmp(&b.feature_quality_score)
508 .unwrap()
509 })
510 .unwrap();
511
512 let total_fit_time: Duration = results.iter().map(|r| r.fit_time).sum();
513 let total_transform_time: Duration = results.iter().map(|r| r.transform_time).sum();
514 let n_results = results.len() as u32;
515
516 BenchmarkSummary {
517 total_methods_tested: results.len(),
518 fastest_method: format!("{} ({})", fastest.method_name, fastest.strategy),
519 slowest_method: format!("{} ({})", slowest.method_name, slowest.strategy),
520 most_memory_efficient: format!(
521 "{} ({})",
522 most_memory_efficient.method_name, most_memory_efficient.strategy
523 ),
524 highest_quality_score: format!(
525 "{} ({})",
526 highest_quality.method_name, highest_quality.strategy
527 ),
528 average_fit_time: total_fit_time / n_results,
529 average_transform_time: total_transform_time / n_results,
530 }
531 }
532
533 pub fn export_to_csv(&self, results: &BenchmarkSuite) -> String {
535 let mut csv = String::new();
536 csv.push_str("method_name,domain,strategy,n_samples,n_features,k_features,fit_time_ms,transform_time_ms,total_time_ms,memory_mb,selected_count,quality_score\n");
537
538 for result in &results.results {
539 csv.push_str(&format!(
540 "{},{},{},{},{},{},{},{},{},{:.2},{},{:.4}\n",
541 result.method_name,
542 result.domain,
543 result.strategy,
544 result.dataset_size.0,
545 result.dataset_size.1,
546 result.k_features,
547 result.fit_time.as_millis(),
548 result.transform_time.as_millis(),
549 result.total_time.as_millis(),
550 result.memory_usage_mb,
551 result.selected_features_count,
552 result.feature_quality_score
553 ));
554 }
555
556 csv
557 }
558}
559
560pub fn run_quick_benchmark() -> Result<BenchmarkSuite, SklearsError> {
562 let config = BenchmarkConfig {
563 dataset_sizes: vec![(50, 30), (100, 50)],
564 k_values: vec![10, 20],
565 repetitions: 2,
566 ..Default::default()
567 };
568
569 let framework = DomainBenchmarkFramework::new(config);
570 framework.run_comprehensive_benchmark()
571}
572
573#[allow(non_snake_case)]
574#[cfg(test)]
575mod tests {
576 use super::*;
577
578 #[test]
579 fn test_benchmark_framework_creation() {
580 let config = BenchmarkConfig::default();
581 let framework = DomainBenchmarkFramework::new(config);
582
583 assert_eq!(framework.config.repetitions, 3);
585 assert!(framework.config.include_bioinformatics);
586 assert!(framework.config.include_finance);
587 assert!(framework.config.include_nlp);
588 }
589
590 #[test]
591 fn test_memory_estimation() {
592 let config = BenchmarkConfig::default();
593 let framework = DomainBenchmarkFramework::new(config);
594
595 let memory_mb = framework.estimate_memory_usage(100, 50, 10);
596 assert!(memory_mb > 0.0);
597 }
598
599 #[test]
600 #[allow(non_snake_case)]
601 fn test_feature_quality_calculation() {
602 let config = BenchmarkConfig::default();
603 let framework = DomainBenchmarkFramework::new(config);
604
605 let X = Array2::from_shape_fn((20, 5), |_| {
606 let mut rng = thread_rng();
607 StandardNormal.sample(&mut rng)
608 });
609 let y = Array1::from_shape_fn(20, |_| {
610 let mut rng = thread_rng();
611 StandardNormal.sample(&mut rng)
612 });
613
614 let quality = framework.calculate_feature_quality(&X, &y);
615 assert!(quality >= 0.0 && quality <= 1.0);
616 }
617
618 #[test]
619 fn test_csv_export() {
620 let config = BenchmarkConfig::default();
621 let framework = DomainBenchmarkFramework::new(config);
622
623 let dummy_result = BenchmarkResult {
624 method_name: "TestMethod".to_string(),
625 domain: "test".to_string(),
626 strategy: "TestStrategy".to_string(),
627 dataset_size: (100, 50),
628 k_features: 10,
629 fit_time: Duration::from_millis(100),
630 transform_time: Duration::from_millis(50),
631 total_time: Duration::from_millis(150),
632 memory_usage_mb: 5.0,
633 selected_features_count: 10,
634 feature_quality_score: 0.75,
635 };
636
637 let dummy_summary = BenchmarkSummary {
638 total_methods_tested: 1,
639 fastest_method: "TestMethod".to_string(),
640 slowest_method: "TestMethod".to_string(),
641 most_memory_efficient: "TestMethod".to_string(),
642 highest_quality_score: "TestMethod".to_string(),
643 average_fit_time: Duration::from_millis(100),
644 average_transform_time: Duration::from_millis(50),
645 };
646
647 let suite = BenchmarkSuite {
648 results: vec![dummy_result],
649 summary: dummy_summary,
650 };
651
652 let csv = framework.export_to_csv(&suite);
653 assert!(csv.contains("method_name"));
654 assert!(csv.contains("TestMethod"));
655 }
656}