sklears_simd/
benchmark_framework.rs

1//! Advanced benchmarking framework for SIMD operations
2//!
3//! Provides comprehensive benchmarking utilities including cross-platform performance tests,
4//! regression detection, and automated optimization guidance.
5//!
6//! ## no-std Compatibility
7//!
8//! This module is compatible with both std and no-std environments. In no-std:
9//! - `HashMap` is replaced with `BTreeMap` for deterministic ordering
10//! - Timing functionality is limited and may return mock values
11//! - Operations are still executed but without accurate timing measurements
12//! - All other functionality remains available
13
14#[cfg(feature = "no-std")]
15extern crate alloc;
16
17#[cfg(feature = "no-std")]
18use alloc::{
19    format,
20    string::{String, ToString},
21    vec::Vec,
22};
23
24use crate::SimdCapabilities;
25
26#[cfg(feature = "no-std")]
27use alloc::collections::BTreeMap as HashMap;
28#[cfg(not(feature = "no-std"))]
29use std::collections::HashMap;
30#[cfg(not(feature = "no-std"))]
31use std::string::ToString;
32#[cfg(not(feature = "no-std"))]
33pub use std::time::Duration;
34
35#[cfg(not(feature = "no-std"))]
36use std::time::Instant;
37
38// Mock Duration for no-std compatibility
39#[cfg(feature = "no-std")]
40#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
41pub struct Duration(u64); // nanoseconds
42
43#[cfg(feature = "no-std")]
44impl Duration {
45    pub fn from_nanos(nanos: u64) -> Self {
46        Duration(nanos)
47    }
48
49    pub fn from_millis(millis: u64) -> Self {
50        Duration(millis * 1_000_000)
51    }
52
53    pub fn from_secs(secs: u64) -> Self {
54        Duration(secs * 1_000_000_000)
55    }
56
57    pub fn as_nanos(&self) -> u128 {
58        self.0 as u128
59    }
60
61    pub fn as_millis(&self) -> u128 {
62        (self.0 / 1_000_000) as u128
63    }
64
65    pub fn as_secs(&self) -> u64 {
66        self.0 / 1_000_000_000
67    }
68
69    pub fn as_secs_f64(&self) -> f64 {
70        self.0 as f64 / 1_000_000_000.0
71    }
72}
73
74/// Performance measurement result
75///
76/// Note: In no-std environments, timing functionality is limited and
77/// duration values may be mock values for API compatibility.
78#[derive(Debug, Clone)]
79pub struct BenchmarkResult {
80    pub name: String,
81    pub duration: Duration,
82    pub throughput: Option<f64>, // operations per second (None in no-std)
83    pub simd_width: usize,
84    pub architecture: String,
85    pub iterations: u64,
86}
87
88/// Cross-platform performance comparison
89#[derive(Debug, Clone)]
90pub struct CrossPlatformResult {
91    pub operation: String,
92    pub results: HashMap<String, BenchmarkResult>,
93    pub best_performance: String,
94    pub speedup_ratios: HashMap<String, f64>,
95}
96
97/// Performance regression detector
98#[derive(Debug)]
99pub struct RegressionDetector {
100    baseline_results: HashMap<String, BenchmarkResult>,
101    threshold: f64, // percentage threshold for regression detection
102}
103
104impl RegressionDetector {
105    /// Create a new regression detector with the given threshold
106    pub fn new(threshold_percent: f64) -> Self {
107        Self {
108            baseline_results: HashMap::new(),
109            threshold: threshold_percent / 100.0,
110        }
111    }
112
113    /// Set baseline results for comparison
114    pub fn set_baseline(&mut self, results: Vec<BenchmarkResult>) {
115        for result in results {
116            self.baseline_results.insert(result.name.clone(), result);
117        }
118    }
119
120    /// Check for performance regressions
121    pub fn check_regression(&self, current_results: &[BenchmarkResult]) -> Vec<RegressionReport> {
122        let mut regressions = Vec::new();
123
124        for current in current_results {
125            if let Some(baseline) = self.baseline_results.get(&current.name) {
126                let baseline_ns = baseline.duration.as_nanos() as f64;
127                let current_ns = current.duration.as_nanos() as f64;
128                let change_ratio = (current_ns - baseline_ns) / baseline_ns;
129
130                if change_ratio > self.threshold {
131                    regressions.push(RegressionReport {
132                        operation: current.name.clone(),
133                        baseline_duration: baseline.duration,
134                        current_duration: current.duration,
135                        regression_percent: change_ratio * 100.0,
136                        severity: if change_ratio > 0.2 {
137                            Severity::Critical
138                        } else if change_ratio > 0.1 {
139                            Severity::High
140                        } else {
141                            Severity::Medium
142                        },
143                    });
144                }
145            }
146        }
147
148        regressions
149    }
150}
151
152/// Performance regression report
153#[derive(Debug)]
154pub struct RegressionReport {
155    pub operation: String,
156    pub baseline_duration: Duration,
157    pub current_duration: Duration,
158    pub regression_percent: f64,
159    pub severity: Severity,
160}
161
162#[derive(Debug, Clone, Copy)]
163pub enum Severity {
164    Medium,
165    High,
166    Critical,
167}
168
169/// Comprehensive benchmark suite runner
170///
171/// Provides benchmarking capabilities for SIMD operations with cross-platform support.
172/// In no-std environments, timing functionality is limited and operations will be
173/// executed but without accurate timing measurements.
174pub struct BenchmarkSuite {
175    capabilities: SimdCapabilities,
176    results: Vec<BenchmarkResult>,
177}
178
179impl Default for BenchmarkSuite {
180    fn default() -> Self {
181        Self::new()
182    }
183}
184
185impl BenchmarkSuite {
186    /// Create a new benchmark suite
187    pub fn new() -> Self {
188        Self {
189            capabilities: SimdCapabilities::detect(),
190            results: Vec::new(),
191        }
192    }
193
194    /// Run a benchmark and record results
195    ///
196    /// In std environments, this provides accurate timing measurements.
197    /// In no-std environments, the operation is executed but timing is mocked.
198    pub fn benchmark<F>(&mut self, name: &str, iterations: u64, mut operation: F) -> BenchmarkResult
199    where
200        F: FnMut(),
201    {
202        // Warm up
203        for _ in 0..10 {
204            operation();
205        }
206
207        #[cfg(not(feature = "no-std"))]
208        let (duration, throughput) = {
209            let start = Instant::now();
210            for _ in 0..iterations {
211                operation();
212            }
213            let duration = start.elapsed();
214            let throughput = Some(iterations as f64 / duration.as_secs_f64());
215            (duration, throughput)
216        };
217
218        #[cfg(feature = "no-std")]
219        let (duration, throughput) = {
220            // Execute the operation without timing in no-std environments
221            for _ in 0..iterations {
222                operation();
223            }
224            // Return mock duration for no-std compatibility
225            (Duration::from_nanos(1), None)
226        };
227
228        let result = BenchmarkResult {
229            name: name.to_string(),
230            duration,
231            throughput,
232            simd_width: self.capabilities.best_f32_width(),
233            architecture: self.get_architecture_name(),
234            iterations,
235        };
236
237        self.results.push(result.clone());
238        result
239    }
240
241    /// Run cross-platform comparison
242    pub fn cross_platform_benchmark<F>(
243        &mut self,
244        operation_name: &str,
245        data_size: usize,
246        operation: F,
247    ) -> CrossPlatformResult
248    where
249        F: Fn(&[f32]) -> f32 + Copy,
250    {
251        let test_data: Vec<f32> = (0..data_size).map(|i| i as f32).collect();
252        let mut results = HashMap::new();
253
254        // Test scalar implementation
255        let scalar_result = self.benchmark(&format!("{}_scalar", operation_name), 1000, || {
256            let _ = operation(&test_data);
257        });
258        results.insert("scalar".to_string(), scalar_result);
259
260        // Test SIMD implementations based on available capabilities
261        if self.capabilities.sse2 {
262            let sse2_result = self.benchmark(&format!("{}_sse2", operation_name), 1000, || {
263                let _ = operation(&test_data);
264            });
265            results.insert("sse2".to_string(), sse2_result);
266        }
267
268        if self.capabilities.avx2 {
269            let avx2_result = self.benchmark(&format!("{}_avx2", operation_name), 1000, || {
270                let _ = operation(&test_data);
271            });
272            results.insert("avx2".to_string(), avx2_result);
273        }
274
275        if self.capabilities.avx512 {
276            let avx512_result = self.benchmark(&format!("{}_avx512", operation_name), 1000, || {
277                let _ = operation(&test_data);
278            });
279            results.insert("avx512".to_string(), avx512_result);
280        }
281
282        if self.capabilities.neon {
283            let neon_result = self.benchmark(&format!("{}_neon", operation_name), 1000, || {
284                let _ = operation(&test_data);
285            });
286            results.insert("neon".to_string(), neon_result);
287        }
288
289        // Find best performance and calculate speedup ratios
290        let best_duration = results
291            .values()
292            .map(|r| r.duration)
293            .min()
294            .unwrap_or(Duration::from_secs(1));
295
296        let best_performance = results
297            .iter()
298            .min_by_key(|(_, result)| result.duration)
299            .map(|(name, _)| name.clone())
300            .unwrap_or_else(|| "unknown".to_string());
301
302        let mut speedup_ratios = HashMap::new();
303        let baseline_duration = results
304            .get("scalar")
305            .map(|r| r.duration)
306            .unwrap_or(best_duration);
307
308        for (name, result) in &results {
309            let speedup = baseline_duration.as_nanos() as f64 / result.duration.as_nanos() as f64;
310            speedup_ratios.insert(name.clone(), speedup);
311        }
312
313        CrossPlatformResult {
314            operation: operation_name.to_string(),
315            results,
316            best_performance,
317            speedup_ratios,
318        }
319    }
320
321    /// Get all benchmark results
322    pub fn get_results(&self) -> &[BenchmarkResult] {
323        &self.results
324    }
325
326    /// Generate performance report
327    pub fn generate_report(&self) -> BenchmarkReport {
328        let total_benchmarks = self.results.len();
329        let avg_duration = if total_benchmarks > 0 {
330            let total_nanos: u128 = self.results.iter().map(|r| r.duration.as_nanos()).sum();
331            Duration::from_nanos((total_nanos / total_benchmarks as u128) as u64)
332        } else {
333            Duration::from_secs(0)
334        };
335
336        let fastest = self.results.iter().min_by_key(|r| r.duration).cloned();
337        let slowest = self.results.iter().max_by_key(|r| r.duration).cloned();
338
339        BenchmarkReport {
340            total_benchmarks,
341            avg_duration,
342            fastest,
343            slowest,
344            architecture: self.get_architecture_name(),
345            simd_width: self.capabilities.best_f32_width(),
346            capabilities: self.capabilities,
347        }
348    }
349
350    fn get_architecture_name(&self) -> String {
351        if self.capabilities.avx512 {
352            "AVX-512".to_string()
353        } else if self.capabilities.avx2 {
354            "AVX2".to_string()
355        } else if self.capabilities.avx {
356            "AVX".to_string()
357        } else if self.capabilities.sse42 {
358            "SSE4.2".to_string()
359        } else if self.capabilities.sse2 {
360            "SSE2".to_string()
361        } else if self.capabilities.neon {
362            "NEON".to_string()
363        } else {
364            "Scalar".to_string()
365        }
366    }
367}
368
369/// Comprehensive benchmark report
370#[derive(Debug)]
371pub struct BenchmarkReport {
372    pub total_benchmarks: usize,
373    pub avg_duration: Duration,
374    pub fastest: Option<BenchmarkResult>,
375    pub slowest: Option<BenchmarkResult>,
376    pub architecture: String,
377    pub simd_width: usize,
378    pub capabilities: SimdCapabilities,
379}
380
381impl BenchmarkReport {
382    /// Generate a formatted report string
383    pub fn format_report(&self) -> String {
384        let mut report = String::new();
385
386        report.push_str("=== SIMD Performance Benchmark Report ===\n");
387        report.push_str(&format!("Architecture: {}\n", self.architecture));
388        report.push_str(&format!("SIMD Width (f32): {}\n", self.simd_width));
389        report.push_str(&format!("Total Benchmarks: {}\n", self.total_benchmarks));
390        report.push_str(&format!("Average Duration: {:?}\n", self.avg_duration));
391
392        report.push_str("\nCapabilities:\n");
393        report.push_str(&format!("  SSE2: {}\n", self.capabilities.sse2));
394        report.push_str(&format!("  AVX2: {}\n", self.capabilities.avx2));
395        report.push_str(&format!("  AVX-512: {}\n", self.capabilities.avx512));
396        report.push_str(&format!("  NEON: {}\n", self.capabilities.neon));
397
398        if let Some(fastest) = &self.fastest {
399            report.push_str(&format!(
400                "\nFastest Operation: {} ({:?})\n",
401                fastest.name, fastest.duration
402            ));
403        }
404
405        if let Some(slowest) = &self.slowest {
406            report.push_str(&format!(
407                "Slowest Operation: {} ({:?})\n",
408                slowest.name, slowest.duration
409            ));
410        }
411
412        report.push_str("\n=== End Report ===\n");
413        report
414    }
415}
416
417/// Automated optimization recommendations
418pub struct OptimizationAdvisor {
419    results: Vec<CrossPlatformResult>,
420}
421
422impl Default for OptimizationAdvisor {
423    fn default() -> Self {
424        Self::new()
425    }
426}
427
428impl OptimizationAdvisor {
429    /// Create a new optimization advisor
430    pub fn new() -> Self {
431        Self {
432            results: Vec::new(),
433        }
434    }
435
436    /// Add cross-platform results for analysis
437    pub fn add_results(&mut self, result: CrossPlatformResult) {
438        self.results.push(result);
439    }
440
441    /// Generate optimization recommendations
442    pub fn generate_recommendations(&self) -> Vec<OptimizationRecommendation> {
443        let mut recommendations = Vec::new();
444
445        for result in &self.results {
446            // Check if SIMD provides significant speedup
447            if let Some(scalar_speedup) = result.speedup_ratios.get("scalar") {
448                if *scalar_speedup < 1.5 {
449                    recommendations.push(OptimizationRecommendation {
450                        operation: result.operation.clone(),
451                        recommendation_type: RecommendationType::AlgorithmOptimization,
452                        description: format!(
453                            "SIMD implementation for {} shows minimal speedup ({}x). Consider algorithm optimization or data layout changes.",
454                            result.operation, scalar_speedup
455                        ),
456                        priority: Priority::Medium,
457                    });
458                }
459            }
460
461            // Check for memory-bound operations
462            let best_speedup = result.speedup_ratios.values().cloned().fold(0.0, f64::max);
463            if best_speedup < 2.0 {
464                recommendations.push(OptimizationRecommendation {
465                    operation: result.operation.clone(),
466                    recommendation_type: RecommendationType::MemoryOptimization,
467                    description: format!(
468                        "Operation {} may be memory-bound. Consider cache optimization, prefetching, or data layout improvements.",
469                        result.operation
470                    ),
471                    priority: Priority::High,
472                });
473            }
474
475            // Check for underutilized SIMD width
476            if result.best_performance == "sse2" && result.speedup_ratios.contains_key("avx2") {
477                recommendations.push(OptimizationRecommendation {
478                    operation: result.operation.clone(),
479                    recommendation_type: RecommendationType::SimdWidthOptimization,
480                    description: format!(
481                        "Operation {} performs better with SSE2 than AVX2. Consider optimizing for wider SIMD or checking for overhead.",
482                        result.operation
483                    ),
484                    priority: Priority::Medium,
485                });
486            }
487        }
488
489        recommendations
490    }
491}
492
493/// Optimization recommendation
494#[derive(Debug)]
495pub struct OptimizationRecommendation {
496    pub operation: String,
497    pub recommendation_type: RecommendationType,
498    pub description: String,
499    pub priority: Priority,
500}
501
502#[derive(Debug)]
503pub enum RecommendationType {
504    AlgorithmOptimization,
505    MemoryOptimization,
506    SimdWidthOptimization,
507    CompilerOptimization,
508}
509
510#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
511pub enum Priority {
512    Low,
513    Medium,
514    High,
515    Critical,
516}
517
518#[cfg(all(test, not(feature = "no-std")))]
519mod tests {
520    use super::*;
521
522    #[test]
523    fn test_benchmark_suite_creation() {
524        let suite = BenchmarkSuite::new();
525        assert_eq!(suite.results.len(), 0);
526    }
527
528    #[test]
529    fn test_simple_benchmark() {
530        let mut suite = BenchmarkSuite::new();
531        let result = suite.benchmark("test_op", 100, || {
532            // Simple operation
533            let _sum: f32 = (0..1000).map(|i| i as f32).sum();
534        });
535
536        assert_eq!(result.name, "test_op");
537        assert_eq!(result.iterations, 100);
538        assert!(result.duration > Duration::from_nanos(0));
539    }
540
541    #[test]
542    fn test_regression_detector() {
543        let mut detector = RegressionDetector::new(10.0); // 10% threshold
544
545        let baseline = vec![BenchmarkResult {
546            name: "test_op".to_string(),
547            duration: Duration::from_millis(100),
548            throughput: None,
549            simd_width: 4,
550            architecture: "test".to_string(),
551            iterations: 1000,
552        }];
553
554        detector.set_baseline(baseline);
555
556        // Test with no regression
557        let current = vec![BenchmarkResult {
558            name: "test_op".to_string(),
559            duration: Duration::from_millis(105), // 5% slower, within threshold
560            throughput: None,
561            simd_width: 4,
562            architecture: "test".to_string(),
563            iterations: 1000,
564        }];
565
566        let regressions = detector.check_regression(&current);
567        assert_eq!(regressions.len(), 0);
568
569        // Test with regression
570        let current_regressed = vec![BenchmarkResult {
571            name: "test_op".to_string(),
572            duration: Duration::from_millis(120), // 20% slower, above threshold
573            throughput: None,
574            simd_width: 4,
575            architecture: "test".to_string(),
576            iterations: 1000,
577        }];
578
579        let regressions = detector.check_regression(&current_regressed);
580        assert_eq!(regressions.len(), 1);
581        assert_eq!(regressions[0].operation, "test_op");
582        assert!(regressions[0].regression_percent > 10.0);
583    }
584
585    #[test]
586    fn test_optimization_advisor() {
587        let mut advisor = OptimizationAdvisor::new();
588
589        let mut speedup_ratios = HashMap::new();
590        speedup_ratios.insert("scalar".to_string(), 1.2); // Low speedup
591
592        let result = CrossPlatformResult {
593            operation: "slow_op".to_string(),
594            results: HashMap::new(),
595            best_performance: "sse2".to_string(),
596            speedup_ratios,
597        };
598
599        advisor.add_results(result);
600        let recommendations = advisor.generate_recommendations();
601
602        assert!(!recommendations.is_empty());
603        assert!(recommendations.iter().any(|r| r.operation == "slow_op"));
604    }
605
606    #[test]
607    fn test_benchmark_report_formatting() {
608        let report = BenchmarkReport {
609            total_benchmarks: 5,
610            avg_duration: Duration::from_millis(10),
611            fastest: None,
612            slowest: None,
613            architecture: "AVX2".to_string(),
614            simd_width: 8,
615            capabilities: SimdCapabilities::detect(),
616        };
617
618        let formatted = report.format_report();
619        assert!(formatted.contains("Architecture: AVX2"));
620        assert!(formatted.contains("SIMD Width (f32): 8"));
621        assert!(formatted.contains("Total Benchmarks: 5"));
622    }
623}
sklears_simd/benchmark_framework.rs

sklears_simd/
benchmark_framework.rs