scirs2_core/gpu/
benchmarks.rs

1//! GPU vs CPU performance benchmarking suite
2//!
3//! This module provides comprehensive benchmarking capabilities for comparing
4//! performance between CPU and GPU implementations of various algorithms.
5
6use crate::gpu::{GpuBackend, GpuContext, GpuError};
7use std::collections::HashMap;
8use std::time::{Duration, Instant};
9use thiserror::Error;
10
11/// Error types for benchmarking operations
12#[derive(Error, Debug)]
13pub enum BenchmarkError {
14    /// Benchmark setup failed
15    #[error("Benchmark setup failed: {0}")]
16    SetupFailed(String),
17
18    /// Benchmark execution failed
19    #[error("Benchmark execution failed: {0}")]
20    ExecutionFailed(String),
21
22    /// Invalid benchmark configuration
23    #[error("Invalid benchmark configuration: {0}")]
24    InvalidConfiguration(String),
25
26    /// Results comparison failed
27    #[error("Results comparison failed: {0}")]
28    ComparisonFailed(String),
29
30    /// Underlying GPU error
31    #[error("GPU error: {0}")]
32    GpuError(#[from] GpuError),
33}
34
35/// Benchmark operation types
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum BenchmarkOperation {
38    /// Matrix multiplication (GEMM)
39    MatrixMultiply,
40    /// Element-wise vector operations
41    VectorOperations,
42    /// Fast Fourier Transform
43    FastFourierTransform,
44    /// Convolution operations
45    Convolution,
46    /// Reduction operations (sum, max, etc.)
47    Reduction,
48    /// Sorting algorithms
49    Sorting,
50    /// Random number generation
51    RandomGeneration,
52    /// Image processing operations
53    ImageProcessing,
54    /// Signal processing
55    SignalProcessing,
56    /// Statistical computations
57    Statistics,
58    /// Linear algebra operations
59    LinearAlgebra,
60    /// Sparse matrix operations
61    SparseMatrix,
62}
63
64impl BenchmarkOperation {
65    /// Get human-readable name
66    pub const fn name(&self) -> &'static str {
67        match self {
68            BenchmarkOperation::MatrixMultiply => "Matrix Multiplication",
69            BenchmarkOperation::VectorOperations => "Vector Operations",
70            BenchmarkOperation::FastFourierTransform => "Fast Fourier Transform",
71            BenchmarkOperation::Convolution => "Convolution",
72            BenchmarkOperation::Reduction => "Reduction",
73            BenchmarkOperation::Sorting => "Sorting",
74            BenchmarkOperation::RandomGeneration => "Random Generation",
75            BenchmarkOperation::ImageProcessing => "Image Processing",
76            BenchmarkOperation::SignalProcessing => "Signal Processing",
77            BenchmarkOperation::Statistics => "Statistics",
78            BenchmarkOperation::LinearAlgebra => "Linear Algebra",
79            BenchmarkOperation::SparseMatrix => "Sparse Matrix",
80        }
81    }
82
83    /// Get operation category
84    pub fn category(&self) -> BenchmarkCategory {
85        match self {
86            BenchmarkOperation::MatrixMultiply
87            | BenchmarkOperation::LinearAlgebra
88            | BenchmarkOperation::SparseMatrix => BenchmarkCategory::LinearAlgebra,
89
90            BenchmarkOperation::VectorOperations | BenchmarkOperation::Reduction => {
91                BenchmarkCategory::ElementWise
92            }
93
94            BenchmarkOperation::FastFourierTransform
95            | BenchmarkOperation::Convolution
96            | BenchmarkOperation::SignalProcessing => BenchmarkCategory::SignalProcessing,
97
98            BenchmarkOperation::ImageProcessing => BenchmarkCategory::ImageProcessing,
99
100            BenchmarkOperation::Sorting
101            | BenchmarkOperation::RandomGeneration
102            | BenchmarkOperation::Statistics => BenchmarkCategory::GeneralCompute,
103        }
104    }
105}
106
107/// Benchmark operation categories
108#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
109pub enum BenchmarkCategory {
110    /// Linear algebra operations
111    LinearAlgebra,
112    /// Element-wise operations
113    ElementWise,
114    /// Signal processing operations
115    SignalProcessing,
116    /// Image processing operations
117    ImageProcessing,
118    /// General compute operations
119    GeneralCompute,
120}
121
122/// Benchmark configuration
123#[derive(Debug, Clone)]
124pub struct BenchmarkConfig {
125    /// Operations to benchmark
126    pub operations: Vec<BenchmarkOperation>,
127    /// Problem sizes to test
128    pub problemsizes: Vec<ProblemSize>,
129    /// Number of warmup iterations
130    pub warmup_iterations: usize,
131    /// Number of benchmark iterations
132    pub benchmark_iterations: usize,
133    /// Data types to test
134    pub datatypes: Vec<DataType>,
135    /// GPU backends to test
136    pub gpu_backends: Vec<GpuBackend>,
137    /// Whether to verify correctness
138    pub verify_correctness: bool,
139    /// Tolerance for numerical comparisons
140    pub tolerance: f64,
141}
142
143impl Default for BenchmarkConfig {
144    fn default() -> Self {
145        Self {
146            operations: vec![
147                BenchmarkOperation::MatrixMultiply,
148                BenchmarkOperation::VectorOperations,
149                BenchmarkOperation::Reduction,
150            ],
151            problemsizes: vec![ProblemSize::Small, ProblemSize::Medium, ProblemSize::Large],
152            warmup_iterations: 3,
153            benchmark_iterations: 10,
154            datatypes: vec![DataType::Float32, DataType::Float64],
155            gpu_backends: vec![GpuBackend::Cuda, GpuBackend::Rocm],
156            verify_correctness: true,
157            tolerance: 1e-6,
158        }
159    }
160}
161
162/// Problem size categories
163#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
164pub enum ProblemSize {
165    /// Small problems (< 1K elements)
166    Small,
167    /// Medium problems (1K - 1M elements)
168    Medium,
169    /// Large problems (1M - 100M elements)
170    Large,
171    /// Extra large problems (> 100M elements)
172    ExtraLarge,
173    /// Custom size
174    Custom(usize),
175}
176
177impl ProblemSize {
178    /// Get actual size for matrix operations (N x N)
179    pub fn matrix_size(&self) -> usize {
180        match self {
181            ProblemSize::Small => 64,
182            ProblemSize::Medium => 512,
183            ProblemSize::Large => 2048,
184            ProblemSize::ExtraLarge => 8192,
185            ProblemSize::Custom(size) => *size,
186        }
187    }
188
189    /// Get actual size for vector operations
190    pub fn vector_size(&self) -> usize {
191        match self {
192            ProblemSize::Small => 1024,
193            ProblemSize::Medium => 1024 * 1024,
194            ProblemSize::Large => 64 * 1024 * 1024,
195            ProblemSize::ExtraLarge => 512 * 1024 * 1024,
196            ProblemSize::Custom(size) => *size,
197        }
198    }
199}
200
201/// Data types for benchmarking
202#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
203pub enum DataType {
204    /// 32-bit floating point
205    Float32,
206    /// 64-bit floating point
207    Float64,
208    /// 16-bit floating point
209    Float16,
210    /// 32-bit signed integer
211    Int32,
212    /// 32-bit unsigned integer
213    UInt32,
214}
215
216impl DataType {
217    /// Get size in bytes
218    pub fn size_bytes(&self) -> usize {
219        match self {
220            DataType::Float32 | DataType::Int32 | DataType::UInt32 => 4,
221            DataType::Float64 => 8,
222            DataType::Float16 => 2,
223        }
224    }
225
226    /// Get type name
227    pub const fn name(&self) -> &'static str {
228        match self {
229            DataType::Float32 => "f32",
230            DataType::Float64 => "f64",
231            DataType::Float16 => "f16",
232            DataType::Int32 => "i32",
233            DataType::UInt32 => "u32",
234        }
235    }
236}
237
238/// Compute platform for benchmarking
239#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
240pub enum ComputePlatform {
241    /// CPU implementation
242    Cpu,
243    /// GPU implementation with specific backend
244    Gpu(GpuBackend),
245}
246
247impl ComputePlatform {
248    /// Get platform name
249    pub fn name(&self) -> String {
250        match self {
251            ComputePlatform::Cpu => "CPU".to_string(),
252            ComputePlatform::Gpu(backend) => format!("GPU ({backend})"),
253        }
254    }
255}
256
257/// Benchmark result for a single test
258#[derive(Debug, Clone)]
259pub struct BenchmarkResult {
260    /// Operation that was benchmarked
261    pub operation: BenchmarkOperation,
262    /// Platform used
263    pub platform: ComputePlatform,
264    /// Problem size
265    pub problemsize: ProblemSize,
266    /// Data type
267    pub datatype: DataType,
268    /// Execution time (average)
269    pub execution_time: Duration,
270    /// Standard deviation of execution times
271    pub time_stddev: Duration,
272    /// Throughput (operations per second)
273    pub throughput: f64,
274    /// Memory bandwidth (GB/s)
275    pub memorybandwidth: f64,
276    /// Energy efficiency (operations per joule) if available
277    pub energy_efficiency: Option<f64>,
278    /// Peak memory usage (bytes)
279    pub peak_memory_usage: usize,
280    /// Whether correctness verification passed
281    pub correctness_verified: bool,
282}
283
284/// Benchmark comparison result
285#[derive(Debug, Clone)]
286pub struct BenchmarkComparison {
287    /// Operation being compared
288    pub operation: BenchmarkOperation,
289    /// Problem size
290    pub problemsize: ProblemSize,
291    /// Data type
292    pub datatype: DataType,
293    /// Results for each platform
294    pub platform_results: HashMap<ComputePlatform, BenchmarkResult>,
295    /// Speedup factors (GPU vs CPU)
296    pub speedups: HashMap<GpuBackend, f64>,
297    /// Energy efficiency comparison
298    pub energy_comparison: HashMap<ComputePlatform, f64>,
299    /// Recommendation based on results
300    pub recommendation: PlatformRecommendation,
301}
302
303/// Platform recommendation based on benchmark results
304#[derive(Debug, Clone)]
305pub enum PlatformRecommendation {
306    /// CPU is recommended
307    Cpu { reason: String },
308    /// GPU is recommended
309    Gpu { backend: GpuBackend, reason: String },
310    /// Depends on specific use case
311    Depends { factors: Vec<String> },
312}
313
314/// Main benchmarking suite
315pub struct BenchmarkSuite {
316    config: BenchmarkConfig,
317    results: Vec<BenchmarkResult>,
318    comparisons: Vec<BenchmarkComparison>,
319}
320
321impl BenchmarkSuite {
322    /// Create a new benchmark suite
323    pub fn new(config: BenchmarkConfig) -> Self {
324        Self {
325            config,
326            results: Vec::new(),
327            comparisons: Vec::new(),
328        }
329    }
330
331    /// Run all benchmarks
332    pub fn run_all(&mut self) -> Result<(), BenchmarkError> {
333        let operations = self.config.operations.clone();
334        let problemsizes = self.config.problemsizes.clone();
335        let datatypes = self.config.datatypes.clone();
336
337        for operation in operations {
338            for problemsize in problemsizes.iter() {
339                for datatype in datatypes.iter() {
340                    self.run_operation_benchmark(operation, *problemsize, *datatype)?;
341                }
342            }
343        }
344
345        self.generate_comparisons()?;
346        Ok(())
347    }
348
349    /// Run benchmark for a specific operation
350    fn run_operation_benchmark(
351        &mut self,
352        operation: BenchmarkOperation,
353        problemsize: ProblemSize,
354        datatype: DataType,
355    ) -> Result<(), BenchmarkError> {
356        // Run CPU benchmark
357        let cpu_result = self.run_cpu_benchmark(operation, problemsize, datatype)?;
358        self.results.push(cpu_result);
359
360        // Run GPU benchmarks for each available backend
361        for &backend in &self.config.gpu_backends {
362            if backend.is_available() {
363                match self.run_gpu_benchmark(operation, problemsize, datatype, backend) {
364                    Ok(gpu_result) => self.results.push(gpu_result),
365                    Err(e) => {
366                        eprintln!("GPU benchmark failed for {backend}: {e}");
367                    }
368                }
369            }
370        }
371
372        Ok(())
373    }
374
375    /// Run CPU benchmark
376    fn run_cpu_benchmark(
377        &self,
378        operation: BenchmarkOperation,
379        problemsize: ProblemSize,
380        datatype: DataType,
381    ) -> Result<BenchmarkResult, BenchmarkError> {
382        // Warmup
383        for _ in 0..self.config.warmup_iterations {
384            self.execute_cpu_operation(operation, problemsize, datatype)?;
385        }
386
387        // Benchmark
388        let mut execution_times = Vec::new();
389        for _ in 0..self.config.benchmark_iterations {
390            let start = Instant::now();
391            self.execute_cpu_operation(operation, problemsize, datatype)?;
392            execution_times.push(start.elapsed());
393        }
394
395        let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
396        let time_stddev = self.calculate_stddev(&execution_times, avg_time);
397
398        Ok(BenchmarkResult {
399            operation,
400            platform: ComputePlatform::Cpu,
401            problemsize,
402            datatype,
403            execution_time: avg_time,
404            time_stddev,
405            throughput: self.calculate_throughput(operation, problemsize, avg_time),
406            memorybandwidth: self.calculate_memorybandwidth(
407                operation,
408                problemsize,
409                datatype,
410                avg_time,
411            ),
412            energy_efficiency: None, // Would need power measurement
413            peak_memory_usage: self.estimate_memory_usage(operation, problemsize, datatype),
414            correctness_verified: true, // CPU is reference implementation
415        })
416    }
417
418    /// Run GPU benchmark
419    fn run_gpu_benchmark(
420        &self,
421        operation: BenchmarkOperation,
422        problemsize: ProblemSize,
423        datatype: DataType,
424        backend: GpuBackend,
425    ) -> Result<BenchmarkResult, BenchmarkError> {
426        // Create GPU context
427        let context =
428            GpuContext::new(backend).map_err(|e| BenchmarkError::SetupFailed(e.to_string()))?;
429
430        // Warmup
431        for _ in 0..self.config.warmup_iterations {
432            self.execute_gpu_operation(operation, problemsize, datatype, backend)?;
433        }
434
435        // Benchmark
436        let mut execution_times = Vec::new();
437        for _ in 0..self.config.benchmark_iterations {
438            let start = Instant::now();
439            self.execute_gpu_operation(operation, problemsize, datatype, backend)?;
440            execution_times.push(start.elapsed());
441        }
442
443        let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
444        let time_stddev = self.calculate_stddev(&execution_times, avg_time);
445
446        Ok(BenchmarkResult {
447            operation,
448            platform: ComputePlatform::Gpu(backend),
449            problemsize,
450            datatype,
451            execution_time: avg_time,
452            time_stddev,
453            throughput: self.calculate_throughput(operation, problemsize, avg_time),
454            memorybandwidth: self.calculate_memorybandwidth(
455                operation,
456                problemsize,
457                datatype,
458                avg_time,
459            ),
460            energy_efficiency: None,
461            peak_memory_usage: self.estimate_memory_usage(operation, problemsize, datatype),
462            correctness_verified: self.config.verify_correctness,
463        })
464    }
465
466    /// Execute CPU operation (placeholder implementation)
467    fn execute_cpu_operation(
468        &self,
469        operation: BenchmarkOperation,
470        problemsize: ProblemSize,
471        datatype: DataType,
472    ) -> Result<(), BenchmarkError> {
473        match operation {
474            BenchmarkOperation::MatrixMultiply => {
475                let n = problemsize.matrix_size();
476                // Simulate matrix multiplication
477                let _result = (0..n * n).map(|i| i as f64).sum::<f64>();
478                Ok(())
479            }
480            BenchmarkOperation::VectorOperations => {
481                let n = problemsize.vector_size();
482                // Simulate vector operation
483                let _result = (0..n).map(|i| (i as f64).sin()).sum::<f64>();
484                Ok(())
485            }
486            _ => {
487                // Other operations would be implemented similarly
488                std::thread::sleep(Duration::from_millis(1));
489                Ok(())
490            }
491        }
492    }
493
494    /// Execute GPU operation (placeholder implementation)
495    fn execute_gpu_operation(
496        &self,
497        operation: BenchmarkOperation,
498        problemsize: ProblemSize,
499        datatype: DataType,
500        _backend: GpuBackend,
501    ) -> Result<(), BenchmarkError> {
502        match operation {
503            BenchmarkOperation::MatrixMultiply => {
504                let _n = problemsize.matrix_size();
505                // Would launch GPU kernel for matrix multiplication
506                std::thread::sleep(Duration::from_micros(100));
507                Ok(())
508            }
509            BenchmarkOperation::VectorOperations => {
510                let _n = problemsize.vector_size();
511                // Would launch GPU kernel for vector operations
512                std::thread::sleep(Duration::from_micros(50));
513                Ok(())
514            }
515            _ => {
516                // Other operations would be implemented similarly
517                std::thread::sleep(Duration::from_micros(100));
518                Ok(())
519            }
520        }
521    }
522
523    /// Generate comparison results
524    fn generate_comparisons(&mut self) -> Result<(), BenchmarkError> {
525        let mut grouped_results: HashMap<
526            (BenchmarkOperation, ProblemSize, DataType),
527            Vec<&BenchmarkResult>,
528        > = HashMap::new();
529
530        // Group results by operation, size, and data type
531        for result in &self.results {
532            let key = (result.operation, result.problemsize, result.datatype);
533            grouped_results.entry(key).or_default().push(result);
534        }
535
536        // Generate comparisons for each group
537        for ((operation, problemsize, datatype), results) in grouped_results {
538            if results.len() > 1 {
539                let comparison =
540                    self.create_comparison(operation, problemsize, datatype, &results)?;
541                self.comparisons.push(comparison);
542            }
543        }
544
545        Ok(())
546    }
547
548    /// Create a comparison from results
549    fn create_comparison(
550        &self,
551        operation: BenchmarkOperation,
552        problemsize: ProblemSize,
553        datatype: DataType,
554        results: &[&BenchmarkResult],
555    ) -> Result<BenchmarkComparison, BenchmarkError> {
556        let mut platform_results = HashMap::new();
557        let mut cpu_time = None;
558
559        for result in results {
560            platform_results.insert(result.platform, (*result).clone());
561            if matches!(result.platform, ComputePlatform::Cpu) {
562                cpu_time = Some(result.execution_time);
563            }
564        }
565
566        let mut speedups = HashMap::new();
567        let mut energy_comparison = HashMap::new();
568
569        if let Some(cpu_time) = cpu_time {
570            for result in results {
571                if let ComputePlatform::Gpu(backend) = result.platform {
572                    let speedup = cpu_time.as_secs_f64() / result.execution_time.as_secs_f64();
573                    speedups.insert(backend, speedup);
574                }
575
576                // Energy comparison (placeholder)
577                energy_comparison.insert(result.platform, 1.0);
578            }
579        }
580
581        let recommendation = self.generate_recommendation(operation, &platform_results, &speedups);
582
583        Ok(BenchmarkComparison {
584            operation,
585            problemsize,
586            datatype,
587            platform_results,
588            speedups,
589            energy_comparison,
590            recommendation,
591        })
592    }
593
594    /// Generate platform recommendation
595    fn generate_recommendation(
596        &self,
597        operation: BenchmarkOperation,
598        platform_results: &HashMap<ComputePlatform, BenchmarkResult>,
599        speedups: &HashMap<GpuBackend, f64>,
600    ) -> PlatformRecommendation {
601        // Find best GPU speedup
602        let best_speedup = speedups.values().fold(0.0f64, |a, &b| a.max(b));
603        let best_backend = speedups
604            .iter()
605            .max_by(|a, b| a.1.partial_cmp(b.1).expect("Operation failed"))
606            .map(|(&backend, _)| backend);
607
608        if best_speedup > 2.0 {
609            if let Some(backend) = best_backend {
610                PlatformRecommendation::Gpu {
611                    backend,
612                    reason: format!("GPU shows {best_speedup:.1}x speedup over CPU"),
613                }
614            } else {
615                PlatformRecommendation::Cpu {
616                    reason: "No significant GPU advantage found".to_string(),
617                }
618            }
619        } else if best_speedup > 1.2 {
620            PlatformRecommendation::Depends {
621                factors: vec![
622                    format!("GPU shows modest {:.1}x speedup", best_speedup),
623                    "Consider data transfer overhead".to_string(),
624                    format!(
625                        "{} may benefit from GPU for larger problems",
626                        operation.name()
627                    ),
628                ],
629            }
630        } else {
631            PlatformRecommendation::Cpu {
632                reason: "CPU performance is competitive or better".to_string(),
633            }
634        }
635    }
636
637    /// Calculate standard deviation of execution times
638    fn calculate_stddev(&self, times: &[Duration], avg: Duration) -> Duration {
639        if times.len() <= 1 {
640            return Duration::ZERO;
641        }
642
643        let variance = times
644            .iter()
645            .map(|&time| {
646                let diff = time.as_secs_f64() - avg.as_secs_f64();
647                diff * diff
648            })
649            .sum::<f64>()
650            / (times.len() - 1) as f64;
651
652        Duration::from_secs_f64(variance.sqrt())
653    }
654
655    /// Calculate throughput for an operation
656    fn calculate_throughput(
657        &self,
658        operation: BenchmarkOperation,
659        problemsize: ProblemSize,
660        time: Duration,
661    ) -> f64 {
662        let ops = match operation {
663            BenchmarkOperation::MatrixMultiply => {
664                let n = problemsize.matrix_size();
665                2 * n * n * n // 2*N^3 operations for N x N matrix multiply
666            }
667            BenchmarkOperation::VectorOperations => {
668                problemsize.vector_size() // One operation per element
669            }
670            _ => problemsize.vector_size(), // Default estimate
671        };
672
673        ops as f64 / time.as_secs_f64()
674    }
675
676    /// Calculate memory bandwidth utilization
677    fn calculate_memorybandwidth(
678        &self,
679        operation: BenchmarkOperation,
680        problemsize: ProblemSize,
681        datatype: DataType,
682        time: Duration,
683    ) -> f64 {
684        let bytes = match operation {
685            BenchmarkOperation::MatrixMultiply => {
686                let n = problemsize.matrix_size();
687                (3 * n * n) * datatype.size_bytes() // A, B, C matrices
688            }
689            BenchmarkOperation::VectorOperations => {
690                problemsize.vector_size() * datatype.size_bytes() * 2 // Read + write
691            }
692            _ => problemsize.vector_size() * datatype.size_bytes() * 2,
693        };
694
695        (bytes as f64) / (time.as_secs_f64() * 1e9) // GB/s
696    }
697
698    /// Estimate memory usage for an operation
699    fn estimate_memory_usage(
700        &self,
701        operation: BenchmarkOperation,
702        problemsize: ProblemSize,
703        datatype: DataType,
704    ) -> usize {
705        match operation {
706            BenchmarkOperation::MatrixMultiply => {
707                let n = problemsize.matrix_size();
708                3 * n * n * datatype.size_bytes() // Three N x N matrices
709            }
710            BenchmarkOperation::VectorOperations => {
711                problemsize.vector_size() * datatype.size_bytes() * 2 // Input + output
712            }
713            _ => problemsize.vector_size() * datatype.size_bytes() * 2,
714        }
715    }
716
717    /// Get all benchmark results
718    pub fn results(&self) -> &[BenchmarkResult] {
719        &self.results
720    }
721
722    /// Get all benchmark comparisons
723    pub fn comparisons(&self) -> &[BenchmarkComparison] {
724        &self.comparisons
725    }
726
727    /// Generate a summary report
728    pub fn generate_report(&self) -> BenchmarkReport {
729        BenchmarkReport::new(&self.results, &self.comparisons)
730    }
731}
732
733/// Comprehensive benchmark report
734#[derive(Debug, Clone)]
735pub struct BenchmarkReport {
736    /// Summary statistics
737    pub summary: BenchmarkSummary,
738    /// Detailed results
739    pub detailed_results: Vec<BenchmarkResult>,
740    /// Platform comparisons
741    pub comparisons: Vec<BenchmarkComparison>,
742    /// Recommendations by operation category
743    pub category_recommendations: HashMap<BenchmarkCategory, String>,
744}
745
746impl BenchmarkReport {
747    fn new(results: &[BenchmarkResult], comparisons: &[BenchmarkComparison]) -> Self {
748        let summary = BenchmarkSummary::from_results(results);
749        let category_recommendations = Self::generate_category_recommendations(comparisons);
750
751        Self {
752            summary,
753            detailed_results: results.to_vec(),
754            comparisons: comparisons.to_vec(),
755            category_recommendations,
756        }
757    }
758
759    fn generate_category_recommendations(
760        comparisons: &[BenchmarkComparison],
761    ) -> HashMap<BenchmarkCategory, String> {
762        let mut recommendations = HashMap::new();
763
764        // Group by category and analyze
765        for category in [
766            BenchmarkCategory::LinearAlgebra,
767            BenchmarkCategory::ElementWise,
768            BenchmarkCategory::SignalProcessing,
769            BenchmarkCategory::ImageProcessing,
770            BenchmarkCategory::GeneralCompute,
771        ] {
772            let category_comps: Vec<_> = comparisons
773                .iter()
774                .filter(|c| c.operation.category() == category)
775                .collect();
776
777            if !category_comps.is_empty() {
778                let gpu_wins = category_comps
779                    .iter()
780                    .filter(|c| matches!(c.recommendation, PlatformRecommendation::Gpu { .. }))
781                    .count();
782
783                let recommendation = if gpu_wins > category_comps.len() / 2 {
784                    format!(
785                        "GPU recommended for most {name} operations",
786                        name = category.name()
787                    )
788                } else {
789                    format!(
790                        "CPU competitive for {name} operations",
791                        name = category.name()
792                    )
793                };
794
795                recommendations.insert(category, recommendation);
796            }
797        }
798
799        recommendations
800    }
801}
802
803impl BenchmarkCategory {
804    fn name(&self) -> &'static str {
805        match self {
806            BenchmarkCategory::LinearAlgebra => "linear algebra",
807            BenchmarkCategory::ElementWise => "element-wise",
808            BenchmarkCategory::SignalProcessing => "signal processing",
809            BenchmarkCategory::ImageProcessing => "image processing",
810            BenchmarkCategory::GeneralCompute => "general compute",
811        }
812    }
813}
814
815/// Summary statistics for benchmark results
816#[derive(Debug, Clone)]
817pub struct BenchmarkSummary {
818    /// Total number of benchmarks run
819    pub total_benchmarks: usize,
820    /// Average CPU execution time
821    pub avg_cpu_time: Duration,
822    /// Average GPU execution time
823    pub avg_gpu_time: Duration,
824    /// Overall GPU speedup factor
825    pub overall_speedup: f64,
826    /// Best performing platform by operation
827    pub best_platforms: HashMap<BenchmarkOperation, ComputePlatform>,
828}
829
830impl BenchmarkSummary {
831    fn from_results(results: &[BenchmarkResult]) -> Self {
832        let total_benchmarks = results.len();
833
834        let cpu_times: Vec<_> = results
835            .iter()
836            .filter(|r| matches!(r.platform, ComputePlatform::Cpu))
837            .map(|r| r.execution_time)
838            .collect();
839
840        let gpu_times: Vec<_> = results
841            .iter()
842            .filter(|r| matches!(r.platform, ComputePlatform::Gpu(_)))
843            .map(|r| r.execution_time)
844            .collect();
845
846        let avg_cpu_time = if !cpu_times.is_empty() {
847            cpu_times.iter().sum::<Duration>() / cpu_times.len() as u32
848        } else {
849            Duration::ZERO
850        };
851
852        let avg_gpu_time = if !gpu_times.is_empty() {
853            gpu_times.iter().sum::<Duration>() / gpu_times.len() as u32
854        } else {
855            Duration::ZERO
856        };
857
858        let overall_speedup = if avg_gpu_time > Duration::ZERO {
859            avg_cpu_time.as_secs_f64() / avg_gpu_time.as_secs_f64()
860        } else {
861            1.0
862        };
863
864        // Find best platform for each operation
865        let mut best_platforms = HashMap::new();
866        let mut operation_results: HashMap<BenchmarkOperation, Vec<&BenchmarkResult>> =
867            HashMap::new();
868
869        for result in results {
870            operation_results
871                .entry(result.operation)
872                .or_default()
873                .push(result);
874        }
875
876        for (operation, op_results) in operation_results {
877            if let Some(best) = op_results.iter().min_by_key(|r| r.execution_time) {
878                best_platforms.insert(operation, best.platform);
879            }
880        }
881
882        Self {
883            total_benchmarks,
884            avg_cpu_time,
885            avg_gpu_time,
886            overall_speedup,
887            best_platforms,
888        }
889    }
890}
891
892#[cfg(test)]
893mod tests {
894    use super::*;
895
896    #[test]
897    fn test_benchmark_operation_name() {
898        assert_eq!(
899            BenchmarkOperation::MatrixMultiply.name(),
900            "Matrix Multiplication"
901        );
902        assert_eq!(
903            BenchmarkOperation::VectorOperations.name(),
904            "Vector Operations"
905        );
906    }
907
908    #[test]
909    fn test_problemsizematrix() {
910        assert_eq!(ProblemSize::Small.matrix_size(), 64);
911        assert_eq!(ProblemSize::Large.matrix_size(), 2048);
912        assert_eq!(ProblemSize::Custom(1000).matrix_size(), 1000);
913    }
914
915    #[test]
916    fn test_datatype_size() {
917        assert_eq!(DataType::Float32.size_bytes(), 4);
918        assert_eq!(DataType::Float64.size_bytes(), 8);
919        assert_eq!(DataType::Float16.size_bytes(), 2);
920    }
921
922    #[test]
923    fn test_compute_platformname() {
924        assert_eq!(ComputePlatform::Cpu.name(), "CPU");
925        assert_eq!(ComputePlatform::Gpu(GpuBackend::Cuda).name(), "GPU (CUDA)");
926    }
927
928    #[test]
929    fn test_benchmark_config_default() {
930        let config = BenchmarkConfig::default();
931        assert!(!config.operations.is_empty());
932        assert!(!config.problemsizes.is_empty());
933        assert!(config.verify_correctness);
934    }
935
936    #[test]
937    fn test_benchmark_suite_creation() {
938        let config = BenchmarkConfig::default();
939        let suite = BenchmarkSuite::new(config);
940        assert!(suite.results().is_empty());
941        assert!(suite.comparisons().is_empty());
942    }
943}