use crate::gpu::{GpuBackend, GpuContext, GpuError};
use std::collections::HashMap;
use std::time::{Duration, Instant};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum BenchmarkError {
#[error("Benchmark setup failed: {0}")]
SetupFailed(String),
#[error("Benchmark execution failed: {0}")]
ExecutionFailed(String),
#[error("Invalid benchmark configuration: {0}")]
InvalidConfiguration(String),
#[error("Results comparison failed: {0}")]
ComparisonFailed(String),
#[error("GPU error: {0}")]
GpuError(#[from] GpuError),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BenchmarkOperation {
MatrixMultiply,
VectorOperations,
FastFourierTransform,
Convolution,
Reduction,
Sorting,
RandomGeneration,
ImageProcessing,
SignalProcessing,
Statistics,
LinearAlgebra,
SparseMatrix,
}
impl BenchmarkOperation {
pub const fn name(&self) -> &'static str {
match self {
BenchmarkOperation::MatrixMultiply => "Matrix Multiplication",
BenchmarkOperation::VectorOperations => "Vector Operations",
BenchmarkOperation::FastFourierTransform => "Fast Fourier Transform",
BenchmarkOperation::Convolution => "Convolution",
BenchmarkOperation::Reduction => "Reduction",
BenchmarkOperation::Sorting => "Sorting",
BenchmarkOperation::RandomGeneration => "Random Generation",
BenchmarkOperation::ImageProcessing => "Image Processing",
BenchmarkOperation::SignalProcessing => "Signal Processing",
BenchmarkOperation::Statistics => "Statistics",
BenchmarkOperation::LinearAlgebra => "Linear Algebra",
BenchmarkOperation::SparseMatrix => "Sparse Matrix",
}
}
pub fn category(&self) -> BenchmarkCategory {
match self {
BenchmarkOperation::MatrixMultiply
| BenchmarkOperation::LinearAlgebra
| BenchmarkOperation::SparseMatrix => BenchmarkCategory::LinearAlgebra,
BenchmarkOperation::VectorOperations | BenchmarkOperation::Reduction => {
BenchmarkCategory::ElementWise
}
BenchmarkOperation::FastFourierTransform
| BenchmarkOperation::Convolution
| BenchmarkOperation::SignalProcessing => BenchmarkCategory::SignalProcessing,
BenchmarkOperation::ImageProcessing => BenchmarkCategory::ImageProcessing,
BenchmarkOperation::Sorting
| BenchmarkOperation::RandomGeneration
| BenchmarkOperation::Statistics => BenchmarkCategory::GeneralCompute,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BenchmarkCategory {
LinearAlgebra,
ElementWise,
SignalProcessing,
ImageProcessing,
GeneralCompute,
}
#[derive(Debug, Clone)]
pub struct BenchmarkConfig {
pub operations: Vec<BenchmarkOperation>,
pub problemsizes: Vec<ProblemSize>,
pub warmup_iterations: usize,
pub benchmark_iterations: usize,
pub datatypes: Vec<DataType>,
pub gpu_backends: Vec<GpuBackend>,
pub verify_correctness: bool,
pub tolerance: f64,
}
impl Default for BenchmarkConfig {
fn default() -> Self {
Self {
operations: vec![
BenchmarkOperation::MatrixMultiply,
BenchmarkOperation::VectorOperations,
BenchmarkOperation::Reduction,
],
problemsizes: vec![ProblemSize::Small, ProblemSize::Medium, ProblemSize::Large],
warmup_iterations: 3,
benchmark_iterations: 10,
datatypes: vec![DataType::Float32, DataType::Float64],
gpu_backends: vec![GpuBackend::Cuda, GpuBackend::Rocm],
verify_correctness: true,
tolerance: 1e-6,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ProblemSize {
Small,
Medium,
Large,
ExtraLarge,
Custom(usize),
}
impl ProblemSize {
pub fn matrix_size(&self) -> usize {
match self {
ProblemSize::Small => 64,
ProblemSize::Medium => 512,
ProblemSize::Large => 2048,
ProblemSize::ExtraLarge => 8192,
ProblemSize::Custom(size) => *size,
}
}
pub fn vector_size(&self) -> usize {
match self {
ProblemSize::Small => 1024,
ProblemSize::Medium => 1024 * 1024,
ProblemSize::Large => 64 * 1024 * 1024,
ProblemSize::ExtraLarge => 512 * 1024 * 1024,
ProblemSize::Custom(size) => *size,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DataType {
Float32,
Float64,
Float16,
Int32,
UInt32,
}
impl DataType {
pub fn size_bytes(&self) -> usize {
match self {
DataType::Float32 | DataType::Int32 | DataType::UInt32 => 4,
DataType::Float64 => 8,
DataType::Float16 => 2,
}
}
pub const fn name(&self) -> &'static str {
match self {
DataType::Float32 => "f32",
DataType::Float64 => "f64",
DataType::Float16 => "f16",
DataType::Int32 => "i32",
DataType::UInt32 => "u32",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ComputePlatform {
Cpu,
Gpu(GpuBackend),
}
impl ComputePlatform {
pub fn name(&self) -> String {
match self {
ComputePlatform::Cpu => "CPU".to_string(),
ComputePlatform::Gpu(backend) => format!("GPU ({backend})"),
}
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub operation: BenchmarkOperation,
pub platform: ComputePlatform,
pub problemsize: ProblemSize,
pub datatype: DataType,
pub execution_time: Duration,
pub time_stddev: Duration,
pub throughput: f64,
pub memorybandwidth: f64,
pub energy_efficiency: Option<f64>,
pub peak_memory_usage: usize,
pub correctness_verified: bool,
}
#[derive(Debug, Clone)]
pub struct BenchmarkComparison {
pub operation: BenchmarkOperation,
pub problemsize: ProblemSize,
pub datatype: DataType,
pub platform_results: HashMap<ComputePlatform, BenchmarkResult>,
pub speedups: HashMap<GpuBackend, f64>,
pub energy_comparison: HashMap<ComputePlatform, f64>,
pub recommendation: PlatformRecommendation,
}
#[derive(Debug, Clone)]
pub enum PlatformRecommendation {
Cpu { reason: String },
Gpu { backend: GpuBackend, reason: String },
Depends { factors: Vec<String> },
}
pub struct BenchmarkSuite {
config: BenchmarkConfig,
results: Vec<BenchmarkResult>,
comparisons: Vec<BenchmarkComparison>,
}
impl BenchmarkSuite {
pub fn new(config: BenchmarkConfig) -> Self {
Self {
config,
results: Vec::new(),
comparisons: Vec::new(),
}
}
pub fn run_all(&mut self) -> Result<(), BenchmarkError> {
let operations = self.config.operations.clone();
let problemsizes = self.config.problemsizes.clone();
let datatypes = self.config.datatypes.clone();
for operation in operations {
for problemsize in problemsizes.iter() {
for datatype in datatypes.iter() {
self.run_operation_benchmark(operation, *problemsize, *datatype)?;
}
}
}
self.generate_comparisons()?;
Ok(())
}
fn run_operation_benchmark(
&mut self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
) -> Result<(), BenchmarkError> {
let cpu_result = self.run_cpu_benchmark(operation, problemsize, datatype)?;
self.results.push(cpu_result);
for &backend in &self.config.gpu_backends {
if backend.is_available() {
match self.run_gpu_benchmark(operation, problemsize, datatype, backend) {
Ok(gpu_result) => self.results.push(gpu_result),
Err(e) => {
eprintln!("GPU benchmark failed for {backend}: {e}");
}
}
}
}
Ok(())
}
fn run_cpu_benchmark(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
) -> Result<BenchmarkResult, BenchmarkError> {
for _ in 0..self.config.warmup_iterations {
self.execute_cpu_operation(operation, problemsize, datatype)?;
}
let mut execution_times = Vec::new();
for _ in 0..self.config.benchmark_iterations {
let start = Instant::now();
self.execute_cpu_operation(operation, problemsize, datatype)?;
execution_times.push(start.elapsed());
}
let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
let time_stddev = self.calculate_stddev(&execution_times, avg_time);
Ok(BenchmarkResult {
operation,
platform: ComputePlatform::Cpu,
problemsize,
datatype,
execution_time: avg_time,
time_stddev,
throughput: self.calculate_throughput(operation, problemsize, avg_time),
memorybandwidth: self.calculate_memorybandwidth(
operation,
problemsize,
datatype,
avg_time,
),
energy_efficiency: None, peak_memory_usage: self.estimate_memory_usage(operation, problemsize, datatype),
correctness_verified: true, })
}
fn run_gpu_benchmark(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
backend: GpuBackend,
) -> Result<BenchmarkResult, BenchmarkError> {
let context =
GpuContext::new(backend).map_err(|e| BenchmarkError::SetupFailed(e.to_string()))?;
for _ in 0..self.config.warmup_iterations {
self.execute_gpu_operation(operation, problemsize, datatype, backend)?;
}
let mut execution_times = Vec::new();
for _ in 0..self.config.benchmark_iterations {
let start = Instant::now();
self.execute_gpu_operation(operation, problemsize, datatype, backend)?;
execution_times.push(start.elapsed());
}
let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
let time_stddev = self.calculate_stddev(&execution_times, avg_time);
Ok(BenchmarkResult {
operation,
platform: ComputePlatform::Gpu(backend),
problemsize,
datatype,
execution_time: avg_time,
time_stddev,
throughput: self.calculate_throughput(operation, problemsize, avg_time),
memorybandwidth: self.calculate_memorybandwidth(
operation,
problemsize,
datatype,
avg_time,
),
energy_efficiency: None,
peak_memory_usage: self.estimate_memory_usage(operation, problemsize, datatype),
correctness_verified: self.config.verify_correctness,
})
}
fn execute_cpu_operation(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
) -> Result<(), BenchmarkError> {
match operation {
BenchmarkOperation::MatrixMultiply => {
let n = problemsize.matrix_size();
let _result = (0..n * n).map(|i| i as f64).sum::<f64>();
Ok(())
}
BenchmarkOperation::VectorOperations => {
let n = problemsize.vector_size();
let _result = (0..n).map(|i| (i as f64).sin()).sum::<f64>();
Ok(())
}
_ => {
std::thread::sleep(Duration::from_millis(1));
Ok(())
}
}
}
fn execute_gpu_operation(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
_backend: GpuBackend,
) -> Result<(), BenchmarkError> {
match operation {
BenchmarkOperation::MatrixMultiply => {
let _n = problemsize.matrix_size();
std::thread::sleep(Duration::from_micros(100));
Ok(())
}
BenchmarkOperation::VectorOperations => {
let _n = problemsize.vector_size();
std::thread::sleep(Duration::from_micros(50));
Ok(())
}
_ => {
std::thread::sleep(Duration::from_micros(100));
Ok(())
}
}
}
fn generate_comparisons(&mut self) -> Result<(), BenchmarkError> {
let mut grouped_results: HashMap<
(BenchmarkOperation, ProblemSize, DataType),
Vec<&BenchmarkResult>,
> = HashMap::new();
for result in &self.results {
let key = (result.operation, result.problemsize, result.datatype);
grouped_results.entry(key).or_default().push(result);
}
for ((operation, problemsize, datatype), results) in grouped_results {
if results.len() > 1 {
let comparison =
self.create_comparison(operation, problemsize, datatype, &results)?;
self.comparisons.push(comparison);
}
}
Ok(())
}
fn create_comparison(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
results: &[&BenchmarkResult],
) -> Result<BenchmarkComparison, BenchmarkError> {
let mut platform_results = HashMap::new();
let mut cpu_time = None;
for result in results {
platform_results.insert(result.platform, (*result).clone());
if matches!(result.platform, ComputePlatform::Cpu) {
cpu_time = Some(result.execution_time);
}
}
let mut speedups = HashMap::new();
let mut energy_comparison = HashMap::new();
if let Some(cpu_time) = cpu_time {
for result in results {
if let ComputePlatform::Gpu(backend) = result.platform {
let speedup = cpu_time.as_secs_f64() / result.execution_time.as_secs_f64();
speedups.insert(backend, speedup);
}
energy_comparison.insert(result.platform, 1.0);
}
}
let recommendation = self.generate_recommendation(operation, &platform_results, &speedups);
Ok(BenchmarkComparison {
operation,
problemsize,
datatype,
platform_results,
speedups,
energy_comparison,
recommendation,
})
}
fn generate_recommendation(
&self,
operation: BenchmarkOperation,
platform_results: &HashMap<ComputePlatform, BenchmarkResult>,
speedups: &HashMap<GpuBackend, f64>,
) -> PlatformRecommendation {
let best_speedup = speedups.values().fold(0.0f64, |a, &b| a.max(b));
let best_backend = speedups
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).expect("Operation failed"))
.map(|(&backend, _)| backend);
if best_speedup > 2.0 {
if let Some(backend) = best_backend {
PlatformRecommendation::Gpu {
backend,
reason: format!("GPU shows {best_speedup:.1}x speedup over CPU"),
}
} else {
PlatformRecommendation::Cpu {
reason: "No significant GPU advantage found".to_string(),
}
}
} else if best_speedup > 1.2 {
PlatformRecommendation::Depends {
factors: vec![
format!("GPU shows modest {:.1}x speedup", best_speedup),
"Consider data transfer overhead".to_string(),
format!(
"{} may benefit from GPU for larger problems",
operation.name()
),
],
}
} else {
PlatformRecommendation::Cpu {
reason: "CPU performance is competitive or better".to_string(),
}
}
}
fn calculate_stddev(&self, times: &[Duration], avg: Duration) -> Duration {
if times.len() <= 1 {
return Duration::ZERO;
}
let variance = times
.iter()
.map(|&time| {
let diff = time.as_secs_f64() - avg.as_secs_f64();
diff * diff
})
.sum::<f64>()
/ (times.len() - 1) as f64;
Duration::from_secs_f64(variance.sqrt())
}
fn calculate_throughput(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
time: Duration,
) -> f64 {
let ops = match operation {
BenchmarkOperation::MatrixMultiply => {
let n = problemsize.matrix_size();
2 * n * n * n }
BenchmarkOperation::VectorOperations => {
problemsize.vector_size() }
_ => problemsize.vector_size(), };
ops as f64 / time.as_secs_f64()
}
fn calculate_memorybandwidth(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
time: Duration,
) -> f64 {
let bytes = match operation {
BenchmarkOperation::MatrixMultiply => {
let n = problemsize.matrix_size();
(3 * n * n) * datatype.size_bytes() }
BenchmarkOperation::VectorOperations => {
problemsize.vector_size() * datatype.size_bytes() * 2 }
_ => problemsize.vector_size() * datatype.size_bytes() * 2,
};
(bytes as f64) / (time.as_secs_f64() * 1e9) }
fn estimate_memory_usage(
&self,
operation: BenchmarkOperation,
problemsize: ProblemSize,
datatype: DataType,
) -> usize {
match operation {
BenchmarkOperation::MatrixMultiply => {
let n = problemsize.matrix_size();
3 * n * n * datatype.size_bytes() }
BenchmarkOperation::VectorOperations => {
problemsize.vector_size() * datatype.size_bytes() * 2 }
_ => problemsize.vector_size() * datatype.size_bytes() * 2,
}
}
pub fn results(&self) -> &[BenchmarkResult] {
&self.results
}
pub fn comparisons(&self) -> &[BenchmarkComparison] {
&self.comparisons
}
pub fn generate_report(&self) -> BenchmarkReport {
BenchmarkReport::new(&self.results, &self.comparisons)
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkReport {
pub summary: BenchmarkSummary,
pub detailed_results: Vec<BenchmarkResult>,
pub comparisons: Vec<BenchmarkComparison>,
pub category_recommendations: HashMap<BenchmarkCategory, String>,
}
impl BenchmarkReport {
fn new(results: &[BenchmarkResult], comparisons: &[BenchmarkComparison]) -> Self {
let summary = BenchmarkSummary::from_results(results);
let category_recommendations = Self::generate_category_recommendations(comparisons);
Self {
summary,
detailed_results: results.to_vec(),
comparisons: comparisons.to_vec(),
category_recommendations,
}
}
fn generate_category_recommendations(
comparisons: &[BenchmarkComparison],
) -> HashMap<BenchmarkCategory, String> {
let mut recommendations = HashMap::new();
for category in [
BenchmarkCategory::LinearAlgebra,
BenchmarkCategory::ElementWise,
BenchmarkCategory::SignalProcessing,
BenchmarkCategory::ImageProcessing,
BenchmarkCategory::GeneralCompute,
] {
let category_comps: Vec<_> = comparisons
.iter()
.filter(|c| c.operation.category() == category)
.collect();
if !category_comps.is_empty() {
let gpu_wins = category_comps
.iter()
.filter(|c| matches!(c.recommendation, PlatformRecommendation::Gpu { .. }))
.count();
let recommendation = if gpu_wins > category_comps.len() / 2 {
format!(
"GPU recommended for most {name} operations",
name = category.name()
)
} else {
format!(
"CPU competitive for {name} operations",
name = category.name()
)
};
recommendations.insert(category, recommendation);
}
}
recommendations
}
}
impl BenchmarkCategory {
fn name(&self) -> &'static str {
match self {
BenchmarkCategory::LinearAlgebra => "linear algebra",
BenchmarkCategory::ElementWise => "element-wise",
BenchmarkCategory::SignalProcessing => "signal processing",
BenchmarkCategory::ImageProcessing => "image processing",
BenchmarkCategory::GeneralCompute => "general compute",
}
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkSummary {
pub total_benchmarks: usize,
pub avg_cpu_time: Duration,
pub avg_gpu_time: Duration,
pub overall_speedup: f64,
pub best_platforms: HashMap<BenchmarkOperation, ComputePlatform>,
}
impl BenchmarkSummary {
fn from_results(results: &[BenchmarkResult]) -> Self {
let total_benchmarks = results.len();
let cpu_times: Vec<_> = results
.iter()
.filter(|r| matches!(r.platform, ComputePlatform::Cpu))
.map(|r| r.execution_time)
.collect();
let gpu_times: Vec<_> = results
.iter()
.filter(|r| matches!(r.platform, ComputePlatform::Gpu(_)))
.map(|r| r.execution_time)
.collect();
let avg_cpu_time = if !cpu_times.is_empty() {
cpu_times.iter().sum::<Duration>() / cpu_times.len() as u32
} else {
Duration::ZERO
};
let avg_gpu_time = if !gpu_times.is_empty() {
gpu_times.iter().sum::<Duration>() / gpu_times.len() as u32
} else {
Duration::ZERO
};
let overall_speedup = if avg_gpu_time > Duration::ZERO {
avg_cpu_time.as_secs_f64() / avg_gpu_time.as_secs_f64()
} else {
1.0
};
let mut best_platforms = HashMap::new();
let mut operation_results: HashMap<BenchmarkOperation, Vec<&BenchmarkResult>> =
HashMap::new();
for result in results {
operation_results
.entry(result.operation)
.or_default()
.push(result);
}
for (operation, op_results) in operation_results {
if let Some(best) = op_results.iter().min_by_key(|r| r.execution_time) {
best_platforms.insert(operation, best.platform);
}
}
Self {
total_benchmarks,
avg_cpu_time,
avg_gpu_time,
overall_speedup,
best_platforms,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_benchmark_operation_name() {
assert_eq!(
BenchmarkOperation::MatrixMultiply.name(),
"Matrix Multiplication"
);
assert_eq!(
BenchmarkOperation::VectorOperations.name(),
"Vector Operations"
);
}
#[test]
fn test_problemsizematrix() {
assert_eq!(ProblemSize::Small.matrix_size(), 64);
assert_eq!(ProblemSize::Large.matrix_size(), 2048);
assert_eq!(ProblemSize::Custom(1000).matrix_size(), 1000);
}
#[test]
fn test_datatype_size() {
assert_eq!(DataType::Float32.size_bytes(), 4);
assert_eq!(DataType::Float64.size_bytes(), 8);
assert_eq!(DataType::Float16.size_bytes(), 2);
}
#[test]
fn test_compute_platformname() {
assert_eq!(ComputePlatform::Cpu.name(), "CPU");
assert_eq!(ComputePlatform::Gpu(GpuBackend::Cuda).name(), "GPU (CUDA)");
}
#[test]
fn test_benchmark_config_default() {
let config = BenchmarkConfig::default();
assert!(!config.operations.is_empty());
assert!(!config.problemsizes.is_empty());
assert!(config.verify_correctness);
}
#[test]
fn test_benchmark_suite_creation() {
let config = BenchmarkConfig::default();
let suite = BenchmarkSuite::new(config);
assert!(suite.results().is_empty());
assert!(suite.comparisons().is_empty());
}
}