use crate::array::Array;
use crate::error::{NumRs2Error, Result};
use crate::gpu::array::GpuArray;
use crate::gpu::context::GpuContextRef;
use crate::gpu::linalg;
use scirs2_core::ndarray::Array2;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
pub struct BenchmarkConfig {
pub warmup_iterations: usize,
pub benchmark_iterations: usize,
pub include_cpu: bool,
pub measure_transfers: bool,
}
impl Default for BenchmarkConfig {
fn default() -> Self {
Self {
warmup_iterations: 3,
benchmark_iterations: 10,
include_cpu: true,
measure_transfers: true,
}
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkResults {
pub gpu_time_ms: f64,
pub cpu_time_ms: Option<f64>,
pub transfer_to_gpu_ms: Option<f64>,
pub transfer_from_gpu_ms: Option<f64>,
pub operations: u64,
pub data_size_bytes: u64,
}
impl BenchmarkResults {
pub fn speedup(&self) -> Option<f64> {
self.cpu_time_ms.map(|cpu| cpu / self.gpu_time_ms)
}
pub fn gpu_gflops(&self) -> f64 {
(self.operations as f64) / (self.gpu_time_ms * 1_000_000.0)
}
pub fn cpu_gflops(&self) -> Option<f64> {
self.cpu_time_ms
.map(|cpu| (self.operations as f64) / (cpu * 1_000_000.0))
}
pub fn total_time_ms(&self) -> f64 {
let mut total = self.gpu_time_ms;
if let Some(t) = self.transfer_to_gpu_ms {
total += t;
}
if let Some(t) = self.transfer_from_gpu_ms {
total += t;
}
total
}
pub fn effective_speedup(&self) -> Option<f64> {
self.cpu_time_ms.map(|cpu| cpu / self.total_time_ms())
}
}
pub struct BenchmarkRunner {
context: GpuContextRef,
}
impl BenchmarkRunner {
pub fn new(context: GpuContextRef) -> Self {
Self { context }
}
pub fn benchmark_matmul(
&self,
m: usize,
k: usize,
n: usize,
config: &BenchmarkConfig,
) -> Result<BenchmarkResults> {
use scirs2_core::random::rngs::StdRng;
use scirs2_core::random::SeedableRng;
use scirs2_core::random::*;
let mut rng = StdRng::seed_from_u64(42);
let dist = Uniform::new(0.0f32, 1.0f32).expect("Failed to create uniform distribution");
let a_data: Vec<f32> = (0..m * k).map(|_| dist.sample(&mut rng)).collect();
let b_data: Vec<f32> = (0..k * n).map(|_| dist.sample(&mut rng)).collect();
let a_cpu = Array::from_vec(a_data.clone()).reshape(&[m, k]);
let b_cpu = Array::from_vec(b_data.clone()).reshape(&[k, n]);
let transfer_to_start = Instant::now();
let a_gpu = GpuArray::from_array_with_context(&a_cpu, self.context.clone())?;
let b_gpu = GpuArray::from_array_with_context(&b_cpu, self.context.clone())?;
let transfer_to_ms = if config.measure_transfers {
Some(transfer_to_start.elapsed().as_secs_f64() * 1000.0)
} else {
None
};
for _ in 0..config.warmup_iterations {
let _ = linalg::matmul(&a_gpu, &b_gpu)?;
}
let gpu_start = Instant::now();
for _ in 0..config.benchmark_iterations {
let _ = linalg::matmul(&a_gpu, &b_gpu)?;
}
let gpu_elapsed = gpu_start.elapsed();
let gpu_time_ms = gpu_elapsed.as_secs_f64() * 1000.0 / config.benchmark_iterations as f64;
let transfer_from_ms = if config.measure_transfers {
let result_gpu = linalg::matmul(&a_gpu, &b_gpu)?;
let transfer_from_start = Instant::now();
let _ = result_gpu.to_array()?;
Some(transfer_from_start.elapsed().as_secs_f64() * 1000.0)
} else {
None
};
let cpu_time_ms = if config.include_cpu {
let a_nd = Array2::from_shape_vec((m, k), a_data).map_err(|e| {
NumRs2Error::DimensionMismatch(format!("Failed to create ndarray: {}", e))
})?;
let b_nd = Array2::from_shape_vec((k, n), b_data).map_err(|e| {
NumRs2Error::DimensionMismatch(format!("Failed to create ndarray: {}", e))
})?;
for _ in 0..config.warmup_iterations {
let _ = a_nd.dot(&b_nd);
}
let cpu_start = Instant::now();
for _ in 0..config.benchmark_iterations {
let _ = a_nd.dot(&b_nd);
}
let cpu_elapsed = cpu_start.elapsed();
Some(cpu_elapsed.as_secs_f64() * 1000.0 / config.benchmark_iterations as f64)
} else {
None
};
let operations = 2u64 * m as u64 * k as u64 * n as u64;
let data_size_bytes = (m * k + k * n + m * n) * std::mem::size_of::<f32>();
Ok(BenchmarkResults {
gpu_time_ms,
cpu_time_ms,
transfer_to_gpu_ms: transfer_to_ms,
transfer_from_gpu_ms: transfer_from_ms,
operations,
data_size_bytes: data_size_bytes as u64,
})
}
pub fn benchmark_elementwise(
&self,
size: usize,
config: &BenchmarkConfig,
) -> Result<BenchmarkResults> {
use scirs2_core::random::rngs::StdRng;
use scirs2_core::random::SeedableRng;
use scirs2_core::random::*;
let mut rng = StdRng::seed_from_u64(42);
let dist = Uniform::new(0.0f32, 1.0f32).expect("Failed to create uniform distribution");
let a_data: Vec<f32> = (0..size).map(|_| dist.sample(&mut rng)).collect();
let b_data: Vec<f32> = (0..size).map(|_| dist.sample(&mut rng)).collect();
let a_cpu = Array::from_vec(a_data.clone()).reshape(&[size]);
let b_cpu = Array::from_vec(b_data.clone()).reshape(&[size]);
let transfer_to_start = Instant::now();
let a_gpu = GpuArray::from_array_with_context(&a_cpu, self.context.clone())?;
let b_gpu = GpuArray::from_array_with_context(&b_cpu, self.context.clone())?;
let transfer_to_ms = if config.measure_transfers {
Some(transfer_to_start.elapsed().as_secs_f64() * 1000.0)
} else {
None
};
for _ in 0..config.warmup_iterations {
let _ = crate::gpu::ops::add(&a_gpu, &b_gpu)?;
}
let gpu_start = Instant::now();
for _ in 0..config.benchmark_iterations {
let _ = crate::gpu::ops::add(&a_gpu, &b_gpu)?;
}
let gpu_elapsed = gpu_start.elapsed();
let gpu_time_ms = gpu_elapsed.as_secs_f64() * 1000.0 / config.benchmark_iterations as f64;
let transfer_from_ms = if config.measure_transfers {
let result_gpu = crate::gpu::ops::add(&a_gpu, &b_gpu)?;
let transfer_from_start = Instant::now();
let _ = result_gpu.to_array()?;
Some(transfer_from_start.elapsed().as_secs_f64() * 1000.0)
} else {
None
};
let cpu_time_ms = if config.include_cpu {
for _ in 0..config.warmup_iterations {
let _: Vec<f32> = a_data.iter().zip(&b_data).map(|(a, b)| a + b).collect();
}
let cpu_start = Instant::now();
for _ in 0..config.benchmark_iterations {
let _: Vec<f32> = a_data.iter().zip(&b_data).map(|(a, b)| a + b).collect();
}
let cpu_elapsed = cpu_start.elapsed();
Some(cpu_elapsed.as_secs_f64() * 1000.0 / config.benchmark_iterations as f64)
} else {
None
};
let operations = size as u64;
let data_size_bytes = (2 * size + size) * std::mem::size_of::<f32>();
Ok(BenchmarkResults {
gpu_time_ms,
cpu_time_ms,
transfer_to_gpu_ms: transfer_to_ms,
transfer_from_gpu_ms: transfer_from_ms,
operations,
data_size_bytes: data_size_bytes as u64,
})
}
pub fn benchmark_memory_transfer(&self, size_bytes: usize) -> Result<(f64, f64)> {
use scirs2_core::random::rngs::StdRng;
use scirs2_core::random::SeedableRng;
use scirs2_core::random::*;
let mut rng = StdRng::seed_from_u64(42);
let dist = Uniform::new(0.0f32, 1.0f32).expect("Failed to create uniform distribution");
let size = size_bytes / std::mem::size_of::<f32>();
let data: Vec<f32> = (0..size).map(|_| dist.sample(&mut rng)).collect();
let cpu_array = Array::from_vec(data).reshape(&[size]);
let to_gpu_start = Instant::now();
let gpu_array = GpuArray::from_array_with_context(&cpu_array, self.context.clone())?;
let to_gpu_time = to_gpu_start.elapsed();
let to_gpu_bandwidth_gbps =
(size_bytes as f64) / (to_gpu_time.as_secs_f64() * 1_000_000_000.0);
let from_gpu_start = Instant::now();
let _ = gpu_array.to_array()?;
let from_gpu_time = from_gpu_start.elapsed();
let from_gpu_bandwidth_gbps =
(size_bytes as f64) / (from_gpu_time.as_secs_f64() * 1_000_000_000.0);
Ok((to_gpu_bandwidth_gbps, from_gpu_bandwidth_gbps))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_benchmark_config_default() {
let config = BenchmarkConfig::default();
assert_eq!(config.warmup_iterations, 3);
assert_eq!(config.benchmark_iterations, 10);
assert!(config.include_cpu);
assert!(config.measure_transfers);
}
#[test]
fn test_benchmark_results_calculations() {
let results = BenchmarkResults {
gpu_time_ms: 10.0,
cpu_time_ms: Some(100.0),
transfer_to_gpu_ms: Some(5.0),
transfer_from_gpu_ms: Some(5.0),
operations: 1_000_000_000,
data_size_bytes: 4_000_000,
};
assert_eq!(results.speedup(), Some(10.0));
assert!((results.gpu_gflops() - 100.0).abs() < 0.01);
assert_eq!(results.total_time_ms(), 20.0);
assert_eq!(results.effective_speedup(), Some(5.0));
}
#[test]
fn test_benchmark_results_no_cpu() {
let results = BenchmarkResults {
gpu_time_ms: 10.0,
cpu_time_ms: None,
transfer_to_gpu_ms: None,
transfer_from_gpu_ms: None,
operations: 1_000_000_000,
data_size_bytes: 4_000_000,
};
assert_eq!(results.speedup(), None);
assert_eq!(results.cpu_gflops(), None);
assert_eq!(results.total_time_ms(), 10.0);
}
}