use crate::Result;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub name: String,
pub iterations: u64,
pub total_duration: Duration,
pub mean_duration: Duration,
pub median_duration: Duration,
pub min_duration: Duration,
pub max_duration: Duration,
pub std_dev: Duration,
pub throughput_ops: f64,
pub throughput_bytes: Option<f64>,
}
impl BenchmarkResult {
pub fn summary(&self) -> String {
format!(
"{}: {:.2?}/iter ({} iters, {:.2?} total, {:.0} ops/s)",
self.name,
self.mean_duration,
self.iterations,
self.total_duration,
self.throughput_ops,
)
}
}
pub struct BenchmarkRunner {
warmup_iterations: u64,
min_iterations: u64,
max_iterations: u64,
target_time: Duration,
}
impl BenchmarkRunner {
pub fn new() -> Self {
Self {
warmup_iterations: 10,
min_iterations: 100,
max_iterations: 10_000,
target_time: Duration::from_secs(2),
}
}
pub fn warmup(mut self, n: u64) -> Self {
self.warmup_iterations = n;
self
}
pub fn min_iters(mut self, n: u64) -> Self {
self.min_iterations = n;
self
}
pub fn max_iters(mut self, n: u64) -> Self {
self.max_iterations = n;
self
}
pub fn target_time(mut self, d: Duration) -> Self {
self.target_time = d;
self
}
pub fn bench<F>(&self, name: &str, mut f: F) -> BenchmarkResult
where
F: FnMut(),
{
for _ in 0..self.warmup_iterations {
f();
}
let mut durations = Vec::new();
let global_start = Instant::now();
for i in 0..self.max_iterations {
let start = Instant::now();
f();
let elapsed = start.elapsed();
durations.push(elapsed);
if i >= self.min_iterations && global_start.elapsed() >= self.target_time {
break;
}
}
let iterations = durations.len() as u64;
self.compute_result(name, &durations, iterations, None)
}
pub fn bench_throughput<F>(
&self,
name: &str,
bytes_per_iter: usize,
mut f: F,
) -> BenchmarkResult
where
F: FnMut(),
{
for _ in 0..self.warmup_iterations {
f();
}
let mut durations = Vec::new();
let global_start = Instant::now();
for i in 0..self.max_iterations {
let start = Instant::now();
f();
let elapsed = start.elapsed();
durations.push(elapsed);
if i >= self.min_iterations && global_start.elapsed() >= self.target_time {
break;
}
}
let iterations = durations.len() as u64;
self.compute_result(name, &durations, iterations, Some(bytes_per_iter))
}
fn compute_result(
&self,
name: &str,
durations: &[Duration],
iterations: u64,
bytes_per_iter: Option<usize>,
) -> BenchmarkResult {
let total: Duration = durations.iter().sum();
let mean = total / iterations as u32;
let mut sorted: Vec<Duration> = durations.to_vec();
sorted.sort();
let median = sorted[sorted.len() / 2];
let min = sorted[0];
let max = sorted[sorted.len() - 1];
let mean_nanos = mean.as_nanos() as f64;
let variance: f64 = durations
.iter()
.map(|d| {
let diff = d.as_nanos() as f64 - mean_nanos;
diff * diff
})
.sum::<f64>()
/ iterations as f64;
let std_dev_nanos = variance.sqrt();
let std_dev = Duration::from_nanos(std_dev_nanos as u64);
let throughput_ops = if mean.as_nanos() > 0 {
1_000_000_000.0 / mean_nanos
} else {
f64::INFINITY
};
let throughput_bytes = bytes_per_iter.map(|bpi| {
throughput_ops * bpi as f64
});
BenchmarkResult {
name: name.to_string(),
iterations,
total_duration: total,
mean_duration: mean,
median_duration: median,
min_duration: min,
max_duration: max,
std_dev,
throughput_ops,
throughput_bytes,
}
}
}
impl Default for BenchmarkRunner {
fn default() -> Self {
Self::new()
}
}
pub struct BenchmarkSuite {
name: String,
results: Vec<BenchmarkResult>,
}
impl BenchmarkSuite {
pub fn new(name: &str) -> Self {
Self {
name: name.to_string(),
results: Vec::new(),
}
}
pub fn add_result(&mut self, result: BenchmarkResult) {
self.results.push(result);
}
pub fn results(&self) -> &[BenchmarkResult] {
&self.results
}
pub fn name(&self) -> &str {
&self.name
}
pub fn report(&self) -> String {
let mut lines = Vec::new();
lines.push(format!("=== Benchmark Suite: {} ===", self.name));
lines.push(String::new());
let max_name_len = self.results.iter().map(|r| r.name.len()).max().unwrap_or(20);
lines.push(format!(
"{:<width$} {:>12} {:>12} {:>12} {:>12} {:>12}",
"Benchmark", "Mean", "Median", "Min", "Max", "Ops/s",
width = max_name_len
));
lines.push("-".repeat(max_name_len + 66));
for r in &self.results {
lines.push(format!(
"{:<width$} {:>12.2?} {:>12.2?} {:>12.2?} {:>12.2?} {:>12.0}",
r.name,
r.mean_duration,
r.median_duration,
r.min_duration,
r.max_duration,
r.throughput_ops,
width = max_name_len
));
}
lines.push(String::new());
lines.push(format!("Total benchmarks: {}", self.results.len()));
lines.join("\n")
}
}
pub fn run_builtin_benchmarks() -> Result<BenchmarkSuite> {
let runner = BenchmarkRunner::new()
.warmup(5)
.min_iters(50)
.max_iters(1000)
.target_time(Duration::from_millis(500));
let mut suite = BenchmarkSuite::new("cuda-rust-wasm");
suite.add_result(runner.bench("pool_allocate_1kb", || {
let pool = crate::memory::MemoryPool::new();
let buf = pool.allocate(1024);
pool.deallocate(buf);
}));
suite.add_result(runner.bench("pool_allocate_64kb", || {
let pool = crate::memory::MemoryPool::new();
let buf = pool.allocate(65536);
pool.deallocate(buf);
}));
suite.add_result(runner.bench_throughput("host_buffer_fill_1kb", 1024, || {
let mut buf = crate::memory::HostBuffer::<u8>::new(1024).unwrap();
buf.fill(0xFF);
}));
use crate::runtime::kernel::{KernelFunction, ThreadContext, LaunchConfig};
use crate::runtime::grid::{Grid, Block};
struct NoopKernel;
impl KernelFunction<()> for NoopKernel {
fn execute(&self, _: (), _ctx: ThreadContext) {}
fn name(&self) -> &str { "noop" }
}
suite.add_result(runner.bench("kernel_launch_1x1", || {
let _ = crate::runtime::kernel::launch_kernel(
NoopKernel,
LaunchConfig::new(Grid::new(1u32), Block::new(1u32)),
(),
);
}));
suite.add_result(runner.bench("kernel_launch_1x256", || {
let _ = crate::runtime::kernel::launch_kernel(
NoopKernel,
LaunchConfig::new(Grid::new(1u32), Block::new(256u32)),
(),
);
}));
suite.add_result(runner.bench("kernel_launch_4x256", || {
let _ = crate::runtime::kernel::launch_kernel(
NoopKernel,
LaunchConfig::new(Grid::new(4u32), Block::new(256u32)),
(),
);
}));
let simple_cuda = r#"
__global__ void add(float* a, float* b, float* c) {
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
"#;
suite.add_result(runner.bench("transpile_simple_kernel", || {
let t = crate::transpiler::CudaTranspiler::new();
let _ = t.transpile(simple_cuda, false, false);
}));
suite.add_result(runner.bench("transpile_with_optimization", || {
let t = crate::transpiler::CudaTranspiler::new();
let _ = t.transpile(simple_cuda, true, true);
}));
suite.add_result(runner.bench("parse_simple_kernel", || {
let p = crate::parser::CudaParser::new();
let _ = p.parse(simple_cuda);
}));
suite.add_result(runner.bench("half_f32_roundtrip_1000", || {
for i in 0..1000 {
let h = crate::runtime::half::Half::from_f32(i as f32);
std::hint::black_box(h.to_f32());
}
}));
suite.add_result(runner.bench("half_dot_product_256", || {
let a: Vec<_> = (0..256).map(|i| crate::runtime::half::Half::from_f32(i as f32 * 0.01)).collect();
let b: Vec<_> = (0..256).map(|i| crate::runtime::half::Half::from_f32(i as f32 * 0.01)).collect();
std::hint::black_box(crate::runtime::half::half_dot(&a, &b));
}));
Ok(suite)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_benchmark_runner_basic() {
let runner = BenchmarkRunner::new()
.warmup(2)
.min_iters(10)
.max_iters(100)
.target_time(Duration::from_millis(100));
let mut counter = 0u64;
let result = runner.bench("counter_increment", || {
counter += 1;
});
assert!(result.iterations >= 10);
assert!(result.throughput_ops > 0.0);
assert!(result.mean_duration <= result.max_duration);
assert!(result.min_duration <= result.mean_duration);
}
#[test]
fn test_benchmark_throughput() {
let runner = BenchmarkRunner::new()
.warmup(1)
.min_iters(10)
.max_iters(50)
.target_time(Duration::from_millis(50));
let result = runner.bench_throughput("memcpy_1kb", 1024, || {
let src = vec![0u8; 1024];
std::hint::black_box(&src);
});
assert!(result.throughput_bytes.is_some());
assert!(result.throughput_bytes.unwrap() > 0.0);
}
#[test]
fn test_benchmark_suite() {
let runner = BenchmarkRunner::new()
.warmup(1)
.min_iters(5)
.max_iters(10)
.target_time(Duration::from_millis(10));
let mut suite = BenchmarkSuite::new("test_suite");
suite.add_result(runner.bench("a", || {}));
suite.add_result(runner.bench("b", || {}));
assert_eq!(suite.results().len(), 2);
assert_eq!(suite.name(), "test_suite");
let report = suite.report();
assert!(report.contains("test_suite"));
assert!(report.contains("a"));
assert!(report.contains("b"));
}
#[test]
fn test_builtin_benchmarks() {
let suite = run_builtin_benchmarks().unwrap();
assert!(!suite.results().is_empty());
for r in suite.results() {
assert!(r.iterations > 0, "Benchmark {} had 0 iterations", r.name);
assert!(r.throughput_ops > 0.0, "Benchmark {} had 0 throughput", r.name);
}
}
#[test]
fn test_benchmark_result_summary() {
let result = BenchmarkResult {
name: "test".to_string(),
iterations: 100,
total_duration: Duration::from_millis(100),
mean_duration: Duration::from_millis(1),
median_duration: Duration::from_millis(1),
min_duration: Duration::from_micros(500),
max_duration: Duration::from_millis(2),
std_dev: Duration::from_micros(200),
throughput_ops: 1000.0,
throughput_bytes: None,
};
let summary = result.summary();
assert!(summary.contains("test"));
assert!(summary.contains("100 iters"));
}
}