cuda_rust_wasm/runtime/
benchmark.rs

1//! Built-in benchmark suite for measuring kernel and memory performance
2//!
3//! Provides self-contained benchmarks that run without external harnesses,
4//! producing structured results suitable for comparison across runs.
5
6use crate::Result;
7use std::time::{Duration, Instant};
8
9/// Single benchmark result
10#[derive(Debug, Clone)]
11pub struct BenchmarkResult {
12    /// Benchmark name
13    pub name: String,
14    /// Number of iterations
15    pub iterations: u64,
16    /// Total wall-clock duration
17    pub total_duration: Duration,
18    /// Mean duration per iteration
19    pub mean_duration: Duration,
20    /// Median duration
21    pub median_duration: Duration,
22    /// Minimum duration
23    pub min_duration: Duration,
24    /// Maximum duration
25    pub max_duration: Duration,
26    /// Standard deviation
27    pub std_dev: Duration,
28    /// Throughput (operations per second)
29    pub throughput_ops: f64,
30    /// Optional throughput in bytes/second
31    pub throughput_bytes: Option<f64>,
32}
33
34impl BenchmarkResult {
35    /// Format a human-readable summary
36    pub fn summary(&self) -> String {
37        format!(
38            "{}: {:.2?}/iter ({} iters, {:.2?} total, {:.0} ops/s)",
39            self.name,
40            self.mean_duration,
41            self.iterations,
42            self.total_duration,
43            self.throughput_ops,
44        )
45    }
46}
47
48/// Benchmark runner
49pub struct BenchmarkRunner {
50    warmup_iterations: u64,
51    min_iterations: u64,
52    max_iterations: u64,
53    target_time: Duration,
54}
55
56impl BenchmarkRunner {
57    /// Create a new benchmark runner with default settings
58    pub fn new() -> Self {
59        Self {
60            warmup_iterations: 10,
61            min_iterations: 100,
62            max_iterations: 10_000,
63            target_time: Duration::from_secs(2),
64        }
65    }
66
67    /// Set warmup iterations
68    pub fn warmup(mut self, n: u64) -> Self {
69        self.warmup_iterations = n;
70        self
71    }
72
73    /// Set minimum iterations
74    pub fn min_iters(mut self, n: u64) -> Self {
75        self.min_iterations = n;
76        self
77    }
78
79    /// Set maximum iterations
80    pub fn max_iters(mut self, n: u64) -> Self {
81        self.max_iterations = n;
82        self
83    }
84
85    /// Set target time
86    pub fn target_time(mut self, d: Duration) -> Self {
87        self.target_time = d;
88        self
89    }
90
91    /// Run a benchmark
92    pub fn bench<F>(&self, name: &str, mut f: F) -> BenchmarkResult
93    where
94        F: FnMut(),
95    {
96        // Warmup
97        for _ in 0..self.warmup_iterations {
98            f();
99        }
100
101        // Measure
102        let mut durations = Vec::new();
103        let global_start = Instant::now();
104
105        for i in 0..self.max_iterations {
106            let start = Instant::now();
107            f();
108            let elapsed = start.elapsed();
109            durations.push(elapsed);
110
111            if i >= self.min_iterations && global_start.elapsed() >= self.target_time {
112                break;
113            }
114        }
115
116        let iterations = durations.len() as u64;
117        self.compute_result(name, &durations, iterations, None)
118    }
119
120    /// Run a benchmark with throughput measured in bytes
121    pub fn bench_throughput<F>(
122        &self,
123        name: &str,
124        bytes_per_iter: usize,
125        mut f: F,
126    ) -> BenchmarkResult
127    where
128        F: FnMut(),
129    {
130        // Warmup
131        for _ in 0..self.warmup_iterations {
132            f();
133        }
134
135        // Measure
136        let mut durations = Vec::new();
137        let global_start = Instant::now();
138
139        for i in 0..self.max_iterations {
140            let start = Instant::now();
141            f();
142            let elapsed = start.elapsed();
143            durations.push(elapsed);
144
145            if i >= self.min_iterations && global_start.elapsed() >= self.target_time {
146                break;
147            }
148        }
149
150        let iterations = durations.len() as u64;
151        self.compute_result(name, &durations, iterations, Some(bytes_per_iter))
152    }
153
154    fn compute_result(
155        &self,
156        name: &str,
157        durations: &[Duration],
158        iterations: u64,
159        bytes_per_iter: Option<usize>,
160    ) -> BenchmarkResult {
161        let total: Duration = durations.iter().sum();
162        let mean = total / iterations as u32;
163
164        let mut sorted: Vec<Duration> = durations.to_vec();
165        sorted.sort();
166        let median = sorted[sorted.len() / 2];
167        let min = sorted[0];
168        let max = sorted[sorted.len() - 1];
169
170        // Standard deviation
171        let mean_nanos = mean.as_nanos() as f64;
172        let variance: f64 = durations
173            .iter()
174            .map(|d| {
175                let diff = d.as_nanos() as f64 - mean_nanos;
176                diff * diff
177            })
178            .sum::<f64>()
179            / iterations as f64;
180        let std_dev_nanos = variance.sqrt();
181        let std_dev = Duration::from_nanos(std_dev_nanos as u64);
182
183        let throughput_ops = if mean.as_nanos() > 0 {
184            1_000_000_000.0 / mean_nanos
185        } else {
186            f64::INFINITY
187        };
188
189        let throughput_bytes = bytes_per_iter.map(|bpi| {
190            throughput_ops * bpi as f64
191        });
192
193        BenchmarkResult {
194            name: name.to_string(),
195            iterations,
196            total_duration: total,
197            mean_duration: mean,
198            median_duration: median,
199            min_duration: min,
200            max_duration: max,
201            std_dev,
202            throughput_ops,
203            throughput_bytes,
204        }
205    }
206}
207
208impl Default for BenchmarkRunner {
209    fn default() -> Self {
210        Self::new()
211    }
212}
213
214/// Benchmark suite with named groups
215pub struct BenchmarkSuite {
216    name: String,
217    results: Vec<BenchmarkResult>,
218}
219
220impl BenchmarkSuite {
221    /// Create a new suite
222    pub fn new(name: &str) -> Self {
223        Self {
224            name: name.to_string(),
225            results: Vec::new(),
226        }
227    }
228
229    /// Add a result
230    pub fn add_result(&mut self, result: BenchmarkResult) {
231        self.results.push(result);
232    }
233
234    /// Get all results
235    pub fn results(&self) -> &[BenchmarkResult] {
236        &self.results
237    }
238
239    /// Get suite name
240    pub fn name(&self) -> &str {
241        &self.name
242    }
243
244    /// Print a formatted report
245    pub fn report(&self) -> String {
246        let mut lines = Vec::new();
247        lines.push(format!("=== Benchmark Suite: {} ===", self.name));
248        lines.push(String::new());
249
250        let max_name_len = self.results.iter().map(|r| r.name.len()).max().unwrap_or(20);
251
252        lines.push(format!(
253            "{:<width$}  {:>12}  {:>12}  {:>12}  {:>12}  {:>12}",
254            "Benchmark", "Mean", "Median", "Min", "Max", "Ops/s",
255            width = max_name_len
256        ));
257        lines.push("-".repeat(max_name_len + 66));
258
259        for r in &self.results {
260            lines.push(format!(
261                "{:<width$}  {:>12.2?}  {:>12.2?}  {:>12.2?}  {:>12.2?}  {:>12.0}",
262                r.name,
263                r.mean_duration,
264                r.median_duration,
265                r.min_duration,
266                r.max_duration,
267                r.throughput_ops,
268                width = max_name_len
269            ));
270        }
271
272        lines.push(String::new());
273        lines.push(format!("Total benchmarks: {}", self.results.len()));
274        lines.join("\n")
275    }
276}
277
278/// Run the built-in benchmark suite for this crate
279pub fn run_builtin_benchmarks() -> Result<BenchmarkSuite> {
280    let runner = BenchmarkRunner::new()
281        .warmup(5)
282        .min_iters(50)
283        .max_iters(1000)
284        .target_time(Duration::from_millis(500));
285
286    let mut suite = BenchmarkSuite::new("cuda-rust-wasm");
287
288    // --- Memory allocation benchmarks ---
289    suite.add_result(runner.bench("pool_allocate_1kb", || {
290        let pool = crate::memory::MemoryPool::new();
291        let buf = pool.allocate(1024);
292        pool.deallocate(buf);
293    }));
294
295    suite.add_result(runner.bench("pool_allocate_64kb", || {
296        let pool = crate::memory::MemoryPool::new();
297        let buf = pool.allocate(65536);
298        pool.deallocate(buf);
299    }));
300
301    suite.add_result(runner.bench_throughput("host_buffer_fill_1kb", 1024, || {
302        let mut buf = crate::memory::HostBuffer::<u8>::new(1024).unwrap();
303        buf.fill(0xFF);
304    }));
305
306    // --- Kernel launch benchmarks ---
307    use crate::runtime::kernel::{KernelFunction, ThreadContext, LaunchConfig};
308    use crate::runtime::grid::{Grid, Block};
309
310    struct NoopKernel;
311    impl KernelFunction<()> for NoopKernel {
312        fn execute(&self, _: (), _ctx: ThreadContext) {}
313        fn name(&self) -> &str { "noop" }
314    }
315
316    suite.add_result(runner.bench("kernel_launch_1x1", || {
317        let _ = crate::runtime::kernel::launch_kernel(
318            NoopKernel,
319            LaunchConfig::new(Grid::new(1u32), Block::new(1u32)),
320            (),
321        );
322    }));
323
324    suite.add_result(runner.bench("kernel_launch_1x256", || {
325        let _ = crate::runtime::kernel::launch_kernel(
326            NoopKernel,
327            LaunchConfig::new(Grid::new(1u32), Block::new(256u32)),
328            (),
329        );
330    }));
331
332    suite.add_result(runner.bench("kernel_launch_4x256", || {
333        let _ = crate::runtime::kernel::launch_kernel(
334            NoopKernel,
335            LaunchConfig::new(Grid::new(4u32), Block::new(256u32)),
336            (),
337        );
338    }));
339
340    // --- Transpiler benchmarks ---
341    let simple_cuda = r#"
342        __global__ void add(float* a, float* b, float* c) {
343            int i = threadIdx.x;
344            c[i] = a[i] + b[i];
345        }
346    "#;
347
348    suite.add_result(runner.bench("transpile_simple_kernel", || {
349        let t = crate::transpiler::CudaTranspiler::new();
350        let _ = t.transpile(simple_cuda, false, false);
351    }));
352
353    suite.add_result(runner.bench("transpile_with_optimization", || {
354        let t = crate::transpiler::CudaTranspiler::new();
355        let _ = t.transpile(simple_cuda, true, true);
356    }));
357
358    // --- Parser benchmarks ---
359    suite.add_result(runner.bench("parse_simple_kernel", || {
360        let p = crate::parser::CudaParser::new();
361        let _ = p.parse(simple_cuda);
362    }));
363
364    // --- Half-precision benchmarks ---
365    suite.add_result(runner.bench("half_f32_roundtrip_1000", || {
366        for i in 0..1000 {
367            let h = crate::runtime::half::Half::from_f32(i as f32);
368            std::hint::black_box(h.to_f32());
369        }
370    }));
371
372    suite.add_result(runner.bench("half_dot_product_256", || {
373        let a: Vec<_> = (0..256).map(|i| crate::runtime::half::Half::from_f32(i as f32 * 0.01)).collect();
374        let b: Vec<_> = (0..256).map(|i| crate::runtime::half::Half::from_f32(i as f32 * 0.01)).collect();
375        std::hint::black_box(crate::runtime::half::half_dot(&a, &b));
376    }));
377
378    Ok(suite)
379}
380
381#[cfg(test)]
382mod tests {
383    use super::*;
384
385    #[test]
386    fn test_benchmark_runner_basic() {
387        let runner = BenchmarkRunner::new()
388            .warmup(2)
389            .min_iters(10)
390            .max_iters(100)
391            .target_time(Duration::from_millis(100));
392
393        let mut counter = 0u64;
394        let result = runner.bench("counter_increment", || {
395            counter += 1;
396        });
397
398        assert!(result.iterations >= 10);
399        assert!(result.throughput_ops > 0.0);
400        assert!(result.mean_duration <= result.max_duration);
401        assert!(result.min_duration <= result.mean_duration);
402    }
403
404    #[test]
405    fn test_benchmark_throughput() {
406        let runner = BenchmarkRunner::new()
407            .warmup(1)
408            .min_iters(10)
409            .max_iters(50)
410            .target_time(Duration::from_millis(50));
411
412        let result = runner.bench_throughput("memcpy_1kb", 1024, || {
413            let src = vec![0u8; 1024];
414            std::hint::black_box(&src);
415        });
416
417        assert!(result.throughput_bytes.is_some());
418        assert!(result.throughput_bytes.unwrap() > 0.0);
419    }
420
421    #[test]
422    fn test_benchmark_suite() {
423        let runner = BenchmarkRunner::new()
424            .warmup(1)
425            .min_iters(5)
426            .max_iters(10)
427            .target_time(Duration::from_millis(10));
428
429        let mut suite = BenchmarkSuite::new("test_suite");
430        suite.add_result(runner.bench("a", || {}));
431        suite.add_result(runner.bench("b", || {}));
432
433        assert_eq!(suite.results().len(), 2);
434        assert_eq!(suite.name(), "test_suite");
435
436        let report = suite.report();
437        assert!(report.contains("test_suite"));
438        assert!(report.contains("a"));
439        assert!(report.contains("b"));
440    }
441
442    #[test]
443    fn test_builtin_benchmarks() {
444        let suite = run_builtin_benchmarks().unwrap();
445        assert!(!suite.results().is_empty());
446        // Verify each benchmark has meaningful results
447        for r in suite.results() {
448            assert!(r.iterations > 0, "Benchmark {} had 0 iterations", r.name);
449            assert!(r.throughput_ops > 0.0, "Benchmark {} had 0 throughput", r.name);
450        }
451    }
452
453    #[test]
454    fn test_benchmark_result_summary() {
455        let result = BenchmarkResult {
456            name: "test".to_string(),
457            iterations: 100,
458            total_duration: Duration::from_millis(100),
459            mean_duration: Duration::from_millis(1),
460            median_duration: Duration::from_millis(1),
461            min_duration: Duration::from_micros(500),
462            max_duration: Duration::from_millis(2),
463            std_dev: Duration::from_micros(200),
464            throughput_ops: 1000.0,
465            throughput_bytes: None,
466        };
467        let summary = result.summary();
468        assert!(summary.contains("test"));
469        assert!(summary.contains("100 iters"));
470    }
471}
cuda_rust_wasm/runtime/benchmark.rs

cuda_rust_wasm/runtime/
benchmark.rs