1use crate::Result;
7use std::time::{Duration, Instant};
8
9#[derive(Debug, Clone)]
11pub struct BenchmarkResult {
12 pub name: String,
14 pub iterations: u64,
16 pub total_duration: Duration,
18 pub mean_duration: Duration,
20 pub median_duration: Duration,
22 pub min_duration: Duration,
24 pub max_duration: Duration,
26 pub std_dev: Duration,
28 pub throughput_ops: f64,
30 pub throughput_bytes: Option<f64>,
32}
33
34impl BenchmarkResult {
35 pub fn summary(&self) -> String {
37 format!(
38 "{}: {:.2?}/iter ({} iters, {:.2?} total, {:.0} ops/s)",
39 self.name,
40 self.mean_duration,
41 self.iterations,
42 self.total_duration,
43 self.throughput_ops,
44 )
45 }
46}
47
48pub struct BenchmarkRunner {
50 warmup_iterations: u64,
51 min_iterations: u64,
52 max_iterations: u64,
53 target_time: Duration,
54}
55
56impl BenchmarkRunner {
57 pub fn new() -> Self {
59 Self {
60 warmup_iterations: 10,
61 min_iterations: 100,
62 max_iterations: 10_000,
63 target_time: Duration::from_secs(2),
64 }
65 }
66
67 pub fn warmup(mut self, n: u64) -> Self {
69 self.warmup_iterations = n;
70 self
71 }
72
73 pub fn min_iters(mut self, n: u64) -> Self {
75 self.min_iterations = n;
76 self
77 }
78
79 pub fn max_iters(mut self, n: u64) -> Self {
81 self.max_iterations = n;
82 self
83 }
84
85 pub fn target_time(mut self, d: Duration) -> Self {
87 self.target_time = d;
88 self
89 }
90
91 pub fn bench<F>(&self, name: &str, mut f: F) -> BenchmarkResult
93 where
94 F: FnMut(),
95 {
96 for _ in 0..self.warmup_iterations {
98 f();
99 }
100
101 let mut durations = Vec::new();
103 let global_start = Instant::now();
104
105 for i in 0..self.max_iterations {
106 let start = Instant::now();
107 f();
108 let elapsed = start.elapsed();
109 durations.push(elapsed);
110
111 if i >= self.min_iterations && global_start.elapsed() >= self.target_time {
112 break;
113 }
114 }
115
116 let iterations = durations.len() as u64;
117 self.compute_result(name, &durations, iterations, None)
118 }
119
120 pub fn bench_throughput<F>(
122 &self,
123 name: &str,
124 bytes_per_iter: usize,
125 mut f: F,
126 ) -> BenchmarkResult
127 where
128 F: FnMut(),
129 {
130 for _ in 0..self.warmup_iterations {
132 f();
133 }
134
135 let mut durations = Vec::new();
137 let global_start = Instant::now();
138
139 for i in 0..self.max_iterations {
140 let start = Instant::now();
141 f();
142 let elapsed = start.elapsed();
143 durations.push(elapsed);
144
145 if i >= self.min_iterations && global_start.elapsed() >= self.target_time {
146 break;
147 }
148 }
149
150 let iterations = durations.len() as u64;
151 self.compute_result(name, &durations, iterations, Some(bytes_per_iter))
152 }
153
154 fn compute_result(
155 &self,
156 name: &str,
157 durations: &[Duration],
158 iterations: u64,
159 bytes_per_iter: Option<usize>,
160 ) -> BenchmarkResult {
161 let total: Duration = durations.iter().sum();
162 let mean = total / iterations as u32;
163
164 let mut sorted: Vec<Duration> = durations.to_vec();
165 sorted.sort();
166 let median = sorted[sorted.len() / 2];
167 let min = sorted[0];
168 let max = sorted[sorted.len() - 1];
169
170 let mean_nanos = mean.as_nanos() as f64;
172 let variance: f64 = durations
173 .iter()
174 .map(|d| {
175 let diff = d.as_nanos() as f64 - mean_nanos;
176 diff * diff
177 })
178 .sum::<f64>()
179 / iterations as f64;
180 let std_dev_nanos = variance.sqrt();
181 let std_dev = Duration::from_nanos(std_dev_nanos as u64);
182
183 let throughput_ops = if mean.as_nanos() > 0 {
184 1_000_000_000.0 / mean_nanos
185 } else {
186 f64::INFINITY
187 };
188
189 let throughput_bytes = bytes_per_iter.map(|bpi| {
190 throughput_ops * bpi as f64
191 });
192
193 BenchmarkResult {
194 name: name.to_string(),
195 iterations,
196 total_duration: total,
197 mean_duration: mean,
198 median_duration: median,
199 min_duration: min,
200 max_duration: max,
201 std_dev,
202 throughput_ops,
203 throughput_bytes,
204 }
205 }
206}
207
208impl Default for BenchmarkRunner {
209 fn default() -> Self {
210 Self::new()
211 }
212}
213
214pub struct BenchmarkSuite {
216 name: String,
217 results: Vec<BenchmarkResult>,
218}
219
220impl BenchmarkSuite {
221 pub fn new(name: &str) -> Self {
223 Self {
224 name: name.to_string(),
225 results: Vec::new(),
226 }
227 }
228
229 pub fn add_result(&mut self, result: BenchmarkResult) {
231 self.results.push(result);
232 }
233
234 pub fn results(&self) -> &[BenchmarkResult] {
236 &self.results
237 }
238
239 pub fn name(&self) -> &str {
241 &self.name
242 }
243
244 pub fn report(&self) -> String {
246 let mut lines = Vec::new();
247 lines.push(format!("=== Benchmark Suite: {} ===", self.name));
248 lines.push(String::new());
249
250 let max_name_len = self.results.iter().map(|r| r.name.len()).max().unwrap_or(20);
251
252 lines.push(format!(
253 "{:<width$} {:>12} {:>12} {:>12} {:>12} {:>12}",
254 "Benchmark", "Mean", "Median", "Min", "Max", "Ops/s",
255 width = max_name_len
256 ));
257 lines.push("-".repeat(max_name_len + 66));
258
259 for r in &self.results {
260 lines.push(format!(
261 "{:<width$} {:>12.2?} {:>12.2?} {:>12.2?} {:>12.2?} {:>12.0}",
262 r.name,
263 r.mean_duration,
264 r.median_duration,
265 r.min_duration,
266 r.max_duration,
267 r.throughput_ops,
268 width = max_name_len
269 ));
270 }
271
272 lines.push(String::new());
273 lines.push(format!("Total benchmarks: {}", self.results.len()));
274 lines.join("\n")
275 }
276}
277
278pub fn run_builtin_benchmarks() -> Result<BenchmarkSuite> {
280 let runner = BenchmarkRunner::new()
281 .warmup(5)
282 .min_iters(50)
283 .max_iters(1000)
284 .target_time(Duration::from_millis(500));
285
286 let mut suite = BenchmarkSuite::new("cuda-rust-wasm");
287
288 suite.add_result(runner.bench("pool_allocate_1kb", || {
290 let pool = crate::memory::MemoryPool::new();
291 let buf = pool.allocate(1024);
292 pool.deallocate(buf);
293 }));
294
295 suite.add_result(runner.bench("pool_allocate_64kb", || {
296 let pool = crate::memory::MemoryPool::new();
297 let buf = pool.allocate(65536);
298 pool.deallocate(buf);
299 }));
300
301 suite.add_result(runner.bench_throughput("host_buffer_fill_1kb", 1024, || {
302 let mut buf = crate::memory::HostBuffer::<u8>::new(1024).unwrap();
303 buf.fill(0xFF);
304 }));
305
306 use crate::runtime::kernel::{KernelFunction, ThreadContext, LaunchConfig};
308 use crate::runtime::grid::{Grid, Block};
309
310 struct NoopKernel;
311 impl KernelFunction<()> for NoopKernel {
312 fn execute(&self, _: (), _ctx: ThreadContext) {}
313 fn name(&self) -> &str { "noop" }
314 }
315
316 suite.add_result(runner.bench("kernel_launch_1x1", || {
317 let _ = crate::runtime::kernel::launch_kernel(
318 NoopKernel,
319 LaunchConfig::new(Grid::new(1u32), Block::new(1u32)),
320 (),
321 );
322 }));
323
324 suite.add_result(runner.bench("kernel_launch_1x256", || {
325 let _ = crate::runtime::kernel::launch_kernel(
326 NoopKernel,
327 LaunchConfig::new(Grid::new(1u32), Block::new(256u32)),
328 (),
329 );
330 }));
331
332 suite.add_result(runner.bench("kernel_launch_4x256", || {
333 let _ = crate::runtime::kernel::launch_kernel(
334 NoopKernel,
335 LaunchConfig::new(Grid::new(4u32), Block::new(256u32)),
336 (),
337 );
338 }));
339
340 let simple_cuda = r#"
342 __global__ void add(float* a, float* b, float* c) {
343 int i = threadIdx.x;
344 c[i] = a[i] + b[i];
345 }
346 "#;
347
348 suite.add_result(runner.bench("transpile_simple_kernel", || {
349 let t = crate::transpiler::CudaTranspiler::new();
350 let _ = t.transpile(simple_cuda, false, false);
351 }));
352
353 suite.add_result(runner.bench("transpile_with_optimization", || {
354 let t = crate::transpiler::CudaTranspiler::new();
355 let _ = t.transpile(simple_cuda, true, true);
356 }));
357
358 suite.add_result(runner.bench("parse_simple_kernel", || {
360 let p = crate::parser::CudaParser::new();
361 let _ = p.parse(simple_cuda);
362 }));
363
364 suite.add_result(runner.bench("half_f32_roundtrip_1000", || {
366 for i in 0..1000 {
367 let h = crate::runtime::half::Half::from_f32(i as f32);
368 std::hint::black_box(h.to_f32());
369 }
370 }));
371
372 suite.add_result(runner.bench("half_dot_product_256", || {
373 let a: Vec<_> = (0..256).map(|i| crate::runtime::half::Half::from_f32(i as f32 * 0.01)).collect();
374 let b: Vec<_> = (0..256).map(|i| crate::runtime::half::Half::from_f32(i as f32 * 0.01)).collect();
375 std::hint::black_box(crate::runtime::half::half_dot(&a, &b));
376 }));
377
378 Ok(suite)
379}
380
381#[cfg(test)]
382mod tests {
383 use super::*;
384
385 #[test]
386 fn test_benchmark_runner_basic() {
387 let runner = BenchmarkRunner::new()
388 .warmup(2)
389 .min_iters(10)
390 .max_iters(100)
391 .target_time(Duration::from_millis(100));
392
393 let mut counter = 0u64;
394 let result = runner.bench("counter_increment", || {
395 counter += 1;
396 });
397
398 assert!(result.iterations >= 10);
399 assert!(result.throughput_ops > 0.0);
400 assert!(result.mean_duration <= result.max_duration);
401 assert!(result.min_duration <= result.mean_duration);
402 }
403
404 #[test]
405 fn test_benchmark_throughput() {
406 let runner = BenchmarkRunner::new()
407 .warmup(1)
408 .min_iters(10)
409 .max_iters(50)
410 .target_time(Duration::from_millis(50));
411
412 let result = runner.bench_throughput("memcpy_1kb", 1024, || {
413 let src = vec![0u8; 1024];
414 std::hint::black_box(&src);
415 });
416
417 assert!(result.throughput_bytes.is_some());
418 assert!(result.throughput_bytes.unwrap() > 0.0);
419 }
420
421 #[test]
422 fn test_benchmark_suite() {
423 let runner = BenchmarkRunner::new()
424 .warmup(1)
425 .min_iters(5)
426 .max_iters(10)
427 .target_time(Duration::from_millis(10));
428
429 let mut suite = BenchmarkSuite::new("test_suite");
430 suite.add_result(runner.bench("a", || {}));
431 suite.add_result(runner.bench("b", || {}));
432
433 assert_eq!(suite.results().len(), 2);
434 assert_eq!(suite.name(), "test_suite");
435
436 let report = suite.report();
437 assert!(report.contains("test_suite"));
438 assert!(report.contains("a"));
439 assert!(report.contains("b"));
440 }
441
442 #[test]
443 fn test_builtin_benchmarks() {
444 let suite = run_builtin_benchmarks().unwrap();
445 assert!(!suite.results().is_empty());
446 for r in suite.results() {
448 assert!(r.iterations > 0, "Benchmark {} had 0 iterations", r.name);
449 assert!(r.throughput_ops > 0.0, "Benchmark {} had 0 throughput", r.name);
450 }
451 }
452
453 #[test]
454 fn test_benchmark_result_summary() {
455 let result = BenchmarkResult {
456 name: "test".to_string(),
457 iterations: 100,
458 total_duration: Duration::from_millis(100),
459 mean_duration: Duration::from_millis(1),
460 median_duration: Duration::from_millis(1),
461 min_duration: Duration::from_micros(500),
462 max_duration: Duration::from_millis(2),
463 std_dev: Duration::from_micros(200),
464 throughput_ops: 1000.0,
465 throughput_bytes: None,
466 };
467 let summary = result.summary();
468 assert!(summary.contains("test"));
469 assert!(summary.contains("100 iters"));
470 }
471}