svod_runtime/benchmark.rs
1//! Kernel benchmarking infrastructure for auto-tuning.
2//!
3//! Provides timing utilities for measuring kernel execution performance,
4//! used by beam search optimization to compare candidate kernels.
5
6use std::sync::OnceLock;
7use std::time::{Duration, Instant};
8
9use svod_device::device::Program;
10
11use crate::Result;
12
13/// Configuration for kernel benchmarking.
14#[derive(Debug, Clone)]
15pub struct BenchmarkConfig {
16 /// Number of warmup runs (not timed).
17 pub warmup_runs: usize,
18 /// Number of timing runs.
19 pub timing_runs: usize,
20 /// Whether to return minimum time (true) or mean (false).
21 pub take_minimum: bool,
22 /// If set, abort the timing loop the moment any single run exceeds this
23 /// threshold. Used by beam search to skip candidates clearly slower than
24 /// the current best (typically `early_stop = beam[0].timing * 3`).
25 pub early_stop: Option<Duration>,
26 /// Invalidate L2 between runs by streaming through a scratch buffer.
27 /// Stabilises rankings — without this, second/third runs hit hot caches
28 /// and bias beam toward smaller-tile candidates.
29 pub clear_l2: bool,
30}
31
32impl Default for BenchmarkConfig {
33 fn default() -> Self {
34 // 3 timing runs, take the minimum — variance from rayon dispatch
35 // and OS scheduling is much larger than per-run overhead, so the
36 // min of 3 is a tighter estimate of the kernel's true cost than
37 // any longer-running statistic.
38 Self { warmup_runs: 0, timing_runs: 3, take_minimum: true, early_stop: None, clear_l2: false }
39 }
40}
41
42/// Result of kernel benchmarking.
43#[derive(Debug, Clone)]
44pub struct BenchmarkResult {
45 /// Minimum execution time.
46 pub min: Duration,
47 /// Mean execution time.
48 pub mean: Duration,
49 /// All timing measurements.
50 pub runs: Vec<Duration>,
51}
52
53impl BenchmarkResult {
54 /// Get the timing value based on config preference.
55 pub fn timing(&self, take_minimum: bool) -> Duration {
56 if take_minimum { self.min } else { self.mean }
57 }
58}
59
60/// Benchmark a compiled kernel's execution time.
61///
62/// Runs warmup iterations (discarded), then timing iterations.
63/// Returns min/mean/all timings.
64///
65/// # Safety
66///
67/// All buffer pointers must be valid for the duration of benchmarking.
68/// The kernel will be executed multiple times.
69///
70/// # Example
71///
72/// ```ignore
73/// let config = BenchmarkConfig::default();
74/// let result = unsafe { benchmark_kernel(&kernel, &buffers, &vals, None, None, &config)? };
75/// println!("Min time: {:?}", result.min);
76/// ```
77pub unsafe fn benchmark_kernel(
78 kernel: &dyn Program,
79 buffers: &[*mut u8],
80 vals: &[i64],
81 global_size: Option<[usize; 3]>,
82 local_size: Option<[usize; 3]>,
83 config: &BenchmarkConfig,
84) -> Result<BenchmarkResult> {
85 // Warmup runs (discard timing)
86 for _ in 0..config.warmup_runs {
87 unsafe { kernel.execute(buffers, vals, global_size, local_size)? };
88 }
89
90 // Timing runs
91 let mut runs = Vec::with_capacity(config.timing_runs);
92 for i in 0..config.timing_runs {
93 if config.clear_l2 && i > 0 {
94 invalidate_l2();
95 }
96 let start = Instant::now();
97 unsafe { kernel.execute(buffers, vals, global_size, local_size)? };
98 runs.push(start.elapsed());
99
100 // Min-of-runs early stop: abort only when the best run so far still
101 // exceeds the threshold. A single jitter outlier in an otherwise
102 // competitive candidate must not disqualify it — `take_minimum=true`
103 // already discards tail noise from the final result.
104 if let Some(threshold) = config.early_stop
105 && runs.iter().copied().min().expect("runs non-empty after push") > threshold
106 {
107 break;
108 }
109 }
110
111 // Calculate statistics
112 let min = runs.iter().copied().min().unwrap_or(Duration::ZERO);
113 let total: Duration = runs.iter().sum();
114 let mean = total / runs.len().max(1) as u32;
115
116 Ok(BenchmarkResult { min, mean, runs })
117}
118
119/// Force rayon's global thread pool to materialise.
120///
121/// Subsequent rayon calls dispatch in O(1), but the lazy initialisation can
122/// dominate the first 1-2 measurements at the small kernel sizes BEAM-time
123/// uses. Call this once before a benchmark loop to remove that bias.
124pub fn warmup_thread_pool() {
125 rayon::join(|| (), || ());
126}
127
128/// Stream through a 16 MiB scratch buffer to evict L2 between timing runs.
129///
130/// Apple M1 P-core L2 is 12 MiB, A14/M2 L2 caches are similar; 16 MiB is
131/// large enough to fully evict L2 on common Apple Silicon and x86 desktop
132/// CPUs. The scratch buffer is allocated once (per process) via `OnceLock`
133/// and reused across calls. `black_box` prevents the compiler from eliding
134/// the read.
135fn invalidate_l2() {
136 const SCRATCH_BYTES: usize = 16 * 1024 * 1024;
137 static SCRATCH: OnceLock<Vec<u8>> = OnceLock::new();
138 let scratch = SCRATCH.get_or_init(|| vec![0u8; SCRATCH_BYTES]);
139
140 let mut acc: u8 = 0;
141 let stride = 64; // touch one byte per cache line
142 let mut i = 0;
143 while i < scratch.len() {
144 acc = acc.wrapping_add(scratch[i]);
145 i += stride;
146 }
147 std::hint::black_box(acc);
148}
149
150#[cfg(test)]
151#[path = "test/unit/benchmark.rs"]
152mod tests;