Skip to main content

svod_runtime/
benchmark.rs

1//! Kernel benchmarking infrastructure for auto-tuning.
2//!
3//! Provides timing utilities for measuring kernel execution performance,
4//! used by beam search optimization to compare candidate kernels.
5
6use std::sync::OnceLock;
7use std::time::{Duration, Instant};
8
9use svod_device::device::Program;
10
11use crate::Result;
12
13/// Configuration for kernel benchmarking.
14#[derive(Debug, Clone)]
15pub struct BenchmarkConfig {
16    /// Number of warmup runs (not timed).
17    pub warmup_runs: usize,
18    /// Number of timing runs.
19    pub timing_runs: usize,
20    /// Whether to return minimum time (true) or mean (false).
21    pub take_minimum: bool,
22    /// If set, abort the timing loop the moment any single run exceeds this
23    /// threshold. Used by beam search to skip candidates clearly slower than
24    /// the current best (typically `early_stop = beam[0].timing * 3`).
25    pub early_stop: Option<Duration>,
26    /// Invalidate L2 between runs by streaming through a scratch buffer.
27    /// Stabilises rankings — without this, second/third runs hit hot caches
28    /// and bias beam toward smaller-tile candidates.
29    pub clear_l2: bool,
30}
31
32impl Default for BenchmarkConfig {
33    fn default() -> Self {
34        // 3 timing runs, take the minimum — variance from rayon dispatch
35        // and OS scheduling is much larger than per-run overhead, so the
36        // min of 3 is a tighter estimate of the kernel's true cost than
37        // any longer-running statistic.
38        Self { warmup_runs: 0, timing_runs: 3, take_minimum: true, early_stop: None, clear_l2: false }
39    }
40}
41
42/// Result of kernel benchmarking.
43#[derive(Debug, Clone)]
44pub struct BenchmarkResult {
45    /// Minimum execution time.
46    pub min: Duration,
47    /// Mean execution time.
48    pub mean: Duration,
49    /// All timing measurements.
50    pub runs: Vec<Duration>,
51}
52
53impl BenchmarkResult {
54    /// Get the timing value based on config preference.
55    pub fn timing(&self, take_minimum: bool) -> Duration {
56        if take_minimum { self.min } else { self.mean }
57    }
58}
59
60/// Benchmark a compiled kernel's execution time.
61///
62/// Runs warmup iterations (discarded), then timing iterations.
63/// Returns min/mean/all timings.
64///
65/// # Safety
66///
67/// All buffer pointers must be valid for the duration of benchmarking.
68/// The kernel will be executed multiple times.
69///
70/// # Example
71///
72/// ```ignore
73/// let config = BenchmarkConfig::default();
74/// let result = unsafe { benchmark_kernel(&kernel, &buffers, &vals, None, None, &config)? };
75/// println!("Min time: {:?}", result.min);
76/// ```
77pub unsafe fn benchmark_kernel(
78    kernel: &dyn Program,
79    buffers: &[*mut u8],
80    vals: &[i64],
81    global_size: Option<[usize; 3]>,
82    local_size: Option<[usize; 3]>,
83    config: &BenchmarkConfig,
84) -> Result<BenchmarkResult> {
85    // Warmup runs (discard timing)
86    for _ in 0..config.warmup_runs {
87        unsafe { kernel.execute(buffers, vals, global_size, local_size)? };
88    }
89
90    // Timing runs
91    let mut runs = Vec::with_capacity(config.timing_runs);
92    for i in 0..config.timing_runs {
93        if config.clear_l2 && i > 0 {
94            invalidate_l2();
95        }
96        let start = Instant::now();
97        unsafe { kernel.execute(buffers, vals, global_size, local_size)? };
98        runs.push(start.elapsed());
99
100        // Min-of-runs early stop: abort only when the best run so far still
101        // exceeds the threshold. A single jitter outlier in an otherwise
102        // competitive candidate must not disqualify it — `take_minimum=true`
103        // already discards tail noise from the final result.
104        if let Some(threshold) = config.early_stop
105            && runs.iter().copied().min().expect("runs non-empty after push") > threshold
106        {
107            break;
108        }
109    }
110
111    // Calculate statistics
112    let min = runs.iter().copied().min().unwrap_or(Duration::ZERO);
113    let total: Duration = runs.iter().sum();
114    let mean = total / runs.len().max(1) as u32;
115
116    Ok(BenchmarkResult { min, mean, runs })
117}
118
119/// Force rayon's global thread pool to materialise.
120///
121/// Subsequent rayon calls dispatch in O(1), but the lazy initialisation can
122/// dominate the first 1-2 measurements at the small kernel sizes BEAM-time
123/// uses. Call this once before a benchmark loop to remove that bias.
124pub fn warmup_thread_pool() {
125    rayon::join(|| (), || ());
126}
127
128/// Stream through a 16 MiB scratch buffer to evict L2 between timing runs.
129///
130/// Apple M1 P-core L2 is 12 MiB, A14/M2 L2 caches are similar; 16 MiB is
131/// large enough to fully evict L2 on common Apple Silicon and x86 desktop
132/// CPUs. The scratch buffer is allocated once (per process) via `OnceLock`
133/// and reused across calls. `black_box` prevents the compiler from eliding
134/// the read.
135fn invalidate_l2() {
136    const SCRATCH_BYTES: usize = 16 * 1024 * 1024;
137    static SCRATCH: OnceLock<Vec<u8>> = OnceLock::new();
138    let scratch = SCRATCH.get_or_init(|| vec![0u8; SCRATCH_BYTES]);
139
140    let mut acc: u8 = 0;
141    let stride = 64; // touch one byte per cache line
142    let mut i = 0;
143    while i < scratch.len() {
144        acc = acc.wrapping_add(scratch[i]);
145        i += stride;
146    }
147    std::hint::black_box(acc);
148}
149
150#[cfg(test)]
151#[path = "test/unit/benchmark.rs"]
152mod tests;