rlx_ir/measure.rs
1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Cycle-accurate timing primitive (#66 in plan.md).
17//!
18//! `Instant::now()` on macOS already wraps `mach_continuous_time` (which on
19//! Apple Silicon ultimately reads `CNTVCT_EL0`), so the wall-clock precision
20//! is fine. The win from going direct is two-fold:
21//!
22//! 1. **Resolution.** `Instant` exposes nanoseconds via `Duration`, but the
23//! hardware tick is ~41 ns on M-series (24 MHz `CNTFRQ_EL0`). Tracking
24//! raw ticks lets the autotuner reason at the actual hardware grain
25//! instead of pretending it has 1 ns precision.
26//! 2. **Overhead.** `Instant::now()` is a few hundred cycles of wrapper
27//! and `Duration` math. The raw `mrs` is one instruction. For
28//! sub-microsecond kernels (small-tile probes in `calibrate.rs`) the
29//! wrapper itself becomes a measurable fraction of the timed region.
30//!
31//! Falls back to `Instant` on non-AArch64 / non-Apple targets so the API
32//! stays portable.
33
34#[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
35use std::time::Instant;
36
37/// Opaque tick reading. Subtract two of these to get a `Duration`.
38#[derive(Copy, Clone, Debug)]
39pub struct Tick {
40 #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
41 cycles: u64,
42 #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
43 instant: Instant,
44}
45
46impl Tick {
47 /// Read the current tick.
48 #[inline(always)]
49 pub fn now() -> Self {
50 #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
51 {
52 Tick {
53 cycles: read_cntvct(),
54 }
55 }
56 #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
57 {
58 Tick {
59 instant: Instant::now(),
60 }
61 }
62 }
63
64 /// Elapsed nanoseconds since `start`. Saturates at zero if the clock
65 /// went backwards (it shouldn't, but the kernel is allowed to lie).
66 #[inline(always)]
67 pub fn elapsed_ns(&self, start: Tick) -> u64 {
68 #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
69 {
70 let dt = self.cycles.saturating_sub(start.cycles);
71 // CNTFRQ_EL0 is 24 MHz on Apple Silicon → 1 tick = 1000/24 ns.
72 // Cached the first time we ask; the value never changes.
73 let freq = cntfrq_hz();
74 ((dt as u128) * 1_000_000_000u128 / freq as u128) as u64
75 }
76 #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
77 {
78 self.instant.duration_since(start.instant).as_nanos() as u64
79 }
80 }
81
82 #[inline(always)]
83 pub fn elapsed_us(&self, start: Tick) -> f64 {
84 self.elapsed_ns(start) as f64 / 1_000.0
85 }
86
87 #[inline(always)]
88 pub fn elapsed_ms(&self, start: Tick) -> f64 {
89 self.elapsed_ns(start) as f64 / 1_000_000.0
90 }
91}
92
93#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
94#[inline(always)]
95fn read_cntvct() -> u64 {
96 let val: u64;
97 // Safety: `mrs cntvct_el0, X` is unprivileged on AArch64 and reads the
98 // virtual count register. No memory effects.
99 unsafe {
100 std::arch::asm!("mrs {0}, cntvct_el0", out(reg) val, options(nomem, nostack));
101 }
102 val
103}
104
105#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
106fn cntfrq_hz() -> u64 {
107 use std::sync::OnceLock;
108 static FREQ: OnceLock<u64> = OnceLock::new();
109 *FREQ.get_or_init(|| {
110 let val: u64;
111 unsafe {
112 std::arch::asm!("mrs {0}, cntfrq_el0", out(reg) val, options(nomem, nostack));
113 }
114 // Apple Silicon reports 24 MHz; guard against bogus zero just in case.
115 if val == 0 { 24_000_000 } else { val }
116 })
117}
118
119/// Time `f`, returning `(result, elapsed_ns)`. Inlined so the surrounding
120/// loop can keep the closure body in registers.
121#[inline(always)]
122pub fn time_ns<R>(f: impl FnOnce() -> R) -> (R, u64) {
123 let t0 = Tick::now();
124 let r = f();
125 let t1 = Tick::now();
126 (r, t1.elapsed_ns(t0))
127}
128
129/// Cache-busting buffer — sized to evict L1+L2 on Apple Silicon
130/// (M-series: 192 KB L1d / core, 16 MB L2 shared per cluster).
131/// Borrowed from MAX's `internal_utils/_cache_busting.mojo` (#19).
132///
133/// Allocate once, then call `.thrash()` between bench iterations to
134/// flush whatever the previous iteration left in cache. Without this,
135/// "cache-cold" timings actually measure cache-warm performance and
136/// over-report by 2-5×.
137pub struct CacheBuster {
138 buf: Vec<u8>,
139}
140
141impl CacheBuster {
142 /// Allocate a buster sized to evict the targeted cache. Defaults
143 /// to 32 MB — twice the M-series L2 — which guarantees full L2
144 /// eviction. Pass a custom size for finer control (e.g. 256 KB
145 /// to evict only L1).
146 pub fn new() -> Self {
147 Self::with_bytes(32 * 1024 * 1024)
148 }
149
150 pub fn with_bytes(bytes: usize) -> Self {
151 Self {
152 buf: vec![0u8; bytes],
153 }
154 }
155
156 /// Walk the buffer once, touching every cache line. After this
157 /// returns, the previous workload's data is evicted.
158 #[inline(never)]
159 pub fn thrash(&mut self) {
160 // 64-byte stride matches the cacheline size on Apple Silicon.
161 // Use a volatile-ish read+write so the optimizer can't elide.
162 let len = self.buf.len();
163 let ptr = self.buf.as_mut_ptr();
164 let mut acc: u8 = 0;
165 let mut i = 0usize;
166 while i < len {
167 unsafe {
168 let p = ptr.add(i);
169 acc = acc.wrapping_add(std::ptr::read_volatile(p));
170 std::ptr::write_volatile(p, acc);
171 }
172 i += 64;
173 }
174 // Write the accumulator somewhere observable so dead-store
175 // elimination doesn't drop the loop on aggressive opt levels.
176 std::hint::black_box(acc);
177 }
178}
179
180impl Default for CacheBuster {
181 fn default() -> Self {
182 Self::new()
183 }
184}
185
186#[cfg(test)]
187mod tests {
188 use super::*;
189
190 #[test]
191 fn tick_is_monotonic() {
192 // CNTVCT_EL0 ticks at ~24 MHz on Apple Silicon → ~41 ns per tick.
193 // Two back-to-back reads can land on the same tick. Sleep one
194 // tick period so the delta is guaranteed non-zero.
195 let a = Tick::now();
196 std::thread::sleep(std::time::Duration::from_micros(50));
197 let b = Tick::now();
198 assert!(b.elapsed_ns(a) > 0);
199 }
200
201 #[test]
202 fn elapsed_units_agree() {
203 let a = Tick::now();
204 std::thread::sleep(std::time::Duration::from_millis(2));
205 let b = Tick::now();
206 let ns = b.elapsed_ns(a);
207 assert!(ns >= 1_500_000, "expected >=1.5ms, got {ns}ns");
208 assert!((b.elapsed_ms(a) - ns as f64 / 1e6).abs() < 1e-6);
209 }
210}