rlx_ir/
measure.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Cycle-accurate timing primitive (#66 in plan.md).
17//!
18//! `Instant::now()` on macOS already wraps `mach_continuous_time` (which on
19//! Apple Silicon ultimately reads `CNTVCT_EL0`), so the wall-clock precision
20//! is fine. The win from going direct is two-fold:
21//!
22//!   1. **Resolution.** `Instant` exposes nanoseconds via `Duration`, but the
23//!      hardware tick is ~41 ns on M-series (24 MHz `CNTFRQ_EL0`). Tracking
24//!      raw ticks lets the autotuner reason at the actual hardware grain
25//!      instead of pretending it has 1 ns precision.
26//!   2. **Overhead.** `Instant::now()` is a few hundred cycles of wrapper
27//!      and `Duration` math. The raw `mrs` is one instruction. For
28//!      sub-microsecond kernels (small-tile probes in `calibrate.rs`) the
29//!      wrapper itself becomes a measurable fraction of the timed region.
30//!
31//! Falls back to `Instant` on non-AArch64 / non-Apple targets so the API
32//! stays portable.
33
34#[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
35use std::time::Instant;
36
37/// Opaque tick reading. Subtract two of these to get a `Duration`.
38#[derive(Copy, Clone, Debug)]
39pub struct Tick {
40    #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
41    cycles: u64,
42    #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
43    instant: Instant,
44}
45
46impl Tick {
47    /// Read the current tick.
48    #[inline(always)]
49    pub fn now() -> Self {
50        #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
51        {
52            Tick {
53                cycles: read_cntvct(),
54            }
55        }
56        #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
57        {
58            Tick {
59                instant: Instant::now(),
60            }
61        }
62    }
63
64    /// Elapsed nanoseconds since `start`. Saturates at zero if the clock
65    /// went backwards (it shouldn't, but the kernel is allowed to lie).
66    #[inline(always)]
67    pub fn elapsed_ns(&self, start: Tick) -> u64 {
68        #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
69        {
70            let dt = self.cycles.saturating_sub(start.cycles);
71            // CNTFRQ_EL0 is 24 MHz on Apple Silicon → 1 tick = 1000/24 ns.
72            // Cached the first time we ask; the value never changes.
73            let freq = cntfrq_hz();
74            ((dt as u128) * 1_000_000_000u128 / freq as u128) as u64
75        }
76        #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
77        {
78            self.instant.duration_since(start.instant).as_nanos() as u64
79        }
80    }
81
82    #[inline(always)]
83    pub fn elapsed_us(&self, start: Tick) -> f64 {
84        self.elapsed_ns(start) as f64 / 1_000.0
85    }
86
87    #[inline(always)]
88    pub fn elapsed_ms(&self, start: Tick) -> f64 {
89        self.elapsed_ns(start) as f64 / 1_000_000.0
90    }
91}
92
93#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
94#[inline(always)]
95fn read_cntvct() -> u64 {
96    let val: u64;
97    // Safety: `mrs cntvct_el0, X` is unprivileged on AArch64 and reads the
98    // virtual count register. No memory effects.
99    unsafe {
100        std::arch::asm!("mrs {0}, cntvct_el0", out(reg) val, options(nomem, nostack));
101    }
102    val
103}
104
105#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
106fn cntfrq_hz() -> u64 {
107    use std::sync::OnceLock;
108    static FREQ: OnceLock<u64> = OnceLock::new();
109    *FREQ.get_or_init(|| {
110        let val: u64;
111        unsafe {
112            std::arch::asm!("mrs {0}, cntfrq_el0", out(reg) val, options(nomem, nostack));
113        }
114        // Apple Silicon reports 24 MHz; guard against bogus zero just in case.
115        if val == 0 { 24_000_000 } else { val }
116    })
117}
118
119/// Time `f`, returning `(result, elapsed_ns)`. Inlined so the surrounding
120/// loop can keep the closure body in registers.
121#[inline(always)]
122pub fn time_ns<R>(f: impl FnOnce() -> R) -> (R, u64) {
123    let t0 = Tick::now();
124    let r = f();
125    let t1 = Tick::now();
126    (r, t1.elapsed_ns(t0))
127}
128
129/// Cache-busting buffer — sized to evict L1+L2 on Apple Silicon
130/// (M-series: 192 KB L1d / core, 16 MB L2 shared per cluster).
131/// Borrowed from MAX's `internal_utils/_cache_busting.mojo` (#19).
132///
133/// Allocate once, then call `.thrash()` between bench iterations to
134/// flush whatever the previous iteration left in cache. Without this,
135/// "cache-cold" timings actually measure cache-warm performance and
136/// over-report by 2-5×.
137pub struct CacheBuster {
138    buf: Vec<u8>,
139}
140
141impl CacheBuster {
142    /// Allocate a buster sized to evict the targeted cache. Defaults
143    /// to 32 MB — twice the M-series L2 — which guarantees full L2
144    /// eviction. Pass a custom size for finer control (e.g. 256 KB
145    /// to evict only L1).
146    pub fn new() -> Self {
147        Self::with_bytes(32 * 1024 * 1024)
148    }
149
150    pub fn with_bytes(bytes: usize) -> Self {
151        Self {
152            buf: vec![0u8; bytes],
153        }
154    }
155
156    /// Walk the buffer once, touching every cache line. After this
157    /// returns, the previous workload's data is evicted.
158    #[inline(never)]
159    pub fn thrash(&mut self) {
160        // 64-byte stride matches the cacheline size on Apple Silicon.
161        // Use a volatile-ish read+write so the optimizer can't elide.
162        let len = self.buf.len();
163        let ptr = self.buf.as_mut_ptr();
164        let mut acc: u8 = 0;
165        let mut i = 0usize;
166        while i < len {
167            unsafe {
168                let p = ptr.add(i);
169                acc = acc.wrapping_add(std::ptr::read_volatile(p));
170                std::ptr::write_volatile(p, acc);
171            }
172            i += 64;
173        }
174        // Write the accumulator somewhere observable so dead-store
175        // elimination doesn't drop the loop on aggressive opt levels.
176        std::hint::black_box(acc);
177    }
178}
179
180impl Default for CacheBuster {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189
190    #[test]
191    fn tick_is_monotonic() {
192        // CNTVCT_EL0 ticks at ~24 MHz on Apple Silicon → ~41 ns per tick.
193        // Two back-to-back reads can land on the same tick. Sleep one
194        // tick period so the delta is guaranteed non-zero.
195        let a = Tick::now();
196        std::thread::sleep(std::time::Duration::from_micros(50));
197        let b = Tick::now();
198        assert!(b.elapsed_ns(a) > 0);
199    }
200
201    #[test]
202    fn elapsed_units_agree() {
203        let a = Tick::now();
204        std::thread::sleep(std::time::Duration::from_millis(2));
205        let b = Tick::now();
206        let ns = b.elapsed_ns(a);
207        assert!(ns >= 1_500_000, "expected >=1.5ms, got {ns}ns");
208        assert!((b.elapsed_ms(a) - ns as f64 / 1e6).abs() < 1e-6);
209    }
210}
rlx_ir/measure.rs

rlx_ir/
measure.rs