ferrum_kernels/backend/
timer.rs

1//! Cross-backend GPU-side timer trait — PLAYBOOK § Phase 1.1.
2//!
3//! Replaces the `Instant::now()` calls inside `FERRUM_*_PROF` probes
4//! (`crates/ferrum-models/src/moe/forward.rs`, `qwen3_moe.rs`, etc.).
5//! Those measure CPU-side dispatch + queue depth — they DON'T see GPU
6//! execution time, so the "per-op µs" they report has been misleading
7//! all the perf debugging this session has built on top of.
8//!
9//! ## Backend behaviour
10//!
11//! - **CUDA** (`cudarc::driver::sys` events) — `cuEventRecord` is
12//!   asynchronous on the stream; `elapsed_ms()` calls `cuEventSynchronize`
13//!   + `cuEventElapsedTime`. Overhead per scope: ~5µs (event create +
14//!   record × 2 + sync at read). Accuracy: ±0.5µs.
15//!
16//! - **Metal** — Metal's `MTLCommandBuffer` exposes `gpuStartTime` /
17//!   `gpuEndTime` per command buffer. For sub-command-buffer scope we
18//!   wrap the section in an explicit `sync()` boundary. This adds
19//!   command-buffer commit overhead (~50-100µs) but gives accurate
20//!   on-GPU timing. **Caveat**: on Metal the sync-wrap inflates each
21//!   timed scope's CPU side; use sparingly.
22//!
23//! - **CPU** — `Instant`. (CPU is the "GPU" here — wall-clock is correct.)
24//!
25//! ## Usage
26//!
27//! ```ignore
28//! use ferrum_kernels::backend::timer::BackendTimer;
29//!
30//! let mut timer = <B as Backend>::Timer::new();
31//! timer.record_start(&mut ctx);
32//! Backend::rms_norm(&mut ctx, &x, &w, eps, &mut out, tokens, dim);
33//! timer.record_end(&mut ctx);
34//! let us = timer.elapsed_ms() * 1000.0;
35//! tracing::info!("rms_norm: {us:.1} us");
36//! ```
37//!
38//! Hot loops should reuse a single `Timer` instance across scopes via
39//! `record_start` / `record_end` — `new()` allocates events on CUDA.
40
41use crate::backend::Backend;
42
43/// Start a timer iff `enabled` is true — `None` is the disabled state.
44/// Pair with [`finish_probe_timer`] at the end of the scope. The env/config
45/// gate is intentionally resolved by the caller so hot probes do not read
46/// process env while a token/layer loop is running.
47pub fn start_probe_timer_if<B: Backend>(enabled: bool, ctx: &mut B::Context) -> Option<B::Timer> {
48    if enabled {
49        let mut t = B::make_timer();
50        t.record_start(ctx);
51        Some(t)
52    } else {
53        None
54    }
55}
56
57/// Close a timer started by [`start_probe_timer`] and return the
58/// elapsed microseconds. `None` propagates the "disabled" state so the
59/// caller can keep the `if let Some(us) = ... { record(us) }` pattern.
60pub fn finish_probe_timer<B: Backend>(
61    timer: Option<B::Timer>,
62    ctx: &mut B::Context,
63) -> Option<u64> {
64    let mut t = timer?;
65    t.record_end(ctx);
66    Some((t.elapsed_ms() * 1000.0) as u64)
67}
68
69/// Convenience wrapper: close a timer AND push a chrome-trace event in
70/// one call. When `FERRUM_TRACE_OUT` is unset, the trace push is a
71/// no-op (cheap atomic check inside [`global_trace`]).
72///
73/// PLAYBOOK § 1.5 — Phase 4 `visualize_layerwise.py` reads chrome-trace
74/// JSON populated by these probe sites.
75pub fn finish_probe_timer_traced<B: Backend>(
76    timer: Option<B::Timer>,
77    ctx: &mut B::Context,
78    name: &str,
79    cat: &str,
80    tid: u32,
81) -> Option<u64> {
82    let us = finish_probe_timer::<B>(timer, ctx)?;
83    ferrum_bench_core::trace::global_trace().push(name, cat, (us as f64) / 1000.0, tid);
84    Some(us)
85}
86
87/// GPU-side timer scoped to a single Backend context.
88pub trait BackendTimer<B: Backend>: Send {
89    /// Allocate timer state. On CUDA this creates two `cuEvent_t`
90    /// handles; on Metal it's a no-op; on CPU it's two `Option<Instant>`.
91    fn new() -> Self
92    where
93        Self: Sized;
94
95    /// Record the "start" timestamp on the current ctx's stream/command
96    /// buffer. Returns immediately on CUDA (async); on Metal forces a
97    /// sync to flush any pending work first.
98    fn record_start(&mut self, ctx: &mut B::Context);
99
100    /// Record the "end" timestamp.
101    fn record_end(&mut self, ctx: &mut B::Context);
102
103    /// Synchronize on the recorded events and return the elapsed time
104    /// in milliseconds. Blocks the calling thread on CUDA; instant on CPU/Metal.
105    ///
106    /// Calling `elapsed_ms` before both record_start + record_end have
107    /// fired returns `0.0`.
108    fn elapsed_ms(&self) -> f64;
109}
110
111// ─────────────────────────────────────────────────────────────────────
112// CPU implementation
113// ─────────────────────────────────────────────────────────────────────
114
115/// CPU timer — wall-clock via `Instant`. There's no GPU to wait on,
116/// so the "GPU time" is just the CPU work duration.
117pub struct CpuTimer {
118    start: Option<std::time::Instant>,
119    end: Option<std::time::Instant>,
120}
121
122impl Default for CpuTimer {
123    fn default() -> Self {
124        Self::new()
125    }
126}
127
128impl CpuTimer {
129    pub fn new() -> Self {
130        Self {
131            start: None,
132            end: None,
133        }
134    }
135}
136
137impl BackendTimer<crate::backend::cpu::CpuBackend> for CpuTimer {
138    fn new() -> Self {
139        CpuTimer::new()
140    }
141
142    fn record_start(&mut self, _ctx: &mut <crate::backend::cpu::CpuBackend as Backend>::Context) {
143        self.start = Some(std::time::Instant::now());
144    }
145
146    fn record_end(&mut self, _ctx: &mut <crate::backend::cpu::CpuBackend as Backend>::Context) {
147        self.end = Some(std::time::Instant::now());
148    }
149
150    fn elapsed_ms(&self) -> f64 {
151        match (self.start, self.end) {
152            (Some(s), Some(e)) => e.duration_since(s).as_secs_f64() * 1000.0,
153            _ => 0.0,
154        }
155    }
156}
157
158// ─────────────────────────────────────────────────────────────────────
159// Metal implementation
160// ─────────────────────────────────────────────────────────────────────
161//
162// Metal exposes per-command-buffer `gpuStartTime`/`gpuEndTime`. To time
163// a sub-CB scope we explicitly `sync()` to commit the current CB,
164// record `Instant::now()` between syncs. The sync forces a flush+wait
165// so the wall-clock delta IS the GPU time — same property as CUDA events,
166// just paid with extra commits.
167//
168// Future improvement: use `MTLCounterSampleBuffer` with the timestamp
169// counter set when the backend's pipeline state is configured to sample
170// counters — would avoid the sync overhead. Not yet wired.
171
172#[cfg(all(target_os = "macos", feature = "metal"))]
173pub struct MetalTimer {
174    start: Option<std::time::Instant>,
175    end: Option<std::time::Instant>,
176}
177
178#[cfg(all(target_os = "macos", feature = "metal"))]
179impl Default for MetalTimer {
180    fn default() -> Self {
181        Self::new()
182    }
183}
184
185#[cfg(all(target_os = "macos", feature = "metal"))]
186impl MetalTimer {
187    pub fn new() -> Self {
188        Self {
189            start: None,
190            end: None,
191        }
192    }
193}
194
195#[cfg(all(target_os = "macos", feature = "metal"))]
196impl BackendTimer<crate::backend::metal::MetalBackend> for MetalTimer {
197    fn new() -> Self {
198        MetalTimer::new()
199    }
200
201    fn record_start(
202        &mut self,
203        ctx: &mut <crate::backend::metal::MetalBackend as Backend>::Context,
204    ) {
205        // Force any pending work to drain before we anchor the clock.
206        crate::backend::metal::MetalBackend::sync(ctx);
207        self.start = Some(std::time::Instant::now());
208    }
209
210    fn record_end(&mut self, ctx: &mut <crate::backend::metal::MetalBackend as Backend>::Context) {
211        // Sync so the wall-clock delta is bounded by actual GPU completion.
212        crate::backend::metal::MetalBackend::sync(ctx);
213        self.end = Some(std::time::Instant::now());
214    }
215
216    fn elapsed_ms(&self) -> f64 {
217        match (self.start, self.end) {
218            (Some(s), Some(e)) => e.duration_since(s).as_secs_f64() * 1000.0,
219            _ => 0.0,
220        }
221    }
222}
223
224// ─────────────────────────────────────────────────────────────────────
225// CUDA implementation
226// ─────────────────────────────────────────────────────────────────────
227//
228// Uses cudarc's raw event API — same one quant.rs:889 already uses for
229// Marlin split-K cross-stream coordination.
230
231#[cfg(feature = "cuda")]
232pub struct CudaTimer {
233    start: Option<cudarc::driver::sys::CUevent>,
234    end: Option<cudarc::driver::sys::CUevent>,
235    recorded_start: bool,
236    recorded_end: bool,
237}
238
239// `CUevent` is `*mut c_void` (a CUDA driver handle). The CUDA driver API
240// is documented as thread-safe for these handles; the trait bound
241// `BackendTimer<B>: Send` requires it.
242#[cfg(feature = "cuda")]
243unsafe impl Send for CudaTimer {}
244
245#[cfg(feature = "cuda")]
246impl Default for CudaTimer {
247    fn default() -> Self {
248        Self::new()
249    }
250}
251
252#[cfg(feature = "cuda")]
253impl Drop for CudaTimer {
254    fn drop(&mut self) {
255        // Best-effort destroy — ignore errors during drop.
256        use cudarc::driver::sys as cu;
257        unsafe {
258            if let Some(e) = self.start.take() {
259                let _ = cu::cuEventDestroy_v2(e);
260            }
261            if let Some(e) = self.end.take() {
262                let _ = cu::cuEventDestroy_v2(e);
263            }
264        }
265    }
266}
267
268#[cfg(feature = "cuda")]
269impl CudaTimer {
270    pub fn new() -> Self {
271        use cudarc::driver::sys as cu;
272        let mut start: cu::CUevent = std::ptr::null_mut();
273        let mut end: cu::CUevent = std::ptr::null_mut();
274        unsafe {
275            // Flag 0 = default (with timing). We don't disable timing
276            // (cuEventDisableTiming) since elapsed_ms needs it.
277            let _ = cu::cuEventCreate(&mut start, 0);
278            let _ = cu::cuEventCreate(&mut end, 0);
279        }
280        Self {
281            start: Some(start),
282            end: Some(end),
283            recorded_start: false,
284            recorded_end: false,
285        }
286    }
287}
288
289#[cfg(feature = "cuda")]
290impl BackendTimer<crate::backend::cuda::CudaBackend> for CudaTimer {
291    fn new() -> Self {
292        CudaTimer::new()
293    }
294
295    fn record_start(&mut self, ctx: &mut <crate::backend::cuda::CudaBackend as Backend>::Context) {
296        use cudarc::driver::sys as cu;
297        if let Some(evt) = self.start {
298            unsafe {
299                let _ = cu::cuEventRecord(evt, ctx.stream.cu_stream());
300            }
301            self.recorded_start = true;
302        }
303    }
304
305    fn record_end(&mut self, ctx: &mut <crate::backend::cuda::CudaBackend as Backend>::Context) {
306        use cudarc::driver::sys as cu;
307        if let Some(evt) = self.end {
308            unsafe {
309                let _ = cu::cuEventRecord(evt, ctx.stream.cu_stream());
310            }
311            self.recorded_end = true;
312        }
313    }
314
315    fn elapsed_ms(&self) -> f64 {
316        if !self.recorded_start || !self.recorded_end {
317            return 0.0;
318        }
319        use cudarc::driver::sys as cu;
320        let (Some(s), Some(e)) = (self.start, self.end) else {
321            return 0.0;
322        };
323        unsafe {
324            // cuEventSynchronize blocks until the event is observed
325            // on the stream. Required before reading elapsed.
326            let _ = cu::cuEventSynchronize(e);
327            // Safe wrapper dispatches cuEventElapsedTime vs _v2 based on
328            // cudarc's auto-detected CUDA version (cuda-version-from-
329            // build-system feature). The two are only visible under
330            // CUDA 12.x and 13.x respectively — calling either directly
331            // is non-portable.
332            cudarc::driver::result::event::elapsed(s, e)
333                .ok()
334                .map(|ms| ms as f64)
335                .unwrap_or(0.0)
336        }
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn cpu_timer_basic() {
346        let mut t = CpuTimer::new();
347        assert_eq!(t.elapsed_ms(), 0.0);
348        // Construct a CpuBackend Context — which is unit.
349        let mut ctx: <crate::backend::cpu::CpuBackend as Backend>::Context = ();
350        BackendTimer::<crate::backend::cpu::CpuBackend>::record_start(&mut t, &mut ctx);
351        std::thread::sleep(std::time::Duration::from_millis(2));
352        BackendTimer::<crate::backend::cpu::CpuBackend>::record_end(&mut t, &mut ctx);
353        let ms = BackendTimer::<crate::backend::cpu::CpuBackend>::elapsed_ms(&t);
354        assert!(ms >= 2.0 && ms < 50.0, "elapsed_ms = {ms}");
355    }
356
357    #[test]
358    fn cpu_timer_returns_zero_if_unrecorded() {
359        let t = CpuTimer::new();
360        let ms = BackendTimer::<crate::backend::cpu::CpuBackend>::elapsed_ms(&t);
361        assert_eq!(ms, 0.0);
362    }
363}
ferrum_kernels/backend/timer.rs

ferrum_kernels/backend/
timer.rs