ferrum_kernels/backend/timer.rs
1//! Cross-backend GPU-side timer trait — PLAYBOOK § Phase 1.1.
2//!
3//! Replaces the `Instant::now()` calls inside `FERRUM_*_PROF` probes
4//! (`crates/ferrum-models/src/moe/forward.rs`, `qwen3_moe.rs`, etc.).
5//! Those measure CPU-side dispatch + queue depth — they DON'T see GPU
6//! execution time, so the "per-op µs" they report has been misleading
7//! all the perf debugging this session has built on top of.
8//!
9//! ## Backend behaviour
10//!
11//! - **CUDA** (`cudarc::driver::sys` events) — `cuEventRecord` is
12//! asynchronous on the stream; `elapsed_ms()` calls `cuEventSynchronize`
13//! + `cuEventElapsedTime`. Overhead per scope: ~5µs (event create +
14//! record × 2 + sync at read). Accuracy: ±0.5µs.
15//!
16//! - **Metal** — Metal's `MTLCommandBuffer` exposes `gpuStartTime` /
17//! `gpuEndTime` per command buffer. For sub-command-buffer scope we
18//! wrap the section in an explicit `sync()` boundary. This adds
19//! command-buffer commit overhead (~50-100µs) but gives accurate
20//! on-GPU timing. **Caveat**: on Metal the sync-wrap inflates each
21//! timed scope's CPU side; use sparingly.
22//!
23//! - **CPU** — `Instant`. (CPU is the "GPU" here — wall-clock is correct.)
24//!
25//! ## Usage
26//!
27//! ```ignore
28//! use ferrum_kernels::backend::timer::BackendTimer;
29//!
30//! let mut timer = <B as Backend>::Timer::new();
31//! timer.record_start(&mut ctx);
32//! Backend::rms_norm(&mut ctx, &x, &w, eps, &mut out, tokens, dim);
33//! timer.record_end(&mut ctx);
34//! let us = timer.elapsed_ms() * 1000.0;
35//! tracing::info!("rms_norm: {us:.1} us");
36//! ```
37//!
38//! Hot loops should reuse a single `Timer` instance across scopes via
39//! `record_start` / `record_end` — `new()` allocates events on CUDA.
40
41use crate::backend::Backend;
42
43/// Start a timer iff `enabled` is true — `None` is the disabled state.
44/// Pair with [`finish_probe_timer`] at the end of the scope. The env/config
45/// gate is intentionally resolved by the caller so hot probes do not read
46/// process env while a token/layer loop is running.
47pub fn start_probe_timer_if<B: Backend>(enabled: bool, ctx: &mut B::Context) -> Option<B::Timer> {
48 if enabled {
49 let mut t = B::make_timer();
50 t.record_start(ctx);
51 Some(t)
52 } else {
53 None
54 }
55}
56
57/// Close a timer started by [`start_probe_timer`] and return the
58/// elapsed microseconds. `None` propagates the "disabled" state so the
59/// caller can keep the `if let Some(us) = ... { record(us) }` pattern.
60pub fn finish_probe_timer<B: Backend>(
61 timer: Option<B::Timer>,
62 ctx: &mut B::Context,
63) -> Option<u64> {
64 let mut t = timer?;
65 t.record_end(ctx);
66 Some((t.elapsed_ms() * 1000.0) as u64)
67}
68
69/// Convenience wrapper: close a timer AND push a chrome-trace event in
70/// one call. When `FERRUM_TRACE_OUT` is unset, the trace push is a
71/// no-op (cheap atomic check inside [`global_trace`]).
72///
73/// PLAYBOOK § 1.5 — Phase 4 `visualize_layerwise.py` reads chrome-trace
74/// JSON populated by these probe sites.
75pub fn finish_probe_timer_traced<B: Backend>(
76 timer: Option<B::Timer>,
77 ctx: &mut B::Context,
78 name: &str,
79 cat: &str,
80 tid: u32,
81) -> Option<u64> {
82 let us = finish_probe_timer::<B>(timer, ctx)?;
83 ferrum_bench_core::trace::global_trace().push(name, cat, (us as f64) / 1000.0, tid);
84 Some(us)
85}
86
87/// GPU-side timer scoped to a single Backend context.
88pub trait BackendTimer<B: Backend>: Send {
89 /// Allocate timer state. On CUDA this creates two `cuEvent_t`
90 /// handles; on Metal it's a no-op; on CPU it's two `Option<Instant>`.
91 fn new() -> Self
92 where
93 Self: Sized;
94
95 /// Record the "start" timestamp on the current ctx's stream/command
96 /// buffer. Returns immediately on CUDA (async); on Metal forces a
97 /// sync to flush any pending work first.
98 fn record_start(&mut self, ctx: &mut B::Context);
99
100 /// Record the "end" timestamp.
101 fn record_end(&mut self, ctx: &mut B::Context);
102
103 /// Synchronize on the recorded events and return the elapsed time
104 /// in milliseconds. Blocks the calling thread on CUDA; instant on CPU/Metal.
105 ///
106 /// Calling `elapsed_ms` before both record_start + record_end have
107 /// fired returns `0.0`.
108 fn elapsed_ms(&self) -> f64;
109}
110
111// ─────────────────────────────────────────────────────────────────────
112// CPU implementation
113// ─────────────────────────────────────────────────────────────────────
114
115/// CPU timer — wall-clock via `Instant`. There's no GPU to wait on,
116/// so the "GPU time" is just the CPU work duration.
117pub struct CpuTimer {
118 start: Option<std::time::Instant>,
119 end: Option<std::time::Instant>,
120}
121
122impl Default for CpuTimer {
123 fn default() -> Self {
124 Self::new()
125 }
126}
127
128impl CpuTimer {
129 pub fn new() -> Self {
130 Self {
131 start: None,
132 end: None,
133 }
134 }
135}
136
137impl BackendTimer<crate::backend::cpu::CpuBackend> for CpuTimer {
138 fn new() -> Self {
139 CpuTimer::new()
140 }
141
142 fn record_start(&mut self, _ctx: &mut <crate::backend::cpu::CpuBackend as Backend>::Context) {
143 self.start = Some(std::time::Instant::now());
144 }
145
146 fn record_end(&mut self, _ctx: &mut <crate::backend::cpu::CpuBackend as Backend>::Context) {
147 self.end = Some(std::time::Instant::now());
148 }
149
150 fn elapsed_ms(&self) -> f64 {
151 match (self.start, self.end) {
152 (Some(s), Some(e)) => e.duration_since(s).as_secs_f64() * 1000.0,
153 _ => 0.0,
154 }
155 }
156}
157
158// ─────────────────────────────────────────────────────────────────────
159// Metal implementation
160// ─────────────────────────────────────────────────────────────────────
161//
162// Metal exposes per-command-buffer `gpuStartTime`/`gpuEndTime`. To time
163// a sub-CB scope we explicitly `sync()` to commit the current CB,
164// record `Instant::now()` between syncs. The sync forces a flush+wait
165// so the wall-clock delta IS the GPU time — same property as CUDA events,
166// just paid with extra commits.
167//
168// Future improvement: use `MTLCounterSampleBuffer` with the timestamp
169// counter set when the backend's pipeline state is configured to sample
170// counters — would avoid the sync overhead. Not yet wired.
171
172#[cfg(all(target_os = "macos", feature = "metal"))]
173pub struct MetalTimer {
174 start: Option<std::time::Instant>,
175 end: Option<std::time::Instant>,
176}
177
178#[cfg(all(target_os = "macos", feature = "metal"))]
179impl Default for MetalTimer {
180 fn default() -> Self {
181 Self::new()
182 }
183}
184
185#[cfg(all(target_os = "macos", feature = "metal"))]
186impl MetalTimer {
187 pub fn new() -> Self {
188 Self {
189 start: None,
190 end: None,
191 }
192 }
193}
194
195#[cfg(all(target_os = "macos", feature = "metal"))]
196impl BackendTimer<crate::backend::metal::MetalBackend> for MetalTimer {
197 fn new() -> Self {
198 MetalTimer::new()
199 }
200
201 fn record_start(
202 &mut self,
203 ctx: &mut <crate::backend::metal::MetalBackend as Backend>::Context,
204 ) {
205 // Force any pending work to drain before we anchor the clock.
206 crate::backend::metal::MetalBackend::sync(ctx);
207 self.start = Some(std::time::Instant::now());
208 }
209
210 fn record_end(&mut self, ctx: &mut <crate::backend::metal::MetalBackend as Backend>::Context) {
211 // Sync so the wall-clock delta is bounded by actual GPU completion.
212 crate::backend::metal::MetalBackend::sync(ctx);
213 self.end = Some(std::time::Instant::now());
214 }
215
216 fn elapsed_ms(&self) -> f64 {
217 match (self.start, self.end) {
218 (Some(s), Some(e)) => e.duration_since(s).as_secs_f64() * 1000.0,
219 _ => 0.0,
220 }
221 }
222}
223
224// ─────────────────────────────────────────────────────────────────────
225// CUDA implementation
226// ─────────────────────────────────────────────────────────────────────
227//
228// Uses cudarc's raw event API — same one quant.rs:889 already uses for
229// Marlin split-K cross-stream coordination.
230
231#[cfg(feature = "cuda")]
232pub struct CudaTimer {
233 start: Option<cudarc::driver::sys::CUevent>,
234 end: Option<cudarc::driver::sys::CUevent>,
235 recorded_start: bool,
236 recorded_end: bool,
237}
238
239// `CUevent` is `*mut c_void` (a CUDA driver handle). The CUDA driver API
240// is documented as thread-safe for these handles; the trait bound
241// `BackendTimer<B>: Send` requires it.
242#[cfg(feature = "cuda")]
243unsafe impl Send for CudaTimer {}
244
245#[cfg(feature = "cuda")]
246impl Default for CudaTimer {
247 fn default() -> Self {
248 Self::new()
249 }
250}
251
252#[cfg(feature = "cuda")]
253impl Drop for CudaTimer {
254 fn drop(&mut self) {
255 // Best-effort destroy — ignore errors during drop.
256 use cudarc::driver::sys as cu;
257 unsafe {
258 if let Some(e) = self.start.take() {
259 let _ = cu::cuEventDestroy_v2(e);
260 }
261 if let Some(e) = self.end.take() {
262 let _ = cu::cuEventDestroy_v2(e);
263 }
264 }
265 }
266}
267
268#[cfg(feature = "cuda")]
269impl CudaTimer {
270 pub fn new() -> Self {
271 use cudarc::driver::sys as cu;
272 let mut start: cu::CUevent = std::ptr::null_mut();
273 let mut end: cu::CUevent = std::ptr::null_mut();
274 unsafe {
275 // Flag 0 = default (with timing). We don't disable timing
276 // (cuEventDisableTiming) since elapsed_ms needs it.
277 let _ = cu::cuEventCreate(&mut start, 0);
278 let _ = cu::cuEventCreate(&mut end, 0);
279 }
280 Self {
281 start: Some(start),
282 end: Some(end),
283 recorded_start: false,
284 recorded_end: false,
285 }
286 }
287}
288
289#[cfg(feature = "cuda")]
290impl BackendTimer<crate::backend::cuda::CudaBackend> for CudaTimer {
291 fn new() -> Self {
292 CudaTimer::new()
293 }
294
295 fn record_start(&mut self, ctx: &mut <crate::backend::cuda::CudaBackend as Backend>::Context) {
296 use cudarc::driver::sys as cu;
297 if let Some(evt) = self.start {
298 unsafe {
299 let _ = cu::cuEventRecord(evt, ctx.stream.cu_stream());
300 }
301 self.recorded_start = true;
302 }
303 }
304
305 fn record_end(&mut self, ctx: &mut <crate::backend::cuda::CudaBackend as Backend>::Context) {
306 use cudarc::driver::sys as cu;
307 if let Some(evt) = self.end {
308 unsafe {
309 let _ = cu::cuEventRecord(evt, ctx.stream.cu_stream());
310 }
311 self.recorded_end = true;
312 }
313 }
314
315 fn elapsed_ms(&self) -> f64 {
316 if !self.recorded_start || !self.recorded_end {
317 return 0.0;
318 }
319 use cudarc::driver::sys as cu;
320 let (Some(s), Some(e)) = (self.start, self.end) else {
321 return 0.0;
322 };
323 unsafe {
324 // cuEventSynchronize blocks until the event is observed
325 // on the stream. Required before reading elapsed.
326 let _ = cu::cuEventSynchronize(e);
327 // Safe wrapper dispatches cuEventElapsedTime vs _v2 based on
328 // cudarc's auto-detected CUDA version (cuda-version-from-
329 // build-system feature). The two are only visible under
330 // CUDA 12.x and 13.x respectively — calling either directly
331 // is non-portable.
332 cudarc::driver::result::event::elapsed(s, e)
333 .ok()
334 .map(|ms| ms as f64)
335 .unwrap_or(0.0)
336 }
337 }
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn cpu_timer_basic() {
346 let mut t = CpuTimer::new();
347 assert_eq!(t.elapsed_ms(), 0.0);
348 // Construct a CpuBackend Context — which is unit.
349 let mut ctx: <crate::backend::cpu::CpuBackend as Backend>::Context = ();
350 BackendTimer::<crate::backend::cpu::CpuBackend>::record_start(&mut t, &mut ctx);
351 std::thread::sleep(std::time::Duration::from_millis(2));
352 BackendTimer::<crate::backend::cpu::CpuBackend>::record_end(&mut t, &mut ctx);
353 let ms = BackendTimer::<crate::backend::cpu::CpuBackend>::elapsed_ms(&t);
354 assert!(ms >= 2.0 && ms < 50.0, "elapsed_ms = {ms}");
355 }
356
357 #[test]
358 fn cpu_timer_returns_zero_if_unrecorded() {
359 let t = CpuTimer::new();
360 let ms = BackendTimer::<crate::backend::cpu::CpuBackend>::elapsed_ms(&t);
361 assert_eq!(ms, 0.0);
362 }
363}