trueno/brick/profiler/mod.rs
1//! BrickProfiler: Token-Centric Profiling System
2//!
3//! TILING-SPEC-001: Tile-Level Profiling Support
4//!
5//! This module provides hierarchical profiling for compute bricks:
6//! - Per-brick timing and throughput (PAR-073)
7//! - O(1) hot path with BrickId enum (PAR-200)
8//! - Tile-level profiling for cache-blocked operations (TILING-SPEC-001)
9//! - Kernel checksum capture for divergence detection (CORRECTNESS-011)
10
11mod checksum;
12mod tile_stats;
13
14#[cfg(test)]
15mod tests;
16
17mod divergence;
18mod exec_graph_ext;
19mod recording;
20mod reporting;
21mod tiling;
22
23pub use checksum::{fnv1a_f32_checksum, DivergenceInfo, KernelChecksum};
24pub use tile_stats::{TileLevel, TileStats, TileTimer};
25
26use std::time::Instant;
27
28use super::exec_graph::{
29 BrickCategory, BrickId, BrickStats, CategoryStats, ExecutionGraph, SyncMode,
30};
31
32/// Pending measurement for deferred sync mode.
33#[derive(Debug, Clone)]
34struct PendingMeasurement {
35 /// Brick ID (if known)
36 brick_id: Option<BrickId>,
37 /// Brick name (for dynamic bricks)
38 name: Option<String>,
39 /// Start time in nanoseconds (from Instant::now())
40 start_ns: u64,
41 /// Number of elements processed
42 elements: u64,
43}
44
45/// Per-brick profiler using pure Rust timing.
46///
47/// # Design (PAR-073, PAR-200)
48///
49/// - Uses `std::time::Instant` for timing (no CUDA event FFI)
50/// - PAR-200: O(1) hot path with `BrickId` enum + array storage
51/// - GPU operations require explicit sync before timing point
52/// - Supports deferred sync mode for low-overhead production profiling
53/// - Aggregates statistics per brick name
54///
55/// # Usage
56///
57/// ```rust,ignore
58/// use trueno::brick::{BrickProfiler, BrickId, SyncMode};
59///
60/// let mut profiler = BrickProfiler::new();
61/// profiler.enable();
62///
63/// // Fast path: use BrickId for known bricks (PAR-200)
64/// let timer = profiler.start_brick(BrickId::RmsNorm);
65/// // ... do work ...
66/// // For GPU: cuda_stream.synchronize() HERE
67/// profiler.stop_brick(timer, 1);
68///
69/// // Legacy path: string-based (slower, for unknown bricks)
70/// let timer = profiler.start("CustomBrick");
71/// profiler.stop(timer, 1);
72///
73/// // Deferred sync mode (production)
74/// profiler.set_sync_mode(SyncMode::Deferred);
75/// profiler.record_deferred(BrickId::RmsNorm, start_ns, 1);
76/// // ... more operations ...
77/// cuda_stream.synchronize();
78/// profiler.finalize(end_ns);
79///
80/// // Get statistics
81/// let stats = profiler.brick_stats(BrickId::RmsNorm);
82/// println!("RmsNorm avg: {:.2}µs", stats.avg_us());
83///
84/// // Get category breakdown
85/// let cats = profiler.category_stats();
86/// println!("Attention: {:.1}%", cats[BrickCategory::Attention as usize].percentage(profiler.total_ns()));
87/// ```
88#[derive(Debug)]
89pub struct BrickProfiler {
90 // PAR-200: Fast path - pre-allocated array for known bricks
91 /// Per-brick statistics for known BrickId types (O(1) lookup)
92 brick_stats: [BrickStats; BrickId::COUNT],
93
94 // Legacy path - HashMap for dynamic/unknown brick names
95 /// Per-brick statistics for unknown brick names (slower, O(1) amortized)
96 dynamic_stats: std::collections::HashMap<String, BrickStats>,
97
98 // PAR-200: Deferred sync support
99 /// Pending measurements awaiting GPU sync
100 pending: Vec<PendingMeasurement>,
101 /// Synchronization mode
102 sync_mode: SyncMode,
103 /// Reference instant for deferred timing
104 epoch: Instant,
105
106 /// Whether profiling is enabled
107 enabled: bool,
108 /// Total tokens processed
109 total_tokens: u64,
110 /// Total time (ns) across all bricks
111 total_ns: u64,
112 /// L2 cache hit rate (0.0-1.0) - v1.1.0 OBSERVE phase
113 l2_cache_hit_rate: Option<f32>,
114 /// Whether zero-copy memory transfers are enabled - v1.1.0 OBSERVE phase
115 is_zero_copy: bool,
116 /// CORRECTNESS-011: Per-kernel checksums for divergence detection
117 kernel_checksums: Vec<KernelChecksum>,
118
119 // PAR-201: Execution path graph
120 /// Whether execution graph tracking is enabled
121 graph_enabled: bool,
122 /// Execution path graph for PTX→kernel→brick relationships
123 execution_graph: ExecutionGraph,
124
125 // TILING-SPEC-001: Tile-level profiling
126 /// Per-level tile statistics (Macro, Midi, Micro)
127 tile_stats: [TileStats; 3],
128 /// Whether tile profiling is enabled
129 tile_profiling_enabled: bool,
130}
131
132/// Timer handle returned by `start()` (legacy string-based API).
133#[derive(Debug)]
134pub struct BrickTimer {
135 /// Brick name
136 name: String,
137 /// Start time
138 start: Instant,
139}
140
141/// Timer handle returned by `start_brick()` (PAR-200 fast path).
142#[derive(Debug)]
143pub struct BrickIdTimer {
144 /// Brick ID
145 brick_id: BrickId,
146 /// Start time
147 start: Instant,
148}
149
150impl Default for BrickProfiler {
151 fn default() -> Self {
152 Self::new()
153 }
154}
155
156impl BrickProfiler {
157 /// Create a new profiler (disabled by default for zero overhead).
158 pub fn new() -> Self {
159 Self {
160 brick_stats: std::array::from_fn(|i| BrickStats::new(BrickId::ALL[i].name())),
161 dynamic_stats: std::collections::HashMap::new(),
162 pending: Vec::new(),
163 sync_mode: SyncMode::Deferred,
164 epoch: Instant::now(),
165 enabled: false,
166 total_tokens: 0,
167 total_ns: 0,
168 l2_cache_hit_rate: None,
169 is_zero_copy: false,
170 kernel_checksums: Vec::new(),
171 graph_enabled: false,
172 execution_graph: ExecutionGraph::new(),
173 tile_stats: [
174 TileStats::new(TileLevel::Macro),
175 TileStats::new(TileLevel::Midi),
176 TileStats::new(TileLevel::Micro),
177 ],
178 tile_profiling_enabled: false,
179 }
180 }
181
182 /// Create an enabled profiler.
183 pub fn enabled() -> Self {
184 let mut profiler = Self::new();
185 profiler.enabled = true;
186 profiler
187 }
188
189 // ========================================================================
190 // PAR-200: Sync Mode Configuration
191 // ========================================================================
192
193 /// Set the synchronization mode for GPU profiling.
194 ///
195 /// # Modes
196 /// - `Immediate`: Sync after each kernel (accurate but slow)
197 /// - `PerLayer`: Sync once per transformer layer
198 /// - `Deferred`: Sync once per forward pass (default, fast)
199 /// - `None`: No synchronization
200 pub fn set_sync_mode(&mut self, mode: SyncMode) {
201 contract_pre_sync_verification!();
202 self.sync_mode = mode;
203 }
204
205 /// Get the current synchronization mode.
206 #[must_use]
207 pub fn sync_mode(&self) -> SyncMode {
208 self.sync_mode
209 }
210
211 /// Reset the epoch for deferred timing.
212 /// Call this at the start of a forward pass.
213 pub fn reset_epoch(&mut self) {
214 self.epoch = Instant::now();
215 }
216
217 /// Get nanoseconds elapsed since epoch.
218 #[inline]
219 pub fn elapsed_ns(&self) -> u64 {
220 self.epoch.elapsed().as_nanos() as u64
221 }
222
223 // ========================================================================
224 // PAR-200: Fast Path API (BrickId-based)
225 // ========================================================================
226
227 /// Start timing a brick using BrickId (O(1) hot path).
228 ///
229 /// This is the preferred API for known brick types.
230 /// For GPU operations, call `stream.synchronize()` before `stop_brick()`.
231 #[inline]
232 #[must_use]
233 pub fn start_brick(&self, brick_id: BrickId) -> BrickIdTimer {
234 BrickIdTimer { brick_id, start: Instant::now() }
235 }
236
237 /// Stop timing and record the sample (O(1) hot path).
238 #[inline]
239 pub fn stop_brick(&mut self, timer: BrickIdTimer, elements: u64) {
240 if !self.enabled {
241 return;
242 }
243
244 let elapsed = timer.start.elapsed();
245 let elapsed_ns = elapsed.as_nanos() as u64;
246
247 // O(1) array access — CB-BUDGET: bounds-check brick_id
248 debug_assert!(
249 (timer.brick_id as usize) < self.brick_stats.len(),
250 "CB-BUDGET: brick_id {} out of bounds (max {})",
251 timer.brick_id as usize,
252 self.brick_stats.len()
253 );
254 let stats = &mut self.brick_stats[timer.brick_id as usize];
255 stats.add_sample(elapsed_ns, elements);
256
257 // Update totals
258 self.total_tokens += elements;
259 self.total_ns += elapsed_ns;
260 }
261
262 /// Get statistics for a known brick type (O(1)).
263 #[inline]
264 #[must_use]
265 pub fn brick_stats(&self, brick_id: BrickId) -> &BrickStats {
266 contract_pre_brick_ordering!();
267 &self.brick_stats[brick_id as usize]
268 }
269
270 /// Get mutable statistics for a known brick type (O(1)).
271 #[inline]
272 pub fn brick_stats_mut(&mut self, brick_id: BrickId) -> &mut BrickStats {
273 &mut self.brick_stats[brick_id as usize]
274 }
275
276 // ========================================================================
277 // PAR-200: Deferred Sync API
278 // ========================================================================
279
280 /// Record a measurement without GPU sync (deferred mode).
281 ///
282 /// Call `finalize()` after GPU sync to apply all pending measurements.
283 ///
284 /// # Arguments
285 /// - `brick_id`: The brick type
286 /// - `start_ns`: Start time (from `elapsed_ns()` at operation start)
287 /// - `elements`: Number of elements processed
288 #[inline]
289 pub fn record_deferred(&mut self, brick_id: BrickId, start_ns: u64, elements: u64) {
290 if !self.enabled {
291 return;
292 }
293 self.pending.push(PendingMeasurement {
294 brick_id: Some(brick_id),
295 name: None,
296 start_ns,
297 elements,
298 });
299 }
300
301 /// Record a measurement for a dynamic brick (deferred mode).
302 #[inline]
303 pub fn record_deferred_dynamic(&mut self, name: &str, start_ns: u64, elements: u64) {
304 if !self.enabled {
305 return;
306 }
307 self.pending.push(PendingMeasurement {
308 brick_id: BrickId::from_str(name),
309 name: Some(name.to_string()),
310 start_ns,
311 elements,
312 });
313 }
314
315 /// Finalize all pending measurements after GPU sync.
316 ///
317 /// Must be called after `stream.synchronize()` to get accurate timing.
318 ///
319 /// # Arguments
320 /// - `end_ns`: End time (from `elapsed_ns()` after sync)
321 pub fn finalize(&mut self, end_ns: u64) {
322 if self.pending.is_empty() {
323 return;
324 }
325
326 // Calculate elapsed time for each pending measurement
327 for m in self.pending.drain(..) {
328 let elapsed_ns = end_ns.saturating_sub(m.start_ns);
329
330 if let Some(brick_id) = m.brick_id {
331 // Fast path: known brick
332 let stats = &mut self.brick_stats[brick_id as usize];
333 stats.add_sample(elapsed_ns, m.elements);
334 } else if let Some(name) = m.name {
335 // Fallback path: dynamic brick lookup
336 let stats = self
337 .dynamic_stats
338 .entry(name.clone())
339 .or_insert_with(|| BrickStats::new(&name));
340 stats.add_sample(elapsed_ns, m.elements);
341 }
342
343 self.total_tokens += m.elements;
344 self.total_ns += elapsed_ns;
345 }
346 }
347
348 /// Check if there are pending measurements.
349 #[inline]
350 #[must_use]
351 pub fn has_pending(&self) -> bool {
352 !self.pending.is_empty()
353 }
354
355 /// Get number of pending measurements.
356 #[inline]
357 #[must_use]
358 pub fn pending_count(&self) -> usize {
359 self.pending.len()
360 }
361
362 // ========================================================================
363 // PAR-200: Category Aggregation
364 // ========================================================================
365
366 /// Get aggregated statistics by category.
367 ///
368 /// Returns an array indexed by `BrickCategory as usize`.
369 #[must_use]
370 pub fn category_stats(&self) -> [CategoryStats; BrickCategory::COUNT] {
371 let mut result = [CategoryStats::default(); BrickCategory::COUNT];
372
373 for (i, stats) in self.brick_stats.iter().enumerate() {
374 let brick_id = BrickId::ALL[i];
375 let cat = brick_id.category() as usize;
376 result[cat].total_ns += stats.total_ns;
377 result[cat].total_elements += stats.total_elements;
378 result[cat].count += stats.count;
379 }
380
381 // Include dynamic stats in "Other" category
382 for stats in self.dynamic_stats.values() {
383 let cat = BrickCategory::Other as usize;
384 result[cat].total_ns += stats.total_ns;
385 result[cat].total_elements += stats.total_elements;
386 result[cat].count += stats.count;
387 }
388
389 result
390 }
391
392 /// Set L2 cache hit rate (v1.1.0 OBSERVE phase)
393 pub fn set_l2_cache_hit_rate(&mut self, rate: f32) {
394 self.l2_cache_hit_rate = Some(rate.clamp(0.0, 1.0));
395 }
396
397 /// Get L2 cache hit rate
398 pub fn l2_cache_hit_rate(&self) -> Option<f32> {
399 self.l2_cache_hit_rate
400 }
401
402 /// Set zero-copy mode (v1.1.0 OBSERVE phase)
403 pub fn set_zero_copy(&mut self, enabled: bool) {
404 self.is_zero_copy = enabled;
405 }
406
407 /// Check if zero-copy is enabled
408 pub fn is_zero_copy(&self) -> bool {
409 self.is_zero_copy
410 }
411
412 /// Enable profiling.
413 pub fn enable(&mut self) {
414 self.enabled = true;
415 }
416
417 /// Disable profiling.
418 pub fn disable(&mut self) {
419 self.enabled = false;
420 }
421
422 /// Check if profiling is enabled.
423 #[must_use]
424 pub fn is_enabled(&self) -> bool {
425 self.enabled
426 }
427
428 /// Get total throughput across all bricks.
429 #[must_use]
430 pub fn total_throughput(&self) -> f64 {
431 if self.total_ns == 0 {
432 0.0
433 } else {
434 self.total_tokens as f64 / (self.total_ns as f64 / 1_000_000_000.0)
435 }
436 }
437
438 /// Get total tokens processed.
439 #[must_use]
440 pub fn total_tokens(&self) -> u64 {
441 contract_pre_token_accounting!();
442 self.total_tokens
443 }
444
445 /// Get total time in nanoseconds.
446 #[must_use]
447 pub fn total_ns(&self) -> u64 {
448 contract_pre_wall_coverage!();
449 self.total_ns
450 }
451}