Skip to main content

trueno/brick/profiler/
mod.rs

1//! BrickProfiler: Token-Centric Profiling System
2//!
3//! TILING-SPEC-001: Tile-Level Profiling Support
4//!
5//! This module provides hierarchical profiling for compute bricks:
6//! - Per-brick timing and throughput (PAR-073)
7//! - O(1) hot path with BrickId enum (PAR-200)
8//! - Tile-level profiling for cache-blocked operations (TILING-SPEC-001)
9//! - Kernel checksum capture for divergence detection (CORRECTNESS-011)
10
11mod checksum;
12mod tile_stats;
13
14#[cfg(test)]
15mod tests;
16
17mod divergence;
18mod exec_graph_ext;
19mod recording;
20mod reporting;
21mod tiling;
22
23pub use checksum::{fnv1a_f32_checksum, DivergenceInfo, KernelChecksum};
24pub use tile_stats::{TileLevel, TileStats, TileTimer};
25
26use std::time::Instant;
27
28use super::exec_graph::{
29    BrickCategory, BrickId, BrickStats, CategoryStats, ExecutionGraph, SyncMode,
30};
31
32/// Pending measurement for deferred sync mode.
33#[derive(Debug, Clone)]
34struct PendingMeasurement {
35    /// Brick ID (if known)
36    brick_id: Option<BrickId>,
37    /// Brick name (for dynamic bricks)
38    name: Option<String>,
39    /// Start time in nanoseconds (from Instant::now())
40    start_ns: u64,
41    /// Number of elements processed
42    elements: u64,
43}
44
45/// Per-brick profiler using pure Rust timing.
46///
47/// # Design (PAR-073, PAR-200)
48///
49/// - Uses `std::time::Instant` for timing (no CUDA event FFI)
50/// - PAR-200: O(1) hot path with `BrickId` enum + array storage
51/// - GPU operations require explicit sync before timing point
52/// - Supports deferred sync mode for low-overhead production profiling
53/// - Aggregates statistics per brick name
54///
55/// # Usage
56///
57/// ```rust,ignore
58/// use trueno::brick::{BrickProfiler, BrickId, SyncMode};
59///
60/// let mut profiler = BrickProfiler::new();
61/// profiler.enable();
62///
63/// // Fast path: use BrickId for known bricks (PAR-200)
64/// let timer = profiler.start_brick(BrickId::RmsNorm);
65/// // ... do work ...
66/// // For GPU: cuda_stream.synchronize() HERE
67/// profiler.stop_brick(timer, 1);
68///
69/// // Legacy path: string-based (slower, for unknown bricks)
70/// let timer = profiler.start("CustomBrick");
71/// profiler.stop(timer, 1);
72///
73/// // Deferred sync mode (production)
74/// profiler.set_sync_mode(SyncMode::Deferred);
75/// profiler.record_deferred(BrickId::RmsNorm, start_ns, 1);
76/// // ... more operations ...
77/// cuda_stream.synchronize();
78/// profiler.finalize(end_ns);
79///
80/// // Get statistics
81/// let stats = profiler.brick_stats(BrickId::RmsNorm);
82/// println!("RmsNorm avg: {:.2}µs", stats.avg_us());
83///
84/// // Get category breakdown
85/// let cats = profiler.category_stats();
86/// println!("Attention: {:.1}%", cats[BrickCategory::Attention as usize].percentage(profiler.total_ns()));
87/// ```
88#[derive(Debug)]
89pub struct BrickProfiler {
90    // PAR-200: Fast path - pre-allocated array for known bricks
91    /// Per-brick statistics for known BrickId types (O(1) lookup)
92    brick_stats: [BrickStats; BrickId::COUNT],
93
94    // Legacy path - HashMap for dynamic/unknown brick names
95    /// Per-brick statistics for unknown brick names (slower, O(1) amortized)
96    dynamic_stats: std::collections::HashMap<String, BrickStats>,
97
98    // PAR-200: Deferred sync support
99    /// Pending measurements awaiting GPU sync
100    pending: Vec<PendingMeasurement>,
101    /// Synchronization mode
102    sync_mode: SyncMode,
103    /// Reference instant for deferred timing
104    epoch: Instant,
105
106    /// Whether profiling is enabled
107    enabled: bool,
108    /// Total tokens processed
109    total_tokens: u64,
110    /// Total time (ns) across all bricks
111    total_ns: u64,
112    /// L2 cache hit rate (0.0-1.0) - v1.1.0 OBSERVE phase
113    l2_cache_hit_rate: Option<f32>,
114    /// Whether zero-copy memory transfers are enabled - v1.1.0 OBSERVE phase
115    is_zero_copy: bool,
116    /// CORRECTNESS-011: Per-kernel checksums for divergence detection
117    kernel_checksums: Vec<KernelChecksum>,
118
119    // PAR-201: Execution path graph
120    /// Whether execution graph tracking is enabled
121    graph_enabled: bool,
122    /// Execution path graph for PTX→kernel→brick relationships
123    execution_graph: ExecutionGraph,
124
125    // TILING-SPEC-001: Tile-level profiling
126    /// Per-level tile statistics (Macro, Midi, Micro)
127    tile_stats: [TileStats; 3],
128    /// Whether tile profiling is enabled
129    tile_profiling_enabled: bool,
130}
131
132/// Timer handle returned by `start()` (legacy string-based API).
133#[derive(Debug)]
134pub struct BrickTimer {
135    /// Brick name
136    name: String,
137    /// Start time
138    start: Instant,
139}
140
141/// Timer handle returned by `start_brick()` (PAR-200 fast path).
142#[derive(Debug)]
143pub struct BrickIdTimer {
144    /// Brick ID
145    brick_id: BrickId,
146    /// Start time
147    start: Instant,
148}
149
150impl Default for BrickProfiler {
151    fn default() -> Self {
152        Self::new()
153    }
154}
155
156impl BrickProfiler {
157    /// Create a new profiler (disabled by default for zero overhead).
158    pub fn new() -> Self {
159        Self {
160            brick_stats: std::array::from_fn(|i| BrickStats::new(BrickId::ALL[i].name())),
161            dynamic_stats: std::collections::HashMap::new(),
162            pending: Vec::new(),
163            sync_mode: SyncMode::Deferred,
164            epoch: Instant::now(),
165            enabled: false,
166            total_tokens: 0,
167            total_ns: 0,
168            l2_cache_hit_rate: None,
169            is_zero_copy: false,
170            kernel_checksums: Vec::new(),
171            graph_enabled: false,
172            execution_graph: ExecutionGraph::new(),
173            tile_stats: [
174                TileStats::new(TileLevel::Macro),
175                TileStats::new(TileLevel::Midi),
176                TileStats::new(TileLevel::Micro),
177            ],
178            tile_profiling_enabled: false,
179        }
180    }
181
182    /// Create an enabled profiler.
183    pub fn enabled() -> Self {
184        let mut profiler = Self::new();
185        profiler.enabled = true;
186        profiler
187    }
188
189    // ========================================================================
190    // PAR-200: Sync Mode Configuration
191    // ========================================================================
192
193    /// Set the synchronization mode for GPU profiling.
194    ///
195    /// # Modes
196    /// - `Immediate`: Sync after each kernel (accurate but slow)
197    /// - `PerLayer`: Sync once per transformer layer
198    /// - `Deferred`: Sync once per forward pass (default, fast)
199    /// - `None`: No synchronization
200    pub fn set_sync_mode(&mut self, mode: SyncMode) {
201        contract_pre_sync_verification!();
202        self.sync_mode = mode;
203    }
204
205    /// Get the current synchronization mode.
206    #[must_use]
207    pub fn sync_mode(&self) -> SyncMode {
208        self.sync_mode
209    }
210
211    /// Reset the epoch for deferred timing.
212    /// Call this at the start of a forward pass.
213    pub fn reset_epoch(&mut self) {
214        self.epoch = Instant::now();
215    }
216
217    /// Get nanoseconds elapsed since epoch.
218    #[inline]
219    pub fn elapsed_ns(&self) -> u64 {
220        self.epoch.elapsed().as_nanos() as u64
221    }
222
223    // ========================================================================
224    // PAR-200: Fast Path API (BrickId-based)
225    // ========================================================================
226
227    /// Start timing a brick using BrickId (O(1) hot path).
228    ///
229    /// This is the preferred API for known brick types.
230    /// For GPU operations, call `stream.synchronize()` before `stop_brick()`.
231    #[inline]
232    #[must_use]
233    pub fn start_brick(&self, brick_id: BrickId) -> BrickIdTimer {
234        BrickIdTimer { brick_id, start: Instant::now() }
235    }
236
237    /// Stop timing and record the sample (O(1) hot path).
238    #[inline]
239    pub fn stop_brick(&mut self, timer: BrickIdTimer, elements: u64) {
240        if !self.enabled {
241            return;
242        }
243
244        let elapsed = timer.start.elapsed();
245        let elapsed_ns = elapsed.as_nanos() as u64;
246
247        // O(1) array access — CB-BUDGET: bounds-check brick_id
248        debug_assert!(
249            (timer.brick_id as usize) < self.brick_stats.len(),
250            "CB-BUDGET: brick_id {} out of bounds (max {})",
251            timer.brick_id as usize,
252            self.brick_stats.len()
253        );
254        let stats = &mut self.brick_stats[timer.brick_id as usize];
255        stats.add_sample(elapsed_ns, elements);
256
257        // Update totals
258        self.total_tokens += elements;
259        self.total_ns += elapsed_ns;
260    }
261
262    /// Get statistics for a known brick type (O(1)).
263    #[inline]
264    #[must_use]
265    pub fn brick_stats(&self, brick_id: BrickId) -> &BrickStats {
266        contract_pre_brick_ordering!();
267        &self.brick_stats[brick_id as usize]
268    }
269
270    /// Get mutable statistics for a known brick type (O(1)).
271    #[inline]
272    pub fn brick_stats_mut(&mut self, brick_id: BrickId) -> &mut BrickStats {
273        &mut self.brick_stats[brick_id as usize]
274    }
275
276    // ========================================================================
277    // PAR-200: Deferred Sync API
278    // ========================================================================
279
280    /// Record a measurement without GPU sync (deferred mode).
281    ///
282    /// Call `finalize()` after GPU sync to apply all pending measurements.
283    ///
284    /// # Arguments
285    /// - `brick_id`: The brick type
286    /// - `start_ns`: Start time (from `elapsed_ns()` at operation start)
287    /// - `elements`: Number of elements processed
288    #[inline]
289    pub fn record_deferred(&mut self, brick_id: BrickId, start_ns: u64, elements: u64) {
290        if !self.enabled {
291            return;
292        }
293        self.pending.push(PendingMeasurement {
294            brick_id: Some(brick_id),
295            name: None,
296            start_ns,
297            elements,
298        });
299    }
300
301    /// Record a measurement for a dynamic brick (deferred mode).
302    #[inline]
303    pub fn record_deferred_dynamic(&mut self, name: &str, start_ns: u64, elements: u64) {
304        if !self.enabled {
305            return;
306        }
307        self.pending.push(PendingMeasurement {
308            brick_id: BrickId::from_str(name),
309            name: Some(name.to_string()),
310            start_ns,
311            elements,
312        });
313    }
314
315    /// Finalize all pending measurements after GPU sync.
316    ///
317    /// Must be called after `stream.synchronize()` to get accurate timing.
318    ///
319    /// # Arguments
320    /// - `end_ns`: End time (from `elapsed_ns()` after sync)
321    pub fn finalize(&mut self, end_ns: u64) {
322        if self.pending.is_empty() {
323            return;
324        }
325
326        // Calculate elapsed time for each pending measurement
327        for m in self.pending.drain(..) {
328            let elapsed_ns = end_ns.saturating_sub(m.start_ns);
329
330            if let Some(brick_id) = m.brick_id {
331                // Fast path: known brick
332                let stats = &mut self.brick_stats[brick_id as usize];
333                stats.add_sample(elapsed_ns, m.elements);
334            } else if let Some(name) = m.name {
335                // Fallback path: dynamic brick lookup
336                let stats = self
337                    .dynamic_stats
338                    .entry(name.clone())
339                    .or_insert_with(|| BrickStats::new(&name));
340                stats.add_sample(elapsed_ns, m.elements);
341            }
342
343            self.total_tokens += m.elements;
344            self.total_ns += elapsed_ns;
345        }
346    }
347
348    /// Check if there are pending measurements.
349    #[inline]
350    #[must_use]
351    pub fn has_pending(&self) -> bool {
352        !self.pending.is_empty()
353    }
354
355    /// Get number of pending measurements.
356    #[inline]
357    #[must_use]
358    pub fn pending_count(&self) -> usize {
359        self.pending.len()
360    }
361
362    // ========================================================================
363    // PAR-200: Category Aggregation
364    // ========================================================================
365
366    /// Get aggregated statistics by category.
367    ///
368    /// Returns an array indexed by `BrickCategory as usize`.
369    #[must_use]
370    pub fn category_stats(&self) -> [CategoryStats; BrickCategory::COUNT] {
371        let mut result = [CategoryStats::default(); BrickCategory::COUNT];
372
373        for (i, stats) in self.brick_stats.iter().enumerate() {
374            let brick_id = BrickId::ALL[i];
375            let cat = brick_id.category() as usize;
376            result[cat].total_ns += stats.total_ns;
377            result[cat].total_elements += stats.total_elements;
378            result[cat].count += stats.count;
379        }
380
381        // Include dynamic stats in "Other" category
382        for stats in self.dynamic_stats.values() {
383            let cat = BrickCategory::Other as usize;
384            result[cat].total_ns += stats.total_ns;
385            result[cat].total_elements += stats.total_elements;
386            result[cat].count += stats.count;
387        }
388
389        result
390    }
391
392    /// Set L2 cache hit rate (v1.1.0 OBSERVE phase)
393    pub fn set_l2_cache_hit_rate(&mut self, rate: f32) {
394        self.l2_cache_hit_rate = Some(rate.clamp(0.0, 1.0));
395    }
396
397    /// Get L2 cache hit rate
398    pub fn l2_cache_hit_rate(&self) -> Option<f32> {
399        self.l2_cache_hit_rate
400    }
401
402    /// Set zero-copy mode (v1.1.0 OBSERVE phase)
403    pub fn set_zero_copy(&mut self, enabled: bool) {
404        self.is_zero_copy = enabled;
405    }
406
407    /// Check if zero-copy is enabled
408    pub fn is_zero_copy(&self) -> bool {
409        self.is_zero_copy
410    }
411
412    /// Enable profiling.
413    pub fn enable(&mut self) {
414        self.enabled = true;
415    }
416
417    /// Disable profiling.
418    pub fn disable(&mut self) {
419        self.enabled = false;
420    }
421
422    /// Check if profiling is enabled.
423    #[must_use]
424    pub fn is_enabled(&self) -> bool {
425        self.enabled
426    }
427
428    /// Get total throughput across all bricks.
429    #[must_use]
430    pub fn total_throughput(&self) -> f64 {
431        if self.total_ns == 0 {
432            0.0
433        } else {
434            self.total_tokens as f64 / (self.total_ns as f64 / 1_000_000_000.0)
435        }
436    }
437
438    /// Get total tokens processed.
439    #[must_use]
440    pub fn total_tokens(&self) -> u64 {
441        contract_pre_token_accounting!();
442        self.total_tokens
443    }
444
445    /// Get total time in nanoseconds.
446    #[must_use]
447    pub fn total_ns(&self) -> u64 {
448        contract_pre_wall_coverage!();
449        self.total_ns
450    }
451}