Skip to main content

trueno/brick/profiler/
tile_stats.rs

1//! Tile-level profiling statistics.
2
3use std::time::Instant;
4
5/// Tile-level profiling statistics.
6///
7/// Tracks per-tile performance metrics for hierarchical cache-blocked operations.
8/// Used in conjunction with `TcbGeometry` and `TilingConfig` from the tiling module.
9///
10/// # Example
11///
12/// ```ignore
13/// let mut profiler = BrickProfiler::new();
14/// profiler.enable();
15///
16/// let tile_timer = profiler.start_tile(TileLevel::Macro, 0, 0);
17/// // ... execute tile ...
18/// profiler.stop_tile(tile_timer, 1024 * 1024);
19/// ```
20#[derive(Debug, Clone, Default)]
21pub struct TileStats {
22    /// Tile level (Macro/Midi/Micro)
23    pub level: TileLevel,
24    /// Total samples collected
25    pub count: u64,
26    /// Total elapsed time (nanoseconds)
27    pub total_ns: u64,
28    /// Min elapsed time (nanoseconds)
29    pub min_ns: u64,
30    /// Max elapsed time (nanoseconds)
31    pub max_ns: u64,
32    /// Total elements processed
33    pub total_elements: u64,
34    /// Total cache misses (estimated)
35    pub cache_misses: u64,
36    /// Total arithmetic operations
37    pub total_flops: u64,
38}
39
40/// Tile hierarchy level for profiling.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
42pub enum TileLevel {
43    /// Macro-tile: L3 cache / GPU global memory
44    #[default]
45    Macro,
46    /// Midi-tile: L2 cache / GPU shared memory
47    Midi,
48    /// Micro-tile: Registers / SIMD lanes
49    Micro,
50}
51
52impl TileLevel {
53    /// Get the name of this tile level.
54    #[must_use]
55    pub const fn name(&self) -> &'static str {
56        match self {
57            TileLevel::Macro => "macro",
58            TileLevel::Midi => "midi",
59            TileLevel::Micro => "micro",
60        }
61    }
62}
63
64impl TileStats {
65    /// Create new tile stats for a given level.
66    pub fn new(level: TileLevel) -> Self {
67        Self {
68            level,
69            count: 0,
70            total_ns: 0,
71            min_ns: u64::MAX,
72            max_ns: 0,
73            total_elements: 0,
74            cache_misses: 0,
75            total_flops: 0,
76        }
77    }
78
79    /// Add a sample to statistics.
80    pub fn add_sample(&mut self, elapsed_ns: u64, elements: u64, flops: u64) {
81        debug_assert!(elements > 0, "CB-BUDGET: tile sample elements must be > 0");
82        self.count += 1;
83        self.total_ns += elapsed_ns;
84        self.min_ns = self.min_ns.min(elapsed_ns);
85        self.max_ns = self.max_ns.max(elapsed_ns);
86        self.total_elements += elements;
87        self.total_flops += flops;
88    }
89
90    /// Average time in microseconds.
91    #[must_use]
92    pub fn avg_us(&self) -> f64 {
93        if self.count == 0 {
94            0.0
95        } else {
96            self.total_ns as f64 / self.count as f64 / 1000.0
97        }
98    }
99
100    /// Throughput in elements/second.
101    #[must_use]
102    pub fn throughput(&self) -> f64 {
103        if self.total_ns == 0 {
104            0.0
105        } else {
106            self.total_elements as f64 / (self.total_ns as f64 / 1_000_000_000.0)
107        }
108    }
109
110    /// Compute throughput in GFLOP/s.
111    #[must_use]
112    pub fn gflops(&self) -> f64 {
113        if self.total_ns == 0 {
114            0.0
115        } else {
116            self.total_flops as f64 / (self.total_ns as f64 / 1_000_000_000.0) / 1e9
117        }
118    }
119
120    /// Arithmetic intensity (FLOP/byte) estimate.
121    ///
122    /// Assumes 4 bytes per element (f32).
123    #[must_use]
124    pub fn arithmetic_intensity(&self) -> f64 {
125        if self.total_elements == 0 {
126            0.0
127        } else {
128            self.total_flops as f64 / (self.total_elements as f64 * 4.0)
129        }
130    }
131
132    /// Estimated cache efficiency (0.0-1.0).
133    ///
134    /// Based on ratio of actual throughput vs theoretical peak.
135    #[must_use]
136    pub fn cache_efficiency(&self, peak_gflops: f64) -> f64 {
137        if peak_gflops <= 0.0 {
138            0.0
139        } else {
140            (self.gflops() / peak_gflops).min(1.0)
141        }
142    }
143}
144
145/// Timer handle for tile-level profiling.
146#[derive(Debug)]
147pub struct TileTimer {
148    /// Tile level
149    pub(crate) level: TileLevel,
150    /// Row index within parent tile (reserved for spatial analysis)
151    pub(crate) _row: u32,
152    /// Column index within parent tile (reserved for spatial analysis)
153    pub(crate) _col: u32,
154    /// Start time
155    pub(crate) start: Instant,
156}