Skip to main content

trueno_gpu/monitor/compute/
mod.rs

1//! Compute Flow Visualization (TRUENO-SPEC-022)
2//!
3//! Pipeline metrics for tracking compute throughput, kernel execution,
4//! and efficiency across CPU and GPU devices.
5//!
6//! # Pipeline Stages
7//!
8//! ```text
9//! INPUT → COMPUTE → REDUCE → OUTPUT
10//! (H2D)   (Kernel)  (Tile)   (D2H)
11//! ```
12//!
13//! # References
14//!
15//! - [Harris2007] Optimizing Parallel Reduction in CUDA
16//! - [Volkov2008] Tile size optimization
17
18use std::collections::VecDeque;
19use std::time::{Duration, Instant};
20
21use super::device::DeviceId;
22
23// ============================================================================
24// Compute Metrics (TRUENO-SPEC-022 Section 4.2)
25// ============================================================================
26
27/// Compute pipeline metrics
28#[derive(Debug, Clone)]
29pub struct ComputeMetrics {
30    /// Per-device compute metrics
31    pub devices: Vec<DeviceComputeMetrics>,
32
33    /// Active kernel executions
34    pub active_kernels: Vec<KernelExecution>,
35
36    /// Pipeline stage latencies
37    /// Input stage latency (H2D transfers)
38    pub input_latency_ms: f64,
39    /// Compute stage latency (kernel execution)
40    pub compute_latency_ms: f64,
41    /// Reduce stage latency (tile reduction)
42    pub reduce_latency_ms: f64,
43    /// Output stage latency (D2H transfers)
44    pub output_latency_ms: f64,
45
46    /// Throughput in operations per second
47    pub operations_per_second: f64,
48    /// Achieved FLOPS
49    pub flops_achieved: f64,
50    /// Theoretical peak FLOPS
51    pub flops_theoretical: f64,
52
53    /// Compute efficiency percentage
54    pub compute_efficiency_pct: f64,
55    /// Memory efficiency percentage
56    pub memory_efficiency_pct: f64,
57}
58
59impl ComputeMetrics {
60    /// Create new compute metrics
61    #[must_use]
62    pub fn new() -> Self {
63        Self::default()
64    }
65
66    /// Calculate total pipeline latency
67    #[must_use]
68    pub fn total_latency_ms(&self) -> f64 {
69        self.input_latency_ms
70            + self.compute_latency_ms
71            + self.reduce_latency_ms
72            + self.output_latency_ms
73    }
74
75    /// Calculate throughput in operations per second
76    #[must_use]
77    pub fn throughput_ops(&self) -> f64 {
78        let latency_s = self.total_latency_ms() / 1000.0;
79        if latency_s > 0.0 {
80            1.0 / latency_s
81        } else {
82            0.0
83        }
84    }
85
86    /// Get compute efficiency as a percentage
87    #[must_use]
88    pub fn efficiency_percent(&self) -> f64 {
89        if self.flops_theoretical > 0.0 {
90            (self.flops_achieved / self.flops_theoretical) * 100.0
91        } else {
92            0.0
93        }
94    }
95
96    /// Add a device's compute metrics
97    pub fn add_device(&mut self, device_metrics: DeviceComputeMetrics) {
98        self.devices.push(device_metrics);
99    }
100
101    /// Track a kernel execution
102    pub fn track_kernel(&mut self, kernel: KernelExecution) {
103        self.active_kernels.push(kernel);
104    }
105
106    /// Clear completed kernels
107    pub fn clear_completed_kernels(&mut self) {
108        self.active_kernels
109            .retain(|k| k.status != KernelStatus::Completed);
110    }
111}
112
113impl Default for ComputeMetrics {
114    fn default() -> Self {
115        Self {
116            devices: Vec::new(),
117            active_kernels: Vec::new(),
118            input_latency_ms: 0.0,
119            compute_latency_ms: 0.0,
120            reduce_latency_ms: 0.0,
121            output_latency_ms: 0.0,
122            operations_per_second: 0.0,
123            flops_achieved: 0.0,
124            flops_theoretical: 0.0,
125            compute_efficiency_pct: 0.0,
126            memory_efficiency_pct: 0.0,
127        }
128    }
129}
130
131// ============================================================================
132// Device Compute Metrics
133// ============================================================================
134
135/// Per-device compute metrics
136#[derive(Debug, Clone)]
137pub struct DeviceComputeMetrics {
138    /// Device ID
139    pub device_id: DeviceId,
140    /// Compute utilization (0.0-100.0)
141    pub utilization_pct: f64,
142    /// Streaming multiprocessor / Compute unit active percentage
143    pub sm_active_pct: f64,
144    /// Active warps
145    pub warps_active: u32,
146    /// Maximum warps
147    pub warps_max: u32,
148    /// Current clock speed in MHz
149    pub clock_mhz: u32,
150    /// Maximum clock speed in MHz
151    pub clock_max_mhz: u32,
152    /// Current power in watts
153    pub power_watts: f64,
154    /// Power limit in watts
155    pub power_limit_watts: f64,
156    /// Temperature in Celsius
157    pub temperature_c: f64,
158    /// Throttle reason (if any)
159    pub throttle_reason: Option<ThrottleReason>,
160    /// Utilization history (60-point sparkline)
161    pub history: VecDeque<f64>,
162}
163
164impl DeviceComputeMetrics {
165    /// Maximum history points
166    pub const MAX_HISTORY_POINTS: usize = 60;
167
168    /// Create new device compute metrics
169    #[must_use]
170    pub fn new(device_id: DeviceId) -> Self {
171        Self {
172            device_id,
173            utilization_pct: 0.0,
174            sm_active_pct: 0.0,
175            warps_active: 0,
176            warps_max: 0,
177            clock_mhz: 0,
178            clock_max_mhz: 0,
179            power_watts: 0.0,
180            power_limit_watts: 0.0,
181            temperature_c: 0.0,
182            throttle_reason: None,
183            history: VecDeque::with_capacity(Self::MAX_HISTORY_POINTS),
184        }
185    }
186
187    /// Update utilization and add to history
188    pub fn update_utilization(&mut self, pct: f64) {
189        self.utilization_pct = pct;
190        self.history.push_back(pct);
191        if self.history.len() > Self::MAX_HISTORY_POINTS {
192            self.history.pop_front();
193        }
194    }
195
196    /// Get warp occupancy percentage
197    #[must_use]
198    pub fn warp_occupancy_pct(&self) -> f64 {
199        if self.warps_max == 0 {
200            return 0.0;
201        }
202        (self.warps_active as f64 / self.warps_max as f64) * 100.0
203    }
204
205    /// Get clock ratio (current/max)
206    #[must_use]
207    pub fn clock_ratio(&self) -> f64 {
208        if self.clock_max_mhz == 0 {
209            return 0.0;
210        }
211        self.clock_mhz as f64 / self.clock_max_mhz as f64
212    }
213
214    /// Get power ratio (current/limit)
215    #[must_use]
216    pub fn power_ratio(&self) -> f64 {
217        if self.power_limit_watts == 0.0 {
218            return 0.0;
219        }
220        self.power_watts / self.power_limit_watts
221    }
222
223    /// Check if device is throttling
224    #[must_use]
225    pub fn is_throttling(&self) -> bool {
226        self.throttle_reason.is_some() && self.throttle_reason != Some(ThrottleReason::None)
227    }
228}
229
230// ThrottleReason is defined in device.rs and re-exported from mod.rs
231use super::device::ThrottleReason;
232
233// ============================================================================
234// Kernel Execution Tracking
235// ============================================================================
236
237/// Active kernel execution
238#[derive(Debug, Clone)]
239pub struct KernelExecution {
240    /// Kernel name
241    pub name: String,
242    /// Grid dimensions (x, y, z)
243    pub grid_dim: (u32, u32, u32),
244    /// Block dimensions (x, y, z)
245    pub block_dim: (u32, u32, u32),
246    /// Shared memory per block in bytes
247    pub shared_mem_bytes: usize,
248    /// Registers per thread
249    pub registers_per_thread: u32,
250    /// Theoretical occupancy percentage
251    pub occupancy_pct: f64,
252    /// Elapsed time in milliseconds
253    pub elapsed_ms: f64,
254    /// Execution status
255    pub status: KernelStatus,
256    /// Device executing this kernel
257    pub device_id: DeviceId,
258    /// Start time
259    pub start_time: Instant,
260}
261
262impl KernelExecution {
263    /// Create a new kernel execution tracker
264    #[must_use]
265    pub fn new(name: impl Into<String>, device_id: DeviceId) -> Self {
266        Self {
267            name: name.into(),
268            grid_dim: (1, 1, 1),
269            block_dim: (1, 1, 1),
270            shared_mem_bytes: 0,
271            registers_per_thread: 0,
272            occupancy_pct: 0.0,
273            elapsed_ms: 0.0,
274            status: KernelStatus::Pending,
275            device_id,
276            start_time: Instant::now(),
277        }
278    }
279
280    /// Set grid and block dimensions
281    #[must_use]
282    pub fn with_dims(mut self, grid: (u32, u32, u32), block: (u32, u32, u32)) -> Self {
283        self.grid_dim = grid;
284        self.block_dim = block;
285        self
286    }
287
288    /// Set shared memory usage
289    #[must_use]
290    pub fn with_shared_mem(mut self, bytes: usize) -> Self {
291        self.shared_mem_bytes = bytes;
292        self
293    }
294
295    /// Set register usage
296    #[must_use]
297    pub fn with_registers(mut self, regs: u32) -> Self {
298        self.registers_per_thread = regs;
299        self
300    }
301
302    /// Get total thread count
303    #[must_use]
304    pub fn total_threads(&self) -> u64 {
305        let grid_total = self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64;
306        let block_total =
307            self.block_dim.0 as u64 * self.block_dim.1 as u64 * self.block_dim.2 as u64;
308        grid_total * block_total
309    }
310
311    /// Get total blocks
312    #[must_use]
313    pub fn total_blocks(&self) -> u64 {
314        self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64
315    }
316
317    /// Get threads per block
318    #[must_use]
319    pub fn threads_per_block(&self) -> u32 {
320        self.block_dim.0 * self.block_dim.1 * self.block_dim.2
321    }
322
323    /// Mark kernel as running
324    pub fn start(&mut self) {
325        self.status = KernelStatus::Running;
326        self.start_time = Instant::now();
327    }
328
329    /// Mark kernel as completed and record elapsed time
330    pub fn complete(&mut self) {
331        self.status = KernelStatus::Completed;
332        self.elapsed_ms = self.start_time.elapsed().as_secs_f64() * 1000.0;
333    }
334
335    /// Update elapsed time for running kernel
336    pub fn update_elapsed(&mut self) {
337        if self.status == KernelStatus::Running {
338            self.elapsed_ms = self.start_time.elapsed().as_secs_f64() * 1000.0;
339        }
340    }
341
342    /// Get progress percentage (estimated based on time if available)
343    #[must_use]
344    pub fn progress_pct(&self) -> f64 {
345        match self.status {
346            KernelStatus::Pending | KernelStatus::Failed => 0.0,
347            KernelStatus::Completed => 100.0,
348            KernelStatus::Running => {
349                // Can't know actual progress without kernel instrumentation
350                // Return a placeholder based on typical kernel times
351                (self.elapsed_ms / 100.0).min(99.0)
352            }
353        }
354    }
355}
356
357/// Kernel execution status
358#[derive(Debug, Clone, Copy, PartialEq, Eq)]
359pub enum KernelStatus {
360    /// Kernel queued but not started
361    Pending,
362    /// Kernel currently executing
363    Running,
364    /// Kernel completed successfully
365    Completed,
366    /// Kernel failed
367    Failed,
368}
369
370// ============================================================================
371// FLOPS Calculation Helpers
372// ============================================================================
373
374/// Calculate theoretical FLOPS for a GEMM operation
375///
376/// FLOPS = 2 * M * N * K (for FMA operations)
377#[must_use]
378pub fn gemm_flops(m: u64, n: u64, k: u64) -> f64 {
379    2.0 * m as f64 * n as f64 * k as f64
380}
381
382/// Calculate achieved GFLOPS from operation count and time
383#[must_use]
384pub fn achieved_gflops(flops: f64, duration: Duration) -> f64 {
385    let seconds = duration.as_secs_f64();
386    if seconds > 0.0 {
387        flops / seconds / 1e9
388    } else {
389        0.0
390    }
391}
392
393/// Calculate compute efficiency percentage
394#[must_use]
395pub fn compute_efficiency(achieved_gflops: f64, theoretical_gflops: f64) -> f64 {
396    if theoretical_gflops > 0.0 {
397        (achieved_gflops / theoretical_gflops) * 100.0
398    } else {
399        0.0
400    }
401}
402
403#[cfg(test)]
404mod tests;