trueno_gpu/monitor/compute/
mod.rs1use std::collections::VecDeque;
19use std::time::{Duration, Instant};
20
21use super::device::DeviceId;
22
23#[derive(Debug, Clone)]
29pub struct ComputeMetrics {
30 pub devices: Vec<DeviceComputeMetrics>,
32
33 pub active_kernels: Vec<KernelExecution>,
35
36 pub input_latency_ms: f64,
39 pub compute_latency_ms: f64,
41 pub reduce_latency_ms: f64,
43 pub output_latency_ms: f64,
45
46 pub operations_per_second: f64,
48 pub flops_achieved: f64,
50 pub flops_theoretical: f64,
52
53 pub compute_efficiency_pct: f64,
55 pub memory_efficiency_pct: f64,
57}
58
59impl ComputeMetrics {
60 #[must_use]
62 pub fn new() -> Self {
63 Self::default()
64 }
65
66 #[must_use]
68 pub fn total_latency_ms(&self) -> f64 {
69 self.input_latency_ms
70 + self.compute_latency_ms
71 + self.reduce_latency_ms
72 + self.output_latency_ms
73 }
74
75 #[must_use]
77 pub fn throughput_ops(&self) -> f64 {
78 let latency_s = self.total_latency_ms() / 1000.0;
79 if latency_s > 0.0 {
80 1.0 / latency_s
81 } else {
82 0.0
83 }
84 }
85
86 #[must_use]
88 pub fn efficiency_percent(&self) -> f64 {
89 if self.flops_theoretical > 0.0 {
90 (self.flops_achieved / self.flops_theoretical) * 100.0
91 } else {
92 0.0
93 }
94 }
95
96 pub fn add_device(&mut self, device_metrics: DeviceComputeMetrics) {
98 self.devices.push(device_metrics);
99 }
100
101 pub fn track_kernel(&mut self, kernel: KernelExecution) {
103 self.active_kernels.push(kernel);
104 }
105
106 pub fn clear_completed_kernels(&mut self) {
108 self.active_kernels
109 .retain(|k| k.status != KernelStatus::Completed);
110 }
111}
112
113impl Default for ComputeMetrics {
114 fn default() -> Self {
115 Self {
116 devices: Vec::new(),
117 active_kernels: Vec::new(),
118 input_latency_ms: 0.0,
119 compute_latency_ms: 0.0,
120 reduce_latency_ms: 0.0,
121 output_latency_ms: 0.0,
122 operations_per_second: 0.0,
123 flops_achieved: 0.0,
124 flops_theoretical: 0.0,
125 compute_efficiency_pct: 0.0,
126 memory_efficiency_pct: 0.0,
127 }
128 }
129}
130
131#[derive(Debug, Clone)]
137pub struct DeviceComputeMetrics {
138 pub device_id: DeviceId,
140 pub utilization_pct: f64,
142 pub sm_active_pct: f64,
144 pub warps_active: u32,
146 pub warps_max: u32,
148 pub clock_mhz: u32,
150 pub clock_max_mhz: u32,
152 pub power_watts: f64,
154 pub power_limit_watts: f64,
156 pub temperature_c: f64,
158 pub throttle_reason: Option<ThrottleReason>,
160 pub history: VecDeque<f64>,
162}
163
164impl DeviceComputeMetrics {
165 pub const MAX_HISTORY_POINTS: usize = 60;
167
168 #[must_use]
170 pub fn new(device_id: DeviceId) -> Self {
171 Self {
172 device_id,
173 utilization_pct: 0.0,
174 sm_active_pct: 0.0,
175 warps_active: 0,
176 warps_max: 0,
177 clock_mhz: 0,
178 clock_max_mhz: 0,
179 power_watts: 0.0,
180 power_limit_watts: 0.0,
181 temperature_c: 0.0,
182 throttle_reason: None,
183 history: VecDeque::with_capacity(Self::MAX_HISTORY_POINTS),
184 }
185 }
186
187 pub fn update_utilization(&mut self, pct: f64) {
189 self.utilization_pct = pct;
190 self.history.push_back(pct);
191 if self.history.len() > Self::MAX_HISTORY_POINTS {
192 self.history.pop_front();
193 }
194 }
195
196 #[must_use]
198 pub fn warp_occupancy_pct(&self) -> f64 {
199 if self.warps_max == 0 {
200 return 0.0;
201 }
202 (self.warps_active as f64 / self.warps_max as f64) * 100.0
203 }
204
205 #[must_use]
207 pub fn clock_ratio(&self) -> f64 {
208 if self.clock_max_mhz == 0 {
209 return 0.0;
210 }
211 self.clock_mhz as f64 / self.clock_max_mhz as f64
212 }
213
214 #[must_use]
216 pub fn power_ratio(&self) -> f64 {
217 if self.power_limit_watts == 0.0 {
218 return 0.0;
219 }
220 self.power_watts / self.power_limit_watts
221 }
222
223 #[must_use]
225 pub fn is_throttling(&self) -> bool {
226 self.throttle_reason.is_some() && self.throttle_reason != Some(ThrottleReason::None)
227 }
228}
229
230use super::device::ThrottleReason;
232
233#[derive(Debug, Clone)]
239pub struct KernelExecution {
240 pub name: String,
242 pub grid_dim: (u32, u32, u32),
244 pub block_dim: (u32, u32, u32),
246 pub shared_mem_bytes: usize,
248 pub registers_per_thread: u32,
250 pub occupancy_pct: f64,
252 pub elapsed_ms: f64,
254 pub status: KernelStatus,
256 pub device_id: DeviceId,
258 pub start_time: Instant,
260}
261
262impl KernelExecution {
263 #[must_use]
265 pub fn new(name: impl Into<String>, device_id: DeviceId) -> Self {
266 Self {
267 name: name.into(),
268 grid_dim: (1, 1, 1),
269 block_dim: (1, 1, 1),
270 shared_mem_bytes: 0,
271 registers_per_thread: 0,
272 occupancy_pct: 0.0,
273 elapsed_ms: 0.0,
274 status: KernelStatus::Pending,
275 device_id,
276 start_time: Instant::now(),
277 }
278 }
279
280 #[must_use]
282 pub fn with_dims(mut self, grid: (u32, u32, u32), block: (u32, u32, u32)) -> Self {
283 self.grid_dim = grid;
284 self.block_dim = block;
285 self
286 }
287
288 #[must_use]
290 pub fn with_shared_mem(mut self, bytes: usize) -> Self {
291 self.shared_mem_bytes = bytes;
292 self
293 }
294
295 #[must_use]
297 pub fn with_registers(mut self, regs: u32) -> Self {
298 self.registers_per_thread = regs;
299 self
300 }
301
302 #[must_use]
304 pub fn total_threads(&self) -> u64 {
305 let grid_total = self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64;
306 let block_total =
307 self.block_dim.0 as u64 * self.block_dim.1 as u64 * self.block_dim.2 as u64;
308 grid_total * block_total
309 }
310
311 #[must_use]
313 pub fn total_blocks(&self) -> u64 {
314 self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64
315 }
316
317 #[must_use]
319 pub fn threads_per_block(&self) -> u32 {
320 self.block_dim.0 * self.block_dim.1 * self.block_dim.2
321 }
322
323 pub fn start(&mut self) {
325 self.status = KernelStatus::Running;
326 self.start_time = Instant::now();
327 }
328
329 pub fn complete(&mut self) {
331 self.status = KernelStatus::Completed;
332 self.elapsed_ms = self.start_time.elapsed().as_secs_f64() * 1000.0;
333 }
334
335 pub fn update_elapsed(&mut self) {
337 if self.status == KernelStatus::Running {
338 self.elapsed_ms = self.start_time.elapsed().as_secs_f64() * 1000.0;
339 }
340 }
341
342 #[must_use]
344 pub fn progress_pct(&self) -> f64 {
345 match self.status {
346 KernelStatus::Pending | KernelStatus::Failed => 0.0,
347 KernelStatus::Completed => 100.0,
348 KernelStatus::Running => {
349 (self.elapsed_ms / 100.0).min(99.0)
352 }
353 }
354 }
355}
356
357#[derive(Debug, Clone, Copy, PartialEq, Eq)]
359pub enum KernelStatus {
360 Pending,
362 Running,
364 Completed,
366 Failed,
368}
369
370#[must_use]
378pub fn gemm_flops(m: u64, n: u64, k: u64) -> f64 {
379 2.0 * m as f64 * n as f64 * k as f64
380}
381
382#[must_use]
384pub fn achieved_gflops(flops: f64, duration: Duration) -> f64 {
385 let seconds = duration.as_secs_f64();
386 if seconds > 0.0 {
387 flops / seconds / 1e9
388 } else {
389 0.0
390 }
391}
392
393#[must_use]
395pub fn compute_efficiency(achieved_gflops: f64, theoretical_gflops: f64) -> f64 {
396 if theoretical_gflops > 0.0 {
397 (achieved_gflops / theoretical_gflops) * 100.0
398 } else {
399 0.0
400 }
401}
402
403#[cfg(test)]
404mod tests;