use std::collections::VecDeque;
use std::time::{Duration, Instant};
use super::device::DeviceId;
#[derive(Debug, Clone)]
pub struct ComputeMetrics {
pub devices: Vec<DeviceComputeMetrics>,
pub active_kernels: Vec<KernelExecution>,
pub input_latency_ms: f64,
pub compute_latency_ms: f64,
pub reduce_latency_ms: f64,
pub output_latency_ms: f64,
pub operations_per_second: f64,
pub flops_achieved: f64,
pub flops_theoretical: f64,
pub compute_efficiency_pct: f64,
pub memory_efficiency_pct: f64,
}
impl ComputeMetrics {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn total_latency_ms(&self) -> f64 {
self.input_latency_ms
+ self.compute_latency_ms
+ self.reduce_latency_ms
+ self.output_latency_ms
}
#[must_use]
pub fn throughput_ops(&self) -> f64 {
let latency_s = self.total_latency_ms() / 1000.0;
if latency_s > 0.0 {
1.0 / latency_s
} else {
0.0
}
}
#[must_use]
pub fn efficiency_percent(&self) -> f64 {
if self.flops_theoretical > 0.0 {
(self.flops_achieved / self.flops_theoretical) * 100.0
} else {
0.0
}
}
pub fn add_device(&mut self, device_metrics: DeviceComputeMetrics) {
self.devices.push(device_metrics);
}
pub fn track_kernel(&mut self, kernel: KernelExecution) {
self.active_kernels.push(kernel);
}
pub fn clear_completed_kernels(&mut self) {
self.active_kernels.retain(|k| k.status != KernelStatus::Completed);
}
}
impl Default for ComputeMetrics {
fn default() -> Self {
Self {
devices: Vec::new(),
active_kernels: Vec::new(),
input_latency_ms: 0.0,
compute_latency_ms: 0.0,
reduce_latency_ms: 0.0,
output_latency_ms: 0.0,
operations_per_second: 0.0,
flops_achieved: 0.0,
flops_theoretical: 0.0,
compute_efficiency_pct: 0.0,
memory_efficiency_pct: 0.0,
}
}
}
#[derive(Debug, Clone)]
pub struct DeviceComputeMetrics {
pub device_id: DeviceId,
pub utilization_pct: f64,
pub sm_active_pct: f64,
pub warps_active: u32,
pub warps_max: u32,
pub clock_mhz: u32,
pub clock_max_mhz: u32,
pub power_watts: f64,
pub power_limit_watts: f64,
pub temperature_c: f64,
pub throttle_reason: Option<ThrottleReason>,
pub history: VecDeque<f64>,
}
impl DeviceComputeMetrics {
pub const MAX_HISTORY_POINTS: usize = 60;
#[must_use]
pub fn new(device_id: DeviceId) -> Self {
Self {
device_id,
utilization_pct: 0.0,
sm_active_pct: 0.0,
warps_active: 0,
warps_max: 0,
clock_mhz: 0,
clock_max_mhz: 0,
power_watts: 0.0,
power_limit_watts: 0.0,
temperature_c: 0.0,
throttle_reason: None,
history: VecDeque::with_capacity(Self::MAX_HISTORY_POINTS),
}
}
pub fn update_utilization(&mut self, pct: f64) {
self.utilization_pct = pct;
self.history.push_back(pct);
if self.history.len() > Self::MAX_HISTORY_POINTS {
self.history.pop_front();
}
}
#[must_use]
pub fn warp_occupancy_pct(&self) -> f64 {
if self.warps_max == 0 {
return 0.0;
}
(self.warps_active as f64 / self.warps_max as f64) * 100.0
}
#[must_use]
pub fn clock_ratio(&self) -> f64 {
if self.clock_max_mhz == 0 {
return 0.0;
}
self.clock_mhz as f64 / self.clock_max_mhz as f64
}
#[must_use]
pub fn power_ratio(&self) -> f64 {
if self.power_limit_watts == 0.0 {
return 0.0;
}
self.power_watts / self.power_limit_watts
}
#[must_use]
pub fn is_throttling(&self) -> bool {
self.throttle_reason.is_some() && self.throttle_reason != Some(ThrottleReason::None)
}
}
use super::device::ThrottleReason;
#[derive(Debug, Clone)]
pub struct KernelExecution {
pub name: String,
pub grid_dim: (u32, u32, u32),
pub block_dim: (u32, u32, u32),
pub shared_mem_bytes: usize,
pub registers_per_thread: u32,
pub occupancy_pct: f64,
pub elapsed_ms: f64,
pub status: KernelStatus,
pub device_id: DeviceId,
pub start_time: Instant,
}
impl KernelExecution {
#[must_use]
pub fn new(name: impl Into<String>, device_id: DeviceId) -> Self {
Self {
name: name.into(),
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
registers_per_thread: 0,
occupancy_pct: 0.0,
elapsed_ms: 0.0,
status: KernelStatus::Pending,
device_id,
start_time: Instant::now(),
}
}
#[must_use]
pub fn with_dims(mut self, grid: (u32, u32, u32), block: (u32, u32, u32)) -> Self {
self.grid_dim = grid;
self.block_dim = block;
self
}
#[must_use]
pub fn with_shared_mem(mut self, bytes: usize) -> Self {
self.shared_mem_bytes = bytes;
self
}
#[must_use]
pub fn with_registers(mut self, regs: u32) -> Self {
self.registers_per_thread = regs;
self
}
#[must_use]
pub fn total_threads(&self) -> u64 {
let grid_total = self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64;
let block_total =
self.block_dim.0 as u64 * self.block_dim.1 as u64 * self.block_dim.2 as u64;
grid_total * block_total
}
#[must_use]
pub fn total_blocks(&self) -> u64 {
self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64
}
#[must_use]
pub fn threads_per_block(&self) -> u32 {
self.block_dim.0 * self.block_dim.1 * self.block_dim.2
}
pub fn start(&mut self) {
self.status = KernelStatus::Running;
self.start_time = Instant::now();
}
pub fn complete(&mut self) {
self.status = KernelStatus::Completed;
self.elapsed_ms = self.start_time.elapsed().as_secs_f64() * 1000.0;
}
pub fn update_elapsed(&mut self) {
if self.status == KernelStatus::Running {
self.elapsed_ms = self.start_time.elapsed().as_secs_f64() * 1000.0;
}
}
#[must_use]
pub fn progress_pct(&self) -> f64 {
match self.status {
KernelStatus::Pending | KernelStatus::Failed => 0.0,
KernelStatus::Completed => 100.0,
KernelStatus::Running => {
(self.elapsed_ms / 100.0).min(99.0)
}
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum KernelStatus {
Pending,
Running,
Completed,
Failed,
}
#[must_use]
pub fn gemm_flops(m: u64, n: u64, k: u64) -> f64 {
2.0 * m as f64 * n as f64 * k as f64
}
#[must_use]
pub fn achieved_gflops(flops: f64, duration: Duration) -> f64 {
let seconds = duration.as_secs_f64();
if seconds > 0.0 {
flops / seconds / 1e9
} else {
0.0
}
}
#[must_use]
pub fn compute_efficiency(achieved_gflops: f64, theoretical_gflops: f64) -> f64 {
if theoretical_gflops > 0.0 {
(achieved_gflops / theoretical_gflops) * 100.0
} else {
0.0
}
}
#[cfg(test)]
mod tests;