use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MetricId {
SmUtilization,
AchievedOccupancy,
DramThroughput,
L1HitRate,
L2HitRate,
WarpExecutionEfficiency,
BranchEfficiency,
GlobalLoadEfficiency,
GlobalStoreEfficiency,
SharedLoadEfficiency,
SharedStoreEfficiency,
TensorCoreUtilization,
Flop32,
Flop16,
Ipc,
Custom(u32),
}
impl MetricId {
pub fn cupti_name(&self) -> &'static str {
match self {
MetricId::SmUtilization => "sm__throughput.avg.pct_of_peak_sustained_elapsed",
MetricId::AchievedOccupancy => "sm__warps_active.avg.pct_of_peak_sustained_elapsed",
MetricId::DramThroughput => "dram__throughput.avg.pct_of_peak_sustained_elapsed",
MetricId::L1HitRate => "l1tex__t_sector_hit_rate.pct",
MetricId::L2HitRate => "lts__t_sector_hit_rate.pct",
MetricId::WarpExecutionEfficiency => "smsp__thread_inst_executed_per_inst_executed.pct",
MetricId::BranchEfficiency => "smsp__sass_average_branch_targets_threads_uniform.pct",
MetricId::GlobalLoadEfficiency => "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct_of_peak_sustained_elapsed",
MetricId::GlobalStoreEfficiency => "smsp__sass_average_data_bytes_per_sector_mem_global_op_st.pct_of_peak_sustained_elapsed",
MetricId::SharedLoadEfficiency => "smsp__sass_average_data_bytes_per_sector_mem_shared_op_ld.pct_of_peak_sustained_elapsed",
MetricId::SharedStoreEfficiency => "smsp__sass_average_data_bytes_per_sector_mem_shared_op_st.pct_of_peak_sustained_elapsed",
MetricId::TensorCoreUtilization => "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
MetricId::Flop32 => "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum",
MetricId::Flop16 => "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum",
MetricId::Ipc => "sm__inst_executed.avg.per_cycle_active",
MetricId::Custom(_) => "custom",
}
}
}
#[derive(Debug, Clone)]
pub enum MetricValue {
Percent(f64),
Count(u64),
Rate(f64),
Bytes(u64),
Throughput(f64),
}
impl MetricValue {
pub fn as_f64(&self) -> f64 {
match self {
MetricValue::Percent(v) => *v,
MetricValue::Count(v) => *v as f64,
MetricValue::Rate(v) => *v,
MetricValue::Bytes(v) => *v as f64,
MetricValue::Throughput(v) => *v,
}
}
pub fn exceeds(&self, threshold: f64) -> bool {
self.as_f64() > threshold
}
}
#[derive(Debug, Clone, Default)]
pub struct SmMetrics {
pub utilization: f64,
pub occupancy: f64,
pub active_warps: f64,
pub active_cycles: u64,
pub elapsed_cycles: u64,
pub instructions_executed: u64,
}
impl SmMetrics {
pub fn ipc(&self) -> f64 {
if self.active_cycles > 0 {
self.instructions_executed as f64 / self.active_cycles as f64
} else {
0.0
}
}
pub fn is_underutilized(&self) -> bool {
self.utilization < 50.0
}
}
#[derive(Debug, Clone, Default)]
pub struct WarpMetrics {
pub execution_efficiency: f64,
pub branch_efficiency: f64,
pub stall_reasons: HashMap<WarpStallReason, f64>,
pub active_threads_per_warp: f64,
}
impl WarpMetrics {
pub fn has_divergence(&self) -> bool {
self.execution_efficiency < 75.0 || self.branch_efficiency < 90.0
}
pub fn primary_stall_reason(&self) -> Option<WarpStallReason> {
self.stall_reasons
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.map(|(reason, _)| *reason)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum WarpStallReason {
InstructionFetch,
MemoryDependency,
ExecutionDependency,
Synchronization,
TextureFetch,
Other,
}
#[derive(Debug, Clone, Default)]
pub struct MemoryMetrics {
pub dram_throughput: f64,
pub l1_hit_rate: f64,
pub l2_hit_rate: f64,
pub global_load_efficiency: f64,
pub global_store_efficiency: f64,
pub shared_throughput: f64,
}
impl MemoryMetrics {
pub fn is_memory_bound(&self) -> bool {
self.dram_throughput > 100.0 && (self.l1_hit_rate < 50.0 || self.l2_hit_rate < 50.0)
}
pub fn has_coalescing_issues(&self) -> bool {
self.global_load_efficiency < 80.0 || self.global_store_efficiency < 80.0
}
}
#[derive(Debug, Clone, Default)]
pub struct MetricsSnapshot {
pub sm: SmMetrics,
pub warp: WarpMetrics,
pub memory: MemoryMetrics,
pub tensor_core_utilization: f64,
pub timestamp_ns: u64,
}
impl MetricsSnapshot {
pub fn efficiency_score(&self) -> f64 {
let sm_weight = 0.3;
let occupancy_weight = 0.2;
let memory_weight = 0.3;
let warp_weight = 0.2;
sm_weight * self.sm.utilization
+ occupancy_weight * self.sm.occupancy
+ memory_weight * (self.memory.l1_hit_rate + self.memory.l2_hit_rate) / 2.0
+ warp_weight * self.warp.execution_efficiency
}
pub fn primary_bottleneck(&self) -> Bottleneck {
if self.sm.is_underutilized() {
if self.memory.is_memory_bound() {
Bottleneck::Memory
} else if self.warp.has_divergence() {
Bottleneck::WarpDivergence
} else {
Bottleneck::LaunchOverhead
}
} else if self.memory.has_coalescing_issues() {
Bottleneck::MemoryCoalescing
} else if self.sm.occupancy < 50.0 {
Bottleneck::Occupancy
} else {
Bottleneck::Compute
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Bottleneck {
Compute,
Memory,
MemoryCoalescing,
Occupancy,
WarpDivergence,
LaunchOverhead,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metric_id_name() {
assert_eq!(
MetricId::SmUtilization.cupti_name(),
"sm__throughput.avg.pct_of_peak_sustained_elapsed"
);
}
#[test]
fn test_metric_value() {
let pct = MetricValue::Percent(75.5);
assert!((pct.as_f64() - 75.5).abs() < 0.001);
assert!(pct.exceeds(50.0));
assert!(!pct.exceeds(80.0));
}
#[test]
fn test_sm_metrics_ipc() {
let sm = SmMetrics {
active_cycles: 1000,
instructions_executed: 4000,
..Default::default()
};
assert!((sm.ipc() - 4.0).abs() < 0.001);
}
#[test]
fn test_warp_divergence() {
let mut warp = WarpMetrics::default();
warp.execution_efficiency = 90.0;
warp.branch_efficiency = 95.0;
assert!(!warp.has_divergence());
warp.execution_efficiency = 60.0;
assert!(warp.has_divergence());
}
#[test]
fn test_bottleneck_detection() {
let mut snapshot = MetricsSnapshot::default();
snapshot.sm.utilization = 30.0;
snapshot.memory.dram_throughput = 200.0;
snapshot.memory.l1_hit_rate = 30.0;
assert_eq!(snapshot.primary_bottleneck(), Bottleneck::Memory);
}
#[test]
fn test_efficiency_score() {
let mut snapshot = MetricsSnapshot::default();
snapshot.sm.utilization = 80.0;
snapshot.sm.occupancy = 70.0;
snapshot.memory.l1_hit_rate = 60.0;
snapshot.memory.l2_hit_rate = 80.0;
snapshot.warp.execution_efficiency = 90.0;
let score = snapshot.efficiency_score();
assert!(score > 50.0 && score < 100.0);
}
}