use std::time::Duration;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ActivityKind {
Kernel,
MemoryCopy,
MemorySet,
Synchronization,
Stream,
Context,
UnifiedMemory,
PcSampling,
}
#[derive(Debug, Clone)]
pub enum ActivityRecord {
Kernel(KernelRecord),
Memcpy(MemcpyRecord),
Sync(SyncRecord),
Generic(GenericRecord),
}
impl ActivityRecord {
pub fn kind(&self) -> ActivityKind {
match self {
ActivityRecord::Kernel(_) => ActivityKind::Kernel,
ActivityRecord::Memcpy(_) => ActivityKind::MemoryCopy,
ActivityRecord::Sync(_) => ActivityKind::Synchronization,
ActivityRecord::Generic(r) => r.kind,
}
}
pub fn start_ns(&self) -> u64 {
match self {
ActivityRecord::Kernel(r) => r.start_ns,
ActivityRecord::Memcpy(r) => r.start_ns,
ActivityRecord::Sync(r) => r.start_ns,
ActivityRecord::Generic(r) => r.start_ns,
}
}
pub fn end_ns(&self) -> u64 {
match self {
ActivityRecord::Kernel(r) => r.end_ns,
ActivityRecord::Memcpy(r) => r.end_ns,
ActivityRecord::Sync(r) => r.end_ns,
ActivityRecord::Generic(r) => r.end_ns,
}
}
pub fn duration(&self) -> Duration {
Duration::from_nanos(self.end_ns() - self.start_ns())
}
}
#[derive(Debug, Clone)]
pub struct KernelRecord {
pub name: String,
pub start_ns: u64,
pub end_ns: u64,
pub device_id: u32,
pub context_id: u32,
pub stream_id: u32,
pub grid_dim: (u32, u32, u32),
pub block_dim: (u32, u32, u32),
pub static_shared_mem: u32,
pub dynamic_shared_mem: u32,
pub registers_per_thread: u32,
pub occupancy: f32,
}
impl KernelRecord {
pub fn duration_ns(&self) -> u64 {
self.end_ns - self.start_ns
}
pub fn duration(&self) -> Duration {
Duration::from_nanos(self.duration_ns())
}
pub fn total_threads(&self) -> u64 {
let grid = self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64;
let block = self.block_dim.0 as u64 * self.block_dim.1 as u64 * self.block_dim.2 as u64;
grid * block
}
pub fn total_shared_mem(&self) -> u32 {
self.static_shared_mem + self.dynamic_shared_mem
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MemcpyKind {
HostToDevice,
DeviceToHost,
DeviceToDevice,
HostToHost,
PeerToPeer,
}
#[derive(Debug, Clone)]
pub struct MemcpyRecord {
pub kind: MemcpyKind,
pub start_ns: u64,
pub end_ns: u64,
pub device_id: u32,
pub context_id: u32,
pub stream_id: u32,
pub bytes: u64,
pub src_device: Option<u32>,
pub dst_device: Option<u32>,
}
impl MemcpyRecord {
pub fn duration_ns(&self) -> u64 {
self.end_ns - self.start_ns
}
pub fn duration(&self) -> Duration {
Duration::from_nanos(self.duration_ns())
}
pub fn throughput_gbps(&self) -> f64 {
let duration_s = self.duration_ns() as f64 / 1e9;
if duration_s > 0.0 {
(self.bytes as f64 / 1e9) / duration_s
} else {
0.0
}
}
}
#[derive(Debug, Clone)]
pub struct SyncRecord {
pub kind: SyncKind,
pub start_ns: u64,
pub end_ns: u64,
pub device_id: u32,
pub context_id: u32,
pub stream_id: Option<u32>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SyncKind {
Device,
Stream,
Event,
StreamWait,
}
impl SyncRecord {
pub fn duration_ns(&self) -> u64 {
self.end_ns - self.start_ns
}
pub fn duration(&self) -> Duration {
Duration::from_nanos(self.duration_ns())
}
}
#[derive(Debug, Clone)]
pub struct GenericRecord {
pub kind: ActivityKind,
pub start_ns: u64,
pub end_ns: u64,
pub device_id: u32,
pub context_id: u32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kernel_record() {
let record = KernelRecord {
name: "test_kernel".to_string(),
start_ns: 1000,
end_ns: 2000,
device_id: 0,
context_id: 1,
stream_id: 0,
grid_dim: (128, 1, 1),
block_dim: (256, 1, 1),
static_shared_mem: 1024,
dynamic_shared_mem: 512,
registers_per_thread: 32,
occupancy: 0.75,
};
assert_eq!(record.duration_ns(), 1000);
assert_eq!(record.total_threads(), 128 * 256);
assert_eq!(record.total_shared_mem(), 1536);
}
#[test]
fn test_memcpy_throughput() {
let record = MemcpyRecord {
kind: MemcpyKind::HostToDevice,
start_ns: 0,
end_ns: 1_000_000, device_id: 0,
context_id: 1,
stream_id: 0,
bytes: 1_000_000_000, src_device: None,
dst_device: None,
};
assert!((record.throughput_gbps() - 1000.0).abs() < 0.001);
}
#[test]
fn test_activity_record_kind() {
let kernel = ActivityRecord::Kernel(KernelRecord {
name: "test".to_string(),
start_ns: 0,
end_ns: 100,
device_id: 0,
context_id: 0,
stream_id: 0,
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
static_shared_mem: 0,
dynamic_shared_mem: 0,
registers_per_thread: 0,
occupancy: 0.0,
});
assert_eq!(kernel.kind(), ActivityKind::Kernel);
assert_eq!(kernel.duration(), Duration::from_nanos(100));
}
}