use crate::{ProfileEvent, TorshError, TorshResult};
use parking_lot::Mutex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
pub struct AMDProfiler {
rocm_profiler: Option<ROCmProfiler>,
codexl_profiler: Option<CodeXLProfiler>,
uprof_profiler: Option<UProfProfiler>,
roctracer: Option<ROCTracer>,
enabled: bool,
config: AMDProfilerConfig,
events: Arc<Mutex<Vec<ProfileEvent>>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AMDProfilerConfig {
pub enable_rocm: bool,
pub enable_codexl: bool,
pub enable_uprof: bool,
pub enable_roctracer: bool,
pub gpu_device_id: u32,
pub cpu_sample_frequency: u32,
pub enable_kernel_profiling: bool,
pub enable_memory_profiling: bool,
pub enable_power_profiling: bool,
}
pub struct ROCmProfiler {
device_id: u32,
enabled: bool,
kernel_launches: Vec<ROCmKernelLaunch>,
memory_operations: Vec<ROCmMemoryOperation>,
hip_api_calls: Vec<HIPAPICall>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ROCmKernelLaunch {
pub kernel_name: String,
pub grid_size: (u32, u32, u32),
pub block_size: (u32, u32, u32),
pub shared_memory_bytes: u32,
pub registers_per_thread: u32,
pub start_time_us: u64,
pub duration_us: u64,
pub occupancy: f32,
pub wave_count: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ROCmMemoryOperation {
pub operation_type: String, pub size_bytes: usize,
pub start_time_us: u64,
pub duration_us: u64,
pub bandwidth_gbps: f64,
pub source_location: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HIPAPICall {
pub function_name: String,
pub start_time_us: u64,
pub duration_us: u64,
pub return_code: i32,
pub parameters: HashMap<String, String>,
}
pub struct CodeXLProfiler {
enabled: bool,
gpu_counters: Vec<GPUCounter>,
cpu_samples: Vec<CPUSample>,
power_samples: Vec<AMDPowerSample>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GPUCounter {
pub counter_name: String,
pub value: u64,
pub timestamp_us: u64,
pub unit: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CPUSample {
pub thread_id: u64,
pub instruction_pointer: u64,
pub timestamp_us: u64,
pub cpu_id: u32,
pub call_stack: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AMDPowerSample {
pub component: String, pub power_watts: f64,
pub temperature_celsius: f64,
pub voltage_volts: f64,
pub frequency_mhz: f64,
pub timestamp_us: u64,
}
pub struct UProfProfiler {
enabled: bool,
sample_frequency: u32,
hotspots: Vec<CPUHotspot>,
cache_analysis: CacheAnalysis,
branch_analysis: BranchAnalysis,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CPUHotspot {
pub function_name: String,
pub module_name: String,
pub source_file: Option<String>,
pub line_number: Option<u32>,
pub exclusive_time_ms: f64,
pub inclusive_time_ms: f64,
pub sample_count: u64,
pub cpu_cycles: u64,
pub instructions: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheAnalysis {
pub l1_data_cache_misses: u64,
pub l1_instruction_cache_misses: u64,
pub l2_cache_misses: u64,
pub l3_cache_misses: u64,
pub cache_miss_rate: f64,
pub memory_stall_cycles: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BranchAnalysis {
pub total_branches: u64,
pub branch_mispredictions: u64,
pub branch_misprediction_rate: f64,
pub indirect_branches: u64,
pub conditional_branches: u64,
}
pub struct ROCTracer {
enabled: bool,
hip_traces: Vec<HIPTrace>,
hsa_traces: Vec<HSATrace>,
kernel_traces: Vec<KernelTrace>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HIPTrace {
pub api_name: String,
pub start_time_us: u64,
pub end_time_us: u64,
pub thread_id: u64,
pub correlation_id: u64,
pub arguments: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HSATrace {
pub api_name: String,
pub start_time_us: u64,
pub end_time_us: u64,
pub agent_handle: u64,
pub queue_handle: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KernelTrace {
pub kernel_name: String,
pub start_time_us: u64,
pub end_time_us: u64,
pub grid_size: (u32, u32, u32),
pub workgroup_size: (u32, u32, u32),
pub correlation_id: u64,
pub queue_index: u32,
}
impl Default for AMDProfilerConfig {
fn default() -> Self {
Self {
enable_rocm: true,
enable_codexl: false,
enable_uprof: false,
enable_roctracer: false,
gpu_device_id: 0,
cpu_sample_frequency: 1000,
enable_kernel_profiling: true,
enable_memory_profiling: true,
enable_power_profiling: false,
}
}
}
impl Default for AMDProfiler {
fn default() -> Self {
Self::new()
}
}
impl AMDProfiler {
pub fn new() -> Self {
Self {
rocm_profiler: None,
codexl_profiler: None,
uprof_profiler: None,
roctracer: None,
enabled: false,
config: AMDProfilerConfig::default(),
events: Arc::new(Mutex::new(Vec::new())),
}
}
pub fn with_config(config: AMDProfilerConfig) -> Self {
let mut profiler = Self::new();
profiler.config = config;
profiler
}
pub fn enable(&mut self) -> TorshResult<()> {
if self.config.enable_rocm {
self.rocm_profiler = Some(ROCmProfiler::new(self.config.gpu_device_id)?);
}
if self.config.enable_codexl {
self.codexl_profiler = Some(CodeXLProfiler::new()?);
}
if self.config.enable_uprof {
self.uprof_profiler = Some(UProfProfiler::new(self.config.cpu_sample_frequency)?);
}
if self.config.enable_roctracer {
self.roctracer = Some(ROCTracer::new()?);
}
self.enabled = true;
Ok(())
}
pub fn disable(&mut self) {
self.enabled = false;
self.rocm_profiler = None;
self.codexl_profiler = None;
self.uprof_profiler = None;
self.roctracer = None;
}
pub fn record_hip_kernel_launch(
&mut self,
kernel_name: &str,
grid_size: (u32, u32, u32),
block_size: (u32, u32, u32),
shared_memory: u32,
duration: Duration,
) -> TorshResult<()> {
if !self.enabled {
return Ok(());
}
if let Some(ref mut rocm) = self.rocm_profiler {
rocm.record_kernel_launch(kernel_name, grid_size, block_size, shared_memory, duration)?;
}
let mut events = self.events.lock();
events.push(ProfileEvent {
name: format!("HIP::{kernel_name}"),
category: "hip_kernel".to_string(),
start_us: 0, duration_us: duration.as_micros() as u64,
thread_id: 0,
operation_count: Some(1),
flops: Some(0), bytes_transferred: Some(0), stack_trace: None,
});
Ok(())
}
pub fn record_hip_memory_operation(
&mut self,
operation_type: &str,
size_bytes: usize,
duration: Duration,
) -> TorshResult<()> {
if !self.enabled {
return Ok(());
}
if let Some(ref mut rocm) = self.rocm_profiler {
rocm.record_memory_operation(operation_type, size_bytes, duration)?;
}
let _bandwidth_gbps =
(size_bytes as f64) / (1024.0 * 1024.0 * 1024.0) / duration.as_secs_f64();
let mut events = self.events.lock();
events.push(ProfileEvent {
name: format!("HIP::Memory::{operation_type}"),
category: "hip_memory".to_string(),
start_us: 0,
duration_us: duration.as_micros() as u64,
thread_id: 0,
operation_count: Some(1),
flops: Some(0),
bytes_transferred: Some(size_bytes as u64),
stack_trace: None,
});
Ok(())
}
pub fn start_cpu_profiling(&mut self) -> TorshResult<()> {
if let Some(ref mut uprof) = self.uprof_profiler {
uprof.start_profiling()?;
}
Ok(())
}
pub fn stop_cpu_profiling(&mut self) -> TorshResult<Vec<CPUHotspot>> {
if let Some(ref mut uprof) = self.uprof_profiler {
return uprof.stop_profiling();
}
Ok(Vec::new())
}
pub fn get_rocm_stats(&self) -> Option<ROCmStats> {
self.rocm_profiler.as_ref().map(|rocm| rocm.get_stats())
}
pub fn get_events(&self) -> Vec<ProfileEvent> {
self.events.lock().clone()
}
pub fn export_data(&self, filename: &str) -> TorshResult<()> {
let data = AMDProfilingData {
rocm_kernels: self
.rocm_profiler
.as_ref()
.map(|r| r.kernel_launches.clone())
.unwrap_or_default(),
rocm_memory_ops: self
.rocm_profiler
.as_ref()
.map(|r| r.memory_operations.clone())
.unwrap_or_default(),
cpu_hotspots: self
.uprof_profiler
.as_ref()
.map(|u| u.hotspots.clone())
.unwrap_or_default(),
gpu_counters: self
.codexl_profiler
.as_ref()
.map(|c| c.gpu_counters.clone())
.unwrap_or_default(),
power_samples: self
.codexl_profiler
.as_ref()
.map(|c| c.power_samples.clone())
.unwrap_or_default(),
};
let json = serde_json::to_string_pretty(&data)
.map_err(|e| TorshError::SerializationError(e.to_string()))?;
std::fs::write(filename, json).map_err(|e| TorshError::IoError(e.to_string()))?;
Ok(())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ROCmStats {
pub total_kernel_launches: usize,
pub total_kernel_time_us: u64,
pub average_kernel_time_us: f64,
pub total_memory_operations: usize,
pub total_bytes_transferred: u64,
pub average_bandwidth_gbps: f64,
pub peak_memory_bandwidth_gbps: f64,
pub gpu_utilization_percent: f64,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct AMDProfilingData {
pub rocm_kernels: Vec<ROCmKernelLaunch>,
pub rocm_memory_ops: Vec<ROCmMemoryOperation>,
pub cpu_hotspots: Vec<CPUHotspot>,
pub gpu_counters: Vec<GPUCounter>,
pub power_samples: Vec<AMDPowerSample>,
}
impl ROCmProfiler {
fn new(device_id: u32) -> TorshResult<Self> {
Ok(Self {
device_id,
enabled: true,
kernel_launches: Vec::new(),
memory_operations: Vec::new(),
hip_api_calls: Vec::new(),
})
}
fn record_kernel_launch(
&mut self,
kernel_name: &str,
grid_size: (u32, u32, u32),
block_size: (u32, u32, u32),
shared_memory: u32,
duration: Duration,
) -> TorshResult<()> {
let launch = ROCmKernelLaunch {
kernel_name: kernel_name.to_string(),
grid_size,
block_size,
shared_memory_bytes: shared_memory,
registers_per_thread: 64, start_time_us: 0, duration_us: duration.as_micros() as u64,
occupancy: self.calculate_occupancy(grid_size, block_size),
wave_count: self.calculate_wave_count(grid_size, block_size),
};
self.kernel_launches.push(launch);
Ok(())
}
fn record_memory_operation(
&mut self,
operation_type: &str,
size_bytes: usize,
duration: Duration,
) -> TorshResult<()> {
let bandwidth_gbps =
(size_bytes as f64) / (1024.0 * 1024.0 * 1024.0) / duration.as_secs_f64();
let operation = ROCmMemoryOperation {
operation_type: operation_type.to_string(),
size_bytes,
start_time_us: 0, duration_us: duration.as_micros() as u64,
bandwidth_gbps,
source_location: None, };
self.memory_operations.push(operation);
Ok(())
}
fn calculate_occupancy(&self, _grid_size: (u32, u32, u32), block_size: (u32, u32, u32)) -> f32 {
let threads_per_block = block_size.0 * block_size.1 * block_size.2;
let max_threads_per_cu = 2560; (threads_per_block as f32 / max_threads_per_cu as f32).min(1.0)
}
fn calculate_wave_count(&self, grid_size: (u32, u32, u32), block_size: (u32, u32, u32)) -> u32 {
let total_threads =
grid_size.0 * grid_size.1 * grid_size.2 * block_size.0 * block_size.1 * block_size.2;
total_threads.div_ceil(64) }
fn get_stats(&self) -> ROCmStats {
let total_kernel_time: u64 = self.kernel_launches.iter().map(|k| k.duration_us).sum();
let total_bytes: u64 = self
.memory_operations
.iter()
.map(|m| m.size_bytes as u64)
.sum();
let total_memory_time: f64 = self
.memory_operations
.iter()
.map(|m| m.duration_us as f64 / 1_000_000.0)
.sum();
ROCmStats {
total_kernel_launches: self.kernel_launches.len(),
total_kernel_time_us: total_kernel_time,
average_kernel_time_us: if !self.kernel_launches.is_empty() {
total_kernel_time as f64 / self.kernel_launches.len() as f64
} else {
0.0
},
total_memory_operations: self.memory_operations.len(),
total_bytes_transferred: total_bytes,
average_bandwidth_gbps: if total_memory_time > 0.0 {
(total_bytes as f64) / (1024.0 * 1024.0 * 1024.0) / total_memory_time
} else {
0.0
},
peak_memory_bandwidth_gbps: self
.memory_operations
.iter()
.map(|m| m.bandwidth_gbps)
.fold(0.0, f64::max),
gpu_utilization_percent: 85.0, }
}
}
impl CodeXLProfiler {
fn new() -> TorshResult<Self> {
Ok(Self {
enabled: true,
gpu_counters: Vec::new(),
cpu_samples: Vec::new(),
power_samples: Vec::new(),
})
}
}
impl UProfProfiler {
fn new(sample_frequency: u32) -> TorshResult<Self> {
Ok(Self {
enabled: true,
sample_frequency,
hotspots: Vec::new(),
cache_analysis: CacheAnalysis {
l1_data_cache_misses: 0,
l1_instruction_cache_misses: 0,
l2_cache_misses: 0,
l3_cache_misses: 0,
cache_miss_rate: 0.0,
memory_stall_cycles: 0,
},
branch_analysis: BranchAnalysis {
total_branches: 0,
branch_mispredictions: 0,
branch_misprediction_rate: 0.0,
indirect_branches: 0,
conditional_branches: 0,
},
})
}
fn start_profiling(&mut self) -> TorshResult<()> {
Ok(())
}
fn stop_profiling(&mut self) -> TorshResult<Vec<CPUHotspot>> {
Ok(self.hotspots.clone())
}
}
impl ROCTracer {
fn new() -> TorshResult<Self> {
Ok(Self {
enabled: true,
hip_traces: Vec::new(),
hsa_traces: Vec::new(),
kernel_traces: Vec::new(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_amd_profiler_creation() {
let profiler = AMDProfiler::new();
assert!(!profiler.enabled);
}
#[test]
fn test_amd_profiler_with_config() {
let config = AMDProfilerConfig {
enable_rocm: true,
enable_uprof: true,
gpu_device_id: 1,
cpu_sample_frequency: 2000,
..Default::default()
};
let profiler = AMDProfiler::with_config(config.clone());
assert_eq!(profiler.config.gpu_device_id, 1);
assert_eq!(profiler.config.cpu_sample_frequency, 2000);
}
#[test]
fn test_rocm_occupancy_calculation() -> TorshResult<()> {
let rocm = ROCmProfiler::new(0)?;
let occupancy = rocm.calculate_occupancy((32, 32, 1), (16, 16, 1));
assert!(occupancy > 0.0 && occupancy <= 1.0);
Ok(())
}
#[test]
fn test_wave_count_calculation() -> TorshResult<()> {
let rocm = ROCmProfiler::new(0)?;
let wave_count = rocm.calculate_wave_count((1, 1, 1), (64, 1, 1));
assert_eq!(wave_count, 1);
let wave_count = rocm.calculate_wave_count((1, 1, 1), (128, 1, 1));
assert_eq!(wave_count, 2); Ok(())
}
}