use crate::{ProfileEvent, TorshResult};
use serde::{Deserialize, Serialize};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use torsh_core::TorshError;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NsightConfig {
pub nvtx_enabled: bool,
pub cuda_api_tracing: bool,
pub kernel_analysis: bool,
pub memory_analysis: bool,
pub occupancy_analysis: bool,
pub output_dir: Option<String>,
pub device_id: i32,
}
impl Default for NsightConfig {
fn default() -> Self {
Self {
nvtx_enabled: true,
cuda_api_tracing: true,
kernel_analysis: true,
memory_analysis: true,
occupancy_analysis: true,
output_dir: None,
device_id: 0,
}
}
}
pub struct NsightProfiler {
config: NsightConfig,
events: Arc<Mutex<Vec<ProfileEvent>>>,
start_time: Instant,
enabled: bool,
session_id: String,
}
impl NsightProfiler {
pub fn new(config: NsightConfig) -> Self {
Self {
config,
events: Arc::new(Mutex::new(Vec::new())),
start_time: Instant::now(),
enabled: false,
session_id: format!("nsight_session_{}", chrono::Utc::now().timestamp()),
}
}
pub fn enable(&mut self) -> TorshResult<()> {
self.enabled = true;
self.start_time = Instant::now();
if let Ok(mut events) = self.events.lock() {
events.clear();
}
if self.config.nvtx_enabled {
self.init_nvtx()?;
}
if self.config.cuda_api_tracing {
self.init_cuda_api_tracing()?;
}
if self.config.kernel_analysis {
self.init_kernel_analysis()?;
}
if self.config.memory_analysis {
self.init_memory_analysis()?;
}
if self.config.occupancy_analysis {
self.init_occupancy_analysis()?;
}
Ok(())
}
pub fn disable(&mut self) -> TorshResult<()> {
self.enabled = false;
self.finalize_session()?;
Ok(())
}
pub fn start_nvtx_range(&self, name: &str) -> TorshResult<NvtxRange> {
if !self.enabled || !self.config.nvtx_enabled {
return Ok(NvtxRange::new_disabled());
}
let start_time = Instant::now();
let range = NvtxRange::new(name.to_string(), start_time);
Ok(range)
}
pub fn record_kernel_launch(
&self,
kernel_name: &str,
grid_size: (u32, u32, u32),
block_size: (u32, u32, u32),
shared_memory: usize,
registers_per_thread: u32,
duration: Duration,
) -> TorshResult<()> {
if !self.enabled {
return Ok(());
}
let mut events = self.events.lock().map_err(|_| {
TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
})?;
let start_us = self.start_time.elapsed().as_micros() as u64;
let duration_us = duration.as_micros() as u64;
let event_name = format!(
"{} [grid:({},{},{}), block:({},{},{}), shared:{}B, regs:{}]",
kernel_name,
grid_size.0,
grid_size.1,
grid_size.2,
block_size.0,
block_size.1,
block_size.2,
shared_memory,
registers_per_thread
);
let _theoretical_occupancy =
self.calculate_theoretical_occupancy(block_size, shared_memory, registers_per_thread)?;
events.push(ProfileEvent {
name: event_name,
category: "nsight_kernel".to_string(),
start_us,
duration_us,
thread_id: self.config.device_id as usize,
operation_count: Some(1),
flops: Some(0), bytes_transferred: Some(0), stack_trace: None,
});
Ok(())
}
pub fn record_memory_operation(
&self,
operation: &str,
src_device: i32,
dst_device: i32,
size_bytes: usize,
duration: Duration,
) -> TorshResult<()> {
if !self.enabled {
return Ok(());
}
let mut events = self.events.lock().map_err(|_| {
TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
})?;
let start_us = self.start_time.elapsed().as_micros() as u64;
let duration_us = duration.as_micros() as u64;
let event_name = format!(
"{} [{}->{}] {}MB",
operation,
src_device,
dst_device,
size_bytes as f64 / 1024.0 / 1024.0
);
let _bandwidth_gbps = if duration_us > 0 {
(size_bytes as f64 / 1024.0 / 1024.0 / 1024.0) / (duration_us as f64 / 1_000_000.0)
} else {
0.0
};
events.push(ProfileEvent {
name: event_name,
category: "nsight_memory".to_string(),
start_us,
duration_us,
thread_id: self.config.device_id as usize,
operation_count: Some(1),
flops: Some(0),
bytes_transferred: Some(size_bytes as u64),
stack_trace: None,
});
Ok(())
}
pub fn export_nsight_data(&self, filename: &str) -> TorshResult<()> {
let events = self.events.lock().map_err(|_| {
TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
})?;
let nsight_data = NsightExportData {
session_id: self.session_id.clone(),
config: self.config.clone(),
events: events.clone(),
total_events: events.len(),
total_duration_us: events.iter().map(|e| e.duration_us).sum(),
timestamp: chrono::Utc::now(),
};
let json_data = serde_json::to_string_pretty(&nsight_data)
.map_err(|e| TorshError::InvalidArgument(format!("Failed to serialize data: {e}")))?;
std::fs::write(filename, json_data)
.map_err(|e| TorshError::InvalidArgument(format!("Failed to write file: {e}")))?;
Ok(())
}
pub fn get_nsight_stats(&self) -> TorshResult<NsightStats> {
let events = self.events.lock().map_err(|_| {
TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
})?;
let kernel_events: Vec<_> = events
.iter()
.filter(|e| e.category == "nsight_kernel")
.collect();
let memory_events: Vec<_> = events
.iter()
.filter(|e| e.category == "nsight_memory")
.collect();
let total_kernel_time_us: u64 = kernel_events.iter().map(|e| e.duration_us).sum();
let total_memory_time_us: u64 = memory_events.iter().map(|e| e.duration_us).sum();
let total_bytes_transferred: usize = memory_events
.iter()
.map(|e| e.bytes_transferred.unwrap_or(0) as usize)
.sum();
let avg_kernel_duration_us = if !kernel_events.is_empty() {
total_kernel_time_us as f64 / kernel_events.len() as f64
} else {
0.0
};
let avg_memory_bandwidth_gbps = if !memory_events.is_empty() && total_memory_time_us > 0 {
(total_bytes_transferred as f64 / 1024.0 / 1024.0 / 1024.0)
/ (total_memory_time_us as f64 / 1_000_000.0)
/ memory_events.len() as f64
} else {
0.0
};
Ok(NsightStats {
total_events: events.len(),
kernel_events: kernel_events.len(),
memory_events: memory_events.len(),
total_kernel_time_us,
total_memory_time_us,
total_bytes_transferred,
avg_kernel_duration_us,
avg_memory_bandwidth_gbps,
session_id: self.session_id.clone(),
})
}
fn init_nvtx(&self) -> TorshResult<()> {
Ok(())
}
fn init_cuda_api_tracing(&self) -> TorshResult<()> {
Ok(())
}
fn init_kernel_analysis(&self) -> TorshResult<()> {
Ok(())
}
fn init_memory_analysis(&self) -> TorshResult<()> {
Ok(())
}
fn init_occupancy_analysis(&self) -> TorshResult<()> {
Ok(())
}
fn finalize_session(&self) -> TorshResult<()> {
Ok(())
}
fn calculate_theoretical_occupancy(
&self,
block_size: (u32, u32, u32),
shared_memory: usize,
registers_per_thread: u32,
) -> TorshResult<f64> {
let threads_per_block = block_size.0 * block_size.1 * block_size.2;
let max_threads_per_sm = 2048; let max_blocks_per_sm = 32;
let max_blocks_by_threads = max_threads_per_sm / threads_per_block;
let max_blocks_by_shared_memory = if shared_memory > 0 {
(48 * 1024) / shared_memory as u32 } else {
max_blocks_per_sm
};
let max_blocks_by_registers = if registers_per_thread > 0 {
(65536 / registers_per_thread) / threads_per_block } else {
max_blocks_per_sm
};
let max_blocks = max_blocks_by_threads
.min(max_blocks_by_shared_memory)
.min(max_blocks_by_registers)
.min(max_blocks_per_sm);
Ok((max_blocks * threads_per_block) as f64 / max_threads_per_sm as f64)
}
}
pub struct NvtxRange {
name: String,
start_time: Instant,
enabled: bool,
}
impl NvtxRange {
fn new(name: String, start_time: Instant) -> Self {
Self {
name,
start_time,
enabled: true,
}
}
fn new_disabled() -> Self {
Self {
name: String::new(),
start_time: Instant::now(),
enabled: false,
}
}
pub fn duration(&self) -> Duration {
self.start_time.elapsed()
}
pub fn name(&self) -> &str {
&self.name
}
}
impl Drop for NvtxRange {
fn drop(&mut self) {
if self.enabled {
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct NsightExportData {
pub session_id: String,
pub config: NsightConfig,
pub events: Vec<ProfileEvent>,
pub total_events: usize,
pub total_duration_us: u64,
pub timestamp: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NsightStats {
pub total_events: usize,
pub kernel_events: usize,
pub memory_events: usize,
pub total_kernel_time_us: u64,
pub total_memory_time_us: u64,
pub total_bytes_transferred: usize,
pub avg_kernel_duration_us: f64,
pub avg_memory_bandwidth_gbps: f64,
pub session_id: String,
}
pub fn create_nsight_profiler() -> NsightProfiler {
NsightProfiler::new(NsightConfig::default())
}
pub fn create_nsight_profiler_with_config(config: NsightConfig) -> NsightProfiler {
NsightProfiler::new(config)
}
pub fn export_nsight_json(profiler: &NsightProfiler, filename: &str) -> TorshResult<()> {
profiler.export_nsight_data(filename)
}
pub fn get_nsight_statistics(profiler: &NsightProfiler) -> TorshResult<NsightStats> {
profiler.get_nsight_stats()
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::Duration;
#[test]
fn test_nsight_profiler_creation() {
let profiler = create_nsight_profiler();
assert!(!profiler.enabled);
}
#[test]
fn test_nsight_profiler_enable_disable() {
let mut profiler = create_nsight_profiler();
assert!(profiler.enable().is_ok());
assert!(profiler.enabled);
assert!(profiler.disable().is_ok());
assert!(!profiler.enabled);
}
#[test]
fn test_nvtx_range() {
let mut profiler = create_nsight_profiler();
profiler.enable().unwrap();
let range = profiler.start_nvtx_range("test_range").unwrap();
assert_eq!(range.name(), "test_range");
std::thread::sleep(std::time::Duration::from_millis(1));
assert!(range.duration().as_nanos() > 0);
}
#[test]
fn test_kernel_recording() {
let mut profiler = create_nsight_profiler();
profiler.enable().unwrap();
let result = profiler.record_kernel_launch(
"test_kernel",
(1, 1, 1),
(256, 1, 1),
1024,
32,
Duration::from_micros(100),
);
assert!(result.is_ok());
let stats = profiler.get_nsight_stats().unwrap();
assert_eq!(stats.kernel_events, 1);
assert_eq!(stats.total_kernel_time_us, 100);
}
#[test]
fn test_memory_recording() {
let mut profiler = create_nsight_profiler();
profiler.enable().unwrap();
let result = profiler.record_memory_operation(
"cudaMemcpy",
0,
0,
1024 * 1024,
Duration::from_micros(50),
);
assert!(result.is_ok());
let stats = profiler.get_nsight_stats().unwrap();
assert_eq!(stats.memory_events, 1);
assert_eq!(stats.total_memory_time_us, 50);
assert_eq!(stats.total_bytes_transferred, 1024 * 1024);
}
#[test]
fn test_theoretical_occupancy_calculation() {
let profiler = create_nsight_profiler();
let occupancy = profiler
.calculate_theoretical_occupancy((256, 1, 1), 1024, 32)
.unwrap();
assert!(occupancy > 0.0);
assert!(occupancy <= 1.0);
}
#[test]
fn test_export_nsight_data() {
let mut profiler = create_nsight_profiler();
profiler.enable().unwrap();
profiler
.record_kernel_launch(
"test_kernel",
(1, 1, 1),
(256, 1, 1),
1024,
32,
Duration::from_micros(100),
)
.unwrap();
let temp_file = std::env::temp_dir().join("test_nsight_export.json");
let temp_str = temp_file.display().to_string();
let result = profiler.export_nsight_data(&temp_str);
assert!(result.is_ok());
let _ = std::fs::remove_file(&temp_file);
}
}