use crate::analysis::roofline::{Bound, MemoryLevel, Precision, RooflineModel};
use crate::metrics::catalog::*;
use crate::profilers::system;
use anyhow::{Context, Result};
use std::collections::HashMap;
use std::path::PathBuf;
use std::process::Command;
#[derive(Debug, Clone, Copy)]
pub enum NcuSection {
LaunchStats,
ComputeThroughput,
MemoryThroughput,
Occupancy,
Roofline,
WarpState,
}
impl NcuSection {
fn as_ncu_arg(&self) -> &str {
match self {
NcuSection::LaunchStats => "LaunchStats",
NcuSection::ComputeThroughput => "ComputeWorkloadAnalysis",
NcuSection::MemoryThroughput => "MemoryWorkloadAnalysis",
NcuSection::Occupancy => "Occupancy",
NcuSection::Roofline => "SpeedOfLight",
NcuSection::WarpState => "WarpStateStats",
}
}
}
pub struct NcuProfiler {
pub ncu_path: PathBuf,
pub sections: Vec<NcuSection>,
}
impl NcuProfiler {
pub fn detect() -> Option<Self> {
which::which("ncu").ok().map(|path| Self {
ncu_path: path,
sections: vec![
NcuSection::LaunchStats,
NcuSection::Roofline,
NcuSection::ComputeThroughput,
NcuSection::MemoryThroughput,
NcuSection::Occupancy,
],
})
}
pub fn profile(
&self,
binary: &str,
binary_args: &[&str],
kernel_regex: &str,
) -> Result<HashMap<String, String>> {
let mut cmd = Command::new(&self.ncu_path);
cmd.arg("--target-processes").arg("all");
cmd.arg("--kernel-name-base").arg("demangled");
if !kernel_regex.is_empty() {
cmd.arg("--kernel-id")
.arg(format!("::regex:{kernel_regex}:"));
}
for section in &self.sections {
cmd.arg("--section").arg(section.as_ncu_arg());
}
cmd.arg("--csv");
cmd.arg("--log-file").arg("/dev/null");
cmd.arg(binary);
cmd.args(binary_args);
let output = cmd
.output()
.with_context(|| format!("Failed to run ncu: {}", self.ncu_path.display()))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
if stderr.contains("permission") || stderr.contains("ERR_NVGPUCTRPERM") {
anyhow::bail!(
"ncu requires elevated permissions. Run with:\n \
sudo cgp profile kernel ...\n \
or set: sudo sysctl kernel.perf_event_paranoid=2"
);
}
anyhow::bail!("ncu failed (exit {}): {}", output.status, stderr.trim());
}
let stdout = String::from_utf8_lossy(&output.stdout);
parse_ncu_csv(&stdout)
}
}
pub fn parse_ncu_csv(csv: &str) -> Result<HashMap<String, String>> {
let mut metrics = HashMap::new();
for line in csv.lines() {
if line.starts_with('"') && !line.starts_with("\"ID\"") {
let fields: Vec<&str> = line.split(',').collect();
if fields.len() >= 4 {
let name = fields[1].trim_matches('"').to_string();
let value = fields[3].trim_matches('"').to_string();
metrics.insert(name, value);
}
}
}
Ok(metrics)
}
fn get_f64(metrics: &HashMap<String, String>, key: &str) -> f64 {
metrics
.get(key)
.and_then(|v| v.replace(',', "").parse::<f64>().ok())
.unwrap_or(0.0)
}
#[allow(dead_code)]
fn get_u64(metrics: &HashMap<String, String>, key: &str) -> u64 {
metrics
.get(key)
.and_then(|v| v.replace(',', "").parse::<u64>().ok())
.unwrap_or(0)
}
fn get_u32(metrics: &HashMap<String, String>, key: &str) -> u32 {
metrics
.get(key)
.and_then(|v| v.replace(',', "").parse::<u32>().ok())
.unwrap_or(0)
}
#[allow(clippy::implicit_hasher)]
pub fn ncu_metrics_to_profile(
metrics: &HashMap<String, String>,
kernel_name: &str,
size: u32,
) -> FullProfile {
let duration_us = get_f64(metrics, "gpu__time_duration.sum") / 1000.0; let flops = 2.0 * (size as f64).powi(3); let tflops = if duration_us > 0.0 {
flops / (duration_us * 1e-6) / 1e12
} else {
0.0
};
let sm_pct = get_f64(metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed");
let dram_pct = get_f64(
metrics,
"dram__throughput.avg.pct_of_peak_sustained_elapsed",
);
let occupancy_pct = get_f64(
metrics,
"sm__warps_active.avg.pct_of_peak_sustained_elapsed",
);
let warp_eff = get_f64(metrics, "smsp__thread_inst_executed_per_inst_executed.pct");
let tc_pct = get_f64(
metrics,
"sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
);
let regs = get_u32(metrics, "launch__registers_per_thread");
let smem = get_u32(metrics, "launch__shared_mem_per_block_driver");
let l2_hit = get_f64(metrics, "lts__t_sector_hit_rate.pct");
let global_load_eff = get_f64(
metrics,
"smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct",
);
let model = RooflineModel::rtx_4090();
let ai = if duration_us > 0.0 {
let dram_bw = 1008.0e9; let actual_bw = dram_bw * dram_pct / 100.0;
let bytes = actual_bw * duration_us * 1e-6;
if bytes > 0.0 {
flops / bytes
} else {
0.0
}
} else {
0.0
};
let roofline = model
.classify(ai, tflops * 1e12, Precision::Fp16, MemoryLevel::Dram)
.map(|point| {
let bound_str = match &point.bound {
Bound::Memory { .. } => "memory".to_string(),
Bound::Compute { .. } => "compute".to_string(),
};
RooflineMetrics {
peak_compute_tflops: 330.0,
peak_bandwidth_gbps: 1008.0,
ridge_point: 327.4,
bound: bound_str,
efficiency_pct: point.efficiency,
distance_to_ridge: point.distance_to_ridge,
}
});
let timestamp = chrono::Utc::now().to_rfc3339();
FullProfile {
version: "2.0".to_string(),
timestamp,
hardware: HardwareInfo {
gpu: Some("NVIDIA GeForce RTX 4090".to_string()),
gpu_sm: Some("8.9".to_string()),
gpu_memory_gb: Some(24.0),
gpu_bandwidth_gbps: Some(1008.0),
..Default::default()
},
kernel: Some(KernelInfo {
name: kernel_name.to_string(),
dimensions: vec![size, size, size],
shared_memory_bytes: Some(smem),
registers_per_thread: Some(regs),
..Default::default()
}),
timing: TimingMetrics {
wall_clock_time_us: duration_us,
samples: 1,
..Default::default()
},
throughput: ThroughputMetrics {
tflops,
bandwidth_gbps: 1008.0 * dram_pct / 100.0,
arithmetic_intensity: ai,
..Default::default()
},
roofline,
gpu_compute: Some(GpuComputeMetrics {
sm_utilization_pct: sm_pct,
achieved_occupancy_pct: occupancy_pct,
warp_execution_efficiency_pct: warp_eff,
tensor_core_utilization_pct: tc_pct,
register_usage_per_thread: regs,
shared_memory_per_block: smem,
..Default::default()
}),
gpu_memory: Some(GpuMemoryMetrics {
dram_throughput_pct: dram_pct,
l2_hit_rate_pct: l2_hit,
global_load_efficiency_pct: global_load_eff,
..Default::default()
}),
system_health: system::collect_system_health(),
vram: system::collect_vram(),
energy: system::collect_system_health()
.and_then(|h| system::compute_energy(h.gpu_power_watts, tflops, duration_us)),
..Default::default()
}
}
fn render_profile(profile: &FullProfile) {
render_profile_header(profile);
render_profile_execution(profile);
render_profile_roofline(profile);
render_profile_compute(profile);
render_profile_memory(profile);
render_profile_vram(profile);
render_profile_system_health(profile);
render_profile_energy(profile);
println!();
}
fn render_profile_header(profile: &FullProfile) {
let kernel = profile.kernel.as_ref().map_or("unknown", |k| &k.name);
let dims = profile
.kernel
.as_ref()
.map(|k| {
k.dimensions
.iter()
.map(|d| d.to_string())
.collect::<Vec<_>>()
.join("x")
})
.unwrap_or_default();
let gpu_name = profile.hardware.gpu.as_deref().unwrap_or("Unknown GPU");
let sm = profile.hardware.gpu_sm.as_deref().unwrap_or("?");
println!("\n=== CGP Kernel Profile: {kernel} ({dims}) ===\n");
println!(
"Backend: CUDA ({gpu_name}, SM {sm}, Driver {})",
detect_driver_version().unwrap_or_else(|| "?".to_string())
);
}
fn render_profile_execution(profile: &FullProfile) {
let t = &profile.timing;
let tp = &profile.throughput;
let peak_pct = if tp.tflops > 0.0 {
tp.tflops / 330.0 * 100.0
} else {
0.0
};
println!(
"Execution: {:.1} us | {:.1} TFLOP/s | {:.1}% of peak",
t.wall_clock_time_us, tp.tflops, peak_pct
);
}
fn render_profile_roofline(profile: &FullProfile) {
let Some(roof) = &profile.roofline else {
return;
};
let tp = &profile.throughput;
println!("\n Roofline Position:");
println!(
" Arithmetic Intensity: {:.1} FLOP/byte",
tp.arithmetic_intensity
);
println!(" Ridge Point: {:.1} FLOP/byte", roof.ridge_point);
let status = if roof.bound == "memory" {
format!("MEMORY-BOUND ({:.1}x below ridge)", roof.distance_to_ridge)
} else {
format!("COMPUTE-BOUND ({:.1}% efficiency)", roof.efficiency_pct)
};
println!(" Status: {status}");
}
fn render_profile_compute(profile: &FullProfile) {
let Some(gc) = &profile.gpu_compute else {
return;
};
println!("\n Compute:");
if gc.tensor_core_utilization_pct > 0.0 {
println!(
" Tensor core utilization: {:5.1}% {}",
gc.tensor_core_utilization_pct,
quality_badge(gc.tensor_core_utilization_pct, 50.0)
);
}
println!(
" Warp execution eff: {:5.1}% {}",
gc.warp_execution_efficiency_pct,
quality_badge(gc.warp_execution_efficiency_pct, 95.0)
);
println!(" SM utilization: {:5.1}%", gc.sm_utilization_pct);
println!(
" Achieved occupancy: {:5.1}%",
gc.achieved_occupancy_pct
);
println!(
" Register usage: {:3}/255",
gc.register_usage_per_thread
);
if gc.shared_memory_per_block > 0 {
println!(
" Shared memory/block: {:5} bytes",
gc.shared_memory_per_block
);
}
}
fn render_profile_memory(profile: &FullProfile) {
let Some(gm) = &profile.gpu_memory else {
return;
};
println!("\n Memory:");
println!(
" DRAM throughput: {:5.1}% of peak ({:.1} GB/s)",
gm.dram_throughput_pct,
1008.0 * gm.dram_throughput_pct / 100.0
);
println!(
" Global load coalescing: {:5.1}% {}",
gm.global_load_efficiency_pct,
quality_badge(gm.global_load_efficiency_pct, 60.0)
);
println!(
" L2 hit rate: {:5.1}% {}",
gm.l2_hit_rate_pct,
quality_badge(gm.l2_hit_rate_pct, 50.0)
);
}
fn render_profile_vram(profile: &FullProfile) {
let Some(vram) = &profile.vram else {
return;
};
println!("\n VRAM:");
println!(
" Used: {:.0} / {:.0} MB ({:.1}%)",
vram.vram_used_mb, vram.vram_total_mb, vram.vram_utilization_pct
);
}
fn render_profile_system_health(profile: &FullProfile) {
let Some(health) = &profile.system_health else {
return;
};
println!("\n System Health:");
println!(" GPU temp: {:.0}°C", health.gpu_temperature_celsius);
println!(" GPU power: {:.0} W", health.gpu_power_watts);
println!(
" GPU clock: {:.0} MHz (mem: {:.0} MHz)",
health.gpu_clock_mhz, health.gpu_memory_clock_mhz
);
if health.cpu_frequency_mhz > 0.0 {
println!(" CPU freq: {:.0} MHz", health.cpu_frequency_mhz);
}
}
fn render_profile_energy(profile: &FullProfile) {
let Some(energy) = &profile.energy else {
return;
};
println!("\n Energy:");
println!(
" Efficiency: {:.4} TFLOP/s per watt",
energy.tflops_per_watt
);
println!(
" Energy: {:.6} J per inference",
energy.joules_per_inference
);
}
fn quality_badge(value: f64, threshold: f64) -> &'static str {
if value >= threshold {
"[OK]"
} else {
"[WARN]"
}
}
fn detect_driver_version() -> Option<String> {
Command::new("nvidia-smi")
.args(["--query-gpu=driver_version", "--format=csv,noheader"])
.output()
.ok()
.filter(|o| o.status.success())
.map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
}
fn detect_gpu_name() -> Option<String> {
Command::new("nvidia-smi")
.args(["--query-gpu=name", "--format=csv,noheader"])
.output()
.ok()
.filter(|o| o.status.success())
.map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
}
pub fn profile_kernel(name: &str, size: u32, roofline: bool, _metrics: Option<&str>) -> Result<()> {
let profiler = match NcuProfiler::detect() {
Some(mut p) => {
if roofline {
p.sections = vec![
NcuSection::LaunchStats,
NcuSection::Roofline,
NcuSection::ComputeThroughput,
NcuSection::MemoryThroughput,
NcuSection::Occupancy,
];
}
p
}
None => {
println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
println!(" ncu not found. Showing static analysis only.\n");
let model = RooflineModel::rtx_4090();
let flops = 2.0 * (size as f64).powi(3);
println!(" Expected FLOPs: {:.2e}", flops);
if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
println!(" FP16 ridge point: {:.1} FLOP/byte", ridge);
}
println!(" Install NVIDIA Nsight Compute for runtime profiling.");
return Ok(());
}
};
let binary = find_kernel_binary(name);
match binary {
Some((bin_path, bin_args)) => {
eprintln!("Profiling {name} via ncu (this may take a moment)...");
let metrics = profiler.profile(
&bin_path,
&bin_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
name,
)?;
let profile = ncu_metrics_to_profile(&metrics, name, size);
render_profile(&profile);
let json_path = format!("/tmp/cgp-{name}-{size}.json");
let json = serde_json::to_string_pretty(&profile)?;
std::fs::write(&json_path, &json)?;
println!(" Profile exported: {json_path}");
}
None => {
println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
println!(" Backend: CUDA (ncu at {})", profiler.ncu_path.display());
println!(
" GPU: {}",
detect_gpu_name().unwrap_or_else(|| "Unknown".to_string())
);
println!(
" Driver: {}",
detect_driver_version().unwrap_or_else(|| "Unknown".to_string())
);
println!("\n No binary found for kernel '{name}'.");
println!(" To profile a specific binary, use:");
println!(" cgp profile binary ./your_binary --kernel-filter {name}");
println!();
if roofline {
let model = RooflineModel::rtx_4090();
let flops = 2.0 * (size as f64).powi(3);
println!(" Roofline analysis (static):");
println!(" Problem FLOPs: {:.2e}", flops);
if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
println!(" FP16 ridge: {:.1} FLOP/byte", ridge);
}
println!();
}
}
}
Ok(())
}
fn find_kernel_binary(_kernel_name: &str) -> Option<(String, Vec<String>)> {
let candidates = [
"/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo",
"./target/release/examples/gpu_batch_demo",
"/mnt/nvme-raid0/targets/trueno/release/examples/wgpu_backward_demo",
"./target/release/examples/wgpu_backward_demo",
];
for path in &candidates {
if std::path::Path::new(path).exists() {
return Some((path.to_string(), vec![]));
}
}
None
}
pub fn profile_cublas(op: &str, size: u32) -> Result<()> {
println!("\n=== CGP cuBLAS Profile: {op} ({size}x{size}) ===\n");
let gpu_name = detect_gpu_name().unwrap_or_else(|| "Unknown GPU".to_string());
let driver = detect_driver_version().unwrap_or_else(|| "?".to_string());
println!(" GPU: {gpu_name} (Driver {driver})");
let flops = 2.0 * (size as f64).powi(3);
println!(" Operation: {op}");
println!(" Problem size: {size}x{size}x{size}");
println!(" FLOPs: {flops:.2e}");
let (peak_tflops, precision) = match op {
"gemm_f16" | "hgemm" => (330.0, "FP16 Tensor"),
"gemm_f32" | "sgemm" => (82.6, "FP32"),
"gemm_tf32" => (165.0, "TF32 Tensor"),
_ => (82.6, "FP32 (default)"),
};
println!(" Precision: {precision} (peak: {peak_tflops:.1} TFLOP/s)");
let cublas_bin = find_cublas_binary();
match cublas_bin {
Some(bin) => {
println!(" Binary: {bin}");
if let Some(nsys) = NsysProfiler::detect() {
println!(" Profiling via nsys...");
match nsys.profile_binary(&bin, &[]) {
Ok(stats) => {
let cublas_kernels: Vec<_> = stats
.iter()
.filter(|s| {
s.name.contains("gemm")
|| s.name.contains("cublas")
|| s.name.contains("Gemm")
})
.collect();
if cublas_kernels.is_empty() {
println!(" No cuBLAS kernels found in trace.");
} else {
println!("\n cuBLAS Kernels:");
for k in &cublas_kernels {
let tflops = flops / (k.avg_us * 1e-6) / 1e12;
let eff = tflops / peak_tflops * 100.0;
println!(
" {:50} avg={:.1}us {:.1} TFLOP/s ({:.1}% eff)",
k.name, k.avg_us, tflops, eff
);
}
}
}
Err(e) => println!(" nsys profiling failed: {e}"),
}
}
}
None => {
let est_time_us = flops / (peak_tflops * 0.9 * 1e12) * 1e6; let est_tflops = flops / (est_time_us * 1e-6) / 1e12;
println!("\n Estimated (no binary found):");
println!(" Time: ~{est_time_us:.1} us");
println!(
" Throughput: ~{est_tflops:.1} TFLOP/s ({:.1}% of peak)",
est_tflops / peak_tflops * 100.0
);
println!("\n To profile real cuBLAS:");
println!(" cgp profile binary ./your_cublas_bench --kernel-filter gemm");
}
}
println!();
Ok(())
}
fn find_cublas_binary() -> Option<String> {
let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
let mut candidates: Vec<String> = Vec::new();
if !target_dir.is_empty() {
candidates.push(format!("{target_dir}/release/examples/bench_cublas_vs_ptx"));
candidates.push(format!("{target_dir}/release/examples/gpu_batch_demo"));
}
candidates.extend_from_slice(&[
"/mnt/nvme-raid0/targets/trueno/release/examples/bench_cublas_vs_ptx".to_string(),
"/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo".to_string(),
]);
for path in &candidates {
if std::path::Path::new(path).exists() {
return Some(path.clone());
}
}
None
}
pub struct NsysProfiler {
pub nsys_path: PathBuf,
}
#[derive(Debug, Clone)]
pub struct NsysKernelStat {
pub name: String,
pub calls: u64,
pub total_us: f64,
pub avg_us: f64,
pub min_us: f64,
pub max_us: f64,
}
impl NsysProfiler {
pub fn detect() -> Option<Self> {
which::which("nsys")
.ok()
.map(|path| Self { nsys_path: path })
}
pub fn profile_binary(
&self,
binary: &str,
binary_args: &[&str],
) -> Result<Vec<NsysKernelStat>> {
let report_path = format!("/tmp/cgp-nsys-{}", std::process::id());
let mut cmd = Command::new(&self.nsys_path);
cmd.arg("profile")
.arg("--stats=true")
.arg("--force-overwrite=true")
.arg("-o")
.arg(&report_path)
.arg(binary)
.args(binary_args);
let output = cmd.output().with_context(|| "Failed to run nsys")?;
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
let combined = format!("{stdout}\n{stderr}");
let stats = parse_nsys_stats(&combined);
let _ = std::fs::remove_file(format!("{report_path}.nsys-rep"));
let _ = std::fs::remove_file(format!("{report_path}.sqlite"));
Ok(stats)
}
}
fn parse_nsys_stats(output: &str) -> Vec<NsysKernelStat> {
let mut stats = Vec::new();
let mut in_kernel_section = false;
for line in output.lines() {
if line.contains("CUDA Kernel Statistics") || line.contains("cuda_gpu_kern_sum") {
in_kernel_section = true;
continue;
}
if in_kernel_section && line.trim().is_empty() {
in_kernel_section = false;
continue;
}
if in_kernel_section {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 8 {
if let (Ok(total_ns), Ok(instances)) = (
parts[1].replace(',', "").parse::<f64>(),
parts[2].replace(',', "").parse::<u64>(),
) {
let avg_ns = parts[3].replace(',', "").parse::<f64>().unwrap_or(0.0);
let min_ns = parts[5].replace(',', "").parse::<f64>().unwrap_or(0.0);
let max_ns = parts[6].replace(',', "").parse::<f64>().unwrap_or(0.0);
let name = parts[7..].join(" ");
stats.push(NsysKernelStat {
name,
calls: instances,
total_us: total_ns / 1000.0,
avg_us: avg_ns / 1000.0,
min_us: min_ns / 1000.0,
max_us: max_ns / 1000.0,
});
}
}
}
}
stats
}
pub fn profile_binary(
path: &str,
kernel_filter: Option<&str>,
_trace: bool,
_duration: Option<&str>,
) -> Result<()> {
println!("\n=== CGP Binary Profile: {path} ===\n");
let profiler = match NsysProfiler::detect() {
Some(p) => p,
None => {
println!(" nsys not found. Install NVIDIA Nsight Systems.");
println!(" Falling back to wall-clock timing...");
let start = std::time::Instant::now();
let status = Command::new(path).status()?;
let elapsed = start.elapsed();
println!(
" Wall time: {:.2}ms (exit code: {})",
elapsed.as_secs_f64() * 1000.0,
status.code().unwrap_or(-1)
);
return Ok(());
}
};
eprintln!("Running nsys profile (this may take a moment)...");
let stats = profiler.profile_binary(path, &[])?;
if stats.is_empty() {
println!(" No CUDA kernels detected. Binary may be CPU-only.");
println!(" For CPU profiling, use: cgp profile simd or cgp profile scalar");
} else {
println!(
" {:40} {:>8} {:>12} {:>12} {:>12}",
"Kernel", "Calls", "Avg (us)", "Min (us)", "Max (us)"
);
println!(" {}", "-".repeat(88));
for stat in &stats {
let name = if let Some(filter) = kernel_filter {
if !stat.name.contains(filter) {
continue;
}
&stat.name
} else {
&stat.name
};
let display_name = if name.len() > 40 {
format!("{}...", &name[..37])
} else {
name.to_string()
};
println!(
" {:40} {:>8} {:>12.1} {:>12.1} {:>12.1}",
display_name, stat.calls, stat.avg_us, stat.min_us, stat.max_us
);
}
println!();
println!(
" Total kernels: {}, Total CUDA time: {:.1} us",
stats.len(),
stats.iter().map(|s| s.total_us).sum::<f64>()
);
}
println!();
Ok(())
}
pub fn profile_python(args: &[String]) -> Result<()> {
let cmd_str = args.join(" ");
println!("\n=== CGP Python Profile ===\n");
println!(" Command: {cmd_str}");
if let Some(profiler) = NsysProfiler::detect() {
eprintln!("Running nsys profile on Python script...");
let python_cmd = if args.is_empty() {
anyhow::bail!(
"No Python script specified. Usage: cgp profile python -- uv run python script.py"
);
} else {
args[0].clone()
};
let python_args: Vec<&str> = args[1..].iter().map(|s| s.as_str()).collect();
let stats = profiler.profile_binary(&python_cmd, &python_args)?;
if stats.is_empty() {
println!(" No CUDA kernels detected (CPU-only Python workload).");
println!(" Use perf stat for CPU profiling.");
} else {
println!(" CUDA kernels captured via nsys:");
for stat in &stats {
println!(
" {} — {} calls, avg {:.1}us",
stat.name, stat.calls, stat.avg_us
);
}
}
} else {
println!(" nsys not found — cannot capture CUDA kernels.");
println!(" Falling back to wall-clock timing.");
if !args.is_empty() {
let start = std::time::Instant::now();
let status = Command::new(&args[0]).args(&args[1..]).status()?;
let elapsed = start.elapsed();
println!(
" Wall time: {:.2}ms (exit code: {})",
elapsed.as_secs_f64() * 1000.0,
status.code().unwrap_or(-1)
);
}
}
println!();
Ok(())
}
pub fn run_trace(binary: &str, duration: Option<&str>) -> Result<()> {
println!("\n=== CGP System Trace: {binary} ===\n");
let Some(profiler) = NsysProfiler::detect() else {
anyhow::bail!("nsys not found. Install NVIDIA Nsight Systems for timeline tracing.");
};
let report_path = format!("/tmp/cgp-trace-{}", std::process::id());
let cmd = build_nsys_trace_command(&profiler.nsys_path, &report_path, duration, binary);
let (stdout, stderr) = run_nsys_trace_command(cmd)?;
print_nsys_summary(&stdout, &stderr);
print_nsys_report_file(&report_path);
println!();
Ok(())
}
fn build_nsys_trace_command(
nsys_path: &std::path::Path,
report_path: &str,
duration: Option<&str>,
binary: &str,
) -> Command {
let mut cmd = Command::new(nsys_path);
cmd.arg("profile")
.arg("--stats=true")
.arg("--force-overwrite=true")
.arg("--trace=cuda,nvtx,osrt")
.arg("-o")
.arg(report_path);
if let Some(dur) = duration {
cmd.arg("--duration").arg(dur);
}
cmd.arg(binary);
cmd
}
fn run_nsys_trace_command(mut cmd: Command) -> Result<(String, String)> {
eprintln!("Running nsys trace (this may take a while)...");
let output = cmd.output().with_context(|| "Failed to run nsys trace")?;
Ok((
String::from_utf8_lossy(&output.stdout).to_string(),
String::from_utf8_lossy(&output.stderr).to_string(),
))
}
fn print_nsys_summary(stdout: &str, stderr: &str) {
let combined = format!("{stdout}\n{stderr}");
let mut in_summary = false;
for line in combined.lines() {
if is_nsys_summary_header(line) {
in_summary = true;
println!(" {line}");
continue;
}
if !in_summary {
continue;
}
if line.trim().is_empty() {
in_summary = false;
println!();
} else {
println!(" {line}");
}
}
}
fn is_nsys_summary_header(line: &str) -> bool {
line.contains("CUDA API Statistics")
|| line.contains("CUDA Kernel Statistics")
|| line.contains("OS Runtime Statistics")
}
fn print_nsys_report_file(report_path: &str) {
let report_file = format!("{report_path}.nsys-rep");
if std::path::Path::new(&report_file).exists() {
println!(" Report: {report_file}");
println!(" View: nsys-ui {report_file}");
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ncu_section_args() {
assert_eq!(NcuSection::LaunchStats.as_ncu_arg(), "LaunchStats");
assert_eq!(NcuSection::Roofline.as_ncu_arg(), "SpeedOfLight");
assert_eq!(NcuSection::WarpState.as_ncu_arg(), "WarpStateStats");
}
#[test]
fn test_ncu_command_build() {
if let Some(profiler) = NcuProfiler::detect() {
assert!(profiler.ncu_path.exists());
}
}
#[test]
fn test_parse_ncu_csv_empty() {
let metrics = parse_ncu_csv("").unwrap();
assert!(metrics.is_empty());
}
#[test]
fn test_parse_ncu_csv_sample() {
let csv = r#""ID","Metric Name","Metric Unit","Metric Value"
"0","sm__throughput.avg.pct_of_peak_sustained_elapsed","%","42.3"
"0","dram__throughput.avg.pct_of_peak_sustained_elapsed","%","7.8"
"0","launch__registers_per_thread","register/thread","48"
"0","gpu__time_duration.sum","nsecond","23200"
"#;
let metrics = parse_ncu_csv(csv).unwrap();
assert_eq!(
get_f64(&metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed"),
42.3
);
assert_eq!(get_u32(&metrics, "launch__registers_per_thread"), 48);
assert_eq!(get_f64(&metrics, "gpu__time_duration.sum"), 23200.0);
}
#[test]
fn test_ncu_metrics_to_profile() {
let mut metrics = HashMap::new();
metrics.insert(
"gpu__time_duration.sum".to_string(),
"23200".to_string(), );
metrics.insert(
"sm__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
"42.3".to_string(),
);
metrics.insert(
"dram__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
"7.8".to_string(),
);
metrics.insert("launch__registers_per_thread".to_string(), "48".to_string());
metrics.insert(
"smsp__thread_inst_executed_per_inst_executed.pct".to_string(),
"100.0".to_string(),
);
let profile = ncu_metrics_to_profile(&metrics, "gemm_cta_wmma_fp16", 512);
assert_eq!(profile.kernel.as_ref().unwrap().name, "gemm_cta_wmma_fp16");
assert!(profile.timing.wall_clock_time_us > 0.0);
assert!(profile.throughput.tflops > 0.0);
assert!(profile.gpu_compute.is_some());
}
#[test]
fn test_parse_nsys_stats_empty() {
let stats = parse_nsys_stats("");
assert!(stats.is_empty());
}
#[test]
fn test_render_profile_no_panic() {
let profile = FullProfile {
version: "2.0".to_string(),
hardware: HardwareInfo {
gpu: Some("Test GPU".to_string()),
gpu_sm: Some("8.9".to_string()),
..Default::default()
},
kernel: Some(KernelInfo {
name: "test_kernel".to_string(),
dimensions: vec![256, 256, 256],
..Default::default()
}),
timing: TimingMetrics {
wall_clock_time_us: 10.0,
samples: 1,
..Default::default()
},
throughput: ThroughputMetrics {
tflops: 5.0,
..Default::default()
},
gpu_compute: Some(GpuComputeMetrics {
sm_utilization_pct: 40.0,
warp_execution_efficiency_pct: 100.0,
achieved_occupancy_pct: 33.0,
register_usage_per_thread: 48,
..Default::default()
}),
gpu_memory: Some(GpuMemoryMetrics {
dram_throughput_pct: 7.8,
l2_hit_rate_pct: 87.0,
global_load_efficiency_pct: 72.0,
..Default::default()
}),
..Default::default()
};
render_profile(&profile);
}
}