1use crate::analysis::roofline::{Bound, MemoryLevel, Precision, RooflineModel};
5use crate::metrics::catalog::*;
6use crate::profilers::system;
7use anyhow::{Context, Result};
8use std::collections::HashMap;
9use std::path::PathBuf;
10use std::process::Command;
11
12#[derive(Debug, Clone, Copy)]
14pub enum NcuSection {
15 LaunchStats,
16 ComputeThroughput,
17 MemoryThroughput,
18 Occupancy,
19 Roofline,
20 WarpState,
21}
22
23impl NcuSection {
24 fn as_ncu_arg(&self) -> &str {
25 match self {
26 NcuSection::LaunchStats => "LaunchStats",
27 NcuSection::ComputeThroughput => "ComputeWorkloadAnalysis",
28 NcuSection::MemoryThroughput => "MemoryWorkloadAnalysis",
29 NcuSection::Occupancy => "Occupancy",
30 NcuSection::Roofline => "SpeedOfLight",
31 NcuSection::WarpState => "WarpStateStats",
32 }
33 }
34}
35
36pub struct NcuProfiler {
38 pub ncu_path: PathBuf,
39 pub sections: Vec<NcuSection>,
40}
41
42impl NcuProfiler {
43 pub fn detect() -> Option<Self> {
44 which::which("ncu").ok().map(|path| Self {
45 ncu_path: path,
46 sections: vec![
47 NcuSection::LaunchStats,
48 NcuSection::Roofline,
49 NcuSection::ComputeThroughput,
50 NcuSection::MemoryThroughput,
51 NcuSection::Occupancy,
52 ],
53 })
54 }
55
56 pub fn profile(
58 &self,
59 binary: &str,
60 binary_args: &[&str],
61 kernel_regex: &str,
62 ) -> Result<HashMap<String, String>> {
63 let mut cmd = Command::new(&self.ncu_path);
64 cmd.arg("--target-processes").arg("all");
65 cmd.arg("--kernel-name-base").arg("demangled");
66
67 if !kernel_regex.is_empty() {
68 cmd.arg("--kernel-id")
69 .arg(format!("::regex:{kernel_regex}:"));
70 }
71
72 for section in &self.sections {
73 cmd.arg("--section").arg(section.as_ncu_arg());
74 }
75
76 cmd.arg("--csv");
77 cmd.arg("--log-file").arg("/dev/null");
78 cmd.arg(binary);
79 cmd.args(binary_args);
80
81 let output = cmd
82 .output()
83 .with_context(|| format!("Failed to run ncu: {}", self.ncu_path.display()))?;
84
85 if !output.status.success() {
86 let stderr = String::from_utf8_lossy(&output.stderr);
87 if stderr.contains("permission") || stderr.contains("ERR_NVGPUCTRPERM") {
89 anyhow::bail!(
90 "ncu requires elevated permissions. Run with:\n \
91 sudo cgp profile kernel ...\n \
92 or set: sudo sysctl kernel.perf_event_paranoid=2"
93 );
94 }
95 anyhow::bail!("ncu failed (exit {}): {}", output.status, stderr.trim());
96 }
97
98 let stdout = String::from_utf8_lossy(&output.stdout);
99 parse_ncu_csv(&stdout)
100 }
101}
102
103pub fn parse_ncu_csv(csv: &str) -> Result<HashMap<String, String>> {
106 let mut metrics = HashMap::new();
107 for line in csv.lines() {
108 if line.starts_with('"') && !line.starts_with("\"ID\"") {
110 let fields: Vec<&str> = line.split(',').collect();
111 if fields.len() >= 4 {
112 let name = fields[1].trim_matches('"').to_string();
113 let value = fields[3].trim_matches('"').to_string();
114 metrics.insert(name, value);
115 }
116 }
117 }
118 Ok(metrics)
119}
120
121fn get_f64(metrics: &HashMap<String, String>, key: &str) -> f64 {
123 metrics
124 .get(key)
125 .and_then(|v| v.replace(',', "").parse::<f64>().ok())
126 .unwrap_or(0.0)
127}
128
129#[allow(dead_code)]
131fn get_u64(metrics: &HashMap<String, String>, key: &str) -> u64 {
132 metrics
133 .get(key)
134 .and_then(|v| v.replace(',', "").parse::<u64>().ok())
135 .unwrap_or(0)
136}
137
138fn get_u32(metrics: &HashMap<String, String>, key: &str) -> u32 {
140 metrics
141 .get(key)
142 .and_then(|v| v.replace(',', "").parse::<u32>().ok())
143 .unwrap_or(0)
144}
145
146#[allow(clippy::implicit_hasher)]
148pub fn ncu_metrics_to_profile(
149 metrics: &HashMap<String, String>,
150 kernel_name: &str,
151 size: u32,
152) -> FullProfile {
153 let duration_us = get_f64(metrics, "gpu__time_duration.sum") / 1000.0; let flops = 2.0 * (size as f64).powi(3); let tflops = if duration_us > 0.0 {
156 flops / (duration_us * 1e-6) / 1e12
157 } else {
158 0.0
159 };
160
161 let sm_pct = get_f64(metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed");
162 let dram_pct = get_f64(
163 metrics,
164 "dram__throughput.avg.pct_of_peak_sustained_elapsed",
165 );
166 let occupancy_pct = get_f64(
167 metrics,
168 "sm__warps_active.avg.pct_of_peak_sustained_elapsed",
169 );
170 let warp_eff = get_f64(metrics, "smsp__thread_inst_executed_per_inst_executed.pct");
171 let tc_pct = get_f64(
172 metrics,
173 "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
174 );
175 let regs = get_u32(metrics, "launch__registers_per_thread");
176 let smem = get_u32(metrics, "launch__shared_mem_per_block_driver");
177 let l2_hit = get_f64(metrics, "lts__t_sector_hit_rate.pct");
178 let global_load_eff = get_f64(
179 metrics,
180 "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct",
181 );
182
183 let model = RooflineModel::rtx_4090();
185 let ai = if duration_us > 0.0 {
186 let dram_bw = 1008.0e9; let actual_bw = dram_bw * dram_pct / 100.0;
189 let bytes = actual_bw * duration_us * 1e-6;
190 if bytes > 0.0 {
191 flops / bytes
192 } else {
193 0.0
194 }
195 } else {
196 0.0
197 };
198
199 let roofline = model
200 .classify(ai, tflops * 1e12, Precision::Fp16, MemoryLevel::Dram)
201 .map(|point| {
202 let bound_str = match &point.bound {
203 Bound::Memory { .. } => "memory".to_string(),
204 Bound::Compute { .. } => "compute".to_string(),
205 };
206 RooflineMetrics {
207 peak_compute_tflops: 330.0,
208 peak_bandwidth_gbps: 1008.0,
209 ridge_point: 327.4,
210 bound: bound_str,
211 efficiency_pct: point.efficiency,
212 distance_to_ridge: point.distance_to_ridge,
213 }
214 });
215
216 let timestamp = chrono::Utc::now().to_rfc3339();
217
218 FullProfile {
219 version: "2.0".to_string(),
220 timestamp,
221 hardware: HardwareInfo {
222 gpu: Some("NVIDIA GeForce RTX 4090".to_string()),
223 gpu_sm: Some("8.9".to_string()),
224 gpu_memory_gb: Some(24.0),
225 gpu_bandwidth_gbps: Some(1008.0),
226 ..Default::default()
227 },
228 kernel: Some(KernelInfo {
229 name: kernel_name.to_string(),
230 dimensions: vec![size, size, size],
231 shared_memory_bytes: Some(smem),
232 registers_per_thread: Some(regs),
233 ..Default::default()
234 }),
235 timing: TimingMetrics {
236 wall_clock_time_us: duration_us,
237 samples: 1,
238 ..Default::default()
239 },
240 throughput: ThroughputMetrics {
241 tflops,
242 bandwidth_gbps: 1008.0 * dram_pct / 100.0,
243 arithmetic_intensity: ai,
244 ..Default::default()
245 },
246 roofline,
247 gpu_compute: Some(GpuComputeMetrics {
248 sm_utilization_pct: sm_pct,
249 achieved_occupancy_pct: occupancy_pct,
250 warp_execution_efficiency_pct: warp_eff,
251 tensor_core_utilization_pct: tc_pct,
252 register_usage_per_thread: regs,
253 shared_memory_per_block: smem,
254 ..Default::default()
255 }),
256 gpu_memory: Some(GpuMemoryMetrics {
257 dram_throughput_pct: dram_pct,
258 l2_hit_rate_pct: l2_hit,
259 global_load_efficiency_pct: global_load_eff,
260 ..Default::default()
261 }),
262 system_health: system::collect_system_health(),
263 vram: system::collect_vram(),
264 energy: system::collect_system_health()
265 .and_then(|h| system::compute_energy(h.gpu_power_watts, tflops, duration_us)),
266 ..Default::default()
267 }
268}
269
270fn render_profile(profile: &FullProfile) {
272 let kernel = profile.kernel.as_ref().map_or("unknown", |k| &k.name);
273 let dims = profile
274 .kernel
275 .as_ref()
276 .map(|k| {
277 k.dimensions
278 .iter()
279 .map(|d| d.to_string())
280 .collect::<Vec<_>>()
281 .join("x")
282 })
283 .unwrap_or_default();
284
285 let gpu_name = profile.hardware.gpu.as_deref().unwrap_or("Unknown GPU");
286 let sm = profile.hardware.gpu_sm.as_deref().unwrap_or("?");
287
288 println!("\n=== CGP Kernel Profile: {kernel} ({dims}) ===\n");
289 println!(
290 "Backend: CUDA ({gpu_name}, SM {sm}, Driver {})",
291 detect_driver_version().unwrap_or_else(|| "?".to_string())
292 );
293
294 let t = &profile.timing;
295 let tp = &profile.throughput;
296 let peak_pct = if tp.tflops > 0.0 {
297 tp.tflops / 330.0 * 100.0
298 } else {
299 0.0
300 };
301 println!(
302 "Execution: {:.1} us | {:.1} TFLOP/s | {:.1}% of peak",
303 t.wall_clock_time_us, tp.tflops, peak_pct
304 );
305
306 if let Some(roof) = &profile.roofline {
308 println!("\n Roofline Position:");
309 println!(
310 " Arithmetic Intensity: {:.1} FLOP/byte",
311 tp.arithmetic_intensity
312 );
313 println!(" Ridge Point: {:.1} FLOP/byte", roof.ridge_point);
314 let status = if roof.bound == "memory" {
315 format!("MEMORY-BOUND ({:.1}x below ridge)", roof.distance_to_ridge)
316 } else {
317 format!("COMPUTE-BOUND ({:.1}% efficiency)", roof.efficiency_pct)
318 };
319 println!(" Status: {status}");
320 }
321
322 if let Some(gc) = &profile.gpu_compute {
324 println!("\n Compute:");
325 if gc.tensor_core_utilization_pct > 0.0 {
326 println!(
327 " Tensor core utilization: {:5.1}% {}",
328 gc.tensor_core_utilization_pct,
329 quality_badge(gc.tensor_core_utilization_pct, 50.0)
330 );
331 }
332 println!(
333 " Warp execution eff: {:5.1}% {}",
334 gc.warp_execution_efficiency_pct,
335 quality_badge(gc.warp_execution_efficiency_pct, 95.0)
336 );
337 println!(" SM utilization: {:5.1}%", gc.sm_utilization_pct);
338 println!(
339 " Achieved occupancy: {:5.1}%",
340 gc.achieved_occupancy_pct
341 );
342 println!(
343 " Register usage: {:3}/255",
344 gc.register_usage_per_thread
345 );
346 if gc.shared_memory_per_block > 0 {
347 println!(
348 " Shared memory/block: {:5} bytes",
349 gc.shared_memory_per_block
350 );
351 }
352 }
353
354 if let Some(gm) = &profile.gpu_memory {
356 println!("\n Memory:");
357 println!(
358 " DRAM throughput: {:5.1}% of peak ({:.1} GB/s)",
359 gm.dram_throughput_pct,
360 1008.0 * gm.dram_throughput_pct / 100.0
361 );
362 println!(
363 " Global load coalescing: {:5.1}% {}",
364 gm.global_load_efficiency_pct,
365 quality_badge(gm.global_load_efficiency_pct, 60.0)
366 );
367 println!(
368 " L2 hit rate: {:5.1}% {}",
369 gm.l2_hit_rate_pct,
370 quality_badge(gm.l2_hit_rate_pct, 50.0)
371 );
372 }
373
374 if let Some(vram) = &profile.vram {
376 println!("\n VRAM:");
377 println!(
378 " Used: {:.0} / {:.0} MB ({:.1}%)",
379 vram.vram_used_mb, vram.vram_total_mb, vram.vram_utilization_pct
380 );
381 }
382
383 if let Some(health) = &profile.system_health {
385 println!("\n System Health:");
386 println!(" GPU temp: {:.0}°C", health.gpu_temperature_celsius);
387 println!(" GPU power: {:.0} W", health.gpu_power_watts);
388 println!(
389 " GPU clock: {:.0} MHz (mem: {:.0} MHz)",
390 health.gpu_clock_mhz, health.gpu_memory_clock_mhz
391 );
392 if health.cpu_frequency_mhz > 0.0 {
393 println!(" CPU freq: {:.0} MHz", health.cpu_frequency_mhz);
394 }
395 }
396
397 if let Some(energy) = &profile.energy {
399 println!("\n Energy:");
400 println!(
401 " Efficiency: {:.4} TFLOP/s per watt",
402 energy.tflops_per_watt
403 );
404 println!(
405 " Energy: {:.6} J per inference",
406 energy.joules_per_inference
407 );
408 }
409
410 println!();
411}
412
413fn quality_badge(value: f64, threshold: f64) -> &'static str {
414 if value >= threshold {
415 "[OK]"
416 } else {
417 "[WARN]"
418 }
419}
420
421fn detect_driver_version() -> Option<String> {
423 Command::new("nvidia-smi")
424 .args(["--query-gpu=driver_version", "--format=csv,noheader"])
425 .output()
426 .ok()
427 .filter(|o| o.status.success())
428 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
429}
430
431fn detect_gpu_name() -> Option<String> {
433 Command::new("nvidia-smi")
434 .args(["--query-gpu=name", "--format=csv,noheader"])
435 .output()
436 .ok()
437 .filter(|o| o.status.success())
438 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
439}
440
441pub fn profile_kernel(name: &str, size: u32, roofline: bool, _metrics: Option<&str>) -> Result<()> {
445 let profiler = match NcuProfiler::detect() {
446 Some(mut p) => {
447 if roofline {
448 p.sections = vec![
450 NcuSection::LaunchStats,
451 NcuSection::Roofline,
452 NcuSection::ComputeThroughput,
453 NcuSection::MemoryThroughput,
454 NcuSection::Occupancy,
455 ];
456 }
457 p
458 }
459 None => {
460 println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
462 println!(" ncu not found. Showing static analysis only.\n");
463 let model = RooflineModel::rtx_4090();
464 let flops = 2.0 * (size as f64).powi(3);
465 println!(" Expected FLOPs: {:.2e}", flops);
466 if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
467 println!(" FP16 ridge point: {:.1} FLOP/byte", ridge);
468 }
469 println!(" Install NVIDIA Nsight Compute for runtime profiling.");
470 return Ok(());
471 }
472 };
473
474 let binary = find_kernel_binary(name);
476
477 match binary {
478 Some((bin_path, bin_args)) => {
479 eprintln!("Profiling {name} via ncu (this may take a moment)...");
480 let metrics = profiler.profile(
481 &bin_path,
482 &bin_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
483 name,
484 )?;
485 let profile = ncu_metrics_to_profile(&metrics, name, size);
486 render_profile(&profile);
487
488 let json_path = format!("/tmp/cgp-{name}-{size}.json");
490 let json = serde_json::to_string_pretty(&profile)?;
491 std::fs::write(&json_path, &json)?;
492 println!(" Profile exported: {json_path}");
493 }
494 None => {
495 println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
497 println!(" Backend: CUDA (ncu at {})", profiler.ncu_path.display());
498 println!(
499 " GPU: {}",
500 detect_gpu_name().unwrap_or_else(|| "Unknown".to_string())
501 );
502 println!(
503 " Driver: {}",
504 detect_driver_version().unwrap_or_else(|| "Unknown".to_string())
505 );
506 println!("\n No binary found for kernel '{name}'.");
507 println!(" To profile a specific binary, use:");
508 println!(" cgp profile binary ./your_binary --kernel-filter {name}");
509 println!();
510
511 if roofline {
513 let model = RooflineModel::rtx_4090();
514 let flops = 2.0 * (size as f64).powi(3);
515 println!(" Roofline analysis (static):");
516 println!(" Problem FLOPs: {:.2e}", flops);
517 if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
518 println!(" FP16 ridge: {:.1} FLOP/byte", ridge);
519 }
520 println!();
521 }
522 }
523 }
524
525 Ok(())
526}
527
528fn find_kernel_binary(_kernel_name: &str) -> Option<(String, Vec<String>)> {
530 let candidates = [
532 "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo",
533 "./target/release/examples/gpu_batch_demo",
534 "/mnt/nvme-raid0/targets/trueno/release/examples/wgpu_backward_demo",
535 "./target/release/examples/wgpu_backward_demo",
536 ];
537 for path in &candidates {
538 if std::path::Path::new(path).exists() {
539 return Some((path.to_string(), vec![]));
540 }
541 }
542 None
543}
544
545pub fn profile_cublas(op: &str, size: u32) -> Result<()> {
547 println!("\n=== CGP cuBLAS Profile: {op} ({size}x{size}) ===\n");
548
549 let gpu_name = detect_gpu_name().unwrap_or_else(|| "Unknown GPU".to_string());
550 let driver = detect_driver_version().unwrap_or_else(|| "?".to_string());
551 println!(" GPU: {gpu_name} (Driver {driver})");
552
553 let flops = 2.0 * (size as f64).powi(3);
555 println!(" Operation: {op}");
556 println!(" Problem size: {size}x{size}x{size}");
557 println!(" FLOPs: {flops:.2e}");
558
559 let (peak_tflops, precision) = match op {
561 "gemm_f16" | "hgemm" => (330.0, "FP16 Tensor"),
562 "gemm_f32" | "sgemm" => (82.6, "FP32"),
563 "gemm_tf32" => (165.0, "TF32 Tensor"),
564 _ => (82.6, "FP32 (default)"),
565 };
566 println!(" Precision: {precision} (peak: {peak_tflops:.1} TFLOP/s)");
567
568 let cublas_bin = find_cublas_binary();
570 match cublas_bin {
571 Some(bin) => {
572 println!(" Binary: {bin}");
573 if let Some(nsys) = NsysProfiler::detect() {
574 println!(" Profiling via nsys...");
575 match nsys.profile_binary(&bin, &[]) {
576 Ok(stats) => {
577 let cublas_kernels: Vec<_> = stats
579 .iter()
580 .filter(|s| {
581 s.name.contains("gemm")
582 || s.name.contains("cublas")
583 || s.name.contains("Gemm")
584 })
585 .collect();
586 if cublas_kernels.is_empty() {
587 println!(" No cuBLAS kernels found in trace.");
588 } else {
589 println!("\n cuBLAS Kernels:");
590 for k in &cublas_kernels {
591 let tflops = flops / (k.avg_us * 1e-6) / 1e12;
592 let eff = tflops / peak_tflops * 100.0;
593 println!(
594 " {:50} avg={:.1}us {:.1} TFLOP/s ({:.1}% eff)",
595 k.name, k.avg_us, tflops, eff
596 );
597 }
598 }
599 }
600 Err(e) => println!(" nsys profiling failed: {e}"),
601 }
602 }
603 }
604 None => {
605 let est_time_us = flops / (peak_tflops * 0.9 * 1e12) * 1e6; let est_tflops = flops / (est_time_us * 1e-6) / 1e12;
608 println!("\n Estimated (no binary found):");
609 println!(" Time: ~{est_time_us:.1} us");
610 println!(
611 " Throughput: ~{est_tflops:.1} TFLOP/s ({:.1}% of peak)",
612 est_tflops / peak_tflops * 100.0
613 );
614 println!("\n To profile real cuBLAS:");
615 println!(" cgp profile binary ./your_cublas_bench --kernel-filter gemm");
616 }
617 }
618
619 println!();
620 Ok(())
621}
622
623fn find_cublas_binary() -> Option<String> {
625 let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
626 let mut candidates: Vec<String> = Vec::new();
627 if !target_dir.is_empty() {
628 candidates.push(format!("{target_dir}/release/examples/bench_cublas_vs_ptx"));
629 candidates.push(format!("{target_dir}/release/examples/gpu_batch_demo"));
630 }
631 candidates.extend_from_slice(&[
632 "/mnt/nvme-raid0/targets/trueno/release/examples/bench_cublas_vs_ptx".to_string(),
633 "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo".to_string(),
634 ]);
635 for path in &candidates {
636 if std::path::Path::new(path).exists() {
637 return Some(path.clone());
638 }
639 }
640 None
641}
642
643pub struct NsysProfiler {
647 pub nsys_path: PathBuf,
648}
649
650#[derive(Debug, Clone)]
652pub struct NsysKernelStat {
653 pub name: String,
654 pub calls: u64,
655 pub total_us: f64,
656 pub avg_us: f64,
657 pub min_us: f64,
658 pub max_us: f64,
659}
660
661impl NsysProfiler {
662 pub fn detect() -> Option<Self> {
663 which::which("nsys")
664 .ok()
665 .map(|path| Self { nsys_path: path })
666 }
667
668 pub fn profile_binary(
670 &self,
671 binary: &str,
672 binary_args: &[&str],
673 ) -> Result<Vec<NsysKernelStat>> {
674 let report_path = format!("/tmp/cgp-nsys-{}", std::process::id());
675
676 let mut cmd = Command::new(&self.nsys_path);
677 cmd.arg("profile")
678 .arg("--stats=true")
679 .arg("--force-overwrite=true")
680 .arg("-o")
681 .arg(&report_path)
682 .arg(binary)
683 .args(binary_args);
684
685 let output = cmd.output().with_context(|| "Failed to run nsys")?;
686 let stdout = String::from_utf8_lossy(&output.stdout);
687 let stderr = String::from_utf8_lossy(&output.stderr);
688 let combined = format!("{stdout}\n{stderr}");
689
690 let stats = parse_nsys_stats(&combined);
691
692 let _ = std::fs::remove_file(format!("{report_path}.nsys-rep"));
694 let _ = std::fs::remove_file(format!("{report_path}.sqlite"));
695
696 Ok(stats)
697 }
698}
699
700fn parse_nsys_stats(output: &str) -> Vec<NsysKernelStat> {
702 let mut stats = Vec::new();
703 let mut in_kernel_section = false;
704
705 for line in output.lines() {
706 if line.contains("CUDA Kernel Statistics") || line.contains("cuda_gpu_kern_sum") {
707 in_kernel_section = true;
708 continue;
709 }
710 if in_kernel_section && line.trim().is_empty() {
711 in_kernel_section = false;
712 continue;
713 }
714 if in_kernel_section {
715 let parts: Vec<&str> = line.split_whitespace().collect();
718 if parts.len() >= 8 {
719 if let (Ok(total_ns), Ok(instances)) = (
720 parts[1].replace(',', "").parse::<f64>(),
721 parts[2].replace(',', "").parse::<u64>(),
722 ) {
723 let avg_ns = parts[3].replace(',', "").parse::<f64>().unwrap_or(0.0);
724 let min_ns = parts[5].replace(',', "").parse::<f64>().unwrap_or(0.0);
725 let max_ns = parts[6].replace(',', "").parse::<f64>().unwrap_or(0.0);
726 let name = parts[7..].join(" ");
727 stats.push(NsysKernelStat {
728 name,
729 calls: instances,
730 total_us: total_ns / 1000.0,
731 avg_us: avg_ns / 1000.0,
732 min_us: min_ns / 1000.0,
733 max_us: max_ns / 1000.0,
734 });
735 }
736 }
737 }
738 }
739 stats
740}
741
742pub fn profile_binary(
744 path: &str,
745 kernel_filter: Option<&str>,
746 _trace: bool,
747 _duration: Option<&str>,
748) -> Result<()> {
749 println!("\n=== CGP Binary Profile: {path} ===\n");
750
751 let profiler = match NsysProfiler::detect() {
752 Some(p) => p,
753 None => {
754 println!(" nsys not found. Install NVIDIA Nsight Systems.");
755 println!(" Falling back to wall-clock timing...");
756
757 let start = std::time::Instant::now();
759 let status = Command::new(path).status()?;
760 let elapsed = start.elapsed();
761 println!(
762 " Wall time: {:.2}ms (exit code: {})",
763 elapsed.as_secs_f64() * 1000.0,
764 status.code().unwrap_or(-1)
765 );
766 return Ok(());
767 }
768 };
769
770 eprintln!("Running nsys profile (this may take a moment)...");
771 let stats = profiler.profile_binary(path, &[])?;
772
773 if stats.is_empty() {
774 println!(" No CUDA kernels detected. Binary may be CPU-only.");
775 println!(" For CPU profiling, use: cgp profile simd or cgp profile scalar");
776 } else {
777 println!(
778 " {:40} {:>8} {:>12} {:>12} {:>12}",
779 "Kernel", "Calls", "Avg (us)", "Min (us)", "Max (us)"
780 );
781 println!(" {}", "-".repeat(88));
782
783 for stat in &stats {
784 let name = if let Some(filter) = kernel_filter {
785 if !stat.name.contains(filter) {
786 continue;
787 }
788 &stat.name
789 } else {
790 &stat.name
791 };
792
793 let display_name = if name.len() > 40 {
794 format!("{}...", &name[..37])
795 } else {
796 name.to_string()
797 };
798
799 println!(
800 " {:40} {:>8} {:>12.1} {:>12.1} {:>12.1}",
801 display_name, stat.calls, stat.avg_us, stat.min_us, stat.max_us
802 );
803 }
804 println!();
805 println!(
806 " Total kernels: {}, Total CUDA time: {:.1} us",
807 stats.len(),
808 stats.iter().map(|s| s.total_us).sum::<f64>()
809 );
810 }
811
812 println!();
813 Ok(())
814}
815
816pub fn profile_python(args: &[String]) -> Result<()> {
818 let cmd_str = args.join(" ");
819 println!("\n=== CGP Python Profile ===\n");
820 println!(" Command: {cmd_str}");
821
822 if let Some(profiler) = NsysProfiler::detect() {
823 eprintln!("Running nsys profile on Python script...");
824 let python_cmd = if args.is_empty() {
826 anyhow::bail!(
827 "No Python script specified. Usage: cgp profile python -- uv run python script.py"
828 );
829 } else {
830 args[0].clone()
831 };
832 let python_args: Vec<&str> = args[1..].iter().map(|s| s.as_str()).collect();
833 let stats = profiler.profile_binary(&python_cmd, &python_args)?;
834
835 if stats.is_empty() {
836 println!(" No CUDA kernels detected (CPU-only Python workload).");
837 println!(" Use perf stat for CPU profiling.");
838 } else {
839 println!(" CUDA kernels captured via nsys:");
840 for stat in &stats {
841 println!(
842 " {} — {} calls, avg {:.1}us",
843 stat.name, stat.calls, stat.avg_us
844 );
845 }
846 }
847 } else {
848 println!(" nsys not found — cannot capture CUDA kernels.");
849 println!(" Falling back to wall-clock timing.");
850
851 if !args.is_empty() {
852 let start = std::time::Instant::now();
853 let status = Command::new(&args[0]).args(&args[1..]).status()?;
854 let elapsed = start.elapsed();
855 println!(
856 " Wall time: {:.2}ms (exit code: {})",
857 elapsed.as_secs_f64() * 1000.0,
858 status.code().unwrap_or(-1)
859 );
860 }
861 }
862
863 println!();
864 Ok(())
865}
866
867pub fn run_trace(binary: &str, duration: Option<&str>) -> Result<()> {
869 println!("\n=== CGP System Trace: {binary} ===\n");
870
871 let profiler = match NsysProfiler::detect() {
872 Some(p) => p,
873 None => {
874 anyhow::bail!("nsys not found. Install NVIDIA Nsight Systems for timeline tracing.");
875 }
876 };
877
878 let report_path = format!("/tmp/cgp-trace-{}", std::process::id());
879
880 let mut cmd = Command::new(&profiler.nsys_path);
881 cmd.arg("profile")
882 .arg("--stats=true")
883 .arg("--force-overwrite=true")
884 .arg("--trace=cuda,nvtx,osrt")
885 .arg("-o")
886 .arg(&report_path);
887
888 if let Some(dur) = duration {
889 cmd.arg("--duration").arg(dur);
890 }
891
892 cmd.arg(binary);
893
894 eprintln!("Running nsys trace (this may take a while)...");
895 let output = cmd.output().with_context(|| "Failed to run nsys trace")?;
896 let stdout = String::from_utf8_lossy(&output.stdout);
897 let stderr = String::from_utf8_lossy(&output.stderr);
898
899 let combined = format!("{stdout}\n{stderr}");
901 let mut in_summary = false;
902 for line in combined.lines() {
903 if line.contains("CUDA API Statistics")
904 || line.contains("CUDA Kernel Statistics")
905 || line.contains("OS Runtime Statistics")
906 {
907 in_summary = true;
908 println!(" {line}");
909 continue;
910 }
911 if in_summary {
912 if line.trim().is_empty() {
913 in_summary = false;
914 println!();
915 } else {
916 println!(" {line}");
917 }
918 }
919 }
920
921 let report_file = format!("{report_path}.nsys-rep");
922 if std::path::Path::new(&report_file).exists() {
923 println!(" Report: {report_file}");
924 println!(" View: nsys-ui {report_file}");
925 }
926
927 println!();
928 Ok(())
929}
930
931#[cfg(test)]
932mod tests {
933 use super::*;
934
935 #[test]
936 fn test_ncu_section_args() {
937 assert_eq!(NcuSection::LaunchStats.as_ncu_arg(), "LaunchStats");
938 assert_eq!(NcuSection::Roofline.as_ncu_arg(), "SpeedOfLight");
939 assert_eq!(NcuSection::WarpState.as_ncu_arg(), "WarpStateStats");
940 }
941
942 #[test]
943 fn test_ncu_command_build() {
944 if let Some(profiler) = NcuProfiler::detect() {
945 assert!(profiler.ncu_path.exists());
946 }
947 }
948
949 #[test]
950 fn test_parse_ncu_csv_empty() {
951 let metrics = parse_ncu_csv("").unwrap();
952 assert!(metrics.is_empty());
953 }
954
955 #[test]
956 fn test_parse_ncu_csv_sample() {
957 let csv = r#""ID","Metric Name","Metric Unit","Metric Value"
958"0","sm__throughput.avg.pct_of_peak_sustained_elapsed","%","42.3"
959"0","dram__throughput.avg.pct_of_peak_sustained_elapsed","%","7.8"
960"0","launch__registers_per_thread","register/thread","48"
961"0","gpu__time_duration.sum","nsecond","23200"
962"#;
963 let metrics = parse_ncu_csv(csv).unwrap();
964 assert_eq!(
965 get_f64(&metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed"),
966 42.3
967 );
968 assert_eq!(get_u32(&metrics, "launch__registers_per_thread"), 48);
969 assert_eq!(get_f64(&metrics, "gpu__time_duration.sum"), 23200.0);
970 }
971
972 #[test]
973 fn test_ncu_metrics_to_profile() {
974 let mut metrics = HashMap::new();
975 metrics.insert(
976 "gpu__time_duration.sum".to_string(),
977 "23200".to_string(), );
979 metrics.insert(
980 "sm__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
981 "42.3".to_string(),
982 );
983 metrics.insert(
984 "dram__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
985 "7.8".to_string(),
986 );
987 metrics.insert("launch__registers_per_thread".to_string(), "48".to_string());
988 metrics.insert(
989 "smsp__thread_inst_executed_per_inst_executed.pct".to_string(),
990 "100.0".to_string(),
991 );
992
993 let profile = ncu_metrics_to_profile(&metrics, "gemm_cta_wmma_fp16", 512);
994 assert_eq!(profile.kernel.as_ref().unwrap().name, "gemm_cta_wmma_fp16");
995 assert!(profile.timing.wall_clock_time_us > 0.0);
996 assert!(profile.throughput.tflops > 0.0);
997 assert!(profile.gpu_compute.is_some());
998 }
999
1000 #[test]
1001 fn test_parse_nsys_stats_empty() {
1002 let stats = parse_nsys_stats("");
1003 assert!(stats.is_empty());
1004 }
1005
1006 #[test]
1007 fn test_render_profile_no_panic() {
1008 let profile = FullProfile {
1009 version: "2.0".to_string(),
1010 hardware: HardwareInfo {
1011 gpu: Some("Test GPU".to_string()),
1012 gpu_sm: Some("8.9".to_string()),
1013 ..Default::default()
1014 },
1015 kernel: Some(KernelInfo {
1016 name: "test_kernel".to_string(),
1017 dimensions: vec![256, 256, 256],
1018 ..Default::default()
1019 }),
1020 timing: TimingMetrics {
1021 wall_clock_time_us: 10.0,
1022 samples: 1,
1023 ..Default::default()
1024 },
1025 throughput: ThroughputMetrics {
1026 tflops: 5.0,
1027 ..Default::default()
1028 },
1029 gpu_compute: Some(GpuComputeMetrics {
1030 sm_utilization_pct: 40.0,
1031 warp_execution_efficiency_pct: 100.0,
1032 achieved_occupancy_pct: 33.0,
1033 register_usage_per_thread: 48,
1034 ..Default::default()
1035 }),
1036 gpu_memory: Some(GpuMemoryMetrics {
1037 dram_throughput_pct: 7.8,
1038 l2_hit_rate_pct: 87.0,
1039 global_load_efficiency_pct: 72.0,
1040 ..Default::default()
1041 }),
1042 ..Default::default()
1043 };
1044 render_profile(&profile);
1046 }
1047}