1use crate::analysis::roofline::{Bound, MemoryLevel, Precision, RooflineModel};
5use crate::metrics::catalog::*;
6use crate::profilers::system;
7use anyhow::{Context, Result};
8use std::collections::HashMap;
9use std::path::PathBuf;
10use std::process::Command;
11
12#[derive(Debug, Clone, Copy)]
14pub enum NcuSection {
15 LaunchStats,
16 ComputeThroughput,
17 MemoryThroughput,
18 Occupancy,
19 Roofline,
20 WarpState,
21}
22
23impl NcuSection {
24 fn as_ncu_arg(&self) -> &str {
25 match self {
26 NcuSection::LaunchStats => "LaunchStats",
27 NcuSection::ComputeThroughput => "ComputeWorkloadAnalysis",
28 NcuSection::MemoryThroughput => "MemoryWorkloadAnalysis",
29 NcuSection::Occupancy => "Occupancy",
30 NcuSection::Roofline => "SpeedOfLight",
31 NcuSection::WarpState => "WarpStateStats",
32 }
33 }
34}
35
36pub struct NcuProfiler {
38 pub ncu_path: PathBuf,
39 pub sections: Vec<NcuSection>,
40}
41
42impl NcuProfiler {
43 pub fn detect() -> Option<Self> {
44 which::which("ncu").ok().map(|path| Self {
45 ncu_path: path,
46 sections: vec![
47 NcuSection::LaunchStats,
48 NcuSection::Roofline,
49 NcuSection::ComputeThroughput,
50 NcuSection::MemoryThroughput,
51 NcuSection::Occupancy,
52 ],
53 })
54 }
55
56 pub fn profile(
58 &self,
59 binary: &str,
60 binary_args: &[&str],
61 kernel_regex: &str,
62 ) -> Result<HashMap<String, String>> {
63 let mut cmd = Command::new(&self.ncu_path);
64 cmd.arg("--target-processes").arg("all");
65 cmd.arg("--kernel-name-base").arg("demangled");
66
67 if !kernel_regex.is_empty() {
68 cmd.arg("--kernel-id")
69 .arg(format!("::regex:{kernel_regex}:"));
70 }
71
72 for section in &self.sections {
73 cmd.arg("--section").arg(section.as_ncu_arg());
74 }
75
76 cmd.arg("--csv");
77 cmd.arg("--log-file").arg("/dev/null");
78 cmd.arg(binary);
79 cmd.args(binary_args);
80
81 let output = cmd
82 .output()
83 .with_context(|| format!("Failed to run ncu: {}", self.ncu_path.display()))?;
84
85 if !output.status.success() {
86 let stderr = String::from_utf8_lossy(&output.stderr);
87 if stderr.contains("permission") || stderr.contains("ERR_NVGPUCTRPERM") {
89 anyhow::bail!(
90 "ncu requires elevated permissions. Run with:\n \
91 sudo cgp profile kernel ...\n \
92 or set: sudo sysctl kernel.perf_event_paranoid=2"
93 );
94 }
95 anyhow::bail!("ncu failed (exit {}): {}", output.status, stderr.trim());
96 }
97
98 let stdout = String::from_utf8_lossy(&output.stdout);
99 parse_ncu_csv(&stdout)
100 }
101}
102
103pub fn parse_ncu_csv(csv: &str) -> Result<HashMap<String, String>> {
106 let mut metrics = HashMap::new();
107 for line in csv.lines() {
108 if line.starts_with('"') && !line.starts_with("\"ID\"") {
110 let fields: Vec<&str> = line.split(',').collect();
111 if fields.len() >= 4 {
112 let name = fields[1].trim_matches('"').to_string();
113 let value = fields[3].trim_matches('"').to_string();
114 metrics.insert(name, value);
115 }
116 }
117 }
118 Ok(metrics)
119}
120
121fn get_f64(metrics: &HashMap<String, String>, key: &str) -> f64 {
123 metrics
124 .get(key)
125 .and_then(|v| v.replace(',', "").parse::<f64>().ok())
126 .unwrap_or(0.0)
127}
128
129#[allow(dead_code)]
131fn get_u64(metrics: &HashMap<String, String>, key: &str) -> u64 {
132 metrics
133 .get(key)
134 .and_then(|v| v.replace(',', "").parse::<u64>().ok())
135 .unwrap_or(0)
136}
137
138fn get_u32(metrics: &HashMap<String, String>, key: &str) -> u32 {
140 metrics
141 .get(key)
142 .and_then(|v| v.replace(',', "").parse::<u32>().ok())
143 .unwrap_or(0)
144}
145
146#[allow(clippy::implicit_hasher)]
148pub fn ncu_metrics_to_profile(
149 metrics: &HashMap<String, String>,
150 kernel_name: &str,
151 size: u32,
152) -> FullProfile {
153 let duration_us = get_f64(metrics, "gpu__time_duration.sum") / 1000.0; let flops = 2.0 * (size as f64).powi(3); let tflops = if duration_us > 0.0 {
156 flops / (duration_us * 1e-6) / 1e12
157 } else {
158 0.0
159 };
160
161 let sm_pct = get_f64(metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed");
162 let dram_pct = get_f64(
163 metrics,
164 "dram__throughput.avg.pct_of_peak_sustained_elapsed",
165 );
166 let occupancy_pct = get_f64(
167 metrics,
168 "sm__warps_active.avg.pct_of_peak_sustained_elapsed",
169 );
170 let warp_eff = get_f64(metrics, "smsp__thread_inst_executed_per_inst_executed.pct");
171 let tc_pct = get_f64(
172 metrics,
173 "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
174 );
175 let regs = get_u32(metrics, "launch__registers_per_thread");
176 let smem = get_u32(metrics, "launch__shared_mem_per_block_driver");
177 let l2_hit = get_f64(metrics, "lts__t_sector_hit_rate.pct");
178 let global_load_eff = get_f64(
179 metrics,
180 "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct",
181 );
182
183 let model = RooflineModel::rtx_4090();
185 let ai = if duration_us > 0.0 {
186 let dram_bw = 1008.0e9; let actual_bw = dram_bw * dram_pct / 100.0;
189 let bytes = actual_bw * duration_us * 1e-6;
190 if bytes > 0.0 {
191 flops / bytes
192 } else {
193 0.0
194 }
195 } else {
196 0.0
197 };
198
199 let roofline = model
200 .classify(ai, tflops * 1e12, Precision::Fp16, MemoryLevel::Dram)
201 .map(|point| {
202 let bound_str = match &point.bound {
203 Bound::Memory { .. } => "memory".to_string(),
204 Bound::Compute { .. } => "compute".to_string(),
205 };
206 RooflineMetrics {
207 peak_compute_tflops: 330.0,
208 peak_bandwidth_gbps: 1008.0,
209 ridge_point: 327.4,
210 bound: bound_str,
211 efficiency_pct: point.efficiency,
212 distance_to_ridge: point.distance_to_ridge,
213 }
214 });
215
216 let timestamp = chrono::Utc::now().to_rfc3339();
217
218 FullProfile {
219 version: "2.0".to_string(),
220 timestamp,
221 hardware: HardwareInfo {
222 gpu: Some("NVIDIA GeForce RTX 4090".to_string()),
223 gpu_sm: Some("8.9".to_string()),
224 gpu_memory_gb: Some(24.0),
225 gpu_bandwidth_gbps: Some(1008.0),
226 ..Default::default()
227 },
228 kernel: Some(KernelInfo {
229 name: kernel_name.to_string(),
230 dimensions: vec![size, size, size],
231 shared_memory_bytes: Some(smem),
232 registers_per_thread: Some(regs),
233 ..Default::default()
234 }),
235 timing: TimingMetrics {
236 wall_clock_time_us: duration_us,
237 samples: 1,
238 ..Default::default()
239 },
240 throughput: ThroughputMetrics {
241 tflops,
242 bandwidth_gbps: 1008.0 * dram_pct / 100.0,
243 arithmetic_intensity: ai,
244 ..Default::default()
245 },
246 roofline,
247 gpu_compute: Some(GpuComputeMetrics {
248 sm_utilization_pct: sm_pct,
249 achieved_occupancy_pct: occupancy_pct,
250 warp_execution_efficiency_pct: warp_eff,
251 tensor_core_utilization_pct: tc_pct,
252 register_usage_per_thread: regs,
253 shared_memory_per_block: smem,
254 ..Default::default()
255 }),
256 gpu_memory: Some(GpuMemoryMetrics {
257 dram_throughput_pct: dram_pct,
258 l2_hit_rate_pct: l2_hit,
259 global_load_efficiency_pct: global_load_eff,
260 ..Default::default()
261 }),
262 system_health: system::collect_system_health(),
263 vram: system::collect_vram(),
264 energy: system::collect_system_health()
265 .and_then(|h| system::compute_energy(h.gpu_power_watts, tflops, duration_us)),
266 ..Default::default()
267 }
268}
269
270fn render_profile(profile: &FullProfile) {
272 render_profile_header(profile);
273 render_profile_execution(profile);
274 render_profile_roofline(profile);
275 render_profile_compute(profile);
276 render_profile_memory(profile);
277 render_profile_vram(profile);
278 render_profile_system_health(profile);
279 render_profile_energy(profile);
280 println!();
281}
282
283fn render_profile_header(profile: &FullProfile) {
284 let kernel = profile.kernel.as_ref().map_or("unknown", |k| &k.name);
285 let dims = profile
286 .kernel
287 .as_ref()
288 .map(|k| {
289 k.dimensions
290 .iter()
291 .map(|d| d.to_string())
292 .collect::<Vec<_>>()
293 .join("x")
294 })
295 .unwrap_or_default();
296 let gpu_name = profile.hardware.gpu.as_deref().unwrap_or("Unknown GPU");
297 let sm = profile.hardware.gpu_sm.as_deref().unwrap_or("?");
298
299 println!("\n=== CGP Kernel Profile: {kernel} ({dims}) ===\n");
300 println!(
301 "Backend: CUDA ({gpu_name}, SM {sm}, Driver {})",
302 detect_driver_version().unwrap_or_else(|| "?".to_string())
303 );
304}
305
306fn render_profile_execution(profile: &FullProfile) {
307 let t = &profile.timing;
308 let tp = &profile.throughput;
309 let peak_pct = if tp.tflops > 0.0 {
310 tp.tflops / 330.0 * 100.0
311 } else {
312 0.0
313 };
314 println!(
315 "Execution: {:.1} us | {:.1} TFLOP/s | {:.1}% of peak",
316 t.wall_clock_time_us, tp.tflops, peak_pct
317 );
318}
319
320fn render_profile_roofline(profile: &FullProfile) {
321 let Some(roof) = &profile.roofline else {
322 return;
323 };
324 let tp = &profile.throughput;
325 println!("\n Roofline Position:");
326 println!(
327 " Arithmetic Intensity: {:.1} FLOP/byte",
328 tp.arithmetic_intensity
329 );
330 println!(" Ridge Point: {:.1} FLOP/byte", roof.ridge_point);
331 let status = if roof.bound == "memory" {
332 format!("MEMORY-BOUND ({:.1}x below ridge)", roof.distance_to_ridge)
333 } else {
334 format!("COMPUTE-BOUND ({:.1}% efficiency)", roof.efficiency_pct)
335 };
336 println!(" Status: {status}");
337}
338
339fn render_profile_compute(profile: &FullProfile) {
340 let Some(gc) = &profile.gpu_compute else {
341 return;
342 };
343 println!("\n Compute:");
344 if gc.tensor_core_utilization_pct > 0.0 {
345 println!(
346 " Tensor core utilization: {:5.1}% {}",
347 gc.tensor_core_utilization_pct,
348 quality_badge(gc.tensor_core_utilization_pct, 50.0)
349 );
350 }
351 println!(
352 " Warp execution eff: {:5.1}% {}",
353 gc.warp_execution_efficiency_pct,
354 quality_badge(gc.warp_execution_efficiency_pct, 95.0)
355 );
356 println!(" SM utilization: {:5.1}%", gc.sm_utilization_pct);
357 println!(
358 " Achieved occupancy: {:5.1}%",
359 gc.achieved_occupancy_pct
360 );
361 println!(
362 " Register usage: {:3}/255",
363 gc.register_usage_per_thread
364 );
365 if gc.shared_memory_per_block > 0 {
366 println!(
367 " Shared memory/block: {:5} bytes",
368 gc.shared_memory_per_block
369 );
370 }
371}
372
373fn render_profile_memory(profile: &FullProfile) {
374 let Some(gm) = &profile.gpu_memory else {
375 return;
376 };
377 println!("\n Memory:");
378 println!(
379 " DRAM throughput: {:5.1}% of peak ({:.1} GB/s)",
380 gm.dram_throughput_pct,
381 1008.0 * gm.dram_throughput_pct / 100.0
382 );
383 println!(
384 " Global load coalescing: {:5.1}% {}",
385 gm.global_load_efficiency_pct,
386 quality_badge(gm.global_load_efficiency_pct, 60.0)
387 );
388 println!(
389 " L2 hit rate: {:5.1}% {}",
390 gm.l2_hit_rate_pct,
391 quality_badge(gm.l2_hit_rate_pct, 50.0)
392 );
393}
394
395fn render_profile_vram(profile: &FullProfile) {
396 let Some(vram) = &profile.vram else {
397 return;
398 };
399 println!("\n VRAM:");
400 println!(
401 " Used: {:.0} / {:.0} MB ({:.1}%)",
402 vram.vram_used_mb, vram.vram_total_mb, vram.vram_utilization_pct
403 );
404}
405
406fn render_profile_system_health(profile: &FullProfile) {
407 let Some(health) = &profile.system_health else {
408 return;
409 };
410 println!("\n System Health:");
411 println!(" GPU temp: {:.0}°C", health.gpu_temperature_celsius);
412 println!(" GPU power: {:.0} W", health.gpu_power_watts);
413 println!(
414 " GPU clock: {:.0} MHz (mem: {:.0} MHz)",
415 health.gpu_clock_mhz, health.gpu_memory_clock_mhz
416 );
417 if health.cpu_frequency_mhz > 0.0 {
418 println!(" CPU freq: {:.0} MHz", health.cpu_frequency_mhz);
419 }
420}
421
422fn render_profile_energy(profile: &FullProfile) {
423 let Some(energy) = &profile.energy else {
424 return;
425 };
426 println!("\n Energy:");
427 println!(
428 " Efficiency: {:.4} TFLOP/s per watt",
429 energy.tflops_per_watt
430 );
431 println!(
432 " Energy: {:.6} J per inference",
433 energy.joules_per_inference
434 );
435}
436
437fn quality_badge(value: f64, threshold: f64) -> &'static str {
438 if value >= threshold {
439 "[OK]"
440 } else {
441 "[WARN]"
442 }
443}
444
445fn detect_driver_version() -> Option<String> {
447 Command::new("nvidia-smi")
448 .args(["--query-gpu=driver_version", "--format=csv,noheader"])
449 .output()
450 .ok()
451 .filter(|o| o.status.success())
452 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
453}
454
455fn detect_gpu_name() -> Option<String> {
457 Command::new("nvidia-smi")
458 .args(["--query-gpu=name", "--format=csv,noheader"])
459 .output()
460 .ok()
461 .filter(|o| o.status.success())
462 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
463}
464
465pub fn profile_kernel(name: &str, size: u32, roofline: bool, _metrics: Option<&str>) -> Result<()> {
469 let profiler = match NcuProfiler::detect() {
470 Some(mut p) => {
471 if roofline {
472 p.sections = vec![
474 NcuSection::LaunchStats,
475 NcuSection::Roofline,
476 NcuSection::ComputeThroughput,
477 NcuSection::MemoryThroughput,
478 NcuSection::Occupancy,
479 ];
480 }
481 p
482 }
483 None => {
484 println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
486 println!(" ncu not found. Showing static analysis only.\n");
487 let model = RooflineModel::rtx_4090();
488 let flops = 2.0 * (size as f64).powi(3);
489 println!(" Expected FLOPs: {:.2e}", flops);
490 if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
491 println!(" FP16 ridge point: {:.1} FLOP/byte", ridge);
492 }
493 println!(" Install NVIDIA Nsight Compute for runtime profiling.");
494 return Ok(());
495 }
496 };
497
498 let binary = find_kernel_binary(name);
500
501 match binary {
502 Some((bin_path, bin_args)) => {
503 eprintln!("Profiling {name} via ncu (this may take a moment)...");
504 let metrics = profiler.profile(
505 &bin_path,
506 &bin_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
507 name,
508 )?;
509 let profile = ncu_metrics_to_profile(&metrics, name, size);
510 render_profile(&profile);
511
512 let json_path = format!("/tmp/cgp-{name}-{size}.json");
514 let json = serde_json::to_string_pretty(&profile)?;
515 std::fs::write(&json_path, &json)?;
516 println!(" Profile exported: {json_path}");
517 }
518 None => {
519 println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
521 println!(" Backend: CUDA (ncu at {})", profiler.ncu_path.display());
522 println!(
523 " GPU: {}",
524 detect_gpu_name().unwrap_or_else(|| "Unknown".to_string())
525 );
526 println!(
527 " Driver: {}",
528 detect_driver_version().unwrap_or_else(|| "Unknown".to_string())
529 );
530 println!("\n No binary found for kernel '{name}'.");
531 println!(" To profile a specific binary, use:");
532 println!(" cgp profile binary ./your_binary --kernel-filter {name}");
533 println!();
534
535 if roofline {
537 let model = RooflineModel::rtx_4090();
538 let flops = 2.0 * (size as f64).powi(3);
539 println!(" Roofline analysis (static):");
540 println!(" Problem FLOPs: {:.2e}", flops);
541 if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
542 println!(" FP16 ridge: {:.1} FLOP/byte", ridge);
543 }
544 println!();
545 }
546 }
547 }
548
549 Ok(())
550}
551
552fn find_kernel_binary(_kernel_name: &str) -> Option<(String, Vec<String>)> {
554 let candidates = [
556 "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo",
557 "./target/release/examples/gpu_batch_demo",
558 "/mnt/nvme-raid0/targets/trueno/release/examples/wgpu_backward_demo",
559 "./target/release/examples/wgpu_backward_demo",
560 ];
561 for path in &candidates {
562 if std::path::Path::new(path).exists() {
563 return Some((path.to_string(), vec![]));
564 }
565 }
566 None
567}
568
569pub fn profile_cublas(op: &str, size: u32) -> Result<()> {
571 println!("\n=== CGP cuBLAS Profile: {op} ({size}x{size}) ===\n");
572
573 let gpu_name = detect_gpu_name().unwrap_or_else(|| "Unknown GPU".to_string());
574 let driver = detect_driver_version().unwrap_or_else(|| "?".to_string());
575 println!(" GPU: {gpu_name} (Driver {driver})");
576
577 let flops = 2.0 * (size as f64).powi(3);
579 println!(" Operation: {op}");
580 println!(" Problem size: {size}x{size}x{size}");
581 println!(" FLOPs: {flops:.2e}");
582
583 let (peak_tflops, precision) = match op {
585 "gemm_f16" | "hgemm" => (330.0, "FP16 Tensor"),
586 "gemm_f32" | "sgemm" => (82.6, "FP32"),
587 "gemm_tf32" => (165.0, "TF32 Tensor"),
588 _ => (82.6, "FP32 (default)"),
589 };
590 println!(" Precision: {precision} (peak: {peak_tflops:.1} TFLOP/s)");
591
592 let cublas_bin = find_cublas_binary();
594 match cublas_bin {
595 Some(bin) => {
596 println!(" Binary: {bin}");
597 if let Some(nsys) = NsysProfiler::detect() {
598 println!(" Profiling via nsys...");
599 match nsys.profile_binary(&bin, &[]) {
600 Ok(stats) => {
601 let cublas_kernels: Vec<_> = stats
603 .iter()
604 .filter(|s| {
605 s.name.contains("gemm")
606 || s.name.contains("cublas")
607 || s.name.contains("Gemm")
608 })
609 .collect();
610 if cublas_kernels.is_empty() {
611 println!(" No cuBLAS kernels found in trace.");
612 } else {
613 println!("\n cuBLAS Kernels:");
614 for k in &cublas_kernels {
615 let tflops = flops / (k.avg_us * 1e-6) / 1e12;
616 let eff = tflops / peak_tflops * 100.0;
617 println!(
618 " {:50} avg={:.1}us {:.1} TFLOP/s ({:.1}% eff)",
619 k.name, k.avg_us, tflops, eff
620 );
621 }
622 }
623 }
624 Err(e) => println!(" nsys profiling failed: {e}"),
625 }
626 }
627 }
628 None => {
629 let est_time_us = flops / (peak_tflops * 0.9 * 1e12) * 1e6; let est_tflops = flops / (est_time_us * 1e-6) / 1e12;
632 println!("\n Estimated (no binary found):");
633 println!(" Time: ~{est_time_us:.1} us");
634 println!(
635 " Throughput: ~{est_tflops:.1} TFLOP/s ({:.1}% of peak)",
636 est_tflops / peak_tflops * 100.0
637 );
638 println!("\n To profile real cuBLAS:");
639 println!(" cgp profile binary ./your_cublas_bench --kernel-filter gemm");
640 }
641 }
642
643 println!();
644 Ok(())
645}
646
647fn find_cublas_binary() -> Option<String> {
649 let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
650 let mut candidates: Vec<String> = Vec::new();
651 if !target_dir.is_empty() {
652 candidates.push(format!("{target_dir}/release/examples/bench_cublas_vs_ptx"));
653 candidates.push(format!("{target_dir}/release/examples/gpu_batch_demo"));
654 }
655 candidates.extend_from_slice(&[
656 "/mnt/nvme-raid0/targets/trueno/release/examples/bench_cublas_vs_ptx".to_string(),
657 "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo".to_string(),
658 ]);
659 for path in &candidates {
660 if std::path::Path::new(path).exists() {
661 return Some(path.clone());
662 }
663 }
664 None
665}
666
667pub struct NsysProfiler {
671 pub nsys_path: PathBuf,
672}
673
674#[derive(Debug, Clone)]
676pub struct NsysKernelStat {
677 pub name: String,
678 pub calls: u64,
679 pub total_us: f64,
680 pub avg_us: f64,
681 pub min_us: f64,
682 pub max_us: f64,
683}
684
685impl NsysProfiler {
686 pub fn detect() -> Option<Self> {
687 which::which("nsys")
688 .ok()
689 .map(|path| Self { nsys_path: path })
690 }
691
692 pub fn profile_binary(
694 &self,
695 binary: &str,
696 binary_args: &[&str],
697 ) -> Result<Vec<NsysKernelStat>> {
698 let report_path = format!("/tmp/cgp-nsys-{}", std::process::id());
699
700 let mut cmd = Command::new(&self.nsys_path);
701 cmd.arg("profile")
702 .arg("--stats=true")
703 .arg("--force-overwrite=true")
704 .arg("-o")
705 .arg(&report_path)
706 .arg(binary)
707 .args(binary_args);
708
709 let output = cmd.output().with_context(|| "Failed to run nsys")?;
710 let stdout = String::from_utf8_lossy(&output.stdout);
711 let stderr = String::from_utf8_lossy(&output.stderr);
712 let combined = format!("{stdout}\n{stderr}");
713
714 let stats = parse_nsys_stats(&combined);
715
716 let _ = std::fs::remove_file(format!("{report_path}.nsys-rep"));
718 let _ = std::fs::remove_file(format!("{report_path}.sqlite"));
719
720 Ok(stats)
721 }
722}
723
724fn parse_nsys_stats(output: &str) -> Vec<NsysKernelStat> {
726 let mut stats = Vec::new();
727 let mut in_kernel_section = false;
728
729 for line in output.lines() {
730 if line.contains("CUDA Kernel Statistics") || line.contains("cuda_gpu_kern_sum") {
731 in_kernel_section = true;
732 continue;
733 }
734 if in_kernel_section && line.trim().is_empty() {
735 in_kernel_section = false;
736 continue;
737 }
738 if in_kernel_section {
739 let parts: Vec<&str> = line.split_whitespace().collect();
742 if parts.len() >= 8 {
743 if let (Ok(total_ns), Ok(instances)) = (
744 parts[1].replace(',', "").parse::<f64>(),
745 parts[2].replace(',', "").parse::<u64>(),
746 ) {
747 let avg_ns = parts[3].replace(',', "").parse::<f64>().unwrap_or(0.0);
748 let min_ns = parts[5].replace(',', "").parse::<f64>().unwrap_or(0.0);
749 let max_ns = parts[6].replace(',', "").parse::<f64>().unwrap_or(0.0);
750 let name = parts[7..].join(" ");
751 stats.push(NsysKernelStat {
752 name,
753 calls: instances,
754 total_us: total_ns / 1000.0,
755 avg_us: avg_ns / 1000.0,
756 min_us: min_ns / 1000.0,
757 max_us: max_ns / 1000.0,
758 });
759 }
760 }
761 }
762 }
763 stats
764}
765
766pub fn profile_binary(
768 path: &str,
769 kernel_filter: Option<&str>,
770 _trace: bool,
771 _duration: Option<&str>,
772) -> Result<()> {
773 println!("\n=== CGP Binary Profile: {path} ===\n");
774
775 let profiler = match NsysProfiler::detect() {
776 Some(p) => p,
777 None => {
778 println!(" nsys not found. Install NVIDIA Nsight Systems.");
779 println!(" Falling back to wall-clock timing...");
780
781 let start = std::time::Instant::now();
783 let status = Command::new(path).status()?;
784 let elapsed = start.elapsed();
785 println!(
786 " Wall time: {:.2}ms (exit code: {})",
787 elapsed.as_secs_f64() * 1000.0,
788 status.code().unwrap_or(-1)
789 );
790 return Ok(());
791 }
792 };
793
794 eprintln!("Running nsys profile (this may take a moment)...");
795 let stats = profiler.profile_binary(path, &[])?;
796
797 if stats.is_empty() {
798 println!(" No CUDA kernels detected. Binary may be CPU-only.");
799 println!(" For CPU profiling, use: cgp profile simd or cgp profile scalar");
800 } else {
801 println!(
802 " {:40} {:>8} {:>12} {:>12} {:>12}",
803 "Kernel", "Calls", "Avg (us)", "Min (us)", "Max (us)"
804 );
805 println!(" {}", "-".repeat(88));
806
807 for stat in &stats {
808 let name = if let Some(filter) = kernel_filter {
809 if !stat.name.contains(filter) {
810 continue;
811 }
812 &stat.name
813 } else {
814 &stat.name
815 };
816
817 let display_name = if name.len() > 40 {
818 format!("{}...", &name[..37])
819 } else {
820 name.to_string()
821 };
822
823 println!(
824 " {:40} {:>8} {:>12.1} {:>12.1} {:>12.1}",
825 display_name, stat.calls, stat.avg_us, stat.min_us, stat.max_us
826 );
827 }
828 println!();
829 println!(
830 " Total kernels: {}, Total CUDA time: {:.1} us",
831 stats.len(),
832 stats.iter().map(|s| s.total_us).sum::<f64>()
833 );
834 }
835
836 println!();
837 Ok(())
838}
839
840pub fn profile_python(args: &[String]) -> Result<()> {
842 let cmd_str = args.join(" ");
843 println!("\n=== CGP Python Profile ===\n");
844 println!(" Command: {cmd_str}");
845
846 if let Some(profiler) = NsysProfiler::detect() {
847 eprintln!("Running nsys profile on Python script...");
848 let python_cmd = if args.is_empty() {
850 anyhow::bail!(
851 "No Python script specified. Usage: cgp profile python -- uv run python script.py"
852 );
853 } else {
854 args[0].clone()
855 };
856 let python_args: Vec<&str> = args[1..].iter().map(|s| s.as_str()).collect();
857 let stats = profiler.profile_binary(&python_cmd, &python_args)?;
858
859 if stats.is_empty() {
860 println!(" No CUDA kernels detected (CPU-only Python workload).");
861 println!(" Use perf stat for CPU profiling.");
862 } else {
863 println!(" CUDA kernels captured via nsys:");
864 for stat in &stats {
865 println!(
866 " {} — {} calls, avg {:.1}us",
867 stat.name, stat.calls, stat.avg_us
868 );
869 }
870 }
871 } else {
872 println!(" nsys not found — cannot capture CUDA kernels.");
873 println!(" Falling back to wall-clock timing.");
874
875 if !args.is_empty() {
876 let start = std::time::Instant::now();
877 let status = Command::new(&args[0]).args(&args[1..]).status()?;
878 let elapsed = start.elapsed();
879 println!(
880 " Wall time: {:.2}ms (exit code: {})",
881 elapsed.as_secs_f64() * 1000.0,
882 status.code().unwrap_or(-1)
883 );
884 }
885 }
886
887 println!();
888 Ok(())
889}
890
891pub fn run_trace(binary: &str, duration: Option<&str>) -> Result<()> {
893 println!("\n=== CGP System Trace: {binary} ===\n");
894
895 let Some(profiler) = NsysProfiler::detect() else {
896 anyhow::bail!("nsys not found. Install NVIDIA Nsight Systems for timeline tracing.");
897 };
898
899 let report_path = format!("/tmp/cgp-trace-{}", std::process::id());
900 let cmd = build_nsys_trace_command(&profiler.nsys_path, &report_path, duration, binary);
901 let (stdout, stderr) = run_nsys_trace_command(cmd)?;
902
903 print_nsys_summary(&stdout, &stderr);
904 print_nsys_report_file(&report_path);
905
906 println!();
907 Ok(())
908}
909
910fn build_nsys_trace_command(
911 nsys_path: &std::path::Path,
912 report_path: &str,
913 duration: Option<&str>,
914 binary: &str,
915) -> Command {
916 let mut cmd = Command::new(nsys_path);
917 cmd.arg("profile")
918 .arg("--stats=true")
919 .arg("--force-overwrite=true")
920 .arg("--trace=cuda,nvtx,osrt")
921 .arg("-o")
922 .arg(report_path);
923 if let Some(dur) = duration {
924 cmd.arg("--duration").arg(dur);
925 }
926 cmd.arg(binary);
927 cmd
928}
929
930fn run_nsys_trace_command(mut cmd: Command) -> Result<(String, String)> {
931 eprintln!("Running nsys trace (this may take a while)...");
932 let output = cmd.output().with_context(|| "Failed to run nsys trace")?;
933 Ok((
934 String::from_utf8_lossy(&output.stdout).to_string(),
935 String::from_utf8_lossy(&output.stderr).to_string(),
936 ))
937}
938
939fn print_nsys_summary(stdout: &str, stderr: &str) {
940 let combined = format!("{stdout}\n{stderr}");
941 let mut in_summary = false;
942 for line in combined.lines() {
943 if is_nsys_summary_header(line) {
944 in_summary = true;
945 println!(" {line}");
946 continue;
947 }
948 if !in_summary {
949 continue;
950 }
951 if line.trim().is_empty() {
952 in_summary = false;
953 println!();
954 } else {
955 println!(" {line}");
956 }
957 }
958}
959
960fn is_nsys_summary_header(line: &str) -> bool {
961 line.contains("CUDA API Statistics")
962 || line.contains("CUDA Kernel Statistics")
963 || line.contains("OS Runtime Statistics")
964}
965
966fn print_nsys_report_file(report_path: &str) {
967 let report_file = format!("{report_path}.nsys-rep");
968 if std::path::Path::new(&report_file).exists() {
969 println!(" Report: {report_file}");
970 println!(" View: nsys-ui {report_file}");
971 }
972}
973
974#[cfg(test)]
975mod tests {
976 use super::*;
977
978 #[test]
979 fn test_ncu_section_args() {
980 assert_eq!(NcuSection::LaunchStats.as_ncu_arg(), "LaunchStats");
981 assert_eq!(NcuSection::Roofline.as_ncu_arg(), "SpeedOfLight");
982 assert_eq!(NcuSection::WarpState.as_ncu_arg(), "WarpStateStats");
983 }
984
985 #[test]
986 fn test_ncu_command_build() {
987 if let Some(profiler) = NcuProfiler::detect() {
988 assert!(profiler.ncu_path.exists());
989 }
990 }
991
992 #[test]
993 fn test_parse_ncu_csv_empty() {
994 let metrics = parse_ncu_csv("").unwrap();
995 assert!(metrics.is_empty());
996 }
997
998 #[test]
999 fn test_parse_ncu_csv_sample() {
1000 let csv = r#""ID","Metric Name","Metric Unit","Metric Value"
1001"0","sm__throughput.avg.pct_of_peak_sustained_elapsed","%","42.3"
1002"0","dram__throughput.avg.pct_of_peak_sustained_elapsed","%","7.8"
1003"0","launch__registers_per_thread","register/thread","48"
1004"0","gpu__time_duration.sum","nsecond","23200"
1005"#;
1006 let metrics = parse_ncu_csv(csv).unwrap();
1007 assert_eq!(
1008 get_f64(&metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed"),
1009 42.3
1010 );
1011 assert_eq!(get_u32(&metrics, "launch__registers_per_thread"), 48);
1012 assert_eq!(get_f64(&metrics, "gpu__time_duration.sum"), 23200.0);
1013 }
1014
1015 #[test]
1016 fn test_ncu_metrics_to_profile() {
1017 let mut metrics = HashMap::new();
1018 metrics.insert(
1019 "gpu__time_duration.sum".to_string(),
1020 "23200".to_string(), );
1022 metrics.insert(
1023 "sm__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
1024 "42.3".to_string(),
1025 );
1026 metrics.insert(
1027 "dram__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
1028 "7.8".to_string(),
1029 );
1030 metrics.insert("launch__registers_per_thread".to_string(), "48".to_string());
1031 metrics.insert(
1032 "smsp__thread_inst_executed_per_inst_executed.pct".to_string(),
1033 "100.0".to_string(),
1034 );
1035
1036 let profile = ncu_metrics_to_profile(&metrics, "gemm_cta_wmma_fp16", 512);
1037 assert_eq!(profile.kernel.as_ref().unwrap().name, "gemm_cta_wmma_fp16");
1038 assert!(profile.timing.wall_clock_time_us > 0.0);
1039 assert!(profile.throughput.tflops > 0.0);
1040 assert!(profile.gpu_compute.is_some());
1041 }
1042
1043 #[test]
1044 fn test_parse_nsys_stats_empty() {
1045 let stats = parse_nsys_stats("");
1046 assert!(stats.is_empty());
1047 }
1048
1049 #[test]
1050 fn test_render_profile_no_panic() {
1051 let profile = FullProfile {
1052 version: "2.0".to_string(),
1053 hardware: HardwareInfo {
1054 gpu: Some("Test GPU".to_string()),
1055 gpu_sm: Some("8.9".to_string()),
1056 ..Default::default()
1057 },
1058 kernel: Some(KernelInfo {
1059 name: "test_kernel".to_string(),
1060 dimensions: vec![256, 256, 256],
1061 ..Default::default()
1062 }),
1063 timing: TimingMetrics {
1064 wall_clock_time_us: 10.0,
1065 samples: 1,
1066 ..Default::default()
1067 },
1068 throughput: ThroughputMetrics {
1069 tflops: 5.0,
1070 ..Default::default()
1071 },
1072 gpu_compute: Some(GpuComputeMetrics {
1073 sm_utilization_pct: 40.0,
1074 warp_execution_efficiency_pct: 100.0,
1075 achieved_occupancy_pct: 33.0,
1076 register_usage_per_thread: 48,
1077 ..Default::default()
1078 }),
1079 gpu_memory: Some(GpuMemoryMetrics {
1080 dram_throughput_pct: 7.8,
1081 l2_hit_rate_pct: 87.0,
1082 global_load_efficiency_pct: 72.0,
1083 ..Default::default()
1084 }),
1085 ..Default::default()
1086 };
1087 render_profile(&profile);
1089 }
1090}