Skip to main content

cgp/profilers/
cuda.rs

1//! CUDA profiler: wraps ncu, nsys, and CUPTI.
2//! See spec sections 4.1.1 (ncu), 4.1.2 (nsys), 4.1.3 (CUPTI).
3
4use crate::analysis::roofline::{Bound, MemoryLevel, Precision, RooflineModel};
5use crate::metrics::catalog::*;
6use crate::profilers::system;
7use anyhow::{Context, Result};
8use std::collections::HashMap;
9use std::path::PathBuf;
10use std::process::Command;
11
12/// ncu metric sections — lazily collect only what's requested.
13#[derive(Debug, Clone, Copy)]
14pub enum NcuSection {
15    LaunchStats,
16    ComputeThroughput,
17    MemoryThroughput,
18    Occupancy,
19    Roofline,
20    WarpState,
21}
22
23impl NcuSection {
24    fn as_ncu_arg(&self) -> &str {
25        match self {
26            NcuSection::LaunchStats => "LaunchStats",
27            NcuSection::ComputeThroughput => "ComputeWorkloadAnalysis",
28            NcuSection::MemoryThroughput => "MemoryWorkloadAnalysis",
29            NcuSection::Occupancy => "Occupancy",
30            NcuSection::Roofline => "SpeedOfLight",
31            NcuSection::WarpState => "WarpStateStats",
32        }
33    }
34}
35
36/// Wraps `ncu` CLI for kernel-level profiling.
37pub struct NcuProfiler {
38    pub ncu_path: PathBuf,
39    pub sections: Vec<NcuSection>,
40}
41
42impl NcuProfiler {
43    pub fn detect() -> Option<Self> {
44        which::which("ncu").ok().map(|path| Self {
45            ncu_path: path,
46            sections: vec![
47                NcuSection::LaunchStats,
48                NcuSection::Roofline,
49                NcuSection::ComputeThroughput,
50                NcuSection::MemoryThroughput,
51                NcuSection::Occupancy,
52            ],
53        })
54    }
55
56    /// Run ncu and collect metrics for a kernel.
57    pub fn profile(
58        &self,
59        binary: &str,
60        binary_args: &[&str],
61        kernel_regex: &str,
62    ) -> Result<HashMap<String, String>> {
63        let mut cmd = Command::new(&self.ncu_path);
64        cmd.arg("--target-processes").arg("all");
65        cmd.arg("--kernel-name-base").arg("demangled");
66
67        if !kernel_regex.is_empty() {
68            cmd.arg("--kernel-id")
69                .arg(format!("::regex:{kernel_regex}:"));
70        }
71
72        for section in &self.sections {
73            cmd.arg("--section").arg(section.as_ncu_arg());
74        }
75
76        cmd.arg("--csv");
77        cmd.arg("--log-file").arg("/dev/null");
78        cmd.arg(binary);
79        cmd.args(binary_args);
80
81        let output = cmd
82            .output()
83            .with_context(|| format!("Failed to run ncu: {}", self.ncu_path.display()))?;
84
85        if !output.status.success() {
86            let stderr = String::from_utf8_lossy(&output.stderr);
87            // ncu often needs root — provide helpful message
88            if stderr.contains("permission") || stderr.contains("ERR_NVGPUCTRPERM") {
89                anyhow::bail!(
90                    "ncu requires elevated permissions. Run with:\n  \
91                     sudo cgp profile kernel ...\n  \
92                     or set: sudo sysctl kernel.perf_event_paranoid=2"
93                );
94            }
95            anyhow::bail!("ncu failed (exit {}): {}", output.status, stderr.trim());
96        }
97
98        let stdout = String::from_utf8_lossy(&output.stdout);
99        parse_ncu_csv(&stdout)
100    }
101}
102
103/// Parse ncu CSV output into a metric name → value map.
104/// ncu CSV format: "ID","Metric Name","Metric Unit","Metric Value"
105pub fn parse_ncu_csv(csv: &str) -> Result<HashMap<String, String>> {
106    let mut metrics = HashMap::new();
107    for line in csv.lines() {
108        // Skip header and non-data lines
109        if line.starts_with('"') && !line.starts_with("\"ID\"") {
110            let fields: Vec<&str> = line.split(',').collect();
111            if fields.len() >= 4 {
112                let name = fields[1].trim_matches('"').to_string();
113                let value = fields[3].trim_matches('"').to_string();
114                metrics.insert(name, value);
115            }
116        }
117    }
118    Ok(metrics)
119}
120
121/// Extract a float metric, returning 0.0 if not found.
122fn get_f64(metrics: &HashMap<String, String>, key: &str) -> f64 {
123    metrics
124        .get(key)
125        .and_then(|v| v.replace(',', "").parse::<f64>().ok())
126        .unwrap_or(0.0)
127}
128
129/// Extract a u64 metric.
130#[allow(dead_code)]
131fn get_u64(metrics: &HashMap<String, String>, key: &str) -> u64 {
132    metrics
133        .get(key)
134        .and_then(|v| v.replace(',', "").parse::<u64>().ok())
135        .unwrap_or(0)
136}
137
138/// Extract a u32 metric.
139fn get_u32(metrics: &HashMap<String, String>, key: &str) -> u32 {
140    metrics
141        .get(key)
142        .and_then(|v| v.replace(',', "").parse::<u32>().ok())
143        .unwrap_or(0)
144}
145
146/// Build a FullProfile from ncu metrics.
147#[allow(clippy::implicit_hasher)]
148pub fn ncu_metrics_to_profile(
149    metrics: &HashMap<String, String>,
150    kernel_name: &str,
151    size: u32,
152) -> FullProfile {
153    let duration_us = get_f64(metrics, "gpu__time_duration.sum") / 1000.0; // ns → us
154    let flops = 2.0 * (size as f64).powi(3); // GEMM: 2*M*N*K
155    let tflops = if duration_us > 0.0 {
156        flops / (duration_us * 1e-6) / 1e12
157    } else {
158        0.0
159    };
160
161    let sm_pct = get_f64(metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed");
162    let dram_pct = get_f64(
163        metrics,
164        "dram__throughput.avg.pct_of_peak_sustained_elapsed",
165    );
166    let occupancy_pct = get_f64(
167        metrics,
168        "sm__warps_active.avg.pct_of_peak_sustained_elapsed",
169    );
170    let warp_eff = get_f64(metrics, "smsp__thread_inst_executed_per_inst_executed.pct");
171    let tc_pct = get_f64(
172        metrics,
173        "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
174    );
175    let regs = get_u32(metrics, "launch__registers_per_thread");
176    let smem = get_u32(metrics, "launch__shared_mem_per_block_driver");
177    let l2_hit = get_f64(metrics, "lts__t_sector_hit_rate.pct");
178    let global_load_eff = get_f64(
179        metrics,
180        "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct",
181    );
182
183    // Roofline analysis
184    let model = RooflineModel::rtx_4090();
185    let ai = if duration_us > 0.0 {
186        // Approximate: use dram throughput pct to estimate bytes
187        let dram_bw = 1008.0e9; // GB/s
188        let actual_bw = dram_bw * dram_pct / 100.0;
189        let bytes = actual_bw * duration_us * 1e-6;
190        if bytes > 0.0 {
191            flops / bytes
192        } else {
193            0.0
194        }
195    } else {
196        0.0
197    };
198
199    let roofline = model
200        .classify(ai, tflops * 1e12, Precision::Fp16, MemoryLevel::Dram)
201        .map(|point| {
202            let bound_str = match &point.bound {
203                Bound::Memory { .. } => "memory".to_string(),
204                Bound::Compute { .. } => "compute".to_string(),
205            };
206            RooflineMetrics {
207                peak_compute_tflops: 330.0,
208                peak_bandwidth_gbps: 1008.0,
209                ridge_point: 327.4,
210                bound: bound_str,
211                efficiency_pct: point.efficiency,
212                distance_to_ridge: point.distance_to_ridge,
213            }
214        });
215
216    let timestamp = chrono::Utc::now().to_rfc3339();
217
218    FullProfile {
219        version: "2.0".to_string(),
220        timestamp,
221        hardware: HardwareInfo {
222            gpu: Some("NVIDIA GeForce RTX 4090".to_string()),
223            gpu_sm: Some("8.9".to_string()),
224            gpu_memory_gb: Some(24.0),
225            gpu_bandwidth_gbps: Some(1008.0),
226            ..Default::default()
227        },
228        kernel: Some(KernelInfo {
229            name: kernel_name.to_string(),
230            dimensions: vec![size, size, size],
231            shared_memory_bytes: Some(smem),
232            registers_per_thread: Some(regs),
233            ..Default::default()
234        }),
235        timing: TimingMetrics {
236            wall_clock_time_us: duration_us,
237            samples: 1,
238            ..Default::default()
239        },
240        throughput: ThroughputMetrics {
241            tflops,
242            bandwidth_gbps: 1008.0 * dram_pct / 100.0,
243            arithmetic_intensity: ai,
244            ..Default::default()
245        },
246        roofline,
247        gpu_compute: Some(GpuComputeMetrics {
248            sm_utilization_pct: sm_pct,
249            achieved_occupancy_pct: occupancy_pct,
250            warp_execution_efficiency_pct: warp_eff,
251            tensor_core_utilization_pct: tc_pct,
252            register_usage_per_thread: regs,
253            shared_memory_per_block: smem,
254            ..Default::default()
255        }),
256        gpu_memory: Some(GpuMemoryMetrics {
257            dram_throughput_pct: dram_pct,
258            l2_hit_rate_pct: l2_hit,
259            global_load_efficiency_pct: global_load_eff,
260            ..Default::default()
261        }),
262        system_health: system::collect_system_health(),
263        vram: system::collect_vram(),
264        energy: system::collect_system_health()
265            .and_then(|h| system::compute_energy(h.gpu_power_watts, tflops, duration_us)),
266        ..Default::default()
267    }
268}
269
270/// Render the profile to stdout in the spec-mandated format.
271fn render_profile(profile: &FullProfile) {
272    render_profile_header(profile);
273    render_profile_execution(profile);
274    render_profile_roofline(profile);
275    render_profile_compute(profile);
276    render_profile_memory(profile);
277    render_profile_vram(profile);
278    render_profile_system_health(profile);
279    render_profile_energy(profile);
280    println!();
281}
282
283fn render_profile_header(profile: &FullProfile) {
284    let kernel = profile.kernel.as_ref().map_or("unknown", |k| &k.name);
285    let dims = profile
286        .kernel
287        .as_ref()
288        .map(|k| {
289            k.dimensions
290                .iter()
291                .map(|d| d.to_string())
292                .collect::<Vec<_>>()
293                .join("x")
294        })
295        .unwrap_or_default();
296    let gpu_name = profile.hardware.gpu.as_deref().unwrap_or("Unknown GPU");
297    let sm = profile.hardware.gpu_sm.as_deref().unwrap_or("?");
298
299    println!("\n=== CGP Kernel Profile: {kernel} ({dims}) ===\n");
300    println!(
301        "Backend: CUDA ({gpu_name}, SM {sm}, Driver {})",
302        detect_driver_version().unwrap_or_else(|| "?".to_string())
303    );
304}
305
306fn render_profile_execution(profile: &FullProfile) {
307    let t = &profile.timing;
308    let tp = &profile.throughput;
309    let peak_pct = if tp.tflops > 0.0 {
310        tp.tflops / 330.0 * 100.0
311    } else {
312        0.0
313    };
314    println!(
315        "Execution: {:.1} us  |  {:.1} TFLOP/s  |  {:.1}% of peak",
316        t.wall_clock_time_us, tp.tflops, peak_pct
317    );
318}
319
320fn render_profile_roofline(profile: &FullProfile) {
321    let Some(roof) = &profile.roofline else {
322        return;
323    };
324    let tp = &profile.throughput;
325    println!("\n  Roofline Position:");
326    println!(
327        "    Arithmetic Intensity: {:.1} FLOP/byte",
328        tp.arithmetic_intensity
329    );
330    println!("    Ridge Point: {:.1} FLOP/byte", roof.ridge_point);
331    let status = if roof.bound == "memory" {
332        format!("MEMORY-BOUND ({:.1}x below ridge)", roof.distance_to_ridge)
333    } else {
334        format!("COMPUTE-BOUND ({:.1}% efficiency)", roof.efficiency_pct)
335    };
336    println!("    Status: {status}");
337}
338
339fn render_profile_compute(profile: &FullProfile) {
340    let Some(gc) = &profile.gpu_compute else {
341        return;
342    };
343    println!("\n  Compute:");
344    if gc.tensor_core_utilization_pct > 0.0 {
345        println!(
346            "    Tensor core utilization: {:5.1}%   {}",
347            gc.tensor_core_utilization_pct,
348            quality_badge(gc.tensor_core_utilization_pct, 50.0)
349        );
350    }
351    println!(
352        "    Warp execution eff:     {:5.1}%   {}",
353        gc.warp_execution_efficiency_pct,
354        quality_badge(gc.warp_execution_efficiency_pct, 95.0)
355    );
356    println!("    SM utilization:         {:5.1}%", gc.sm_utilization_pct);
357    println!(
358        "    Achieved occupancy:     {:5.1}%",
359        gc.achieved_occupancy_pct
360    );
361    println!(
362        "    Register usage:          {:3}/255",
363        gc.register_usage_per_thread
364    );
365    if gc.shared_memory_per_block > 0 {
366        println!(
367            "    Shared memory/block:   {:5} bytes",
368            gc.shared_memory_per_block
369        );
370    }
371}
372
373fn render_profile_memory(profile: &FullProfile) {
374    let Some(gm) = &profile.gpu_memory else {
375        return;
376    };
377    println!("\n  Memory:");
378    println!(
379        "    DRAM throughput:        {:5.1}% of peak ({:.1} GB/s)",
380        gm.dram_throughput_pct,
381        1008.0 * gm.dram_throughput_pct / 100.0
382    );
383    println!(
384        "    Global load coalescing: {:5.1}%   {}",
385        gm.global_load_efficiency_pct,
386        quality_badge(gm.global_load_efficiency_pct, 60.0)
387    );
388    println!(
389        "    L2 hit rate:            {:5.1}%   {}",
390        gm.l2_hit_rate_pct,
391        quality_badge(gm.l2_hit_rate_pct, 50.0)
392    );
393}
394
395fn render_profile_vram(profile: &FullProfile) {
396    let Some(vram) = &profile.vram else {
397        return;
398    };
399    println!("\n  VRAM:");
400    println!(
401        "    Used: {:.0} / {:.0} MB ({:.1}%)",
402        vram.vram_used_mb, vram.vram_total_mb, vram.vram_utilization_pct
403    );
404}
405
406fn render_profile_system_health(profile: &FullProfile) {
407    let Some(health) = &profile.system_health else {
408        return;
409    };
410    println!("\n  System Health:");
411    println!("    GPU temp:  {:.0}°C", health.gpu_temperature_celsius);
412    println!("    GPU power: {:.0} W", health.gpu_power_watts);
413    println!(
414        "    GPU clock: {:.0} MHz (mem: {:.0} MHz)",
415        health.gpu_clock_mhz, health.gpu_memory_clock_mhz
416    );
417    if health.cpu_frequency_mhz > 0.0 {
418        println!("    CPU freq:  {:.0} MHz", health.cpu_frequency_mhz);
419    }
420}
421
422fn render_profile_energy(profile: &FullProfile) {
423    let Some(energy) = &profile.energy else {
424        return;
425    };
426    println!("\n  Energy:");
427    println!(
428        "    Efficiency: {:.4} TFLOP/s per watt",
429        energy.tflops_per_watt
430    );
431    println!(
432        "    Energy:     {:.6} J per inference",
433        energy.joules_per_inference
434    );
435}
436
437fn quality_badge(value: f64, threshold: f64) -> &'static str {
438    if value >= threshold {
439        "[OK]"
440    } else {
441        "[WARN]"
442    }
443}
444
445/// Get driver version from nvidia-smi.
446fn detect_driver_version() -> Option<String> {
447    Command::new("nvidia-smi")
448        .args(["--query-gpu=driver_version", "--format=csv,noheader"])
449        .output()
450        .ok()
451        .filter(|o| o.status.success())
452        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
453}
454
455/// Detect GPU name from nvidia-smi.
456fn detect_gpu_name() -> Option<String> {
457    Command::new("nvidia-smi")
458        .args(["--query-gpu=name", "--format=csv,noheader"])
459        .output()
460        .ok()
461        .filter(|o| o.status.success())
462        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
463}
464
465// ── Public API: profile commands ──
466
467/// Profile a CUDA PTX kernel via ncu.
468pub fn profile_kernel(name: &str, size: u32, roofline: bool, _metrics: Option<&str>) -> Result<()> {
469    let profiler = match NcuProfiler::detect() {
470        Some(mut p) => {
471            if roofline {
472                // Ensure we have all sections needed for roofline
473                p.sections = vec![
474                    NcuSection::LaunchStats,
475                    NcuSection::Roofline,
476                    NcuSection::ComputeThroughput,
477                    NcuSection::MemoryThroughput,
478                    NcuSection::Occupancy,
479                ];
480            }
481            p
482        }
483        None => {
484            // Fallback: show static roofline data
485            println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
486            println!("  ncu not found. Showing static analysis only.\n");
487            let model = RooflineModel::rtx_4090();
488            let flops = 2.0 * (size as f64).powi(3);
489            println!("  Expected FLOPs: {:.2e}", flops);
490            if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
491                println!("  FP16 ridge point: {:.1} FLOP/byte", ridge);
492            }
493            println!("  Install NVIDIA Nsight Compute for runtime profiling.");
494            return Ok(());
495        }
496    };
497
498    // Try to find a trueno benchmark binary that exercises this kernel
499    let binary = find_kernel_binary(name);
500
501    match binary {
502        Some((bin_path, bin_args)) => {
503            eprintln!("Profiling {name} via ncu (this may take a moment)...");
504            let metrics = profiler.profile(
505                &bin_path,
506                &bin_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
507                name,
508            )?;
509            let profile = ncu_metrics_to_profile(&metrics, name, size);
510            render_profile(&profile);
511
512            // Export JSON alongside
513            let json_path = format!("/tmp/cgp-{name}-{size}.json");
514            let json = serde_json::to_string_pretty(&profile)?;
515            std::fs::write(&json_path, &json)?;
516            println!("  Profile exported: {json_path}");
517        }
518        None => {
519            // No binary found — run ncu with a placeholder message
520            println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
521            println!("  Backend: CUDA (ncu at {})", profiler.ncu_path.display());
522            println!(
523                "  GPU: {}",
524                detect_gpu_name().unwrap_or_else(|| "Unknown".to_string())
525            );
526            println!(
527                "  Driver: {}",
528                detect_driver_version().unwrap_or_else(|| "Unknown".to_string())
529            );
530            println!("\n  No binary found for kernel '{name}'.");
531            println!("  To profile a specific binary, use:");
532            println!("    cgp profile binary ./your_binary --kernel-filter {name}");
533            println!();
534
535            // Still show roofline analysis
536            if roofline {
537                let model = RooflineModel::rtx_4090();
538                let flops = 2.0 * (size as f64).powi(3);
539                println!("  Roofline analysis (static):");
540                println!("    Problem FLOPs: {:.2e}", flops);
541                if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
542                    println!("    FP16 ridge: {:.1} FLOP/byte", ridge);
543                }
544                println!();
545            }
546        }
547    }
548
549    Ok(())
550}
551
552/// Try to find a built CUDA binary that exercises a given kernel.
553fn find_kernel_binary(_kernel_name: &str) -> Option<(String, Vec<String>)> {
554    // Look for GPU-specific examples/benches
555    let candidates = [
556        "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo",
557        "./target/release/examples/gpu_batch_demo",
558        "/mnt/nvme-raid0/targets/trueno/release/examples/wgpu_backward_demo",
559        "./target/release/examples/wgpu_backward_demo",
560    ];
561    for path in &candidates {
562        if std::path::Path::new(path).exists() {
563            return Some((path.to_string(), vec![]));
564        }
565    }
566    None
567}
568
569/// Profile cuBLAS operations.
570pub fn profile_cublas(op: &str, size: u32) -> Result<()> {
571    println!("\n=== CGP cuBLAS Profile: {op} ({size}x{size}) ===\n");
572
573    let gpu_name = detect_gpu_name().unwrap_or_else(|| "Unknown GPU".to_string());
574    let driver = detect_driver_version().unwrap_or_else(|| "?".to_string());
575    println!("  GPU: {gpu_name} (Driver {driver})");
576
577    // Calculate expected performance from roofline
578    let flops = 2.0 * (size as f64).powi(3);
579    println!("  Operation: {op}");
580    println!("  Problem size: {size}x{size}x{size}");
581    println!("  FLOPs: {flops:.2e}");
582
583    // cuBLAS peak estimates for RTX 4090
584    let (peak_tflops, precision) = match op {
585        "gemm_f16" | "hgemm" => (330.0, "FP16 Tensor"),
586        "gemm_f32" | "sgemm" => (82.6, "FP32"),
587        "gemm_tf32" => (165.0, "TF32 Tensor"),
588        _ => (82.6, "FP32 (default)"),
589    };
590    println!("  Precision: {precision} (peak: {peak_tflops:.1} TFLOP/s)");
591
592    // Try to find a cuBLAS benchmark binary
593    let cublas_bin = find_cublas_binary();
594    match cublas_bin {
595        Some(bin) => {
596            println!("  Binary: {bin}");
597            if let Some(nsys) = NsysProfiler::detect() {
598                println!("  Profiling via nsys...");
599                match nsys.profile_binary(&bin, &[]) {
600                    Ok(stats) => {
601                        // Find cuBLAS kernels
602                        let cublas_kernels: Vec<_> = stats
603                            .iter()
604                            .filter(|s| {
605                                s.name.contains("gemm")
606                                    || s.name.contains("cublas")
607                                    || s.name.contains("Gemm")
608                            })
609                            .collect();
610                        if cublas_kernels.is_empty() {
611                            println!("  No cuBLAS kernels found in trace.");
612                        } else {
613                            println!("\n  cuBLAS Kernels:");
614                            for k in &cublas_kernels {
615                                let tflops = flops / (k.avg_us * 1e-6) / 1e12;
616                                let eff = tflops / peak_tflops * 100.0;
617                                println!(
618                                    "    {:50} avg={:.1}us  {:.1} TFLOP/s  ({:.1}% eff)",
619                                    k.name, k.avg_us, tflops, eff
620                                );
621                            }
622                        }
623                    }
624                    Err(e) => println!("  nsys profiling failed: {e}"),
625                }
626            }
627        }
628        None => {
629            // Estimate from known measurements
630            let est_time_us = flops / (peak_tflops * 0.9 * 1e12) * 1e6; // 90% of peak
631            let est_tflops = flops / (est_time_us * 1e-6) / 1e12;
632            println!("\n  Estimated (no binary found):");
633            println!("    Time: ~{est_time_us:.1} us");
634            println!(
635                "    Throughput: ~{est_tflops:.1} TFLOP/s ({:.1}% of peak)",
636                est_tflops / peak_tflops * 100.0
637            );
638            println!("\n  To profile real cuBLAS:");
639            println!("    cgp profile binary ./your_cublas_bench --kernel-filter gemm");
640        }
641    }
642
643    println!();
644    Ok(())
645}
646
647/// Find a cuBLAS benchmark binary.
648fn find_cublas_binary() -> Option<String> {
649    let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
650    let mut candidates: Vec<String> = Vec::new();
651    if !target_dir.is_empty() {
652        candidates.push(format!("{target_dir}/release/examples/bench_cublas_vs_ptx"));
653        candidates.push(format!("{target_dir}/release/examples/gpu_batch_demo"));
654    }
655    candidates.extend_from_slice(&[
656        "/mnt/nvme-raid0/targets/trueno/release/examples/bench_cublas_vs_ptx".to_string(),
657        "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo".to_string(),
658    ]);
659    for path in &candidates {
660        if std::path::Path::new(path).exists() {
661            return Some(path.clone());
662        }
663    }
664    None
665}
666
667// ── nsys integration ──
668
669/// Wraps `nsys` CLI for system-wide timeline profiling.
670pub struct NsysProfiler {
671    pub nsys_path: PathBuf,
672}
673
674/// Parsed nsys stats output — one entry per kernel.
675#[derive(Debug, Clone)]
676pub struct NsysKernelStat {
677    pub name: String,
678    pub calls: u64,
679    pub total_us: f64,
680    pub avg_us: f64,
681    pub min_us: f64,
682    pub max_us: f64,
683}
684
685impl NsysProfiler {
686    pub fn detect() -> Option<Self> {
687        which::which("nsys")
688            .ok()
689            .map(|path| Self { nsys_path: path })
690    }
691
692    /// Run nsys profile and capture stats.
693    pub fn profile_binary(
694        &self,
695        binary: &str,
696        binary_args: &[&str],
697    ) -> Result<Vec<NsysKernelStat>> {
698        let report_path = format!("/tmp/cgp-nsys-{}", std::process::id());
699
700        let mut cmd = Command::new(&self.nsys_path);
701        cmd.arg("profile")
702            .arg("--stats=true")
703            .arg("--force-overwrite=true")
704            .arg("-o")
705            .arg(&report_path)
706            .arg(binary)
707            .args(binary_args);
708
709        let output = cmd.output().with_context(|| "Failed to run nsys")?;
710        let stdout = String::from_utf8_lossy(&output.stdout);
711        let stderr = String::from_utf8_lossy(&output.stderr);
712        let combined = format!("{stdout}\n{stderr}");
713
714        let stats = parse_nsys_stats(&combined);
715
716        // Clean up report files
717        let _ = std::fs::remove_file(format!("{report_path}.nsys-rep"));
718        let _ = std::fs::remove_file(format!("{report_path}.sqlite"));
719
720        Ok(stats)
721    }
722}
723
724/// Parse nsys stats output for CUDA kernel summary.
725fn parse_nsys_stats(output: &str) -> Vec<NsysKernelStat> {
726    let mut stats = Vec::new();
727    let mut in_kernel_section = false;
728
729    for line in output.lines() {
730        if line.contains("CUDA Kernel Statistics") || line.contains("cuda_gpu_kern_sum") {
731            in_kernel_section = true;
732            continue;
733        }
734        if in_kernel_section && line.trim().is_empty() {
735            in_kernel_section = false;
736            continue;
737        }
738        if in_kernel_section {
739            // nsys stats format varies; try to parse common patterns
740            // Typical: Time(%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  Name
741            let parts: Vec<&str> = line.split_whitespace().collect();
742            if parts.len() >= 8 {
743                if let (Ok(total_ns), Ok(instances)) = (
744                    parts[1].replace(',', "").parse::<f64>(),
745                    parts[2].replace(',', "").parse::<u64>(),
746                ) {
747                    let avg_ns = parts[3].replace(',', "").parse::<f64>().unwrap_or(0.0);
748                    let min_ns = parts[5].replace(',', "").parse::<f64>().unwrap_or(0.0);
749                    let max_ns = parts[6].replace(',', "").parse::<f64>().unwrap_or(0.0);
750                    let name = parts[7..].join(" ");
751                    stats.push(NsysKernelStat {
752                        name,
753                        calls: instances,
754                        total_us: total_ns / 1000.0,
755                        avg_us: avg_ns / 1000.0,
756                        min_us: min_ns / 1000.0,
757                        max_us: max_ns / 1000.0,
758                    });
759                }
760            }
761        }
762    }
763    stats
764}
765
766/// Profile an arbitrary binary via nsys.
767pub fn profile_binary(
768    path: &str,
769    kernel_filter: Option<&str>,
770    _trace: bool,
771    _duration: Option<&str>,
772) -> Result<()> {
773    println!("\n=== CGP Binary Profile: {path} ===\n");
774
775    let profiler = match NsysProfiler::detect() {
776        Some(p) => p,
777        None => {
778            println!("  nsys not found. Install NVIDIA Nsight Systems.");
779            println!("  Falling back to wall-clock timing...");
780
781            // Fallback: just run and time it
782            let start = std::time::Instant::now();
783            let status = Command::new(path).status()?;
784            let elapsed = start.elapsed();
785            println!(
786                "  Wall time: {:.2}ms (exit code: {})",
787                elapsed.as_secs_f64() * 1000.0,
788                status.code().unwrap_or(-1)
789            );
790            return Ok(());
791        }
792    };
793
794    eprintln!("Running nsys profile (this may take a moment)...");
795    let stats = profiler.profile_binary(path, &[])?;
796
797    if stats.is_empty() {
798        println!("  No CUDA kernels detected. Binary may be CPU-only.");
799        println!("  For CPU profiling, use: cgp profile simd or cgp profile scalar");
800    } else {
801        println!(
802            "  {:40} {:>8} {:>12} {:>12} {:>12}",
803            "Kernel", "Calls", "Avg (us)", "Min (us)", "Max (us)"
804        );
805        println!("  {}", "-".repeat(88));
806
807        for stat in &stats {
808            let name = if let Some(filter) = kernel_filter {
809                if !stat.name.contains(filter) {
810                    continue;
811                }
812                &stat.name
813            } else {
814                &stat.name
815            };
816
817            let display_name = if name.len() > 40 {
818                format!("{}...", &name[..37])
819            } else {
820                name.to_string()
821            };
822
823            println!(
824                "  {:40} {:>8} {:>12.1} {:>12.1} {:>12.1}",
825                display_name, stat.calls, stat.avg_us, stat.min_us, stat.max_us
826            );
827        }
828        println!();
829        println!(
830            "  Total kernels: {}, Total CUDA time: {:.1} us",
831            stats.len(),
832            stats.iter().map(|s| s.total_us).sum::<f64>()
833        );
834    }
835
836    println!();
837    Ok(())
838}
839
840/// Profile a Python script via nsys + perf stat.
841pub fn profile_python(args: &[String]) -> Result<()> {
842    let cmd_str = args.join(" ");
843    println!("\n=== CGP Python Profile ===\n");
844    println!("  Command: {cmd_str}");
845
846    if let Some(profiler) = NsysProfiler::detect() {
847        eprintln!("Running nsys profile on Python script...");
848        // Build the full command: uv run python <args>
849        let python_cmd = if args.is_empty() {
850            anyhow::bail!(
851                "No Python script specified. Usage: cgp profile python -- uv run python script.py"
852            );
853        } else {
854            args[0].clone()
855        };
856        let python_args: Vec<&str> = args[1..].iter().map(|s| s.as_str()).collect();
857        let stats = profiler.profile_binary(&python_cmd, &python_args)?;
858
859        if stats.is_empty() {
860            println!("  No CUDA kernels detected (CPU-only Python workload).");
861            println!("  Use perf stat for CPU profiling.");
862        } else {
863            println!("  CUDA kernels captured via nsys:");
864            for stat in &stats {
865                println!(
866                    "    {} — {} calls, avg {:.1}us",
867                    stat.name, stat.calls, stat.avg_us
868                );
869            }
870        }
871    } else {
872        println!("  nsys not found — cannot capture CUDA kernels.");
873        println!("  Falling back to wall-clock timing.");
874
875        if !args.is_empty() {
876            let start = std::time::Instant::now();
877            let status = Command::new(&args[0]).args(&args[1..]).status()?;
878            let elapsed = start.elapsed();
879            println!(
880                "  Wall time: {:.2}ms (exit code: {})",
881                elapsed.as_secs_f64() * 1000.0,
882                status.code().unwrap_or(-1)
883            );
884        }
885    }
886
887    println!();
888    Ok(())
889}
890
891/// Run `cgp trace` — system-wide timeline via nsys.
892pub fn run_trace(binary: &str, duration: Option<&str>) -> Result<()> {
893    println!("\n=== CGP System Trace: {binary} ===\n");
894
895    let Some(profiler) = NsysProfiler::detect() else {
896        anyhow::bail!("nsys not found. Install NVIDIA Nsight Systems for timeline tracing.");
897    };
898
899    let report_path = format!("/tmp/cgp-trace-{}", std::process::id());
900    let cmd = build_nsys_trace_command(&profiler.nsys_path, &report_path, duration, binary);
901    let (stdout, stderr) = run_nsys_trace_command(cmd)?;
902
903    print_nsys_summary(&stdout, &stderr);
904    print_nsys_report_file(&report_path);
905
906    println!();
907    Ok(())
908}
909
910fn build_nsys_trace_command(
911    nsys_path: &std::path::Path,
912    report_path: &str,
913    duration: Option<&str>,
914    binary: &str,
915) -> Command {
916    let mut cmd = Command::new(nsys_path);
917    cmd.arg("profile")
918        .arg("--stats=true")
919        .arg("--force-overwrite=true")
920        .arg("--trace=cuda,nvtx,osrt")
921        .arg("-o")
922        .arg(report_path);
923    if let Some(dur) = duration {
924        cmd.arg("--duration").arg(dur);
925    }
926    cmd.arg(binary);
927    cmd
928}
929
930fn run_nsys_trace_command(mut cmd: Command) -> Result<(String, String)> {
931    eprintln!("Running nsys trace (this may take a while)...");
932    let output = cmd.output().with_context(|| "Failed to run nsys trace")?;
933    Ok((
934        String::from_utf8_lossy(&output.stdout).to_string(),
935        String::from_utf8_lossy(&output.stderr).to_string(),
936    ))
937}
938
939fn print_nsys_summary(stdout: &str, stderr: &str) {
940    let combined = format!("{stdout}\n{stderr}");
941    let mut in_summary = false;
942    for line in combined.lines() {
943        if is_nsys_summary_header(line) {
944            in_summary = true;
945            println!("  {line}");
946            continue;
947        }
948        if !in_summary {
949            continue;
950        }
951        if line.trim().is_empty() {
952            in_summary = false;
953            println!();
954        } else {
955            println!("  {line}");
956        }
957    }
958}
959
960fn is_nsys_summary_header(line: &str) -> bool {
961    line.contains("CUDA API Statistics")
962        || line.contains("CUDA Kernel Statistics")
963        || line.contains("OS Runtime Statistics")
964}
965
966fn print_nsys_report_file(report_path: &str) {
967    let report_file = format!("{report_path}.nsys-rep");
968    if std::path::Path::new(&report_file).exists() {
969        println!("  Report: {report_file}");
970        println!("  View: nsys-ui {report_file}");
971    }
972}
973
974#[cfg(test)]
975mod tests {
976    use super::*;
977
978    #[test]
979    fn test_ncu_section_args() {
980        assert_eq!(NcuSection::LaunchStats.as_ncu_arg(), "LaunchStats");
981        assert_eq!(NcuSection::Roofline.as_ncu_arg(), "SpeedOfLight");
982        assert_eq!(NcuSection::WarpState.as_ncu_arg(), "WarpStateStats");
983    }
984
985    #[test]
986    fn test_ncu_command_build() {
987        if let Some(profiler) = NcuProfiler::detect() {
988            assert!(profiler.ncu_path.exists());
989        }
990    }
991
992    #[test]
993    fn test_parse_ncu_csv_empty() {
994        let metrics = parse_ncu_csv("").unwrap();
995        assert!(metrics.is_empty());
996    }
997
998    #[test]
999    fn test_parse_ncu_csv_sample() {
1000        let csv = r#""ID","Metric Name","Metric Unit","Metric Value"
1001"0","sm__throughput.avg.pct_of_peak_sustained_elapsed","%","42.3"
1002"0","dram__throughput.avg.pct_of_peak_sustained_elapsed","%","7.8"
1003"0","launch__registers_per_thread","register/thread","48"
1004"0","gpu__time_duration.sum","nsecond","23200"
1005"#;
1006        let metrics = parse_ncu_csv(csv).unwrap();
1007        assert_eq!(
1008            get_f64(&metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed"),
1009            42.3
1010        );
1011        assert_eq!(get_u32(&metrics, "launch__registers_per_thread"), 48);
1012        assert_eq!(get_f64(&metrics, "gpu__time_duration.sum"), 23200.0);
1013    }
1014
1015    #[test]
1016    fn test_ncu_metrics_to_profile() {
1017        let mut metrics = HashMap::new();
1018        metrics.insert(
1019            "gpu__time_duration.sum".to_string(),
1020            "23200".to_string(), // 23.2 us
1021        );
1022        metrics.insert(
1023            "sm__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
1024            "42.3".to_string(),
1025        );
1026        metrics.insert(
1027            "dram__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
1028            "7.8".to_string(),
1029        );
1030        metrics.insert("launch__registers_per_thread".to_string(), "48".to_string());
1031        metrics.insert(
1032            "smsp__thread_inst_executed_per_inst_executed.pct".to_string(),
1033            "100.0".to_string(),
1034        );
1035
1036        let profile = ncu_metrics_to_profile(&metrics, "gemm_cta_wmma_fp16", 512);
1037        assert_eq!(profile.kernel.as_ref().unwrap().name, "gemm_cta_wmma_fp16");
1038        assert!(profile.timing.wall_clock_time_us > 0.0);
1039        assert!(profile.throughput.tflops > 0.0);
1040        assert!(profile.gpu_compute.is_some());
1041    }
1042
1043    #[test]
1044    fn test_parse_nsys_stats_empty() {
1045        let stats = parse_nsys_stats("");
1046        assert!(stats.is_empty());
1047    }
1048
1049    #[test]
1050    fn test_render_profile_no_panic() {
1051        let profile = FullProfile {
1052            version: "2.0".to_string(),
1053            hardware: HardwareInfo {
1054                gpu: Some("Test GPU".to_string()),
1055                gpu_sm: Some("8.9".to_string()),
1056                ..Default::default()
1057            },
1058            kernel: Some(KernelInfo {
1059                name: "test_kernel".to_string(),
1060                dimensions: vec![256, 256, 256],
1061                ..Default::default()
1062            }),
1063            timing: TimingMetrics {
1064                wall_clock_time_us: 10.0,
1065                samples: 1,
1066                ..Default::default()
1067            },
1068            throughput: ThroughputMetrics {
1069                tflops: 5.0,
1070                ..Default::default()
1071            },
1072            gpu_compute: Some(GpuComputeMetrics {
1073                sm_utilization_pct: 40.0,
1074                warp_execution_efficiency_pct: 100.0,
1075                achieved_occupancy_pct: 33.0,
1076                register_usage_per_thread: 48,
1077                ..Default::default()
1078            }),
1079            gpu_memory: Some(GpuMemoryMetrics {
1080                dram_throughput_pct: 7.8,
1081                l2_hit_rate_pct: 87.0,
1082                global_load_efficiency_pct: 72.0,
1083                ..Default::default()
1084            }),
1085            ..Default::default()
1086        };
1087        // Should not panic
1088        render_profile(&profile);
1089    }
1090}