Skip to main content

cgp/profilers/
cuda.rs

1//! CUDA profiler: wraps ncu, nsys, and CUPTI.
2//! See spec sections 4.1.1 (ncu), 4.1.2 (nsys), 4.1.3 (CUPTI).
3
4use crate::analysis::roofline::{Bound, MemoryLevel, Precision, RooflineModel};
5use crate::metrics::catalog::*;
6use crate::profilers::system;
7use anyhow::{Context, Result};
8use std::collections::HashMap;
9use std::path::PathBuf;
10use std::process::Command;
11
12/// ncu metric sections — lazily collect only what's requested.
13#[derive(Debug, Clone, Copy)]
14pub enum NcuSection {
15    LaunchStats,
16    ComputeThroughput,
17    MemoryThroughput,
18    Occupancy,
19    Roofline,
20    WarpState,
21}
22
23impl NcuSection {
24    fn as_ncu_arg(&self) -> &str {
25        match self {
26            NcuSection::LaunchStats => "LaunchStats",
27            NcuSection::ComputeThroughput => "ComputeWorkloadAnalysis",
28            NcuSection::MemoryThroughput => "MemoryWorkloadAnalysis",
29            NcuSection::Occupancy => "Occupancy",
30            NcuSection::Roofline => "SpeedOfLight",
31            NcuSection::WarpState => "WarpStateStats",
32        }
33    }
34}
35
36/// Wraps `ncu` CLI for kernel-level profiling.
37pub struct NcuProfiler {
38    pub ncu_path: PathBuf,
39    pub sections: Vec<NcuSection>,
40}
41
42impl NcuProfiler {
43    pub fn detect() -> Option<Self> {
44        which::which("ncu").ok().map(|path| Self {
45            ncu_path: path,
46            sections: vec![
47                NcuSection::LaunchStats,
48                NcuSection::Roofline,
49                NcuSection::ComputeThroughput,
50                NcuSection::MemoryThroughput,
51                NcuSection::Occupancy,
52            ],
53        })
54    }
55
56    /// Run ncu and collect metrics for a kernel.
57    pub fn profile(
58        &self,
59        binary: &str,
60        binary_args: &[&str],
61        kernel_regex: &str,
62    ) -> Result<HashMap<String, String>> {
63        let mut cmd = Command::new(&self.ncu_path);
64        cmd.arg("--target-processes").arg("all");
65        cmd.arg("--kernel-name-base").arg("demangled");
66
67        if !kernel_regex.is_empty() {
68            cmd.arg("--kernel-id")
69                .arg(format!("::regex:{kernel_regex}:"));
70        }
71
72        for section in &self.sections {
73            cmd.arg("--section").arg(section.as_ncu_arg());
74        }
75
76        cmd.arg("--csv");
77        cmd.arg("--log-file").arg("/dev/null");
78        cmd.arg(binary);
79        cmd.args(binary_args);
80
81        let output = cmd
82            .output()
83            .with_context(|| format!("Failed to run ncu: {}", self.ncu_path.display()))?;
84
85        if !output.status.success() {
86            let stderr = String::from_utf8_lossy(&output.stderr);
87            // ncu often needs root — provide helpful message
88            if stderr.contains("permission") || stderr.contains("ERR_NVGPUCTRPERM") {
89                anyhow::bail!(
90                    "ncu requires elevated permissions. Run with:\n  \
91                     sudo cgp profile kernel ...\n  \
92                     or set: sudo sysctl kernel.perf_event_paranoid=2"
93                );
94            }
95            anyhow::bail!("ncu failed (exit {}): {}", output.status, stderr.trim());
96        }
97
98        let stdout = String::from_utf8_lossy(&output.stdout);
99        parse_ncu_csv(&stdout)
100    }
101}
102
103/// Parse ncu CSV output into a metric name → value map.
104/// ncu CSV format: "ID","Metric Name","Metric Unit","Metric Value"
105pub fn parse_ncu_csv(csv: &str) -> Result<HashMap<String, String>> {
106    let mut metrics = HashMap::new();
107    for line in csv.lines() {
108        // Skip header and non-data lines
109        if line.starts_with('"') && !line.starts_with("\"ID\"") {
110            let fields: Vec<&str> = line.split(',').collect();
111            if fields.len() >= 4 {
112                let name = fields[1].trim_matches('"').to_string();
113                let value = fields[3].trim_matches('"').to_string();
114                metrics.insert(name, value);
115            }
116        }
117    }
118    Ok(metrics)
119}
120
121/// Extract a float metric, returning 0.0 if not found.
122fn get_f64(metrics: &HashMap<String, String>, key: &str) -> f64 {
123    metrics
124        .get(key)
125        .and_then(|v| v.replace(',', "").parse::<f64>().ok())
126        .unwrap_or(0.0)
127}
128
129/// Extract a u64 metric.
130#[allow(dead_code)]
131fn get_u64(metrics: &HashMap<String, String>, key: &str) -> u64 {
132    metrics
133        .get(key)
134        .and_then(|v| v.replace(',', "").parse::<u64>().ok())
135        .unwrap_or(0)
136}
137
138/// Extract a u32 metric.
139fn get_u32(metrics: &HashMap<String, String>, key: &str) -> u32 {
140    metrics
141        .get(key)
142        .and_then(|v| v.replace(',', "").parse::<u32>().ok())
143        .unwrap_or(0)
144}
145
146/// Build a FullProfile from ncu metrics.
147#[allow(clippy::implicit_hasher)]
148pub fn ncu_metrics_to_profile(
149    metrics: &HashMap<String, String>,
150    kernel_name: &str,
151    size: u32,
152) -> FullProfile {
153    let duration_us = get_f64(metrics, "gpu__time_duration.sum") / 1000.0; // ns → us
154    let flops = 2.0 * (size as f64).powi(3); // GEMM: 2*M*N*K
155    let tflops = if duration_us > 0.0 {
156        flops / (duration_us * 1e-6) / 1e12
157    } else {
158        0.0
159    };
160
161    let sm_pct = get_f64(metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed");
162    let dram_pct = get_f64(
163        metrics,
164        "dram__throughput.avg.pct_of_peak_sustained_elapsed",
165    );
166    let occupancy_pct = get_f64(
167        metrics,
168        "sm__warps_active.avg.pct_of_peak_sustained_elapsed",
169    );
170    let warp_eff = get_f64(metrics, "smsp__thread_inst_executed_per_inst_executed.pct");
171    let tc_pct = get_f64(
172        metrics,
173        "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
174    );
175    let regs = get_u32(metrics, "launch__registers_per_thread");
176    let smem = get_u32(metrics, "launch__shared_mem_per_block_driver");
177    let l2_hit = get_f64(metrics, "lts__t_sector_hit_rate.pct");
178    let global_load_eff = get_f64(
179        metrics,
180        "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct",
181    );
182
183    // Roofline analysis
184    let model = RooflineModel::rtx_4090();
185    let ai = if duration_us > 0.0 {
186        // Approximate: use dram throughput pct to estimate bytes
187        let dram_bw = 1008.0e9; // GB/s
188        let actual_bw = dram_bw * dram_pct / 100.0;
189        let bytes = actual_bw * duration_us * 1e-6;
190        if bytes > 0.0 {
191            flops / bytes
192        } else {
193            0.0
194        }
195    } else {
196        0.0
197    };
198
199    let roofline = model
200        .classify(ai, tflops * 1e12, Precision::Fp16, MemoryLevel::Dram)
201        .map(|point| {
202            let bound_str = match &point.bound {
203                Bound::Memory { .. } => "memory".to_string(),
204                Bound::Compute { .. } => "compute".to_string(),
205            };
206            RooflineMetrics {
207                peak_compute_tflops: 330.0,
208                peak_bandwidth_gbps: 1008.0,
209                ridge_point: 327.4,
210                bound: bound_str,
211                efficiency_pct: point.efficiency,
212                distance_to_ridge: point.distance_to_ridge,
213            }
214        });
215
216    let timestamp = chrono::Utc::now().to_rfc3339();
217
218    FullProfile {
219        version: "2.0".to_string(),
220        timestamp,
221        hardware: HardwareInfo {
222            gpu: Some("NVIDIA GeForce RTX 4090".to_string()),
223            gpu_sm: Some("8.9".to_string()),
224            gpu_memory_gb: Some(24.0),
225            gpu_bandwidth_gbps: Some(1008.0),
226            ..Default::default()
227        },
228        kernel: Some(KernelInfo {
229            name: kernel_name.to_string(),
230            dimensions: vec![size, size, size],
231            shared_memory_bytes: Some(smem),
232            registers_per_thread: Some(regs),
233            ..Default::default()
234        }),
235        timing: TimingMetrics {
236            wall_clock_time_us: duration_us,
237            samples: 1,
238            ..Default::default()
239        },
240        throughput: ThroughputMetrics {
241            tflops,
242            bandwidth_gbps: 1008.0 * dram_pct / 100.0,
243            arithmetic_intensity: ai,
244            ..Default::default()
245        },
246        roofline,
247        gpu_compute: Some(GpuComputeMetrics {
248            sm_utilization_pct: sm_pct,
249            achieved_occupancy_pct: occupancy_pct,
250            warp_execution_efficiency_pct: warp_eff,
251            tensor_core_utilization_pct: tc_pct,
252            register_usage_per_thread: regs,
253            shared_memory_per_block: smem,
254            ..Default::default()
255        }),
256        gpu_memory: Some(GpuMemoryMetrics {
257            dram_throughput_pct: dram_pct,
258            l2_hit_rate_pct: l2_hit,
259            global_load_efficiency_pct: global_load_eff,
260            ..Default::default()
261        }),
262        system_health: system::collect_system_health(),
263        vram: system::collect_vram(),
264        energy: system::collect_system_health()
265            .and_then(|h| system::compute_energy(h.gpu_power_watts, tflops, duration_us)),
266        ..Default::default()
267    }
268}
269
270/// Render the profile to stdout in the spec-mandated format.
271fn render_profile(profile: &FullProfile) {
272    let kernel = profile.kernel.as_ref().map_or("unknown", |k| &k.name);
273    let dims = profile
274        .kernel
275        .as_ref()
276        .map(|k| {
277            k.dimensions
278                .iter()
279                .map(|d| d.to_string())
280                .collect::<Vec<_>>()
281                .join("x")
282        })
283        .unwrap_or_default();
284
285    let gpu_name = profile.hardware.gpu.as_deref().unwrap_or("Unknown GPU");
286    let sm = profile.hardware.gpu_sm.as_deref().unwrap_or("?");
287
288    println!("\n=== CGP Kernel Profile: {kernel} ({dims}) ===\n");
289    println!(
290        "Backend: CUDA ({gpu_name}, SM {sm}, Driver {})",
291        detect_driver_version().unwrap_or_else(|| "?".to_string())
292    );
293
294    let t = &profile.timing;
295    let tp = &profile.throughput;
296    let peak_pct = if tp.tflops > 0.0 {
297        tp.tflops / 330.0 * 100.0
298    } else {
299        0.0
300    };
301    println!(
302        "Execution: {:.1} us  |  {:.1} TFLOP/s  |  {:.1}% of peak",
303        t.wall_clock_time_us, tp.tflops, peak_pct
304    );
305
306    // Roofline
307    if let Some(roof) = &profile.roofline {
308        println!("\n  Roofline Position:");
309        println!(
310            "    Arithmetic Intensity: {:.1} FLOP/byte",
311            tp.arithmetic_intensity
312        );
313        println!("    Ridge Point: {:.1} FLOP/byte", roof.ridge_point);
314        let status = if roof.bound == "memory" {
315            format!("MEMORY-BOUND ({:.1}x below ridge)", roof.distance_to_ridge)
316        } else {
317            format!("COMPUTE-BOUND ({:.1}% efficiency)", roof.efficiency_pct)
318        };
319        println!("    Status: {status}");
320    }
321
322    // Compute metrics
323    if let Some(gc) = &profile.gpu_compute {
324        println!("\n  Compute:");
325        if gc.tensor_core_utilization_pct > 0.0 {
326            println!(
327                "    Tensor core utilization: {:5.1}%   {}",
328                gc.tensor_core_utilization_pct,
329                quality_badge(gc.tensor_core_utilization_pct, 50.0)
330            );
331        }
332        println!(
333            "    Warp execution eff:     {:5.1}%   {}",
334            gc.warp_execution_efficiency_pct,
335            quality_badge(gc.warp_execution_efficiency_pct, 95.0)
336        );
337        println!("    SM utilization:         {:5.1}%", gc.sm_utilization_pct);
338        println!(
339            "    Achieved occupancy:     {:5.1}%",
340            gc.achieved_occupancy_pct
341        );
342        println!(
343            "    Register usage:          {:3}/255",
344            gc.register_usage_per_thread
345        );
346        if gc.shared_memory_per_block > 0 {
347            println!(
348                "    Shared memory/block:   {:5} bytes",
349                gc.shared_memory_per_block
350            );
351        }
352    }
353
354    // Memory metrics
355    if let Some(gm) = &profile.gpu_memory {
356        println!("\n  Memory:");
357        println!(
358            "    DRAM throughput:        {:5.1}% of peak ({:.1} GB/s)",
359            gm.dram_throughput_pct,
360            1008.0 * gm.dram_throughput_pct / 100.0
361        );
362        println!(
363            "    Global load coalescing: {:5.1}%   {}",
364            gm.global_load_efficiency_pct,
365            quality_badge(gm.global_load_efficiency_pct, 60.0)
366        );
367        println!(
368            "    L2 hit rate:            {:5.1}%   {}",
369            gm.l2_hit_rate_pct,
370            quality_badge(gm.l2_hit_rate_pct, 50.0)
371        );
372    }
373
374    // VRAM metrics
375    if let Some(vram) = &profile.vram {
376        println!("\n  VRAM:");
377        println!(
378            "    Used: {:.0} / {:.0} MB ({:.1}%)",
379            vram.vram_used_mb, vram.vram_total_mb, vram.vram_utilization_pct
380        );
381    }
382
383    // System health
384    if let Some(health) = &profile.system_health {
385        println!("\n  System Health:");
386        println!("    GPU temp:  {:.0}°C", health.gpu_temperature_celsius);
387        println!("    GPU power: {:.0} W", health.gpu_power_watts);
388        println!(
389            "    GPU clock: {:.0} MHz (mem: {:.0} MHz)",
390            health.gpu_clock_mhz, health.gpu_memory_clock_mhz
391        );
392        if health.cpu_frequency_mhz > 0.0 {
393            println!("    CPU freq:  {:.0} MHz", health.cpu_frequency_mhz);
394        }
395    }
396
397    // Energy efficiency
398    if let Some(energy) = &profile.energy {
399        println!("\n  Energy:");
400        println!(
401            "    Efficiency: {:.4} TFLOP/s per watt",
402            energy.tflops_per_watt
403        );
404        println!(
405            "    Energy:     {:.6} J per inference",
406            energy.joules_per_inference
407        );
408    }
409
410    println!();
411}
412
413fn quality_badge(value: f64, threshold: f64) -> &'static str {
414    if value >= threshold {
415        "[OK]"
416    } else {
417        "[WARN]"
418    }
419}
420
421/// Get driver version from nvidia-smi.
422fn detect_driver_version() -> Option<String> {
423    Command::new("nvidia-smi")
424        .args(["--query-gpu=driver_version", "--format=csv,noheader"])
425        .output()
426        .ok()
427        .filter(|o| o.status.success())
428        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
429}
430
431/// Detect GPU name from nvidia-smi.
432fn detect_gpu_name() -> Option<String> {
433    Command::new("nvidia-smi")
434        .args(["--query-gpu=name", "--format=csv,noheader"])
435        .output()
436        .ok()
437        .filter(|o| o.status.success())
438        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
439}
440
441// ── Public API: profile commands ──
442
443/// Profile a CUDA PTX kernel via ncu.
444pub fn profile_kernel(name: &str, size: u32, roofline: bool, _metrics: Option<&str>) -> Result<()> {
445    let profiler = match NcuProfiler::detect() {
446        Some(mut p) => {
447            if roofline {
448                // Ensure we have all sections needed for roofline
449                p.sections = vec![
450                    NcuSection::LaunchStats,
451                    NcuSection::Roofline,
452                    NcuSection::ComputeThroughput,
453                    NcuSection::MemoryThroughput,
454                    NcuSection::Occupancy,
455                ];
456            }
457            p
458        }
459        None => {
460            // Fallback: show static roofline data
461            println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
462            println!("  ncu not found. Showing static analysis only.\n");
463            let model = RooflineModel::rtx_4090();
464            let flops = 2.0 * (size as f64).powi(3);
465            println!("  Expected FLOPs: {:.2e}", flops);
466            if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
467                println!("  FP16 ridge point: {:.1} FLOP/byte", ridge);
468            }
469            println!("  Install NVIDIA Nsight Compute for runtime profiling.");
470            return Ok(());
471        }
472    };
473
474    // Try to find a trueno benchmark binary that exercises this kernel
475    let binary = find_kernel_binary(name);
476
477    match binary {
478        Some((bin_path, bin_args)) => {
479            eprintln!("Profiling {name} via ncu (this may take a moment)...");
480            let metrics = profiler.profile(
481                &bin_path,
482                &bin_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
483                name,
484            )?;
485            let profile = ncu_metrics_to_profile(&metrics, name, size);
486            render_profile(&profile);
487
488            // Export JSON alongside
489            let json_path = format!("/tmp/cgp-{name}-{size}.json");
490            let json = serde_json::to_string_pretty(&profile)?;
491            std::fs::write(&json_path, &json)?;
492            println!("  Profile exported: {json_path}");
493        }
494        None => {
495            // No binary found — run ncu with a placeholder message
496            println!("\n=== CGP Kernel Profile: {name} ({size}x{size}x{size}) ===\n");
497            println!("  Backend: CUDA (ncu at {})", profiler.ncu_path.display());
498            println!(
499                "  GPU: {}",
500                detect_gpu_name().unwrap_or_else(|| "Unknown".to_string())
501            );
502            println!(
503                "  Driver: {}",
504                detect_driver_version().unwrap_or_else(|| "Unknown".to_string())
505            );
506            println!("\n  No binary found for kernel '{name}'.");
507            println!("  To profile a specific binary, use:");
508            println!("    cgp profile binary ./your_binary --kernel-filter {name}");
509            println!();
510
511            // Still show roofline analysis
512            if roofline {
513                let model = RooflineModel::rtx_4090();
514                let flops = 2.0 * (size as f64).powi(3);
515                println!("  Roofline analysis (static):");
516                println!("    Problem FLOPs: {:.2e}", flops);
517                if let Some(ridge) = model.ridge_point(Precision::Fp16, MemoryLevel::Dram) {
518                    println!("    FP16 ridge: {:.1} FLOP/byte", ridge);
519                }
520                println!();
521            }
522        }
523    }
524
525    Ok(())
526}
527
528/// Try to find a built CUDA binary that exercises a given kernel.
529fn find_kernel_binary(_kernel_name: &str) -> Option<(String, Vec<String>)> {
530    // Look for GPU-specific examples/benches
531    let candidates = [
532        "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo",
533        "./target/release/examples/gpu_batch_demo",
534        "/mnt/nvme-raid0/targets/trueno/release/examples/wgpu_backward_demo",
535        "./target/release/examples/wgpu_backward_demo",
536    ];
537    for path in &candidates {
538        if std::path::Path::new(path).exists() {
539            return Some((path.to_string(), vec![]));
540        }
541    }
542    None
543}
544
545/// Profile cuBLAS operations.
546pub fn profile_cublas(op: &str, size: u32) -> Result<()> {
547    println!("\n=== CGP cuBLAS Profile: {op} ({size}x{size}) ===\n");
548
549    let gpu_name = detect_gpu_name().unwrap_or_else(|| "Unknown GPU".to_string());
550    let driver = detect_driver_version().unwrap_or_else(|| "?".to_string());
551    println!("  GPU: {gpu_name} (Driver {driver})");
552
553    // Calculate expected performance from roofline
554    let flops = 2.0 * (size as f64).powi(3);
555    println!("  Operation: {op}");
556    println!("  Problem size: {size}x{size}x{size}");
557    println!("  FLOPs: {flops:.2e}");
558
559    // cuBLAS peak estimates for RTX 4090
560    let (peak_tflops, precision) = match op {
561        "gemm_f16" | "hgemm" => (330.0, "FP16 Tensor"),
562        "gemm_f32" | "sgemm" => (82.6, "FP32"),
563        "gemm_tf32" => (165.0, "TF32 Tensor"),
564        _ => (82.6, "FP32 (default)"),
565    };
566    println!("  Precision: {precision} (peak: {peak_tflops:.1} TFLOP/s)");
567
568    // Try to find a cuBLAS benchmark binary
569    let cublas_bin = find_cublas_binary();
570    match cublas_bin {
571        Some(bin) => {
572            println!("  Binary: {bin}");
573            if let Some(nsys) = NsysProfiler::detect() {
574                println!("  Profiling via nsys...");
575                match nsys.profile_binary(&bin, &[]) {
576                    Ok(stats) => {
577                        // Find cuBLAS kernels
578                        let cublas_kernels: Vec<_> = stats
579                            .iter()
580                            .filter(|s| {
581                                s.name.contains("gemm")
582                                    || s.name.contains("cublas")
583                                    || s.name.contains("Gemm")
584                            })
585                            .collect();
586                        if cublas_kernels.is_empty() {
587                            println!("  No cuBLAS kernels found in trace.");
588                        } else {
589                            println!("\n  cuBLAS Kernels:");
590                            for k in &cublas_kernels {
591                                let tflops = flops / (k.avg_us * 1e-6) / 1e12;
592                                let eff = tflops / peak_tflops * 100.0;
593                                println!(
594                                    "    {:50} avg={:.1}us  {:.1} TFLOP/s  ({:.1}% eff)",
595                                    k.name, k.avg_us, tflops, eff
596                                );
597                            }
598                        }
599                    }
600                    Err(e) => println!("  nsys profiling failed: {e}"),
601                }
602            }
603        }
604        None => {
605            // Estimate from known measurements
606            let est_time_us = flops / (peak_tflops * 0.9 * 1e12) * 1e6; // 90% of peak
607            let est_tflops = flops / (est_time_us * 1e-6) / 1e12;
608            println!("\n  Estimated (no binary found):");
609            println!("    Time: ~{est_time_us:.1} us");
610            println!(
611                "    Throughput: ~{est_tflops:.1} TFLOP/s ({:.1}% of peak)",
612                est_tflops / peak_tflops * 100.0
613            );
614            println!("\n  To profile real cuBLAS:");
615            println!("    cgp profile binary ./your_cublas_bench --kernel-filter gemm");
616        }
617    }
618
619    println!();
620    Ok(())
621}
622
623/// Find a cuBLAS benchmark binary.
624fn find_cublas_binary() -> Option<String> {
625    let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
626    let mut candidates: Vec<String> = Vec::new();
627    if !target_dir.is_empty() {
628        candidates.push(format!("{target_dir}/release/examples/bench_cublas_vs_ptx"));
629        candidates.push(format!("{target_dir}/release/examples/gpu_batch_demo"));
630    }
631    candidates.extend_from_slice(&[
632        "/mnt/nvme-raid0/targets/trueno/release/examples/bench_cublas_vs_ptx".to_string(),
633        "/mnt/nvme-raid0/targets/trueno/release/examples/gpu_batch_demo".to_string(),
634    ]);
635    for path in &candidates {
636        if std::path::Path::new(path).exists() {
637            return Some(path.clone());
638        }
639    }
640    None
641}
642
643// ── nsys integration ──
644
645/// Wraps `nsys` CLI for system-wide timeline profiling.
646pub struct NsysProfiler {
647    pub nsys_path: PathBuf,
648}
649
650/// Parsed nsys stats output — one entry per kernel.
651#[derive(Debug, Clone)]
652pub struct NsysKernelStat {
653    pub name: String,
654    pub calls: u64,
655    pub total_us: f64,
656    pub avg_us: f64,
657    pub min_us: f64,
658    pub max_us: f64,
659}
660
661impl NsysProfiler {
662    pub fn detect() -> Option<Self> {
663        which::which("nsys")
664            .ok()
665            .map(|path| Self { nsys_path: path })
666    }
667
668    /// Run nsys profile and capture stats.
669    pub fn profile_binary(
670        &self,
671        binary: &str,
672        binary_args: &[&str],
673    ) -> Result<Vec<NsysKernelStat>> {
674        let report_path = format!("/tmp/cgp-nsys-{}", std::process::id());
675
676        let mut cmd = Command::new(&self.nsys_path);
677        cmd.arg("profile")
678            .arg("--stats=true")
679            .arg("--force-overwrite=true")
680            .arg("-o")
681            .arg(&report_path)
682            .arg(binary)
683            .args(binary_args);
684
685        let output = cmd.output().with_context(|| "Failed to run nsys")?;
686        let stdout = String::from_utf8_lossy(&output.stdout);
687        let stderr = String::from_utf8_lossy(&output.stderr);
688        let combined = format!("{stdout}\n{stderr}");
689
690        let stats = parse_nsys_stats(&combined);
691
692        // Clean up report files
693        let _ = std::fs::remove_file(format!("{report_path}.nsys-rep"));
694        let _ = std::fs::remove_file(format!("{report_path}.sqlite"));
695
696        Ok(stats)
697    }
698}
699
700/// Parse nsys stats output for CUDA kernel summary.
701fn parse_nsys_stats(output: &str) -> Vec<NsysKernelStat> {
702    let mut stats = Vec::new();
703    let mut in_kernel_section = false;
704
705    for line in output.lines() {
706        if line.contains("CUDA Kernel Statistics") || line.contains("cuda_gpu_kern_sum") {
707            in_kernel_section = true;
708            continue;
709        }
710        if in_kernel_section && line.trim().is_empty() {
711            in_kernel_section = false;
712            continue;
713        }
714        if in_kernel_section {
715            // nsys stats format varies; try to parse common patterns
716            // Typical: Time(%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  Name
717            let parts: Vec<&str> = line.split_whitespace().collect();
718            if parts.len() >= 8 {
719                if let (Ok(total_ns), Ok(instances)) = (
720                    parts[1].replace(',', "").parse::<f64>(),
721                    parts[2].replace(',', "").parse::<u64>(),
722                ) {
723                    let avg_ns = parts[3].replace(',', "").parse::<f64>().unwrap_or(0.0);
724                    let min_ns = parts[5].replace(',', "").parse::<f64>().unwrap_or(0.0);
725                    let max_ns = parts[6].replace(',', "").parse::<f64>().unwrap_or(0.0);
726                    let name = parts[7..].join(" ");
727                    stats.push(NsysKernelStat {
728                        name,
729                        calls: instances,
730                        total_us: total_ns / 1000.0,
731                        avg_us: avg_ns / 1000.0,
732                        min_us: min_ns / 1000.0,
733                        max_us: max_ns / 1000.0,
734                    });
735                }
736            }
737        }
738    }
739    stats
740}
741
742/// Profile an arbitrary binary via nsys.
743pub fn profile_binary(
744    path: &str,
745    kernel_filter: Option<&str>,
746    _trace: bool,
747    _duration: Option<&str>,
748) -> Result<()> {
749    println!("\n=== CGP Binary Profile: {path} ===\n");
750
751    let profiler = match NsysProfiler::detect() {
752        Some(p) => p,
753        None => {
754            println!("  nsys not found. Install NVIDIA Nsight Systems.");
755            println!("  Falling back to wall-clock timing...");
756
757            // Fallback: just run and time it
758            let start = std::time::Instant::now();
759            let status = Command::new(path).status()?;
760            let elapsed = start.elapsed();
761            println!(
762                "  Wall time: {:.2}ms (exit code: {})",
763                elapsed.as_secs_f64() * 1000.0,
764                status.code().unwrap_or(-1)
765            );
766            return Ok(());
767        }
768    };
769
770    eprintln!("Running nsys profile (this may take a moment)...");
771    let stats = profiler.profile_binary(path, &[])?;
772
773    if stats.is_empty() {
774        println!("  No CUDA kernels detected. Binary may be CPU-only.");
775        println!("  For CPU profiling, use: cgp profile simd or cgp profile scalar");
776    } else {
777        println!(
778            "  {:40} {:>8} {:>12} {:>12} {:>12}",
779            "Kernel", "Calls", "Avg (us)", "Min (us)", "Max (us)"
780        );
781        println!("  {}", "-".repeat(88));
782
783        for stat in &stats {
784            let name = if let Some(filter) = kernel_filter {
785                if !stat.name.contains(filter) {
786                    continue;
787                }
788                &stat.name
789            } else {
790                &stat.name
791            };
792
793            let display_name = if name.len() > 40 {
794                format!("{}...", &name[..37])
795            } else {
796                name.to_string()
797            };
798
799            println!(
800                "  {:40} {:>8} {:>12.1} {:>12.1} {:>12.1}",
801                display_name, stat.calls, stat.avg_us, stat.min_us, stat.max_us
802            );
803        }
804        println!();
805        println!(
806            "  Total kernels: {}, Total CUDA time: {:.1} us",
807            stats.len(),
808            stats.iter().map(|s| s.total_us).sum::<f64>()
809        );
810    }
811
812    println!();
813    Ok(())
814}
815
816/// Profile a Python script via nsys + perf stat.
817pub fn profile_python(args: &[String]) -> Result<()> {
818    let cmd_str = args.join(" ");
819    println!("\n=== CGP Python Profile ===\n");
820    println!("  Command: {cmd_str}");
821
822    if let Some(profiler) = NsysProfiler::detect() {
823        eprintln!("Running nsys profile on Python script...");
824        // Build the full command: uv run python <args>
825        let python_cmd = if args.is_empty() {
826            anyhow::bail!(
827                "No Python script specified. Usage: cgp profile python -- uv run python script.py"
828            );
829        } else {
830            args[0].clone()
831        };
832        let python_args: Vec<&str> = args[1..].iter().map(|s| s.as_str()).collect();
833        let stats = profiler.profile_binary(&python_cmd, &python_args)?;
834
835        if stats.is_empty() {
836            println!("  No CUDA kernels detected (CPU-only Python workload).");
837            println!("  Use perf stat for CPU profiling.");
838        } else {
839            println!("  CUDA kernels captured via nsys:");
840            for stat in &stats {
841                println!(
842                    "    {} — {} calls, avg {:.1}us",
843                    stat.name, stat.calls, stat.avg_us
844                );
845            }
846        }
847    } else {
848        println!("  nsys not found — cannot capture CUDA kernels.");
849        println!("  Falling back to wall-clock timing.");
850
851        if !args.is_empty() {
852            let start = std::time::Instant::now();
853            let status = Command::new(&args[0]).args(&args[1..]).status()?;
854            let elapsed = start.elapsed();
855            println!(
856                "  Wall time: {:.2}ms (exit code: {})",
857                elapsed.as_secs_f64() * 1000.0,
858                status.code().unwrap_or(-1)
859            );
860        }
861    }
862
863    println!();
864    Ok(())
865}
866
867/// Run `cgp trace` — system-wide timeline via nsys.
868pub fn run_trace(binary: &str, duration: Option<&str>) -> Result<()> {
869    println!("\n=== CGP System Trace: {binary} ===\n");
870
871    let profiler = match NsysProfiler::detect() {
872        Some(p) => p,
873        None => {
874            anyhow::bail!("nsys not found. Install NVIDIA Nsight Systems for timeline tracing.");
875        }
876    };
877
878    let report_path = format!("/tmp/cgp-trace-{}", std::process::id());
879
880    let mut cmd = Command::new(&profiler.nsys_path);
881    cmd.arg("profile")
882        .arg("--stats=true")
883        .arg("--force-overwrite=true")
884        .arg("--trace=cuda,nvtx,osrt")
885        .arg("-o")
886        .arg(&report_path);
887
888    if let Some(dur) = duration {
889        cmd.arg("--duration").arg(dur);
890    }
891
892    cmd.arg(binary);
893
894    eprintln!("Running nsys trace (this may take a while)...");
895    let output = cmd.output().with_context(|| "Failed to run nsys trace")?;
896    let stdout = String::from_utf8_lossy(&output.stdout);
897    let stderr = String::from_utf8_lossy(&output.stderr);
898
899    // Print stats summary
900    let combined = format!("{stdout}\n{stderr}");
901    let mut in_summary = false;
902    for line in combined.lines() {
903        if line.contains("CUDA API Statistics")
904            || line.contains("CUDA Kernel Statistics")
905            || line.contains("OS Runtime Statistics")
906        {
907            in_summary = true;
908            println!("  {line}");
909            continue;
910        }
911        if in_summary {
912            if line.trim().is_empty() {
913                in_summary = false;
914                println!();
915            } else {
916                println!("  {line}");
917            }
918        }
919    }
920
921    let report_file = format!("{report_path}.nsys-rep");
922    if std::path::Path::new(&report_file).exists() {
923        println!("  Report: {report_file}");
924        println!("  View: nsys-ui {report_file}");
925    }
926
927    println!();
928    Ok(())
929}
930
931#[cfg(test)]
932mod tests {
933    use super::*;
934
935    #[test]
936    fn test_ncu_section_args() {
937        assert_eq!(NcuSection::LaunchStats.as_ncu_arg(), "LaunchStats");
938        assert_eq!(NcuSection::Roofline.as_ncu_arg(), "SpeedOfLight");
939        assert_eq!(NcuSection::WarpState.as_ncu_arg(), "WarpStateStats");
940    }
941
942    #[test]
943    fn test_ncu_command_build() {
944        if let Some(profiler) = NcuProfiler::detect() {
945            assert!(profiler.ncu_path.exists());
946        }
947    }
948
949    #[test]
950    fn test_parse_ncu_csv_empty() {
951        let metrics = parse_ncu_csv("").unwrap();
952        assert!(metrics.is_empty());
953    }
954
955    #[test]
956    fn test_parse_ncu_csv_sample() {
957        let csv = r#""ID","Metric Name","Metric Unit","Metric Value"
958"0","sm__throughput.avg.pct_of_peak_sustained_elapsed","%","42.3"
959"0","dram__throughput.avg.pct_of_peak_sustained_elapsed","%","7.8"
960"0","launch__registers_per_thread","register/thread","48"
961"0","gpu__time_duration.sum","nsecond","23200"
962"#;
963        let metrics = parse_ncu_csv(csv).unwrap();
964        assert_eq!(
965            get_f64(&metrics, "sm__throughput.avg.pct_of_peak_sustained_elapsed"),
966            42.3
967        );
968        assert_eq!(get_u32(&metrics, "launch__registers_per_thread"), 48);
969        assert_eq!(get_f64(&metrics, "gpu__time_duration.sum"), 23200.0);
970    }
971
972    #[test]
973    fn test_ncu_metrics_to_profile() {
974        let mut metrics = HashMap::new();
975        metrics.insert(
976            "gpu__time_duration.sum".to_string(),
977            "23200".to_string(), // 23.2 us
978        );
979        metrics.insert(
980            "sm__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
981            "42.3".to_string(),
982        );
983        metrics.insert(
984            "dram__throughput.avg.pct_of_peak_sustained_elapsed".to_string(),
985            "7.8".to_string(),
986        );
987        metrics.insert("launch__registers_per_thread".to_string(), "48".to_string());
988        metrics.insert(
989            "smsp__thread_inst_executed_per_inst_executed.pct".to_string(),
990            "100.0".to_string(),
991        );
992
993        let profile = ncu_metrics_to_profile(&metrics, "gemm_cta_wmma_fp16", 512);
994        assert_eq!(profile.kernel.as_ref().unwrap().name, "gemm_cta_wmma_fp16");
995        assert!(profile.timing.wall_clock_time_us > 0.0);
996        assert!(profile.throughput.tflops > 0.0);
997        assert!(profile.gpu_compute.is_some());
998    }
999
1000    #[test]
1001    fn test_parse_nsys_stats_empty() {
1002        let stats = parse_nsys_stats("");
1003        assert!(stats.is_empty());
1004    }
1005
1006    #[test]
1007    fn test_render_profile_no_panic() {
1008        let profile = FullProfile {
1009            version: "2.0".to_string(),
1010            hardware: HardwareInfo {
1011                gpu: Some("Test GPU".to_string()),
1012                gpu_sm: Some("8.9".to_string()),
1013                ..Default::default()
1014            },
1015            kernel: Some(KernelInfo {
1016                name: "test_kernel".to_string(),
1017                dimensions: vec![256, 256, 256],
1018                ..Default::default()
1019            }),
1020            timing: TimingMetrics {
1021                wall_clock_time_us: 10.0,
1022                samples: 1,
1023                ..Default::default()
1024            },
1025            throughput: ThroughputMetrics {
1026                tflops: 5.0,
1027                ..Default::default()
1028            },
1029            gpu_compute: Some(GpuComputeMetrics {
1030                sm_utilization_pct: 40.0,
1031                warp_execution_efficiency_pct: 100.0,
1032                achieved_occupancy_pct: 33.0,
1033                register_usage_per_thread: 48,
1034                ..Default::default()
1035            }),
1036            gpu_memory: Some(GpuMemoryMetrics {
1037                dram_throughput_pct: 7.8,
1038                l2_hit_rate_pct: 87.0,
1039                global_load_efficiency_pct: 72.0,
1040                ..Default::default()
1041            }),
1042            ..Default::default()
1043        };
1044        // Should not panic
1045        render_profile(&profile);
1046    }
1047}