use std::io::{self, BufRead, Write};
use anyhow::Result;
use super::commands;
use super::db::GpuDb;
pub fn run(db: &mut GpuDb) -> Result<()> {
let stdin = io::stdin();
let mut stdout = io::stdout();
commands::cmd_stats(db);
println!();
loop {
print!("gpu> ");
stdout.flush()?;
let mut line = String::new();
if stdin.lock().read_line(&mut line)? == 0 {
break;
}
let line = line.trim();
if line.is_empty() { continue; }
let parts: Vec<&str> = line.split_whitespace().collect();
let cmd = parts[0];
let args = &parts[1..];
match cmd {
"quit" | "exit" | "q" => break,
"help" | "h" | "?" => cmd_help(args),
"stats" => commands::cmd_stats(db),
"kernels" | "k" => commands::cmd_kernels(db, args),
"ops" => commands::cmd_ops(db, args),
"inspect" | "i" => commands::cmd_inspect(db, args),
"bound" => commands::cmd_bound(db, args),
"roofline" | "roof" => commands::cmd_roofline(db, args),
"occupancy" | "occ" => commands::cmd_occupancy(db, args),
"transfers" | "xfer" => commands::cmd_transfers(db, args),
"gaps" => commands::cmd_gaps(db, args),
"overlap" => commands::cmd_overlap(db),
"streams" => commands::cmd_streams(db),
"timeline" | "tl" => commands::cmd_timeline(db, args),
"trace" => commands::cmd_trace(db, args),
"callers" => commands::cmd_callers(db, args),
"layers" => commands::cmd_layers(db),
"suggest" => commands::cmd_suggest(db),
"save" => commands::cmd_save(db, args),
"list" | "ls" => commands::cmd_list(),
"diff" => commands::cmd_diff(db, args),
"focus" => commands::cmd_focus(db, args),
"ignore" => commands::cmd_ignore(db, args),
"region" => commands::cmd_region(db, args),
"reset" => commands::cmd_reset(db),
"variance" | "var" => commands::cmd_variance(db, args),
"warmup" => commands::cmd_warmup(db),
"small" => commands::cmd_small(db, args),
"fuse" => commands::cmd_fuse(db, args),
"concurrency" | "conc" => commands::cmd_concurrency(db),
"hotpath" | "hot" => commands::cmd_hotpath(db),
"compare-ops" | "cmp" => commands::cmd_compare_ops(db, args),
"top-ops" | "top" => commands::cmd_top_ops(db, args),
"breakdown" | "br" => commands::cmd_breakdown(db, args),
"idle-between" | "idle" => commands::cmd_idle_between(db, args),
"outliers" => commands::cmd_outliers(db, args),
"source" | "src" => commands::cmd_source(db, args),
"memory" | "mem" => commands::cmd_memory(db, args),
"bandwidth" | "bw" => commands::cmd_bandwidth(db, args),
"critical-path" | "cp" => commands::cmd_critical_path(db, args),
"stream-graph" | "sg" => commands::cmd_stream_graph(db, args),
"hotspot" => commands::cmd_hotspot(db, args),
"launches" => commands::cmd_launches(db, args),
"compare" | "cmp-k" => commands::cmd_compare(db, args),
"regressions" | "regr" => commands::cmd_regressions(db, args),
_ => {
println!("unknown command: {cmd}");
println!("type 'help' for available commands");
}
}
}
Ok(())
}
fn cmd_help(args: &[&str]) {
if args.is_empty() {
println!("GPU Profile REPL — Commands:\n");
println!(" Hotspots");
println!(" kernels [N] [pattern] Top kernels by total GPU time");
println!(" ops [N] [pattern] Top operators (needs torch/proton layer)");
println!(" stats Overall summary\n");
println!(" Analysis");
println!(" roofline [pattern] Classify compute-bound vs memory-bound");
println!(" bound <kernel> Detailed boundedness diagnosis");
println!(" occupancy [N] SM occupancy ranking");
println!(" variance <kernel> Launch-to-launch timing variance");
println!(" outliers <kernel> Slowest launches, timeline position, clustering");
println!(" source <kernel> Op/file that launched this kernel (needs torch/proton)");
println!(" memory [N] GPU allocations: peak, leaks, largest, churn");
println!(" bandwidth [N] [pat] Per-kernel achieved memory bandwidth (needs ncu)");
println!(" critical-path [gap_us] Longest same-stream kernel chain");
println!(" stream-graph [width] ASCII timeline with streams as rows");
println!(" hotspot <window_us> Hottest N-us window and what ran in it");
println!(" launches <kernel> [N] Every launch: time, grid/block, stream, gap");
println!(" compare <ka> <kb> Side-by-side stats for two kernels");
println!(" regressions <s> [%] [us] Diff filtered by % and absolute thresholds");
println!(" warmup Detect warmup launches before steady state");
println!(" small [N] Kernels where launch overhead > compute");
println!(" fuse [N] Sequential kernels that could be fused");
println!(" concurrency Stream utilization and parallelism gaps");
println!(" hotpath Critical path through ops (CPU vs GPU bound)");
println!(" compare-ops [N] CPU vs GPU time ratio per operator");
println!(" top-ops [N] [pattern] Ops ranked by GPU time (not CPU)");
println!(" breakdown <op> Which kernels an op expands into");
println!(" idle-between <a> <b> GPU idle gap between two ops\n");
println!(" Timeline");
println!(" transfers [N] Memory copies ranked by cost");
println!(" gaps [N] GPU idle periods");
println!(" overlap Compute/transfer concurrency");
println!(" streams Per-stream utilization");
println!(" timeline [N] Chronological kernel launches\n");
println!(" Drill-down");
println!(" inspect <kernel> Full detail from all layers");
println!(" trace <op> Op -> kernel mapping");
println!(" callers <kernel> Which op launched this kernel\n");
println!(" Data management");
println!(" layers Show loaded data layers");
println!(" suggest Suggest what data to collect next");
println!(" save <name> Save session for later");
println!(" list List saved sessions");
println!(" diff <name> Compare against saved session\n");
println!(" Filtering");
println!(" focus <pattern> Show only matching kernels");
println!(" ignore <pattern> Hide matching kernels");
println!(" region <name> Focus on NVTX / profiler step");
println!(" reset Clear all filters\n");
println!(" quit Exit REPL");
} else {
println!("no detailed help for '{}'", args.join(" "));
}
}