use super::db::{GpuDb, escape_sql_like};
fn parse_count(args: &[&str]) -> usize {
args.first()
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(10)
}
fn parse_pattern<'a>(args: &'a [&'a str]) -> Option<&'a str> {
if args.is_empty() { return None; }
if args[0].parse::<usize>().is_ok() {
args.get(1).copied()
} else {
Some(args[0])
}
}
fn fmt_us(us: f64) -> String {
if us >= 1_000_000.0 { format!("{:.2}s", us / 1_000_000.0) }
else if us >= 1_000.0 { format!("{:.1}ms", us / 1_000.0) }
else { format!("{:.1}us", us) }
}
fn fmt_bytes(b: i64) -> String {
if b >= 1_073_741_824 { format!("{:.1} GB", b as f64 / 1_073_741_824.0) }
else if b >= 1_048_576 { format!("{:.1} MB", b as f64 / 1_048_576.0) }
else if b >= 1024 { format!("{:.1} KB", b as f64 / 1024.0) }
else { format!("{b} B") }
}
pub(crate) fn trunc(s: &str, max: usize) -> String {
if s.chars().count() <= max {
s.to_string()
} else {
let end: String = s.chars().take(max - 3).collect();
format!("{end}...")
}
}
fn like_param(pattern: &str) -> String {
format!("%{}%", escape_sql_like(pattern))
}
pub(crate) fn escape_regex(s: &str) -> String {
let mut out = String::with_capacity(s.len() + 8);
for ch in s.chars() {
if "\\^$.|?*+()[]{}".contains(ch) {
out.push('\\');
}
out.push(ch);
}
out
}
pub(crate) fn compute_gpu_gaps(db: &GpuDb) -> Vec<(f64, f64)> {
let tl = db.timeline_filter();
let kernel_sql = format!(
"SELECT start_us, start_us + duration_us AS end_us
FROM launches WHERE start_us IS NOT NULL AND {tl}"
);
let mut intervals: Vec<(f64, f64)> = db.query_vec(&kernel_sql, [], |row| {
Ok((row.get::<_,f64>(0)?, row.get::<_,f64>(1)?))
});
let xfer_intervals: Vec<(f64, f64)> = db.query_vec(
"SELECT start_us, start_us + duration_us FROM transfers WHERE start_us IS NOT NULL",
[],
|row| Ok((row.get(0)?, row.get(1)?)),
);
intervals.extend(xfer_intervals);
intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let mut gaps = Vec::new();
if let Some(&(_, mut cur_end)) = intervals.first() {
for &(s, e) in &intervals[1..] {
if s <= cur_end {
if e > cur_end { cur_end = e; }
} else {
let gap = s - cur_end;
if gap > 1.0 {
gaps.push((cur_end, gap));
}
cur_end = e;
}
}
}
gaps
}
pub(crate) fn gpu_busy_us(db: &GpuDb) -> f64 {
let tl = db.timeline_filter();
let k_sql = format!(
"SELECT start_us, start_us + duration_us FROM launches
WHERE start_us IS NOT NULL AND {tl}"
);
let mut intervals: Vec<(f64, f64)> = db.query_vec(&k_sql, [], |row| {
Ok((row.get(0)?, row.get(1)?))
});
let xfer: Vec<(f64, f64)> = db.query_vec(
"SELECT start_us, start_us + duration_us FROM transfers WHERE start_us IS NOT NULL",
[], |row| Ok((row.get(0)?, row.get(1)?))
);
intervals.extend(xfer);
intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
merge_intervals(&intervals).iter().map(|(s, e)| e - s).sum()
}
fn require_op_layer(db: &GpuDb) -> bool {
if db.has_layer("torch") || db.has_layer("proton") {
true
} else {
println!("no op data — need torch.profiler or proton layer");
false
}
}
pub fn cmd_stats(db: &GpuDb) {
let target = db.meta("target");
let device = db.meta("device");
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
let gpu_us = db.total_gpu_time_us();
let xfer_us: f64 = db.scalar_f64("SELECT COALESCE(SUM(duration_us),0) FROM transfers");
println!("GPU Profile Summary");
println!(" Target: {target}");
if !device.is_empty() { println!(" Device: {device}"); }
println!(" Wall time: {}", fmt_us(wall_us));
let pct = |v: f64| if wall_us > 0.0 { v / wall_us * 100.0 } else { 0.0 };
println!(" Kernel time: {} ({:.1}% of wall)", fmt_us(gpu_us), pct(gpu_us));
if xfer_us > 0.0 {
println!(" Transfer time: {} ({:.1}% of wall)", fmt_us(xfer_us), pct(xfer_us));
}
if wall_us > 0.0 && db.has_layer("nsys") {
let useful = gpu_busy_us(db);
println!(" Efficiency: {:.1}% ({} useful GPU / {} wall)",
useful / wall_us * 100.0, fmt_us(useful), fmt_us(wall_us));
}
println!(" Kernels: {} launches, {} unique",
db.total_launch_count(), db.unique_kernel_count());
println!(" Transfers: {}", db.transfer_count());
println!(" Streams: {}", db.stream_count());
let layers = db.layer_names();
let has_nsys = db.has_layer("nsys");
let has_ncu = db.has_layer("ncu");
let has_torch = db.has_layer("torch");
if layers.is_empty() {
println!(" Layers: (none)");
} else {
println!(" Layers: {}", layers.join(" + "));
}
let mut missing = Vec::new();
if !has_nsys { missing.push("nsys"); }
if !has_ncu { missing.push("ncu"); }
if !has_torch && target.ends_with(".py") { missing.push("torch"); }
if !missing.is_empty() {
println!(" Missing: {} (run 'suggest')", missing.join(", "));
}
let uk = db.unique_kernel_count();
let wm = db.kernels_with_metrics();
println!(" Deep metrics: {wm}/{uk} kernels");
let wo = db.kernels_with_ops();
if wo > 0 { println!(" Op mapping: {wo}/{uk} kernels"); }
let failures = db.failures();
if !failures.is_empty() {
println!(" Failures: {} (run 'suggest')", failures.len());
}
let nsys_warn = db.meta("nsys_warning");
if !nsys_warn.is_empty() {
println!(" WARNING: {nsys_warn}");
}
if let Some(w) = db.check_target_consistency() {
println!(" WARNING: {w}");
}
for w in db.check_kernel_consistency() {
println!(" WARNING: {w}");
}
}
pub fn cmd_kernels(db: &GpuDb, args: &[&str]) {
let n = parse_count(args);
let pattern = parse_pattern(args);
let filter = db.kernel_filter();
let tl = db.timeline_filter();
let pattern_clause = pattern
.map(|p| format!(r"AND launches.kernel_name LIKE '%{}%' ESCAPE '\'", escape_sql_like(p)))
.unwrap_or_default();
let sql = format!(
"SELECT launches.kernel_name,
COUNT(*) as cnt,
SUM(launches.duration_us) as total,
AVG(launches.duration_us) as avg,
AVG(launches.duration_us * launches.duration_us)
- AVG(launches.duration_us) * AVG(launches.duration_us) as var,
m.boundedness,
m.compute_throughput_pct,
m.memory_throughput_pct
FROM launches
LEFT JOIN metrics m ON m.kernel_name = launches.kernel_name
WHERE {filter} AND {tl} {pattern_clause}
GROUP BY launches.kernel_name
ORDER BY total DESC
LIMIT ?1"
);
let gpu_total = db.total_gpu_time_us();
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt
.query_map([n as i64], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, i64>(1)?,
row.get::<_, f64>(2)?,
row.get::<_, f64>(3)?,
row.get::<_, f64>(4)?,
row.get::<_, Option<String>>(5)?,
row.get::<_, Option<f64>>(6)?,
row.get::<_, Option<f64>>(7)?,
))
})
.unwrap()
.filter_map(|r| r.ok())
.collect();
println!(" # Kernel Time % Launches Avg Stddev Tail% Bound");
println!(" ── ──────────────────────────────── ──────── ────── ────────── ───────── ───────── ────── ────────────");
for (i, (name, cnt, total, avg, var, bound, cmp, mem)) in rows.iter().enumerate() {
let pct = if gpu_total > 0.0 { total / gpu_total * 100.0 } else { 0.0 };
let stddev = var.max(0.0).sqrt();
let tail_pct = tail_over_2x_median(db, name, &tl);
let bound_str = match bound.as_deref() {
Some("compute") => format!("cmp {:.0}%", cmp.unwrap_or(0.0)),
Some("memory") => format!("mem {:.0}%", mem.unwrap_or(0.0)),
Some("latency") => "latency".into(),
_ => "[no ncu]".into(),
};
let tail_str = match tail_pct {
Some(p) => format!("{p:.1}%"),
None => "—".into(),
};
println!(" {:<2} {:<32} {:>8} {:>5.1}% {:>9} {:>9} {:>9} {:>6} {:<12}",
i + 1, trunc(name, 32), fmt_us(*total), pct, cnt,
fmt_us(*avg), fmt_us(stddev), tail_str, bound_str);
}
}
fn tail_over_2x_median(db: &GpuDb, kernel_name: &str, tl: &str) -> Option<f64> {
let sql = format!(
"SELECT duration_us FROM launches
WHERE kernel_name = ?1 AND {tl}"
);
let durs: Vec<f64> = db.query_vec(&sql, [kernel_name], |row| row.get(0));
if durs.len() < 4 { return None; }
let mut sorted = durs.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median = sorted[sorted.len() / 2];
if median <= 0.0 { return None; }
let thresh = median * 2.0;
let tail = durs.iter().filter(|&&d| d > thresh).count();
Some(tail as f64 / durs.len() as f64 * 100.0)
}
pub fn cmd_ops(db: &GpuDb, args: &[&str]) {
if !require_op_layer(db) { return; }
let n = parse_count(args);
let pattern = parse_pattern(args);
let pattern_clause = pattern
.map(|p| format!(r"AND name LIKE '%{}%' ESCAPE '\'", escape_sql_like(p)))
.unwrap_or_default();
let sql = format!(
"SELECT name, module_path, cpu_time_us, gpu_time_us, input_shapes
FROM ops WHERE 1=1 {pattern_clause}
ORDER BY cpu_time_us DESC LIMIT ?1"
);
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows = stmt.query_map([n as i64], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, Option<String>>(1)?,
row.get::<_, f64>(2)?,
row.get::<_, f64>(3)?,
row.get::<_, Option<String>>(4)?,
))
}).unwrap();
println!(" # Op CPU Time Module");
println!(" ── ───────────────────────────────── ────────── ────────────");
for (i, row) in rows.enumerate() {
let (name, module, cpu_time, _, _) = row.unwrap();
println!(" {:<2} {:<34} {:>9} {}",
i + 1, trunc(&name, 34), fmt_us(cpu_time),
module.as_deref().unwrap_or(""));
}
}
pub fn cmd_inspect(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: inspect <kernel_pattern>"); return; }
};
let tl = db.timeline_filter();
let sql = format!(r"SELECT kernel_name, COUNT(*), SUM(duration_us), AVG(duration_us),
MIN(duration_us), MAX(duration_us)
FROM launches WHERE kernel_name LIKE ?1 ESCAPE '\' AND {tl}
GROUP BY kernel_name");
let mut stmt = db.conn.prepare(&sql).unwrap();
let mut rows = stmt.query_map([like_param(pattern)], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, i64>(1)?,
row.get::<_, f64>(2)?,
row.get::<_, f64>(3)?,
row.get::<_, f64>(4)?,
row.get::<_, f64>(5)?,
))
}).unwrap();
let (name, cnt, total, avg, min, max) = match rows.next() {
Some(Ok(r)) => r,
_ => { println!("no kernel matching '{pattern}'"); return; }
};
if rows.next().is_some() {
println!("multiple matches for '{pattern}':");
println!(" {name}");
for row in rows { if let Ok((n,_,_,_,_,_)) = row { println!(" {n}"); } }
println!("narrow the pattern");
return;
}
drop(rows);
drop(stmt);
println!("Kernel: {name}");
println!(" Launches: {cnt}");
println!(" Total: {}", fmt_us(total));
println!(" Average: {}", fmt_us(avg));
if cnt > 1 { println!(" Min: {}", fmt_us(min)); println!(" Max: {}", fmt_us(max)); }
let config_sql = format!("SELECT grid_x, grid_y, grid_z, block_x, block_y, block_z,
COUNT(*) as cnt
FROM launches WHERE kernel_name = ?1
AND grid_x IS NOT NULL AND {tl}
GROUP BY grid_x, grid_y, grid_z, block_x, block_y, block_z
ORDER BY cnt DESC LIMIT 5");
let mut stmt = db.conn.prepare(&config_sql).unwrap();
let configs: Vec<_> = stmt.query_map([&name], |row| {
Ok((row.get::<_,u32>(0)?, row.get::<_,u32>(1)?, row.get::<_,u32>(2)?,
row.get::<_,u32>(3)?, row.get::<_,u32>(4)?, row.get::<_,u32>(5)?,
row.get::<_,i64>(6)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if !configs.is_empty() {
println!();
for (gx,gy,gz,bx,by,bz,c) in &configs {
let threads = *bx as u64 * *by as u64 * *bz as u64;
println!(" grid=({gx},{gy},{gz}) block=({bx},{by},{bz}) threads/block={threads} x{c}");
}
}
let m_sql = "SELECT occupancy_pct, compute_throughput_pct, memory_throughput_pct,
registers_per_thread, shared_mem_static_bytes, shared_mem_dynamic_bytes,
l2_hit_rate_pct, achieved_bandwidth_gb_s, peak_bandwidth_gb_s,
boundedness
FROM metrics WHERE kernel_name = ?1";
if let Ok(m) = db.conn.query_row(m_sql, [&name], |row| {
Ok((
row.get::<_,Option<f64>>(0)?, row.get::<_,Option<f64>>(1)?,
row.get::<_,Option<f64>>(2)?, row.get::<_,Option<i64>>(3)?,
row.get::<_,Option<i64>>(4)?, row.get::<_,Option<i64>>(5)?,
row.get::<_,Option<f64>>(6)?, row.get::<_,Option<f64>>(7)?,
row.get::<_,Option<f64>>(8)?, row.get::<_,Option<String>>(9)?,
))
}) {
println!("\n Hardware Metrics (ncu):");
if let Some(b) = &m.9 { println!(" Boundedness: {b}"); }
if let Some(v) = m.0 { println!(" Occupancy: {v:.1}%"); }
if let Some(v) = m.1 { println!(" Compute throughput: {v:.1}%"); }
if let Some(v) = m.2 { println!(" Memory throughput: {v:.1}%"); }
if let Some(v) = m.3 { println!(" Registers/thread: {v}"); }
let shmem = m.4.unwrap_or(0) + m.5.unwrap_or(0);
if shmem > 0 { println!(" Shared memory: {}", fmt_bytes(shmem)); }
if let Some(v) = m.6 { println!(" L2 hit rate: {v:.1}%"); }
if let (Some(a), Some(p)) = (m.7, m.8) {
println!(" Bandwidth: {a:.1} / {p:.1} GB/s ({:.1}%)", a / p * 100.0);
} else if let Some(a) = m.7 { println!(" Bandwidth: {a:.1} GB/s"); }
} else {
println!("\n No hardware metrics (need ncu)");
}
let op_sql = "SELECT o.name, o.module_path, o.input_shapes
FROM op_kernel_map okm JOIN ops o ON o.id = okm.op_id
WHERE okm.kernel_name = ?1";
let mut stmt = db.conn.prepare(op_sql).unwrap();
let ops: Vec<_> = stmt.query_map([&name], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,Option<String>>(1)?, row.get::<_,Option<String>>(2)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if !ops.is_empty() {
println!("\n Origin (torch/proton):");
for (opname, modpath, shapes) in &ops {
println!(" Op: {opname}");
if let Some(m) = modpath { println!(" Module: {m}"); }
if let Some(s) = shapes { println!(" Shapes: {s}"); }
}
}
}
pub fn cmd_bound(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: bound <kernel_pattern>"); return; }
};
let sql = "SELECT m.kernel_name, m.boundedness,
m.compute_throughput_pct, m.memory_throughput_pct,
m.l2_hit_rate_pct, m.achieved_bandwidth_gb_s, m.peak_bandwidth_gb_s,
m.occupancy_pct
FROM metrics m WHERE m.kernel_name LIKE ?1 ESCAPE '\'";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([like_param(pattern)], |row| {
Ok((
row.get::<_,String>(0)?, row.get::<_,Option<String>>(1)?,
row.get::<_,Option<f64>>(2)?, row.get::<_,Option<f64>>(3)?,
row.get::<_,Option<f64>>(4)?, row.get::<_,Option<f64>>(5)?,
row.get::<_,Option<f64>>(6)?, row.get::<_,Option<f64>>(7)?,
))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() {
println!("no metrics for kernel matching '{pattern}'");
println!("need ncu data");
return;
}
for (name, bound, cmp, mem, l2, bw, peak, occ) in &rows {
println!("{name}:");
match bound.as_deref() {
Some("compute") => println!(" Compute-bound ({:.1}% compute, {:.1}% memory)", cmp.unwrap_or(0.0), mem.unwrap_or(0.0)),
Some("memory") => {
println!(" Memory-bound ({:.1}% memory, {:.1}% compute)", mem.unwrap_or(0.0), cmp.unwrap_or(0.0));
if let Some(l) = l2 { println!(" L2 hit rate: {l:.1}%"); }
if let (Some(a), Some(p)) = (bw, peak) {
println!(" Bandwidth: {a:.1} / {p:.1} GB/s ({:.1}% of peak)", a / p * 100.0);
}
}
Some("latency") => {
println!(" Latency-bound (low utilization)");
if let Some(o) = occ { println!(" Occupancy: {o:.1}%"); }
}
_ => println!(" Compute: {:.1}%, Memory: {:.1}%", cmp.unwrap_or(0.0), mem.unwrap_or(0.0)),
}
}
}
pub fn cmd_roofline(db: &GpuDb, args: &[&str]) {
if !db.has_layer("ncu") {
println!("no ncu metrics — roofline requires hardware counters");
return;
}
let pattern = parse_pattern(args);
let pat = pattern.map(|p| like_param(p)).unwrap_or_else(|| "%".into());
let sql = "SELECT kernel_name, boundedness, compute_throughput_pct,
memory_throughput_pct, occupancy_pct
FROM metrics WHERE kernel_name LIKE ?1 ESCAPE '\'
ORDER BY kernel_name";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([&pat], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,Option<String>>(1)?,
row.get::<_,Option<f64>>(2)?, row.get::<_,Option<f64>>(3)?,
row.get::<_,Option<f64>>(4)?))
}).unwrap().filter_map(|r| r.ok()).collect();
println!(" Kernel Bound Compute% Memory% Occupancy");
println!(" ────────────────────────────────── ──────── ──────── ──────── ──────────");
for (name, bound, cmp, mem, occ) in &rows {
println!(" {:<34} {:<8} {:>7.1}% {:>7.1}% {:>8}",
trunc(name, 34),
bound.as_deref().unwrap_or("?"),
cmp.unwrap_or(0.0), mem.unwrap_or(0.0),
occ.map(|v| format!("{v:.1}%")).unwrap_or_else(|| "?".into()));
}
}
pub fn cmd_occupancy(db: &GpuDb, args: &[&str]) {
if !db.has_layer("ncu") { println!("no occupancy data — need ncu"); return; }
let n = parse_count(args);
let sql = "SELECT kernel_name, occupancy_pct, registers_per_thread,
shared_mem_static_bytes + shared_mem_dynamic_bytes as shmem
FROM metrics WHERE occupancy_pct IS NOT NULL
ORDER BY occupancy_pct ASC LIMIT ?1";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([n as i64], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?,
row.get::<_,Option<i64>>(2)?, row.get::<_,Option<i64>>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
println!(" Kernel Occupancy Regs ShmemK Limiting");
println!(" ────────────────────────────────── ───────── ───── ─────── ────────");
for (name, occ, regs, shmem) in &rows {
let limit = if regs.unwrap_or(0) > 64 { "registers" }
else if shmem.unwrap_or(0) > 48 * 1024 { "shared mem" }
else { "block size" };
println!(" {:<34} {:>8.1}% {:>5} {:>6} {}",
trunc(name, 34), occ,
regs.map(|v| v.to_string()).unwrap_or_else(|| "?".into()),
shmem.map(|v| format!("{:.1}", v as f64 / 1024.0)).unwrap_or_else(|| "?".into()),
limit);
}
}
pub fn cmd_transfers(db: &GpuDb, args: &[&str]) {
if db.transfer_count() == 0 {
println!("no memory transfers recorded");
if !db.has_layer("nsys") { println!("need nsys layer for transfer data"); }
return;
}
let n = parse_count(args);
let (total_bytes, total_time): (i64, f64) = db.conn.query_row(
"SELECT COALESCE(SUM(bytes),0), COALESCE(SUM(duration_us),0) FROM transfers",
[], |row| Ok((row.get(0)?, row.get(1)?))
).unwrap();
let kernel_time = db.total_gpu_time_us();
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
println!(" Total: {} transfers, {}, {}",
db.transfer_count(), fmt_bytes(total_bytes), fmt_us(total_time));
if wall_us > 0.0 {
println!(" {:.1}% of wall time spent on transfers", total_time / wall_us * 100.0);
}
if kernel_time > 0.0 {
let ratio = total_time / kernel_time;
let verdict = if ratio > 5.0 { "BANDWIDTH-BOUND — PCIe dominates" }
else if ratio > 1.5 { "transfer-heavy — consider async transfers or larger batches" }
else if ratio > 0.5 { "mixed compute/transfer" }
else { "compute-dominated" };
println!(" Transfer:compute ratio = {ratio:.2}:1 ({verdict})");
}
println!();
let kind_sql = "SELECT kind, COUNT(*), SUM(bytes), SUM(duration_us),
MIN(bytes), MAX(bytes)
FROM transfers GROUP BY kind ORDER BY SUM(duration_us) DESC";
let mut stmt = db.conn.prepare(kind_sql).unwrap();
let kinds: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,i64>(1)?,
row.get::<_,i64>(2)?, row.get::<_,f64>(3)?,
row.get::<_,i64>(4)?, row.get::<_,i64>(5)?))
}).unwrap().filter_map(|r| r.ok()).collect();
println!(" By Direction:");
println!(" Kind Count Total Time Avg BW Size range");
println!(" ───── ──────── ───────────── ─────────── ──────────── ────────────");
for (kind, cnt, bytes, dur, min_b, max_b) in &kinds {
let bw = if *dur > 0.0 { format!("{:.1} GB/s", *bytes as f64 / dur / 1000.0) }
else { "?".into() };
let range = if min_b == max_b { fmt_bytes(*min_b) }
else { format!("{}-{}", fmt_bytes(*min_b), fmt_bytes(*max_b)) };
println!(" {:<5} {:>8} {:>13} {:>11} {:>12} {}",
kind, cnt, fmt_bytes(*bytes), fmt_us(*dur), bw, range);
}
println!();
let (small_cnt, small_bytes, small_time): (i64, i64, f64) = db.conn.query_row(
"SELECT COUNT(*), COALESCE(SUM(bytes),0), COALESCE(SUM(duration_us),0)
FROM transfers WHERE bytes < 1048576", [], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?))
).unwrap();
if small_cnt > 0 {
let pct = if total_time > 0.0 { small_time / total_time * 100.0 } else { 0.0 };
println!(" Small transfers: {} (<1 MB), {} total, {} time ({pct:.1}% of transfer time)",
small_cnt, fmt_bytes(small_bytes), fmt_us(small_time));
if small_cnt > 10 {
println!(" → many small transfers — coalesce into fewer batched copies");
}
}
if total_time > 0.0 {
print_transfer_cdf(db, total_time);
}
let sql = "SELECT kind, bytes, duration_us, stream_id
FROM transfers ORDER BY duration_us DESC LIMIT ?1";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([n as i64], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,i64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,Option<u32>>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
println!("\n Top {} by Duration:", rows.len());
println!(" # Kind Size Duration BW Stream Notes");
println!(" ── ───── ────────── ────────── ──────────── ─────── ────────");
for (i, (kind, bytes, dur, sid)) in rows.iter().enumerate() {
let bw_num = if *dur > 0.0 { *bytes as f64 / dur / 1000.0 } else { 0.0 };
let bw_str = if *dur > 0.0 { format!("{:.1} GB/s", bw_num) } else { "?".into() };
let notes = if *bytes >= 16 * 1024 * 1024 && (kind == "H2D" || kind == "D2H") && bw_num < 6.0 {
"pageable? consider cudaMallocHost"
} else if *bytes < 4096 {
"tiny — overhead-dominated"
} else {
""
};
println!(" {:<2} {:<5} {:>10} {:>10} {:>11} {:>6} {}",
i+1, kind, fmt_bytes(*bytes), fmt_us(*dur), bw_str,
sid.map(|s| s.to_string()).unwrap_or_else(|| "?".into()), notes);
}
}
fn print_transfer_cdf(db: &GpuDb, total_time: f64) {
let buckets: [(i64, &str); 7] = [
(4 * 1024, "<4 KB"),
(64 * 1024, "<64 KB"),
(1024 * 1024, "<1 MB"),
(16 * 1024 * 1024, "<16 MB"),
(128 * 1024 * 1024, "<128 MB"),
(1024 * 1024 * 1024,"<1 GB"),
(i64::MAX, ">=1 GB"),
];
let rows: Vec<(i64, f64)> = db.query_vec(
"SELECT bytes, duration_us FROM transfers ORDER BY bytes ASC",
[], |row| Ok((row.get(0)?, row.get(1)?))
);
if rows.is_empty() { return; }
println!("\n Cumulative time by transfer size:");
println!(" Size bucket Count Time Bucket % Cumulative %");
println!(" ────────────── ──────── ──────────── ───────── ────────────");
let mut idx = 0usize;
let mut cum = 0.0;
for &(limit, label) in &buckets {
let mut cnt = 0i64;
let mut bucket_time = 0.0;
while idx < rows.len() && rows[idx].0 < limit {
bucket_time += rows[idx].1;
cnt += 1;
idx += 1;
}
if cnt == 0 { continue; }
cum += bucket_time;
let bpct = bucket_time / total_time * 100.0;
let cpct = cum / total_time * 100.0;
println!(" {:<14} {:>8} {:>12} {:>8.1}% {:>11.1}%",
label, cnt, fmt_us(bucket_time), bpct, cpct);
}
}
pub fn cmd_gaps(db: &GpuDb, args: &[&str]) {
if !db.has_layer("nsys") {
println!("no timeline data — need nsys layer");
return;
}
let n = parse_count(args);
let mut rows = compute_gpu_gaps(db);
if rows.is_empty() { println!("no GPU idle gaps detected"); return; }
let total_gap: f64 = rows.iter().map(|r| r.1).sum();
let total_count = rows.len();
rows.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
let shown = rows.len().min(n);
rows.truncate(n);
println!(" {} GPU idle gaps (total idle: {})", total_count, fmt_us(total_gap));
if shown < total_count {
println!(" Showing top {shown} by duration:\n");
} else {
println!();
}
println!(" # Start Duration Before → After");
println!(" ── ──────────── ──────────── ────────────────────────────────");
for (i, (start, dur)) in rows.iter().enumerate() {
let before = kernel_ending_at_or_before(db, *start);
let after = kernel_starting_at_or_after(db, *start + *dur);
let edge = format!("{} → {}",
before.as_deref().map(|n| trunc(n, 22)).unwrap_or_else(|| "—".into()),
after.as_deref().map(|n| trunc(n, 22)).unwrap_or_else(|| "—".into()));
println!(" {:<2} {:>12} {:>12} {}", i+1, fmt_us(*start), fmt_us(*dur), edge);
}
}
fn kernel_ending_at_or_before(db: &GpuDb, t: f64) -> Option<String> {
let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name FROM launches
WHERE start_us IS NOT NULL AND (start_us + duration_us) <= ?1 + 0.5 AND {tl}
ORDER BY (start_us + duration_us) DESC LIMIT 1"
);
db.conn.query_row(&sql, [t], |row| row.get::<_, String>(0)).ok()
}
fn kernel_starting_at_or_after(db: &GpuDb, t: f64) -> Option<String> {
let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name FROM launches
WHERE start_us IS NOT NULL AND start_us >= ?1 - 0.5 AND {tl}
ORDER BY start_us ASC LIMIT 1"
);
db.conn.query_row(&sql, [t], |row| row.get::<_, String>(0)).ok()
}
pub fn cmd_overlap(db: &GpuDb) {
if !db.has_layer("nsys") { println!("no timeline data — need nsys layer"); return; }
let gpu_us = db.total_gpu_time_us();
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
let xfer_time: f64 = db.conn.query_row(
"SELECT COALESCE(SUM(duration_us),0) FROM transfers", [], |row| row.get(0)
).unwrap();
let overlap_us = compute_xfer_kernel_overlap(db);
println!(" Compute/Transfer Overlap:");
println!(" GPU kernel time: {}", fmt_us(gpu_us));
println!(" Transfer time: {}", fmt_us(xfer_time));
if xfer_time > 0.0 && overlap_us > 0.0 {
println!(" Concurrent: {} ({:.1}% of transfers overlapped with compute)",
fmt_us(overlap_us), overlap_us / xfer_time * 100.0);
} else if xfer_time > 0.0 {
println!(" Concurrent: none (transfers and compute are serialized)");
}
if wall_us > 0.0 {
println!(" GPU utilization: {:.1}%", gpu_us / wall_us * 100.0);
}
let kinds: Vec<String> = db.query_vec(
"SELECT DISTINCT kind FROM transfers WHERE start_us IS NOT NULL",
[], |row| row.get(0)
);
if !kinds.is_empty() {
println!("\n By direction:");
println!(" Kind Transfer Overlap %");
println!(" ────── ────────── ────────── ──────");
for kind in &kinds {
let (dir_time, dir_overlap) = compute_xfer_kernel_overlap_for_kind(db, kind);
if dir_time <= 0.0 { continue; }
let pct = dir_overlap / dir_time * 100.0;
println!(" {:<6} {:>10} {:>10} {:>5.1}%",
kind, fmt_us(dir_time), fmt_us(dir_overlap), pct);
}
}
}
fn compute_xfer_kernel_overlap_for_kind(db: &GpuDb, kind: &str) -> (f64, f64) {
let tl = db.timeline_filter();
let k_sql = format!(
"SELECT start_us, start_us + duration_us FROM launches
WHERE start_us IS NOT NULL AND {tl} ORDER BY start_us"
);
let k_intervals: Vec<(f64, f64)> = db.query_vec(&k_sql, [], |row| {
Ok((row.get(0)?, row.get(1)?))
});
let merged = merge_intervals(&k_intervals);
let t_intervals: Vec<(f64, f64)> = db.query_vec(
"SELECT start_us, start_us + duration_us FROM transfers
WHERE start_us IS NOT NULL AND kind = ?1 ORDER BY start_us",
[kind], |row| Ok((row.get(0)?, row.get(1)?))
);
let total_time: f64 = t_intervals.iter().map(|(s, e)| e - s).sum();
let mut overlap = 0.0;
for &(ts, te) in &t_intervals {
for &(ks, ke) in &merged {
let os = ts.max(ks);
let oe = te.min(ke);
if os < oe { overlap += oe - os; }
}
}
(total_time, overlap)
}
pub(crate) fn compute_xfer_kernel_overlap(db: &GpuDb) -> f64 {
let tl = db.timeline_filter();
let k_sql = format!(
"SELECT start_us, start_us + duration_us AS end_us
FROM launches WHERE start_us IS NOT NULL AND {tl}
ORDER BY start_us"
);
let k_intervals: Vec<(f64, f64)> = db.query_vec(&k_sql, [], |row| {
Ok((row.get(0)?, row.get(1)?))
});
let merged = merge_intervals(&k_intervals);
let t_intervals: Vec<(f64, f64)> = db.query_vec(
"SELECT start_us, start_us + duration_us FROM transfers WHERE start_us IS NOT NULL ORDER BY start_us",
[], |row| Ok((row.get(0)?, row.get(1)?))
);
let mut total_overlap = 0.0;
for &(ts, te) in &t_intervals {
for &(ks, ke) in &merged {
let overlap_start = ts.max(ks);
let overlap_end = te.min(ke);
if overlap_start < overlap_end {
total_overlap += overlap_end - overlap_start;
}
}
}
total_overlap
}
fn merge_intervals(intervals: &[(f64, f64)]) -> Vec<(f64, f64)> {
if intervals.is_empty() { return Vec::new(); }
let mut merged: Vec<(f64, f64)> = Vec::new();
let (mut cur_s, mut cur_e) = intervals[0];
for &(s, e) in &intervals[1..] {
if s <= cur_e {
if e > cur_e { cur_e = e; }
} else {
merged.push((cur_s, cur_e));
cur_s = s;
cur_e = e;
}
}
merged.push((cur_s, cur_e));
merged
}
pub fn cmd_streams(db: &GpuDb) {
let tl = db.timeline_filter();
let sql = format!("SELECT stream_id, COUNT(*) as cnt, SUM(duration_us) as total
FROM launches WHERE stream_id IS NOT NULL AND {tl}
GROUP BY stream_id ORDER BY total DESC");
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,u32>(0)?, row.get::<_,i64>(1)?, row.get::<_,f64>(2)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() { println!("no stream data"); return; }
println!(" Stream Kernels Active Time");
println!(" ─────── ──────── ────────────");
for (sid, cnt, total) in &rows {
println!(" {:>6} {:>7} {:>11}", sid, cnt, fmt_us(*total));
}
}
pub fn cmd_timeline(db: &GpuDb, args: &[&str]) {
let n = parse_count(args);
let tl = db.timeline_filter();
let sql = format!("SELECT kernel_name, start_us, duration_us, stream_id
FROM launches WHERE start_us IS NOT NULL AND {tl}
ORDER BY start_us LIMIT ?1");
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt.query_map([n as i64], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,Option<u32>>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() { println!("no timeline data"); return; }
println!(" # Start Duration Stream Kernel");
println!(" ── ──────────── ────────── ──────── ────────────────────────────────");
for (i, (name, start, dur, sid)) in rows.iter().enumerate() {
println!(" {:<2} {:>12} {:>10} {:>7} {}",
i+1, fmt_us(*start), fmt_us(*dur),
sid.map(|s| s.to_string()).unwrap_or_else(|| "?".into()),
trunc(name, 40));
}
}
pub fn cmd_trace(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: trace <op_pattern>"); return; }
};
if !require_op_layer(db) { return; }
let sql = r"SELECT id, name, module_path, cpu_time_us, input_shapes
FROM ops WHERE name LIKE ?1 ESCAPE '\'";
let mut stmt = db.conn.prepare(sql).unwrap();
let ops: Vec<_> = stmt.query_map([like_param(pattern)], |row| {
Ok((row.get::<_,i64>(0)?, row.get::<_,String>(1)?,
row.get::<_,Option<String>>(2)?, row.get::<_,f64>(3)?,
row.get::<_,Option<String>>(4)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if ops.is_empty() { println!("no op matching '{pattern}'"); return; }
for (op_id, name, module, cpu_time, shapes) in &ops {
println!("Op: {name}");
if let Some(m) = module { println!(" Module: {m}"); }
if let Some(s) = shapes { println!(" Shapes: {s}"); }
println!(" CPU: {}", fmt_us(*cpu_time));
let k_sql = "SELECT kernel_name FROM op_kernel_map WHERE op_id = ?1";
let mut k_stmt = db.conn.prepare(k_sql).unwrap();
let kernels: Vec<String> = k_stmt.query_map([op_id], |row| row.get(0))
.unwrap().filter_map(|r| r.ok()).collect();
if !kernels.is_empty() {
println!(" Kernels: {}", kernels.join(", "));
}
}
}
pub fn cmd_callers(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: callers <kernel_pattern>"); return; }
};
if !require_op_layer(db) { return; }
let sql = r"SELECT DISTINCT o.name, o.module_path
FROM op_kernel_map okm JOIN ops o ON o.id = okm.op_id
WHERE okm.kernel_name LIKE ?1 ESCAPE '\'";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([like_param(pattern)], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,Option<String>>(1)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() { println!("no op mapping for kernels matching '{pattern}'"); return; }
for (name, module) in &rows {
println!(" {} ({})", name, module.as_deref().unwrap_or("?"));
}
}
pub fn cmd_layers(db: &GpuDb) {
let sql = "SELECT id, source, file, collected_at, collection_secs FROM layers ORDER BY id";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,i64>(0)?, row.get::<_,String>(1)?,
row.get::<_,String>(2)?, row.get::<_,String>(3)?,
row.get::<_,Option<f64>>(4)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() { println!("no layers loaded"); return; }
println!(" # Source File Collected Secs");
println!(" ── ─────── ──────────────────────────────────────── ────────────────── ─────");
for (id, source, file, at, secs) in &rows {
println!(" {:<2} {:<7} {:<43} {:<18} {}",
id, source, trunc(file, 43), &at[..at.len().min(18)],
secs.map(|s| format!("{s:.1}")).unwrap_or_else(|| "?".into()));
}
let uk = db.unique_kernel_count();
let wm = db.kernels_with_metrics();
let wo = db.kernels_with_ops();
println!("\n Correlation: {uk} unique kernels");
println!(" With metrics: {wm}/{uk}");
println!(" With op map: {wo}/{uk}");
}
pub fn cmd_suggest(db: &GpuDb) {
let uk = db.unique_kernel_count();
let failures = db.failures();
let has_nsys = db.has_layer("nsys");
let has_ncu = db.has_layer("ncu");
let has_torch = db.has_layer("torch");
let target = db.meta("target");
if uk == 0 && failures.is_empty() {
println!("no profile data");
return;
}
let mut n = 1;
if !failures.is_empty() {
println!(" Collection failures:\n");
for (phase, error) in &failures {
println!(" {n}. {phase} failed: {error}");
n += 1;
}
println!();
}
if uk == 0 { println!(" No kernel data collected."); return; }
println!(" Suggestions:\n");
if !has_nsys {
println!(" {n}. No timeline data. Run gdbg with your target.");
println!(" This gives: kernel timeline, memory transfers, GPU idle gaps\n");
n += 1;
}
if !has_ncu {
let tl = db.timeline_filter();
let top_sql = format!("SELECT kernel_name, SUM(duration_us) as total
FROM launches WHERE {tl} GROUP BY kernel_name ORDER BY total DESC LIMIT 5");
let mut stmt = db.conn.prepare(&top_sql).unwrap();
let top: Vec<(String, f64)> = stmt.query_map([], |row| {
Ok((row.get(0)?, row.get(1)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if !top.is_empty() {
let gpu_total = db.total_gpu_time_us();
let pct: f64 = top.iter().map(|t| if gpu_total > 0.0 { t.1 / gpu_total * 100.0 } else { 0.0 }).sum();
let regex = top.iter().map(|t| escape_regex(&t.0)).collect::<Vec<_>>().join("|");
println!(" {n}. Top {} kernels ({pct:.0}% of GPU) lack hardware metrics.", top.len());
println!(" Collect: ncu --set full --kernel-name \"regex:{regex}\" {target}\n");
n += 1;
}
}
if !has_torch && target.ends_with(".py") {
println!(" {n}. No op->kernel mapping. Can't trace kernels back to Python.");
println!(" Needed for: ops, callers, trace commands\n");
n += 1;
}
let tl2 = db.timeline_filter();
let var_sql = format!("SELECT kernel_name, COUNT(*) as cnt, AVG(duration_us) as avg,
AVG(duration_us * duration_us) - AVG(duration_us) * AVG(duration_us) as var
FROM launches WHERE {tl2} GROUP BY kernel_name
HAVING cnt > 5 AND var > 0
ORDER BY SUM(duration_us) DESC LIMIT 5");
let mut stmt = db.conn.prepare(&var_sql).unwrap();
let vars: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(2)?, row.get::<_,f64>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
for (name, avg, var) in &vars {
let stddev = var.max(0.0).sqrt();
let cv = if *avg > 0.0 { stddev / avg } else { 0.0 };
if cv > 0.3 {
println!(" {n}. '{}' has high variance (CV={cv:.2}).", name);
println!(" May indicate: data-dependent paths, cache effects, or varying input sizes.\n");
n += 1;
}
}
if has_nsys {
let gpu_us = db.total_gpu_time_us();
let xfer_us: f64 = db.scalar_f64("SELECT COALESCE(SUM(duration_us),0) FROM transfers");
if gpu_us > 0.0 && xfer_us > 0.0 {
let ratio = xfer_us / gpu_us;
if ratio > 5.0 {
println!(" {n}. Transfer:compute ratio is {ratio:.1}:1 — PCIe dominates.");
println!(" Try: cudaMallocHost (pinned memory), overlap via CUDA streams, or increase batch size.\n");
n += 1;
}
}
let tl = db.timeline_filter();
let tiny_sql = format!(
"SELECT COUNT(*) FROM (
SELECT kernel_name FROM launches WHERE {tl}
GROUP BY kernel_name HAVING AVG(duration_us) < 10.0
)"
);
let tiny_count: i64 = db.scalar_f64(&tiny_sql) as i64;
if tiny_count > 10 {
println!(" {n}. {tiny_count} distinct kernels average under 10us — launch overhead likely dominates.");
println!(" Try: torch.compile(), CUDA graphs, or manual kernel fusion. See 'small' and 'fuse'.\n");
n += 1;
}
let dom_sql = format!(
"SELECT kernel_name, SUM(duration_us) as t FROM launches WHERE {tl}
GROUP BY kernel_name ORDER BY t DESC LIMIT 1"
);
if let Ok((dom_name, dom_time)) = db.conn.query_row(
&dom_sql, [], |row| Ok((row.get::<_, String>(0)?, row.get::<_, f64>(1)?))
) {
if gpu_us > 0.0 && dom_time / gpu_us > 0.5 {
let pct = dom_time / gpu_us * 100.0;
println!(" {n}. One kernel accounts for {pct:.0}% of GPU time: {}", trunc(&dom_name, 60));
println!(" Try: bound '{}' — optimize the hotspot directly.\n", trunc(&dom_name, 40));
n += 1;
}
}
}
if has_nsys && has_ncu && (has_torch || !target.ends_with(".py")) {
println!(" All layers loaded — full analysis available.");
}
let _ = n;
}
pub fn cmd_save(db: &GpuDb, args: &[&str]) {
let name = match args.first() {
Some(n) => *n,
None => { println!("usage: save <name>"); return; }
};
match db.save(name) {
Ok(path) => println!("saved to {}", path.display()),
Err(e) => println!("save failed: {e}"),
}
}
pub fn cmd_list() {
match GpuDb::list_saved() {
Ok(sessions) => {
if sessions.is_empty() {
println!("no saved sessions");
return;
}
println!(" Name Device Kernels Layers Created");
println!(" ─────────────────────── ─────────────── ──────── ──────────────── ────────────────");
for s in &sessions {
let dev = if s.device.is_empty() { "?" } else { &s.device };
println!(" {:<23} {:<15} {:>7} {:<16} {}",
trunc(&s.name, 23), trunc(dev, 15), s.kernel_count,
s.layers.join("+"), &s.created[..s.created.len().min(16)]);
}
}
Err(e) => println!("list failed: {e}"),
}
}
pub fn cmd_diff(db: &GpuDb, args: &[&str]) {
let name = match args.first() {
Some(n) => *n,
None => { println!("usage: diff <saved_session>"); return; }
};
let other_path = if name.ends_with(".gpu.db") || name.contains('/') {
PathBuf::from(name)
} else {
GpuDb::session_dir().join(format!("{name}.gpu.db"))
};
if !other_path.exists() {
println!("cannot load '{name}': no such session at {}", other_path.display());
return;
}
if let Err(e) = db.attach(other_path.to_str().unwrap_or(""), "other") {
println!("cannot load '{name}': {e}");
return;
}
let sql = "SELECT
COALESCE(c.kernel_name, o.kernel_name) as name,
COALESCE(o.total, 0) as before,
COALESCE(c.total, 0) as after
FROM
(SELECT kernel_name, SUM(duration_us) as total FROM launches GROUP BY kernel_name) c
FULL OUTER JOIN
(SELECT kernel_name, SUM(duration_us) as total FROM other.launches GROUP BY kernel_name) o
ON c.kernel_name = o.kernel_name
ORDER BY ABS(COALESCE(c.total,0) - COALESCE(o.total,0)) DESC
LIMIT 15";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?, row.get::<_,f64>(2)?))
}).unwrap().filter_map(|r| r.ok()).collect();
println!(" Diff: current vs {name}\n");
println!(" Kernel Before After Delta");
println!(" ────────────────────────────────── ────────── ────────── ──────────");
for (kname, before, after) in &rows {
let delta = if *before > 0.0 {
let pct = (after - before) / before * 100.0;
format!("{}{pct:.1}%", if pct >= 0.0 { "+" } else { "" })
} else { "new".into() };
println!(" {:<34} {:>10} {:>10} {:>10}",
trunc(kname, 34), fmt_us(*before), fmt_us(*after), delta);
}
let _ = db.detach("other");
}
pub fn cmd_focus(db: &mut GpuDb, args: &[&str]) {
match args.first() {
Some(p) => { db.focus = Some(p.to_string()); println!("focus set to '{p}'"); }
None => println!("usage: focus <pattern>"),
}
}
pub fn cmd_ignore(db: &mut GpuDb, args: &[&str]) {
match args.first() {
Some(p) => { db.ignore = Some(p.to_string()); println!("ignoring '{p}'"); }
None => println!("usage: ignore <pattern>"),
}
}
pub fn cmd_region(db: &mut GpuDb, args: &[&str]) {
match args.first() {
Some(p) => { db.region_filter = Some(p.to_string()); println!("region filter set to '{p}'"); }
None => {
let sql = "SELECT name, duration_us FROM regions ORDER BY start_us";
let mut stmt = db.conn.prepare(sql).unwrap();
let rows: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() { println!("no NVTX regions"); }
else { for (n, d) in &rows { println!(" {} ({})", n, fmt_us(*d)); } }
}
}
}
pub fn cmd_reset(db: &mut GpuDb) {
db.focus = None;
db.ignore = None;
db.region_filter = None;
println!("all filters cleared");
}
pub fn cmd_variance(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: variance <kernel_pattern>"); return; }
};
let tl = db.timeline_filter();
let sql = format!(r"SELECT kernel_name, COUNT(*), AVG(duration_us),
MIN(duration_us), MAX(duration_us),
AVG(duration_us * duration_us) - AVG(duration_us) * AVG(duration_us)
FROM launches WHERE kernel_name LIKE ?1 ESCAPE '\' AND {tl}
GROUP BY kernel_name");
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt.query_map([like_param(pattern)], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,i64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,f64>(3)?,
row.get::<_,f64>(4)?, row.get::<_,f64>(5)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() { println!("no kernel matching '{pattern}'"); return; }
for (name, cnt, avg, min, max, var) in &rows {
if *cnt < 2 { println!("{name}: only {cnt} launch"); continue; }
let stddev = var.max(0.0).sqrt();
let cv = if *avg > 0.0 { stddev / avg } else { 0.0 };
println!("{name}:");
println!(" Launches: {cnt}");
println!(" Mean: {}", fmt_us(*avg));
println!(" Stddev: {} (CV={cv:.3})", fmt_us(stddev));
println!(" Min: {}", fmt_us(*min));
println!(" Max: {}", fmt_us(*max));
}
}
pub(crate) fn detect_warmup_count(durations: &[f64]) -> usize {
if durations.len() < 5 { return 0; }
let half = durations.len() / 2;
let mut tail: Vec<f64> = durations[half..].to_vec();
tail.sort_by(|a, b| a.partial_cmp(b).unwrap());
let steady_median = tail[tail.len() / 2];
if steady_median <= 0.0 { return 0; }
let threshold = steady_median * 1.2;
for (i, &d) in durations.iter().enumerate() {
if d <= threshold { return i; }
}
0
}
pub fn cmd_warmup(db: &GpuDb) {
let tl = db.timeline_filter();
let kernel_sql = format!(
"SELECT kernel_name, COUNT(*) as cnt
FROM launches WHERE start_us IS NOT NULL AND {tl}
GROUP BY kernel_name HAVING cnt >= 5
ORDER BY SUM(duration_us) DESC"
);
let mut k_stmt = db.conn.prepare(&kernel_sql).unwrap();
let kernels: Vec<(String, i64)> = k_stmt.query_map([], |row| {
Ok((row.get(0)?, row.get(1)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if kernels.is_empty() {
println!("not enough launches to detect warmup (need ≥5 of the same kernel)");
return;
}
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
let mut found_warmup = false;
for (kernel_name, _cnt) in &kernels {
let launch_sql = format!(
"SELECT start_us, duration_us
FROM launches WHERE kernel_name = ?1 AND start_us IS NOT NULL AND {tl}
ORDER BY start_us LIMIT 200"
);
let mut stmt = db.conn.prepare(&launch_sql).unwrap();
let launches: Vec<(f64, f64)> = stmt.query_map([kernel_name], |row| {
Ok((row.get(0)?, row.get(1)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if launches.len() < 5 { continue; }
let durs: Vec<f64> = launches.iter().map(|r| r.1).collect();
let steady_median = {
let half = durs.len() / 2;
let mut tail = durs[half..].to_vec();
tail.sort_by(|a, b| a.partial_cmp(b).unwrap());
tail[tail.len() / 2]
};
let warmup_end = detect_warmup_count(&durs);
if warmup_end == 0 { continue; }
found_warmup = true;
let warmup_total: f64 = launches[..warmup_end].iter().map(|r| r.1).sum();
let steady_avg = if launches.len() > warmup_end {
launches[warmup_end..].iter().map(|r| r.1).sum::<f64>() / (launches.len() - warmup_end) as f64
} else { 0.0 };
let warmup_pct = if wall_us > 0.0 { warmup_total / wall_us * 100.0 } else { 0.0 };
println!(" Warmup: {} ({})\n", trunc(kernel_name, 50), fmt_us(steady_median));
println!(" Launch Duration Cumulative");
let mut cumulative = 0.0;
let show = (warmup_end + 3).min(launches.len());
for (i, (_, dur)) in launches.iter().take(show).enumerate() {
cumulative += dur;
let marker = if i < warmup_end { " ← warmup" } else if i == warmup_end { " ← stabilized" } else { "" };
println!(" {:<6} {:>10} {:>10}{marker}", i + 1, fmt_us(*dur), fmt_us(cumulative));
}
println!("\n Warmup: {} launches ({}, {warmup_pct:.1}% of wall time)", warmup_end, fmt_us(warmup_total));
println!(" Steady state: {} avg/launch (excluding warmup)", fmt_us(steady_avg));
let excess = warmup_total - steady_avg * warmup_end as f64;
if excess > 0.0 {
let wall_msg = if wall_us > 0.0 {
format!(" out of {}", fmt_us(wall_us))
} else { String::new() };
println!(" Cold-start cost: first {} launch(es) cost {} extra{wall_msg} — dedicate a warmup pass to amortize",
warmup_end, fmt_us(excess));
}
println!();
}
if !found_warmup {
println!("no warmup detected (all kernels stable from first launch)");
}
}
pub fn cmd_small(db: &GpuDb, args: &[&str]) {
let n = parse_count(args);
let threshold_us = 10.0; let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name, COUNT(*) as cnt, AVG(duration_us) as avg,
SUM(duration_us) as total
FROM launches
WHERE {} AND {tl} GROUP BY kernel_name
HAVING avg < ?1
ORDER BY cnt DESC LIMIT ?2",
db.kernel_filter()
);
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt.query_map(rusqlite::params![threshold_us, n as i64], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,i64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,f64>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() {
println!("no kernels averaging under {threshold_us:.0}us");
return;
}
let total_launches: i64 = rows.iter().map(|r| r.1).sum();
let total_time: f64 = rows.iter().map(|r| r.3).sum();
let overhead_est = total_launches as f64 * 5.0;
println!(" Small Kernels (avg < {threshold_us:.0}us, launch overhead may dominate):\n");
println!(" # Kernel Avg Launches Total");
println!(" ── ────────────────────────────────── ───────── ──────── ─────────");
for (i, (name, cnt, avg, total)) in rows.iter().enumerate() {
println!(" {:<2} {:<34} {:>9} {:>8} {:>9}",
i + 1, trunc(name, 34), fmt_us(*avg), cnt, fmt_us(*total));
}
println!("\n {} kernels, {} total launches", rows.len(), total_launches);
println!(" Estimated launch overhead: {} (at ~5us/launch)", fmt_us(overhead_est));
println!(" Actual compute time: {}", fmt_us(total_time));
if overhead_est > total_time {
println!(" Launch overhead EXCEEDS compute — consider kernel fusion or torch.compile()");
}
}
pub fn cmd_fuse(db: &GpuDb, args: &[&str]) {
let n = parse_count(args);
if !db.has_layer("nsys") && !db.has_layer("torch") {
println!("no timeline data — need nsys or torch layer");
return;
}
let tl = db.timeline_filter();
let sql = "WITH ordered AS (
SELECT kernel_name, start_us, duration_us, stream_id,
ROW_NUMBER() OVER (ORDER BY start_us) as rn
FROM launches WHERE start_us IS NOT NULL AND ".to_string()
+ &tl + ")
SELECT a.kernel_name, b.kernel_name,
b.start_us - (a.start_us + a.duration_us) AS gap_us,
a.duration_us + b.duration_us AS combined_us
FROM ordered a
JOIN ordered b ON b.rn = a.rn + 1
WHERE gap_us >= 0 AND gap_us < 5.0
AND a.stream_id IS b.stream_id
ORDER BY gap_us ASC
LIMIT 500";
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,String>(1)?,
row.get::<_,f64>(2)?, row.get::<_,f64>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() {
println!("no fusion candidates found (no sequential same-stream kernels with < 5us gap)");
return;
}
let mut pairs: std::collections::HashMap<(String, String), (f64, f64, usize)> =
std::collections::HashMap::new();
for (a, b, gap, combined) in &rows {
let key = (a.clone(), b.clone());
let entry = pairs.entry(key).or_insert((0.0, 0.0, 0));
entry.0 += gap;
entry.1 += combined;
entry.2 += 1;
}
let mut sorted: Vec<_> = pairs.into_iter().collect();
sorted.sort_by(|a, b| b.1.2.cmp(&a.1.2));
sorted.truncate(n);
let total_gap: f64 = rows.iter().map(|r| r.2).sum();
println!(" Sequential Launch Candidates (same stream, < 5us gap):\n");
println!(" # Kernel A → Kernel B Count Avg Gap Type");
println!(" ── ──────────────────────────────────────────────── ────── ──────── ─────────");
for (i, ((a, b), (gap_sum, _, count))) in sorted.iter().enumerate() {
let avg_gap = gap_sum / *count as f64;
let kind = if a == b { "batch" } else { "fuse" };
println!(" {:<2} {} → {} {:>5} {:>7} {}",
i + 1, trunc(a, 24), trunc(b, 24), count, fmt_us(avg_gap), kind);
}
println!("\n Total reclaimable gap: {} across {} pairs", fmt_us(total_gap), rows.len());
println!(" 'batch' = same kernel, use CUDA graphs or larger batch sizes");
println!(" 'fuse' = different kernels, use torch.compile() or manual fusion");
detect_kernel_sequences(db, n);
}
fn detect_kernel_sequences(db: &GpuDb, limit: usize) {
let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name, duration_us FROM launches
WHERE start_us IS NOT NULL AND {tl}
ORDER BY start_us"
);
let launches: Vec<(String, f64)> = db.query_vec(&sql, [], |row| {
Ok((row.get(0)?, row.get(1)?))
});
if launches.len() < 6 { return; }
type PatternKey = Vec<usize>;
struct Found { names: Vec<String>, reps: usize, total_us: f64 }
let mut found: std::collections::HashMap<PatternKey, Found> = std::collections::HashMap::new();
let mut id_of: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut names: Vec<String> = Vec::new();
let ids: Vec<usize> = launches.iter().map(|(n, _)| {
if let Some(&i) = id_of.get(n) { i } else {
let i = names.len();
names.push(n.clone());
id_of.insert(n.clone(), i);
i
}
}).collect();
let durs: Vec<f64> = launches.iter().map(|(_, d)| *d).collect();
for len in (2..=5).rev() {
if ids.len() < len * 2 { continue; }
let mut i = 0;
while i + 2 * len <= ids.len() {
let pat = &ids[i..i + len];
if pat.iter().all(|&x| x == pat[0]) { i += 1; continue; }
let mut reps = 1usize;
let mut j = i + len;
while j + len <= ids.len() && ids[j..j + len] == *pat {
reps += 1;
j += len;
}
if reps >= 3 {
let window_us: f64 = durs[i..j].iter().sum();
let key: Vec<usize> = pat.to_vec();
let entry = found.entry(key).or_insert(Found {
names: pat.iter().map(|&id| names[id].clone()).collect(),
reps: 0,
total_us: 0.0,
});
entry.reps += reps;
entry.total_us += window_us;
i = j; } else {
i += 1;
}
}
}
if found.is_empty() { return; }
let mut sorted: Vec<_> = found.into_iter().collect();
sorted.sort_by(|a, b| b.1.total_us.partial_cmp(&a.1.total_us).unwrap());
sorted.truncate(limit);
println!("\n Repeating Kernel Sequences (CUDA graph candidates):\n");
println!(" # Length Reps GPU Time Sequence");
println!(" ── ─────── ─────── ──────────── ─────────────────────────────────────────");
for (i, (_, f)) in sorted.iter().enumerate() {
let seq = f.names.iter().map(|n| trunc(n, 20)).collect::<Vec<_>>().join(" → ");
println!(" {:<2} {:>7} {:>7} {:>12} {}",
i + 1, f.names.len(), f.reps, fmt_us(f.total_us), seq);
}
println!(" → capture these with torch.cuda.graph or cudaGraph APIs to remove launch overhead");
}
pub fn cmd_concurrency(db: &GpuDb) {
let total_launches = db.total_launch_count();
if total_launches == 0 {
println!("no launch data");
return;
}
let tl = db.timeline_filter();
let sql = format!("SELECT stream_id, COUNT(*) as cnt, SUM(duration_us) as total
FROM launches WHERE stream_id IS NOT NULL AND {tl}
GROUP BY stream_id ORDER BY total DESC");
let mut stmt = db.conn.prepare(&sql).unwrap();
let streams: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,u32>(0)?, row.get::<_,i64>(1)?, row.get::<_,f64>(2)?))
}).unwrap().filter_map(|r| r.ok()).collect();
println!(" Stream Concurrency Analysis:\n");
if streams.len() <= 1 {
let sid = streams.first().map(|s| s.0.to_string()).unwrap_or_else(|| "?".into());
println!(" All {} launches on stream {} (single stream)\n", total_launches, sid);
} else {
println!(" {} streams active:\n", streams.len());
println!(" Stream Launches Active Time % of Total");
println!(" ─────── ──────── ──────────── ──────────");
let gpu_total = db.total_gpu_time_us();
for (sid, cnt, total) in &streams {
let pct = if gpu_total > 0.0 { total / gpu_total * 100.0 } else { 0.0 };
println!(" {:>6} {:>7} {:>11} {:>9.1}%", sid, cnt, fmt_us(*total), pct);
}
println!();
}
let gpu_total = db.total_gpu_time_us();
let k_sql = format!(
"SELECT start_us, start_us + duration_us FROM launches
WHERE start_us IS NOT NULL AND {tl} ORDER BY start_us"
);
let k_intervals: Vec<(f64, f64)> = db.query_vec(&k_sql, [], |row| {
Ok((row.get(0)?, row.get(1)?))
});
let merged_active: f64 = merge_intervals(&k_intervals).iter().map(|(s, e)| e - s).sum();
if merged_active > 0.0 && gpu_total > 0.0 {
let pindex = gpu_total / merged_active;
let verdict = if pindex < 1.05 { "serial — no overlap" }
else if pindex < 1.5 { "light overlap" }
else if pindex < 2.5 { "moderate overlap" }
else { "high overlap" };
println!(" Parallelism index: {pindex:.2}x ({verdict})");
println!(" (sum of per-kernel time / merged active time — 1.0 = serial, N = perfect N-way overlap)\n");
}
let gpu_gaps = compute_gpu_gaps(db);
let total_gap: f64 = gpu_gaps.iter().map(|g| g.1).sum();
let gap_count = gpu_gaps.len() as i64;
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
if total_gap > 0.0 {
println!(" GPU idle: {} across {} gaps ({:.1}% of wall time)",
fmt_us(total_gap), gap_count,
if wall_us > 0.0 { total_gap / wall_us * 100.0 } else { 0.0 });
}
if streams.len() <= 1 && gap_count > 10 {
println!(" Multiple streams could reduce idle time by overlapping independent kernels");
println!(" Tip: torch.cuda.Stream() for manual overlap, or CUDA graphs for replay");
}
}
pub fn cmd_hotpath(db: &GpuDb) {
if !require_op_layer(db) { return; }
let sql = "SELECT name, cpu_time_us, gpu_time_us, module_path
FROM ops
WHERE cpu_time_us > 0
ORDER BY cpu_time_us DESC
LIMIT 20";
let mut stmt = db.conn.prepare(sql).unwrap();
let ops: Vec<_> = stmt.query_map([], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,Option<String>>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if ops.is_empty() { println!("no op data"); return; }
let total_cpu: f64 = ops.iter().map(|o| o.1).sum();
let total_gpu = db.total_gpu_time_us();
println!(" Critical Path (ops by CPU time):\n");
println!(" Op CPU Time GPU Time Bound");
println!(" ───────────────────────────────── ────────── ────────── ─────");
for (name, cpu, gpu, _) in &ops {
let bound = if *gpu < 0.01 {
"overhead"
} else if cpu / gpu.max(0.01) > 10.0 {
"CPU"
} else if gpu / cpu.max(0.01) > 2.0 {
"GPU"
} else {
"balanced"
};
println!(" {:<34} {:>9} {:>9} {bound}",
trunc(name, 34), fmt_us(*cpu), fmt_us(*gpu));
}
println!("\n Total CPU: {} Total GPU: {}", fmt_us(total_cpu), fmt_us(total_gpu));
let ratio = total_cpu / total_gpu.max(0.01);
if ratio > 10.0 {
println!(" Workload is CPU-bound ({ratio:.0}:1 CPU:GPU ratio)");
println!(" Consider: larger batch size, torch.compile(), or CUDA graphs");
} else if ratio < 0.5 {
println!(" Workload is GPU-bound — optimize kernel efficiency");
} else {
println!(" Workload is balanced between CPU and GPU");
}
}
pub fn cmd_compare_ops(db: &GpuDb, args: &[&str]) {
if !require_op_layer(db) { return; }
let n = parse_count(args);
let sql = "SELECT name, cpu_time_us, gpu_time_us
FROM ops
WHERE cpu_time_us > 0
ORDER BY cpu_time_us DESC
LIMIT ?1";
let mut stmt = db.conn.prepare(sql).unwrap();
let ops: Vec<_> = stmt.query_map([n as i64], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?, row.get::<_,f64>(2)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if ops.is_empty() { println!("no op data"); return; }
println!(" CPU vs GPU Time by Operator:\n");
println!(" Op CPU Time GPU Time Ratio Assessment");
println!(" ───────────────────────────────── ────────── ────────── ────────── ──────────");
for (name, cpu, gpu) in &ops {
let (ratio_str, assessment) = if *gpu < 0.01 {
("∞".to_string(), "pure overhead")
} else {
let r = cpu / gpu;
let a = if r > 100.0 { "CPU-bound" }
else if r > 10.0 { "CPU-heavy" }
else if r > 2.0 { "balanced" }
else if r > 0.5 { "GPU-heavy" }
else { "GPU-bound" };
(format!("{r:.0}:1"), a)
};
println!(" {:<34} {:>9} {:>9} {:>10} {assessment}",
trunc(name, 34), fmt_us(*cpu), fmt_us(*gpu), ratio_str);
}
let total_cpu: f64 = ops.iter().map(|o| o.1).sum();
let total_gpu: f64 = ops.iter().map(|o| o.2).sum();
let gpu_util = if total_cpu > 0.0 { total_gpu / total_cpu * 100.0 } else { 0.0 };
println!("\n GPU utilization: {gpu_util:.1}% (GPU active time / CPU wall time)");
}
pub fn cmd_top_ops(db: &GpuDb, args: &[&str]) {
if !require_op_layer(db) { return; }
let n = parse_count(args);
let pattern = parse_pattern(args);
let pat_clause = pattern
.map(|p| format!(r"AND o.name LIKE '%{}%' ESCAPE '\'", escape_sql_like(p)))
.unwrap_or_default();
let sql = format!(
"SELECT o.name, o.cpu_time_us, o.gpu_time_us, o.module_path
FROM ops o
WHERE o.gpu_time_us > 0 {pat_clause}
ORDER BY o.gpu_time_us DESC
LIMIT ?1"
);
let mut stmt = db.conn.prepare(&sql).unwrap();
let rows: Vec<_> = stmt.query_map([n as i64], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,f64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,Option<String>>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if rows.is_empty() {
println!("no ops with GPU time (op->kernel correlation may be missing)");
return;
}
let total_gpu = db.total_gpu_time_us();
println!(" Ops by GPU Time:\n");
println!(" # Op GPU Time % GPU CPU Time Ratio");
println!(" ── ───────────────────────────────── ────────── ──────── ────────── ──────");
for (i, (name, cpu, gpu, _)) in rows.iter().enumerate() {
let pct = if total_gpu > 0.0 { gpu / total_gpu * 100.0 } else { 0.0 };
let ratio = if *gpu > 0.01 { format!("{:.0}:1", cpu / gpu) } else { "∞".into() };
println!(" {:<2} {:<34} {:>9} {:>7.1}% {:>9} {:>6}",
i + 1, trunc(name, 34), fmt_us(*gpu), pct, fmt_us(*cpu), ratio);
}
}
pub fn cmd_breakdown(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: breakdown <op_pattern>"); return; }
};
if !require_op_layer(db) { return; }
let op_sql = r"SELECT id, name, cpu_time_us, gpu_time_us FROM ops WHERE name LIKE ?1 ESCAPE '\'";
let mut stmt = db.conn.prepare(op_sql).unwrap();
let ops: Vec<_> = stmt.query_map([like_param(pattern)], |row| {
Ok((row.get::<_,i64>(0)?, row.get::<_,String>(1)?,
row.get::<_,f64>(2)?, row.get::<_,f64>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if ops.is_empty() { println!("no op matching '{pattern}'"); return; }
for (op_id, op_name, cpu_time, gpu_time) in &ops {
println!("Op: {op_name}");
println!(" CPU: {} GPU: {}", fmt_us(*cpu_time), fmt_us(*gpu_time));
let tl_l = db.timeline_filter_for("l");
let k_sql = format!(
"SELECT okm.kernel_name,
COUNT(*) as launches,
SUM(l.duration_us) as total_us,
AVG(l.duration_us) as avg_us
FROM op_kernel_map okm
JOIN launches l ON l.kernel_name = okm.kernel_name AND {tl_l}
WHERE okm.op_id = ?1
GROUP BY okm.kernel_name
ORDER BY total_us DESC"
);
let mut k_stmt = db.conn.prepare(&k_sql).unwrap();
let kernels: Vec<_> = k_stmt.query_map([op_id], |row| {
Ok((row.get::<_,String>(0)?, row.get::<_,i64>(1)?,
row.get::<_,f64>(2)?, row.get::<_,f64>(3)?))
}).unwrap().filter_map(|r| r.ok()).collect();
if kernels.is_empty() {
println!(" (no correlated kernels)\n");
continue;
}
let kernel_total: f64 = kernels.iter().map(|k| k.2).sum();
println!(" Kernels ({} total GPU time):\n", fmt_us(kernel_total));
println!(" Kernel Total % Avg Launches");
println!(" ────────────────────────────────── ───────── ────── ───────── ────────");
for (kname, launches, total, avg) in &kernels {
let pct = if kernel_total > 0.0 { total / kernel_total * 100.0 } else { 0.0 };
println!(" {:<34} {:>9} {:>5.1}% {:>9} {:>7}",
trunc(kname, 34), fmt_us(*total), pct, fmt_us(*avg), launches);
}
println!();
}
}
pub fn cmd_idle_between(db: &GpuDb, args: &[&str]) {
if args.len() < 2 {
println!("usage: idle-between <op_a_pattern> <op_b_pattern>");
return;
}
let pat_a = args[0];
let pat_b = args[1];
if !require_op_layer(db) { return; }
let torch_layer = db.conn.query_row(
"SELECT id FROM layers WHERE source IN ('torch', 'proton') ORDER BY id LIMIT 1",
[], |row| row.get::<_, i64>(0),
);
let tl = match torch_layer {
Ok(id) => format!("launches.layer_id = {id}"),
Err(_) => db.timeline_filter(),
};
let ka_sql = r"SELECT DISTINCT kernel_name FROM op_kernel_map okm
JOIN ops o ON o.id = okm.op_id
WHERE o.name LIKE ?1 ESCAPE '\'";
let mut stmt = db.conn.prepare(ka_sql).unwrap();
let kernels_a: Vec<String> = stmt.query_map([like_param(pat_a)], |row| row.get(0))
.unwrap().filter_map(|r| r.ok()).collect();
let kb_sql = r"SELECT DISTINCT kernel_name FROM op_kernel_map okm
JOIN ops o ON o.id = okm.op_id
WHERE o.name LIKE ?1 ESCAPE '\'";
let mut stmt = db.conn.prepare(kb_sql).unwrap();
let kernels_b: Vec<String> = stmt.query_map([like_param(pat_b)], |row| row.get(0))
.unwrap().filter_map(|r| r.ok()).collect();
if kernels_a.is_empty() { println!("no kernels found for op '{pat_a}'"); return; }
if kernels_b.is_empty() { println!("no kernels found for op '{pat_b}'"); return; }
let placeholders_a = kernels_a.iter().map(|k| format!("'{}'", k.replace('\'', "''"))).collect::<Vec<_>>().join(",");
let placeholders_b = kernels_b.iter().map(|k| format!("'{}'", k.replace('\'', "''"))).collect::<Vec<_>>().join(",");
let a_sql = format!(
"SELECT start_us + duration_us AS end_us FROM launches
WHERE kernel_name IN ({placeholders_a}) AND start_us IS NOT NULL AND {tl}
ORDER BY start_us"
);
let b_sql = format!(
"SELECT start_us FROM launches
WHERE kernel_name IN ({placeholders_b}) AND start_us IS NOT NULL AND {tl}
ORDER BY start_us"
);
let mut a_stmt = db.conn.prepare(&a_sql).unwrap();
let a_ends: Vec<f64> = a_stmt.query_map([], |row| row.get(0))
.unwrap().filter_map(|r| r.ok()).collect();
let mut b_stmt = db.conn.prepare(&b_sql).unwrap();
let b_starts: Vec<f64> = b_stmt.query_map([], |row| row.get(0))
.unwrap().filter_map(|r| r.ok()).collect();
let mut gaps: Vec<f64> = Vec::new();
let mut b_idx = 0;
for a_end in &a_ends {
while b_idx < b_starts.len() && b_starts[b_idx] < *a_end {
b_idx += 1;
}
if b_idx < b_starts.len() {
let gap = b_starts[b_idx] - a_end;
if gap >= 0.0 {
gaps.push(gap);
}
}
}
if gaps.is_empty() {
println!("no transitions found from '{pat_a}' to '{pat_b}'");
return;
}
let total: f64 = gaps.iter().sum();
let avg = total / gaps.len() as f64;
let min = gaps.iter().cloned().fold(f64::INFINITY, f64::min);
let max = gaps.iter().cloned().fold(0.0_f64, f64::max);
println!(" Idle Between '{}' → '{}':\n", pat_a, pat_b);
println!(" Transitions: {}", gaps.len());
println!(" Total idle: {}", fmt_us(total));
println!(" Average: {}", fmt_us(avg));
println!(" Min: {}", fmt_us(min));
println!(" Max: {}", fmt_us(max));
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
if wall_us > 0.0 {
println!(" % of wall: {:.1}%", total / wall_us * 100.0);
}
}
pub fn cmd_outliers(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: outliers <kernel_pattern>"); return; }
};
let tl = db.timeline_filter();
let resolve_sql = format!(
r"SELECT kernel_name, COUNT(*) FROM launches
WHERE kernel_name LIKE ?1 ESCAPE '\' AND {tl}
GROUP BY kernel_name ORDER BY COUNT(*) DESC LIMIT 1"
);
let kernel = match db.conn.query_row(
&resolve_sql, [like_param(pattern)],
|row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
) {
Ok(x) => x,
Err(_) => { println!("no kernel matching '{pattern}'"); return; }
};
let (name, total_cnt) = kernel;
let all_sql = format!(
"SELECT start_us, duration_us FROM launches
WHERE kernel_name = ?1 AND start_us IS NOT NULL AND {tl}
ORDER BY start_us"
);
let launches: Vec<(f64, f64)> = db.query_vec(&all_sql, [&name], |row| {
Ok((row.get(0)?, row.get(1)?))
});
if launches.len() < 4 {
println!("{name}: only {} launches — need ≥4 for outlier analysis", launches.len());
return;
}
let cnt = launches.len();
let mut sorted: Vec<f64> = launches.iter().map(|(_, d)| *d).collect();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let pct_idx = |p: f64| -> usize {
let k = (p * cnt as f64).ceil() as isize - 1;
k.clamp(0, cnt as isize - 1) as usize
};
let median = sorted[pct_idx(0.50)];
let p90 = sorted[pct_idx(0.90)];
let p99 = sorted[pct_idx(0.99)];
let mut indexed: Vec<(usize, f64, f64)> = launches.iter().enumerate()
.map(|(i, (s, d))| (i, *s, *d)).collect();
indexed.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
let top_n = (cnt / 10).max(3).min(cnt);
let outliers = &indexed[..top_n];
let third = cnt / 3;
let mut early = 0;
let mut late = 0;
for &(idx, _, _) in outliers {
if idx < third { early += 1; }
else if idx >= cnt - third { late += 1; }
}
let t_min = launches.first().map(|(s, _)| *s).unwrap_or(0.0);
let t_max = launches.last().map(|(s, d)| *s + *d).unwrap_or(0.0);
let span = t_max - t_min;
println!(" Outliers: {} ({} launches)\n", trunc(&name, 60), total_cnt);
println!(" Distribution:");
println!(" median: {} p90: {} p99: {} max: {}",
fmt_us(median), fmt_us(p90), fmt_us(p99), fmt_us(sorted[cnt - 1]));
println!(" worst is {:.1}x median\n", sorted[cnt - 1] / median.max(1e-9));
println!(" Worst {} launches (top {:.0}%):", top_n, top_n as f64 / cnt as f64 * 100.0);
println!(" # Idx Timeline Start Duration vs median");
println!(" ─── ───── ──────────── ──────────── ─────────── ─────────");
for (i, &(idx, start, dur)) in outliers.iter().enumerate() {
let tpos = if span > 0.0 { (start - t_min) / span * 100.0 } else { 0.0 };
let ratio = dur / median.max(1e-9);
println!(" {:<3} {:>5} {:>11.1}% {:>12} {:>11} {:>7.1}x",
i + 1, idx, tpos, fmt_us(start), fmt_us(dur), ratio);
}
println!();
let worst_ratio = sorted[cnt - 1] / median.max(1e-9);
if cnt < 20 {
println!(" → {cnt} launches — too few to distinguish clustering from noise");
} else if worst_ratio < 1.5 {
println!(" → launches are uniform (worst {:.2}x median) — no meaningful outliers", worst_ratio);
} else if early > 2 * late && early >= top_n / 2 {
println!(" → clusters EARLY ({}/{} outliers in first third) — likely warmup / JIT / cache cold", early, top_n);
} else if late > 2 * early && late >= top_n / 2 {
println!(" → clusters LATE ({}/{} outliers in last third) — thermal throttling, memory fragmentation, or contention", late, top_n);
} else {
println!(" → outliers spread across the timeline — likely data-dependent work or scheduler jitter");
}
}
pub fn cmd_source(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: source <kernel_pattern>"); return; }
};
if !db.has_layer("torch") && !db.has_layer("proton") {
println!("no op-to-kernel mapping — need torch.profiler or proton layer");
println!("(run 'suggest' for how to collect it)");
return;
}
let sql = r"SELECT o.name, COALESCE(o.module_path, '') AS mp,
COUNT(DISTINCT o.id) AS op_hits,
SUM(COALESCE(o.gpu_time_us, 0)) AS gpu_us
FROM op_kernel_map m
JOIN ops o ON o.id = m.op_id
WHERE m.kernel_name LIKE ?1 ESCAPE '\'
GROUP BY o.name, mp
ORDER BY gpu_us DESC
LIMIT 20";
let rows: Vec<(String, String, i64, f64)> = db.query_vec(
sql, [like_param(pattern)],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
);
if rows.is_empty() {
println!("no op mapping found for pattern '{pattern}'");
return;
}
println!(" Launch sites for kernels matching '{pattern}':\n");
println!(" Op Hits GPU Time Source");
println!(" ──────────────────────────────────────── ───── ─────────── ──────────────────────────────");
for (name, mp, hits, gpu_us) in &rows {
let src = if mp.is_empty() { "—".to_string() } else { trunc(mp, 40) };
println!(" {:<40} {:>5} {:>11} {}",
trunc(name, 40), hits, fmt_us(*gpu_us), src);
}
}
pub fn cmd_memory(db: &GpuDb, args: &[&str]) {
let total: i64 = db.scalar_f64("SELECT COUNT(*) FROM allocations") as i64;
if total == 0 {
println!("no allocation data");
println!("(re-profile to capture it — memory tracking is enabled by default in this build)");
return;
}
let n = parse_count(args);
let (n_alloc, n_free, sum_alloc): (i64, i64, i64) = db.conn.query_row(
"SELECT SUM(CASE WHEN op = 'alloc' THEN 1 ELSE 0 END),
SUM(CASE WHEN op = 'free' THEN 1 ELSE 0 END),
COALESCE(SUM(CASE WHEN op = 'alloc' THEN bytes ELSE 0 END), 0)
FROM allocations",
[], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?))
).unwrap_or((0, 0, 0));
let events: Vec<(f64, String, i64, i64)> = db.query_vec(
"SELECT start_us, op, address, bytes FROM allocations ORDER BY start_us",
[], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
);
let mut live: std::collections::HashMap<i64, i64> = std::collections::HashMap::new();
let mut live_bytes: i64 = 0;
let mut peak: i64 = 0;
let mut peak_time: f64 = 0.0;
let mut alloc_lifetimes: Vec<(i64, f64)> = Vec::new();
let mut pending_start: std::collections::HashMap<i64, f64> = std::collections::HashMap::new();
for (t, op, addr, bytes) in &events {
if op == "alloc" {
live.insert(*addr, *bytes);
pending_start.insert(*addr, *t);
live_bytes += *bytes;
if live_bytes > peak { peak = live_bytes; peak_time = *t; }
} else if op == "free" {
if let Some(b) = live.remove(addr) {
live_bytes -= b;
if let Some(s) = pending_start.remove(addr) {
alloc_lifetimes.push((b, *t - s));
}
}
}
}
let leaked: i64 = live.values().sum();
let leak_count = live.len();
println!(" GPU Memory Summary\n");
println!(" Events: {n_alloc} allocs, {n_free} frees");
println!(" Total: {} allocated across {n_alloc} events", fmt_bytes(sum_alloc));
println!(" Peak live: {} at t={}", fmt_bytes(peak), fmt_us(peak_time));
if leak_count > 0 {
println!(" Leaked: {} across {leak_count} allocations (not freed by exit)", fmt_bytes(leaked));
} else {
println!(" Leaked: none");
}
println!();
let big_sql = "SELECT address, bytes, start_us FROM allocations
WHERE op = 'alloc' ORDER BY bytes DESC LIMIT ?1";
let bigs: Vec<(i64, i64, f64)> = db.query_vec(big_sql, [n as i64], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?))
});
if !bigs.is_empty() {
println!(" Largest Allocations:");
println!(" # Size Start Lifetime");
println!(" ── ──────────── ──────────── ──────────────");
for (i, (addr, bytes, start)) in bigs.iter().enumerate() {
let lifetime = db.conn.query_row(
"SELECT start_us FROM allocations
WHERE op = 'free' AND address = ?1 AND start_us > ?2
ORDER BY start_us LIMIT 1",
rusqlite::params![addr, start],
|row| row.get::<_, f64>(0)
).ok().map(|fr| fmt_us(fr - *start)).unwrap_or_else(|| "leaked".into());
println!(" {:<2} {:>12} {:>12} {}",
i + 1, fmt_bytes(*bytes), fmt_us(*start), lifetime);
}
}
if !alloc_lifetimes.is_empty() {
let short_threshold = 100.0; let short_cnt = alloc_lifetimes.iter().filter(|(_, lt)| *lt < short_threshold).count();
if short_cnt > 10 {
let bytes_churned: i64 = alloc_lifetimes.iter()
.filter(|(_, lt)| *lt < short_threshold)
.map(|(b, _)| *b).sum();
println!("\n Churn: {short_cnt} allocations lived < 100us ({} total) — consider a pool allocator",
fmt_bytes(bytes_churned));
}
}
}
use std::path::PathBuf;
pub fn cmd_bandwidth(db: &GpuDb, args: &[&str]) {
if !db.has_layer("ncu") {
println!("no bandwidth data — need ncu layer (achieved_bandwidth_gb_s)");
return;
}
let n = parse_count(args);
let pattern = parse_pattern(args);
let pat_clause = pattern
.map(|p| format!(r"AND kernel_name LIKE '%{}%' ESCAPE '\'", escape_sql_like(p)))
.unwrap_or_default();
let sql = format!(
"SELECT kernel_name, achieved_bandwidth_gb_s, peak_bandwidth_gb_s, boundedness
FROM metrics
WHERE achieved_bandwidth_gb_s IS NOT NULL {pat_clause}
ORDER BY achieved_bandwidth_gb_s DESC"
);
let rows: Vec<(String, f64, Option<f64>, Option<String>)> = db.query_vec(
&sql, [], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
);
if rows.is_empty() {
println!("no kernels have bandwidth metrics");
return;
}
let tl = db.timeline_filter();
let time_sql = format!(
"SELECT kernel_name, SUM(duration_us) FROM launches WHERE {tl} GROUP BY kernel_name"
);
let time_rows: Vec<(String, f64)> = db.query_vec(&time_sql, [], |row| {
Ok((row.get(0)?, row.get(1)?))
});
let time_of: std::collections::HashMap<String, f64> = time_rows.into_iter().collect();
println!(" Per-kernel Memory Bandwidth:\n");
println!(" # Kernel Achieved Peak % peak Bound GPU Time");
println!(" ── ────────────────────────────────── ──────────── ────────── ─────── ──────── ──────────");
let shown = rows.iter().take(n);
let mut flagged = 0usize;
for (i, (name, ach, peak, bound)) in shown.enumerate() {
let pct = peak.filter(|&p| p > 0.0).map(|p| ach / p * 100.0);
let pct_str = pct.map(|v| format!("{v:.1}%")).unwrap_or_else(|| "?".into());
let peak_str = peak.map(|v| format!("{v:.1}")).unwrap_or_else(|| "?".into());
let gpu_us = time_of.get(name).copied().unwrap_or(0.0);
let flag = match pct {
Some(v) if v < 50.0 => { flagged += 1; " ←low" }
_ => "",
};
println!(" {:<2} {:<34} {:>9.1} GB/s {:>6} GB/s {:>6} {:<8} {:>10}{flag}",
i + 1, trunc(name, 34), ach, peak_str, pct_str,
bound.as_deref().unwrap_or("?"), fmt_us(gpu_us));
}
if flagged > 0 {
println!("\n {flagged} kernel(s) under 50% of peak bandwidth — likely memory-access bound");
println!(" (poor coalescing, low L2 hit rate, or uncoalesced strided loads)");
}
}
pub fn cmd_critical_path(db: &GpuDb, args: &[&str]) {
if !db.has_layer("nsys") && !db.has_layer("torch") {
println!("no timeline data — need nsys or torch layer");
return;
}
let gap_thresh: f64 = args.first()
.and_then(|s| s.parse().ok())
.unwrap_or(100.0);
let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name, start_us, duration_us, stream_id
FROM launches
WHERE start_us IS NOT NULL AND stream_id IS NOT NULL AND {tl}
ORDER BY stream_id, start_us"
);
let rows: Vec<(String, f64, f64, u32)> = db.query_vec(&sql, [], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
});
if rows.is_empty() {
println!("no stream-tagged launches available");
return;
}
struct Chain {
stream: u32,
start: f64,
end: f64,
kernel_time: f64,
kernels: Vec<(String, f64)>, }
let mut chains: Vec<Chain> = Vec::new();
let mut cur: Option<Chain> = None;
for (name, start, dur, stream) in &rows {
let end = start + dur;
let extend = cur.as_ref().is_some_and(|c| {
c.stream == *stream && start - c.end <= gap_thresh
});
if !extend {
if let Some(c) = cur.take() { chains.push(c); }
cur = Some(Chain {
stream: *stream, start: *start, end,
kernel_time: *dur, kernels: vec![(name.clone(), *dur)],
});
} else if let Some(c) = cur.as_mut() {
c.end = end;
c.kernel_time += dur;
c.kernels.push((name.clone(), *dur));
}
}
if let Some(c) = cur.take() { chains.push(c); }
chains.sort_by(|a, b| {
let sa = a.end - a.start;
let sb = b.end - b.start;
sb.partial_cmp(&sa).unwrap()
.then_with(|| b.kernel_time.partial_cmp(&a.kernel_time).unwrap())
});
println!(" Critical path chains (same stream, gap ≤ {}):\n", fmt_us(gap_thresh));
let Some(best) = chains.first() else {
println!(" (no chains to report)");
return;
};
let best_span = best.end - best.start;
let utilization = if best_span > 0.0 { best.kernel_time / best_span * 100.0 } else { 0.0 };
println!(" Longest chain: stream {} span {} active {} ({utilization:.0}%) {} kernel(s)",
best.stream, fmt_us(best_span), fmt_us(best.kernel_time), best.kernels.len());
let mut agg: std::collections::HashMap<&str, (usize, f64)> = std::collections::HashMap::new();
for (name, dur) in &best.kernels {
let e = agg.entry(name.as_str()).or_insert((0, 0.0));
e.0 += 1;
e.1 += dur;
}
let mut ranked: Vec<_> = agg.into_iter().collect();
ranked.sort_by(|a, b| b.1.1.partial_cmp(&a.1.1).unwrap());
println!("\n Top kernels on chain:");
println!(" Kernel Launches Time % chain");
println!(" ────────────────────────────────────────── ──────── ────────── ────────");
for (name, (cnt, total)) in ranked.iter().take(8) {
let pct = if best.kernel_time > 0.0 { total / best.kernel_time * 100.0 } else { 0.0 };
println!(" {:<42} {:>8} {:>10} {:>6.1}%",
trunc(name, 42), cnt, fmt_us(*total), pct);
}
if chains.len() > 1 {
println!("\n Other long chains:");
println!(" # Stream Span Active Util Kernels");
println!(" ── ─────── ─────────── ─────────── ────── ────────");
for (i, c) in chains.iter().skip(1).take(5).enumerate() {
let span = c.end - c.start;
let util = if span > 0.0 { c.kernel_time / span * 100.0 } else { 0.0 };
println!(" {:<2} {:>7} {:>11} {:>11} {:>5.0}% {:>7}",
i + 2, c.stream, fmt_us(span), fmt_us(c.kernel_time), util, c.kernels.len());
}
}
let wall_us: f64 = db.meta("wall_time_us").parse().unwrap_or(0.0);
if wall_us > 0.0 {
println!("\n Chain wall fraction: {:.1}% of wall time ({})",
best_span / wall_us * 100.0, fmt_us(wall_us));
}
}
pub fn cmd_stream_graph(db: &GpuDb, args: &[&str]) {
let width: usize = args.first()
.and_then(|s| s.parse().ok())
.filter(|&w: &usize| w >= 20 && w <= 500)
.unwrap_or(100);
let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name, start_us, duration_us, stream_id
FROM launches
WHERE start_us IS NOT NULL AND stream_id IS NOT NULL AND {tl}
ORDER BY stream_id, start_us"
);
let rows: Vec<(String, f64, f64, u32)> = db.query_vec(&sql, [], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
});
if rows.is_empty() {
println!("no timeline data");
return;
}
let t_min = rows.iter().map(|r| r.1).fold(f64::INFINITY, f64::min);
let t_max = rows.iter().map(|r| r.1 + r.2).fold(f64::NEG_INFINITY, f64::max);
let span = t_max - t_min;
if span <= 0.0 { println!("timeline has zero span"); return; }
use std::collections::BTreeMap;
let mut by_stream: BTreeMap<u32, Vec<(String, f64, f64)>> = BTreeMap::new();
for (name, start, dur, stream) in &rows {
by_stream.entry(*stream).or_default().push((name.clone(), *start, *dur));
}
let mut kernel_time: std::collections::HashMap<String, f64> = std::collections::HashMap::new();
for (name, _, dur, _) in &rows {
*kernel_time.entry(name.clone()).or_insert(0.0) += dur;
}
let mut kernel_rank: Vec<(String, f64)> = kernel_time.into_iter().collect();
kernel_rank.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
let glyphs: Vec<char> = ('A'..='Z').chain('a'..='z').chain('0'..='9').collect();
let glyph_of: std::collections::HashMap<String, char> = kernel_rank.iter().enumerate()
.map(|(i, (name, _))| {
let g = if i < glyphs.len() { glyphs[i] } else { '*' };
(name.clone(), g)
})
.collect();
println!(" Stream Graph ({} → {}, span {})\n",
fmt_us(t_min), fmt_us(t_max), fmt_us(span));
for (stream, launches) in &by_stream {
let mut line = vec![' '; width];
for (name, start, dur) in launches {
let s = ((*start - t_min) / span * width as f64).floor() as usize;
let e_raw = ((*start + *dur - t_min) / span * width as f64).ceil() as usize;
let s = s.min(width - 1);
let e = e_raw.clamp(s + 1, width);
let g = glyph_of.get(name).copied().unwrap_or('?');
for cell in line.iter_mut().take(e).skip(s) {
*cell = g;
}
}
let row: String = line.into_iter().collect();
println!(" s{:<4} │{row}│", stream);
}
let axis: String = "─".repeat(width);
println!(" └{axis}┘");
println!("\n Legend:");
for (i, (name, total)) in kernel_rank.iter().take(glyphs.len().min(20)).enumerate() {
let g = glyphs.get(i).copied().unwrap_or('*');
println!(" {g} {:<50} {}", trunc(name, 50), fmt_us(*total));
}
if kernel_rank.len() > 20 {
println!(" ({} more kernels not shown)", kernel_rank.len() - 20);
}
}
pub(crate) fn find_hottest_window(
intervals: &[(f64, f64)],
window_us: f64,
) -> (f64, f64, usize, usize) {
let n = intervals.len();
if n == 0 || window_us <= 0.0 { return (0.0, 0.0, 0, 0); }
let mut candidates: Vec<f64> = Vec::with_capacity(2 * n);
for &(s, d) in intervals {
candidates.push(s);
candidates.push(s + d - window_us);
}
candidates.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mut best = (0.0_f64, 0.0_f64, 0usize, 0usize);
let mut lo = 0usize;
for &w_start in &candidates {
let w_end = w_start + window_us;
while lo < n && intervals[lo].0 + intervals[lo].1 <= w_start { lo += 1; }
let mut busy = 0.0_f64;
let mut hi_scan = lo;
while hi_scan < n && intervals[hi_scan].0 < w_end {
let (s, d) = intervals[hi_scan];
let e = s + d;
let os = s.max(w_start);
let oe = e.min(w_end);
if os < oe { busy += oe - os; }
hi_scan += 1;
}
if busy > best.0 {
best = (busy, w_start, lo, hi_scan);
}
}
best
}
pub fn cmd_hotspot(db: &GpuDb, args: &[&str]) {
let window_us: f64 = match args.first().and_then(|s| s.parse::<f64>().ok()) {
Some(v) if v > 0.0 => v,
_ => { println!("usage: hotspot <window_us> (e.g. 10000 for 10ms)"); return; }
};
let tl = db.timeline_filter();
let sql = format!(
"SELECT kernel_name, start_us, duration_us
FROM launches
WHERE start_us IS NOT NULL AND {tl}
ORDER BY start_us"
);
let rows: Vec<(String, f64, f64)> = db.query_vec(&sql, [], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?))
});
if rows.is_empty() { println!("no timeline data"); return; }
let intervals: Vec<(f64, f64)> = rows.iter().map(|(_, s, d)| (*s, *d)).collect();
let (busy_best, w_start_best, lo, hi_end) = find_hottest_window(&intervals, window_us);
let best = (busy_best, w_start_best, lo, hi_end);
if best.0 == 0.0 {
println!("no activity found in any window");
return;
}
let (busy, w_start, lo, hi_end) = best;
let w_end = w_start + window_us;
let util = busy / window_us * 100.0;
println!(" Hottest {} window:\n", fmt_us(window_us));
println!(" Window: {} → {}", fmt_us(w_start), fmt_us(w_end));
println!(" Busy time: {} ({util:.1}% of window)", fmt_us(busy));
println!(" Launches: {}", hi_end - lo);
let mut agg: std::collections::HashMap<&str, (usize, f64)> = std::collections::HashMap::new();
for (name, s, d) in rows.iter().take(hi_end).skip(lo) {
let end = s + d;
let os = s.max(w_start);
let oe = end.min(w_end);
if os < oe {
let e = agg.entry(name.as_str()).or_insert((0, 0.0));
e.0 += 1;
e.1 += oe - os;
}
}
let mut ranked: Vec<_> = agg.into_iter().collect();
ranked.sort_by(|a, b| b.1.1.partial_cmp(&a.1.1).unwrap());
println!("\n Kernel Launches Time in window % busy");
println!(" ────────────────────────────────────────── ──────── ─────────────── ───────");
for (name, (cnt, t)) in ranked.iter().take(15) {
let pct = if busy > 0.0 { t / busy * 100.0 } else { 0.0 };
println!(" {:<42} {:>8} {:>15} {:>6.1}%",
trunc(name, 42), cnt, fmt_us(*t), pct);
}
}
pub fn cmd_launches(db: &GpuDb, args: &[&str]) {
let pattern = match args.first() {
Some(p) => *p,
None => { println!("usage: launches <kernel_pattern> [limit]"); return; }
};
let limit: usize = args.get(1)
.and_then(|s| s.parse().ok())
.unwrap_or(50);
let tl = db.timeline_filter();
let resolve_sql = format!(
r"SELECT kernel_name, COUNT(*) FROM launches
WHERE kernel_name LIKE ?1 ESCAPE '\' AND {tl}
GROUP BY kernel_name ORDER BY COUNT(*) DESC LIMIT 1"
);
let kernel = match db.conn.query_row(
&resolve_sql, [like_param(pattern)],
|row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
) {
Ok(x) => x,
Err(_) => { println!("no kernel matching '{pattern}'"); return; }
};
let (name, cnt) = kernel;
let sql = format!(
"SELECT start_us, duration_us, grid_x, grid_y, grid_z,
block_x, block_y, block_z, stream_id
FROM launches
WHERE kernel_name = ?1 AND start_us IS NOT NULL AND {tl}
ORDER BY start_us LIMIT ?2"
);
let rows: Vec<(f64, f64, Option<u32>, Option<u32>, Option<u32>,
Option<u32>, Option<u32>, Option<u32>, Option<u32>)> = db.query_vec(
&sql, rusqlite::params![name, limit as i64],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?,
row.get(5)?, row.get(6)?, row.get(7)?, row.get(8)?)),
);
println!(" Launches of {} ({} total, showing {})\n",
trunc(&name, 70), cnt, rows.len());
println!(" # Start Duration Gap Stream Grid Block");
println!(" ──── ───────────── ─────────── ────────── ─────── ─────────────── ───────────────");
let mut prev_end: Option<f64> = None;
for (i, (start, dur, gx, gy, gz, bx, by, bz, sid)) in rows.iter().enumerate() {
let gap = prev_end.map(|e| start - e);
let gap_s = gap.map(|g| if g >= 0.0 { fmt_us(g) } else { format!("-{}", fmt_us(-g)) })
.unwrap_or_else(|| "—".into());
let grid = match (gx, gy, gz) {
(Some(x), Some(y), Some(z)) => format!("({x},{y},{z})"),
_ => "—".into(),
};
let block = match (bx, by, bz) {
(Some(x), Some(y), Some(z)) => format!("({x},{y},{z})"),
_ => "—".into(),
};
let sid_s = sid.map(|s| s.to_string()).unwrap_or_else(|| "?".into());
println!(" {:<4} {:>13} {:>11} {:>10} {:>7} {:<15} {:<15}",
i + 1, fmt_us(*start), fmt_us(*dur), gap_s, sid_s,
trunc(&grid, 15), trunc(&block, 15));
prev_end = Some(start + dur);
}
if rows.len() >= 2 {
let gaps: Vec<f64> = rows.windows(2)
.map(|w| w[1].0 - (w[0].0 + w[0].1))
.filter(|g| *g >= 0.0)
.collect();
if !gaps.is_empty() {
let mean = gaps.iter().sum::<f64>() / gaps.len() as f64;
let min = gaps.iter().cloned().fold(f64::INFINITY, f64::min);
let max = gaps.iter().cloned().fold(0.0_f64, f64::max);
println!("\n Gap between consecutive launches: mean {}, min {}, max {}",
fmt_us(mean), fmt_us(min), fmt_us(max));
}
}
}
pub fn cmd_compare(db: &GpuDb, args: &[&str]) {
if args.len() < 2 {
println!("usage: compare <kernel_a> <kernel_b>");
return;
}
let tl = db.timeline_filter();
let resolve = |pattern: &str| -> Option<(String, i64, f64, f64, f64, f64, f64)> {
let sql = format!(
r"SELECT kernel_name,
COUNT(*),
AVG(duration_us),
MIN(duration_us),
MAX(duration_us),
SUM(duration_us),
AVG(duration_us * duration_us) - AVG(duration_us) * AVG(duration_us)
FROM launches
WHERE kernel_name LIKE ?1 ESCAPE '\' AND {tl}
GROUP BY kernel_name
ORDER BY SUM(duration_us) DESC LIMIT 1"
);
db.conn.query_row(&sql, [like_param(pattern)], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?,
row.get(4)?, row.get(5)?, row.get(6)?))
}).ok()
};
let a = match resolve(args[0]) {
Some(v) => v,
None => { println!("no kernel matching '{}'", args[0]); return; }
};
let b = match resolve(args[1]) {
Some(v) => v,
None => { println!("no kernel matching '{}'", args[1]); return; }
};
if a.0 == b.0 {
println!("both patterns resolved to the same kernel: {}", a.0);
return;
}
let metrics_of = |name: &str| -> Option<(Option<f64>, Option<f64>, Option<f64>, Option<f64>, Option<String>)> {
db.conn.query_row(
"SELECT occupancy_pct, compute_throughput_pct, memory_throughput_pct,
achieved_bandwidth_gb_s, boundedness
FROM metrics WHERE kernel_name = ?1",
[name],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?)),
).ok()
};
let ma = metrics_of(&a.0);
let mb = metrics_of(&b.0);
let stddev = |var: f64| var.max(0.0).sqrt();
let cv = |mean: f64, var: f64| if mean > 0.0 { stddev(var) / mean } else { 0.0 };
println!(" Kernel A: {}", trunc(&a.0, 70));
println!(" Kernel B: {}\n", trunc(&b.0, 70));
println!(" Metric A B Ratio B/A");
println!(" ──────────────── ──────────────── ──────────────── ──────────");
let row = |label: &str, va: String, vb: String, ratio: Option<f64>| {
let r = ratio.map(|r| format!("{r:.2}x")).unwrap_or_else(|| "—".into());
println!(" {:<16} {:<16} {:<16} {r}", label, va, vb);
};
row("Launches", a.1.to_string(), b.1.to_string(),
if a.1 > 0 { Some(b.1 as f64 / a.1 as f64) } else { None });
row("Total time", fmt_us(a.5), fmt_us(b.5),
if a.5 > 0.0 { Some(b.5 / a.5) } else { None });
row("Mean", fmt_us(a.2), fmt_us(b.2),
if a.2 > 0.0 { Some(b.2 / a.2) } else { None });
row("Min", fmt_us(a.3), fmt_us(b.3), None);
row("Max", fmt_us(a.4), fmt_us(b.4), None);
row("Stddev",
fmt_us(stddev(a.6)), fmt_us(stddev(b.6)), None);
row("CV",
format!("{:.3}", cv(a.2, a.6)),
format!("{:.3}", cv(b.2, b.6)), None);
if ma.is_some() || mb.is_some() {
println!("\n Hardware metrics (ncu):");
let fmt_opt_pct = |v: Option<f64>| v.map(|x| format!("{x:.1}%")).unwrap_or_else(|| "?".into());
let fmt_opt_bw = |v: Option<f64>| v.map(|x| format!("{x:.1} GB/s")).unwrap_or_else(|| "?".into());
let fmt_opt_s = |v: Option<String>| v.unwrap_or_else(|| "?".into());
let (oa, ca, mma, ba, bda) = ma.unwrap_or((None, None, None, None, None));
let (ob, cb, mmb, bb, bdb) = mb.unwrap_or((None, None, None, None, None));
println!(" Occupancy {:<16} {:<16}", fmt_opt_pct(oa), fmt_opt_pct(ob));
println!(" Compute tput {:<16} {:<16}", fmt_opt_pct(ca), fmt_opt_pct(cb));
println!(" Memory tput {:<16} {:<16}", fmt_opt_pct(mma), fmt_opt_pct(mmb));
println!(" Bandwidth {:<16} {:<16}", fmt_opt_bw(ba), fmt_opt_bw(bb));
println!(" Boundedness {:<16} {:<16}", fmt_opt_s(bda), fmt_opt_s(bdb));
}
}
pub fn cmd_regressions(db: &GpuDb, args: &[&str]) {
let name = match args.first() {
Some(n) => *n,
None => { println!("usage: regressions <saved_session> [pct=5] [min_us=10]"); return; }
};
let pct_thresh: f64 = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(5.0);
let abs_thresh_us: f64 = args.get(2).and_then(|s| s.parse().ok()).unwrap_or(10.0);
let other_path = if name.ends_with(".gpu.db") || name.contains('/') {
PathBuf::from(name)
} else {
GpuDb::session_dir().join(format!("{name}.gpu.db"))
};
if !other_path.exists() {
println!("cannot load '{name}': no such session at {}", other_path.display());
return;
}
if let Err(e) = db.attach(other_path.to_str().unwrap_or(""), "other") {
println!("cannot load '{name}': {e}");
return;
}
let sql = "SELECT COALESCE(c.kernel_name, o.kernel_name),
COALESCE(o.total, 0), COALESCE(c.total, 0),
COALESCE(o.cnt, 0), COALESCE(c.cnt, 0)
FROM
(SELECT kernel_name, SUM(duration_us) AS total, COUNT(*) AS cnt
FROM launches GROUP BY kernel_name) c
FULL OUTER JOIN
(SELECT kernel_name, SUM(duration_us) AS total, COUNT(*) AS cnt
FROM other.launches GROUP BY kernel_name) o
ON c.kernel_name = o.kernel_name";
let all: Vec<(String, f64, f64, i64, i64)> = db.query_vec(sql, [], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?))
});
struct Change { name: String, before: f64, after: f64, delta_us: f64, delta_pct: f64, before_cnt: i64, after_cnt: i64 }
let mut regressions: Vec<Change> = Vec::new();
let mut improvements: Vec<Change> = Vec::new();
let mut new_kernels: Vec<(String, f64, i64)> = Vec::new();
let mut gone_kernels: Vec<(String, f64, i64)> = Vec::new();
for (kname, before, after, bc, ac) in all {
if before <= 0.0 && after > 0.0 {
new_kernels.push((kname, after, ac));
continue;
}
if after <= 0.0 && before > 0.0 {
gone_kernels.push((kname, before, bc));
continue;
}
let delta = after - before;
if delta.abs() < abs_thresh_us { continue; }
let pct = if before > 0.0 { delta / before * 100.0 } else { 0.0 };
if pct.abs() < pct_thresh { continue; }
let ch = Change {
name: kname, before, after,
delta_us: delta, delta_pct: pct,
before_cnt: bc, after_cnt: ac,
};
if delta > 0.0 { regressions.push(ch); } else { improvements.push(ch); }
}
regressions.sort_by(|a, b| b.delta_us.partial_cmp(&a.delta_us).unwrap());
improvements.sort_by(|a, b| a.delta_us.partial_cmp(&b.delta_us).unwrap());
println!(" Regressions vs {name} (threshold: ≥{pct_thresh}% AND ≥{abs_thresh_us}us)\n");
let print_changes = |label: &str, v: &[Change]| {
if v.is_empty() { return; }
println!(" {label} ({})", v.len());
println!(" Kernel Before After Delta % Launches");
println!(" ────────────────────────────────────────── ─────────── ─────────── ──────────── ──────── ─────────");
for c in v.iter().take(15) {
let sign = if c.delta_us >= 0.0 { "+" } else { "" };
let launches = if c.before_cnt == c.after_cnt {
format!("{}", c.after_cnt)
} else {
format!("{}→{}", c.before_cnt, c.after_cnt)
};
println!(" {:<42} {:>11} {:>11} {:>11} {sign}{:>6.1}% {:>9}",
trunc(&c.name, 42), fmt_us(c.before), fmt_us(c.after),
fmt_us(c.delta_us.abs()), c.delta_pct, launches);
}
println!();
};
print_changes("SLOWER", ®ressions);
print_changes("FASTER", &improvements);
if !new_kernels.is_empty() {
println!(" NEW kernels in current run ({}):", new_kernels.len());
new_kernels.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
for (n, t, c) in new_kernels.iter().take(10) {
println!(" + {:<50} {} ({} launches)", trunc(n, 50), fmt_us(*t), c);
}
println!();
}
if !gone_kernels.is_empty() {
println!(" GONE from current run ({}):", gone_kernels.len());
gone_kernels.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
for (n, t, c) in gone_kernels.iter().take(10) {
println!(" - {:<50} {} ({} launches)", trunc(n, 50), fmt_us(*t), c);
}
println!();
}
let net_delta: f64 = regressions.iter().map(|c| c.delta_us).sum::<f64>()
+ improvements.iter().map(|c| c.delta_us).sum::<f64>();
let sign = if net_delta >= 0.0 { "+" } else { "-" };
println!(" Net change on filtered kernels: {sign}{} ({} regressions, {} improvements)",
fmt_us(net_delta.abs()), regressions.len(), improvements.len());
let _ = db.detach("other");
}