#[cfg(feature = "inference")]
fn profile_gpu_generation(
path: &Path,
tokens_per_pass: usize,
warmup_passes: usize,
measure_passes: usize,
) -> Result<RealProfileResults, CliError> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, QuantizedGenerateConfig};
let format = detect_format(path);
println!(
"{}",
format!("Loading {format} model for GPU generation profiling...").dimmed()
);
let (model, architecture) = match format {
"gguf" => {
let mapped = MappedGGUFModel::from_path(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load GGUF: {e}")))?;
let arch = mapped.model.architecture().unwrap_or("unknown").to_string();
let m = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
(m, arch)
}
"apr" => {
let mapped = realizar::apr::MappedAprModel::from_path(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load APR: {e}")))?;
let arch = mapped.metadata.architecture.clone().unwrap_or_else(|| "unknown".to_string());
let m = OwnedQuantizedModel::from_apr(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model from APR: {e}")))?;
(m, arch)
}
"safetensors" => {
let tmp_apr = std::env::temp_dir().join("profile-safetensors-q4k.apr");
let import_opts = aprender::format::ImportOptions {
quantize: Some(aprender::format::QuantizationType::Q4K),
..aprender::format::ImportOptions::default()
};
println!("{}", "Converting SafeTensors → Q4K (one-time)...".dimmed());
aprender::format::apr_import(&path.display().to_string(), &tmp_apr, import_opts)
.map_err(|e| CliError::ValidationFailed(format!("SafeTensors→Q4K failed: {e}")))?;
let mapped = realizar::apr::MappedAprModel::from_path(&tmp_apr)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load temp APR: {e}")))?;
let arch = mapped.metadata.architecture.clone().unwrap_or_else(|| "unknown".to_string());
let m = OwnedQuantizedModel::from_apr(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
let _ = std::fs::remove_file(&tmp_apr);
(m, arch)
}
_ => {
return Err(CliError::ValidationFailed(format!(
"GPU profiling unsupported for format '{format}' (expected gguf, apr, or safetensors)"
)));
}
};
let num_layers = model.config().num_layers;
let vocab_size = model.config().vocab_size;
let hidden_dim = model.config().hidden_dim;
let mut cuda_model = match realizar::gguf::OwnedQuantizedModelCuda::new(model, 0) {
Ok(m) => m,
Err(e) => {
return Err(CliError::ValidationFailed(format!("CUDA init failed: {e}")));
}
};
let test_tokens: Vec<u32> = vec![791, 7438, 315, 2324, 374];
let gen_config = QuantizedGenerateConfig {
max_tokens: tokens_per_pass,
temperature: 0.0, top_k: 1,
stop_tokens: vec![],
trace: false,
..Default::default()
};
println!(
"{}",
format!(
"GPU warmup: {} passes x {} tokens...",
warmup_passes, tokens_per_pass
)
.dimmed()
);
for i in 0..warmup_passes {
if let Err(e) = cuda_model.generate_gpu_resident(&test_tokens, &gen_config) {
eprintln!("Warning: GPU warmup pass {i} failed: {e}");
}
}
println!(
"{}",
format!(
"GPU measurement: {} passes x {} tokens...",
measure_passes, tokens_per_pass
)
.dimmed()
);
let mut per_pass_decode_times: Vec<f64> = Vec::new(); let mut per_pass_prefill_times: Vec<f64> = Vec::new(); let mut per_pass_total_times: Vec<f64> = Vec::new(); let mut total_tokens_generated: usize = 0;
for pass in 0..measure_passes {
let total_start = Instant::now();
let prefill_start = Instant::now();
let prefill_config = QuantizedGenerateConfig {
max_tokens: 1,
temperature: 0.0,
top_k: 1,
stop_tokens: vec![],
trace: false,
..Default::default()
};
if let Err(e) = cuda_model.generate_gpu_resident(&test_tokens, &prefill_config) {
eprintln!("Warning: prefill pass {pass} failed: {e}");
}
let prefill_ms = prefill_start.elapsed().as_secs_f64() * 1000.0;
per_pass_prefill_times.push(prefill_ms);
let gen_start = Instant::now();
let result = cuda_model.generate_gpu_resident(&test_tokens, &gen_config);
let gen_ms = gen_start.elapsed().as_secs_f64() * 1000.0;
let total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
per_pass_total_times.push(total_ms);
if let Ok(ref tokens) = result {
let generated = tokens.len().saturating_sub(test_tokens.len());
total_tokens_generated += generated;
let decode_ms = (gen_ms - prefill_ms).max(0.1);
per_pass_decode_times.push(decode_ms);
if pass == 0 {
println!(
"{}",
format!(
" Pass 0: {} tokens in {:.1}ms (prefill: {:.1}ms, decode: {:.1}ms = {:.1} tok/s)",
generated,
gen_ms,
prefill_ms,
decode_ms,
generated as f64 / (decode_ms / 1000.0)
)
.dimmed()
);
}
}
}
let stats = compute_profile_stats(
&per_pass_decode_times,
&per_pass_prefill_times,
&per_pass_total_times,
total_tokens_generated,
measure_passes,
test_tokens.len(),
);
let hotspots = run_brick_profiler_pass(
&mut cuda_model, &test_tokens, num_layers, hidden_dim, vocab_size,
);
let category_summary = Some(compute_category_summary(&hotspots));
let total_decode_us = stats.avg_decode_ms * 1000.0;
let (launch_overhead_us, launch_overhead_pct) =
compute_kernel_launch_overhead(&hotspots, total_decode_us);
let mut results = RealProfileResults {
model_path: path.display().to_string(),
architecture,
num_layers,
vocab_size,
hidden_dim,
warmup_passes,
measure_passes,
total_inference_us: stats.avg_total_ms * 1000.0,
throughput_tok_s: stats.decode_tok_s,
tokens_per_pass: stats.tokens_per_decode,
hotspots,
per_layer_us: vec![],
is_real_data: true,
roofline: None,
category_summary,
backend: "cuda".to_string(),
latency_p50_ms: stats.p50,
latency_p95_ms: stats.p95,
latency_p99_ms: stats.p99,
latency_min_ms: stats.lat_min,
latency_max_ms: stats.lat_max,
prefill_tok_s: stats.prefill_tok_s,
decode_tok_s: stats.decode_tok_s,
total_tokens_generated,
kernel_launch_overhead_pct: launch_overhead_pct,
kernel_launch_overhead_us: launch_overhead_us,
};
results.roofline = Some(compute_roofline(&results));
std::env::remove_var("SKIP_CUDA_GRAPH");
Ok(results)
}
#[cfg(feature = "inference")]
struct ProfileStats {
p50: f64,
p95: f64,
p99: f64,
lat_min: f64,
lat_max: f64,
avg_decode_ms: f64,
tokens_per_decode: usize,
decode_tok_s: f64,
prefill_tok_s: f64,
avg_total_ms: f64,
}
#[cfg(feature = "inference")]
fn compute_profile_stats(
decode_times: &[f64],
prefill_times: &[f64],
total_times: &[f64],
total_tokens: usize,
measure_passes: usize,
prompt_len: usize,
) -> ProfileStats {
let mut sorted_decode = decode_times.to_vec();
sorted_decode.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let percentile = |pct: f64| -> f64 {
if sorted_decode.is_empty() {
return 0.0;
}
let idx = ((pct / 100.0) * (sorted_decode.len() - 1) as f64) as usize;
sorted_decode[idx.min(sorted_decode.len() - 1)]
};
let p50 = percentile(50.0);
let p95 = percentile(95.0);
let p99 = percentile(99.0);
let lat_min = sorted_decode.first().copied().unwrap_or(0.0);
let lat_max = sorted_decode.last().copied().unwrap_or(0.0);
let avg_decode_ms = if decode_times.is_empty() {
0.0
} else {
decode_times.iter().sum::<f64>() / decode_times.len() as f64
};
let tokens_per_decode = if measure_passes > 0 {
total_tokens / measure_passes
} else {
0
};
let decode_tok_s = if avg_decode_ms > 0.0 {
tokens_per_decode as f64 / (avg_decode_ms / 1000.0)
} else {
0.0
};
let avg_prefill_ms = if prefill_times.is_empty() {
0.0
} else {
prefill_times.iter().sum::<f64>() / prefill_times.len() as f64
};
let prefill_tok_s = if avg_prefill_ms > 0.0 {
prompt_len as f64 / (avg_prefill_ms / 1000.0)
} else {
0.0
};
let avg_total_ms = if total_times.is_empty() {
0.0
} else {
total_times.iter().sum::<f64>() / total_times.len() as f64
};
ProfileStats {
p50,
p95,
p99,
lat_min,
lat_max,
avg_decode_ms,
tokens_per_decode,
decode_tok_s,
prefill_tok_s,
avg_total_ms,
}
}
#[cfg(feature = "inference")]
fn run_brick_profiler_pass(
cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
test_tokens: &[u32],
num_layers: usize,
hidden_dim: usize,
vocab_size: usize,
) -> Vec<Hotspot> {
use realizar::gguf::QuantizedGenerateConfig;
println!(
"{}",
"Per-operation profiling pass (no CUDA graph)...".dimmed()
);
std::env::set_var("SKIP_CUDA_GRAPH", "1");
cuda_model.clear_decode_graph();
cuda_model.enable_profiling();
cuda_model.reset_profiler();
let profile_config = QuantizedGenerateConfig {
max_tokens: 16,
temperature: 0.0,
top_k: 1,
stop_tokens: vec![],
trace: false,
..Default::default()
};
let _ = cuda_model.generate_gpu_resident(test_tokens, &profile_config);
extract_gpu_hotspots(cuda_model, num_layers, hidden_dim, vocab_size)
}
#[cfg(feature = "inference")]
fn estimate_kernel_data_bytes(name: &str, hidden_dim: usize, vocab_size: usize) -> Option<u64> {
let name_lower = name.to_lowercase();
let op = classify_kernel_op(&name_lower);
compute_kernel_bytes(op, hidden_dim, vocab_size)
}
#[cfg(feature = "inference")]
#[derive(Clone, Copy)]
enum KernelOp {
QkvProj,
OutProj,
FfnGateUp,
FfnDown,
LmHead,
Norm,
Rope,
Attention,
Embed,
Unknown,
}
#[cfg(feature = "inference")]
fn classify_kernel_op(name: &str) -> KernelOp {
const RULES: &[(&[&str], KernelOp)] = &[
(&["q_proj", "k_proj", "v_proj"], KernelOp::QkvProj),
(&["o_proj", "out_proj"], KernelOp::OutProj),
(&["gate_proj", "up_proj"], KernelOp::FfnGateUp),
(&["down_proj"], KernelOp::FfnDown),
(&["lm_head", "output"], KernelOp::LmHead),
(&["rmsnorm", "layernorm"], KernelOp::Norm),
(&["rope", "rotary"], KernelOp::Rope),
(&["softmax", "attention"], KernelOp::Attention),
(&["embed"], KernelOp::Embed),
];
for &(patterns, op) in RULES {
for &pattern in patterns {
if name.contains(pattern) {
return op;
}
}
}
KernelOp::Unknown
}
#[cfg(feature = "inference")]
fn compute_kernel_bytes(op: KernelOp, hidden_dim: usize, vocab_size: usize) -> Option<u64> {
let q4k_bpe: f64 = 144.0 / 256.0; let act_rw = (hidden_dim * 8) as u64; let h = hidden_dim as f64;
match op {
KernelOp::QkvProj | KernelOp::OutProj => {
Some((h * h * q4k_bpe) as u64 + act_rw)
}
KernelOp::FfnGateUp => {
Some((h * (hidden_dim * 4) as f64 * q4k_bpe) as u64 + act_rw)
}
KernelOp::FfnDown => {
Some(((hidden_dim * 4) as f64 * h * q4k_bpe) as u64 + act_rw)
}
KernelOp::LmHead => {
Some((h * vocab_size as f64 * q4k_bpe) as u64 + (vocab_size * 4 + hidden_dim * 4) as u64)
}
KernelOp::Norm => Some(act_rw + (hidden_dim * 4) as u64),
KernelOp::Rope => Some(act_rw),
KernelOp::Attention => Some(act_rw * 2),
KernelOp::Embed => Some((hidden_dim * 4) as u64),
KernelOp::Unknown => None,
}
}
#[cfg(feature = "inference")]
fn extract_gpu_hotspots(
cuda_model: &realizar::gguf::OwnedQuantizedModelCuda,
_num_layers: usize,
hidden_dim: usize,
vocab_size: usize,
) -> Vec<Hotspot> {
let profiler = cuda_model.profiler();
let total_ns = profiler.total_ns();
let mut hotspots: Vec<Hotspot> = profiler
.all_brick_stats()
.map(|stats| {
let total_us = stats.total_ns as f64 / 1000.0;
let pct = if total_ns > 0 {
100.0 * stats.total_ns as f64 / total_ns as f64
} else {
0.0
};
let avg_us = if stats.count > 0 {
total_us / stats.count as f64
} else {
0.0
};
let data_bytes = estimate_kernel_data_bytes(&stats.name, hidden_dim, vocab_size);
let bandwidth = data_bytes.and_then(|bytes| {
if avg_us > 0.0 {
Some(bytes as f64 / (avg_us * 1000.0))
} else {
None
}
});
Hotspot {
name: stats.name.clone(),
time_us: total_us,
percent: pct,
count: stats.count as usize,
avg_us,
min_us: stats.min_us(),
max_us: stats.max_us(),
bottleneck: Some(classify_operation_bottleneck(&stats.name)),
efficiency_pct: bandwidth.map(|bw| (bw / 1008.0 * 100.0).min(100.0)), category: Some(classify_operation_category(&stats.name)),
bandwidth_gbs: bandwidth,
data_bytes_per_call: data_bytes,
}
})
.collect();
hotspots.sort_by(|a, b| {
b.time_us
.partial_cmp(&a.time_us)
.unwrap_or(std::cmp::Ordering::Equal)
});
hotspots
}
#[cfg(feature = "inference")]
fn compute_kernel_launch_overhead(hotspots: &[Hotspot], total_decode_us: f64) -> (f64, f64) {
let sum_kernel_us: f64 = hotspots.iter().map(|h| h.time_us).sum();
let overhead_us = (total_decode_us - sum_kernel_us).max(0.0);
let overhead_pct = if total_decode_us > 0.0 {
overhead_us / total_decode_us * 100.0
} else {
0.0
};
(overhead_us, overhead_pct)
}