#[allow(clippy::needless_pass_by_value)] fn run_headless_simulated(config: CbtopConfig) -> Result<()> {
let model_name = config.model.as_deref().unwrap_or("qwen2.5-coder-1.5b");
eprintln!("cbtop: Running headless benchmark (SIMULATED)...");
eprintln!(" Model: {model_name}");
eprintln!(" Warmup: {} iterations", config.warmup);
eprintln!(" Measurement: {} iterations", config.iterations);
eprintln!();
eprintln!(" WARNING: Using simulated data. For real profiling, use:");
eprintln!(" apr cbtop --model-path model.gguf --headless --json # GGUF");
eprintln!(" apr cbtop --model-path model.safetensors --headless --json # SafeTensors");
eprintln!(" apr cbtop --model-path model.apr --headless --json # APR");
let mut pipeline = PipelineState::new();
for _ in 0..config.warmup {
pipeline.update_demo();
}
for brick in &mut pipeline.bricks {
brick.samples.clear();
brick.actual_us = 0.0;
}
for _ in 0..config.iterations {
pipeline.update_demo();
}
let report = generate_headless_report_simulated(model_name, &pipeline, &config);
let ci_passed = check_ci_thresholds(&report, &config);
if config.json {
let json_output = format_report_as_json(&report);
if let Some(ref path) = config.output {
std::fs::write(path, &json_output).map_err(|e| {
CliError::ValidationFailed(format!("Failed to write output file: {e}"))
})?;
eprintln!("cbtop: Results written to {}", path.display());
} else {
println!("{json_output}");
}
} else {
print_report_text(&report);
}
if config.ci && !ci_passed {
eprintln!("cbtop: CI thresholds not met!");
return Err(CliError::ValidationFailed(
"CI thresholds not met".to_string(),
));
}
Ok(())
}
#[cfg(feature = "inference")]
#[allow(clippy::needless_pass_by_value)] fn run_headless_apr(
config: CbtopConfig,
model_path: &std::path::Path,
model_name: &str,
) -> Result<()> {
use realizar::apr::AprV2Model;
use trueno::brick::BrickProfiler;
eprintln!("cbtop: APR format profiling (CPU, §12.11 BrickProfiler)");
eprintln!();
eprintln!("cbtop: Loading APR model...");
let load_start = Instant::now();
let model = AprV2Model::load(model_path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load APR model: {e}")))?;
let load_time = load_start.elapsed();
eprintln!("cbtop: APR model loaded in {:.2}s", load_time.as_secs_f32());
let hidden_dim = model.metadata().hidden_size.unwrap_or(0);
let num_layers = model.metadata().num_layers.unwrap_or(0);
let vocab_size = model.metadata().vocab_size.unwrap_or(0);
eprintln!("cbtop: APR model config:");
eprintln!(" Hidden: {}", hidden_dim);
eprintln!(" Layers: {}", num_layers);
eprintln!(" Vocab: {}", vocab_size);
eprintln!();
let prompt_tokens: Vec<u32> = vec![1, 25580, 264, 2566];
let mut profiler = BrickProfiler::enabled();
eprintln!("cbtop: Warmup ({} iterations)...", config.warmup);
for i in 0..config.warmup {
let _ = model.forward(&prompt_tokens);
eprint!("\r Warmup {}/{}", i + 1, config.warmup);
}
eprintln!();
eprintln!("cbtop: Measurement ({} iterations)...", config.iterations);
let measure_start = Instant::now();
for i in 0..config.iterations {
profiler.reset();
let _ = model.forward(&prompt_tokens);
eprint!("\r Iteration {}/{}", i + 1, config.iterations);
}
eprintln!();
let total_time = measure_start.elapsed();
let tokens_generated = config.iterations * prompt_tokens.len();
let throughput = tokens_generated as f64 / total_time.as_secs_f64();
eprintln!();
eprintln!("╔═══════════════════════════════════════════════════════════╗");
eprintln!("║ APR BRICKPROFILER SUMMARY (§12.11) ║");
eprintln!("╠═══════════════════════════════════════════════════════════╣");
eprintln!("║ Model: {:50} ║", model_name);
eprintln!("║ Format: APR (brick prefix: apr.*) ║");
eprintln!(
"║ Throughput: {:8.1} tok/s ║",
throughput
);
eprintln!("╠═══════════════════════════════════════════════════════════╣");
eprintln!("║ Brick Timing Summary: ║");
eprintln!(
"║ {:20} │ {:10} │ {:6} │ {:8} ║",
"Brick", "Mean µs", "% Tot", "Samples"
);
eprintln!("╠═══════════════════════════════════════════════════════════╣");
#[allow(deprecated)]
let all_stats = profiler.all_stats();
let mut sorted_stats: Vec<_> = all_stats.iter().collect();
sorted_stats.sort_by(|a, b| b.1.total_ns.cmp(&a.1.total_ns));
let summary_total = profiler.total_ns().max(1);
for (name, stat) in sorted_stats.iter().take(12) {
let mean_us = stat.avg_us();
let total_ns = stat.total_ns;
let pct = (total_ns as f64 / summary_total as f64) * 100.0;
let samples = stat.count;
eprintln!(
"║ {:20} │ {:10.2} │ {:5.1}% │ {:8} ║",
name, mean_us, pct, samples
);
}
eprintln!("╚═══════════════════════════════════════════════════════════╝");
if config.json {
let json = format!(
r#"{{"model":"{}","format":"apr","throughput":{:.1},"total_time_ms":{:.1},"iterations":{}}}"#,
model_name,
throughput,
total_time.as_secs_f64() * 1000.0,
config.iterations
);
if let Some(ref output_path) = config.output {
std::fs::write(output_path, &json)?;
eprintln!("cbtop: JSON output written to {}", output_path.display());
} else {
println!("{json}");
}
}
Ok(())
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_headless_real(config: CbtopConfig) -> Result<()> {
use realizar::gguf::QuantizedGenerateConfig;
std::env::set_var("CUDA_GRAPH_DISABLE", "1");
let model_path = config.model_path.clone().ok_or_else(|| {
CliError::ValidationFailed("model_path is required for real profiling".to_string())
})?;
let format = ModelFormat::from_path(&model_path).ok_or_else(|| {
CliError::ValidationFailed(format!(
"Unsupported model format: {}. Supported: .gguf, .safetensors, .apr",
model_path.display()
))
})?;
let model_name: String = config.model.clone().unwrap_or_else(|| {
model_path
.file_stem()
.and_then(|s| s.to_str())
.map_or_else(|| "unknown".to_string(), std::string::ToString::to_string)
});
eprintln!("cbtop: Running headless benchmark (REAL PROFILING)...");
eprintln!(" Model: {model_name}");
eprintln!(" Path: {}", model_path.display());
eprintln!(
" Format: {:?} (brick prefix: {}.*)",
format,
format.brick_prefix()
);
eprintln!(" Warmup: {} iterations", config.warmup);
eprintln!(" Measurement: {} iterations", config.iterations);
eprintln!();
if format == ModelFormat::Apr {
return run_headless_apr(config, &model_path, &model_name);
}
let (mapped, mut cuda_model) = load_gguf_cuda_for_profiling(&model_path)?;
let mut draft_cuda_model = load_draft_model(&config)?;
let (hidden_dim, num_heads, num_kv_heads, num_layers, _head_dim, intermediate_dim) =
extract_model_dims(&mapped);
eprintln!("cbtop: Model config:");
eprintln!(" Hidden: {}", hidden_dim);
eprintln!(" Heads: {} (KV: {})", num_heads, num_kv_heads);
eprintln!(" FFN: {}", intermediate_dim);
eprintln!(" Layers: {}", num_layers);
eprintln!();
let prompt = "Hello, I am a coding assistant.";
let prompt_tokens: Vec<u32> = mapped.model.encode(prompt).ok_or_else(|| {
CliError::InferenceFailed(
"FATAL: GGUF model has no tokenizer - cannot encode prompt for cbtop benchmark"
.to_string(),
)
})?;
let gen_config = QuantizedGenerateConfig {
max_tokens: 32,
temperature: 0.0,
top_k: 1,
..Default::default()
};
eprintln!("cbtop: Warmup ({} iterations)...", config.warmup);
for i in 0..config.warmup {
let _ = cuda_model.generate_gpu_resident(&prompt_tokens, &gen_config);
eprint!("\r Warmup {}/{}", i + 1, config.warmup);
}
eprintln!();
cuda_model.enable_profiling();
cuda_model.reset_profiler();
eprintln!("cbtop: BrickProfiler enabled (PAR-073, Immediate sync)");
eprintln!();
let mode_str = describe_measurement_mode(&config, draft_cuda_model.is_some());
eprintln!(
"cbtop: Measuring throughput ({} iterations, {} mode)...",
config.iterations, mode_str
);
let (total_tokens, latencies_us) = if config.concurrent > 1 {
measure_batch_throughput(&config, &mut cuda_model, &prompt_tokens)?
} else {
measure_standard_throughput(
&config,
&mut cuda_model,
&mut draft_cuda_model,
&prompt_tokens,
&gen_config,
)?
};
eprintln!();
let total_time_us: f64 = latencies_us.iter().sum();
let total_time_s = total_time_us / 1_000_000.0;
let tokens_per_sec = if total_time_s > 0.0 {
total_tokens as f64 / total_time_s
} else {
0.0
};
eprintln!();
eprintln!("cbtop: Throughput: {:.1} tok/s (MEASURED)", tokens_per_sec);
let measured_per_token_us = if tokens_per_sec > 0.0 { 1_000_000.0 / tokens_per_sec } else { 0.0 };
let measured_per_layer_us = if num_layers > 0 { measured_per_token_us / num_layers as f64 } else { 0.0 };
let target_per_layer_us = 35.7; eprintln!(
"cbtop: Per-layer time: {:.1}µs (MEASURED), budget: {:.1}µs ({:.1}x)",
measured_per_layer_us,
target_per_layer_us,
measured_per_layer_us / target_per_layer_us
);
eprintln!();
eprintln!("=== PAR-073 BrickProfiler Results ===");
let profiler_summary = cuda_model.profiler_summary();
eprintln!("{}", profiler_summary);
print_profiler_brick_stats(&cuda_model);
eprintln!();
let brick_reports = brick_scores_from_profiler(&cuda_model, num_layers);
let cv_percent = compute_cv_percent(&latencies_us);
#[cfg(feature = "visualization")]
check_renacer_escalation(tokens_per_sec, cv_percent);
let gpu_name = cuda_model.device_name().to_string();
build_and_output_report(
&config,
&model_name,
&gpu_name,
tokens_per_sec,
cv_percent,
&latencies_us,
brick_reports,
)
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn brick_scores_from_profiler(
cuda_model: &realizar::gguf::OwnedQuantizedModelCuda,
num_layers: usize,
) -> Vec<BrickScore> {
if num_layers > 0 {
eprintln!(" Layers: {} (per-layer normalization not yet implemented)", num_layers);
}
let profiler = cuda_model.profiler();
let mut scores = Vec::new();
let mut all: Vec<_> = profiler.all_brick_stats().collect();
all.sort_by(|a, b| b.total_ns.cmp(&a.total_ns));
let total_ns: u64 = all.iter().map(|s| s.total_ns).sum();
let total_us = total_ns as f64 / 1000.0;
let decoded_tokens = all.iter()
.find(|s| s.name == "LmHead")
.map_or(1u64, |s| s.count.max(1));
let wall_us_per_token = total_us / decoded_tokens as f64;
eprintln!("=== Real Brick Scores (from BrickProfiler) ===");
eprintln!(
" Total: {:.1}µs across {} decoded tokens ({:.1}µs/decoded_tok)",
total_us, decoded_tokens, wall_us_per_token,
);
for stats in &all {
let avg_us = stats.avg_us();
let per_decoded_tok_us = (stats.count as f64 * avg_us) / decoded_tokens as f64;
let pct = if total_ns > 0 { 100.0 * stats.total_ns as f64 / total_ns as f64 } else { 0.0 };
eprintln!(
" {:30} avg={:8.1}µs per_tok={:8.1}µs ({:5.1}%) n={} calls/tok={}",
stats.name, avg_us, per_decoded_tok_us, pct, stats.count,
stats.count / decoded_tokens,
);
let budget_us = wall_us_per_token * (pct / 100.0);
let score = compute_brick_score(per_decoded_tok_us, budget_us);
let grade = score_to_grade(score);
scores.push(BrickScore {
name: stats.name.clone(),
score,
grade: grade.to_string(),
budget_us,
actual_us: per_decoded_tok_us,
gap_factor: if budget_us > 0.0 { per_decoded_tok_us / budget_us } else { 1.0 },
});
}
eprintln!();
scores
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn load_gguf_cuda_for_profiling(
model_path: &std::path::Path,
) -> Result<(
realizar::gguf::MappedGGUFModel,
realizar::gguf::OwnedQuantizedModelCuda,
)> {
use realizar::cuda::CudaExecutor;
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};
let cuda_devices = CudaExecutor::num_devices();
if !CudaExecutor::is_available() || cuda_devices == 0 {
eprintln!("cbtop: ERROR - CUDA not available. Real profiling requires CUDA GPU.");
return Err(CliError::ValidationFailed(
"CUDA not available for real profiling".to_string(),
));
}
eprintln!(" CUDA: {} GPU(s) detected", cuda_devices);
eprintln!();
eprintln!("cbtop: Loading model...");
let load_start = Instant::now();
let mapped = MappedGGUFModel::from_path(model_path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to map model: {e}")))?;
let model = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
let cuda_model = OwnedQuantizedModelCuda::new(model, 0)
.map_err(|e| CliError::ValidationFailed(format!("Failed to initialize CUDA: {e}")))?;
let load_time = load_start.elapsed();
eprintln!("cbtop: Model loaded in {:.2}s", load_time.as_secs_f32());
eprintln!("cbtop: CUDA graphs DISABLED for per-brick profiling (PAR-073)");
eprintln!();
Ok((mapped, cuda_model))
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn extract_model_dims(
mapped: &realizar::gguf::MappedGGUFModel,
) -> (usize, usize, usize, usize, usize, usize) {
let hidden_dim = mapped.model.embedding_dim().unwrap_or(0);
let num_heads = mapped.model.num_heads().unwrap_or(0);
let num_kv_heads = mapped.model.num_kv_heads().unwrap_or(0);
let num_layers = mapped.model.num_layers().unwrap_or(0);
let head_dim = if num_heads > 0 { hidden_dim / num_heads } else { 0 };
let intermediate_dim = mapped
.model
.tensors
.iter()
.find(|t| t.name == "blk.0.ffn_up.weight")
.map_or(0, |t| {
t.dims.first().copied().unwrap_or(0) as usize
});
(hidden_dim, num_heads, num_kv_heads, num_layers, head_dim, intermediate_dim)
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn compute_cv_percent(latencies_us: &[f64]) -> f64 {
let mean = latencies_us.iter().sum::<f64>() / latencies_us.len() as f64;
let variance = latencies_us
.iter()
.map(|x| (x - mean).powi(2))
.sum::<f64>()
/ latencies_us.len() as f64;
(variance.sqrt() / mean) * 100.0
}
#[cfg(all(feature = "inference", feature = "cuda"))]
fn describe_measurement_mode(config: &CbtopConfig, has_draft: bool) -> String {
if config.concurrent > 1 {
format!("batch (concurrent={})", config.concurrent)
} else if config.speculative && has_draft {
format!("speculative with draft (k={})", config.speculation_k)
} else if config.speculative {
format!("speculative self (k={})", config.speculation_k)
} else {
"standard".to_string()
}
}