fn run_bench_warmup<F: FnMut()>(config: &BenchConfig, count: usize, mut f: F) {
if !config.quiet {
eprintln!("{}", "Running warmup...".yellow());
}
for i in 0..count {
f();
if !config.quiet {
eprint!(" Warmup {}/{}\r", i + 1, count);
std::io::Write::flush(&mut std::io::stderr()).ok();
}
}
if !config.quiet {
eprintln!(" Warmup complete ");
eprintln!();
}
}
fn print_bench_progress(config: &BenchConfig, i: usize, tokens: usize, time: Duration) {
if !config.quiet {
eprint!(
" Iteration {}/{}: {} tokens in {:.2}s\r",
i + 1,
config.iterations,
tokens,
time.as_secs_f32()
);
std::io::Write::flush(&mut std::io::stderr()).ok();
}
}
fn print_results(result: &BenchResult) {
output::section("Results");
println!();
let throughput_str = format!("{:.1} tok/s", result.tokens_per_second);
if result.passed {
println!(
"{} {} {}",
"Throughput:".white().bold(),
throughput_str.green().bold(),
"(PASS: >= 10 tok/s)".green()
);
} else {
println!(
"{} {} {}",
"Throughput:".white().bold(),
throughput_str.red().bold(),
"(FAIL: < 10 tok/s)".red()
);
}
println!();
output::kv("Total tokens", result.total_tokens);
output::kv(
"Total time",
format!("{:.2}s", result.total_time.as_secs_f32()),
);
output::kv(
"Time to first token",
format!("{:.0}ms", result.time_to_first_token.as_secs_f64() * 1000.0),
);
println!();
output::kv(
"Mean iteration time",
format!("{:.2}s", result.mean_time.as_secs_f32()),
);
output::kv(
"Median iteration time",
format!("{:.2}s", result.median_time.as_secs_f32()),
);
output::kv(
"Std deviation",
format!("{:.0}ms", result.std_dev.as_secs_f64() * 1000.0),
);
println!();
let grade = if result.tokens_per_second >= 100.0 {
"A+ (Excellent)".green()
} else if result.tokens_per_second >= 50.0 {
"A (Very Good)".green()
} else if result.tokens_per_second >= 20.0 {
"B (Good)".blue()
} else if result.tokens_per_second >= 10.0 {
"C (Acceptable)".yellow()
} else {
"F (Below Threshold)".red()
};
output::kv("Performance Grade", grade);
}
#[cfg(feature = "inference")]
fn run_realizar_benchmark(path: &Path, config: &BenchConfig) -> Result<BenchResult> {
use realizar::format::{detect_format, ModelFormat};
let header_bytes = std::fs::read(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to read model: {e}")))?;
let format = detect_format(&header_bytes[..8.min(header_bytes.len())])
.map_err(|e| CliError::ValidationFailed(format!("Failed to detect format: {e}")))?;
if !config.quiet {
eprintln!("{} {}", "Format:".cyan().bold(), format.to_string().green());
}
#[cfg(feature = "cuda")]
let (cuda_available, cuda_devices) = {
use realizar::cuda::CudaExecutor;
(CudaExecutor::is_available(), CudaExecutor::num_devices())
};
#[cfg(not(feature = "cuda"))]
let (cuda_available, cuda_devices) = (false, 0usize);
if !config.quiet {
if cuda_available && cuda_devices > 0 {
eprintln!(
"{} {} GPU(s) detected",
"CUDA:".cyan().bold(),
cuda_devices.to_string().green()
);
} else {
eprintln!(
"{} {}",
"CUDA:".cyan().bold(),
"Not available (CPU mode)".yellow()
);
}
}
let use_cuda = cuda_available && cuda_devices > 0;
let tracer = TracerImpl::new_local();
match format {
ModelFormat::Gguf => run_gguf_benchmark(path, config, use_cuda, &tracer),
ModelFormat::Apr => run_apr_benchmark(path, config, use_cuda, &tracer),
ModelFormat::SafeTensors => run_safetensors_benchmark(path, config, use_cuda, &tracer),
}
}
#[cfg(feature = "inference")]
fn run_gguf_benchmark(
path: &Path,
config: &BenchConfig,
use_cuda: bool,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use realizar::gguf::{GGUFModel, QuantizedGenerateConfig};
if !config.quiet {
eprintln!("{}", "Loading GGUF model...".yellow());
}
let start = Instant::now();
let model_bytes = std::fs::read(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to read model: {e}")))?;
let gguf = GGUFModel::from_bytes(&model_bytes)
.map_err(|e| CliError::ValidationFailed(format!("Failed to parse GGUF: {e}")))?;
let bos = aprender::demo::SpecialTokens::qwen2().bos_id;
let prompt_tokens: Vec<u32> = gguf
.encode(&config.prompt)
.unwrap_or_else(|| vec![bos, 9707, 11, 358, 1079, 264, 11761, 18328, 13, 9842]);
let gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens.min(128),
temperature: 0.0,
top_k: 1,
..Default::default()
};
#[cfg(feature = "cuda")]
if use_cuda {
match run_cuda_benchmark(
&gguf,
&model_bytes,
&prompt_tokens,
&gen_config,
config,
start,
path,
tracer,
) {
Ok(result) => return Ok(result),
Err(e) => {
if !config.quiet {
eprintln!(
"{}",
format!("CUDA init failed, falling back to CPU: {e}").yellow()
);
}
let cpu_start = Instant::now();
return run_cpu_benchmark(&prompt_tokens, &gen_config, config, cpu_start, path, tracer);
}
}
}
run_cpu_benchmark(&prompt_tokens, &gen_config, config, start, path, tracer)
}
#[cfg(feature = "inference")]
fn resolve_apr_prompt_tokens(path: &Path, prompt: &str) -> Vec<u32> {
use realizar::apr::AprV2Model;
if let Some(tokenizer) = realizar::safetensors::find_sibling_file(path, "tokenizer.json")
.and_then(|tp| AprV2Model::load_tokenizer_from_path(&tp))
{
tokenizer.encode(prompt)
} else if let Some((vocab, _, _)) = AprV2Model::load_tokenizer_from_sibling(path) {
let token_to_id: std::collections::HashMap<String, u32> = vocab
.iter()
.enumerate()
.map(|(i, t)| (t.clone(), i as u32))
.collect();
prompt
.split_whitespace()
.filter_map(|w| token_to_id.get(w).copied())
.collect()
} else {
vec![1, 2, 3, 4, 5]
}
}
#[cfg(feature = "inference")]
fn handle_zero_generation_fallback(
generation_failed: bool,
total_tokens: usize,
iterations: usize,
prompt_len: usize,
_quiet: bool,
) -> usize {
if generation_failed && total_tokens == 0 {
eprintln!(
"{}",
"Note: Generation produced 0 new tokens, reporting forward-pass throughput.".yellow()
);
iterations * prompt_len
} else {
total_tokens
}
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn try_cuda_benchmark(
path: &Path,
config: &BenchConfig,
tracer: &TracerImpl,
) -> Result<Option<BenchResult>> {
let result = run_apr_cuda_benchmark(path, config, tracer)?;
if result.total_tokens > 0 {
return Ok(Some(result));
}
if !config.quiet {
eprintln!(
"{}",
"GPU generated 0 tokens, falling back to CPU...".yellow()
);
}
Ok(None)
}
#[cfg(feature = "inference")]
fn run_apr_benchmark(
path: &Path,
config: &BenchConfig,
use_cuda: bool,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use realizar::apr_transformer::{AprTransformer, GenerateConfig};
#[cfg(feature = "cuda")]
if use_cuda {
if let Some(result) = try_cuda_benchmark(path, config, tracer)? {
return Ok(result);
}
}
if !config.quiet {
eprintln!("{}", "Loading APR model (CPU)...".yellow());
}
let start = Instant::now();
let transformer = AprTransformer::from_apr_file(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load APR: {e}")))?;
let load_time = start.elapsed();
if !config.quiet {
eprintln!(
"{} in {:.2}s",
"Model ready".green(),
load_time.as_secs_f32()
);
eprintln!();
}
let prompt_tokens = resolve_apr_prompt_tokens(path, &config.prompt);
let gen_config = GenerateConfig {
max_tokens: config.max_tokens.min(32),
temperature: 0.0,
top_p: 1.0,
top_k: 0,
repetition_penalty: 1.0,
trace: false,
stop_tokens: vec![],
};
run_bench_warmup(config, config.warmup, || {
let _ = transformer.generate_with_cache(&prompt_tokens, &gen_config);
});
let (iteration_times, total_tokens, first_token_time) =
run_apr_measurement(&transformer, &prompt_tokens, &gen_config, config, tracer)?;
calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}
#[cfg(feature = "inference")]
fn run_apr_measurement(
transformer: &realizar::apr_transformer::AprTransformer,
prompt_tokens: &[u32],
gen_config: &realizar::apr_transformer::GenerateConfig,
config: &BenchConfig,
tracer: &TracerImpl,
) -> Result<(Vec<Duration>, usize, Duration)> {
if !config.quiet {
eprintln!("{}", "Running benchmark...".yellow());
}
let mut iteration_times = Vec::with_capacity(config.iterations);
let mut total_tokens = 0usize;
let mut first_token_time = Duration::ZERO;
let mut generation_failed = false;
let budget_us = config.max_tokens as u64 * 100_000;
let mut generate_errors = 0usize;
for i in 0..config.iterations {
let traced = tracer.trace("bench_apr_iter", budget_us, || {
transformer.generate_with_cache(prompt_tokens, gen_config)
});
let output = match traced.result {
Ok(tokens) => tokens,
Err(e) => {
if generate_errors == 0 {
eprintln!(
"{}",
format!("Warning: generate_with_cache() failed on iteration {i}: {e}").yellow()
);
}
generate_errors += 1;
prompt_tokens.to_vec()
}
};
let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
let iter_time = Duration::from_micros(traced.duration_us);
iteration_times.push(iter_time);
total_tokens += tokens_generated;
if i == 0 {
first_token_time =
Duration::from_secs_f64(iter_time.as_secs_f64() / tokens_generated.max(1) as f64);
if tokens_generated == 0 {
generation_failed = true;
}
}
print_bench_progress(config, i, tokens_generated, iter_time);
}
if !config.quiet {
eprintln!();
}
total_tokens = handle_zero_generation_fallback(
generation_failed, total_tokens, config.iterations, prompt_tokens.len(), config.quiet,
);
if !config.quiet {
eprintln!();
}
Ok((iteration_times, total_tokens, first_token_time))
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_apr_cuda_benchmark(
path: &Path,
config: &BenchConfig,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use realizar::apr::{AprV2Model, MappedAprModel};
use realizar::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda, QuantizedGenerateConfig};
if !config.quiet {
eprintln!("{}", "Loading APR model (GPU, fused Q4K kernels)...".yellow());
}
let start = Instant::now();
let mapped = MappedAprModel::from_path(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to map APR: {e}")))?;
let tensor_count = mapped.tensors.len();
let prompt_tokens: Vec<u32> = {
let cpu_model = AprV2Model::load(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load APR for tokenizer: {e}")))?;
if let Some(tokenizer) = cpu_model.load_embedded_bpe_tokenizer() {
tokenizer.encode(&config.prompt)
} else {
resolve_apr_prompt_tokens(path, &config.prompt)
}
};
let model = OwnedQuantizedModel::from_apr(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create quantized model from APR: {e}")))?;
let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)
.map_err(|e| CliError::ValidationFailed(format!("Failed to init CUDA: {e}")))?;
let gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens.min(128),
temperature: 0.0,
top_k: 1,
..Default::default()
};
let load_time = start.elapsed();
if !config.quiet {
eprintln!(
"{} in {:.2}s ({} tensors, GPU device 0, fused Q4K kernels)",
"Model ready".green(),
load_time.as_secs_f32(),
tensor_count
);
eprintln!();
}
run_bench_warmup(config, config.warmup, || {
let _ = cuda_model.generate_gpu_resident(&prompt_tokens, &gen_config);
});
if !config.quiet {
eprintln!("{}", "Running benchmark (GPU)...".yellow());
}
let mut iteration_times = Vec::with_capacity(config.iterations);
let mut total_tokens = 0usize;
let mut first_token_time = Duration::ZERO;
let budget_us = config.max_tokens as u64 * 100_000;
for i in 0..config.iterations {
let traced = tracer.trace("bench_apr_gpu_iter", budget_us, || {
cuda_model
.generate_gpu_resident(&prompt_tokens, &gen_config)
.unwrap_or_default()
});
let output = traced.result;
let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
let iter_time = Duration::from_micros(traced.duration_us);
iteration_times.push(iter_time);
total_tokens += tokens_generated;
if i == 0 {
first_token_time =
Duration::from_secs_f64(iter_time.as_secs_f64() / tokens_generated.max(1) as f64);
}
print_bench_progress(config, i, tokens_generated, iter_time);
}
if !config.quiet {
eprintln!();
eprintln!();
}
calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}