#![allow(dead_code)]
#![allow(unused_imports)]
#![allow(unused_variables)]
#![allow(clippy::needless_return)]
use crate::error::{CliError, Result};
use colored::Colorize;
use std::path::Path;
use std::process::Command;
use std::time::{Duration, Instant};
use super::types::{Baseline, BenchMeasurement, BenchmarkComparison, ModelTier, ShowcaseConfig};
pub(super) fn export_benchmark_results(
bench: &BenchmarkComparison,
config: &ShowcaseConfig,
) -> Result<()> {
match config.export_format {
super::types::ExportFormat::None => Ok(()),
super::types::ExportFormat::Json => {
let path = config
.export_path
.clone()
.unwrap_or_else(|| config.model_dir.join("benchmark-results.json"));
let json = serde_json::to_string_pretty(bench).map_err(|e| {
CliError::ValidationFailed(format!("JSON serialization failed: {e}"))
})?;
std::fs::write(&path, &json)
.map_err(|e| CliError::ValidationFailed(format!("Failed to write JSON: {e}")))?;
println!(
"{} Benchmark results exported to {} ({} bytes)",
"✓".green(),
path.display(),
json.len()
);
Ok(())
}
super::types::ExportFormat::Csv => {
let path = config
.export_path
.clone()
.unwrap_or_else(|| config.model_dir.join("benchmark-results.csv"));
let csv = format_benchmark_csv(bench);
std::fs::write(&path, &csv)
.map_err(|e| CliError::ValidationFailed(format!("Failed to write CSV: {e}")))?;
println!(
"{} Benchmark results exported to {} ({} bytes)",
"✓".green(),
path.display(),
csv.len()
);
Ok(())
}
}
}
pub(super) fn format_benchmark_csv(bench: &BenchmarkComparison) -> String {
use std::fmt::Write;
let mut csv = String::new();
csv.push_str("system,tokens_per_sec,ttft_ms,speedup_pct,stddev,runs\n");
let _ = writeln!(
csv,
"APR,{:.2},{:.2},,{:.2},{}",
bench.apr_tps, bench.apr_ttft_ms, bench.apr_tps_stddev, bench.runs
);
type Baseline = (&'static str, Option<f64>, Option<f64>, Option<f64>);
let baselines: &[Baseline] = &[
(
"llama.cpp",
bench.llama_cpp_tps,
bench.llama_cpp_ttft_ms,
bench.speedup_vs_llama,
),
(
"Ollama",
bench.ollama_tps,
bench.ollama_ttft_ms,
bench.speedup_vs_ollama,
),
];
for &(name, tps_opt, ttft_opt, speedup_opt) in baselines {
if let Some(tps) = tps_opt {
let ttft = ttft_opt.unwrap_or(0.0);
let speedup = speedup_opt.map_or(String::new(), |s| format!("{s:.2}"));
let _ = writeln!(csv, "{name},{tps:.2},{ttft:.2},{speedup},N/A,N/A");
}
}
csv
}
fn build_comparison(
apr_tps: f64,
apr_ttft_ms: f64,
apr_tps_stddev: f64,
runs: usize,
config: &ShowcaseConfig,
) -> BenchmarkComparison {
let llama_results = if config.baselines.contains(&Baseline::LlamaCpp) {
println!();
println!("{}", "Running llama.cpp benchmark...".yellow());
run_llama_cpp_bench(config).ok()
} else {
None
};
let ollama_results = if config.baselines.contains(&Baseline::Ollama) {
println!();
println!("{}", "Running Ollama benchmark...".yellow());
run_ollama_bench(config).ok()
} else {
None
};
let speedup_vs_llama = llama_results.map(|(tps, _)| ((apr_tps - tps) / tps) * 100.0);
let speedup_vs_ollama = ollama_results.map(|(tps, _)| ((apr_tps - tps) / tps) * 100.0);
BenchmarkComparison {
apr_tps,
apr_ttft_ms,
apr_tps_stddev,
runs,
llama_cpp_tps: llama_results.map(|(tps, _)| tps),
llama_cpp_ttft_ms: llama_results.map(|(_, ttft)| ttft),
ollama_tps: ollama_results.map(|(tps, _)| tps),
ollama_ttft_ms: ollama_results.map(|(_, ttft)| ttft),
speedup_vs_llama,
speedup_vs_ollama,
}
}
#[cfg(feature = "inference")]
pub(super) fn run_benchmark(config: &ShowcaseConfig) -> Result<BenchmarkComparison> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};
println!();
println!("{}", "═══ Step E: Performance Benchmark ═══".cyan().bold());
println!();
println!("Benchmark configuration:");
println!(
" Runs: {} (per Hoefler & Belli 2015)",
config.bench_runs.max(5)
);
println!(" Warmup: 5 iterations");
println!(" Baselines: {:?}", config.baselines);
println!(
" Backend: {}",
if config.gpu { "GPU (CUDA)" } else { "CPU" }
);
println!();
let gguf_path = config.model_dir.join(config.tier.gguf_filename());
if !gguf_path.exists() {
return Err(CliError::ValidationFailed(format!(
"Model not found: {}. Run 'apr showcase --step import' first.",
gguf_path.display()
)));
}
println!(
"Loading model for benchmark: {} ({})",
config.tier.gguf_filename(),
config.tier.params()
);
let mapped = MappedGGUFModel::from_path(&gguf_path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load GGUF: {e}")))?;
let model = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
println!("{} Model loaded", "✓".green());
println!();
let apr_results = if config.gpu {
println!("{}", "Running APR benchmark (GPU)...".yellow());
match OwnedQuantizedModelCuda::new(model, 0) {
Ok(mut cuda_model) => {
println!("{} CUDA model created", "✓".green());
run_real_benchmark_cuda(&mut cuda_model, &mapped, config)?
}
Err(e) => {
println!(
"{} CUDA unavailable ({}), falling back to CPU",
"⚠".yellow(),
e
);
let model = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to reload: {e}")))?;
run_real_benchmark(&model, &mapped, config)?
}
}
} else {
println!("{}", "Running APR benchmark (CPU)...".yellow());
run_real_benchmark(&model, &mapped, config)?
};
let apr_tps = apr_results
.iter()
.map(BenchMeasurement::tokens_per_second)
.sum::<f64>()
/ apr_results.len() as f64;
let apr_ttft_ms = apr_results
.iter()
.map(|m| m.ttft.as_secs_f64() * 1000.0)
.sum::<f64>()
/ apr_results.len() as f64;
let apr_tps_stddev = calculate_stddev(
&apr_results
.iter()
.map(BenchMeasurement::tokens_per_second)
.collect::<Vec<_>>(),
);
println!(
" APR: {:.1} ± {:.1} tok/s, TTFT: {:.1}ms ({} runs)",
apr_tps,
apr_tps_stddev,
apr_ttft_ms,
apr_results.len()
);
let comparison = build_comparison(
apr_tps,
apr_ttft_ms,
apr_tps_stddev,
apr_results.len(),
config,
);
print_benchmark_results(&comparison);
Ok(comparison)
}
#[cfg(feature = "inference")]
fn bench_setup(
mapped: &realizar::gguf::MappedGGUFModel,
) -> (Vec<u32>, realizar::gguf::QuantizedGenerateConfig) {
let test_prompt = "Hello, I am a coding assistant. Write a function that calculates";
let bos = aprender::demo::SpecialTokens::qwen2().bos_id;
let prompt_tokens: Vec<u32> = mapped.model.encode(test_prompt).unwrap_or_else(|| {
vec![bos, 9707, 11, 358, 1079, 264, 11761, 18328, 13, 9842]
});
println!(
" Prompt: {} tokens (\"{}...\")",
prompt_tokens.len(),
&test_prompt[..test_prompt.len().min(30)]
);
let gen_config = realizar::gguf::QuantizedGenerateConfig {
max_tokens: 32,
temperature: 0.0, top_k: 1, ..Default::default()
};
(prompt_tokens, gen_config)
}
#[cfg(feature = "inference")]
fn record_measurement(
output_len: usize,
prompt_len: usize,
duration: Duration,
) -> BenchMeasurement {
let tokens_generated = output_len.saturating_sub(prompt_len);
let ttft = if tokens_generated > 0 {
Duration::from_secs_f64(duration.as_secs_f64() / tokens_generated as f64)
} else {
duration
};
BenchMeasurement {
tokens_generated,
duration,
ttft,
}
}
#[cfg(feature = "inference")]
pub(super) fn run_real_benchmark(
model: &realizar::gguf::OwnedQuantizedModel,
mapped: &realizar::gguf::MappedGGUFModel,
config: &ShowcaseConfig,
) -> Result<Vec<BenchMeasurement>> {
let (prompt_tokens, gen_config) = bench_setup(mapped);
print!(" Warmup: ");
for i in 0..5 {
let _ = model.generate_with_cache(&prompt_tokens, &gen_config);
print!("{} ", i + 1);
std::io::Write::flush(&mut std::io::stdout()).ok();
}
println!("done");
let runs = config.bench_runs.clamp(5, 100);
let mut measurements = Vec::with_capacity(runs);
print!(" Measuring: ");
for i in 0..runs {
let start = Instant::now();
let output = model
.generate_with_cache(&prompt_tokens, &gen_config)
.unwrap_or_default();
let duration = start.elapsed();
measurements.push(record_measurement(
output.len(),
prompt_tokens.len(),
duration,
));
if (i + 1) % 5 == 0 {
print!("{} ", i + 1);
std::io::Write::flush(&mut std::io::stdout()).ok();
}
}
println!("done");
Ok(measurements)
}
#[cfg(feature = "inference")]
pub(super) fn run_real_benchmark_cuda(
model: &mut realizar::gguf::OwnedQuantizedModelCuda,
mapped: &realizar::gguf::MappedGGUFModel,
config: &ShowcaseConfig,
) -> Result<Vec<BenchMeasurement>> {
let (prompt_tokens, gen_config) = bench_setup(mapped);
print!(" Warmup: ");
for i in 0..5 {
if let Err(e) = model.generate_gpu_resident(&prompt_tokens, &gen_config) {
eprintln!("\n Warmup error: {e}");
return Err(CliError::ValidationFailed(format!(
"GPU warmup failed: {e}"
)));
}
print!("{} ", i + 1);
std::io::Write::flush(&mut std::io::stdout()).ok();
}
println!("done");
let runs = config.bench_runs.clamp(5, 100);
let mut measurements = Vec::with_capacity(runs);
print!(" Measuring: ");
for i in 0..runs {
let start = Instant::now();
let output = match model.generate_gpu_resident(&prompt_tokens, &gen_config) {
Ok(tokens) => tokens,
Err(e) => {
eprintln!("\n Generation error: {e}");
if measurements.is_empty() {
return Err(CliError::ValidationFailed(format!(
"GPU generation failed: {e}"
)));
}
break;
}
};
let duration = start.elapsed();
measurements.push(record_measurement(
output.len(),
prompt_tokens.len(),
duration,
));
if (i + 1) % 5 == 0 {
print!("{} ", i + 1);
std::io::Write::flush(&mut std::io::stdout()).ok();
}
}
println!("done");
Ok(measurements)
}
#[cfg(not(feature = "inference"))]
pub(super) fn run_benchmark(config: &ShowcaseConfig) -> Result<BenchmarkComparison> {
println!();
println!("{}", "═══ Step E: Performance Benchmark ═══".cyan().bold());
println!();
println!(
"{} Inference feature not enabled. Using simulated benchmarks.",
"⚠".yellow()
);
let apr_tps = 44.0 + generate_jitter() * 2.0;
let apr_ttft_ms = 78.0 + generate_jitter() * 5.0;
let comparison = build_comparison(apr_tps, apr_ttft_ms, 2.0, config.bench_runs, config);
print_benchmark_results(&comparison);
Ok(comparison)
}
include!("benchmark_helpers.rs");