#[cfg(feature = "inference")]
fn run_safetensors_benchmark(
path: &Path,
config: &BenchConfig,
use_cuda: bool,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use realizar::safetensors_infer::SafetensorsToAprConverter;
#[cfg(feature = "cuda")]
if use_cuda {
return run_safetensors_cuda_benchmark(path, config, tracer);
}
bench_log(config, &"Loading SafeTensors model (CPU)...".yellow().to_string());
let start = Instant::now();
let transformer = SafetensorsToAprConverter::convert(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to load SafeTensors: {e}")))?;
bench_log_ready(config, start.elapsed(), "");
let prompt_tokens = resolve_safetensors_tokens(path, &config.prompt);
run_forward_warmup(config, &transformer, &prompt_tokens);
bench_log(config, &"Running benchmark (forward pass)...".yellow().to_string());
let mut iteration_times = Vec::with_capacity(config.iterations);
let total_tokens = config.iterations * prompt_tokens.len();
let budget_us = config.max_tokens as u64 * 100_000;
for i in 0..config.iterations {
let traced = tracer.trace("bench_safetensors_cpu_iter", budget_us, || {
transformer.forward(&prompt_tokens)
});
let _ = traced.result;
let iter_time = Duration::from_micros(traced.duration_us);
iteration_times.push(iter_time);
bench_log_iter(config, i, iter_time, None);
}
let first_token_time = iteration_times.first().copied().unwrap_or(Duration::ZERO);
bench_log_done(config);
calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}
#[cfg(feature = "inference")]
fn resolve_safetensors_tokens(path: &Path, prompt: &str) -> Vec<u32> {
use realizar::apr::AprV2Model;
if let Some(tokenizer) = realizar::safetensors::find_sibling_file(path, "tokenizer.json")
.and_then(|tp| AprV2Model::load_tokenizer_from_path(&tp))
{
tokenizer.encode(prompt)
} else {
let bos = aprender::demo::SpecialTokens::qwen2().bos_id;
vec![bos, 9707, 11, 358, 1079, 264, 11761, 18328, 13, 9842]
}
}
#[cfg(feature = "inference")]
fn run_forward_warmup(
config: &BenchConfig,
transformer: &realizar::apr_transformer::AprTransformer,
prompt_tokens: &[u32],
) {
bench_log(config, &"Running warmup...".yellow().to_string());
for i in 0..config.warmup {
let _ = transformer.forward(prompt_tokens);
bench_log_iter(config, i, Duration::ZERO, None);
}
bench_log_done(config);
}
#[cfg(feature = "inference")]
fn bench_log(config: &BenchConfig, msg: &str) {
if !config.quiet {
eprintln!("{msg}");
}
}
#[cfg(feature = "inference")]
fn bench_log_ready(config: &BenchConfig, elapsed: Duration, suffix: &str) {
if !config.quiet {
eprintln!("{} in {:.2}s{suffix}", "Model ready".green(), elapsed.as_secs_f32());
eprintln!();
}
}
#[cfg(feature = "inference")]
fn bench_log_iter(config: &BenchConfig, i: usize, time: Duration, tokens: Option<usize>) {
if config.quiet {
return;
}
if let Some(tok) = tokens {
eprint!(" Iteration {}/{}: {} tokens in {:.2}s\r", i + 1, config.iterations, tok, time.as_secs_f32());
} else if time > Duration::ZERO {
eprint!(" Iteration {}/{}: {:.2}s\r", i + 1, config.iterations, time.as_secs_f32());
} else {
eprint!(" Warmup {}/{}\r", i + 1, config.warmup);
}
std::io::Write::flush(&mut std::io::stderr()).ok();
}
#[cfg(feature = "inference")]
fn bench_log_done(config: &BenchConfig) {
if !config.quiet {
eprintln!(" Complete ");
eprintln!();
}
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_safetensors_cuda_benchmark(
path: &Path,
config: &BenchConfig,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use aprender::format::{ImportOptions, QuantizationType};
use realizar::apr::MappedAprModel;
use realizar::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda, QuantizedGenerateConfig};
bench_log(config, &"Converting SafeTensors → Q4K (one-time)...".yellow().to_string());
let start = Instant::now();
let tmp_apr = std::env::temp_dir().join("bench-safetensors-q4k.apr");
let import_opts = ImportOptions {
quantize: Some(QuantizationType::Q4K),
..ImportOptions::default()
};
aprender::format::apr_import(&path.display().to_string(), &tmp_apr, import_opts)
.map_err(|e| CliError::ValidationFailed(format!("SafeTensors→APR Q4K conversion failed: {e}")))?;
bench_log(config, &"Loading Q4K model (GPU, fused kernels)...".yellow().to_string());
let mapped = MappedAprModel::from_path(&tmp_apr)
.map_err(|e| CliError::ValidationFailed(format!("Failed to map temp APR: {e}")))?;
let prompt_tokens = resolve_safetensors_tokens(path, &config.prompt);
let model = OwnedQuantizedModel::from_apr(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create quantized model: {e}")))?;
let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)
.map_err(|e| CliError::ValidationFailed(format!("Failed to init CUDA: {e}")))?;
let gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens.min(128),
temperature: 0.0,
top_k: 1,
..Default::default()
};
let load_time = start.elapsed();
bench_log_ready(config, load_time, " (GPU device 0, fused Q4K kernels)");
run_cuda_warmup(&mut cuda_model, &prompt_tokens, &gen_config, config)?;
let (iteration_times, total_tokens, first_token_time) =
run_cuda_measurement(&mut cuda_model, &prompt_tokens, &gen_config, config, tracer)?;
let _ = std::fs::remove_file(&tmp_apr);
calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_cuda_warmup(
cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
config: &BenchConfig,
) -> Result<()> {
if !config.quiet {
eprintln!("{}", "Running warmup (GPU)...".yellow());
}
for i in 0..config.warmup {
cuda_model
.generate_gpu_resident(prompt_tokens, gen_config)
.map_err(|e| {
eprintln!("\n Warmup error: {e}");
CliError::ValidationFailed(format!("GPU warmup failed: {e}"))
})?;
if !config.quiet {
eprint!(" Warmup {}/{}\r", i + 1, config.warmup);
std::io::Write::flush(&mut std::io::stderr()).ok();
}
}
if !config.quiet {
eprintln!(" Warmup complete ");
eprintln!();
}
Ok(())
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_cuda_measurement(
cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
config: &BenchConfig,
tracer: &TracerImpl,
) -> Result<(Vec<Duration>, usize, Duration)> {
if !config.quiet {
eprintln!("{}", "Running benchmark (GPU)...".yellow());
}
let mut iteration_times = Vec::with_capacity(config.iterations);
let mut total_tokens = 0usize;
let mut first_token_time = Duration::ZERO;
let budget_us = config.max_tokens as u64 * 100_000;
for i in 0..config.iterations {
let traced = tracer.trace("bench_gpu_iter", budget_us, || {
cuda_model
.generate_gpu_resident(prompt_tokens, gen_config)
});
let output = traced.result.map_err(|e| {
eprintln!("\n Generation error: {e}");
CliError::ValidationFailed(format!("GPU generation failed: {e}"))
})?;
let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
let iter_time = Duration::from_micros(traced.duration_us);
iteration_times.push(iter_time);
total_tokens += tokens_generated;
if i == 0 {
first_token_time =
Duration::from_secs_f64(iter_time.as_secs_f64() / tokens_generated.max(1) as f64);
}
if !config.quiet {
eprint!(
" Iteration {}/{}: {} tokens in {:.2}s\r",
i + 1,
config.iterations,
tokens_generated,
iter_time.as_secs_f32()
);
std::io::Write::flush(&mut std::io::stderr()).ok();
}
}
if !config.quiet {
eprintln!();
eprintln!();
}
Ok((iteration_times, total_tokens, first_token_time))
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_cuda_benchmark(
_gguf: &realizar::gguf::GGUFModel,
_model_bytes: &[u8],
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
config: &BenchConfig,
start: Instant,
model_path: &Path,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};
if !config.quiet {
eprintln!("{}", "Initializing CUDA model...".cyan());
}
let mapped = MappedGGUFModel::from_path(model_path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to map model: {e}")))?;
let model = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)
.map_err(|e| CliError::ValidationFailed(format!("Failed to initialize CUDA: {e}")))?;
let load_time = start.elapsed();
if !config.quiet {
eprintln!(
"{} in {:.2}s (GPU device 0)",
"Model ready".green(),
load_time.as_secs_f32()
);
eprintln!();
}
run_cuda_warmup(&mut cuda_model, prompt_tokens, gen_config, config)?;
let (iteration_times, total_tokens, first_token_time) =
run_cuda_measurement(&mut cuda_model, prompt_tokens, gen_config, config, tracer)?;
calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}
#[cfg(feature = "inference")]
fn run_cpu_benchmark(
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
config: &BenchConfig,
start: Instant,
path: &Path,
tracer: &TracerImpl,
) -> Result<BenchResult> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel};
let mapped = MappedGGUFModel::from_path(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to mmap model: {e}")))?;
let model = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
bench_log_ready(config, start.elapsed(), " (CPU)");
run_cpu_warmup(&model, prompt_tokens, gen_config, config);
let (iteration_times, total_tokens, first_token_time) =
run_cpu_measurement(&model, prompt_tokens, gen_config, config, tracer);
calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}
#[cfg(feature = "inference")]
fn run_cpu_warmup(
model: &realizar::gguf::OwnedQuantizedModel,
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
config: &BenchConfig,
) {
bench_log(config, &"Running warmup (CPU)...".yellow().to_string());
for i in 0..config.warmup {
let _ = model.generate_with_cache(prompt_tokens, gen_config);
bench_log_iter(config, i, Duration::ZERO, None);
}
bench_log_done(config);
}
#[cfg(feature = "inference")]
fn run_cpu_measurement(
model: &realizar::gguf::OwnedQuantizedModel,
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
config: &BenchConfig,
tracer: &TracerImpl,
) -> (Vec<Duration>, usize, Duration) {
bench_log(config, &"Running benchmark (CPU)...".yellow().to_string());
let mut iteration_times = Vec::with_capacity(config.iterations);
let mut total_tokens = 0usize;
let mut first_token_time = Duration::ZERO;
let budget_us = config.max_tokens as u64 * 100_000;
for i in 0..config.iterations {
let traced = tracer.trace("bench_cpu_iter", budget_us, || {
model
.generate_with_cache(prompt_tokens, gen_config)
.unwrap_or_default()
});
let output = traced.result;
let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
let iter_time = Duration::from_micros(traced.duration_us);
iteration_times.push(iter_time);
total_tokens += tokens_generated;
if i == 0 {
first_token_time =
Duration::from_secs_f64(iter_time.as_secs_f64() / tokens_generated.max(1) as f64);
}
bench_log_iter(config, i, iter_time, Some(tokens_generated));
}
bench_log_done(config);
(iteration_times, total_tokens, first_token_time)
}
#[cfg(feature = "inference")]
fn calculate_benchmark_stats(
iteration_times: Vec<Duration>,
total_tokens: usize,
first_token_time: Duration,
config: &BenchConfig,
) -> Result<BenchResult> {
let total_time: Duration = iteration_times.iter().sum();
let tokens_per_second = if total_tokens == 0 || total_time.as_secs_f64() <= 0.0 {
0.0
} else {
total_tokens as f64 / total_time.as_secs_f64()
};
let mean_time = total_time / config.iterations as u32;
let mut sorted_times = iteration_times.clone();
sorted_times.sort();
let median_time = if config.iterations % 2 == 0 && config.iterations >= 2 {
let mid = config.iterations / 2;
(sorted_times[mid - 1] + sorted_times[mid]) / 2
} else {
sorted_times[config.iterations / 2]
};
let mean_ms = mean_time.as_secs_f64() * 1000.0;
let variance: f64 = iteration_times
.iter()
.map(|t| {
let diff = t.as_secs_f64() * 1000.0 - mean_ms;
diff * diff
})
.sum::<f64>()
/ config.iterations as f64;
let std_dev = Duration::from_secs_f64(variance.sqrt() / 1000.0);
let passed = tokens_per_second >= 10.0;
Ok(BenchResult {
total_tokens,
total_time,
tokens_per_second,
time_to_first_token: first_token_time,
iteration_times,
mean_time,
median_time,
std_dev,
passed,
})
}