impl HttpBenchmarkRunner {
#[must_use]
pub fn new(config: HttpBenchmarkConfig) -> Self {
let outlier_detector = OutlierDetector::new(config.outlier_k_factor);
Self {
client: ModelHttpClient::with_timeout(120), config,
preflight_runner: None,
outlier_detector,
}
}
#[must_use]
pub fn with_defaults() -> Self {
Self::new(HttpBenchmarkConfig::default())
}
#[must_use]
pub fn with_relaxed() -> Self {
Self::new(HttpBenchmarkConfig::relaxed())
}
#[must_use]
pub fn with_reproducible() -> Self {
Self::new(HttpBenchmarkConfig::reproducible())
}
fn run_preflight_impl(
&mut self,
base_url: &str,
default_port: u16,
make_check: fn(u16) -> ServerAvailabilityCheck,
health_check: impl FnOnce(&ModelHttpClient, &str) -> Result<bool>,
) -> Result<Vec<String>> {
let mut runner = PreflightRunner::new();
let url_parts: Vec<&str> = base_url.trim_end_matches('/').split(':').collect();
let port = url_parts
.last()
.and_then(|p| p.parse::<u16>().ok())
.unwrap_or(default_port);
let mut server_check = make_check(port);
match health_check(&self.client, base_url) {
Ok(true) => server_check.set_health_status(200),
Ok(false) => server_check.set_health_status(500),
Err(_) => server_check.set_health_status(0),
}
runner.add_check(Box::new(server_check));
let passed = runner
.run()
.map_err(|e| RealizarError::ConnectionError(format!("Preflight failed: {}", e)))?;
self.preflight_runner = Some(runner);
Ok(passed)
}
pub fn run_preflight_llamacpp(&mut self, base_url: &str) -> Result<Vec<String>> {
self.run_preflight_impl(
base_url,
8080,
ServerAvailabilityCheck::llama_cpp,
|client, url| client.health_check_openai(url),
)
}
pub fn run_preflight_ollama(&mut self, base_url: &str) -> Result<Vec<String>> {
self.run_preflight_impl(
base_url,
11434,
ServerAvailabilityCheck::ollama,
|client, url| client.health_check_ollama(url),
)
}
#[must_use]
pub fn preflight_checks_passed(&self) -> Vec<String> {
self.preflight_runner
.as_ref()
.map(|r| r.passed_checks().to_vec())
.unwrap_or_default()
}
fn benchmark_measure_loop(
&mut self,
mut send_request: impl FnMut(&ModelHttpClient) -> Result<InferenceTiming>,
) -> Result<HttpBenchmarkResult> {
let mut latencies = Vec::with_capacity(self.config.max_samples());
let mut throughputs = Vec::with_capacity(self.config.max_samples());
let mut cold_start_ms = 0.0;
for _ in 0..self.config.warmup_iterations {
drop(send_request(&self.client));
}
for i in 0..self.config.max_samples() {
let timing = send_request(&self.client)?;
latencies.push(timing.total_time_ms);
if timing.tokens_generated > 0 {
let tps = (timing.tokens_generated as f64) / (timing.total_time_ms / 1000.0);
throughputs.push(tps);
}
if i == 0 {
cold_start_ms = timing.total_time_ms;
}
if let StopDecision::Stop(_) = self.config.cv_criterion.should_stop(&latencies) {
break;
}
}
let (filtered_latencies, outliers_detected, outliers_excluded) =
if self.config.filter_outliers {
let outliers = self.outlier_detector.detect(&latencies);
let outlier_count = outliers.iter().filter(|&&x| x).count();
let filtered = self.outlier_detector.filter(&latencies);
(filtered, outlier_count, outlier_count)
} else {
(latencies.clone(), 0, 0)
};
self.compute_results_with_quality(
&latencies,
&filtered_latencies,
&throughputs,
cold_start_ms,
outliers_detected,
outliers_excluded,
)
}
pub fn benchmark_llamacpp(&mut self, base_url: &str) -> Result<HttpBenchmarkResult> {
if self.config.run_preflight {
self.run_preflight_llamacpp(base_url)?;
}
let prompt = self.config.prompt.clone();
let max_tokens = self.config.max_tokens;
let temperature = self.config.temperature;
let base = base_url.to_owned();
self.benchmark_measure_loop(move |client| {
let request = CompletionRequest {
model: "default".to_string(),
prompt: prompt.clone(),
max_tokens,
temperature: Some(temperature),
stream: false,
};
client.llamacpp_completion(&base, &request)
})
}
pub fn benchmark_ollama(&mut self, base_url: &str, model: &str) -> Result<HttpBenchmarkResult> {
if self.config.run_preflight {
self.run_preflight_ollama(base_url)?;
}
let prompt = self.config.prompt.clone();
let max_tokens = self.config.max_tokens;
let temperature = self.config.temperature;
let base = base_url.to_owned();
let model_name = model.to_owned();
self.benchmark_measure_loop(move |client| {
let request = OllamaRequest {
model: model_name.clone(),
prompt: prompt.clone(),
stream: false,
options: Some(OllamaOptions {
num_predict: Some(max_tokens),
temperature: Some(temperature),
}),
};
client.ollama_generate(&base, &request)
})
}
#[cfg(test)]
fn calculate_cv(samples: &[f64]) -> f64 {
if samples.len() < 2 {
return f64::MAX;
}
let n = samples.len() as f64;
let mean = samples.iter().sum::<f64>() / n;
if mean.abs() < f64::EPSILON {
return f64::MAX;
}
let variance = samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
let std_dev = variance.sqrt();
std_dev / mean
}
fn compute_results_with_quality(
&self,
raw_latencies: &[f64],
filtered_latencies: &[f64],
throughputs: &[f64],
cold_start_ms: f64,
outliers_detected: usize,
outliers_excluded: usize,
) -> Result<HttpBenchmarkResult> {
let latencies = if filtered_latencies.is_empty() {
raw_latencies
} else {
filtered_latencies
};
if latencies.is_empty() {
return Err(RealizarError::InferenceError(
"No valid samples collected".to_string(),
));
}
let (mean_latency_ms, std_dev_ms, cv, p50_latency_ms, p99_latency_ms) =
Self::latency_stats(latencies);
let throughput_tps = Self::mean_throughput(throughputs);
let cv_converged = cv < self.config.cv_threshold();
Ok(HttpBenchmarkResult {
latency_samples: raw_latencies.to_vec(),
latency_samples_filtered: filtered_latencies.to_vec(),
mean_latency_ms,
p50_latency_ms,
p99_latency_ms,
std_dev_ms,
cv_at_stop: cv,
throughput_tps,
cold_start_ms,
sample_count: raw_latencies.len(),
filtered_sample_count: filtered_latencies.len(),
cv_converged,
quality_metrics: QualityMetrics {
cv_at_stop: cv,
cv_converged,
outliers_detected,
outliers_excluded,
preflight_checks_passed: self.preflight_checks_passed(),
},
})
}
fn latency_stats(latencies: &[f64]) -> (f64, f64, f64, f64, f64) {
let n = latencies.len() as f64;
let mean = if n > 0.0 {
latencies.iter().sum::<f64>() / n
} else {
0.0
};
let variance = if n > 1.0 {
latencies.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)
} else {
0.0
};
let std_dev = variance.sqrt();
let cv = if mean.abs() > f64::EPSILON {
std_dev / mean
} else {
f64::MAX
};
let mut sorted = latencies.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p50 = sorted.get(latencies.len() / 2).copied().unwrap_or(0.0);
let p99_idx = (latencies.len() * 99 / 100).min(latencies.len().saturating_sub(1));
let p99 = sorted.get(p99_idx).copied().unwrap_or(0.0);
(mean, std_dev, cv, p50, p99)
}
fn mean_throughput(throughputs: &[f64]) -> f64 {
if throughputs.is_empty() {
0.0
} else {
throughputs.iter().sum::<f64>() / throughputs.len() as f64
}
}
#[cfg(test)]
fn compute_results(
latencies: &[f64],
throughputs: &[f64],
cold_start_ms: f64,
cv_threshold: f64,
) -> HttpBenchmarkResult {
let (mean_latency_ms, std_dev_ms, cv, p50_latency_ms, p99_latency_ms) =
Self::latency_stats(latencies);
let throughput_tps = Self::mean_throughput(throughputs);
let cv_converged = cv < cv_threshold;
HttpBenchmarkResult {
latency_samples: latencies.to_vec(),
latency_samples_filtered: latencies.to_vec(),
mean_latency_ms,
p50_latency_ms,
p99_latency_ms,
std_dev_ms,
cv_at_stop: cv,
throughput_tps,
cold_start_ms,
sample_count: latencies.len(),
filtered_sample_count: latencies.len(),
cv_converged,
quality_metrics: QualityMetrics {
cv_at_stop: cv,
cv_converged,
outliers_detected: 0,
outliers_excluded: 0,
preflight_checks_passed: Vec::new(),
},
}
}
}