use std::time::Instant;
use reqwest::blocking::Client;
use serde::{Deserialize, Serialize};
use crate::bench_preflight::{
canonical_inputs, CvStoppingCriterion, OutlierDetector, PreflightRunner, QualityMetrics,
ServerAvailabilityCheck, StopDecision,
};
use crate::error::{RealizarError, Result};
#[derive(Debug, Clone, Serialize)]
pub struct CompletionRequest {
pub model: String,
pub prompt: String,
pub max_tokens: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
#[serde(default)]
pub stream: bool,
}
#[derive(Debug, Clone, Deserialize)]
pub struct CompletionResponse {
pub id: String,
pub choices: Vec<CompletionChoice>,
pub usage: Option<UsageStats>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct CompletionChoice {
pub text: String,
pub finish_reason: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct UsageStats {
pub prompt_tokens: usize,
pub completion_tokens: usize,
pub total_tokens: usize,
}
#[derive(Debug, Clone, Serialize)]
pub struct OllamaRequest {
pub model: String,
pub prompt: String,
#[serde(default)]
pub stream: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub options: Option<OllamaOptions>,
}
#[derive(Debug, Clone, Serialize)]
pub struct OllamaOptions {
#[serde(skip_serializing_if = "Option::is_none")]
pub num_predict: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct OllamaResponse {
pub model: String,
pub response: String,
pub done: bool,
#[serde(default)]
pub total_duration: u64,
#[serde(default)]
pub load_duration: u64,
#[serde(default)]
pub prompt_eval_count: usize,
#[serde(default)]
pub prompt_eval_duration: u64,
#[serde(default)]
pub eval_count: usize,
#[serde(default)]
pub eval_duration: u64,
}
#[derive(Debug, Clone, Deserialize)]
pub struct LlamaCppResponse {
pub content: String,
#[serde(default)]
pub model: String,
#[serde(default)]
pub tokens_predicted: usize,
#[serde(default)]
pub tokens_evaluated: usize,
#[serde(default)]
pub stop: bool,
#[serde(default)]
pub timings: Option<LlamaCppTimings>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct LlamaCppTimings {
#[serde(default)]
pub prompt_n: usize,
#[serde(default)]
pub prompt_ms: f64,
#[serde(default)]
pub predicted_n: usize,
#[serde(default)]
pub predicted_ms: f64,
#[serde(default)]
pub predicted_per_second: f64,
}
#[derive(Debug, Clone)]
pub struct InferenceTiming {
pub ttft_ms: f64,
pub total_time_ms: f64,
pub tokens_generated: usize,
pub text: String,
}
pub struct ModelHttpClient {
client: Client,
timeout_secs: u64,
}
impl Default for ModelHttpClient {
fn default() -> Self {
Self::new()
}
}
impl ModelHttpClient {
#[must_use]
pub fn new() -> Self {
Self {
client: Client::builder()
.timeout(std::time::Duration::from_secs(60))
.build()
.expect("Failed to create HTTP client"),
timeout_secs: 60,
}
}
#[must_use]
pub fn with_timeout(timeout_secs: u64) -> Self {
Self {
client: Client::builder()
.timeout(std::time::Duration::from_secs(timeout_secs))
.build()
.expect("Failed to create HTTP client"),
timeout_secs,
}
}
#[must_use]
pub fn timeout_secs(&self) -> u64 {
self.timeout_secs
}
pub fn openai_completion(
&self,
base_url: &str,
request: &CompletionRequest,
api_key: Option<&str>,
) -> Result<InferenceTiming> {
let url = format!("{}/v1/completions", base_url.trim_end_matches('/'));
let start = Instant::now();
let mut req_builder = self.client.post(&url).json(request);
if let Some(key) = api_key {
req_builder = req_builder.header("Authorization", format!("Bearer {}", key));
}
let response = req_builder
.send()
.map_err(|e| RealizarError::ConnectionError(format!("HTTP request failed: {}", e)))?;
let ttft_ms = start.elapsed().as_secs_f64() * 1000.0;
if !response.status().is_success() {
let status = response.status();
let body = response.text().unwrap_or_default();
return Err(RealizarError::ConnectionError(format!(
"HTTP {} from {}: {}",
status, url, body
)));
}
let completion: CompletionResponse =
response.json().map_err(|e| RealizarError::FormatError {
reason: format!("Failed to parse completion response: {}", e),
})?;
let total_time_ms = start.elapsed().as_secs_f64() * 1000.0;
let text = completion
.choices
.first()
.map(|c| c.text.clone())
.unwrap_or_default();
let tokens_generated = completion.usage.map_or(0, |u| u.completion_tokens);
Ok(InferenceTiming {
ttft_ms,
total_time_ms,
tokens_generated,
text,
})
}
pub fn ollama_generate(
&self,
base_url: &str,
request: &OllamaRequest,
) -> Result<InferenceTiming> {
let url = format!("{}/api/generate", base_url.trim_end_matches('/'));
let start = Instant::now();
let response =
self.client.post(&url).json(request).send().map_err(|e| {
RealizarError::ConnectionError(format!("HTTP request failed: {}", e))
})?;
let ttft_ms = start.elapsed().as_secs_f64() * 1000.0;
if !response.status().is_success() {
let status = response.status();
let body = response.text().unwrap_or_default();
return Err(RealizarError::ConnectionError(format!(
"HTTP {} from {}: {}",
status, url, body
)));
}
let ollama_resp: OllamaResponse =
response.json().map_err(|e| RealizarError::FormatError {
reason: format!("Failed to parse Ollama response: {}", e),
})?;
let total_time_ms = start.elapsed().as_secs_f64() * 1000.0;
let tokens_generated = if ollama_resp.eval_count > 0 {
ollama_resp.eval_count
} else {
(ollama_resp.response.len() / 4).max(1)
};
Ok(InferenceTiming {
ttft_ms,
total_time_ms,
tokens_generated,
text: ollama_resp.response,
})
}
pub fn llamacpp_completion(
&self,
base_url: &str,
request: &CompletionRequest,
) -> Result<InferenceTiming> {
let url = format!("{}/completion", base_url.trim_end_matches('/'));
let start = Instant::now();
let body = serde_json::json!({
"prompt": request.prompt,
"n_predict": request.max_tokens,
"temperature": request.temperature.unwrap_or(0.8),
"stream": false
});
let response =
self.client.post(&url).json(&body).send().map_err(|e| {
RealizarError::ConnectionError(format!("HTTP request failed: {}", e))
})?;
let ttft_ms = start.elapsed().as_secs_f64() * 1000.0;
if !response.status().is_success() {
let status = response.status();
let body = response.text().unwrap_or_default();
return Err(RealizarError::ConnectionError(format!(
"HTTP {} from {}: {}",
status, url, body
)));
}
let llama_resp: LlamaCppResponse =
response.json().map_err(|e| RealizarError::FormatError {
reason: format!("Failed to parse llama.cpp response: {}", e),
})?;
let total_time_ms = start.elapsed().as_secs_f64() * 1000.0;
Ok(InferenceTiming {
ttft_ms,
total_time_ms,
tokens_generated: llama_resp.tokens_predicted,
text: llama_resp.content,
})
}
pub fn health_check_openai(&self, base_url: &str) -> Result<bool> {
let url = format!("{}/v1/models", base_url.trim_end_matches('/'));
let response =
self.client.get(&url).send().map_err(|e| {
RealizarError::ConnectionError(format!("Health check failed: {}", e))
})?;
Ok(response.status().is_success())
}
pub fn health_check_ollama(&self, base_url: &str) -> Result<bool> {
let url = format!("{}/api/tags", base_url.trim_end_matches('/'));
let response =
self.client.get(&url).send().map_err(|e| {
RealizarError::ConnectionError(format!("Health check failed: {}", e))
})?;
Ok(response.status().is_success())
}
}
include!("mod_http_benchmark.rs");
include!("benchmark_runner.rs");
include!("mod_04.rs");