#![allow(dead_code)]
#![allow(unused_imports)]
#![allow(unused_variables)]
#![allow(clippy::needless_return)]
#![allow(clippy::format_push_string)]
#![allow(clippy::map_unwrap_or)]
#![allow(clippy::if_not_else)]
#![allow(clippy::disallowed_methods)]
#![allow(clippy::redundant_closure_for_method_calls)]
#![allow(clippy::inefficient_to_string)]
use crate::error::{CliError, Result};
use colored::Colorize;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct ServerConfig {
pub port: u16,
pub host: String,
#[allow(dead_code)]
pub cors: bool,
#[allow(dead_code)]
pub timeout_secs: u64,
#[allow(dead_code)]
pub max_concurrent: usize,
pub metrics: bool,
#[allow(dead_code)]
pub no_gpu: bool,
pub gpu: bool,
pub batch: bool,
pub trace: bool,
pub trace_level: String,
pub profile: bool,
pub verbose: bool,
pub backend: Option<String>,
pub otlp_endpoint: Option<String>,
pub context_length: usize,
pub no_fp8_cache: bool,
}
impl Default for ServerConfig {
fn default() -> Self {
Self {
port: 8080,
host: "127.0.0.1".to_string(),
cors: true,
timeout_secs: 30,
max_concurrent: 10,
metrics: true,
no_gpu: false,
gpu: false,
batch: false,
trace: false,
trace_level: "basic".to_string(),
profile: false,
verbose: false,
backend: None,
otlp_endpoint: None,
context_length: 4096,
no_fp8_cache: false,
}
}
}
impl ServerConfig {
#[cfg(test)]
pub(crate) fn with_port(mut self, port: u16) -> Self {
self.port = port;
self
}
#[cfg(test)]
pub(crate) fn with_host(mut self, host: impl Into<String>) -> Self {
self.host = host.into();
self
}
pub(super) fn bind_addr(&self) -> String {
format!("{}:{}", self.host, self.port)
}
}
#[derive(Debug, Default)]
pub struct ServerMetrics {
pub requests_total: AtomicU64,
pub requests_success: AtomicU64,
pub requests_client_error: AtomicU64,
pub requests_server_error: AtomicU64,
pub tokens_generated: AtomicU64,
pub inference_time_ms: AtomicU64,
pub model_memory_bytes: AtomicU64,
start_time: std::sync::OnceLock<Instant>,
}
impl ServerMetrics {
pub fn new() -> Arc<Self> {
let metrics = Arc::new(Self::default());
let _ = metrics.start_time.set(Instant::now());
metrics
}
pub fn record_request(&self, success: bool, tokens: u64, duration_ms: u64) {
self.requests_total.fetch_add(1, Ordering::Relaxed);
if success {
self.requests_success.fetch_add(1, Ordering::Relaxed);
} else {
self.requests_server_error.fetch_add(1, Ordering::Relaxed);
}
self.tokens_generated.fetch_add(tokens, Ordering::Relaxed);
self.inference_time_ms
.fetch_add(duration_ms, Ordering::Relaxed);
}
pub fn record_client_error(&self) {
self.requests_total.fetch_add(1, Ordering::Relaxed);
self.requests_client_error.fetch_add(1, Ordering::Relaxed);
}
pub fn uptime_seconds(&self) -> u64 {
self.start_time
.get()
.map(|t| t.elapsed().as_secs())
.unwrap_or(0)
}
pub fn prometheus_output(&self) -> String {
let total = self.requests_total.load(Ordering::Relaxed);
let success = self.requests_success.load(Ordering::Relaxed);
let client_errors = self.requests_client_error.load(Ordering::Relaxed);
let server_errors = self.requests_server_error.load(Ordering::Relaxed);
let tokens = self.tokens_generated.load(Ordering::Relaxed);
let inference_ms = self.inference_time_ms.load(Ordering::Relaxed);
let model_bytes = self.model_memory_bytes.load(Ordering::Relaxed);
let uptime = self.uptime_seconds();
format!(
r#"# HELP apr_requests_total Total number of HTTP requests
# TYPE apr_requests_total counter
apr_requests_total {total}
# HELP apr_requests_success Successful requests (2xx)
# TYPE apr_requests_success counter
apr_requests_success {success}
# HELP apr_requests_client_error Client error requests (4xx)
# TYPE apr_requests_client_error counter
apr_requests_client_error {client_errors}
# HELP apr_requests_server_error Server error requests (5xx)
# TYPE apr_requests_server_error counter
apr_requests_server_error {server_errors}
# HELP apr_tokens_generated_total Total tokens generated
# TYPE apr_tokens_generated_total counter
apr_tokens_generated_total {tokens}
# HELP apr_inference_duration_seconds_total Total inference time in seconds
# TYPE apr_inference_duration_seconds_total counter
apr_inference_duration_seconds_total {:.3}
# HELP apr_memory_bytes Memory usage by type
# TYPE apr_memory_bytes gauge
apr_memory_bytes{{type="model"}} {model_bytes}
# HELP apr_uptime_seconds Server uptime in seconds
# TYPE apr_uptime_seconds gauge
apr_uptime_seconds {uptime}
"#,
inference_ms as f64 / 1000.0
)
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Tool {
#[serde(rename = "type")]
pub tool_type: String,
pub function: FunctionDef,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct FunctionDef {
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub parameters: Option<serde_json::Value>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ToolCall {
pub id: String,
#[serde(rename = "type")]
pub tool_type: String,
pub function: FunctionCall,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct FunctionCall {
pub name: String,
pub arguments: String,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(untagged)]
pub enum ToolChoice {
Mode(String),
Function {
#[serde(rename = "type")]
tool_type: String,
function: ToolChoiceFunction,
},
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ToolChoiceFunction {
pub name: String,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ChatMessage {
pub role: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_calls: Option<Vec<ToolCall>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_call_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ChatCompletionRequest {
#[serde(default)]
pub model: String,
pub messages: Vec<ChatMessage>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<Tool>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<ToolChoice>,
#[serde(skip_serializing_if = "Option::is_none")]
pub max_tokens: Option<u32>,
#[serde(default)]
pub stream: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f32>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ChatCompletionResponse {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<ChatChoice>,
#[serde(skip_serializing_if = "Option::is_none")]
pub usage: Option<TokenUsage>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ChatChoice {
pub index: u32,
pub message: ChatMessage,
pub finish_reason: Option<String>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[allow(clippy::struct_field_names)]
pub struct TokenUsage {
pub prompt_tokens: u32,
pub completion_tokens: u32,
pub total_tokens: u32,
}
pub(super) fn format_tools_prompt(tools: &[Tool]) -> String {
contract_pre_idempotency_classification!();
if tools.is_empty() {
return String::new();
}
let mut prompt = String::from("\n\nYou have access to the following tools:\n\n");
for tool in tools {
prompt.push_str(&format!("### {}\n", tool.function.name));
if let Some(desc) = &tool.function.description {
prompt.push_str(&format!("{}\n", desc));
}
if let Some(params) = &tool.function.parameters {
prompt.push_str(&format!("Parameters: {}\n", params));
}
prompt.push('\n');
}
prompt.push_str("To use a tool, respond with a JSON object in this format:\n");
prompt.push_str(r#"{"tool_call": {"name": "function_name", "arguments": {...}}}"#);
prompt.push_str("\n\nIf you don't need to use a tool, respond normally.\n");
prompt
}
pub(super) fn parse_tool_calls(output: &str) -> Option<Vec<ToolCall>> {
let output_trimmed = output.trim();
if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(output_trimmed) {
if let Some(call) = extract_tool_call(&parsed) {
return Some(vec![call]);
}
}
if let Some(json_str) = find_embedded_tool_json(output_trimmed) {
if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&json_str) {
if let Some(call) = extract_tool_call(&parsed) {
return Some(vec![call]);
}
}
}
None
}
fn extract_tool_call(parsed: &serde_json::Value) -> Option<ToolCall> {
contract_pre_tool_schema_fidelity!();
let tool_call = parsed.get("tool_call")?;
let name = tool_call.get("name")?.as_str()?;
let arguments = tool_call.get("arguments")?;
let result = ToolCall {
id: format!("call_{}", uuid_simple()),
tool_type: "function".to_string(),
function: FunctionCall {
name: name.to_string(),
arguments: arguments.to_string(),
},
};
contract_post_tool_schema_fidelity!(&result);
Some(result)
}
fn find_embedded_tool_json(text: &str) -> Option<String> {
let start = text.find(r#"{"tool_call""#)?;
let json_part = &text[start..];
let mut depth = 0;
for (i, c) in json_part.char_indices() {
match c {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
return Some(json_part[..=i].to_string());
}
}
_ => {}
}
}
None
}
include!("types_uuid_simple_server.rs");