use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
use serde::{Deserialize, Serialize};
use std::sync::Arc;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelLoadParams {
pub alias: String,
pub path: String,
#[serde(default)]
pub gpu_layers: i32,
#[serde(default)]
pub context_size: u32,
#[serde(default)]
pub use_mmap: Option<bool>,
#[serde(default)]
pub use_mlock: bool,
#[serde(default)]
pub flash_attn: bool,
#[serde(default)]
pub cache_type_k: Option<String>,
#[serde(default)]
pub cache_type_v: Option<String>,
#[serde(default)]
pub rope_freq_base: Option<f32>,
#[serde(default)]
pub rope_freq_scale: Option<f32>,
#[serde(default)]
pub n_batch: Option<u32>,
#[serde(default)]
pub defrag_thold: Option<f32>,
#[serde(default)]
pub split_mode: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionParams {
pub model: Option<String>,
pub messages: Vec<ChatMessage>,
#[serde(default = "default_max_tokens")]
pub max_tokens: u32,
#[serde(default)]
pub temperature: Option<f32>,
#[serde(default)]
pub top_p: Option<f32>,
#[serde(default)]
pub top_k: Option<i32>,
#[serde(default)]
pub frequency_penalty: Option<f32>,
#[serde(default)]
pub presence_penalty: Option<f32>,
#[serde(default)]
pub stream: bool,
#[serde(default)]
pub stop: Vec<String>,
#[serde(default)]
pub response_format: Option<ResponseFormat>,
#[serde(default)]
pub tools: Option<Vec<Tool>>,
#[serde(default)]
pub tool_choice: Option<ToolChoice>,
#[serde(default)]
pub thinking: Option<ThinkingConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompletionParams {
pub model: Option<String>,
pub prompt: String,
#[serde(default = "default_max_tokens")]
pub max_tokens: u32,
#[serde(default)]
pub temperature: Option<f32>,
#[serde(default)]
pub top_p: Option<f32>,
#[serde(default)]
pub top_k: Option<i32>,
#[serde(default)]
pub frequency_penalty: Option<f32>,
#[serde(default)]
pub presence_penalty: Option<f32>,
#[serde(default)]
pub stream: bool,
#[serde(default)]
pub stop: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", content = "data")]
pub enum Request {
Ping,
Status,
ListModels,
LoadModel(ModelLoadParams),
UnloadModel { alias: String },
SetDefaultModel { alias: String },
ChatCompletion(ChatCompletionParams),
Completion(CompletionParams),
Embeddings {
model: Option<String>,
input: EmbeddingInput,
},
Tokenize { model: Option<String>, text: String },
Cancel { request_id: String },
Shutdown,
}
fn default_max_tokens() -> u32 {
512
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMessage {
pub role: String,
#[serde(default)]
pub content: MessageContent,
#[serde(skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_calls: Option<Vec<ToolCall>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_call_id: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum MessageContent {
Text(String),
Parts(Vec<ContentPart>),
}
impl MessageContent {
pub fn text(&self) -> String {
match self {
MessageContent::Text(s) => s.clone(),
MessageContent::Parts(parts) => parts
.iter()
.filter_map(|p| match p {
ContentPart::Text { text } => Some(text.as_str()),
_ => None,
})
.collect::<Vec<_>>()
.join(" "),
}
}
pub fn has_images(&self) -> bool {
match self {
MessageContent::Text(_) => false,
MessageContent::Parts(parts) => parts
.iter()
.any(|p| matches!(p, ContentPart::ImageUrl { .. })),
}
}
pub fn images(&self) -> Vec<&ImageUrl> {
match self {
MessageContent::Text(_) => vec![],
MessageContent::Parts(parts) => parts
.iter()
.filter_map(|p| match p {
ContentPart::ImageUrl { image_url } => Some(image_url),
_ => None,
})
.collect(),
}
}
}
impl Default for MessageContent {
fn default() -> Self {
MessageContent::Text(String::new())
}
}
impl From<String> for MessageContent {
fn from(s: String) -> Self {
MessageContent::Text(s)
}
}
impl From<&str> for MessageContent {
fn from(s: &str) -> Self {
MessageContent::Text(s.to_string())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum ContentPart {
#[serde(rename = "text")]
Text { text: String },
#[serde(rename = "image_url")]
ImageUrl { image_url: ImageUrl },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageUrl {
pub url: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub detail: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum EmbeddingInput {
Single(String),
Multiple(Vec<String>),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", content = "data")]
pub enum Response {
Pong { uptime_secs: u64, version: String },
Status(DaemonStatus),
Models(Vec<ModelStatus>),
ModelLoaded { alias: String, info: ModelInfo },
ModelUnloaded { alias: String },
DefaultModelSet { alias: String },
ChatCompletion(ChatCompletionResponse),
Completion(CompletionResponse),
StreamChunk(StreamChunk),
StreamEnd { request_id: String, usage: Usage },
Embeddings(EmbeddingsResponse),
Tokens { tokens: Vec<i32>, count: usize },
Cancelled { request_id: String },
ShuttingDown,
Error {
code: ErrorCode,
message: String,
#[serde(skip_serializing_if = "Option::is_none")]
request_id: Option<String>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DaemonStatus {
pub version: String,
pub uptime_secs: u64,
pub models_loaded: usize,
pub default_model: Option<String>,
pub http_endpoint: Option<String>,
pub ipc_endpoint: String,
pub stats: DaemonStats,
}
#[derive(
Debug, Clone, Default, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize,
)]
#[archive(check_bytes)]
pub struct DaemonStats {
pub requests_total: u64,
pub tokens_generated: u64,
pub active_requests: u32,
pub memory_used_mb: u64,
pub gpu_available: bool,
#[serde(default)]
pub memory_total_mb: u64,
#[serde(default)]
pub memory_available_mb: u64,
#[serde(default)]
pub memory_pressure: String,
#[serde(default)]
pub model_details: Vec<ModelDetailedStats>,
}
#[derive(
Debug, Clone, Default, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize,
)]
#[archive(check_bytes)]
pub struct ModelDetailedStats {
pub alias: String,
pub requests_total: u64,
pub tokens_generated: u64,
pub tokens_prompt: u64,
pub avg_tokens_per_sec: f32,
pub memory_bytes: u64,
pub active_requests: u32,
pub last_used_secs_ago: u64,
pub load_time_ms: u64,
pub pool_size: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelStatus {
pub alias: String,
pub info: ModelInfo,
pub is_default: bool,
pub active_requests: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelInfo {
pub path: String,
pub parameters: u64,
pub context_size: u32,
pub vocab_size: u32,
pub gpu_layers: i32,
pub quantization: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionResponse {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<ChatChoice>,
pub usage: Usage,
#[serde(skip_serializing_if = "Option::is_none")]
pub thinking: Option<ThinkingContent>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatChoice {
pub index: u32,
pub message: ChatMessage,
pub finish_reason: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompletionResponse {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<CompletionChoice>,
pub usage: Usage,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompletionChoice {
pub index: u32,
pub text: String,
pub finish_reason: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StreamChunk {
#[serde(
serialize_with = "serialize_arc_str",
deserialize_with = "deserialize_arc_str"
)]
pub request_id: Arc<str>,
pub index: u32,
pub delta: String,
pub token_id: i32,
#[serde(skip_serializing_if = "Option::is_none")]
pub thinking: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_calls: Option<Vec<ToolCallDelta>>,
}
fn serialize_arc_str<S>(value: &Arc<str>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.serialize_str(value)
}
fn deserialize_arc_str<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = <String as serde::Deserialize>::deserialize(deserializer)?;
Ok(Arc::from(s))
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolCallDelta {
pub index: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub id: Option<String>,
#[serde(rename = "type", skip_serializing_if = "Option::is_none")]
pub call_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub function: Option<FunctionCallDelta>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunctionCallDelta {
#[serde(skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub arguments: Option<String>,
}
#[derive(
Debug, Clone, Default, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize,
)]
#[archive(check_bytes)]
pub struct Usage {
pub prompt_tokens: u32,
pub completion_tokens: u32,
pub total_tokens: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingsResponse {
pub object: String,
pub data: Vec<EmbeddingData>,
pub model: String,
pub usage: Usage,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingData {
pub object: String,
pub embedding: Vec<f32>,
pub index: u32,
}
#[derive(
Debug,
Clone,
Copy,
Serialize,
Deserialize,
PartialEq,
Eq,
Archive,
RkyvSerialize,
RkyvDeserialize,
)]
#[archive(check_bytes)]
#[serde(rename_all = "snake_case")]
pub enum ErrorCode {
ModelNotFound,
ModelLoadFailed,
NoDefaultModel,
InvalidRequest,
GenerationFailed,
Cancelled,
RateLimited,
Internal,
Timeout,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(tag = "type")]
pub enum ResponseFormat {
#[serde(rename = "text")]
#[default]
Text,
#[serde(rename = "json_object")]
JsonObject,
#[serde(rename = "json_schema")]
JsonSchema { json_schema: JsonSchemaSpec },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JsonSchemaSpec {
pub name: String,
pub schema: serde_json::Value,
#[serde(default)]
pub strict: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Tool {
#[serde(rename = "type")]
pub tool_type: String,
pub function: FunctionDefinition,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunctionDefinition {
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub parameters: Option<serde_json::Value>,
#[serde(default)]
pub strict: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ToolChoice {
Mode(String),
Specific {
#[serde(rename = "type")]
choice_type: String,
function: ToolChoiceFunction,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolChoiceFunction {
pub name: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolCall {
pub id: String,
#[serde(rename = "type")]
pub call_type: String,
pub function: FunctionCall,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunctionCall {
pub name: String,
pub arguments: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ThinkingConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub budget_tokens: u32,
#[serde(default)]
pub stream_thinking: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThinkingContent {
pub content: String,
pub tokens: u32,
}
impl Request {
pub fn to_bytes(&self) -> Result<Vec<u8>, serde_json::Error> {
serde_json::to_vec(self)
}
pub fn from_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice(bytes)
}
}
impl Response {
pub fn to_bytes(&self) -> Result<Vec<u8>, serde_json::Error> {
serde_json::to_vec(self)
}
pub fn from_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice(bytes)
}
pub fn error(code: ErrorCode, message: impl Into<String>) -> Self {
Response::Error {
code,
message: message.into(),
request_id: None,
}
}
pub fn error_with_id(code: ErrorCode, message: impl Into<String>, request_id: String) -> Self {
Response::Error {
code,
message: message.into(),
request_id: Some(request_id),
}
}
}
#[allow(dead_code)]
pub fn generate_request_id() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
let ts = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_nanos();
format!("req_{:x}", ts)
}
pub fn unix_timestamp_secs() -> u64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs()
}
pub fn format_size(bytes: u64) -> String {
const KB: u64 = 1024;
const MB: u64 = KB * 1024;
const GB: u64 = MB * 1024;
if bytes >= GB {
format!("{:.1} GB", bytes as f64 / GB as f64)
} else if bytes >= MB {
format!("{:.1} MB", bytes as f64 / MB as f64)
} else if bytes >= KB {
format!("{:.1} KB", bytes as f64 / KB as f64)
} else {
format!("{} B", bytes)
}
}
pub fn generate_completion_id() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
let ts = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_nanos();
format!("cmpl_{:x}", ts)
}