use std::path::PathBuf;
use std::time::Duration;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq)]
pub enum ModelState {
Available,
Loading,
Benchmarking,
Loaded { port: u16, pid: u32 },
Failed { error: String },
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SearchSort {
Relevance,
Downloads,
Likes,
Trending,
CreatedAt,
}
impl SearchSort {
pub fn next(self) -> Self {
match self {
SearchSort::Relevance => SearchSort::Downloads,
SearchSort::Downloads => SearchSort::Likes,
SearchSort::Likes => SearchSort::Trending,
SearchSort::Trending => SearchSort::CreatedAt,
SearchSort::CreatedAt => SearchSort::Relevance,
}
}
pub fn label(self) -> &'static str {
match self {
SearchSort::Relevance => "Relevance",
SearchSort::Downloads => "Downloads",
SearchSort::Likes => "Likes",
SearchSort::Trending => "Trending",
SearchSort::CreatedAt => "Created",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SearchResult {
pub model_id: String,
pub model_name: String,
pub tags: Vec<String>,
pub downloads: u64,
pub likes: u64,
pub pipeline_tag: Option<String>,
pub size: Option<u64>,
pub parameters: Option<String>,
pub capabilities: Vec<String>,
pub context_length: Option<u32>,
pub readme: Option<String>,
pub quantization: Option<String>,
pub license: Option<String>,
pub trending_score: i64,
pub created_at: Option<String>,
#[serde(default)]
pub downloaded: bool,
}
#[derive(Debug, Clone)]
pub struct DownloadState {
pub model_id: String,
pub filename: String,
pub total_bytes: u64,
pub downloaded_bytes: u64,
pub status: DownloadStatus,
pub cancelled: bool,
pub cancel_token: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
pub download_state: u8,
pub download_state_arc: Option<std::sync::Arc<std::sync::atomic::AtomicU8>>,
pub start_time: std::time::Instant,
pub bytes_per_second: f64,
pub dest: Option<std::path::PathBuf>,
}
impl DownloadState {
pub fn new(model_id: String, filename: String, total_bytes: u64) -> Self {
Self {
model_id,
filename,
total_bytes,
downloaded_bytes: 0,
status: DownloadStatus::Downloading,
cancelled: false,
cancel_token: None,
download_state: 1,
download_state_arc: None,
start_time: std::time::Instant::now(),
bytes_per_second: 0.0,
dest: None,
}
}
}
impl ModelSettings {
pub fn get_active_backend_version(&self) -> Option<&String> {
match self.backend {
Backend::Cpu => self.llama_cpp_version_cpu.as_ref(),
Backend::Vulkan => self.llama_cpp_version_vulkan.as_ref(),
Backend::Rocm => self.llama_cpp_version_rocm.as_ref(),
Backend::RocmLemonade => self.llama_cpp_version_rocm_lemonade.as_ref(),
Backend::Cuda => self.llama_cpp_version_cuda.as_ref(),
_ => None,
}
}
pub fn get_active_backend_version_display(&self) -> &str {
self.get_active_backend_version()
.map(|s| s.as_str())
.unwrap_or("latest")
}
pub fn set_active_backend_version(&mut self, tag: Option<String>) {
match self.backend {
Backend::Cpu => self.llama_cpp_version_cpu = tag,
Backend::Vulkan => self.llama_cpp_version_vulkan = tag,
Backend::Rocm => self.llama_cpp_version_rocm = tag,
Backend::RocmLemonade => self.llama_cpp_version_rocm_lemonade = tag,
Backend::Cuda => self.llama_cpp_version_cuda = tag,
_ => {}
}
}
}
pub fn strip_gguf(name: &str) -> &str {
name.strip_suffix(".gguf")
.or_else(|| name.strip_suffix(".GGUF"))
.unwrap_or(name)
}
pub fn clean_host(host: &str) -> String {
let host = host.trim();
if host.is_empty() {
return "127.0.0.1".to_string();
}
let host = host.split_whitespace().next().unwrap_or(host);
if host.contains(':') && !host.starts_with('[') {
format!("[{}]", host)
} else {
host.to_string()
}
}
pub fn format_host(host: &str) -> &str {
match host {
"" | "127.0.0.1" => "localhost (127.0.0.1)",
_ => host,
}
}
impl From<crate::config::DefaultParams> for ModelSettings {
fn from(dp: crate::config::DefaultParams) -> Self {
Self {
context_length: dp.context_length,
threads: dp.threads,
threads_batch: dp.threads_batch,
batch_size: dp.batch_size,
ubatch_size: dp.ubatch_size,
parallel: dp.parallel,
max_concurrent_predictions: dp.max_concurrent_predictions,
uniform_cache: dp.uniform_cache,
kv_cache_offload: dp.kv_cache_offload,
cache_type_k: dp.cache_type_k,
cache_type_v: dp.cache_type_v,
keep: dp.keep,
swa_full: dp.swa_full,
mlock: dp.mlock,
mmap: dp.mmap,
numa: dp.numa,
system_prompt: dp.system_prompt,
system_prompt_preset_name: dp.system_prompt_preset_name,
gpu_layers_mode: match dp.gpu_layers {
n if n < 0 => GpuLayersMode::All,
_ => dp.gpu_layers_mode,
},
split_mode: dp.split_mode,
tensor_split: dp.tensor_split,
main_gpu: dp.main_gpu,
fit: dp.fit,
lora: dp.lora,
lora_scaled: dp.lora_scaled,
rpc: dp.rpc,
embedding: dp.embedding,
flash_attn: dp.flash_attn,
expert_count: dp.expert_count,
jinja: dp.jinja,
chat_template: dp.chat_template,
chat_template_kwargs: dp.chat_template_kwargs,
seed: dp.seed,
temperature: dp.temperature,
top_k: dp.top_k,
top_p: dp.top_p,
min_p: dp.min_p,
typical_p: dp.typical_p,
mirostat: dp.mirostat,
mirostat_lr: dp.mirostat_lr,
mirostat_ent: dp.mirostat_ent,
ignore_eos: dp.ignore_eos,
samplers: dp.samplers,
repeat_penalty: dp.repeat_penalty,
repeat_last_n: dp.repeat_last_n,
presence_penalty: dp.presence_penalty,
frequency_penalty: dp.frequency_penalty,
dry_multiplier: dp.dry_multiplier,
dry_base: dp.dry_base,
dry_allowed_length: dp.dry_allowed_length,
dry_penalty_last_n: dp.dry_penalty_last_n,
rope_scaling: dp.rope_scaling,
rope_scale: dp.rope_scale,
rope_freq_base: dp.rope_freq_base,
rope_freq_scale: dp.rope_freq_scale,
rope_yarn_enabled: dp.rope_yarn_enabled,
host: dp.host,
port: dp.port,
timeout: dp.timeout,
cache_prompt: dp.cache_prompt,
cache_reuse: dp.cache_reuse,
webui: dp.webui,
max_tokens: dp.max_tokens,
cache_type: dp.cache_type,
backend: dp.backend,
llama_cpp_version_cpu: dp.llama_cpp_version_cpu,
llama_cpp_version_vulkan: dp.llama_cpp_version_vulkan,
llama_cpp_version_rocm: dp.llama_cpp_version_rocm,
llama_cpp_version_rocm_lemonade: dp.llama_cpp_version_rocm_lemonade,
llama_cpp_version_cuda: dp.llama_cpp_version_cuda,
api_endpoint_enabled: dp.api_endpoint_enabled,
api_endpoint_port: dp.api_endpoint_port,
spec_type: dp.spec_type,
draft_tokens: dp.draft_tokens,
tags: dp.tags,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize, Hash)]
#[derive(Default)]
pub enum GpuLayersMode {
#[default]
Auto,
Specific(u32),
All,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DownloadStatus {
Downloading,
Paused,
Complete,
Error(String),
Cancelled,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash, Default)]
pub enum CacheType {
#[serde(rename = "f16")]
#[default]
F16,
#[serde(rename = "bf16")]
BF16,
#[serde(rename = "fq8_0")]
Fq8_0,
#[serde(rename = "fq4_1")]
Fq4_1,
}
impl std::fmt::Display for CacheType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
CacheType::F16 => write!(f, "f16"),
CacheType::BF16 => write!(f, "bf16"),
CacheType::Fq8_0 => write!(f, "fq8_0"),
CacheType::Fq4_1 => write!(f, "fq4_1"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default, Hash)]
pub enum CacheQuantType {
#[serde(rename = "f32")]
F32,
#[serde(rename = "f16")]
#[default]
F16,
#[serde(rename = "bf16")]
BF16,
#[serde(rename = "q8_0")]
Q8_0,
#[serde(rename = "q4_0")]
Q4_0,
#[serde(rename = "q4_1")]
Q4_1,
#[serde(rename = "iq4_nl")]
Iq4Nl,
#[serde(rename = "q5_0")]
Q5_0,
#[serde(rename = "q5_1")]
Q5_1,
}
pub type CacheTypeK = CacheQuantType;
pub type CacheTypeV = CacheQuantType;
impl CacheQuantType {
pub fn from_u8(n: u8) -> Self {
match n {
0 => Self::F32,
1 => Self::F16,
2 => Self::BF16,
3 => Self::Q8_0,
4 => Self::Q5_1,
5 => Self::Q5_0,
6 => Self::Q4_1,
7 => Self::Q4_0,
8 => Self::Iq4Nl,
_ => Self::F16,
}
}
pub fn next(&self) -> Self {
match self {
Self::F32 => Self::F16,
Self::F16 => Self::BF16,
Self::BF16 => Self::Q8_0,
Self::Q8_0 => Self::Q5_1,
Self::Q5_1 => Self::Q5_0,
Self::Q5_0 => Self::Q4_1,
Self::Q4_1 => Self::Q4_0,
Self::Q4_0 => Self::Iq4Nl,
Self::Iq4Nl => Self::F32,
}
}
pub fn prev(&self) -> Self {
match self {
Self::F32 => Self::Iq4Nl,
Self::F16 => Self::F32,
Self::BF16 => Self::F16,
Self::Q8_0 => Self::BF16,
Self::Q5_1 => Self::Q8_0,
Self::Q5_0 => Self::Q5_1,
Self::Q4_1 => Self::Q5_0,
Self::Q4_0 => Self::Q4_1,
Self::Iq4Nl => Self::Q4_0,
}
}
}
impl std::fmt::Display for CacheQuantType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::F32 => write!(f, "f32"),
Self::F16 => write!(f, "f16"),
Self::BF16 => write!(f, "bf16"),
Self::Q8_0 => write!(f, "q8_0"),
Self::Q4_0 => write!(f, "q4_0"),
Self::Q4_1 => write!(f, "q4_1"),
Self::Iq4Nl => write!(f, "iq4_nl"),
Self::Q5_0 => write!(f, "q5_0"),
Self::Q5_1 => write!(f, "q5_1"),
}
}
}
impl From<&str> for CacheQuantType {
fn from(s: &str) -> Self {
match s {
"F32" => Self::F32,
"F16" => Self::F16,
"BF16" => Self::BF16,
"Q8_0" => Self::Q8_0,
"Q4_0" => Self::Q4_0,
"Q4_1" => Self::Q4_1,
"Iq4Nl" => Self::Iq4Nl,
"Q5_0" => Self::Q5_0,
"Q5_1" => Self::Q5_1,
_ => Self::F16, }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Hash, Default)]
pub enum SplitMode {
#[serde(rename = "none")]
None,
#[serde(rename = "layer")]
#[default]
Layer,
#[serde(rename = "row")]
Row,
#[serde(rename = "tensor")]
Tensor,
}
impl std::fmt::Display for SplitMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SplitMode::None => write!(f, "none"),
SplitMode::Layer => write!(f, "layer"),
SplitMode::Row => write!(f, "row"),
SplitMode::Tensor => write!(f, "tensor"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Hash, Default)]
pub enum NumMode {
#[serde(rename = "none")]
#[default]
None,
#[serde(rename = "distribute")]
Distribute,
#[serde(rename = "isolate")]
Isolate,
#[serde(rename = "numactl")]
Numactl,
}
impl std::fmt::Display for NumMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
NumMode::None => write!(f, "none"),
NumMode::Distribute => write!(f, "distribute"),
NumMode::Isolate => write!(f, "isolate"),
NumMode::Numactl => write!(f, "numactl"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash, Default)]
pub enum RopeScaling {
#[serde(rename = "none")]
#[default]
None,
#[serde(rename = "linear")]
Linear,
#[serde(rename = "yarn")]
Yarn,
}
impl std::fmt::Display for RopeScaling {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
RopeScaling::None => write!(f, "none"),
RopeScaling::Linear => write!(f, "linear"),
RopeScaling::Yarn => write!(f, "yarn"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash, Default)]
pub enum Mirostat {
#[serde(rename = "0")]
#[default]
Off,
#[serde(rename = "1")]
V1,
#[serde(rename = "2")]
Mirostat2,
}
impl std::fmt::Display for Mirostat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Mirostat::Off => write!(f, "off"),
Mirostat::V1 => write!(f, "1"),
Mirostat::Mirostat2 => write!(f, "2"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Samplers(pub String);
impl Default for Samplers {
fn default() -> Self {
Self("penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature".to_string())
}
}
impl std::fmt::Display for Samplers {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum Backend {
#[serde(rename = "cpu")]
#[default]
Cpu,
#[serde(rename = "vulkan")]
Vulkan,
#[serde(rename = "rocm")]
Rocm,
#[serde(rename = "rocm_lemonade")]
RocmLemonade,
#[serde(rename = "cuda")]
Cuda,
#[serde(rename = "cpu_arm64")]
CpuArm64,
#[serde(rename = "win_cpu")]
CpuWindows,
#[serde(rename = "win_vulkan")]
VulkanWindows,
#[serde(rename = "win_cuda_12_4")]
CudaWindows12_4,
#[serde(rename = "win_cuda_13_1")]
CudaWindows13_1,
#[serde(rename = "win_hip")]
HipWindows,
#[serde(rename = "macos_arm64")]
CpuMacosArm64,
#[serde(rename = "macos_x64")]
CpuMacosX64,
}
impl Backend {
pub fn slug(&self) -> &'static str {
match self {
Backend::Cpu => "cpu",
Backend::Vulkan => "vulkan",
Backend::Rocm => "rocm",
Backend::RocmLemonade => "rocm-lemonade",
Backend::Cuda => "cuda",
Backend::CpuArm64 => "cpu-arm64",
Backend::CpuWindows => "win-cpu",
Backend::VulkanWindows => "win-vulkan",
Backend::CudaWindows12_4 => "win-cuda-12.4",
Backend::CudaWindows13_1 => "win-cuda-13.1",
Backend::HipWindows => "win-hip",
Backend::CpuMacosArm64 => "macos-arm64",
Backend::CpuMacosX64 => "macos-x64",
}
}
pub fn is_linux(self) -> bool {
matches!(
self,
Backend::Cpu
| Backend::Vulkan
| Backend::Rocm
| Backend::RocmLemonade
| Backend::Cuda
| Backend::CpuArm64
)
}
pub fn is_windows(self) -> bool {
matches!(
self,
Backend::CpuWindows
| Backend::VulkanWindows
| Backend::CudaWindows12_4
| Backend::CudaWindows13_1
| Backend::HipWindows
)
}
pub fn is_macos(self) -> bool {
matches!(self, Backend::CpuMacosArm64 | Backend::CpuMacosX64)
}
pub fn from_str(s: &str) -> Self {
let s = s.to_lowercase();
if s.starts_with("vulkan") || s.starts_with("vk") {
Backend::Vulkan
} else if s.starts_with("rocm") || s.starts_with("ro") {
if s.contains("lemonade") {
Backend::RocmLemonade
} else {
Backend::Rocm
}
} else if s.starts_with("cuda") || s.starts_with("cu") {
Backend::Cuda
} else {
Backend::Cpu }
}
}
impl std::fmt::Display for Backend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.slug())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum ServerMode {
#[serde(rename = "normal")]
#[default]
Normal,
#[serde(rename = "router")]
Router,
#[serde(rename = "bench_gpu", alias = "bench")]
Bench,
#[serde(rename = "bench_tune")]
BenchTune,
}
impl std::fmt::Display for ServerMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ServerMode::Normal => write!(f, "Normal"),
ServerMode::Router => write!(f, "Router (XP!)"),
ServerMode::Bench => write!(f, "Bench GPU"),
ServerMode::BenchTune => write!(f, "BenchTune"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ModelSettings {
pub context_length: u32,
pub threads: u32,
pub threads_batch: u32,
pub batch_size: u32,
pub ubatch_size: u32,
pub parallel: u32,
pub max_concurrent_predictions: Option<u32>,
pub uniform_cache: bool,
pub kv_cache_offload: bool,
pub cache_type_k: Option<CacheTypeK>,
pub cache_type_v: Option<CacheTypeV>,
pub keep: i32,
pub swa_full: bool,
pub mlock: bool,
pub mmap: bool,
pub numa: NumMode,
pub system_prompt: String,
pub system_prompt_preset_name: String,
pub gpu_layers_mode: GpuLayersMode,
pub split_mode: SplitMode,
pub tensor_split: String,
pub main_gpu: i32,
pub fit: bool,
pub lora: Option<PathBuf>,
pub lora_scaled: Option<(PathBuf, f32)>,
pub rpc: String,
pub embedding: bool,
pub flash_attn: bool,
pub expert_count: i32,
pub jinja: bool,
pub chat_template: Option<String>,
pub chat_template_kwargs: Option<String>,
pub seed: i32,
pub temperature: f32,
pub top_k: i32,
pub top_p: f32,
pub min_p: f32,
pub typical_p: f32,
pub mirostat: Mirostat,
pub mirostat_lr: f32,
pub mirostat_ent: f32,
pub ignore_eos: bool,
pub samplers: Samplers,
pub repeat_penalty: f32,
pub repeat_last_n: i32,
pub presence_penalty: Option<f32>,
pub frequency_penalty: Option<f32>,
pub dry_multiplier: f32,
pub dry_base: f32,
pub dry_allowed_length: i32,
pub dry_penalty_last_n: i32,
pub rope_scaling: RopeScaling,
pub rope_scale: f32,
pub rope_freq_base: f32,
pub rope_freq_scale: f32,
pub rope_yarn_enabled: bool,
pub host: String,
pub port: u16,
pub timeout: u32,
pub cache_prompt: bool,
pub cache_reuse: u32,
pub webui: bool,
pub max_tokens: Option<u32>,
pub cache_type: CacheType,
pub backend: Backend,
pub llama_cpp_version_cpu: Option<String>,
pub llama_cpp_version_vulkan: Option<String>,
pub llama_cpp_version_rocm: Option<String>,
pub llama_cpp_version_rocm_lemonade: Option<String>,
pub llama_cpp_version_cuda: Option<String>,
pub api_endpoint_enabled: bool,
pub api_endpoint_port: u16,
pub spec_type: String,
pub draft_tokens: u32,
pub tags: Vec<String>,
}
impl Default for ModelSettings {
fn default() -> Self {
let mut s: Self = crate::config::DefaultParams::default().into();
s.uniform_cache = false;
s.cache_type_k = Some(CacheTypeK::F16);
s.cache_type_v = Some(CacheTypeV::F16);
s.cache_type = CacheType::default();
s.backend = Backend::Cpu;
s.presence_penalty = Some(0.0);
s.frequency_penalty = Some(0.0);
s
}
}
impl ModelSettings {
pub fn from_config(config: &crate::config::Config) -> Self {
config.default.clone().into()
}
}
pub const BENCHMARK_PROMPT: &str = "Create Mona Lisa image in ascii art using text, number, symbol, everything possible. this should be the perfect painting.";
#[derive(Debug, Clone)]
pub struct DiscoveredModel {
pub path: PathBuf,
pub name: String,
pub file_size: u64,
pub display_name: String, }
#[derive(Debug, Clone, Default)]
pub struct GgufMetadata {
pub layers: u32,
pub hidden_size: u32,
pub n_ctx_train: u32,
pub n_head: u32,
pub n_kv_head: u32,
pub arch: String,
pub file_type: String,
pub quantization: String,
pub model_parameters: String,
pub domain: String,
pub capabilities: Vec<String>,
pub tokenizer: String,
pub vocab_size: u32,
pub draft_tokens: u32,
}
impl GgufMetadata {
pub fn from_path(path: &std::path::Path) -> anyhow::Result<Self> {
let path_str = path.to_string_lossy();
let mut container = gguf_rs::get_gguf_container(&path_str)
.map_err(|e| anyhow::anyhow!("Failed to get GGUF container: {}", e))?;
let model_data = container
.decode()
.map_err(|e| anyhow::anyhow!("Failed to decode GGUF: {}", e))?;
let mut meta = Self::default();
let extract_str = |key: &str| -> String {
model_data
.metadata()
.get(key)
.and_then(|v| v.as_str().map(|s| s.to_string()))
.unwrap_or_default()
};
let extract_num = |key: &str| -> Option<u64> {
model_data.metadata().get(key).and_then(|v| {
v.as_u64()
.or_else(|| v.as_i64().map(|x| x as u64))
.or_else(|| v.as_f64().map(|x| x as u64))
})
};
meta.arch = extract_str("general.architecture");
let prefix = if meta.arch.is_empty() {
"llama"
} else {
&meta.arch
};
let get_num_with_fallback = |suffix: &str| -> u32 {
extract_num(&format!("{}.{}", prefix, suffix))
.or_else(|| {
if prefix != "llama" {
extract_num(&format!("llama.{}", suffix))
} else {
None
}
})
.unwrap_or(0) as u32
};
meta.layers = get_num_with_fallback("block_count");
meta.hidden_size = get_num_with_fallback("embedding_length");
meta.n_ctx_train = get_num_with_fallback("context_length");
meta.n_head = get_num_with_fallback("attention.head_count");
meta.n_kv_head = get_num_with_fallback("attention.head_count_kv");
if let Some(value) = model_data.metadata().get("tokenizer.ggml.tokens")
&& let Some(arr) = value.as_array()
{
meta.vocab_size = arr.len() as u32;
}
if meta.arch == "mtp" {
meta.draft_tokens = extract_num("mtp.draft_tokens").unwrap_or(0) as u32;
}
if let Some(v) = extract_num("general.file_type") {
meta.file_type = match v {
0 => "F32".to_string(),
1 => "F16".to_string(),
2 => "Q4_0".to_string(),
3 => "Q4_1".to_string(),
7 => "Q8_0".to_string(),
8 => "Q5_0".to_string(),
9 => "Q5_1".to_string(),
10 => "Q2_K".to_string(),
11 => "Q3_K_S".to_string(),
12 => "Q3_K_M".to_string(),
13 => "Q3_K_L".to_string(),
14 => "Q4_K_S".to_string(),
15 => "Q4_K_M".to_string(),
16 => "Q5_K_S".to_string(),
17 => "Q5_K_M".to_string(),
18 => "Q6_K".to_string(),
19 => "IQ2_XXS".to_string(),
20 => "IQ2_XS".to_string(),
21 => "IQ3_XXS".to_string(),
22 => "IQ1_S".to_string(),
23 => "IQ4_NL".to_string(),
24 => "IQ3_S".to_string(),
25 => "IQ2_S".to_string(),
26 => "IQ4_XS".to_string(),
_ => format!("Unknown ({})", v),
};
}
if let Some(value) = model_data.metadata().get("general.capabilities")
&& let Some(arr) = value.as_array()
{
for v in arr {
if let Some(s) = v.as_str() {
meta.capabilities.push(s.to_string());
}
}
}
if model_data
.metadata()
.contains_key("tokenizer.chat_template")
{
meta.capabilities.push("chat".to_string());
}
meta.tokenizer = extract_str("tokenizer.ggml.model");
meta.domain = extract_str("general.domain");
meta.model_parameters = model_data.model_parameters();
Ok(meta)
}
}
#[derive(Debug, Clone)]
pub struct ServerMetrics {
pub loaded: bool,
pub tps: f64,
pub prompt_tps: f64,
pub cpu_usage: f64,
pub gpu_mem_used: u64,
pub gpu_mem_total: u64,
pub ram_used: u64,
pub ctx_used: u32,
pub ctx_max: u32,
pub total_vram_used: u64,
pub decoded_tokens: u64,
pub gen_tps: f64,
pub latency_per_token_ms: f64,
pub prompt_latency_ms: f64,
}
#[derive(Debug, Clone)]
pub struct GPUBuffer {
pub device: String,
pub buffer_size_mib: f64,
}
#[derive(Debug, Clone, Default)]
pub struct LoadProgress {
pub layers_total: Option<u32>,
pub layers_loaded: Option<u32>,
pub tensors_total: Option<u32>,
pub tensors_loaded: u32,
pub buffers: Vec<GPUBuffer>,
}
impl Default for ServerMetrics {
fn default() -> Self {
Self {
loaded: false,
tps: 0.0,
prompt_tps: 0.0,
cpu_usage: 0.0,
gpu_mem_used: 0,
gpu_mem_total: 0,
ram_used: 0,
ctx_used: 0,
ctx_max: 0,
total_vram_used: 0,
decoded_tokens: 0,
gen_tps: 0.0,
latency_per_token_ms: 0.0,
prompt_latency_ms: 0.0,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct WsMetrics {
pub model_name: String,
pub loaded: bool,
pub state: String,
pub tps: f64,
pub prompt_tps: f64,
pub ctx_used: u32,
pub ctx_max: u32,
pub cpu_usage: f64,
pub gpu_mem_used: u64,
pub gpu_mem_total: u64,
pub ram_used: u64,
pub latency_per_token_ms: f64,
pub decoded_tokens: u64,
pub gen_tps: f64,
pub timestamp: u64,
pub cmd_display: Option<String>,
pub threads: u32,
pub threads_batch: u32,
pub context_length: u32,
pub ubatch_size: u32,
pub batch_size: u32,
pub temperature: f32,
pub top_k: u32,
pub top_p: f32,
pub min_p: f32,
pub typical_p: f32,
pub seed: i32,
pub repeat_penalty: f32,
pub repeat_last_n: i32,
pub presence_penalty: Option<f32>,
pub frequency_penalty: Option<f32>,
pub mirostat: Option<u32>,
pub mirostat_lr: Option<f32>,
pub mirostat_ent: Option<f32>,
pub max_tokens: Option<u32>,
pub flash_attn: bool,
pub kv_cache_offload: bool,
pub cache_type_k: Option<String>,
pub cache_type_v: Option<String>,
pub uniform_cache: bool,
pub mlock: bool,
pub mmap: bool,
pub embedding: bool,
pub jinja: bool,
pub ignore_eos: bool,
pub samplers: String,
pub expert_count: u32,
pub gpu_layers: String,
pub backend: String,
pub llama_cpp_version: String,
pub spec_type: String,
pub draft_tokens: u32,
}
impl WsMetrics {
pub fn from_metrics(
metrics: &ServerMetrics,
model_name: &str,
state: &str,
settings: &crate::models::ModelSettings,
cmd_display: Option<&str>,
) -> Self {
use std::time::{SystemTime, UNIX_EPOCH};
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let gpu_layers = match settings.gpu_layers_mode {
crate::models::GpuLayersMode::Auto => "Auto".to_string(),
crate::models::GpuLayersMode::Specific(n) => n.to_string(),
crate::models::GpuLayersMode::All => "All".to_string(),
};
Self {
model_name: model_name.to_string(),
loaded: metrics.loaded,
state: state.to_string(),
tps: metrics.tps,
prompt_tps: metrics.prompt_tps,
ctx_used: metrics.ctx_used,
ctx_max: metrics.ctx_max,
cpu_usage: metrics.cpu_usage,
gpu_mem_used: metrics.gpu_mem_used,
gpu_mem_total: metrics.gpu_mem_total,
ram_used: metrics.ram_used,
latency_per_token_ms: metrics.latency_per_token_ms,
decoded_tokens: metrics.decoded_tokens,
gen_tps: metrics.gen_tps,
timestamp,
cmd_display: cmd_display.map(String::from),
threads: settings.threads,
threads_batch: settings.threads_batch,
context_length: settings.context_length,
ubatch_size: settings.ubatch_size,
batch_size: settings.batch_size,
temperature: settings.temperature,
top_k: settings.top_k as u32,
top_p: settings.top_p,
min_p: settings.min_p,
typical_p: settings.typical_p,
seed: settings.seed,
repeat_penalty: settings.repeat_penalty,
repeat_last_n: settings.repeat_last_n,
presence_penalty: settings.presence_penalty,
frequency_penalty: settings.frequency_penalty,
mirostat: Some(match settings.mirostat {
crate::models::Mirostat::Off => 0,
crate::models::Mirostat::V1 => 1,
crate::models::Mirostat::Mirostat2 => 2,
}),
mirostat_lr: Some(settings.mirostat_lr),
mirostat_ent: Some(settings.mirostat_ent),
max_tokens: settings.max_tokens,
flash_attn: settings.flash_attn,
kv_cache_offload: settings.kv_cache_offload,
cache_type_k: settings.cache_type_k.map(|k| k.to_string()),
cache_type_v: settings.cache_type_v.map(|k| k.to_string()),
uniform_cache: settings.uniform_cache,
mlock: settings.mlock,
mmap: settings.mmap,
embedding: settings.embedding,
jinja: settings.jinja,
ignore_eos: settings.ignore_eos,
samplers: settings.samplers.to_string(),
expert_count: settings.expert_count as u32,
gpu_layers,
backend: settings.backend.to_string(),
llama_cpp_version: settings.get_active_backend_version_display().to_string(),
spec_type: settings.spec_type.clone(),
draft_tokens: settings.draft_tokens,
}
}
}
pub fn estimate_vram_mib(
model_mib: u64,
settings: &ModelSettings,
total_layers: u32,
hidden_size_opt: Option<u32>,
n_head_opt: Option<u32>,
n_kv_head_opt: Option<u32>,
gpu_mem_total_mib: u64,
) -> u64 {
let model_mib_f = model_mib as f64;
let gpu_layers = match settings.gpu_layers_mode {
GpuLayersMode::Auto => {
if total_layers > 0 {
(total_layers as f64 * 0.6) as u32
} else {
20
}
}
GpuLayersMode::Specific(n) => {
if total_layers > 0 {
n.min(total_layers)
} else {
n
}
}
GpuLayersMode::All => {
if total_layers > 0 {
total_layers
} else {
32
}
}
};
let model_vram = if total_layers > 0 && gpu_layers > 0 {
model_mib_f * (gpu_layers as f64 / total_layers as f64).min(1.0)
} else if gpu_layers > 0 {
model_mib_f
} else {
0.0
};
if matches!(settings.gpu_layers_mode, GpuLayersMode::Specific(0)) {
return 0; }
let hidden_size = match hidden_size_opt {
Some(h) => h as f64,
None => {
let params_est = model_mib_f / 550.0;
(1024.0 * params_est.sqrt().max(1.0) * 1.5).max(512.0)
}
};
let gqa_ratio = match (n_head_opt, n_kv_head_opt) {
(Some(n_head), Some(n_kv_head)) if n_head > 0 => n_kv_head as f64 / n_head as f64,
_ => 1.0, };
let flash_attn_factor = if settings.flash_attn { 0.5 } else { 1.0 };
let uniform_cache_factor = if settings.uniform_cache {
1.0 / settings.parallel as f64
} else {
1.0
};
let kv_mib = (2.0
* hidden_size
* settings.context_length as f64
* total_layers as f64
* gqa_ratio
* gpu_layers as f64
/ total_layers as f64 * flash_attn_factor
* uniform_cache_factor
* kv_quant_bytes(
settings.cache_type_k.unwrap_or(CacheTypeK::F16),
settings.cache_type_v.unwrap_or(CacheTypeV::F16)
))
/ (1024.0 * 1024.0);
let activation_mib = (settings.batch_size as f64 * hidden_size * 8.0) / (1024.0 * 1024.0);
let fixed_overhead = if gpu_mem_total_mib > 0 {
gpu_mem_total_mib as f64 * 0.038
} else {
500.0
};
let total_mib = model_vram + kv_mib + activation_mib + fixed_overhead + 550.0;
total_mib.ceil() as u64
}
fn kv_quant_bytes(k_type: CacheQuantType, v_type: CacheQuantType) -> f64 {
let get_bytes = |t: CacheQuantType| match t {
CacheQuantType::F32 => 4.0,
CacheQuantType::F16 | CacheQuantType::BF16 => 2.0,
CacheQuantType::Q8_0 => 1.0,
CacheQuantType::Q5_0 | CacheQuantType::Q5_1 => 0.625, CacheQuantType::Q4_0 | CacheQuantType::Q4_1 | CacheQuantType::Iq4Nl => 0.5, };
(get_bytes(k_type) + get_bytes(v_type)) / 2.0
}
impl ModelSettings {
pub fn is_dirty(&self, other: &Self) -> bool {
self != other
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct BenchTuneConfig {
pub model_path: PathBuf,
pub num_iterations: u32,
pub prompt: String,
pub params_to_test: Vec<BenchTuneParam>,
pub test_duration: Duration,
pub bench_mode: BenchTuneMode,
pub n_predict: u32,
pub chat_template_kwargs: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchTuneParam {
pub name: String,
pub min: f64,
pub max: f64,
pub step: f64,
pub enabled: bool,
}
impl PartialEq for BenchTuneParam {
fn eq(&self, other: &Self) -> bool {
self.name == other.name
&& self.min.to_bits() == other.min.to_bits()
&& self.max.to_bits() == other.max.to_bits()
&& self.step.to_bits() == other.step.to_bits()
&& self.enabled == other.enabled
}
}
impl Eq for BenchTuneParam {}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchTuneParamValue {
pub temperature: Option<f64>,
pub top_p: Option<f64>,
pub top_k: Option<i64>,
pub repeat_penalty: Option<f64>,
pub context_length: Option<u32>,
pub batch_size: Option<u32>,
pub flash_attn: Option<bool>,
pub threads: Option<u32>,
pub expert_count: Option<i32>,
pub spec_type: Option<String>,
pub draft_tokens: Option<u32>,
}
impl PartialEq for BenchTuneParamValue {
fn eq(&self, other: &Self) -> bool {
self.temperature.map(|v| v.to_bits()) == other.temperature.map(|v| v.to_bits())
&& self.top_p.map(|v| v.to_bits()) == other.top_p.map(|v| v.to_bits())
&& self.top_k == other.top_k
&& self.repeat_penalty.map(|v| v.to_bits()) == other.repeat_penalty.map(|v| v.to_bits())
&& self.context_length == other.context_length
&& self.batch_size == other.batch_size
&& self.flash_attn == other.flash_attn
&& self.threads == other.threads
&& self.expert_count == other.expert_count
&& self.spec_type == other.spec_type
&& self.draft_tokens == other.draft_tokens
}
}
impl Eq for BenchTuneParamValue {}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchTuneResult {
pub params: BenchTuneParamValue,
pub metrics: BenchTuneMetrics,
pub outputs: Vec<String>,
pub per_iteration_metrics: Vec<BenchTuneMetrics>,
pub base_settings: Option<ModelSettings>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchTuneMetrics {
pub prompt_tps: f64,
pub generation_tps: f64,
pub combined_tps: f64,
pub latency_per_token: f64,
pub first_token_time: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum BenchTuneStatus {
Running {
current: usize,
total: usize,
progress: f32,
current_params: BenchTuneParamValue,
},
Completed {
total_tests: usize,
successful_tests: usize,
elapsed: Duration,
},
PartiallyCompleted {
total_tests: usize,
successful_tests: usize,
failed_tests: usize,
elapsed: Duration,
},
Cancelled {
total_tests: usize,
successful_tests: usize,
failed_tests: usize,
elapsed: Duration,
},
Error {
error: String,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Default)]
pub enum BenchTuneMode {
RuntimeOnly,
#[default]
Full,
}
#[derive(Debug, Clone)]
pub enum BenchTuneProgress {
Running {
current: usize,
total: usize,
progress: f32,
current_params: BenchTuneParamValue,
},
Completed {
total_tests: usize,
successful_tests: usize,
elapsed: Duration,
},
PartiallyCompleted {
total_tests: usize,
successful_tests: usize,
failed_tests: usize,
elapsed: Duration,
},
Cancelled {
total_tests: usize,
successful_tests: usize,
failed_tests: usize,
elapsed: Duration,
},
Error { error: String },
}
impl BenchTuneProgress {
pub fn from_status(status: &BenchTuneStatus) -> Option<Self> {
match status {
BenchTuneStatus::Running {
current,
total,
progress,
current_params,
} => Some(BenchTuneProgress::Running {
current: *current,
total: *total,
progress: *progress,
current_params: current_params.clone(),
}),
BenchTuneStatus::Completed {
total_tests,
successful_tests,
elapsed,
} => Some(BenchTuneProgress::Completed {
total_tests: *total_tests,
successful_tests: *successful_tests,
elapsed: *elapsed,
}),
BenchTuneStatus::PartiallyCompleted {
total_tests,
successful_tests,
failed_tests,
elapsed,
} => Some(BenchTuneProgress::PartiallyCompleted {
total_tests: *total_tests,
successful_tests: *successful_tests,
failed_tests: *failed_tests,
elapsed: *elapsed,
}),
BenchTuneStatus::Cancelled {
total_tests,
successful_tests,
failed_tests,
elapsed,
} => Some(BenchTuneProgress::Cancelled {
total_tests: *total_tests,
successful_tests: *successful_tests,
failed_tests: *failed_tests,
elapsed: *elapsed,
}),
BenchTuneStatus::Error { error } => Some(BenchTuneProgress::Error {
error: error.clone(),
}),
}
}
}
impl BenchTuneConfig {
pub fn new(model_path: PathBuf, num_iterations: u32, prompt: String) -> Self {
Self {
model_path,
num_iterations,
prompt,
params_to_test: vec![
BenchTuneParam {
name: "temperature".to_string(),
min: 0.4,
max: 1.0,
step: 0.1,
enabled: false,
},
BenchTuneParam {
name: "top_p".to_string(),
min: 0.8,
max: 1.0,
step: 0.1,
enabled: false,
},
BenchTuneParam {
name: "top_k".to_string(),
min: 10.0,
max: 40.0,
step: 5.0,
enabled: false,
},
BenchTuneParam {
name: "repeat_penalty".to_string(),
min: 1.0,
max: 1.2,
step: 0.1,
enabled: false,
},
BenchTuneParam {
name: "flash_attn".to_string(),
min: 0.0,
max: 1.0,
step: 1.0,
enabled: false,
},
BenchTuneParam {
name: "threads".to_string(),
min: 4.0,
max: 16.0,
step: 4.0,
enabled: false,
},
BenchTuneParam {
name: "batch_size".to_string(),
min: 512.0,
max: 2048.0,
step: 512.0,
enabled: false,
},
BenchTuneParam {
name: "expert_count".to_string(),
min: 1.0,
max: 4.0,
step: 1.0,
enabled: false,
},
],
test_duration: Duration::from_secs(30),
bench_mode: BenchTuneMode::default(),
n_predict: 512,
chat_template_kwargs: Some(r#"{"enable_thinking": false}"#.to_string()),
}
}
pub fn generate_combinations(&self) -> Vec<BenchTuneParamValue> {
let mut temp_values = vec![None];
let mut top_p_values = vec![None];
let mut top_k_values = vec![None];
let mut repeat_penalty_values = vec![None];
let mut flash_attn_values = vec![None];
let mut threads_values = vec![None];
let mut batch_size_values = vec![None];
let mut expert_count_values = vec![None];
for p in &self.params_to_test {
if !p.enabled {
continue;
}
let vals: Vec<f64> = {
let step_count = ((p.max - p.min) / p.step).ceil() as usize;
(0..=step_count)
.map(|i| (p.min + (i as f64 * p.step)).min(p.max))
.collect()
};
match p.name.as_str() {
"temperature" => temp_values = vals.into_iter().map(Some).collect(),
"top_p" => top_p_values = vals.into_iter().map(Some).collect(),
"top_k" => top_k_values = vals.into_iter().map(|v| Some(v as i64)).collect(),
"repeat_penalty" => repeat_penalty_values = vals.into_iter().map(Some).collect(),
"flash_attn" => {
flash_attn_values = vals.into_iter().map(|v| Some(v >= 0.5)).collect()
}
"threads" => threads_values = vals.into_iter().map(|v| Some(v as u32)).collect(),
"batch_size" => {
batch_size_values = vals.into_iter().map(|v| Some(v as u32)).collect()
}
"expert_count" => {
expert_count_values = vals.into_iter().map(|v| Some(v as i32)).collect()
}
_ => {}
}
}
let mut combinations = Vec::new();
for &temp in &temp_values {
for &top_p in &top_p_values {
for &top_k in &top_k_values {
for &rp in &repeat_penalty_values {
for &fa in &flash_attn_values {
for &th in &threads_values {
for &bs in &batch_size_values {
for &ec in &expert_count_values {
combinations.push(BenchTuneParamValue {
temperature: temp,
top_p,
top_k,
repeat_penalty: rp,
context_length: None,
batch_size: bs,
flash_attn: fa,
threads: th,
expert_count: ec,
spec_type: None,
draft_tokens: None,
});
}
}
}
}
}
}
}
}
combinations
}
pub fn get_total_tests_count(&self) -> usize {
self.generate_combinations().len()
}
}
#[cfg(test)]
mod field_count_tests {
use super::*;
#[test]
fn test_model_settings_field_count() {
let s = ModelSettings::default();
let field_count = count_model_settings_fields(&s);
assert_eq!(
field_count, 75,
"ModelSettings has {} fields (expected 75). \
Update the checklist at src/models.rs:665 and all locations listed there.",
field_count
);
}
#[allow(clippy::too_many_lines)]
fn count_model_settings_fields(s: &ModelSettings) -> usize {
let _ = (
&s.context_length,
&s.threads,
&s.threads_batch,
&s.batch_size,
&s.ubatch_size,
&s.parallel,
&s.max_concurrent_predictions,
&s.uniform_cache,
&s.kv_cache_offload,
&s.cache_type_k,
&s.cache_type_v,
&s.keep,
&s.swa_full,
&s.mlock,
&s.mmap,
&s.numa,
&s.system_prompt,
&s.system_prompt_preset_name,
&s.gpu_layers_mode,
&s.split_mode,
&s.tensor_split,
&s.main_gpu,
&s.fit,
&s.lora,
&s.lora_scaled,
&s.rpc,
&s.embedding,
&s.flash_attn,
&s.expert_count,
&s.jinja,
&s.chat_template,
&s.chat_template_kwargs,
&s.seed,
&s.temperature,
&s.top_k,
&s.top_p,
&s.min_p,
&s.typical_p,
&s.mirostat,
&s.mirostat_lr,
&s.mirostat_ent,
&s.ignore_eos,
&s.samplers,
&s.repeat_penalty,
&s.repeat_last_n,
&s.presence_penalty,
&s.frequency_penalty,
&s.dry_multiplier,
&s.dry_base,
&s.dry_allowed_length,
&s.dry_penalty_last_n,
&s.rope_scaling,
&s.rope_scale,
&s.rope_freq_base,
&s.rope_freq_scale,
&s.rope_yarn_enabled,
&s.host,
&s.port,
&s.timeout,
&s.cache_prompt,
&s.cache_reuse,
&s.webui,
&s.max_tokens,
&s.cache_type,
&s.backend,
&s.llama_cpp_version_cpu,
&s.llama_cpp_version_vulkan,
&s.llama_cpp_version_rocm,
&s.llama_cpp_version_rocm_lemonade,
&s.llama_cpp_version_cuda,
&s.api_endpoint_enabled,
&s.api_endpoint_port,
&s.spec_type,
&s.draft_tokens,
&s.tags,
);
75
}
#[test]
fn test_is_dirty_uses_derived_eq() {
let s1 = ModelSettings::default();
let s2 = ModelSettings::default();
let s3 = s1.clone();
assert!(!s1.is_dirty(&s2));
assert!(!s1.is_dirty(&s3));
assert_eq!(s1 != s2, s1.is_dirty(&s2));
assert_eq!(s1 != s3, s1.is_dirty(&s3));
}
#[test]
fn test_from_default_params_completeness() {
let dp = crate::config::DefaultParams::default();
let ms: ModelSettings = dp.clone().into();
assert_eq!(ms.context_length, 131072);
assert_eq!(ms.threads, dp.threads);
assert_eq!(ms.temperature, 0.8);
assert_eq!(ms.backend, dp.backend);
}
}