pub const DEFAULT_CONTEXT_POOL_SIZE: usize = 4;
pub(super) fn detect_quantization_from_path(path: &str) -> Option<String> {
let filename = std::path::Path::new(path)
.file_name()
.and_then(|n| n.to_str())?
.to_ascii_uppercase();
let known = [
"Q2_K", "Q3_K", "Q4_0", "Q4_1", "Q4_K_M", "Q4_K_S", "Q5_0", "Q5_1", "Q5_K_M", "Q5_K_S",
"Q6_K", "Q8_0", "F16", "F32",
];
for q in known {
if filename.contains(q) {
return Some(q.to_string());
}
}
None
}
#[derive(Debug, Clone, Default)]
pub struct ModelConfig {
pub stop_sequences: Vec<String>,
pub system_prompt: Option<String>,
pub temperature: Option<f32>,
pub top_p: Option<f32>,
pub top_k: Option<i32>,
pub context_size: Option<u32>,
}
#[derive(Debug, Clone)]
pub struct ModelLoadConfig {
pub alias: String,
pub path: String,
pub gpu_layers: i32,
pub context_size: u32,
pub threads: i32,
pub context_pool_size: usize,
pub mmproj_path: Option<String>,
pub model_config: Option<ModelConfig>,
pub use_mmap: Option<bool>,
pub use_mlock: bool,
pub flash_attn: bool,
pub cache_type_k: Option<String>,
pub cache_type_v: Option<String>,
pub rope_freq_base: Option<f32>,
pub rope_freq_scale: Option<f32>,
pub n_batch: Option<u32>,
pub defrag_thold: Option<f32>,
pub split_mode: Option<String>,
}
impl ModelLoadConfig {
pub fn new(alias: impl Into<String>, path: impl Into<String>) -> Self {
Self {
alias: alias.into(),
path: path.into(),
gpu_layers: 0,
context_size: 4096,
threads: num_cpus::get() as i32,
context_pool_size: DEFAULT_CONTEXT_POOL_SIZE,
mmproj_path: None,
model_config: None,
use_mmap: None,
use_mlock: false,
flash_attn: false,
cache_type_k: None,
cache_type_v: None,
rope_freq_base: None,
rope_freq_scale: None,
n_batch: None,
defrag_thold: None,
split_mode: None,
}
}
pub fn gpu_layers(mut self, layers: i32) -> Self {
self.gpu_layers = layers;
self
}
pub fn context_size(mut self, size: u32) -> Self {
self.context_size = size;
self
}
pub fn threads(mut self, threads: i32) -> Self {
self.threads = threads;
self
}
pub fn context_pool_size(mut self, size: usize) -> Self {
self.context_pool_size = size.max(1);
self
}
pub fn mmproj(mut self, path: impl Into<String>) -> Self {
self.mmproj_path = Some(path.into());
self
}
pub fn with_config(mut self, config: ModelConfig) -> Self {
self.model_config = Some(config);
self
}
pub fn use_mmap(mut self, use_mmap: bool) -> Self {
self.use_mmap = Some(use_mmap);
self
}
pub fn use_mlock(mut self, mlock: bool) -> Self {
self.use_mlock = mlock;
self
}
pub fn flash_attn(mut self, enabled: bool) -> Self {
self.flash_attn = enabled;
self
}
pub fn cache_type_k(mut self, cache_type: impl Into<String>) -> Self {
self.cache_type_k = Some(cache_type.into());
self
}
pub fn cache_type_v(mut self, cache_type: impl Into<String>) -> Self {
self.cache_type_v = Some(cache_type.into());
self
}
pub fn rope_freq_base(mut self, base: f32) -> Self {
self.rope_freq_base = Some(base);
self
}
pub fn rope_freq_scale(mut self, scale: f32) -> Self {
self.rope_freq_scale = Some(scale);
self
}
pub fn n_batch(mut self, batch: u32) -> Self {
self.n_batch = Some(batch);
self
}
pub fn defrag_thold(mut self, thold: f32) -> Self {
self.defrag_thold = Some(thold);
self
}
pub fn split_mode(mut self, mode: impl Into<String>) -> Self {
self.split_mode = Some(mode.into());
self
}
}