pub struct ModelSettings {Show 75 fields
pub context_length: u32,
pub threads: u32,
pub threads_batch: u32,
pub batch_size: u32,
pub ubatch_size: u32,
pub parallel: u32,
pub max_concurrent_predictions: Option<u32>,
pub uniform_cache: bool,
pub kv_cache_offload: bool,
pub cache_type_k: Option<CacheTypeK>,
pub cache_type_v: Option<CacheTypeV>,
pub keep: i32,
pub swa_full: bool,
pub mlock: bool,
pub mmap: bool,
pub numa: NumMode,
pub system_prompt: String,
pub system_prompt_preset_name: String,
pub gpu_layers_mode: GpuLayersMode,
pub split_mode: SplitMode,
pub tensor_split: String,
pub main_gpu: i32,
pub fit: bool,
pub lora: Option<PathBuf>,
pub lora_scaled: Option<(PathBuf, f32)>,
pub rpc: String,
pub embedding: bool,
pub flash_attn: bool,
pub expert_count: i32,
pub jinja: bool,
pub chat_template: Option<String>,
pub chat_template_kwargs: Option<String>,
pub seed: i32,
pub temperature: f32,
pub top_k: i32,
pub top_p: f32,
pub min_p: f32,
pub typical_p: f32,
pub mirostat: Mirostat,
pub mirostat_lr: f32,
pub mirostat_ent: f32,
pub ignore_eos: bool,
pub samplers: Samplers,
pub repeat_penalty: f32,
pub repeat_last_n: i32,
pub presence_penalty: Option<f32>,
pub frequency_penalty: Option<f32>,
pub dry_multiplier: f32,
pub dry_base: f32,
pub dry_allowed_length: i32,
pub dry_penalty_last_n: i32,
pub rope_scaling: RopeScaling,
pub rope_scale: f32,
pub rope_freq_base: f32,
pub rope_freq_scale: f32,
pub rope_yarn_enabled: bool,
pub host: String,
pub port: u16,
pub timeout: u32,
pub cache_prompt: bool,
pub cache_reuse: u32,
pub webui: bool,
pub max_tokens: Option<u32>,
pub cache_type: CacheType,
pub backend: Backend,
pub llama_cpp_version_cpu: Option<String>,
pub llama_cpp_version_vulkan: Option<String>,
pub llama_cpp_version_rocm: Option<String>,
pub llama_cpp_version_rocm_lemonade: Option<String>,
pub llama_cpp_version_cuda: Option<String>,
pub api_endpoint_enabled: bool,
pub api_endpoint_port: u16,
pub spec_type: String,
pub draft_tokens: u32,
pub tags: Vec<String>,
}Expand description
Settings for loading a model via llama.cpp server.
Fields§
§context_length: u32Size of the prompt context.
threads: u32Number of CPU threads for generation.
threads_batch: u32Number of CPU threads for batch processing.
batch_size: u32Logical maximum batch size.
ubatch_size: u32Physical maximum batch size (micro-batch).
parallel: u32Max concurrent predictions (sequences).
max_concurrent_predictions: Option<u32>Max concurrent predictions (requests in flight). None means no –parallel argument.
uniform_cache: boolUse uniform (unified) KV cache across all sequences.
kv_cache_offload: boolOffload KV cache to system RAM.
cache_type_k: Option<CacheTypeK>KV cache data type for K.
cache_type_v: Option<CacheTypeV>KV cache data type for V.
keep: i32Keep N tokens from the initial prompt.
swa_full: boolUse full-size SWA cache.
mlock: boolForce system to keep model in RAM.
mmap: boolMemory-map the model.
numa: NumModeNUMA optimization.
system_prompt: StringSystem prompt.
system_prompt_preset_name: StringName of the system prompt preset currently selected.
gpu_layers_mode: GpuLayersModeGPU layer offloading mode.
split_mode: SplitModeSplit mode across multiple GPUs.
tensor_split: StringFraction of model offloaded to each GPU (comma-separated).
main_gpu: i32Main GPU index.
fit: boolWhether to adjust arguments to fit device memory.
lora: Option<PathBuf>Path to LoRA adapter.
lora_scaled: Option<(PathBuf, f32)>Path to LoRA adapter with scale.
rpc: StringRPC servers.
embedding: boolRestrict to embedding use case.
flash_attn: boolEnable Flash Attention.
expert_count: i32Active experts per token (MoE models, -1 = model default).
jinja: boolUse Jinja template engine for chat.
chat_template: Option<String>Custom chat template string.
chat_template_kwargs: Option<String>JSON string for –chat-template-kwargs (e.g. {“enable_thinking”: false}).
seed: i32RNG seed (-1 = random).
temperature: f32Temperature.
top_k: i32Top-k sampling (0 = disabled).
top_p: f32Top-p sampling (1.0 = disabled).
min_p: f32Minimum probability for a token.
typical_p: f32Locally typical sampling parameter p.
mirostat: MirostatMirostat version (0=off, 1=Mirostat, 2=Mirostat2).
mirostat_lr: f32Mirostat learning rate (eta).
mirostat_ent: f32Mirostat target entropy (tau).
ignore_eos: boolIgnore end-of-stream token.
samplers: SamplersSampler order string.
repeat_penalty: f32Penalize repeat sequence of tokens.
repeat_last_n: i32Last N tokens to consider for repeat penalty.
presence_penalty: Option<f32>Repeat alpha presence penalty.
frequency_penalty: Option<f32>Repeat alpha frequency penalty.
dry_multiplier: f32DRY sampling multiplier.
dry_base: f32DRY sampling base value.
dry_allowed_length: i32DRY allowed length.
dry_penalty_last_n: i32DRY penalty last N.
rope_scaling: RopeScalingRoPE frequency scaling method.
rope_scale: f32RoPE context scaling factor.
rope_freq_base: f32RoPE base frequency.
rope_freq_scale: f32RoPE frequency scaling factor.
rope_yarn_enabled: boolEnable Yarn RoPE scaling mode.
host: StringHost address.
port: u16Port.
timeout: u32Server timeout in seconds.
cache_prompt: boolWhether to enable prompt caching.
cache_reuse: u32Min chunk size for cache reuse.
webui: boolWhether to enable WebUI.
max_tokens: Option<u32>Max tokens to predict.
cache_type: CacheTypeCache type (legacy, kept for compatibility).
backend: BackendBackend (cpu/vulkan).
llama_cpp_version_cpu: Option<String>llama.cpp release tag for CPU backend (e.g. “b1234” or None for latest).
llama_cpp_version_vulkan: Option<String>llama.cpp release tag for Vulkan backend (e.g. “b1234” or None for latest).
llama_cpp_version_rocm: Option<String>llama.cpp release tag for ROCm backend (e.g. “b1234” or None for latest).
llama_cpp_version_rocm_lemonade: Option<String>Lemonade llama.cpp release tag for ROCm backend.
llama_cpp_version_cuda: Option<String>llama.cpp release tag for CUDA backend.
api_endpoint_enabled: boolWhether to enable the API proxy server.
api_endpoint_port: u16Port for the API proxy server.
spec_type: StringSpeculative decoding type (e.g., “draft-mtp”, “ngram-simple”, “” for off).
draft_tokens: u32Number of draft tokens for MTP.
Tags for the model.
Implementations§
Source§impl ModelSettings
impl ModelSettings
Sourcepub fn get_active_backend_version(&self) -> Option<&String>
pub fn get_active_backend_version(&self) -> Option<&String>
Get the version string for the currently active backend.
Sourcepub fn get_active_backend_version_display(&self) -> &str
pub fn get_active_backend_version_display(&self) -> &str
Get the display version string for the currently active backend (defaults to “latest”).
Sourcepub fn set_active_backend_version(&mut self, tag: Option<String>)
pub fn set_active_backend_version(&mut self, tag: Option<String>)
Set the version string for the currently active backend.
Source§impl ModelSettings
impl ModelSettings
Sourcepub fn from_config(config: &Config) -> Self
pub fn from_config(config: &Config) -> Self
Create ModelSettings from config defaults, applying model-specific overrides.
Trait Implementations§
Source§impl Clone for ModelSettings
impl Clone for ModelSettings
Source§fn clone(&self) -> ModelSettings
fn clone(&self) -> ModelSettings
1.0.0 (const: unstable) · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for ModelSettings
impl Debug for ModelSettings
Source§impl Default for ModelSettings
impl Default for ModelSettings
Source§impl<'de> Deserialize<'de> for ModelSettings
impl<'de> Deserialize<'de> for ModelSettings
Source§fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
Source§impl From<DefaultParams> for ModelSettings
impl From<DefaultParams> for ModelSettings
Source§fn from(dp: DefaultParams) -> Self
fn from(dp: DefaultParams) -> Self
Source§impl PartialEq for ModelSettings
impl PartialEq for ModelSettings
Source§fn eq(&self, other: &ModelSettings) -> bool
fn eq(&self, other: &ModelSettings) -> bool
self and other values to be equal, and is used by ==.Source§impl Serialize for ModelSettings
impl Serialize for ModelSettings
impl StructuralPartialEq for ModelSettings
Auto Trait Implementations§
impl Freeze for ModelSettings
impl RefUnwindSafe for ModelSettings
impl Send for ModelSettings
impl Sync for ModelSettings
impl Unpin for ModelSettings
impl UnsafeUnpin for ModelSettings
impl UnwindSafe for ModelSettings
Blanket Implementations§
Source§impl<'a, T, E> AsTaggedExplicit<'a, E> for Twhere
T: 'a,
impl<'a, T, E> AsTaggedExplicit<'a, E> for Twhere
T: 'a,
Source§impl<'a, T, E> AsTaggedImplicit<'a, E> for Twhere
T: 'a,
impl<'a, T, E> AsTaggedImplicit<'a, E> for Twhere
T: 'a,
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more