pub struct LlamaContextParams { /* private fields */ }Expand description
A safe wrapper around llama_context_params.
Generally this should be created with Default::default() and then modified with with_* methods.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let ctx_params = LlamaContextParams::default()
.with_n_ctx(NonZeroU32::new(2048))
.with_seed(1234);
assert_eq!(ctx_params.seed(), 1234);
assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));Implementations§
Source§impl LlamaContextParams
impl LlamaContextParams
Sourcepub fn with_n_ctx(self, n_ctx: Option<NonZeroU32>) -> Self
pub fn with_n_ctx(self, n_ctx: Option<NonZeroU32>) -> Self
Set the side of the context
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
let params = params.with_n_ctx(NonZeroU32::new(2048));
assert_eq!(params.n_ctx(), NonZeroU32::new(2048));Sourcepub fn n_ctx(&self) -> Option<NonZeroU32>
pub fn n_ctx(&self) -> Option<NonZeroU32>
Sourcepub fn with_n_batch(self, n_batch: u32) -> Self
pub fn with_n_batch(self, n_batch: u32) -> Self
Set the n_batch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_batch(2048);
assert_eq!(params.n_batch(), 2048);Sourcepub fn n_batch(&self) -> u32
pub fn n_batch(&self) -> u32
Get the n_batch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.n_batch(), 2048);Sourcepub fn with_n_ubatch(self, n_ubatch: u32) -> Self
pub fn with_n_ubatch(self, n_ubatch: u32) -> Self
Set the n_ubatch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_ubatch(512);
assert_eq!(params.n_ubatch(), 512);Sourcepub fn n_ubatch(&self) -> u32
pub fn n_ubatch(&self) -> u32
Get the n_ubatch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.n_ubatch(), 512);Sourcepub fn with_flash_attention(self, enabled: bool) -> Self
pub fn with_flash_attention(self, enabled: bool) -> Self
Set the flash_attention parameter
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_flash_attention(true);
assert_eq!(params.flash_attention(), true);Sourcepub fn flash_attention(&self) -> bool
pub fn flash_attention(&self) -> bool
Get the flash_attention parameter
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.flash_attention(), false);Sourcepub fn with_offload_kqv(self, enabled: bool) -> Self
pub fn with_offload_kqv(self, enabled: bool) -> Self
Set the offload_kqv parameter to control offloading KV cache & KQV ops to GPU
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_offload_kqv(false);
assert_eq!(params.offload_kqv(), false);Sourcepub fn offload_kqv(&self) -> bool
pub fn offload_kqv(&self) -> bool
Get the offload_kqv parameter
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.offload_kqv(), true);Sourcepub fn with_rope_scaling_type(self, rope_scaling_type: RopeScalingType) -> Self
pub fn with_rope_scaling_type(self, rope_scaling_type: RopeScalingType) -> Self
Set the type of rope scaling.
§Examples
use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
let params = LlamaContextParams::default()
.with_rope_scaling_type(RopeScalingType::Linear);
assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);Sourcepub fn rope_scaling_type(&self) -> RopeScalingType
pub fn rope_scaling_type(&self) -> RopeScalingType
Get the type of rope scaling.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_scaling_type(), llama_cpp_4::context::params::RopeScalingType::Unspecified);Sourcepub fn with_rope_freq_base(self, rope_freq_base: f32) -> Self
pub fn with_rope_freq_base(self, rope_freq_base: f32) -> Self
Set the rope frequency base.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_rope_freq_base(0.5);
assert_eq!(params.rope_freq_base(), 0.5);Sourcepub fn rope_freq_base(&self) -> f32
pub fn rope_freq_base(&self) -> f32
Get the rope frequency base.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_freq_base(), 0.0);Sourcepub fn with_rope_freq_scale(self, rope_freq_scale: f32) -> Self
pub fn with_rope_freq_scale(self, rope_freq_scale: f32) -> Self
Set the rope frequency scale.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_rope_freq_scale(0.5);
assert_eq!(params.rope_freq_scale(), 0.5);Sourcepub fn rope_freq_scale(&self) -> f32
pub fn rope_freq_scale(&self) -> f32
Get the rope frequency scale.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_freq_scale(), 0.0);Sourcepub fn n_threads(&self) -> i32
pub fn n_threads(&self) -> i32
Get the number of threads.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_threads(), 4);Sourcepub fn n_threads_batch(&self) -> i32
pub fn n_threads_batch(&self) -> i32
Get the number of threads allocated for batches.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_threads_batch(), 4);Sourcepub fn with_n_threads(self, n_threads: i32) -> Self
pub fn with_n_threads(self, n_threads: i32) -> Self
Set the number of threads.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_threads(8);
assert_eq!(params.n_threads(), 8);Sourcepub fn with_n_threads_batch(self, n_threads: i32) -> Self
pub fn with_n_threads_batch(self, n_threads: i32) -> Self
Set the number of threads allocated for batches.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_threads_batch(8);
assert_eq!(params.n_threads_batch(), 8);Sourcepub fn embeddings(&self) -> bool
pub fn embeddings(&self) -> bool
Check whether embeddings are enabled
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert!(!params.embeddings());Sourcepub fn with_embeddings(self, embedding: bool) -> Self
pub fn with_embeddings(self, embedding: bool) -> Self
Enable the use of embeddings
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_embeddings(true);
assert!(params.embeddings());Sourcepub fn with_cb_eval(self, cb_eval: ggml_backend_sched_eval_callback) -> Self
pub fn with_cb_eval(self, cb_eval: ggml_backend_sched_eval_callback) -> Self
Set the evaluation callback.
§Examples
extern "C" fn cb_eval_fn(
t: *mut llama_cpp_sys_4::ggml_tensor,
ask: bool,
user_data: *mut std::ffi::c_void,
) -> bool {
false
}
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));Sourcepub fn with_cb_eval_user_data(self, cb_eval_user_data: *mut c_void) -> Self
pub fn with_cb_eval_user_data(self, cb_eval_user_data: *mut c_void) -> Self
Set the evaluation callback user data.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
let user_data = std::ptr::null_mut();
let params = params.with_cb_eval_user_data(user_data);Sourcepub fn with_tensor_capture(self, capture: &mut TensorCapture) -> Self
pub fn with_tensor_capture(self, capture: &mut TensorCapture) -> Self
Attach a TensorCapture to
intercept intermediate tensor outputs during decode().
This sets up the cb_eval callback to capture tensors matching the
capture’s filter (e.g. specific layer outputs). After decode() the
captured data can be read from the TensorCapture.
§Example
use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::context::tensor_capture::TensorCapture;
let mut capture = TensorCapture::for_layers(&[13, 20, 27]);
let ctx_params = LlamaContextParams::default()
.with_embeddings(true)
.with_tensor_capture(&mut capture);Sourcepub fn with_cache_type_k(self, ty: GgmlType) -> Self
pub fn with_cache_type_k(self, ty: GgmlType) -> Self
Set the storage type for the K (key) KV cache tensors.
The default is GgmlType::F16. Quantized types like GgmlType::Q5_0
or GgmlType::Q4_0 reduce VRAM usage significantly; combining them with
TurboQuant attention rotation (the default) keeps quality high.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::quantize::GgmlType;
let params = LlamaContextParams::default()
.with_cache_type_k(GgmlType::Q5_0);Sourcepub fn cache_type_k(&self) -> ggml_type
pub fn cache_type_k(&self) -> ggml_type
Get the K-cache storage type.
Sourcepub fn with_cache_type_v(self, ty: GgmlType) -> Self
pub fn with_cache_type_v(self, ty: GgmlType) -> Self
Set the storage type for the V (value) KV cache tensors.
See with_cache_type_k for details.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::quantize::GgmlType;
let params = LlamaContextParams::default()
.with_cache_type_v(GgmlType::Q5_0);Sourcepub fn cache_type_v(&self) -> ggml_type
pub fn cache_type_v(&self) -> ggml_type
Get the V-cache storage type.
Sourcepub fn with_attn_rot_disabled(self, disabled: bool) -> Self
pub fn with_attn_rot_disabled(self, disabled: bool) -> Self
Control the TurboQuant attention-rotation feature (llama.cpp PR #21038).
By default, llama.cpp applies a Hadamard rotation to Q/K/V tensors before writing them into the KV cache. This significantly improves quantized KV-cache quality at near-zero overhead, and is enabled automatically for models whose head dimension is a power of two.
Set disabled = true to opt out (equivalent to LLAMA_ATTN_ROT_DISABLE=1).
The env-var is applied just before the context is created and restored
afterwards, so this is safe to call from a single thread.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
// Disable rotation for this context only:
let params = LlamaContextParams::default().with_attn_rot_disabled(true);
assert!(params.attn_rot_disabled());Sourcepub fn attn_rot_disabled(&self) -> bool
pub fn attn_rot_disabled(&self) -> bool
Returns true if TurboQuant attention rotation is disabled for this context.
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert!(!params.attn_rot_disabled());Sourcepub fn with_pooling_type(self, pooling_type: LlamaPoolingType) -> Self
pub fn with_pooling_type(self, pooling_type: LlamaPoolingType) -> Self
Set the type of pooling.
§Examples
use llama_cpp_4::context::params::{LlamaContextParams, LlamaPoolingType};
let params = LlamaContextParams::default()
.with_pooling_type(LlamaPoolingType::Last);
assert_eq!(params.pooling_type(), LlamaPoolingType::Last);Sourcepub fn pooling_type(&self) -> LlamaPoolingType
pub fn pooling_type(&self) -> LlamaPoolingType
Get the type of pooling.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.pooling_type(), llama_cpp_4::context::params::LlamaPoolingType::Unspecified);Trait Implementations§
Source§impl Clone for LlamaContextParams
impl Clone for LlamaContextParams
Source§fn clone(&self) -> LlamaContextParams
fn clone(&self) -> LlamaContextParams
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for LlamaContextParams
impl Debug for LlamaContextParams
Source§impl Default for LlamaContextParams
Default parameters for LlamaContext. (as defined in llama.cpp by llama_context_default_params)
impl Default for LlamaContextParams
Default parameters for LlamaContext. (as defined in llama.cpp by llama_context_default_params)
use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
let params = LlamaContextParams::default();
assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512");
assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);impl Send for LlamaContextParams
SAFETY: we do not currently allow setting or reading the pointers that cause this to not be automatically send or sync.