Struct LlamaContextParams

Source

pub struct LlamaContextParams { /* private fields */ }

Expand description

Builder for llama_context_params.

Construct with Default::default(), chain with_* setters, then pass the value to crate::model::LlamaModel::new_context. Getter methods mirror the fields that exist on the underlying C struct.

§Sampler ownership

Self::with_sampler_seq_configs stores owned LlamaSampler chains inside this struct until the context is created. Clone clears sampler configs because the underlying chains cannot be duplicated safely.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;

let ctx_params = LlamaContextParams::default()
    .with_n_ctx(NonZeroU32::new(2048));

assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));

Implementations§

Source §

impl LlamaContextParams

Source

pub fn with_flash_attn_type(self, flash_attn_type: LlamaFlashAttnType) -> Self

Set the flash-attention mode (Auto, Enabled, or Disabled).

Maps to llama_context_params.flash_attn_type. Use LlamaFlashAttnType::Auto to match llama.cpp defaults.

§Examples

use llama_cpp_4::context::params::{LlamaContextParams, LlamaFlashAttnType};
let params = LlamaContextParams::default()
    .with_flash_attn_type(LlamaFlashAttnType::Auto);
assert_eq!(params.flash_attn_type(), LlamaFlashAttnType::Auto);

Source

pub fn flash_attn_type(&self) -> LlamaFlashAttnType

Get the configured flash-attention mode.

Source

pub fn with_attention_type(self, attention_type: LlamaAttentionType) -> Self

Set the attention type used when extracting embeddings.

Maps to llama_context_params.attention_type. Embedding models often need LlamaAttentionType::NonCausal; generative decoding uses LlamaAttentionType::Causal.

§Examples

use llama_cpp_4::context::params::{LlamaAttentionType, LlamaContextParams};
let params = LlamaContextParams::default()
    .with_attention_type(LlamaAttentionType::Causal);
assert_eq!(params.attention_type(), LlamaAttentionType::Causal);

Source

pub fn attention_type(&self) -> LlamaAttentionType

Get the attention type used when extracting embeddings.

Source

pub fn with_n_outputs_max(self, n_outputs_max: u32) -> Self

Set the maximum number of outputs per micro-batch.

Maps to llama_context_params.n_outputs_max. When 0, llama.cpp uses n_batch as the cap.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_n_outputs_max(256);
assert_eq!(params.n_outputs_max(), 256);

Source

pub fn n_outputs_max(&self) -> u32

Get the maximum number of outputs per micro-batch.

Source

pub fn with_kv_unified(self, kv_unified: bool) -> Self

Use a unified KV buffer across input sequences.

Maps to llama_context_params.kv_unified. Disabling can improve throughput for batched decoding when sequences do not share a long prefix.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_kv_unified(false);
assert!(!params.kv_unified());

Source

pub fn kv_unified(&self) -> bool

Returns true when a unified KV buffer is enabled.

Source

pub fn with_swa_full(self, swa_full: bool) -> Self

Use a full-size sliding-window-attention (SWA) KV cache.

Maps to llama_context_params.swa_full. When false and n_seq_max > 1, llama.cpp may use a smaller per-sequence SWA window for better performance.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_swa_full(true);
assert!(params.swa_full());

Source

pub fn swa_full(&self) -> bool

Returns true when full SWA cache is enabled.

Source

pub fn with_op_offload(self, op_offload: bool) -> Self

Offload eligible host tensor operations to the active device.

Maps to llama_context_params.op_offload.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_op_offload(true);
assert!(params.op_offload());

Source

pub fn op_offload(&self) -> bool

Returns true when host tensor ops are offloaded to device.

Source

pub fn with_ctx_other(self, other: &LlamaContext<'_>) -> Self

Pair this context with another for shared memory or cross-context results.

Maps to llama_context_params.ctx_other. The paired context is returned by crate::context::LlamaContext::ctx_other after creation.

other must remain alive until crate::model::LlamaModel::new_context returns.

§Examples

let target = model.new_context(&backend, LlamaContextParams::default())?;
let draft = model.new_context(
    &backend,
    LlamaContextParams::default().with_ctx_other(&target),
)?;

Source

pub fn with_yarn_ext_factor(self, yarn_ext_factor: f32) -> Self

Set YaRN extrapolation mix factor.

Maps to llama_context_params.yarn_ext_factor. Negative values use the model default. Only meaningful when super::RopeScalingType::Yarn is active.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_yarn_ext_factor(1.0);
assert_eq!(params.yarn_ext_factor(), 1.0);

Source

pub fn yarn_ext_factor(&self) -> f32

Get YaRN extrapolation mix factor (yarn_ext_factor).

Source

pub fn with_yarn_attn_factor(self, yarn_attn_factor: f32) -> Self

Set YaRN magnitude scaling factor.

Maps to llama_context_params.yarn_attn_factor.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_yarn_attn_factor(1.0);
assert_eq!(params.yarn_attn_factor(), 1.0);

Source

pub fn yarn_attn_factor(&self) -> f32

Get YaRN magnitude scaling factor (yarn_attn_factor).

Source

pub fn with_yarn_beta_fast(self, yarn_beta_fast: f32) -> Self

Set YaRN low correction dimension (yarn_beta_fast).

Maps to llama_context_params.yarn_beta_fast.

Source

pub fn yarn_beta_fast(&self) -> f32

Get YaRN low correction dimension.

Source

pub fn with_yarn_beta_slow(self, yarn_beta_slow: f32) -> Self

Set YaRN high correction dimension (yarn_beta_slow).

Maps to llama_context_params.yarn_beta_slow.

Source

pub fn yarn_beta_slow(&self) -> f32

Get YaRN high correction dimension.

Source

pub fn with_yarn_orig_ctx(self, yarn_orig_ctx: u32) -> Self

Set YaRN original context size.

Maps to llama_context_params.yarn_orig_ctx. 0 uses the model default.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_yarn_orig_ctx(8192);
assert_eq!(params.yarn_orig_ctx(), 8192);

Source

pub fn yarn_orig_ctx(&self) -> u32

Get YaRN original context size (yarn_orig_ctx).

Source

pub fn with_no_perf(self, no_perf: bool) -> Self

Disable performance timing collection for this context.

Maps to llama_context_params.no_perf. When true, calls such as crate::context::LlamaContext::timings return empty counters.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_no_perf(true);
assert!(params.no_perf());

Source

pub fn no_perf(&self) -> bool

Returns true when perf timings are disabled for this context.

Source

pub fn with_abort_callback( self, callback: ggml_abort_callback, user_data: *mut c_void, ) -> Self

Register an abort callback checked during decode() on CPU backends.

Maps to llama_context_params.abort_callback / abort_callback_data. The callback is invoked periodically during long decodes; return a non-zero value to stop the current operation.

user_data is passed through unchanged and must remain valid for the lifetime of any context created from these params.

Source

pub fn with_sampler_seq_configs( self, configs: impl IntoIterator<Item = (i32, LlamaSampler)>, ) -> Self

Assign per-sequence backend sampler chains.

Maps to llama_context_params.samplers / n_samplers. Each LlamaSampler must be a sampler chain created with llama_sampler_chain_init. The samplers are kept alive inside these params until crate::model::LlamaModel::new_context returns.

Pair sequence ids with the chains that should run when decoding those sequences on the backend.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::sampling::LlamaSampler;

let chain = LlamaSampler::chain_default(&model)?;
let params = LlamaContextParams::default()
    .with_sampler_seq_configs([(0, chain)]);
assert_eq!(params.n_sampler_seq_configs(), 1);

Source

pub fn n_sampler_seq_configs(&self) -> usize

Number of per-sequence sampler configs attached to these params.

Returns 0 when no chains were set or after Clone (sampler chains are not duplicated).

Source §

impl LlamaContextParams

Source

pub fn with_n_ctx(self, n_ctx: Option<NonZeroU32>) -> Self

Set the side of the context

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
let params = params.with_n_ctx(NonZeroU32::new(2048));
assert_eq!(params.n_ctx(), NonZeroU32::new(2048));

Source

pub fn n_ctx(&self) -> Option<NonZeroU32>

Get the size of the context.

None if the context size is specified by the model and not the context.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512));

Source

pub fn with_n_seq_max(self, n_seq_max: u32) -> Self

Set the maximum number of independent sequence states in the context.

This maps to llama.cpp’s llama_context_params.n_seq_max and must match the highest sequence id used by batched decoding.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
    .with_n_seq_max(16);
assert_eq!(params.n_seq_max(), 16);

Source

pub fn n_seq_max(&self) -> u32

Get the configured maximum number of independent sequence states.

Source

pub fn with_n_batch(self, n_batch: u32) -> Self

Set the n_batch

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
    .with_n_batch(2048);
assert_eq!(params.n_batch(), 2048);

Source

pub fn n_batch(&self) -> u32

Get the n_batch

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.n_batch(), 2048);

Source

pub fn with_n_ubatch(self, n_ubatch: u32) -> Self

Set the n_ubatch

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
    .with_n_ubatch(512);
assert_eq!(params.n_ubatch(), 512);

Source

pub fn n_ubatch(&self) -> u32

Get the n_ubatch

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.n_ubatch(), 512);

Source

pub fn with_ctx_type(self, ctx_type: LlamaContextType) -> Self

Set the context type (e.g. LlamaContextType::Mtp for the draft context in crate::mtp::MtpSession).

Source

pub fn ctx_type(&self) -> LlamaContextType

Get the configured context type.

Source

pub fn with_n_rs_seq(self, n_rs_seq: u32) -> Self

Set the number of recurrent-state snapshots per sequence (MTP rollback).

Must be >= MtpSessionConfig::n_draft_max on the draft context. See crate::mtp.

Source

pub fn n_rs_seq(&self) -> u32

Get the number of recurrent-state snapshots per sequence used for MTP rollback.

Source

pub fn with_offload_kqv(self, enabled: bool) -> Self

Set the offload_kqv parameter to control offloading KV cache & KQV ops to GPU

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
    .with_offload_kqv(false);
assert_eq!(params.offload_kqv(), false);

Source

pub fn offload_kqv(&self) -> bool

Get the offload_kqv parameter

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.offload_kqv(), true);

Source

pub fn with_rope_scaling_type(self, rope_scaling_type: RopeScalingType) -> Self

Set the type of rope scaling.

§Examples

use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
let params = LlamaContextParams::default()
    .with_rope_scaling_type(RopeScalingType::Linear);
assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);

Source

pub fn rope_scaling_type(&self) -> RopeScalingType

Get the type of rope scaling.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_scaling_type(), llama_cpp_4::context::params::RopeScalingType::Unspecified);

Source

pub fn with_rope_freq_base(self, rope_freq_base: f32) -> Self

Set the rope frequency base.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
   .with_rope_freq_base(0.5);
assert_eq!(params.rope_freq_base(), 0.5);

Source

pub fn rope_freq_base(&self) -> f32

Get the rope frequency base.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_freq_base(), 0.0);

Source

pub fn with_rope_freq_scale(self, rope_freq_scale: f32) -> Self

Set the rope frequency scale.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
  .with_rope_freq_scale(0.5);
assert_eq!(params.rope_freq_scale(), 0.5);

Source

pub fn rope_freq_scale(&self) -> f32

Get the rope frequency scale.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_freq_scale(), 0.0);

Source

pub fn n_threads(&self) -> i32

Get the number of threads.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_threads(), 4);

Source

pub fn n_threads_batch(&self) -> i32

Get the number of threads allocated for batches.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_threads_batch(), 4);

Source

pub fn with_n_threads(self, n_threads: i32) -> Self

Set the number of threads.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
   .with_n_threads(8);
assert_eq!(params.n_threads(), 8);

Source

pub fn with_n_threads_batch(self, n_threads: i32) -> Self

Set the number of threads allocated for batches.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
   .with_n_threads_batch(8);
assert_eq!(params.n_threads_batch(), 8);

Source

pub fn embeddings(&self) -> bool

Check whether embeddings are enabled

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert!(!params.embeddings());

Source

pub fn with_embeddings(self, embedding: bool) -> Self

Enable the use of embeddings

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
   .with_embeddings(true);
assert!(params.embeddings());

Source

pub fn with_cb_eval(self, cb_eval: ggml_backend_sched_eval_callback) -> Self

Set the evaluation callback.

§Examples

extern "C" fn cb_eval_fn(
    t: *mut llama_cpp_sys_4::ggml_tensor,
    ask: bool,
    user_data: *mut std::ffi::c_void,
) -> bool {
    false
}

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));

Source

pub fn with_cb_eval_user_data(self, cb_eval_user_data: *mut c_void) -> Self

Set the evaluation callback user data.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
let user_data = std::ptr::null_mut();
let params = params.with_cb_eval_user_data(user_data);

Source

pub fn with_tensor_capture(self, capture: &mut TensorCapture) -> Self

Attach a TensorCapture to intercept intermediate tensor outputs during crate::LlamaContext::decode.

Sets cb_eval to copy tensors matching the capture filter (layer outputs, named nodes, prefix, or all). After decode(), read results from the capture — see crate::TensorCapture and crate::context::tensor_capture.

The capture must outlive the context. Call TensorCapture::clear before reusing it on another batch.

§Example

use llama_cpp_4::prelude::*;

fn main() {
    let backend = LlamaBackend::init().unwrap();
    let model = LlamaModel::load_from_file(
        &backend,
        "model.gguf",
        &LlamaModelParams::default(),
    )
    .unwrap();

    let mut capture = TensorCapture::for_layers(&[13, 20, 27]);
    let ctx_params = LlamaContextParams::default().with_tensor_capture(&mut capture);
    let _ctx = model.new_context(&backend, ctx_params).unwrap();
}

Source

pub fn with_cache_type_k(self, ty: GgmlType) -> Self

Set the storage type for the K (key) KV cache tensors.

The default is GgmlType::F16. Quantized types like GgmlType::Q5_0 or GgmlType::Q4_0 reduce VRAM usage significantly; combining them with TurboQuant attention rotation (the default) keeps quality high.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::quantize::GgmlType;
let params = LlamaContextParams::default()
    .with_cache_type_k(GgmlType::Q5_0);

Source

pub fn cache_type_k(&self) -> ggml_type

Get the K-cache storage type.

Source

pub fn with_cache_type_v(self, ty: GgmlType) -> Self

Set the storage type for the V (value) KV cache tensors.

See with_cache_type_k for details.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::quantize::GgmlType;
let params = LlamaContextParams::default()
    .with_cache_type_v(GgmlType::Q5_0);

Source

pub fn cache_type_v(&self) -> ggml_type

Get the V-cache storage type.

Source

pub fn with_attn_rot_disabled(self, disabled: bool) -> Self

Control the TurboQuant attention-rotation feature (llama.cpp PR #21038).

By default, llama.cpp applies a Hadamard rotation to Q/K/V tensors before writing them into the KV cache. This significantly improves quantized KV-cache quality at near-zero overhead, and is enabled automatically for models whose head dimension is a power of two.

Set disabled = true to opt out (equivalent to LLAMA_ATTN_ROT_DISABLE=1). The env-var is applied just before the context is created and restored afterwards, so this is safe to call from a single thread.

§Examples

use llama_cpp_4::context::params::LlamaContextParams;
// Disable rotation for this context only:
let params = LlamaContextParams::default().with_attn_rot_disabled(true);
assert!(params.attn_rot_disabled());

Source

pub fn attn_rot_disabled(&self) -> bool

Returns true if TurboQuant attention rotation is disabled for this context.

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert!(!params.attn_rot_disabled());

Source

pub fn with_pooling_type(self, pooling_type: LlamaPoolingType) -> Self

Set the type of pooling.

§Examples

use llama_cpp_4::context::params::{LlamaContextParams, LlamaPoolingType};
let params = LlamaContextParams::default()
    .with_pooling_type(LlamaPoolingType::Last);
assert_eq!(params.pooling_type(), LlamaPoolingType::Last);

Source

pub fn pooling_type(&self) -> LlamaPoolingType

Get the type of pooling.

§Examples

let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.pooling_type(), llama_cpp_4::context::params::LlamaPoolingType::Unspecified);

Source

pub fn try_clone(&self) -> Result<Self, ParamsCloneError>

Clone these params, failing when sampler chains are attached.

Prefer this over Clone::clone when you need to detect dropped sampler configuration.

§Errors

Returns ParamsCloneError::SamplerChains when per-sequence sampler chains are attached and cannot be duplicated.

Trait Implementations§

Source §

impl Clone for LlamaContextParams

Duplicate context params for reuse.

Sampler chains attached via LlamaContextParams::with_sampler_seq_configs are not cloned — the copy clears samplers / n_samplers because the underlying C chains cannot be duplicated safely.

Source §

fn clone(&self) -> Self

Returns a duplicate of the value. Read more

1.0.0 (const: unstable) · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for LlamaContextParams

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for LlamaContextParams

Default parameters for LlamaContext. (as defined in llama.cpp by llama_context_default_params)

use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
let params = LlamaContextParams::default();
assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512");
assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);

Source §