pub struct LlamaContextParams { /* private fields */ }Expand description
Builder for llama_context_params.
Construct with Default::default(), chain with_* setters, then pass the
value to crate::model::LlamaModel::new_context. Getter methods mirror
the fields that exist on the underlying C struct.
§Sampler ownership
Self::with_sampler_seq_configs stores owned LlamaSampler chains inside
this struct until the context is created. Clone clears sampler configs
because the underlying chains cannot be duplicated safely.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let ctx_params = LlamaContextParams::default()
.with_n_ctx(NonZeroU32::new(2048));
assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));Implementations§
Source§impl LlamaContextParams
impl LlamaContextParams
Sourcepub fn with_flash_attn_type(self, flash_attn_type: LlamaFlashAttnType) -> Self
pub fn with_flash_attn_type(self, flash_attn_type: LlamaFlashAttnType) -> Self
Set the flash-attention mode (Auto, Enabled, or Disabled).
Maps to llama_context_params.flash_attn_type. Use
LlamaFlashAttnType::Auto to match llama.cpp defaults.
§Examples
use llama_cpp_4::context::params::{LlamaContextParams, LlamaFlashAttnType};
let params = LlamaContextParams::default()
.with_flash_attn_type(LlamaFlashAttnType::Auto);
assert_eq!(params.flash_attn_type(), LlamaFlashAttnType::Auto);Sourcepub fn flash_attn_type(&self) -> LlamaFlashAttnType
pub fn flash_attn_type(&self) -> LlamaFlashAttnType
Get the configured flash-attention mode.
Sourcepub fn with_attention_type(self, attention_type: LlamaAttentionType) -> Self
pub fn with_attention_type(self, attention_type: LlamaAttentionType) -> Self
Set the attention type used when extracting embeddings.
Maps to llama_context_params.attention_type. Embedding models often
need LlamaAttentionType::NonCausal; generative decoding uses
LlamaAttentionType::Causal.
§Examples
use llama_cpp_4::context::params::{LlamaAttentionType, LlamaContextParams};
let params = LlamaContextParams::default()
.with_attention_type(LlamaAttentionType::Causal);
assert_eq!(params.attention_type(), LlamaAttentionType::Causal);Sourcepub fn attention_type(&self) -> LlamaAttentionType
pub fn attention_type(&self) -> LlamaAttentionType
Get the attention type used when extracting embeddings.
Sourcepub fn with_n_outputs_max(self, n_outputs_max: u32) -> Self
pub fn with_n_outputs_max(self, n_outputs_max: u32) -> Self
Set the maximum number of outputs per micro-batch.
Maps to llama_context_params.n_outputs_max. When 0, llama.cpp uses
n_batch as the cap.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_n_outputs_max(256);
assert_eq!(params.n_outputs_max(), 256);Sourcepub fn n_outputs_max(&self) -> u32
pub fn n_outputs_max(&self) -> u32
Get the maximum number of outputs per micro-batch.
Sourcepub fn with_kv_unified(self, kv_unified: bool) -> Self
pub fn with_kv_unified(self, kv_unified: bool) -> Self
Use a unified KV buffer across input sequences.
Maps to llama_context_params.kv_unified. Disabling can improve
throughput for batched decoding when sequences do not share a long prefix.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_kv_unified(false);
assert!(!params.kv_unified());Sourcepub fn kv_unified(&self) -> bool
pub fn kv_unified(&self) -> bool
Returns true when a unified KV buffer is enabled.
Sourcepub fn with_swa_full(self, swa_full: bool) -> Self
pub fn with_swa_full(self, swa_full: bool) -> Self
Use a full-size sliding-window-attention (SWA) KV cache.
Maps to llama_context_params.swa_full. When false and n_seq_max > 1,
llama.cpp may use a smaller per-sequence SWA window for better performance.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_swa_full(true);
assert!(params.swa_full());Sourcepub fn with_op_offload(self, op_offload: bool) -> Self
pub fn with_op_offload(self, op_offload: bool) -> Self
Offload eligible host tensor operations to the active device.
Maps to llama_context_params.op_offload.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_op_offload(true);
assert!(params.op_offload());Sourcepub fn op_offload(&self) -> bool
pub fn op_offload(&self) -> bool
Returns true when host tensor ops are offloaded to device.
Sourcepub fn with_ctx_other(self, other: &LlamaContext<'_>) -> Self
pub fn with_ctx_other(self, other: &LlamaContext<'_>) -> Self
Pair this context with another for shared memory or cross-context results.
Maps to llama_context_params.ctx_other. The paired context is returned
by crate::context::LlamaContext::ctx_other after creation.
other must remain alive until crate::model::LlamaModel::new_context
returns.
§Examples
let target = model.new_context(&backend, LlamaContextParams::default())?;
let draft = model.new_context(
&backend,
LlamaContextParams::default().with_ctx_other(&target),
)?;Sourcepub fn with_yarn_ext_factor(self, yarn_ext_factor: f32) -> Self
pub fn with_yarn_ext_factor(self, yarn_ext_factor: f32) -> Self
Set YaRN extrapolation mix factor.
Maps to llama_context_params.yarn_ext_factor. Negative values use the
model default. Only meaningful when super::RopeScalingType::Yarn is active.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_yarn_ext_factor(1.0);
assert_eq!(params.yarn_ext_factor(), 1.0);Sourcepub fn yarn_ext_factor(&self) -> f32
pub fn yarn_ext_factor(&self) -> f32
Get YaRN extrapolation mix factor (yarn_ext_factor).
Sourcepub fn with_yarn_attn_factor(self, yarn_attn_factor: f32) -> Self
pub fn with_yarn_attn_factor(self, yarn_attn_factor: f32) -> Self
Set YaRN magnitude scaling factor.
Maps to llama_context_params.yarn_attn_factor.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_yarn_attn_factor(1.0);
assert_eq!(params.yarn_attn_factor(), 1.0);Sourcepub fn yarn_attn_factor(&self) -> f32
pub fn yarn_attn_factor(&self) -> f32
Get YaRN magnitude scaling factor (yarn_attn_factor).
Sourcepub fn with_yarn_beta_fast(self, yarn_beta_fast: f32) -> Self
pub fn with_yarn_beta_fast(self, yarn_beta_fast: f32) -> Self
Set YaRN low correction dimension (yarn_beta_fast).
Maps to llama_context_params.yarn_beta_fast.
Sourcepub fn yarn_beta_fast(&self) -> f32
pub fn yarn_beta_fast(&self) -> f32
Get YaRN low correction dimension.
Sourcepub fn with_yarn_beta_slow(self, yarn_beta_slow: f32) -> Self
pub fn with_yarn_beta_slow(self, yarn_beta_slow: f32) -> Self
Set YaRN high correction dimension (yarn_beta_slow).
Maps to llama_context_params.yarn_beta_slow.
Sourcepub fn yarn_beta_slow(&self) -> f32
pub fn yarn_beta_slow(&self) -> f32
Get YaRN high correction dimension.
Sourcepub fn with_yarn_orig_ctx(self, yarn_orig_ctx: u32) -> Self
pub fn with_yarn_orig_ctx(self, yarn_orig_ctx: u32) -> Self
Set YaRN original context size.
Maps to llama_context_params.yarn_orig_ctx. 0 uses the model default.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_yarn_orig_ctx(8192);
assert_eq!(params.yarn_orig_ctx(), 8192);Sourcepub fn yarn_orig_ctx(&self) -> u32
pub fn yarn_orig_ctx(&self) -> u32
Get YaRN original context size (yarn_orig_ctx).
Sourcepub fn with_no_perf(self, no_perf: bool) -> Self
pub fn with_no_perf(self, no_perf: bool) -> Self
Disable performance timing collection for this context.
Maps to llama_context_params.no_perf. When true, calls such as
crate::context::LlamaContext::timings return empty counters.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_no_perf(true);
assert!(params.no_perf());Sourcepub fn with_abort_callback(
self,
callback: ggml_abort_callback,
user_data: *mut c_void,
) -> Self
pub fn with_abort_callback( self, callback: ggml_abort_callback, user_data: *mut c_void, ) -> Self
Register an abort callback checked during decode() on CPU backends.
Maps to llama_context_params.abort_callback / abort_callback_data.
The callback is invoked periodically during long decodes; return a
non-zero value to stop the current operation.
user_data is passed through unchanged and must remain valid for the
lifetime of any context created from these params.
Sourcepub fn with_sampler_seq_configs(
self,
configs: impl IntoIterator<Item = (i32, LlamaSampler)>,
) -> Self
pub fn with_sampler_seq_configs( self, configs: impl IntoIterator<Item = (i32, LlamaSampler)>, ) -> Self
Assign per-sequence backend sampler chains.
Maps to llama_context_params.samplers / n_samplers. Each
LlamaSampler must be a sampler chain created with
llama_sampler_chain_init. The samplers are kept alive inside these
params until crate::model::LlamaModel::new_context returns.
Pair sequence ids with the chains that should run when decoding those sequences on the backend.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::sampling::LlamaSampler;
let chain = LlamaSampler::chain_default(&model)?;
let params = LlamaContextParams::default()
.with_sampler_seq_configs([(0, chain)]);
assert_eq!(params.n_sampler_seq_configs(), 1);Sourcepub fn n_sampler_seq_configs(&self) -> usize
pub fn n_sampler_seq_configs(&self) -> usize
Number of per-sequence sampler configs attached to these params.
Returns 0 when no chains were set or after Clone (sampler chains
are not duplicated).
Source§impl LlamaContextParams
impl LlamaContextParams
Sourcepub fn with_n_ctx(self, n_ctx: Option<NonZeroU32>) -> Self
pub fn with_n_ctx(self, n_ctx: Option<NonZeroU32>) -> Self
Set the side of the context
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
let params = params.with_n_ctx(NonZeroU32::new(2048));
assert_eq!(params.n_ctx(), NonZeroU32::new(2048));Sourcepub fn n_ctx(&self) -> Option<NonZeroU32>
pub fn n_ctx(&self) -> Option<NonZeroU32>
Sourcepub fn with_n_seq_max(self, n_seq_max: u32) -> Self
pub fn with_n_seq_max(self, n_seq_max: u32) -> Self
Set the maximum number of independent sequence states in the context.
This maps to llama.cpp’s llama_context_params.n_seq_max and must match
the highest sequence id used by batched decoding.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_seq_max(16);
assert_eq!(params.n_seq_max(), 16);Sourcepub fn n_seq_max(&self) -> u32
pub fn n_seq_max(&self) -> u32
Get the configured maximum number of independent sequence states.
Sourcepub fn with_n_batch(self, n_batch: u32) -> Self
pub fn with_n_batch(self, n_batch: u32) -> Self
Set the n_batch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_batch(2048);
assert_eq!(params.n_batch(), 2048);Sourcepub fn n_batch(&self) -> u32
pub fn n_batch(&self) -> u32
Get the n_batch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.n_batch(), 2048);Sourcepub fn with_n_ubatch(self, n_ubatch: u32) -> Self
pub fn with_n_ubatch(self, n_ubatch: u32) -> Self
Set the n_ubatch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_ubatch(512);
assert_eq!(params.n_ubatch(), 512);Sourcepub fn n_ubatch(&self) -> u32
pub fn n_ubatch(&self) -> u32
Get the n_ubatch
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.n_ubatch(), 512);Sourcepub fn with_ctx_type(self, ctx_type: LlamaContextType) -> Self
pub fn with_ctx_type(self, ctx_type: LlamaContextType) -> Self
Set the context type (e.g. LlamaContextType::Mtp for the draft context in
crate::mtp::MtpSession).
Sourcepub fn ctx_type(&self) -> LlamaContextType
pub fn ctx_type(&self) -> LlamaContextType
Get the configured context type.
Sourcepub fn with_n_rs_seq(self, n_rs_seq: u32) -> Self
pub fn with_n_rs_seq(self, n_rs_seq: u32) -> Self
Set the number of recurrent-state snapshots per sequence (MTP rollback).
Must be >= MtpSessionConfig::n_draft_max
on the draft context. See crate::mtp.
Sourcepub fn n_rs_seq(&self) -> u32
pub fn n_rs_seq(&self) -> u32
Get the number of recurrent-state snapshots per sequence used for MTP rollback.
Sourcepub fn with_offload_kqv(self, enabled: bool) -> Self
pub fn with_offload_kqv(self, enabled: bool) -> Self
Set the offload_kqv parameter to control offloading KV cache & KQV ops to GPU
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_offload_kqv(false);
assert_eq!(params.offload_kqv(), false);Sourcepub fn offload_kqv(&self) -> bool
pub fn offload_kqv(&self) -> bool
Get the offload_kqv parameter
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
assert_eq!(params.offload_kqv(), true);Sourcepub fn with_rope_scaling_type(self, rope_scaling_type: RopeScalingType) -> Self
pub fn with_rope_scaling_type(self, rope_scaling_type: RopeScalingType) -> Self
Set the type of rope scaling.
§Examples
use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
let params = LlamaContextParams::default()
.with_rope_scaling_type(RopeScalingType::Linear);
assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);Sourcepub fn rope_scaling_type(&self) -> RopeScalingType
pub fn rope_scaling_type(&self) -> RopeScalingType
Get the type of rope scaling.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_scaling_type(), llama_cpp_4::context::params::RopeScalingType::Unspecified);Sourcepub fn with_rope_freq_base(self, rope_freq_base: f32) -> Self
pub fn with_rope_freq_base(self, rope_freq_base: f32) -> Self
Set the rope frequency base.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_rope_freq_base(0.5);
assert_eq!(params.rope_freq_base(), 0.5);Sourcepub fn rope_freq_base(&self) -> f32
pub fn rope_freq_base(&self) -> f32
Get the rope frequency base.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_freq_base(), 0.0);Sourcepub fn with_rope_freq_scale(self, rope_freq_scale: f32) -> Self
pub fn with_rope_freq_scale(self, rope_freq_scale: f32) -> Self
Set the rope frequency scale.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_rope_freq_scale(0.5);
assert_eq!(params.rope_freq_scale(), 0.5);Sourcepub fn rope_freq_scale(&self) -> f32
pub fn rope_freq_scale(&self) -> f32
Get the rope frequency scale.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.rope_freq_scale(), 0.0);Sourcepub fn n_threads(&self) -> i32
pub fn n_threads(&self) -> i32
Get the number of threads.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_threads(), 4);Sourcepub fn n_threads_batch(&self) -> i32
pub fn n_threads_batch(&self) -> i32
Get the number of threads allocated for batches.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.n_threads_batch(), 4);Sourcepub fn with_n_threads(self, n_threads: i32) -> Self
pub fn with_n_threads(self, n_threads: i32) -> Self
Set the number of threads.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_threads(8);
assert_eq!(params.n_threads(), 8);Sourcepub fn with_n_threads_batch(self, n_threads: i32) -> Self
pub fn with_n_threads_batch(self, n_threads: i32) -> Self
Set the number of threads allocated for batches.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_n_threads_batch(8);
assert_eq!(params.n_threads_batch(), 8);Sourcepub fn embeddings(&self) -> bool
pub fn embeddings(&self) -> bool
Check whether embeddings are enabled
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert!(!params.embeddings());Sourcepub fn with_embeddings(self, embedding: bool) -> Self
pub fn with_embeddings(self, embedding: bool) -> Self
Enable the use of embeddings
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default()
.with_embeddings(true);
assert!(params.embeddings());Sourcepub fn with_cb_eval(self, cb_eval: ggml_backend_sched_eval_callback) -> Self
pub fn with_cb_eval(self, cb_eval: ggml_backend_sched_eval_callback) -> Self
Set the evaluation callback.
§Examples
extern "C" fn cb_eval_fn(
t: *mut llama_cpp_sys_4::ggml_tensor,
ask: bool,
user_data: *mut std::ffi::c_void,
) -> bool {
false
}
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));Sourcepub fn with_cb_eval_user_data(self, cb_eval_user_data: *mut c_void) -> Self
pub fn with_cb_eval_user_data(self, cb_eval_user_data: *mut c_void) -> Self
Set the evaluation callback user data.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
let params = LlamaContextParams::default();
let user_data = std::ptr::null_mut();
let params = params.with_cb_eval_user_data(user_data);Sourcepub fn with_tensor_capture(self, capture: &mut TensorCapture) -> Self
pub fn with_tensor_capture(self, capture: &mut TensorCapture) -> Self
Attach a TensorCapture to
intercept intermediate tensor outputs during crate::LlamaContext::decode.
Sets cb_eval to copy tensors matching the capture filter (layer outputs,
named nodes, prefix, or all). After decode(), read results from the
capture — see crate::TensorCapture and crate::context::tensor_capture.
The capture must outlive the context. Call TensorCapture::clear before
reusing it on another batch.
§Example
use llama_cpp_4::prelude::*;
fn main() {
let backend = LlamaBackend::init().unwrap();
let model = LlamaModel::load_from_file(
&backend,
"model.gguf",
&LlamaModelParams::default(),
)
.unwrap();
let mut capture = TensorCapture::for_layers(&[13, 20, 27]);
let ctx_params = LlamaContextParams::default().with_tensor_capture(&mut capture);
let _ctx = model.new_context(&backend, ctx_params).unwrap();
}Sourcepub fn with_cache_type_k(self, ty: GgmlType) -> Self
pub fn with_cache_type_k(self, ty: GgmlType) -> Self
Set the storage type for the K (key) KV cache tensors.
The default is GgmlType::F16. Quantized types like GgmlType::Q5_0
or GgmlType::Q4_0 reduce VRAM usage significantly; combining them with
TurboQuant attention rotation (the default) keeps quality high.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::quantize::GgmlType;
let params = LlamaContextParams::default()
.with_cache_type_k(GgmlType::Q5_0);Sourcepub fn cache_type_k(&self) -> ggml_type
pub fn cache_type_k(&self) -> ggml_type
Get the K-cache storage type.
Sourcepub fn with_cache_type_v(self, ty: GgmlType) -> Self
pub fn with_cache_type_v(self, ty: GgmlType) -> Self
Set the storage type for the V (value) KV cache tensors.
See with_cache_type_k for details.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
use llama_cpp_4::quantize::GgmlType;
let params = LlamaContextParams::default()
.with_cache_type_v(GgmlType::Q5_0);Sourcepub fn cache_type_v(&self) -> ggml_type
pub fn cache_type_v(&self) -> ggml_type
Get the V-cache storage type.
Sourcepub fn with_attn_rot_disabled(self, disabled: bool) -> Self
pub fn with_attn_rot_disabled(self, disabled: bool) -> Self
Control the TurboQuant attention-rotation feature (llama.cpp PR #21038).
By default, llama.cpp applies a Hadamard rotation to Q/K/V tensors before writing them into the KV cache. This significantly improves quantized KV-cache quality at near-zero overhead, and is enabled automatically for models whose head dimension is a power of two.
Set disabled = true to opt out (equivalent to LLAMA_ATTN_ROT_DISABLE=1).
The env-var is applied just before the context is created and restored
afterwards, so this is safe to call from a single thread.
§Examples
use llama_cpp_4::context::params::LlamaContextParams;
// Disable rotation for this context only:
let params = LlamaContextParams::default().with_attn_rot_disabled(true);
assert!(params.attn_rot_disabled());Sourcepub fn attn_rot_disabled(&self) -> bool
pub fn attn_rot_disabled(&self) -> bool
Returns true if TurboQuant attention rotation is disabled for this context.
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert!(!params.attn_rot_disabled());Sourcepub fn with_pooling_type(self, pooling_type: LlamaPoolingType) -> Self
pub fn with_pooling_type(self, pooling_type: LlamaPoolingType) -> Self
Set the type of pooling.
§Examples
use llama_cpp_4::context::params::{LlamaContextParams, LlamaPoolingType};
let params = LlamaContextParams::default()
.with_pooling_type(LlamaPoolingType::Last);
assert_eq!(params.pooling_type(), LlamaPoolingType::Last);Sourcepub fn pooling_type(&self) -> LlamaPoolingType
pub fn pooling_type(&self) -> LlamaPoolingType
Get the type of pooling.
§Examples
let params = llama_cpp_4::context::params::LlamaContextParams::default();
assert_eq!(params.pooling_type(), llama_cpp_4::context::params::LlamaPoolingType::Unspecified);Sourcepub fn try_clone(&self) -> Result<Self, ParamsCloneError>
pub fn try_clone(&self) -> Result<Self, ParamsCloneError>
Clone these params, failing when sampler chains are attached.
Prefer this over Clone::clone when you need to detect dropped sampler
configuration.
§Errors
Returns ParamsCloneError::SamplerChains when per-sequence sampler
chains are attached and cannot be duplicated.
Trait Implementations§
Source§impl Clone for LlamaContextParams
Duplicate context params for reuse.
impl Clone for LlamaContextParams
Duplicate context params for reuse.
Sampler chains attached via LlamaContextParams::with_sampler_seq_configs
are not cloned — the copy clears samplers / n_samplers because the
underlying C chains cannot be duplicated safely.
Source§impl Debug for LlamaContextParams
impl Debug for LlamaContextParams
Source§impl Default for LlamaContextParams
Default parameters for LlamaContext. (as defined in llama.cpp by llama_context_default_params)
impl Default for LlamaContextParams
Default parameters for LlamaContext. (as defined in llama.cpp by llama_context_default_params)
use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
let params = LlamaContextParams::default();
assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512");
assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);impl Send for LlamaContextParams
SAFETY: we do not currently allow setting or reading the pointers that cause this to not be automatically send or sync.