Struct LlamaContextParams

Source

pub struct LlamaContextParams { /* private fields */ }

Expand description

A safe wrapper around llama_context_params.

Generally this should be created with Default::default() and then modified with with_* methods.

§Examples


let ctx_params = LlamaContextParams::default()
    .with_n_ctx(NonZeroU32::new(2048));

assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));

Implementations§

Source §

impl LlamaContextParams

Source

pub fn with_n_ctx(self, n_ctx: Option<NonZeroU32>) -> Self

Set the size of the context

§Examples

let params = LlamaContextParams::default();
let params = params.with_n_ctx(NonZeroU32::new(2048));
assert_eq!(params.n_ctx(), NonZeroU32::new(2048));

Source

pub fn n_ctx(&self) -> Option<NonZeroU32>

Get the size of the context.

None if the context size is specified by the model and not the context.

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512));

Source

pub fn with_n_batch(self, n_batch: u32) -> Self

Set the n_batch

§Examples

let params = LlamaContextParams::default()
    .with_n_batch(2048);
assert_eq!(params.n_batch(), 2048);

Source

pub fn n_batch(&self) -> u32

Get the n_batch

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.n_batch(), 2048);

Source

pub fn with_n_ubatch(self, n_ubatch: u32) -> Self

Set the n_ubatch

§Examples

let params = LlamaContextParams::default()
    .with_n_ubatch(512);
assert_eq!(params.n_ubatch(), 512);

Source

pub fn n_ubatch(&self) -> u32

Get the n_ubatch

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.n_ubatch(), 512);

Source

pub fn with_n_seq_max(self, n_seq_max: u32) -> Self

Set the max number of sequences (i.e. distinct states for recurrent models)

§Examples

let params = LlamaContextParams::default()
    .with_n_seq_max(64);
assert_eq!(params.n_seq_max(), 64);

Source

pub fn n_seq_max(&self) -> u32

Get the max number of sequences (i.e. distinct states for recurrent models)

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.n_seq_max(), 1);

Source

pub fn with_n_threads(self, n_threads: i32) -> Self

Set the number of threads

§Examples

let params = LlamaContextParams::default()
   .with_n_threads(8);
assert_eq!(params.n_threads(), 8);

Source

pub fn n_threads(&self) -> i32

Get the number of threads

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.n_threads(), 4);

Source

pub fn with_n_threads_batch(self, n_threads: i32) -> Self

Set the number of threads allocated for batches

§Examples

let params = LlamaContextParams::default()
   .with_n_threads_batch(8);
assert_eq!(params.n_threads_batch(), 8);

Source

pub fn n_threads_batch(&self) -> i32

Get the number of threads allocated for batches

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.n_threads_batch(), 4);

Source

pub fn with_rope_scaling_type(self, rope_scaling_type: RopeScalingType) -> Self

Set the type of rope scaling

§Examples

let params = LlamaContextParams::default()
    .with_rope_scaling_type(RopeScalingType::Linear);
assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);

Source

pub fn rope_scaling_type(&self) -> RopeScalingType

Get the type of rope scaling

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);

Source

pub fn with_pooling_type(self, pooling_type: LlamaPoolingType) -> Self

Set the type of pooling

§Examples

let params = LlamaContextParams::default()
    .with_pooling_type(LlamaPoolingType::Last);
assert_eq!(params.pooling_type(), LlamaPoolingType::Last);

Source

pub fn pooling_type(&self) -> LlamaPoolingType

Get the type of pooling

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.pooling_type(), LlamaPoolingType::Unspecified);

Source

pub fn with_attention_type(self, attention_type: LlamaAttentionType) -> Self

Set the attention type for embeddings

§Examples

let params = LlamaContextParams::default()
    .with_attention_type(LlamaAttentionType::Causal);
assert_eq!(params.attention_type(), LlamaAttentionType::Causal);

Source

pub fn attention_type(&self) -> LlamaAttentionType

Get the attention type for embeddings

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.attention_type(), LlamaAttentionType::Unspecified);

Source

pub fn with_flash_attention_policy(self, policy: llama_flash_attn_type) -> Self

Set the flash attention policy using llama.cpp enum

Source

pub fn flash_attention_policy(&self) -> llama_flash_attn_type

Get the flash attention policy

Source

pub fn with_rope_freq_base(self, rope_freq_base: f32) -> Self

Set the rope frequency base

§Examples

let params = LlamaContextParams::default()
   .with_rope_freq_base(0.5);
assert_eq!(params.rope_freq_base(), 0.5);

Source

pub fn rope_freq_base(&self) -> f32

Get the rope frequency base

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.rope_freq_base(), 0.0);

Source

pub fn with_rope_freq_scale(self, rope_freq_scale: f32) -> Self

Set the rope frequency scale

§Examples

let params = LlamaContextParams::default()
  .with_rope_freq_scale(0.5);
assert_eq!(params.rope_freq_scale(), 0.5);

Source

pub fn rope_freq_scale(&self) -> f32

Get the rope frequency scale

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.rope_freq_scale(), 0.0);

Source

pub fn with_yarn_ext_factor(self, yarn_ext_factor: f32) -> Self

Set the YaRN extrapolation mix factor

§Examples

let params = LlamaContextParams::default().with_yarn_ext_factor(1.0);
assert_eq!(params.yarn_ext_factor(), 1.0);

Source

pub fn yarn_ext_factor(&self) -> f32

Get the YaRN extrapolation mix factor

Source

pub fn with_yarn_attn_factor(self, yarn_attn_factor: f32) -> Self

Set the YaRN magnitude scaling factor

§Examples

let params = LlamaContextParams::default().with_yarn_attn_factor(2.0);
assert_eq!(params.yarn_attn_factor(), 2.0);

Source

pub fn yarn_attn_factor(&self) -> f32

Get the YaRN magnitude scaling factor

Source

pub fn with_yarn_beta_fast(self, yarn_beta_fast: f32) -> Self

Set the YaRN low correction dim

§Examples

let params = LlamaContextParams::default().with_yarn_beta_fast(16.0);
assert_eq!(params.yarn_beta_fast(), 16.0);

Source

pub fn yarn_beta_fast(&self) -> f32

Get the YaRN low correction dim

Source

pub fn with_yarn_beta_slow(self, yarn_beta_slow: f32) -> Self

Set the YaRN high correction dim

§Examples

let params = LlamaContextParams::default().with_yarn_beta_slow(2.0);
assert_eq!(params.yarn_beta_slow(), 2.0);

Source

pub fn yarn_beta_slow(&self) -> f32

Get the YaRN high correction dim

Source

pub fn with_yarn_orig_ctx(self, yarn_orig_ctx: u32) -> Self

Set the YaRN original context size

§Examples

let params = LlamaContextParams::default().with_yarn_orig_ctx(4096);
assert_eq!(params.yarn_orig_ctx(), 4096);

Source

pub fn yarn_orig_ctx(&self) -> u32

Get the YaRN original context size

Source

pub fn with_defrag_thold(self, defrag_thold: f32) -> Self

Set the KV cache defragmentation threshold

§Examples

let params = LlamaContextParams::default().with_defrag_thold(0.1);
assert_eq!(params.defrag_thold(), 0.1);

Source

pub fn defrag_thold(&self) -> f32

Get the KV cache defragmentation threshold

Source

pub fn with_type_k(self, type_k: KvCacheType) -> Self

Set the KV cache data type for K

§Examples

let params = LlamaContextParams::default().with_type_k(KvCacheType::Q4_0);
assert_eq!(params.type_k(), KvCacheType::Q4_0);

Source

pub fn type_k(&self) -> KvCacheType

Get the KV cache data type for K

§Examples

let params = LlamaContextParams::default();
let _ = params.type_k();

Source

pub fn with_type_v(self, type_v: KvCacheType) -> Self

Set the KV cache data type for V

§Examples

let params = LlamaContextParams::default().with_type_v(KvCacheType::Q4_1);
assert_eq!(params.type_v(), KvCacheType::Q4_1);

Source

pub fn type_v(&self) -> KvCacheType

Get the KV cache data type for V

§Examples

let params = LlamaContextParams::default();
let _ = params.type_v();

Source

pub fn with_embeddings(self, embedding: bool) -> Self

Set whether embeddings are enabled

§Examples

let params = LlamaContextParams::default()
   .with_embeddings(true);
assert!(params.embeddings());

Source

pub fn embeddings(&self) -> bool

Get whether embeddings are enabled

§Examples

let params = LlamaContextParams::default();
assert!(!params.embeddings());

Source

pub fn with_offload_kqv(self, enabled: bool) -> Self

Set whether to offload KQV ops to GPU

§Examples

let params = LlamaContextParams::default()
    .with_offload_kqv(false);
assert_eq!(params.offload_kqv(), false);

Source

pub fn offload_kqv(&self) -> bool

Get whether KQV ops are offloaded to GPU

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.offload_kqv(), true);

Source

pub fn with_no_perf(self, no_perf: bool) -> Self

Set whether to disable performance timings

§Examples

let params = LlamaContextParams::default().with_no_perf(true);
assert!(params.no_perf());

Source

pub fn no_perf(&self) -> bool

Get whether performance timings are disabled

Source

pub fn with_op_offload(self, op_offload: bool) -> Self

Set whether to offload ops to GPU

§Examples

let params = LlamaContextParams::default().with_op_offload(false);
assert_eq!(params.op_offload(), false);

Source

pub fn op_offload(&self) -> bool

Get whether ops are offloaded to GPU

Source

pub fn with_swa_full(self, enabled: bool) -> Self

Set whether to use full sliding window attention

§Examples

let params = LlamaContextParams::default()
    .with_swa_full(false);
assert_eq!(params.swa_full(), false);

Source

pub fn swa_full(&self) -> bool

Get whether full sliding window attention is enabled

§Examples

let params = LlamaContextParams::default();
assert_eq!(params.swa_full(), true);

Source

pub fn with_kv_unified(self, kv_unified: bool) -> Self

Set whether to use a unified KV cache buffer across input sequences

§Examples

let params = LlamaContextParams::default().with_kv_unified(true);
assert!(params.kv_unified());

Source

pub fn kv_unified(&self) -> bool

Get whether a unified KV cache buffer is used across input sequences

§Examples

let params = LlamaContextParams::default();
let _ = params.kv_unified();

Trait Implementations§

Source §

impl Clone for LlamaContextParams

Source §

fn clone(&self) -> LlamaContextParams

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for LlamaContextParams

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for LlamaContextParams

Default parameters for LlamaContext. (as defined in llama.cpp by llama_context_default_params)

let params = LlamaContextParams::default();
assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512");
assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);

Source §