llama_cpp_4/
common.rs

1//! Exposes a small subset of llama.cpp `common/` helpers and parameter structs.
2//!
3//! ## Upstream `common_init_from_params`
4//!
5//! llama.cpp's [`common_init_from_params`](https://github.com/ggml-org/llama.cpp/blob/master/common/common.h)
6//! loads a model and context (and samplers) from a parsed CLI-style
7//! [`common_params`](https://github.com/ggml-org/llama.cpp/blob/master/common/common.h).
8//! Its second argument, `model_only`, skips context creation when `true` (used
9//! by tests that construct contexts manually).
10//!
11//! This crate does not wrap the full C++ `common_params` tree. The Rust
12//! equivalent of `model_only = true` is [`crate::model::LlamaModel::load_from_file`]
13//! followed by [`crate::model::LlamaModel::new_context`] when you need inference.
14pub use llama_cpp_sys_4::common::*;
15
16/// Struct containing common parameters for processing.
17/// ## See more
18/// <https://github.com/ggerganov/llama.cpp/blob/master/common/common.h#L109>
19#[derive(Debug, Clone)]
20pub struct CommonParams {
21    /// New tokens to predict
22    pub n_predict: i32,
23
24    /// Context size
25    pub n_ctx: i32,
26
27    /// Logical batch size for prompt processing (must be >=32 to use BLAS)
28    pub n_batch: i32,
29
30    /// Physical batch size for prompt processing (must be >=32 to use BLAS)
31    pub n_ubatch: i32,
32
33    /// Number of tokens to keep from initial prompt
34    pub n_keep: i32,
35
36    /// Max number of chunks to process (-1 = unlimited)
37    pub n_chunks: i32,
38
39    /// Number of parallel sequences to decode
40    pub n_parallel: i32,
41
42    /// Number of sequences to decode
43    pub n_sequences: i32,
44
45    /// Group-attention factor
46    pub grp_attn_n: i32,
47
48    /// Group-attention width
49    pub grp_attn_w: i32,
50
51    /// Print token count every n tokens (-1 = disabled)
52    pub n_print: i32,
53
54    /// `RoPE` base frequency
55    pub rope_freq_base: f32,
56
57    /// `RoPE` frequency scaling factor
58    pub rope_freq_scale: f32,
59
60    /// `YaRN` extrapolation mix factor
61    pub yarn_ext_factor: f32,
62
63    /// `YaRN` magnitude scaling factor
64    pub yarn_attn_factor: f32,
65
66    /// `YaRN` low correction dim
67    pub yarn_beta_fast: f32,
68
69    /// `YaRN` high correction dim
70    pub yarn_beta_slow: f32,
71
72    /// `YaRN` original context length
73    pub yarn_orig_ctx: i32,
74
75    /// KV cache defragmentation threshold
76    pub defrag_thold: f32,
77
78    /// prompt for the model to consume
79    pub prompt: String,
80}
81
82impl Default for CommonParams {
83    fn default() -> Self {
84        CommonParams {
85            n_predict: -1,
86            n_ctx: 4096,
87            n_batch: 2048,
88            n_ubatch: 512,
89            n_keep: 0,
90            n_chunks: -1,
91            n_parallel: 1,
92            n_sequences: 1,
93            grp_attn_n: 1,
94            grp_attn_w: 512,
95            n_print: -1,
96            rope_freq_base: 0.0,
97            rope_freq_scale: 0.0,
98            yarn_ext_factor: -1.0,
99            yarn_attn_factor: 1.0,
100            yarn_beta_fast: 32.0,
101            yarn_beta_slow: 1.0,
102            yarn_orig_ctx: 0,
103            defrag_thold: 0.1,
104            prompt: String::new(),
105        }
106    }
107}
llama_cpp_4/common.rs

llama_cpp_4/
common.rs