llama_cpp_2/model/
params.rs

1//! A safe wrapper around `llama_model_params`.
2
3use crate::model::params::kv_overrides::KvOverrides;
4use std::ffi::{c_char, CStr};
5use std::fmt::{Debug, Formatter};
6use std::pin::Pin;
7use std::ptr::null;
8
9pub mod kv_overrides;
10
11/// A safe wrapper around `llama_model_params`.
12#[allow(clippy::module_name_repetitions)]
13pub struct LlamaModelParams {
14    pub(crate) params: llama_cpp_sys_2::llama_model_params,
15    kv_overrides: Vec<llama_cpp_sys_2::llama_model_kv_override>,
16    buft_overrides: Vec<llama_cpp_sys_2::llama_model_tensor_buft_override>,
17}
18
19impl Debug for LlamaModelParams {
20    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
21        f.debug_struct("LlamaModelParams")
22            .field("n_gpu_layers", &self.params.n_gpu_layers)
23            .field("main_gpu", &self.params.main_gpu)
24            .field("vocab_only", &self.params.vocab_only)
25            .field("use_mmap", &self.params.use_mmap)
26            .field("use_mlock", &self.params.use_mlock)
27            .field("kv_overrides", &"vec of kv_overrides")
28            .finish()
29    }
30}
31
32impl LlamaModelParams {
33    /// See [`KvOverrides`]
34    ///
35    /// # Examples
36    ///
37    /// ```rust
38    /// # use llama_cpp_2::model::params::LlamaModelParams;
39    /// let params = Box::pin(LlamaModelParams::default());
40    /// let kv_overrides = params.kv_overrides();
41    /// let count = kv_overrides.into_iter().count();
42    /// assert_eq!(count, 0);
43    /// ```
44    #[must_use]
45    pub fn kv_overrides(&self) -> KvOverrides {
46        KvOverrides::new(self)
47    }
48
49    /// Appends a key-value override to the model parameters. It must be pinned as this creates a self-referential struct.
50    ///
51    /// # Examples
52    ///
53    /// ```rust
54    /// # use std::ffi::{CStr, CString};
55    /// use std::pin::pin;
56    /// # use llama_cpp_2::model::params::LlamaModelParams;
57    /// # use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
58    /// let mut params = pin!(LlamaModelParams::default());
59    /// let key = CString::new("key").expect("CString::new failed");
60    /// params.as_mut().append_kv_override(&key, ParamOverrideValue::Int(50));
61    ///
62    /// let kv_overrides = params.kv_overrides().into_iter().collect::<Vec<_>>();
63    /// assert_eq!(kv_overrides.len(), 1);
64    ///
65    /// let (k, v) = &kv_overrides[0];
66    /// assert_eq!(v, &ParamOverrideValue::Int(50));
67    ///
68    /// assert_eq!(k.to_bytes(), b"key", "expected key to be 'key', was {:?}", k);
69    /// ```
70    #[allow(clippy::missing_panics_doc)] // panics are just to enforce internal invariants, not user errors
71    pub fn append_kv_override(
72        mut self: Pin<&mut Self>,
73        key: &CStr,
74        value: kv_overrides::ParamOverrideValue,
75    ) {
76        let kv_override = self
77            .kv_overrides
78            .get_mut(0)
79            .expect("kv_overrides did not have a next allocated");
80
81        assert_eq!(kv_override.key[0], 0, "last kv_override was not empty");
82
83        // There should be some way to do this without iterating over everything.
84        for (i, &c) in key.to_bytes_with_nul().iter().enumerate() {
85            kv_override.key[i] = c_char::try_from(c).expect("invalid character in key");
86        }
87
88        kv_override.tag = value.tag();
89        kv_override.__bindgen_anon_1 = value.value();
90
91        // set to null pointer for panic safety (as push may move the vector, invalidating the pointer)
92        self.params.kv_overrides = null();
93
94        // push the next one to ensure we maintain the iterator invariant of ending with a 0
95        self.kv_overrides
96            .push(llama_cpp_sys_2::llama_model_kv_override {
97                key: [0; 128],
98                tag: 0,
99                __bindgen_anon_1: llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 {
100                    val_i64: 0,
101                },
102            });
103
104        // set the pointer to the (potentially) new vector
105        self.params.kv_overrides = self.kv_overrides.as_ptr();
106
107        eprintln!("saved ptr: {:?}", self.params.kv_overrides);
108    }
109}
110
111impl LlamaModelParams {
112    /// Adds buffer type overides to move all mixture-of-experts layers to CPU.
113    pub fn add_cpu_moe_override(self: Pin<&mut Self>) {
114        self.add_cpu_buft_override(c"\\.ffn_(up|down|gate)_(ch|)exps");
115    }
116
117    /// Appends a buffer type override to the model parameters, to move layers matching pattern to CPU.
118    /// It must be pinned as this creates a self-referential struct.
119    pub fn add_cpu_buft_override(mut self: Pin<&mut Self>, key: &CStr) {
120        let buft_override = self
121            .buft_overrides
122            .get_mut(0)
123            .expect("buft_overrides did not have a next allocated");
124
125        assert!(
126            buft_override.pattern.is_null(),
127            "last buft_override was not empty"
128        );
129
130        // There should be some way to do this without iterating over everything.
131        for (_i, &c) in key.to_bytes_with_nul().iter().enumerate() {
132            c_char::try_from(c).expect("invalid character in key");
133        }
134
135        buft_override.pattern = key.as_ptr();
136        buft_override.buft = unsafe { llama_cpp_sys_2::ggml_backend_cpu_buffer_type() };
137
138        // set to null pointer for panic safety (as push may move the vector, invalidating the pointer)
139        self.params.tensor_buft_overrides = null();
140
141        // push the next one to ensure we maintain the iterator invariant of ending with a 0
142        self.buft_overrides
143            .push(llama_cpp_sys_2::llama_model_tensor_buft_override {
144                pattern: std::ptr::null(),
145                buft: std::ptr::null_mut(),
146            });
147
148        // set the pointer to the (potentially) new vector
149        self.params.tensor_buft_overrides = self.buft_overrides.as_ptr();
150    }
151}
152
153impl LlamaModelParams {
154    /// Get the number of layers to offload to the GPU.
155    #[must_use]
156    pub fn n_gpu_layers(&self) -> i32 {
157        self.params.n_gpu_layers
158    }
159
160    /// The GPU that is used for scratch and small tensors
161    #[must_use]
162    pub fn main_gpu(&self) -> i32 {
163        self.params.main_gpu
164    }
165
166    /// only load the vocabulary, no weights
167    #[must_use]
168    pub fn vocab_only(&self) -> bool {
169        self.params.vocab_only
170    }
171
172    /// use mmap if possible
173    #[must_use]
174    pub fn use_mmap(&self) -> bool {
175        self.params.use_mmap
176    }
177
178    /// force system to keep model in RAM
179    #[must_use]
180    pub fn use_mlock(&self) -> bool {
181        self.params.use_mlock
182    }
183
184    /// sets the number of gpu layers to offload to the GPU.
185    /// ```
186    /// # use llama_cpp_2::model::params::LlamaModelParams;
187    /// let params = LlamaModelParams::default();
188    /// let params = params.with_n_gpu_layers(1);
189    /// assert_eq!(params.n_gpu_layers(), 1);
190    /// ```
191    #[must_use]
192    pub fn with_n_gpu_layers(mut self, n_gpu_layers: u32) -> Self {
193        // The only way this conversion can fail is if u32 overflows the i32 - in which case we set
194        // to MAX
195        let n_gpu_layers = i32::try_from(n_gpu_layers).unwrap_or(i32::MAX);
196        self.params.n_gpu_layers = n_gpu_layers;
197        self
198    }
199
200    /// sets the main GPU
201    #[must_use]
202    pub fn with_main_gpu(mut self, main_gpu: i32) -> Self {
203        self.params.main_gpu = main_gpu;
204        self
205    }
206
207    /// sets `vocab_only`
208    #[must_use]
209    pub fn with_vocab_only(mut self, vocab_only: bool) -> Self {
210        self.params.vocab_only = vocab_only;
211        self
212    }
213
214    /// sets `use_mlock`
215    #[must_use]
216    pub fn with_use_mlock(mut self, use_mlock: bool) -> Self {
217        self.params.use_mlock = use_mlock;
218        self
219    }
220}
221
222/// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`)
223/// ```
224/// # use llama_cpp_2::model::params::LlamaModelParams;
225/// let params = LlamaModelParams::default();
226/// assert_eq!(params.n_gpu_layers(), 999, "n_gpu_layers should be 999");
227/// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0");
228/// assert_eq!(params.vocab_only(), false, "vocab_only should be false");
229/// assert_eq!(params.use_mmap(), true, "use_mmap should be true");
230/// assert_eq!(params.use_mlock(), false, "use_mlock should be false");
231/// ```
232impl Default for LlamaModelParams {
233    fn default() -> Self {
234        let default_params = unsafe { llama_cpp_sys_2::llama_model_default_params() };
235        LlamaModelParams {
236            params: default_params,
237            // push the next one to ensure we maintain the iterator invariant of ending with a 0
238            kv_overrides: vec![llama_cpp_sys_2::llama_model_kv_override {
239                key: [0; 128],
240                tag: 0,
241                __bindgen_anon_1: llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 {
242                    val_i64: 0,
243                },
244            }],
245            buft_overrides: vec![llama_cpp_sys_2::llama_model_tensor_buft_override {
246                pattern: std::ptr::null(),
247                buft: std::ptr::null_mut(),
248            }],
249        }
250    }
251}