Skip to main content

llama_cpp_bindings/model/
params.rs

1//! A safe wrapper around `llama_model_params`.
2
3use crate::LlamaCppError;
4use crate::error::ModelParamsError;
5use crate::model::params::kv_overrides::KvOverrides;
6use crate::model::split_mode::{LlamaSplitMode, LlamaSplitModeParseError};
7use std::ffi::{CStr, c_char};
8use std::fmt::{Debug, Formatter};
9use std::pin::Pin;
10use std::ptr::null;
11
12pub mod kv_overrides;
13pub mod param_override_value;
14
15/// The maximum number of devices supported.
16///
17/// The real maximum number of devices is the lesser one of this value and the value returned by
18/// `llama_cpp_bindings::max_devices()`.
19pub const LLAMA_CPP_MAX_DEVICES: usize = 16;
20
21/// A safe wrapper around `llama_model_params`.
22pub struct LlamaModelParams {
23    /// The underlying `llama_model_params` from the C API.
24    pub params: llama_cpp_bindings_sys::llama_model_params,
25    kv_overrides: Vec<llama_cpp_bindings_sys::llama_model_kv_override>,
26    buft_overrides: Vec<llama_cpp_bindings_sys::llama_model_tensor_buft_override>,
27    devices: Pin<Box<[llama_cpp_bindings_sys::ggml_backend_dev_t; LLAMA_CPP_MAX_DEVICES]>>,
28}
29
30impl Debug for LlamaModelParams {
31    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
32        f.debug_struct("LlamaModelParams")
33            .field("n_gpu_layers", &self.params.n_gpu_layers)
34            .field("main_gpu", &self.params.main_gpu)
35            .field("vocab_only", &self.params.vocab_only)
36            .field("use_mmap", &self.params.use_mmap)
37            .field("use_mlock", &self.params.use_mlock)
38            .field("split_mode", &self.split_mode())
39            .field("devices", &self.devices)
40            .field("kv_overrides", &"vec of kv_overrides")
41            .finish()
42    }
43}
44
45impl LlamaModelParams {
46    /// See [`KvOverrides`]
47    ///
48    /// # Examples
49    ///
50    /// ```rust
51    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
52    /// let params = Box::pin(LlamaModelParams::default());
53    /// let kv_overrides = params.kv_overrides();
54    /// let count = kv_overrides.into_iter().count();
55    /// assert_eq!(count, 0);
56    /// ```
57    #[must_use]
58    pub fn kv_overrides(&self) -> KvOverrides<'_> {
59        KvOverrides::new(self)
60    }
61
62    /// Appends a key-value override to the model parameters. It must be pinned as this creates a self-referential struct.
63    ///
64    /// # Errors
65    /// Returns [`ModelParamsError`] if the internal override vector has no available slot,
66    /// the slot is not empty, or the key contains invalid characters.
67    ///
68    /// # Examples
69    ///
70    /// ```rust
71    /// # use std::ffi::{CStr, CString};
72    /// use std::pin::pin;
73    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
74    /// # use llama_cpp_bindings::model::params::param_override_value::ParamOverrideValue;
75    /// let mut params = pin!(LlamaModelParams::default());
76    /// let key = CString::new("key").expect("CString::new failed");
77    /// params.as_mut().append_kv_override(&key, ParamOverrideValue::Int(50)).unwrap();
78    ///
79    /// let kv_overrides = params.kv_overrides().into_iter().collect::<Vec<_>>();
80    /// assert_eq!(kv_overrides.len(), 1);
81    ///
82    /// let (k, v) = &kv_overrides[0];
83    /// assert_eq!(v, &ParamOverrideValue::Int(50));
84    ///
85    /// assert_eq!(k.to_bytes(), b"key", "expected key to be 'key', was {:?}", k);
86    /// ```
87    pub fn append_kv_override(
88        mut self: Pin<&mut Self>,
89        key: &CStr,
90        value: param_override_value::ParamOverrideValue,
91    ) -> Result<(), ModelParamsError> {
92        let kv_override = self
93            .kv_overrides
94            .get_mut(0)
95            .ok_or(ModelParamsError::NoAvailableSlot)?;
96
97        if kv_override.key[0] != 0 {
98            return Err(ModelParamsError::SlotNotEmpty);
99        }
100
101        for (i, &byte) in key.to_bytes_with_nul().iter().enumerate() {
102            kv_override.key[i] = c_char::try_from(byte).map_err(|convert_error| {
103                ModelParamsError::InvalidCharacterInKey {
104                    byte,
105                    reason: convert_error.to_string(),
106                }
107            })?;
108        }
109
110        kv_override.tag = value.tag();
111        kv_override.__bindgen_anon_1 = value.value();
112
113        // set to null pointer for panic safety (as push may move the vector, invalidating the pointer)
114        self.params.kv_overrides = null();
115
116        // push the next one to ensure we maintain the iterator invariant of ending with a 0
117        self.kv_overrides
118            .push(llama_cpp_bindings_sys::llama_model_kv_override {
119                key: [0; 128],
120                tag: 0,
121                __bindgen_anon_1: llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
122                    val_i64: 0,
123                },
124            });
125
126        // set the pointer to the (potentially) new vector
127        self.params.kv_overrides = self.kv_overrides.as_ptr();
128
129        Ok(())
130    }
131}
132
133impl LlamaModelParams {
134    /// Adds buffer type overrides to move all mixture-of-experts layers to CPU.
135    ///
136    /// # Errors
137    /// Returns [`ModelParamsError`] if the internal override vector has no available slot,
138    /// the slot is not empty, or the key contains invalid characters.
139    pub fn add_cpu_moe_override(self: Pin<&mut Self>) -> Result<(), ModelParamsError> {
140        self.add_cpu_buft_override(c"\\.ffn_(up|down|gate)_(ch|)exps")
141    }
142
143    /// Appends a buffer type override to the model parameters, to move layers matching pattern to CPU.
144    /// It must be pinned as this creates a self-referential struct.
145    ///
146    /// # Errors
147    /// Returns [`ModelParamsError`] if the internal override vector has no available slot,
148    /// the slot is not empty, or the key contains invalid characters.
149    pub fn add_cpu_buft_override(
150        mut self: Pin<&mut Self>,
151        key: &CStr,
152    ) -> Result<(), ModelParamsError> {
153        let buft_override = self
154            .buft_overrides
155            .get_mut(0)
156            .ok_or(ModelParamsError::NoAvailableSlot)?;
157
158        if !buft_override.pattern.is_null() {
159            return Err(ModelParamsError::SlotNotEmpty);
160        }
161
162        for &byte in key.to_bytes_with_nul() {
163            c_char::try_from(byte).map_err(|convert_error| {
164                ModelParamsError::InvalidCharacterInKey {
165                    byte,
166                    reason: convert_error.to_string(),
167                }
168            })?;
169        }
170
171        buft_override.pattern = key.as_ptr();
172        buft_override.buft = unsafe { llama_cpp_bindings_sys::ggml_backend_cpu_buffer_type() };
173
174        // set to null pointer for panic safety (as push may move the vector, invalidating the pointer)
175        self.params.tensor_buft_overrides = null();
176
177        // push the next one to ensure we maintain the iterator invariant of ending with a 0
178        self.buft_overrides
179            .push(llama_cpp_bindings_sys::llama_model_tensor_buft_override {
180                pattern: std::ptr::null(),
181                buft: std::ptr::null_mut(),
182            });
183
184        // set the pointer to the (potentially) new vector
185        self.params.tensor_buft_overrides = self.buft_overrides.as_ptr();
186
187        Ok(())
188    }
189}
190
191impl LlamaModelParams {
192    /// Get the number of layers to offload to the GPU.
193    #[must_use]
194    pub fn n_gpu_layers(&self) -> i32 {
195        self.params.n_gpu_layers
196    }
197
198    /// The GPU that is used for scratch and small tensors
199    #[must_use]
200    pub fn main_gpu(&self) -> i32 {
201        self.params.main_gpu
202    }
203
204    /// only load the vocabulary, no weights
205    #[must_use]
206    pub fn vocab_only(&self) -> bool {
207        self.params.vocab_only
208    }
209
210    /// use mmap if possible
211    #[must_use]
212    pub fn use_mmap(&self) -> bool {
213        self.params.use_mmap
214    }
215
216    /// force system to keep model in RAM
217    #[must_use]
218    pub fn use_mlock(&self) -> bool {
219        self.params.use_mlock
220    }
221
222    /// get the split mode
223    ///
224    /// # Errors
225    /// Returns `LlamaSplitModeParseError` if the unknown split mode is encountered.
226    pub fn split_mode(&self) -> Result<LlamaSplitMode, LlamaSplitModeParseError> {
227        LlamaSplitMode::try_from(self.params.split_mode)
228    }
229
230    /// get the devices
231    #[must_use]
232    pub fn devices(&self) -> Vec<usize> {
233        let mut backend_devices = Vec::new();
234        for i in 0..unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() } {
235            let dev = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_get(i) };
236            backend_devices.push(dev);
237        }
238        let mut devices = Vec::new();
239        for &dev in self.devices.iter() {
240            if dev.is_null() {
241                break;
242            }
243            if let Some((index, _)) = backend_devices
244                .iter()
245                .enumerate()
246                .find(|&(_i, &d)| d == dev)
247            {
248                devices.push(index);
249            }
250        }
251        devices
252    }
253
254    /// sets the number of gpu layers to offload to the GPU.
255    /// ```
256    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
257    /// let params = LlamaModelParams::default();
258    /// let params = params.with_n_gpu_layers(1);
259    /// assert_eq!(params.n_gpu_layers(), 1);
260    /// ```
261    #[must_use]
262    pub fn with_n_gpu_layers(mut self, n_gpu_layers: u32) -> Self {
263        // The only way this conversion can fail is if u32 overflows the i32 - in which case we set
264        // to MAX
265        let n_gpu_layers = i32::try_from(n_gpu_layers).unwrap_or(i32::MAX);
266        self.params.n_gpu_layers = n_gpu_layers;
267        self
268    }
269
270    /// sets the main GPU
271    ///
272    /// To enable this option, you must set `split_mode` to `LlamaSplitMode::None` to enable single GPU mode.
273    #[must_use]
274    pub fn with_main_gpu(mut self, main_gpu: i32) -> Self {
275        self.params.main_gpu = main_gpu;
276        self
277    }
278
279    /// sets `vocab_only`
280    #[must_use]
281    pub fn with_vocab_only(mut self, vocab_only: bool) -> Self {
282        self.params.vocab_only = vocab_only;
283        self
284    }
285
286    /// sets `use_mlock`
287    #[must_use]
288    pub fn with_use_mlock(mut self, use_mlock: bool) -> Self {
289        self.params.use_mlock = use_mlock;
290        self
291    }
292
293    /// sets `split_mode`
294    #[must_use]
295    pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self {
296        self.params.split_mode = split_mode.into();
297        self
298    }
299
300    /// sets `devices`
301    ///
302    /// The devices are specified as indices that correspond to the ggml backend device indices.
303    ///
304    /// The maximum number of devices is 16.
305    ///
306    /// You don't need to specify CPU or ACCEL devices.
307    ///
308    /// # Errors
309    /// Returns `LlamaCppError::BackendDeviceNotFound` if any device index is invalid.
310    pub fn with_devices(mut self, devices: &[usize]) -> Result<Self, LlamaCppError> {
311        for dev in self.devices.iter_mut() {
312            *dev = std::ptr::null_mut();
313        }
314        // Check device count
315        let max_devices = crate::max_devices().min(LLAMA_CPP_MAX_DEVICES);
316        if devices.len() > max_devices {
317            return Err(LlamaCppError::MaxDevicesExceeded(max_devices));
318        }
319        for (i, &dev) in devices.iter().enumerate() {
320            if dev >= unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() } {
321                return Err(LlamaCppError::BackendDeviceNotFound(dev));
322            }
323            let backend_dev = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_get(dev) };
324            self.devices[i] = backend_dev;
325        }
326        if self.devices.is_empty() {
327            self.params.devices = std::ptr::null_mut();
328        } else {
329            self.params.devices = self.devices.as_mut_ptr();
330        }
331        Ok(self)
332    }
333}
334
335/// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`)
336/// ```
337/// # use llama_cpp_bindings::model::params::LlamaModelParams;
338/// use llama_cpp_bindings::model::split_mode::LlamaSplitMode;
339/// let params = LlamaModelParams::default();
340/// assert_eq!(params.n_gpu_layers(), -1, "n_gpu_layers should be -1");
341/// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0");
342/// assert_eq!(params.vocab_only(), false, "vocab_only should be false");
343/// assert_eq!(params.use_mmap(), true, "use_mmap should be true");
344/// assert_eq!(params.use_mlock(), false, "use_mlock should be false");
345/// assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer), "split_mode should be LAYER");
346/// assert_eq!(params.devices().len(), 0, "devices should be empty");
347/// ```
348impl Default for LlamaModelParams {
349    fn default() -> Self {
350        let default_params = unsafe { llama_cpp_bindings_sys::llama_model_default_params() };
351        LlamaModelParams {
352            params: default_params,
353            // push the next one to ensure we maintain the iterator invariant of ending with a 0
354            kv_overrides: vec![llama_cpp_bindings_sys::llama_model_kv_override {
355                key: [0; 128],
356                tag: 0,
357                __bindgen_anon_1: llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
358                    val_i64: 0,
359                },
360            }],
361            buft_overrides: vec![llama_cpp_bindings_sys::llama_model_tensor_buft_override {
362                pattern: std::ptr::null(),
363                buft: std::ptr::null_mut(),
364            }],
365            devices: Box::pin([std::ptr::null_mut(); 16]),
366        }
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use crate::model::split_mode::LlamaSplitMode;
373
374    use super::LlamaModelParams;
375
376    #[test]
377    fn default_params_have_expected_values() {
378        let params = LlamaModelParams::default();
379
380        assert_eq!(params.n_gpu_layers(), -1);
381        assert_eq!(params.main_gpu(), 0);
382        assert!(!params.vocab_only());
383        assert!(params.use_mmap());
384        assert!(!params.use_mlock());
385        assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer));
386        assert!(params.devices().is_empty());
387    }
388
389    #[test]
390    fn n_gpu_layers_overflow_clamps_to_max() {
391        let params = LlamaModelParams::default().with_n_gpu_layers(u32::MAX);
392
393        assert_eq!(params.n_gpu_layers(), i32::MAX);
394    }
395}