llama_cpp_bindings/model/
params.rs1use crate::LlamaCppError;
2use crate::context::params::LlamaContextParams;
3use crate::error::{FitError, ModelParamsError};
4use crate::model::llama_split_mode_parse_error::LlamaSplitModeParseError;
5use crate::model::params::fit_result::FitResult;
6use crate::model::params::kv_overrides::KvOverrides;
7use crate::model::split_mode::LlamaSplitMode;
8use std::ffi::{CStr, c_char};
9use std::fmt::{Debug, Formatter};
10use std::pin::Pin;
11use std::ptr::null;
12
13pub mod fit_result;
14pub mod kv_override_value_iterator;
15pub mod kv_overrides;
16pub mod param_override_value;
17pub mod unknown_kv_override_tag;
18
19pub const LLAMA_CPP_MAX_DEVICES: usize = 16;
20
21pub struct LlamaModelParams {
22 pub params: llama_cpp_bindings_sys::llama_model_params,
23 kv_overrides: Vec<llama_cpp_bindings_sys::llama_model_kv_override>,
24 buft_overrides: Vec<llama_cpp_bindings_sys::llama_model_tensor_buft_override>,
25 devices: Pin<Box<[llama_cpp_bindings_sys::ggml_backend_dev_t; LLAMA_CPP_MAX_DEVICES]>>,
26 tensor_split: Vec<f32>,
27}
28
29impl Debug for LlamaModelParams {
30 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
31 f.debug_struct("LlamaModelParams")
32 .field("n_gpu_layers", &self.params.n_gpu_layers)
33 .field("main_gpu", &self.params.main_gpu)
34 .field("vocab_only", &self.params.vocab_only)
35 .field("use_mmap", &self.params.use_mmap)
36 .field("use_mlock", &self.params.use_mlock)
37 .field("split_mode", &self.split_mode())
38 .field("devices", &self.devices)
39 .field("kv_overrides", &"vec of kv_overrides")
40 .finish_non_exhaustive()
41 }
42}
43
44impl LlamaModelParams {
45 #[must_use]
46 pub const fn kv_overrides(&self) -> KvOverrides<'_> {
47 KvOverrides::new(self)
48 }
49
50 pub fn append_kv_override(
55 mut self: Pin<&mut Self>,
56 key: &CStr,
57 value: param_override_value::ParamOverrideValue,
58 ) -> Result<(), ModelParamsError> {
59 let kv_override = self
60 .kv_overrides
61 .get_mut(0)
62 .ok_or(ModelParamsError::NoAvailableSlot)?;
63
64 if kv_override.key[0] != 0 {
65 return Err(ModelParamsError::SlotNotEmpty);
66 }
67
68 for (i, &byte) in key.to_bytes_with_nul().iter().enumerate() {
69 kv_override.key[i] = c_char::try_from(byte).map_err(|convert_error| {
70 ModelParamsError::InvalidCharacterInKey {
71 byte,
72 reason: convert_error.to_string(),
73 }
74 })?;
75 }
76
77 kv_override.tag = value.tag();
78 kv_override.__bindgen_anon_1 = value.value();
79
80 self.push_kv_override_terminator();
81
82 Ok(())
83 }
84
85 fn push_kv_override_terminator(mut self: Pin<&mut Self>) {
86 self.params.kv_overrides = null();
87
88 self.kv_overrides
89 .push(llama_cpp_bindings_sys::llama_model_kv_override {
90 key: [0; 128],
91 tag: 0,
92 __bindgen_anon_1: llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
93 val_i64: 0,
94 },
95 });
96
97 self.params.kv_overrides = self.kv_overrides.as_ptr();
98 }
99}
100
101impl LlamaModelParams {
102 pub fn add_cpu_moe_override(self: Pin<&mut Self>) -> Result<(), ModelParamsError> {
106 self.add_cpu_buft_override(c"\\.ffn_(up|down|gate)_(ch|)exps")
107 }
108
109 pub fn add_cpu_buft_override(
113 mut self: Pin<&mut Self>,
114 key: &CStr,
115 ) -> Result<(), ModelParamsError> {
116 let buft_override = self
117 .buft_overrides
118 .get_mut(0)
119 .ok_or(ModelParamsError::NoAvailableSlot)?;
120
121 if !buft_override.pattern.is_null() {
122 return Err(ModelParamsError::SlotNotEmpty);
123 }
124
125 for &byte in key.to_bytes_with_nul() {
126 c_char::try_from(byte).map_err(|convert_error| {
127 ModelParamsError::InvalidCharacterInKey {
128 byte,
129 reason: convert_error.to_string(),
130 }
131 })?;
132 }
133
134 buft_override.pattern = key.as_ptr();
135 buft_override.buft = unsafe { llama_cpp_bindings_sys::ggml_backend_cpu_buffer_type() };
136
137 self.push_buft_override_terminator();
138
139 Ok(())
140 }
141
142 fn push_buft_override_terminator(mut self: Pin<&mut Self>) {
143 self.params.tensor_buft_overrides = null();
144
145 self.buft_overrides
146 .push(llama_cpp_bindings_sys::llama_model_tensor_buft_override {
147 pattern: null(),
148 buft: std::ptr::null_mut(),
149 });
150
151 self.params.tensor_buft_overrides = self.buft_overrides.as_ptr();
152 }
153}
154
155impl LlamaModelParams {
156 #[must_use]
157 pub const fn n_gpu_layers(&self) -> i32 {
158 self.params.n_gpu_layers
159 }
160
161 #[must_use]
162 pub const fn main_gpu(&self) -> i32 {
163 self.params.main_gpu
164 }
165
166 #[must_use]
167 pub const fn vocab_only(&self) -> bool {
168 self.params.vocab_only
169 }
170
171 #[must_use]
172 pub const fn use_mmap(&self) -> bool {
173 self.params.use_mmap
174 }
175
176 #[must_use]
177 pub const fn use_mlock(&self) -> bool {
178 self.params.use_mlock
179 }
180
181 pub fn split_mode(&self) -> Result<LlamaSplitMode, LlamaSplitModeParseError> {
184 LlamaSplitMode::try_from(self.params.split_mode)
185 }
186
187 #[must_use]
188 pub fn devices(&self) -> Vec<usize> {
189 let mut backend_devices = Vec::new();
190 for i in 0..unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() } {
191 let dev = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_get(i) };
192 backend_devices.push(dev);
193 }
194 let mut devices = Vec::new();
195 for &dev in self.devices.iter() {
196 if dev.is_null() {
197 break;
198 }
199 let matched_index = backend_devices
200 .iter()
201 .enumerate()
202 .find(|&(_i, &d)| d == dev)
203 .map(|(index, _)| index);
204
205 if let Some(index) = matched_index {
206 devices.push(index);
207 }
208 }
209 devices
210 }
211
212 #[must_use]
213 pub fn with_n_gpu_layers(mut self, n_gpu_layers: u32) -> Self {
214 let n_gpu_layers = i32::try_from(n_gpu_layers).unwrap_or(i32::MAX);
215 self.params.n_gpu_layers = n_gpu_layers;
216 self
217 }
218
219 #[must_use]
220 pub const fn with_main_gpu(mut self, main_gpu: i32) -> Self {
221 self.params.main_gpu = main_gpu;
222 self
223 }
224
225 #[must_use]
226 pub const fn with_vocab_only(mut self, vocab_only: bool) -> Self {
227 self.params.vocab_only = vocab_only;
228 self
229 }
230
231 #[must_use]
232 pub const fn with_use_mmap(mut self, use_mmap: bool) -> Self {
233 self.params.use_mmap = use_mmap;
234 self
235 }
236
237 #[must_use]
238 pub const fn no_alloc(&self) -> bool {
239 self.params.no_alloc
240 }
241
242 #[must_use]
243 pub const fn with_no_alloc(mut self, no_alloc: bool) -> Self {
244 self.params.no_alloc = no_alloc;
245 if no_alloc {
246 self.params.use_mmap = false;
247 }
248 self
249 }
250
251 #[must_use]
252 pub const fn with_use_mlock(mut self, use_mlock: bool) -> Self {
253 self.params.use_mlock = use_mlock;
254 self
255 }
256
257 #[must_use]
258 pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self {
259 self.params.split_mode = split_mode.into();
260 self
261 }
262
263 pub fn with_devices(mut self, devices: &[usize]) -> Result<Self, LlamaCppError> {
266 for dev in self.devices.iter_mut() {
267 *dev = std::ptr::null_mut();
268 }
269 let max_devices = crate::max_devices().min(LLAMA_CPP_MAX_DEVICES);
270 if devices.len() > max_devices {
271 return Err(LlamaCppError::MaxDevicesExceeded(max_devices));
272 }
273 for (i, &dev) in devices.iter().enumerate() {
274 if dev >= unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() } {
275 return Err(LlamaCppError::BackendDeviceNotFound(dev));
276 }
277 let backend_dev = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_get(dev) };
278 self.devices[i] = backend_dev;
279 }
280 self.params.devices = self.devices.as_mut_ptr();
281
282 Ok(self)
283 }
284}
285
286impl LlamaModelParams {
287 pub fn fit_params(
291 mut self: Pin<&mut Self>,
292 model_path: &CStr,
293 context_params: &mut LlamaContextParams,
294 margins: &mut [usize],
295 n_ctx_min: u32,
296 log_level: llama_cpp_bindings_sys::ggml_log_level,
297 ) -> Result<FitResult, FitError> {
298 let max_devices = unsafe { llama_cpp_bindings_sys::llama_max_devices() };
299 let max_buft = unsafe { llama_cpp_bindings_sys::llama_max_tensor_buft_overrides() };
300
301 self.tensor_split.clear();
302 self.tensor_split.resize(max_devices, 0.0);
303
304 self.buft_overrides.clear();
305 self.buft_overrides.resize(
306 max_buft + 1,
307 llama_cpp_bindings_sys::llama_model_tensor_buft_override {
308 pattern: null(),
309 buft: std::ptr::null_mut(),
310 },
311 );
312
313 self.params.tensor_split = null::<f32>();
314 self.params.tensor_buft_overrides = null();
315
316 let mut out_unrecognized_status_code: i32 = 0;
317 let mut out_error: *mut c_char = std::ptr::null_mut();
318
319 let status = unsafe {
320 llama_cpp_bindings_sys::llama_rs_fit_params(
321 model_path.as_ptr(),
322 &raw mut self.params,
323 &raw mut context_params.context_params,
324 self.tensor_split.as_mut_ptr(),
325 self.buft_overrides.as_mut_ptr(),
326 margins.as_mut_ptr(),
327 n_ctx_min,
328 log_level,
329 &raw mut out_unrecognized_status_code,
330 &raw mut out_error,
331 )
332 };
333
334 match status {
335 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_OK => {}
336 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_REPORTED_FAILURE => {
337 return Err(FitError::NoFittingMemoryLayout);
338 }
339 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_REPORTED_ERROR => {
340 return Err(FitError::Aborted);
341 }
342 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_RETURNED_UNRECOGNIZED_STATUS_CODE => {
343 return Err(FitError::UnknownStatus {
344 code: out_unrecognized_status_code,
345 });
346 }
347 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_ERROR_STRING_ALLOCATION_FAILED => {
348 return Err(FitError::NotEnoughMemory);
349 }
350 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_THREW_CXX_EXCEPTION => {
351 let message =
352 unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
353 return Err(FitError::Reported { message });
354 }
355 other => unreachable!("llama_rs_fit_params returned unrecognized wrapper status: {other}"),
356 }
357
358 self.params.tensor_split = self.tensor_split.as_ptr();
359 self.params.tensor_buft_overrides = self.buft_overrides.as_ptr();
360
361 Ok(FitResult {
362 n_ctx: context_params.context_params.n_ctx,
363 })
364 }
365}
366
367impl Default for LlamaModelParams {
368 fn default() -> Self {
369 let default_params = unsafe { llama_cpp_bindings_sys::llama_model_default_params() };
370 Self {
371 params: default_params,
372 kv_overrides: vec![llama_cpp_bindings_sys::llama_model_kv_override {
373 key: [0; 128],
374 tag: 0,
375 __bindgen_anon_1: llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
376 val_i64: 0,
377 },
378 }],
379 buft_overrides: vec![llama_cpp_bindings_sys::llama_model_tensor_buft_override {
380 pattern: null(),
381 buft: std::ptr::null_mut(),
382 }],
383 devices: Box::pin([std::ptr::null_mut(); 16]),
384 tensor_split: Vec::new(),
385 }
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use crate::model::split_mode::LlamaSplitMode;
392
393 use super::LlamaModelParams;
394
395 #[test]
396 fn default_params_have_expected_values() {
397 let params = LlamaModelParams::default();
398
399 assert_eq!(params.n_gpu_layers(), -1);
400 assert_eq!(params.main_gpu(), 0);
401 assert!(!params.vocab_only());
402 assert!(params.use_mmap());
403 assert!(!params.use_mlock());
404 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer));
405 assert!(params.devices().is_empty());
406 }
407
408 #[test]
409 fn n_gpu_layers_overflow_clamps_to_max() {
410 let params = LlamaModelParams::default().with_n_gpu_layers(u32::MAX);
411
412 assert_eq!(params.n_gpu_layers(), i32::MAX);
413 }
414
415 #[test]
416 fn with_n_gpu_layers_sets_value() {
417 let params = LlamaModelParams::default().with_n_gpu_layers(32);
418
419 assert_eq!(params.n_gpu_layers(), 32);
420 }
421
422 #[test]
423 fn with_main_gpu_sets_value() {
424 let params = LlamaModelParams::default().with_main_gpu(2);
425
426 assert_eq!(params.main_gpu(), 2);
427 }
428
429 #[test]
430 fn with_split_mode_none() {
431 let params = LlamaModelParams::default().with_split_mode(LlamaSplitMode::None);
432
433 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::None));
434 }
435
436 #[test]
437 fn with_split_mode_row() {
438 let params = LlamaModelParams::default().with_split_mode(LlamaSplitMode::Row);
439
440 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Row));
441 }
442
443 #[test]
444 fn with_vocab_only_enables() {
445 let params = LlamaModelParams::default().with_vocab_only(true);
446
447 assert!(params.vocab_only());
448 }
449
450 #[test]
451 fn with_vocab_only_disables() {
452 let params = LlamaModelParams::default().with_vocab_only(false);
453
454 assert!(!params.vocab_only());
455 }
456
457 #[test]
458 fn with_use_mmap_enables() {
459 let params = LlamaModelParams::default().with_use_mmap(true);
460
461 assert!(params.use_mmap());
462 }
463
464 #[test]
465 fn with_use_mmap_disables() {
466 let params = LlamaModelParams::default().with_use_mmap(false);
467
468 assert!(!params.use_mmap());
469 }
470
471 #[test]
472 fn with_no_alloc_enables() {
473 let params = LlamaModelParams::default().with_no_alloc(true);
474
475 assert!(params.no_alloc());
476 }
477
478 #[test]
479 fn with_no_alloc_disables() {
480 let params = LlamaModelParams::default().with_no_alloc(false);
481
482 assert!(!params.no_alloc());
483 }
484
485 #[test]
486 fn with_no_alloc_true_disables_mmap() {
487 let params = LlamaModelParams::default()
488 .with_use_mmap(true)
489 .with_no_alloc(true);
490
491 assert!(params.no_alloc());
492 assert!(!params.use_mmap());
493 }
494
495 #[test]
496 fn default_no_alloc_is_false() {
497 let params = LlamaModelParams::default();
498
499 assert!(!params.no_alloc());
500 }
501
502 #[test]
503 fn with_use_mlock_enables() {
504 let params = LlamaModelParams::default().with_use_mlock(true);
505
506 assert!(params.use_mlock());
507 }
508
509 #[test]
510 fn with_use_mlock_disables() {
511 let params = LlamaModelParams::default().with_use_mlock(false);
512
513 assert!(!params.use_mlock());
514 }
515
516 #[test]
517 fn debug_format_contains_field_names() {
518 let params = LlamaModelParams::default();
519 let debug_output = format!("{params:?}");
520
521 assert!(debug_output.contains("n_gpu_layers"));
522 assert!(debug_output.contains("main_gpu"));
523 assert!(debug_output.contains("vocab_only"));
524 assert!(debug_output.contains("use_mmap"));
525 assert!(debug_output.contains("use_mlock"));
526 assert!(debug_output.contains("split_mode"));
527 }
528
529 #[test]
530 fn builder_chaining_preserves_all_values() {
531 let params = LlamaModelParams::default()
532 .with_n_gpu_layers(10)
533 .with_main_gpu(1)
534 .with_split_mode(LlamaSplitMode::Row)
535 .with_vocab_only(true)
536 .with_use_mlock(true);
537
538 assert_eq!(params.n_gpu_layers(), 10);
539 assert_eq!(params.main_gpu(), 1);
540 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Row));
541 assert!(params.vocab_only());
542 assert!(params.use_mlock());
543 }
544
545 #[test]
546 fn with_devices_empty_list_succeeds() {
547 let params = LlamaModelParams::default().with_devices(&[]);
548
549 assert!(params.is_ok());
550 assert!(params.unwrap().devices().is_empty());
551 }
552
553 #[test]
554 fn with_devices_invalid_index_returns_error() {
555 let result = LlamaModelParams::default().with_devices(&[999_999]);
556
557 assert!(matches!(
558 result.unwrap_err(),
559 crate::LlamaCppError::BackendDeviceNotFound(999_999)
560 ));
561 }
562
563 #[test]
564 fn add_cpu_buft_override_succeeds() {
565 let mut params = std::pin::pin!(LlamaModelParams::default());
566 let result = params.as_mut().add_cpu_buft_override(c"test_pattern");
567
568 assert!(result.is_ok());
569 }
570
571 #[test]
572 fn add_cpu_buft_override_twice_fails_with_slot_not_empty() {
573 let mut params = std::pin::pin!(LlamaModelParams::default());
574 params
575 .as_mut()
576 .add_cpu_buft_override(c"first_pattern")
577 .unwrap();
578 let result = params.as_mut().add_cpu_buft_override(c"second_pattern");
579
580 assert_eq!(
581 result.unwrap_err(),
582 crate::error::ModelParamsError::SlotNotEmpty
583 );
584 }
585
586 #[test]
587 fn add_cpu_moe_override_succeeds() {
588 let mut params = std::pin::pin!(LlamaModelParams::default());
589 let result = params.as_mut().add_cpu_moe_override();
590
591 assert!(result.is_ok());
592 }
593
594 #[test]
595 fn append_kv_override_twice_fails_with_slot_not_empty() {
596 use crate::model::params::param_override_value::ParamOverrideValue;
597 use std::ffi::CString;
598
599 let mut params = std::pin::pin!(LlamaModelParams::default());
600 let key = CString::new("first_key").unwrap();
601 params
602 .as_mut()
603 .append_kv_override(&key, ParamOverrideValue::Int(1))
604 .unwrap();
605
606 let key2 = CString::new("second_key").unwrap();
607 let result = params
608 .as_mut()
609 .append_kv_override(&key2, ParamOverrideValue::Int(2));
610
611 assert_eq!(
612 result.unwrap_err(),
613 crate::error::ModelParamsError::SlotNotEmpty
614 );
615 }
616
617 #[test]
618 fn with_devices_too_many_returns_max_exceeded() {
619 let too_many: Vec<usize> = (0..17).collect();
620 let result = LlamaModelParams::default().with_devices(&too_many);
621
622 assert!(
623 result
624 .unwrap_err()
625 .to_string()
626 .contains("Max devices exceeded")
627 );
628 }
629
630 #[test]
631 fn with_devices_sets_devices_when_available() {
632 #[cfg(feature = "dynamic-backends")]
633 crate::load_backends::load_backends().unwrap();
634
635 let dev_count = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() };
636 assert!(dev_count > 0, "Test requires at least one backend device");
637
638 let params = LlamaModelParams::default().with_devices(&[0]).unwrap();
639
640 assert_eq!(params.devices().len(), 1);
641 assert_eq!(params.devices()[0], 0);
642 }
643
644 #[test]
645 fn with_devices_invalid_index_returns_not_found() {
646 let invalid_index = usize::MAX;
647 let result = LlamaModelParams::default().with_devices(&[invalid_index]);
648
649 assert!(result.unwrap_err().to_string().contains("Backend device"));
650 }
651
652 #[test]
653 #[cfg(not(target_os = "windows"))]
654 fn append_kv_override_with_high_byte_returns_invalid_character_error() {
655 use crate::model::params::param_override_value::ParamOverrideValue;
656
657 let key_bytes: &[u8] = b"\xff\0";
658 let key = std::ffi::CStr::from_bytes_with_nul(key_bytes).unwrap();
659 let mut params = std::pin::pin!(LlamaModelParams::default());
660 let result = params
661 .as_mut()
662 .append_kv_override(key, ParamOverrideValue::Int(1));
663
664 assert!(matches!(
665 result,
666 Err(crate::error::ModelParamsError::InvalidCharacterInKey { byte: 0xff, .. })
667 ));
668 }
669
670 #[test]
671 #[cfg(not(target_os = "windows"))]
672 fn add_cpu_buft_override_with_high_byte_returns_invalid_character_error() {
673 let key_bytes: &[u8] = b"\xff\0";
674 let key = std::ffi::CStr::from_bytes_with_nul(key_bytes).unwrap();
675 let mut params = std::pin::pin!(LlamaModelParams::default());
676 let result = params.as_mut().add_cpu_buft_override(key);
677
678 assert!(matches!(
679 result,
680 Err(crate::error::ModelParamsError::InvalidCharacterInKey { byte: 0xff, .. })
681 ));
682 }
683
684 #[test]
685 #[serial_test::serial]
686 fn fit_params_invalid_model_path_returns_error() {
687 use crate::context::params::LlamaContextParams;
688 use crate::error::FitError;
689 use crate::llama_backend::LlamaBackend;
690
691 let _backend = LlamaBackend::init();
692 let mut params = std::pin::pin!(LlamaModelParams::default());
693 let mut context_params = LlamaContextParams::default();
694 let mut margins = vec![0usize; crate::max_devices()];
695
696 let bogus_path = c"/nonexistent/path/to/model.gguf";
697 let result = params.as_mut().fit_params(
698 bogus_path,
699 &mut context_params,
700 &mut margins,
701 512,
702 llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
703 );
704
705 assert!(
706 matches!(result, Err(FitError::Aborted | FitError::Reported { .. })),
707 "expected Aborted or Reported, got {result:?}"
708 );
709 }
710}