llama_cpp_bindings/model/
params.rs1use crate::LlamaCppError;
4use crate::context::params::LlamaContextParams;
5use crate::error::{FitError, ModelParamsError};
6use crate::model::llama_split_mode_parse_error::LlamaSplitModeParseError;
7use crate::model::params::fit_result::FitResult;
8use crate::model::params::kv_overrides::KvOverrides;
9use crate::model::split_mode::LlamaSplitMode;
10use std::ffi::{CStr, c_char};
11use std::fmt::{Debug, Formatter};
12use std::pin::Pin;
13use std::ptr::null;
14
15pub mod fit_result;
16pub mod kv_override_value_iterator;
17pub mod kv_overrides;
18pub mod param_override_value;
19pub mod unknown_kv_override_tag;
20
21pub const LLAMA_CPP_MAX_DEVICES: usize = 16;
26
27pub struct LlamaModelParams {
29 pub params: llama_cpp_bindings_sys::llama_model_params,
31 kv_overrides: Vec<llama_cpp_bindings_sys::llama_model_kv_override>,
32 buft_overrides: Vec<llama_cpp_bindings_sys::llama_model_tensor_buft_override>,
33 devices: Pin<Box<[llama_cpp_bindings_sys::ggml_backend_dev_t; LLAMA_CPP_MAX_DEVICES]>>,
34 tensor_split: Vec<f32>,
35}
36
37impl Debug for LlamaModelParams {
38 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
39 f.debug_struct("LlamaModelParams")
40 .field("n_gpu_layers", &self.params.n_gpu_layers)
41 .field("main_gpu", &self.params.main_gpu)
42 .field("vocab_only", &self.params.vocab_only)
43 .field("use_mmap", &self.params.use_mmap)
44 .field("use_mlock", &self.params.use_mlock)
45 .field("split_mode", &self.split_mode())
46 .field("devices", &self.devices)
47 .field("kv_overrides", &"vec of kv_overrides")
48 .finish_non_exhaustive()
49 }
50}
51
52impl LlamaModelParams {
53 #[must_use]
65 pub const fn kv_overrides(&self) -> KvOverrides<'_> {
66 KvOverrides::new(self)
67 }
68
69 pub fn append_kv_override(
95 mut self: Pin<&mut Self>,
96 key: &CStr,
97 value: param_override_value::ParamOverrideValue,
98 ) -> Result<(), ModelParamsError> {
99 let kv_override = self
100 .kv_overrides
101 .get_mut(0)
102 .ok_or(ModelParamsError::NoAvailableSlot)?;
103
104 if kv_override.key[0] != 0 {
105 return Err(ModelParamsError::SlotNotEmpty);
106 }
107
108 for (i, &byte) in key.to_bytes_with_nul().iter().enumerate() {
109 kv_override.key[i] = c_char::try_from(byte).map_err(|convert_error| {
110 ModelParamsError::InvalidCharacterInKey {
111 byte,
112 reason: convert_error.to_string(),
113 }
114 })?;
115 }
116
117 kv_override.tag = value.tag();
118 kv_override.__bindgen_anon_1 = value.value();
119
120 self.push_kv_override_terminator();
121
122 Ok(())
123 }
124
125 fn push_kv_override_terminator(mut self: Pin<&mut Self>) {
130 self.params.kv_overrides = null();
131
132 self.kv_overrides
133 .push(llama_cpp_bindings_sys::llama_model_kv_override {
134 key: [0; 128],
135 tag: 0,
136 __bindgen_anon_1: llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
137 val_i64: 0,
138 },
139 });
140
141 self.params.kv_overrides = self.kv_overrides.as_ptr();
142 }
143}
144
145impl LlamaModelParams {
146 pub fn add_cpu_moe_override(self: Pin<&mut Self>) -> Result<(), ModelParamsError> {
152 self.add_cpu_buft_override(c"\\.ffn_(up|down|gate)_(ch|)exps")
153 }
154
155 pub fn add_cpu_buft_override(
162 mut self: Pin<&mut Self>,
163 key: &CStr,
164 ) -> Result<(), ModelParamsError> {
165 let buft_override = self
166 .buft_overrides
167 .get_mut(0)
168 .ok_or(ModelParamsError::NoAvailableSlot)?;
169
170 if !buft_override.pattern.is_null() {
171 return Err(ModelParamsError::SlotNotEmpty);
172 }
173
174 for &byte in key.to_bytes_with_nul() {
175 c_char::try_from(byte).map_err(|convert_error| {
176 ModelParamsError::InvalidCharacterInKey {
177 byte,
178 reason: convert_error.to_string(),
179 }
180 })?;
181 }
182
183 buft_override.pattern = key.as_ptr();
184 buft_override.buft = unsafe { llama_cpp_bindings_sys::ggml_backend_cpu_buffer_type() };
185
186 self.push_buft_override_terminator();
187
188 Ok(())
189 }
190
191 fn push_buft_override_terminator(mut self: Pin<&mut Self>) {
196 self.params.tensor_buft_overrides = null();
197
198 self.buft_overrides
199 .push(llama_cpp_bindings_sys::llama_model_tensor_buft_override {
200 pattern: null(),
201 buft: std::ptr::null_mut(),
202 });
203
204 self.params.tensor_buft_overrides = self.buft_overrides.as_ptr();
205 }
206}
207
208impl LlamaModelParams {
209 #[must_use]
211 pub const fn n_gpu_layers(&self) -> i32 {
212 self.params.n_gpu_layers
213 }
214
215 #[must_use]
217 pub const fn main_gpu(&self) -> i32 {
218 self.params.main_gpu
219 }
220
221 #[must_use]
223 pub const fn vocab_only(&self) -> bool {
224 self.params.vocab_only
225 }
226
227 #[must_use]
229 pub const fn use_mmap(&self) -> bool {
230 self.params.use_mmap
231 }
232
233 #[must_use]
235 pub const fn use_mlock(&self) -> bool {
236 self.params.use_mlock
237 }
238
239 pub fn split_mode(&self) -> Result<LlamaSplitMode, LlamaSplitModeParseError> {
244 LlamaSplitMode::try_from(self.params.split_mode)
245 }
246
247 #[must_use]
249 pub fn devices(&self) -> Vec<usize> {
250 let mut backend_devices = Vec::new();
251 for i in 0..unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() } {
252 let dev = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_get(i) };
253 backend_devices.push(dev);
254 }
255 let mut devices = Vec::new();
256 for &dev in self.devices.iter() {
257 if dev.is_null() {
258 break;
259 }
260 let matched_index = backend_devices
261 .iter()
262 .enumerate()
263 .find(|&(_i, &d)| d == dev)
264 .map(|(index, _)| index);
265
266 if let Some(index) = matched_index {
267 devices.push(index);
268 }
269 }
270 devices
271 }
272
273 #[must_use]
281 pub fn with_n_gpu_layers(mut self, n_gpu_layers: u32) -> Self {
282 let n_gpu_layers = i32::try_from(n_gpu_layers).unwrap_or(i32::MAX);
283 self.params.n_gpu_layers = n_gpu_layers;
284 self
285 }
286
287 #[must_use]
291 pub const fn with_main_gpu(mut self, main_gpu: i32) -> Self {
292 self.params.main_gpu = main_gpu;
293 self
294 }
295
296 #[must_use]
298 pub const fn with_vocab_only(mut self, vocab_only: bool) -> Self {
299 self.params.vocab_only = vocab_only;
300 self
301 }
302
303 #[must_use]
313 pub const fn with_use_mmap(mut self, use_mmap: bool) -> Self {
314 self.params.use_mmap = use_mmap;
315 self
316 }
317
318 #[must_use]
320 pub const fn no_alloc(&self) -> bool {
321 self.params.no_alloc
322 }
323
324 #[must_use]
336 pub const fn with_no_alloc(mut self, no_alloc: bool) -> Self {
337 self.params.no_alloc = no_alloc;
338 if no_alloc {
339 self.params.use_mmap = false;
340 }
341 self
342 }
343
344 #[must_use]
346 pub const fn with_use_mlock(mut self, use_mlock: bool) -> Self {
347 self.params.use_mlock = use_mlock;
348 self
349 }
350
351 #[must_use]
353 pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self {
354 self.params.split_mode = split_mode.into();
355 self
356 }
357
358 pub fn with_devices(mut self, devices: &[usize]) -> Result<Self, LlamaCppError> {
369 for dev in self.devices.iter_mut() {
370 *dev = std::ptr::null_mut();
371 }
372 let max_devices = crate::max_devices().min(LLAMA_CPP_MAX_DEVICES);
373 if devices.len() > max_devices {
374 return Err(LlamaCppError::MaxDevicesExceeded(max_devices));
375 }
376 for (i, &dev) in devices.iter().enumerate() {
377 if dev >= unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() } {
378 return Err(LlamaCppError::BackendDeviceNotFound(dev));
379 }
380 let backend_dev = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_get(dev) };
381 self.devices[i] = backend_dev;
382 }
383 self.params.devices = self.devices.as_mut_ptr();
384
385 Ok(self)
386 }
387}
388
389impl LlamaModelParams {
390 pub fn fit_params(
426 mut self: Pin<&mut Self>,
427 model_path: &CStr,
428 context_params: &mut LlamaContextParams,
429 margins: &mut [usize],
430 n_ctx_min: u32,
431 log_level: llama_cpp_bindings_sys::ggml_log_level,
432 ) -> Result<FitResult, FitError> {
433 let max_devices = unsafe { llama_cpp_bindings_sys::llama_max_devices() };
434 let max_buft = unsafe { llama_cpp_bindings_sys::llama_max_tensor_buft_overrides() };
435
436 self.tensor_split.clear();
437 self.tensor_split.resize(max_devices, 0.0);
438
439 self.buft_overrides.clear();
440 self.buft_overrides.resize(
441 max_buft + 1,
442 llama_cpp_bindings_sys::llama_model_tensor_buft_override {
443 pattern: null(),
444 buft: std::ptr::null_mut(),
445 },
446 );
447
448 self.params.tensor_split = null::<f32>();
449 self.params.tensor_buft_overrides = null();
450
451 let mut out_unrecognized_status_code: i32 = 0;
452 let mut out_error: *mut c_char = std::ptr::null_mut();
453
454 let status = unsafe {
455 llama_cpp_bindings_sys::llama_rs_fit_params(
456 model_path.as_ptr(),
457 &raw mut self.params,
458 &raw mut context_params.context_params,
459 self.tensor_split.as_mut_ptr(),
460 self.buft_overrides.as_mut_ptr(),
461 margins.as_mut_ptr(),
462 n_ctx_min,
463 log_level,
464 &raw mut out_unrecognized_status_code,
465 &raw mut out_error,
466 )
467 };
468
469 match status {
470 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_OK => {}
471 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_REPORTED_FAILURE => {
472 return Err(FitError::NoFittingMemoryLayout);
473 }
474 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_REPORTED_ERROR => {
475 return Err(FitError::Aborted);
476 }
477 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_RETURNED_UNRECOGNIZED_STATUS_CODE => {
478 return Err(FitError::UnknownStatus {
479 code: out_unrecognized_status_code,
480 });
481 }
482 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_ERROR_STRING_ALLOCATION_FAILED => {
483 return Err(FitError::NotEnoughMemory);
484 }
485 llama_cpp_bindings_sys::LLAMA_RS_FIT_PARAMS_VENDORED_THREW_CXX_EXCEPTION => {
486 let message =
487 unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
488 return Err(FitError::Reported { message });
489 }
490 other => unreachable!("llama_rs_fit_params returned unrecognized wrapper status: {other}"),
491 }
492
493 self.params.tensor_split = self.tensor_split.as_ptr();
494 self.params.tensor_buft_overrides = self.buft_overrides.as_ptr();
495
496 Ok(FitResult {
497 n_ctx: context_params.context_params.n_ctx,
498 })
499 }
500}
501
502impl Default for LlamaModelParams {
516 fn default() -> Self {
517 let default_params = unsafe { llama_cpp_bindings_sys::llama_model_default_params() };
518 Self {
519 params: default_params,
520 kv_overrides: vec![llama_cpp_bindings_sys::llama_model_kv_override {
521 key: [0; 128],
522 tag: 0,
523 __bindgen_anon_1: llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
524 val_i64: 0,
525 },
526 }],
527 buft_overrides: vec![llama_cpp_bindings_sys::llama_model_tensor_buft_override {
528 pattern: null(),
529 buft: std::ptr::null_mut(),
530 }],
531 devices: Box::pin([std::ptr::null_mut(); 16]),
532 tensor_split: Vec::new(),
533 }
534 }
535}
536
537#[cfg(test)]
538mod tests {
539 use crate::model::split_mode::LlamaSplitMode;
540
541 use super::LlamaModelParams;
542
543 #[test]
544 fn default_params_have_expected_values() {
545 let params = LlamaModelParams::default();
546
547 assert_eq!(params.n_gpu_layers(), -1);
548 assert_eq!(params.main_gpu(), 0);
549 assert!(!params.vocab_only());
550 assert!(params.use_mmap());
551 assert!(!params.use_mlock());
552 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer));
553 assert!(params.devices().is_empty());
554 }
555
556 #[test]
557 fn n_gpu_layers_overflow_clamps_to_max() {
558 let params = LlamaModelParams::default().with_n_gpu_layers(u32::MAX);
559
560 assert_eq!(params.n_gpu_layers(), i32::MAX);
561 }
562
563 #[test]
564 fn with_n_gpu_layers_sets_value() {
565 let params = LlamaModelParams::default().with_n_gpu_layers(32);
566
567 assert_eq!(params.n_gpu_layers(), 32);
568 }
569
570 #[test]
571 fn with_main_gpu_sets_value() {
572 let params = LlamaModelParams::default().with_main_gpu(2);
573
574 assert_eq!(params.main_gpu(), 2);
575 }
576
577 #[test]
578 fn with_split_mode_none() {
579 let params = LlamaModelParams::default().with_split_mode(LlamaSplitMode::None);
580
581 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::None));
582 }
583
584 #[test]
585 fn with_split_mode_row() {
586 let params = LlamaModelParams::default().with_split_mode(LlamaSplitMode::Row);
587
588 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Row));
589 }
590
591 #[test]
592 fn with_vocab_only_enables() {
593 let params = LlamaModelParams::default().with_vocab_only(true);
594
595 assert!(params.vocab_only());
596 }
597
598 #[test]
599 fn with_vocab_only_disables() {
600 let params = LlamaModelParams::default().with_vocab_only(false);
601
602 assert!(!params.vocab_only());
603 }
604
605 #[test]
606 fn with_use_mmap_enables() {
607 let params = LlamaModelParams::default().with_use_mmap(true);
608
609 assert!(params.use_mmap());
610 }
611
612 #[test]
613 fn with_use_mmap_disables() {
614 let params = LlamaModelParams::default().with_use_mmap(false);
615
616 assert!(!params.use_mmap());
617 }
618
619 #[test]
620 fn with_no_alloc_enables() {
621 let params = LlamaModelParams::default().with_no_alloc(true);
622
623 assert!(params.no_alloc());
624 }
625
626 #[test]
627 fn with_no_alloc_disables() {
628 let params = LlamaModelParams::default().with_no_alloc(false);
629
630 assert!(!params.no_alloc());
631 }
632
633 #[test]
634 fn with_no_alloc_true_disables_mmap() {
635 let params = LlamaModelParams::default()
636 .with_use_mmap(true)
637 .with_no_alloc(true);
638
639 assert!(params.no_alloc());
640 assert!(!params.use_mmap());
641 }
642
643 #[test]
644 fn default_no_alloc_is_false() {
645 let params = LlamaModelParams::default();
646
647 assert!(!params.no_alloc());
648 }
649
650 #[test]
651 fn with_use_mlock_enables() {
652 let params = LlamaModelParams::default().with_use_mlock(true);
653
654 assert!(params.use_mlock());
655 }
656
657 #[test]
658 fn with_use_mlock_disables() {
659 let params = LlamaModelParams::default().with_use_mlock(false);
660
661 assert!(!params.use_mlock());
662 }
663
664 #[test]
665 fn debug_format_contains_field_names() {
666 let params = LlamaModelParams::default();
667 let debug_output = format!("{params:?}");
668
669 assert!(debug_output.contains("n_gpu_layers"));
670 assert!(debug_output.contains("main_gpu"));
671 assert!(debug_output.contains("vocab_only"));
672 assert!(debug_output.contains("use_mmap"));
673 assert!(debug_output.contains("use_mlock"));
674 assert!(debug_output.contains("split_mode"));
675 }
676
677 #[test]
678 fn builder_chaining_preserves_all_values() {
679 let params = LlamaModelParams::default()
680 .with_n_gpu_layers(10)
681 .with_main_gpu(1)
682 .with_split_mode(LlamaSplitMode::Row)
683 .with_vocab_only(true)
684 .with_use_mlock(true);
685
686 assert_eq!(params.n_gpu_layers(), 10);
687 assert_eq!(params.main_gpu(), 1);
688 assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Row));
689 assert!(params.vocab_only());
690 assert!(params.use_mlock());
691 }
692
693 #[test]
694 fn with_devices_empty_list_succeeds() {
695 let params = LlamaModelParams::default().with_devices(&[]);
696
697 assert!(params.is_ok());
698 assert!(params.unwrap().devices().is_empty());
699 }
700
701 #[test]
702 fn with_devices_invalid_index_returns_error() {
703 let result = LlamaModelParams::default().with_devices(&[999_999]);
704
705 assert!(matches!(
706 result.unwrap_err(),
707 crate::LlamaCppError::BackendDeviceNotFound(999_999)
708 ));
709 }
710
711 #[test]
712 fn add_cpu_buft_override_succeeds() {
713 let mut params = std::pin::pin!(LlamaModelParams::default());
714 let result = params.as_mut().add_cpu_buft_override(c"test_pattern");
715
716 assert!(result.is_ok());
717 }
718
719 #[test]
720 fn add_cpu_buft_override_twice_fails_with_slot_not_empty() {
721 let mut params = std::pin::pin!(LlamaModelParams::default());
722 params
723 .as_mut()
724 .add_cpu_buft_override(c"first_pattern")
725 .unwrap();
726 let result = params.as_mut().add_cpu_buft_override(c"second_pattern");
727
728 assert_eq!(
729 result.unwrap_err(),
730 crate::error::ModelParamsError::SlotNotEmpty
731 );
732 }
733
734 #[test]
735 fn add_cpu_moe_override_succeeds() {
736 let mut params = std::pin::pin!(LlamaModelParams::default());
737 let result = params.as_mut().add_cpu_moe_override();
738
739 assert!(result.is_ok());
740 }
741
742 #[test]
743 fn append_kv_override_twice_fails_with_slot_not_empty() {
744 use crate::model::params::param_override_value::ParamOverrideValue;
745 use std::ffi::CString;
746
747 let mut params = std::pin::pin!(LlamaModelParams::default());
748 let key = CString::new("first_key").unwrap();
749 params
750 .as_mut()
751 .append_kv_override(&key, ParamOverrideValue::Int(1))
752 .unwrap();
753
754 let key2 = CString::new("second_key").unwrap();
755 let result = params
756 .as_mut()
757 .append_kv_override(&key2, ParamOverrideValue::Int(2));
758
759 assert_eq!(
760 result.unwrap_err(),
761 crate::error::ModelParamsError::SlotNotEmpty
762 );
763 }
764
765 #[test]
766 fn with_devices_too_many_returns_max_exceeded() {
767 let too_many: Vec<usize> = (0..17).collect();
768 let result = LlamaModelParams::default().with_devices(&too_many);
769
770 assert!(
771 result
772 .unwrap_err()
773 .to_string()
774 .contains("Max devices exceeded")
775 );
776 }
777
778 #[test]
779 fn with_devices_sets_devices_when_available() {
780 #[cfg(feature = "dynamic-backends")]
781 crate::load_backends::load_backends().unwrap();
782
783 let dev_count = unsafe { llama_cpp_bindings_sys::ggml_backend_dev_count() };
784 assert!(dev_count > 0, "Test requires at least one backend device");
785
786 let params = LlamaModelParams::default().with_devices(&[0]).unwrap();
787
788 assert_eq!(params.devices().len(), 1);
789 assert_eq!(params.devices()[0], 0);
790 }
791
792 #[test]
793 fn with_devices_invalid_index_returns_not_found() {
794 let invalid_index = usize::MAX;
795 let result = LlamaModelParams::default().with_devices(&[invalid_index]);
796
797 assert!(result.unwrap_err().to_string().contains("Backend device"));
798 }
799
800 #[test]
801 #[cfg(not(target_os = "windows"))]
802 fn append_kv_override_with_high_byte_returns_invalid_character_error() {
803 use crate::model::params::param_override_value::ParamOverrideValue;
804
805 let key_bytes: &[u8] = b"\xff\0";
806 let key = std::ffi::CStr::from_bytes_with_nul(key_bytes).unwrap();
807 let mut params = std::pin::pin!(LlamaModelParams::default());
808 let result = params
809 .as_mut()
810 .append_kv_override(key, ParamOverrideValue::Int(1));
811
812 assert!(matches!(
813 result,
814 Err(crate::error::ModelParamsError::InvalidCharacterInKey { byte: 0xff, .. })
815 ));
816 }
817
818 #[test]
819 #[cfg(not(target_os = "windows"))]
820 fn add_cpu_buft_override_with_high_byte_returns_invalid_character_error() {
821 let key_bytes: &[u8] = b"\xff\0";
822 let key = std::ffi::CStr::from_bytes_with_nul(key_bytes).unwrap();
823 let mut params = std::pin::pin!(LlamaModelParams::default());
824 let result = params.as_mut().add_cpu_buft_override(key);
825
826 assert!(matches!(
827 result,
828 Err(crate::error::ModelParamsError::InvalidCharacterInKey { byte: 0xff, .. })
829 ));
830 }
831
832 #[test]
833 #[serial_test::serial]
834 fn fit_params_invalid_model_path_returns_error() {
835 use crate::context::params::LlamaContextParams;
836 use crate::error::FitError;
837 use crate::llama_backend::LlamaBackend;
838
839 let _backend = LlamaBackend::init();
840 let mut params = std::pin::pin!(LlamaModelParams::default());
841 let mut context_params = LlamaContextParams::default();
842 let mut margins = vec![0usize; crate::max_devices()];
843
844 let bogus_path = c"/nonexistent/path/to/model.gguf";
845 let result = params.as_mut().fit_params(
846 bogus_path,
847 &mut context_params,
848 &mut margins,
849 512,
850 llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
851 );
852
853 assert!(
854 matches!(result, Err(FitError::Aborted | FitError::Reported { .. })),
855 "expected Aborted or Reported, got {result:?}"
856 );
857 }
858}