1use std::ffi::NulError;
27use std::fmt::Debug;
28use std::num::NonZeroI32;
29
30use crate::llama_batch::BatchAddError;
31use std::os::raw::c_int;
32use std::path::PathBuf;
33use std::string::FromUtf8Error;
34
35pub mod common;
36pub mod context;
37pub mod eagle;
38#[cfg(feature = "ggml")]
39pub mod ggml;
40pub mod llama_backend;
41pub mod llama_batch;
42pub mod model;
43pub mod mtp;
44pub mod quantize;
45pub mod sampling;
46pub mod token;
47pub mod token_type;
48
49#[cfg(feature = "rpc")]
50pub mod rpc;
51
52#[cfg(feature = "mtmd")]
53pub mod mtmd;
54
55pub type Result<T> = std::result::Result<T, LLamaCppError>;
57
58#[derive(Debug, Eq, PartialEq, thiserror::Error)]
60pub enum LLamaCppError {
61 #[error("BackendAlreadyInitialized")]
64 BackendAlreadyInitialized,
65 #[error("{0}")]
67 ChatTemplateError(#[from] ChatTemplateError),
68 #[error("{0}")]
70 DecodeError(#[from] DecodeError),
71 #[error("{0}")]
73 EncodeError(#[from] EncodeError),
74 #[error("{0}")]
76 LlamaModelLoadError(#[from] LlamaModelLoadError),
77 #[error("{0}")]
79 LlamaContextLoadError(#[from] LlamaContextLoadError),
80 #[error["{0}"]]
82 BatchAddError(#[from] BatchAddError),
83 #[error(transparent)]
85 EmbeddingError(#[from] EmbeddingsError),
86}
87
88#[derive(Debug, Eq, PartialEq, thiserror::Error)]
90pub enum ChatTemplateError {
91 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
93 BuffSizeError(usize),
94 #[error("the model has no meta val - returned code {0}")]
96 MissingTemplate(i32),
97 #[error(transparent)]
99 Utf8Error(#[from] std::str::Utf8Error),
100}
101
102#[derive(Debug, Eq, PartialEq, thiserror::Error)]
104pub enum StringFromModelError {
105 #[error("llama.cpp returned error code {0}")]
107 ReturnedError(i32),
108 #[error(transparent)]
110 Utf8Error(#[from] std::str::Utf8Error),
111}
112
113#[derive(Debug, Eq, PartialEq, thiserror::Error)]
115pub enum LlamaContextLoadError {
116 #[error("null reference from llama.cpp")]
118 NullReturn,
119}
120
121#[derive(Debug, Eq, PartialEq, thiserror::Error)]
123pub enum DecodeError {
124 #[error("Decode Error 1: NoKvCacheSlot")]
126 NoKvCacheSlot,
127 #[error("Decode Error -1: n_tokens == 0")]
129 NTokensZero,
130 #[error("Decode Error {0}: unknown")]
132 Unknown(c_int),
133}
134
135#[derive(Debug, Eq, PartialEq, thiserror::Error)]
137pub enum EncodeError {
138 #[error("Encode Error 1: NoKvCacheSlot")]
140 NoKvCacheSlot,
141 #[error("Encode Error -1: n_tokens == 0")]
143 NTokensZero,
144 #[error("Encode Error {0}: unknown")]
146 Unknown(c_int),
147}
148
149#[derive(Debug, Eq, PartialEq, thiserror::Error)]
151pub enum EmbeddingsError {
152 #[error("Embeddings weren't enabled in the context options")]
154 NotEnabled,
155 #[error("Logits were not enabled for the given token")]
157 LogitsNotEnabled,
158 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
160 NonePoolType,
161}
162
163impl From<NonZeroI32> for DecodeError {
165 fn from(value: NonZeroI32) -> Self {
166 match value.get() {
167 1 => DecodeError::NoKvCacheSlot,
168 -1 => DecodeError::NTokensZero,
169 i => DecodeError::Unknown(i),
170 }
171 }
172}
173
174impl From<NonZeroI32> for EncodeError {
176 fn from(value: NonZeroI32) -> Self {
177 match value.get() {
178 1 => EncodeError::NoKvCacheSlot,
179 -1 => EncodeError::NTokensZero,
180 i => EncodeError::Unknown(i),
181 }
182 }
183}
184
185#[derive(Debug, Eq, PartialEq, thiserror::Error)]
187pub enum LlamaModelLoadError {
188 #[error("null byte in string {0}")]
190 NullError(#[from] NulError),
191 #[error("null result from llama cpp")]
193 NullResult,
194 #[error("failed to convert path {0} to str")]
196 PathToStrError(PathBuf),
197}
198
199#[derive(Debug, Eq, PartialEq, thiserror::Error)]
201pub enum LlamaLoraAdapterInitError {
202 #[error("null byte in string {0}")]
204 NullError(#[from] NulError),
205 #[error("null result from llama cpp")]
207 NullResult,
208 #[error("failed to convert path {0} to str")]
210 PathToStrError(PathBuf),
211}
212
213#[derive(Debug, Eq, PartialEq, thiserror::Error)]
215pub enum LlamaLoraAdapterSetError {
216 #[error("error code from llama cpp")]
218 ErrorResult(i32),
219}
220
221#[derive(Debug, Eq, PartialEq, thiserror::Error)]
223pub enum LlamaLoraAdapterRemoveError {
224 #[error("error code from llama cpp")]
226 ErrorResult(i32),
227}
228
229#[must_use]
236pub fn llama_time_us() -> i64 {
237 unsafe { llama_cpp_sys_4::llama_time_us() }
238}
239
240#[must_use]
247pub fn max_devices() -> usize {
248 unsafe { llama_cpp_sys_4::llama_max_devices() }
249}
250
251#[must_use]
260pub fn mmap_supported() -> bool {
261 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
262}
263
264#[must_use]
273pub fn mlock_supported() -> bool {
274 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
275}
276
277#[derive(Debug, thiserror::Error, Clone)]
279#[non_exhaustive]
280pub enum TokenToStringError {
281 #[error("Unknown Token Type")]
283 UnknownTokenType,
284 #[error("Insufficient Buffer Space {0}")]
286 InsufficientBufferSpace(c_int),
287 #[error("FromUtf8Error {0}")]
289 FromUtf8Error(#[from] FromUtf8Error),
290}
291
292#[derive(Debug, thiserror::Error)]
294pub enum StringToTokenError {
295 #[error("{0}")]
297 NulError(#[from] NulError),
298 #[error("{0}")]
299 CIntConversionError(#[from] std::num::TryFromIntError),
301}
302
303#[derive(Debug, thiserror::Error)]
305pub enum NewLlamaChatMessageError {
306 #[error("{0}")]
308 NulError(#[from] NulError),
309}
310
311#[derive(Debug, thiserror::Error)]
313pub enum ApplyChatTemplateError {
314 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
316 BuffSizeError,
317 #[error("{0}")]
319 NulError(#[from] NulError),
320 #[error("{0}")]
322 FromUtf8Error(#[from] FromUtf8Error),
323}
324
325#[must_use]
341pub fn ggml_time_us() -> i64 {
342 unsafe { llama_cpp_sys_4::ggml_time_us() }
343}
344
345#[must_use]
357pub fn llama_supports_mlock() -> bool {
358 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
359}
360
361#[must_use]
365pub fn supports_gpu_offload() -> bool {
366 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
367}
368
369#[must_use]
373pub fn supports_rpc() -> bool {
374 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
375}
376
377#[must_use]
385pub fn print_system_info() -> String {
386 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
387 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
388 c_str
389 .to_str()
390 .expect("system info is not valid UTF-8")
391 .to_owned()
392}
393
394#[must_use]
396pub fn max_parallel_sequences() -> usize {
397 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
398}
399
400#[must_use]
402pub fn max_tensor_buft_overrides() -> usize {
403 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
404}
405
406#[must_use]
412pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
413 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
414 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
415 c_str
416 .to_str()
417 .expect("flash_attn_type_name is not valid UTF-8")
418 .to_owned()
419}
420
421#[must_use]
427pub fn model_meta_key_str(key: u32) -> String {
428 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
429 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
430 c_str
431 .to_str()
432 .expect("meta_key_str is not valid UTF-8")
433 .to_owned()
434}
435
436pub fn model_quantize(
457 fname_inp: &str,
458 fname_out: &str,
459 params: &quantize::QuantizeParams,
460) -> std::result::Result<(), u32> {
461 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
462 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
463 let guard = params.to_raw();
464 let rc = unsafe {
465 llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
466 };
467 if rc == 0 {
468 Ok(())
469 } else {
470 Err(rc)
471 }
472}
473
474#[must_use]
478#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
479pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
480 unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
481}
482
483pub unsafe fn log_set(
490 callback: llama_cpp_sys_4::ggml_log_callback,
491 user_data: *mut std::ffi::c_void,
492) {
493 llama_cpp_sys_4::llama_log_set(callback, user_data);
494}
495
496pub unsafe fn log_get(
502 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
503 user_data: *mut *mut std::ffi::c_void,
504) {
505 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
506}
507
508pub unsafe fn opt_init(
514 ctx: *mut llama_cpp_sys_4::llama_context,
515 model: *mut llama_cpp_sys_4::llama_model,
516 params: llama_cpp_sys_4::llama_opt_params,
517) {
518 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
519}
520
521#[allow(clippy::too_many_arguments)]
527pub unsafe fn opt_epoch(
528 ctx: *mut llama_cpp_sys_4::llama_context,
529 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
530 result_train: llama_cpp_sys_4::ggml_opt_result_t,
531 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
532 idata_split: i64,
533 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
534 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
535) {
536 llama_cpp_sys_4::llama_opt_epoch(
537 ctx,
538 dataset,
539 result_train,
540 result_eval,
541 idata_split,
542 callback_train,
543 callback_eval,
544 );
545}
546
547pub unsafe fn opt_param_filter_all(
553 tensor: *const llama_cpp_sys_4::ggml_tensor,
554 userdata: *mut std::ffi::c_void,
555) -> bool {
556 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
557}
558
559#[allow(clippy::too_many_arguments)]
565pub unsafe fn params_fit(
566 path_model: *const std::ffi::c_char,
567 mparams: *mut llama_cpp_sys_4::llama_model_params,
568 cparams: *mut llama_cpp_sys_4::llama_context_params,
569 tensor_split: *mut f32,
570 tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
571 margins: *mut usize,
572 n_ctx_min: u32,
573 log_level: llama_cpp_sys_4::ggml_log_level,
574) -> llama_cpp_sys_4::common_params_fit_status {
575 llama_cpp_sys_4::common_fit_params(
576 path_model,
577 mparams,
578 cparams,
579 tensor_split,
580 tensor_buft_overrides,
581 margins,
582 n_ctx_min,
583 log_level,
584 )
585}