1use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod mtp;
41pub mod quantize;
42pub mod sampling;
43pub mod token;
44pub mod token_type;
45
46#[cfg(feature = "rpc")]
47pub mod rpc;
48
49#[cfg(feature = "mtmd")]
50pub mod mtmd;
51
52pub type Result<T> = std::result::Result<T, LLamaCppError>;
54
55#[derive(Debug, Eq, PartialEq, thiserror::Error)]
57pub enum LLamaCppError {
58 #[error("BackendAlreadyInitialized")]
61 BackendAlreadyInitialized,
62 #[error("{0}")]
64 ChatTemplateError(#[from] ChatTemplateError),
65 #[error("{0}")]
67 DecodeError(#[from] DecodeError),
68 #[error("{0}")]
70 EncodeError(#[from] EncodeError),
71 #[error("{0}")]
73 LlamaModelLoadError(#[from] LlamaModelLoadError),
74 #[error("{0}")]
76 LlamaContextLoadError(#[from] LlamaContextLoadError),
77 #[error["{0}"]]
79 BatchAddError(#[from] BatchAddError),
80 #[error(transparent)]
82 EmbeddingError(#[from] EmbeddingsError),
83}
84
85#[derive(Debug, Eq, PartialEq, thiserror::Error)]
87pub enum ChatTemplateError {
88 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
90 BuffSizeError(usize),
91 #[error("the model has no meta val - returned code {0}")]
93 MissingTemplate(i32),
94 #[error(transparent)]
96 Utf8Error(#[from] std::str::Utf8Error),
97}
98
99#[derive(Debug, Eq, PartialEq, thiserror::Error)]
101pub enum StringFromModelError {
102 #[error("llama.cpp returned error code {0}")]
104 ReturnedError(i32),
105 #[error(transparent)]
107 Utf8Error(#[from] std::str::Utf8Error),
108}
109
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
112pub enum LlamaContextLoadError {
113 #[error("null reference from llama.cpp")]
115 NullReturn,
116}
117
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
120pub enum DecodeError {
121 #[error("Decode Error 1: NoKvCacheSlot")]
123 NoKvCacheSlot,
124 #[error("Decode Error -1: n_tokens == 0")]
126 NTokensZero,
127 #[error("Decode Error {0}: unknown")]
129 Unknown(c_int),
130}
131
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
134pub enum EncodeError {
135 #[error("Encode Error 1: NoKvCacheSlot")]
137 NoKvCacheSlot,
138 #[error("Encode Error -1: n_tokens == 0")]
140 NTokensZero,
141 #[error("Encode Error {0}: unknown")]
143 Unknown(c_int),
144}
145
146#[derive(Debug, Eq, PartialEq, thiserror::Error)]
148pub enum EmbeddingsError {
149 #[error("Embeddings weren't enabled in the context options")]
151 NotEnabled,
152 #[error("Logits were not enabled for the given token")]
154 LogitsNotEnabled,
155 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
157 NonePoolType,
158}
159
160impl From<NonZeroI32> for DecodeError {
162 fn from(value: NonZeroI32) -> Self {
163 match value.get() {
164 1 => DecodeError::NoKvCacheSlot,
165 -1 => DecodeError::NTokensZero,
166 i => DecodeError::Unknown(i),
167 }
168 }
169}
170
171impl From<NonZeroI32> for EncodeError {
173 fn from(value: NonZeroI32) -> Self {
174 match value.get() {
175 1 => EncodeError::NoKvCacheSlot,
176 -1 => EncodeError::NTokensZero,
177 i => EncodeError::Unknown(i),
178 }
179 }
180}
181
182#[derive(Debug, Eq, PartialEq, thiserror::Error)]
184pub enum LlamaModelLoadError {
185 #[error("null byte in string {0}")]
187 NullError(#[from] NulError),
188 #[error("null result from llama cpp")]
190 NullResult,
191 #[error("failed to convert path {0} to str")]
193 PathToStrError(PathBuf),
194}
195
196#[derive(Debug, Eq, PartialEq, thiserror::Error)]
198pub enum LlamaLoraAdapterInitError {
199 #[error("null byte in string {0}")]
201 NullError(#[from] NulError),
202 #[error("null result from llama cpp")]
204 NullResult,
205 #[error("failed to convert path {0} to str")]
207 PathToStrError(PathBuf),
208}
209
210#[derive(Debug, Eq, PartialEq, thiserror::Error)]
212pub enum LlamaLoraAdapterSetError {
213 #[error("error code from llama cpp")]
215 ErrorResult(i32),
216}
217
218#[derive(Debug, Eq, PartialEq, thiserror::Error)]
220pub enum LlamaLoraAdapterRemoveError {
221 #[error("error code from llama cpp")]
223 ErrorResult(i32),
224}
225
226#[must_use]
233pub fn llama_time_us() -> i64 {
234 unsafe { llama_cpp_sys_4::llama_time_us() }
235}
236
237#[must_use]
244pub fn max_devices() -> usize {
245 unsafe { llama_cpp_sys_4::llama_max_devices() }
246}
247
248#[must_use]
257pub fn mmap_supported() -> bool {
258 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
259}
260
261#[must_use]
270pub fn mlock_supported() -> bool {
271 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
272}
273
274#[derive(Debug, thiserror::Error, Clone)]
276#[non_exhaustive]
277pub enum TokenToStringError {
278 #[error("Unknown Token Type")]
280 UnknownTokenType,
281 #[error("Insufficient Buffer Space {0}")]
283 InsufficientBufferSpace(c_int),
284 #[error("FromUtf8Error {0}")]
286 FromUtf8Error(#[from] FromUtf8Error),
287}
288
289#[derive(Debug, thiserror::Error)]
291pub enum StringToTokenError {
292 #[error("{0}")]
294 NulError(#[from] NulError),
295 #[error("{0}")]
296 CIntConversionError(#[from] std::num::TryFromIntError),
298}
299
300#[derive(Debug, thiserror::Error)]
302pub enum NewLlamaChatMessageError {
303 #[error("{0}")]
305 NulError(#[from] NulError),
306}
307
308#[derive(Debug, thiserror::Error)]
310pub enum ApplyChatTemplateError {
311 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
313 BuffSizeError,
314 #[error("{0}")]
316 NulError(#[from] NulError),
317 #[error("{0}")]
319 FromUtf8Error(#[from] FromUtf8Error),
320}
321
322#[must_use]
338pub fn ggml_time_us() -> i64 {
339 unsafe { llama_cpp_sys_4::ggml_time_us() }
340}
341
342#[must_use]
354pub fn llama_supports_mlock() -> bool {
355 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
356}
357
358#[must_use]
362pub fn supports_gpu_offload() -> bool {
363 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
364}
365
366#[must_use]
370pub fn supports_rpc() -> bool {
371 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
372}
373
374#[must_use]
382pub fn print_system_info() -> String {
383 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
384 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
385 c_str
386 .to_str()
387 .expect("system info is not valid UTF-8")
388 .to_owned()
389}
390
391#[must_use]
393pub fn max_parallel_sequences() -> usize {
394 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
395}
396
397#[must_use]
399pub fn max_tensor_buft_overrides() -> usize {
400 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
401}
402
403#[must_use]
409pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
410 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
411 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
412 c_str
413 .to_str()
414 .expect("flash_attn_type_name is not valid UTF-8")
415 .to_owned()
416}
417
418#[must_use]
424pub fn model_meta_key_str(key: u32) -> String {
425 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
426 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
427 c_str
428 .to_str()
429 .expect("meta_key_str is not valid UTF-8")
430 .to_owned()
431}
432
433pub fn model_quantize(
454 fname_inp: &str,
455 fname_out: &str,
456 params: &quantize::QuantizeParams,
457) -> std::result::Result<(), u32> {
458 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
459 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
460 let guard = params.to_raw();
461 let rc = unsafe {
462 llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
463 };
464 if rc == 0 {
465 Ok(())
466 } else {
467 Err(rc)
468 }
469}
470
471#[must_use]
475#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
476pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
477 unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
478}
479
480pub unsafe fn log_set(
487 callback: llama_cpp_sys_4::ggml_log_callback,
488 user_data: *mut std::ffi::c_void,
489) {
490 llama_cpp_sys_4::llama_log_set(callback, user_data);
491}
492
493pub unsafe fn log_get(
499 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
500 user_data: *mut *mut std::ffi::c_void,
501) {
502 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
503}
504
505pub unsafe fn opt_init(
511 ctx: *mut llama_cpp_sys_4::llama_context,
512 model: *mut llama_cpp_sys_4::llama_model,
513 params: llama_cpp_sys_4::llama_opt_params,
514) {
515 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
516}
517
518#[allow(clippy::too_many_arguments)]
524pub unsafe fn opt_epoch(
525 ctx: *mut llama_cpp_sys_4::llama_context,
526 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
527 result_train: llama_cpp_sys_4::ggml_opt_result_t,
528 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
529 idata_split: i64,
530 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
531 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
532) {
533 llama_cpp_sys_4::llama_opt_epoch(
534 ctx,
535 dataset,
536 result_train,
537 result_eval,
538 idata_split,
539 callback_train,
540 callback_eval,
541 );
542}
543
544pub unsafe fn opt_param_filter_all(
550 tensor: *const llama_cpp_sys_4::ggml_tensor,
551 userdata: *mut std::ffi::c_void,
552) -> bool {
553 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
554}
555
556#[allow(clippy::too_many_arguments)]
562pub unsafe fn params_fit(
563 path_model: *const std::ffi::c_char,
564 mparams: *mut llama_cpp_sys_4::llama_model_params,
565 cparams: *mut llama_cpp_sys_4::llama_context_params,
566 tensor_split: *mut f32,
567 tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
568 margins: *mut usize,
569 n_ctx_min: u32,
570 log_level: llama_cpp_sys_4::ggml_log_level,
571) -> llama_cpp_sys_4::common_params_fit_status {
572 llama_cpp_sys_4::common_fit_params(
573 path_model,
574 mparams,
575 cparams,
576 tensor_split,
577 tensor_buft_overrides,
578 margins,
579 n_ctx_min,
580 log_level,
581 )
582}