1use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod quantize;
41pub mod sampling;
42pub mod token;
43pub mod token_type;
44
45#[cfg(feature = "rpc")]
46pub mod rpc;
47
48#[cfg(feature = "mtmd")]
49pub mod mtmd;
50
51pub type Result<T> = std::result::Result<T, LLamaCppError>;
53
54#[derive(Debug, Eq, PartialEq, thiserror::Error)]
56pub enum LLamaCppError {
57 #[error("BackendAlreadyInitialized")]
60 BackendAlreadyInitialized,
61 #[error("{0}")]
63 ChatTemplateError(#[from] ChatTemplateError),
64 #[error("{0}")]
66 DecodeError(#[from] DecodeError),
67 #[error("{0}")]
69 EncodeError(#[from] EncodeError),
70 #[error("{0}")]
72 LlamaModelLoadError(#[from] LlamaModelLoadError),
73 #[error("{0}")]
75 LlamaContextLoadError(#[from] LlamaContextLoadError),
76 #[error["{0}"]]
78 BatchAddError(#[from] BatchAddError),
79 #[error(transparent)]
81 EmbeddingError(#[from] EmbeddingsError),
82}
83
84#[derive(Debug, Eq, PartialEq, thiserror::Error)]
86pub enum ChatTemplateError {
87 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
89 BuffSizeError(usize),
90 #[error("the model has no meta val - returned code {0}")]
92 MissingTemplate(i32),
93 #[error(transparent)]
95 Utf8Error(#[from] std::str::Utf8Error),
96}
97
98#[derive(Debug, Eq, PartialEq, thiserror::Error)]
100pub enum StringFromModelError {
101 #[error("llama.cpp returned error code {0}")]
103 ReturnedError(i32),
104 #[error(transparent)]
106 Utf8Error(#[from] std::str::Utf8Error),
107}
108
109#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum LlamaContextLoadError {
112 #[error("null reference from llama.cpp")]
114 NullReturn,
115}
116
117#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum DecodeError {
120 #[error("Decode Error 1: NoKvCacheSlot")]
122 NoKvCacheSlot,
123 #[error("Decode Error -1: n_tokens == 0")]
125 NTokensZero,
126 #[error("Decode Error {0}: unknown")]
128 Unknown(c_int),
129}
130
131#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EncodeError {
134 #[error("Encode Error 1: NoKvCacheSlot")]
136 NoKvCacheSlot,
137 #[error("Encode Error -1: n_tokens == 0")]
139 NTokensZero,
140 #[error("Encode Error {0}: unknown")]
142 Unknown(c_int),
143}
144
145#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EmbeddingsError {
148 #[error("Embeddings weren't enabled in the context options")]
150 NotEnabled,
151 #[error("Logits were not enabled for the given token")]
153 LogitsNotEnabled,
154 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
156 NonePoolType,
157}
158
159impl From<NonZeroI32> for DecodeError {
161 fn from(value: NonZeroI32) -> Self {
162 match value.get() {
163 1 => DecodeError::NoKvCacheSlot,
164 -1 => DecodeError::NTokensZero,
165 i => DecodeError::Unknown(i),
166 }
167 }
168}
169
170impl From<NonZeroI32> for EncodeError {
172 fn from(value: NonZeroI32) -> Self {
173 match value.get() {
174 1 => EncodeError::NoKvCacheSlot,
175 -1 => EncodeError::NTokensZero,
176 i => EncodeError::Unknown(i),
177 }
178 }
179}
180
181#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaModelLoadError {
184 #[error("null byte in string {0}")]
186 NullError(#[from] NulError),
187 #[error("null result from llama cpp")]
189 NullResult,
190 #[error("failed to convert path {0} to str")]
192 PathToStrError(PathBuf),
193}
194
195#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterInitError {
198 #[error("null byte in string {0}")]
200 NullError(#[from] NulError),
201 #[error("null result from llama cpp")]
203 NullResult,
204 #[error("failed to convert path {0} to str")]
206 PathToStrError(PathBuf),
207}
208
209#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterSetError {
212 #[error("error code from llama cpp")]
214 ErrorResult(i32),
215}
216
217#[derive(Debug, Eq, PartialEq, thiserror::Error)]
219pub enum LlamaLoraAdapterRemoveError {
220 #[error("error code from llama cpp")]
222 ErrorResult(i32),
223}
224
225#[must_use]
232pub fn llama_time_us() -> i64 {
233 unsafe { llama_cpp_sys_4::llama_time_us() }
234}
235
236#[must_use]
243pub fn max_devices() -> usize {
244 unsafe { llama_cpp_sys_4::llama_max_devices() }
245}
246
247#[must_use]
256pub fn mmap_supported() -> bool {
257 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
258}
259
260#[must_use]
269pub fn mlock_supported() -> bool {
270 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
271}
272
273#[derive(Debug, thiserror::Error, Clone)]
275#[non_exhaustive]
276pub enum TokenToStringError {
277 #[error("Unknown Token Type")]
279 UnknownTokenType,
280 #[error("Insufficient Buffer Space {0}")]
282 InsufficientBufferSpace(c_int),
283 #[error("FromUtf8Error {0}")]
285 FromUtf8Error(#[from] FromUtf8Error),
286}
287
288#[derive(Debug, thiserror::Error)]
290pub enum StringToTokenError {
291 #[error("{0}")]
293 NulError(#[from] NulError),
294 #[error("{0}")]
295 CIntConversionError(#[from] std::num::TryFromIntError),
297}
298
299#[derive(Debug, thiserror::Error)]
301pub enum NewLlamaChatMessageError {
302 #[error("{0}")]
304 NulError(#[from] NulError),
305}
306
307#[derive(Debug, thiserror::Error)]
309pub enum ApplyChatTemplateError {
310 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
312 BuffSizeError,
313 #[error("{0}")]
315 NulError(#[from] NulError),
316 #[error("{0}")]
318 FromUtf8Error(#[from] FromUtf8Error),
319}
320
321#[must_use]
337pub fn ggml_time_us() -> i64 {
338 unsafe { llama_cpp_sys_4::ggml_time_us() }
339}
340
341#[must_use]
353pub fn llama_supports_mlock() -> bool {
354 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
355}
356
357#[must_use]
361pub fn supports_gpu_offload() -> bool {
362 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
363}
364
365#[must_use]
369pub fn supports_rpc() -> bool {
370 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
371}
372
373#[must_use]
381pub fn print_system_info() -> String {
382 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
383 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
384 c_str.to_str().expect("system info is not valid UTF-8").to_owned()
385}
386
387#[must_use]
389pub fn max_parallel_sequences() -> usize {
390 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
391}
392
393#[must_use]
395pub fn max_tensor_buft_overrides() -> usize {
396 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
397}
398
399#[must_use]
405pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
406 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
407 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
408 c_str.to_str().expect("flash_attn_type_name is not valid UTF-8").to_owned()
409}
410
411#[must_use]
417pub fn model_meta_key_str(key: u32) -> String {
418 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
419 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
420 c_str.to_str().expect("meta_key_str is not valid UTF-8").to_owned()
421}
422
423pub fn model_quantize(
444 fname_inp: &str,
445 fname_out: &str,
446 params: &quantize::QuantizeParams,
447) -> std::result::Result<(), u32> {
448 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
449 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
450 let guard = params.to_raw();
451 let rc =
452 unsafe { llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &guard.raw) };
453 if rc == 0 { Ok(()) } else { Err(rc) }
454}
455
456#[must_use]
460#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
461pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
462 unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
463}
464
465pub unsafe fn log_set(
472 callback: llama_cpp_sys_4::ggml_log_callback,
473 user_data: *mut std::ffi::c_void,
474) {
475 llama_cpp_sys_4::llama_log_set(callback, user_data);
476}
477
478pub unsafe fn log_get(
484 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
485 user_data: *mut *mut std::ffi::c_void,
486) {
487 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
488}
489
490pub unsafe fn opt_init(
496 ctx: *mut llama_cpp_sys_4::llama_context,
497 model: *mut llama_cpp_sys_4::llama_model,
498 params: llama_cpp_sys_4::llama_opt_params,
499) {
500 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
501}
502
503#[allow(clippy::too_many_arguments)]
509pub unsafe fn opt_epoch(
510 ctx: *mut llama_cpp_sys_4::llama_context,
511 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
512 result_train: llama_cpp_sys_4::ggml_opt_result_t,
513 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
514 idata_split: i64,
515 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
516 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
517) {
518 llama_cpp_sys_4::llama_opt_epoch(
519 ctx,
520 dataset,
521 result_train,
522 result_eval,
523 idata_split,
524 callback_train,
525 callback_eval,
526 );
527}
528
529pub unsafe fn opt_param_filter_all(
535 tensor: *const llama_cpp_sys_4::ggml_tensor,
536 userdata: *mut std::ffi::c_void,
537) -> bool {
538 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
539}
540
541#[allow(clippy::too_many_arguments)]
547pub unsafe fn params_fit(
548 path_model: *const std::ffi::c_char,
549 mparams: *mut llama_cpp_sys_4::llama_model_params,
550 cparams: *mut llama_cpp_sys_4::llama_context_params,
551 tensor_split: *mut f32,
552 tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
553 margins: *mut usize,
554 n_ctx_min: u32,
555 log_level: llama_cpp_sys_4::ggml_log_level,
556) -> llama_cpp_sys_4::llama_params_fit_status {
557 llama_cpp_sys_4::llama_params_fit(
558 path_model,
559 mparams,
560 cparams,
561 tensor_split,
562 tensor_buft_overrides,
563 margins,
564 n_ctx_min,
565 log_level,
566 )
567}