1use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod sampling;
41pub mod token;
42pub mod token_type;
43
44#[cfg(feature = "rpc")]
45pub mod rpc;
46
47#[cfg(feature = "mtmd")]
48pub mod mtmd;
49
50pub type Result<T> = std::result::Result<T, LLamaCppError>;
52
53#[derive(Debug, Eq, PartialEq, thiserror::Error)]
55pub enum LLamaCppError {
56 #[error("BackendAlreadyInitialized")]
59 BackendAlreadyInitialized,
60 #[error("{0}")]
62 ChatTemplateError(#[from] ChatTemplateError),
63 #[error("{0}")]
65 DecodeError(#[from] DecodeError),
66 #[error("{0}")]
68 EncodeError(#[from] EncodeError),
69 #[error("{0}")]
71 LlamaModelLoadError(#[from] LlamaModelLoadError),
72 #[error("{0}")]
74 LlamaContextLoadError(#[from] LlamaContextLoadError),
75 #[error["{0}"]]
77 BatchAddError(#[from] BatchAddError),
78 #[error(transparent)]
80 EmbeddingError(#[from] EmbeddingsError),
81}
82
83#[derive(Debug, Eq, PartialEq, thiserror::Error)]
85pub enum ChatTemplateError {
86 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
88 BuffSizeError(usize),
89 #[error("the model has no meta val - returned code {0}")]
91 MissingTemplate(i32),
92 #[error(transparent)]
94 Utf8Error(#[from] std::str::Utf8Error),
95}
96
97#[derive(Debug, Eq, PartialEq, thiserror::Error)]
99pub enum StringFromModelError {
100 #[error("llama.cpp returned error code {0}")]
102 ReturnedError(i32),
103 #[error(transparent)]
105 Utf8Error(#[from] std::str::Utf8Error),
106}
107
108#[derive(Debug, Eq, PartialEq, thiserror::Error)]
110pub enum LlamaContextLoadError {
111 #[error("null reference from llama.cpp")]
113 NullReturn,
114}
115
116#[derive(Debug, Eq, PartialEq, thiserror::Error)]
118pub enum DecodeError {
119 #[error("Decode Error 1: NoKvCacheSlot")]
121 NoKvCacheSlot,
122 #[error("Decode Error -1: n_tokens == 0")]
124 NTokensZero,
125 #[error("Decode Error {0}: unknown")]
127 Unknown(c_int),
128}
129
130#[derive(Debug, Eq, PartialEq, thiserror::Error)]
132pub enum EncodeError {
133 #[error("Encode Error 1: NoKvCacheSlot")]
135 NoKvCacheSlot,
136 #[error("Encode Error -1: n_tokens == 0")]
138 NTokensZero,
139 #[error("Encode Error {0}: unknown")]
141 Unknown(c_int),
142}
143
144#[derive(Debug, Eq, PartialEq, thiserror::Error)]
146pub enum EmbeddingsError {
147 #[error("Embeddings weren't enabled in the context options")]
149 NotEnabled,
150 #[error("Logits were not enabled for the given token")]
152 LogitsNotEnabled,
153 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
155 NonePoolType,
156}
157
158impl From<NonZeroI32> for DecodeError {
160 fn from(value: NonZeroI32) -> Self {
161 match value.get() {
162 1 => DecodeError::NoKvCacheSlot,
163 -1 => DecodeError::NTokensZero,
164 i => DecodeError::Unknown(i),
165 }
166 }
167}
168
169impl From<NonZeroI32> for EncodeError {
171 fn from(value: NonZeroI32) -> Self {
172 match value.get() {
173 1 => EncodeError::NoKvCacheSlot,
174 -1 => EncodeError::NTokensZero,
175 i => EncodeError::Unknown(i),
176 }
177 }
178}
179
180#[derive(Debug, Eq, PartialEq, thiserror::Error)]
182pub enum LlamaModelLoadError {
183 #[error("null byte in string {0}")]
185 NullError(#[from] NulError),
186 #[error("null result from llama cpp")]
188 NullResult,
189 #[error("failed to convert path {0} to str")]
191 PathToStrError(PathBuf),
192}
193
194#[derive(Debug, Eq, PartialEq, thiserror::Error)]
196pub enum LlamaLoraAdapterInitError {
197 #[error("null byte in string {0}")]
199 NullError(#[from] NulError),
200 #[error("null result from llama cpp")]
202 NullResult,
203 #[error("failed to convert path {0} to str")]
205 PathToStrError(PathBuf),
206}
207
208#[derive(Debug, Eq, PartialEq, thiserror::Error)]
210pub enum LlamaLoraAdapterSetError {
211 #[error("error code from llama cpp")]
213 ErrorResult(i32),
214}
215
216#[derive(Debug, Eq, PartialEq, thiserror::Error)]
218pub enum LlamaLoraAdapterRemoveError {
219 #[error("error code from llama cpp")]
221 ErrorResult(i32),
222}
223
224#[must_use]
231pub fn llama_time_us() -> i64 {
232 unsafe { llama_cpp_sys_4::llama_time_us() }
233}
234
235#[must_use]
242pub fn max_devices() -> usize {
243 unsafe { llama_cpp_sys_4::llama_max_devices() }
244}
245
246#[must_use]
255pub fn mmap_supported() -> bool {
256 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
257}
258
259#[must_use]
268pub fn mlock_supported() -> bool {
269 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
270}
271
272#[derive(Debug, thiserror::Error, Clone)]
274#[non_exhaustive]
275pub enum TokenToStringError {
276 #[error("Unknown Token Type")]
278 UnknownTokenType,
279 #[error("Insufficient Buffer Space {0}")]
281 InsufficientBufferSpace(c_int),
282 #[error("FromUtf8Error {0}")]
284 FromUtf8Error(#[from] FromUtf8Error),
285}
286
287#[derive(Debug, thiserror::Error)]
289pub enum StringToTokenError {
290 #[error("{0}")]
292 NulError(#[from] NulError),
293 #[error("{0}")]
294 CIntConversionError(#[from] std::num::TryFromIntError),
296}
297
298#[derive(Debug, thiserror::Error)]
300pub enum NewLlamaChatMessageError {
301 #[error("{0}")]
303 NulError(#[from] NulError),
304}
305
306#[derive(Debug, thiserror::Error)]
308pub enum ApplyChatTemplateError {
309 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
311 BuffSizeError,
312 #[error("{0}")]
314 NulError(#[from] NulError),
315 #[error("{0}")]
317 FromUtf8Error(#[from] FromUtf8Error),
318}
319
320#[must_use]
336pub fn ggml_time_us() -> i64 {
337 unsafe { llama_cpp_sys_4::ggml_time_us() }
338}
339
340#[must_use]
352pub fn llama_supports_mlock() -> bool {
353 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
354}
355
356#[must_use]
360pub fn supports_gpu_offload() -> bool {
361 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
362}
363
364#[must_use]
368pub fn supports_rpc() -> bool {
369 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
370}
371
372#[must_use]
380pub fn print_system_info() -> String {
381 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
382 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
383 c_str.to_str().expect("system info is not valid UTF-8").to_owned()
384}
385
386#[must_use]
388pub fn max_parallel_sequences() -> usize {
389 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
390}
391
392#[must_use]
394pub fn max_tensor_buft_overrides() -> usize {
395 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
396}
397
398#[must_use]
404pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
405 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
406 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
407 c_str.to_str().expect("flash_attn_type_name is not valid UTF-8").to_owned()
408}
409
410#[must_use]
416pub fn model_meta_key_str(key: u32) -> String {
417 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key) };
418 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
419 c_str.to_str().expect("meta_key_str is not valid UTF-8").to_owned()
420}
421
422#[must_use]
438pub fn model_quantize(
439 fname_inp: &str,
440 fname_out: &str,
441 params: Option<&llama_cpp_sys_4::llama_model_quantize_params>,
442) -> u32 {
443 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
444 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
445 let default_params = unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() };
446 let params = params.unwrap_or(&default_params);
447 unsafe { llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), params) }
448}
449
450#[must_use]
452pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
453 unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
454}
455
456pub unsafe fn log_set(
463 callback: llama_cpp_sys_4::ggml_log_callback,
464 user_data: *mut std::ffi::c_void,
465) {
466 llama_cpp_sys_4::llama_log_set(callback, user_data);
467}
468
469pub unsafe fn log_get(
475 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
476 user_data: *mut *mut std::ffi::c_void,
477) {
478 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
479}
480
481pub unsafe fn opt_init(
487 ctx: *mut llama_cpp_sys_4::llama_context,
488 model: *mut llama_cpp_sys_4::llama_model,
489 params: llama_cpp_sys_4::llama_opt_params,
490) {
491 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
492}
493
494#[allow(clippy::too_many_arguments)]
500pub unsafe fn opt_epoch(
501 ctx: *mut llama_cpp_sys_4::llama_context,
502 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
503 result_train: llama_cpp_sys_4::ggml_opt_result_t,
504 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
505 idata_split: i64,
506 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
507 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
508) {
509 llama_cpp_sys_4::llama_opt_epoch(
510 ctx,
511 dataset,
512 result_train,
513 result_eval,
514 idata_split,
515 callback_train,
516 callback_eval,
517 );
518}
519
520pub unsafe fn opt_param_filter_all(
526 tensor: *const llama_cpp_sys_4::ggml_tensor,
527 userdata: *mut std::ffi::c_void,
528) -> bool {
529 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
530}
531
532#[allow(clippy::too_many_arguments)]
538pub unsafe fn params_fit(
539 path_model: *const std::ffi::c_char,
540 mparams: *mut llama_cpp_sys_4::llama_model_params,
541 cparams: *mut llama_cpp_sys_4::llama_context_params,
542 tensor_split: *mut f32,
543 tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
544 margins: *mut usize,
545 n_ctx_min: u32,
546 log_level: llama_cpp_sys_4::ggml_log_level,
547) -> llama_cpp_sys_4::llama_params_fit_status {
548 llama_cpp_sys_4::llama_params_fit(
549 path_model,
550 mparams,
551 cparams,
552 tensor_split,
553 tensor_buft_overrides,
554 margins,
555 n_ctx_min,
556 log_level,
557 )
558}