1use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod quantize;
41pub mod sampling;
42pub mod token;
43pub mod token_type;
44
45#[cfg(feature = "rpc")]
46pub mod rpc;
47
48#[cfg(feature = "mtmd")]
49pub mod mtmd;
50
51pub type Result<T> = std::result::Result<T, LLamaCppError>;
53
54#[derive(Debug, Eq, PartialEq, thiserror::Error)]
56pub enum LLamaCppError {
57 #[error("BackendAlreadyInitialized")]
60 BackendAlreadyInitialized,
61 #[error("{0}")]
63 ChatTemplateError(#[from] ChatTemplateError),
64 #[error("{0}")]
66 DecodeError(#[from] DecodeError),
67 #[error("{0}")]
69 EncodeError(#[from] EncodeError),
70 #[error("{0}")]
72 LlamaModelLoadError(#[from] LlamaModelLoadError),
73 #[error("{0}")]
75 LlamaContextLoadError(#[from] LlamaContextLoadError),
76 #[error["{0}"]]
78 BatchAddError(#[from] BatchAddError),
79 #[error(transparent)]
81 EmbeddingError(#[from] EmbeddingsError),
82}
83
84#[derive(Debug, Eq, PartialEq, thiserror::Error)]
86pub enum ChatTemplateError {
87 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
89 BuffSizeError(usize),
90 #[error("the model has no meta val - returned code {0}")]
92 MissingTemplate(i32),
93 #[error(transparent)]
95 Utf8Error(#[from] std::str::Utf8Error),
96}
97
98#[derive(Debug, Eq, PartialEq, thiserror::Error)]
100pub enum StringFromModelError {
101 #[error("llama.cpp returned error code {0}")]
103 ReturnedError(i32),
104 #[error(transparent)]
106 Utf8Error(#[from] std::str::Utf8Error),
107}
108
109#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum LlamaContextLoadError {
112 #[error("null reference from llama.cpp")]
114 NullReturn,
115}
116
117#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum DecodeError {
120 #[error("Decode Error 1: NoKvCacheSlot")]
122 NoKvCacheSlot,
123 #[error("Decode Error -1: n_tokens == 0")]
125 NTokensZero,
126 #[error("Decode Error {0}: unknown")]
128 Unknown(c_int),
129}
130
131#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EncodeError {
134 #[error("Encode Error 1: NoKvCacheSlot")]
136 NoKvCacheSlot,
137 #[error("Encode Error -1: n_tokens == 0")]
139 NTokensZero,
140 #[error("Encode Error {0}: unknown")]
142 Unknown(c_int),
143}
144
145#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EmbeddingsError {
148 #[error("Embeddings weren't enabled in the context options")]
150 NotEnabled,
151 #[error("Logits were not enabled for the given token")]
153 LogitsNotEnabled,
154 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
156 NonePoolType,
157}
158
159impl From<NonZeroI32> for DecodeError {
161 fn from(value: NonZeroI32) -> Self {
162 match value.get() {
163 1 => DecodeError::NoKvCacheSlot,
164 -1 => DecodeError::NTokensZero,
165 i => DecodeError::Unknown(i),
166 }
167 }
168}
169
170impl From<NonZeroI32> for EncodeError {
172 fn from(value: NonZeroI32) -> Self {
173 match value.get() {
174 1 => EncodeError::NoKvCacheSlot,
175 -1 => EncodeError::NTokensZero,
176 i => EncodeError::Unknown(i),
177 }
178 }
179}
180
181#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaModelLoadError {
184 #[error("null byte in string {0}")]
186 NullError(#[from] NulError),
187 #[error("null result from llama cpp")]
189 NullResult,
190 #[error("failed to convert path {0} to str")]
192 PathToStrError(PathBuf),
193}
194
195#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterInitError {
198 #[error("null byte in string {0}")]
200 NullError(#[from] NulError),
201 #[error("null result from llama cpp")]
203 NullResult,
204 #[error("failed to convert path {0} to str")]
206 PathToStrError(PathBuf),
207}
208
209#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterSetError {
212 #[error("error code from llama cpp")]
214 ErrorResult(i32),
215}
216
217#[derive(Debug, Eq, PartialEq, thiserror::Error)]
219pub enum LlamaLoraAdapterRemoveError {
220 #[error("error code from llama cpp")]
222 ErrorResult(i32),
223}
224
225#[must_use]
232pub fn llama_time_us() -> i64 {
233 unsafe { llama_cpp_sys_4::llama_time_us() }
234}
235
236#[must_use]
243pub fn max_devices() -> usize {
244 unsafe { llama_cpp_sys_4::llama_max_devices() }
245}
246
247#[must_use]
256pub fn mmap_supported() -> bool {
257 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
258}
259
260#[must_use]
269pub fn mlock_supported() -> bool {
270 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
271}
272
273#[derive(Debug, thiserror::Error, Clone)]
275#[non_exhaustive]
276pub enum TokenToStringError {
277 #[error("Unknown Token Type")]
279 UnknownTokenType,
280 #[error("Insufficient Buffer Space {0}")]
282 InsufficientBufferSpace(c_int),
283 #[error("FromUtf8Error {0}")]
285 FromUtf8Error(#[from] FromUtf8Error),
286}
287
288#[derive(Debug, thiserror::Error)]
290pub enum StringToTokenError {
291 #[error("{0}")]
293 NulError(#[from] NulError),
294 #[error("{0}")]
295 CIntConversionError(#[from] std::num::TryFromIntError),
297}
298
299#[derive(Debug, thiserror::Error)]
301pub enum NewLlamaChatMessageError {
302 #[error("{0}")]
304 NulError(#[from] NulError),
305}
306
307#[derive(Debug, thiserror::Error)]
309pub enum ApplyChatTemplateError {
310 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
312 BuffSizeError,
313 #[error("{0}")]
315 NulError(#[from] NulError),
316 #[error("{0}")]
318 FromUtf8Error(#[from] FromUtf8Error),
319}
320
321#[must_use]
337pub fn ggml_time_us() -> i64 {
338 unsafe { llama_cpp_sys_4::ggml_time_us() }
339}
340
341#[must_use]
353pub fn llama_supports_mlock() -> bool {
354 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
355}
356
357#[must_use]
361pub fn supports_gpu_offload() -> bool {
362 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
363}
364
365#[must_use]
369pub fn supports_rpc() -> bool {
370 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
371}
372
373#[must_use]
381pub fn print_system_info() -> String {
382 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
383 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
384 c_str
385 .to_str()
386 .expect("system info is not valid UTF-8")
387 .to_owned()
388}
389
390#[must_use]
392pub fn max_parallel_sequences() -> usize {
393 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
394}
395
396#[must_use]
398pub fn max_tensor_buft_overrides() -> usize {
399 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
400}
401
402#[must_use]
408pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
409 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
410 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
411 c_str
412 .to_str()
413 .expect("flash_attn_type_name is not valid UTF-8")
414 .to_owned()
415}
416
417#[must_use]
423pub fn model_meta_key_str(key: u32) -> String {
424 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
425 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
426 c_str
427 .to_str()
428 .expect("meta_key_str is not valid UTF-8")
429 .to_owned()
430}
431
432pub fn model_quantize(
453 fname_inp: &str,
454 fname_out: &str,
455 params: &quantize::QuantizeParams,
456) -> std::result::Result<(), u32> {
457 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
458 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
459 let guard = params.to_raw();
460 let rc = unsafe {
461 llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
462 };
463 if rc == 0 {
464 Ok(())
465 } else {
466 Err(rc)
467 }
468}
469
470#[must_use]
474#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
475pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
476 unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
477}
478
479pub unsafe fn log_set(
486 callback: llama_cpp_sys_4::ggml_log_callback,
487 user_data: *mut std::ffi::c_void,
488) {
489 llama_cpp_sys_4::llama_log_set(callback, user_data);
490}
491
492pub unsafe fn log_get(
498 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
499 user_data: *mut *mut std::ffi::c_void,
500) {
501 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
502}
503
504pub unsafe fn opt_init(
510 ctx: *mut llama_cpp_sys_4::llama_context,
511 model: *mut llama_cpp_sys_4::llama_model,
512 params: llama_cpp_sys_4::llama_opt_params,
513) {
514 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
515}
516
517#[allow(clippy::too_many_arguments)]
523pub unsafe fn opt_epoch(
524 ctx: *mut llama_cpp_sys_4::llama_context,
525 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
526 result_train: llama_cpp_sys_4::ggml_opt_result_t,
527 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
528 idata_split: i64,
529 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
530 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
531) {
532 llama_cpp_sys_4::llama_opt_epoch(
533 ctx,
534 dataset,
535 result_train,
536 result_eval,
537 idata_split,
538 callback_train,
539 callback_eval,
540 );
541}
542
543pub unsafe fn opt_param_filter_all(
549 tensor: *const llama_cpp_sys_4::ggml_tensor,
550 userdata: *mut std::ffi::c_void,
551) -> bool {
552 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
553}
554
555#[allow(clippy::too_many_arguments)]
561pub unsafe fn params_fit(
562 path_model: *const std::ffi::c_char,
563 mparams: *mut llama_cpp_sys_4::llama_model_params,
564 cparams: *mut llama_cpp_sys_4::llama_context_params,
565 tensor_split: *mut f32,
566 tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
567 margins: *mut usize,
568 n_ctx_min: u32,
569 log_level: llama_cpp_sys_4::ggml_log_level,
570) -> llama_cpp_sys_4::common_params_fit_status {
571 llama_cpp_sys_4::common_fit_params(
572 path_model,
573 mparams,
574 cparams,
575 tensor_split,
576 tensor_buft_overrides,
577 margins,
578 n_ctx_min,
579 log_level,
580 )
581}