1use std::ffi::NulError;
26use std::fmt::Debug;
27use std::num::NonZeroI32;
28
29use crate::llama_batch::BatchAddError;
30use std::os::raw::c_int;
31use std::path::PathBuf;
32use std::string::FromUtf8Error;
33
34pub mod common;
35pub mod context;
36#[cfg(feature = "ggml")]
37pub mod ggml;
38pub mod llama_backend;
39pub mod llama_batch;
40pub mod model;
41pub mod mtp;
42pub mod quantize;
43pub mod sampling;
44pub mod token;
45pub mod token_type;
46
47#[cfg(feature = "rpc")]
48pub mod rpc;
49
50#[cfg(feature = "mtmd")]
51pub mod mtmd;
52
53pub type Result<T> = std::result::Result<T, LLamaCppError>;
55
56#[derive(Debug, Eq, PartialEq, thiserror::Error)]
58pub enum LLamaCppError {
59 #[error("BackendAlreadyInitialized")]
62 BackendAlreadyInitialized,
63 #[error("{0}")]
65 ChatTemplateError(#[from] ChatTemplateError),
66 #[error("{0}")]
68 DecodeError(#[from] DecodeError),
69 #[error("{0}")]
71 EncodeError(#[from] EncodeError),
72 #[error("{0}")]
74 LlamaModelLoadError(#[from] LlamaModelLoadError),
75 #[error("{0}")]
77 LlamaContextLoadError(#[from] LlamaContextLoadError),
78 #[error["{0}"]]
80 BatchAddError(#[from] BatchAddError),
81 #[error(transparent)]
83 EmbeddingError(#[from] EmbeddingsError),
84}
85
86#[derive(Debug, Eq, PartialEq, thiserror::Error)]
88pub enum ChatTemplateError {
89 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
91 BuffSizeError(usize),
92 #[error("the model has no meta val - returned code {0}")]
94 MissingTemplate(i32),
95 #[error(transparent)]
97 Utf8Error(#[from] std::str::Utf8Error),
98}
99
100#[derive(Debug, Eq, PartialEq, thiserror::Error)]
102pub enum StringFromModelError {
103 #[error("llama.cpp returned error code {0}")]
105 ReturnedError(i32),
106 #[error(transparent)]
108 Utf8Error(#[from] std::str::Utf8Error),
109}
110
111#[derive(Debug, Eq, PartialEq, thiserror::Error)]
113pub enum LlamaContextLoadError {
114 #[error("null reference from llama.cpp")]
116 NullReturn,
117}
118
119#[derive(Debug, Eq, PartialEq, thiserror::Error)]
121pub enum DecodeError {
122 #[error("Decode Error 1: NoKvCacheSlot")]
124 NoKvCacheSlot,
125 #[error("Decode Error -1: n_tokens == 0")]
127 NTokensZero,
128 #[error("Decode Error {0}: unknown")]
130 Unknown(c_int),
131}
132
133#[derive(Debug, Eq, PartialEq, thiserror::Error)]
135pub enum EncodeError {
136 #[error("Encode Error 1: NoKvCacheSlot")]
138 NoKvCacheSlot,
139 #[error("Encode Error -1: n_tokens == 0")]
141 NTokensZero,
142 #[error("Encode Error {0}: unknown")]
144 Unknown(c_int),
145}
146
147#[derive(Debug, Eq, PartialEq, thiserror::Error)]
149pub enum EmbeddingsError {
150 #[error("Embeddings weren't enabled in the context options")]
152 NotEnabled,
153 #[error("Logits were not enabled for the given token")]
155 LogitsNotEnabled,
156 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
158 NonePoolType,
159}
160
161impl From<NonZeroI32> for DecodeError {
163 fn from(value: NonZeroI32) -> Self {
164 match value.get() {
165 1 => DecodeError::NoKvCacheSlot,
166 -1 => DecodeError::NTokensZero,
167 i => DecodeError::Unknown(i),
168 }
169 }
170}
171
172impl From<NonZeroI32> for EncodeError {
174 fn from(value: NonZeroI32) -> Self {
175 match value.get() {
176 1 => EncodeError::NoKvCacheSlot,
177 -1 => EncodeError::NTokensZero,
178 i => EncodeError::Unknown(i),
179 }
180 }
181}
182
183#[derive(Debug, Eq, PartialEq, thiserror::Error)]
185pub enum LlamaModelLoadError {
186 #[error("null byte in string {0}")]
188 NullError(#[from] NulError),
189 #[error("null result from llama cpp")]
191 NullResult,
192 #[error("failed to convert path {0} to str")]
194 PathToStrError(PathBuf),
195}
196
197#[derive(Debug, Eq, PartialEq, thiserror::Error)]
199pub enum LlamaLoraAdapterInitError {
200 #[error("null byte in string {0}")]
202 NullError(#[from] NulError),
203 #[error("null result from llama cpp")]
205 NullResult,
206 #[error("failed to convert path {0} to str")]
208 PathToStrError(PathBuf),
209}
210
211#[derive(Debug, Eq, PartialEq, thiserror::Error)]
213pub enum LlamaLoraAdapterSetError {
214 #[error("error code from llama cpp")]
216 ErrorResult(i32),
217}
218
219#[derive(Debug, Eq, PartialEq, thiserror::Error)]
221pub enum LlamaLoraAdapterRemoveError {
222 #[error("error code from llama cpp")]
224 ErrorResult(i32),
225}
226
227#[must_use]
234pub fn llama_time_us() -> i64 {
235 unsafe { llama_cpp_sys_4::llama_time_us() }
236}
237
238#[must_use]
245pub fn max_devices() -> usize {
246 unsafe { llama_cpp_sys_4::llama_max_devices() }
247}
248
249#[must_use]
258pub fn mmap_supported() -> bool {
259 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
260}
261
262#[must_use]
271pub fn mlock_supported() -> bool {
272 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
273}
274
275#[derive(Debug, thiserror::Error, Clone)]
277#[non_exhaustive]
278pub enum TokenToStringError {
279 #[error("Unknown Token Type")]
281 UnknownTokenType,
282 #[error("Insufficient Buffer Space {0}")]
284 InsufficientBufferSpace(c_int),
285 #[error("FromUtf8Error {0}")]
287 FromUtf8Error(#[from] FromUtf8Error),
288}
289
290#[derive(Debug, thiserror::Error)]
292pub enum StringToTokenError {
293 #[error("{0}")]
295 NulError(#[from] NulError),
296 #[error("{0}")]
297 CIntConversionError(#[from] std::num::TryFromIntError),
299}
300
301#[derive(Debug, thiserror::Error)]
303pub enum NewLlamaChatMessageError {
304 #[error("{0}")]
306 NulError(#[from] NulError),
307}
308
309#[derive(Debug, thiserror::Error)]
311pub enum ApplyChatTemplateError {
312 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
314 BuffSizeError,
315 #[error("{0}")]
317 NulError(#[from] NulError),
318 #[error("{0}")]
320 FromUtf8Error(#[from] FromUtf8Error),
321}
322
323#[must_use]
339pub fn ggml_time_us() -> i64 {
340 unsafe { llama_cpp_sys_4::ggml_time_us() }
341}
342
343#[must_use]
355pub fn llama_supports_mlock() -> bool {
356 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
357}
358
359#[must_use]
363pub fn supports_gpu_offload() -> bool {
364 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
365}
366
367#[must_use]
371pub fn supports_rpc() -> bool {
372 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
373}
374
375#[must_use]
383pub fn print_system_info() -> String {
384 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
385 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
386 c_str
387 .to_str()
388 .expect("system info is not valid UTF-8")
389 .to_owned()
390}
391
392#[must_use]
394pub fn max_parallel_sequences() -> usize {
395 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
396}
397
398#[must_use]
400pub fn max_tensor_buft_overrides() -> usize {
401 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
402}
403
404#[must_use]
410pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
411 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
412 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
413 c_str
414 .to_str()
415 .expect("flash_attn_type_name is not valid UTF-8")
416 .to_owned()
417}
418
419#[must_use]
425pub fn model_meta_key_str(key: u32) -> String {
426 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
427 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
428 c_str
429 .to_str()
430 .expect("meta_key_str is not valid UTF-8")
431 .to_owned()
432}
433
434pub fn model_quantize(
455 fname_inp: &str,
456 fname_out: &str,
457 params: &quantize::QuantizeParams,
458) -> std::result::Result<(), u32> {
459 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
460 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
461 let guard = params.to_raw();
462 let rc = unsafe {
463 llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
464 };
465 if rc == 0 {
466 Ok(())
467 } else {
468 Err(rc)
469 }
470}
471
472#[must_use]
476#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
477pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
478 unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
479}
480
481pub unsafe fn log_set(
488 callback: llama_cpp_sys_4::ggml_log_callback,
489 user_data: *mut std::ffi::c_void,
490) {
491 llama_cpp_sys_4::llama_log_set(callback, user_data);
492}
493
494pub unsafe fn log_get(
500 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
501 user_data: *mut *mut std::ffi::c_void,
502) {
503 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
504}
505
506pub unsafe fn opt_init(
512 ctx: *mut llama_cpp_sys_4::llama_context,
513 model: *mut llama_cpp_sys_4::llama_model,
514 params: llama_cpp_sys_4::llama_opt_params,
515) {
516 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
517}
518
519#[allow(clippy::too_many_arguments)]
525pub unsafe fn opt_epoch(
526 ctx: *mut llama_cpp_sys_4::llama_context,
527 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
528 result_train: llama_cpp_sys_4::ggml_opt_result_t,
529 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
530 idata_split: i64,
531 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
532 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
533) {
534 llama_cpp_sys_4::llama_opt_epoch(
535 ctx,
536 dataset,
537 result_train,
538 result_eval,
539 idata_split,
540 callback_train,
541 callback_eval,
542 );
543}
544
545pub unsafe fn opt_param_filter_all(
551 tensor: *const llama_cpp_sys_4::ggml_tensor,
552 userdata: *mut std::ffi::c_void,
553) -> bool {
554 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
555}
556
557#[allow(clippy::too_many_arguments)]
563pub unsafe fn params_fit(
564 path_model: *const std::ffi::c_char,
565 mparams: *mut llama_cpp_sys_4::llama_model_params,
566 cparams: *mut llama_cpp_sys_4::llama_context_params,
567 tensor_split: *mut f32,
568 tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
569 margins: *mut usize,
570 n_ctx_min: u32,
571 log_level: llama_cpp_sys_4::ggml_log_level,
572) -> llama_cpp_sys_4::common_params_fit_status {
573 llama_cpp_sys_4::common_fit_params(
574 path_model,
575 mparams,
576 cparams,
577 tensor_split,
578 tensor_buft_overrides,
579 margins,
580 n_ctx_min,
581 log_level,
582 )
583}