llama_cpp_4/lib.rs
1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//!
15//! # Feature Flags
16//!
17//! - `cuda` enables CUDA GPU support.
18//! - `metal` enables Apple Metal GPU support.
19//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
20//! - `native` enables host-CPU optimisations (`-march=native`).
21//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
22//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
23//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
24use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35pub mod llama_backend;
36pub mod llama_batch;
37pub mod model;
38pub mod sampling;
39pub mod token;
40pub mod token_type;
41
42#[cfg(feature = "rpc")]
43pub mod rpc;
44
45#[cfg(feature = "mtmd")]
46pub mod mtmd;
47
48/// A failable result from a llama.cpp function.
49pub type Result<T> = std::result::Result<T, LLamaCppError>;
50
51/// All errors that can occur in the llama-cpp crate.
52#[derive(Debug, Eq, PartialEq, thiserror::Error)]
53pub enum LLamaCppError {
54 /// The backend was already initialized. This can generally be ignored as initializing the backend
55 /// is idempotent.
56 #[error("BackendAlreadyInitialized")]
57 BackendAlreadyInitialized,
58 /// There was an error while get the chat template from model.
59 #[error("{0}")]
60 ChatTemplateError(#[from] ChatTemplateError),
61 /// There was an error while decoding a batch.
62 #[error("{0}")]
63 DecodeError(#[from] DecodeError),
64 /// There was an error while encoding a batch.
65 #[error("{0}")]
66 EncodeError(#[from] EncodeError),
67 /// There was an error loading a model.
68 #[error("{0}")]
69 LlamaModelLoadError(#[from] LlamaModelLoadError),
70 /// There was an error creating a new model context.
71 #[error("{0}")]
72 LlamaContextLoadError(#[from] LlamaContextLoadError),
73 /// There was an error adding a token to a batch.
74 #[error["{0}"]]
75 BatchAddError(#[from] BatchAddError),
76 /// see [`EmbeddingsError`]
77 #[error(transparent)]
78 EmbeddingError(#[from] EmbeddingsError),
79}
80
81/// There was an error while getting the chat template from a model.
82#[derive(Debug, Eq, PartialEq, thiserror::Error)]
83pub enum ChatTemplateError {
84 /// the buffer was too small.
85 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
86 BuffSizeError(usize),
87 /// gguf has no chat template
88 #[error("the model has no meta val - returned code {0}")]
89 MissingTemplate(i32),
90 /// The chat template was not valid utf8.
91 #[error(transparent)]
92 Utf8Error(#[from] std::str::Utf8Error),
93}
94
95/// Failed to Load context
96#[derive(Debug, Eq, PartialEq, thiserror::Error)]
97pub enum LlamaContextLoadError {
98 /// llama.cpp returned null
99 #[error("null reference from llama.cpp")]
100 NullReturn,
101}
102
103/// Failed to decode a batch.
104#[derive(Debug, Eq, PartialEq, thiserror::Error)]
105pub enum DecodeError {
106 /// No kv cache slot was available.
107 #[error("Decode Error 1: NoKvCacheSlot")]
108 NoKvCacheSlot,
109 /// The number of tokens in the batch was 0.
110 #[error("Decode Error -1: n_tokens == 0")]
111 NTokensZero,
112 /// An unknown error occurred.
113 #[error("Decode Error {0}: unknown")]
114 Unknown(c_int),
115}
116
117/// Failed to decode a batch.
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum EncodeError {
120 /// No kv cache slot was available.
121 #[error("Encode Error 1: NoKvCacheSlot")]
122 NoKvCacheSlot,
123 /// The number of tokens in the batch was 0.
124 #[error("Encode Error -1: n_tokens == 0")]
125 NTokensZero,
126 /// An unknown error occurred.
127 #[error("Encode Error {0}: unknown")]
128 Unknown(c_int),
129}
130
131/// When embedding related functions fail
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EmbeddingsError {
134 /// Embeddings weren't enabled in the context options
135 #[error("Embeddings weren't enabled in the context options")]
136 NotEnabled,
137 /// Logits weren't enabled for the given token
138 #[error("Logits were not enabled for the given token")]
139 LogitsNotEnabled,
140 /// The given sequence index exceeds the max sequence id
141 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
142 NonePoolType,
143}
144
145/// Decode a error from llama.cpp into a [`DecodeError`].
146impl From<NonZeroI32> for DecodeError {
147 fn from(value: NonZeroI32) -> Self {
148 match value.get() {
149 1 => DecodeError::NoKvCacheSlot,
150 -1 => DecodeError::NTokensZero,
151 i => DecodeError::Unknown(i),
152 }
153 }
154}
155
156/// Encode a error from llama.cpp into a [`EncodeError`].
157impl From<NonZeroI32> for EncodeError {
158 fn from(value: NonZeroI32) -> Self {
159 match value.get() {
160 1 => EncodeError::NoKvCacheSlot,
161 -1 => EncodeError::NTokensZero,
162 i => EncodeError::Unknown(i),
163 }
164 }
165}
166
167/// An error that can occur when loading a model.
168#[derive(Debug, Eq, PartialEq, thiserror::Error)]
169pub enum LlamaModelLoadError {
170 /// There was a null byte in a provided string and thus it could not be converted to a C string.
171 #[error("null byte in string {0}")]
172 NullError(#[from] NulError),
173 /// llama.cpp returned a nullptr - this could be many different causes.
174 #[error("null result from llama cpp")]
175 NullResult,
176 /// Failed to convert the path to a rust str. This means the path was not valid unicode
177 #[error("failed to convert path {0} to str")]
178 PathToStrError(PathBuf),
179}
180
181/// An error that can occur when loading a model.
182#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaLoraAdapterInitError {
184 /// There was a null byte in a provided string and thus it could not be converted to a C string.
185 #[error("null byte in string {0}")]
186 NullError(#[from] NulError),
187 /// llama.cpp returned a nullptr - this could be many different causes.
188 #[error("null result from llama cpp")]
189 NullResult,
190 /// Failed to convert the path to a rust str. This means the path was not valid unicode
191 #[error("failed to convert path {0} to str")]
192 PathToStrError(PathBuf),
193}
194
195/// An error that can occur when loading a model.
196#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterSetError {
198 /// llama.cpp returned a non-zero error code.
199 #[error("error code from llama cpp")]
200 ErrorResult(i32),
201}
202
203/// An error that can occur when loading a model.
204#[derive(Debug, Eq, PartialEq, thiserror::Error)]
205pub enum LlamaLoraAdapterRemoveError {
206 /// llama.cpp returned a non-zero error code.
207 #[error("error code from llama cpp")]
208 ErrorResult(i32),
209}
210
211/// get the time (in microseconds) according to llama.cpp
212/// ```
213/// # use llama_cpp_4::llama_time_us;
214/// let time = llama_time_us();
215/// assert!(time > 0);
216/// ```
217#[must_use]
218pub fn llama_time_us() -> i64 {
219 unsafe { llama_cpp_sys_4::llama_time_us() }
220}
221
222/// get the max number of devices according to llama.cpp (this is generally cuda devices)
223/// ```
224/// # use llama_cpp_4::max_devices;
225/// let max_devices = max_devices();
226/// assert!(max_devices >= 0);
227/// ```
228#[must_use]
229pub fn max_devices() -> usize {
230 unsafe { llama_cpp_sys_4::llama_max_devices() }
231}
232
233/// is memory mapping supported according to llama.cpp
234/// ```
235/// # use llama_cpp_4::mmap_supported;
236/// let mmap_supported = mmap_supported();
237/// if mmap_supported {
238/// println!("mmap_supported!");
239/// }
240/// ```
241#[must_use]
242pub fn mmap_supported() -> bool {
243 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
244}
245
246/// is memory locking supported according to llama.cpp
247/// ```
248/// # use llama_cpp_4::mlock_supported;
249/// let mlock_supported = mlock_supported();
250/// if mlock_supported {
251/// println!("mlock_supported!");
252/// }
253/// ```
254#[must_use]
255pub fn mlock_supported() -> bool {
256 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
257}
258
259/// An error that can occur when converting a token to a string.
260#[derive(Debug, thiserror::Error, Clone)]
261#[non_exhaustive]
262pub enum TokenToStringError {
263 /// the token type was unknown
264 #[error("Unknown Token Type")]
265 UnknownTokenType,
266 /// There was insufficient buffer space to convert the token to a string.
267 #[error("Insufficient Buffer Space {0}")]
268 InsufficientBufferSpace(c_int),
269 /// The token was not valid utf8.
270 #[error("FromUtf8Error {0}")]
271 FromUtf8Error(#[from] FromUtf8Error),
272}
273
274/// Failed to convert a string to a token sequence.
275#[derive(Debug, thiserror::Error)]
276pub enum StringToTokenError {
277 /// the string contained a null byte and thus could not be converted to a c string.
278 #[error("{0}")]
279 NulError(#[from] NulError),
280 #[error("{0}")]
281 /// Failed to convert a provided integer to a [`c_int`].
282 CIntConversionError(#[from] std::num::TryFromIntError),
283}
284
285/// Failed to apply model chat template.
286#[derive(Debug, thiserror::Error)]
287pub enum NewLlamaChatMessageError {
288 /// the string contained a null byte and thus could not be converted to a c string.
289 #[error("{0}")]
290 NulError(#[from] NulError),
291}
292
293/// Failed to apply model chat template.
294#[derive(Debug, thiserror::Error)]
295pub enum ApplyChatTemplateError {
296 /// the buffer was too small.
297 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
298 BuffSizeError,
299 /// the string contained a null byte and thus could not be converted to a c string.
300 #[error("{0}")]
301 NulError(#[from] NulError),
302 /// the string could not be converted to utf8.
303 #[error("{0}")]
304 FromUtf8Error(#[from] FromUtf8Error),
305}
306
307/// Get the time in microseconds according to ggml
308///
309/// ```
310/// # use std::time::Duration;
311/// use llama_cpp_4::ggml_time_us;
312///
313/// let start = ggml_time_us();
314///
315/// std::thread::sleep(Duration::from_micros(10));
316///
317/// let end = ggml_time_us();
318///
319/// let elapsed = end - start;
320///
321/// assert!(elapsed >= 10)
322#[must_use]
323pub fn ggml_time_us() -> i64 {
324 unsafe { llama_cpp_sys_4::ggml_time_us() }
325}
326
327/// checks if mlock is supported
328///
329/// ```
330/// # use llama_cpp_4::llama_supports_mlock;
331///
332/// if llama_supports_mlock() {
333/// println!("mlock is supported!");
334/// } else {
335/// println!("mlock is not supported!");
336/// }
337/// ```
338#[must_use]
339pub fn llama_supports_mlock() -> bool {
340 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
341}