Skip to main content

llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//!
15//! # Feature Flags
16//!
17//! - `cuda` enables CUDA GPU support.
18//! - `metal` enables Apple Metal GPU support.
19//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
20//! - `native` enables host-CPU optimisations (`-march=native`).
21//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
22//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
23//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
24use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35pub mod llama_backend;
36pub mod llama_batch;
37pub mod model;
38pub mod sampling;
39pub mod token;
40pub mod token_type;
41
42#[cfg(feature = "rpc")]
43pub mod rpc;
44
45#[cfg(feature = "mtmd")]
46pub mod mtmd;
47
48/// A failable result from a llama.cpp function.
49pub type Result<T> = std::result::Result<T, LLamaCppError>;
50
51/// All errors that can occur in the llama-cpp crate.
52#[derive(Debug, Eq, PartialEq, thiserror::Error)]
53pub enum LLamaCppError {
54    /// The backend was already initialized. This can generally be ignored as initializing the backend
55    /// is idempotent.
56    #[error("BackendAlreadyInitialized")]
57    BackendAlreadyInitialized,
58    /// There was an error while get the chat template from model.
59    #[error("{0}")]
60    ChatTemplateError(#[from] ChatTemplateError),
61    /// There was an error while decoding a batch.
62    #[error("{0}")]
63    DecodeError(#[from] DecodeError),
64    /// There was an error while encoding a batch.
65    #[error("{0}")]
66    EncodeError(#[from] EncodeError),
67    /// There was an error loading a model.
68    #[error("{0}")]
69    LlamaModelLoadError(#[from] LlamaModelLoadError),
70    /// There was an error creating a new model context.
71    #[error("{0}")]
72    LlamaContextLoadError(#[from] LlamaContextLoadError),
73    /// There was an error adding a token to a batch.
74    #[error["{0}"]]
75    BatchAddError(#[from] BatchAddError),
76    /// see [`EmbeddingsError`]
77    #[error(transparent)]
78    EmbeddingError(#[from] EmbeddingsError),
79}
80
81/// There was an error while getting the chat template from a model.
82#[derive(Debug, Eq, PartialEq, thiserror::Error)]
83pub enum ChatTemplateError {
84    /// the buffer was too small.
85    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
86    BuffSizeError(usize),
87    /// gguf has no chat template
88    #[error("the model has no meta val - returned code {0}")]
89    MissingTemplate(i32),
90    /// The chat template was not valid utf8.
91    #[error(transparent)]
92    Utf8Error(#[from] std::str::Utf8Error),
93}
94
95/// Failed to Load context
96#[derive(Debug, Eq, PartialEq, thiserror::Error)]
97pub enum LlamaContextLoadError {
98    /// llama.cpp returned null
99    #[error("null reference from llama.cpp")]
100    NullReturn,
101}
102
103/// Failed to decode a batch.
104#[derive(Debug, Eq, PartialEq, thiserror::Error)]
105pub enum DecodeError {
106    /// No kv cache slot was available.
107    #[error("Decode Error 1: NoKvCacheSlot")]
108    NoKvCacheSlot,
109    /// The number of tokens in the batch was 0.
110    #[error("Decode Error -1: n_tokens == 0")]
111    NTokensZero,
112    /// An unknown error occurred.
113    #[error("Decode Error {0}: unknown")]
114    Unknown(c_int),
115}
116
117/// Failed to decode a batch.
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum EncodeError {
120    /// No kv cache slot was available.
121    #[error("Encode Error 1: NoKvCacheSlot")]
122    NoKvCacheSlot,
123    /// The number of tokens in the batch was 0.
124    #[error("Encode Error -1: n_tokens == 0")]
125    NTokensZero,
126    /// An unknown error occurred.
127    #[error("Encode Error {0}: unknown")]
128    Unknown(c_int),
129}
130
131/// When embedding related functions fail
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EmbeddingsError {
134    /// Embeddings weren't enabled in the context options
135    #[error("Embeddings weren't enabled in the context options")]
136    NotEnabled,
137    /// Logits weren't enabled for the given token
138    #[error("Logits were not enabled for the given token")]
139    LogitsNotEnabled,
140    /// The given sequence index exceeds the max sequence id
141    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
142    NonePoolType,
143}
144
145/// Decode a error from llama.cpp into a [`DecodeError`].
146impl From<NonZeroI32> for DecodeError {
147    fn from(value: NonZeroI32) -> Self {
148        match value.get() {
149            1 => DecodeError::NoKvCacheSlot,
150            -1 => DecodeError::NTokensZero,
151            i => DecodeError::Unknown(i),
152        }
153    }
154}
155
156/// Encode a error from llama.cpp into a [`EncodeError`].
157impl From<NonZeroI32> for EncodeError {
158    fn from(value: NonZeroI32) -> Self {
159        match value.get() {
160            1 => EncodeError::NoKvCacheSlot,
161            -1 => EncodeError::NTokensZero,
162            i => EncodeError::Unknown(i),
163        }
164    }
165}
166
167/// An error that can occur when loading a model.
168#[derive(Debug, Eq, PartialEq, thiserror::Error)]
169pub enum LlamaModelLoadError {
170    /// There was a null byte in a provided string and thus it could not be converted to a C string.
171    #[error("null byte in string {0}")]
172    NullError(#[from] NulError),
173    /// llama.cpp returned a nullptr - this could be many different causes.
174    #[error("null result from llama cpp")]
175    NullResult,
176    /// Failed to convert the path to a rust str. This means the path was not valid unicode
177    #[error("failed to convert path {0} to str")]
178    PathToStrError(PathBuf),
179}
180
181/// An error that can occur when loading a model.
182#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaLoraAdapterInitError {
184    /// There was a null byte in a provided string and thus it could not be converted to a C string.
185    #[error("null byte in string {0}")]
186    NullError(#[from] NulError),
187    /// llama.cpp returned a nullptr - this could be many different causes.
188    #[error("null result from llama cpp")]
189    NullResult,
190    /// Failed to convert the path to a rust str. This means the path was not valid unicode
191    #[error("failed to convert path {0} to str")]
192    PathToStrError(PathBuf),
193}
194
195/// An error that can occur when loading a model.
196#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterSetError {
198    /// llama.cpp returned a non-zero error code.
199    #[error("error code from llama cpp")]
200    ErrorResult(i32),
201}
202
203/// An error that can occur when loading a model.
204#[derive(Debug, Eq, PartialEq, thiserror::Error)]
205pub enum LlamaLoraAdapterRemoveError {
206    /// llama.cpp returned a non-zero error code.
207    #[error("error code from llama cpp")]
208    ErrorResult(i32),
209}
210
211/// get the time (in microseconds) according to llama.cpp
212/// ```
213/// # use llama_cpp_4::llama_time_us;
214/// let time = llama_time_us();
215/// assert!(time > 0);
216/// ```
217#[must_use]
218pub fn llama_time_us() -> i64 {
219    unsafe { llama_cpp_sys_4::llama_time_us() }
220}
221
222/// get the max number of devices according to llama.cpp (this is generally cuda devices)
223/// ```
224/// # use llama_cpp_4::max_devices;
225/// let max_devices = max_devices();
226/// assert!(max_devices >= 0);
227/// ```
228#[must_use]
229pub fn max_devices() -> usize {
230    unsafe { llama_cpp_sys_4::llama_max_devices() }
231}
232
233/// is memory mapping supported according to llama.cpp
234/// ```
235/// # use llama_cpp_4::mmap_supported;
236/// let mmap_supported = mmap_supported();
237/// if mmap_supported {
238///   println!("mmap_supported!");
239/// }
240/// ```
241#[must_use]
242pub fn mmap_supported() -> bool {
243    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
244}
245
246/// is memory locking supported according to llama.cpp
247/// ```
248/// # use llama_cpp_4::mlock_supported;
249/// let mlock_supported = mlock_supported();
250/// if mlock_supported {
251///    println!("mlock_supported!");
252/// }
253/// ```
254#[must_use]
255pub fn mlock_supported() -> bool {
256    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
257}
258
259/// An error that can occur when converting a token to a string.
260#[derive(Debug, thiserror::Error, Clone)]
261#[non_exhaustive]
262pub enum TokenToStringError {
263    /// the token type was unknown
264    #[error("Unknown Token Type")]
265    UnknownTokenType,
266    /// There was insufficient buffer space to convert the token to a string.
267    #[error("Insufficient Buffer Space {0}")]
268    InsufficientBufferSpace(c_int),
269    /// The token was not valid utf8.
270    #[error("FromUtf8Error {0}")]
271    FromUtf8Error(#[from] FromUtf8Error),
272}
273
274/// Failed to convert a string to a token sequence.
275#[derive(Debug, thiserror::Error)]
276pub enum StringToTokenError {
277    /// the string contained a null byte and thus could not be converted to a c string.
278    #[error("{0}")]
279    NulError(#[from] NulError),
280    #[error("{0}")]
281    /// Failed to convert a provided integer to a [`c_int`].
282    CIntConversionError(#[from] std::num::TryFromIntError),
283}
284
285/// Failed to apply model chat template.
286#[derive(Debug, thiserror::Error)]
287pub enum NewLlamaChatMessageError {
288    /// the string contained a null byte and thus could not be converted to a c string.
289    #[error("{0}")]
290    NulError(#[from] NulError),
291}
292
293/// Failed to apply model chat template.
294#[derive(Debug, thiserror::Error)]
295pub enum ApplyChatTemplateError {
296    /// the buffer was too small.
297    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
298    BuffSizeError,
299    /// the string contained a null byte and thus could not be converted to a c string.
300    #[error("{0}")]
301    NulError(#[from] NulError),
302    /// the string could not be converted to utf8.
303    #[error("{0}")]
304    FromUtf8Error(#[from] FromUtf8Error),
305}
306
307/// Get the time in microseconds according to ggml
308///
309/// ```
310/// # use std::time::Duration;
311/// use llama_cpp_4::ggml_time_us;
312///
313/// let start = ggml_time_us();
314///
315/// std::thread::sleep(Duration::from_micros(10));
316///
317/// let end = ggml_time_us();
318///
319/// let elapsed = end - start;
320///
321/// assert!(elapsed >= 10)
322#[must_use]
323pub fn ggml_time_us() -> i64 {
324    unsafe { llama_cpp_sys_4::ggml_time_us() }
325}
326
327/// checks if mlock is supported
328///
329/// ```
330/// # use llama_cpp_4::llama_supports_mlock;
331///
332/// if llama_supports_mlock() {
333///   println!("mlock is supported!");
334/// } else {
335///   println!("mlock is not supported!");
336/// }
337/// ```
338#[must_use]
339pub fn llama_supports_mlock() -> bool {
340    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
341}