llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::NulError;
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod llama_backend;
27pub mod llama_batch;
28mod log;
29pub mod model;
30#[cfg(feature = "mtmd")]
31pub mod mtmd;
32pub mod sampling;
33pub mod timing;
34pub mod token;
35pub mod token_type;
36
37/// A failable result from a llama.cpp function.
38pub type Result<T> = std::result::Result<T, LLamaCppError>;
39
40/// All errors that can occur in the llama-cpp crate.
41#[derive(Debug, Eq, PartialEq, thiserror::Error)]
42pub enum LLamaCppError {
43    /// The backend was already initialized. This can generally be ignored as initializing the backend
44    /// is idempotent.
45    #[error("BackendAlreadyInitialized")]
46    BackendAlreadyInitialized,
47    /// There was an error while get the chat template from model.
48    #[error("{0}")]
49    ChatTemplateError(#[from] ChatTemplateError),
50    /// There was an error while decoding a batch.
51    #[error("{0}")]
52    DecodeError(#[from] DecodeError),
53    /// There was an error while encoding a batch.
54    #[error("{0}")]
55    EncodeError(#[from] EncodeError),
56    /// There was an error loading a model.
57    #[error("{0}")]
58    LlamaModelLoadError(#[from] LlamaModelLoadError),
59    /// There was an error creating a new model context.
60    #[error("{0}")]
61    LlamaContextLoadError(#[from] LlamaContextLoadError),
62    /// There was an error adding a token to a batch.
63    #[error["{0}"]]
64    BatchAddError(#[from] BatchAddError),
65    /// see [`EmbeddingsError`]
66    #[error(transparent)]
67    EmbeddingError(#[from] EmbeddingsError),
68    // See [`LlamaSamplerError`]
69}
70
71/// There was an error while getting the chat template from a model.
72#[derive(Debug, Eq, PartialEq, thiserror::Error)]
73pub enum ChatTemplateError {
74    /// gguf has no chat template (by that name)
75    #[error("chat template not found - returned null pointer")]
76    MissingTemplate,
77
78    /// chat template contained a null byte
79    #[error("null byte in string {0}")]
80    NullError(#[from] NulError),
81
82    /// The chat template was not valid utf8.
83    #[error(transparent)]
84    Utf8Error(#[from] std::str::Utf8Error),
85}
86
87/// Failed fetching metadata value
88#[derive(Debug, Eq, PartialEq, thiserror::Error)]
89pub enum MetaValError {
90    /// The provided string contains an unexpected null-byte
91    #[error("null byte in string {0}")]
92    NullError(#[from] NulError),
93
94    /// The returned data contains invalid UTF8 data
95    #[error("FromUtf8Error {0}")]
96    FromUtf8Error(#[from] FromUtf8Error),
97
98    /// Got negative return value. This happens if the key or index queried does not exist.
99    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
100    NegativeReturn(i32),
101}
102
103/// Failed to Load context
104#[derive(Debug, Eq, PartialEq, thiserror::Error)]
105pub enum LlamaContextLoadError {
106    /// llama.cpp returned null
107    #[error("null reference from llama.cpp")]
108    NullReturn,
109}
110
111/// Failed to decode a batch.
112#[derive(Debug, Eq, PartialEq, thiserror::Error)]
113pub enum DecodeError {
114    /// No kv cache slot was available.
115    #[error("Decode Error 1: NoKvCacheSlot")]
116    NoKvCacheSlot,
117    /// The number of tokens in the batch was 0.
118    #[error("Decode Error -1: n_tokens == 0")]
119    NTokensZero,
120    /// An unknown error occurred.
121    #[error("Decode Error {0}: unknown")]
122    Unknown(c_int),
123}
124
125/// Failed to decode a batch.
126#[derive(Debug, Eq, PartialEq, thiserror::Error)]
127pub enum EncodeError {
128    /// No kv cache slot was available.
129    #[error("Encode Error 1: NoKvCacheSlot")]
130    NoKvCacheSlot,
131    /// The number of tokens in the batch was 0.
132    #[error("Encode Error -1: n_tokens == 0")]
133    NTokensZero,
134    /// An unknown error occurred.
135    #[error("Encode Error {0}: unknown")]
136    Unknown(c_int),
137}
138
139/// When embedding related functions fail
140#[derive(Debug, Eq, PartialEq, thiserror::Error)]
141pub enum EmbeddingsError {
142    /// Embeddings weren't enabled in the context options
143    #[error("Embeddings weren't enabled in the context options")]
144    NotEnabled,
145    /// Logits weren't enabled for the given token
146    #[error("Logits were not enabled for the given token")]
147    LogitsNotEnabled,
148    /// The given sequence index exceeds the max sequence id
149    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
150    NonePoolType,
151}
152
153/// Decode a error from llama.cpp into a [`DecodeError`].
154impl From<NonZeroI32> for DecodeError {
155    fn from(value: NonZeroI32) -> Self {
156        match value.get() {
157            1 => DecodeError::NoKvCacheSlot,
158            -1 => DecodeError::NTokensZero,
159            i => DecodeError::Unknown(i),
160        }
161    }
162}
163
164/// Encode a error from llama.cpp into a [`EncodeError`].
165impl From<NonZeroI32> for EncodeError {
166    fn from(value: NonZeroI32) -> Self {
167        match value.get() {
168            1 => EncodeError::NoKvCacheSlot,
169            -1 => EncodeError::NTokensZero,
170            i => EncodeError::Unknown(i),
171        }
172    }
173}
174
175/// An error that can occur when loading a model.
176#[derive(Debug, Eq, PartialEq, thiserror::Error)]
177pub enum LlamaModelLoadError {
178    /// There was a null byte in a provided string and thus it could not be converted to a C string.
179    #[error("null byte in string {0}")]
180    NullError(#[from] NulError),
181    /// llama.cpp returned a nullptr - this could be many different causes.
182    #[error("null result from llama cpp")]
183    NullResult,
184    /// Failed to convert the path to a rust str. This means the path was not valid unicode
185    #[error("failed to convert path {0} to str")]
186    PathToStrError(PathBuf),
187}
188
189/// An error that can occur when loading a model.
190#[derive(Debug, Eq, PartialEq, thiserror::Error)]
191pub enum LlamaLoraAdapterInitError {
192    /// There was a null byte in a provided string and thus it could not be converted to a C string.
193    #[error("null byte in string {0}")]
194    NullError(#[from] NulError),
195    /// llama.cpp returned a nullptr - this could be many different causes.
196    #[error("null result from llama cpp")]
197    NullResult,
198    /// Failed to convert the path to a rust str. This means the path was not valid unicode
199    #[error("failed to convert path {0} to str")]
200    PathToStrError(PathBuf),
201}
202
203/// An error that can occur when loading a model.
204#[derive(Debug, Eq, PartialEq, thiserror::Error)]
205pub enum LlamaLoraAdapterSetError {
206    /// llama.cpp returned a non-zero error code.
207    #[error("error code from llama cpp")]
208    ErrorResult(i32),
209}
210
211/// An error that can occur when loading a model.
212#[derive(Debug, Eq, PartialEq, thiserror::Error)]
213pub enum LlamaLoraAdapterRemoveError {
214    /// llama.cpp returned a non-zero error code.
215    #[error("error code from llama cpp")]
216    ErrorResult(i32),
217}
218
219/// get the time (in microseconds) according to llama.cpp
220/// ```
221/// # use llama_cpp_2::llama_time_us;
222/// # use llama_cpp_2::llama_backend::LlamaBackend;
223/// let backend = LlamaBackend::init().unwrap();
224/// let time = llama_time_us();
225/// assert!(time > 0);
226/// ```
227#[must_use]
228pub fn llama_time_us() -> i64 {
229    unsafe { llama_cpp_sys_2::llama_time_us() }
230}
231
232/// get the max number of devices according to llama.cpp (this is generally cuda devices)
233/// ```
234/// # use llama_cpp_2::max_devices;
235/// let max_devices = max_devices();
236/// assert!(max_devices >= 0);
237/// ```
238#[must_use]
239pub fn max_devices() -> usize {
240    unsafe { llama_cpp_sys_2::llama_max_devices() }
241}
242
243/// is memory mapping supported according to llama.cpp
244/// ```
245/// # use llama_cpp_2::mmap_supported;
246/// let mmap_supported = mmap_supported();
247/// if mmap_supported {
248///   println!("mmap_supported!");
249/// }
250/// ```
251#[must_use]
252pub fn mmap_supported() -> bool {
253    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
254}
255
256/// is memory locking supported according to llama.cpp
257/// ```
258/// # use llama_cpp_2::mlock_supported;
259/// let mlock_supported = mlock_supported();
260/// if mlock_supported {
261///    println!("mlock_supported!");
262/// }
263/// ```
264#[must_use]
265pub fn mlock_supported() -> bool {
266    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
267}
268
269/// An error that can occur when converting a token to a string.
270#[derive(Debug, thiserror::Error, Clone)]
271#[non_exhaustive]
272pub enum TokenToStringError {
273    /// the token type was unknown
274    #[error("Unknown Token Type")]
275    UnknownTokenType,
276    /// There was insufficient buffer space to convert the token to a string.
277    #[error("Insufficient Buffer Space {0}")]
278    InsufficientBufferSpace(c_int),
279    /// The token was not valid utf8.
280    #[error("FromUtf8Error {0}")]
281    FromUtf8Error(#[from] FromUtf8Error),
282}
283
284/// Failed to convert a string to a token sequence.
285#[derive(Debug, thiserror::Error)]
286pub enum StringToTokenError {
287    /// the string contained a null byte and thus could not be converted to a c string.
288    #[error("{0}")]
289    NulError(#[from] NulError),
290    #[error("{0}")]
291    /// Failed to convert a provided integer to a [`c_int`].
292    CIntConversionError(#[from] std::num::TryFromIntError),
293}
294
295/// Failed to apply model chat template.
296#[derive(Debug, thiserror::Error)]
297pub enum NewLlamaChatMessageError {
298    /// the string contained a null byte and thus could not be converted to a c string.
299    #[error("{0}")]
300    NulError(#[from] NulError),
301}
302
303/// Failed to apply model chat template.
304#[derive(Debug, thiserror::Error)]
305pub enum ApplyChatTemplateError {
306    /// the string contained a null byte and thus could not be converted to a c string.
307    #[error("{0}")]
308    NulError(#[from] NulError),
309    /// the string could not be converted to utf8.
310    #[error("{0}")]
311    FromUtf8Error(#[from] FromUtf8Error),
312}
313
314/// Get the time in microseconds according to ggml
315///
316/// ```
317/// # use std::time::Duration;
318/// # use llama_cpp_2::llama_backend::LlamaBackend;
319/// let backend = LlamaBackend::init().unwrap();
320/// use llama_cpp_2::ggml_time_us;
321///
322/// let start = ggml_time_us();
323///
324/// std::thread::sleep(Duration::from_micros(10));
325///
326/// let end = ggml_time_us();
327///
328/// let elapsed = end - start;
329///
330/// assert!(elapsed >= 10)
331#[must_use]
332pub fn ggml_time_us() -> i64 {
333    unsafe { llama_cpp_sys_2::ggml_time_us() }
334}
335
336/// checks if mlock is supported
337///
338/// ```
339/// # use llama_cpp_2::llama_supports_mlock;
340///
341/// if llama_supports_mlock() {
342///   println!("mlock is supported!");
343/// } else {
344///   println!("mlock is not supported!");
345/// }
346/// ```
347#[must_use]
348pub fn llama_supports_mlock() -> bool {
349    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
350}
351
352/// Options to configure how llama.cpp logs are intercepted.
353#[derive(Default, Debug, Clone)]
354pub struct LogOptions {
355    disabled: bool,
356}
357
358impl LogOptions {
359    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
360    /// logs to be sent to tracing.
361    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
362        self.disabled = !enabled;
363        self
364    }
365}
366
367extern "C" fn logs_to_trace(
368    level: llama_cpp_sys_2::ggml_log_level,
369    text: *const ::std::os::raw::c_char,
370    data: *mut ::std::os::raw::c_void,
371) {
372    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
373    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
374    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
375    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
376    use std::borrow::Borrow;
377
378    let log_state = unsafe { &*(data as *const log::State) };
379
380    if log_state.options.disabled {
381        return;
382    }
383
384    // If the log level is disabled, we can just return early
385    if !log_state.is_enabled_for_level(level) {
386        log_state.update_previous_level_for_disabled_log(level);
387        return;
388    }
389
390    let text = unsafe { std::ffi::CStr::from_ptr(text) };
391    let text = text.to_string_lossy();
392    let text: &str = text.borrow();
393
394    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
395    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
396    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
397    // to know how to flush it.
398
399    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
400        log_state.cont_buffered_log(text);
401    } else if text.ends_with('\n') {
402        log_state.emit_non_cont_line(level, text);
403    } else {
404        log_state.buffer_non_cont(level, text);
405    }
406}
407
408/// Redirect llama.cpp logs into tracing.
409pub fn send_logs_to_tracing(options: LogOptions) {
410    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
411
412    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
413    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
414    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
415    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
416    let llama_heap_state = Box::as_ref(
417        log::LLAMA_STATE
418            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
419    ) as *const _;
420    let ggml_heap_state = Box::as_ref(
421        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
422    ) as *const _;
423
424    unsafe {
425        // GGML has to be set after llama since setting llama sets ggml as well.
426        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
427        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
428    }
429}