llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::NulError;
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod llama_backend;
27pub mod llama_batch;
28mod log;
29pub mod model;
30pub mod sampling;
31pub mod timing;
32pub mod token;
33pub mod token_type;
34
35/// A failable result from a llama.cpp function.
36pub type Result<T> = std::result::Result<T, LLamaCppError>;
37
38/// All errors that can occur in the llama-cpp crate.
39#[derive(Debug, Eq, PartialEq, thiserror::Error)]
40pub enum LLamaCppError {
41    /// The backend was already initialized. This can generally be ignored as initializing the backend
42    /// is idempotent.
43    #[error("BackendAlreadyInitialized")]
44    BackendAlreadyInitialized,
45    /// There was an error while get the chat template from model.
46    #[error("{0}")]
47    ChatTemplateError(#[from] ChatTemplateError),
48    /// There was an error while decoding a batch.
49    #[error("{0}")]
50    DecodeError(#[from] DecodeError),
51    /// There was an error while encoding a batch.
52    #[error("{0}")]
53    EncodeError(#[from] EncodeError),
54    /// There was an error loading a model.
55    #[error("{0}")]
56    LlamaModelLoadError(#[from] LlamaModelLoadError),
57    /// There was an error creating a new model context.
58    #[error("{0}")]
59    LlamaContextLoadError(#[from] LlamaContextLoadError),
60    /// There was an error adding a token to a batch.
61    #[error["{0}"]]
62    BatchAddError(#[from] BatchAddError),
63    /// see [`EmbeddingsError`]
64    #[error(transparent)]
65    EmbeddingError(#[from] EmbeddingsError),
66    // See [`LlamaSamplerError`]
67}
68
69/// There was an error while getting the chat template from a model.
70#[derive(Debug, Eq, PartialEq, thiserror::Error)]
71pub enum ChatTemplateError {
72    /// gguf has no chat template (by that name)
73    #[error("chat template not found - returned null pointer")]
74    MissingTemplate,
75
76    /// chat template contained a null byte
77    #[error("null byte in string {0}")]
78    NullError(#[from] NulError),
79
80    /// The chat template was not valid utf8.
81    #[error(transparent)]
82    Utf8Error(#[from] std::str::Utf8Error),
83}
84
85/// Failed fetching metadata value
86#[derive(Debug, Eq, PartialEq, thiserror::Error)]
87pub enum MetaValError {
88    /// The provided string contains an unexpected null-byte
89    #[error("null byte in string {0}")]
90    NullError(#[from] NulError),
91
92    /// The returned data contains invalid UTF8 data
93    #[error("FromUtf8Error {0}")]
94    FromUtf8Error(#[from] FromUtf8Error),
95
96    /// Got negative return value. This happens if the key or index queried does not exist.
97    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
98    NegativeReturn(i32),
99}
100
101/// Failed to Load context
102#[derive(Debug, Eq, PartialEq, thiserror::Error)]
103pub enum LlamaContextLoadError {
104    /// llama.cpp returned null
105    #[error("null reference from llama.cpp")]
106    NullReturn,
107}
108
109/// Failed to decode a batch.
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum DecodeError {
112    /// No kv cache slot was available.
113    #[error("Decode Error 1: NoKvCacheSlot")]
114    NoKvCacheSlot,
115    /// The number of tokens in the batch was 0.
116    #[error("Decode Error -1: n_tokens == 0")]
117    NTokensZero,
118    /// An unknown error occurred.
119    #[error("Decode Error {0}: unknown")]
120    Unknown(c_int),
121}
122
123/// Failed to decode a batch.
124#[derive(Debug, Eq, PartialEq, thiserror::Error)]
125pub enum EncodeError {
126    /// No kv cache slot was available.
127    #[error("Encode Error 1: NoKvCacheSlot")]
128    NoKvCacheSlot,
129    /// The number of tokens in the batch was 0.
130    #[error("Encode Error -1: n_tokens == 0")]
131    NTokensZero,
132    /// An unknown error occurred.
133    #[error("Encode Error {0}: unknown")]
134    Unknown(c_int),
135}
136
137/// When embedding related functions fail
138#[derive(Debug, Eq, PartialEq, thiserror::Error)]
139pub enum EmbeddingsError {
140    /// Embeddings weren't enabled in the context options
141    #[error("Embeddings weren't enabled in the context options")]
142    NotEnabled,
143    /// Logits weren't enabled for the given token
144    #[error("Logits were not enabled for the given token")]
145    LogitsNotEnabled,
146    /// The given sequence index exceeds the max sequence id
147    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
148    NonePoolType,
149}
150
151/// Decode a error from llama.cpp into a [`DecodeError`].
152impl From<NonZeroI32> for DecodeError {
153    fn from(value: NonZeroI32) -> Self {
154        match value.get() {
155            1 => DecodeError::NoKvCacheSlot,
156            -1 => DecodeError::NTokensZero,
157            i => DecodeError::Unknown(i),
158        }
159    }
160}
161
162/// Encode a error from llama.cpp into a [`EncodeError`].
163impl From<NonZeroI32> for EncodeError {
164    fn from(value: NonZeroI32) -> Self {
165        match value.get() {
166            1 => EncodeError::NoKvCacheSlot,
167            -1 => EncodeError::NTokensZero,
168            i => EncodeError::Unknown(i),
169        }
170    }
171}
172
173/// An error that can occur when loading a model.
174#[derive(Debug, Eq, PartialEq, thiserror::Error)]
175pub enum LlamaModelLoadError {
176    /// There was a null byte in a provided string and thus it could not be converted to a C string.
177    #[error("null byte in string {0}")]
178    NullError(#[from] NulError),
179    /// llama.cpp returned a nullptr - this could be many different causes.
180    #[error("null result from llama cpp")]
181    NullResult,
182    /// Failed to convert the path to a rust str. This means the path was not valid unicode
183    #[error("failed to convert path {0} to str")]
184    PathToStrError(PathBuf),
185}
186
187/// An error that can occur when loading a model.
188#[derive(Debug, Eq, PartialEq, thiserror::Error)]
189pub enum LlamaLoraAdapterInitError {
190    /// There was a null byte in a provided string and thus it could not be converted to a C string.
191    #[error("null byte in string {0}")]
192    NullError(#[from] NulError),
193    /// llama.cpp returned a nullptr - this could be many different causes.
194    #[error("null result from llama cpp")]
195    NullResult,
196    /// Failed to convert the path to a rust str. This means the path was not valid unicode
197    #[error("failed to convert path {0} to str")]
198    PathToStrError(PathBuf),
199}
200
201/// An error that can occur when loading a model.
202#[derive(Debug, Eq, PartialEq, thiserror::Error)]
203pub enum LlamaLoraAdapterSetError {
204    /// llama.cpp returned a non-zero error code.
205    #[error("error code from llama cpp")]
206    ErrorResult(i32),
207}
208
209/// An error that can occur when loading a model.
210#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterRemoveError {
212    /// llama.cpp returned a non-zero error code.
213    #[error("error code from llama cpp")]
214    ErrorResult(i32),
215}
216
217/// get the time (in microseconds) according to llama.cpp
218/// ```
219/// # use llama_cpp_2::llama_time_us;
220/// # use llama_cpp_2::llama_backend::LlamaBackend;
221/// let backend = LlamaBackend::init().unwrap();
222/// let time = llama_time_us();
223/// assert!(time > 0);
224/// ```
225#[must_use]
226pub fn llama_time_us() -> i64 {
227    unsafe { llama_cpp_sys_2::llama_time_us() }
228}
229
230/// get the max number of devices according to llama.cpp (this is generally cuda devices)
231/// ```
232/// # use llama_cpp_2::max_devices;
233/// let max_devices = max_devices();
234/// assert!(max_devices >= 0);
235/// ```
236#[must_use]
237pub fn max_devices() -> usize {
238    unsafe { llama_cpp_sys_2::llama_max_devices() }
239}
240
241/// is memory mapping supported according to llama.cpp
242/// ```
243/// # use llama_cpp_2::mmap_supported;
244/// let mmap_supported = mmap_supported();
245/// if mmap_supported {
246///   println!("mmap_supported!");
247/// }
248/// ```
249#[must_use]
250pub fn mmap_supported() -> bool {
251    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
252}
253
254/// is memory locking supported according to llama.cpp
255/// ```
256/// # use llama_cpp_2::mlock_supported;
257/// let mlock_supported = mlock_supported();
258/// if mlock_supported {
259///    println!("mlock_supported!");
260/// }
261/// ```
262#[must_use]
263pub fn mlock_supported() -> bool {
264    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
265}
266
267/// An error that can occur when converting a token to a string.
268#[derive(Debug, thiserror::Error, Clone)]
269#[non_exhaustive]
270pub enum TokenToStringError {
271    /// the token type was unknown
272    #[error("Unknown Token Type")]
273    UnknownTokenType,
274    /// There was insufficient buffer space to convert the token to a string.
275    #[error("Insufficient Buffer Space {0}")]
276    InsufficientBufferSpace(c_int),
277    /// The token was not valid utf8.
278    #[error("FromUtf8Error {0}")]
279    FromUtf8Error(#[from] FromUtf8Error),
280}
281
282/// Failed to convert a string to a token sequence.
283#[derive(Debug, thiserror::Error)]
284pub enum StringToTokenError {
285    /// the string contained a null byte and thus could not be converted to a c string.
286    #[error("{0}")]
287    NulError(#[from] NulError),
288    #[error("{0}")]
289    /// Failed to convert a provided integer to a [`c_int`].
290    CIntConversionError(#[from] std::num::TryFromIntError),
291}
292
293/// Failed to apply model chat template.
294#[derive(Debug, thiserror::Error)]
295pub enum NewLlamaChatMessageError {
296    /// the string contained a null byte and thus could not be converted to a c string.
297    #[error("{0}")]
298    NulError(#[from] NulError),
299}
300
301/// Failed to apply model chat template.
302#[derive(Debug, thiserror::Error)]
303pub enum ApplyChatTemplateError {
304    /// the string contained a null byte and thus could not be converted to a c string.
305    #[error("{0}")]
306    NulError(#[from] NulError),
307    /// the string could not be converted to utf8.
308    #[error("{0}")]
309    FromUtf8Error(#[from] FromUtf8Error),
310}
311
312/// Get the time in microseconds according to ggml
313///
314/// ```
315/// # use std::time::Duration;
316/// # use llama_cpp_2::llama_backend::LlamaBackend;
317/// let backend = LlamaBackend::init().unwrap();
318/// use llama_cpp_2::ggml_time_us;
319///
320/// let start = ggml_time_us();
321///
322/// std::thread::sleep(Duration::from_micros(10));
323///
324/// let end = ggml_time_us();
325///
326/// let elapsed = end - start;
327///
328/// assert!(elapsed >= 10)
329#[must_use]
330pub fn ggml_time_us() -> i64 {
331    unsafe { llama_cpp_sys_2::ggml_time_us() }
332}
333
334/// checks if mlock is supported
335///
336/// ```
337/// # use llama_cpp_2::llama_supports_mlock;
338///
339/// if llama_supports_mlock() {
340///   println!("mlock is supported!");
341/// } else {
342///   println!("mlock is not supported!");
343/// }
344/// ```
345#[must_use]
346pub fn llama_supports_mlock() -> bool {
347    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
348}
349
350/// Options to configure how llama.cpp logs are intercepted.
351#[derive(Default, Debug, Clone)]
352pub struct LogOptions {
353    disabled: bool,
354}
355
356impl LogOptions {
357    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
358    /// logs to be sent to tracing.
359    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
360        self.disabled = !enabled;
361        self
362    }
363}
364
365extern "C" fn logs_to_trace(
366    level: llama_cpp_sys_2::ggml_log_level,
367    text: *const ::std::os::raw::c_char,
368    data: *mut ::std::os::raw::c_void,
369) {
370    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
371    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
372    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
373    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
374    use std::borrow::Borrow;
375
376    let log_state = unsafe { &*(data as *const log::State) };
377
378    if log_state.options.disabled {
379        return;
380    }
381
382    // If the log level is disabled, we can just return early
383    if !log_state.is_enabled_for_level(level) {
384        log_state.update_previous_level_for_disabled_log(level);
385        return;
386    }
387
388    let text = unsafe { std::ffi::CStr::from_ptr(text) };
389    let text = text.to_string_lossy();
390    let text: &str = text.borrow();
391
392    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
393    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
394    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
395    // to know how to flush it.
396
397    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
398        log_state.cont_buffered_log(text);
399    } else if text.ends_with('\n') {
400        log_state.emit_non_cont_line(level, text);
401    } else {
402        log_state.buffer_non_cont(level, text);
403    }
404}
405
406/// Redirect llama.cpp logs into tracing.
407pub fn send_logs_to_tracing(options: LogOptions) {
408    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
409
410    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
411    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
412    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
413    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
414    let llama_heap_state = Box::as_ref(
415        log::LLAMA_STATE
416            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
417    ) as *const _;
418    let ggml_heap_state = Box::as_ref(
419        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
420    ) as *const _;
421
422    unsafe {
423        // GGML has to be set after llama since setting llama sets ggml as well.
424        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
425        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
426    }
427}