llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::NulError;
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod llama_backend;
27pub mod llama_batch;
28mod log;
29pub mod model;
30#[cfg(feature = "mtmd")]
31pub mod mtmd;
32pub mod sampling;
33pub mod timing;
34pub mod token;
35pub mod token_type;
36
37/// A failable result from a llama.cpp function.
38pub type Result<T> = std::result::Result<T, LLamaCppError>;
39
40/// All errors that can occur in the llama-cpp crate.
41#[derive(Debug, Eq, PartialEq, thiserror::Error)]
42pub enum LLamaCppError {
43    /// The backend was already initialized. This can generally be ignored as initializing the backend
44    /// is idempotent.
45    #[error("BackendAlreadyInitialized")]
46    BackendAlreadyInitialized,
47    /// There was an error while get the chat template from model.
48    #[error("{0}")]
49    ChatTemplateError(#[from] ChatTemplateError),
50    /// There was an error while decoding a batch.
51    #[error("{0}")]
52    DecodeError(#[from] DecodeError),
53    /// There was an error while encoding a batch.
54    #[error("{0}")]
55    EncodeError(#[from] EncodeError),
56    /// There was an error loading a model.
57    #[error("{0}")]
58    LlamaModelLoadError(#[from] LlamaModelLoadError),
59    /// There was an error creating a new model context.
60    #[error("{0}")]
61    LlamaContextLoadError(#[from] LlamaContextLoadError),
62    /// There was an error adding a token to a batch.
63    #[error["{0}"]]
64    BatchAddError(#[from] BatchAddError),
65    /// see [`EmbeddingsError`]
66    #[error(transparent)]
67    EmbeddingError(#[from] EmbeddingsError),
68    // See [`LlamaSamplerError`]
69    /// Backend device not found
70    #[error("Backend device {0} not found")]
71    BackendDeviceNotFound(usize),
72    /// Max devices exceeded
73    #[error("Max devices exceeded. Max devices is {0}")]
74    MaxDevicesExceeded(usize),
75}
76
77/// There was an error while getting the chat template from a model.
78#[derive(Debug, Eq, PartialEq, thiserror::Error)]
79pub enum ChatTemplateError {
80    /// gguf has no chat template (by that name)
81    #[error("chat template not found - returned null pointer")]
82    MissingTemplate,
83
84    /// chat template contained a null byte
85    #[error("null byte in string {0}")]
86    NullError(#[from] NulError),
87
88    /// The chat template was not valid utf8.
89    #[error(transparent)]
90    Utf8Error(#[from] std::str::Utf8Error),
91}
92
93/// Failed fetching metadata value
94#[derive(Debug, Eq, PartialEq, thiserror::Error)]
95pub enum MetaValError {
96    /// The provided string contains an unexpected null-byte
97    #[error("null byte in string {0}")]
98    NullError(#[from] NulError),
99
100    /// The returned data contains invalid UTF8 data
101    #[error("FromUtf8Error {0}")]
102    FromUtf8Error(#[from] FromUtf8Error),
103
104    /// Got negative return value. This happens if the key or index queried does not exist.
105    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
106    NegativeReturn(i32),
107}
108
109/// Failed to Load context
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum LlamaContextLoadError {
112    /// llama.cpp returned null
113    #[error("null reference from llama.cpp")]
114    NullReturn,
115}
116
117/// Failed to decode a batch.
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum DecodeError {
120    /// No kv cache slot was available.
121    #[error("Decode Error 1: NoKvCacheSlot")]
122    NoKvCacheSlot,
123    /// The number of tokens in the batch was 0.
124    #[error("Decode Error -1: n_tokens == 0")]
125    NTokensZero,
126    /// An unknown error occurred.
127    #[error("Decode Error {0}: unknown")]
128    Unknown(c_int),
129}
130
131/// Failed to decode a batch.
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EncodeError {
134    /// No kv cache slot was available.
135    #[error("Encode Error 1: NoKvCacheSlot")]
136    NoKvCacheSlot,
137    /// The number of tokens in the batch was 0.
138    #[error("Encode Error -1: n_tokens == 0")]
139    NTokensZero,
140    /// An unknown error occurred.
141    #[error("Encode Error {0}: unknown")]
142    Unknown(c_int),
143}
144
145/// When embedding related functions fail
146#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EmbeddingsError {
148    /// Embeddings weren't enabled in the context options
149    #[error("Embeddings weren't enabled in the context options")]
150    NotEnabled,
151    /// Logits weren't enabled for the given token
152    #[error("Logits were not enabled for the given token")]
153    LogitsNotEnabled,
154    /// The given sequence index exceeds the max sequence id
155    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
156    NonePoolType,
157}
158
159/// Decode a error from llama.cpp into a [`DecodeError`].
160impl From<NonZeroI32> for DecodeError {
161    fn from(value: NonZeroI32) -> Self {
162        match value.get() {
163            1 => DecodeError::NoKvCacheSlot,
164            -1 => DecodeError::NTokensZero,
165            i => DecodeError::Unknown(i),
166        }
167    }
168}
169
170/// Encode a error from llama.cpp into a [`EncodeError`].
171impl From<NonZeroI32> for EncodeError {
172    fn from(value: NonZeroI32) -> Self {
173        match value.get() {
174            1 => EncodeError::NoKvCacheSlot,
175            -1 => EncodeError::NTokensZero,
176            i => EncodeError::Unknown(i),
177        }
178    }
179}
180
181/// An error that can occur when loading a model.
182#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaModelLoadError {
184    /// There was a null byte in a provided string and thus it could not be converted to a C string.
185    #[error("null byte in string {0}")]
186    NullError(#[from] NulError),
187    /// llama.cpp returned a nullptr - this could be many different causes.
188    #[error("null result from llama cpp")]
189    NullResult,
190    /// Failed to convert the path to a rust str. This means the path was not valid unicode
191    #[error("failed to convert path {0} to str")]
192    PathToStrError(PathBuf),
193}
194
195/// An error that can occur when loading a model.
196#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterInitError {
198    /// There was a null byte in a provided string and thus it could not be converted to a C string.
199    #[error("null byte in string {0}")]
200    NullError(#[from] NulError),
201    /// llama.cpp returned a nullptr - this could be many different causes.
202    #[error("null result from llama cpp")]
203    NullResult,
204    /// Failed to convert the path to a rust str. This means the path was not valid unicode
205    #[error("failed to convert path {0} to str")]
206    PathToStrError(PathBuf),
207}
208
209/// An error that can occur when loading a model.
210#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterSetError {
212    /// llama.cpp returned a non-zero error code.
213    #[error("error code from llama cpp")]
214    ErrorResult(i32),
215}
216
217/// An error that can occur when loading a model.
218#[derive(Debug, Eq, PartialEq, thiserror::Error)]
219pub enum LlamaLoraAdapterRemoveError {
220    /// llama.cpp returned a non-zero error code.
221    #[error("error code from llama cpp")]
222    ErrorResult(i32),
223}
224
225/// get the time (in microseconds) according to llama.cpp
226/// ```
227/// # use llama_cpp_2::llama_time_us;
228/// # use llama_cpp_2::llama_backend::LlamaBackend;
229/// let backend = LlamaBackend::init().unwrap();
230/// let time = llama_time_us();
231/// assert!(time > 0);
232/// ```
233#[must_use]
234pub fn llama_time_us() -> i64 {
235    unsafe { llama_cpp_sys_2::llama_time_us() }
236}
237
238/// get the max number of devices according to llama.cpp (this is generally cuda devices)
239/// ```
240/// # use llama_cpp_2::max_devices;
241/// let max_devices = max_devices();
242/// assert!(max_devices >= 0);
243/// ```
244#[must_use]
245pub fn max_devices() -> usize {
246    unsafe { llama_cpp_sys_2::llama_max_devices() }
247}
248
249/// is memory mapping supported according to llama.cpp
250/// ```
251/// # use llama_cpp_2::mmap_supported;
252/// let mmap_supported = mmap_supported();
253/// if mmap_supported {
254///   println!("mmap_supported!");
255/// }
256/// ```
257#[must_use]
258pub fn mmap_supported() -> bool {
259    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
260}
261
262/// is memory locking supported according to llama.cpp
263/// ```
264/// # use llama_cpp_2::mlock_supported;
265/// let mlock_supported = mlock_supported();
266/// if mlock_supported {
267///    println!("mlock_supported!");
268/// }
269/// ```
270#[must_use]
271pub fn mlock_supported() -> bool {
272    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
273}
274
275/// An error that can occur when converting a token to a string.
276#[derive(Debug, thiserror::Error, Clone)]
277#[non_exhaustive]
278pub enum TokenToStringError {
279    /// the token type was unknown
280    #[error("Unknown Token Type")]
281    UnknownTokenType,
282    /// There was insufficient buffer space to convert the token to a string.
283    #[error("Insufficient Buffer Space {0}")]
284    InsufficientBufferSpace(c_int),
285    /// The token was not valid utf8.
286    #[error("FromUtf8Error {0}")]
287    FromUtf8Error(#[from] FromUtf8Error),
288}
289
290/// Failed to convert a string to a token sequence.
291#[derive(Debug, thiserror::Error)]
292pub enum StringToTokenError {
293    /// the string contained a null byte and thus could not be converted to a c string.
294    #[error("{0}")]
295    NulError(#[from] NulError),
296    #[error("{0}")]
297    /// Failed to convert a provided integer to a [`c_int`].
298    CIntConversionError(#[from] std::num::TryFromIntError),
299}
300
301/// Failed to apply model chat template.
302#[derive(Debug, thiserror::Error)]
303pub enum NewLlamaChatMessageError {
304    /// the string contained a null byte and thus could not be converted to a c string.
305    #[error("{0}")]
306    NulError(#[from] NulError),
307}
308
309/// Failed to apply model chat template.
310#[derive(Debug, thiserror::Error)]
311pub enum ApplyChatTemplateError {
312    /// the string contained a null byte and thus could not be converted to a c string.
313    #[error("{0}")]
314    NulError(#[from] NulError),
315    /// the string could not be converted to utf8.
316    #[error("{0}")]
317    FromUtf8Error(#[from] FromUtf8Error),
318}
319
320/// Get the time in microseconds according to ggml
321///
322/// ```
323/// # use std::time::Duration;
324/// # use llama_cpp_2::llama_backend::LlamaBackend;
325/// let backend = LlamaBackend::init().unwrap();
326/// use llama_cpp_2::ggml_time_us;
327///
328/// let start = ggml_time_us();
329///
330/// std::thread::sleep(Duration::from_micros(10));
331///
332/// let end = ggml_time_us();
333///
334/// let elapsed = end - start;
335///
336/// assert!(elapsed >= 10)
337#[must_use]
338pub fn ggml_time_us() -> i64 {
339    unsafe { llama_cpp_sys_2::ggml_time_us() }
340}
341
342/// checks if mlock is supported
343///
344/// ```
345/// # use llama_cpp_2::llama_supports_mlock;
346///
347/// if llama_supports_mlock() {
348///   println!("mlock is supported!");
349/// } else {
350///   println!("mlock is not supported!");
351/// }
352/// ```
353#[must_use]
354pub fn llama_supports_mlock() -> bool {
355    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
356}
357
358/// Backend device type
359#[derive(Debug, Clone, Copy, PartialEq, Eq)]
360pub enum LlamaBackendDeviceType {
361    /// CPU device
362    Cpu,
363    /// ACCEL device
364    Accelerator,
365    /// GPU device
366    Gpu,
367    /// iGPU device
368    IntegratedGpu,
369    /// Unknown device type
370    Unknown,
371}
372
373/// A ggml backend device
374///
375/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
376#[derive(Debug, Clone)]
377pub struct LlamaBackendDevice {
378    /// The index of the device
379    ///
380    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
381    pub index: usize,
382    /// The name of the device (e.g. "Vulkan0")
383    pub name: String,
384    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
385    pub description: String,
386    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
387    pub backend: String,
388    /// Total memory of the device in bytes
389    pub memory_total: usize,
390    /// Free memory of the device in bytes
391    pub memory_free: usize,
392    /// Device type
393    pub device_type: LlamaBackendDeviceType,
394}
395
396/// List ggml backend devices
397#[must_use]
398pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
399    let mut devices = Vec::new();
400    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
401        fn cstr_to_string(ptr: *const i8) -> String {
402            if ptr.is_null() {
403                String::new()
404            } else {
405                unsafe { std::ffi::CStr::from_ptr(ptr) }
406                    .to_string_lossy()
407                    .to_string()
408            }
409        }
410        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
411        let props = unsafe {
412            let mut props = std::mem::zeroed();
413            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
414            props
415        };
416        let name = cstr_to_string(props.name);
417        let description = cstr_to_string(props.description);
418        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
419        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
420        let backend = cstr_to_string(backend_name);
421        let memory_total = props.memory_total;
422        let memory_free = props.memory_free;
423        let device_type = match props.type_ {
424            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
425            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
426            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
427            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
428            _ => LlamaBackendDeviceType::Unknown,
429        };
430        devices.push(LlamaBackendDevice {
431            index: i,
432            name,
433            description,
434            backend,
435            memory_total,
436            memory_free,
437            device_type,
438        });
439    }
440    devices
441}
442
443/// Options to configure how llama.cpp logs are intercepted.
444#[derive(Default, Debug, Clone)]
445pub struct LogOptions {
446    disabled: bool,
447}
448
449impl LogOptions {
450    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
451    /// logs to be sent to tracing.
452    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
453        self.disabled = !enabled;
454        self
455    }
456}
457
458extern "C" fn logs_to_trace(
459    level: llama_cpp_sys_2::ggml_log_level,
460    text: *const ::std::os::raw::c_char,
461    data: *mut ::std::os::raw::c_void,
462) {
463    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
464    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
465    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
466    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
467    use std::borrow::Borrow;
468
469    let log_state = unsafe { &*(data as *const log::State) };
470
471    if log_state.options.disabled {
472        return;
473    }
474
475    // If the log level is disabled, we can just return early
476    if !log_state.is_enabled_for_level(level) {
477        log_state.update_previous_level_for_disabled_log(level);
478        return;
479    }
480
481    let text = unsafe { std::ffi::CStr::from_ptr(text) };
482    let text = text.to_string_lossy();
483    let text: &str = text.borrow();
484
485    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
486    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
487    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
488    // to know how to flush it.
489
490    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
491        log_state.cont_buffered_log(text);
492    } else if text.ends_with('\n') {
493        log_state.emit_non_cont_line(level, text);
494    } else {
495        log_state.buffer_non_cont(level, text);
496    }
497}
498
499/// Redirect llama.cpp logs into tracing.
500pub fn send_logs_to_tracing(options: LogOptions) {
501    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
502
503    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
504    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
505    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
506    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
507    let llama_heap_state = Box::as_ref(
508        log::LLAMA_STATE
509            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
510    ) as *const _;
511    let ggml_heap_state = Box::as_ref(
512        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
513    ) as *const _;
514
515    unsafe {
516        // GGML has to be set after llama since setting llama sets ggml as well.
517        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
518        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
519    }
520}
llama_cpp_2/lib.rs

llama_cpp_2/
lib.rs