llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::{c_char, NulError};
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod llama_backend;
27pub mod llama_batch;
28mod log;
29pub mod model;
30#[cfg(feature = "mtmd")]
31pub mod mtmd;
32pub mod sampling;
33pub mod timing;
34pub mod token;
35pub mod token_type;
36
37/// A failable result from a llama.cpp function.
38pub type Result<T> = std::result::Result<T, LlamaCppError>;
39
40/// All errors that can occur in the llama-cpp crate.
41#[derive(Debug, Eq, PartialEq, thiserror::Error)]
42pub enum LlamaCppError {
43    /// The backend was already initialized. This can generally be ignored as initializing the backend
44    /// is idempotent.
45    #[error("BackendAlreadyInitialized")]
46    BackendAlreadyInitialized,
47    /// There was an error while get the chat template from model.
48    #[error("{0}")]
49    ChatTemplateError(#[from] ChatTemplateError),
50    /// There was an error while decoding a batch.
51    #[error("{0}")]
52    DecodeError(#[from] DecodeError),
53    /// There was an error while encoding a batch.
54    #[error("{0}")]
55    EncodeError(#[from] EncodeError),
56    /// There was an error loading a model.
57    #[error("{0}")]
58    LlamaModelLoadError(#[from] LlamaModelLoadError),
59    /// There was an error creating a new model context.
60    #[error("{0}")]
61    LlamaContextLoadError(#[from] LlamaContextLoadError),
62    /// There was an error adding a token to a batch.
63    #[error["{0}"]]
64    BatchAddError(#[from] BatchAddError),
65    /// see [`EmbeddingsError`]
66    #[error(transparent)]
67    EmbeddingError(#[from] EmbeddingsError),
68    // See [`LlamaSamplerError`]
69    /// Backend device not found
70    #[error("Backend device {0} not found")]
71    BackendDeviceNotFound(usize),
72    /// Max devices exceeded
73    #[error("Max devices exceeded. Max devices is {0}")]
74    MaxDevicesExceeded(usize),
75}
76
77/// There was an error while getting the chat template from a model.
78#[derive(Debug, Eq, PartialEq, thiserror::Error)]
79pub enum ChatTemplateError {
80    /// gguf has no chat template (by that name)
81    #[error("chat template not found - returned null pointer")]
82    MissingTemplate,
83
84    /// chat template contained a null byte
85    #[error("null byte in string {0}")]
86    NullError(#[from] NulError),
87
88    /// The chat template was not valid utf8.
89    #[error(transparent)]
90    Utf8Error(#[from] std::str::Utf8Error),
91}
92
93/// Failed fetching metadata value
94#[derive(Debug, Eq, PartialEq, thiserror::Error)]
95pub enum MetaValError {
96    /// The provided string contains an unexpected null-byte
97    #[error("null byte in string {0}")]
98    NullError(#[from] NulError),
99
100    /// The returned data contains invalid UTF8 data
101    #[error("FromUtf8Error {0}")]
102    FromUtf8Error(#[from] FromUtf8Error),
103
104    /// Got negative return value. This happens if the key or index queried does not exist.
105    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
106    NegativeReturn(i32),
107}
108
109/// Failed to Load context
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum LlamaContextLoadError {
112    /// llama.cpp returned null
113    #[error("null reference from llama.cpp")]
114    NullReturn,
115}
116
117/// Failed to decode a batch.
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum DecodeError {
120    /// No kv cache slot was available.
121    #[error("Decode Error 1: NoKvCacheSlot")]
122    NoKvCacheSlot,
123    /// The number of tokens in the batch was 0.
124    #[error("Decode Error -1: n_tokens == 0")]
125    NTokensZero,
126    /// An unknown error occurred.
127    #[error("Decode Error {0}: unknown")]
128    Unknown(c_int),
129}
130
131/// Failed to decode a batch.
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EncodeError {
134    /// No kv cache slot was available.
135    #[error("Encode Error 1: NoKvCacheSlot")]
136    NoKvCacheSlot,
137    /// The number of tokens in the batch was 0.
138    #[error("Encode Error -1: n_tokens == 0")]
139    NTokensZero,
140    /// An unknown error occurred.
141    #[error("Encode Error {0}: unknown")]
142    Unknown(c_int),
143}
144
145/// When embedding related functions fail
146#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EmbeddingsError {
148    /// Embeddings weren't enabled in the context options
149    #[error("Embeddings weren't enabled in the context options")]
150    NotEnabled,
151    /// Logits weren't enabled for the given token
152    #[error("Logits were not enabled for the given token")]
153    LogitsNotEnabled,
154    /// The given sequence index exceeds the max sequence id
155    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
156    NonePoolType,
157}
158
159/// Errors that can occur when initializing a grammar sampler
160#[derive(Debug, Eq, PartialEq, thiserror::Error)]
161pub enum GrammarError {
162    /// The grammar root was not found in the grammar string
163    #[error("Grammar root not found in grammar string")]
164    RootNotFound,
165    /// The trigger word contains null bytes
166    #[error("Trigger word contains null bytes")]
167    TriggerWordNullBytes,
168    /// The grammar string or root contains null bytes
169    #[error("Grammar string or root contains null bytes")]
170    GrammarNullBytes,
171    /// The grammar call returned null
172    #[error("Grammar call returned null")]
173    NullGrammar,
174}
175
176/// Decode a error from llama.cpp into a [`DecodeError`].
177impl From<NonZeroI32> for DecodeError {
178    fn from(value: NonZeroI32) -> Self {
179        match value.get() {
180            1 => DecodeError::NoKvCacheSlot,
181            -1 => DecodeError::NTokensZero,
182            i => DecodeError::Unknown(i),
183        }
184    }
185}
186
187/// Encode a error from llama.cpp into a [`EncodeError`].
188impl From<NonZeroI32> for EncodeError {
189    fn from(value: NonZeroI32) -> Self {
190        match value.get() {
191            1 => EncodeError::NoKvCacheSlot,
192            -1 => EncodeError::NTokensZero,
193            i => EncodeError::Unknown(i),
194        }
195    }
196}
197
198/// An error that can occur when loading a model.
199#[derive(Debug, Eq, PartialEq, thiserror::Error)]
200pub enum LlamaModelLoadError {
201    /// There was a null byte in a provided string and thus it could not be converted to a C string.
202    #[error("null byte in string {0}")]
203    NullError(#[from] NulError),
204    /// llama.cpp returned a nullptr - this could be many different causes.
205    #[error("null result from llama cpp")]
206    NullResult,
207    /// Failed to convert the path to a rust str. This means the path was not valid unicode
208    #[error("failed to convert path {0} to str")]
209    PathToStrError(PathBuf),
210}
211
212/// An error that can occur when loading a model.
213#[derive(Debug, Eq, PartialEq, thiserror::Error)]
214pub enum LlamaLoraAdapterInitError {
215    /// There was a null byte in a provided string and thus it could not be converted to a C string.
216    #[error("null byte in string {0}")]
217    NullError(#[from] NulError),
218    /// llama.cpp returned a nullptr - this could be many different causes.
219    #[error("null result from llama cpp")]
220    NullResult,
221    /// Failed to convert the path to a rust str. This means the path was not valid unicode
222    #[error("failed to convert path {0} to str")]
223    PathToStrError(PathBuf),
224}
225
226/// An error that can occur when loading a model.
227#[derive(Debug, Eq, PartialEq, thiserror::Error)]
228pub enum LlamaLoraAdapterSetError {
229    /// llama.cpp returned a non-zero error code.
230    #[error("error code from llama cpp")]
231    ErrorResult(i32),
232}
233
234/// An error that can occur when loading a model.
235#[derive(Debug, Eq, PartialEq, thiserror::Error)]
236pub enum LlamaLoraAdapterRemoveError {
237    /// llama.cpp returned a non-zero error code.
238    #[error("error code from llama cpp")]
239    ErrorResult(i32),
240}
241
242/// get the time (in microseconds) according to llama.cpp
243/// ```
244/// # use llama_cpp_2::llama_time_us;
245/// # use llama_cpp_2::llama_backend::LlamaBackend;
246/// let backend = LlamaBackend::init().unwrap();
247/// let time = llama_time_us();
248/// assert!(time > 0);
249/// ```
250#[must_use]
251pub fn llama_time_us() -> i64 {
252    unsafe { llama_cpp_sys_2::llama_time_us() }
253}
254
255/// get the max number of devices according to llama.cpp (this is generally cuda devices)
256/// ```
257/// # use llama_cpp_2::max_devices;
258/// let max_devices = max_devices();
259/// assert!(max_devices >= 0);
260/// ```
261#[must_use]
262pub fn max_devices() -> usize {
263    unsafe { llama_cpp_sys_2::llama_max_devices() }
264}
265
266/// is memory mapping supported according to llama.cpp
267/// ```
268/// # use llama_cpp_2::mmap_supported;
269/// let mmap_supported = mmap_supported();
270/// if mmap_supported {
271///   println!("mmap_supported!");
272/// }
273/// ```
274#[must_use]
275pub fn mmap_supported() -> bool {
276    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
277}
278
279/// is memory locking supported according to llama.cpp
280/// ```
281/// # use llama_cpp_2::mlock_supported;
282/// let mlock_supported = mlock_supported();
283/// if mlock_supported {
284///    println!("mlock_supported!");
285/// }
286/// ```
287#[must_use]
288pub fn mlock_supported() -> bool {
289    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
290}
291
292/// An error that can occur when converting a token to a string.
293#[derive(Debug, thiserror::Error, Clone)]
294#[non_exhaustive]
295pub enum TokenToStringError {
296    /// the token type was unknown
297    #[error("Unknown Token Type")]
298    UnknownTokenType,
299    /// There was insufficient buffer space to convert the token to a string.
300    #[error("Insufficient Buffer Space {0}")]
301    InsufficientBufferSpace(c_int),
302    /// The token was not valid utf8.
303    #[error("FromUtf8Error {0}")]
304    FromUtf8Error(#[from] FromUtf8Error),
305}
306
307/// Failed to convert a string to a token sequence.
308#[derive(Debug, thiserror::Error)]
309pub enum StringToTokenError {
310    /// the string contained a null byte and thus could not be converted to a c string.
311    #[error("{0}")]
312    NulError(#[from] NulError),
313    #[error("{0}")]
314    /// Failed to convert a provided integer to a [`c_int`].
315    CIntConversionError(#[from] std::num::TryFromIntError),
316}
317
318/// Failed to apply model chat template.
319#[derive(Debug, thiserror::Error)]
320pub enum NewLlamaChatMessageError {
321    /// the string contained a null byte and thus could not be converted to a c string.
322    #[error("{0}")]
323    NulError(#[from] NulError),
324}
325
326/// Failed to apply model chat template.
327#[derive(Debug, thiserror::Error)]
328pub enum ApplyChatTemplateError {
329    /// the string contained a null byte and thus could not be converted to a c string.
330    #[error("{0}")]
331    NulError(#[from] NulError),
332    /// the string could not be converted to utf8.
333    #[error("{0}")]
334    FromUtf8Error(#[from] FromUtf8Error),
335}
336
337/// Get the time in microseconds according to ggml
338///
339/// ```
340/// # use std::time::Duration;
341/// # use llama_cpp_2::llama_backend::LlamaBackend;
342/// let backend = LlamaBackend::init().unwrap();
343/// use llama_cpp_2::ggml_time_us;
344///
345/// let start = ggml_time_us();
346///
347/// std::thread::sleep(Duration::from_micros(10));
348///
349/// let end = ggml_time_us();
350///
351/// let elapsed = end - start;
352///
353/// assert!(elapsed >= 10)
354#[must_use]
355pub fn ggml_time_us() -> i64 {
356    unsafe { llama_cpp_sys_2::ggml_time_us() }
357}
358
359/// checks if mlock is supported
360///
361/// ```
362/// # use llama_cpp_2::llama_supports_mlock;
363///
364/// if llama_supports_mlock() {
365///   println!("mlock is supported!");
366/// } else {
367///   println!("mlock is not supported!");
368/// }
369/// ```
370#[must_use]
371pub fn llama_supports_mlock() -> bool {
372    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
373}
374
375/// Backend device type
376#[derive(Debug, Clone, Copy, PartialEq, Eq)]
377pub enum LlamaBackendDeviceType {
378    /// CPU device
379    Cpu,
380    /// ACCEL device
381    Accelerator,
382    /// GPU device
383    Gpu,
384    /// iGPU device
385    IntegratedGpu,
386    /// Unknown device type
387    Unknown,
388}
389
390/// A ggml backend device
391///
392/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
393#[derive(Debug, Clone)]
394pub struct LlamaBackendDevice {
395    /// The index of the device
396    ///
397    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
398    pub index: usize,
399    /// The name of the device (e.g. "Vulkan0")
400    pub name: String,
401    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
402    pub description: String,
403    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
404    pub backend: String,
405    /// Total memory of the device in bytes
406    pub memory_total: usize,
407    /// Free memory of the device in bytes
408    pub memory_free: usize,
409    /// Device type
410    pub device_type: LlamaBackendDeviceType,
411}
412
413/// List ggml backend devices
414#[must_use]
415pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
416    let mut devices = Vec::new();
417    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
418        fn cstr_to_string(ptr: *const c_char) -> String {
419            if ptr.is_null() {
420                String::new()
421            } else {
422                unsafe { std::ffi::CStr::from_ptr(ptr) }
423                    .to_string_lossy()
424                    .to_string()
425            }
426        }
427        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
428        let props = unsafe {
429            let mut props = std::mem::zeroed();
430            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
431            props
432        };
433        let name = cstr_to_string(props.name);
434        let description = cstr_to_string(props.description);
435        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
436        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
437        let backend = cstr_to_string(backend_name);
438        let memory_total = props.memory_total;
439        let memory_free = props.memory_free;
440        let device_type = match props.type_ {
441            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
442            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
443            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
444            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
445            _ => LlamaBackendDeviceType::Unknown,
446        };
447        devices.push(LlamaBackendDevice {
448            index: i,
449            name,
450            description,
451            backend,
452            memory_total,
453            memory_free,
454            device_type,
455        });
456    }
457    devices
458}
459
460/// Options to configure how llama.cpp logs are intercepted.
461#[derive(Default, Debug, Clone)]
462pub struct LogOptions {
463    disabled: bool,
464}
465
466impl LogOptions {
467    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
468    /// logs to be sent to tracing.
469    #[must_use]
470    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
471        self.disabled = !enabled;
472        self
473    }
474}
475
476extern "C" fn logs_to_trace(
477    level: llama_cpp_sys_2::ggml_log_level,
478    text: *const ::std::os::raw::c_char,
479    data: *mut ::std::os::raw::c_void,
480) {
481    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
482    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
483    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
484    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
485    use std::borrow::Borrow;
486
487    let log_state = unsafe { &*(data as *const log::State) };
488
489    if log_state.options.disabled {
490        return;
491    }
492
493    // If the log level is disabled, we can just return early
494    if !log_state.is_enabled_for_level(level) {
495        log_state.update_previous_level_for_disabled_log(level);
496        return;
497    }
498
499    let text = unsafe { std::ffi::CStr::from_ptr(text) };
500    let text = text.to_string_lossy();
501    let text: &str = text.borrow();
502
503    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
504    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
505    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
506    // to know how to flush it.
507
508    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
509        log_state.cont_buffered_log(text);
510    } else if text.ends_with('\n') {
511        log_state.emit_non_cont_line(level, text);
512    } else {
513        log_state.buffer_non_cont(level, text);
514    }
515}
516
517/// Redirect llama.cpp logs into tracing.
518pub fn send_logs_to_tracing(options: LogOptions) {
519    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
520
521    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
522    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
523    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
524    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
525    let llama_heap_state = Box::as_ref(
526        log::LLAMA_STATE
527            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
528    ) as *const _;
529    let ggml_heap_state = Box::as_ref(
530        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
531    ) as *const _;
532
533    unsafe {
534        // GGML has to be set after llama since setting llama sets ggml as well.
535        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
536        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
537    }
538}