llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::{c_char, CStr, CString, NulError};
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod gguf;
27pub mod llama_backend;
28pub mod llama_batch;
29#[cfg(feature = "llguidance")]
30pub(crate) mod llguidance_sampler;
31mod log;
32pub mod model;
33#[cfg(feature = "mtmd")]
34pub mod mtmd;
35pub mod openai;
36pub mod sampling;
37pub mod timing;
38pub mod token;
39pub mod token_type;
40
41pub(crate) fn status_is_ok(status: llama_cpp_sys_2::llama_rs_status) -> bool {
42    status == llama_cpp_sys_2::LLAMA_RS_STATUS_OK
43}
44
45pub(crate) fn status_to_i32(status: llama_cpp_sys_2::llama_rs_status) -> i32 {
46    status as i32
47}
48
49/// A failable result from a llama.cpp function.
50pub type Result<T> = std::result::Result<T, LlamaCppError>;
51
52/// All errors that can occur in the llama-cpp crate.
53#[derive(Debug, Eq, PartialEq, thiserror::Error)]
54pub enum LlamaCppError {
55    /// The backend was already initialized. This can generally be ignored as initializing the backend
56    /// is idempotent.
57    #[error("BackendAlreadyInitialized")]
58    BackendAlreadyInitialized,
59    /// There was an error while get the chat template from model.
60    #[error("{0}")]
61    ChatTemplateError(#[from] ChatTemplateError),
62    /// There was an error while decoding a batch.
63    #[error("{0}")]
64    DecodeError(#[from] DecodeError),
65    /// There was an error while encoding a batch.
66    #[error("{0}")]
67    EncodeError(#[from] EncodeError),
68    /// There was an error loading a model.
69    #[error("{0}")]
70    LlamaModelLoadError(#[from] LlamaModelLoadError),
71    /// There was an error creating a new model context.
72    #[error("{0}")]
73    LlamaContextLoadError(#[from] LlamaContextLoadError),
74    /// There was an error adding a token to a batch.
75    #[error["{0}"]]
76    BatchAddError(#[from] BatchAddError),
77    /// see [`EmbeddingsError`]
78    #[error(transparent)]
79    EmbeddingError(#[from] EmbeddingsError),
80    // See [`LlamaSamplerError`]
81    /// Backend device not found
82    #[error("Backend device {0} not found")]
83    BackendDeviceNotFound(usize),
84    /// Max devices exceeded
85    #[error("Max devices exceeded. Max devices is {0}")]
86    MaxDevicesExceeded(usize),
87    /// Failed to convert JSON schema to grammar.
88    #[error("JsonSchemaToGrammarError: {0}")]
89    JsonSchemaToGrammarError(String),
90}
91
92/// There was an error while getting the chat template from a model.
93#[derive(Debug, Eq, PartialEq, thiserror::Error)]
94pub enum ChatTemplateError {
95    /// gguf has no chat template (by that name)
96    #[error("chat template not found - returned null pointer")]
97    MissingTemplate,
98
99    /// chat template contained a null byte
100    #[error("null byte in string {0}")]
101    NullError(#[from] NulError),
102
103    /// The chat template was not valid utf8.
104    #[error(transparent)]
105    Utf8Error(#[from] std::str::Utf8Error),
106}
107
108/// Failed fetching metadata value
109#[derive(Debug, Eq, PartialEq, thiserror::Error)]
110pub enum MetaValError {
111    /// The provided string contains an unexpected null-byte
112    #[error("null byte in string {0}")]
113    NullError(#[from] NulError),
114
115    /// The returned data contains invalid UTF8 data
116    #[error("FromUtf8Error {0}")]
117    FromUtf8Error(#[from] FromUtf8Error),
118
119    /// Got negative return value. This happens if the key or index queried does not exist.
120    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
121    NegativeReturn(i32),
122}
123
124/// Failed to Load context
125#[derive(Debug, Eq, PartialEq, thiserror::Error)]
126pub enum LlamaContextLoadError {
127    /// llama.cpp returned null
128    #[error("null reference from llama.cpp")]
129    NullReturn,
130}
131
132/// Failed to decode a batch.
133#[derive(Debug, Eq, PartialEq, thiserror::Error)]
134pub enum DecodeError {
135    /// No kv cache slot was available.
136    #[error("Decode Error 1: NoKvCacheSlot")]
137    NoKvCacheSlot,
138    /// The number of tokens in the batch was 0.
139    #[error("Decode Error -1: n_tokens == 0")]
140    NTokensZero,
141    /// An unknown error occurred.
142    #[error("Decode Error {0}: unknown")]
143    Unknown(c_int),
144}
145
146/// Failed to decode a batch.
147#[derive(Debug, Eq, PartialEq, thiserror::Error)]
148pub enum EncodeError {
149    /// No kv cache slot was available.
150    #[error("Encode Error 1: NoKvCacheSlot")]
151    NoKvCacheSlot,
152    /// The number of tokens in the batch was 0.
153    #[error("Encode Error -1: n_tokens == 0")]
154    NTokensZero,
155    /// An unknown error occurred.
156    #[error("Encode Error {0}: unknown")]
157    Unknown(c_int),
158}
159
160/// When embedding related functions fail
161#[derive(Debug, Eq, PartialEq, thiserror::Error)]
162pub enum EmbeddingsError {
163    /// Embeddings weren't enabled in the context options
164    #[error("Embeddings weren't enabled in the context options")]
165    NotEnabled,
166    /// Logits weren't enabled for the given token
167    #[error("Logits were not enabled for the given token")]
168    LogitsNotEnabled,
169    /// The given sequence index exceeds the max sequence id
170    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
171    NonePoolType,
172}
173
174/// Errors that can occur when initializing a grammar sampler
175#[derive(Debug, Eq, PartialEq, thiserror::Error)]
176pub enum GrammarError {
177    /// The grammar root was not found in the grammar string
178    #[error("Grammar root not found in grammar string")]
179    RootNotFound,
180    /// The trigger word contains null bytes
181    #[error("Trigger word contains null bytes")]
182    TriggerWordNullBytes,
183    /// The grammar string or root contains null bytes
184    #[error("Grammar string or root contains null bytes")]
185    GrammarNullBytes,
186    /// The grammar call returned null
187    #[error("Grammar call returned null")]
188    NullGrammar,
189}
190
191/// Decode a error from llama.cpp into a [`DecodeError`].
192impl From<NonZeroI32> for DecodeError {
193    fn from(value: NonZeroI32) -> Self {
194        match value.get() {
195            1 => DecodeError::NoKvCacheSlot,
196            -1 => DecodeError::NTokensZero,
197            i => DecodeError::Unknown(i),
198        }
199    }
200}
201
202/// Encode a error from llama.cpp into a [`EncodeError`].
203impl From<NonZeroI32> for EncodeError {
204    fn from(value: NonZeroI32) -> Self {
205        match value.get() {
206            1 => EncodeError::NoKvCacheSlot,
207            -1 => EncodeError::NTokensZero,
208            i => EncodeError::Unknown(i),
209        }
210    }
211}
212
213/// An error that can occur when loading a model.
214#[derive(Debug, Eq, PartialEq, thiserror::Error)]
215pub enum LlamaModelLoadError {
216    /// There was a null byte in a provided string and thus it could not be converted to a C string.
217    #[error("null byte in string {0}")]
218    NullError(#[from] NulError),
219    /// llama.cpp returned a nullptr - this could be many different causes.
220    #[error("null result from llama cpp")]
221    NullResult,
222    /// Failed to convert the path to a rust str. This means the path was not valid unicode
223    #[error("failed to convert path {0} to str")]
224    PathToStrError(PathBuf),
225}
226
227/// An error that can occur when loading a model.
228#[derive(Debug, Eq, PartialEq, thiserror::Error)]
229pub enum LlamaLoraAdapterInitError {
230    /// There was a null byte in a provided string and thus it could not be converted to a C string.
231    #[error("null byte in string {0}")]
232    NullError(#[from] NulError),
233    /// llama.cpp returned a nullptr - this could be many different causes.
234    #[error("null result from llama cpp")]
235    NullResult,
236    /// Failed to convert the path to a rust str. This means the path was not valid unicode
237    #[error("failed to convert path {0} to str")]
238    PathToStrError(PathBuf),
239}
240
241/// An error that can occur when loading a model.
242#[derive(Debug, Eq, PartialEq, thiserror::Error)]
243pub enum LlamaLoraAdapterSetError {
244    /// llama.cpp returned a non-zero error code.
245    #[error("error code from llama cpp")]
246    ErrorResult(i32),
247}
248
249/// An error that can occur when loading a model.
250#[derive(Debug, Eq, PartialEq, thiserror::Error)]
251pub enum LlamaLoraAdapterRemoveError {
252    /// llama.cpp returned a non-zero error code.
253    #[error("error code from llama cpp")]
254    ErrorResult(i32),
255}
256
257/// get the time (in microseconds) according to llama.cpp
258/// ```
259/// # use llama_cpp_2::llama_time_us;
260/// # use llama_cpp_2::llama_backend::LlamaBackend;
261/// let backend = LlamaBackend::init().unwrap();
262/// let time = llama_time_us();
263/// assert!(time > 0);
264/// ```
265#[must_use]
266pub fn llama_time_us() -> i64 {
267    unsafe { llama_cpp_sys_2::llama_time_us() }
268}
269
270/// get the max number of devices according to llama.cpp (this is generally cuda devices)
271/// ```
272/// # use llama_cpp_2::max_devices;
273/// let max_devices = max_devices();
274/// assert!(max_devices >= 0);
275/// ```
276#[must_use]
277pub fn max_devices() -> usize {
278    unsafe { llama_cpp_sys_2::llama_max_devices() }
279}
280
281/// is memory mapping supported according to llama.cpp
282/// ```
283/// # use llama_cpp_2::mmap_supported;
284/// let mmap_supported = mmap_supported();
285/// if mmap_supported {
286///   println!("mmap_supported!");
287/// }
288/// ```
289#[must_use]
290pub fn mmap_supported() -> bool {
291    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
292}
293
294/// is memory locking supported according to llama.cpp
295/// ```
296/// # use llama_cpp_2::mlock_supported;
297/// let mlock_supported = mlock_supported();
298/// if mlock_supported {
299///    println!("mlock_supported!");
300/// }
301/// ```
302#[must_use]
303pub fn mlock_supported() -> bool {
304    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
305}
306
307/// Convert a JSON schema string into a llama.cpp grammar string.
308pub fn json_schema_to_grammar(schema_json: &str) -> Result<String> {
309    let schema_cstr = CString::new(schema_json)
310        .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
311    let mut out = std::ptr::null_mut();
312    let rc = unsafe {
313        llama_cpp_sys_2::llama_rs_json_schema_to_grammar(schema_cstr.as_ptr(), false, &mut out)
314    };
315
316    let result = {
317        if !status_is_ok(rc) || out.is_null() {
318            return Err(LlamaCppError::JsonSchemaToGrammarError(format!(
319                "ffi error {}",
320                status_to_i32(rc)
321            )));
322        }
323        let grammar_bytes = unsafe { CStr::from_ptr(out) }.to_bytes().to_vec();
324        let grammar = String::from_utf8(grammar_bytes)
325            .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
326        Ok(grammar)
327    };
328
329    unsafe { llama_cpp_sys_2::llama_rs_string_free(out) };
330    result
331}
332
333/// An error that can occur when converting a token to a string.
334#[derive(Debug, thiserror::Error, Clone)]
335#[non_exhaustive]
336pub enum TokenToStringError {
337    /// the token type was unknown
338    #[error("Unknown Token Type")]
339    UnknownTokenType,
340    /// There was insufficient buffer space to convert the token to a string.
341    #[error("Insufficient Buffer Space {0}")]
342    InsufficientBufferSpace(c_int),
343    /// The token was not valid utf8.
344    #[error("FromUtf8Error {0}")]
345    FromUtf8Error(#[from] FromUtf8Error),
346}
347
348/// Failed to convert a string to a token sequence.
349#[derive(Debug, thiserror::Error)]
350pub enum StringToTokenError {
351    /// the string contained a null byte and thus could not be converted to a c string.
352    #[error("{0}")]
353    NulError(#[from] NulError),
354    #[error("{0}")]
355    /// Failed to convert a provided integer to a [`c_int`].
356    CIntConversionError(#[from] std::num::TryFromIntError),
357}
358
359/// Failed to apply model chat template.
360#[derive(Debug, thiserror::Error)]
361pub enum NewLlamaChatMessageError {
362    /// the string contained a null byte and thus could not be converted to a c string.
363    #[error("{0}")]
364    NulError(#[from] NulError),
365}
366
367/// Failed to apply model chat template.
368#[derive(Debug, thiserror::Error)]
369pub enum ApplyChatTemplateError {
370    /// the string contained a null byte and thus could not be converted to a c string.
371    #[error("{0}")]
372    NulError(#[from] NulError),
373    /// the string could not be converted to utf8.
374    #[error("{0}")]
375    FromUtf8Error(#[from] FromUtf8Error),
376    /// llama.cpp returned a null pointer for the template result.
377    #[error("null result from llama.cpp")]
378    NullResult,
379    /// llama.cpp returned an error code.
380    #[error("ffi error {0}")]
381    FfiError(i32),
382    /// invalid grammar trigger data returned by llama.cpp.
383    #[error("invalid grammar trigger data")]
384    InvalidGrammarTriggerType,
385}
386
387/// Failed to parse a chat response.
388#[derive(Debug, thiserror::Error)]
389pub enum ChatParseError {
390    /// the string contained a null byte and thus could not be converted to a c string.
391    #[error("{0}")]
392    NulError(#[from] NulError),
393    /// the string could not be converted to utf8.
394    #[error("{0}")]
395    Utf8Error(#[from] FromUtf8Error),
396    /// llama.cpp returned a null pointer for the parse result.
397    #[error("null result from llama.cpp")]
398    NullResult,
399    /// llama.cpp returned an error code.
400    #[error("ffi error {0}")]
401    FfiError(i32),
402}
403
404/// Failed to accept a token in a sampler.
405#[derive(Debug, thiserror::Error)]
406pub enum SamplerAcceptError {
407    /// llama.cpp returned an error code.
408    #[error("ffi error {0}")]
409    FfiError(i32),
410}
411
412/// Get the time in microseconds according to ggml
413///
414/// ```
415/// # use std::time::Duration;
416/// # use llama_cpp_2::llama_backend::LlamaBackend;
417/// let backend = LlamaBackend::init().unwrap();
418/// use llama_cpp_2::ggml_time_us;
419///
420/// let start = ggml_time_us();
421///
422/// std::thread::sleep(Duration::from_micros(10));
423///
424/// let end = ggml_time_us();
425///
426/// let elapsed = end - start;
427///
428/// assert!(elapsed >= 10)
429#[must_use]
430pub fn ggml_time_us() -> i64 {
431    unsafe { llama_cpp_sys_2::ggml_time_us() }
432}
433
434/// checks if mlock is supported
435///
436/// ```
437/// # use llama_cpp_2::llama_supports_mlock;
438///
439/// if llama_supports_mlock() {
440///   println!("mlock is supported!");
441/// } else {
442///   println!("mlock is not supported!");
443/// }
444/// ```
445#[must_use]
446pub fn llama_supports_mlock() -> bool {
447    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
448}
449
450/// Backend device type
451#[derive(Debug, Clone, Copy, PartialEq, Eq)]
452pub enum LlamaBackendDeviceType {
453    /// CPU device
454    Cpu,
455    /// ACCEL device
456    Accelerator,
457    /// GPU device
458    Gpu,
459    /// iGPU device
460    IntegratedGpu,
461    /// Unknown device type
462    Unknown,
463}
464
465/// A ggml backend device
466///
467/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
468#[derive(Debug, Clone)]
469pub struct LlamaBackendDevice {
470    /// The index of the device
471    ///
472    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
473    pub index: usize,
474    /// The name of the device (e.g. "Vulkan0")
475    pub name: String,
476    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
477    pub description: String,
478    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
479    pub backend: String,
480    /// Total memory of the device in bytes
481    pub memory_total: usize,
482    /// Free memory of the device in bytes
483    pub memory_free: usize,
484    /// Device type
485    pub device_type: LlamaBackendDeviceType,
486}
487
488/// List ggml backend devices
489#[must_use]
490pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
491    let mut devices = Vec::new();
492    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
493        fn cstr_to_string(ptr: *const c_char) -> String {
494            if ptr.is_null() {
495                String::new()
496            } else {
497                unsafe { std::ffi::CStr::from_ptr(ptr) }
498                    .to_string_lossy()
499                    .to_string()
500            }
501        }
502        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
503        let props = unsafe {
504            let mut props = std::mem::zeroed();
505            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
506            props
507        };
508        let name = cstr_to_string(props.name);
509        let description = cstr_to_string(props.description);
510        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
511        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
512        let backend = cstr_to_string(backend_name);
513        let memory_total = props.memory_total;
514        let memory_free = props.memory_free;
515        let device_type = match props.type_ {
516            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
517            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
518            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
519            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
520            _ => LlamaBackendDeviceType::Unknown,
521        };
522        devices.push(LlamaBackendDevice {
523            index: i,
524            name,
525            description,
526            backend,
527            memory_total,
528            memory_free,
529            device_type,
530        });
531    }
532    devices
533}
534
535/// Options to configure how llama.cpp logs are intercepted.
536#[derive(Default, Debug, Clone)]
537pub struct LogOptions {
538    disabled: bool,
539}
540
541impl LogOptions {
542    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
543    /// logs to be sent to tracing.
544    #[must_use]
545    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
546        self.disabled = !enabled;
547        self
548    }
549}
550
551extern "C" fn logs_to_trace(
552    level: llama_cpp_sys_2::ggml_log_level,
553    text: *const ::std::os::raw::c_char,
554    data: *mut ::std::os::raw::c_void,
555) {
556    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
557    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
558    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
559    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
560    use std::borrow::Borrow;
561
562    let log_state = unsafe { &*(data as *const log::State) };
563
564    if log_state.options.disabled {
565        return;
566    }
567
568    // If the log level is disabled, we can just return early
569    if !log_state.is_enabled_for_level(level) {
570        log_state.update_previous_level_for_disabled_log(level);
571        return;
572    }
573
574    let text = unsafe { std::ffi::CStr::from_ptr(text) };
575    let text = text.to_string_lossy();
576    let text: &str = text.borrow();
577
578    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
579    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
580    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
581    // to know how to flush it.
582
583    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
584        log_state.cont_buffered_log(text);
585    } else if text.ends_with('\n') {
586        log_state.emit_non_cont_line(level, text);
587    } else {
588        log_state.buffer_non_cont(level, text);
589    }
590}
591
592/// Redirect llama.cpp logs into tracing.
593pub fn send_logs_to_tracing(options: LogOptions) {
594    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
595
596    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
597    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
598    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
599    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
600    let llama_heap_state = Box::as_ref(
601        log::LLAMA_STATE
602            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
603    ) as *const _;
604    let ggml_heap_state = Box::as_ref(
605        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
606    ) as *const _;
607
608    unsafe {
609        // GGML has to be set after llama since setting llama sets ggml as well.
610        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
611        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
612    }
613}
llama_cpp_2/lib.rs

llama_cpp_2/
lib.rs