Skip to main content

llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//! - [tools](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/tools)
12//!
13//! # Feature Flags
14//!
15//! - `cuda` enables CUDA gpu support.
16//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
17use std::ffi::{c_char, CStr, CString, NulError};
18use std::fmt::Debug;
19use std::num::NonZeroI32;
20
21use crate::llama_batch::BatchAddError;
22use std::os::raw::c_int;
23use std::path::PathBuf;
24use std::string::FromUtf8Error;
25
26pub mod context;
27pub mod gguf;
28pub mod llama_backend;
29pub mod llama_batch;
30#[cfg(feature = "llguidance")]
31pub(crate) mod llguidance_sampler;
32mod log;
33pub mod model;
34#[cfg(feature = "mtmd")]
35pub mod mtmd;
36pub mod openai;
37pub mod sampling;
38pub mod timing;
39pub mod token;
40pub mod token_type;
41
42pub use crate::context::session::LlamaStateSeqFlags;
43
44pub(crate) fn status_is_ok(status: llama_cpp_sys_2::llama_rs_status) -> bool {
45    status == llama_cpp_sys_2::LLAMA_RS_STATUS_OK
46}
47
48pub(crate) fn status_to_i32(status: llama_cpp_sys_2::llama_rs_status) -> i32 {
49    status as i32
50}
51
52/// A failable result from a llama.cpp function.
53pub type Result<T> = std::result::Result<T, LlamaCppError>;
54
55/// All errors that can occur in the llama-cpp crate.
56#[derive(Debug, Eq, PartialEq, thiserror::Error)]
57pub enum LlamaCppError {
58    /// The backend was already initialized. This can generally be ignored as initializing the backend
59    /// is idempotent.
60    #[error("BackendAlreadyInitialized")]
61    BackendAlreadyInitialized,
62    /// There was an error while get the chat template from model.
63    #[error("{0}")]
64    ChatTemplateError(#[from] ChatTemplateError),
65    /// There was an error while decoding a batch.
66    #[error("{0}")]
67    DecodeError(#[from] DecodeError),
68    /// There was an error while encoding a batch.
69    #[error("{0}")]
70    EncodeError(#[from] EncodeError),
71    /// There was an error loading a model.
72    #[error("{0}")]
73    LlamaModelLoadError(#[from] LlamaModelLoadError),
74    /// There was an error creating a new model context.
75    #[error("{0}")]
76    LlamaContextLoadError(#[from] LlamaContextLoadError),
77    /// There was an error adding a token to a batch.
78    #[error["{0}"]]
79    BatchAddError(#[from] BatchAddError),
80    /// see [`EmbeddingsError`]
81    #[error(transparent)]
82    EmbeddingError(#[from] EmbeddingsError),
83    // See [`LlamaSamplerError`]
84    /// Backend device not found
85    #[error("Backend device {0} not found")]
86    BackendDeviceNotFound(usize),
87    /// Max devices exceeded
88    #[error("Max devices exceeded. Max devices is {0}")]
89    MaxDevicesExceeded(usize),
90    /// Failed to convert JSON schema to grammar.
91    #[error("JsonSchemaToGrammarError: {0}")]
92    JsonSchemaToGrammarError(String),
93}
94
95/// There was an error while getting the chat template from a model.
96#[derive(Debug, Eq, PartialEq, thiserror::Error)]
97pub enum ChatTemplateError {
98    /// gguf has no chat template (by that name)
99    #[error("chat template not found - returned null pointer")]
100    MissingTemplate,
101
102    /// chat template contained a null byte
103    #[error("null byte in string {0}")]
104    NullError(#[from] NulError),
105
106    /// The chat template was not valid utf8.
107    #[error(transparent)]
108    Utf8Error(#[from] std::str::Utf8Error),
109}
110
111/// Failed fetching metadata value
112#[derive(Debug, Eq, PartialEq, thiserror::Error)]
113pub enum MetaValError {
114    /// The provided string contains an unexpected null-byte
115    #[error("null byte in string {0}")]
116    NullError(#[from] NulError),
117
118    /// The returned data contains invalid UTF8 data
119    #[error("FromUtf8Error {0}")]
120    FromUtf8Error(#[from] FromUtf8Error),
121
122    /// Got negative return value. This happens if the key or index queried does not exist.
123    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
124    NegativeReturn(i32),
125}
126
127/// Failed to Load context
128#[derive(Debug, Eq, PartialEq, thiserror::Error)]
129pub enum LlamaContextLoadError {
130    /// llama.cpp returned null
131    #[error("null reference from llama.cpp")]
132    NullReturn,
133}
134
135/// Failed to decode a batch.
136#[derive(Debug, Eq, PartialEq, thiserror::Error)]
137pub enum DecodeError {
138    /// No kv cache slot was available.
139    #[error("Decode Error 1: NoKvCacheSlot")]
140    NoKvCacheSlot,
141    /// The number of tokens in the batch was 0.
142    #[error("Decode Error -1: n_tokens == 0")]
143    NTokensZero,
144    /// An unknown error occurred.
145    #[error("Decode Error {0}: unknown")]
146    Unknown(c_int),
147}
148
149/// Failed to decode a batch.
150#[derive(Debug, Eq, PartialEq, thiserror::Error)]
151pub enum EncodeError {
152    /// No kv cache slot was available.
153    #[error("Encode Error 1: NoKvCacheSlot")]
154    NoKvCacheSlot,
155    /// The number of tokens in the batch was 0.
156    #[error("Encode Error -1: n_tokens == 0")]
157    NTokensZero,
158    /// An unknown error occurred.
159    #[error("Encode Error {0}: unknown")]
160    Unknown(c_int),
161}
162
163/// When embedding related functions fail
164#[derive(Debug, Eq, PartialEq, thiserror::Error)]
165pub enum EmbeddingsError {
166    /// Embeddings weren't enabled in the context options
167    #[error("Embeddings weren't enabled in the context options")]
168    NotEnabled,
169    /// Logits weren't enabled for the given token
170    #[error("Logits were not enabled for the given token")]
171    LogitsNotEnabled,
172    /// The given sequence index exceeds the max sequence id
173    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
174    NonePoolType,
175}
176
177/// Errors that can occur when initializing a grammar sampler
178#[derive(Debug, Eq, PartialEq, thiserror::Error)]
179pub enum GrammarError {
180    /// The grammar root was not found in the grammar string
181    #[error("Grammar root not found in grammar string")]
182    RootNotFound,
183    /// The trigger word contains null bytes
184    #[error("Trigger word contains null bytes")]
185    TriggerWordNullBytes,
186    /// The grammar string or root contains null bytes
187    #[error("Grammar string or root contains null bytes")]
188    GrammarNullBytes,
189    /// The grammar call returned null
190    #[error("Grammar call returned null")]
191    NullGrammar,
192}
193
194/// Decode a error from llama.cpp into a [`DecodeError`].
195impl From<NonZeroI32> for DecodeError {
196    fn from(value: NonZeroI32) -> Self {
197        match value.get() {
198            1 => DecodeError::NoKvCacheSlot,
199            -1 => DecodeError::NTokensZero,
200            i => DecodeError::Unknown(i),
201        }
202    }
203}
204
205/// Encode a error from llama.cpp into a [`EncodeError`].
206impl From<NonZeroI32> for EncodeError {
207    fn from(value: NonZeroI32) -> Self {
208        match value.get() {
209            1 => EncodeError::NoKvCacheSlot,
210            -1 => EncodeError::NTokensZero,
211            i => EncodeError::Unknown(i),
212        }
213    }
214}
215
216/// An error that can occur when loading a model.
217#[derive(Debug, Eq, PartialEq, thiserror::Error)]
218pub enum LlamaModelLoadError {
219    /// There was a null byte in a provided string and thus it could not be converted to a C string.
220    #[error("null byte in string {0}")]
221    NullError(#[from] NulError),
222    /// llama.cpp returned a nullptr - this could be many different causes.
223    #[error("null result from llama cpp")]
224    NullResult,
225    /// Failed to convert the path to a rust str. This means the path was not valid unicode
226    #[error("failed to convert path {0} to str")]
227    PathToStrError(PathBuf),
228}
229
230/// An error that can occur when loading a model.
231#[derive(Debug, Eq, PartialEq, thiserror::Error)]
232pub enum LlamaLoraAdapterInitError {
233    /// There was a null byte in a provided string and thus it could not be converted to a C string.
234    #[error("null byte in string {0}")]
235    NullError(#[from] NulError),
236    /// llama.cpp returned a nullptr - this could be many different causes.
237    #[error("null result from llama cpp")]
238    NullResult,
239    /// Failed to convert the path to a rust str. This means the path was not valid unicode
240    #[error("failed to convert path {0} to str")]
241    PathToStrError(PathBuf),
242}
243
244/// An error that can occur when loading a model.
245#[derive(Debug, Eq, PartialEq, thiserror::Error)]
246pub enum LlamaLoraAdapterSetError {
247    /// llama.cpp returned a non-zero error code.
248    #[error("error code from llama cpp")]
249    ErrorResult(i32),
250}
251
252/// An error that can occur when loading a model.
253#[derive(Debug, Eq, PartialEq, thiserror::Error)]
254pub enum LlamaLoraAdapterRemoveError {
255    /// llama.cpp returned a non-zero error code.
256    #[error("error code from llama cpp")]
257    ErrorResult(i32),
258}
259
260/// get the time (in microseconds) according to llama.cpp
261/// ```
262/// # use llama_cpp_2::llama_time_us;
263/// # use llama_cpp_2::llama_backend::LlamaBackend;
264/// let backend = LlamaBackend::init().unwrap();
265/// let time = llama_time_us();
266/// assert!(time > 0);
267/// ```
268#[must_use]
269pub fn llama_time_us() -> i64 {
270    unsafe { llama_cpp_sys_2::llama_time_us() }
271}
272
273/// get the max number of devices according to llama.cpp (this is generally cuda devices)
274/// ```
275/// # use llama_cpp_2::max_devices;
276/// let max_devices = max_devices();
277/// assert!(max_devices >= 0);
278/// ```
279#[must_use]
280pub fn max_devices() -> usize {
281    unsafe { llama_cpp_sys_2::llama_max_devices() }
282}
283
284/// is memory mapping supported according to llama.cpp
285/// ```
286/// # use llama_cpp_2::mmap_supported;
287/// let mmap_supported = mmap_supported();
288/// if mmap_supported {
289///   println!("mmap_supported!");
290/// }
291/// ```
292#[must_use]
293pub fn mmap_supported() -> bool {
294    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
295}
296
297/// is memory locking supported according to llama.cpp
298/// ```
299/// # use llama_cpp_2::mlock_supported;
300/// let mlock_supported = mlock_supported();
301/// if mlock_supported {
302///    println!("mlock_supported!");
303/// }
304/// ```
305#[must_use]
306pub fn mlock_supported() -> bool {
307    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
308}
309
310/// Convert a JSON schema string into a llama.cpp grammar string.
311pub fn json_schema_to_grammar(schema_json: &str) -> Result<String> {
312    let schema_cstr = CString::new(schema_json)
313        .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
314    let mut out = std::ptr::null_mut();
315    let rc = unsafe {
316        llama_cpp_sys_2::llama_rs_json_schema_to_grammar(schema_cstr.as_ptr(), false, &mut out)
317    };
318
319    let result = {
320        if !status_is_ok(rc) || out.is_null() {
321            return Err(LlamaCppError::JsonSchemaToGrammarError(format!(
322                "ffi error {}",
323                status_to_i32(rc)
324            )));
325        }
326        let grammar_bytes = unsafe { CStr::from_ptr(out) }.to_bytes().to_vec();
327        let grammar = String::from_utf8(grammar_bytes)
328            .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
329        Ok(grammar)
330    };
331
332    unsafe { llama_cpp_sys_2::llama_rs_string_free(out) };
333    result
334}
335
336#[cfg(test)]
337mod tests {
338    use super::json_schema_to_grammar;
339
340    #[test]
341    fn json_schema_string_api_returns_grammar() {
342        let schema = r#"{
343            "type": "object",
344            "properties": {
345                "city": { "type": "string" },
346                "unit": { "enum": ["c", "f"] }
347            },
348            "required": ["city"]
349        }"#;
350
351        let grammar =
352            json_schema_to_grammar(schema).expect("string-based schema conversion should succeed");
353
354        assert!(grammar.contains("root ::="));
355    }
356}
357
358/// An error that can occur when converting a token to a string.
359#[derive(Debug, thiserror::Error, Clone)]
360#[non_exhaustive]
361pub enum TokenToStringError {
362    /// the token type was unknown
363    #[error("Unknown Token Type")]
364    UnknownTokenType,
365    /// There was insufficient buffer space to convert the token to a string.
366    #[error("Insufficient Buffer Space {0}")]
367    InsufficientBufferSpace(c_int),
368    /// The token was not valid utf8.
369    #[error("FromUtf8Error {0}")]
370    FromUtf8Error(#[from] FromUtf8Error),
371}
372
373/// Failed to convert a string to a token sequence.
374#[derive(Debug, thiserror::Error)]
375pub enum StringToTokenError {
376    /// the string contained a null byte and thus could not be converted to a c string.
377    #[error("{0}")]
378    NulError(#[from] NulError),
379    #[error("{0}")]
380    /// Failed to convert a provided integer to a [`c_int`].
381    CIntConversionError(#[from] std::num::TryFromIntError),
382}
383
384/// Failed to apply model chat template.
385#[derive(Debug, thiserror::Error)]
386pub enum NewLlamaChatMessageError {
387    /// the string contained a null byte and thus could not be converted to a c string.
388    #[error("{0}")]
389    NulError(#[from] NulError),
390}
391
392/// Failed to apply model chat template.
393#[derive(Debug, thiserror::Error)]
394pub enum ApplyChatTemplateError {
395    /// the string contained a null byte and thus could not be converted to a c string.
396    #[error("{0}")]
397    NulError(#[from] NulError),
398    /// the string could not be converted to utf8.
399    #[error("{0}")]
400    FromUtf8Error(#[from] FromUtf8Error),
401    /// llama.cpp returned a null pointer for the template result.
402    #[error("null result from llama.cpp")]
403    NullResult,
404    /// llama.cpp returned an error code.
405    #[error("ffi error {0}")]
406    FfiError(i32),
407    /// invalid grammar trigger data returned by llama.cpp.
408    #[error("invalid grammar trigger data")]
409    InvalidGrammarTriggerType,
410}
411
412/// Failed to parse a chat response.
413#[derive(Debug, thiserror::Error)]
414pub enum ChatParseError {
415    /// the string contained a null byte and thus could not be converted to a c string.
416    #[error("{0}")]
417    NulError(#[from] NulError),
418    /// the string could not be converted to utf8.
419    #[error("{0}")]
420    Utf8Error(#[from] FromUtf8Error),
421    /// llama.cpp returned a null pointer for the parse result.
422    #[error("null result from llama.cpp")]
423    NullResult,
424    /// llama.cpp returned an error code.
425    #[error("ffi error {0}")]
426    FfiError(i32),
427}
428
429/// Failed to accept a token in a sampler.
430#[derive(Debug, thiserror::Error)]
431pub enum SamplerAcceptError {
432    /// llama.cpp returned an error code.
433    #[error("ffi error {0}")]
434    FfiError(i32),
435}
436
437/// Get the time in microseconds according to ggml
438///
439/// ```
440/// # use std::time::Duration;
441/// # use llama_cpp_2::llama_backend::LlamaBackend;
442/// let backend = LlamaBackend::init().unwrap();
443/// use llama_cpp_2::ggml_time_us;
444///
445/// let start = ggml_time_us();
446///
447/// std::thread::sleep(Duration::from_micros(10));
448///
449/// let end = ggml_time_us();
450///
451/// let elapsed = end - start;
452///
453/// assert!(elapsed >= 10)
454#[must_use]
455pub fn ggml_time_us() -> i64 {
456    unsafe { llama_cpp_sys_2::ggml_time_us() }
457}
458
459/// checks if mlock is supported
460///
461/// ```
462/// # use llama_cpp_2::llama_supports_mlock;
463///
464/// if llama_supports_mlock() {
465///   println!("mlock is supported!");
466/// } else {
467///   println!("mlock is not supported!");
468/// }
469/// ```
470#[must_use]
471pub fn llama_supports_mlock() -> bool {
472    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
473}
474
475/// Backend device type
476#[derive(Debug, Clone, Copy, PartialEq, Eq)]
477pub enum LlamaBackendDeviceType {
478    /// CPU device
479    Cpu,
480    /// ACCEL device
481    Accelerator,
482    /// GPU device
483    Gpu,
484    /// iGPU device
485    IntegratedGpu,
486    /// Unknown device type
487    Unknown,
488}
489
490/// A ggml backend device
491///
492/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
493#[derive(Debug, Clone)]
494pub struct LlamaBackendDevice {
495    /// The index of the device
496    ///
497    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
498    pub index: usize,
499    /// The name of the device (e.g. "Vulkan0")
500    pub name: String,
501    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
502    pub description: String,
503    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
504    pub backend: String,
505    /// Total memory of the device in bytes
506    pub memory_total: usize,
507    /// Free memory of the device in bytes
508    pub memory_free: usize,
509    /// Device type
510    pub device_type: LlamaBackendDeviceType,
511}
512
513/// List ggml backend devices
514#[must_use]
515pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
516    let mut devices = Vec::new();
517    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
518        fn cstr_to_string(ptr: *const c_char) -> String {
519            if ptr.is_null() {
520                String::new()
521            } else {
522                unsafe { std::ffi::CStr::from_ptr(ptr) }
523                    .to_string_lossy()
524                    .to_string()
525            }
526        }
527        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
528        let props = unsafe {
529            let mut props = std::mem::zeroed();
530            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
531            props
532        };
533        let name = cstr_to_string(props.name);
534        let description = cstr_to_string(props.description);
535        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
536        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
537        let backend = cstr_to_string(backend_name);
538        let memory_total = props.memory_total;
539        let memory_free = props.memory_free;
540        let device_type = match props.type_ {
541            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
542            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
543            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
544            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
545            _ => LlamaBackendDeviceType::Unknown,
546        };
547        devices.push(LlamaBackendDevice {
548            index: i,
549            name,
550            description,
551            backend,
552            memory_total,
553            memory_free,
554            device_type,
555        });
556    }
557    devices
558}
559
560/// Options to configure how llama.cpp logs are intercepted.
561#[derive(Default, Debug, Clone)]
562pub struct LogOptions {
563    disabled: bool,
564}
565
566impl LogOptions {
567    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
568    /// logs to be sent to tracing.
569    #[must_use]
570    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
571        self.disabled = !enabled;
572        self
573    }
574}
575
576extern "C" fn logs_to_trace(
577    level: llama_cpp_sys_2::ggml_log_level,
578    text: *const ::std::os::raw::c_char,
579    data: *mut ::std::os::raw::c_void,
580) {
581    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
582    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
583    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
584    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
585    use std::borrow::Borrow;
586
587    let log_state = unsafe { &*(data as *const log::State) };
588
589    if log_state.options.disabled {
590        return;
591    }
592
593    // If the log level is disabled, we can just return early
594    if !log_state.is_enabled_for_level(level) {
595        log_state.update_previous_level_for_disabled_log(level);
596        return;
597    }
598
599    let text = unsafe { std::ffi::CStr::from_ptr(text) };
600    let text = text.to_string_lossy();
601    let text: &str = text.borrow();
602
603    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
604    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
605    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
606    // to know how to flush it.
607
608    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
609        log_state.cont_buffered_log(text);
610    } else if text.ends_with('\n') {
611        log_state.emit_non_cont_line(level, text);
612    } else {
613        log_state.buffer_non_cont(level, text);
614    }
615}
616
617/// Redirect llama.cpp logs into tracing.
618pub fn send_logs_to_tracing(options: LogOptions) {
619    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
620
621    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
622    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
623    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
624    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
625    let llama_heap_state = Box::as_ref(
626        log::LLAMA_STATE
627            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
628    ) as *const _;
629    let ggml_heap_state = Box::as_ref(
630        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
631    ) as *const _;
632
633    unsafe {
634        // GGML has to be set after llama since setting llama sets ggml as well.
635        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
636        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
637    }
638}