Skip to main content

llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//! - [tools](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/tools)
12//!
13//! # Feature Flags
14//!
15//! - `cuda` enables CUDA gpu support.
16//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
17use std::ffi::{c_char, CStr, CString, NulError};
18use std::fmt::Debug;
19use std::num::NonZeroI32;
20
21use crate::llama_batch::BatchAddError;
22use std::os::raw::c_int;
23use std::path::PathBuf;
24use std::string::FromUtf8Error;
25
26pub mod context;
27pub mod gguf;
28pub mod llama_backend;
29pub mod llama_batch;
30#[cfg(feature = "llguidance")]
31pub(crate) mod llguidance_sampler;
32mod log;
33pub mod model;
34#[cfg(feature = "mtmd")]
35pub mod mtmd;
36pub mod openai;
37pub mod sampling;
38pub mod timing;
39pub mod token;
40pub mod token_type;
41
42pub use crate::context::session::LlamaStateSeqFlags;
43
44pub(crate) fn status_is_ok(status: llama_cpp_sys_2::llama_rs_status) -> bool {
45    status == llama_cpp_sys_2::LLAMA_RS_STATUS_OK
46}
47
48/// A failable result from a llama.cpp function.
49pub type Result<T> = std::result::Result<T, LlamaCppError>;
50
51/// All errors that can occur in the llama-cpp crate.
52#[derive(Debug, Eq, PartialEq, thiserror::Error)]
53pub enum LlamaCppError {
54    /// The backend was already initialized. This can generally be ignored as initializing the backend
55    /// is idempotent.
56    #[error("BackendAlreadyInitialized")]
57    BackendAlreadyInitialized,
58    /// There was an error while get the chat template from model.
59    #[error("{0}")]
60    ChatTemplateError(#[from] ChatTemplateError),
61    /// There was an error while decoding a batch.
62    #[error("{0}")]
63    DecodeError(#[from] DecodeError),
64    /// There was an error while encoding a batch.
65    #[error("{0}")]
66    EncodeError(#[from] EncodeError),
67    /// There was an error loading a model.
68    #[error("{0}")]
69    LlamaModelLoadError(#[from] LlamaModelLoadError),
70    /// There was an error creating a new model context.
71    #[error("{0}")]
72    LlamaContextLoadError(#[from] LlamaContextLoadError),
73    /// There was an error adding a token to a batch.
74    #[error["{0}"]]
75    BatchAddError(#[from] BatchAddError),
76    /// see [`EmbeddingsError`]
77    #[error(transparent)]
78    EmbeddingError(#[from] EmbeddingsError),
79    // See [`LlamaSamplerError`]
80    /// Backend device not found
81    #[error("Backend device {0} not found")]
82    BackendDeviceNotFound(usize),
83    /// Max devices exceeded
84    #[error("Max devices exceeded. Max devices is {0}")]
85    MaxDevicesExceeded(usize),
86    /// Failed to convert JSON schema to grammar.
87    #[error("JsonSchemaToGrammarError: {0}")]
88    JsonSchemaToGrammarError(String),
89    /// There was an error fitting model parameters to available memory.
90    #[error("{0}")]
91    FitError(#[from] crate::model::params::FitError),
92}
93
94/// There was an error while getting the chat template from a model.
95#[derive(Debug, Eq, PartialEq, thiserror::Error)]
96pub enum ChatTemplateError {
97    /// gguf has no chat template (by that name)
98    #[error("chat template not found - returned null pointer")]
99    MissingTemplate,
100
101    /// chat template contained a null byte
102    #[error("null byte in string {0}")]
103    NullError(#[from] NulError),
104
105    /// The chat template was not valid utf8.
106    #[error(transparent)]
107    Utf8Error(#[from] std::str::Utf8Error),
108}
109
110/// Failed fetching metadata value
111#[derive(Debug, Eq, PartialEq, thiserror::Error)]
112pub enum MetaValError {
113    /// The provided string contains an unexpected null-byte
114    #[error("null byte in string {0}")]
115    NullError(#[from] NulError),
116
117    /// The returned data contains invalid UTF8 data
118    #[error("FromUtf8Error {0}")]
119    FromUtf8Error(#[from] FromUtf8Error),
120
121    /// Got negative return value. This happens if the key or index queried does not exist.
122    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
123    NegativeReturn(i32),
124}
125
126/// Failed to Load context
127#[derive(Debug, Eq, PartialEq, thiserror::Error)]
128pub enum LlamaContextLoadError {
129    /// llama.cpp returned null
130    #[error("null reference from llama.cpp")]
131    NullReturn,
132}
133
134/// Failed to decode a batch.
135#[derive(Debug, Eq, PartialEq, thiserror::Error)]
136pub enum DecodeError {
137    /// No kv cache slot was available.
138    #[error("Decode Error 1: NoKvCacheSlot")]
139    NoKvCacheSlot,
140    /// The number of tokens in the batch was 0.
141    #[error("Decode Error -1: n_tokens == 0")]
142    NTokensZero,
143    /// An unknown error occurred.
144    #[error("Decode Error {0}: unknown")]
145    Unknown(c_int),
146}
147
148/// Failed to decode a batch.
149#[derive(Debug, Eq, PartialEq, thiserror::Error)]
150pub enum EncodeError {
151    /// No kv cache slot was available.
152    #[error("Encode Error 1: NoKvCacheSlot")]
153    NoKvCacheSlot,
154    /// The number of tokens in the batch was 0.
155    #[error("Encode Error -1: n_tokens == 0")]
156    NTokensZero,
157    /// An unknown error occurred.
158    #[error("Encode Error {0}: unknown")]
159    Unknown(c_int),
160}
161
162/// When embedding related functions fail
163#[derive(Debug, Eq, PartialEq, thiserror::Error)]
164pub enum EmbeddingsError {
165    /// Embeddings weren't enabled in the context options
166    #[error("Embeddings weren't enabled in the context options")]
167    NotEnabled,
168    /// Logits weren't enabled for the given token
169    #[error("Logits were not enabled for the given token")]
170    LogitsNotEnabled,
171    /// The given sequence index exceeds the max sequence id
172    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
173    NonePoolType,
174}
175
176/// Errors that can occur when initializing a grammar sampler
177#[derive(Debug, Eq, PartialEq, thiserror::Error)]
178pub enum GrammarError {
179    /// The grammar root was not found in the grammar string
180    #[error("Grammar root not found in grammar string")]
181    RootNotFound,
182    /// The trigger word contains null bytes
183    #[error("Trigger word contains null bytes")]
184    TriggerWordNullBytes,
185    /// The grammar string or root contains null bytes
186    #[error("Grammar string or root contains null bytes")]
187    GrammarNullBytes,
188    /// The grammar call returned null
189    #[error("Grammar call returned null")]
190    NullGrammar,
191}
192
193/// Decode a error from llama.cpp into a [`DecodeError`].
194impl From<NonZeroI32> for DecodeError {
195    fn from(value: NonZeroI32) -> Self {
196        match value.get() {
197            1 => DecodeError::NoKvCacheSlot,
198            -1 => DecodeError::NTokensZero,
199            i => DecodeError::Unknown(i),
200        }
201    }
202}
203
204/// Encode a error from llama.cpp into a [`EncodeError`].
205impl From<NonZeroI32> for EncodeError {
206    fn from(value: NonZeroI32) -> Self {
207        match value.get() {
208            1 => EncodeError::NoKvCacheSlot,
209            -1 => EncodeError::NTokensZero,
210            i => EncodeError::Unknown(i),
211        }
212    }
213}
214
215/// An error that can occur when loading a model.
216#[derive(Debug, Eq, PartialEq, thiserror::Error)]
217pub enum LlamaModelLoadError {
218    /// There was a null byte in a provided string and thus it could not be converted to a C string.
219    #[error("null byte in string {0}")]
220    NullError(#[from] NulError),
221    /// llama.cpp returned a nullptr - this could be many different causes.
222    #[error("null result from llama cpp")]
223    NullResult,
224    /// Failed to convert the path to a rust str. This means the path was not valid unicode
225    #[error("failed to convert path {0} to str")]
226    PathToStrError(PathBuf),
227}
228
229/// An error that can occur when loading a model.
230#[derive(Debug, Eq, PartialEq, thiserror::Error)]
231pub enum LlamaLoraAdapterInitError {
232    /// There was a null byte in a provided string and thus it could not be converted to a C string.
233    #[error("null byte in string {0}")]
234    NullError(#[from] NulError),
235    /// llama.cpp returned a nullptr - this could be many different causes.
236    #[error("null result from llama cpp")]
237    NullResult,
238    /// Failed to convert the path to a rust str. This means the path was not valid unicode
239    #[error("failed to convert path {0} to str")]
240    PathToStrError(PathBuf),
241}
242
243/// An error that can occur when loading a model.
244#[derive(Debug, Eq, PartialEq, thiserror::Error)]
245pub enum LlamaLoraAdapterSetError {
246    /// llama.cpp returned a non-zero error code.
247    #[error("error code from llama cpp")]
248    ErrorResult(i32),
249}
250
251/// An error that can occur when loading a model.
252#[derive(Debug, Eq, PartialEq, thiserror::Error)]
253pub enum LlamaLoraAdapterRemoveError {
254    /// llama.cpp returned a non-zero error code.
255    #[error("error code from llama cpp")]
256    ErrorResult(i32),
257}
258
259/// get the time (in microseconds) according to llama.cpp
260/// ```
261/// # use llama_cpp_2::llama_time_us;
262/// # use llama_cpp_2::llama_backend::LlamaBackend;
263/// let backend = LlamaBackend::init().unwrap();
264/// let time = llama_time_us();
265/// assert!(time > 0);
266/// ```
267#[must_use]
268pub fn llama_time_us() -> i64 {
269    unsafe { llama_cpp_sys_2::llama_time_us() }
270}
271
272/// get the max number of devices according to llama.cpp (this is generally cuda devices)
273/// ```
274/// # use llama_cpp_2::max_devices;
275/// let max_devices = max_devices();
276/// assert!(max_devices >= 0);
277/// ```
278#[must_use]
279pub fn max_devices() -> usize {
280    unsafe { llama_cpp_sys_2::llama_max_devices() }
281}
282
283/// is memory mapping supported according to llama.cpp
284/// ```
285/// # use llama_cpp_2::mmap_supported;
286/// let mmap_supported = mmap_supported();
287/// if mmap_supported {
288///   println!("mmap_supported!");
289/// }
290/// ```
291#[must_use]
292pub fn mmap_supported() -> bool {
293    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
294}
295
296/// is memory locking supported according to llama.cpp
297/// ```
298/// # use llama_cpp_2::mlock_supported;
299/// let mlock_supported = mlock_supported();
300/// if mlock_supported {
301///    println!("mlock_supported!");
302/// }
303/// ```
304#[must_use]
305pub fn mlock_supported() -> bool {
306    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
307}
308
309/// Convert a JSON schema string into a llama.cpp grammar string.
310pub fn json_schema_to_grammar(schema_json: &str) -> Result<String> {
311    let schema_cstr = CString::new(schema_json)
312        .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
313    let mut out = std::ptr::null_mut();
314    let rc = unsafe {
315        llama_cpp_sys_2::llama_rs_json_schema_to_grammar(schema_cstr.as_ptr(), false, &mut out)
316    };
317
318    let result = {
319        if !status_is_ok(rc) || out.is_null() {
320            return Err(LlamaCppError::JsonSchemaToGrammarError(format!(
321                "ffi error {}",
322                rc
323            )));
324        }
325        let grammar_bytes = unsafe { CStr::from_ptr(out) }.to_bytes().to_vec();
326        let grammar = String::from_utf8(grammar_bytes)
327            .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
328        Ok(grammar)
329    };
330
331    unsafe { llama_cpp_sys_2::llama_rs_string_free(out) };
332    result
333}
334
335#[cfg(test)]
336mod tests {
337    use super::json_schema_to_grammar;
338
339    #[test]
340    fn json_schema_string_api_returns_grammar() {
341        let schema = r#"{
342            "type": "object",
343            "properties": {
344                "city": { "type": "string" },
345                "unit": { "enum": ["c", "f"] }
346            },
347            "required": ["city"]
348        }"#;
349
350        let grammar =
351            json_schema_to_grammar(schema).expect("string-based schema conversion should succeed");
352
353        assert!(grammar.contains("root ::="));
354    }
355}
356
357/// An error that can occur when converting a token to a string.
358#[derive(Debug, thiserror::Error, Clone)]
359#[non_exhaustive]
360pub enum TokenToStringError {
361    /// the token type was unknown
362    #[error("Unknown Token Type")]
363    UnknownTokenType,
364    /// There was insufficient buffer space to convert the token to a string.
365    #[error("Insufficient Buffer Space {0}")]
366    InsufficientBufferSpace(c_int),
367    /// The token was not valid utf8.
368    #[error("FromUtf8Error {0}")]
369    FromUtf8Error(#[from] FromUtf8Error),
370}
371
372/// Failed to convert a string to a token sequence.
373#[derive(Debug, thiserror::Error)]
374pub enum StringToTokenError {
375    /// the string contained a null byte and thus could not be converted to a c string.
376    #[error("{0}")]
377    NulError(#[from] NulError),
378    #[error("{0}")]
379    /// Failed to convert a provided integer to a [`c_int`].
380    CIntConversionError(#[from] std::num::TryFromIntError),
381}
382
383/// Failed to apply model chat template.
384#[derive(Debug, thiserror::Error)]
385pub enum NewLlamaChatMessageError {
386    /// the string contained a null byte and thus could not be converted to a c string.
387    #[error("{0}")]
388    NulError(#[from] NulError),
389}
390
391/// Failed to apply model chat template.
392#[derive(Debug, thiserror::Error)]
393pub enum ApplyChatTemplateError {
394    /// the string contained a null byte and thus could not be converted to a c string.
395    #[error("{0}")]
396    NulError(#[from] NulError),
397    /// the string could not be converted to utf8.
398    #[error("{0}")]
399    FromUtf8Error(#[from] FromUtf8Error),
400    /// llama.cpp returned a null pointer for the template result.
401    #[error("null result from llama.cpp")]
402    NullResult,
403    /// llama.cpp returned an error code.
404    #[error("ffi error {0}")]
405    FfiError(i32),
406    /// invalid grammar trigger data returned by llama.cpp.
407    #[error("invalid grammar trigger data")]
408    InvalidGrammarTriggerType,
409}
410
411/// Failed to parse a chat response.
412#[derive(Debug, thiserror::Error)]
413pub enum ChatParseError {
414    /// the string contained a null byte and thus could not be converted to a c string.
415    #[error("{0}")]
416    NulError(#[from] NulError),
417    /// the string could not be converted to utf8.
418    #[error("{0}")]
419    Utf8Error(#[from] FromUtf8Error),
420    /// llama.cpp returned a null pointer for the parse result.
421    #[error("null result from llama.cpp")]
422    NullResult,
423    /// llama.cpp returned an error code.
424    #[error("ffi error {0}")]
425    FfiError(i32),
426}
427
428/// Failed to accept a token in a sampler.
429#[derive(Debug, thiserror::Error)]
430pub enum SamplerAcceptError {
431    /// llama.cpp returned an error code.
432    #[error("ffi error {0}")]
433    FfiError(i32),
434}
435
436/// Get the time in microseconds according to ggml
437///
438/// ```
439/// # use std::time::Duration;
440/// # use llama_cpp_2::llama_backend::LlamaBackend;
441/// let backend = LlamaBackend::init().unwrap();
442/// use llama_cpp_2::ggml_time_us;
443///
444/// let start = ggml_time_us();
445///
446/// std::thread::sleep(Duration::from_micros(10));
447///
448/// let end = ggml_time_us();
449///
450/// let elapsed = end - start;
451///
452/// assert!(elapsed >= 10)
453#[must_use]
454pub fn ggml_time_us() -> i64 {
455    unsafe { llama_cpp_sys_2::ggml_time_us() }
456}
457
458/// checks if mlock is supported
459///
460/// ```
461/// # use llama_cpp_2::llama_supports_mlock;
462///
463/// if llama_supports_mlock() {
464///   println!("mlock is supported!");
465/// } else {
466///   println!("mlock is not supported!");
467/// }
468/// ```
469#[must_use]
470pub fn llama_supports_mlock() -> bool {
471    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
472}
473
474/// Backend device type
475#[derive(Debug, Clone, Copy, PartialEq, Eq)]
476pub enum LlamaBackendDeviceType {
477    /// CPU device
478    Cpu,
479    /// ACCEL device
480    Accelerator,
481    /// GPU device
482    Gpu,
483    /// iGPU device
484    IntegratedGpu,
485    /// Unknown device type
486    Unknown,
487}
488
489/// A ggml backend device
490///
491/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
492#[derive(Debug, Clone)]
493pub struct LlamaBackendDevice {
494    /// The index of the device
495    ///
496    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
497    pub index: usize,
498    /// The name of the device (e.g. "Vulkan0")
499    pub name: String,
500    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
501    pub description: String,
502    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
503    pub backend: String,
504    /// Total memory of the device in bytes
505    pub memory_total: usize,
506    /// Free memory of the device in bytes
507    pub memory_free: usize,
508    /// Device type
509    pub device_type: LlamaBackendDeviceType,
510}
511
512/// List ggml backend devices
513#[must_use]
514pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
515    let mut devices = Vec::new();
516    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
517        fn cstr_to_string(ptr: *const c_char) -> String {
518            if ptr.is_null() {
519                String::new()
520            } else {
521                unsafe { std::ffi::CStr::from_ptr(ptr) }
522                    .to_string_lossy()
523                    .to_string()
524            }
525        }
526        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
527        let props = unsafe {
528            let mut props = std::mem::zeroed();
529            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
530            props
531        };
532        let name = cstr_to_string(props.name);
533        let description = cstr_to_string(props.description);
534        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
535        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
536        let backend = cstr_to_string(backend_name);
537        let memory_total = props.memory_total;
538        let memory_free = props.memory_free;
539        let device_type = match props.type_ {
540            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
541            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
542            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
543            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
544            _ => LlamaBackendDeviceType::Unknown,
545        };
546        devices.push(LlamaBackendDevice {
547            index: i,
548            name,
549            description,
550            backend,
551            memory_total,
552            memory_free,
553            device_type,
554        });
555    }
556    devices
557}
558
559/// Options to configure how llama.cpp logs are intercepted.
560#[derive(Default, Debug, Clone)]
561pub struct LogOptions {
562    disabled: bool,
563}
564
565impl LogOptions {
566    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
567    /// logs to be sent to tracing.
568    #[must_use]
569    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
570        self.disabled = !enabled;
571        self
572    }
573}
574
575extern "C" fn logs_to_trace(
576    level: llama_cpp_sys_2::ggml_log_level,
577    text: *const ::std::os::raw::c_char,
578    data: *mut ::std::os::raw::c_void,
579) {
580    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
581    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
582    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
583    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
584    use std::borrow::Borrow;
585
586    let log_state = unsafe { &*(data as *const log::State) };
587
588    if log_state.options.disabled {
589        return;
590    }
591
592    // If the log level is disabled, we can just return early
593    if !log_state.is_enabled_for_level(level) {
594        log_state.update_previous_level_for_disabled_log(level);
595        return;
596    }
597
598    let text = unsafe { std::ffi::CStr::from_ptr(text) };
599    let text = text.to_string_lossy();
600    let text: &str = text.borrow();
601
602    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
603    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
604    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
605    // to know how to flush it.
606
607    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
608        log_state.cont_buffered_log(text);
609    } else if text.ends_with('\n') {
610        log_state.emit_non_cont_line(level, text);
611    } else {
612        log_state.buffer_non_cont(level, text);
613    }
614}
615
616/// Redirect llama.cpp logs into tracing.
617pub fn send_logs_to_tracing(options: LogOptions) {
618    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
619
620    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
621    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
622    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
623    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
624    let llama_heap_state = Box::as_ref(
625        log::LLAMA_STATE
626            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
627    ) as *const _;
628    let ggml_heap_state = Box::as_ref(
629        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
630    ) as *const _;
631
632    unsafe {
633        // GGML has to be set after llama since setting llama sets ggml as well.
634        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
635        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
636    }
637}