Skip to main content

llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::{c_char, CStr, CString, NulError};
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod llama_backend;
27pub mod llama_batch;
28mod log;
29pub mod model;
30#[cfg(feature = "mtmd")]
31pub mod mtmd;
32pub mod openai;
33pub mod sampling;
34pub mod timing;
35pub mod token;
36pub mod token_type;
37
38pub(crate) fn status_is_ok(status: llama_cpp_sys_2::llama_rs_status) -> bool {
39    status == llama_cpp_sys_2::LLAMA_RS_STATUS_OK
40}
41
42pub(crate) fn status_to_i32(status: llama_cpp_sys_2::llama_rs_status) -> i32 {
43    status as i32
44}
45
46/// A failable result from a llama.cpp function.
47pub type Result<T> = std::result::Result<T, LlamaCppError>;
48
49/// All errors that can occur in the llama-cpp crate.
50#[derive(Debug, Eq, PartialEq, thiserror::Error)]
51pub enum LlamaCppError {
52    /// The backend was already initialized. This can generally be ignored as initializing the backend
53    /// is idempotent.
54    #[error("BackendAlreadyInitialized")]
55    BackendAlreadyInitialized,
56    /// There was an error while get the chat template from model.
57    #[error("{0}")]
58    ChatTemplateError(#[from] ChatTemplateError),
59    /// There was an error while decoding a batch.
60    #[error("{0}")]
61    DecodeError(#[from] DecodeError),
62    /// There was an error while encoding a batch.
63    #[error("{0}")]
64    EncodeError(#[from] EncodeError),
65    /// There was an error loading a model.
66    #[error("{0}")]
67    LlamaModelLoadError(#[from] LlamaModelLoadError),
68    /// There was an error creating a new model context.
69    #[error("{0}")]
70    LlamaContextLoadError(#[from] LlamaContextLoadError),
71    /// There was an error adding a token to a batch.
72    #[error["{0}"]]
73    BatchAddError(#[from] BatchAddError),
74    /// see [`EmbeddingsError`]
75    #[error(transparent)]
76    EmbeddingError(#[from] EmbeddingsError),
77    // See [`LlamaSamplerError`]
78    /// Backend device not found
79    #[error("Backend device {0} not found")]
80    BackendDeviceNotFound(usize),
81    /// Max devices exceeded
82    #[error("Max devices exceeded. Max devices is {0}")]
83    MaxDevicesExceeded(usize),
84    /// Failed to convert JSON schema to grammar.
85    #[error("JsonSchemaToGrammarError: {0}")]
86    JsonSchemaToGrammarError(String),
87}
88
89/// There was an error while getting the chat template from a model.
90#[derive(Debug, Eq, PartialEq, thiserror::Error)]
91pub enum ChatTemplateError {
92    /// gguf has no chat template (by that name)
93    #[error("chat template not found - returned null pointer")]
94    MissingTemplate,
95
96    /// chat template contained a null byte
97    #[error("null byte in string {0}")]
98    NullError(#[from] NulError),
99
100    /// The chat template was not valid utf8.
101    #[error(transparent)]
102    Utf8Error(#[from] std::str::Utf8Error),
103}
104
105/// Failed fetching metadata value
106#[derive(Debug, Eq, PartialEq, thiserror::Error)]
107pub enum MetaValError {
108    /// The provided string contains an unexpected null-byte
109    #[error("null byte in string {0}")]
110    NullError(#[from] NulError),
111
112    /// The returned data contains invalid UTF8 data
113    #[error("FromUtf8Error {0}")]
114    FromUtf8Error(#[from] FromUtf8Error),
115
116    /// Got negative return value. This happens if the key or index queried does not exist.
117    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
118    NegativeReturn(i32),
119}
120
121/// Failed to Load context
122#[derive(Debug, Eq, PartialEq, thiserror::Error)]
123pub enum LlamaContextLoadError {
124    /// llama.cpp returned null
125    #[error("null reference from llama.cpp")]
126    NullReturn,
127}
128
129/// Failed to decode a batch.
130#[derive(Debug, Eq, PartialEq, thiserror::Error)]
131pub enum DecodeError {
132    /// No kv cache slot was available.
133    #[error("Decode Error 1: NoKvCacheSlot")]
134    NoKvCacheSlot,
135    /// The number of tokens in the batch was 0.
136    #[error("Decode Error -1: n_tokens == 0")]
137    NTokensZero,
138    /// An unknown error occurred.
139    #[error("Decode Error {0}: unknown")]
140    Unknown(c_int),
141}
142
143/// Failed to decode a batch.
144#[derive(Debug, Eq, PartialEq, thiserror::Error)]
145pub enum EncodeError {
146    /// No kv cache slot was available.
147    #[error("Encode Error 1: NoKvCacheSlot")]
148    NoKvCacheSlot,
149    /// The number of tokens in the batch was 0.
150    #[error("Encode Error -1: n_tokens == 0")]
151    NTokensZero,
152    /// An unknown error occurred.
153    #[error("Encode Error {0}: unknown")]
154    Unknown(c_int),
155}
156
157/// When embedding related functions fail
158#[derive(Debug, Eq, PartialEq, thiserror::Error)]
159pub enum EmbeddingsError {
160    /// Embeddings weren't enabled in the context options
161    #[error("Embeddings weren't enabled in the context options")]
162    NotEnabled,
163    /// Logits weren't enabled for the given token
164    #[error("Logits were not enabled for the given token")]
165    LogitsNotEnabled,
166    /// The given sequence index exceeds the max sequence id
167    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
168    NonePoolType,
169}
170
171/// Errors that can occur when initializing a grammar sampler
172#[derive(Debug, Eq, PartialEq, thiserror::Error)]
173pub enum GrammarError {
174    /// The grammar root was not found in the grammar string
175    #[error("Grammar root not found in grammar string")]
176    RootNotFound,
177    /// The trigger word contains null bytes
178    #[error("Trigger word contains null bytes")]
179    TriggerWordNullBytes,
180    /// The grammar string or root contains null bytes
181    #[error("Grammar string or root contains null bytes")]
182    GrammarNullBytes,
183    /// The grammar call returned null
184    #[error("Grammar call returned null")]
185    NullGrammar,
186}
187
188/// Decode a error from llama.cpp into a [`DecodeError`].
189impl From<NonZeroI32> for DecodeError {
190    fn from(value: NonZeroI32) -> Self {
191        match value.get() {
192            1 => DecodeError::NoKvCacheSlot,
193            -1 => DecodeError::NTokensZero,
194            i => DecodeError::Unknown(i),
195        }
196    }
197}
198
199/// Encode a error from llama.cpp into a [`EncodeError`].
200impl From<NonZeroI32> for EncodeError {
201    fn from(value: NonZeroI32) -> Self {
202        match value.get() {
203            1 => EncodeError::NoKvCacheSlot,
204            -1 => EncodeError::NTokensZero,
205            i => EncodeError::Unknown(i),
206        }
207    }
208}
209
210/// An error that can occur when loading a model.
211#[derive(Debug, Eq, PartialEq, thiserror::Error)]
212pub enum LlamaModelLoadError {
213    /// There was a null byte in a provided string and thus it could not be converted to a C string.
214    #[error("null byte in string {0}")]
215    NullError(#[from] NulError),
216    /// llama.cpp returned a nullptr - this could be many different causes.
217    #[error("null result from llama cpp")]
218    NullResult,
219    /// Failed to convert the path to a rust str. This means the path was not valid unicode
220    #[error("failed to convert path {0} to str")]
221    PathToStrError(PathBuf),
222}
223
224/// An error that can occur when loading a model.
225#[derive(Debug, Eq, PartialEq, thiserror::Error)]
226pub enum LlamaLoraAdapterInitError {
227    /// There was a null byte in a provided string and thus it could not be converted to a C string.
228    #[error("null byte in string {0}")]
229    NullError(#[from] NulError),
230    /// llama.cpp returned a nullptr - this could be many different causes.
231    #[error("null result from llama cpp")]
232    NullResult,
233    /// Failed to convert the path to a rust str. This means the path was not valid unicode
234    #[error("failed to convert path {0} to str")]
235    PathToStrError(PathBuf),
236}
237
238/// An error that can occur when loading a model.
239#[derive(Debug, Eq, PartialEq, thiserror::Error)]
240pub enum LlamaLoraAdapterSetError {
241    /// llama.cpp returned a non-zero error code.
242    #[error("error code from llama cpp")]
243    ErrorResult(i32),
244}
245
246/// An error that can occur when loading a model.
247#[derive(Debug, Eq, PartialEq, thiserror::Error)]
248pub enum LlamaLoraAdapterRemoveError {
249    /// llama.cpp returned a non-zero error code.
250    #[error("error code from llama cpp")]
251    ErrorResult(i32),
252}
253
254/// get the time (in microseconds) according to llama.cpp
255/// ```
256/// # use llama_cpp_2::llama_time_us;
257/// # use llama_cpp_2::llama_backend::LlamaBackend;
258/// let backend = LlamaBackend::init().unwrap();
259/// let time = llama_time_us();
260/// assert!(time > 0);
261/// ```
262#[must_use]
263pub fn llama_time_us() -> i64 {
264    unsafe { llama_cpp_sys_2::llama_time_us() }
265}
266
267/// get the max number of devices according to llama.cpp (this is generally cuda devices)
268/// ```
269/// # use llama_cpp_2::max_devices;
270/// let max_devices = max_devices();
271/// assert!(max_devices >= 0);
272/// ```
273#[must_use]
274pub fn max_devices() -> usize {
275    unsafe { llama_cpp_sys_2::llama_max_devices() }
276}
277
278/// is memory mapping supported according to llama.cpp
279/// ```
280/// # use llama_cpp_2::mmap_supported;
281/// let mmap_supported = mmap_supported();
282/// if mmap_supported {
283///   println!("mmap_supported!");
284/// }
285/// ```
286#[must_use]
287pub fn mmap_supported() -> bool {
288    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
289}
290
291/// is memory locking supported according to llama.cpp
292/// ```
293/// # use llama_cpp_2::mlock_supported;
294/// let mlock_supported = mlock_supported();
295/// if mlock_supported {
296///    println!("mlock_supported!");
297/// }
298/// ```
299#[must_use]
300pub fn mlock_supported() -> bool {
301    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
302}
303
304/// Convert a JSON schema string into a llama.cpp grammar string.
305pub fn json_schema_to_grammar(schema_json: &str) -> Result<String> {
306    let schema_cstr = CString::new(schema_json)
307        .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
308    let mut out = std::ptr::null_mut();
309    let rc = unsafe {
310        llama_cpp_sys_2::llama_rs_json_schema_to_grammar(schema_cstr.as_ptr(), false, &mut out)
311    };
312
313    let result = {
314        if !status_is_ok(rc) || out.is_null() {
315            return Err(LlamaCppError::JsonSchemaToGrammarError(format!(
316                "ffi error {}",
317                status_to_i32(rc)
318            )));
319        }
320        let grammar_bytes = unsafe { CStr::from_ptr(out) }.to_bytes().to_vec();
321        let grammar = String::from_utf8(grammar_bytes)
322            .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
323        Ok(grammar)
324    };
325
326    unsafe { llama_cpp_sys_2::llama_rs_string_free(out) };
327    result
328}
329
330/// An error that can occur when converting a token to a string.
331#[derive(Debug, thiserror::Error, Clone)]
332#[non_exhaustive]
333pub enum TokenToStringError {
334    /// the token type was unknown
335    #[error("Unknown Token Type")]
336    UnknownTokenType,
337    /// There was insufficient buffer space to convert the token to a string.
338    #[error("Insufficient Buffer Space {0}")]
339    InsufficientBufferSpace(c_int),
340    /// The token was not valid utf8.
341    #[error("FromUtf8Error {0}")]
342    FromUtf8Error(#[from] FromUtf8Error),
343}
344
345/// Failed to convert a string to a token sequence.
346#[derive(Debug, thiserror::Error)]
347pub enum StringToTokenError {
348    /// the string contained a null byte and thus could not be converted to a c string.
349    #[error("{0}")]
350    NulError(#[from] NulError),
351    #[error("{0}")]
352    /// Failed to convert a provided integer to a [`c_int`].
353    CIntConversionError(#[from] std::num::TryFromIntError),
354}
355
356/// Failed to apply model chat template.
357#[derive(Debug, thiserror::Error)]
358pub enum NewLlamaChatMessageError {
359    /// the string contained a null byte and thus could not be converted to a c string.
360    #[error("{0}")]
361    NulError(#[from] NulError),
362}
363
364/// Failed to apply model chat template.
365#[derive(Debug, thiserror::Error)]
366pub enum ApplyChatTemplateError {
367    /// the string contained a null byte and thus could not be converted to a c string.
368    #[error("{0}")]
369    NulError(#[from] NulError),
370    /// the string could not be converted to utf8.
371    #[error("{0}")]
372    FromUtf8Error(#[from] FromUtf8Error),
373    /// llama.cpp returned a null pointer for the template result.
374    #[error("null result from llama.cpp")]
375    NullResult,
376    /// llama.cpp returned an error code.
377    #[error("ffi error {0}")]
378    FfiError(i32),
379    /// invalid grammar trigger data returned by llama.cpp.
380    #[error("invalid grammar trigger data")]
381    InvalidGrammarTriggerType,
382}
383
384/// Failed to parse a chat response.
385#[derive(Debug, thiserror::Error)]
386pub enum ChatParseError {
387    /// the string contained a null byte and thus could not be converted to a c string.
388    #[error("{0}")]
389    NulError(#[from] NulError),
390    /// the string could not be converted to utf8.
391    #[error("{0}")]
392    Utf8Error(#[from] FromUtf8Error),
393    /// llama.cpp returned a null pointer for the parse result.
394    #[error("null result from llama.cpp")]
395    NullResult,
396    /// llama.cpp returned an error code.
397    #[error("ffi error {0}")]
398    FfiError(i32),
399}
400
401/// Failed to accept a token in a sampler.
402#[derive(Debug, thiserror::Error)]
403pub enum SamplerAcceptError {
404    /// llama.cpp returned an error code.
405    #[error("ffi error {0}")]
406    FfiError(i32),
407}
408
409/// Get the time in microseconds according to ggml
410///
411/// ```
412/// # use std::time::Duration;
413/// # use llama_cpp_2::llama_backend::LlamaBackend;
414/// let backend = LlamaBackend::init().unwrap();
415/// use llama_cpp_2::ggml_time_us;
416///
417/// let start = ggml_time_us();
418///
419/// std::thread::sleep(Duration::from_micros(10));
420///
421/// let end = ggml_time_us();
422///
423/// let elapsed = end - start;
424///
425/// assert!(elapsed >= 10)
426#[must_use]
427pub fn ggml_time_us() -> i64 {
428    unsafe { llama_cpp_sys_2::ggml_time_us() }
429}
430
431/// checks if mlock is supported
432///
433/// ```
434/// # use llama_cpp_2::llama_supports_mlock;
435///
436/// if llama_supports_mlock() {
437///   println!("mlock is supported!");
438/// } else {
439///   println!("mlock is not supported!");
440/// }
441/// ```
442#[must_use]
443pub fn llama_supports_mlock() -> bool {
444    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
445}
446
447/// Backend device type
448#[derive(Debug, Clone, Copy, PartialEq, Eq)]
449pub enum LlamaBackendDeviceType {
450    /// CPU device
451    Cpu,
452    /// ACCEL device
453    Accelerator,
454    /// GPU device
455    Gpu,
456    /// iGPU device
457    IntegratedGpu,
458    /// Unknown device type
459    Unknown,
460}
461
462/// A ggml backend device
463///
464/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
465#[derive(Debug, Clone)]
466pub struct LlamaBackendDevice {
467    /// The index of the device
468    ///
469    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
470    pub index: usize,
471    /// The name of the device (e.g. "Vulkan0")
472    pub name: String,
473    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
474    pub description: String,
475    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
476    pub backend: String,
477    /// Total memory of the device in bytes
478    pub memory_total: usize,
479    /// Free memory of the device in bytes
480    pub memory_free: usize,
481    /// Device type
482    pub device_type: LlamaBackendDeviceType,
483}
484
485/// List ggml backend devices
486#[must_use]
487pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
488    let mut devices = Vec::new();
489    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
490        fn cstr_to_string(ptr: *const c_char) -> String {
491            if ptr.is_null() {
492                String::new()
493            } else {
494                unsafe { std::ffi::CStr::from_ptr(ptr) }
495                    .to_string_lossy()
496                    .to_string()
497            }
498        }
499        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
500        let props = unsafe {
501            let mut props = std::mem::zeroed();
502            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
503            props
504        };
505        let name = cstr_to_string(props.name);
506        let description = cstr_to_string(props.description);
507        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
508        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
509        let backend = cstr_to_string(backend_name);
510        let memory_total = props.memory_total;
511        let memory_free = props.memory_free;
512        let device_type = match props.type_ {
513            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
514            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
515            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
516            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
517            _ => LlamaBackendDeviceType::Unknown,
518        };
519        devices.push(LlamaBackendDevice {
520            index: i,
521            name,
522            description,
523            backend,
524            memory_total,
525            memory_free,
526            device_type,
527        });
528    }
529    devices
530}
531
532/// Options to configure how llama.cpp logs are intercepted.
533#[derive(Default, Debug, Clone)]
534pub struct LogOptions {
535    disabled: bool,
536}
537
538impl LogOptions {
539    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
540    /// logs to be sent to tracing.
541    #[must_use]
542    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
543        self.disabled = !enabled;
544        self
545    }
546}
547
548extern "C" fn logs_to_trace(
549    level: llama_cpp_sys_2::ggml_log_level,
550    text: *const ::std::os::raw::c_char,
551    data: *mut ::std::os::raw::c_void,
552) {
553    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
554    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
555    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
556    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
557    use std::borrow::Borrow;
558
559    let log_state = unsafe { &*(data as *const log::State) };
560
561    if log_state.options.disabled {
562        return;
563    }
564
565    // If the log level is disabled, we can just return early
566    if !log_state.is_enabled_for_level(level) {
567        log_state.update_previous_level_for_disabled_log(level);
568        return;
569    }
570
571    let text = unsafe { std::ffi::CStr::from_ptr(text) };
572    let text = text.to_string_lossy();
573    let text: &str = text.borrow();
574
575    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
576    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
577    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
578    // to know how to flush it.
579
580    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
581        log_state.cont_buffered_log(text);
582    } else if text.ends_with('\n') {
583        log_state.emit_non_cont_line(level, text);
584    } else {
585        log_state.buffer_non_cont(level, text);
586    }
587}
588
589/// Redirect llama.cpp logs into tracing.
590pub fn send_logs_to_tracing(options: LogOptions) {
591    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
592
593    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
594    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
595    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
596    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
597    let llama_heap_state = Box::as_ref(
598        log::LLAMA_STATE
599            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
600    ) as *const _;
601    let ggml_heap_state = Box::as_ref(
602        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
603    ) as *const _;
604
605    unsafe {
606        // GGML has to be set after llama since setting llama sets ggml as well.
607        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
608        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
609    }
610}