Skip to main content

llama_cpp_2/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//! - [tools](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/tools)
12//!
13//! # Feature Flags
14//!
15//! - `cuda` enables CUDA gpu support.
16//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
17use std::ffi::{c_char, CStr, CString, NulError};
18use std::fmt::Debug;
19use std::num::NonZeroI32;
20
21use crate::llama_batch::BatchAddError;
22use std::os::raw::c_int;
23use std::path::PathBuf;
24use std::string::FromUtf8Error;
25
26pub mod context;
27pub mod gguf;
28pub mod llama_backend;
29pub mod llama_batch;
30#[cfg(feature = "llguidance")]
31pub(crate) mod llguidance_sampler;
32mod log;
33pub mod model;
34#[cfg(feature = "mtmd")]
35pub mod mtmd;
36pub mod openai;
37pub mod sampling;
38pub mod timing;
39pub mod token;
40pub mod token_type;
41
42pub use crate::context::session::LlamaStateSeqFlags;
43
44pub(crate) fn status_is_ok(status: llama_cpp_sys_2::llama_rs_status) -> bool {
45    status == llama_cpp_sys_2::LLAMA_RS_STATUS_OK
46}
47
48/// A failable result from a llama.cpp function.
49pub type Result<T> = std::result::Result<T, LlamaCppError>;
50
51/// All errors that can occur in the llama-cpp crate.
52#[derive(Debug, Eq, PartialEq, thiserror::Error)]
53pub enum LlamaCppError {
54    /// The backend was already initialized. This can generally be ignored as initializing the backend
55    /// is idempotent.
56    #[error("BackendAlreadyInitialized")]
57    BackendAlreadyInitialized,
58    /// There was an error while get the chat template from model.
59    #[error("{0}")]
60    ChatTemplateError(#[from] ChatTemplateError),
61    /// There was an error while decoding a batch.
62    #[error("{0}")]
63    DecodeError(#[from] DecodeError),
64    /// There was an error while encoding a batch.
65    #[error("{0}")]
66    EncodeError(#[from] EncodeError),
67    /// There was an error loading a model.
68    #[error("{0}")]
69    LlamaModelLoadError(#[from] LlamaModelLoadError),
70    /// There was an error creating a new model context.
71    #[error("{0}")]
72    LlamaContextLoadError(#[from] LlamaContextLoadError),
73    /// There was an error adding a token to a batch.
74    #[error["{0}"]]
75    BatchAddError(#[from] BatchAddError),
76    /// see [`EmbeddingsError`]
77    #[error(transparent)]
78    EmbeddingError(#[from] EmbeddingsError),
79    // See [`LlamaSamplerError`]
80    /// Backend device not found
81    #[error("Backend device {0} not found")]
82    BackendDeviceNotFound(usize),
83    /// Max devices exceeded
84    #[error("Max devices exceeded. Max devices is {0}")]
85    MaxDevicesExceeded(usize),
86    /// Failed to convert JSON schema to grammar.
87    #[error("JsonSchemaToGrammarError: {0}")]
88    JsonSchemaToGrammarError(String),
89}
90
91/// There was an error while getting the chat template from a model.
92#[derive(Debug, Eq, PartialEq, thiserror::Error)]
93pub enum ChatTemplateError {
94    /// gguf has no chat template (by that name)
95    #[error("chat template not found - returned null pointer")]
96    MissingTemplate,
97
98    /// chat template contained a null byte
99    #[error("null byte in string {0}")]
100    NullError(#[from] NulError),
101
102    /// The chat template was not valid utf8.
103    #[error(transparent)]
104    Utf8Error(#[from] std::str::Utf8Error),
105}
106
107/// Failed fetching metadata value
108#[derive(Debug, Eq, PartialEq, thiserror::Error)]
109pub enum MetaValError {
110    /// The provided string contains an unexpected null-byte
111    #[error("null byte in string {0}")]
112    NullError(#[from] NulError),
113
114    /// The returned data contains invalid UTF8 data
115    #[error("FromUtf8Error {0}")]
116    FromUtf8Error(#[from] FromUtf8Error),
117
118    /// Got negative return value. This happens if the key or index queried does not exist.
119    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
120    NegativeReturn(i32),
121}
122
123/// Failed to Load context
124#[derive(Debug, Eq, PartialEq, thiserror::Error)]
125pub enum LlamaContextLoadError {
126    /// llama.cpp returned null
127    #[error("null reference from llama.cpp")]
128    NullReturn,
129}
130
131/// Failed to decode a batch.
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum DecodeError {
134    /// No kv cache slot was available.
135    #[error("Decode Error 1: NoKvCacheSlot")]
136    NoKvCacheSlot,
137    /// The number of tokens in the batch was 0.
138    #[error("Decode Error -1: n_tokens == 0")]
139    NTokensZero,
140    /// An unknown error occurred.
141    #[error("Decode Error {0}: unknown")]
142    Unknown(c_int),
143}
144
145/// Failed to decode a batch.
146#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EncodeError {
148    /// No kv cache slot was available.
149    #[error("Encode Error 1: NoKvCacheSlot")]
150    NoKvCacheSlot,
151    /// The number of tokens in the batch was 0.
152    #[error("Encode Error -1: n_tokens == 0")]
153    NTokensZero,
154    /// An unknown error occurred.
155    #[error("Encode Error {0}: unknown")]
156    Unknown(c_int),
157}
158
159/// When embedding related functions fail
160#[derive(Debug, Eq, PartialEq, thiserror::Error)]
161pub enum EmbeddingsError {
162    /// Embeddings weren't enabled in the context options
163    #[error("Embeddings weren't enabled in the context options")]
164    NotEnabled,
165    /// Logits weren't enabled for the given token
166    #[error("Logits were not enabled for the given token")]
167    LogitsNotEnabled,
168    /// The given sequence index exceeds the max sequence id
169    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
170    NonePoolType,
171}
172
173/// Errors that can occur when initializing a grammar sampler
174#[derive(Debug, Eq, PartialEq, thiserror::Error)]
175pub enum GrammarError {
176    /// The grammar root was not found in the grammar string
177    #[error("Grammar root not found in grammar string")]
178    RootNotFound,
179    /// The trigger word contains null bytes
180    #[error("Trigger word contains null bytes")]
181    TriggerWordNullBytes,
182    /// The grammar string or root contains null bytes
183    #[error("Grammar string or root contains null bytes")]
184    GrammarNullBytes,
185    /// The grammar call returned null
186    #[error("Grammar call returned null")]
187    NullGrammar,
188}
189
190/// Decode a error from llama.cpp into a [`DecodeError`].
191impl From<NonZeroI32> for DecodeError {
192    fn from(value: NonZeroI32) -> Self {
193        match value.get() {
194            1 => DecodeError::NoKvCacheSlot,
195            -1 => DecodeError::NTokensZero,
196            i => DecodeError::Unknown(i),
197        }
198    }
199}
200
201/// Encode a error from llama.cpp into a [`EncodeError`].
202impl From<NonZeroI32> for EncodeError {
203    fn from(value: NonZeroI32) -> Self {
204        match value.get() {
205            1 => EncodeError::NoKvCacheSlot,
206            -1 => EncodeError::NTokensZero,
207            i => EncodeError::Unknown(i),
208        }
209    }
210}
211
212/// An error that can occur when loading a model.
213#[derive(Debug, Eq, PartialEq, thiserror::Error)]
214pub enum LlamaModelLoadError {
215    /// There was a null byte in a provided string and thus it could not be converted to a C string.
216    #[error("null byte in string {0}")]
217    NullError(#[from] NulError),
218    /// llama.cpp returned a nullptr - this could be many different causes.
219    #[error("null result from llama cpp")]
220    NullResult,
221    /// Failed to convert the path to a rust str. This means the path was not valid unicode
222    #[error("failed to convert path {0} to str")]
223    PathToStrError(PathBuf),
224}
225
226/// An error that can occur when loading a model.
227#[derive(Debug, Eq, PartialEq, thiserror::Error)]
228pub enum LlamaLoraAdapterInitError {
229    /// There was a null byte in a provided string and thus it could not be converted to a C string.
230    #[error("null byte in string {0}")]
231    NullError(#[from] NulError),
232    /// llama.cpp returned a nullptr - this could be many different causes.
233    #[error("null result from llama cpp")]
234    NullResult,
235    /// Failed to convert the path to a rust str. This means the path was not valid unicode
236    #[error("failed to convert path {0} to str")]
237    PathToStrError(PathBuf),
238}
239
240/// An error that can occur when loading a model.
241#[derive(Debug, Eq, PartialEq, thiserror::Error)]
242pub enum LlamaLoraAdapterSetError {
243    /// llama.cpp returned a non-zero error code.
244    #[error("error code from llama cpp")]
245    ErrorResult(i32),
246}
247
248/// An error that can occur when loading a model.
249#[derive(Debug, Eq, PartialEq, thiserror::Error)]
250pub enum LlamaLoraAdapterRemoveError {
251    /// llama.cpp returned a non-zero error code.
252    #[error("error code from llama cpp")]
253    ErrorResult(i32),
254}
255
256/// get the time (in microseconds) according to llama.cpp
257/// ```
258/// # use llama_cpp_2::llama_time_us;
259/// # use llama_cpp_2::llama_backend::LlamaBackend;
260/// let backend = LlamaBackend::init().unwrap();
261/// let time = llama_time_us();
262/// assert!(time > 0);
263/// ```
264#[must_use]
265pub fn llama_time_us() -> i64 {
266    unsafe { llama_cpp_sys_2::llama_time_us() }
267}
268
269/// get the max number of devices according to llama.cpp (this is generally cuda devices)
270/// ```
271/// # use llama_cpp_2::max_devices;
272/// let max_devices = max_devices();
273/// assert!(max_devices >= 0);
274/// ```
275#[must_use]
276pub fn max_devices() -> usize {
277    unsafe { llama_cpp_sys_2::llama_max_devices() }
278}
279
280/// is memory mapping supported according to llama.cpp
281/// ```
282/// # use llama_cpp_2::mmap_supported;
283/// let mmap_supported = mmap_supported();
284/// if mmap_supported {
285///   println!("mmap_supported!");
286/// }
287/// ```
288#[must_use]
289pub fn mmap_supported() -> bool {
290    unsafe { llama_cpp_sys_2::llama_supports_mmap() }
291}
292
293/// is memory locking supported according to llama.cpp
294/// ```
295/// # use llama_cpp_2::mlock_supported;
296/// let mlock_supported = mlock_supported();
297/// if mlock_supported {
298///    println!("mlock_supported!");
299/// }
300/// ```
301#[must_use]
302pub fn mlock_supported() -> bool {
303    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
304}
305
306/// Convert a JSON schema string into a llama.cpp grammar string.
307pub fn json_schema_to_grammar(schema_json: &str) -> Result<String> {
308    let schema_cstr = CString::new(schema_json)
309        .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
310    let mut out = std::ptr::null_mut();
311    let rc = unsafe {
312        llama_cpp_sys_2::llama_rs_json_schema_to_grammar(schema_cstr.as_ptr(), false, &mut out)
313    };
314
315    let result = {
316        if !status_is_ok(rc) || out.is_null() {
317            return Err(LlamaCppError::JsonSchemaToGrammarError(format!(
318                "ffi error {}",
319                rc
320            )));
321        }
322        let grammar_bytes = unsafe { CStr::from_ptr(out) }.to_bytes().to_vec();
323        let grammar = String::from_utf8(grammar_bytes)
324            .map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
325        Ok(grammar)
326    };
327
328    unsafe { llama_cpp_sys_2::llama_rs_string_free(out) };
329    result
330}
331
332#[cfg(test)]
333mod tests {
334    use super::json_schema_to_grammar;
335
336    #[test]
337    fn json_schema_string_api_returns_grammar() {
338        let schema = r#"{
339            "type": "object",
340            "properties": {
341                "city": { "type": "string" },
342                "unit": { "enum": ["c", "f"] }
343            },
344            "required": ["city"]
345        }"#;
346
347        let grammar =
348            json_schema_to_grammar(schema).expect("string-based schema conversion should succeed");
349
350        assert!(grammar.contains("root ::="));
351    }
352}
353
354/// An error that can occur when converting a token to a string.
355#[derive(Debug, thiserror::Error, Clone)]
356#[non_exhaustive]
357pub enum TokenToStringError {
358    /// the token type was unknown
359    #[error("Unknown Token Type")]
360    UnknownTokenType,
361    /// There was insufficient buffer space to convert the token to a string.
362    #[error("Insufficient Buffer Space {0}")]
363    InsufficientBufferSpace(c_int),
364    /// The token was not valid utf8.
365    #[error("FromUtf8Error {0}")]
366    FromUtf8Error(#[from] FromUtf8Error),
367}
368
369/// Failed to convert a string to a token sequence.
370#[derive(Debug, thiserror::Error)]
371pub enum StringToTokenError {
372    /// the string contained a null byte and thus could not be converted to a c string.
373    #[error("{0}")]
374    NulError(#[from] NulError),
375    #[error("{0}")]
376    /// Failed to convert a provided integer to a [`c_int`].
377    CIntConversionError(#[from] std::num::TryFromIntError),
378}
379
380/// Failed to apply model chat template.
381#[derive(Debug, thiserror::Error)]
382pub enum NewLlamaChatMessageError {
383    /// the string contained a null byte and thus could not be converted to a c string.
384    #[error("{0}")]
385    NulError(#[from] NulError),
386}
387
388/// Failed to apply model chat template.
389#[derive(Debug, thiserror::Error)]
390pub enum ApplyChatTemplateError {
391    /// the string contained a null byte and thus could not be converted to a c string.
392    #[error("{0}")]
393    NulError(#[from] NulError),
394    /// the string could not be converted to utf8.
395    #[error("{0}")]
396    FromUtf8Error(#[from] FromUtf8Error),
397    /// llama.cpp returned a null pointer for the template result.
398    #[error("null result from llama.cpp")]
399    NullResult,
400    /// llama.cpp returned an error code.
401    #[error("ffi error {0}")]
402    FfiError(i32),
403    /// invalid grammar trigger data returned by llama.cpp.
404    #[error("invalid grammar trigger data")]
405    InvalidGrammarTriggerType,
406}
407
408/// Failed to parse a chat response.
409#[derive(Debug, thiserror::Error)]
410pub enum ChatParseError {
411    /// the string contained a null byte and thus could not be converted to a c string.
412    #[error("{0}")]
413    NulError(#[from] NulError),
414    /// the string could not be converted to utf8.
415    #[error("{0}")]
416    Utf8Error(#[from] FromUtf8Error),
417    /// llama.cpp returned a null pointer for the parse result.
418    #[error("null result from llama.cpp")]
419    NullResult,
420    /// llama.cpp returned an error code.
421    #[error("ffi error {0}")]
422    FfiError(i32),
423}
424
425/// Failed to accept a token in a sampler.
426#[derive(Debug, thiserror::Error)]
427pub enum SamplerAcceptError {
428    /// llama.cpp returned an error code.
429    #[error("ffi error {0}")]
430    FfiError(i32),
431}
432
433/// Get the time in microseconds according to ggml
434///
435/// ```
436/// # use std::time::Duration;
437/// # use llama_cpp_2::llama_backend::LlamaBackend;
438/// let backend = LlamaBackend::init().unwrap();
439/// use llama_cpp_2::ggml_time_us;
440///
441/// let start = ggml_time_us();
442///
443/// std::thread::sleep(Duration::from_micros(10));
444///
445/// let end = ggml_time_us();
446///
447/// let elapsed = end - start;
448///
449/// assert!(elapsed >= 10)
450#[must_use]
451pub fn ggml_time_us() -> i64 {
452    unsafe { llama_cpp_sys_2::ggml_time_us() }
453}
454
455/// checks if mlock is supported
456///
457/// ```
458/// # use llama_cpp_2::llama_supports_mlock;
459///
460/// if llama_supports_mlock() {
461///   println!("mlock is supported!");
462/// } else {
463///   println!("mlock is not supported!");
464/// }
465/// ```
466#[must_use]
467pub fn llama_supports_mlock() -> bool {
468    unsafe { llama_cpp_sys_2::llama_supports_mlock() }
469}
470
471/// Backend device type
472#[derive(Debug, Clone, Copy, PartialEq, Eq)]
473pub enum LlamaBackendDeviceType {
474    /// CPU device
475    Cpu,
476    /// ACCEL device
477    Accelerator,
478    /// GPU device
479    Gpu,
480    /// iGPU device
481    IntegratedGpu,
482    /// Unknown device type
483    Unknown,
484}
485
486/// A ggml backend device
487///
488/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
489#[derive(Debug, Clone)]
490pub struct LlamaBackendDevice {
491    /// The index of the device
492    ///
493    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
494    pub index: usize,
495    /// The name of the device (e.g. "Vulkan0")
496    pub name: String,
497    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
498    pub description: String,
499    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
500    pub backend: String,
501    /// Total memory of the device in bytes
502    pub memory_total: usize,
503    /// Free memory of the device in bytes
504    pub memory_free: usize,
505    /// Device type
506    pub device_type: LlamaBackendDeviceType,
507}
508
509/// List ggml backend devices
510#[must_use]
511pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
512    let mut devices = Vec::new();
513    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
514        fn cstr_to_string(ptr: *const c_char) -> String {
515            if ptr.is_null() {
516                String::new()
517            } else {
518                unsafe { std::ffi::CStr::from_ptr(ptr) }
519                    .to_string_lossy()
520                    .to_string()
521            }
522        }
523        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
524        let props = unsafe {
525            let mut props = std::mem::zeroed();
526            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
527            props
528        };
529        let name = cstr_to_string(props.name);
530        let description = cstr_to_string(props.description);
531        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
532        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
533        let backend = cstr_to_string(backend_name);
534        let memory_total = props.memory_total;
535        let memory_free = props.memory_free;
536        let device_type = match props.type_ {
537            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
538            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
539            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
540            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
541            _ => LlamaBackendDeviceType::Unknown,
542        };
543        devices.push(LlamaBackendDevice {
544            index: i,
545            name,
546            description,
547            backend,
548            memory_total,
549            memory_free,
550            device_type,
551        });
552    }
553    devices
554}
555
556/// Options to configure how llama.cpp logs are intercepted.
557#[derive(Default, Debug, Clone)]
558pub struct LogOptions {
559    disabled: bool,
560}
561
562impl LogOptions {
563    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
564    /// logs to be sent to tracing.
565    #[must_use]
566    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
567        self.disabled = !enabled;
568        self
569    }
570}
571
572extern "C" fn logs_to_trace(
573    level: llama_cpp_sys_2::ggml_log_level,
574    text: *const ::std::os::raw::c_char,
575    data: *mut ::std::os::raw::c_void,
576) {
577    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
578    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
579    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
580    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
581    use std::borrow::Borrow;
582
583    let log_state = unsafe { &*(data as *const log::State) };
584
585    if log_state.options.disabled {
586        return;
587    }
588
589    // If the log level is disabled, we can just return early
590    if !log_state.is_enabled_for_level(level) {
591        log_state.update_previous_level_for_disabled_log(level);
592        return;
593    }
594
595    let text = unsafe { std::ffi::CStr::from_ptr(text) };
596    let text = text.to_string_lossy();
597    let text: &str = text.borrow();
598
599    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
600    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
601    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
602    // to know how to flush it.
603
604    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
605        log_state.cont_buffered_log(text);
606    } else if text.ends_with('\n') {
607        log_state.emit_non_cont_line(level, text);
608    } else {
609        log_state.buffer_non_cont(level, text);
610    }
611}
612
613/// Redirect llama.cpp logs into tracing.
614pub fn send_logs_to_tracing(options: LogOptions) {
615    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
616
617    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
618    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
619    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
620    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
621    let llama_heap_state = Box::as_ref(
622        log::LLAMA_STATE
623            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
624    ) as *const _;
625    let ggml_heap_state = Box::as_ref(
626        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
627    ) as *const _;
628
629    unsafe {
630        // GGML has to be set after llama since setting llama sets ggml as well.
631        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
632        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
633    }
634}