Skip to main content

llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Quick start
9//!
10//! ```no_run
11//! use llama_cpp_4::prelude::*;
12//! use std::num::NonZeroU32;
13//!
14//! fn main() {
15//!     let backend = LlamaBackend::init().unwrap();
16//!     let model = LlamaModel::load_from_file(
17//!         &backend,
18//!         "model.gguf",
19//!         &LlamaModelParams::default(),
20//!     )
21//!     .unwrap();
22//!     let mut ctx = model
23//!         .new_context(
24//!             &backend,
25//!             LlamaContextParams::default().with_n_ctx(NonZeroU32::new(2048)),
26//!         )
27//!         .unwrap();
28//!
29//!     let tokens = model.str_to_token("Hello, world!", AddBos::Always).unwrap();
30//!     let mut batch = LlamaBatch::new(512, 1);
31//!     for (i, &tok) in tokens.iter().enumerate() {
32//!         batch
33//!             .add(tok, i as i32, &[0], i == tokens.len() - 1)
34//!             .unwrap();
35//!     }
36//!     ctx.decode(&mut batch).unwrap();
37//!
38//!     let token = LlamaSampler::greedy().sample(&ctx, 0);
39//!     let _piece = model.token_to_bytes(token, Special::Plaintext).unwrap();
40//! }
41//! ```
42//!
43//! # Examples in this repository
44//!
45//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
46//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
47//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
48//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
49//! - [mtp](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/mtp) — MTP speculative decoding via [`crate::mtp::MtpSession`]
50//! - [eagle](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/eagle) — EAGLE-3 speculative decoding via [`crate::eagle::Eagle3Session`]
51//!
52//! # Advanced: tensor capture
53//!
54//! Use [`TensorCapture`] with [`LlamaContextParams::with_tensor_capture`] to read
55//! per-layer hidden states (or other named graph nodes) during
56//! [`LlamaContext::decode`]. See [`context::tensor_capture`] for a full example.
57//!
58//! # Prelude
59//!
60//! For the types used in most inference programs, import [`prelude`]:
61//!
62//! ```
63//! use llama_cpp_4::prelude::*;
64//! ```
65//!
66//! The same core types are also re-exported at the crate root (e.g.
67//! [`LlamaModel`], [`LlamaBackend`]) so you can pick whichever import style
68//! you prefer. See [`prelude`] for a full list and additional examples (chat,
69//! embeddings, memory estimation).
70//!
71//! # Feature Flags
72//!
73//! - `cuda` enables CUDA GPU support.
74//! - `metal` enables Apple Metal GPU support.
75//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
76//! - `native` enables host-CPU optimisations (`-march=native`).
77//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
78//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
79//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
80use std::ffi::NulError;
81use std::fmt::Debug;
82use std::num::NonZeroI32;
83
84use crate::llama_batch::BatchAddError;
85use std::os::raw::c_int;
86use std::path::PathBuf;
87use std::string::FromUtf8Error;
88
89pub mod common;
90pub mod context;
91pub mod eagle;
92pub mod fit;
93#[cfg(feature = "ggml")]
94pub mod ggml;
95pub mod llama_backend;
96pub mod llama_batch;
97pub mod model;
98pub mod mtp;
99pub mod prelude;
100pub mod quantize;
101pub mod sampling;
102pub mod token;
103pub mod token_type;
104
105#[cfg(feature = "rpc")]
106pub mod rpc;
107
108#[cfg(feature = "mtmd")]
109pub mod mtmd;
110
111/// A failable result from a llama.cpp function.
112pub type Result<T> = std::result::Result<T, LLamaCppError>;
113
114/// All errors that can occur in the llama-cpp crate.
115#[derive(Debug, Eq, PartialEq, thiserror::Error)]
116pub enum LLamaCppError {
117    /// The backend was already initialized. This can generally be ignored as initializing the backend
118    /// is idempotent.
119    #[error("BackendAlreadyInitialized")]
120    BackendAlreadyInitialized,
121    /// There was an error while get the chat template from model.
122    #[error("{0}")]
123    ChatTemplateError(#[from] ChatTemplateError),
124    /// There was an error while decoding a batch.
125    #[error("{0}")]
126    DecodeError(#[from] DecodeError),
127    /// There was an error while encoding a batch.
128    #[error("{0}")]
129    EncodeError(#[from] EncodeError),
130    /// There was an error loading a model.
131    #[error("{0}")]
132    LlamaModelLoadError(#[from] LlamaModelLoadError),
133    /// There was an error creating a new model context.
134    #[error("{0}")]
135    LlamaContextLoadError(#[from] LlamaContextLoadError),
136    /// There was an error adding a token to a batch.
137    #[error["{0}"]]
138    BatchAddError(#[from] BatchAddError),
139    /// see [`EmbeddingsError`]
140    #[error(transparent)]
141    EmbeddingError(#[from] EmbeddingsError),
142}
143
144/// There was an error while getting the chat template from a model.
145#[derive(Debug, Eq, PartialEq, thiserror::Error)]
146pub enum ChatTemplateError {
147    /// the buffer was too small.
148    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
149    BuffSizeError(usize),
150    /// gguf has no chat template
151    #[error("the model has no meta val - returned code {0}")]
152    MissingTemplate(i32),
153    /// The chat template was not valid utf8.
154    #[error(transparent)]
155    Utf8Error(#[from] std::str::Utf8Error),
156}
157
158/// Error retrieving a string from the model (e.g. description, metadata key/value).
159#[derive(Debug, Eq, PartialEq, thiserror::Error)]
160pub enum StringFromModelError {
161    /// The C function returned a negative error code.
162    #[error("llama.cpp returned error code {0}")]
163    ReturnedError(i32),
164    /// The returned bytes were not valid UTF-8.
165    #[error(transparent)]
166    Utf8Error(#[from] std::str::Utf8Error),
167}
168
169/// Failed to Load context
170#[derive(Debug, Eq, PartialEq, thiserror::Error)]
171pub enum LlamaContextLoadError {
172    /// llama.cpp returned null
173    #[error("null reference from llama.cpp")]
174    NullReturn,
175}
176
177/// Failed to decode a batch.
178#[derive(Debug, Eq, PartialEq, thiserror::Error)]
179pub enum DecodeError {
180    /// No kv cache slot was available.
181    #[error("Decode Error 1: NoKvCacheSlot")]
182    NoKvCacheSlot,
183    /// The number of tokens in the batch was 0.
184    #[error("Decode Error -1: n_tokens == 0")]
185    NTokensZero,
186    /// An unknown error occurred.
187    #[error("Decode Error {0}: unknown")]
188    Unknown(c_int),
189}
190
191/// Failed to decode a batch.
192#[derive(Debug, Eq, PartialEq, thiserror::Error)]
193pub enum EncodeError {
194    /// No kv cache slot was available.
195    #[error("Encode Error 1: NoKvCacheSlot")]
196    NoKvCacheSlot,
197    /// The number of tokens in the batch was 0.
198    #[error("Encode Error -1: n_tokens == 0")]
199    NTokensZero,
200    /// An unknown error occurred.
201    #[error("Encode Error {0}: unknown")]
202    Unknown(c_int),
203}
204
205/// When embedding related functions fail
206#[derive(Debug, Eq, PartialEq, thiserror::Error)]
207pub enum EmbeddingsError {
208    /// Embeddings weren't enabled in the context options
209    #[error("Embeddings weren't enabled in the context options")]
210    NotEnabled,
211    /// Logits weren't enabled for the given token
212    #[error("Logits were not enabled for the given token")]
213    LogitsNotEnabled,
214    /// The given sequence index exceeds the max sequence id
215    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
216    NonePoolType,
217}
218
219/// Decode a error from llama.cpp into a [`DecodeError`].
220impl From<NonZeroI32> for DecodeError {
221    fn from(value: NonZeroI32) -> Self {
222        match value.get() {
223            1 => DecodeError::NoKvCacheSlot,
224            -1 => DecodeError::NTokensZero,
225            i => DecodeError::Unknown(i),
226        }
227    }
228}
229
230/// Encode a error from llama.cpp into a [`EncodeError`].
231impl From<NonZeroI32> for EncodeError {
232    fn from(value: NonZeroI32) -> Self {
233        match value.get() {
234            1 => EncodeError::NoKvCacheSlot,
235            -1 => EncodeError::NTokensZero,
236            i => EncodeError::Unknown(i),
237        }
238    }
239}
240
241/// An error that can occur when loading a model.
242#[derive(Debug, Eq, PartialEq, thiserror::Error)]
243pub enum LlamaModelLoadError {
244    /// There was a null byte in a provided string and thus it could not be converted to a C string.
245    #[error("null byte in string {0}")]
246    NullError(#[from] NulError),
247    /// llama.cpp returned a nullptr - this could be many different causes.
248    #[error("null result from llama cpp")]
249    NullResult,
250    /// Failed to convert the path to a rust str. This means the path was not valid unicode
251    #[error("failed to convert path {0} to str")]
252    PathToStrError(PathBuf),
253}
254
255/// An error that can occur when loading a model.
256#[derive(Debug, Eq, PartialEq, thiserror::Error)]
257pub enum LlamaLoraAdapterInitError {
258    /// There was a null byte in a provided string and thus it could not be converted to a C string.
259    #[error("null byte in string {0}")]
260    NullError(#[from] NulError),
261    /// llama.cpp returned a nullptr - this could be many different causes.
262    #[error("null result from llama cpp")]
263    NullResult,
264    /// Failed to convert the path to a rust str. This means the path was not valid unicode
265    #[error("failed to convert path {0} to str")]
266    PathToStrError(PathBuf),
267}
268
269/// An error that can occur when loading a model.
270#[derive(Debug, Eq, PartialEq, thiserror::Error)]
271pub enum LlamaLoraAdapterSetError {
272    /// llama.cpp returned a non-zero error code.
273    #[error("error code from llama cpp")]
274    ErrorResult(i32),
275}
276
277/// An error that can occur when loading a model.
278#[derive(Debug, Eq, PartialEq, thiserror::Error)]
279pub enum LlamaLoraAdapterRemoveError {
280    /// llama.cpp returned a non-zero error code.
281    #[error("error code from llama cpp")]
282    ErrorResult(i32),
283}
284
285/// get the time (in microseconds) according to llama.cpp
286/// ```
287/// # use llama_cpp_4::llama_time_us;
288/// let time = llama_time_us();
289/// assert!(time > 0);
290/// ```
291#[must_use]
292pub fn llama_time_us() -> i64 {
293    unsafe { llama_cpp_sys_4::llama_time_us() }
294}
295
296/// get the max number of devices according to llama.cpp (this is generally cuda devices)
297/// ```
298/// # use llama_cpp_4::max_devices;
299/// let max_devices = max_devices();
300/// assert!(max_devices >= 0);
301/// ```
302#[must_use]
303pub fn max_devices() -> usize {
304    unsafe { llama_cpp_sys_4::llama_max_devices() }
305}
306
307/// is memory mapping supported according to llama.cpp
308/// ```
309/// # use llama_cpp_4::mmap_supported;
310/// let mmap_supported = mmap_supported();
311/// if mmap_supported {
312///   println!("mmap_supported!");
313/// }
314/// ```
315#[must_use]
316pub fn mmap_supported() -> bool {
317    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
318}
319
320/// is memory locking supported according to llama.cpp
321/// ```
322/// # use llama_cpp_4::mlock_supported;
323/// let mlock_supported = mlock_supported();
324/// if mlock_supported {
325///    println!("mlock_supported!");
326/// }
327/// ```
328#[must_use]
329pub fn mlock_supported() -> bool {
330    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
331}
332
333/// An error that can occur when converting a token to a string.
334#[derive(Debug, thiserror::Error, Clone)]
335#[non_exhaustive]
336pub enum TokenToStringError {
337    /// the token type was unknown
338    #[error("Unknown Token Type")]
339    UnknownTokenType,
340    /// There was insufficient buffer space to convert the token to a string.
341    #[error("Insufficient Buffer Space {0}")]
342    InsufficientBufferSpace(c_int),
343    /// The token was not valid utf8.
344    #[error("FromUtf8Error {0}")]
345    FromUtf8Error(#[from] FromUtf8Error),
346}
347
348/// Failed to convert a string to a token sequence.
349#[derive(Debug, thiserror::Error)]
350pub enum StringToTokenError {
351    /// the string contained a null byte and thus could not be converted to a c string.
352    #[error("{0}")]
353    NulError(#[from] NulError),
354    #[error("{0}")]
355    /// Failed to convert a provided integer to a [`c_int`].
356    CIntConversionError(#[from] std::num::TryFromIntError),
357}
358
359/// Failed to apply model chat template.
360#[derive(Debug, thiserror::Error)]
361pub enum NewLlamaChatMessageError {
362    /// the string contained a null byte and thus could not be converted to a c string.
363    #[error("{0}")]
364    NulError(#[from] NulError),
365}
366
367/// Failed to apply model chat template.
368#[derive(Debug, thiserror::Error)]
369pub enum ApplyChatTemplateError {
370    /// the buffer was too small.
371    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
372    BuffSizeError,
373    /// the string contained a null byte and thus could not be converted to a c string.
374    #[error("{0}")]
375    NulError(#[from] NulError),
376    /// the string could not be converted to utf8.
377    #[error("{0}")]
378    FromUtf8Error(#[from] FromUtf8Error),
379}
380
381/// Get the time in microseconds according to ggml
382///
383/// ```
384/// # use std::time::Duration;
385/// use llama_cpp_4::ggml_time_us;
386///
387/// let start = ggml_time_us();
388///
389/// std::thread::sleep(Duration::from_micros(10));
390///
391/// let end = ggml_time_us();
392///
393/// let elapsed = end - start;
394///
395/// assert!(elapsed >= 10)
396#[must_use]
397pub fn ggml_time_us() -> i64 {
398    unsafe { llama_cpp_sys_4::ggml_time_us() }
399}
400
401/// Checks if mlock is supported.
402///
403/// ```
404/// # use llama_cpp_4::llama_supports_mlock;
405///
406/// if llama_supports_mlock() {
407///   println!("mlock is supported!");
408/// } else {
409///   println!("mlock is not supported!");
410/// }
411/// ```
412#[must_use]
413pub fn llama_supports_mlock() -> bool {
414    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
415}
416
417/// Checks if GPU offload is supported.
418///
419/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
420#[must_use]
421pub fn supports_gpu_offload() -> bool {
422    unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
423}
424
425/// Checks if RPC backend is supported.
426///
427/// Returns `true` if the library was compiled with RPC support.
428#[must_use]
429pub fn supports_rpc() -> bool {
430    unsafe { llama_cpp_sys_4::llama_supports_rpc() }
431}
432
433/// Get system information string.
434///
435/// Returns a string containing CPU features, build info, and other system details.
436///
437/// # Panics
438///
439/// Panics if the returned string is not valid UTF-8.
440#[must_use]
441pub fn print_system_info() -> String {
442    let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
443    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
444    c_str
445        .to_str()
446        .expect("system info is not valid UTF-8")
447        .to_owned()
448}
449
450/// Get the maximum number of parallel sequences supported.
451#[must_use]
452pub fn max_parallel_sequences() -> usize {
453    unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
454}
455
456/// Get the maximum number of tensor buffer type overrides.
457#[must_use]
458pub fn max_tensor_buft_overrides() -> usize {
459    unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
460}
461
462/// Get the name of a flash attention type.
463///
464/// # Panics
465///
466/// Panics if the returned string is not valid UTF-8.
467#[must_use]
468pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
469    let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
470    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
471    c_str
472        .to_str()
473        .expect("flash_attn_type_name is not valid UTF-8")
474        .to_owned()
475}
476
477/// Get the string representation of a model metadata key.
478///
479/// # Panics
480///
481/// Panics if the returned string is not valid UTF-8.
482#[must_use]
483pub fn model_meta_key_str(key: u32) -> String {
484    let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key) };
485    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
486    c_str
487        .to_str()
488        .expect("meta_key_str is not valid UTF-8")
489        .to_owned()
490}
491
492/// Quantize a model file using typed [`crate::quantize::QuantizeParams`].
493///
494/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
495/// returned by `llama_model_quantize`.
496///
497/// # Panics
498///
499/// Panics if either path contains an interior null byte.
500///
501/// # Errors
502///
503/// Returns `Err(code)` with the non-zero status code from `llama_model_quantize`
504/// when quantization fails.
505///
506/// # Example
507///
508/// ```no_run
509/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
510///
511/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
512///     .with_nthread(8)
513///     .with_quantize_output_tensor(true);
514///
515/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
516/// ```
517pub fn model_quantize(
518    fname_inp: &str,
519    fname_out: &str,
520    params: &quantize::QuantizeParams,
521) -> std::result::Result<(), u32> {
522    let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
523    let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
524    let guard = params.to_raw();
525    let rc = unsafe {
526        llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
527    };
528    if rc == 0 {
529        Ok(())
530    } else {
531        Err(rc)
532    }
533}
534
535/// Set the log callback.
536///
537/// # Safety
538///
539/// The callback and user data must remain valid for the lifetime of the application
540/// or until the callback is replaced.
541pub unsafe fn log_set(
542    callback: llama_cpp_sys_4::ggml_log_callback,
543    user_data: *mut std::ffi::c_void,
544) {
545    llama_cpp_sys_4::llama_log_set(callback, user_data);
546}
547
548/// Get the current log callback and user data.
549///
550/// # Safety
551///
552/// The caller must ensure the pointers are valid.
553pub unsafe fn log_get(
554    log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
555    user_data: *mut *mut std::ffi::c_void,
556) {
557    llama_cpp_sys_4::llama_log_get(log_callback, user_data);
558}
559
560/// Initialize optimizer state for fine-tuning.
561///
562/// # Safety
563///
564/// The context and model must be valid and compatible.
565pub unsafe fn opt_init(
566    ctx: *mut llama_cpp_sys_4::llama_context,
567    model: *mut llama_cpp_sys_4::llama_model,
568    params: llama_cpp_sys_4::llama_opt_params,
569) {
570    llama_cpp_sys_4::llama_opt_init(ctx, model, params);
571}
572
573/// Run one training epoch.
574///
575/// # Safety
576///
577/// All pointers and handles must be valid.
578#[allow(clippy::too_many_arguments)]
579pub unsafe fn opt_epoch(
580    ctx: *mut llama_cpp_sys_4::llama_context,
581    dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
582    result_train: llama_cpp_sys_4::ggml_opt_result_t,
583    result_eval: llama_cpp_sys_4::ggml_opt_result_t,
584    idata_split: i64,
585    callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
586    callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
587) {
588    llama_cpp_sys_4::llama_opt_epoch(
589        ctx,
590        dataset,
591        result_train,
592        result_eval,
593        idata_split,
594        callback_train,
595        callback_eval,
596    );
597}
598
599/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
600///
601/// # Safety
602///
603/// The tensor pointer must be valid.
604pub unsafe fn opt_param_filter_all(
605    tensor: *const llama_cpp_sys_4::ggml_tensor,
606    userdata: *mut std::ffi::c_void,
607) -> bool {
608    llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
609}
610
611// ── Crate-root re-exports (see also [`prelude`]) ────────────────────────────
612//
613// These mirror the most common [`prelude`] exports so callers can write
614// `llama_cpp_4::LlamaModel` without a glob import.
615
616/// Parameters used when creating a context.
617pub use context::params::LlamaContextParams;
618/// One captured intermediate tensor from [`TensorCapture`].
619pub use context::CapturedTensor;
620/// An inference context tied to a model.
621pub use context::LlamaContext;
622/// Per-buffer memory usage entry from [`LlamaContext::memory_breakdown`].
623pub use context::MemoryBreakdownEntry;
624/// Hook `cb_eval` during decode to copy named graph tensors (layer hidden states, …).
625pub use context::TensorCapture;
626/// Initialise the llama.cpp backend and hardware drivers.
627pub use llama_backend::LlamaBackend;
628/// Micro-batch submitted to [`LlamaContext::decode`].
629pub use llama_batch::LlamaBatch;
630/// Parameters used when loading a model.
631pub use model::params::LlamaModelParams;
632/// Controls whether tokenisation prepends a BOS token.
633pub use model::AddBos;
634/// A loaded GGUF model.
635pub use model::LlamaModel;
636/// Controls how special tokens are rendered as text.
637pub use model::Special;
638/// Sampler chain for token selection.
639pub use sampling::LlamaSampler;
640/// A single vocabulary token id.
641pub use token::LlamaToken;