Skip to main content

llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//! - [mtp](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/mtp) — MTP speculative decoding via [`crate::mtp::MtpSession`]
15//! - [eagle](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/eagle) — EAGLE-3 speculative decoding via [`crate::eagle::Eagle3Session`]
16//!
17//! # Feature Flags
18//!
19//! - `cuda` enables CUDA GPU support.
20//! - `metal` enables Apple Metal GPU support.
21//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
22//! - `native` enables host-CPU optimisations (`-march=native`).
23//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
24//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
25//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
26use std::ffi::NulError;
27use std::fmt::Debug;
28use std::num::NonZeroI32;
29
30use crate::llama_batch::BatchAddError;
31use std::os::raw::c_int;
32use std::path::PathBuf;
33use std::string::FromUtf8Error;
34
35pub mod common;
36pub mod context;
37pub mod eagle;
38#[cfg(feature = "ggml")]
39pub mod ggml;
40pub mod llama_backend;
41pub mod llama_batch;
42pub mod model;
43pub mod mtp;
44pub mod quantize;
45pub mod sampling;
46pub mod token;
47pub mod token_type;
48
49#[cfg(feature = "rpc")]
50pub mod rpc;
51
52#[cfg(feature = "mtmd")]
53pub mod mtmd;
54
55/// A failable result from a llama.cpp function.
56pub type Result<T> = std::result::Result<T, LLamaCppError>;
57
58/// All errors that can occur in the llama-cpp crate.
59#[derive(Debug, Eq, PartialEq, thiserror::Error)]
60pub enum LLamaCppError {
61    /// The backend was already initialized. This can generally be ignored as initializing the backend
62    /// is idempotent.
63    #[error("BackendAlreadyInitialized")]
64    BackendAlreadyInitialized,
65    /// There was an error while get the chat template from model.
66    #[error("{0}")]
67    ChatTemplateError(#[from] ChatTemplateError),
68    /// There was an error while decoding a batch.
69    #[error("{0}")]
70    DecodeError(#[from] DecodeError),
71    /// There was an error while encoding a batch.
72    #[error("{0}")]
73    EncodeError(#[from] EncodeError),
74    /// There was an error loading a model.
75    #[error("{0}")]
76    LlamaModelLoadError(#[from] LlamaModelLoadError),
77    /// There was an error creating a new model context.
78    #[error("{0}")]
79    LlamaContextLoadError(#[from] LlamaContextLoadError),
80    /// There was an error adding a token to a batch.
81    #[error["{0}"]]
82    BatchAddError(#[from] BatchAddError),
83    /// see [`EmbeddingsError`]
84    #[error(transparent)]
85    EmbeddingError(#[from] EmbeddingsError),
86}
87
88/// There was an error while getting the chat template from a model.
89#[derive(Debug, Eq, PartialEq, thiserror::Error)]
90pub enum ChatTemplateError {
91    /// the buffer was too small.
92    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
93    BuffSizeError(usize),
94    /// gguf has no chat template
95    #[error("the model has no meta val - returned code {0}")]
96    MissingTemplate(i32),
97    /// The chat template was not valid utf8.
98    #[error(transparent)]
99    Utf8Error(#[from] std::str::Utf8Error),
100}
101
102/// Error retrieving a string from the model (e.g. description, metadata key/value).
103#[derive(Debug, Eq, PartialEq, thiserror::Error)]
104pub enum StringFromModelError {
105    /// The C function returned a negative error code.
106    #[error("llama.cpp returned error code {0}")]
107    ReturnedError(i32),
108    /// The returned bytes were not valid UTF-8.
109    #[error(transparent)]
110    Utf8Error(#[from] std::str::Utf8Error),
111}
112
113/// Failed to Load context
114#[derive(Debug, Eq, PartialEq, thiserror::Error)]
115pub enum LlamaContextLoadError {
116    /// llama.cpp returned null
117    #[error("null reference from llama.cpp")]
118    NullReturn,
119}
120
121/// Failed to decode a batch.
122#[derive(Debug, Eq, PartialEq, thiserror::Error)]
123pub enum DecodeError {
124    /// No kv cache slot was available.
125    #[error("Decode Error 1: NoKvCacheSlot")]
126    NoKvCacheSlot,
127    /// The number of tokens in the batch was 0.
128    #[error("Decode Error -1: n_tokens == 0")]
129    NTokensZero,
130    /// An unknown error occurred.
131    #[error("Decode Error {0}: unknown")]
132    Unknown(c_int),
133}
134
135/// Failed to decode a batch.
136#[derive(Debug, Eq, PartialEq, thiserror::Error)]
137pub enum EncodeError {
138    /// No kv cache slot was available.
139    #[error("Encode Error 1: NoKvCacheSlot")]
140    NoKvCacheSlot,
141    /// The number of tokens in the batch was 0.
142    #[error("Encode Error -1: n_tokens == 0")]
143    NTokensZero,
144    /// An unknown error occurred.
145    #[error("Encode Error {0}: unknown")]
146    Unknown(c_int),
147}
148
149/// When embedding related functions fail
150#[derive(Debug, Eq, PartialEq, thiserror::Error)]
151pub enum EmbeddingsError {
152    /// Embeddings weren't enabled in the context options
153    #[error("Embeddings weren't enabled in the context options")]
154    NotEnabled,
155    /// Logits weren't enabled for the given token
156    #[error("Logits were not enabled for the given token")]
157    LogitsNotEnabled,
158    /// The given sequence index exceeds the max sequence id
159    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
160    NonePoolType,
161}
162
163/// Decode a error from llama.cpp into a [`DecodeError`].
164impl From<NonZeroI32> for DecodeError {
165    fn from(value: NonZeroI32) -> Self {
166        match value.get() {
167            1 => DecodeError::NoKvCacheSlot,
168            -1 => DecodeError::NTokensZero,
169            i => DecodeError::Unknown(i),
170        }
171    }
172}
173
174/// Encode a error from llama.cpp into a [`EncodeError`].
175impl From<NonZeroI32> for EncodeError {
176    fn from(value: NonZeroI32) -> Self {
177        match value.get() {
178            1 => EncodeError::NoKvCacheSlot,
179            -1 => EncodeError::NTokensZero,
180            i => EncodeError::Unknown(i),
181        }
182    }
183}
184
185/// An error that can occur when loading a model.
186#[derive(Debug, Eq, PartialEq, thiserror::Error)]
187pub enum LlamaModelLoadError {
188    /// There was a null byte in a provided string and thus it could not be converted to a C string.
189    #[error("null byte in string {0}")]
190    NullError(#[from] NulError),
191    /// llama.cpp returned a nullptr - this could be many different causes.
192    #[error("null result from llama cpp")]
193    NullResult,
194    /// Failed to convert the path to a rust str. This means the path was not valid unicode
195    #[error("failed to convert path {0} to str")]
196    PathToStrError(PathBuf),
197}
198
199/// An error that can occur when loading a model.
200#[derive(Debug, Eq, PartialEq, thiserror::Error)]
201pub enum LlamaLoraAdapterInitError {
202    /// There was a null byte in a provided string and thus it could not be converted to a C string.
203    #[error("null byte in string {0}")]
204    NullError(#[from] NulError),
205    /// llama.cpp returned a nullptr - this could be many different causes.
206    #[error("null result from llama cpp")]
207    NullResult,
208    /// Failed to convert the path to a rust str. This means the path was not valid unicode
209    #[error("failed to convert path {0} to str")]
210    PathToStrError(PathBuf),
211}
212
213/// An error that can occur when loading a model.
214#[derive(Debug, Eq, PartialEq, thiserror::Error)]
215pub enum LlamaLoraAdapterSetError {
216    /// llama.cpp returned a non-zero error code.
217    #[error("error code from llama cpp")]
218    ErrorResult(i32),
219}
220
221/// An error that can occur when loading a model.
222#[derive(Debug, Eq, PartialEq, thiserror::Error)]
223pub enum LlamaLoraAdapterRemoveError {
224    /// llama.cpp returned a non-zero error code.
225    #[error("error code from llama cpp")]
226    ErrorResult(i32),
227}
228
229/// get the time (in microseconds) according to llama.cpp
230/// ```
231/// # use llama_cpp_4::llama_time_us;
232/// let time = llama_time_us();
233/// assert!(time > 0);
234/// ```
235#[must_use]
236pub fn llama_time_us() -> i64 {
237    unsafe { llama_cpp_sys_4::llama_time_us() }
238}
239
240/// get the max number of devices according to llama.cpp (this is generally cuda devices)
241/// ```
242/// # use llama_cpp_4::max_devices;
243/// let max_devices = max_devices();
244/// assert!(max_devices >= 0);
245/// ```
246#[must_use]
247pub fn max_devices() -> usize {
248    unsafe { llama_cpp_sys_4::llama_max_devices() }
249}
250
251/// is memory mapping supported according to llama.cpp
252/// ```
253/// # use llama_cpp_4::mmap_supported;
254/// let mmap_supported = mmap_supported();
255/// if mmap_supported {
256///   println!("mmap_supported!");
257/// }
258/// ```
259#[must_use]
260pub fn mmap_supported() -> bool {
261    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
262}
263
264/// is memory locking supported according to llama.cpp
265/// ```
266/// # use llama_cpp_4::mlock_supported;
267/// let mlock_supported = mlock_supported();
268/// if mlock_supported {
269///    println!("mlock_supported!");
270/// }
271/// ```
272#[must_use]
273pub fn mlock_supported() -> bool {
274    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
275}
276
277/// An error that can occur when converting a token to a string.
278#[derive(Debug, thiserror::Error, Clone)]
279#[non_exhaustive]
280pub enum TokenToStringError {
281    /// the token type was unknown
282    #[error("Unknown Token Type")]
283    UnknownTokenType,
284    /// There was insufficient buffer space to convert the token to a string.
285    #[error("Insufficient Buffer Space {0}")]
286    InsufficientBufferSpace(c_int),
287    /// The token was not valid utf8.
288    #[error("FromUtf8Error {0}")]
289    FromUtf8Error(#[from] FromUtf8Error),
290}
291
292/// Failed to convert a string to a token sequence.
293#[derive(Debug, thiserror::Error)]
294pub enum StringToTokenError {
295    /// the string contained a null byte and thus could not be converted to a c string.
296    #[error("{0}")]
297    NulError(#[from] NulError),
298    #[error("{0}")]
299    /// Failed to convert a provided integer to a [`c_int`].
300    CIntConversionError(#[from] std::num::TryFromIntError),
301}
302
303/// Failed to apply model chat template.
304#[derive(Debug, thiserror::Error)]
305pub enum NewLlamaChatMessageError {
306    /// the string contained a null byte and thus could not be converted to a c string.
307    #[error("{0}")]
308    NulError(#[from] NulError),
309}
310
311/// Failed to apply model chat template.
312#[derive(Debug, thiserror::Error)]
313pub enum ApplyChatTemplateError {
314    /// the buffer was too small.
315    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
316    BuffSizeError,
317    /// the string contained a null byte and thus could not be converted to a c string.
318    #[error("{0}")]
319    NulError(#[from] NulError),
320    /// the string could not be converted to utf8.
321    #[error("{0}")]
322    FromUtf8Error(#[from] FromUtf8Error),
323}
324
325/// Get the time in microseconds according to ggml
326///
327/// ```
328/// # use std::time::Duration;
329/// use llama_cpp_4::ggml_time_us;
330///
331/// let start = ggml_time_us();
332///
333/// std::thread::sleep(Duration::from_micros(10));
334///
335/// let end = ggml_time_us();
336///
337/// let elapsed = end - start;
338///
339/// assert!(elapsed >= 10)
340#[must_use]
341pub fn ggml_time_us() -> i64 {
342    unsafe { llama_cpp_sys_4::ggml_time_us() }
343}
344
345/// Checks if mlock is supported.
346///
347/// ```
348/// # use llama_cpp_4::llama_supports_mlock;
349///
350/// if llama_supports_mlock() {
351///   println!("mlock is supported!");
352/// } else {
353///   println!("mlock is not supported!");
354/// }
355/// ```
356#[must_use]
357pub fn llama_supports_mlock() -> bool {
358    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
359}
360
361/// Checks if GPU offload is supported.
362///
363/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
364#[must_use]
365pub fn supports_gpu_offload() -> bool {
366    unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
367}
368
369/// Checks if RPC backend is supported.
370///
371/// Returns `true` if the library was compiled with RPC support.
372#[must_use]
373pub fn supports_rpc() -> bool {
374    unsafe { llama_cpp_sys_4::llama_supports_rpc() }
375}
376
377/// Get system information string.
378///
379/// Returns a string containing CPU features, build info, and other system details.
380///
381/// # Panics
382///
383/// Panics if the returned string is not valid UTF-8.
384#[must_use]
385pub fn print_system_info() -> String {
386    let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
387    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
388    c_str
389        .to_str()
390        .expect("system info is not valid UTF-8")
391        .to_owned()
392}
393
394/// Get the maximum number of parallel sequences supported.
395#[must_use]
396pub fn max_parallel_sequences() -> usize {
397    unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
398}
399
400/// Get the maximum number of tensor buffer type overrides.
401#[must_use]
402pub fn max_tensor_buft_overrides() -> usize {
403    unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
404}
405
406/// Get the name of a flash attention type.
407///
408/// # Panics
409///
410/// Panics if the returned string is not valid UTF-8.
411#[must_use]
412pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
413    let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
414    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
415    c_str
416        .to_str()
417        .expect("flash_attn_type_name is not valid UTF-8")
418        .to_owned()
419}
420
421/// Get the string representation of a model metadata key.
422///
423/// # Panics
424///
425/// Panics if the returned string is not valid UTF-8.
426#[must_use]
427pub fn model_meta_key_str(key: u32) -> String {
428    let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
429    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
430    c_str
431        .to_str()
432        .expect("meta_key_str is not valid UTF-8")
433        .to_owned()
434}
435
436/// Quantize a model file using typed [`QuantizeParams`].
437///
438/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
439/// returned by `llama_model_quantize`.
440///
441/// # Panics
442///
443/// Panics if either path contains an interior null byte.
444///
445/// # Example
446///
447/// ```no_run
448/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
449///
450/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
451///     .with_nthread(8)
452///     .with_quantize_output_tensor(true);
453///
454/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
455/// ```
456pub fn model_quantize(
457    fname_inp: &str,
458    fname_out: &str,
459    params: &quantize::QuantizeParams,
460) -> std::result::Result<(), u32> {
461    let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
462    let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
463    let guard = params.to_raw();
464    let rc = unsafe {
465        llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
466    };
467    if rc == 0 {
468        Ok(())
469    } else {
470        Err(rc)
471    }
472}
473
474/// Get default quantization parameters (raw sys type).
475///
476/// Prefer [`QuantizeParams::new`] for the typed Rust API.
477#[must_use]
478#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
479pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
480    unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
481}
482
483/// Set the log callback.
484///
485/// # Safety
486///
487/// The callback and user data must remain valid for the lifetime of the application
488/// or until the callback is replaced.
489pub unsafe fn log_set(
490    callback: llama_cpp_sys_4::ggml_log_callback,
491    user_data: *mut std::ffi::c_void,
492) {
493    llama_cpp_sys_4::llama_log_set(callback, user_data);
494}
495
496/// Get the current log callback and user data.
497///
498/// # Safety
499///
500/// The caller must ensure the pointers are valid.
501pub unsafe fn log_get(
502    log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
503    user_data: *mut *mut std::ffi::c_void,
504) {
505    llama_cpp_sys_4::llama_log_get(log_callback, user_data);
506}
507
508/// Initialize optimizer state for fine-tuning.
509///
510/// # Safety
511///
512/// The context and model must be valid and compatible.
513pub unsafe fn opt_init(
514    ctx: *mut llama_cpp_sys_4::llama_context,
515    model: *mut llama_cpp_sys_4::llama_model,
516    params: llama_cpp_sys_4::llama_opt_params,
517) {
518    llama_cpp_sys_4::llama_opt_init(ctx, model, params);
519}
520
521/// Run one training epoch.
522///
523/// # Safety
524///
525/// All pointers and handles must be valid.
526#[allow(clippy::too_many_arguments)]
527pub unsafe fn opt_epoch(
528    ctx: *mut llama_cpp_sys_4::llama_context,
529    dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
530    result_train: llama_cpp_sys_4::ggml_opt_result_t,
531    result_eval: llama_cpp_sys_4::ggml_opt_result_t,
532    idata_split: i64,
533    callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
534    callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
535) {
536    llama_cpp_sys_4::llama_opt_epoch(
537        ctx,
538        dataset,
539        result_train,
540        result_eval,
541        idata_split,
542        callback_train,
543        callback_eval,
544    );
545}
546
547/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
548///
549/// # Safety
550///
551/// The tensor pointer must be valid.
552pub unsafe fn opt_param_filter_all(
553    tensor: *const llama_cpp_sys_4::ggml_tensor,
554    userdata: *mut std::ffi::c_void,
555) -> bool {
556    llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
557}
558
559/// Auto-fit model and context parameters for available memory.
560///
561/// # Safety
562///
563/// All pointers must be valid.
564#[allow(clippy::too_many_arguments)]
565pub unsafe fn params_fit(
566    path_model: *const std::ffi::c_char,
567    mparams: *mut llama_cpp_sys_4::llama_model_params,
568    cparams: *mut llama_cpp_sys_4::llama_context_params,
569    tensor_split: *mut f32,
570    tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
571    margins: *mut usize,
572    n_ctx_min: u32,
573    log_level: llama_cpp_sys_4::ggml_log_level,
574) -> llama_cpp_sys_4::common_params_fit_status {
575    llama_cpp_sys_4::common_fit_params(
576        path_model,
577        mparams,
578        cparams,
579        tensor_split,
580        tensor_buft_overrides,
581        margins,
582        n_ctx_min,
583        log_level,
584    )
585}