Skip to main content

llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//! - [mtp](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/mtp) — MTP speculative decoding via [`crate::mtp::MtpSession`]
15//!
16//! # Feature Flags
17//!
18//! - `cuda` enables CUDA GPU support.
19//! - `metal` enables Apple Metal GPU support.
20//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
21//! - `native` enables host-CPU optimisations (`-march=native`).
22//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
23//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
24//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
25use std::ffi::NulError;
26use std::fmt::Debug;
27use std::num::NonZeroI32;
28
29use crate::llama_batch::BatchAddError;
30use std::os::raw::c_int;
31use std::path::PathBuf;
32use std::string::FromUtf8Error;
33
34pub mod common;
35pub mod context;
36#[cfg(feature = "ggml")]
37pub mod ggml;
38pub mod llama_backend;
39pub mod llama_batch;
40pub mod model;
41pub mod mtp;
42pub mod quantize;
43pub mod sampling;
44pub mod token;
45pub mod token_type;
46
47#[cfg(feature = "rpc")]
48pub mod rpc;
49
50#[cfg(feature = "mtmd")]
51pub mod mtmd;
52
53/// A failable result from a llama.cpp function.
54pub type Result<T> = std::result::Result<T, LLamaCppError>;
55
56/// All errors that can occur in the llama-cpp crate.
57#[derive(Debug, Eq, PartialEq, thiserror::Error)]
58pub enum LLamaCppError {
59    /// The backend was already initialized. This can generally be ignored as initializing the backend
60    /// is idempotent.
61    #[error("BackendAlreadyInitialized")]
62    BackendAlreadyInitialized,
63    /// There was an error while get the chat template from model.
64    #[error("{0}")]
65    ChatTemplateError(#[from] ChatTemplateError),
66    /// There was an error while decoding a batch.
67    #[error("{0}")]
68    DecodeError(#[from] DecodeError),
69    /// There was an error while encoding a batch.
70    #[error("{0}")]
71    EncodeError(#[from] EncodeError),
72    /// There was an error loading a model.
73    #[error("{0}")]
74    LlamaModelLoadError(#[from] LlamaModelLoadError),
75    /// There was an error creating a new model context.
76    #[error("{0}")]
77    LlamaContextLoadError(#[from] LlamaContextLoadError),
78    /// There was an error adding a token to a batch.
79    #[error["{0}"]]
80    BatchAddError(#[from] BatchAddError),
81    /// see [`EmbeddingsError`]
82    #[error(transparent)]
83    EmbeddingError(#[from] EmbeddingsError),
84}
85
86/// There was an error while getting the chat template from a model.
87#[derive(Debug, Eq, PartialEq, thiserror::Error)]
88pub enum ChatTemplateError {
89    /// the buffer was too small.
90    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
91    BuffSizeError(usize),
92    /// gguf has no chat template
93    #[error("the model has no meta val - returned code {0}")]
94    MissingTemplate(i32),
95    /// The chat template was not valid utf8.
96    #[error(transparent)]
97    Utf8Error(#[from] std::str::Utf8Error),
98}
99
100/// Error retrieving a string from the model (e.g. description, metadata key/value).
101#[derive(Debug, Eq, PartialEq, thiserror::Error)]
102pub enum StringFromModelError {
103    /// The C function returned a negative error code.
104    #[error("llama.cpp returned error code {0}")]
105    ReturnedError(i32),
106    /// The returned bytes were not valid UTF-8.
107    #[error(transparent)]
108    Utf8Error(#[from] std::str::Utf8Error),
109}
110
111/// Failed to Load context
112#[derive(Debug, Eq, PartialEq, thiserror::Error)]
113pub enum LlamaContextLoadError {
114    /// llama.cpp returned null
115    #[error("null reference from llama.cpp")]
116    NullReturn,
117}
118
119/// Failed to decode a batch.
120#[derive(Debug, Eq, PartialEq, thiserror::Error)]
121pub enum DecodeError {
122    /// No kv cache slot was available.
123    #[error("Decode Error 1: NoKvCacheSlot")]
124    NoKvCacheSlot,
125    /// The number of tokens in the batch was 0.
126    #[error("Decode Error -1: n_tokens == 0")]
127    NTokensZero,
128    /// An unknown error occurred.
129    #[error("Decode Error {0}: unknown")]
130    Unknown(c_int),
131}
132
133/// Failed to decode a batch.
134#[derive(Debug, Eq, PartialEq, thiserror::Error)]
135pub enum EncodeError {
136    /// No kv cache slot was available.
137    #[error("Encode Error 1: NoKvCacheSlot")]
138    NoKvCacheSlot,
139    /// The number of tokens in the batch was 0.
140    #[error("Encode Error -1: n_tokens == 0")]
141    NTokensZero,
142    /// An unknown error occurred.
143    #[error("Encode Error {0}: unknown")]
144    Unknown(c_int),
145}
146
147/// When embedding related functions fail
148#[derive(Debug, Eq, PartialEq, thiserror::Error)]
149pub enum EmbeddingsError {
150    /// Embeddings weren't enabled in the context options
151    #[error("Embeddings weren't enabled in the context options")]
152    NotEnabled,
153    /// Logits weren't enabled for the given token
154    #[error("Logits were not enabled for the given token")]
155    LogitsNotEnabled,
156    /// The given sequence index exceeds the max sequence id
157    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
158    NonePoolType,
159}
160
161/// Decode a error from llama.cpp into a [`DecodeError`].
162impl From<NonZeroI32> for DecodeError {
163    fn from(value: NonZeroI32) -> Self {
164        match value.get() {
165            1 => DecodeError::NoKvCacheSlot,
166            -1 => DecodeError::NTokensZero,
167            i => DecodeError::Unknown(i),
168        }
169    }
170}
171
172/// Encode a error from llama.cpp into a [`EncodeError`].
173impl From<NonZeroI32> for EncodeError {
174    fn from(value: NonZeroI32) -> Self {
175        match value.get() {
176            1 => EncodeError::NoKvCacheSlot,
177            -1 => EncodeError::NTokensZero,
178            i => EncodeError::Unknown(i),
179        }
180    }
181}
182
183/// An error that can occur when loading a model.
184#[derive(Debug, Eq, PartialEq, thiserror::Error)]
185pub enum LlamaModelLoadError {
186    /// There was a null byte in a provided string and thus it could not be converted to a C string.
187    #[error("null byte in string {0}")]
188    NullError(#[from] NulError),
189    /// llama.cpp returned a nullptr - this could be many different causes.
190    #[error("null result from llama cpp")]
191    NullResult,
192    /// Failed to convert the path to a rust str. This means the path was not valid unicode
193    #[error("failed to convert path {0} to str")]
194    PathToStrError(PathBuf),
195}
196
197/// An error that can occur when loading a model.
198#[derive(Debug, Eq, PartialEq, thiserror::Error)]
199pub enum LlamaLoraAdapterInitError {
200    /// There was a null byte in a provided string and thus it could not be converted to a C string.
201    #[error("null byte in string {0}")]
202    NullError(#[from] NulError),
203    /// llama.cpp returned a nullptr - this could be many different causes.
204    #[error("null result from llama cpp")]
205    NullResult,
206    /// Failed to convert the path to a rust str. This means the path was not valid unicode
207    #[error("failed to convert path {0} to str")]
208    PathToStrError(PathBuf),
209}
210
211/// An error that can occur when loading a model.
212#[derive(Debug, Eq, PartialEq, thiserror::Error)]
213pub enum LlamaLoraAdapterSetError {
214    /// llama.cpp returned a non-zero error code.
215    #[error("error code from llama cpp")]
216    ErrorResult(i32),
217}
218
219/// An error that can occur when loading a model.
220#[derive(Debug, Eq, PartialEq, thiserror::Error)]
221pub enum LlamaLoraAdapterRemoveError {
222    /// llama.cpp returned a non-zero error code.
223    #[error("error code from llama cpp")]
224    ErrorResult(i32),
225}
226
227/// get the time (in microseconds) according to llama.cpp
228/// ```
229/// # use llama_cpp_4::llama_time_us;
230/// let time = llama_time_us();
231/// assert!(time > 0);
232/// ```
233#[must_use]
234pub fn llama_time_us() -> i64 {
235    unsafe { llama_cpp_sys_4::llama_time_us() }
236}
237
238/// get the max number of devices according to llama.cpp (this is generally cuda devices)
239/// ```
240/// # use llama_cpp_4::max_devices;
241/// let max_devices = max_devices();
242/// assert!(max_devices >= 0);
243/// ```
244#[must_use]
245pub fn max_devices() -> usize {
246    unsafe { llama_cpp_sys_4::llama_max_devices() }
247}
248
249/// is memory mapping supported according to llama.cpp
250/// ```
251/// # use llama_cpp_4::mmap_supported;
252/// let mmap_supported = mmap_supported();
253/// if mmap_supported {
254///   println!("mmap_supported!");
255/// }
256/// ```
257#[must_use]
258pub fn mmap_supported() -> bool {
259    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
260}
261
262/// is memory locking supported according to llama.cpp
263/// ```
264/// # use llama_cpp_4::mlock_supported;
265/// let mlock_supported = mlock_supported();
266/// if mlock_supported {
267///    println!("mlock_supported!");
268/// }
269/// ```
270#[must_use]
271pub fn mlock_supported() -> bool {
272    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
273}
274
275/// An error that can occur when converting a token to a string.
276#[derive(Debug, thiserror::Error, Clone)]
277#[non_exhaustive]
278pub enum TokenToStringError {
279    /// the token type was unknown
280    #[error("Unknown Token Type")]
281    UnknownTokenType,
282    /// There was insufficient buffer space to convert the token to a string.
283    #[error("Insufficient Buffer Space {0}")]
284    InsufficientBufferSpace(c_int),
285    /// The token was not valid utf8.
286    #[error("FromUtf8Error {0}")]
287    FromUtf8Error(#[from] FromUtf8Error),
288}
289
290/// Failed to convert a string to a token sequence.
291#[derive(Debug, thiserror::Error)]
292pub enum StringToTokenError {
293    /// the string contained a null byte and thus could not be converted to a c string.
294    #[error("{0}")]
295    NulError(#[from] NulError),
296    #[error("{0}")]
297    /// Failed to convert a provided integer to a [`c_int`].
298    CIntConversionError(#[from] std::num::TryFromIntError),
299}
300
301/// Failed to apply model chat template.
302#[derive(Debug, thiserror::Error)]
303pub enum NewLlamaChatMessageError {
304    /// the string contained a null byte and thus could not be converted to a c string.
305    #[error("{0}")]
306    NulError(#[from] NulError),
307}
308
309/// Failed to apply model chat template.
310#[derive(Debug, thiserror::Error)]
311pub enum ApplyChatTemplateError {
312    /// the buffer was too small.
313    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
314    BuffSizeError,
315    /// the string contained a null byte and thus could not be converted to a c string.
316    #[error("{0}")]
317    NulError(#[from] NulError),
318    /// the string could not be converted to utf8.
319    #[error("{0}")]
320    FromUtf8Error(#[from] FromUtf8Error),
321}
322
323/// Get the time in microseconds according to ggml
324///
325/// ```
326/// # use std::time::Duration;
327/// use llama_cpp_4::ggml_time_us;
328///
329/// let start = ggml_time_us();
330///
331/// std::thread::sleep(Duration::from_micros(10));
332///
333/// let end = ggml_time_us();
334///
335/// let elapsed = end - start;
336///
337/// assert!(elapsed >= 10)
338#[must_use]
339pub fn ggml_time_us() -> i64 {
340    unsafe { llama_cpp_sys_4::ggml_time_us() }
341}
342
343/// Checks if mlock is supported.
344///
345/// ```
346/// # use llama_cpp_4::llama_supports_mlock;
347///
348/// if llama_supports_mlock() {
349///   println!("mlock is supported!");
350/// } else {
351///   println!("mlock is not supported!");
352/// }
353/// ```
354#[must_use]
355pub fn llama_supports_mlock() -> bool {
356    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
357}
358
359/// Checks if GPU offload is supported.
360///
361/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
362#[must_use]
363pub fn supports_gpu_offload() -> bool {
364    unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
365}
366
367/// Checks if RPC backend is supported.
368///
369/// Returns `true` if the library was compiled with RPC support.
370#[must_use]
371pub fn supports_rpc() -> bool {
372    unsafe { llama_cpp_sys_4::llama_supports_rpc() }
373}
374
375/// Get system information string.
376///
377/// Returns a string containing CPU features, build info, and other system details.
378///
379/// # Panics
380///
381/// Panics if the returned string is not valid UTF-8.
382#[must_use]
383pub fn print_system_info() -> String {
384    let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
385    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
386    c_str
387        .to_str()
388        .expect("system info is not valid UTF-8")
389        .to_owned()
390}
391
392/// Get the maximum number of parallel sequences supported.
393#[must_use]
394pub fn max_parallel_sequences() -> usize {
395    unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
396}
397
398/// Get the maximum number of tensor buffer type overrides.
399#[must_use]
400pub fn max_tensor_buft_overrides() -> usize {
401    unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
402}
403
404/// Get the name of a flash attention type.
405///
406/// # Panics
407///
408/// Panics if the returned string is not valid UTF-8.
409#[must_use]
410pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
411    let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
412    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
413    c_str
414        .to_str()
415        .expect("flash_attn_type_name is not valid UTF-8")
416        .to_owned()
417}
418
419/// Get the string representation of a model metadata key.
420///
421/// # Panics
422///
423/// Panics if the returned string is not valid UTF-8.
424#[must_use]
425pub fn model_meta_key_str(key: u32) -> String {
426    let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
427    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
428    c_str
429        .to_str()
430        .expect("meta_key_str is not valid UTF-8")
431        .to_owned()
432}
433
434/// Quantize a model file using typed [`QuantizeParams`].
435///
436/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
437/// returned by `llama_model_quantize`.
438///
439/// # Panics
440///
441/// Panics if either path contains an interior null byte.
442///
443/// # Example
444///
445/// ```no_run
446/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
447///
448/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
449///     .with_nthread(8)
450///     .with_quantize_output_tensor(true);
451///
452/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
453/// ```
454pub fn model_quantize(
455    fname_inp: &str,
456    fname_out: &str,
457    params: &quantize::QuantizeParams,
458) -> std::result::Result<(), u32> {
459    let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
460    let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
461    let guard = params.to_raw();
462    let rc = unsafe {
463        llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
464    };
465    if rc == 0 {
466        Ok(())
467    } else {
468        Err(rc)
469    }
470}
471
472/// Get default quantization parameters (raw sys type).
473///
474/// Prefer [`QuantizeParams::new`] for the typed Rust API.
475#[must_use]
476#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
477pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
478    unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
479}
480
481/// Set the log callback.
482///
483/// # Safety
484///
485/// The callback and user data must remain valid for the lifetime of the application
486/// or until the callback is replaced.
487pub unsafe fn log_set(
488    callback: llama_cpp_sys_4::ggml_log_callback,
489    user_data: *mut std::ffi::c_void,
490) {
491    llama_cpp_sys_4::llama_log_set(callback, user_data);
492}
493
494/// Get the current log callback and user data.
495///
496/// # Safety
497///
498/// The caller must ensure the pointers are valid.
499pub unsafe fn log_get(
500    log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
501    user_data: *mut *mut std::ffi::c_void,
502) {
503    llama_cpp_sys_4::llama_log_get(log_callback, user_data);
504}
505
506/// Initialize optimizer state for fine-tuning.
507///
508/// # Safety
509///
510/// The context and model must be valid and compatible.
511pub unsafe fn opt_init(
512    ctx: *mut llama_cpp_sys_4::llama_context,
513    model: *mut llama_cpp_sys_4::llama_model,
514    params: llama_cpp_sys_4::llama_opt_params,
515) {
516    llama_cpp_sys_4::llama_opt_init(ctx, model, params);
517}
518
519/// Run one training epoch.
520///
521/// # Safety
522///
523/// All pointers and handles must be valid.
524#[allow(clippy::too_many_arguments)]
525pub unsafe fn opt_epoch(
526    ctx: *mut llama_cpp_sys_4::llama_context,
527    dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
528    result_train: llama_cpp_sys_4::ggml_opt_result_t,
529    result_eval: llama_cpp_sys_4::ggml_opt_result_t,
530    idata_split: i64,
531    callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
532    callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
533) {
534    llama_cpp_sys_4::llama_opt_epoch(
535        ctx,
536        dataset,
537        result_train,
538        result_eval,
539        idata_split,
540        callback_train,
541        callback_eval,
542    );
543}
544
545/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
546///
547/// # Safety
548///
549/// The tensor pointer must be valid.
550pub unsafe fn opt_param_filter_all(
551    tensor: *const llama_cpp_sys_4::ggml_tensor,
552    userdata: *mut std::ffi::c_void,
553) -> bool {
554    llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
555}
556
557/// Auto-fit model and context parameters for available memory.
558///
559/// # Safety
560///
561/// All pointers must be valid.
562#[allow(clippy::too_many_arguments)]
563pub unsafe fn params_fit(
564    path_model: *const std::ffi::c_char,
565    mparams: *mut llama_cpp_sys_4::llama_model_params,
566    cparams: *mut llama_cpp_sys_4::llama_context_params,
567    tensor_split: *mut f32,
568    tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
569    margins: *mut usize,
570    n_ctx_min: u32,
571    log_level: llama_cpp_sys_4::ggml_log_level,
572) -> llama_cpp_sys_4::common_params_fit_status {
573    llama_cpp_sys_4::common_fit_params(
574        path_model,
575        mparams,
576        cparams,
577        tensor_split,
578        tensor_buft_overrides,
579        margins,
580        n_ctx_min,
581        log_level,
582    )
583}