Skip to main content

llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//!
15//! # Feature Flags
16//!
17//! - `cuda` enables CUDA GPU support.
18//! - `metal` enables Apple Metal GPU support.
19//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
20//! - `native` enables host-CPU optimisations (`-march=native`).
21//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
22//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
23//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
24use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod mtp;
41pub mod quantize;
42pub mod sampling;
43pub mod token;
44pub mod token_type;
45
46#[cfg(feature = "rpc")]
47pub mod rpc;
48
49#[cfg(feature = "mtmd")]
50pub mod mtmd;
51
52/// A failable result from a llama.cpp function.
53pub type Result<T> = std::result::Result<T, LLamaCppError>;
54
55/// All errors that can occur in the llama-cpp crate.
56#[derive(Debug, Eq, PartialEq, thiserror::Error)]
57pub enum LLamaCppError {
58    /// The backend was already initialized. This can generally be ignored as initializing the backend
59    /// is idempotent.
60    #[error("BackendAlreadyInitialized")]
61    BackendAlreadyInitialized,
62    /// There was an error while get the chat template from model.
63    #[error("{0}")]
64    ChatTemplateError(#[from] ChatTemplateError),
65    /// There was an error while decoding a batch.
66    #[error("{0}")]
67    DecodeError(#[from] DecodeError),
68    /// There was an error while encoding a batch.
69    #[error("{0}")]
70    EncodeError(#[from] EncodeError),
71    /// There was an error loading a model.
72    #[error("{0}")]
73    LlamaModelLoadError(#[from] LlamaModelLoadError),
74    /// There was an error creating a new model context.
75    #[error("{0}")]
76    LlamaContextLoadError(#[from] LlamaContextLoadError),
77    /// There was an error adding a token to a batch.
78    #[error["{0}"]]
79    BatchAddError(#[from] BatchAddError),
80    /// see [`EmbeddingsError`]
81    #[error(transparent)]
82    EmbeddingError(#[from] EmbeddingsError),
83}
84
85/// There was an error while getting the chat template from a model.
86#[derive(Debug, Eq, PartialEq, thiserror::Error)]
87pub enum ChatTemplateError {
88    /// the buffer was too small.
89    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
90    BuffSizeError(usize),
91    /// gguf has no chat template
92    #[error("the model has no meta val - returned code {0}")]
93    MissingTemplate(i32),
94    /// The chat template was not valid utf8.
95    #[error(transparent)]
96    Utf8Error(#[from] std::str::Utf8Error),
97}
98
99/// Error retrieving a string from the model (e.g. description, metadata key/value).
100#[derive(Debug, Eq, PartialEq, thiserror::Error)]
101pub enum StringFromModelError {
102    /// The C function returned a negative error code.
103    #[error("llama.cpp returned error code {0}")]
104    ReturnedError(i32),
105    /// The returned bytes were not valid UTF-8.
106    #[error(transparent)]
107    Utf8Error(#[from] std::str::Utf8Error),
108}
109
110/// Failed to Load context
111#[derive(Debug, Eq, PartialEq, thiserror::Error)]
112pub enum LlamaContextLoadError {
113    /// llama.cpp returned null
114    #[error("null reference from llama.cpp")]
115    NullReturn,
116}
117
118/// Failed to decode a batch.
119#[derive(Debug, Eq, PartialEq, thiserror::Error)]
120pub enum DecodeError {
121    /// No kv cache slot was available.
122    #[error("Decode Error 1: NoKvCacheSlot")]
123    NoKvCacheSlot,
124    /// The number of tokens in the batch was 0.
125    #[error("Decode Error -1: n_tokens == 0")]
126    NTokensZero,
127    /// An unknown error occurred.
128    #[error("Decode Error {0}: unknown")]
129    Unknown(c_int),
130}
131
132/// Failed to decode a batch.
133#[derive(Debug, Eq, PartialEq, thiserror::Error)]
134pub enum EncodeError {
135    /// No kv cache slot was available.
136    #[error("Encode Error 1: NoKvCacheSlot")]
137    NoKvCacheSlot,
138    /// The number of tokens in the batch was 0.
139    #[error("Encode Error -1: n_tokens == 0")]
140    NTokensZero,
141    /// An unknown error occurred.
142    #[error("Encode Error {0}: unknown")]
143    Unknown(c_int),
144}
145
146/// When embedding related functions fail
147#[derive(Debug, Eq, PartialEq, thiserror::Error)]
148pub enum EmbeddingsError {
149    /// Embeddings weren't enabled in the context options
150    #[error("Embeddings weren't enabled in the context options")]
151    NotEnabled,
152    /// Logits weren't enabled for the given token
153    #[error("Logits were not enabled for the given token")]
154    LogitsNotEnabled,
155    /// The given sequence index exceeds the max sequence id
156    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
157    NonePoolType,
158}
159
160/// Decode a error from llama.cpp into a [`DecodeError`].
161impl From<NonZeroI32> for DecodeError {
162    fn from(value: NonZeroI32) -> Self {
163        match value.get() {
164            1 => DecodeError::NoKvCacheSlot,
165            -1 => DecodeError::NTokensZero,
166            i => DecodeError::Unknown(i),
167        }
168    }
169}
170
171/// Encode a error from llama.cpp into a [`EncodeError`].
172impl From<NonZeroI32> for EncodeError {
173    fn from(value: NonZeroI32) -> Self {
174        match value.get() {
175            1 => EncodeError::NoKvCacheSlot,
176            -1 => EncodeError::NTokensZero,
177            i => EncodeError::Unknown(i),
178        }
179    }
180}
181
182/// An error that can occur when loading a model.
183#[derive(Debug, Eq, PartialEq, thiserror::Error)]
184pub enum LlamaModelLoadError {
185    /// There was a null byte in a provided string and thus it could not be converted to a C string.
186    #[error("null byte in string {0}")]
187    NullError(#[from] NulError),
188    /// llama.cpp returned a nullptr - this could be many different causes.
189    #[error("null result from llama cpp")]
190    NullResult,
191    /// Failed to convert the path to a rust str. This means the path was not valid unicode
192    #[error("failed to convert path {0} to str")]
193    PathToStrError(PathBuf),
194}
195
196/// An error that can occur when loading a model.
197#[derive(Debug, Eq, PartialEq, thiserror::Error)]
198pub enum LlamaLoraAdapterInitError {
199    /// There was a null byte in a provided string and thus it could not be converted to a C string.
200    #[error("null byte in string {0}")]
201    NullError(#[from] NulError),
202    /// llama.cpp returned a nullptr - this could be many different causes.
203    #[error("null result from llama cpp")]
204    NullResult,
205    /// Failed to convert the path to a rust str. This means the path was not valid unicode
206    #[error("failed to convert path {0} to str")]
207    PathToStrError(PathBuf),
208}
209
210/// An error that can occur when loading a model.
211#[derive(Debug, Eq, PartialEq, thiserror::Error)]
212pub enum LlamaLoraAdapterSetError {
213    /// llama.cpp returned a non-zero error code.
214    #[error("error code from llama cpp")]
215    ErrorResult(i32),
216}
217
218/// An error that can occur when loading a model.
219#[derive(Debug, Eq, PartialEq, thiserror::Error)]
220pub enum LlamaLoraAdapterRemoveError {
221    /// llama.cpp returned a non-zero error code.
222    #[error("error code from llama cpp")]
223    ErrorResult(i32),
224}
225
226/// get the time (in microseconds) according to llama.cpp
227/// ```
228/// # use llama_cpp_4::llama_time_us;
229/// let time = llama_time_us();
230/// assert!(time > 0);
231/// ```
232#[must_use]
233pub fn llama_time_us() -> i64 {
234    unsafe { llama_cpp_sys_4::llama_time_us() }
235}
236
237/// get the max number of devices according to llama.cpp (this is generally cuda devices)
238/// ```
239/// # use llama_cpp_4::max_devices;
240/// let max_devices = max_devices();
241/// assert!(max_devices >= 0);
242/// ```
243#[must_use]
244pub fn max_devices() -> usize {
245    unsafe { llama_cpp_sys_4::llama_max_devices() }
246}
247
248/// is memory mapping supported according to llama.cpp
249/// ```
250/// # use llama_cpp_4::mmap_supported;
251/// let mmap_supported = mmap_supported();
252/// if mmap_supported {
253///   println!("mmap_supported!");
254/// }
255/// ```
256#[must_use]
257pub fn mmap_supported() -> bool {
258    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
259}
260
261/// is memory locking supported according to llama.cpp
262/// ```
263/// # use llama_cpp_4::mlock_supported;
264/// let mlock_supported = mlock_supported();
265/// if mlock_supported {
266///    println!("mlock_supported!");
267/// }
268/// ```
269#[must_use]
270pub fn mlock_supported() -> bool {
271    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
272}
273
274/// An error that can occur when converting a token to a string.
275#[derive(Debug, thiserror::Error, Clone)]
276#[non_exhaustive]
277pub enum TokenToStringError {
278    /// the token type was unknown
279    #[error("Unknown Token Type")]
280    UnknownTokenType,
281    /// There was insufficient buffer space to convert the token to a string.
282    #[error("Insufficient Buffer Space {0}")]
283    InsufficientBufferSpace(c_int),
284    /// The token was not valid utf8.
285    #[error("FromUtf8Error {0}")]
286    FromUtf8Error(#[from] FromUtf8Error),
287}
288
289/// Failed to convert a string to a token sequence.
290#[derive(Debug, thiserror::Error)]
291pub enum StringToTokenError {
292    /// the string contained a null byte and thus could not be converted to a c string.
293    #[error("{0}")]
294    NulError(#[from] NulError),
295    #[error("{0}")]
296    /// Failed to convert a provided integer to a [`c_int`].
297    CIntConversionError(#[from] std::num::TryFromIntError),
298}
299
300/// Failed to apply model chat template.
301#[derive(Debug, thiserror::Error)]
302pub enum NewLlamaChatMessageError {
303    /// the string contained a null byte and thus could not be converted to a c string.
304    #[error("{0}")]
305    NulError(#[from] NulError),
306}
307
308/// Failed to apply model chat template.
309#[derive(Debug, thiserror::Error)]
310pub enum ApplyChatTemplateError {
311    /// the buffer was too small.
312    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
313    BuffSizeError,
314    /// the string contained a null byte and thus could not be converted to a c string.
315    #[error("{0}")]
316    NulError(#[from] NulError),
317    /// the string could not be converted to utf8.
318    #[error("{0}")]
319    FromUtf8Error(#[from] FromUtf8Error),
320}
321
322/// Get the time in microseconds according to ggml
323///
324/// ```
325/// # use std::time::Duration;
326/// use llama_cpp_4::ggml_time_us;
327///
328/// let start = ggml_time_us();
329///
330/// std::thread::sleep(Duration::from_micros(10));
331///
332/// let end = ggml_time_us();
333///
334/// let elapsed = end - start;
335///
336/// assert!(elapsed >= 10)
337#[must_use]
338pub fn ggml_time_us() -> i64 {
339    unsafe { llama_cpp_sys_4::ggml_time_us() }
340}
341
342/// Checks if mlock is supported.
343///
344/// ```
345/// # use llama_cpp_4::llama_supports_mlock;
346///
347/// if llama_supports_mlock() {
348///   println!("mlock is supported!");
349/// } else {
350///   println!("mlock is not supported!");
351/// }
352/// ```
353#[must_use]
354pub fn llama_supports_mlock() -> bool {
355    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
356}
357
358/// Checks if GPU offload is supported.
359///
360/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
361#[must_use]
362pub fn supports_gpu_offload() -> bool {
363    unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
364}
365
366/// Checks if RPC backend is supported.
367///
368/// Returns `true` if the library was compiled with RPC support.
369#[must_use]
370pub fn supports_rpc() -> bool {
371    unsafe { llama_cpp_sys_4::llama_supports_rpc() }
372}
373
374/// Get system information string.
375///
376/// Returns a string containing CPU features, build info, and other system details.
377///
378/// # Panics
379///
380/// Panics if the returned string is not valid UTF-8.
381#[must_use]
382pub fn print_system_info() -> String {
383    let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
384    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
385    c_str
386        .to_str()
387        .expect("system info is not valid UTF-8")
388        .to_owned()
389}
390
391/// Get the maximum number of parallel sequences supported.
392#[must_use]
393pub fn max_parallel_sequences() -> usize {
394    unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
395}
396
397/// Get the maximum number of tensor buffer type overrides.
398#[must_use]
399pub fn max_tensor_buft_overrides() -> usize {
400    unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
401}
402
403/// Get the name of a flash attention type.
404///
405/// # Panics
406///
407/// Panics if the returned string is not valid UTF-8.
408#[must_use]
409pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
410    let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
411    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
412    c_str
413        .to_str()
414        .expect("flash_attn_type_name is not valid UTF-8")
415        .to_owned()
416}
417
418/// Get the string representation of a model metadata key.
419///
420/// # Panics
421///
422/// Panics if the returned string is not valid UTF-8.
423#[must_use]
424pub fn model_meta_key_str(key: u32) -> String {
425    let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
426    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
427    c_str
428        .to_str()
429        .expect("meta_key_str is not valid UTF-8")
430        .to_owned()
431}
432
433/// Quantize a model file using typed [`QuantizeParams`].
434///
435/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
436/// returned by `llama_model_quantize`.
437///
438/// # Panics
439///
440/// Panics if either path contains an interior null byte.
441///
442/// # Example
443///
444/// ```no_run
445/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
446///
447/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
448///     .with_nthread(8)
449///     .with_quantize_output_tensor(true);
450///
451/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
452/// ```
453pub fn model_quantize(
454    fname_inp: &str,
455    fname_out: &str,
456    params: &quantize::QuantizeParams,
457) -> std::result::Result<(), u32> {
458    let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
459    let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
460    let guard = params.to_raw();
461    let rc = unsafe {
462        llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
463    };
464    if rc == 0 {
465        Ok(())
466    } else {
467        Err(rc)
468    }
469}
470
471/// Get default quantization parameters (raw sys type).
472///
473/// Prefer [`QuantizeParams::new`] for the typed Rust API.
474#[must_use]
475#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
476pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
477    unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
478}
479
480/// Set the log callback.
481///
482/// # Safety
483///
484/// The callback and user data must remain valid for the lifetime of the application
485/// or until the callback is replaced.
486pub unsafe fn log_set(
487    callback: llama_cpp_sys_4::ggml_log_callback,
488    user_data: *mut std::ffi::c_void,
489) {
490    llama_cpp_sys_4::llama_log_set(callback, user_data);
491}
492
493/// Get the current log callback and user data.
494///
495/// # Safety
496///
497/// The caller must ensure the pointers are valid.
498pub unsafe fn log_get(
499    log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
500    user_data: *mut *mut std::ffi::c_void,
501) {
502    llama_cpp_sys_4::llama_log_get(log_callback, user_data);
503}
504
505/// Initialize optimizer state for fine-tuning.
506///
507/// # Safety
508///
509/// The context and model must be valid and compatible.
510pub unsafe fn opt_init(
511    ctx: *mut llama_cpp_sys_4::llama_context,
512    model: *mut llama_cpp_sys_4::llama_model,
513    params: llama_cpp_sys_4::llama_opt_params,
514) {
515    llama_cpp_sys_4::llama_opt_init(ctx, model, params);
516}
517
518/// Run one training epoch.
519///
520/// # Safety
521///
522/// All pointers and handles must be valid.
523#[allow(clippy::too_many_arguments)]
524pub unsafe fn opt_epoch(
525    ctx: *mut llama_cpp_sys_4::llama_context,
526    dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
527    result_train: llama_cpp_sys_4::ggml_opt_result_t,
528    result_eval: llama_cpp_sys_4::ggml_opt_result_t,
529    idata_split: i64,
530    callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
531    callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
532) {
533    llama_cpp_sys_4::llama_opt_epoch(
534        ctx,
535        dataset,
536        result_train,
537        result_eval,
538        idata_split,
539        callback_train,
540        callback_eval,
541    );
542}
543
544/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
545///
546/// # Safety
547///
548/// The tensor pointer must be valid.
549pub unsafe fn opt_param_filter_all(
550    tensor: *const llama_cpp_sys_4::ggml_tensor,
551    userdata: *mut std::ffi::c_void,
552) -> bool {
553    llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
554}
555
556/// Auto-fit model and context parameters for available memory.
557///
558/// # Safety
559///
560/// All pointers must be valid.
561#[allow(clippy::too_many_arguments)]
562pub unsafe fn params_fit(
563    path_model: *const std::ffi::c_char,
564    mparams: *mut llama_cpp_sys_4::llama_model_params,
565    cparams: *mut llama_cpp_sys_4::llama_context_params,
566    tensor_split: *mut f32,
567    tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
568    margins: *mut usize,
569    n_ctx_min: u32,
570    log_level: llama_cpp_sys_4::ggml_log_level,
571) -> llama_cpp_sys_4::common_params_fit_status {
572    llama_cpp_sys_4::common_fit_params(
573        path_model,
574        mparams,
575        cparams,
576        tensor_split,
577        tensor_buft_overrides,
578        margins,
579        n_ctx_min,
580        log_level,
581    )
582}