Skip to main content

llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//!
15//! # Feature Flags
16//!
17//! - `cuda` enables CUDA GPU support.
18//! - `metal` enables Apple Metal GPU support.
19//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
20//! - `native` enables host-CPU optimisations (`-march=native`).
21//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
22//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
23//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
24use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod quantize;
41pub mod sampling;
42pub mod token;
43pub mod token_type;
44
45#[cfg(feature = "rpc")]
46pub mod rpc;
47
48#[cfg(feature = "mtmd")]
49pub mod mtmd;
50
51/// A failable result from a llama.cpp function.
52pub type Result<T> = std::result::Result<T, LLamaCppError>;
53
54/// All errors that can occur in the llama-cpp crate.
55#[derive(Debug, Eq, PartialEq, thiserror::Error)]
56pub enum LLamaCppError {
57    /// The backend was already initialized. This can generally be ignored as initializing the backend
58    /// is idempotent.
59    #[error("BackendAlreadyInitialized")]
60    BackendAlreadyInitialized,
61    /// There was an error while get the chat template from model.
62    #[error("{0}")]
63    ChatTemplateError(#[from] ChatTemplateError),
64    /// There was an error while decoding a batch.
65    #[error("{0}")]
66    DecodeError(#[from] DecodeError),
67    /// There was an error while encoding a batch.
68    #[error("{0}")]
69    EncodeError(#[from] EncodeError),
70    /// There was an error loading a model.
71    #[error("{0}")]
72    LlamaModelLoadError(#[from] LlamaModelLoadError),
73    /// There was an error creating a new model context.
74    #[error("{0}")]
75    LlamaContextLoadError(#[from] LlamaContextLoadError),
76    /// There was an error adding a token to a batch.
77    #[error["{0}"]]
78    BatchAddError(#[from] BatchAddError),
79    /// see [`EmbeddingsError`]
80    #[error(transparent)]
81    EmbeddingError(#[from] EmbeddingsError),
82}
83
84/// There was an error while getting the chat template from a model.
85#[derive(Debug, Eq, PartialEq, thiserror::Error)]
86pub enum ChatTemplateError {
87    /// the buffer was too small.
88    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
89    BuffSizeError(usize),
90    /// gguf has no chat template
91    #[error("the model has no meta val - returned code {0}")]
92    MissingTemplate(i32),
93    /// The chat template was not valid utf8.
94    #[error(transparent)]
95    Utf8Error(#[from] std::str::Utf8Error),
96}
97
98/// Error retrieving a string from the model (e.g. description, metadata key/value).
99#[derive(Debug, Eq, PartialEq, thiserror::Error)]
100pub enum StringFromModelError {
101    /// The C function returned a negative error code.
102    #[error("llama.cpp returned error code {0}")]
103    ReturnedError(i32),
104    /// The returned bytes were not valid UTF-8.
105    #[error(transparent)]
106    Utf8Error(#[from] std::str::Utf8Error),
107}
108
109/// Failed to Load context
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum LlamaContextLoadError {
112    /// llama.cpp returned null
113    #[error("null reference from llama.cpp")]
114    NullReturn,
115}
116
117/// Failed to decode a batch.
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum DecodeError {
120    /// No kv cache slot was available.
121    #[error("Decode Error 1: NoKvCacheSlot")]
122    NoKvCacheSlot,
123    /// The number of tokens in the batch was 0.
124    #[error("Decode Error -1: n_tokens == 0")]
125    NTokensZero,
126    /// An unknown error occurred.
127    #[error("Decode Error {0}: unknown")]
128    Unknown(c_int),
129}
130
131/// Failed to decode a batch.
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EncodeError {
134    /// No kv cache slot was available.
135    #[error("Encode Error 1: NoKvCacheSlot")]
136    NoKvCacheSlot,
137    /// The number of tokens in the batch was 0.
138    #[error("Encode Error -1: n_tokens == 0")]
139    NTokensZero,
140    /// An unknown error occurred.
141    #[error("Encode Error {0}: unknown")]
142    Unknown(c_int),
143}
144
145/// When embedding related functions fail
146#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EmbeddingsError {
148    /// Embeddings weren't enabled in the context options
149    #[error("Embeddings weren't enabled in the context options")]
150    NotEnabled,
151    /// Logits weren't enabled for the given token
152    #[error("Logits were not enabled for the given token")]
153    LogitsNotEnabled,
154    /// The given sequence index exceeds the max sequence id
155    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
156    NonePoolType,
157}
158
159/// Decode a error from llama.cpp into a [`DecodeError`].
160impl From<NonZeroI32> for DecodeError {
161    fn from(value: NonZeroI32) -> Self {
162        match value.get() {
163            1 => DecodeError::NoKvCacheSlot,
164            -1 => DecodeError::NTokensZero,
165            i => DecodeError::Unknown(i),
166        }
167    }
168}
169
170/// Encode a error from llama.cpp into a [`EncodeError`].
171impl From<NonZeroI32> for EncodeError {
172    fn from(value: NonZeroI32) -> Self {
173        match value.get() {
174            1 => EncodeError::NoKvCacheSlot,
175            -1 => EncodeError::NTokensZero,
176            i => EncodeError::Unknown(i),
177        }
178    }
179}
180
181/// An error that can occur when loading a model.
182#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaModelLoadError {
184    /// There was a null byte in a provided string and thus it could not be converted to a C string.
185    #[error("null byte in string {0}")]
186    NullError(#[from] NulError),
187    /// llama.cpp returned a nullptr - this could be many different causes.
188    #[error("null result from llama cpp")]
189    NullResult,
190    /// Failed to convert the path to a rust str. This means the path was not valid unicode
191    #[error("failed to convert path {0} to str")]
192    PathToStrError(PathBuf),
193}
194
195/// An error that can occur when loading a model.
196#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterInitError {
198    /// There was a null byte in a provided string and thus it could not be converted to a C string.
199    #[error("null byte in string {0}")]
200    NullError(#[from] NulError),
201    /// llama.cpp returned a nullptr - this could be many different causes.
202    #[error("null result from llama cpp")]
203    NullResult,
204    /// Failed to convert the path to a rust str. This means the path was not valid unicode
205    #[error("failed to convert path {0} to str")]
206    PathToStrError(PathBuf),
207}
208
209/// An error that can occur when loading a model.
210#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterSetError {
212    /// llama.cpp returned a non-zero error code.
213    #[error("error code from llama cpp")]
214    ErrorResult(i32),
215}
216
217/// An error that can occur when loading a model.
218#[derive(Debug, Eq, PartialEq, thiserror::Error)]
219pub enum LlamaLoraAdapterRemoveError {
220    /// llama.cpp returned a non-zero error code.
221    #[error("error code from llama cpp")]
222    ErrorResult(i32),
223}
224
225/// get the time (in microseconds) according to llama.cpp
226/// ```
227/// # use llama_cpp_4::llama_time_us;
228/// let time = llama_time_us();
229/// assert!(time > 0);
230/// ```
231#[must_use]
232pub fn llama_time_us() -> i64 {
233    unsafe { llama_cpp_sys_4::llama_time_us() }
234}
235
236/// get the max number of devices according to llama.cpp (this is generally cuda devices)
237/// ```
238/// # use llama_cpp_4::max_devices;
239/// let max_devices = max_devices();
240/// assert!(max_devices >= 0);
241/// ```
242#[must_use]
243pub fn max_devices() -> usize {
244    unsafe { llama_cpp_sys_4::llama_max_devices() }
245}
246
247/// is memory mapping supported according to llama.cpp
248/// ```
249/// # use llama_cpp_4::mmap_supported;
250/// let mmap_supported = mmap_supported();
251/// if mmap_supported {
252///   println!("mmap_supported!");
253/// }
254/// ```
255#[must_use]
256pub fn mmap_supported() -> bool {
257    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
258}
259
260/// is memory locking supported according to llama.cpp
261/// ```
262/// # use llama_cpp_4::mlock_supported;
263/// let mlock_supported = mlock_supported();
264/// if mlock_supported {
265///    println!("mlock_supported!");
266/// }
267/// ```
268#[must_use]
269pub fn mlock_supported() -> bool {
270    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
271}
272
273/// An error that can occur when converting a token to a string.
274#[derive(Debug, thiserror::Error, Clone)]
275#[non_exhaustive]
276pub enum TokenToStringError {
277    /// the token type was unknown
278    #[error("Unknown Token Type")]
279    UnknownTokenType,
280    /// There was insufficient buffer space to convert the token to a string.
281    #[error("Insufficient Buffer Space {0}")]
282    InsufficientBufferSpace(c_int),
283    /// The token was not valid utf8.
284    #[error("FromUtf8Error {0}")]
285    FromUtf8Error(#[from] FromUtf8Error),
286}
287
288/// Failed to convert a string to a token sequence.
289#[derive(Debug, thiserror::Error)]
290pub enum StringToTokenError {
291    /// the string contained a null byte and thus could not be converted to a c string.
292    #[error("{0}")]
293    NulError(#[from] NulError),
294    #[error("{0}")]
295    /// Failed to convert a provided integer to a [`c_int`].
296    CIntConversionError(#[from] std::num::TryFromIntError),
297}
298
299/// Failed to apply model chat template.
300#[derive(Debug, thiserror::Error)]
301pub enum NewLlamaChatMessageError {
302    /// the string contained a null byte and thus could not be converted to a c string.
303    #[error("{0}")]
304    NulError(#[from] NulError),
305}
306
307/// Failed to apply model chat template.
308#[derive(Debug, thiserror::Error)]
309pub enum ApplyChatTemplateError {
310    /// the buffer was too small.
311    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
312    BuffSizeError,
313    /// the string contained a null byte and thus could not be converted to a c string.
314    #[error("{0}")]
315    NulError(#[from] NulError),
316    /// the string could not be converted to utf8.
317    #[error("{0}")]
318    FromUtf8Error(#[from] FromUtf8Error),
319}
320
321/// Get the time in microseconds according to ggml
322///
323/// ```
324/// # use std::time::Duration;
325/// use llama_cpp_4::ggml_time_us;
326///
327/// let start = ggml_time_us();
328///
329/// std::thread::sleep(Duration::from_micros(10));
330///
331/// let end = ggml_time_us();
332///
333/// let elapsed = end - start;
334///
335/// assert!(elapsed >= 10)
336#[must_use]
337pub fn ggml_time_us() -> i64 {
338    unsafe { llama_cpp_sys_4::ggml_time_us() }
339}
340
341/// Checks if mlock is supported.
342///
343/// ```
344/// # use llama_cpp_4::llama_supports_mlock;
345///
346/// if llama_supports_mlock() {
347///   println!("mlock is supported!");
348/// } else {
349///   println!("mlock is not supported!");
350/// }
351/// ```
352#[must_use]
353pub fn llama_supports_mlock() -> bool {
354    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
355}
356
357/// Checks if GPU offload is supported.
358///
359/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
360#[must_use]
361pub fn supports_gpu_offload() -> bool {
362    unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
363}
364
365/// Checks if RPC backend is supported.
366///
367/// Returns `true` if the library was compiled with RPC support.
368#[must_use]
369pub fn supports_rpc() -> bool {
370    unsafe { llama_cpp_sys_4::llama_supports_rpc() }
371}
372
373/// Get system information string.
374///
375/// Returns a string containing CPU features, build info, and other system details.
376///
377/// # Panics
378///
379/// Panics if the returned string is not valid UTF-8.
380#[must_use]
381pub fn print_system_info() -> String {
382    let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
383    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
384    c_str.to_str().expect("system info is not valid UTF-8").to_owned()
385}
386
387/// Get the maximum number of parallel sequences supported.
388#[must_use]
389pub fn max_parallel_sequences() -> usize {
390    unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
391}
392
393/// Get the maximum number of tensor buffer type overrides.
394#[must_use]
395pub fn max_tensor_buft_overrides() -> usize {
396    unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
397}
398
399/// Get the name of a flash attention type.
400///
401/// # Panics
402///
403/// Panics if the returned string is not valid UTF-8.
404#[must_use]
405pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
406    let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
407    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
408    c_str.to_str().expect("flash_attn_type_name is not valid UTF-8").to_owned()
409}
410
411/// Get the string representation of a model metadata key.
412///
413/// # Panics
414///
415/// Panics if the returned string is not valid UTF-8.
416#[must_use]
417pub fn model_meta_key_str(key: u32) -> String {
418    let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
419    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
420    c_str.to_str().expect("meta_key_str is not valid UTF-8").to_owned()
421}
422
423/// Quantize a model file using typed [`QuantizeParams`].
424///
425/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
426/// returned by `llama_model_quantize`.
427///
428/// # Panics
429///
430/// Panics if either path contains an interior null byte.
431///
432/// # Example
433///
434/// ```no_run
435/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
436///
437/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
438///     .with_nthread(8)
439///     .with_quantize_output_tensor(true);
440///
441/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
442/// ```
443pub fn model_quantize(
444    fname_inp: &str,
445    fname_out: &str,
446    params: &quantize::QuantizeParams,
447) -> std::result::Result<(), u32> {
448    let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
449    let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
450    let guard = params.to_raw();
451    let rc =
452        unsafe { llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &guard.raw) };
453    if rc == 0 { Ok(()) } else { Err(rc) }
454}
455
456/// Get default quantization parameters (raw sys type).
457///
458/// Prefer [`QuantizeParams::new`] for the typed Rust API.
459#[must_use]
460#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
461pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
462    unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
463}
464
465/// Set the log callback.
466///
467/// # Safety
468///
469/// The callback and user data must remain valid for the lifetime of the application
470/// or until the callback is replaced.
471pub unsafe fn log_set(
472    callback: llama_cpp_sys_4::ggml_log_callback,
473    user_data: *mut std::ffi::c_void,
474) {
475    llama_cpp_sys_4::llama_log_set(callback, user_data);
476}
477
478/// Get the current log callback and user data.
479///
480/// # Safety
481///
482/// The caller must ensure the pointers are valid.
483pub unsafe fn log_get(
484    log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
485    user_data: *mut *mut std::ffi::c_void,
486) {
487    llama_cpp_sys_4::llama_log_get(log_callback, user_data);
488}
489
490/// Initialize optimizer state for fine-tuning.
491///
492/// # Safety
493///
494/// The context and model must be valid and compatible.
495pub unsafe fn opt_init(
496    ctx: *mut llama_cpp_sys_4::llama_context,
497    model: *mut llama_cpp_sys_4::llama_model,
498    params: llama_cpp_sys_4::llama_opt_params,
499) {
500    llama_cpp_sys_4::llama_opt_init(ctx, model, params);
501}
502
503/// Run one training epoch.
504///
505/// # Safety
506///
507/// All pointers and handles must be valid.
508#[allow(clippy::too_many_arguments)]
509pub unsafe fn opt_epoch(
510    ctx: *mut llama_cpp_sys_4::llama_context,
511    dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
512    result_train: llama_cpp_sys_4::ggml_opt_result_t,
513    result_eval: llama_cpp_sys_4::ggml_opt_result_t,
514    idata_split: i64,
515    callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
516    callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
517) {
518    llama_cpp_sys_4::llama_opt_epoch(
519        ctx,
520        dataset,
521        result_train,
522        result_eval,
523        idata_split,
524        callback_train,
525        callback_eval,
526    );
527}
528
529/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
530///
531/// # Safety
532///
533/// The tensor pointer must be valid.
534pub unsafe fn opt_param_filter_all(
535    tensor: *const llama_cpp_sys_4::ggml_tensor,
536    userdata: *mut std::ffi::c_void,
537) -> bool {
538    llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
539}
540
541/// Auto-fit model and context parameters for available memory.
542///
543/// # Safety
544///
545/// All pointers must be valid.
546#[allow(clippy::too_many_arguments)]
547pub unsafe fn params_fit(
548    path_model: *const std::ffi::c_char,
549    mparams: *mut llama_cpp_sys_4::llama_model_params,
550    cparams: *mut llama_cpp_sys_4::llama_context_params,
551    tensor_split: *mut f32,
552    tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
553    margins: *mut usize,
554    n_ctx_min: u32,
555    log_level: llama_cpp_sys_4::ggml_log_level,
556) -> llama_cpp_sys_4::llama_params_fit_status {
557    llama_cpp_sys_4::llama_params_fit(
558        path_model,
559        mparams,
560        cparams,
561        tensor_split,
562        tensor_buft_overrides,
563        margins,
564        n_ctx_min,
565        log_level,
566    )
567}