llama_cpp_4/
lib.rs

1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
11//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
12//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
13//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
14//!
15//! # Feature Flags
16//!
17//! - `cuda` enables CUDA GPU support.
18//! - `metal` enables Apple Metal GPU support.
19//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
20//! - `native` enables host-CPU optimisations (`-march=native`).
21//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
22//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
23//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
24use std::ffi::NulError;
25use std::fmt::Debug;
26use std::num::NonZeroI32;
27
28use crate::llama_batch::BatchAddError;
29use std::os::raw::c_int;
30use std::path::PathBuf;
31use std::string::FromUtf8Error;
32
33pub mod common;
34pub mod context;
35#[cfg(feature = "ggml")]
36pub mod ggml;
37pub mod llama_backend;
38pub mod llama_batch;
39pub mod model;
40pub mod quantize;
41pub mod sampling;
42pub mod token;
43pub mod token_type;
44
45#[cfg(feature = "rpc")]
46pub mod rpc;
47
48#[cfg(feature = "mtmd")]
49pub mod mtmd;
50
51/// A failable result from a llama.cpp function.
52pub type Result<T> = std::result::Result<T, LLamaCppError>;
53
54/// All errors that can occur in the llama-cpp crate.
55#[derive(Debug, Eq, PartialEq, thiserror::Error)]
56pub enum LLamaCppError {
57    /// The backend was already initialized. This can generally be ignored as initializing the backend
58    /// is idempotent.
59    #[error("BackendAlreadyInitialized")]
60    BackendAlreadyInitialized,
61    /// There was an error while get the chat template from model.
62    #[error("{0}")]
63    ChatTemplateError(#[from] ChatTemplateError),
64    /// There was an error while decoding a batch.
65    #[error("{0}")]
66    DecodeError(#[from] DecodeError),
67    /// There was an error while encoding a batch.
68    #[error("{0}")]
69    EncodeError(#[from] EncodeError),
70    /// There was an error loading a model.
71    #[error("{0}")]
72    LlamaModelLoadError(#[from] LlamaModelLoadError),
73    /// There was an error creating a new model context.
74    #[error("{0}")]
75    LlamaContextLoadError(#[from] LlamaContextLoadError),
76    /// There was an error adding a token to a batch.
77    #[error["{0}"]]
78    BatchAddError(#[from] BatchAddError),
79    /// see [`EmbeddingsError`]
80    #[error(transparent)]
81    EmbeddingError(#[from] EmbeddingsError),
82}
83
84/// There was an error while getting the chat template from a model.
85#[derive(Debug, Eq, PartialEq, thiserror::Error)]
86pub enum ChatTemplateError {
87    /// the buffer was too small.
88    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
89    BuffSizeError(usize),
90    /// gguf has no chat template
91    #[error("the model has no meta val - returned code {0}")]
92    MissingTemplate(i32),
93    /// The chat template was not valid utf8.
94    #[error(transparent)]
95    Utf8Error(#[from] std::str::Utf8Error),
96}
97
98/// Error retrieving a string from the model (e.g. description, metadata key/value).
99#[derive(Debug, Eq, PartialEq, thiserror::Error)]
100pub enum StringFromModelError {
101    /// The C function returned a negative error code.
102    #[error("llama.cpp returned error code {0}")]
103    ReturnedError(i32),
104    /// The returned bytes were not valid UTF-8.
105    #[error(transparent)]
106    Utf8Error(#[from] std::str::Utf8Error),
107}
108
109/// Failed to Load context
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum LlamaContextLoadError {
112    /// llama.cpp returned null
113    #[error("null reference from llama.cpp")]
114    NullReturn,
115}
116
117/// Failed to decode a batch.
118#[derive(Debug, Eq, PartialEq, thiserror::Error)]
119pub enum DecodeError {
120    /// No kv cache slot was available.
121    #[error("Decode Error 1: NoKvCacheSlot")]
122    NoKvCacheSlot,
123    /// The number of tokens in the batch was 0.
124    #[error("Decode Error -1: n_tokens == 0")]
125    NTokensZero,
126    /// An unknown error occurred.
127    #[error("Decode Error {0}: unknown")]
128    Unknown(c_int),
129}
130
131/// Failed to decode a batch.
132#[derive(Debug, Eq, PartialEq, thiserror::Error)]
133pub enum EncodeError {
134    /// No kv cache slot was available.
135    #[error("Encode Error 1: NoKvCacheSlot")]
136    NoKvCacheSlot,
137    /// The number of tokens in the batch was 0.
138    #[error("Encode Error -1: n_tokens == 0")]
139    NTokensZero,
140    /// An unknown error occurred.
141    #[error("Encode Error {0}: unknown")]
142    Unknown(c_int),
143}
144
145/// When embedding related functions fail
146#[derive(Debug, Eq, PartialEq, thiserror::Error)]
147pub enum EmbeddingsError {
148    /// Embeddings weren't enabled in the context options
149    #[error("Embeddings weren't enabled in the context options")]
150    NotEnabled,
151    /// Logits weren't enabled for the given token
152    #[error("Logits were not enabled for the given token")]
153    LogitsNotEnabled,
154    /// The given sequence index exceeds the max sequence id
155    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
156    NonePoolType,
157}
158
159/// Decode a error from llama.cpp into a [`DecodeError`].
160impl From<NonZeroI32> for DecodeError {
161    fn from(value: NonZeroI32) -> Self {
162        match value.get() {
163            1 => DecodeError::NoKvCacheSlot,
164            -1 => DecodeError::NTokensZero,
165            i => DecodeError::Unknown(i),
166        }
167    }
168}
169
170/// Encode a error from llama.cpp into a [`EncodeError`].
171impl From<NonZeroI32> for EncodeError {
172    fn from(value: NonZeroI32) -> Self {
173        match value.get() {
174            1 => EncodeError::NoKvCacheSlot,
175            -1 => EncodeError::NTokensZero,
176            i => EncodeError::Unknown(i),
177        }
178    }
179}
180
181/// An error that can occur when loading a model.
182#[derive(Debug, Eq, PartialEq, thiserror::Error)]
183pub enum LlamaModelLoadError {
184    /// There was a null byte in a provided string and thus it could not be converted to a C string.
185    #[error("null byte in string {0}")]
186    NullError(#[from] NulError),
187    /// llama.cpp returned a nullptr - this could be many different causes.
188    #[error("null result from llama cpp")]
189    NullResult,
190    /// Failed to convert the path to a rust str. This means the path was not valid unicode
191    #[error("failed to convert path {0} to str")]
192    PathToStrError(PathBuf),
193}
194
195/// An error that can occur when loading a model.
196#[derive(Debug, Eq, PartialEq, thiserror::Error)]
197pub enum LlamaLoraAdapterInitError {
198    /// There was a null byte in a provided string and thus it could not be converted to a C string.
199    #[error("null byte in string {0}")]
200    NullError(#[from] NulError),
201    /// llama.cpp returned a nullptr - this could be many different causes.
202    #[error("null result from llama cpp")]
203    NullResult,
204    /// Failed to convert the path to a rust str. This means the path was not valid unicode
205    #[error("failed to convert path {0} to str")]
206    PathToStrError(PathBuf),
207}
208
209/// An error that can occur when loading a model.
210#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterSetError {
212    /// llama.cpp returned a non-zero error code.
213    #[error("error code from llama cpp")]
214    ErrorResult(i32),
215}
216
217/// An error that can occur when loading a model.
218#[derive(Debug, Eq, PartialEq, thiserror::Error)]
219pub enum LlamaLoraAdapterRemoveError {
220    /// llama.cpp returned a non-zero error code.
221    #[error("error code from llama cpp")]
222    ErrorResult(i32),
223}
224
225/// get the time (in microseconds) according to llama.cpp
226/// ```
227/// # use llama_cpp_4::llama_time_us;
228/// let time = llama_time_us();
229/// assert!(time > 0);
230/// ```
231#[must_use]
232pub fn llama_time_us() -> i64 {
233    unsafe { llama_cpp_sys_4::llama_time_us() }
234}
235
236/// get the max number of devices according to llama.cpp (this is generally cuda devices)
237/// ```
238/// # use llama_cpp_4::max_devices;
239/// let max_devices = max_devices();
240/// assert!(max_devices >= 0);
241/// ```
242#[must_use]
243pub fn max_devices() -> usize {
244    unsafe { llama_cpp_sys_4::llama_max_devices() }
245}
246
247/// is memory mapping supported according to llama.cpp
248/// ```
249/// # use llama_cpp_4::mmap_supported;
250/// let mmap_supported = mmap_supported();
251/// if mmap_supported {
252///   println!("mmap_supported!");
253/// }
254/// ```
255#[must_use]
256pub fn mmap_supported() -> bool {
257    unsafe { llama_cpp_sys_4::llama_supports_mmap() }
258}
259
260/// is memory locking supported according to llama.cpp
261/// ```
262/// # use llama_cpp_4::mlock_supported;
263/// let mlock_supported = mlock_supported();
264/// if mlock_supported {
265///    println!("mlock_supported!");
266/// }
267/// ```
268#[must_use]
269pub fn mlock_supported() -> bool {
270    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
271}
272
273/// An error that can occur when converting a token to a string.
274#[derive(Debug, thiserror::Error, Clone)]
275#[non_exhaustive]
276pub enum TokenToStringError {
277    /// the token type was unknown
278    #[error("Unknown Token Type")]
279    UnknownTokenType,
280    /// There was insufficient buffer space to convert the token to a string.
281    #[error("Insufficient Buffer Space {0}")]
282    InsufficientBufferSpace(c_int),
283    /// The token was not valid utf8.
284    #[error("FromUtf8Error {0}")]
285    FromUtf8Error(#[from] FromUtf8Error),
286}
287
288/// Failed to convert a string to a token sequence.
289#[derive(Debug, thiserror::Error)]
290pub enum StringToTokenError {
291    /// the string contained a null byte and thus could not be converted to a c string.
292    #[error("{0}")]
293    NulError(#[from] NulError),
294    #[error("{0}")]
295    /// Failed to convert a provided integer to a [`c_int`].
296    CIntConversionError(#[from] std::num::TryFromIntError),
297}
298
299/// Failed to apply model chat template.
300#[derive(Debug, thiserror::Error)]
301pub enum NewLlamaChatMessageError {
302    /// the string contained a null byte and thus could not be converted to a c string.
303    #[error("{0}")]
304    NulError(#[from] NulError),
305}
306
307/// Failed to apply model chat template.
308#[derive(Debug, thiserror::Error)]
309pub enum ApplyChatTemplateError {
310    /// the buffer was too small.
311    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
312    BuffSizeError,
313    /// the string contained a null byte and thus could not be converted to a c string.
314    #[error("{0}")]
315    NulError(#[from] NulError),
316    /// the string could not be converted to utf8.
317    #[error("{0}")]
318    FromUtf8Error(#[from] FromUtf8Error),
319}
320
321/// Get the time in microseconds according to ggml
322///
323/// ```
324/// # use std::time::Duration;
325/// use llama_cpp_4::ggml_time_us;
326///
327/// let start = ggml_time_us();
328///
329/// std::thread::sleep(Duration::from_micros(10));
330///
331/// let end = ggml_time_us();
332///
333/// let elapsed = end - start;
334///
335/// assert!(elapsed >= 10)
336#[must_use]
337pub fn ggml_time_us() -> i64 {
338    unsafe { llama_cpp_sys_4::ggml_time_us() }
339}
340
341/// Checks if mlock is supported.
342///
343/// ```
344/// # use llama_cpp_4::llama_supports_mlock;
345///
346/// if llama_supports_mlock() {
347///   println!("mlock is supported!");
348/// } else {
349///   println!("mlock is not supported!");
350/// }
351/// ```
352#[must_use]
353pub fn llama_supports_mlock() -> bool {
354    unsafe { llama_cpp_sys_4::llama_supports_mlock() }
355}
356
357/// Checks if GPU offload is supported.
358///
359/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
360#[must_use]
361pub fn supports_gpu_offload() -> bool {
362    unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
363}
364
365/// Checks if RPC backend is supported.
366///
367/// Returns `true` if the library was compiled with RPC support.
368#[must_use]
369pub fn supports_rpc() -> bool {
370    unsafe { llama_cpp_sys_4::llama_supports_rpc() }
371}
372
373/// Get system information string.
374///
375/// Returns a string containing CPU features, build info, and other system details.
376///
377/// # Panics
378///
379/// Panics if the returned string is not valid UTF-8.
380#[must_use]
381pub fn print_system_info() -> String {
382    let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
383    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
384    c_str
385        .to_str()
386        .expect("system info is not valid UTF-8")
387        .to_owned()
388}
389
390/// Get the maximum number of parallel sequences supported.
391#[must_use]
392pub fn max_parallel_sequences() -> usize {
393    unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
394}
395
396/// Get the maximum number of tensor buffer type overrides.
397#[must_use]
398pub fn max_tensor_buft_overrides() -> usize {
399    unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
400}
401
402/// Get the name of a flash attention type.
403///
404/// # Panics
405///
406/// Panics if the returned string is not valid UTF-8.
407#[must_use]
408pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
409    let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
410    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
411    c_str
412        .to_str()
413        .expect("flash_attn_type_name is not valid UTF-8")
414        .to_owned()
415}
416
417/// Get the string representation of a model metadata key.
418///
419/// # Panics
420///
421/// Panics if the returned string is not valid UTF-8.
422#[must_use]
423pub fn model_meta_key_str(key: u32) -> String {
424    let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key.try_into().unwrap()) };
425    let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
426    c_str
427        .to_str()
428        .expect("meta_key_str is not valid UTF-8")
429        .to_owned()
430}
431
432/// Quantize a model file using typed [`QuantizeParams`].
433///
434/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
435/// returned by `llama_model_quantize`.
436///
437/// # Panics
438///
439/// Panics if either path contains an interior null byte.
440///
441/// # Example
442///
443/// ```no_run
444/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
445///
446/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
447///     .with_nthread(8)
448///     .with_quantize_output_tensor(true);
449///
450/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
451/// ```
452pub fn model_quantize(
453    fname_inp: &str,
454    fname_out: &str,
455    params: &quantize::QuantizeParams,
456) -> std::result::Result<(), u32> {
457    let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
458    let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
459    let guard = params.to_raw();
460    let rc = unsafe {
461        llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
462    };
463    if rc == 0 {
464        Ok(())
465    } else {
466        Err(rc)
467    }
468}
469
470/// Get default quantization parameters (raw sys type).
471///
472/// Prefer [`QuantizeParams::new`] for the typed Rust API.
473#[must_use]
474#[deprecated(since = "0.2.19", note = "use `QuantizeParams::new` instead")]
475pub fn model_quantize_default_params() -> llama_cpp_sys_4::llama_model_quantize_params {
476    unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() }
477}
478
479/// Set the log callback.
480///
481/// # Safety
482///
483/// The callback and user data must remain valid for the lifetime of the application
484/// or until the callback is replaced.
485pub unsafe fn log_set(
486    callback: llama_cpp_sys_4::ggml_log_callback,
487    user_data: *mut std::ffi::c_void,
488) {
489    llama_cpp_sys_4::llama_log_set(callback, user_data);
490}
491
492/// Get the current log callback and user data.
493///
494/// # Safety
495///
496/// The caller must ensure the pointers are valid.
497pub unsafe fn log_get(
498    log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
499    user_data: *mut *mut std::ffi::c_void,
500) {
501    llama_cpp_sys_4::llama_log_get(log_callback, user_data);
502}
503
504/// Initialize optimizer state for fine-tuning.
505///
506/// # Safety
507///
508/// The context and model must be valid and compatible.
509pub unsafe fn opt_init(
510    ctx: *mut llama_cpp_sys_4::llama_context,
511    model: *mut llama_cpp_sys_4::llama_model,
512    params: llama_cpp_sys_4::llama_opt_params,
513) {
514    llama_cpp_sys_4::llama_opt_init(ctx, model, params);
515}
516
517/// Run one training epoch.
518///
519/// # Safety
520///
521/// All pointers and handles must be valid.
522#[allow(clippy::too_many_arguments)]
523pub unsafe fn opt_epoch(
524    ctx: *mut llama_cpp_sys_4::llama_context,
525    dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
526    result_train: llama_cpp_sys_4::ggml_opt_result_t,
527    result_eval: llama_cpp_sys_4::ggml_opt_result_t,
528    idata_split: i64,
529    callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
530    callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
531) {
532    llama_cpp_sys_4::llama_opt_epoch(
533        ctx,
534        dataset,
535        result_train,
536        result_eval,
537        idata_split,
538        callback_train,
539        callback_eval,
540    );
541}
542
543/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
544///
545/// # Safety
546///
547/// The tensor pointer must be valid.
548pub unsafe fn opt_param_filter_all(
549    tensor: *const llama_cpp_sys_4::ggml_tensor,
550    userdata: *mut std::ffi::c_void,
551) -> bool {
552    llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
553}
554
555/// Auto-fit model and context parameters for available memory.
556///
557/// # Safety
558///
559/// All pointers must be valid.
560#[allow(clippy::too_many_arguments)]
561pub unsafe fn params_fit(
562    path_model: *const std::ffi::c_char,
563    mparams: *mut llama_cpp_sys_4::llama_model_params,
564    cparams: *mut llama_cpp_sys_4::llama_context_params,
565    tensor_split: *mut f32,
566    tensor_buft_overrides: *mut llama_cpp_sys_4::llama_model_tensor_buft_override,
567    margins: *mut usize,
568    n_ctx_min: u32,
569    log_level: llama_cpp_sys_4::ggml_log_level,
570) -> llama_cpp_sys_4::common_params_fit_status {
571    llama_cpp_sys_4::common_fit_params(
572        path_model,
573        mparams,
574        cparams,
575        tensor_split,
576        tensor_buft_overrides,
577        margins,
578        n_ctx_min,
579        log_level,
580    )
581}
llama_cpp_4/lib.rs

llama_cpp_4/
lib.rs