llama_cpp_4/lib.rs
1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provides safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Quick start
9//!
10//! ```no_run
11//! use llama_cpp_4::prelude::*;
12//! use std::num::NonZeroU32;
13//!
14//! fn main() {
15//! let backend = LlamaBackend::init().unwrap();
16//! let model = LlamaModel::load_from_file(
17//! &backend,
18//! "model.gguf",
19//! &LlamaModelParams::default(),
20//! )
21//! .unwrap();
22//! let mut ctx = model
23//! .new_context(
24//! &backend,
25//! LlamaContextParams::default().with_n_ctx(NonZeroU32::new(2048)),
26//! )
27//! .unwrap();
28//!
29//! let tokens = model.str_to_token("Hello, world!", AddBos::Always).unwrap();
30//! let mut batch = LlamaBatch::new(512, 1);
31//! for (i, &tok) in tokens.iter().enumerate() {
32//! batch
33//! .add(tok, i as i32, &[0], i == tokens.len() - 1)
34//! .unwrap();
35//! }
36//! ctx.decode(&mut batch).unwrap();
37//!
38//! let token = LlamaSampler::greedy().sample(&ctx, 0);
39//! let _piece = model.token_to_bytes(token, Special::Plaintext).unwrap();
40//! }
41//! ```
42//!
43//! # Examples in this repository
44//!
45//! - [simple](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/simple)
46//! - [chat](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/chat)
47//! - [embeddings](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/embeddings)
48//! - [server](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/server)
49//! - [mtp](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/mtp) — MTP speculative decoding via [`crate::mtp::MtpSession`]
50//! - [eagle](https://github.com/eugenehp/llama-cpp-rs/tree/main/examples/eagle) — EAGLE-3 speculative decoding via [`crate::eagle::Eagle3Session`]
51//!
52//! # Advanced: tensor capture
53//!
54//! Use [`TensorCapture`] with [`LlamaContextParams::with_tensor_capture`] to read
55//! per-layer hidden states (or other named graph nodes) during
56//! [`LlamaContext::decode`]. See [`context::tensor_capture`] for a full example.
57//!
58//! # Prelude
59//!
60//! For the types used in most inference programs, import [`prelude`]:
61//!
62//! ```
63//! use llama_cpp_4::prelude::*;
64//! ```
65//!
66//! The same core types are also re-exported at the crate root (e.g.
67//! [`LlamaModel`], [`LlamaBackend`]) so you can pick whichever import style
68//! you prefer. See [`prelude`] for a full list and additional examples (chat,
69//! embeddings, memory estimation).
70//!
71//! # Feature Flags
72//!
73//! - `cuda` enables CUDA GPU support.
74//! - `metal` enables Apple Metal GPU support.
75//! - `vulkan` enables Vulkan GPU support (AMD / Intel / cross-platform).
76//! - `native` enables host-CPU optimisations (`-march=native`).
77//! - `openmp` enables OpenMP multi-core CPU parallelism (on by default).
78//! - `rpc` enables RPC backend support for distributed inference across multiple machines.
79//! - `mtmd` enables multimodal (image + audio) support via `libmtmd`.
80use std::ffi::NulError;
81use std::fmt::Debug;
82use std::num::NonZeroI32;
83
84use crate::llama_batch::BatchAddError;
85use std::os::raw::c_int;
86use std::path::PathBuf;
87use std::string::FromUtf8Error;
88
89pub mod common;
90pub mod context;
91pub mod eagle;
92pub mod fit;
93#[cfg(feature = "ggml")]
94pub mod ggml;
95pub mod llama_backend;
96pub mod llama_batch;
97pub mod model;
98pub mod mtp;
99pub mod prelude;
100pub mod quantize;
101pub mod sampling;
102pub mod token;
103pub mod token_type;
104
105#[cfg(feature = "rpc")]
106pub mod rpc;
107
108#[cfg(feature = "mtmd")]
109pub mod mtmd;
110
111/// A failable result from a llama.cpp function.
112pub type Result<T> = std::result::Result<T, LLamaCppError>;
113
114/// All errors that can occur in the llama-cpp crate.
115#[derive(Debug, Eq, PartialEq, thiserror::Error)]
116pub enum LLamaCppError {
117 /// The backend was already initialized. This can generally be ignored as initializing the backend
118 /// is idempotent.
119 #[error("BackendAlreadyInitialized")]
120 BackendAlreadyInitialized,
121 /// There was an error while get the chat template from model.
122 #[error("{0}")]
123 ChatTemplateError(#[from] ChatTemplateError),
124 /// There was an error while decoding a batch.
125 #[error("{0}")]
126 DecodeError(#[from] DecodeError),
127 /// There was an error while encoding a batch.
128 #[error("{0}")]
129 EncodeError(#[from] EncodeError),
130 /// There was an error loading a model.
131 #[error("{0}")]
132 LlamaModelLoadError(#[from] LlamaModelLoadError),
133 /// There was an error creating a new model context.
134 #[error("{0}")]
135 LlamaContextLoadError(#[from] LlamaContextLoadError),
136 /// There was an error adding a token to a batch.
137 #[error["{0}"]]
138 BatchAddError(#[from] BatchAddError),
139 /// see [`EmbeddingsError`]
140 #[error(transparent)]
141 EmbeddingError(#[from] EmbeddingsError),
142}
143
144/// There was an error while getting the chat template from a model.
145#[derive(Debug, Eq, PartialEq, thiserror::Error)]
146pub enum ChatTemplateError {
147 /// the buffer was too small.
148 #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
149 BuffSizeError(usize),
150 /// gguf has no chat template
151 #[error("the model has no meta val - returned code {0}")]
152 MissingTemplate(i32),
153 /// The chat template was not valid utf8.
154 #[error(transparent)]
155 Utf8Error(#[from] std::str::Utf8Error),
156}
157
158/// Error retrieving a string from the model (e.g. description, metadata key/value).
159#[derive(Debug, Eq, PartialEq, thiserror::Error)]
160pub enum StringFromModelError {
161 /// The C function returned a negative error code.
162 #[error("llama.cpp returned error code {0}")]
163 ReturnedError(i32),
164 /// The returned bytes were not valid UTF-8.
165 #[error(transparent)]
166 Utf8Error(#[from] std::str::Utf8Error),
167}
168
169/// Failed to Load context
170#[derive(Debug, Eq, PartialEq, thiserror::Error)]
171pub enum LlamaContextLoadError {
172 /// llama.cpp returned null
173 #[error("null reference from llama.cpp")]
174 NullReturn,
175}
176
177/// Failed to decode a batch.
178#[derive(Debug, Eq, PartialEq, thiserror::Error)]
179pub enum DecodeError {
180 /// No kv cache slot was available.
181 #[error("Decode Error 1: NoKvCacheSlot")]
182 NoKvCacheSlot,
183 /// The number of tokens in the batch was 0.
184 #[error("Decode Error -1: n_tokens == 0")]
185 NTokensZero,
186 /// An unknown error occurred.
187 #[error("Decode Error {0}: unknown")]
188 Unknown(c_int),
189}
190
191/// Failed to decode a batch.
192#[derive(Debug, Eq, PartialEq, thiserror::Error)]
193pub enum EncodeError {
194 /// No kv cache slot was available.
195 #[error("Encode Error 1: NoKvCacheSlot")]
196 NoKvCacheSlot,
197 /// The number of tokens in the batch was 0.
198 #[error("Encode Error -1: n_tokens == 0")]
199 NTokensZero,
200 /// An unknown error occurred.
201 #[error("Encode Error {0}: unknown")]
202 Unknown(c_int),
203}
204
205/// When embedding related functions fail
206#[derive(Debug, Eq, PartialEq, thiserror::Error)]
207pub enum EmbeddingsError {
208 /// Embeddings weren't enabled in the context options
209 #[error("Embeddings weren't enabled in the context options")]
210 NotEnabled,
211 /// Logits weren't enabled for the given token
212 #[error("Logits were not enabled for the given token")]
213 LogitsNotEnabled,
214 /// The given sequence index exceeds the max sequence id
215 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
216 NonePoolType,
217}
218
219/// Decode a error from llama.cpp into a [`DecodeError`].
220impl From<NonZeroI32> for DecodeError {
221 fn from(value: NonZeroI32) -> Self {
222 match value.get() {
223 1 => DecodeError::NoKvCacheSlot,
224 -1 => DecodeError::NTokensZero,
225 i => DecodeError::Unknown(i),
226 }
227 }
228}
229
230/// Encode a error from llama.cpp into a [`EncodeError`].
231impl From<NonZeroI32> for EncodeError {
232 fn from(value: NonZeroI32) -> Self {
233 match value.get() {
234 1 => EncodeError::NoKvCacheSlot,
235 -1 => EncodeError::NTokensZero,
236 i => EncodeError::Unknown(i),
237 }
238 }
239}
240
241/// An error that can occur when loading a model.
242#[derive(Debug, Eq, PartialEq, thiserror::Error)]
243pub enum LlamaModelLoadError {
244 /// There was a null byte in a provided string and thus it could not be converted to a C string.
245 #[error("null byte in string {0}")]
246 NullError(#[from] NulError),
247 /// llama.cpp returned a nullptr - this could be many different causes.
248 #[error("null result from llama cpp")]
249 NullResult,
250 /// Failed to convert the path to a rust str. This means the path was not valid unicode
251 #[error("failed to convert path {0} to str")]
252 PathToStrError(PathBuf),
253}
254
255/// An error that can occur when loading a model.
256#[derive(Debug, Eq, PartialEq, thiserror::Error)]
257pub enum LlamaLoraAdapterInitError {
258 /// There was a null byte in a provided string and thus it could not be converted to a C string.
259 #[error("null byte in string {0}")]
260 NullError(#[from] NulError),
261 /// llama.cpp returned a nullptr - this could be many different causes.
262 #[error("null result from llama cpp")]
263 NullResult,
264 /// Failed to convert the path to a rust str. This means the path was not valid unicode
265 #[error("failed to convert path {0} to str")]
266 PathToStrError(PathBuf),
267}
268
269/// An error that can occur when loading a model.
270#[derive(Debug, Eq, PartialEq, thiserror::Error)]
271pub enum LlamaLoraAdapterSetError {
272 /// llama.cpp returned a non-zero error code.
273 #[error("error code from llama cpp")]
274 ErrorResult(i32),
275}
276
277/// An error that can occur when loading a model.
278#[derive(Debug, Eq, PartialEq, thiserror::Error)]
279pub enum LlamaLoraAdapterRemoveError {
280 /// llama.cpp returned a non-zero error code.
281 #[error("error code from llama cpp")]
282 ErrorResult(i32),
283}
284
285/// get the time (in microseconds) according to llama.cpp
286/// ```
287/// # use llama_cpp_4::llama_time_us;
288/// let time = llama_time_us();
289/// assert!(time > 0);
290/// ```
291#[must_use]
292pub fn llama_time_us() -> i64 {
293 unsafe { llama_cpp_sys_4::llama_time_us() }
294}
295
296/// get the max number of devices according to llama.cpp (this is generally cuda devices)
297/// ```
298/// # use llama_cpp_4::max_devices;
299/// let max_devices = max_devices();
300/// assert!(max_devices >= 0);
301/// ```
302#[must_use]
303pub fn max_devices() -> usize {
304 unsafe { llama_cpp_sys_4::llama_max_devices() }
305}
306
307/// is memory mapping supported according to llama.cpp
308/// ```
309/// # use llama_cpp_4::mmap_supported;
310/// let mmap_supported = mmap_supported();
311/// if mmap_supported {
312/// println!("mmap_supported!");
313/// }
314/// ```
315#[must_use]
316pub fn mmap_supported() -> bool {
317 unsafe { llama_cpp_sys_4::llama_supports_mmap() }
318}
319
320/// is memory locking supported according to llama.cpp
321/// ```
322/// # use llama_cpp_4::mlock_supported;
323/// let mlock_supported = mlock_supported();
324/// if mlock_supported {
325/// println!("mlock_supported!");
326/// }
327/// ```
328#[must_use]
329pub fn mlock_supported() -> bool {
330 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
331}
332
333/// An error that can occur when converting a token to a string.
334#[derive(Debug, thiserror::Error, Clone)]
335#[non_exhaustive]
336pub enum TokenToStringError {
337 /// the token type was unknown
338 #[error("Unknown Token Type")]
339 UnknownTokenType,
340 /// There was insufficient buffer space to convert the token to a string.
341 #[error("Insufficient Buffer Space {0}")]
342 InsufficientBufferSpace(c_int),
343 /// The token was not valid utf8.
344 #[error("FromUtf8Error {0}")]
345 FromUtf8Error(#[from] FromUtf8Error),
346}
347
348/// Failed to convert a string to a token sequence.
349#[derive(Debug, thiserror::Error)]
350pub enum StringToTokenError {
351 /// the string contained a null byte and thus could not be converted to a c string.
352 #[error("{0}")]
353 NulError(#[from] NulError),
354 #[error("{0}")]
355 /// Failed to convert a provided integer to a [`c_int`].
356 CIntConversionError(#[from] std::num::TryFromIntError),
357}
358
359/// Failed to apply model chat template.
360#[derive(Debug, thiserror::Error)]
361pub enum NewLlamaChatMessageError {
362 /// the string contained a null byte and thus could not be converted to a c string.
363 #[error("{0}")]
364 NulError(#[from] NulError),
365}
366
367/// Failed to apply model chat template.
368#[derive(Debug, thiserror::Error)]
369pub enum ApplyChatTemplateError {
370 /// the buffer was too small.
371 #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
372 BuffSizeError,
373 /// the string contained a null byte and thus could not be converted to a c string.
374 #[error("{0}")]
375 NulError(#[from] NulError),
376 /// the string could not be converted to utf8.
377 #[error("{0}")]
378 FromUtf8Error(#[from] FromUtf8Error),
379}
380
381/// Get the time in microseconds according to ggml
382///
383/// ```
384/// # use std::time::Duration;
385/// use llama_cpp_4::ggml_time_us;
386///
387/// let start = ggml_time_us();
388///
389/// std::thread::sleep(Duration::from_micros(10));
390///
391/// let end = ggml_time_us();
392///
393/// let elapsed = end - start;
394///
395/// assert!(elapsed >= 10)
396#[must_use]
397pub fn ggml_time_us() -> i64 {
398 unsafe { llama_cpp_sys_4::ggml_time_us() }
399}
400
401/// Checks if mlock is supported.
402///
403/// ```
404/// # use llama_cpp_4::llama_supports_mlock;
405///
406/// if llama_supports_mlock() {
407/// println!("mlock is supported!");
408/// } else {
409/// println!("mlock is not supported!");
410/// }
411/// ```
412#[must_use]
413pub fn llama_supports_mlock() -> bool {
414 unsafe { llama_cpp_sys_4::llama_supports_mlock() }
415}
416
417/// Checks if GPU offload is supported.
418///
419/// Returns `true` if the library was compiled with GPU support (CUDA, Metal, Vulkan, etc.).
420#[must_use]
421pub fn supports_gpu_offload() -> bool {
422 unsafe { llama_cpp_sys_4::llama_supports_gpu_offload() }
423}
424
425/// Checks if RPC backend is supported.
426///
427/// Returns `true` if the library was compiled with RPC support.
428#[must_use]
429pub fn supports_rpc() -> bool {
430 unsafe { llama_cpp_sys_4::llama_supports_rpc() }
431}
432
433/// Get system information string.
434///
435/// Returns a string containing CPU features, build info, and other system details.
436///
437/// # Panics
438///
439/// Panics if the returned string is not valid UTF-8.
440#[must_use]
441pub fn print_system_info() -> String {
442 let c_str = unsafe { llama_cpp_sys_4::llama_print_system_info() };
443 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
444 c_str
445 .to_str()
446 .expect("system info is not valid UTF-8")
447 .to_owned()
448}
449
450/// Get the maximum number of parallel sequences supported.
451#[must_use]
452pub fn max_parallel_sequences() -> usize {
453 unsafe { llama_cpp_sys_4::llama_max_parallel_sequences() }
454}
455
456/// Get the maximum number of tensor buffer type overrides.
457#[must_use]
458pub fn max_tensor_buft_overrides() -> usize {
459 unsafe { llama_cpp_sys_4::llama_max_tensor_buft_overrides() }
460}
461
462/// Get the name of a flash attention type.
463///
464/// # Panics
465///
466/// Panics if the returned string is not valid UTF-8.
467#[must_use]
468pub fn flash_attn_type_name(flash_attn_type: i32) -> String {
469 let c_str = unsafe { llama_cpp_sys_4::llama_flash_attn_type_name(flash_attn_type) };
470 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
471 c_str
472 .to_str()
473 .expect("flash_attn_type_name is not valid UTF-8")
474 .to_owned()
475}
476
477/// Get the string representation of a model metadata key.
478///
479/// # Panics
480///
481/// Panics if the returned string is not valid UTF-8.
482#[must_use]
483pub fn model_meta_key_str(key: u32) -> String {
484 let c_str = unsafe { llama_cpp_sys_4::llama_model_meta_key_str(key) };
485 let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
486 c_str
487 .to_str()
488 .expect("meta_key_str is not valid UTF-8")
489 .to_owned()
490}
491
492/// Quantize a model file using typed [`crate::quantize::QuantizeParams`].
493///
494/// Returns `Ok(())` on success, or `Err(code)` with the non-zero error code
495/// returned by `llama_model_quantize`.
496///
497/// # Panics
498///
499/// Panics if either path contains an interior null byte.
500///
501/// # Errors
502///
503/// Returns `Err(code)` with the non-zero status code from `llama_model_quantize`
504/// when quantization fails.
505///
506/// # Example
507///
508/// ```no_run
509/// use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
510///
511/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
512/// .with_nthread(8)
513/// .with_quantize_output_tensor(true);
514///
515/// llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", ¶ms).unwrap();
516/// ```
517pub fn model_quantize(
518 fname_inp: &str,
519 fname_out: &str,
520 params: &quantize::QuantizeParams,
521) -> std::result::Result<(), u32> {
522 let c_inp = std::ffi::CString::new(fname_inp).expect("input path contains null bytes");
523 let c_out = std::ffi::CString::new(fname_out).expect("output path contains null bytes");
524 let guard = params.to_raw();
525 let rc = unsafe {
526 llama_cpp_sys_4::llama_model_quantize(c_inp.as_ptr(), c_out.as_ptr(), &raw const guard.raw)
527 };
528 if rc == 0 {
529 Ok(())
530 } else {
531 Err(rc)
532 }
533}
534
535/// Set the log callback.
536///
537/// # Safety
538///
539/// The callback and user data must remain valid for the lifetime of the application
540/// or until the callback is replaced.
541pub unsafe fn log_set(
542 callback: llama_cpp_sys_4::ggml_log_callback,
543 user_data: *mut std::ffi::c_void,
544) {
545 llama_cpp_sys_4::llama_log_set(callback, user_data);
546}
547
548/// Get the current log callback and user data.
549///
550/// # Safety
551///
552/// The caller must ensure the pointers are valid.
553pub unsafe fn log_get(
554 log_callback: *mut llama_cpp_sys_4::ggml_log_callback,
555 user_data: *mut *mut std::ffi::c_void,
556) {
557 llama_cpp_sys_4::llama_log_get(log_callback, user_data);
558}
559
560/// Initialize optimizer state for fine-tuning.
561///
562/// # Safety
563///
564/// The context and model must be valid and compatible.
565pub unsafe fn opt_init(
566 ctx: *mut llama_cpp_sys_4::llama_context,
567 model: *mut llama_cpp_sys_4::llama_model,
568 params: llama_cpp_sys_4::llama_opt_params,
569) {
570 llama_cpp_sys_4::llama_opt_init(ctx, model, params);
571}
572
573/// Run one training epoch.
574///
575/// # Safety
576///
577/// All pointers and handles must be valid.
578#[allow(clippy::too_many_arguments)]
579pub unsafe fn opt_epoch(
580 ctx: *mut llama_cpp_sys_4::llama_context,
581 dataset: llama_cpp_sys_4::ggml_opt_dataset_t,
582 result_train: llama_cpp_sys_4::ggml_opt_result_t,
583 result_eval: llama_cpp_sys_4::ggml_opt_result_t,
584 idata_split: i64,
585 callback_train: llama_cpp_sys_4::ggml_opt_epoch_callback,
586 callback_eval: llama_cpp_sys_4::ggml_opt_epoch_callback,
587) {
588 llama_cpp_sys_4::llama_opt_epoch(
589 ctx,
590 dataset,
591 result_train,
592 result_eval,
593 idata_split,
594 callback_train,
595 callback_eval,
596 );
597}
598
599/// Parameter filter that accepts all tensors (for use with [`opt_init`]).
600///
601/// # Safety
602///
603/// The tensor pointer must be valid.
604pub unsafe fn opt_param_filter_all(
605 tensor: *const llama_cpp_sys_4::ggml_tensor,
606 userdata: *mut std::ffi::c_void,
607) -> bool {
608 llama_cpp_sys_4::llama_opt_param_filter_all(tensor, userdata)
609}
610
611// ── Crate-root re-exports (see also [`prelude`]) ────────────────────────────
612//
613// These mirror the most common [`prelude`] exports so callers can write
614// `llama_cpp_4::LlamaModel` without a glob import.
615
616/// Parameters used when creating a context.
617pub use context::params::LlamaContextParams;
618/// One captured intermediate tensor from [`TensorCapture`].
619pub use context::CapturedTensor;
620/// An inference context tied to a model.
621pub use context::LlamaContext;
622/// Per-buffer memory usage entry from [`LlamaContext::memory_breakdown`].
623pub use context::MemoryBreakdownEntry;
624/// Hook `cb_eval` during decode to copy named graph tensors (layer hidden states, …).
625pub use context::TensorCapture;
626/// Initialise the llama.cpp backend and hardware drivers.
627pub use llama_backend::LlamaBackend;
628/// Micro-batch submitted to [`LlamaContext::decode`].
629pub use llama_batch::LlamaBatch;
630/// Parameters used when loading a model.
631pub use model::params::LlamaModelParams;
632/// Controls whether tokenisation prepends a BOS token.
633pub use model::AddBos;
634/// A loaded GGUF model.
635pub use model::LlamaModel;
636/// Controls how special tokens are rendered as text.
637pub use model::Special;
638/// Sampler chain for token selection.
639pub use sampling::LlamaSampler;
640/// A single vocabulary token id.
641pub use token::LlamaToken;