llama_cpp_2/lib.rs
1//! Bindings to the llama.cpp library.
2//!
3//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
4//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
5//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
6//! the API is not as nice as it could be.
7//!
8//! # Examples
9//!
10//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
11//!
12//! # Feature Flags
13//!
14//! - `cuda` enables CUDA gpu support.
15//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
16use std::ffi::NulError;
17use std::fmt::Debug;
18use std::num::NonZeroI32;
19
20use crate::llama_batch::BatchAddError;
21use std::os::raw::c_int;
22use std::path::PathBuf;
23use std::string::FromUtf8Error;
24
25pub mod context;
26pub mod llama_backend;
27pub mod llama_batch;
28mod log;
29pub mod model;
30pub mod sampling;
31pub mod timing;
32pub mod token;
33pub mod token_type;
34
35/// A failable result from a llama.cpp function.
36pub type Result<T> = std::result::Result<T, LLamaCppError>;
37
38/// All errors that can occur in the llama-cpp crate.
39#[derive(Debug, Eq, PartialEq, thiserror::Error)]
40pub enum LLamaCppError {
41 /// The backend was already initialized. This can generally be ignored as initializing the backend
42 /// is idempotent.
43 #[error("BackendAlreadyInitialized")]
44 BackendAlreadyInitialized,
45 /// There was an error while get the chat template from model.
46 #[error("{0}")]
47 ChatTemplateError(#[from] ChatTemplateError),
48 /// There was an error while decoding a batch.
49 #[error("{0}")]
50 DecodeError(#[from] DecodeError),
51 /// There was an error while encoding a batch.
52 #[error("{0}")]
53 EncodeError(#[from] EncodeError),
54 /// There was an error loading a model.
55 #[error("{0}")]
56 LlamaModelLoadError(#[from] LlamaModelLoadError),
57 /// There was an error creating a new model context.
58 #[error("{0}")]
59 LlamaContextLoadError(#[from] LlamaContextLoadError),
60 /// There was an error adding a token to a batch.
61 #[error["{0}"]]
62 BatchAddError(#[from] BatchAddError),
63 /// see [`EmbeddingsError`]
64 #[error(transparent)]
65 EmbeddingError(#[from] EmbeddingsError),
66 // See [`LlamaSamplerError`]
67}
68
69/// There was an error while getting the chat template from a model.
70#[derive(Debug, Eq, PartialEq, thiserror::Error)]
71pub enum ChatTemplateError {
72 /// gguf has no chat template (by that name)
73 #[error("chat template not found - returned null pointer")]
74 MissingTemplate,
75
76 /// chat template contained a null byte
77 #[error("null byte in string {0}")]
78 NullError(#[from] NulError),
79
80 /// The chat template was not valid utf8.
81 #[error(transparent)]
82 Utf8Error(#[from] std::str::Utf8Error),
83}
84
85/// Failed fetching metadata value
86#[derive(Debug, Eq, PartialEq, thiserror::Error)]
87pub enum MetaValError {
88 /// The provided string contains an unexpected null-byte
89 #[error("null byte in string {0}")]
90 NullError(#[from] NulError),
91
92 /// The returned data contains invalid UTF8 data
93 #[error("FromUtf8Error {0}")]
94 FromUtf8Error(#[from] FromUtf8Error),
95
96 /// Got negative return value. This happens if the key or index queried does not exist.
97 #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
98 NegativeReturn(i32),
99}
100
101/// Failed to Load context
102#[derive(Debug, Eq, PartialEq, thiserror::Error)]
103pub enum LlamaContextLoadError {
104 /// llama.cpp returned null
105 #[error("null reference from llama.cpp")]
106 NullReturn,
107}
108
109/// Failed to decode a batch.
110#[derive(Debug, Eq, PartialEq, thiserror::Error)]
111pub enum DecodeError {
112 /// No kv cache slot was available.
113 #[error("Decode Error 1: NoKvCacheSlot")]
114 NoKvCacheSlot,
115 /// The number of tokens in the batch was 0.
116 #[error("Decode Error -1: n_tokens == 0")]
117 NTokensZero,
118 /// An unknown error occurred.
119 #[error("Decode Error {0}: unknown")]
120 Unknown(c_int),
121}
122
123/// Failed to decode a batch.
124#[derive(Debug, Eq, PartialEq, thiserror::Error)]
125pub enum EncodeError {
126 /// No kv cache slot was available.
127 #[error("Encode Error 1: NoKvCacheSlot")]
128 NoKvCacheSlot,
129 /// The number of tokens in the batch was 0.
130 #[error("Encode Error -1: n_tokens == 0")]
131 NTokensZero,
132 /// An unknown error occurred.
133 #[error("Encode Error {0}: unknown")]
134 Unknown(c_int),
135}
136
137/// When embedding related functions fail
138#[derive(Debug, Eq, PartialEq, thiserror::Error)]
139pub enum EmbeddingsError {
140 /// Embeddings weren't enabled in the context options
141 #[error("Embeddings weren't enabled in the context options")]
142 NotEnabled,
143 /// Logits weren't enabled for the given token
144 #[error("Logits were not enabled for the given token")]
145 LogitsNotEnabled,
146 /// The given sequence index exceeds the max sequence id
147 #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
148 NonePoolType,
149}
150
151/// Decode a error from llama.cpp into a [`DecodeError`].
152impl From<NonZeroI32> for DecodeError {
153 fn from(value: NonZeroI32) -> Self {
154 match value.get() {
155 1 => DecodeError::NoKvCacheSlot,
156 -1 => DecodeError::NTokensZero,
157 i => DecodeError::Unknown(i),
158 }
159 }
160}
161
162/// Encode a error from llama.cpp into a [`EncodeError`].
163impl From<NonZeroI32> for EncodeError {
164 fn from(value: NonZeroI32) -> Self {
165 match value.get() {
166 1 => EncodeError::NoKvCacheSlot,
167 -1 => EncodeError::NTokensZero,
168 i => EncodeError::Unknown(i),
169 }
170 }
171}
172
173/// An error that can occur when loading a model.
174#[derive(Debug, Eq, PartialEq, thiserror::Error)]
175pub enum LlamaModelLoadError {
176 /// There was a null byte in a provided string and thus it could not be converted to a C string.
177 #[error("null byte in string {0}")]
178 NullError(#[from] NulError),
179 /// llama.cpp returned a nullptr - this could be many different causes.
180 #[error("null result from llama cpp")]
181 NullResult,
182 /// Failed to convert the path to a rust str. This means the path was not valid unicode
183 #[error("failed to convert path {0} to str")]
184 PathToStrError(PathBuf),
185}
186
187/// An error that can occur when loading a model.
188#[derive(Debug, Eq, PartialEq, thiserror::Error)]
189pub enum LlamaLoraAdapterInitError {
190 /// There was a null byte in a provided string and thus it could not be converted to a C string.
191 #[error("null byte in string {0}")]
192 NullError(#[from] NulError),
193 /// llama.cpp returned a nullptr - this could be many different causes.
194 #[error("null result from llama cpp")]
195 NullResult,
196 /// Failed to convert the path to a rust str. This means the path was not valid unicode
197 #[error("failed to convert path {0} to str")]
198 PathToStrError(PathBuf),
199}
200
201/// An error that can occur when loading a model.
202#[derive(Debug, Eq, PartialEq, thiserror::Error)]
203pub enum LlamaLoraAdapterSetError {
204 /// llama.cpp returned a non-zero error code.
205 #[error("error code from llama cpp")]
206 ErrorResult(i32),
207}
208
209/// An error that can occur when loading a model.
210#[derive(Debug, Eq, PartialEq, thiserror::Error)]
211pub enum LlamaLoraAdapterRemoveError {
212 /// llama.cpp returned a non-zero error code.
213 #[error("error code from llama cpp")]
214 ErrorResult(i32),
215}
216
217/// get the time (in microseconds) according to llama.cpp
218/// ```
219/// # use llama_cpp_2::llama_time_us;
220/// # use llama_cpp_2::llama_backend::LlamaBackend;
221/// let backend = LlamaBackend::init().unwrap();
222/// let time = llama_time_us();
223/// assert!(time > 0);
224/// ```
225#[must_use]
226pub fn llama_time_us() -> i64 {
227 unsafe { llama_cpp_sys_2::llama_time_us() }
228}
229
230/// get the max number of devices according to llama.cpp (this is generally cuda devices)
231/// ```
232/// # use llama_cpp_2::max_devices;
233/// let max_devices = max_devices();
234/// assert!(max_devices >= 0);
235/// ```
236#[must_use]
237pub fn max_devices() -> usize {
238 unsafe { llama_cpp_sys_2::llama_max_devices() }
239}
240
241/// is memory mapping supported according to llama.cpp
242/// ```
243/// # use llama_cpp_2::mmap_supported;
244/// let mmap_supported = mmap_supported();
245/// if mmap_supported {
246/// println!("mmap_supported!");
247/// }
248/// ```
249#[must_use]
250pub fn mmap_supported() -> bool {
251 unsafe { llama_cpp_sys_2::llama_supports_mmap() }
252}
253
254/// is memory locking supported according to llama.cpp
255/// ```
256/// # use llama_cpp_2::mlock_supported;
257/// let mlock_supported = mlock_supported();
258/// if mlock_supported {
259/// println!("mlock_supported!");
260/// }
261/// ```
262#[must_use]
263pub fn mlock_supported() -> bool {
264 unsafe { llama_cpp_sys_2::llama_supports_mlock() }
265}
266
267/// An error that can occur when converting a token to a string.
268#[derive(Debug, thiserror::Error, Clone)]
269#[non_exhaustive]
270pub enum TokenToStringError {
271 /// the token type was unknown
272 #[error("Unknown Token Type")]
273 UnknownTokenType,
274 /// There was insufficient buffer space to convert the token to a string.
275 #[error("Insufficient Buffer Space {0}")]
276 InsufficientBufferSpace(c_int),
277 /// The token was not valid utf8.
278 #[error("FromUtf8Error {0}")]
279 FromUtf8Error(#[from] FromUtf8Error),
280}
281
282/// Failed to convert a string to a token sequence.
283#[derive(Debug, thiserror::Error)]
284pub enum StringToTokenError {
285 /// the string contained a null byte and thus could not be converted to a c string.
286 #[error("{0}")]
287 NulError(#[from] NulError),
288 #[error("{0}")]
289 /// Failed to convert a provided integer to a [`c_int`].
290 CIntConversionError(#[from] std::num::TryFromIntError),
291}
292
293/// Failed to apply model chat template.
294#[derive(Debug, thiserror::Error)]
295pub enum NewLlamaChatMessageError {
296 /// the string contained a null byte and thus could not be converted to a c string.
297 #[error("{0}")]
298 NulError(#[from] NulError),
299}
300
301/// Failed to apply model chat template.
302#[derive(Debug, thiserror::Error)]
303pub enum ApplyChatTemplateError {
304 /// the string contained a null byte and thus could not be converted to a c string.
305 #[error("{0}")]
306 NulError(#[from] NulError),
307 /// the string could not be converted to utf8.
308 #[error("{0}")]
309 FromUtf8Error(#[from] FromUtf8Error),
310}
311
312/// Get the time in microseconds according to ggml
313///
314/// ```
315/// # use std::time::Duration;
316/// # use llama_cpp_2::llama_backend::LlamaBackend;
317/// let backend = LlamaBackend::init().unwrap();
318/// use llama_cpp_2::ggml_time_us;
319///
320/// let start = ggml_time_us();
321///
322/// std::thread::sleep(Duration::from_micros(10));
323///
324/// let end = ggml_time_us();
325///
326/// let elapsed = end - start;
327///
328/// assert!(elapsed >= 10)
329#[must_use]
330pub fn ggml_time_us() -> i64 {
331 unsafe { llama_cpp_sys_2::ggml_time_us() }
332}
333
334/// checks if mlock is supported
335///
336/// ```
337/// # use llama_cpp_2::llama_supports_mlock;
338///
339/// if llama_supports_mlock() {
340/// println!("mlock is supported!");
341/// } else {
342/// println!("mlock is not supported!");
343/// }
344/// ```
345#[must_use]
346pub fn llama_supports_mlock() -> bool {
347 unsafe { llama_cpp_sys_2::llama_supports_mlock() }
348}
349
350/// Options to configure how llama.cpp logs are intercepted.
351#[derive(Default, Debug, Clone)]
352pub struct LogOptions {
353 disabled: bool,
354}
355
356impl LogOptions {
357 /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
358 /// logs to be sent to tracing.
359 pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
360 self.disabled = !enabled;
361 self
362 }
363}
364
365extern "C" fn logs_to_trace(
366 level: llama_cpp_sys_2::ggml_log_level,
367 text: *const ::std::os::raw::c_char,
368 data: *mut ::std::os::raw::c_void,
369) {
370 // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
371 // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
372 // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
373 // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
374 use std::borrow::Borrow;
375
376 let log_state = unsafe { &*(data as *const log::State) };
377
378 if log_state.options.disabled {
379 return;
380 }
381
382 // If the log level is disabled, we can just return early
383 if !log_state.is_enabled_for_level(level) {
384 log_state.update_previous_level_for_disabled_log(level);
385 return;
386 }
387
388 let text = unsafe { std::ffi::CStr::from_ptr(text) };
389 let text = text.to_string_lossy();
390 let text: &str = text.borrow();
391
392 // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
393 // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
394 // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
395 // to know how to flush it.
396
397 if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
398 log_state.cont_buffered_log(text);
399 } else if text.ends_with('\n') {
400 log_state.emit_non_cont_line(level, text);
401 } else {
402 log_state.buffer_non_cont(level, text);
403 }
404}
405
406/// Redirect llama.cpp logs into tracing.
407pub fn send_logs_to_tracing(options: LogOptions) {
408 // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
409
410 // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
411 // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
412 // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
413 // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
414 let llama_heap_state = Box::as_ref(
415 log::LLAMA_STATE
416 .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
417 ) as *const _;
418 let ggml_heap_state = Box::as_ref(
419 log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
420 ) as *const _;
421
422 unsafe {
423 // GGML has to be set after llama since setting llama sets ggml as well.
424 llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
425 llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
426 }
427}