1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
//! Bindings to the llama.cpp library.
//!
//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
//! the API is not as nice as it could be.
//!
//! # Examples
//!
//! ## Inference
//!
//! ```no_run
//! use llama_cpp_2::model::LlamaModel;
//! use llama_cpp_2::llama_backend::LlamaBackend;
//! use llama_cpp_2::context::params::LlamaContextParams;
//! use llama_cpp_2::llama_batch::LlamaBatch;
//! use llama_cpp_2::model::params::LlamaModelParams;
//! use llama_cpp_2::token::data_array::LlamaTokenDataArray;
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//!
//! // initialize GGML
//! let backend = LlamaBackend::init()?;
//!
//! // load the model (this may be slow)
//! let model = LlamaModel::load_from_file(&backend, "path/to/model", &LlamaModelParams::default())?;
//! let prompt = "How do I kill a process on linux?";
//! let tokens = model.str_to_token(prompt, true)?;
//!
//! // create a context and batch
//! let mut context = model.new_context(&backend, &LlamaContextParams::default())?;
//! let mut batch = LlamaBatch::new(512, 1);
//! let mut pos: i32 = 0;
//!
//! // add the prompt to the batch
//! let last_index = i32::try_from(tokens.len())? - 1;
//! for token in tokens {
//! batch.add(token, pos, &[0], pos == last_index);
//! pos += 1;
//! }
//!
//! let mut response = vec![];
//!
//! // evaluate first 10 tokens
//! for i in 0..10 {
//! context.decode(&mut batch)?;
//! let token = context.sample_token_greedy(LlamaTokenDataArray::from_iter(context.candidates_ith(batch.n_tokens()), false));
//! response.push(token);
//! batch.clear();
//! batch.add(token, pos, &[0], true);
//! pos += 1;
//! }
//!
//! let response_str = model.tokens_to_str(&response)?;
//! println!("{}", response_str);
//!
//! # Ok(())
//! # }
use std::ffi::NulError;
use std::fmt::Debug;
use std::num::NonZeroI32;
use std::os::raw::c_int;
use std::path::PathBuf;
use std::string::FromUtf8Error;
pub mod context;
pub mod grammar;
pub mod llama_backend;
pub mod llama_batch;
pub mod model;
pub mod timing;
pub mod token;
pub mod token_type;
/// A failable result from a llama.cpp function.
pub type Result<T> = std::result::Result<T, LLamaCppError>;
/// All errors that can occur in the llama-cpp crate.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LLamaCppError {
/// The backend was already initialized. This can generally be ignored as initializing the backend
/// is idempotent.
#[error("BackendAlreadyInitialized")]
BackendAlreadyInitialized,
/// There was an error while decoding a batch.
#[error("{0}")]
DecodeError(#[from] DecodeError),
/// There was an error loading a model.
#[error("{0}")]
LlamaModelLoadError(#[from] LlamaModelLoadError),
/// There was an error creating a new model context.
#[error("{0}")]
LlamaContextLoadError(#[from] LlamaContextLoadError),
}
/// Failed to Load context
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaContextLoadError {
/// llama.cpp returned null
#[error("null reference from llama.cpp")]
NullReturn,
}
/// Failed to decode a batch.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum DecodeError {
/// No kv cache slot was available.
#[error("Decode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
/// The number of tokens in the batch was 0.
#[error("Decode Error -1: n_tokens == 0")]
NTokensZero,
/// An unknown error occurred.
#[error("Decode Error {0}: unknown")]
Unknown(c_int),
}
/// Decode a error from llama.cpp into a [`DecodeError`].
impl From<NonZeroI32> for DecodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => DecodeError::NoKvCacheSlot,
-1 => DecodeError::NTokensZero,
i => DecodeError::Unknown(i),
}
}
}
/// An error that can occur when loading a model.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaModelLoadError {
/// There was a null byte in a provided string and thus it could not be converted to a C string.
#[error("null byte in string {0}")]
NullError(#[from] NulError),
/// llama.cpp returned a nullptr - this could be many different causes.
#[error("null result from llama cpp")]
NullResult,
/// Failed to convert the path to a rust str. This means the path was not valid unicode
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
/// get the time (in microseconds) according to llama.cpp
/// ```
/// # use llama_cpp_2::llama_time_us;
/// let time = llama_time_us();
/// assert!(time > 0);
/// ```
#[must_use]
pub fn llama_time_us() -> i64 {
unsafe { llama_cpp_sys_2::llama_time_us() }
}
/// get the max number of devices according to llama.cpp (this is generally cuda devices)
/// ```
/// # use llama_cpp_2::max_devices;
/// let max_devices = max_devices();
/// assert!(max_devices >= 0);
/// ```
#[must_use]
pub fn max_devices() -> c_int {
unsafe { llama_cpp_sys_2::llama_max_devices() }
}
/// is memory mapping supported according to llama.cpp
/// ```
/// # use llama_cpp_2::mmap_supported;
/// let mmap_supported = mmap_supported();
/// if mmap_supported {
/// println!("mmap_supported!");
/// }
/// ```
#[must_use]
pub fn mmap_supported() -> bool {
unsafe { llama_cpp_sys_2::llama_mmap_supported() }
}
/// is memory locking supported according to llama.cpp
/// ```
/// # use llama_cpp_2::mlock_supported;
/// let mlock_supported = mlock_supported();
/// if mlock_supported {
/// println!("mlock_supported!");
/// }
/// ```
#[must_use]
pub fn mlock_supported() -> bool {
unsafe { llama_cpp_sys_2::llama_mlock_supported() }
}
/// An error that can occur when converting a token to a string.
#[derive(Debug, thiserror::Error, Clone)]
#[non_exhaustive]
pub enum TokenToStringError {
/// the token type was unknown
#[error("Unknown Token Type")]
UnknownTokenType,
/// There was insufficient buffer space to convert the token to a string.
#[error("Insufficient Buffer Space {0}")]
InsufficientBufferSpace(c_int),
/// The token was not valid utf8.
#[error("FromUtf8Error {0}")]
FromUtf8Error(#[from] FromUtf8Error),
}
/// Failed to convert a string to a token sequence.
#[derive(Debug, thiserror::Error)]
pub enum StringToTokenError {
/// the string contained a null byte and thus could not be converted to a c string.
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
/// Failed to convert a provided integer to a c_int.
CIntConversionError(#[from] std::num::TryFromIntError),
}