#![allow(clippy::pedantic)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_possible_wrap)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::similar_names)]
#![allow(clippy::too_many_lines)]
#![allow(clippy::cognitive_complexity)]
#![allow(clippy::missing_errors_doc)]
#![allow(clippy::missing_panics_doc)]
#![allow(clippy::doc_markdown)]
#![allow(clippy::needless_continue)]
#![allow(clippy::match_same_arms)]
#![allow(clippy::unnested_or_patterns)]
#![allow(clippy::unreadable_literal)]
#![allow(clippy::type_complexity)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::must_use_candidate)]
#![allow(clippy::double_must_use)]
#![allow(clippy::missing_safety_doc)]
#![allow(clippy::upper_case_acronyms)]
use std::ffi::{c_char, NulError};
use std::fmt::Debug;
use std::num::NonZeroI32;
use crate::llama_batch::BatchAddError;
use std::os::raw::c_int;
use std::path::PathBuf;
use std::string::FromUtf8Error;
pub mod context;
pub mod llama_backend;
pub mod llama_batch;
mod log;
pub mod model;
#[cfg(feature = "mtmd")]
pub mod mtmd;
pub mod sampling;
pub mod timing;
pub mod token;
pub mod token_type;
pub type Result<T> = std::result::Result<T, LlamaCppError>;
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaCppError {
#[error("BackendAlreadyInitialized")]
BackendAlreadyInitialized,
#[error("{0}")]
ChatTemplateError(#[from] ChatTemplateError),
#[error("{0}")]
DecodeError(#[from] DecodeError),
#[error("{0}")]
EncodeError(#[from] EncodeError),
#[error("{0}")]
LlamaModelLoadError(#[from] LlamaModelLoadError),
#[error("{0}")]
LlamaContextLoadError(#[from] LlamaContextLoadError),
#[error["{0}"]]
BatchAddError(#[from] BatchAddError),
#[error(transparent)]
EmbeddingError(#[from] EmbeddingsError),
#[error("Backend device {0} not found")]
BackendDeviceNotFound(usize),
#[error("Max devices exceeded. Max devices is {0}")]
MaxDevicesExceeded(usize),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum ChatTemplateError {
#[error("chat template not found - returned null pointer")]
MissingTemplate,
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error(transparent)]
Utf8Error(#[from] std::str::Utf8Error),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum MetaValError {
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error("FromUtf8Error {0}")]
FromUtf8Error(#[from] FromUtf8Error),
#[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
NegativeReturn(i32),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaContextLoadError {
#[error("null reference from llama.cpp")]
NullReturn,
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum DecodeError {
#[error("Decode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
#[error("Decode Error -1: n_tokens == 0")]
NTokensZero,
#[error("Decode Error {0}: unknown")]
Unknown(c_int),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum EncodeError {
#[error("Encode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
#[error("Encode Error -1: n_tokens == 0")]
NTokensZero,
#[error("Encode Error {0}: unknown")]
Unknown(c_int),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum EmbeddingsError {
#[error("Embeddings weren't enabled in the context options")]
NotEnabled,
#[error("Logits were not enabled for the given token")]
LogitsNotEnabled,
#[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
NonePoolType,
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum GrammarError {
#[error("Grammar root not found in grammar string")]
RootNotFound,
#[error("Trigger word contains null bytes")]
TriggerWordNullBytes,
#[error("Grammar string or root contains null bytes")]
GrammarNullBytes,
#[error("Grammar call returned null")]
NullGrammar,
}
impl From<NonZeroI32> for DecodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => DecodeError::NoKvCacheSlot,
-1 => DecodeError::NTokensZero,
i => DecodeError::Unknown(i),
}
}
}
impl From<NonZeroI32> for EncodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => EncodeError::NoKvCacheSlot,
-1 => EncodeError::NTokensZero,
i => EncodeError::Unknown(i),
}
}
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaModelLoadError {
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error("null result from llama cpp")]
NullResult,
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterInitError {
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error("null result from llama cpp")]
NullResult,
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterSetError {
#[error("error code from llama cpp")]
ErrorResult(i32),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterRemoveError {
#[error("error code from llama cpp")]
ErrorResult(i32),
}
#[must_use]
pub fn llama_time_us() -> i64 {
unsafe { infrastructure_llama_bindings::llama_time_us() }
}
#[must_use]
pub fn max_devices() -> usize {
unsafe { infrastructure_llama_bindings::llama_max_devices() }
}
#[must_use]
pub fn mmap_supported() -> bool {
unsafe { infrastructure_llama_bindings::llama_supports_mmap() }
}
#[must_use]
pub fn mlock_supported() -> bool {
unsafe { infrastructure_llama_bindings::llama_supports_mlock() }
}
#[derive(Debug, thiserror::Error, Clone)]
#[non_exhaustive]
pub enum TokenToStringError {
#[error("Unknown Token Type")]
UnknownTokenType,
#[error("Insufficient Buffer Space {0}")]
InsufficientBufferSpace(c_int),
#[error("FromUtf8Error {0}")]
FromUtf8Error(#[from] FromUtf8Error),
}
#[derive(Debug, thiserror::Error)]
pub enum StringToTokenError {
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
CIntConversionError(#[from] std::num::TryFromIntError),
}
#[derive(Debug, thiserror::Error)]
pub enum NewLlamaChatMessageError {
#[error("{0}")]
NulError(#[from] NulError),
}
#[derive(Debug, thiserror::Error)]
pub enum ApplyChatTemplateError {
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
FromUtf8Error(#[from] FromUtf8Error),
}
#[must_use]
pub fn ggml_time_us() -> i64 {
unsafe { infrastructure_llama_bindings::ggml_time_us() }
}
#[must_use]
pub fn llama_supports_mlock() -> bool {
unsafe { infrastructure_llama_bindings::llama_supports_mlock() }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LlamaBackendDeviceType {
Cpu,
Accelerator,
Gpu,
IntegratedGpu,
Unknown,
}
#[derive(Debug, Clone)]
pub struct LlamaBackendDevice {
pub index: usize,
pub name: String,
pub description: String,
pub backend: String,
pub memory_total: usize,
pub memory_free: usize,
pub device_type: LlamaBackendDeviceType,
}
#[must_use]
pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
let mut devices = Vec::new();
for i in 0..unsafe { infrastructure_llama_bindings::ggml_backend_dev_count() } {
fn cstr_to_string(ptr: *const c_char) -> String {
if ptr.is_null() {
String::new()
} else {
unsafe { std::ffi::CStr::from_ptr(ptr) }
.to_string_lossy()
.to_string()
}
}
let dev = unsafe { infrastructure_llama_bindings::ggml_backend_dev_get(i) };
let props = unsafe {
let mut props = std::mem::zeroed();
infrastructure_llama_bindings::ggml_backend_dev_get_props(dev, &raw mut props);
props
};
let name = cstr_to_string(props.name);
let description = cstr_to_string(props.description);
let backend = unsafe { infrastructure_llama_bindings::ggml_backend_dev_backend_reg(dev) };
let backend_name = unsafe { infrastructure_llama_bindings::ggml_backend_reg_name(backend) };
let backend = cstr_to_string(backend_name);
let memory_total = props.memory_total;
let memory_free = props.memory_free;
let device_type = match props.type_ {
infrastructure_llama_bindings::GGML_BACKEND_DEVICE_TYPE_CPU => {
LlamaBackendDeviceType::Cpu
}
infrastructure_llama_bindings::GGML_BACKEND_DEVICE_TYPE_ACCEL => {
LlamaBackendDeviceType::Accelerator
}
infrastructure_llama_bindings::GGML_BACKEND_DEVICE_TYPE_GPU => {
LlamaBackendDeviceType::Gpu
}
infrastructure_llama_bindings::GGML_BACKEND_DEVICE_TYPE_IGPU => {
LlamaBackendDeviceType::IntegratedGpu
}
_ => LlamaBackendDeviceType::Unknown,
};
devices.push(LlamaBackendDevice {
index: i,
name,
description,
backend,
memory_total,
memory_free,
device_type,
});
}
devices
}
#[derive(Default, Debug, Clone)]
pub struct LogOptions {
disabled: bool,
}
impl LogOptions {
#[must_use]
pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
self.disabled = !enabled;
self
}
}
extern "C" fn logs_to_trace(
level: infrastructure_llama_bindings::ggml_log_level,
text: *const ::std::os::raw::c_char,
data: *mut ::std::os::raw::c_void,
) {
use std::borrow::Borrow;
let log_state = unsafe { &*(data as *const log::State) };
if log_state.options.disabled {
return;
}
if !log_state.is_enabled_for_level(level) {
log_state.update_previous_level_for_disabled_log(level);
return;
}
let text = unsafe { std::ffi::CStr::from_ptr(text) };
let text = text.to_string_lossy();
let text: &str = text.borrow();
if level == infrastructure_llama_bindings::GGML_LOG_LEVEL_CONT {
log_state.cont_buffered_log(text);
} else if text.ends_with('\n') {
log_state.emit_non_cont_line(level, text);
} else {
log_state.buffer_non_cont(level, text);
}
}
pub fn send_logs_to_tracing(options: LogOptions) {
let llama_heap_state = Box::as_ref(
log::LLAMA_STATE
.get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
) as *const _;
let ggml_heap_state = Box::as_ref(
log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
) as *const _;
unsafe {
infrastructure_llama_bindings::llama_log_set(
Some(logs_to_trace),
llama_heap_state as *mut _,
);
infrastructure_llama_bindings::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
}
}