use std::ffi::{c_char, CStr, CString, NulError};
use std::fmt::Debug;
use std::num::NonZeroI32;
use crate::llama_batch::BatchAddError;
use std::os::raw::c_int;
use std::path::PathBuf;
use std::string::FromUtf8Error;
pub mod context;
pub mod gguf;
pub mod llama_backend;
pub mod llama_batch;
#[cfg(feature = "llguidance")]
pub(crate) mod llguidance_sampler;
mod log;
pub mod model;
#[cfg(feature = "mtmd")]
pub mod mtmd;
pub mod openai;
pub mod sampling;
pub mod timing;
pub mod token;
pub mod token_type;
pub use crate::context::session::LlamaStateSeqFlags;
pub(crate) fn status_is_ok(status: llama_cpp_sys_2::llama_rs_status) -> bool {
status == llama_cpp_sys_2::LLAMA_RS_STATUS_OK
}
pub type Result<T> = std::result::Result<T, LlamaCppError>;
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaCppError {
#[error("BackendAlreadyInitialized")]
BackendAlreadyInitialized,
#[error("{0}")]
ChatTemplateError(#[from] ChatTemplateError),
#[error("{0}")]
DecodeError(#[from] DecodeError),
#[error("{0}")]
EncodeError(#[from] EncodeError),
#[error("{0}")]
LlamaModelLoadError(#[from] LlamaModelLoadError),
#[error("{0}")]
LlamaContextLoadError(#[from] LlamaContextLoadError),
#[error["{0}"]]
BatchAddError(#[from] BatchAddError),
#[error(transparent)]
EmbeddingError(#[from] EmbeddingsError),
#[error("Backend device {0} not found")]
BackendDeviceNotFound(usize),
#[error("Max devices exceeded. Max devices is {0}")]
MaxDevicesExceeded(usize),
#[error("JsonSchemaToGrammarError: {0}")]
JsonSchemaToGrammarError(String),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum ChatTemplateError {
#[error("chat template not found - returned null pointer")]
MissingTemplate,
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error(transparent)]
Utf8Error(#[from] std::str::Utf8Error),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum MetaValError {
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error("FromUtf8Error {0}")]
FromUtf8Error(#[from] FromUtf8Error),
#[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
NegativeReturn(i32),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaContextLoadError {
#[error("null reference from llama.cpp")]
NullReturn,
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum DecodeError {
#[error("Decode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
#[error("Decode Error -1: n_tokens == 0")]
NTokensZero,
#[error("Decode Error {0}: unknown")]
Unknown(c_int),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum EncodeError {
#[error("Encode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
#[error("Encode Error -1: n_tokens == 0")]
NTokensZero,
#[error("Encode Error {0}: unknown")]
Unknown(c_int),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum EmbeddingsError {
#[error("Embeddings weren't enabled in the context options")]
NotEnabled,
#[error("Logits were not enabled for the given token")]
LogitsNotEnabled,
#[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
NonePoolType,
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum GrammarError {
#[error("Grammar root not found in grammar string")]
RootNotFound,
#[error("Trigger word contains null bytes")]
TriggerWordNullBytes,
#[error("Grammar string or root contains null bytes")]
GrammarNullBytes,
#[error("Grammar call returned null")]
NullGrammar,
}
impl From<NonZeroI32> for DecodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => DecodeError::NoKvCacheSlot,
-1 => DecodeError::NTokensZero,
i => DecodeError::Unknown(i),
}
}
}
impl From<NonZeroI32> for EncodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => EncodeError::NoKvCacheSlot,
-1 => EncodeError::NTokensZero,
i => EncodeError::Unknown(i),
}
}
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaModelLoadError {
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error("null result from llama cpp")]
NullResult,
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterInitError {
#[error("null byte in string {0}")]
NullError(#[from] NulError),
#[error("null result from llama cpp")]
NullResult,
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterSetError {
#[error("error code from llama cpp")]
ErrorResult(i32),
}
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterRemoveError {
#[error("error code from llama cpp")]
ErrorResult(i32),
}
#[must_use]
pub fn llama_time_us() -> i64 {
unsafe { llama_cpp_sys_2::llama_time_us() }
}
#[must_use]
pub fn max_devices() -> usize {
unsafe { llama_cpp_sys_2::llama_max_devices() }
}
#[must_use]
pub fn mmap_supported() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mmap() }
}
#[must_use]
pub fn mlock_supported() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
}
pub fn json_schema_to_grammar(schema_json: &str) -> Result<String> {
let schema_cstr = CString::new(schema_json)
.map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
let mut out = std::ptr::null_mut();
let rc = unsafe {
llama_cpp_sys_2::llama_rs_json_schema_to_grammar(schema_cstr.as_ptr(), false, &mut out)
};
let result = {
if !status_is_ok(rc) || out.is_null() {
return Err(LlamaCppError::JsonSchemaToGrammarError(format!(
"ffi error {}",
rc
)));
}
let grammar_bytes = unsafe { CStr::from_ptr(out) }.to_bytes().to_vec();
let grammar = String::from_utf8(grammar_bytes)
.map_err(|err| LlamaCppError::JsonSchemaToGrammarError(err.to_string()))?;
Ok(grammar)
};
unsafe { llama_cpp_sys_2::llama_rs_string_free(out) };
result
}
#[cfg(test)]
mod tests {
use super::json_schema_to_grammar;
#[test]
fn json_schema_string_api_returns_grammar() {
let schema = r#"{
"type": "object",
"properties": {
"city": { "type": "string" },
"unit": { "enum": ["c", "f"] }
},
"required": ["city"]
}"#;
let grammar =
json_schema_to_grammar(schema).expect("string-based schema conversion should succeed");
assert!(grammar.contains("root ::="));
}
}
#[derive(Debug, thiserror::Error, Clone)]
#[non_exhaustive]
pub enum TokenToStringError {
#[error("Unknown Token Type")]
UnknownTokenType,
#[error("Insufficient Buffer Space {0}")]
InsufficientBufferSpace(c_int),
#[error("FromUtf8Error {0}")]
FromUtf8Error(#[from] FromUtf8Error),
}
#[derive(Debug, thiserror::Error)]
pub enum StringToTokenError {
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
CIntConversionError(#[from] std::num::TryFromIntError),
}
#[derive(Debug, thiserror::Error)]
pub enum NewLlamaChatMessageError {
#[error("{0}")]
NulError(#[from] NulError),
}
#[derive(Debug, thiserror::Error)]
pub enum ApplyChatTemplateError {
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
FromUtf8Error(#[from] FromUtf8Error),
#[error("null result from llama.cpp")]
NullResult,
#[error("ffi error {0}")]
FfiError(i32),
#[error("invalid grammar trigger data")]
InvalidGrammarTriggerType,
}
#[derive(Debug, thiserror::Error)]
pub enum ChatParseError {
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
Utf8Error(#[from] FromUtf8Error),
#[error("null result from llama.cpp")]
NullResult,
#[error("ffi error {0}")]
FfiError(i32),
}
#[derive(Debug, thiserror::Error)]
pub enum SamplerAcceptError {
#[error("ffi error {0}")]
FfiError(i32),
}
#[must_use]
pub fn ggml_time_us() -> i64 {
unsafe { llama_cpp_sys_2::ggml_time_us() }
}
#[must_use]
pub fn llama_supports_mlock() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LlamaBackendDeviceType {
Cpu,
Accelerator,
Gpu,
IntegratedGpu,
Unknown,
}
#[derive(Debug, Clone)]
pub struct LlamaBackendDevice {
pub index: usize,
pub name: String,
pub description: String,
pub backend: String,
pub memory_total: usize,
pub memory_free: usize,
pub device_type: LlamaBackendDeviceType,
}
#[must_use]
pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
let mut devices = Vec::new();
for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
fn cstr_to_string(ptr: *const c_char) -> String {
if ptr.is_null() {
String::new()
} else {
unsafe { std::ffi::CStr::from_ptr(ptr) }
.to_string_lossy()
.to_string()
}
}
let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
let props = unsafe {
let mut props = std::mem::zeroed();
llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
props
};
let name = cstr_to_string(props.name);
let description = cstr_to_string(props.description);
let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
let backend = cstr_to_string(backend_name);
let memory_total = props.memory_total;
let memory_free = props.memory_free;
let device_type = match props.type_ {
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
_ => LlamaBackendDeviceType::Unknown,
};
devices.push(LlamaBackendDevice {
index: i,
name,
description,
backend,
memory_total,
memory_free,
device_type,
});
}
devices
}
#[derive(Default, Debug, Clone)]
pub struct LogOptions {
disabled: bool,
}
impl LogOptions {
#[must_use]
pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
self.disabled = !enabled;
self
}
}
extern "C" fn logs_to_trace(
level: llama_cpp_sys_2::ggml_log_level,
text: *const ::std::os::raw::c_char,
data: *mut ::std::os::raw::c_void,
) {
use std::borrow::Borrow;
let log_state = unsafe { &*(data as *const log::State) };
if log_state.options.disabled {
return;
}
if !log_state.is_enabled_for_level(level) {
log_state.update_previous_level_for_disabled_log(level);
return;
}
let text = unsafe { std::ffi::CStr::from_ptr(text) };
let text = text.to_string_lossy();
let text: &str = text.borrow();
if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
log_state.cont_buffered_log(text);
} else if text.ends_with('\n') {
log_state.emit_non_cont_line(level, text);
} else {
log_state.buffer_non_cont(level, text);
}
}
pub fn send_logs_to_tracing(options: LogOptions) {
let llama_heap_state = Box::as_ref(
log::LLAMA_STATE
.get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
) as *const _;
let ggml_heap_state = Box::as_ref(
log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
) as *const _;
unsafe {
llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
}
}