Skip to main content

rig_llama_cpp/
lib.rs

1//! # rig-llama-cpp
2//!
3//! A [Rig](https://docs.rs/rig-core) provider that runs GGUF models locally
4//! via [llama.cpp](https://github.com/ggml-org/llama.cpp), with optional Vulkan GPU acceleration.
5//!
6//! This crate implements Rig's [`rig::completion::CompletionModel`] and [`rig::embeddings::EmbeddingModel`] traits
7//! so that any GGUF model can be used as a drop-in replacement for cloud-based providers. It supports:
8//!
9//! - **Completion and streaming** — both one-shot and token-by-token responses.
10//! - **Tool calling** — models with OpenAI-compatible chat templates can invoke tools.
11//! - **Reasoning / thinking** — extended thinking output is forwarded when the model supports it.
12//! - **Configurable sampling** — top-p, top-k, min-p, temperature, presence and repetition penalties.
13//! - **Embeddings** — generate text embeddings using GGUF embedding models.
14//!
15//! # Feature flags
16//!
17//! There is **no default GPU backend** — pick exactly the one that matches
18//! your hardware. With no feature enabled the build is CPU-only.
19//!
20//! GPU backends (forwarded to `llama-cpp-2`):
21//!
22//! - `vulkan` — cross-vendor GPU (recommended on Linux/Windows when CUDA/ROCm aren't set up).
23//! - `cuda` — NVIDIA GPUs with the CUDA toolkit installed.
24//! - `metal` — Apple Silicon / macOS.
25//! - `rocm` — AMD GPUs on Linux with the ROCm toolchain.
26//!
27//! Other:
28//!
29//! - `openmp` — OpenMP CPU threading; orthogonal to the GPU backends and may be combined with any of them.
30//! - `mtmd` — multimodal (vision) inference; required for `Client::from_gguf_with_mmproj` and `ClientBuilder::mmproj`.
31//!
32//! Examples:
33//!
34//! ```text
35//! cargo build --features vulkan
36//! cargo build --features cuda
37//! cargo build --features "vulkan,mtmd"
38//! ```
39//!
40//! Backend support depends on the corresponding `llama-cpp-2` feature and any required
41//! native toolchain or system libraries being available on the host machine.
42//!
43//! # Quick start
44//!
45//! ```rust,no_run
46//! use rig::client::CompletionClient;
47//! use rig::completion::Prompt;
48//!
49//! # #[tokio::main]
50//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
51//! let client = rig_llama_cpp::Client::builder("path/to/model.gguf")
52//!     .n_ctx(8192)
53//!     .build()?;
54//!
55//! let agent = client
56//!     .agent("local")
57//!     .preamble("You are a helpful assistant.")
58//!     .max_tokens(512)
59//!     .build();
60//!
61//! let response = agent.prompt("Hello!").await?;
62//! println!("{response}");
63//! # Ok(())
64//! # }
65//! ```
66
67mod checkpoint;
68mod client;
69mod embedding;
70mod error;
71#[cfg(feature = "mtmd")]
72mod image;
73mod loader;
74mod parsing;
75mod prompt;
76mod request;
77mod sampling;
78mod slot;
79mod types;
80mod worker;
81
82pub use client::{Client, ClientBuilder, Model};
83pub use embedding::{EmbeddingClient, EmbeddingModelHandle};
84pub use error::LoadError;
85pub use types::{
86    CheckpointParams, FitParams, KvCacheParams, KvCacheType, RawResponse, SamplingParams,
87    StreamChunk,
88};
89
90fn env_flag_enabled(name: &str) -> bool {
91    match std::env::var(name) {
92        Ok(value) => matches!(
93            value.trim().to_ascii_lowercase().as_str(),
94            "1" | "true" | "yes" | "on"
95        ),
96        Err(_) => false,
97    }
98}
99
100/// Whether to forward llama.cpp's *C-side* logging to stderr.
101///
102/// This only controls log lines that originate inside the `llama-cpp-2` /
103/// `llama-cpp-sys-2` C++ code (via `printf`-style writes that bypass Rust's
104/// `log` facade). Library-level diagnostics from `rig-llama-cpp` itself go
105/// through the [`log`] crate and are controlled by the consumer's logger
106/// configuration (e.g. `RUST_LOG=rig_llama_cpp=debug`), not this env var.
107fn llama_logs_enabled() -> bool {
108    env_flag_enabled("RIG_LLAMA_CPP_LOGS")
109}
110
111/// Process-wide [`LlamaBackend`] initialised on first use and shared by every
112/// worker (chat + embedding). The underlying llama.cpp backend is a global
113/// singleton — calling `LlamaBackend::init()` twice in the same process
114/// returns `BackendAlreadyInitialized`. Routing all callers through this
115/// helper means a chat client and an embedding client can coexist without
116/// racing on the C-side init flag.
117///
118/// Returns `Ok(&'static LlamaBackend)` once the backend is up; subsequent
119/// calls are cheap (single `OnceLock::get`). On platforms where init can
120/// fail (e.g. no Vulkan device) the error is sticky for the lifetime of
121/// the process — there's no recovering anyway.
122pub(crate) fn shared_backend() -> Result<&'static llama_cpp_2::llama_backend::LlamaBackend, String>
123{
124    use llama_cpp_2::llama_backend::LlamaBackend;
125    use std::sync::{Mutex, OnceLock};
126
127    static BACKEND: OnceLock<LlamaBackend> = OnceLock::new();
128    static INIT_LOCK: Mutex<()> = Mutex::new(());
129
130    if let Some(b) = BACKEND.get() {
131        return Ok(b);
132    }
133    // Serialise concurrent first-time initialisations. The C-side init flag
134    // is process-global so multiple threads racing on `LlamaBackend::init`
135    // will produce `BackendAlreadyInitialized` for the loser even though
136    // they all want the same handle.
137    let _guard = INIT_LOCK.lock().map_err(|e| e.to_string())?;
138    if let Some(b) = BACKEND.get() {
139        return Ok(b);
140    }
141
142    let mut backend = LlamaBackend::init().map_err(|e| format!("Backend init failed: {e}"))?;
143    if !llama_logs_enabled() {
144        backend.void_logs();
145        // NOTE: upstream llama-cpp-2 0.1.146 does not yet expose a way to
146        // silence mtmd's own log stream — when the `mtmd` feature is on,
147        // mmproj init may print to stderr. Track upstream for an mtmd
148        // log-silencing API and re-enable suppression here.
149    }
150    let _ = BACKEND.set(backend);
151    // INVARIANT: we hold `INIT_LOCK` for the duration of this function and
152    // just called `BACKEND.set(backend)`. Any concurrent caller that
153    // reached the second `BACKEND.get().is_some()` check above already
154    // returned, so reaching this line means we are the unique writer and
155    // `BACKEND` is now `Some`. Even if `set()` raced (`Err`-returning),
156    // the "loser" still observes the state filled by the winner — `get()`
157    // is guaranteed to return `Some`.
158    Ok(BACKEND.get().expect("BACKEND set above under INIT_LOCK"))
159}