llama-cpp-4
Safe Rust bindings to llama.cpp.
Tracks upstream closely — designed to stay current rather than provide a thick abstraction layer.
llama.cpp version: b8533 · Crate version: 0.2.13
Add to your project
[]
= "0.2.13"
# GPU support (pick one or more)
# llama-cpp-4 = { version = "0.2.13", features = ["cuda"] }
# llama-cpp-4 = { version = "0.2.13", features = ["metal"] }
# llama-cpp-4 = { version = "0.2.13", features = ["vulkan"] }
Feature flags
| Feature | Default | Description |
|---|---|---|
openmp |
✅ | Multi-threaded CPU inference via OpenMP |
cuda |
NVIDIA GPU via CUDA | |
metal |
Apple GPU via Metal | |
vulkan |
Cross-platform GPU via Vulkan | |
native |
CPU auto-tune for current arch (AVX2, NEON, …) | |
rpc |
Remote compute backend | |
dynamic-link |
Link llama.cpp as a shared library instead of static |
API overview
Backend
use LlamaBackend;
// Initialise once per process. Configures hardware backends (CUDA, Metal, …).
let backend = init?;
// Optional: suppress llama.cpp's stderr log spam
let backend = init_numa?;
Loading a model
use ;
let mut params = default;
params = params.with_n_gpu_layers; // offload all layers to GPU
let model = load_from_file?;
// Metadata
println!;
println!;
println!;
// Chat template (Jinja, if the model includes one)
if let Ok = model.get_chat_template
Tokenising
use ;
let tokens = model.str_to_token?;
let text = model.token_to_str?;
let bytes = model.token_to_bytes?;
// Batch: token_to_piece is available on the context too
Chat template
use LlamaChatMessage;
let messages = vec!;
// Pass None to use the model's built-in template
let prompt = model.apply_chat_template?;
Creating a context
use LlamaContextParams;
use NonZeroU32;
let params = default
.with_n_ctx
.with_n_batch
.with_n_threads
.with_flash_attn; // Flash Attention 2
let mut ctx = model.new_context?;
Batched decode (prefill + generation)
use LlamaBatch;
let mut batch = new;
// Add prompt tokens; only the last token needs logits
for in tokens.iter.enumerate
ctx.decode?;
// Generate one token at a time
batch.clear;
batch.add?;
ctx.decode?;
Sampling
use LlamaSampler;
// Simple chain
let sampler = chain_simple;
// Or greedy
let sampler = chain_simple;
// Or GBNF grammar (constrained decoding)
let grammar = r#"root ::= "yes" | "no""#;
let sampler = chain_simple;
// Sample a token
let token = sampler.sample;
// Check for end-of-generation
if model.is_eog_token
// Decode to text
let bytes = model.token_to_bytes?;
KV cache
// The KV cache is managed through the memory handle
use SeqRm;
ctx.clear_kv_cache_seq?; // clear sequence 0
Embeddings
use LlamaContextParams;
let params = default
.with_embeddings
.with_n_ctx;
let mut ctx = model.new_context?;
// ... fill batch, decode ...
// Pooled (sequence-level) embedding
let vec = ctx.embeddings_seq_ith?;
// Per-token embedding
let vec = ctx.embeddings_ith?;
LoRA adapters
let adapter = model.load_lora_adapter?;
ctx.set_lora_adapter?;
// Remove all adapters
ctx.lora_adapter_remove?;
Performance counters
let perf = ctx.perf_context;
println!;
println!;
ctx.perf_context_reset;
Full example: text generation
use ;
use NonZeroU32;
Safety
This crate wraps a C++ library via FFI. The safe API prevents most misuse, but some patterns (e.g. using a context after its model is dropped) can still cause UB. File an issue if you spot any.
Requirements
- Rust 1.75+
clang(for bindgen at build time)- A C++17 compiler (GCC 9+, Clang 10+, MSVC 2019+)
- For CUDA: CUDA toolkit 11.8+
- For Metal: Xcode 14+