llama-cpp-4 0.2.5

llama.cpp bindings for Rust
docs.rs failed to build llama-cpp-4-0.2.5
Please check the build logs for more information.
See Builds for ideas on how to fix a failed build, or Metadata for how to configure docs.rs builds.
If you believe this is docs.rs' fault, open an issue.

llama-cpp-4

Crates.io docs.rs License

Safe Rust bindings to llama.cpp.
Tracks upstream closely — designed to stay current rather than provide a thick abstraction layer.

llama.cpp version: b8249 · Crate version: 0.2.5


Add to your project

[dependencies]
llama-cpp-4 = "0.2.5"

# GPU support (pick one or more)
# llama-cpp-4 = { version = "0.2.5", features = ["cuda"] }
# llama-cpp-4 = { version = "0.2.5", features = ["metal"] }
# llama-cpp-4 = { version = "0.2.5", features = ["vulkan"] }

Feature flags

Feature Default Description
openmp Multi-threaded CPU inference via OpenMP
cuda NVIDIA GPU via CUDA
metal Apple GPU via Metal
vulkan Cross-platform GPU via Vulkan
native CPU auto-tune for current arch (AVX2, NEON, …)
rpc Remote compute backend
dynamic-link Link llama.cpp as a shared library instead of static

API overview

Backend

use llama_cpp_4::llama_backend::LlamaBackend;

// Initialise once per process. Configures hardware backends (CUDA, Metal, …).
let backend = LlamaBackend::init()?;
// Optional: suppress llama.cpp's stderr log spam
let backend = LlamaBackend::init_numa(NumaStrategy::Disabled)?;

Loading a model

use llama_cpp_4::model::{params::LlamaModelParams, LlamaModel};

let mut params = LlamaModelParams::default();
params = params.with_n_gpu_layers(99); // offload all layers to GPU

let model = LlamaModel::load_from_file(&backend, "model.gguf", &params)?;

// Metadata
println!("vocab size : {}", model.n_vocab());
println!("context len: {}", model.n_ctx_train());
println!("embed dim  : {}", model.n_embd());

// Chat template (Jinja, if the model includes one)
if let Ok(tmpl) = model.get_chat_template(4096) {
    println!("template   : {tmpl}");
}

Tokenising

use llama_cpp_4::model::{AddBos, Special};

let tokens = model.str_to_token("Hello, world!", AddBos::Always)?;
let text   = model.token_to_str(tokens[0], Special::Plaintext)?;
let bytes  = model.token_to_bytes(tokens[0], Special::Plaintext)?;

// Batch: token_to_piece is available on the context too

Chat template

use llama_cpp_4::model::LlamaChatMessage;

let messages = vec![
    LlamaChatMessage::new("system".into(),    "You are helpful.".into())?,
    LlamaChatMessage::new("user".into(),      "What is 2+2?".into())?,
];
// Pass None to use the model's built-in template
let prompt = model.apply_chat_template(None, messages, true)?;

Creating a context

use llama_cpp_4::context::params::LlamaContextParams;
use std::num::NonZeroU32;

let params = LlamaContextParams::default()
    .with_n_ctx(NonZeroU32::new(4096))
    .with_n_batch(512)
    .with_n_threads(8)
    .with_flash_attn(true);   // Flash Attention 2

let mut ctx = model.new_context(&backend, params)?;

Batched decode (prefill + generation)

use llama_cpp_4::llama_batch::LlamaBatch;

let mut batch = LlamaBatch::new(512, 1);

// Add prompt tokens; only the last token needs logits
for (i, &tok) in tokens.iter().enumerate() {
    let last = i == tokens.len() - 1;
    batch.add(tok, i as i32, &[0], last)?;
}
ctx.decode(&mut batch)?;

// Generate one token at a time
batch.clear();
batch.add(new_token, pos, &[0], true)?;
ctx.decode(&mut batch)?;

Sampling

use llama_cpp_4::sampling::LlamaSampler;

// Simple chain
let sampler = LlamaSampler::chain_simple([
    LlamaSampler::top_k(40),
    LlamaSampler::top_p(0.95, 1),
    LlamaSampler::temp(0.8),
    LlamaSampler::dist(/* seed */ 42),
]);

// Or greedy
let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);

// Or GBNF grammar (constrained decoding)
let grammar = r#"root ::= "yes" | "no""#;
let sampler = LlamaSampler::chain_simple([
    LlamaSampler::grammar(&model, grammar, "root"),
    LlamaSampler::greedy(),
]);

// Sample a token
let token = sampler.sample(&ctx, batch.n_tokens() - 1);

// Check for end-of-generation
if model.is_eog_token(token) { break; }

// Decode to text
let bytes = model.token_to_bytes(token, Special::Plaintext)?;

KV cache

// The KV cache is managed through the memory handle
use llama_cpp_4::context::kv_cache::SeqRm;

ctx.clear_kv_cache_seq(0, None, None)?; // clear sequence 0

Embeddings

use llama_cpp_4::context::params::LlamaContextParams;

let params = LlamaContextParams::default()
    .with_embeddings(true)
    .with_n_ctx(NonZeroU32::new(512));
let mut ctx = model.new_context(&backend, params)?;

// ... fill batch, decode ...

// Pooled (sequence-level) embedding
let vec = ctx.embeddings_seq_ith(0)?;

// Per-token embedding
let vec = ctx.embeddings_ith(last_token_pos)?;

LoRA adapters

let adapter = model.load_lora_adapter("adapter.gguf", 1.0)?;
ctx.set_lora_adapter(&adapter, 1.0)?;
// Remove all adapters
ctx.lora_adapter_remove()?;

Performance counters

let perf = ctx.perf_context();
println!("prompt eval: {:.2} ms", perf.t_p_eval_ms);
println!("generation : {:.2} ms  ({:.1} t/s)", perf.t_eval_ms, ...);
ctx.perf_context_reset();

Full example: text generation

use llama_cpp_4::{
    context::params::LlamaContextParams,
    llama_backend::LlamaBackend,
    llama_batch::LlamaBatch,
    model::{params::LlamaModelParams, AddBos, LlamaModel, Special},
    sampling::LlamaSampler,
};
use std::num::NonZeroU32;

fn main() -> anyhow::Result<()> {
    let backend = LlamaBackend::init()?;
    let model = LlamaModel::load_from_file(
        &backend, "model.gguf", &LlamaModelParams::default()
    )?;

    let ctx_params = LlamaContextParams::default()
        .with_n_ctx(NonZeroU32::new(2048));
    let mut ctx = model.new_context(&backend, ctx_params)?;

    let tokens = model.str_to_token("The answer is", AddBos::Always)?;
    let n_prompt = tokens.len();

    let mut batch = LlamaBatch::new(2048, 1);
    for (i, &tok) in tokens.iter().enumerate() {
        batch.add(tok, i as i32, &[0], i == n_prompt - 1)?;
    }
    ctx.decode(&mut batch)?;

    let sampler = LlamaSampler::chain_simple([
        LlamaSampler::temp(0.8),
        LlamaSampler::dist(0),
    ]);

    let mut pos = n_prompt as i32;
    let mut decoder = encoding_rs::UTF_8.new_decoder();

    for _ in 0..256 {
        let token = sampler.sample(&ctx, 0);
        if model.is_eog_token(token) { break; }

        let bytes = model.token_to_bytes(token, Special::Plaintext)?;
        let mut piece = String::new();
        decoder.decode_to_string(&bytes, &mut piece, false);
        print!("{piece}");

        batch.clear();
        batch.add(token, pos, &[0], true)?;
        ctx.decode(&mut batch)?;
        pos += 1;
    }
    Ok(())
}

Safety

This crate wraps a C++ library via FFI. The safe API prevents most misuse, but some patterns (e.g. using a context after its model is dropped) can still cause UB. File an issue if you spot any.

Requirements

  • Rust 1.75+
  • clang (for bindgen at build time)
  • A C++17 compiler (GCC 9+, Clang 10+, MSVC 2019+)
  • For CUDA: CUDA toolkit 11.8+
  • For Metal: Xcode 14+