Skip to main content

Module prelude

Module prelude 

Source
Expand description

Convenience re-exports for typical inference workflows.

§Quick start

use llama_cpp_4::prelude::*;

Core types are also available at the crate root (llama_cpp_4::LlamaModel, …) if you prefer explicit paths over a glob import.

§What’s included

CategoryRe-exported types
InferenceLlamaBackend, LlamaModel, LlamaModelParams, LlamaContext, LlamaContextParams, LlamaBatch, LlamaSampler, LlamaSamplerParams, LlamaToken, LlamaTokenDataArray
TokenisingAddBos, Special
ChatLlamaChatMessage
Model introspectionLlamaBackendDevice, LlamaBackendDeviceType
Context paramsLlamaFlashAttnType, LlamaContextType, LlamaAttentionType, RopeScalingType, LlamaPoolingType, ParamsCloneError
KV overridesParamOverrideValue
ErrorsResult, LLamaCppError, DecodeError, EncodeError, EmbeddingsError, BatchAddError, ApplyChatTemplateError, NewLlamaChatMessageError
Memory / fitget_device_memory_data, fit_params, FitParams, FitParamsResult, FitParamsError, DeviceMemoryReport, MemoryBreakdownEntry
Tensor captureTensorCapture, CapturedTensor
SpeculativeMtpSession, MtpSessionConfig, Eagle3Session, Eagle3SessionConfig
QuantizationQuantizeParams, TensorTypeOverride, GgmlType, LlamaFtype, model_quantize, attn_rot_disabled, set_attn_rot_disabled
Utilitiesggml_time_us, llama_time_us, print_system_info, supports_gpu_offload, max_devices

With the mtmd feature: MtmdContext, MtmdBitmap, …
With the rpc feature: RpcBackend, RpcServer, and RpcError in llama_cpp_4::rpc.

§Text generation

use llama_cpp_4::prelude::*;
use std::num::NonZeroU32;

fn main() {
    let backend = LlamaBackend::init().unwrap();
    let model = LlamaModel::load_from_file(
        &backend,
        "model.gguf",
        &LlamaModelParams::default(),
    )
    .unwrap();
    let mut ctx = model
        .new_context(
            &backend,
            LlamaContextParams::default().with_n_ctx(NonZeroU32::new(2048)),
        )
        .unwrap();

    let tokens = model.str_to_token("The answer is", AddBos::Always).unwrap();
    let n_prompt = tokens.len();
    let mut batch = LlamaBatch::new(2048, 1);
    for (i, &tok) in tokens.iter().enumerate() {
        batch
            .add(tok, i as i32, &[0], i == n_prompt - 1)
            .unwrap();
    }
    ctx.decode(&mut batch).unwrap();

    let sampler = LlamaSampler::chain_simple([
        LlamaSampler::temp(0.8),
        LlamaSampler::dist(0),
    ]);
    let token = sampler.sample(&ctx, 0);
    let _text = model.token_to_bytes(token, Special::Plaintext).unwrap();
}

§Chat template

use llama_cpp_4::prelude::*;

fn main() {
    let backend = LlamaBackend::init().unwrap();
    let model = LlamaModel::load_from_file(
        &backend,
        "model.gguf",
        &LlamaModelParams::default(),
    )
    .unwrap();

    let messages = vec![
        LlamaChatMessage::new("system".into(), "You are helpful.".into()).unwrap(),
        LlamaChatMessage::new("user".into(), "What is 2+2?".into()).unwrap(),
    ];
    let prompt = model.apply_chat_template(None, &messages, true).unwrap();
    let _tokens = model.str_to_token(&prompt, AddBos::Always).unwrap();
}

§Embeddings

use llama_cpp_4::prelude::*;
use std::num::NonZeroU32;

fn main() {
    let backend = LlamaBackend::init().unwrap();
    let model = LlamaModel::load_from_file(
        &backend,
        "model.gguf",
        &LlamaModelParams::default(),
    )
    .unwrap();
    let mut ctx = model
        .new_context(
            &backend,
            LlamaContextParams::default()
                .with_embeddings(true)
                .with_n_ctx(NonZeroU32::new(512)),
        )
        .unwrap();

    let tokens = model.str_to_token("Hello", AddBos::Always).unwrap();
    let mut batch = LlamaBatch::new(512, 1);
    for (i, &tok) in tokens.iter().enumerate() {
        batch
            .add(tok, i as i32, &[0], i == tokens.len() - 1)
            .unwrap();
    }
    ctx.decode(&mut batch).unwrap();
    let _vec = ctx.embeddings_seq_ith(0).unwrap();
}

§Memory estimation (before loading fully)

use llama_cpp_4::prelude::*;
use std::path::Path;

fn main() {
    let report = get_device_memory_data(
        Path::new("model.gguf"),
        &LlamaModelParams::default(),
        &LlamaContextParams::default(),
        llama_cpp_sys_4::GGML_LOG_LEVEL_ERROR,
    )
    .unwrap();
    for entry in &report.entries {
        println!("projected used: {} bytes", entry.used());
    }
}

Re-exports§

pub use crate::context::params::LlamaAttentionType;
pub use crate::context::params::LlamaContextParams;
pub use crate::context::params::LlamaContextType;
pub use crate::context::params::LlamaFlashAttnType;
pub use crate::context::params::LlamaPoolingType;
pub use crate::context::params::ParamsCloneError;
pub use crate::context::params::RopeScalingType;
pub use crate::context::CapturedTensor;
pub use crate::context::LlamaContext;
pub use crate::context::MemoryBreakdownEntry;
pub use crate::context::TensorCapture;
pub use crate::llama_backend::LlamaBackend;
pub use crate::llama_batch::BatchAddError;
pub use crate::llama_batch::LlamaBatch;
pub use crate::model::params::kv_overrides::ParamOverrideValue;
pub use crate::model::params::LlamaModelParams;
pub use crate::model::AddBos;
pub use crate::model::LlamaBackendDevice;
pub use crate::model::LlamaBackendDeviceType;
pub use crate::model::LlamaChatMessage;
pub use crate::model::LlamaModel;
pub use crate::model::Special;
pub use crate::sampling::LlamaSampler;
pub use crate::sampling::LlamaSamplerParams;
pub use crate::token::data_array::LlamaTokenDataArray;
pub use crate::token::LlamaToken;
pub use crate::ApplyChatTemplateError;
pub use crate::DecodeError;
pub use crate::EmbeddingsError;
pub use crate::EncodeError;
pub use crate::LLamaCppError;
pub use crate::LlamaContextLoadError;
pub use crate::LlamaModelLoadError;
pub use crate::NewLlamaChatMessageError;
pub use crate::Result;
pub use crate::fit::fit_params;
pub use crate::fit::get_device_memory_data;
pub use crate::fit::DeviceMemoryEntry;
pub use crate::fit::DeviceMemoryError;
pub use crate::fit::DeviceMemoryHyperParams;
pub use crate::fit::DeviceMemoryReport;
pub use crate::fit::FitParams;
pub use crate::fit::FitParamsError;
pub use crate::fit::FitParamsResult;
pub use crate::eagle::Eagle3Session;
pub use crate::eagle::Eagle3SessionConfig;
pub use crate::mtp::MtpSession;
pub use crate::mtp::MtpSessionConfig;
pub use crate::mtmd::MtmdBitmap;
pub use crate::mtmd::MtmdContext;
pub use crate::mtmd::MtmdContextParams;
pub use crate::mtmd::MtmdInputChunks;
pub use crate::mtmd::MtmdInputText;
pub use crate::mtmd::MtmdProgressCallback;
pub use crate::rpc::RpcBackend;
pub use crate::rpc::RpcError;
pub use crate::rpc::RpcServer;
pub use crate::quantize::attn_rot_disabled;
pub use crate::quantize::set_attn_rot_disabled;
pub use crate::quantize::GgmlType;
pub use crate::quantize::LlamaFtype;
pub use crate::quantize::QuantizeParams;
pub use crate::quantize::TensorTypeOverride;
pub use crate::ggml_time_us;
pub use crate::llama_time_us;
pub use crate::max_devices;
pub use crate::model_quantize;
pub use crate::print_system_info;
pub use crate::supports_gpu_offload;