Expand description
Convenience re-exports for typical inference workflows.
§Quick start
use llama_cpp_4::prelude::*;Core types are also available at the crate root (llama_cpp_4::LlamaModel, …)
if you prefer explicit paths over a glob import.
§What’s included
With the mtmd feature: MtmdContext, MtmdBitmap, …
With the rpc feature: RpcBackend, RpcServer, and RpcError in llama_cpp_4::rpc.
§Text generation
use llama_cpp_4::prelude::*;
use std::num::NonZeroU32;
fn main() {
let backend = LlamaBackend::init().unwrap();
let model = LlamaModel::load_from_file(
&backend,
"model.gguf",
&LlamaModelParams::default(),
)
.unwrap();
let mut ctx = model
.new_context(
&backend,
LlamaContextParams::default().with_n_ctx(NonZeroU32::new(2048)),
)
.unwrap();
let tokens = model.str_to_token("The answer is", AddBos::Always).unwrap();
let n_prompt = tokens.len();
let mut batch = LlamaBatch::new(2048, 1);
for (i, &tok) in tokens.iter().enumerate() {
batch
.add(tok, i as i32, &[0], i == n_prompt - 1)
.unwrap();
}
ctx.decode(&mut batch).unwrap();
let sampler = LlamaSampler::chain_simple([
LlamaSampler::temp(0.8),
LlamaSampler::dist(0),
]);
let token = sampler.sample(&ctx, 0);
let _text = model.token_to_bytes(token, Special::Plaintext).unwrap();
}§Chat template
use llama_cpp_4::prelude::*;
fn main() {
let backend = LlamaBackend::init().unwrap();
let model = LlamaModel::load_from_file(
&backend,
"model.gguf",
&LlamaModelParams::default(),
)
.unwrap();
let messages = vec![
LlamaChatMessage::new("system".into(), "You are helpful.".into()).unwrap(),
LlamaChatMessage::new("user".into(), "What is 2+2?".into()).unwrap(),
];
let prompt = model.apply_chat_template(None, &messages, true).unwrap();
let _tokens = model.str_to_token(&prompt, AddBos::Always).unwrap();
}§Embeddings
use llama_cpp_4::prelude::*;
use std::num::NonZeroU32;
fn main() {
let backend = LlamaBackend::init().unwrap();
let model = LlamaModel::load_from_file(
&backend,
"model.gguf",
&LlamaModelParams::default(),
)
.unwrap();
let mut ctx = model
.new_context(
&backend,
LlamaContextParams::default()
.with_embeddings(true)
.with_n_ctx(NonZeroU32::new(512)),
)
.unwrap();
let tokens = model.str_to_token("Hello", AddBos::Always).unwrap();
let mut batch = LlamaBatch::new(512, 1);
for (i, &tok) in tokens.iter().enumerate() {
batch
.add(tok, i as i32, &[0], i == tokens.len() - 1)
.unwrap();
}
ctx.decode(&mut batch).unwrap();
let _vec = ctx.embeddings_seq_ith(0).unwrap();
}§Memory estimation (before loading fully)
use llama_cpp_4::prelude::*;
use std::path::Path;
fn main() {
let report = get_device_memory_data(
Path::new("model.gguf"),
&LlamaModelParams::default(),
&LlamaContextParams::default(),
llama_cpp_sys_4::GGML_LOG_LEVEL_ERROR,
)
.unwrap();
for entry in &report.entries {
println!("projected used: {} bytes", entry.used());
}
}Re-exports§
pub use crate::context::params::LlamaAttentionType;pub use crate::context::params::LlamaContextParams;pub use crate::context::params::LlamaContextType;pub use crate::context::params::LlamaFlashAttnType;pub use crate::context::params::LlamaPoolingType;pub use crate::context::params::ParamsCloneError;pub use crate::context::params::RopeScalingType;pub use crate::context::CapturedTensor;pub use crate::context::LlamaContext;pub use crate::context::MemoryBreakdownEntry;pub use crate::context::TensorCapture;pub use crate::llama_backend::LlamaBackend;pub use crate::llama_batch::BatchAddError;pub use crate::llama_batch::LlamaBatch;pub use crate::model::params::kv_overrides::ParamOverrideValue;pub use crate::model::params::LlamaModelParams;pub use crate::model::AddBos;pub use crate::model::LlamaBackendDevice;pub use crate::model::LlamaBackendDeviceType;pub use crate::model::LlamaChatMessage;pub use crate::model::LlamaModel;pub use crate::model::Special;pub use crate::sampling::LlamaSampler;pub use crate::sampling::LlamaSamplerParams;pub use crate::token::data_array::LlamaTokenDataArray;pub use crate::token::LlamaToken;pub use crate::ApplyChatTemplateError;pub use crate::DecodeError;pub use crate::EmbeddingsError;pub use crate::EncodeError;pub use crate::LLamaCppError;pub use crate::LlamaContextLoadError;pub use crate::LlamaModelLoadError;pub use crate::NewLlamaChatMessageError;pub use crate::Result;pub use crate::fit::fit_params;pub use crate::fit::get_device_memory_data;pub use crate::fit::DeviceMemoryEntry;pub use crate::fit::DeviceMemoryError;pub use crate::fit::DeviceMemoryHyperParams;pub use crate::fit::DeviceMemoryReport;pub use crate::fit::FitParams;pub use crate::fit::FitParamsError;pub use crate::fit::FitParamsResult;pub use crate::eagle::Eagle3Session;pub use crate::eagle::Eagle3SessionConfig;pub use crate::mtp::MtpSession;pub use crate::mtp::MtpSessionConfig;pub use crate::mtmd::MtmdBitmap;pub use crate::mtmd::MtmdContext;pub use crate::mtmd::MtmdContextParams;pub use crate::mtmd::MtmdInputChunks;pub use crate::mtmd::MtmdInputText;pub use crate::mtmd::MtmdProgressCallback;pub use crate::rpc::RpcBackend;pub use crate::rpc::RpcError;pub use crate::rpc::RpcServer;pub use crate::quantize::attn_rot_disabled;pub use crate::quantize::set_attn_rot_disabled;pub use crate::quantize::GgmlType;pub use crate::quantize::LlamaFtype;pub use crate::quantize::QuantizeParams;pub use crate::quantize::TensorTypeOverride;pub use crate::ggml_time_us;pub use crate::llama_time_us;pub use crate::max_devices;pub use crate::model_quantize;pub use crate::print_system_info;pub use crate::supports_gpu_offload;