llama_cpp_4/prelude.rs
1//! Convenience re-exports for typical inference workflows.
2//!
3//! # Quick start
4//!
5//! ```
6//! use llama_cpp_4::prelude::*;
7//! ```
8//!
9//! Core types are also available at the crate root (`llama_cpp_4::LlamaModel`, …)
10//! if you prefer explicit paths over a glob import.
11//!
12//! # What's included
13//!
14//! | Category | Re-exported types |
15//! |---|---|
16//! | Inference | [`LlamaBackend`], [`LlamaModel`], [`LlamaModelParams`], [`LlamaContext`], [`LlamaContextParams`], [`LlamaBatch`], [`LlamaSampler`], [`LlamaSamplerParams`], [`LlamaToken`], [`LlamaTokenDataArray`] |
17//! | Tokenising | [`AddBos`], [`Special`] |
18//! | Chat | [`LlamaChatMessage`] |
19//! | Model introspection | [`LlamaBackendDevice`], [`LlamaBackendDeviceType`] |
20//! | Context params | [`LlamaFlashAttnType`], [`LlamaContextType`], [`LlamaAttentionType`], [`RopeScalingType`], [`LlamaPoolingType`], [`ParamsCloneError`] |
21//! | KV overrides | [`ParamOverrideValue`] |
22//! | Errors | [`Result`], [`LLamaCppError`], [`DecodeError`], [`EncodeError`], [`EmbeddingsError`], [`BatchAddError`], [`ApplyChatTemplateError`], [`NewLlamaChatMessageError`] |
23//! | Memory / fit | [`get_device_memory_data`], [`fit_params`], [`FitParams`], [`FitParamsResult`], [`FitParamsError`], [`DeviceMemoryReport`], [`MemoryBreakdownEntry`] |
24//! | Tensor capture | [`TensorCapture`], [`CapturedTensor`] |
25//! | Speculative | [`MtpSession`], [`MtpSessionConfig`], [`Eagle3Session`], [`Eagle3SessionConfig`] |
26//! | Quantization | [`QuantizeParams`], [`TensorTypeOverride`], [`GgmlType`], [`LlamaFtype`], [`model_quantize`], [`attn_rot_disabled`], [`set_attn_rot_disabled`] |
27//! | Utilities | [`ggml_time_us`], [`llama_time_us`], [`print_system_info`], [`supports_gpu_offload`], [`max_devices`] |
28//!
29//! With the `mtmd` feature: [`MtmdContext`], [`MtmdBitmap`], …\
30//! With the `rpc` feature: `RpcBackend`, `RpcServer`, and `RpcError` in `llama_cpp_4::rpc`.
31//!
32//! # Text generation
33//!
34//! ```no_run
35//! use llama_cpp_4::prelude::*;
36//! use std::num::NonZeroU32;
37//!
38//! fn main() {
39//! let backend = LlamaBackend::init().unwrap();
40//! let model = LlamaModel::load_from_file(
41//! &backend,
42//! "model.gguf",
43//! &LlamaModelParams::default(),
44//! )
45//! .unwrap();
46//! let mut ctx = model
47//! .new_context(
48//! &backend,
49//! LlamaContextParams::default().with_n_ctx(NonZeroU32::new(2048)),
50//! )
51//! .unwrap();
52//!
53//! let tokens = model.str_to_token("The answer is", AddBos::Always).unwrap();
54//! let n_prompt = tokens.len();
55//! let mut batch = LlamaBatch::new(2048, 1);
56//! for (i, &tok) in tokens.iter().enumerate() {
57//! batch
58//! .add(tok, i as i32, &[0], i == n_prompt - 1)
59//! .unwrap();
60//! }
61//! ctx.decode(&mut batch).unwrap();
62//!
63//! let sampler = LlamaSampler::chain_simple([
64//! LlamaSampler::temp(0.8),
65//! LlamaSampler::dist(0),
66//! ]);
67//! let token = sampler.sample(&ctx, 0);
68//! let _text = model.token_to_bytes(token, Special::Plaintext).unwrap();
69//! }
70//! ```
71//!
72//! # Chat template
73//!
74//! ```no_run
75//! use llama_cpp_4::prelude::*;
76//!
77//! fn main() {
78//! let backend = LlamaBackend::init().unwrap();
79//! let model = LlamaModel::load_from_file(
80//! &backend,
81//! "model.gguf",
82//! &LlamaModelParams::default(),
83//! )
84//! .unwrap();
85//!
86//! let messages = vec![
87//! LlamaChatMessage::new("system".into(), "You are helpful.".into()).unwrap(),
88//! LlamaChatMessage::new("user".into(), "What is 2+2?".into()).unwrap(),
89//! ];
90//! let prompt = model.apply_chat_template(None, &messages, true).unwrap();
91//! let _tokens = model.str_to_token(&prompt, AddBos::Always).unwrap();
92//! }
93//! ```
94//!
95//! # Embeddings
96//!
97//! ```no_run
98//! use llama_cpp_4::prelude::*;
99//! use std::num::NonZeroU32;
100//!
101//! fn main() {
102//! let backend = LlamaBackend::init().unwrap();
103//! let model = LlamaModel::load_from_file(
104//! &backend,
105//! "model.gguf",
106//! &LlamaModelParams::default(),
107//! )
108//! .unwrap();
109//! let mut ctx = model
110//! .new_context(
111//! &backend,
112//! LlamaContextParams::default()
113//! .with_embeddings(true)
114//! .with_n_ctx(NonZeroU32::new(512)),
115//! )
116//! .unwrap();
117//!
118//! let tokens = model.str_to_token("Hello", AddBos::Always).unwrap();
119//! let mut batch = LlamaBatch::new(512, 1);
120//! for (i, &tok) in tokens.iter().enumerate() {
121//! batch
122//! .add(tok, i as i32, &[0], i == tokens.len() - 1)
123//! .unwrap();
124//! }
125//! ctx.decode(&mut batch).unwrap();
126//! let _vec = ctx.embeddings_seq_ith(0).unwrap();
127//! }
128//! ```
129//!
130//! # Memory estimation (before loading fully)
131//!
132//! ```no_run
133//! use llama_cpp_4::prelude::*;
134//! use std::path::Path;
135//!
136//! fn main() {
137//! let report = get_device_memory_data(
138//! Path::new("model.gguf"),
139//! &LlamaModelParams::default(),
140//! &LlamaContextParams::default(),
141//! llama_cpp_sys_4::GGML_LOG_LEVEL_ERROR,
142//! )
143//! .unwrap();
144//! for entry in &report.entries {
145//! println!("projected used: {} bytes", entry.used());
146//! }
147//! }
148//! ```
149
150// ── Core inference ────────────────────────────────────────────────────────────
151
152pub use crate::context::params::{
153 LlamaAttentionType, LlamaContextParams, LlamaContextType, LlamaFlashAttnType, LlamaPoolingType,
154 ParamsCloneError, RopeScalingType,
155};
156pub use crate::context::{CapturedTensor, LlamaContext, MemoryBreakdownEntry, TensorCapture};
157pub use crate::llama_backend::LlamaBackend;
158pub use crate::llama_batch::{BatchAddError, LlamaBatch};
159pub use crate::model::params::kv_overrides::ParamOverrideValue;
160pub use crate::model::params::LlamaModelParams;
161pub use crate::model::{
162 AddBos, LlamaBackendDevice, LlamaBackendDeviceType, LlamaChatMessage, LlamaModel, Special,
163};
164pub use crate::sampling::{LlamaSampler, LlamaSamplerParams};
165pub use crate::token::data_array::LlamaTokenDataArray;
166pub use crate::token::LlamaToken;
167
168// ── Errors & results ────────────────────────────────────────────────────────
169
170pub use crate::{
171 ApplyChatTemplateError, DecodeError, EmbeddingsError, EncodeError, LLamaCppError,
172 LlamaContextLoadError, LlamaModelLoadError, NewLlamaChatMessageError, Result,
173};
174
175// ── Memory / fit helpers ────────────────────────────────────────────────────
176
177pub use crate::fit::{
178 fit_params, get_device_memory_data, DeviceMemoryEntry, DeviceMemoryError,
179 DeviceMemoryHyperParams, DeviceMemoryReport, FitParams, FitParamsError, FitParamsResult,
180};
181
182// ── Speculative decoding ────────────────────────────────────────────────────
183
184pub use crate::eagle::{Eagle3Session, Eagle3SessionConfig};
185pub use crate::mtp::{MtpSession, MtpSessionConfig};
186
187// ── Multimodal (feature `mtmd`) ─────────────────────────────────────────────
188
189#[cfg(feature = "mtmd")]
190pub use crate::mtmd::{
191 MtmdBitmap, MtmdContext, MtmdContextParams, MtmdInputChunks, MtmdInputText,
192 MtmdProgressCallback,
193};
194
195// ── Remote backend (feature `rpc`) ──────────────────────────────────────────
196
197#[cfg(feature = "rpc")]
198pub use crate::rpc::{RpcBackend, RpcError, RpcServer};
199
200// ── Quantization ────────────────────────────────────────────────────────────
201
202pub use crate::quantize::{
203 attn_rot_disabled, set_attn_rot_disabled, GgmlType, LlamaFtype, QuantizeParams,
204 TensorTypeOverride,
205};
206
207// ── Utilities ───────────────────────────────────────────────────────────────
208
209pub use crate::{
210 ggml_time_us, llama_time_us, max_devices, model_quantize, print_system_info,
211 supports_gpu_offload,
212};