llama_cpp_4/context/memory_breakdown.rs
1//! Per-buffer-type memory usage reported by llama.cpp.
2//!
3//! [`MemoryBreakdownEntry`] values are produced by
4//! [`crate::context::LlamaContext::memory_breakdown`] and classify bytes into
5//! model weights, KV / recurrent cache, and temporary compute buffers for each
6//! ggml backend buffer type (e.g. `CUDA0`, `Metal`, `Host`).
7//!
8//! # Examples
9//!
10//! ```no_run
11//! use llama_cpp_4::prelude::*;
12//!
13//! fn main() {
14//! let backend = LlamaBackend::init().unwrap();
15//! let model = LlamaModel::load_from_file(&backend, "model.gguf", &LlamaModelParams::default()).unwrap();
16//! let ctx = model.new_context(&backend, LlamaContextParams::default()).unwrap();
17//! for entry in ctx.memory_breakdown() {
18//! println!("{}: {} bytes total", entry.buft_name, entry.total());
19//! }
20//! }
21//! ```
22
23use std::ffi::CStr;
24
25/// Memory attributed to a single backend buffer type (e.g. CUDA0, Host).
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct MemoryBreakdownEntry {
28 /// Human-readable buffer-type name from ggml.
29 pub buft_name: String,
30 /// Bytes used by model weights on this buffer type.
31 pub model: usize,
32 /// Bytes used by the KV / recurrent context cache.
33 pub context: usize,
34 /// Bytes used by temporary compute buffers.
35 pub compute: usize,
36}
37
38impl MemoryBreakdownEntry {
39 /// Sum of model, context, and compute bytes.
40 #[must_use]
41 pub fn total(&self) -> usize {
42 self.model + self.context + self.compute
43 }
44}
45
46fn raw_entry_to_rust(
47 entry: &llama_cpp_sys_4::llama_memory_breakdown_entry,
48) -> MemoryBreakdownEntry {
49 let bytes: &[u8] = unsafe {
50 std::slice::from_raw_parts(entry.buft_name.as_ptr().cast(), entry.buft_name.len())
51 };
52 let name = CStr::from_bytes_until_nul(bytes)
53 .map(|c| c.to_string_lossy().into_owned())
54 .unwrap_or_default();
55 MemoryBreakdownEntry {
56 buft_name: name,
57 model: entry.model,
58 context: entry.context,
59 compute: entry.compute,
60 }
61}
62
63/// Collect memory breakdown entries for a live context.
64///
65/// Wraps the `ext_shim` helper around `llama_get_memory_breakdown`. Grows the
66/// output buffer until every entry fits. Returns an empty vector when the
67/// context pointer is invalid or no buffers are registered yet.
68///
69/// Prefer [`crate::context::LlamaContext::memory_breakdown`] in application code.
70#[must_use]
71pub(crate) fn collect_memory_breakdown(
72 ctx: *const llama_cpp_sys_4::llama_context,
73) -> Vec<MemoryBreakdownEntry> {
74 if ctx.is_null() {
75 return Vec::new();
76 }
77
78 let mut capacity = 16usize;
79 loop {
80 let mut raw = vec![
81 llama_cpp_sys_4::llama_memory_breakdown_entry {
82 buft_name: [0; 128],
83 model: 0,
84 context: 0,
85 compute: 0,
86 };
87 capacity
88 ];
89
90 let n = unsafe {
91 llama_cpp_sys_4::llama_memory_breakdown_collect(ctx, raw.as_mut_ptr(), capacity)
92 };
93
94 if n < capacity {
95 return raw
96 .into_iter()
97 .take(n)
98 .map(|e| raw_entry_to_rust(&e))
99 .collect();
100 }
101
102 capacity = capacity.saturating_mul(2);
103 if capacity > 4096 {
104 return raw.into_iter().map(|e| raw_entry_to_rust(&e)).collect();
105 }
106 }
107}