llama_cpp_4/context/
memory_breakdown.rs

1//! Per-buffer-type memory usage reported by llama.cpp.
2//!
3//! [`MemoryBreakdownEntry`] values are produced by
4//! [`crate::context::LlamaContext::memory_breakdown`] and classify bytes into
5//! model weights, KV / recurrent cache, and temporary compute buffers for each
6//! ggml backend buffer type (e.g. `CUDA0`, `Metal`, `Host`).
7//!
8//! # Examples
9//!
10//! ```no_run
11//! use llama_cpp_4::prelude::*;
12//!
13//! fn main() {
14//!     let backend = LlamaBackend::init().unwrap();
15//!     let model = LlamaModel::load_from_file(&backend, "model.gguf", &LlamaModelParams::default()).unwrap();
16//!     let ctx = model.new_context(&backend, LlamaContextParams::default()).unwrap();
17//!     for entry in ctx.memory_breakdown() {
18//!         println!("{}: {} bytes total", entry.buft_name, entry.total());
19//!     }
20//! }
21//! ```
22
23use std::ffi::CStr;
24
25/// Memory attributed to a single backend buffer type (e.g. CUDA0, Host).
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct MemoryBreakdownEntry {
28    /// Human-readable buffer-type name from ggml.
29    pub buft_name: String,
30    /// Bytes used by model weights on this buffer type.
31    pub model: usize,
32    /// Bytes used by the KV / recurrent context cache.
33    pub context: usize,
34    /// Bytes used by temporary compute buffers.
35    pub compute: usize,
36}
37
38impl MemoryBreakdownEntry {
39    /// Sum of model, context, and compute bytes.
40    #[must_use]
41    pub fn total(&self) -> usize {
42        self.model + self.context + self.compute
43    }
44}
45
46fn raw_entry_to_rust(
47    entry: &llama_cpp_sys_4::llama_memory_breakdown_entry,
48) -> MemoryBreakdownEntry {
49    let bytes: &[u8] = unsafe {
50        std::slice::from_raw_parts(entry.buft_name.as_ptr().cast(), entry.buft_name.len())
51    };
52    let name = CStr::from_bytes_until_nul(bytes)
53        .map(|c| c.to_string_lossy().into_owned())
54        .unwrap_or_default();
55    MemoryBreakdownEntry {
56        buft_name: name,
57        model: entry.model,
58        context: entry.context,
59        compute: entry.compute,
60    }
61}
62
63/// Collect memory breakdown entries for a live context.
64///
65/// Wraps the `ext_shim` helper around `llama_get_memory_breakdown`. Grows the
66/// output buffer until every entry fits. Returns an empty vector when the
67/// context pointer is invalid or no buffers are registered yet.
68///
69/// Prefer [`crate::context::LlamaContext::memory_breakdown`] in application code.
70#[must_use]
71pub(crate) fn collect_memory_breakdown(
72    ctx: *const llama_cpp_sys_4::llama_context,
73) -> Vec<MemoryBreakdownEntry> {
74    if ctx.is_null() {
75        return Vec::new();
76    }
77
78    let mut capacity = 16usize;
79    loop {
80        let mut raw = vec![
81            llama_cpp_sys_4::llama_memory_breakdown_entry {
82                buft_name: [0; 128],
83                model: 0,
84                context: 0,
85                compute: 0,
86            };
87            capacity
88        ];
89
90        let n = unsafe {
91            llama_cpp_sys_4::llama_memory_breakdown_collect(ctx, raw.as_mut_ptr(), capacity)
92        };
93
94        if n < capacity {
95            return raw
96                .into_iter()
97                .take(n)
98                .map(|e| raw_entry_to_rust(&e))
99                .collect();
100        }
101
102        capacity = capacity.saturating_mul(2);
103        if capacity > 4096 {
104            return raw.into_iter().map(|e| raw_entry_to_rust(&e)).collect();
105        }
106    }
107}
llama_cpp_4/context/memory_breakdown.rs

llama_cpp_4/context/
memory_breakdown.rs