ferrum_quantization/gguf/file.rs
1//! `GgufFile`: mmap-backed reader for a single GGUF file.
2//!
3//! Lifecycle:
4//! 1. `GgufFile::open(path)` — mmaps the file and parses the header.
5//! No tensor payloads are read at this stage.
6//! 2. `architecture()`, `metadata_*()`, `tensor_names()`, `tensor_info()` —
7//! cheap lookups, all served from the parsed header in memory.
8//! 3. `read_tensor(name, device)` — slices the mmap at the right offset
9//! and asks candle to materialise a `QTensor` (still quantized).
10//!
11//! Tensor reads only need a shared `&self` because the mmap is immutable; the
12//! file is safe to share across threads. (Candle's `Content::tensor` wants
13//! a `&mut R: Read + Seek`, but we satisfy it with a fresh `Cursor<&[u8]>`
14//! on each call — the cursor's mutable state is local to the call.)
15
16use std::fs::File;
17use std::io::Cursor;
18use std::path::Path;
19
20use candle_core::quantized::gguf_file::{Content, TensorInfo, Value};
21use candle_core::quantized::QTensor;
22use candle_core::{Device, Error as CandleError, Result as CandleResult};
23use memmap2::Mmap;
24
25/// Read-only handle to a memory-mapped GGUF file.
26pub struct GgufFile {
27 /// memory-mapped file payload. Kept alive for the lifetime of `self`
28 /// because `read_tensor` slices into it.
29 mmap: Mmap,
30 /// Parsed header / metadata / tensor descriptors. No payload bytes.
31 content: Content,
32}
33
34impl GgufFile {
35 /// Open and parse the header of a GGUF file.
36 ///
37 /// Returns immediately after the descriptor table is read — no tensor
38 /// data is materialised. `read_tensor` lazy-loads individual tensors.
39 pub fn open(path: impl AsRef<Path>) -> CandleResult<Self> {
40 let path_ref = path.as_ref();
41 let file = File::open(path_ref).map_err(|e| {
42 CandleError::Msg(format!(
43 "failed to open GGUF file '{}': {e}",
44 path_ref.display()
45 ))
46 })?;
47 // SAFETY: `Mmap::map` requires that the underlying file is not modified
48 // while the mapping is live. We treat the file as read-only for the
49 // entire lifetime of `self`. `Mmap` itself only exposes `&[u8]`.
50 let mmap = unsafe { Mmap::map(&file) }.map_err(|e| {
51 CandleError::Msg(format!(
52 "failed to mmap GGUF file '{}': {e}",
53 path_ref.display()
54 ))
55 })?;
56 let mut cursor = Cursor::new(&mmap[..]);
57 let content = Content::read(&mut cursor)?;
58 Ok(Self { mmap, content })
59 }
60
61 /// Raw access to candle's parsed header — for callers that need the full
62 /// `metadata` / `tensor_infos` maps. Prefer the typed accessors below.
63 pub fn content(&self) -> &Content {
64 &self.content
65 }
66
67 // ── Metadata: typed accessors ─────────────────────────────────────────
68 //
69 // GGUF metadata keys are conventionally `<scope>.<field>` strings, e.g.
70 // `general.architecture` or `qwen3.block_count`. Different model families
71 // namespace under their architecture id. `architecture()` is the one key
72 // that's always present and tells you which scope to read the rest from.
73
74 /// Architecture string, e.g. `"qwen3"`, `"llama"`. Read from
75 /// `general.architecture`. Errors if the key is missing or non-string.
76 pub fn architecture(&self) -> CandleResult<&str> {
77 self.metadata_string("general.architecture")
78 }
79
80 /// Raw metadata value lookup. Returns `None` if the key is absent.
81 pub fn metadata(&self, key: &str) -> Option<&Value> {
82 self.content.metadata.get(key)
83 }
84
85 /// Read a string-typed metadata field. Errors if missing or wrong type.
86 pub fn metadata_string(&self, key: &str) -> CandleResult<&str> {
87 self.require_metadata(key)?.to_string().map(|s| s.as_str())
88 }
89
90 /// Read a u32-typed metadata field. Errors if missing or wrong type.
91 pub fn metadata_u32(&self, key: &str) -> CandleResult<u32> {
92 self.require_metadata(key)?.to_u32()
93 }
94
95 /// Read a u64-typed metadata field. Errors if missing or wrong type.
96 pub fn metadata_u64(&self, key: &str) -> CandleResult<u64> {
97 self.require_metadata(key)?.to_u64()
98 }
99
100 /// Read an f32-typed metadata field. Errors if missing or wrong type.
101 pub fn metadata_f32(&self, key: &str) -> CandleResult<f32> {
102 self.require_metadata(key)?.to_f32()
103 }
104
105 /// Read a bool-typed metadata field. Errors if missing or wrong type.
106 pub fn metadata_bool(&self, key: &str) -> CandleResult<bool> {
107 self.require_metadata(key)?.to_bool()
108 }
109
110 fn require_metadata(&self, key: &str) -> CandleResult<&Value> {
111 self.metadata(key)
112 .ok_or_else(|| CandleError::Msg(format!("GGUF metadata key missing: '{key}'")))
113 }
114
115 // ── Tensor enumeration ────────────────────────────────────────────────
116
117 /// Total number of tensors declared in the header.
118 pub fn tensor_count(&self) -> usize {
119 self.content.tensor_infos.len()
120 }
121
122 /// Iterate over every tensor name in the file. Order is whatever the
123 /// underlying `HashMap` yields — do not rely on it being deterministic.
124 pub fn tensor_names(&self) -> impl Iterator<Item = &str> {
125 self.content.tensor_infos.keys().map(|s| s.as_str())
126 }
127
128 /// Look up a tensor descriptor (shape, dtype, byte offset) without
129 /// touching the payload. `None` if the tensor isn't in the file.
130 pub fn tensor_info(&self, name: &str) -> Option<&TensorInfo> {
131 self.content.tensor_infos.get(name)
132 }
133
134 /// Whether a tensor with `name` is declared in the header.
135 pub fn has_tensor(&self, name: &str) -> bool {
136 self.content.tensor_infos.contains_key(name)
137 }
138
139 // ── Tensor read ───────────────────────────────────────────────────────
140
141 /// Materialise a tensor as a candle `QTensor` on the target device.
142 ///
143 /// The returned tensor is **still quantized** — no dequant happens here.
144 /// Wrap it in `QMatMul::from_qtensor` for inference, or call
145 /// `QTensor::dequantize(device)` to get a fp32 `Tensor`.
146 ///
147 /// **Beware:** candle copies the bytes into an owned `Vec<u8>` (see
148 /// `TensorInfo::read`). For the steady-state weight upload path use
149 /// [`Self::tensor_byte_slice`] instead — it returns a slice directly
150 /// into the mmap with no allocation.
151 pub fn read_tensor(&self, name: &str, device: &Device) -> CandleResult<QTensor> {
152 let mut cursor = Cursor::new(&self.mmap[..]);
153 self.content.tensor(&mut cursor, name, device)
154 }
155
156 /// Whole mmap region as a byte slice. Used to wrap the file as a single
157 /// zero-copy `MTLBuffer` on Metal — the lifetime of the slice is tied to
158 /// `&self`, so the caller is expected to keep an `Arc<GgufFile>` alive
159 /// for as long as anything references the mmap.
160 pub fn mmap_bytes(&self) -> &[u8] {
161 &self.mmap[..]
162 }
163
164 /// Byte slice covering exactly tensor `name` inside the mmap. The slice
165 /// points into the file mapping, so reads are demand-paged and there is
166 /// no heap allocation. Returns `None` if the tensor isn't declared.
167 ///
168 /// The byte length is computed from the tensor's `(elem_count, ggml_dtype)`
169 /// using candle's `block_size()` / `type_size()`. For raw quant tensors
170 /// (Q4K / Q6K / etc.), these bytes are exactly what `QTensor::data()`
171 /// would return — but with no copy.
172 pub fn tensor_byte_slice(&self, name: &str) -> Option<&[u8]> {
173 let info = self.content.tensor_infos.get(name)?;
174 let elem_count = info.shape.elem_count();
175 let block_size = info.ggml_dtype.block_size();
176 if !elem_count.is_multiple_of(block_size) {
177 return None;
178 }
179 let size_in_bytes = elem_count / block_size * info.ggml_dtype.type_size();
180 let abs_start = (self.content.tensor_data_offset + info.offset) as usize;
181 let abs_end = abs_start.checked_add(size_in_bytes)?;
182 if abs_end > self.mmap.len() {
183 return None;
184 }
185 Some(&self.mmap[abs_start..abs_end])
186 }
187
188 /// `(byte_offset_in_mmap, byte_length)` for tensor `name`. Same
189 /// computation as [`Self::tensor_byte_slice`] but returns the indices
190 /// rather than the slice — useful when the caller already has the
191 /// mmap base pointer (e.g. when binding a region of a shared buffer
192 /// at a given offset).
193 pub fn tensor_byte_range(&self, name: &str) -> Option<(usize, usize)> {
194 let info = self.content.tensor_infos.get(name)?;
195 let elem_count = info.shape.elem_count();
196 let block_size = info.ggml_dtype.block_size();
197 if !elem_count.is_multiple_of(block_size) {
198 return None;
199 }
200 let size_in_bytes = elem_count / block_size * info.ggml_dtype.type_size();
201 let abs_start = (self.content.tensor_data_offset + info.offset) as usize;
202 let abs_end = abs_start.checked_add(size_in_bytes)?;
203 if abs_end > self.mmap.len() {
204 return None;
205 }
206 Some((abs_start, size_in_bytes))
207 }
208}
209
210impl std::fmt::Debug for GgufFile {
211 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
212 f.debug_struct("GgufFile")
213 .field("size_bytes", &self.mmap.len())
214 .field("metadata_keys", &self.content.metadata.len())
215 .field("tensor_count", &self.content.tensor_infos.len())
216 .field("tensor_data_offset", &self.content.tensor_data_offset)
217 .finish()
218 }
219}