ferrum_quantization/gguf/
file.rs

1//! `GgufFile`: mmap-backed reader for a single GGUF file.
2//!
3//! Lifecycle:
4//!   1. `GgufFile::open(path)` — mmaps the file and parses the header.
5//!      No tensor payloads are read at this stage.
6//!   2. `architecture()`, `metadata_*()`, `tensor_names()`, `tensor_info()` —
7//!      cheap lookups, all served from the parsed header in memory.
8//!   3. `read_tensor(name, device)` — slices the mmap at the right offset
9//!      and asks candle to materialise a `QTensor` (still quantized).
10//!
11//! Tensor reads only need a shared `&self` because the mmap is immutable; the
12//! file is safe to share across threads. (Candle's `Content::tensor` wants
13//! a `&mut R: Read + Seek`, but we satisfy it with a fresh `Cursor<&[u8]>`
14//! on each call — the cursor's mutable state is local to the call.)
15
16use std::fs::File;
17use std::io::Cursor;
18use std::path::Path;
19
20use candle_core::quantized::gguf_file::{Content, TensorInfo, Value};
21use candle_core::quantized::QTensor;
22use candle_core::{Device, Error as CandleError, Result as CandleResult};
23use memmap2::Mmap;
24
25/// Read-only handle to a memory-mapped GGUF file.
26pub struct GgufFile {
27    /// memory-mapped file payload. Kept alive for the lifetime of `self`
28    /// because `read_tensor` slices into it.
29    mmap: Mmap,
30    /// Parsed header / metadata / tensor descriptors. No payload bytes.
31    content: Content,
32}
33
34impl GgufFile {
35    /// Open and parse the header of a GGUF file.
36    ///
37    /// Returns immediately after the descriptor table is read — no tensor
38    /// data is materialised. `read_tensor` lazy-loads individual tensors.
39    pub fn open(path: impl AsRef<Path>) -> CandleResult<Self> {
40        let path_ref = path.as_ref();
41        let file = File::open(path_ref).map_err(|e| {
42            CandleError::Msg(format!(
43                "failed to open GGUF file '{}': {e}",
44                path_ref.display()
45            ))
46        })?;
47        // SAFETY: `Mmap::map` requires that the underlying file is not modified
48        // while the mapping is live. We treat the file as read-only for the
49        // entire lifetime of `self`. `Mmap` itself only exposes `&[u8]`.
50        let mmap = unsafe { Mmap::map(&file) }.map_err(|e| {
51            CandleError::Msg(format!(
52                "failed to mmap GGUF file '{}': {e}",
53                path_ref.display()
54            ))
55        })?;
56        let mut cursor = Cursor::new(&mmap[..]);
57        let content = Content::read(&mut cursor)?;
58        Ok(Self { mmap, content })
59    }
60
61    /// Raw access to candle's parsed header — for callers that need the full
62    /// `metadata` / `tensor_infos` maps. Prefer the typed accessors below.
63    pub fn content(&self) -> &Content {
64        &self.content
65    }
66
67    // ── Metadata: typed accessors ─────────────────────────────────────────
68    //
69    // GGUF metadata keys are conventionally `<scope>.<field>` strings, e.g.
70    // `general.architecture` or `qwen3.block_count`. Different model families
71    // namespace under their architecture id. `architecture()` is the one key
72    // that's always present and tells you which scope to read the rest from.
73
74    /// Architecture string, e.g. `"qwen3"`, `"llama"`. Read from
75    /// `general.architecture`. Errors if the key is missing or non-string.
76    pub fn architecture(&self) -> CandleResult<&str> {
77        self.metadata_string("general.architecture")
78    }
79
80    /// Raw metadata value lookup. Returns `None` if the key is absent.
81    pub fn metadata(&self, key: &str) -> Option<&Value> {
82        self.content.metadata.get(key)
83    }
84
85    /// Read a string-typed metadata field. Errors if missing or wrong type.
86    pub fn metadata_string(&self, key: &str) -> CandleResult<&str> {
87        self.require_metadata(key)?.to_string().map(|s| s.as_str())
88    }
89
90    /// Read a u32-typed metadata field. Errors if missing or wrong type.
91    pub fn metadata_u32(&self, key: &str) -> CandleResult<u32> {
92        self.require_metadata(key)?.to_u32()
93    }
94
95    /// Read a u64-typed metadata field. Errors if missing or wrong type.
96    pub fn metadata_u64(&self, key: &str) -> CandleResult<u64> {
97        self.require_metadata(key)?.to_u64()
98    }
99
100    /// Read an f32-typed metadata field. Errors if missing or wrong type.
101    pub fn metadata_f32(&self, key: &str) -> CandleResult<f32> {
102        self.require_metadata(key)?.to_f32()
103    }
104
105    /// Read a bool-typed metadata field. Errors if missing or wrong type.
106    pub fn metadata_bool(&self, key: &str) -> CandleResult<bool> {
107        self.require_metadata(key)?.to_bool()
108    }
109
110    fn require_metadata(&self, key: &str) -> CandleResult<&Value> {
111        self.metadata(key)
112            .ok_or_else(|| CandleError::Msg(format!("GGUF metadata key missing: '{key}'")))
113    }
114
115    // ── Tensor enumeration ────────────────────────────────────────────────
116
117    /// Total number of tensors declared in the header.
118    pub fn tensor_count(&self) -> usize {
119        self.content.tensor_infos.len()
120    }
121
122    /// Iterate over every tensor name in the file. Order is whatever the
123    /// underlying `HashMap` yields — do not rely on it being deterministic.
124    pub fn tensor_names(&self) -> impl Iterator<Item = &str> {
125        self.content.tensor_infos.keys().map(|s| s.as_str())
126    }
127
128    /// Look up a tensor descriptor (shape, dtype, byte offset) without
129    /// touching the payload. `None` if the tensor isn't in the file.
130    pub fn tensor_info(&self, name: &str) -> Option<&TensorInfo> {
131        self.content.tensor_infos.get(name)
132    }
133
134    /// Whether a tensor with `name` is declared in the header.
135    pub fn has_tensor(&self, name: &str) -> bool {
136        self.content.tensor_infos.contains_key(name)
137    }
138
139    // ── Tensor read ───────────────────────────────────────────────────────
140
141    /// Materialise a tensor as a candle `QTensor` on the target device.
142    ///
143    /// The returned tensor is **still quantized** — no dequant happens here.
144    /// Wrap it in `QMatMul::from_qtensor` for inference, or call
145    /// `QTensor::dequantize(device)` to get a fp32 `Tensor`.
146    ///
147    /// **Beware:** candle copies the bytes into an owned `Vec<u8>` (see
148    /// `TensorInfo::read`). For the steady-state weight upload path use
149    /// [`Self::tensor_byte_slice`] instead — it returns a slice directly
150    /// into the mmap with no allocation.
151    pub fn read_tensor(&self, name: &str, device: &Device) -> CandleResult<QTensor> {
152        let mut cursor = Cursor::new(&self.mmap[..]);
153        self.content.tensor(&mut cursor, name, device)
154    }
155
156    /// Whole mmap region as a byte slice. Used to wrap the file as a single
157    /// zero-copy `MTLBuffer` on Metal — the lifetime of the slice is tied to
158    /// `&self`, so the caller is expected to keep an `Arc<GgufFile>` alive
159    /// for as long as anything references the mmap.
160    pub fn mmap_bytes(&self) -> &[u8] {
161        &self.mmap[..]
162    }
163
164    /// Byte slice covering exactly tensor `name` inside the mmap. The slice
165    /// points into the file mapping, so reads are demand-paged and there is
166    /// no heap allocation. Returns `None` if the tensor isn't declared.
167    ///
168    /// The byte length is computed from the tensor's `(elem_count, ggml_dtype)`
169    /// using candle's `block_size()` / `type_size()`. For raw quant tensors
170    /// (Q4K / Q6K / etc.), these bytes are exactly what `QTensor::data()`
171    /// would return — but with no copy.
172    pub fn tensor_byte_slice(&self, name: &str) -> Option<&[u8]> {
173        let info = self.content.tensor_infos.get(name)?;
174        let elem_count = info.shape.elem_count();
175        let block_size = info.ggml_dtype.block_size();
176        if !elem_count.is_multiple_of(block_size) {
177            return None;
178        }
179        let size_in_bytes = elem_count / block_size * info.ggml_dtype.type_size();
180        let abs_start = (self.content.tensor_data_offset + info.offset) as usize;
181        let abs_end = abs_start.checked_add(size_in_bytes)?;
182        if abs_end > self.mmap.len() {
183            return None;
184        }
185        Some(&self.mmap[abs_start..abs_end])
186    }
187
188    /// `(byte_offset_in_mmap, byte_length)` for tensor `name`. Same
189    /// computation as [`Self::tensor_byte_slice`] but returns the indices
190    /// rather than the slice — useful when the caller already has the
191    /// mmap base pointer (e.g. when binding a region of a shared buffer
192    /// at a given offset).
193    pub fn tensor_byte_range(&self, name: &str) -> Option<(usize, usize)> {
194        let info = self.content.tensor_infos.get(name)?;
195        let elem_count = info.shape.elem_count();
196        let block_size = info.ggml_dtype.block_size();
197        if !elem_count.is_multiple_of(block_size) {
198            return None;
199        }
200        let size_in_bytes = elem_count / block_size * info.ggml_dtype.type_size();
201        let abs_start = (self.content.tensor_data_offset + info.offset) as usize;
202        let abs_end = abs_start.checked_add(size_in_bytes)?;
203        if abs_end > self.mmap.len() {
204            return None;
205        }
206        Some((abs_start, size_in_bytes))
207    }
208}
209
210impl std::fmt::Debug for GgufFile {
211    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
212        f.debug_struct("GgufFile")
213            .field("size_bytes", &self.mmap.len())
214            .field("metadata_keys", &self.content.metadata.len())
215            .field("tensor_count", &self.content.tensor_infos.len())
216            .field("tensor_data_offset", &self.content.tensor_data_offset)
217            .finish()
218    }
219}
ferrum_quantization/gguf/file.rs

ferrum_quantization/gguf/
file.rs