ferrum_quantization/gguf/
mod.rs

1//! GGUF (GGML Universal Format) reader.
2//!
3//! Phase 1A scope: parse the file header, expose metadata + tensor descriptors,
4//! and load individual tensors as candle `QTensor` (which already handles
5//! dequant for every K-quant variant on CPU / Metal / CUDA).
6//!
7//! Phase 1A added the `GgufFile` reader. Phase 1B added `GgufLinear<B>`.
8//! Phase 1C (this commit) adds `GgufLoader<B>` — implements `WeightLoader<B>`
9//! against ferrum's HuggingFace-style tensor names by translating to GGUF's
10//! `blk.{i}.attn_q.weight` shorthand and handling `qkv_proj` / `gate_up_proj`
11//! fusion on the fly.
12//!
13//! ## Why wrap candle instead of writing a parser from scratch
14//!
15//! `candle_core::quantized::gguf_file::Content` already implements the full
16//! GGUF v1/v2/v3 spec, including all current GGML K-quant variants and
17//! Metal/CUDA/CPU dequant kernels. Re-implementing that for ferrum would be
18//! 3-5 weeks of work duplicating well-tested code. Instead this module
19//! provides a small adapter that:
20//!
21//!   1. Adds an `mmap`-backed `open(path)` constructor (candle's API takes
22//!      a generic `Read + Seek` and pushes file handling to the caller).
23//!   2. Provides typed metadata accessors keyed by string (`metadata_string`,
24//!      `metadata_u32`, …) so callers don't pattern-match on `Value` everywhere.
25//!   3. Documents the GGUF metadata key conventions ferrum relies on
26//!      (`general.architecture`, `<arch>.block_count`, …) in one place.
27
28pub mod file;
29pub mod linear;
30pub mod loader;
31pub mod names;
32
33pub use file::GgufFile;
34pub use linear::{linear_from_qtensor, GgufLinear};
35pub use loader::GgufLoader;
36pub use names::{ferrum_to_gguf, gate_up_split_parts, qkv_split_parts};
37
38// Re-exports — callers can import these from `ferrum_quantization::gguf` rather
39// than reaching into `candle_core::quantized::*` directly. Keeps the dep
40// surface explicit and lets us swap in a native parser later without churning
41// downstream call sites.
42pub use candle_core::quantized::gguf_file::{TensorInfo, Value, ValueType};
43pub use candle_core::quantized::{GgmlDType, QTensor};
ferrum_quantization/gguf/mod.rs

ferrum_quantization/gguf/
mod.rs