ferrum_quantization/gguf/
names.rs

1//! GGUF ↔ ferrum tensor-name translation.
2//!
3//! Ferrum models address weights using HuggingFace-style names
4//! (`model.layers.0.self_attn.q_proj.weight`). GGUF files use llama.cpp's
5//! shorthand (`blk.0.attn_q.weight`). This module is the single source of
6//! truth for that mapping; both `GgufLoader` and any future tooling go
7//! through `ferrum_to_gguf`.
8//!
9//! Scope: dense Llama-family models (Qwen3, Qwen2.x, Llama-3.x, Mistral,
10//! TinyLlama) and Qwen-style MoE families (Qwen3-MoE, Mixtral, DeepSeek-V2 —
11//! they all use the same GGUF layout: per-layer router `ffn_gate_inp` plus
12//! three stacked-expert tensors `ffn_{gate,up,down}_exps` with shape
13//! `[num_experts, ...]`).
14//!
15//! ## ferrum-side naming convention for MoE tensors
16//!
17//! ferrum mirrors GGUF's stacked layout rather than HuggingFace's
18//! `experts.{e}.gate_proj` per-expert layout. Reasons:
19//!   1. The stacked form is what candle's `QMatMul::indexed_moe_forward`
20//!      expects — slicing per-expert is a runtime concern, not a
21//!      storage concern.
22//!   2. Loading per-expert from GGUF would require N reads + concat per
23//!      layer (the dense path's qkv-fusion shim works the other direction
24//!      and only does 3, not N=128).
25//!   3. If a future safetensors-MoE loader needs to consume per-expert
26//!      tensors, it can do its own concat just like the dense Qwen2.5
27//!      path concatenates q/k/v.
28
29/// Translate a ferrum tensor name to its GGUF equivalent.
30///
31/// Returns `None` for names that have no GGUF counterpart (yet) or aren't
32/// recognised — caller treats this as "tensor not found".
33///
34/// Accepts both bare stems (`"lm_head"`, `"model.layers.0.self_attn.o_proj"`)
35/// and fully-qualified names (`"...weight"`, `"...bias"`). The `.weight` /
36/// `.bias` suffix passes through unchanged.
37pub fn ferrum_to_gguf(name: &str) -> Option<String> {
38    // Top-level tensors first — they don't fit the layer pattern.
39    if let Some(out) = map_top_level(name) {
40        return Some(out);
41    }
42
43    // Layer-scoped: must be "model.layers.{idx}.<rest>"
44    let rest = name.strip_prefix("model.layers.")?;
45    let (idx_str, after_idx) = rest.split_once('.')?;
46    let idx: usize = idx_str.parse().ok()?;
47    let mapped = map_layer_scoped(after_idx)?;
48    Some(format!("blk.{idx}.{mapped}"))
49}
50
51fn map_top_level(name: &str) -> Option<String> {
52    let mapped = match name {
53        "model.embed_tokens" => "token_embd",
54        "model.embed_tokens.weight" => "token_embd.weight",
55        "model.norm" => "output_norm",
56        "model.norm.weight" => "output_norm.weight",
57        "lm_head" => "output",
58        "lm_head.weight" => "output.weight",
59        _ => return None,
60    };
61    Some(mapped.to_string())
62}
63
64fn map_layer_scoped(rest: &str) -> Option<String> {
65    // Peel off the .weight / .bias suffix, map the stem, then re-attach.
66    let (stem, suffix) = if let Some(s) = rest.strip_suffix(".weight") {
67        (s, ".weight")
68    } else if let Some(s) = rest.strip_suffix(".bias") {
69        (s, ".bias")
70    } else {
71        (rest, "")
72    };
73
74    let mapped_stem = match stem {
75        // RMSNorms
76        "input_layernorm" => "attn_norm",
77        "post_attention_layernorm" => "ffn_norm",
78        // Attention projections
79        "self_attn.q_proj" => "attn_q",
80        "self_attn.k_proj" => "attn_k",
81        "self_attn.v_proj" => "attn_v",
82        "self_attn.o_proj" => "attn_output",
83        // Qwen3 QK-norm — only present on that family
84        "self_attn.q_norm" => "attn_q_norm",
85        "self_attn.k_norm" => "attn_k_norm",
86        // Dense MLP projections
87        "mlp.gate_proj" => "ffn_gate",
88        "mlp.up_proj" => "ffn_up",
89        "mlp.down_proj" => "ffn_down",
90        // MoE: router (gating) + stacked expert weights. Shape conventions:
91        //   router:    [hidden_size, num_experts]
92        //   gate_exps: [num_experts, expert_intermediate, hidden_size]
93        //   up_exps:   [num_experts, expert_intermediate, hidden_size]
94        //   down_exps: [num_experts, hidden_size, expert_intermediate]
95        // Loaded as flat fp32 buffers; the MoE runtime slices per-expert
96        // at forward time.
97        "mlp.router" => "ffn_gate_inp",
98        "mlp.gate_exps" => "ffn_gate_exps",
99        "mlp.up_exps" => "ffn_up_exps",
100        "mlp.down_exps" => "ffn_down_exps",
101        _ => return None,
102    };
103
104    Some(format!("{mapped_stem}{suffix}"))
105}
106
107/// The three sub-tensor names that fuse into `qkv_proj`, in the order the
108/// model expects them stacked along axis 0 (rows = output neurons).
109pub fn qkv_split_parts(layer_prefix: &str) -> [String; 3] {
110    [
111        format!("{layer_prefix}self_attn.q_proj"),
112        format!("{layer_prefix}self_attn.k_proj"),
113        format!("{layer_prefix}self_attn.v_proj"),
114    ]
115}
116
117/// The two sub-tensor names that fuse into `gate_up_proj`, stacked along
118/// axis 0 (gate first, then up).
119pub fn gate_up_split_parts(layer_prefix: &str) -> [String; 2] {
120    [
121        format!("{layer_prefix}mlp.gate_proj"),
122        format!("{layer_prefix}mlp.up_proj"),
123    ]
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn maps_top_level_tensors() {
132        assert_eq!(
133            ferrum_to_gguf("model.embed_tokens.weight"),
134            Some("token_embd.weight".into())
135        );
136        assert_eq!(
137            ferrum_to_gguf("model.embed_tokens"),
138            Some("token_embd".into())
139        );
140        assert_eq!(
141            ferrum_to_gguf("model.norm.weight"),
142            Some("output_norm.weight".into())
143        );
144        assert_eq!(ferrum_to_gguf("lm_head"), Some("output".into()));
145        assert_eq!(
146            ferrum_to_gguf("lm_head.weight"),
147            Some("output.weight".into())
148        );
149    }
150
151    #[test]
152    fn maps_layer_attention_weights() {
153        assert_eq!(
154            ferrum_to_gguf("model.layers.0.self_attn.q_proj.weight"),
155            Some("blk.0.attn_q.weight".into())
156        );
157        assert_eq!(
158            ferrum_to_gguf("model.layers.27.self_attn.k_proj.weight"),
159            Some("blk.27.attn_k.weight".into())
160        );
161        assert_eq!(
162            ferrum_to_gguf("model.layers.5.self_attn.v_proj.weight"),
163            Some("blk.5.attn_v.weight".into())
164        );
165        assert_eq!(
166            ferrum_to_gguf("model.layers.0.self_attn.o_proj.weight"),
167            Some("blk.0.attn_output.weight".into())
168        );
169        // bare stem (load_linear-style)
170        assert_eq!(
171            ferrum_to_gguf("model.layers.0.self_attn.o_proj"),
172            Some("blk.0.attn_output".into())
173        );
174    }
175
176    #[test]
177    fn maps_qwen3_qk_norm() {
178        assert_eq!(
179            ferrum_to_gguf("model.layers.0.self_attn.q_norm.weight"),
180            Some("blk.0.attn_q_norm.weight".into())
181        );
182        assert_eq!(
183            ferrum_to_gguf("model.layers.0.self_attn.k_norm.weight"),
184            Some("blk.0.attn_k_norm.weight".into())
185        );
186    }
187
188    #[test]
189    fn maps_attention_bias() {
190        assert_eq!(
191            ferrum_to_gguf("model.layers.0.self_attn.q_proj.bias"),
192            Some("blk.0.attn_q.bias".into())
193        );
194    }
195
196    #[test]
197    fn maps_layer_norms() {
198        assert_eq!(
199            ferrum_to_gguf("model.layers.0.input_layernorm.weight"),
200            Some("blk.0.attn_norm.weight".into())
201        );
202        assert_eq!(
203            ferrum_to_gguf("model.layers.0.post_attention_layernorm.weight"),
204            Some("blk.0.ffn_norm.weight".into())
205        );
206    }
207
208    #[test]
209    fn maps_mlp_projections() {
210        assert_eq!(
211            ferrum_to_gguf("model.layers.0.mlp.gate_proj.weight"),
212            Some("blk.0.ffn_gate.weight".into())
213        );
214        assert_eq!(
215            ferrum_to_gguf("model.layers.0.mlp.up_proj.weight"),
216            Some("blk.0.ffn_up.weight".into())
217        );
218        assert_eq!(
219            ferrum_to_gguf("model.layers.0.mlp.down_proj.weight"),
220            Some("blk.0.ffn_down.weight".into())
221        );
222    }
223
224    #[test]
225    fn maps_moe_router_and_stacked_experts() {
226        // Router (2-D, [hidden, num_experts])
227        assert_eq!(
228            ferrum_to_gguf("model.layers.0.mlp.router.weight"),
229            Some("blk.0.ffn_gate_inp.weight".into())
230        );
231        // Stacked expert weights (3-D, [num_experts, ...])
232        assert_eq!(
233            ferrum_to_gguf("model.layers.0.mlp.gate_exps.weight"),
234            Some("blk.0.ffn_gate_exps.weight".into())
235        );
236        assert_eq!(
237            ferrum_to_gguf("model.layers.27.mlp.up_exps.weight"),
238            Some("blk.27.ffn_up_exps.weight".into())
239        );
240        assert_eq!(
241            ferrum_to_gguf("model.layers.0.mlp.down_exps.weight"),
242            Some("blk.0.ffn_down_exps.weight".into())
243        );
244        // Bare stems (load_linear-style for 2-D router)
245        assert_eq!(
246            ferrum_to_gguf("model.layers.0.mlp.router"),
247            Some("blk.0.ffn_gate_inp".into())
248        );
249    }
250
251    #[test]
252    fn rejects_unknown_names() {
253        assert_eq!(ferrum_to_gguf("totally_made_up"), None);
254        assert_eq!(ferrum_to_gguf("model.layers.0.unknown_part.weight"), None);
255        assert_eq!(
256            ferrum_to_gguf("model.layers.bad_idx.input_layernorm.weight"),
257            None
258        );
259        // HF-style per-expert names are NOT supported (deliberately —
260        // the loader expects stacked names).
261        assert_eq!(
262            ferrum_to_gguf("model.layers.0.mlp.experts.0.gate_proj.weight"),
263            None
264        );
265    }
266
267    #[test]
268    fn split_parts_helpers() {
269        assert_eq!(
270            qkv_split_parts("model.layers.3."),
271            [
272                "model.layers.3.self_attn.q_proj".to_string(),
273                "model.layers.3.self_attn.k_proj".into(),
274                "model.layers.3.self_attn.v_proj".into(),
275            ]
276        );
277        assert_eq!(
278            gate_up_split_parts("model.layers.3."),
279            [
280                "model.layers.3.mlp.gate_proj".to_string(),
281                "model.layers.3.mlp.up_proj".into(),
282            ]
283        );
284    }
285}
ferrum_quantization/gguf/names.rs

ferrum_quantization/gguf/
names.rs