1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
//! GGUF metadata key schema — GH-322
//!
//! Central definition of all GGUF metadata keys used by realizar.
//! All GGUF key construction MUST go through this module.
//!
//! Architecture-parameterized keys follow the GGUF spec pattern:
//! `{arch}.{suffix}` where `arch` = `general.architecture` value
//!
//! Reference: <https://github.com/ggerganov/ggml/blob/master/docs/gguf.md>
// ─── Fixed keys (no architecture prefix) ─────────────────────────────────────
/// `general.architecture` — the model architecture string
pub const GENERAL_ARCHITECTURE: &str = "general.architecture";
/// `tokenizer.ggml.model` — tokenizer type (e.g. "gpt2", "llama", "bpe")
pub const TOKENIZER_MODEL: &str = "tokenizer.ggml.model";
/// `tokenizer.ggml.tokens` — vocabulary token array
pub const TOKENIZER_TOKENS: &str = "tokenizer.ggml.tokens";
/// `tokenizer.ggml.bos_token_id` — beginning-of-sentence token ID
pub const TOKENIZER_BOS_ID: &str = "tokenizer.ggml.bos_token_id";
/// `tokenizer.ggml.eos_token_id` — end-of-sentence token ID
pub const TOKENIZER_EOS_ID: &str = "tokenizer.ggml.eos_token_id";
/// `tokenizer.ggml.add_bos_token` — whether to prepend BOS token
pub const TOKENIZER_ADD_BOS: &str = "tokenizer.ggml.add_bos_token";
/// `tokenizer.ggml.vocab_size` — vocabulary size (alternative location)
pub const TOKENIZER_VOCAB_SIZE: &str = "tokenizer.ggml.vocab_size";
/// `tokenizer.ggml.tokens.size` — vocabulary size (test factory location)
pub const TOKENIZER_TOKENS_SIZE: &str = "tokenizer.ggml.tokens.size";
// ─── Architecture-parameterized key suffixes ─────────────────────────────────
//
// Use with `arch_key(arch, SUFFIX)` to get the full `{arch}.{suffix}` key.
/// `{arch}.embedding_length` — hidden dimension / embedding size
pub const EMBEDDING_LENGTH: &str = "embedding_length";
/// `{arch}.block_count` — number of transformer layers
pub const BLOCK_COUNT: &str = "block_count";
/// `{arch}.attention.head_count` — number of attention heads
pub const ATTENTION_HEAD_COUNT: &str = "attention.head_count";
/// `{arch}.attention.head_count_kv` — number of key-value heads (GQA)
pub const ATTENTION_HEAD_COUNT_KV: &str = "attention.head_count_kv";
/// `{arch}.attention.key_length` — per-head key dimension
pub const ATTENTION_KEY_LENGTH: &str = "attention.key_length";
/// `{arch}.attention.value_length` — per-head value dimension
pub const ATTENTION_VALUE_LENGTH: &str = "attention.value_length";
/// `{arch}.attention.layer_norm_rms_epsilon` — RMSNorm epsilon
pub const ATTENTION_LAYER_NORM_RMS_EPSILON: &str = "attention.layer_norm_rms_epsilon";
/// `{arch}.attention.layer_norm_epsilon` — LayerNorm epsilon (GPT-2/Phi-2)
pub const ATTENTION_LAYER_NORM_EPSILON: &str = "attention.layer_norm_epsilon";
/// `{arch}.context_length` — maximum sequence length
pub const CONTEXT_LENGTH: &str = "context_length";
/// `{arch}.rope.freq_base` — RoPE frequency base (LLaMA: 10000, Qwen2: 1000000)
pub const ROPE_FREQ_BASE: &str = "rope.freq_base";
/// `{arch}.rope.scaling.type` — RoPE scaling type (none, linear, yarn, neox)
pub const ROPE_SCALING_TYPE: &str = "rope.scaling.type";
/// `{arch}.feed_forward_length` — FFN intermediate dimension
pub const FEED_FORWARD_LENGTH: &str = "feed_forward_length";
/// `{arch}.vocab_size` — vocabulary size (architecture-specific location)
pub const VOCAB_SIZE: &str = "vocab_size";
// ─── Key construction ────────────────────────────────────────────────────────
/// Construct an architecture-parameterized GGUF metadata key.
///
/// Returns `"{arch}.{suffix}"` where `suffix` is one of the constants above.
///
/// # Examples
///
/// ```rust,ignore
/// use realizar::gguf::keys;
/// let key = keys::arch_key("llama", keys::EMBEDDING_LENGTH);
/// assert_eq!(key, "llama.embedding_length");
/// ```
pub