model_artifact/
gguf.rs

1use std::io::{Read, Seek, SeekFrom};
2use std::path::Path;
3
4const MAX_GGUF_STRING_BYTES: u64 = 1_000_000;
5const MAX_GGUF_ARRAY_ELEMENTS: u64 = 1_000_000;
6const MAX_GGUF_ARRAY_DEPTH: u32 = 64;
7const MAX_GGUF_TENSOR_DIMS: u32 = 8;
8const MAX_GGUF_HEADER_KV_COUNT: usize = 1_000_000;
9const MAX_GGUF_TENSOR_COUNT: usize = 1_000_000;
10
11/// GGUF value types (matching gguf.h enum).
12#[repr(u32)]
13#[derive(Debug, Clone, Copy, PartialEq)]
14enum GgufType {
15    Uint8 = 0,
16    Int8 = 1,
17    Uint16 = 2,
18    Int16 = 3,
19    Uint32 = 4,
20    Int32 = 5,
21    Float32 = 6,
22    Bool = 7,
23    String = 8,
24    Array = 9,
25    Uint64 = 10,
26    Int64 = 11,
27    Float64 = 12,
28}
29
30impl GgufType {
31    fn from_u32(v: u32) -> Option<Self> {
32        match v {
33            0 => Some(Self::Uint8),
34            1 => Some(Self::Int8),
35            2 => Some(Self::Uint16),
36            3 => Some(Self::Int16),
37            4 => Some(Self::Uint32),
38            5 => Some(Self::Int32),
39            6 => Some(Self::Float32),
40            7 => Some(Self::Bool),
41            8 => Some(Self::String),
42            9 => Some(Self::Array),
43            10 => Some(Self::Uint64),
44            11 => Some(Self::Int64),
45            12 => Some(Self::Float64),
46            _ => None,
47        }
48    }
49
50    fn fixed_size(self) -> Option<usize> {
51        match self {
52            Self::Uint8 | Self::Int8 | Self::Bool => Some(1),
53            Self::Uint16 | Self::Int16 => Some(2),
54            Self::Uint32 | Self::Int32 | Self::Float32 => Some(4),
55            Self::Uint64 | Self::Int64 | Self::Float64 => Some(8),
56            Self::String | Self::Array => None,
57        }
58    }
59}
60
61fn read_u32(f: &mut std::fs::File) -> std::io::Result<u32> {
62    let mut buf = [0u8; 4];
63    f.read_exact(&mut buf)?;
64    Ok(u32::from_le_bytes(buf))
65}
66
67fn read_u64(f: &mut std::fs::File) -> std::io::Result<u64> {
68    let mut buf = [0u8; 8];
69    f.read_exact(&mut buf)?;
70    Ok(u64::from_le_bytes(buf))
71}
72
73fn read_i32(f: &mut std::fs::File) -> std::io::Result<i32> {
74    let mut buf = [0u8; 4];
75    f.read_exact(&mut buf)?;
76    Ok(i32::from_le_bytes(buf))
77}
78
79fn read_i64(f: &mut std::fs::File) -> std::io::Result<i64> {
80    let mut buf = [0u8; 8];
81    f.read_exact(&mut buf)?;
82    Ok(i64::from_le_bytes(buf))
83}
84
85fn read_gguf_header_count(
86    f: &mut std::fs::File,
87    max: usize,
88    label: &str,
89) -> std::io::Result<usize> {
90    let value = read_i64(f)?;
91    let count = usize::try_from(value).map_err(|_| {
92        std::io::Error::new(std::io::ErrorKind::InvalidData, format!("negative {label}"))
93    })?;
94    if count > max {
95        return Err(std::io::Error::new(
96            std::io::ErrorKind::InvalidData,
97            format!("{label} too large"),
98        ));
99    }
100    Ok(count)
101}
102
103fn read_bounded_len(f: &mut std::fs::File, max: u64, label: &str) -> std::io::Result<usize> {
104    let len = read_u64(f)?;
105    if len > max {
106        return Err(std::io::Error::new(
107            std::io::ErrorKind::InvalidData,
108            format!("{label} too long"),
109        ));
110    }
111    usize::try_from(len).map_err(|_| {
112        std::io::Error::new(
113            std::io::ErrorKind::InvalidData,
114            format!("{label} too large"),
115        )
116    })
117}
118
119fn read_gguf_string(f: &mut std::fs::File) -> std::io::Result<String> {
120    let len = read_bounded_len(f, MAX_GGUF_STRING_BYTES, "string")?;
121    let mut buf = vec![0u8; len];
122    f.read_exact(&mut buf)?;
123    String::from_utf8(buf).map_err(|_| {
124        std::io::Error::new(
125            std::io::ErrorKind::InvalidData,
126            "invalid UTF-8 in GGUF string",
127        )
128    })
129}
130
131fn skip_gguf_value(f: &mut std::fs::File, typ: GgufType) -> std::io::Result<()> {
132    skip_gguf_value_with_depth(f, typ, 0)
133}
134
135fn skip_gguf_value_with_depth(
136    f: &mut std::fs::File,
137    typ: GgufType,
138    depth: u32,
139) -> std::io::Result<()> {
140    match typ {
141        GgufType::String => {
142            let _ = read_gguf_string(f)?;
143        }
144        GgufType::Array => {
145            if depth >= MAX_GGUF_ARRAY_DEPTH {
146                return Err(std::io::Error::new(
147                    std::io::ErrorKind::InvalidData,
148                    "GGUF nesting too deep",
149                ));
150            }
151            let elem_type = GgufType::from_u32(read_u32(f)?).ok_or_else(|| {
152                std::io::Error::new(std::io::ErrorKind::InvalidData, "bad array type")
153            })?;
154            let count = read_bounded_len(f, MAX_GGUF_ARRAY_ELEMENTS, "array")?;
155            for _ in 0..count {
156                skip_gguf_value_with_depth(f, elem_type, depth + 1)?;
157            }
158        }
159        other => {
160            let size = other.fixed_size().unwrap_or(0);
161            f.seek(SeekFrom::Current(size as i64))?;
162        }
163    }
164    Ok(())
165}
166
167fn read_gguf_value_as_u32(f: &mut std::fs::File, typ: GgufType) -> std::io::Result<Option<u32>> {
168    match typ {
169        GgufType::Uint32 => Ok(Some(read_u32(f)?)),
170        GgufType::Int32 => {
171            let value = read_i32(f)?;
172            let value = u32::try_from(value).map_err(|_| {
173                std::io::Error::new(
174                    std::io::ErrorKind::InvalidData,
175                    "negative Int32 where unsigned GGUF value was expected",
176                )
177            })?;
178            Ok(Some(value))
179        }
180        GgufType::Uint16 => {
181            let mut buf = [0u8; 2];
182            f.read_exact(&mut buf)?;
183            Ok(Some(u16::from_le_bytes(buf) as u32))
184        }
185        GgufType::Uint8 => {
186            let mut buf = [0u8; 1];
187            f.read_exact(&mut buf)?;
188            Ok(Some(buf[0] as u32))
189        }
190        _ => {
191            skip_gguf_value(f, typ)?;
192            Ok(None)
193        }
194    }
195}
196
197fn read_gguf_value_as_f32(f: &mut std::fs::File, typ: GgufType) -> std::io::Result<Option<f32>> {
198    match typ {
199        GgufType::Float32 => {
200            let mut buf = [0u8; 4];
201            f.read_exact(&mut buf)?;
202            Ok(Some(f32::from_le_bytes(buf)))
203        }
204        _ => {
205            skip_gguf_value(f, typ)?;
206            Ok(None)
207        }
208    }
209}
210
211fn read_gguf_value_as_string_opt(
212    f: &mut std::fs::File,
213    typ: GgufType,
214) -> std::io::Result<Option<String>> {
215    match typ {
216        GgufType::String => Ok(Some(read_gguf_string(f)?)),
217        _ => {
218            skip_gguf_value(f, typ)?;
219            Ok(None)
220        }
221    }
222}
223
224#[derive(Clone, Debug, Default)]
225pub struct GgufCompactMeta {
226    pub architecture: String,
227    pub context_length: u32,
228    pub vocab_size: u32,
229    pub embedding_size: u32,
230    pub head_count: u32,
231    pub kv_head_count: u32,
232    pub layer_count: u32,
233    pub feed_forward_length: u32,
234    pub key_length: u32,
235    pub value_length: u32,
236    pub tokenizer_model_name: String,
237    pub rope_scale: f32,
238    pub rope_freq_base: f32,
239    pub expert_count: u32,
240    pub expert_used_count: u32,
241}
242
243impl GgufCompactMeta {
244    pub fn effective_kv_head_count(&self) -> Option<u32> {
245        if self.kv_head_count > 0 {
246            Some(self.kv_head_count)
247        } else if self.head_count > 0 {
248            Some(self.head_count)
249        } else {
250            None
251        }
252    }
253
254    pub fn k_cache_bytes_per_token_f16(&self) -> Option<u64> {
255        GgufKvCacheQuant::f16().k_cache_bytes_per_token(self)
256    }
257
258    pub fn v_cache_bytes_per_token_f16(&self) -> Option<u64> {
259        GgufKvCacheQuant::f16().v_cache_bytes_per_token(self)
260    }
261
262    pub fn kv_cache_bytes_per_token_f16(&self) -> Option<u64> {
263        GgufKvCacheQuant::f16().kv_cache_bytes_per_token(self)
264    }
265}
266
267#[derive(Clone, Copy, Debug, Eq, PartialEq)]
268pub enum GgufKvCacheType {
269    F16,
270    Q8_0,
271    Q4_0,
272}
273
274impl GgufKvCacheType {
275    pub fn from_llama_arg(value: &str) -> Option<Self> {
276        match value.to_ascii_lowercase().as_str() {
277            "f16" => Some(Self::F16),
278            "q8_0" => Some(Self::Q8_0),
279            "q4_0" => Some(Self::Q4_0),
280            _ => None,
281        }
282    }
283
284    pub const fn as_llama_arg(self) -> &'static str {
285        match self {
286            Self::F16 => "f16",
287            Self::Q8_0 => "q8_0",
288            Self::Q4_0 => "q4_0",
289        }
290    }
291
292    fn block_shape(self) -> (u64, u64) {
293        match self {
294            Self::F16 => (1, 2),
295            Self::Q8_0 => (32, 34),
296            Self::Q4_0 => (32, 18),
297        }
298    }
299
300    fn bytes_for_elements(self, elements: u64) -> Option<u64> {
301        let (block_elements, block_bytes) = self.block_shape();
302        let blocks = elements
303            .checked_add(block_elements.checked_sub(1)?)?
304            .checked_div(block_elements)?;
305        blocks.checked_mul(block_bytes)
306    }
307}
308
309#[derive(Clone, Copy, Debug, Eq, PartialEq)]
310pub struct GgufKvCacheQuant {
311    pub k: GgufKvCacheType,
312    pub v: GgufKvCacheType,
313}
314
315impl GgufKvCacheQuant {
316    /// f16 K + f16 V — highest quality, largest KV cache.
317    pub const F16: Self = Self {
318        k: GgufKvCacheType::F16,
319        v: GgufKvCacheType::F16,
320    };
321
322    /// q8_0 K + q8_0 V — moderate compression.
323    pub const Q8_0: Self = Self {
324        k: GgufKvCacheType::Q8_0,
325        v: GgufKvCacheType::Q8_0,
326    };
327
328    /// q4_0 K + q4_0 V — most aggressive compression, smallest KV cache.
329    pub const Q4_0: Self = Self {
330        k: GgufKvCacheType::Q4_0,
331        v: GgufKvCacheType::Q4_0,
332    };
333
334    pub const fn new(k: GgufKvCacheType, v: GgufKvCacheType) -> Self {
335        Self { k, v }
336    }
337
338    pub const fn f16() -> Self {
339        Self::F16
340    }
341
342    /// Returns `true` if `self` uses more aggressive (smaller) quantisation
343    /// than `other`.
344    pub const fn is_more_aggressive_than(self, other: Self) -> bool {
345        Self::aggressiveness(self) > Self::aggressiveness(other)
346    }
347
348    const fn aggressiveness(q: Self) -> u8 {
349        Self::type_aggressiveness(q.k) + Self::type_aggressiveness(q.v)
350    }
351
352    const fn type_aggressiveness(t: GgufKvCacheType) -> u8 {
353        match t {
354            GgufKvCacheType::F16 => 0,
355            GgufKvCacheType::Q8_0 => 1,
356            GgufKvCacheType::Q4_0 => 2,
357        }
358    }
359
360    pub fn from_llama_args(cache_type_k: &str, cache_type_v: &str) -> Option<Self> {
361        Some(Self {
362            k: GgufKvCacheType::from_llama_arg(cache_type_k)?,
363            v: GgufKvCacheType::from_llama_arg(cache_type_v)?,
364        })
365    }
366
367    pub fn k_cache_bytes_per_token(self, meta: &GgufCompactMeta) -> Option<u64> {
368        cache_bytes_per_token(meta, meta.key_length, self.k)
369    }
370
371    pub fn v_cache_bytes_per_token(self, meta: &GgufCompactMeta) -> Option<u64> {
372        cache_bytes_per_token(meta, meta.value_length, self.v)
373    }
374
375    pub fn kv_cache_bytes_per_token(self, meta: &GgufCompactMeta) -> Option<u64> {
376        self.k_cache_bytes_per_token(meta)?
377            .checked_add(self.v_cache_bytes_per_token(meta)?)
378    }
379}
380
381fn cache_bytes_per_token(
382    meta: &GgufCompactMeta,
383    vector_length: u32,
384    cache_type: GgufKvCacheType,
385) -> Option<u64> {
386    let kv_heads = u64::from(meta.effective_kv_head_count()?);
387    let vector_length = u64::from((vector_length > 0).then_some(vector_length)?);
388    let layers = u64::from((meta.layer_count > 0).then_some(meta.layer_count)?);
389    let elements_per_layer = kv_heads.checked_mul(vector_length)?;
390    cache_type
391        .bytes_for_elements(elements_per_layer)?
392        .checked_mul(layers)
393}
394
395#[derive(Clone, Debug, Default, Eq, PartialEq)]
396pub struct GgufTensorByteProfile {
397    pub expert_count: u32,
398    pub expert_used_count: u32,
399    pub full_model_bytes: u64,
400    pub base_resident_bytes: u64,
401    pub expert_tensor_bytes: u64,
402    pub file_overhead_bytes: u64,
403}
404
405#[derive(Clone, Debug)]
406struct GgufTensorInfo {
407    name: String,
408    offset: u64,
409}
410
411/// Scan a GGUF file header and return compact structural metadata.
412/// Reads only the KV section, never tensor data. Returns None on any parse failure.
413pub fn scan_gguf_compact_meta(path: &Path) -> Option<GgufCompactMeta> {
414    let mut f = std::fs::File::open(path).ok()?;
415
416    let mut magic = [0u8; 4];
417    f.read_exact(&mut magic).ok()?;
418    if &magic != b"GGUF" {
419        return None;
420    }
421    let version = read_u32(&mut f).ok()?;
422    if version < 2 {
423        return None;
424    }
425    let _n_tensors = read_gguf_header_count(&mut f, MAX_GGUF_TENSOR_COUNT, "tensor count").ok()?;
426    let n_kv = read_gguf_header_count(&mut f, MAX_GGUF_HEADER_KV_COUNT, "KV count").ok()?;
427
428    let mut meta = GgufCompactMeta::default();
429    for _ in 0..n_kv {
430        let key = read_gguf_string(&mut f).ok()?;
431        let vtype = GgufType::from_u32(read_u32(&mut f).ok()?)?;
432
433        if key == "general.architecture" {
434            meta.architecture = read_gguf_value_as_string_opt(&mut f, vtype).ok()??;
435        } else if key == "tokenizer.ggml.model" {
436            meta.tokenizer_model_name = read_gguf_value_as_string_opt(&mut f, vtype).ok()??;
437        } else if key.ends_with(".context_length") {
438            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
439                meta.context_length = v;
440            }
441        } else if key.ends_with(".embedding_length") {
442            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
443                meta.embedding_size = v;
444            }
445        } else if key.ends_with(".head_count") && !key.ends_with("_kv") {
446            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
447                meta.head_count = v;
448            }
449        } else if key.ends_with(".attention.head_count_kv") {
450            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
451                meta.kv_head_count = v;
452            }
453        } else if key.ends_with(".block_count") {
454            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
455                meta.layer_count = v;
456            }
457        } else if key.ends_with(".feed_forward_length") {
458            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
459                meta.feed_forward_length = v;
460            }
461        } else if key.ends_with(".attention.key_length") {
462            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
463                meta.key_length = v;
464            }
465        } else if key.ends_with(".attention.value_length") {
466            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
467                meta.value_length = v;
468            }
469        } else if key.ends_with(".rope.scale") {
470            if let Ok(Some(v)) = read_gguf_value_as_f32(&mut f, vtype) {
471                meta.rope_scale = v;
472            }
473        } else if key.ends_with(".rope.freq_base") {
474            if let Ok(Some(v)) = read_gguf_value_as_f32(&mut f, vtype) {
475                meta.rope_freq_base = v;
476            }
477        } else if key.ends_with(".vocab_size") {
478            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
479                meta.vocab_size = v;
480            }
481        } else if key.ends_with(".expert_count") {
482            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
483                meta.expert_count = v;
484            }
485        } else if key.ends_with(".expert_used_count") {
486            if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
487                meta.expert_used_count = v;
488            }
489        } else {
490            skip_gguf_value(&mut f, vtype).ok()?;
491        }
492    }
493
494    if meta.key_length == 0
495        && meta.head_count > 0
496        && let Some(key_length) = meta.embedding_size.checked_div(meta.head_count)
497    {
498        meta.key_length = key_length;
499    }
500    if meta.value_length == 0
501        && let Some(effective_kv) = meta.effective_kv_head_count()
502        && let Some(value_length) = meta.embedding_size.checked_div(effective_kv)
503    {
504        meta.value_length = value_length;
505    }
506
507    Some(meta)
508}
509
510fn align_offset(value: u64, alignment: u32) -> u64 {
511    let alignment = u64::from(alignment.max(1));
512    let remainder = value % alignment;
513    if remainder == 0 {
514        value
515    } else {
516        value + (alignment - remainder)
517    }
518}
519
520fn read_tensor_infos(
521    f: &mut std::fs::File,
522    n_tensors: usize,
523) -> std::io::Result<Vec<GgufTensorInfo>> {
524    let mut tensors = Vec::new();
525    tensors.try_reserve(n_tensors).map_err(|_| {
526        std::io::Error::new(
527            std::io::ErrorKind::InvalidData,
528            "GGUF tensor count requires too much memory",
529        )
530    })?;
531    for _ in 0..n_tensors {
532        let name = read_gguf_string(f)?;
533        let n_dims = read_u32(f)?;
534        if n_dims > MAX_GGUF_TENSOR_DIMS {
535            return Err(std::io::Error::new(
536                std::io::ErrorKind::InvalidData,
537                "too many GGUF tensor dimensions",
538            ));
539        }
540        for _ in 0..n_dims {
541            let _ = read_u64(f)?;
542        }
543        let _ = read_u32(f)?;
544        let offset = read_u64(f)?;
545        tensors.push(GgufTensorInfo { name, offset });
546    }
547    Ok(tensors)
548}
549
550fn is_expert_partitioned_tensor(name: &str) -> bool {
551    let lower = name.to_ascii_lowercase();
552    if lower.contains("shared_expert") || lower.contains("sharedexpert") || lower.contains("shexp")
553    {
554        return false;
555    }
556
557    lower.contains("ffn_gate_exps")
558        || lower.contains("ffn_up_exps")
559        || lower.contains("ffn_down_exps")
560        || lower.contains("exp_probs")
561        || lower.contains(".expert")
562        || lower.contains("_expert")
563}
564
565/// Scan GGUF tensor metadata and estimate which bytes are always resident versus
566/// expert-partitioned. Reads only the header and tensor-info tables.
567pub fn scan_gguf_tensor_byte_profile(path: &Path) -> Option<GgufTensorByteProfile> {
568    let mut f = std::fs::File::open(path).ok()?;
569    let file_len = f.metadata().ok()?.len();
570
571    let mut magic = [0u8; 4];
572    f.read_exact(&mut magic).ok()?;
573    if &magic != b"GGUF" {
574        return None;
575    }
576    let version = read_u32(&mut f).ok()?;
577    if version < 2 {
578        return None;
579    }
580
581    let n_tensors = read_gguf_header_count(&mut f, MAX_GGUF_TENSOR_COUNT, "tensor count").ok()?;
582    let n_kv = read_gguf_header_count(&mut f, MAX_GGUF_HEADER_KV_COUNT, "KV count").ok()?;
583
584    let mut expert_count = 0u32;
585    let mut expert_used_count = 0u32;
586    let mut alignment = 32u32;
587
588    for _ in 0..n_kv {
589        let key = read_gguf_string(&mut f).ok()?;
590        let vtype = GgufType::from_u32(read_u32(&mut f).ok()?)?;
591
592        if key == "general.alignment" {
593            if let Ok(Some(value)) = read_gguf_value_as_u32(&mut f, vtype) {
594                alignment = value.max(1);
595            }
596        } else if key.ends_with(".expert_count") {
597            if let Ok(Some(value)) = read_gguf_value_as_u32(&mut f, vtype) {
598                expert_count = value;
599            }
600        } else if key.ends_with(".expert_used_count") {
601            if let Ok(Some(value)) = read_gguf_value_as_u32(&mut f, vtype) {
602                expert_used_count = value;
603            }
604        } else {
605            skip_gguf_value(&mut f, vtype).ok()?;
606        }
607    }
608
609    let mut tensors = read_tensor_infos(&mut f, n_tensors).ok()?;
610    if tensors.is_empty() {
611        return Some(GgufTensorByteProfile {
612            expert_count,
613            expert_used_count,
614            full_model_bytes: file_len,
615            base_resident_bytes: 0,
616            expert_tensor_bytes: 0,
617            file_overhead_bytes: file_len,
618        });
619    }
620
621    let tensor_info_end = f.stream_position().ok()?;
622    let data_start = align_offset(tensor_info_end, alignment);
623    if data_start > file_len {
624        return None;
625    }
626    let data_len = file_len - data_start;
627
628    tensors.sort_by_key(|tensor| tensor.offset);
629    if tensors.first()?.offset > data_len {
630        return None;
631    }
632
633    let mut base_resident_bytes = 0u64;
634    let mut expert_tensor_bytes = 0u64;
635    for (index, tensor) in tensors.iter().enumerate() {
636        let next_offset = tensors
637            .get(index + 1)
638            .map(|next| next.offset)
639            .unwrap_or(data_len);
640        if next_offset < tensor.offset || next_offset > data_len {
641            return None;
642        }
643        let tensor_bytes = next_offset - tensor.offset;
644        if is_expert_partitioned_tensor(&tensor.name) {
645            expert_tensor_bytes = expert_tensor_bytes.saturating_add(tensor_bytes);
646        } else {
647            base_resident_bytes = base_resident_bytes.saturating_add(tensor_bytes);
648        }
649    }
650
651    let file_overhead_bytes = file_len.saturating_sub(base_resident_bytes + expert_tensor_bytes);
652    Some(GgufTensorByteProfile {
653        expert_count,
654        expert_used_count,
655        full_model_bytes: file_len,
656        base_resident_bytes,
657        expert_tensor_bytes,
658        file_overhead_bytes,
659    })
660}
661
662#[cfg(test)]
663mod tests {
664    use super::*;
665    use std::io::Write;
666    use std::path::PathBuf;
667    use std::time::{SystemTime, UNIX_EPOCH};
668
669    fn temp_file_path(prefix: &str) -> PathBuf {
670        let unique = SystemTime::now()
671            .duration_since(UNIX_EPOCH)
672            .unwrap()
673            .as_nanos();
674        std::env::temp_dir().join(format!("{prefix}-{unique}.gguf"))
675    }
676
677    fn write_bytes(prefix: &str, bytes: &[u8]) -> PathBuf {
678        let path = temp_file_path(prefix);
679        let mut file = std::fs::File::create(&path).unwrap();
680        file.write_all(bytes).unwrap();
681        file.flush().unwrap();
682        path
683    }
684
685    fn push_array_header(bytes: &mut Vec<u8>, elem_type: GgufType, count: u64) {
686        bytes.extend_from_slice(&(elem_type as u32).to_le_bytes());
687        bytes.extend_from_slice(&count.to_le_bytes());
688    }
689
690    fn push_gguf_string(bytes: &mut Vec<u8>, value: &str) {
691        bytes.extend_from_slice(&(value.len() as u64).to_le_bytes());
692        bytes.extend_from_slice(value.as_bytes());
693    }
694
695    fn push_u32_kv(bytes: &mut Vec<u8>, key: &str, value: u32) {
696        push_gguf_string(bytes, key);
697        bytes.extend_from_slice(&(GgufType::Uint32 as u32).to_le_bytes());
698        bytes.extend_from_slice(&value.to_le_bytes());
699    }
700
701    fn push_tensor_info(bytes: &mut Vec<u8>, name: &str, offset: u64) {
702        push_gguf_string(bytes, name);
703        bytes.extend_from_slice(&1u32.to_le_bytes());
704        bytes.extend_from_slice(&16u64.to_le_bytes());
705        bytes.extend_from_slice(&(GgufType::Uint8 as u32).to_le_bytes());
706        bytes.extend_from_slice(&offset.to_le_bytes());
707    }
708
709    #[test]
710    fn skip_gguf_value_rejects_excessive_array_depth() {
711        let mut bytes = Vec::new();
712        for _ in 0..=MAX_GGUF_ARRAY_DEPTH {
713            push_array_header(&mut bytes, GgufType::Array, 1);
714        }
715        push_array_header(&mut bytes, GgufType::Uint8, 1);
716        bytes.push(0);
717
718        let path = write_bytes("model-artifact-gguf-depth", &bytes);
719        let mut file = std::fs::File::open(&path).unwrap();
720        let err = skip_gguf_value(&mut file, GgufType::Array).unwrap_err();
721        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
722        assert!(err.to_string().contains("nesting too deep"));
723        let _ = std::fs::remove_file(path);
724    }
725
726    #[test]
727    fn skip_gguf_value_rejects_excessive_array_count() {
728        let mut bytes = Vec::new();
729        push_array_header(&mut bytes, GgufType::Uint8, MAX_GGUF_ARRAY_ELEMENTS + 1);
730
731        let path = write_bytes("model-artifact-gguf-count", &bytes);
732        let mut file = std::fs::File::open(&path).unwrap();
733        let err = skip_gguf_value(&mut file, GgufType::Array).unwrap_err();
734        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
735        assert!(err.to_string().contains("array too long"));
736        let _ = std::fs::remove_file(path);
737    }
738
739    #[test]
740    fn scan_gguf_compact_meta_returns_none_on_malicious_nested_array() {
741        let mut bytes = Vec::new();
742        bytes.extend_from_slice(b"GGUF");
743        bytes.extend_from_slice(&2u32.to_le_bytes());
744        bytes.extend_from_slice(&0i64.to_le_bytes());
745        bytes.extend_from_slice(&1i64.to_le_bytes());
746        push_gguf_string(&mut bytes, "general.architecture");
747        bytes.extend_from_slice(&(GgufType::Array as u32).to_le_bytes());
748        for _ in 0..=MAX_GGUF_ARRAY_DEPTH {
749            push_array_header(&mut bytes, GgufType::Array, 1);
750        }
751        push_array_header(&mut bytes, GgufType::Uint8, 1);
752        bytes.push(0);
753
754        let path = write_bytes("model-artifact-gguf-malicious", &bytes);
755        assert!(scan_gguf_compact_meta(&path).is_none());
756        let _ = std::fs::remove_file(path);
757    }
758
759    #[test]
760    fn scan_gguf_compact_meta_derives_value_length_from_kv_heads_without_head_count() {
761        let mut bytes = Vec::new();
762        bytes.extend_from_slice(b"GGUF");
763        bytes.extend_from_slice(&2u32.to_le_bytes());
764        bytes.extend_from_slice(&0i64.to_le_bytes());
765        bytes.extend_from_slice(&2i64.to_le_bytes());
766        push_u32_kv(&mut bytes, "llama.embedding_length", 4096);
767        push_u32_kv(&mut bytes, "llama.attention.head_count_kv", 8);
768
769        let path = write_bytes("model-artifact-gguf-kv-heads", &bytes);
770        let meta = scan_gguf_compact_meta(&path).expect("should parse GGUF");
771        assert_eq!(meta.head_count, 0);
772        assert_eq!(meta.kv_head_count, 8);
773        assert_eq!(meta.key_length, 0);
774        assert_eq!(meta.value_length, 512);
775        let _ = std::fs::remove_file(path);
776    }
777
778    #[test]
779    fn scan_gguf_compact_meta_preserves_kv_head_count() {
780        let mut bytes = Vec::new();
781        bytes.extend_from_slice(b"GGUF");
782        bytes.extend_from_slice(&2u32.to_le_bytes());
783        bytes.extend_from_slice(&0i64.to_le_bytes());
784        bytes.extend_from_slice(&6i64.to_le_bytes());
785        push_u32_kv(&mut bytes, "llama.embedding_length", 4096);
786        push_u32_kv(&mut bytes, "llama.attention.head_count", 32);
787        push_u32_kv(&mut bytes, "llama.attention.head_count_kv", 8);
788        push_u32_kv(&mut bytes, "llama.block_count", 24);
789        push_u32_kv(&mut bytes, "llama.attention.key_length", 128);
790        push_u32_kv(&mut bytes, "llama.attention.value_length", 128);
791
792        let path = write_bytes("model-artifact-gguf-kv-head-count", &bytes);
793        let meta = scan_gguf_compact_meta(&path).expect("should parse GGUF");
794        assert_eq!(meta.head_count, 32);
795        assert_eq!(meta.kv_head_count, 8);
796        assert_eq!(meta.effective_kv_head_count(), Some(8));
797        assert_eq!(meta.k_cache_bytes_per_token_f16(), Some(49_152));
798        assert_eq!(meta.v_cache_bytes_per_token_f16(), Some(49_152));
799        let _ = std::fs::remove_file(path);
800    }
801
802    #[test]
803    fn kv_cache_quant_prices_key_and_value_types_independently() {
804        let meta = GgufCompactMeta {
805            head_count: 32,
806            kv_head_count: 8,
807            layer_count: 24,
808            key_length: 128,
809            value_length: 128,
810            ..Default::default()
811        };
812        let quant = GgufKvCacheQuant::new(GgufKvCacheType::Q8_0, GgufKvCacheType::Q4_0);
813
814        assert_eq!(quant.k_cache_bytes_per_token(&meta), Some(26_112));
815        assert_eq!(quant.v_cache_bytes_per_token(&meta), Some(13_824));
816        assert_eq!(quant.kv_cache_bytes_per_token(&meta), Some(39_936));
817    }
818
819    #[test]
820    fn kv_cache_quant_prices_key_and_value_widths_independently() {
821        let meta = GgufCompactMeta {
822            head_count: 32,
823            kv_head_count: 8,
824            layer_count: 24,
825            key_length: 64,
826            value_length: 256,
827            ..Default::default()
828        };
829        let quant = GgufKvCacheQuant::new(GgufKvCacheType::Q8_0, GgufKvCacheType::Q4_0);
830
831        assert_eq!(quant.k_cache_bytes_per_token(&meta), Some(13_056));
832        assert_eq!(quant.v_cache_bytes_per_token(&meta), Some(27_648));
833        assert_eq!(quant.kv_cache_bytes_per_token(&meta), Some(40_704));
834    }
835
836    #[test]
837    fn kv_cache_bytes_per_token_returns_none_when_required_fields_are_missing() {
838        let meta = GgufCompactMeta {
839            head_count: 32,
840            layer_count: 24,
841            key_length: 128,
842            ..Default::default()
843        };
844
845        assert_eq!(meta.k_cache_bytes_per_token_f16(), Some(196_608));
846        assert_eq!(meta.v_cache_bytes_per_token_f16(), None);
847        assert_eq!(
848            GgufKvCacheQuant::f16().kv_cache_bytes_per_token(&meta),
849            None
850        );
851    }
852
853    #[test]
854    fn scan_gguf_compact_meta_rejects_negative_kv_count() {
855        let mut bytes = Vec::new();
856        bytes.extend_from_slice(b"GGUF");
857        bytes.extend_from_slice(&2u32.to_le_bytes());
858        bytes.extend_from_slice(&0i64.to_le_bytes());
859        bytes.extend_from_slice(&(-1i64).to_le_bytes());
860
861        let path = write_bytes("model-artifact-gguf-negative-kv", &bytes);
862        assert!(scan_gguf_compact_meta(&path).is_none());
863        let _ = std::fs::remove_file(path);
864    }
865
866    #[test]
867    fn scan_gguf_tensor_byte_profile_rejects_excessive_tensor_count() {
868        let mut bytes = Vec::new();
869        bytes.extend_from_slice(b"GGUF");
870        bytes.extend_from_slice(&2u32.to_le_bytes());
871        bytes.extend_from_slice(&((MAX_GGUF_TENSOR_COUNT as i64) + 1).to_le_bytes());
872        bytes.extend_from_slice(&0i64.to_le_bytes());
873
874        let path = write_bytes("model-artifact-gguf-too-many-tensors", &bytes);
875        assert!(scan_gguf_tensor_byte_profile(&path).is_none());
876        let _ = std::fs::remove_file(path);
877    }
878
879    #[test]
880    fn read_gguf_value_as_u32_rejects_negative_int32() {
881        let path = write_bytes("model-artifact-gguf-negative-int32", &(-1i32).to_le_bytes());
882        let mut file = std::fs::File::open(&path).unwrap();
883        let err = read_gguf_value_as_u32(&mut file, GgufType::Int32).unwrap_err();
884        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
885        assert!(
886            err.to_string()
887                .contains("negative Int32 where unsigned GGUF value was expected")
888        );
889        let _ = std::fs::remove_file(path);
890    }
891
892    #[test]
893    fn scan_gguf_tensor_byte_profile_splits_base_and_expert_bytes() {
894        let mut bytes = Vec::new();
895        bytes.extend_from_slice(b"GGUF");
896        bytes.extend_from_slice(&2u32.to_le_bytes());
897        bytes.extend_from_slice(&2i64.to_le_bytes());
898        bytes.extend_from_slice(&3i64.to_le_bytes());
899
900        push_u32_kv(&mut bytes, "general.alignment", 32);
901        push_u32_kv(&mut bytes, "llama.expert_count", 8);
902        push_u32_kv(&mut bytes, "llama.expert_used_count", 2);
903
904        push_tensor_info(&mut bytes, "blk.0.ffn_up_exps.weight", 0);
905        push_tensor_info(&mut bytes, "blk.0.attn_q.weight", 64);
906
907        let data_start = align_offset(bytes.len() as u64, 32) as usize;
908        bytes.resize(data_start, 0);
909        bytes.resize(data_start + 96, 0);
910
911        let path = write_bytes("model-artifact-gguf-tensors", &bytes);
912        let profile = scan_gguf_tensor_byte_profile(&path).unwrap();
913        assert_eq!(profile.expert_count, 8);
914        assert_eq!(profile.expert_used_count, 2);
915        assert_eq!(profile.expert_tensor_bytes, 64);
916        assert_eq!(profile.base_resident_bytes, 32);
917        assert_eq!(profile.full_model_bytes, bytes.len() as u64);
918        assert_eq!(
919            profile.full_model_bytes,
920            profile.base_resident_bytes + profile.expert_tensor_bytes + profile.file_overhead_bytes
921        );
922        let _ = std::fs::remove_file(path);
923    }
924}
model_artifact/gguf.rs

model_artifact/
gguf.rs