1use std::io::{Read, Seek, SeekFrom};
2use std::path::Path;
3
4const MAX_GGUF_STRING_BYTES: u64 = 1_000_000;
5const MAX_GGUF_ARRAY_ELEMENTS: u64 = 1_000_000;
6const MAX_GGUF_ARRAY_DEPTH: u32 = 64;
7const MAX_GGUF_TENSOR_DIMS: u32 = 8;
8const MAX_GGUF_HEADER_KV_COUNT: usize = 1_000_000;
9const MAX_GGUF_TENSOR_COUNT: usize = 1_000_000;
10
11#[repr(u32)]
13#[derive(Debug, Clone, Copy, PartialEq)]
14enum GgufType {
15 Uint8 = 0,
16 Int8 = 1,
17 Uint16 = 2,
18 Int16 = 3,
19 Uint32 = 4,
20 Int32 = 5,
21 Float32 = 6,
22 Bool = 7,
23 String = 8,
24 Array = 9,
25 Uint64 = 10,
26 Int64 = 11,
27 Float64 = 12,
28}
29
30impl GgufType {
31 fn from_u32(v: u32) -> Option<Self> {
32 match v {
33 0 => Some(Self::Uint8),
34 1 => Some(Self::Int8),
35 2 => Some(Self::Uint16),
36 3 => Some(Self::Int16),
37 4 => Some(Self::Uint32),
38 5 => Some(Self::Int32),
39 6 => Some(Self::Float32),
40 7 => Some(Self::Bool),
41 8 => Some(Self::String),
42 9 => Some(Self::Array),
43 10 => Some(Self::Uint64),
44 11 => Some(Self::Int64),
45 12 => Some(Self::Float64),
46 _ => None,
47 }
48 }
49
50 fn fixed_size(self) -> Option<usize> {
51 match self {
52 Self::Uint8 | Self::Int8 | Self::Bool => Some(1),
53 Self::Uint16 | Self::Int16 => Some(2),
54 Self::Uint32 | Self::Int32 | Self::Float32 => Some(4),
55 Self::Uint64 | Self::Int64 | Self::Float64 => Some(8),
56 Self::String | Self::Array => None,
57 }
58 }
59}
60
61fn read_u32(f: &mut std::fs::File) -> std::io::Result<u32> {
62 let mut buf = [0u8; 4];
63 f.read_exact(&mut buf)?;
64 Ok(u32::from_le_bytes(buf))
65}
66
67fn read_u64(f: &mut std::fs::File) -> std::io::Result<u64> {
68 let mut buf = [0u8; 8];
69 f.read_exact(&mut buf)?;
70 Ok(u64::from_le_bytes(buf))
71}
72
73fn read_i32(f: &mut std::fs::File) -> std::io::Result<i32> {
74 let mut buf = [0u8; 4];
75 f.read_exact(&mut buf)?;
76 Ok(i32::from_le_bytes(buf))
77}
78
79fn read_i64(f: &mut std::fs::File) -> std::io::Result<i64> {
80 let mut buf = [0u8; 8];
81 f.read_exact(&mut buf)?;
82 Ok(i64::from_le_bytes(buf))
83}
84
85fn read_gguf_header_count(
86 f: &mut std::fs::File,
87 max: usize,
88 label: &str,
89) -> std::io::Result<usize> {
90 let value = read_i64(f)?;
91 let count = usize::try_from(value).map_err(|_| {
92 std::io::Error::new(std::io::ErrorKind::InvalidData, format!("negative {label}"))
93 })?;
94 if count > max {
95 return Err(std::io::Error::new(
96 std::io::ErrorKind::InvalidData,
97 format!("{label} too large"),
98 ));
99 }
100 Ok(count)
101}
102
103fn read_bounded_len(f: &mut std::fs::File, max: u64, label: &str) -> std::io::Result<usize> {
104 let len = read_u64(f)?;
105 if len > max {
106 return Err(std::io::Error::new(
107 std::io::ErrorKind::InvalidData,
108 format!("{label} too long"),
109 ));
110 }
111 usize::try_from(len).map_err(|_| {
112 std::io::Error::new(
113 std::io::ErrorKind::InvalidData,
114 format!("{label} too large"),
115 )
116 })
117}
118
119fn read_gguf_string(f: &mut std::fs::File) -> std::io::Result<String> {
120 let len = read_bounded_len(f, MAX_GGUF_STRING_BYTES, "string")?;
121 let mut buf = vec![0u8; len];
122 f.read_exact(&mut buf)?;
123 String::from_utf8(buf).map_err(|_| {
124 std::io::Error::new(
125 std::io::ErrorKind::InvalidData,
126 "invalid UTF-8 in GGUF string",
127 )
128 })
129}
130
131fn skip_gguf_value(f: &mut std::fs::File, typ: GgufType) -> std::io::Result<()> {
132 skip_gguf_value_with_depth(f, typ, 0)
133}
134
135fn skip_gguf_value_with_depth(
136 f: &mut std::fs::File,
137 typ: GgufType,
138 depth: u32,
139) -> std::io::Result<()> {
140 match typ {
141 GgufType::String => {
142 let _ = read_gguf_string(f)?;
143 }
144 GgufType::Array => {
145 if depth >= MAX_GGUF_ARRAY_DEPTH {
146 return Err(std::io::Error::new(
147 std::io::ErrorKind::InvalidData,
148 "GGUF nesting too deep",
149 ));
150 }
151 let elem_type = GgufType::from_u32(read_u32(f)?).ok_or_else(|| {
152 std::io::Error::new(std::io::ErrorKind::InvalidData, "bad array type")
153 })?;
154 let count = read_bounded_len(f, MAX_GGUF_ARRAY_ELEMENTS, "array")?;
155 for _ in 0..count {
156 skip_gguf_value_with_depth(f, elem_type, depth + 1)?;
157 }
158 }
159 other => {
160 let size = other.fixed_size().unwrap_or(0);
161 f.seek(SeekFrom::Current(size as i64))?;
162 }
163 }
164 Ok(())
165}
166
167fn read_gguf_value_as_u32(f: &mut std::fs::File, typ: GgufType) -> std::io::Result<Option<u32>> {
168 match typ {
169 GgufType::Uint32 => Ok(Some(read_u32(f)?)),
170 GgufType::Int32 => {
171 let value = read_i32(f)?;
172 let value = u32::try_from(value).map_err(|_| {
173 std::io::Error::new(
174 std::io::ErrorKind::InvalidData,
175 "negative Int32 where unsigned GGUF value was expected",
176 )
177 })?;
178 Ok(Some(value))
179 }
180 GgufType::Uint16 => {
181 let mut buf = [0u8; 2];
182 f.read_exact(&mut buf)?;
183 Ok(Some(u16::from_le_bytes(buf) as u32))
184 }
185 GgufType::Uint8 => {
186 let mut buf = [0u8; 1];
187 f.read_exact(&mut buf)?;
188 Ok(Some(buf[0] as u32))
189 }
190 _ => {
191 skip_gguf_value(f, typ)?;
192 Ok(None)
193 }
194 }
195}
196
197fn read_gguf_value_as_f32(f: &mut std::fs::File, typ: GgufType) -> std::io::Result<Option<f32>> {
198 match typ {
199 GgufType::Float32 => {
200 let mut buf = [0u8; 4];
201 f.read_exact(&mut buf)?;
202 Ok(Some(f32::from_le_bytes(buf)))
203 }
204 _ => {
205 skip_gguf_value(f, typ)?;
206 Ok(None)
207 }
208 }
209}
210
211fn read_gguf_value_as_string_opt(
212 f: &mut std::fs::File,
213 typ: GgufType,
214) -> std::io::Result<Option<String>> {
215 match typ {
216 GgufType::String => Ok(Some(read_gguf_string(f)?)),
217 _ => {
218 skip_gguf_value(f, typ)?;
219 Ok(None)
220 }
221 }
222}
223
224#[derive(Clone, Debug, Default)]
225pub struct GgufCompactMeta {
226 pub architecture: String,
227 pub context_length: u32,
228 pub vocab_size: u32,
229 pub embedding_size: u32,
230 pub head_count: u32,
231 pub kv_head_count: u32,
232 pub layer_count: u32,
233 pub feed_forward_length: u32,
234 pub key_length: u32,
235 pub value_length: u32,
236 pub tokenizer_model_name: String,
237 pub rope_scale: f32,
238 pub rope_freq_base: f32,
239 pub expert_count: u32,
240 pub expert_used_count: u32,
241}
242
243impl GgufCompactMeta {
244 pub fn effective_kv_head_count(&self) -> Option<u32> {
245 if self.kv_head_count > 0 {
246 Some(self.kv_head_count)
247 } else if self.head_count > 0 {
248 Some(self.head_count)
249 } else {
250 None
251 }
252 }
253
254 pub fn k_cache_bytes_per_token_f16(&self) -> Option<u64> {
255 GgufKvCacheQuant::f16().k_cache_bytes_per_token(self)
256 }
257
258 pub fn v_cache_bytes_per_token_f16(&self) -> Option<u64> {
259 GgufKvCacheQuant::f16().v_cache_bytes_per_token(self)
260 }
261
262 pub fn kv_cache_bytes_per_token_f16(&self) -> Option<u64> {
263 GgufKvCacheQuant::f16().kv_cache_bytes_per_token(self)
264 }
265}
266
267#[derive(Clone, Copy, Debug, Eq, PartialEq)]
268pub enum GgufKvCacheType {
269 F16,
270 Q8_0,
271 Q4_0,
272}
273
274impl GgufKvCacheType {
275 pub fn from_llama_arg(value: &str) -> Option<Self> {
276 match value.to_ascii_lowercase().as_str() {
277 "f16" => Some(Self::F16),
278 "q8_0" => Some(Self::Q8_0),
279 "q4_0" => Some(Self::Q4_0),
280 _ => None,
281 }
282 }
283
284 pub const fn as_llama_arg(self) -> &'static str {
285 match self {
286 Self::F16 => "f16",
287 Self::Q8_0 => "q8_0",
288 Self::Q4_0 => "q4_0",
289 }
290 }
291
292 fn block_shape(self) -> (u64, u64) {
293 match self {
294 Self::F16 => (1, 2),
295 Self::Q8_0 => (32, 34),
296 Self::Q4_0 => (32, 18),
297 }
298 }
299
300 fn bytes_for_elements(self, elements: u64) -> Option<u64> {
301 let (block_elements, block_bytes) = self.block_shape();
302 let blocks = elements
303 .checked_add(block_elements.checked_sub(1)?)?
304 .checked_div(block_elements)?;
305 blocks.checked_mul(block_bytes)
306 }
307}
308
309#[derive(Clone, Copy, Debug, Eq, PartialEq)]
310pub struct GgufKvCacheQuant {
311 pub k: GgufKvCacheType,
312 pub v: GgufKvCacheType,
313}
314
315impl GgufKvCacheQuant {
316 pub const F16: Self = Self {
318 k: GgufKvCacheType::F16,
319 v: GgufKvCacheType::F16,
320 };
321
322 pub const Q8_0: Self = Self {
324 k: GgufKvCacheType::Q8_0,
325 v: GgufKvCacheType::Q8_0,
326 };
327
328 pub const Q4_0: Self = Self {
330 k: GgufKvCacheType::Q4_0,
331 v: GgufKvCacheType::Q4_0,
332 };
333
334 pub const fn new(k: GgufKvCacheType, v: GgufKvCacheType) -> Self {
335 Self { k, v }
336 }
337
338 pub const fn f16() -> Self {
339 Self::F16
340 }
341
342 pub const fn is_more_aggressive_than(self, other: Self) -> bool {
345 Self::aggressiveness(self) > Self::aggressiveness(other)
346 }
347
348 const fn aggressiveness(q: Self) -> u8 {
349 Self::type_aggressiveness(q.k) + Self::type_aggressiveness(q.v)
350 }
351
352 const fn type_aggressiveness(t: GgufKvCacheType) -> u8 {
353 match t {
354 GgufKvCacheType::F16 => 0,
355 GgufKvCacheType::Q8_0 => 1,
356 GgufKvCacheType::Q4_0 => 2,
357 }
358 }
359
360 pub fn from_llama_args(cache_type_k: &str, cache_type_v: &str) -> Option<Self> {
361 Some(Self {
362 k: GgufKvCacheType::from_llama_arg(cache_type_k)?,
363 v: GgufKvCacheType::from_llama_arg(cache_type_v)?,
364 })
365 }
366
367 pub fn k_cache_bytes_per_token(self, meta: &GgufCompactMeta) -> Option<u64> {
368 cache_bytes_per_token(meta, meta.key_length, self.k)
369 }
370
371 pub fn v_cache_bytes_per_token(self, meta: &GgufCompactMeta) -> Option<u64> {
372 cache_bytes_per_token(meta, meta.value_length, self.v)
373 }
374
375 pub fn kv_cache_bytes_per_token(self, meta: &GgufCompactMeta) -> Option<u64> {
376 self.k_cache_bytes_per_token(meta)?
377 .checked_add(self.v_cache_bytes_per_token(meta)?)
378 }
379}
380
381fn cache_bytes_per_token(
382 meta: &GgufCompactMeta,
383 vector_length: u32,
384 cache_type: GgufKvCacheType,
385) -> Option<u64> {
386 let kv_heads = u64::from(meta.effective_kv_head_count()?);
387 let vector_length = u64::from((vector_length > 0).then_some(vector_length)?);
388 let layers = u64::from((meta.layer_count > 0).then_some(meta.layer_count)?);
389 let elements_per_layer = kv_heads.checked_mul(vector_length)?;
390 cache_type
391 .bytes_for_elements(elements_per_layer)?
392 .checked_mul(layers)
393}
394
395#[derive(Clone, Debug, Default, Eq, PartialEq)]
396pub struct GgufTensorByteProfile {
397 pub expert_count: u32,
398 pub expert_used_count: u32,
399 pub full_model_bytes: u64,
400 pub base_resident_bytes: u64,
401 pub expert_tensor_bytes: u64,
402 pub file_overhead_bytes: u64,
403}
404
405#[derive(Clone, Debug)]
406struct GgufTensorInfo {
407 name: String,
408 offset: u64,
409}
410
411pub fn scan_gguf_compact_meta(path: &Path) -> Option<GgufCompactMeta> {
414 let mut f = std::fs::File::open(path).ok()?;
415
416 let mut magic = [0u8; 4];
417 f.read_exact(&mut magic).ok()?;
418 if &magic != b"GGUF" {
419 return None;
420 }
421 let version = read_u32(&mut f).ok()?;
422 if version < 2 {
423 return None;
424 }
425 let _n_tensors = read_gguf_header_count(&mut f, MAX_GGUF_TENSOR_COUNT, "tensor count").ok()?;
426 let n_kv = read_gguf_header_count(&mut f, MAX_GGUF_HEADER_KV_COUNT, "KV count").ok()?;
427
428 let mut meta = GgufCompactMeta::default();
429 for _ in 0..n_kv {
430 let key = read_gguf_string(&mut f).ok()?;
431 let vtype = GgufType::from_u32(read_u32(&mut f).ok()?)?;
432
433 if key == "general.architecture" {
434 meta.architecture = read_gguf_value_as_string_opt(&mut f, vtype).ok()??;
435 } else if key == "tokenizer.ggml.model" {
436 meta.tokenizer_model_name = read_gguf_value_as_string_opt(&mut f, vtype).ok()??;
437 } else if key.ends_with(".context_length") {
438 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
439 meta.context_length = v;
440 }
441 } else if key.ends_with(".embedding_length") {
442 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
443 meta.embedding_size = v;
444 }
445 } else if key.ends_with(".head_count") && !key.ends_with("_kv") {
446 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
447 meta.head_count = v;
448 }
449 } else if key.ends_with(".attention.head_count_kv") {
450 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
451 meta.kv_head_count = v;
452 }
453 } else if key.ends_with(".block_count") {
454 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
455 meta.layer_count = v;
456 }
457 } else if key.ends_with(".feed_forward_length") {
458 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
459 meta.feed_forward_length = v;
460 }
461 } else if key.ends_with(".attention.key_length") {
462 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
463 meta.key_length = v;
464 }
465 } else if key.ends_with(".attention.value_length") {
466 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
467 meta.value_length = v;
468 }
469 } else if key.ends_with(".rope.scale") {
470 if let Ok(Some(v)) = read_gguf_value_as_f32(&mut f, vtype) {
471 meta.rope_scale = v;
472 }
473 } else if key.ends_with(".rope.freq_base") {
474 if let Ok(Some(v)) = read_gguf_value_as_f32(&mut f, vtype) {
475 meta.rope_freq_base = v;
476 }
477 } else if key.ends_with(".vocab_size") {
478 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
479 meta.vocab_size = v;
480 }
481 } else if key.ends_with(".expert_count") {
482 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
483 meta.expert_count = v;
484 }
485 } else if key.ends_with(".expert_used_count") {
486 if let Ok(Some(v)) = read_gguf_value_as_u32(&mut f, vtype) {
487 meta.expert_used_count = v;
488 }
489 } else {
490 skip_gguf_value(&mut f, vtype).ok()?;
491 }
492 }
493
494 if meta.key_length == 0
495 && meta.head_count > 0
496 && let Some(key_length) = meta.embedding_size.checked_div(meta.head_count)
497 {
498 meta.key_length = key_length;
499 }
500 if meta.value_length == 0
501 && let Some(effective_kv) = meta.effective_kv_head_count()
502 && let Some(value_length) = meta.embedding_size.checked_div(effective_kv)
503 {
504 meta.value_length = value_length;
505 }
506
507 Some(meta)
508}
509
510fn align_offset(value: u64, alignment: u32) -> u64 {
511 let alignment = u64::from(alignment.max(1));
512 let remainder = value % alignment;
513 if remainder == 0 {
514 value
515 } else {
516 value + (alignment - remainder)
517 }
518}
519
520fn read_tensor_infos(
521 f: &mut std::fs::File,
522 n_tensors: usize,
523) -> std::io::Result<Vec<GgufTensorInfo>> {
524 let mut tensors = Vec::new();
525 tensors.try_reserve(n_tensors).map_err(|_| {
526 std::io::Error::new(
527 std::io::ErrorKind::InvalidData,
528 "GGUF tensor count requires too much memory",
529 )
530 })?;
531 for _ in 0..n_tensors {
532 let name = read_gguf_string(f)?;
533 let n_dims = read_u32(f)?;
534 if n_dims > MAX_GGUF_TENSOR_DIMS {
535 return Err(std::io::Error::new(
536 std::io::ErrorKind::InvalidData,
537 "too many GGUF tensor dimensions",
538 ));
539 }
540 for _ in 0..n_dims {
541 let _ = read_u64(f)?;
542 }
543 let _ = read_u32(f)?;
544 let offset = read_u64(f)?;
545 tensors.push(GgufTensorInfo { name, offset });
546 }
547 Ok(tensors)
548}
549
550fn is_expert_partitioned_tensor(name: &str) -> bool {
551 let lower = name.to_ascii_lowercase();
552 if lower.contains("shared_expert") || lower.contains("sharedexpert") || lower.contains("shexp")
553 {
554 return false;
555 }
556
557 lower.contains("ffn_gate_exps")
558 || lower.contains("ffn_up_exps")
559 || lower.contains("ffn_down_exps")
560 || lower.contains("exp_probs")
561 || lower.contains(".expert")
562 || lower.contains("_expert")
563}
564
565pub fn scan_gguf_tensor_byte_profile(path: &Path) -> Option<GgufTensorByteProfile> {
568 let mut f = std::fs::File::open(path).ok()?;
569 let file_len = f.metadata().ok()?.len();
570
571 let mut magic = [0u8; 4];
572 f.read_exact(&mut magic).ok()?;
573 if &magic != b"GGUF" {
574 return None;
575 }
576 let version = read_u32(&mut f).ok()?;
577 if version < 2 {
578 return None;
579 }
580
581 let n_tensors = read_gguf_header_count(&mut f, MAX_GGUF_TENSOR_COUNT, "tensor count").ok()?;
582 let n_kv = read_gguf_header_count(&mut f, MAX_GGUF_HEADER_KV_COUNT, "KV count").ok()?;
583
584 let mut expert_count = 0u32;
585 let mut expert_used_count = 0u32;
586 let mut alignment = 32u32;
587
588 for _ in 0..n_kv {
589 let key = read_gguf_string(&mut f).ok()?;
590 let vtype = GgufType::from_u32(read_u32(&mut f).ok()?)?;
591
592 if key == "general.alignment" {
593 if let Ok(Some(value)) = read_gguf_value_as_u32(&mut f, vtype) {
594 alignment = value.max(1);
595 }
596 } else if key.ends_with(".expert_count") {
597 if let Ok(Some(value)) = read_gguf_value_as_u32(&mut f, vtype) {
598 expert_count = value;
599 }
600 } else if key.ends_with(".expert_used_count") {
601 if let Ok(Some(value)) = read_gguf_value_as_u32(&mut f, vtype) {
602 expert_used_count = value;
603 }
604 } else {
605 skip_gguf_value(&mut f, vtype).ok()?;
606 }
607 }
608
609 let mut tensors = read_tensor_infos(&mut f, n_tensors).ok()?;
610 if tensors.is_empty() {
611 return Some(GgufTensorByteProfile {
612 expert_count,
613 expert_used_count,
614 full_model_bytes: file_len,
615 base_resident_bytes: 0,
616 expert_tensor_bytes: 0,
617 file_overhead_bytes: file_len,
618 });
619 }
620
621 let tensor_info_end = f.stream_position().ok()?;
622 let data_start = align_offset(tensor_info_end, alignment);
623 if data_start > file_len {
624 return None;
625 }
626 let data_len = file_len - data_start;
627
628 tensors.sort_by_key(|tensor| tensor.offset);
629 if tensors.first()?.offset > data_len {
630 return None;
631 }
632
633 let mut base_resident_bytes = 0u64;
634 let mut expert_tensor_bytes = 0u64;
635 for (index, tensor) in tensors.iter().enumerate() {
636 let next_offset = tensors
637 .get(index + 1)
638 .map(|next| next.offset)
639 .unwrap_or(data_len);
640 if next_offset < tensor.offset || next_offset > data_len {
641 return None;
642 }
643 let tensor_bytes = next_offset - tensor.offset;
644 if is_expert_partitioned_tensor(&tensor.name) {
645 expert_tensor_bytes = expert_tensor_bytes.saturating_add(tensor_bytes);
646 } else {
647 base_resident_bytes = base_resident_bytes.saturating_add(tensor_bytes);
648 }
649 }
650
651 let file_overhead_bytes = file_len.saturating_sub(base_resident_bytes + expert_tensor_bytes);
652 Some(GgufTensorByteProfile {
653 expert_count,
654 expert_used_count,
655 full_model_bytes: file_len,
656 base_resident_bytes,
657 expert_tensor_bytes,
658 file_overhead_bytes,
659 })
660}
661
662#[cfg(test)]
663mod tests {
664 use super::*;
665 use std::io::Write;
666 use std::path::PathBuf;
667 use std::time::{SystemTime, UNIX_EPOCH};
668
669 fn temp_file_path(prefix: &str) -> PathBuf {
670 let unique = SystemTime::now()
671 .duration_since(UNIX_EPOCH)
672 .unwrap()
673 .as_nanos();
674 std::env::temp_dir().join(format!("{prefix}-{unique}.gguf"))
675 }
676
677 fn write_bytes(prefix: &str, bytes: &[u8]) -> PathBuf {
678 let path = temp_file_path(prefix);
679 let mut file = std::fs::File::create(&path).unwrap();
680 file.write_all(bytes).unwrap();
681 file.flush().unwrap();
682 path
683 }
684
685 fn push_array_header(bytes: &mut Vec<u8>, elem_type: GgufType, count: u64) {
686 bytes.extend_from_slice(&(elem_type as u32).to_le_bytes());
687 bytes.extend_from_slice(&count.to_le_bytes());
688 }
689
690 fn push_gguf_string(bytes: &mut Vec<u8>, value: &str) {
691 bytes.extend_from_slice(&(value.len() as u64).to_le_bytes());
692 bytes.extend_from_slice(value.as_bytes());
693 }
694
695 fn push_u32_kv(bytes: &mut Vec<u8>, key: &str, value: u32) {
696 push_gguf_string(bytes, key);
697 bytes.extend_from_slice(&(GgufType::Uint32 as u32).to_le_bytes());
698 bytes.extend_from_slice(&value.to_le_bytes());
699 }
700
701 fn push_tensor_info(bytes: &mut Vec<u8>, name: &str, offset: u64) {
702 push_gguf_string(bytes, name);
703 bytes.extend_from_slice(&1u32.to_le_bytes());
704 bytes.extend_from_slice(&16u64.to_le_bytes());
705 bytes.extend_from_slice(&(GgufType::Uint8 as u32).to_le_bytes());
706 bytes.extend_from_slice(&offset.to_le_bytes());
707 }
708
709 #[test]
710 fn skip_gguf_value_rejects_excessive_array_depth() {
711 let mut bytes = Vec::new();
712 for _ in 0..=MAX_GGUF_ARRAY_DEPTH {
713 push_array_header(&mut bytes, GgufType::Array, 1);
714 }
715 push_array_header(&mut bytes, GgufType::Uint8, 1);
716 bytes.push(0);
717
718 let path = write_bytes("model-artifact-gguf-depth", &bytes);
719 let mut file = std::fs::File::open(&path).unwrap();
720 let err = skip_gguf_value(&mut file, GgufType::Array).unwrap_err();
721 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
722 assert!(err.to_string().contains("nesting too deep"));
723 let _ = std::fs::remove_file(path);
724 }
725
726 #[test]
727 fn skip_gguf_value_rejects_excessive_array_count() {
728 let mut bytes = Vec::new();
729 push_array_header(&mut bytes, GgufType::Uint8, MAX_GGUF_ARRAY_ELEMENTS + 1);
730
731 let path = write_bytes("model-artifact-gguf-count", &bytes);
732 let mut file = std::fs::File::open(&path).unwrap();
733 let err = skip_gguf_value(&mut file, GgufType::Array).unwrap_err();
734 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
735 assert!(err.to_string().contains("array too long"));
736 let _ = std::fs::remove_file(path);
737 }
738
739 #[test]
740 fn scan_gguf_compact_meta_returns_none_on_malicious_nested_array() {
741 let mut bytes = Vec::new();
742 bytes.extend_from_slice(b"GGUF");
743 bytes.extend_from_slice(&2u32.to_le_bytes());
744 bytes.extend_from_slice(&0i64.to_le_bytes());
745 bytes.extend_from_slice(&1i64.to_le_bytes());
746 push_gguf_string(&mut bytes, "general.architecture");
747 bytes.extend_from_slice(&(GgufType::Array as u32).to_le_bytes());
748 for _ in 0..=MAX_GGUF_ARRAY_DEPTH {
749 push_array_header(&mut bytes, GgufType::Array, 1);
750 }
751 push_array_header(&mut bytes, GgufType::Uint8, 1);
752 bytes.push(0);
753
754 let path = write_bytes("model-artifact-gguf-malicious", &bytes);
755 assert!(scan_gguf_compact_meta(&path).is_none());
756 let _ = std::fs::remove_file(path);
757 }
758
759 #[test]
760 fn scan_gguf_compact_meta_derives_value_length_from_kv_heads_without_head_count() {
761 let mut bytes = Vec::new();
762 bytes.extend_from_slice(b"GGUF");
763 bytes.extend_from_slice(&2u32.to_le_bytes());
764 bytes.extend_from_slice(&0i64.to_le_bytes());
765 bytes.extend_from_slice(&2i64.to_le_bytes());
766 push_u32_kv(&mut bytes, "llama.embedding_length", 4096);
767 push_u32_kv(&mut bytes, "llama.attention.head_count_kv", 8);
768
769 let path = write_bytes("model-artifact-gguf-kv-heads", &bytes);
770 let meta = scan_gguf_compact_meta(&path).expect("should parse GGUF");
771 assert_eq!(meta.head_count, 0);
772 assert_eq!(meta.kv_head_count, 8);
773 assert_eq!(meta.key_length, 0);
774 assert_eq!(meta.value_length, 512);
775 let _ = std::fs::remove_file(path);
776 }
777
778 #[test]
779 fn scan_gguf_compact_meta_preserves_kv_head_count() {
780 let mut bytes = Vec::new();
781 bytes.extend_from_slice(b"GGUF");
782 bytes.extend_from_slice(&2u32.to_le_bytes());
783 bytes.extend_from_slice(&0i64.to_le_bytes());
784 bytes.extend_from_slice(&6i64.to_le_bytes());
785 push_u32_kv(&mut bytes, "llama.embedding_length", 4096);
786 push_u32_kv(&mut bytes, "llama.attention.head_count", 32);
787 push_u32_kv(&mut bytes, "llama.attention.head_count_kv", 8);
788 push_u32_kv(&mut bytes, "llama.block_count", 24);
789 push_u32_kv(&mut bytes, "llama.attention.key_length", 128);
790 push_u32_kv(&mut bytes, "llama.attention.value_length", 128);
791
792 let path = write_bytes("model-artifact-gguf-kv-head-count", &bytes);
793 let meta = scan_gguf_compact_meta(&path).expect("should parse GGUF");
794 assert_eq!(meta.head_count, 32);
795 assert_eq!(meta.kv_head_count, 8);
796 assert_eq!(meta.effective_kv_head_count(), Some(8));
797 assert_eq!(meta.k_cache_bytes_per_token_f16(), Some(49_152));
798 assert_eq!(meta.v_cache_bytes_per_token_f16(), Some(49_152));
799 let _ = std::fs::remove_file(path);
800 }
801
802 #[test]
803 fn kv_cache_quant_prices_key_and_value_types_independently() {
804 let meta = GgufCompactMeta {
805 head_count: 32,
806 kv_head_count: 8,
807 layer_count: 24,
808 key_length: 128,
809 value_length: 128,
810 ..Default::default()
811 };
812 let quant = GgufKvCacheQuant::new(GgufKvCacheType::Q8_0, GgufKvCacheType::Q4_0);
813
814 assert_eq!(quant.k_cache_bytes_per_token(&meta), Some(26_112));
815 assert_eq!(quant.v_cache_bytes_per_token(&meta), Some(13_824));
816 assert_eq!(quant.kv_cache_bytes_per_token(&meta), Some(39_936));
817 }
818
819 #[test]
820 fn kv_cache_quant_prices_key_and_value_widths_independently() {
821 let meta = GgufCompactMeta {
822 head_count: 32,
823 kv_head_count: 8,
824 layer_count: 24,
825 key_length: 64,
826 value_length: 256,
827 ..Default::default()
828 };
829 let quant = GgufKvCacheQuant::new(GgufKvCacheType::Q8_0, GgufKvCacheType::Q4_0);
830
831 assert_eq!(quant.k_cache_bytes_per_token(&meta), Some(13_056));
832 assert_eq!(quant.v_cache_bytes_per_token(&meta), Some(27_648));
833 assert_eq!(quant.kv_cache_bytes_per_token(&meta), Some(40_704));
834 }
835
836 #[test]
837 fn kv_cache_bytes_per_token_returns_none_when_required_fields_are_missing() {
838 let meta = GgufCompactMeta {
839 head_count: 32,
840 layer_count: 24,
841 key_length: 128,
842 ..Default::default()
843 };
844
845 assert_eq!(meta.k_cache_bytes_per_token_f16(), Some(196_608));
846 assert_eq!(meta.v_cache_bytes_per_token_f16(), None);
847 assert_eq!(
848 GgufKvCacheQuant::f16().kv_cache_bytes_per_token(&meta),
849 None
850 );
851 }
852
853 #[test]
854 fn scan_gguf_compact_meta_rejects_negative_kv_count() {
855 let mut bytes = Vec::new();
856 bytes.extend_from_slice(b"GGUF");
857 bytes.extend_from_slice(&2u32.to_le_bytes());
858 bytes.extend_from_slice(&0i64.to_le_bytes());
859 bytes.extend_from_slice(&(-1i64).to_le_bytes());
860
861 let path = write_bytes("model-artifact-gguf-negative-kv", &bytes);
862 assert!(scan_gguf_compact_meta(&path).is_none());
863 let _ = std::fs::remove_file(path);
864 }
865
866 #[test]
867 fn scan_gguf_tensor_byte_profile_rejects_excessive_tensor_count() {
868 let mut bytes = Vec::new();
869 bytes.extend_from_slice(b"GGUF");
870 bytes.extend_from_slice(&2u32.to_le_bytes());
871 bytes.extend_from_slice(&((MAX_GGUF_TENSOR_COUNT as i64) + 1).to_le_bytes());
872 bytes.extend_from_slice(&0i64.to_le_bytes());
873
874 let path = write_bytes("model-artifact-gguf-too-many-tensors", &bytes);
875 assert!(scan_gguf_tensor_byte_profile(&path).is_none());
876 let _ = std::fs::remove_file(path);
877 }
878
879 #[test]
880 fn read_gguf_value_as_u32_rejects_negative_int32() {
881 let path = write_bytes("model-artifact-gguf-negative-int32", &(-1i32).to_le_bytes());
882 let mut file = std::fs::File::open(&path).unwrap();
883 let err = read_gguf_value_as_u32(&mut file, GgufType::Int32).unwrap_err();
884 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
885 assert!(
886 err.to_string()
887 .contains("negative Int32 where unsigned GGUF value was expected")
888 );
889 let _ = std::fs::remove_file(path);
890 }
891
892 #[test]
893 fn scan_gguf_tensor_byte_profile_splits_base_and_expert_bytes() {
894 let mut bytes = Vec::new();
895 bytes.extend_from_slice(b"GGUF");
896 bytes.extend_from_slice(&2u32.to_le_bytes());
897 bytes.extend_from_slice(&2i64.to_le_bytes());
898 bytes.extend_from_slice(&3i64.to_le_bytes());
899
900 push_u32_kv(&mut bytes, "general.alignment", 32);
901 push_u32_kv(&mut bytes, "llama.expert_count", 8);
902 push_u32_kv(&mut bytes, "llama.expert_used_count", 2);
903
904 push_tensor_info(&mut bytes, "blk.0.ffn_up_exps.weight", 0);
905 push_tensor_info(&mut bytes, "blk.0.attn_q.weight", 64);
906
907 let data_start = align_offset(bytes.len() as u64, 32) as usize;
908 bytes.resize(data_start, 0);
909 bytes.resize(data_start + 96, 0);
910
911 let path = write_bytes("model-artifact-gguf-tensors", &bytes);
912 let profile = scan_gguf_tensor_byte_profile(&path).unwrap();
913 assert_eq!(profile.expert_count, 8);
914 assert_eq!(profile.expert_used_count, 2);
915 assert_eq!(profile.expert_tensor_bytes, 64);
916 assert_eq!(profile.base_resident_bytes, 32);
917 assert_eq!(profile.full_model_bytes, bytes.len() as u64);
918 assert_eq!(
919 profile.full_model_bytes,
920 profile.base_resident_bytes + profile.expert_tensor_bytes + profile.file_overhead_bytes
921 );
922 let _ = std::fs::remove_file(path);
923 }
924}