1use crate::error::{GgufError, SparseInferenceError};
7use crate::model::types::Tensor;
8use byteorder::{LittleEndian, ReadBytesExt};
9use std::collections::HashMap;
10use std::io::{Cursor, Read};
11
12pub const GGUF_MAGIC: u32 = 0x46554747;
14
15pub const GGUF_VERSION: u32 = 3;
17
18#[derive(Debug, Clone)]
20pub struct GgufHeader {
21 pub magic: u32,
22 pub version: u32,
23 pub tensor_count: u64,
24 pub metadata_kv_count: u64,
25}
26
27#[derive(Debug, Clone)]
29pub enum GgufValue {
30 Uint8(u8),
31 Int8(i8),
32 Uint16(u16),
33 Int16(i16),
34 Uint32(u32),
35 Int32(i32),
36 Float32(f32),
37 Bool(bool),
38 String(String),
39 Array(Vec<GgufValue>),
40 Uint64(u64),
41 Int64(i64),
42 Float64(f64),
43}
44
45impl GgufValue {
46 pub fn as_u32(&self) -> Option<u32> {
48 match self {
49 GgufValue::Uint8(v) => Some(*v as u32),
50 GgufValue::Uint16(v) => Some(*v as u32),
51 GgufValue::Uint32(v) => Some(*v),
52 GgufValue::Uint64(v) => Some(*v as u32),
53 GgufValue::Int8(v) => Some(*v as u32),
54 GgufValue::Int16(v) => Some(*v as u32),
55 GgufValue::Int32(v) => Some(*v as u32),
56 GgufValue::Int64(v) => Some(*v as u32),
57 _ => None,
58 }
59 }
60
61 pub fn as_usize(&self) -> Option<usize> {
63 self.as_u32().map(|v| v as usize)
64 }
65
66 pub fn as_f32(&self) -> Option<f32> {
68 match self {
69 GgufValue::Float32(v) => Some(*v),
70 GgufValue::Float64(v) => Some(*v as f32),
71 GgufValue::Uint8(v) => Some(*v as f32),
72 GgufValue::Int8(v) => Some(*v as f32),
73 GgufValue::Uint16(v) => Some(*v as f32),
74 GgufValue::Int16(v) => Some(*v as f32),
75 GgufValue::Uint32(v) => Some(*v as f32),
76 GgufValue::Int32(v) => Some(*v as f32),
77 _ => None,
78 }
79 }
80}
81
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84#[repr(u32)]
85pub enum GgufTensorType {
86 F32 = 0,
87 F16 = 1,
88 Q4_0 = 2,
89 Q4_1 = 3,
90 Q5_0 = 6,
91 Q5_1 = 7,
92 Q8_0 = 8,
93 Q8_1 = 9,
94 Q2_K = 10,
95 Q3_K = 11,
96 Q4_K = 12,
97 Q5_K = 13,
98 Q6_K = 14,
99}
100
101impl GgufTensorType {
102 pub fn from_u32(value: u32) -> Result<Self, GgufError> {
103 match value {
104 0 => Ok(Self::F32),
105 1 => Ok(Self::F16),
106 2 => Ok(Self::Q4_0),
107 3 => Ok(Self::Q4_1),
108 6 => Ok(Self::Q5_0),
109 7 => Ok(Self::Q5_1),
110 8 => Ok(Self::Q8_0),
111 9 => Ok(Self::Q8_1),
112 10 => Ok(Self::Q2_K),
113 11 => Ok(Self::Q3_K),
114 12 => Ok(Self::Q4_K),
115 13 => Ok(Self::Q5_K),
116 14 => Ok(Self::Q6_K),
117 _ => Err(GgufError::InvalidTensorType(value)),
118 }
119 }
120
121 pub fn block_size(&self) -> usize {
123 match self {
124 Self::F32 => 1,
125 Self::F16 => 1,
126 Self::Q4_0 | Self::Q4_1 => 32,
127 Self::Q5_0 | Self::Q5_1 => 32,
128 Self::Q8_0 | Self::Q8_1 => 32,
129 Self::Q2_K | Self::Q3_K | Self::Q4_K | Self::Q5_K | Self::Q6_K => 256,
130 }
131 }
132
133 pub fn bytes_per_block(&self) -> usize {
135 match self {
136 Self::F32 => 4,
137 Self::F16 => 2,
138 Self::Q4_0 => 18, Self::Q4_1 => 20, Self::Q5_0 => 22, Self::Q5_1 => 24, Self::Q8_0 => 34, Self::Q8_1 => 36, Self::Q2_K => 84,
145 Self::Q3_K => 110,
146 Self::Q4_K => 144,
147 Self::Q5_K => 176,
148 Self::Q6_K => 210,
149 }
150 }
151}
152
153#[derive(Debug, Clone)]
155pub struct GgufTensorInfo {
156 pub name: String,
157 pub dimensions: Vec<u64>,
158 pub tensor_type: GgufTensorType,
159 pub offset: u64,
160}
161
162#[derive(Debug, Clone)]
164pub struct GgufModel {
165 pub header: GgufHeader,
166 pub metadata: HashMap<String, GgufValue>,
167 pub tensors: HashMap<String, GgufTensorInfo>,
168 pub tensor_data_offset: u64,
169}
170
171pub struct GgufParser;
173
174impl GgufParser {
175 pub fn parse(data: &[u8]) -> Result<GgufModel, GgufError> {
177 let mut cursor = Cursor::new(data);
178
179 let header = Self::parse_header_from_cursor(&mut cursor)?;
181
182 let metadata = Self::parse_metadata(&mut cursor, header.metadata_kv_count)?;
184
185 let tensors = Self::parse_tensor_info(&mut cursor, header.tensor_count)?;
187
188 let current_pos = cursor.position();
190 let alignment = 32u64;
191 let tensor_data_offset = ((current_pos + alignment - 1) / alignment) * alignment;
192
193 Ok(GgufModel {
194 header,
195 metadata,
196 tensors,
197 tensor_data_offset,
198 })
199 }
200
201 pub fn parse_header(data: &[u8]) -> Result<GgufHeader, GgufError> {
203 let mut cursor = Cursor::new(data);
204 Self::parse_header_from_cursor(&mut cursor)
205 }
206
207 fn parse_header_from_cursor(cursor: &mut Cursor<&[u8]>) -> Result<GgufHeader, GgufError> {
208 let magic = cursor.read_u32::<LittleEndian>()?;
209 if magic != GGUF_MAGIC {
210 return Err(GgufError::InvalidMagic(magic));
211 }
212
213 let version = cursor.read_u32::<LittleEndian>()?;
214 if version != GGUF_VERSION {
215 return Err(GgufError::UnsupportedVersion(version));
216 }
217
218 let tensor_count = cursor.read_u64::<LittleEndian>()?;
219 let metadata_kv_count = cursor.read_u64::<LittleEndian>()?;
220
221 Ok(GgufHeader {
222 magic,
223 version,
224 tensor_count,
225 metadata_kv_count,
226 })
227 }
228
229 fn parse_metadata(
230 cursor: &mut Cursor<&[u8]>,
231 count: u64,
232 ) -> Result<HashMap<String, GgufValue>, GgufError> {
233 let mut metadata = HashMap::new();
234
235 for _ in 0..count {
236 let key = Self::read_string(cursor)?;
237 let value = Self::read_value(cursor)?;
238 metadata.insert(key, value);
239 }
240
241 Ok(metadata)
242 }
243
244 fn parse_tensor_info(
245 cursor: &mut Cursor<&[u8]>,
246 count: u64,
247 ) -> Result<HashMap<String, GgufTensorInfo>, GgufError> {
248 let mut tensors = HashMap::new();
249 let mut cumulative_offset = 0u64;
250
251 for _ in 0..count {
252 let name = Self::read_string(cursor)?;
253
254 let n_dims = cursor.read_u32::<LittleEndian>()? as usize;
256
257 let mut dimensions = Vec::with_capacity(n_dims);
259 for _ in 0..n_dims {
260 dimensions.push(cursor.read_u64::<LittleEndian>()?);
261 }
262
263 let tensor_type_raw = cursor.read_u32::<LittleEndian>()?;
265 let tensor_type = GgufTensorType::from_u32(tensor_type_raw)?;
266
267 let offset_in_section = cursor.read_u64::<LittleEndian>()?;
269
270 let info = GgufTensorInfo {
271 name: name.clone(),
272 dimensions,
273 tensor_type,
274 offset: offset_in_section,
275 };
276
277 tensors.insert(name, info);
278 }
279
280 Ok(tensors)
281 }
282
283 fn read_string(cursor: &mut Cursor<&[u8]>) -> Result<String, GgufError> {
284 let len = cursor.read_u64::<LittleEndian>()? as usize;
285 let mut bytes = vec![0u8; len];
286 cursor.read_exact(&mut bytes)?;
287 Ok(String::from_utf8(bytes)?)
288 }
289
290 fn read_value(cursor: &mut Cursor<&[u8]>) -> Result<GgufValue, GgufError> {
291 let value_type = cursor.read_u32::<LittleEndian>()?;
292 Self::read_value_of_type(cursor, value_type)
293 }
294
295 fn read_value_of_type(cursor: &mut Cursor<&[u8]>, value_type: u32) -> Result<GgufValue, GgufError> {
296 match value_type {
297 0 => Ok(GgufValue::Uint8(cursor.read_u8()?)),
298 1 => Ok(GgufValue::Int8(cursor.read_i8()?)),
299 2 => Ok(GgufValue::Uint16(cursor.read_u16::<LittleEndian>()?)),
300 3 => Ok(GgufValue::Int16(cursor.read_i16::<LittleEndian>()?)),
301 4 => Ok(GgufValue::Uint32(cursor.read_u32::<LittleEndian>()?)),
302 5 => Ok(GgufValue::Int32(cursor.read_i32::<LittleEndian>()?)),
303 6 => Ok(GgufValue::Float32(cursor.read_f32::<LittleEndian>()?)),
304 7 => Ok(GgufValue::Bool(cursor.read_u8()? != 0)),
305 8 => Ok(GgufValue::String(Self::read_string(cursor)?)),
306 9 => {
307 let array_type = cursor.read_u32::<LittleEndian>()?;
308 let array_len = cursor.read_u64::<LittleEndian>()? as usize;
309 let mut array = Vec::with_capacity(array_len);
310
311 for _ in 0..array_len {
312 array.push(Self::read_value_of_type(cursor, array_type)?);
313 }
314 Ok(GgufValue::Array(array))
315 }
316 10 => Ok(GgufValue::Uint64(cursor.read_u64::<LittleEndian>()?)),
317 11 => Ok(GgufValue::Int64(cursor.read_i64::<LittleEndian>()?)),
318 12 => Ok(GgufValue::Float64(cursor.read_f64::<LittleEndian>()?)),
319 _ => Err(GgufError::InvalidValueType(value_type)),
320 }
321 }
322
323 pub fn load_tensor(
325 data: &[u8],
326 model: &GgufModel,
327 tensor_name: &str,
328 ) -> Result<Tensor, GgufError> {
329 let info = model
330 .tensors
331 .get(tensor_name)
332 .ok_or_else(|| GgufError::TensorNotFound(tensor_name.to_string()))?;
333
334 let offset = (model.tensor_data_offset + info.offset) as usize;
335
336 let n_elements = info.dimensions.iter().product::<u64>() as usize;
338
339 let tensor_data = &data[offset..];
341 let dequantized = Self::dequantize(tensor_data, info.tensor_type, n_elements)?;
342
343 Ok(Tensor::new(
344 dequantized,
345 info.dimensions.clone(),
346 tensor_name.to_string(),
347 ))
348 }
349
350 pub fn dequantize(
352 data: &[u8],
353 tensor_type: GgufTensorType,
354 n_elements: usize,
355 ) -> Result<Vec<f32>, GgufError> {
356 match tensor_type {
357 GgufTensorType::F32 => dequantize_f32(data, n_elements),
358 GgufTensorType::F16 => dequantize_f16(data, n_elements),
359 GgufTensorType::Q4_0 => Ok(dequantize_q4_0(data, n_elements)),
360 GgufTensorType::Q4_1 => Ok(dequantize_q4_1(data, n_elements)),
361 GgufTensorType::Q5_0 => Ok(dequantize_q5_0(data, n_elements)),
362 GgufTensorType::Q5_1 => Ok(dequantize_q5_1(data, n_elements)),
363 GgufTensorType::Q8_0 => Ok(dequantize_q8_0(data, n_elements)),
364 GgufTensorType::Q8_1 => Ok(dequantize_q8_1(data, n_elements)),
365 GgufTensorType::Q2_K => Ok(dequantize_q2_k(data, n_elements)),
366 GgufTensorType::Q3_K => Ok(dequantize_q3_k(data, n_elements)),
367 GgufTensorType::Q4_K => Ok(dequantize_q4_k(data, n_elements)),
368 GgufTensorType::Q5_K => Ok(dequantize_q5_k(data, n_elements)),
369 GgufTensorType::Q6_K => Ok(dequantize_q6_k(data, n_elements)),
370 }
371 }
372}
373
374fn dequantize_f32(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
377 let mut cursor = Cursor::new(data);
378 let mut result = Vec::with_capacity(n_elements);
379
380 for _ in 0..n_elements {
381 result.push(cursor.read_f32::<LittleEndian>()?);
382 }
383
384 Ok(result)
385}
386
387fn dequantize_f16(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
388 let mut cursor = Cursor::new(data);
389 let mut result = Vec::with_capacity(n_elements);
390
391 for _ in 0..n_elements {
392 let f16_bits = cursor.read_u16::<LittleEndian>()?;
393 let f16_val = half::f16::from_bits(f16_bits);
394 result.push(f16_val.to_f32());
395 }
396
397 Ok(result)
398}
399
400fn dequantize_q4_0(data: &[u8], n_elements: usize) -> Vec<f32> {
403 const BLOCK_SIZE: usize = 32;
404 let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
405 let mut result = Vec::with_capacity(n_elements);
406
407 for block_idx in 0..n_blocks {
408 let block_offset = block_idx * 18; let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
412 let scale = half::f16::from_bits(scale_bits).to_f32();
413
414 for i in 0..BLOCK_SIZE {
416 if result.len() >= n_elements {
417 break;
418 }
419
420 let byte_idx = block_offset + 2 + (i / 2);
421 let nibble = if i % 2 == 0 {
422 (data[byte_idx] & 0x0F) as i8
423 } else {
424 ((data[byte_idx] >> 4) & 0x0F) as i8
425 };
426
427 let value = (nibble - 8) as f32 * scale;
429 result.push(value);
430 }
431 }
432
433 result.truncate(n_elements);
434 result
435}
436
437fn dequantize_q4_1(data: &[u8], n_elements: usize) -> Vec<f32> {
439 const BLOCK_SIZE: usize = 32;
440 let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
441 let mut result = Vec::with_capacity(n_elements);
442
443 for block_idx in 0..n_blocks {
444 let block_offset = block_idx * 20; let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
447 let scale = half::f16::from_bits(scale_bits).to_f32();
448
449 let min_bits = u16::from_le_bytes([data[block_offset + 2], data[block_offset + 3]]);
450 let min = half::f16::from_bits(min_bits).to_f32();
451
452 for i in 0..BLOCK_SIZE {
453 if result.len() >= n_elements {
454 break;
455 }
456
457 let byte_idx = block_offset + 4 + (i / 2);
458 let nibble = if i % 2 == 0 {
459 data[byte_idx] & 0x0F
460 } else {
461 (data[byte_idx] >> 4) & 0x0F
462 };
463
464 let value = nibble as f32 * scale + min;
465 result.push(value);
466 }
467 }
468
469 result.truncate(n_elements);
470 result
471}
472
473fn dequantize_q5_0(data: &[u8], n_elements: usize) -> Vec<f32> {
475 const BLOCK_SIZE: usize = 32;
476 let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
477 let mut result = Vec::with_capacity(n_elements);
478
479 for block_idx in 0..n_blocks {
480 let block_offset = block_idx * 22; let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
483 let scale = half::f16::from_bits(scale_bits).to_f32();
484
485 let high_bits = u32::from_le_bytes([
486 data[block_offset + 2],
487 data[block_offset + 3],
488 data[block_offset + 4],
489 data[block_offset + 5],
490 ]);
491
492 for i in 0..BLOCK_SIZE {
493 if result.len() >= n_elements {
494 break;
495 }
496
497 let byte_idx = block_offset + 6 + (i / 2);
498 let low_nibble = if i % 2 == 0 {
499 data[byte_idx] & 0x0F
500 } else {
501 (data[byte_idx] >> 4) & 0x0F
502 };
503
504 let high_bit = ((high_bits >> i) & 1) as u8;
505 let quant = (high_bit << 4) | low_nibble;
506
507 let value = (quant as i8 - 16) as f32 * scale;
508 result.push(value);
509 }
510 }
511
512 result.truncate(n_elements);
513 result
514}
515
516fn dequantize_q5_1(data: &[u8], n_elements: usize) -> Vec<f32> {
518 dequantize_q5_0(data, n_elements) }
521
522fn dequantize_q8_0(data: &[u8], n_elements: usize) -> Vec<f32> {
524 const BLOCK_SIZE: usize = 32;
525 let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
526 let mut result = Vec::with_capacity(n_elements);
527
528 for block_idx in 0..n_blocks {
529 let block_offset = block_idx * 34; let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
532 let scale = half::f16::from_bits(scale_bits).to_f32();
533
534 for i in 0..BLOCK_SIZE {
535 if result.len() >= n_elements {
536 break;
537 }
538
539 let quant = data[block_offset + 2 + i] as i8;
540 let value = quant as f32 * scale;
541 result.push(value);
542 }
543 }
544
545 result.truncate(n_elements);
546 result
547}
548
549fn dequantize_q8_1(data: &[u8], n_elements: usize) -> Vec<f32> {
551 dequantize_q8_0(data, n_elements) }
553
554fn dequantize_q2_k(data: &[u8], n_elements: usize) -> Vec<f32> {
556 dequantize_q4_0(data, n_elements)
558}
559
560fn dequantize_q3_k(data: &[u8], n_elements: usize) -> Vec<f32> {
561 dequantize_q4_0(data, n_elements)
562}
563
564fn dequantize_q4_k(data: &[u8], n_elements: usize) -> Vec<f32> {
565 dequantize_q4_0(data, n_elements)
567}
568
569fn dequantize_q5_k(data: &[u8], n_elements: usize) -> Vec<f32> {
570 dequantize_q5_0(data, n_elements)
571}
572
573fn dequantize_q6_k(data: &[u8], n_elements: usize) -> Vec<f32> {
574 dequantize_q5_0(data, n_elements)
575}
576
577
578#[cfg(test)]
579mod tests {
580 use super::*;
581
582 #[test]
583 fn test_gguf_magic() {
584 assert_eq!(GGUF_MAGIC, 0x46554747);
585 }
586
587 #[test]
588 fn test_tensor_type_block_sizes() {
589 assert_eq!(GgufTensorType::Q4_0.block_size(), 32);
590 assert_eq!(GgufTensorType::Q8_0.block_size(), 32);
591 assert_eq!(GgufTensorType::Q4_K.block_size(), 256);
592 }
593
594 #[test]
595 fn test_dequantize_q4_0() {
596 let mut data = vec![0u8; 18];
598 data[0] = 0x00;
600 data[1] = 0x3C; data[2] = 0x01; let result = dequantize_q4_0(&data, 32);
606 assert_eq!(result.len(), 32);
607 }
608}