1use crate::bitnet::dequantize_bitnet_t158;
32use crate::error::{Result, RuvLLMError};
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
42#[repr(u32)]
43pub enum GgufQuantType {
44 F32 = 0,
46 F16 = 1,
48 Q4_0 = 2,
50 Q4_1 = 3,
52 Q4_2 = 4,
54 Q4_3 = 5,
56 Q5_0 = 6,
58 Q5_1 = 7,
60 Q8_0 = 8,
62 Q8_1 = 9,
64 Q2_K = 10,
66 Q3_K = 11,
68 Q4_K = 12,
70 Q5_K = 13,
72 Q6_K = 14,
74 Q8_K = 15,
76 IQ2_XXS = 16,
78 IQ2_XS = 17,
80 IQ3_XXS = 18,
82 IQ1_S = 19,
84 IQ4_NL = 20,
86 IQ3_S = 21,
88 IQ2_S = 22,
90 IQ4_XS = 23,
92 I8 = 24,
94 I16 = 25,
96 I32 = 26,
98 I64 = 27,
100 F64 = 28,
102 Bf16 = 29,
104 BitnetT158 = 30,
106}
107
108impl TryFrom<u32> for GgufQuantType {
109 type Error = RuvLLMError;
110
111 fn try_from(value: u32) -> Result<Self> {
112 match value {
113 0 => Ok(Self::F32),
114 1 => Ok(Self::F16),
115 2 => Ok(Self::Q4_0),
116 3 => Ok(Self::Q4_1),
117 4 => Ok(Self::Q4_2),
118 5 => Ok(Self::Q4_3),
119 6 => Ok(Self::Q5_0),
120 7 => Ok(Self::Q5_1),
121 8 => Ok(Self::Q8_0),
122 9 => Ok(Self::Q8_1),
123 10 => Ok(Self::Q2_K),
124 11 => Ok(Self::Q3_K),
125 12 => Ok(Self::Q4_K),
126 13 => Ok(Self::Q5_K),
127 14 => Ok(Self::Q6_K),
128 15 => Ok(Self::Q8_K),
129 16 => Ok(Self::IQ2_XXS),
130 17 => Ok(Self::IQ2_XS),
131 18 => Ok(Self::IQ3_XXS),
132 19 => Ok(Self::IQ1_S),
133 20 => Ok(Self::IQ4_NL),
134 21 => Ok(Self::IQ3_S),
135 22 => Ok(Self::IQ2_S),
136 23 => Ok(Self::IQ4_XS),
137 24 => Ok(Self::I8),
138 25 => Ok(Self::I16),
139 26 => Ok(Self::I32),
140 27 => Ok(Self::I64),
141 28 => Ok(Self::F64),
142 29 => Ok(Self::Bf16),
143 30 => Ok(Self::BitnetT158),
144 _ => Err(RuvLLMError::Model(format!(
145 "Unknown GGUF quantization type: {}",
146 value
147 ))),
148 }
149 }
150}
151
152impl GgufQuantType {
153 pub fn block_size(&self) -> usize {
158 match self {
159 Self::F32 | Self::F16 | Self::Bf16 | Self::F64 => 1,
160 Self::I8 | Self::I16 | Self::I32 | Self::I64 => 1,
161 Self::Q4_0 | Self::Q4_1 | Self::Q4_2 | Self::Q4_3 => 32,
162 Self::Q5_0 | Self::Q5_1 => 32,
163 Self::Q8_0 | Self::Q8_1 => 32,
164 Self::Q2_K | Self::Q3_K | Self::Q4_K | Self::Q5_K | Self::Q6_K | Self::Q8_K => 256,
165 Self::IQ2_XXS | Self::IQ2_XS | Self::IQ2_S => 256,
166 Self::IQ3_XXS | Self::IQ3_S => 256,
167 Self::IQ1_S => 256,
168 Self::IQ4_NL => 32,
169 Self::IQ4_XS => 256,
170 Self::BitnetT158 => 256,
171 }
172 }
173
174 pub fn type_size(&self) -> usize {
178 match self {
179 Self::F32 => 4,
180 Self::F16 => 2,
181 Self::Bf16 => 2,
182 Self::F64 => 8,
183 Self::I8 => 1,
184 Self::I16 => 2,
185 Self::I32 => 4,
186 Self::I64 => 8,
187 Self::Q4_0 => 18,
189 Self::Q4_1 => 20,
191 Self::Q4_2 => 18, Self::Q4_3 => 20, Self::Q5_0 => 22,
195 Self::Q5_1 => 24,
197 Self::Q8_0 => 34,
199 Self::Q8_1 => 36,
201 Self::Q2_K => 84,
203 Self::Q3_K => 110,
205 Self::Q4_K => 144,
207 Self::Q5_K => 176,
209 Self::Q6_K => 210,
211 Self::Q8_K => 292,
213 Self::IQ2_XXS => 66,
215 Self::IQ2_XS => 74,
216 Self::IQ2_S => 82,
217 Self::IQ3_XXS => 98,
218 Self::IQ3_S => 110,
219 Self::IQ1_S => 50,
220 Self::IQ4_NL => 18,
221 Self::IQ4_XS => 136,
222 Self::BitnetT158 => 66,
224 }
225 }
226
227 pub fn tensor_size(&self, num_elements: usize) -> usize {
229 let block_size = self.block_size();
230 let type_size = self.type_size();
231 let num_blocks = (num_elements + block_size - 1) / block_size;
232 num_blocks * type_size
233 }
234
235 pub fn is_quantized(&self) -> bool {
237 !matches!(
238 self,
239 Self::F32
240 | Self::F16
241 | Self::Bf16
242 | Self::F64
243 | Self::I8
244 | Self::I16
245 | Self::I32
246 | Self::I64
247 )
248 }
249
250 pub fn bits_per_weight(&self) -> f32 {
252 let type_size = self.type_size() as f32;
253 let block_size = self.block_size() as f32;
254 (type_size * 8.0) / block_size
255 }
256
257 pub fn name(&self) -> &'static str {
259 match self {
260 Self::F32 => "F32",
261 Self::F16 => "F16",
262 Self::Bf16 => "BF16",
263 Self::F64 => "F64",
264 Self::I8 => "I8",
265 Self::I16 => "I16",
266 Self::I32 => "I32",
267 Self::I64 => "I64",
268 Self::Q4_0 => "Q4_0",
269 Self::Q4_1 => "Q4_1",
270 Self::Q4_2 => "Q4_2",
271 Self::Q4_3 => "Q4_3",
272 Self::Q5_0 => "Q5_0",
273 Self::Q5_1 => "Q5_1",
274 Self::Q8_0 => "Q8_0",
275 Self::Q8_1 => "Q8_1",
276 Self::Q2_K => "Q2_K",
277 Self::Q3_K => "Q3_K",
278 Self::Q4_K => "Q4_K",
279 Self::Q5_K => "Q5_K",
280 Self::Q6_K => "Q6_K",
281 Self::Q8_K => "Q8_K",
282 Self::IQ2_XXS => "IQ2_XXS",
283 Self::IQ2_XS => "IQ2_XS",
284 Self::IQ2_S => "IQ2_S",
285 Self::IQ3_XXS => "IQ3_XXS",
286 Self::IQ3_S => "IQ3_S",
287 Self::IQ1_S => "IQ1_S",
288 Self::IQ4_NL => "IQ4_NL",
289 Self::IQ4_XS => "IQ4_XS",
290 Self::BitnetT158 => "BITNET_T158",
291 }
292 }
293}
294
295#[derive(Debug, Clone)]
304pub struct QuantizedTensor {
305 pub data: Vec<u8>,
307 pub dtype: GgufQuantType,
309 pub shape: Vec<usize>,
311 pub num_elements: usize,
313}
314
315impl QuantizedTensor {
316 pub fn dequantize(&self) -> Result<Vec<f32>> {
318 dequantize_tensor(&self.data, self.dtype, self.num_elements)
319 }
320
321 pub fn block_count(&self) -> usize {
323 let block_size = self.dtype.block_size();
324 (self.num_elements + block_size - 1) / block_size
325 }
326}
327
328pub fn dequantize_tensor(
344 data: &[u8],
345 dtype: GgufQuantType,
346 num_elements: usize,
347) -> Result<Vec<f32>> {
348 let mut output = vec![0.0f32; num_elements];
349
350 match dtype {
351 GgufQuantType::F32 => dequantize_f32(data, &mut output),
352 GgufQuantType::F16 => dequantize_f16(data, &mut output),
353 GgufQuantType::Bf16 => dequantize_bf16(data, &mut output),
354 GgufQuantType::Q4_0 => dequantize_q4_0(data, &mut output),
355 GgufQuantType::Q4_1 => dequantize_q4_1(data, &mut output),
356 GgufQuantType::Q5_0 => dequantize_q5_0(data, &mut output),
357 GgufQuantType::Q5_1 => dequantize_q5_1(data, &mut output),
358 GgufQuantType::Q8_0 => dequantize_q8_0(data, &mut output),
359 GgufQuantType::Q8_1 => dequantize_q8_1(data, &mut output),
360 GgufQuantType::Q2_K => dequantize_q2_k(data, &mut output),
361 GgufQuantType::Q3_K => dequantize_q3_k(data, &mut output),
362 GgufQuantType::Q4_K => dequantize_q4_k(data, &mut output),
363 GgufQuantType::Q5_K => dequantize_q5_k(data, &mut output),
364 GgufQuantType::Q6_K => dequantize_q6_k(data, &mut output),
365 GgufQuantType::IQ4_NL => dequantize_iq4_nl(data, &mut output),
366 GgufQuantType::BitnetT158 => dequantize_bitnet_t158_wrapper(data, &mut output),
367 GgufQuantType::IQ1_S => {
368 return Err(RuvLLMError::Model(
369 "IQ1_S dequantization requires codebook lookup tables (not yet implemented). \
370 For BitNet ternary quantization, use BITNET_T158 type instead."
371 .to_string(),
372 ));
373 }
374 _ => {
375 return Err(RuvLLMError::Model(format!(
376 "Dequantization not implemented for {:?}",
377 dtype
378 )));
379 }
380 }
381
382 Ok(output)
383}
384
385pub fn dequantize_block(data: &[u8], dtype: GgufQuantType, output: &mut [f32]) {
393 match dtype {
394 GgufQuantType::Q4_0 => dequantize_q4_0_block(data, output),
395 GgufQuantType::Q4_1 => dequantize_q4_1_block(data, output),
396 GgufQuantType::Q8_0 => dequantize_q8_0_block(data, output),
397 GgufQuantType::Q4_K => dequantize_q4_k_block(data, output),
398 GgufQuantType::BitnetT158 => dequantize_bitnet_t158_block_wrapper(data, output),
399 _ => {
400 output.fill(0.0);
402 }
403 }
404}
405
406fn dequantize_bitnet_t158_block_wrapper(data: &[u8], output: &mut [f32]) {
412 if data.len() < BITNET_T158_TYPE_SIZE {
413 output.fill(0.0);
414 return;
415 }
416
417 let packed = &data[..64];
419
420 let scale = f16_to_f32(u16::from_le_bytes([data[64], data[65]]));
422
423 let min_output_len = output.len().min(BITNET_T158_BLOCK_SIZE);
425 let dequantized = dequantize_bitnet_t158(packed, &[scale], min_output_len);
426
427 output[..dequantized.len()].copy_from_slice(&dequantized);
429}
430
431fn dequantize_f32(data: &[u8], output: &mut [f32]) {
436 for (i, chunk) in data.chunks_exact(4).enumerate() {
437 if i >= output.len() {
438 break;
439 }
440 output[i] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
441 }
442}
443
444fn dequantize_f16(data: &[u8], output: &mut [f32]) {
445 for (i, chunk) in data.chunks_exact(2).enumerate() {
446 if i >= output.len() {
447 break;
448 }
449 let bits = u16::from_le_bytes([chunk[0], chunk[1]]);
450 output[i] = f16_to_f32(bits);
451 }
452}
453
454fn dequantize_bf16(data: &[u8], output: &mut [f32]) {
455 for (i, chunk) in data.chunks_exact(2).enumerate() {
456 if i >= output.len() {
457 break;
458 }
459 let bits = u16::from_le_bytes([chunk[0], chunk[1]]);
460 output[i] = f32::from_bits((bits as u32) << 16);
462 }
463}
464
465const Q4_0_BLOCK_SIZE: usize = 32;
471const Q4_0_TYPE_SIZE: usize = 18; fn dequantize_q4_0(data: &[u8], output: &mut [f32]) {
474 let num_blocks = output.len() / Q4_0_BLOCK_SIZE;
475
476 for block_idx in 0..num_blocks {
477 let block_start = block_idx * Q4_0_TYPE_SIZE;
478 let out_start = block_idx * Q4_0_BLOCK_SIZE;
479
480 if block_start + Q4_0_TYPE_SIZE > data.len() {
481 break;
482 }
483
484 let block = &data[block_start..block_start + Q4_0_TYPE_SIZE];
485 let out = &mut output[out_start..out_start + Q4_0_BLOCK_SIZE];
486
487 dequantize_q4_0_block(block, out);
488 }
489}
490
491fn dequantize_q4_0_block(block: &[u8], output: &mut [f32]) {
492 let scale = f16_to_f32(u16::from_le_bytes([block[0], block[1]]));
494
495 for i in 0..16 {
497 let byte = block[2 + i];
498 let q0 = (byte & 0x0F) as i8 - 8; let q1 = ((byte >> 4) & 0x0F) as i8 - 8;
500
501 output[i * 2] = (q0 as f32) * scale;
502 output[i * 2 + 1] = (q1 as f32) * scale;
503 }
504}
505
506const Q4_1_BLOCK_SIZE: usize = 32;
511const Q4_1_TYPE_SIZE: usize = 20; fn dequantize_q4_1(data: &[u8], output: &mut [f32]) {
514 let num_blocks = output.len() / Q4_1_BLOCK_SIZE;
515
516 for block_idx in 0..num_blocks {
517 let block_start = block_idx * Q4_1_TYPE_SIZE;
518 let out_start = block_idx * Q4_1_BLOCK_SIZE;
519
520 if block_start + Q4_1_TYPE_SIZE > data.len() {
521 break;
522 }
523
524 let block = &data[block_start..block_start + Q4_1_TYPE_SIZE];
525 let out = &mut output[out_start..out_start + Q4_1_BLOCK_SIZE];
526
527 dequantize_q4_1_block(block, out);
528 }
529}
530
531fn dequantize_q4_1_block(block: &[u8], output: &mut [f32]) {
532 let scale = f16_to_f32(u16::from_le_bytes([block[0], block[1]]));
533 let min = f16_to_f32(u16::from_le_bytes([block[2], block[3]]));
534
535 for i in 0..16 {
536 let byte = block[4 + i];
537 let q0 = (byte & 0x0F) as f32;
538 let q1 = ((byte >> 4) & 0x0F) as f32;
539
540 output[i * 2] = q0 * scale + min;
541 output[i * 2 + 1] = q1 * scale + min;
542 }
543}
544
545const Q5_0_BLOCK_SIZE: usize = 32;
550const Q5_0_TYPE_SIZE: usize = 22; fn dequantize_q5_0(data: &[u8], output: &mut [f32]) {
553 let num_blocks = output.len() / Q5_0_BLOCK_SIZE;
554
555 for block_idx in 0..num_blocks {
556 let block_start = block_idx * Q5_0_TYPE_SIZE;
557 let out_start = block_idx * Q5_0_BLOCK_SIZE;
558
559 if block_start + Q5_0_TYPE_SIZE > data.len() {
560 break;
561 }
562
563 let scale = f16_to_f32(u16::from_le_bytes([
564 data[block_start],
565 data[block_start + 1],
566 ]));
567
568 let qh = u32::from_le_bytes([
570 data[block_start + 2],
571 data[block_start + 3],
572 data[block_start + 4],
573 data[block_start + 5],
574 ]);
575
576 for i in 0..16 {
578 let byte = data[block_start + 6 + i];
579 let h0 = ((qh >> (i * 2)) & 1) as i8;
580 let h1 = ((qh >> (i * 2 + 1)) & 1) as i8;
581
582 let q0 = ((byte & 0x0F) as i8 | (h0 << 4)) - 16;
583 let q1 = (((byte >> 4) & 0x0F) as i8 | (h1 << 4)) - 16;
584
585 output[out_start + i * 2] = (q0 as f32) * scale;
586 output[out_start + i * 2 + 1] = (q1 as f32) * scale;
587 }
588 }
589}
590
591const Q5_1_BLOCK_SIZE: usize = 32;
596const Q5_1_TYPE_SIZE: usize = 24; fn dequantize_q5_1(data: &[u8], output: &mut [f32]) {
599 let num_blocks = output.len() / Q5_1_BLOCK_SIZE;
600
601 for block_idx in 0..num_blocks {
602 let block_start = block_idx * Q5_1_TYPE_SIZE;
603 let out_start = block_idx * Q5_1_BLOCK_SIZE;
604
605 if block_start + Q5_1_TYPE_SIZE > data.len() {
606 break;
607 }
608
609 let scale = f16_to_f32(u16::from_le_bytes([
610 data[block_start],
611 data[block_start + 1],
612 ]));
613 let min = f16_to_f32(u16::from_le_bytes([
614 data[block_start + 2],
615 data[block_start + 3],
616 ]));
617
618 let qh = u32::from_le_bytes([
619 data[block_start + 4],
620 data[block_start + 5],
621 data[block_start + 6],
622 data[block_start + 7],
623 ]);
624
625 for i in 0..16 {
626 let byte = data[block_start + 8 + i];
627 let h0 = ((qh >> (i * 2)) & 1) as u8;
628 let h1 = ((qh >> (i * 2 + 1)) & 1) as u8;
629
630 let q0 = ((byte & 0x0F) | (h0 << 4)) as f32;
631 let q1 = (((byte >> 4) & 0x0F) | (h1 << 4)) as f32;
632
633 output[out_start + i * 2] = q0 * scale + min;
634 output[out_start + i * 2 + 1] = q1 * scale + min;
635 }
636 }
637}
638
639const Q8_0_BLOCK_SIZE: usize = 32;
644const Q8_0_TYPE_SIZE: usize = 34; fn dequantize_q8_0(data: &[u8], output: &mut [f32]) {
647 let num_blocks = output.len() / Q8_0_BLOCK_SIZE;
648
649 for block_idx in 0..num_blocks {
650 let block_start = block_idx * Q8_0_TYPE_SIZE;
651 let out_start = block_idx * Q8_0_BLOCK_SIZE;
652
653 if block_start + Q8_0_TYPE_SIZE > data.len() {
654 break;
655 }
656
657 let block = &data[block_start..block_start + Q8_0_TYPE_SIZE];
658 let out = &mut output[out_start..out_start + Q8_0_BLOCK_SIZE];
659
660 dequantize_q8_0_block(block, out);
661 }
662}
663
664fn dequantize_q8_0_block(block: &[u8], output: &mut [f32]) {
665 let scale = f16_to_f32(u16::from_le_bytes([block[0], block[1]]));
666
667 for i in 0..32 {
668 let q = block[2 + i] as i8;
669 output[i] = (q as f32) * scale;
670 }
671}
672
673const Q8_1_BLOCK_SIZE: usize = 32;
678const Q8_1_TYPE_SIZE: usize = 36; fn dequantize_q8_1(data: &[u8], output: &mut [f32]) {
681 let num_blocks = output.len() / Q8_1_BLOCK_SIZE;
682
683 for block_idx in 0..num_blocks {
684 let block_start = block_idx * Q8_1_TYPE_SIZE;
685 let out_start = block_idx * Q8_1_BLOCK_SIZE;
686
687 if block_start + Q8_1_TYPE_SIZE > data.len() {
688 break;
689 }
690
691 let scale = f16_to_f32(u16::from_le_bytes([
692 data[block_start],
693 data[block_start + 1],
694 ]));
695 let offset = f16_to_f32(u16::from_le_bytes([
696 data[block_start + 2],
697 data[block_start + 3],
698 ]));
699
700 for i in 0..32 {
701 let q = data[block_start + 4 + i] as i8;
702 output[out_start + i] = (q as f32) * scale + offset;
703 }
704 }
705}
706
707const Q2_K_BLOCK_SIZE: usize = 256;
712const Q2_K_TYPE_SIZE: usize = 84;
713
714fn dequantize_q2_k(data: &[u8], output: &mut [f32]) {
715 let num_blocks = output.len() / Q2_K_BLOCK_SIZE;
716
717 for block_idx in 0..num_blocks {
718 let block_start = block_idx * Q2_K_TYPE_SIZE;
719 let out_start = block_idx * Q2_K_BLOCK_SIZE;
720
721 if block_start + Q2_K_TYPE_SIZE > data.len() {
722 break;
723 }
724
725 let block = &data[block_start..];
732
733 let d = f16_to_f32(u16::from_le_bytes([block[16], block[17]]));
734 let dmin = f16_to_f32(u16::from_le_bytes([block[18], block[19]]));
735
736 for j in 0..16 {
737 let sc = (block[j / 2] >> ((j % 2) * 4)) & 0x0F;
739 let scale = d * (sc as f32);
740 let min = dmin * (sc as f32);
741
742 for k in 0..16 {
743 let idx = j * 16 + k;
744 let byte_idx = 20 + idx / 4;
745 let bit_idx = (idx % 4) * 2;
746 let q = (block[byte_idx] >> bit_idx) & 0x03;
747 output[out_start + idx] = (q as f32) * scale - min;
748 }
749 }
750 }
751}
752
753const Q3_K_BLOCK_SIZE: usize = 256;
758const Q3_K_TYPE_SIZE: usize = 110;
759
760fn dequantize_q3_k(data: &[u8], output: &mut [f32]) {
761 let num_blocks = output.len() / Q3_K_BLOCK_SIZE;
762
763 for block_idx in 0..num_blocks {
764 let block_start = block_idx * Q3_K_TYPE_SIZE;
765 let out_start = block_idx * Q3_K_BLOCK_SIZE;
766
767 if block_start + Q3_K_TYPE_SIZE > data.len() {
768 break;
769 }
770
771 let block = &data[block_start..];
773 let d = f16_to_f32(u16::from_le_bytes([block[104], block[105]]));
774
775 for i in 0..256 {
778 let byte_idx = i * 3 / 8;
779 let bit_offset = (i * 3) % 8;
780
781 if byte_idx < 96 {
782 let q = ((block[byte_idx] >> bit_offset) & 0x07) as i8 - 4;
783 output[out_start + i] = (q as f32) * d;
784 }
785 }
786 }
787}
788
789const Q4_K_BLOCK_SIZE: usize = 256;
794const Q4_K_TYPE_SIZE: usize = 144; fn dequantize_q4_k(data: &[u8], output: &mut [f32]) {
797 let num_blocks = output.len() / Q4_K_BLOCK_SIZE;
798
799 for block_idx in 0..num_blocks {
800 let block_start = block_idx * Q4_K_TYPE_SIZE;
801 let out_start = block_idx * Q4_K_BLOCK_SIZE;
802
803 if block_start + Q4_K_TYPE_SIZE > data.len() {
804 break;
805 }
806
807 let block = &data[block_start..block_start + Q4_K_TYPE_SIZE];
808 let out = &mut output[out_start..out_start + Q4_K_BLOCK_SIZE];
809
810 dequantize_q4_k_block(block, out);
811 }
812}
813
814fn dequantize_q4_k_block(block: &[u8], output: &mut [f32]) {
815 let d = f16_to_f32(u16::from_le_bytes([block[0], block[1]]));
817 let dmin = f16_to_f32(u16::from_le_bytes([block[2], block[3]]));
818
819 for sb in 0..8 {
821 let scale_idx = sb * 6 / 8;
823 let scale_shift = (sb * 6) % 8;
824
825 let mut sc = (block[4 + scale_idx] >> scale_shift) & 0x3F;
826 if scale_shift > 2 && scale_idx + 1 < 12 {
827 sc |= (block[4 + scale_idx + 1] << (8 - scale_shift)) & 0x3F;
828 }
829
830 let scale = d * (sc as f32);
831
832 let qs_start = 16 + sb * 16; for i in 0..16 {
835 let byte = block[qs_start + i];
836 let q0 = (byte & 0x0F) as f32;
837 let q1 = ((byte >> 4) & 0x0F) as f32;
838
839 output[sb * 32 + i * 2] = q0 * scale + dmin;
840 output[sb * 32 + i * 2 + 1] = q1 * scale + dmin;
841 }
842 }
843}
844
845const Q5_K_BLOCK_SIZE: usize = 256;
850const Q5_K_TYPE_SIZE: usize = 176;
851
852fn dequantize_q5_k(data: &[u8], output: &mut [f32]) {
853 let num_blocks = output.len() / Q5_K_BLOCK_SIZE;
854
855 for block_idx in 0..num_blocks {
856 let block_start = block_idx * Q5_K_TYPE_SIZE;
857 let out_start = block_idx * Q5_K_BLOCK_SIZE;
858
859 if block_start + Q5_K_TYPE_SIZE > data.len() {
860 break;
861 }
862
863 let block = &data[block_start..];
864 let d = f16_to_f32(u16::from_le_bytes([block[0], block[1]]));
865 let dmin = f16_to_f32(u16::from_le_bytes([block[2], block[3]]));
866
867 for i in 0..256 {
869 let byte_idx = 16 + (i * 5) / 8;
870 let bit_offset = (i * 5) % 8;
871
872 if byte_idx < Q5_K_TYPE_SIZE {
873 let mut q = (block[byte_idx] >> bit_offset) & 0x1F;
874 if bit_offset > 3 && byte_idx + 1 < Q5_K_TYPE_SIZE {
875 q |= (block[byte_idx + 1] << (8 - bit_offset)) & 0x1F;
876 }
877 output[out_start + i] = (q as f32) * d + dmin;
878 }
879 }
880 }
881}
882
883const Q6_K_BLOCK_SIZE: usize = 256;
888const Q6_K_TYPE_SIZE: usize = 210;
889
890fn dequantize_q6_k(data: &[u8], output: &mut [f32]) {
891 let num_blocks = output.len() / Q6_K_BLOCK_SIZE;
892
893 for block_idx in 0..num_blocks {
894 let block_start = block_idx * Q6_K_TYPE_SIZE;
895 let out_start = block_idx * Q6_K_BLOCK_SIZE;
896
897 if block_start + Q6_K_TYPE_SIZE > data.len() {
898 break;
899 }
900
901 let block = &data[block_start..];
902 let d = f16_to_f32(u16::from_le_bytes([block[208], block[209]]));
903
904 for i in 0..256 {
909 let ql_idx = i / 2;
910 let is_high = i % 2 == 1;
911
912 if ql_idx < 128 {
913 let ql = if is_high {
914 (block[ql_idx] >> 4) & 0x0F
915 } else {
916 block[ql_idx] & 0x0F
917 };
918
919 let qh_idx = 128 + i / 4;
920 let qh_shift = (i % 4) * 2;
921 let qh = if qh_idx < 192 {
922 (block[qh_idx] >> qh_shift) & 0x03
923 } else {
924 0
925 };
926
927 let q = ((qh << 4) | ql) as i8 - 32;
928 let scale_idx = i / 16;
929 let sc = if scale_idx < 16 {
930 (block[192 + scale_idx / 2] >> ((scale_idx % 2) * 4)) & 0x0F
931 } else {
932 1
933 };
934
935 output[out_start + i] = (q as f32) * d * (sc as f32);
936 }
937 }
938 }
939}
940
941const IQ4_NL_BLOCK_SIZE: usize = 32;
946const IQ4_NL_TYPE_SIZE: usize = 18;
947
948const IQ4_NL_LUT: [f32; 16] = [
950 -1.0, -0.75, -0.5, -0.375, -0.25, -0.125, 0.0, 0.125, 0.25, 0.375, 0.5, 0.75, 1.0, 1.5, 2.0,
951 3.0,
952];
953
954fn dequantize_iq4_nl(data: &[u8], output: &mut [f32]) {
955 let num_blocks = output.len() / IQ4_NL_BLOCK_SIZE;
956
957 for block_idx in 0..num_blocks {
958 let block_start = block_idx * IQ4_NL_TYPE_SIZE;
959 let out_start = block_idx * IQ4_NL_BLOCK_SIZE;
960
961 if block_start + IQ4_NL_TYPE_SIZE > data.len() {
962 break;
963 }
964
965 let scale = f16_to_f32(u16::from_le_bytes([
966 data[block_start],
967 data[block_start + 1],
968 ]));
969
970 for i in 0..16 {
971 let byte = data[block_start + 2 + i];
972 let q0 = (byte & 0x0F) as usize;
973 let q1 = ((byte >> 4) & 0x0F) as usize;
974
975 output[out_start + i * 2] = IQ4_NL_LUT[q0] * scale;
976 output[out_start + i * 2 + 1] = IQ4_NL_LUT[q1] * scale;
977 }
978 }
979}
980
981const BITNET_T158_BLOCK_SIZE: usize = 256;
986const BITNET_T158_TYPE_SIZE: usize = 66; fn dequantize_bitnet_t158_wrapper(data: &[u8], output: &mut [f32]) {
997 let num_blocks = output.len() / BITNET_T158_BLOCK_SIZE;
998
999 let mut scales = Vec::with_capacity(num_blocks);
1001 let mut packed_data = Vec::with_capacity(num_blocks * 64);
1002
1003 for block_idx in 0..num_blocks {
1004 let block_start = block_idx * BITNET_T158_TYPE_SIZE;
1005
1006 if block_start + BITNET_T158_TYPE_SIZE > data.len() {
1007 break;
1008 }
1009
1010 packed_data.extend_from_slice(&data[block_start..block_start + 64]);
1012
1013 let scale_f16 = f16_to_f32(u16::from_le_bytes([
1015 data[block_start + 64],
1016 data[block_start + 65],
1017 ]));
1018 scales.push(scale_f16);
1019 }
1020
1021 let dequantized = dequantize_bitnet_t158(&packed_data, &scales, output.len());
1023
1024 output[..dequantized.len()].copy_from_slice(&dequantized);
1026}
1027
1028#[inline(always)]
1034fn f16_to_f32(bits: u16) -> f32 {
1035 let sign = ((bits & 0x8000) as u32) << 16;
1036 let exp = ((bits >> 10) & 0x1F) as u32;
1037 let frac = (bits & 0x03FF) as u32;
1038
1039 if exp == 0 {
1040 if frac == 0 {
1041 return f32::from_bits(sign);
1042 }
1043 let mut e = 1u32;
1045 let mut f = frac;
1046 while (f & 0x0400) == 0 {
1047 f <<= 1;
1048 e += 1;
1049 }
1050 f &= 0x03FF;
1051 return f32::from_bits(sign | ((127 - 15 + 1 - e) << 23) | (f << 13));
1052 }
1053
1054 if exp == 31 {
1055 return f32::from_bits(sign | 0x7F80_0000 | (frac << 13));
1057 }
1058
1059 f32::from_bits(sign | ((exp + 127 - 15) << 23) | (frac << 13))
1060}
1061
1062#[cfg(test)]
1067mod tests {
1068 use super::*;
1069
1070 #[test]
1071 fn test_quant_type_sizes() {
1072 assert_eq!(GgufQuantType::F32.block_size(), 1);
1073 assert_eq!(GgufQuantType::F32.type_size(), 4);
1074
1075 assert_eq!(GgufQuantType::Q4_0.block_size(), 32);
1076 assert_eq!(GgufQuantType::Q4_0.type_size(), 18);
1077
1078 assert_eq!(GgufQuantType::Q4_K.block_size(), 256);
1079 assert_eq!(GgufQuantType::Q4_K.type_size(), 144);
1080 }
1081
1082 #[test]
1083 fn test_quant_type_bits() {
1084 assert!((GgufQuantType::F32.bits_per_weight() - 32.0).abs() < 0.1);
1086
1087 assert!((GgufQuantType::Q4_0.bits_per_weight() - 4.5).abs() < 0.1);
1089
1090 assert!((GgufQuantType::Q8_0.bits_per_weight() - 8.5).abs() < 0.1);
1092 }
1093
1094 #[test]
1095 fn test_f16_conversion() {
1096 assert_eq!(f16_to_f32(0x0000), 0.0);
1098 assert_eq!(f16_to_f32(0x3C00), 1.0);
1099 assert_eq!(f16_to_f32(0xBC00), -1.0);
1100
1101 let half = f16_to_f32(0x3800); assert!((half - 0.5).abs() < 0.001);
1104 }
1105
1106 #[test]
1107 fn test_q4_0_dequantize() {
1108 let mut block = vec![0u8; 18];
1110 block[0] = 0x00;
1112 block[1] = 0x3C;
1113 for i in 0..16 {
1115 block[2 + i] = 0x88; }
1117
1118 let mut output = vec![0.0f32; 32];
1119 dequantize_q4_0_block(&block, &mut output);
1120
1121 for val in &output {
1123 assert!(val.abs() < 0.001);
1124 }
1125 }
1126
1127 #[test]
1128 fn test_q8_0_dequantize() {
1129 let mut block = vec![0u8; 34];
1131 block[0] = 0x00;
1133 block[1] = 0x3C;
1134 for i in 0..32 {
1136 block[2 + i] = (i + 1) as u8;
1137 }
1138
1139 let mut output = vec![0.0f32; 32];
1140 dequantize_q8_0_block(&block, &mut output);
1141
1142 for i in 0..32 {
1144 assert!((output[i] - (i + 1) as f32).abs() < 0.001);
1145 }
1146 }
1147
1148 #[test]
1149 fn test_quant_type_try_from() {
1150 assert_eq!(GgufQuantType::try_from(0).unwrap(), GgufQuantType::F32);
1151 assert_eq!(GgufQuantType::try_from(12).unwrap(), GgufQuantType::Q4_K);
1152 assert!(GgufQuantType::try_from(100).is_err());
1153 }
1154
1155 #[test]
1156 fn test_quantized_tensor() {
1157 let tensor = QuantizedTensor {
1158 data: vec![0u8; 144],
1159 dtype: GgufQuantType::Q4_K,
1160 shape: vec![256],
1161 num_elements: 256,
1162 };
1163
1164 assert_eq!(tensor.block_count(), 1);
1165 assert!(tensor.dtype.is_quantized());
1166 }
1167}