entrenar/quant/gguf_quant/
q8_0.rs1use super::GGUF_BLOCK_SIZE;
4use serde::{Deserialize, Serialize};
5
6#[derive(Clone, Debug, Serialize, Deserialize)]
11pub struct Q8_0 {
12 pub scales: Vec<f32>,
14 pub data: Vec<i8>,
16 pub len: usize,
18}
19
20impl Q8_0 {
21 pub fn quantize(values: &[f32]) -> Self {
23 let len = values.len();
24 let num_blocks = len.div_ceil(GGUF_BLOCK_SIZE);
25
26 let mut scales = Vec::with_capacity(num_blocks);
27 let mut data = Vec::with_capacity(len);
28
29 for block_idx in 0..num_blocks {
30 let start = block_idx * GGUF_BLOCK_SIZE;
31 let end = (start + GGUF_BLOCK_SIZE).min(len);
32 let block = &values[start..end];
33
34 let max_abs = block
36 .iter()
37 .map(|v| v.abs())
38 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
39 .unwrap_or(0.0);
40
41 let scale = if max_abs < 1e-10 { 1e-10 } else { max_abs / 127.0 };
42 scales.push(scale);
43
44 for &val in block {
46 let q = (val / scale).round().clamp(-128.0, 127.0) as i8;
47 data.push(q);
48 }
49
50 let padding = GGUF_BLOCK_SIZE - block.len();
52 data.extend(std::iter::repeat_n(0i8, padding));
53 }
54
55 data.truncate(len);
57
58 Self { scales, data, len }
59 }
60
61 pub fn dequantize(&self) -> Vec<f32> {
63 let mut result = Vec::with_capacity(self.len);
64
65 for (i, &q) in self.data.iter().enumerate() {
66 let block_idx = i / GGUF_BLOCK_SIZE;
67 let scale = self.scales[block_idx];
68 result.push(f32::from(q) * scale);
69 }
70
71 result
72 }
73
74 pub fn memory_bytes(&self) -> usize {
76 self.scales.len() * 4 + self.data.len()
77 }
78
79 pub fn gguf_bytes(&self) -> usize {
81 self.scales.len() * 2 + self.data.len()
82 }
83
84 pub fn compression_ratio(&self) -> f32 {
86 let original = self.len * 4;
87 original as f32 / self.gguf_bytes() as f32
88 }
89
90 pub fn num_blocks(&self) -> usize {
92 self.scales.len()
93 }
94}