Skip to main content

cbtop/quantize/
format.rs

1//! Quantization format definitions and properties.
2
3// Allow non-camel-case for GGML standard quantization type names
4#![allow(non_camel_case_types)]
5
6use std::fmt;
7
8/// Supported quantization formats for ComputeBricks.
9///
10/// Based on GGML/llama.cpp quantization types.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum QuantFormat {
13    /// Full precision (baseline)
14    F32,
15    /// Half precision
16    F16,
17    /// Brain float 16
18    BF16,
19
20    /// 4-bit quantization, no scales per block (32 values/block)
21    Q4_0,
22    /// 4-bit quantization with K-quants (256 values/super-block, 6-bit scales)
23    Q4_K,
24    /// 5-bit quantization with K-quants
25    Q5_K,
26    /// 6-bit quantization with K-quants
27    Q6_K,
28    /// 8-bit quantization, simple (32 values/block)
29    Q8_0,
30
31    /// GPTQ format (ExLlama compatible)
32    Gptq { bits: u8, group_size: u16 },
33    /// AWQ format (activation-aware)
34    Awq { bits: u8 },
35}
36
37impl QuantFormat {
38    /// Effective bits per weight (including scales and metadata).
39    pub fn bits_per_weight(&self) -> f64 {
40        match self {
41            QuantFormat::F32 => 32.0,
42            QuantFormat::F16 => 16.0,
43            QuantFormat::BF16 => 16.0,
44            QuantFormat::Q4_0 => 4.5, // 4 bits + scale overhead
45            QuantFormat::Q4_K => 4.5, // 4 bits + 6-bit scales
46            QuantFormat::Q5_K => 5.5,
47            QuantFormat::Q6_K => 6.5,
48            QuantFormat::Q8_0 => 8.5, // 8 bits + scale
49            QuantFormat::Gptq { bits, .. } => *bits as f64 + 0.5,
50            QuantFormat::Awq { bits } => *bits as f64 + 0.5,
51        }
52    }
53
54    /// Memory ratio compared to F16 (lower is better).
55    pub fn memory_ratio(&self) -> f64 {
56        self.bits_per_weight() / 16.0
57    }
58
59    /// Expected perplexity delta compared to F16 (lower is better).
60    ///
61    /// Based on llama.cpp benchmarks.
62    pub fn expected_ppl_delta(&self) -> f64 {
63        match self {
64            QuantFormat::F32 => 0.0,
65            QuantFormat::F16 => 0.0,
66            QuantFormat::BF16 => 0.01,
67            QuantFormat::Q4_0 => 0.5,
68            QuantFormat::Q4_K => 0.3,
69            QuantFormat::Q5_K => 0.1,
70            QuantFormat::Q6_K => 0.05,
71            QuantFormat::Q8_0 => 0.01,
72            QuantFormat::Gptq { bits, .. } => match bits {
73                4 => 0.4,
74                8 => 0.02,
75                _ => 0.5,
76            },
77            QuantFormat::Awq { bits } => match bits {
78                4 => 0.2, // AWQ tends to have better quality
79                _ => 0.3,
80            },
81        }
82    }
83
84    /// Block size (number of weights per quantization block).
85    pub fn block_size(&self) -> usize {
86        match self {
87            QuantFormat::F32 | QuantFormat::F16 | QuantFormat::BF16 => 1,
88            QuantFormat::Q4_0 | QuantFormat::Q8_0 => 32,
89            QuantFormat::Q4_K | QuantFormat::Q5_K | QuantFormat::Q6_K => 256, // Super-block
90            QuantFormat::Gptq { group_size, .. } => *group_size as usize,
91            QuantFormat::Awq { .. } => 128,
92        }
93    }
94
95    /// Bytes per block.
96    pub fn bytes_per_block(&self) -> usize {
97        match self {
98            QuantFormat::F32 => 4,
99            QuantFormat::F16 | QuantFormat::BF16 => 2,
100            QuantFormat::Q4_0 => 18,  // 2 (scale) + 16 (32 x 4-bit)
101            QuantFormat::Q4_K => 144, // 2+2+12+128 (super-block)
102            QuantFormat::Q5_K => 176, // 2+2+12+128+32
103            QuantFormat::Q6_K => 210, // 128+64+16+2
104            QuantFormat::Q8_0 => 34,  // 2 (scale) + 32 (8-bit values)
105            QuantFormat::Gptq { bits, group_size } => {
106                let data_bytes = (*group_size as usize * *bits as usize).div_ceil(8);
107                data_bytes + 4 // + scale/zero
108            }
109            QuantFormat::Awq { bits } => {
110                let data_bytes = (128 * *bits as usize).div_ceil(8);
111                data_bytes + 4
112            }
113        }
114    }
115}
116
117impl fmt::Display for QuantFormat {
118    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
119        match self {
120            QuantFormat::F32 => write!(f, "F32"),
121            QuantFormat::F16 => write!(f, "F16"),
122            QuantFormat::BF16 => write!(f, "BF16"),
123            QuantFormat::Q4_0 => write!(f, "Q4_0"),
124            QuantFormat::Q4_K => write!(f, "Q4_K"),
125            QuantFormat::Q5_K => write!(f, "Q5_K"),
126            QuantFormat::Q6_K => write!(f, "Q6_K"),
127            QuantFormat::Q8_0 => write!(f, "Q8_0"),
128            QuantFormat::Gptq { bits, group_size } => {
129                write!(f, "GPTQ-{}bit-g{}", bits, group_size)
130            }
131            QuantFormat::Awq { bits } => write!(f, "AWQ-{}bit", bits),
132        }
133    }
134}
135
136/// Dequantization strategy.
137///
138/// Controls when and how quantized weights are dequantized.
139#[derive(Debug, Clone, Copy, PartialEq, Eq)]
140pub enum DequantStrategy {
141    /// Fused: dequantize during matmul (best for GPU, saves memory bandwidth)
142    Fused,
143    /// Prefetch: dequantize ahead of compute (good for pipelining)
144    Prefetch { lookahead_blocks: usize },
145    /// On-demand: dequantize per block (lowest memory footprint)
146    OnDemand,
147}
148
149impl Default for DequantStrategy {
150    fn default() -> Self {
151        DequantStrategy::Fused
152    }
153}