Skip to main content

cbtop/quantize/
mod.rs

1//! QuantizedBrick Implementation (PMAT-013)
2//!
3//! Implements quantized weight support for ComputeBricks per cbtop spec S17.
4//!
5//! # Supported Formats
6//!
7//! | Format | Bits/Weight | Memory | Perplexity Delta |
8//! |--------|------------|--------|------------------|
9//! | Q4_0   | 4.0        | 25%    | ~0.5%            |
10//! | Q4_K   | 4.5        | 28%    | ~0.3%            |
11//! | Q5_K   | 5.5        | 34%    | ~0.1%            |
12//! | Q8_0   | 8.0        | 50%    | ~0.01%           |
13//!
14//! # Citations
15//!
16//! - [Dettmers et al. 2022] "LLM.int8(): 8-bit Matrix Multiplication" NeurIPS
17//! - [Frantar et al. 2023] "GPTQ: Accurate Post-Training Quantization" ICLR
18//! - [Lin et al. 2023] "AWQ: Activation-aware Weight Quantization" MLSys
19
20// Allow non-camel-case for GGML standard quantization type names
21#![allow(non_camel_case_types)]
22
23mod format;
24mod gguf;
25mod weights;
26
27pub use format::{DequantStrategy, QuantFormat};
28pub use gguf::{GgufError, GgufHeader, GgufLoader, GgufResult, GgufTensorInfo, GgufValue};
29pub use weights::{LayerQuantStats, QuantStats, QuantizedWeights};
30
31use std::fmt;
32
33/// QuantizedBrick wraps compute operations with quantized weights.
34///
35/// Per cbtop spec S17.2.
36#[derive(Debug, Clone)]
37pub struct QuantizedBrick {
38    /// Brick name
39    pub name: String,
40    /// Quantized weights for this brick
41    pub weights: Option<QuantizedWeights>,
42    /// Dequantization strategy
43    pub dequant_strategy: DequantStrategy,
44    /// Performance budget (tokens per second)
45    pub budget_tok_per_sec: Option<u64>,
46}
47
48impl QuantizedBrick {
49    /// Create a new quantized brick.
50    pub fn new(name: &str) -> Self {
51        Self {
52            name: name.to_string(),
53            weights: None,
54            dequant_strategy: DequantStrategy::default(),
55            budget_tok_per_sec: None,
56        }
57    }
58
59    /// Set quantized weights.
60    pub fn with_weights(mut self, weights: QuantizedWeights) -> Self {
61        self.weights = Some(weights);
62        self
63    }
64
65    /// Set dequantization strategy.
66    pub fn with_dequant_strategy(mut self, strategy: DequantStrategy) -> Self {
67        self.dequant_strategy = strategy;
68        self
69    }
70
71    /// Set performance budget.
72    pub fn with_budget(mut self, tok_per_sec: u64) -> Self {
73        self.budget_tok_per_sec = Some(tok_per_sec);
74        self
75    }
76
77    /// Get memory footprint (bytes).
78    pub fn memory_bytes(&self) -> usize {
79        self.weights.as_ref().map_or(0, |w| w.memory_bytes())
80    }
81
82    /// Get effective bits per weight.
83    pub fn bits_per_weight(&self) -> f64 {
84        self.weights
85            .as_ref()
86            .map_or(0.0, |w| w.actual_bits_per_weight())
87    }
88
89    /// Get quantization format.
90    pub fn format(&self) -> Option<QuantFormat> {
91        self.weights.as_ref().map(|w| w.format)
92    }
93
94    /// Check if weights are loaded.
95    pub fn has_weights(&self) -> bool {
96        self.weights.is_some()
97    }
98}
99
100impl fmt::Display for QuantizedBrick {
101    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
102        write!(f, "QuantizedBrick[{}]", self.name)?;
103        if let Some(weights) = &self.weights {
104            write!(
105                f,
106                " format={} weights={} memory={:.2}MB",
107                weights.format,
108                weights.num_weights(),
109                weights.memory_bytes() as f64 / 1_000_000.0
110            )?;
111        }
112        Ok(())
113    }
114}
115
116/// GGML tensor type to QuantFormat mapping.
117pub fn ggml_type_to_format(ggml_type: u32) -> Option<QuantFormat> {
118    match ggml_type {
119        0 => Some(QuantFormat::F32),
120        1 => Some(QuantFormat::F16),
121        2 => Some(QuantFormat::Q4_0),
122        3 => Some(QuantFormat::Q4_K), // Q4_1 in GGML, map to Q4_K
123        8 => Some(QuantFormat::Q8_0),
124        12 => Some(QuantFormat::Q4_K),
125        13 => Some(QuantFormat::Q5_K),
126        14 => Some(QuantFormat::Q6_K),
127        _ => None,
128    }
129}
130
131#[cfg(test)]
132mod tests;