Skip to main content

ferrum_quantization/
config.rs

1//! Quantization configuration parsed from model metadata.
2//!
3//! Populated by `WeightLoader` implementations from sources like
4//! `quantize_config.json` (GPTQ/AWQ) or a GGUF header.
5
6use serde::{Deserialize, Serialize};
7
8/// The quantization scheme in use, if any.
9#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
10#[serde(rename_all = "lowercase")]
11pub enum QuantMethod {
12    /// No quantization — dense fp32/fp16/bf16 weights.
13    None,
14    /// GPTQ: int4/int8 group-wise with scales + zeros, asymmetric.
15    Gptq,
16    /// AWQ: int4 group-wise, similar to GPTQ but different packing.
17    Awq,
18    /// GGUF: k-quants and legacy quants embedded in a single-file format.
19    Gguf,
20}
21
22impl Default for QuantMethod {
23    fn default() -> Self {
24        Self::None
25    }
26}
27
28/// Combined quantization config.
29#[derive(Clone, Debug, Serialize, Deserialize, Default)]
30pub struct QuantConfig {
31    pub method: QuantMethod,
32    /// Bit-width (typically 4 or 8 for GPTQ/AWQ).
33    #[serde(default)]
34    pub bits: u32,
35    /// Group size for group-wise scales (typically 128).
36    #[serde(default)]
37    pub group_size: usize,
38    /// Whether to use descending activation order (GPTQ only).
39    #[serde(default)]
40    pub desc_act: bool,
41    /// Whether scales use symmetric quantization.
42    #[serde(default)]
43    pub sym: bool,
44}