ferrum_quantization/config.rs
1//! Quantization configuration parsed from model metadata.
2//!
3//! Populated by `WeightLoader` implementations from sources like
4//! `quantize_config.json` (GPTQ/AWQ) or a GGUF header.
5
6use serde::{Deserialize, Serialize};
7
8/// The quantization scheme in use, if any.
9#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
10#[serde(rename_all = "lowercase")]
11#[derive(Default)]
12pub enum QuantMethod {
13 /// No quantization — dense fp32/fp16/bf16 weights.
14 #[default]
15 None,
16 /// GPTQ: int4/int8 group-wise with scales + zeros, asymmetric.
17 Gptq,
18 /// AWQ: int4 group-wise, similar to GPTQ but different packing.
19 Awq,
20 /// GGUF: k-quants and legacy quants embedded in a single-file format.
21 Gguf,
22}
23
24/// Combined quantization config.
25#[derive(Clone, Debug, Serialize, Deserialize, Default)]
26pub struct QuantConfig {
27 // AutoGPTQ writes `quant_method`; some older / hand-written packs use
28 // `method`. Accept both. Default to None if absent (which would have
29 // been an error before — but in practice every real quant pack has
30 // one of the two).
31 #[serde(rename = "quant_method", alias = "method", default)]
32 pub method: QuantMethod,
33 /// Bit-width (typically 4 or 8 for GPTQ/AWQ).
34 #[serde(default)]
35 pub bits: u32,
36 /// Group size for group-wise scales (typically 128).
37 #[serde(default)]
38 pub group_size: usize,
39 /// Whether to use descending activation order (GPTQ only).
40 #[serde(default)]
41 pub desc_act: bool,
42 /// Whether scales use symmetric quantization.
43 #[serde(default)]
44 pub sym: bool,
45}