1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
//! Quantization types and enums
//!
//! This module contains the core types used throughout the quantization system,
//! including quantization methods, parameters, and data types.
/// Supported methods of quantization
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QuantizationMethod {
/// Uniform quantization maps the input range to uniform discrete levels
/// with equal spacing between consecutive levels
Uniform,
/// Symmetric quantization is centered around zero and has equal positive and
/// negative ranges, making it suitable for weight matrices
Symmetric,
/// Affine quantization uses the formula q = scale * (x - zero_point)
/// allowing better representation of asymmetric distributions
Affine,
/// Power-of-two quantization uses powers of 2 for the scale factor,
/// enabling efficient implementation with bitshifts
PowerOfTwo,
/// Int4 quantization uses 4-bit signed integers, packing two values into each byte
/// for memory efficiency. This is useful for model compression in ML applications.
Int4,
/// UInt4 quantization uses 4-bit unsigned integers, packing two values into each byte.
/// This provides a positive-only range with maximum memory efficiency.
UInt4,
/// Float16 quantization uses IEEE 754 16-bit half-precision floating point format.
/// It provides a good balance between precision and memory efficiency for ML models.
Float16,
/// BFloat16 quantization uses the "brain floating point" 16-bit format,
/// which has the same exponent size as f32 but fewer mantissa bits.
/// This is especially well-suited for deep learning applications.
BFloat16,
/// Per-channel symmetric quantization applies different symmetric quantization
/// parameters to each channel (column), improving accuracy for matrices with
/// varying distributions across channels.
PerChannelSymmetric,
/// Per-channel affine quantization applies different affine quantization
/// parameters to each channel (column), allowing for better representation of
/// asymmetric distributions that vary by channel.
PerChannelAffine,
}
/// Parameters for the quantization process
#[derive(Debug, Clone)]
pub struct QuantizationParams {
/// The number of bits used for quantization
pub bits: u8,
/// The scale factor used to convert between quantized and float values
/// For per-channel quantization, this is the default scale for debugging
pub scale: f32,
/// The zero point used for asymmetric quantization (for affine quantization)
/// For per-channel quantization, this is the default zero point for debugging
pub zero_point: i32,
/// The minimum value of the original data
/// For per-channel quantization, this is across all channels
pub min_val: f32,
/// The maximum value of the original data
/// For per-channel quantization, this is across all channels
pub max_val: f32,
/// The quantization method used
pub method: QuantizationMethod,
/// The data type used for storage
pub data_type: QuantizedDataType,
/// Per-channel scale factors (only used for per-channel quantization)
pub channel_scales: Option<Vec<f32>>,
/// Per-channel zero points (only used for per-channel affine quantization)
pub channel_zero_points: Option<Vec<i32>>,
}
/// The storage type used for quantized data
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum QuantizedDataType {
/// 8-bit signed integers
Int8,
/// 4-bit signed integers (packed into i8 array)
Int4,
/// 4-bit unsigned integers (packed into i8 array)
UInt4,
/// 16-bit IEEE 754 half-precision floating point (f16)
Float16,
/// 16-bit Brain floating point (bf16)
BFloat16,
}