scirs2_linalg/quantization/
types.rs

1//! Quantization types and enums
2//!
3//! This module contains the core types used throughout the quantization system,
4//! including quantization methods, parameters, and data types.
5
6/// Supported methods of quantization
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum QuantizationMethod {
9    /// Uniform quantization maps the input range to uniform discrete levels
10    /// with equal spacing between consecutive levels
11    Uniform,
12
13    /// Symmetric quantization is centered around zero and has equal positive and
14    /// negative ranges, making it suitable for weight matrices
15    Symmetric,
16
17    /// Affine quantization uses the formula q = scale * (x - zero_point)
18    /// allowing better representation of asymmetric distributions
19    Affine,
20
21    /// Power-of-two quantization uses powers of 2 for the scale factor,
22    /// enabling efficient implementation with bitshifts
23    PowerOfTwo,
24
25    /// Int4 quantization uses 4-bit signed integers, packing two values into each byte
26    /// for memory efficiency. This is useful for model compression in ML applications.
27    Int4,
28
29    /// UInt4 quantization uses 4-bit unsigned integers, packing two values into each byte.
30    /// This provides a positive-only range with maximum memory efficiency.
31    UInt4,
32
33    /// Float16 quantization uses IEEE 754 16-bit half-precision floating point format.
34    /// It provides a good balance between precision and memory efficiency for ML models.
35    Float16,
36
37    /// BFloat16 quantization uses the "brain floating point" 16-bit format,
38    /// which has the same exponent size as f32 but fewer mantissa bits.
39    /// This is especially well-suited for deep learning applications.
40    BFloat16,
41
42    /// Per-channel symmetric quantization applies different symmetric quantization
43    /// parameters to each channel (column), improving accuracy for matrices with
44    /// varying distributions across channels.
45    PerChannelSymmetric,
46
47    /// Per-channel affine quantization applies different affine quantization
48    /// parameters to each channel (column), allowing for better representation of
49    /// asymmetric distributions that vary by channel.
50    PerChannelAffine,
51}
52
53/// Parameters for the quantization process
54#[derive(Debug, Clone)]
55pub struct QuantizationParams {
56    /// The number of bits used for quantization
57    pub bits: u8,
58
59    /// The scale factor used to convert between quantized and float values
60    /// For per-channel quantization, this is the default scale for debugging
61    pub scale: f32,
62
63    /// The zero point used for asymmetric quantization (for affine quantization)
64    /// For per-channel quantization, this is the default zero point for debugging
65    pub zero_point: i32,
66
67    /// The minimum value of the original data
68    /// For per-channel quantization, this is across all channels
69    pub min_val: f32,
70
71    /// The maximum value of the original data
72    /// For per-channel quantization, this is across all channels
73    pub max_val: f32,
74
75    /// The quantization method used
76    pub method: QuantizationMethod,
77
78    /// The data type used for storage
79    pub data_type: QuantizedDataType,
80
81    /// Per-channel scale factors (only used for per-channel quantization)
82    pub channel_scales: Option<Vec<f32>>,
83
84    /// Per-channel zero points (only used for per-channel affine quantization)
85    pub channel_zero_points: Option<Vec<i32>>,
86}
87
88/// The storage type used for quantized data
89#[derive(Debug, Clone, PartialEq, Eq)]
90pub enum QuantizedDataType {
91    /// 8-bit signed integers
92    Int8,
93    /// 4-bit signed integers (packed into i8 array)
94    Int4,
95    /// 4-bit unsigned integers (packed into i8 array)
96    UInt4,
97    /// 16-bit IEEE 754 half-precision floating point (f16)
98    Float16,
99    /// 16-bit Brain floating point (bf16)
100    BFloat16,
101}