scirs2_linalg/quantization/types.rs
1//! Quantization types and enums
2//!
3//! This module contains the core types used throughout the quantization system,
4//! including quantization methods, parameters, and data types.
5
6/// Supported methods of quantization
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum QuantizationMethod {
9 /// Uniform quantization maps the input range to uniform discrete levels
10 /// with equal spacing between consecutive levels
11 Uniform,
12
13 /// Symmetric quantization is centered around zero and has equal positive and
14 /// negative ranges, making it suitable for weight matrices
15 Symmetric,
16
17 /// Affine quantization uses the formula q = scale * (x - zero_point)
18 /// allowing better representation of asymmetric distributions
19 Affine,
20
21 /// Power-of-two quantization uses powers of 2 for the scale factor,
22 /// enabling efficient implementation with bitshifts
23 PowerOfTwo,
24
25 /// Int4 quantization uses 4-bit signed integers, packing two values into each byte
26 /// for memory efficiency. This is useful for model compression in ML applications.
27 Int4,
28
29 /// UInt4 quantization uses 4-bit unsigned integers, packing two values into each byte.
30 /// This provides a positive-only range with maximum memory efficiency.
31 UInt4,
32
33 /// Float16 quantization uses IEEE 754 16-bit half-precision floating point format.
34 /// It provides a good balance between precision and memory efficiency for ML models.
35 Float16,
36
37 /// BFloat16 quantization uses the "brain floating point" 16-bit format,
38 /// which has the same exponent size as f32 but fewer mantissa bits.
39 /// This is especially well-suited for deep learning applications.
40 BFloat16,
41
42 /// Per-channel symmetric quantization applies different symmetric quantization
43 /// parameters to each channel (column), improving accuracy for matrices with
44 /// varying distributions across channels.
45 PerChannelSymmetric,
46
47 /// Per-channel affine quantization applies different affine quantization
48 /// parameters to each channel (column), allowing for better representation of
49 /// asymmetric distributions that vary by channel.
50 PerChannelAffine,
51}
52
53/// Parameters for the quantization process
54#[derive(Debug, Clone)]
55pub struct QuantizationParams {
56 /// The number of bits used for quantization
57 pub bits: u8,
58
59 /// The scale factor used to convert between quantized and float values
60 /// For per-channel quantization, this is the default scale for debugging
61 pub scale: f32,
62
63 /// The zero point used for asymmetric quantization (for affine quantization)
64 /// For per-channel quantization, this is the default zero point for debugging
65 pub zero_point: i32,
66
67 /// The minimum value of the original data
68 /// For per-channel quantization, this is across all channels
69 pub min_val: f32,
70
71 /// The maximum value of the original data
72 /// For per-channel quantization, this is across all channels
73 pub max_val: f32,
74
75 /// The quantization method used
76 pub method: QuantizationMethod,
77
78 /// The data type used for storage
79 pub data_type: QuantizedDataType,
80
81 /// Per-channel scale factors (only used for per-channel quantization)
82 pub channel_scales: Option<Vec<f32>>,
83
84 /// Per-channel zero points (only used for per-channel affine quantization)
85 pub channel_zero_points: Option<Vec<i32>>,
86}
87
88/// The storage type used for quantized data
89#[derive(Debug, Clone, PartialEq, Eq)]
90pub enum QuantizedDataType {
91 /// 8-bit signed integers
92 Int8,
93 /// 4-bit signed integers (packed into i8 array)
94 Int4,
95 /// 4-bit unsigned integers (packed into i8 array)
96 UInt4,
97 /// 16-bit IEEE 754 half-precision floating point (f16)
98 Float16,
99 /// 16-bit Brain floating point (bf16)
100 BFloat16,
101}