lnmp_quant/
scheme.rs

1use serde::{Deserialize, Serialize};
2
3/// Quantization scheme for embedding vectors
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
5#[repr(u8)]
6pub enum QuantScheme {
7    /// 8-bit signed integer quantization
8    /// - Range: -128 to 127
9    /// - Size reduction: 4x (F32 → Int8)
10    /// - Accuracy: Very high
11    QInt8 = 0x01,
12
13    /// 4-bit packed quantization (future)
14    /// - Range: -8 to 7
15    /// - Size reduction: 8x (F32 → 4-bit)
16    /// - Accuracy: High
17    #[allow(dead_code)]
18    QInt4 = 0x02,
19
20    /// 1-bit binary quantization (future)
21    /// - Range: -1 or 1 (sign-based)
22    /// - Size reduction: 32x (F32 → 1-bit)
23    /// - Accuracy: Moderate
24    #[allow(dead_code)]
25    Binary = 0x03,
26
27    /// FP16 passthrough (future)
28    /// - Range: Half-precision float
29    /// - Size reduction: 2x (F32 → F16)
30    /// - Accuracy: Very high
31    #[allow(dead_code)]
32    FP16Passthrough = 0x04,
33}
34
35impl QuantScheme {
36    /// Returns the expected bytes per value for this quantization scheme
37    pub fn bytes_per_value(self) -> usize {
38        match self {
39            QuantScheme::QInt8 => 1,
40            QuantScheme::QInt4 => 1,  // packed, 2 values per byte
41            QuantScheme::Binary => 1, // packed, 8 values per byte
42            QuantScheme::FP16Passthrough => 2,
43        }
44    }
45
46    /// Returns the compression ratio compared to F32 (4 bytes)
47    pub fn compression_ratio(self) -> f32 {
48        4.0 / self.bytes_per_value() as f32
49    }
50}
51
52#[cfg(test)]
53mod tests {
54    use super::*;
55
56    #[test]
57    fn test_bytes_per_value() {
58        assert_eq!(QuantScheme::QInt8.bytes_per_value(), 1);
59        assert_eq!(QuantScheme::QInt4.bytes_per_value(), 1);
60        assert_eq!(QuantScheme::Binary.bytes_per_value(), 1);
61        assert_eq!(QuantScheme::FP16Passthrough.bytes_per_value(), 2);
62    }
63
64    #[test]
65    fn test_compression_ratio() {
66        assert_eq!(QuantScheme::QInt8.compression_ratio(), 4.0);
67        assert_eq!(QuantScheme::FP16Passthrough.compression_ratio(), 2.0);
68    }
69}