Skip to main content

semantic_memory/
quantize.rs

1//! Scalar quantization (SQ8) for f32 → i8 vector compression.
2//!
3//! Per-vector affine quantization: each vector gets its own scale and zero_point,
4//! computed from its min/max values. This gives 4x memory reduction with <0.5% cosine
5//! similarity error on normalized embedding vectors.
6//!
7//! This module is independent of the HNSW backend and can be used with brute-force too.
8
9use crate::error::MemoryError;
10
11/// Scalar quantization parameters for a single vector.
12#[derive(Debug, Clone)]
13pub struct QuantizedVector {
14    /// Quantized int8 values.
15    pub data: Vec<i8>,
16    /// Scale factor: `original[i] ≈ (data[i] - zero_point) * scale`
17    pub scale: f32,
18    /// Quantization zero point, always in the i8 range [-128, 127].
19    pub zero_point: i8,
20}
21
22/// Quantizer that converts f32 vectors to int8 with per-vector calibration.
23#[derive(Debug, Clone)]
24pub struct Quantizer {
25    dimensions: usize,
26}
27
28impl Quantizer {
29    /// Create a new quantizer for vectors of the given dimensionality.
30    pub fn new(dimensions: usize) -> Self {
31        Self { dimensions }
32    }
33
34    /// The configured dimensionality.
35    pub fn dimensions(&self) -> usize {
36        self.dimensions
37    }
38
39    /// Quantize a single f32 vector to int8 with per-vector affine calibration.
40    ///
41    /// Affine quantization maps to the full i8 range [-128, 127] (256 discrete levels).
42    /// Each vector gets its own scale/zero_point derived from its min/max values.
43    pub fn quantize(&self, vector: &[f32]) -> Result<QuantizedVector, MemoryError> {
44        if vector.len() != self.dimensions {
45            return Err(MemoryError::QuantizationError(format!(
46                "expected {} dimensions, got {}",
47                self.dimensions,
48                vector.len()
49            )));
50        }
51
52        let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
53        let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);
54
55        // Handle edge case: constant vector (all dimensions same value)
56        if (max - min).abs() < f32::EPSILON {
57            return Ok(QuantizedVector {
58                data: vec![0i8; self.dimensions],
59                scale: 1.0,
60                zero_point: 0,
61            });
62        }
63
64        let scale = (max - min) / 255.0;
65        if !scale.is_finite() || scale <= 0.0 {
66            return Err(MemoryError::QuantizationError(
67                "computed non-finite quantization scale".into(),
68            ));
69        }
70        let zero_point_f = -128.0 - (min / scale);
71        let zero_point = zero_point_f.round().clamp(-128.0, 127.0) as i8;
72
73        let data: Vec<i8> = vector
74            .iter()
75            .map(|&v| {
76                let q = (v / scale + zero_point as f32).round();
77                q.clamp(-128.0, 127.0) as i8
78            })
79            .collect();
80
81        Ok(QuantizedVector {
82            data,
83            scale,
84            zero_point,
85        })
86    }
87
88    /// Dequantize back to f32 (approximate reconstruction).
89    pub fn dequantize(&self, qv: &QuantizedVector) -> Vec<f32> {
90        qv.data
91            .iter()
92            .map(|&q| (q as f32 - qv.zero_point as f32) * qv.scale)
93            .collect()
94    }
95}
96
97/// Pack a QuantizedVector into bytes for SQLite storage.
98///
99/// Format: `[scale: f32 LE][zero_point: i8][data: i8 × dims]`
100/// Total bytes: `4 + 1 + dims`
101pub fn pack_quantized(qv: &QuantizedVector) -> Vec<u8> {
102    let mut buf = Vec::with_capacity(5 + qv.data.len());
103    buf.extend_from_slice(&qv.scale.to_le_bytes());
104    buf.push(qv.zero_point.to_ne_bytes()[0]);
105    buf.extend(qv.data.iter().map(|value| value.to_ne_bytes()[0]));
106    buf
107}
108
109/// Unpack bytes from SQLite into a QuantizedVector.
110pub fn unpack_quantized(bytes: &[u8], dimensions: usize) -> Result<QuantizedVector, MemoryError> {
111    let expected_len = 5 + dimensions;
112    if bytes.len() != expected_len {
113        return Err(MemoryError::QuantizationError(format!(
114            "expected {} bytes for {} dimensions, got {}",
115            expected_len,
116            dimensions,
117            bytes.len()
118        )));
119    }
120    let scale_bytes: [u8; 4] = bytes[0..4]
121        .try_into()
122        .map_err(|e| MemoryError::QuantizationError(format!("invalid scale bytes: {e}")))?;
123    let scale = f32::from_le_bytes(scale_bytes);
124    let zero_point = i8::from_ne_bytes([bytes[4]]);
125    let data: Vec<i8> = bytes[5..].iter().map(|&b| i8::from_ne_bytes([b])).collect();
126    Ok(QuantizedVector {
127        data,
128        scale,
129        zero_point,
130    })
131}