Skip to main content

semantic_memory/
quantize.rs

1//! Scalar quantization (SQ8) for f32 → i8 vector compression.
2//!
3//! Per-vector symmetric quantization: each vector gets its own scale and zero_point,
4//! computed from its min/max values. This gives 4x memory reduction with <0.5% cosine
5//! similarity error on normalized embedding vectors.
6//!
7//! This module is independent of the HNSW backend and can be used with brute-force too.
8
9use crate::error::MemoryError;
10
11/// Scalar quantization parameters for a single vector.
12#[derive(Debug, Clone)]
13pub struct QuantizedVector {
14    /// Quantized int8 values.
15    pub data: Vec<i8>,
16    /// Scale factor: `original[i] ≈ (data[i] - zero_point) * scale`
17    pub scale: f32,
18    /// Symmetric zero point, always in [-127, 127].
19    pub zero_point: i8,
20}
21
22/// Quantizer that converts f32 vectors to int8 with per-vector calibration.
23#[derive(Debug, Clone)]
24pub struct Quantizer {
25    dimensions: usize,
26}
27
28impl Quantizer {
29    /// Create a new quantizer for vectors of the given dimensionality.
30    pub fn new(dimensions: usize) -> Self {
31        Self { dimensions }
32    }
33
34    /// The configured dimensionality.
35    pub fn dimensions(&self) -> usize {
36        self.dimensions
37    }
38
39    /// Quantize a single f32 vector to int8 with per-vector symmetric calibration.
40    ///
41    /// Symmetric quantization maps to [-127, 127] (254 discrete levels).
42    /// Each vector gets its own scale/zero_point derived from its min/max values.
43    pub fn quantize(&self, vector: &[f32]) -> Result<QuantizedVector, MemoryError> {
44        if vector.len() != self.dimensions {
45            return Err(MemoryError::QuantizationError(format!(
46                "expected {} dimensions, got {}",
47                self.dimensions,
48                vector.len()
49            )));
50        }
51
52        let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
53        let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);
54
55        // Handle edge case: constant vector (all dimensions same value)
56        if (max - min).abs() < f32::EPSILON {
57            return Ok(QuantizedVector {
58                data: vec![0i8; self.dimensions],
59                scale: 1.0,
60                zero_point: 0,
61            });
62        }
63
64        // Symmetric quantization: 254 steps over [-127, 127]
65        let scale = (max - min) / 254.0;
66        let zero_point_f = -127.0 - (min / scale);
67        let zero_point = zero_point_f.round().clamp(-127.0, 127.0) as i8;
68
69        let data: Vec<i8> = vector
70            .iter()
71            .map(|&v| {
72                let q = (v / scale + zero_point as f32).round();
73                q.clamp(-127.0, 127.0) as i8
74            })
75            .collect();
76
77        Ok(QuantizedVector {
78            data,
79            scale,
80            zero_point,
81        })
82    }
83
84    /// Dequantize back to f32 (approximate reconstruction).
85    pub fn dequantize(&self, qv: &QuantizedVector) -> Vec<f32> {
86        qv.data
87            .iter()
88            .map(|&q| (q as f32 - qv.zero_point as f32) * qv.scale)
89            .collect()
90    }
91}
92
93/// Pack a QuantizedVector into bytes for SQLite storage.
94///
95/// Format: `[scale: f32 LE][zero_point: i8][data: i8 × dims]`
96/// Total bytes: `4 + 1 + dims`
97pub fn pack_quantized(qv: &QuantizedVector) -> Vec<u8> {
98    let mut buf = Vec::with_capacity(5 + qv.data.len());
99    buf.extend_from_slice(&qv.scale.to_le_bytes());
100    buf.push(qv.zero_point as u8);
101    // Cast i8 slice to u8 slice for storage
102    let data_bytes: &[u8] = bytemuck::cast_slice(&qv.data);
103    buf.extend_from_slice(data_bytes);
104    buf
105}
106
107/// Unpack bytes from SQLite into a QuantizedVector.
108pub fn unpack_quantized(bytes: &[u8], dimensions: usize) -> Result<QuantizedVector, MemoryError> {
109    let expected_len = 5 + dimensions;
110    if bytes.len() != expected_len {
111        return Err(MemoryError::QuantizationError(format!(
112            "expected {} bytes for {} dimensions, got {}",
113            expected_len, dimensions, bytes.len()
114        )));
115    }
116    let scale = f32::from_le_bytes(bytes[0..4].try_into().unwrap());
117    let zero_point = bytes[4] as i8;
118    let data: Vec<i8> = bytes[5..].iter().map(|&b| b as i8).collect();
119    Ok(QuantizedVector {
120        data,
121        scale,
122        zero_point,
123    })
124}