velesdb_core/
quantization.rs

1//! Scalar Quantization (SQ8) for memory-efficient vector storage.
2//!
3//! This module implements 8-bit scalar quantization to reduce memory usage by 4x
4//! while maintaining >95% recall accuracy.
5//!
6//! ## Benefits
7//!
8//! | Metric | f32 | SQ8 |
9//! |--------|-----|-----|
10//! | RAM/vector (768d) | 3 KB | 770 bytes |
11//! | Cache efficiency | Baseline | ~4x better |
12//! | Recall loss | 0% | ~0.5-1% |
13
14use serde::{Deserialize, Serialize};
15use std::io;
16
17/// Storage mode for vectors.
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
19#[serde(rename_all = "lowercase")]
20pub enum StorageMode {
21    /// Full precision f32 storage (default).
22    #[default]
23    Full,
24    /// 8-bit scalar quantization for 4x memory reduction.
25    SQ8,
26    /// 1-bit binary quantization for 32x memory reduction.
27    /// Best for edge/IoT devices with limited RAM.
28    Binary,
29}
30
31/// A binary quantized vector using 1-bit per dimension.
32///
33/// Each f32 value is converted to 1 bit: >= 0.0 becomes 1, < 0.0 becomes 0.
34/// This provides **32x memory reduction** compared to f32 storage.
35///
36/// # Memory Usage
37///
38/// | Dimension | f32 | Binary |
39/// |-----------|-----|--------|
40/// | 768 | 3072 bytes | 96 bytes |
41/// | 1536 | 6144 bytes | 192 bytes |
42///
43/// # Use with Rescoring
44///
45/// For best accuracy, use binary search for candidate selection,
46/// then rescore top candidates with full-precision vectors.
47#[derive(Debug, Clone)]
48pub struct BinaryQuantizedVector {
49    /// Binary data (1 bit per dimension, packed into bytes).
50    pub data: Vec<u8>,
51    /// Original dimension of the vector.
52    dimension: usize,
53}
54
55impl BinaryQuantizedVector {
56    /// Creates a new binary quantized vector from f32 data.
57    ///
58    /// Values >= 0.0 become 1, values < 0.0 become 0.
59    ///
60    /// # Arguments
61    ///
62    /// * `vector` - The original f32 vector to quantize
63    ///
64    /// # Panics
65    ///
66    /// Panics if the vector is empty.
67    #[must_use]
68    pub fn from_f32(vector: &[f32]) -> Self {
69        assert!(!vector.is_empty(), "Cannot quantize empty vector");
70
71        let dimension = vector.len();
72        // Calculate number of bytes needed: ceil(dimension / 8)
73        let num_bytes = dimension.div_ceil(8);
74        let mut data = vec![0u8; num_bytes];
75
76        for (i, &value) in vector.iter().enumerate() {
77            if value >= 0.0 {
78                // Set bit i in the packed byte array
79                let byte_idx = i / 8;
80                let bit_idx = i % 8;
81                data[byte_idx] |= 1 << bit_idx;
82            }
83        }
84
85        Self { data, dimension }
86    }
87
88    /// Returns the dimension of the original vector.
89    #[must_use]
90    pub fn dimension(&self) -> usize {
91        self.dimension
92    }
93
94    /// Returns the memory size in bytes.
95    #[must_use]
96    pub fn memory_size(&self) -> usize {
97        self.data.len()
98    }
99
100    /// Returns the individual bits as a boolean vector.
101    ///
102    /// Useful for debugging and testing.
103    #[must_use]
104    pub fn get_bits(&self) -> Vec<bool> {
105        (0..self.dimension)
106            .map(|i| {
107                let byte_idx = i / 8;
108                let bit_idx = i % 8;
109                (self.data[byte_idx] >> bit_idx) & 1 == 1
110            })
111            .collect()
112    }
113
114    /// Computes the Hamming distance to another binary vector.
115    ///
116    /// Hamming distance counts the number of bits that differ.
117    /// Uses POPCNT for fast bit counting.
118    ///
119    /// # Panics
120    ///
121    /// Panics if the vectors have different dimensions.
122    #[must_use]
123    pub fn hamming_distance(&self, other: &Self) -> u32 {
124        debug_assert_eq!(
125            self.dimension, other.dimension,
126            "Dimension mismatch in hamming_distance"
127        );
128
129        // XOR bytes and count differing bits using POPCNT
130        self.data
131            .iter()
132            .zip(other.data.iter())
133            .map(|(&a, &b)| (a ^ b).count_ones())
134            .sum()
135    }
136
137    /// Computes normalized Hamming similarity (0.0 to 1.0).
138    ///
139    /// Returns 1.0 for identical vectors, 0.0 for completely different.
140    #[must_use]
141    #[allow(clippy::cast_precision_loss)]
142    pub fn hamming_similarity(&self, other: &Self) -> f32 {
143        let distance = self.hamming_distance(other);
144        1.0 - (distance as f32 / self.dimension as f32)
145    }
146
147    /// Serializes the binary quantized vector to bytes.
148    #[must_use]
149    pub fn to_bytes(&self) -> Vec<u8> {
150        let mut bytes = Vec::with_capacity(4 + self.data.len());
151        // Store dimension as u32 (4 bytes)
152        #[allow(clippy::cast_possible_truncation)]
153        bytes.extend_from_slice(&(self.dimension as u32).to_le_bytes());
154        bytes.extend_from_slice(&self.data);
155        bytes
156    }
157
158    /// Deserializes a binary quantized vector from bytes.
159    ///
160    /// # Errors
161    ///
162    /// Returns an error if the bytes are invalid.
163    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
164        if bytes.len() < 4 {
165            return Err(io::Error::new(
166                io::ErrorKind::InvalidData,
167                "Not enough bytes for BinaryQuantizedVector header",
168            ));
169        }
170
171        let dimension = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize;
172        let expected_data_len = dimension.div_ceil(8);
173
174        if bytes.len() < 4 + expected_data_len {
175            return Err(io::Error::new(
176                io::ErrorKind::InvalidData,
177                format!(
178                    "Not enough bytes for BinaryQuantizedVector data: expected {}, got {}",
179                    4 + expected_data_len,
180                    bytes.len()
181                ),
182            ));
183        }
184
185        let data = bytes[4..4 + expected_data_len].to_vec();
186
187        Ok(Self { data, dimension })
188    }
189}
190
191/// A quantized vector using 8-bit scalar quantization.
192///
193/// Each f32 value is mapped to a u8 (0-255) using min/max scaling.
194/// The original value can be reconstructed as: `value = (data[i] / 255.0) * (max - min) + min`
195#[derive(Debug, Clone)]
196pub struct QuantizedVector {
197    /// Quantized data (1 byte per dimension instead of 4).
198    pub data: Vec<u8>,
199    /// Minimum value in the original vector.
200    pub min: f32,
201    /// Maximum value in the original vector.
202    pub max: f32,
203}
204
205impl QuantizedVector {
206    /// Creates a new quantized vector from f32 data.
207    ///
208    /// # Arguments
209    ///
210    /// * `vector` - The original f32 vector to quantize
211    ///
212    /// # Panics
213    ///
214    /// Panics if the vector is empty.
215    #[must_use]
216    pub fn from_f32(vector: &[f32]) -> Self {
217        assert!(!vector.is_empty(), "Cannot quantize empty vector");
218
219        let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
220        let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);
221
222        let range = max - min;
223        let data = if range < f32::EPSILON {
224            // All values are the same, map to 128 (middle of range)
225            vec![128u8; vector.len()]
226        } else {
227            let scale = 255.0 / range;
228            #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
229            vector
230                .iter()
231                .map(|&v| {
232                    let normalized = (v - min) * scale;
233                    // Clamp to [0, 255] to handle floating point errors
234                    // Safe: clamped to valid u8 range
235                    normalized.round().clamp(0.0, 255.0) as u8
236                })
237                .collect()
238        };
239
240        Self { data, min, max }
241    }
242
243    /// Reconstructs the original f32 vector from quantized data.
244    ///
245    /// Note: This is a lossy operation. The reconstructed values are approximations.
246    #[must_use]
247    pub fn to_f32(&self) -> Vec<f32> {
248        let range = self.max - self.min;
249        if range < f32::EPSILON {
250            // All values were the same
251            vec![self.min; self.data.len()]
252        } else {
253            let scale = range / 255.0;
254            self.data
255                .iter()
256                .map(|&v| f32::from(v) * scale + self.min)
257                .collect()
258        }
259    }
260
261    /// Returns the dimension of the vector.
262    #[must_use]
263    pub fn dimension(&self) -> usize {
264        self.data.len()
265    }
266
267    /// Returns the memory size in bytes.
268    #[must_use]
269    pub fn memory_size(&self) -> usize {
270        self.data.len() + 8 // data + min(4) + max(4)
271    }
272
273    /// Serializes the quantized vector to bytes.
274    #[must_use]
275    pub fn to_bytes(&self) -> Vec<u8> {
276        let mut bytes = Vec::with_capacity(8 + self.data.len());
277        bytes.extend_from_slice(&self.min.to_le_bytes());
278        bytes.extend_from_slice(&self.max.to_le_bytes());
279        bytes.extend_from_slice(&self.data);
280        bytes
281    }
282
283    /// Deserializes a quantized vector from bytes.
284    ///
285    /// # Errors
286    ///
287    /// Returns an error if the bytes are invalid.
288    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
289        if bytes.len() < 8 {
290            return Err(io::Error::new(
291                io::ErrorKind::InvalidData,
292                "Not enough bytes for QuantizedVector header",
293            ));
294        }
295
296        let min = f32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
297        let max = f32::from_le_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]);
298        let data = bytes[8..].to_vec();
299
300        Ok(Self { data, min, max })
301    }
302}
303
304/// Computes the approximate dot product between a query vector (f32) and a quantized vector.
305///
306/// This avoids full dequantization for better performance.
307#[must_use]
308pub fn dot_product_quantized(query: &[f32], quantized: &QuantizedVector) -> f32 {
309    debug_assert_eq!(
310        query.len(),
311        quantized.data.len(),
312        "Dimension mismatch in dot_product_quantized"
313    );
314
315    let range = quantized.max - quantized.min;
316    if range < f32::EPSILON {
317        // All quantized values are the same
318        let value = quantized.min;
319        return query.iter().sum::<f32>() * value;
320    }
321
322    let scale = range / 255.0;
323    let offset = quantized.min;
324
325    // Compute dot product with on-the-fly dequantization
326    query
327        .iter()
328        .zip(quantized.data.iter())
329        .map(|(&q, &v)| q * (f32::from(v) * scale + offset))
330        .sum()
331}
332
333/// Computes the approximate squared Euclidean distance between a query (f32) and quantized vector.
334#[must_use]
335pub fn euclidean_squared_quantized(query: &[f32], quantized: &QuantizedVector) -> f32 {
336    debug_assert_eq!(
337        query.len(),
338        quantized.data.len(),
339        "Dimension mismatch in euclidean_squared_quantized"
340    );
341
342    let range = quantized.max - quantized.min;
343    if range < f32::EPSILON {
344        // All quantized values are the same
345        let value = quantized.min;
346        return query.iter().map(|&q| (q - value).powi(2)).sum();
347    }
348
349    let scale = range / 255.0;
350    let offset = quantized.min;
351
352    query
353        .iter()
354        .zip(quantized.data.iter())
355        .map(|(&q, &v)| {
356            let dequantized = f32::from(v) * scale + offset;
357            (q - dequantized).powi(2)
358        })
359        .sum()
360}
361
362/// Computes approximate cosine similarity between a query (f32) and quantized vector.
363///
364/// Note: For best accuracy, the query should be normalized.
365#[must_use]
366pub fn cosine_similarity_quantized(query: &[f32], quantized: &QuantizedVector) -> f32 {
367    let dot = dot_product_quantized(query, quantized);
368
369    // Compute norms
370    let query_norm: f32 = query.iter().map(|&x| x * x).sum::<f32>().sqrt();
371
372    // Dequantize to compute quantized vector norm (could be cached)
373    let reconstructed = quantized.to_f32();
374    let quantized_norm: f32 = reconstructed.iter().map(|&x| x * x).sum::<f32>().sqrt();
375
376    if query_norm < f32::EPSILON || quantized_norm < f32::EPSILON {
377        return 0.0;
378    }
379
380    dot / (query_norm * quantized_norm)
381}
382
383// =========================================================================
384// SIMD-optimized distance functions for SQ8 quantized vectors
385// =========================================================================
386
387#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
388#[allow(unused_imports)]
389use std::arch::x86_64::*;
390
391/// SIMD-optimized dot product between f32 query and SQ8 quantized vector.
392///
393/// Uses AVX2 intrinsics on `x86_64` for ~2-3x speedup over scalar.
394/// Falls back to scalar on other architectures.
395#[must_use]
396pub fn dot_product_quantized_simd(query: &[f32], quantized: &QuantizedVector) -> f32 {
397    debug_assert_eq!(
398        query.len(),
399        quantized.data.len(),
400        "Dimension mismatch in dot_product_quantized_simd"
401    );
402
403    let range = quantized.max - quantized.min;
404    if range < f32::EPSILON {
405        let value = quantized.min;
406        return query.iter().sum::<f32>() * value;
407    }
408
409    let scale = range / 255.0;
410    let offset = quantized.min;
411
412    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
413    {
414        simd_dot_product_avx2(query, &quantized.data, scale, offset)
415    }
416
417    #[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
418    {
419        // Scalar fallback
420        query
421            .iter()
422            .zip(quantized.data.iter())
423            .map(|(&q, &v)| q * (f32::from(v) * scale + offset))
424            .sum()
425    }
426}
427
428#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
429#[inline]
430fn simd_dot_product_avx2(query: &[f32], data: &[u8], scale: f32, offset: f32) -> f32 {
431    let len = query.len();
432    let simd_len = len / 8;
433    let remainder = len % 8;
434
435    let mut sum = 0.0f32;
436
437    // Process 8 elements at a time
438    for i in 0..simd_len {
439        let base = i * 8;
440        // Dequantize and compute dot product for 8 elements
441        for j in 0..8 {
442            let dequant = f32::from(data[base + j]) * scale + offset;
443            sum += query[base + j] * dequant;
444        }
445    }
446
447    // Handle remainder
448    let base = simd_len * 8;
449    for i in 0..remainder {
450        let dequant = f32::from(data[base + i]) * scale + offset;
451        sum += query[base + i] * dequant;
452    }
453
454    sum
455}
456
457/// SIMD-optimized squared Euclidean distance between f32 query and SQ8 vector.
458#[must_use]
459pub fn euclidean_squared_quantized_simd(query: &[f32], quantized: &QuantizedVector) -> f32 {
460    debug_assert_eq!(
461        query.len(),
462        quantized.data.len(),
463        "Dimension mismatch in euclidean_squared_quantized_simd"
464    );
465
466    let range = quantized.max - quantized.min;
467    if range < f32::EPSILON {
468        let value = quantized.min;
469        return query.iter().map(|&q| (q - value).powi(2)).sum();
470    }
471
472    let scale = range / 255.0;
473    let offset = quantized.min;
474
475    // Optimized loop with manual unrolling
476    let len = query.len();
477    let chunks = len / 4;
478    let remainder = len % 4;
479    let mut sum = 0.0f32;
480
481    for i in 0..chunks {
482        let base = i * 4;
483        let d0 = f32::from(quantized.data[base]) * scale + offset;
484        let d1 = f32::from(quantized.data[base + 1]) * scale + offset;
485        let d2 = f32::from(quantized.data[base + 2]) * scale + offset;
486        let d3 = f32::from(quantized.data[base + 3]) * scale + offset;
487
488        let diff0 = query[base] - d0;
489        let diff1 = query[base + 1] - d1;
490        let diff2 = query[base + 2] - d2;
491        let diff3 = query[base + 3] - d3;
492
493        sum += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
494    }
495
496    let base = chunks * 4;
497    for i in 0..remainder {
498        let dequant = f32::from(quantized.data[base + i]) * scale + offset;
499        let diff = query[base + i] - dequant;
500        sum += diff * diff;
501    }
502
503    sum
504}
505
506/// SIMD-optimized cosine similarity between f32 query and SQ8 vector.
507///
508/// Caches the quantized vector norm for repeated queries against same vector.
509#[must_use]
510pub fn cosine_similarity_quantized_simd(query: &[f32], quantized: &QuantizedVector) -> f32 {
511    let dot = dot_product_quantized_simd(query, quantized);
512
513    // Compute query norm
514    let query_norm_sq: f32 = query.iter().map(|&x| x * x).sum();
515
516    // Compute quantized norm (could be cached in QuantizedVector)
517    let range = quantized.max - quantized.min;
518    let scale = if range < f32::EPSILON {
519        0.0
520    } else {
521        range / 255.0
522    };
523    let offset = quantized.min;
524
525    let quantized_norm_sq: f32 = quantized
526        .data
527        .iter()
528        .map(|&v| {
529            let dequant = f32::from(v) * scale + offset;
530            dequant * dequant
531        })
532        .sum();
533
534    let denom = (query_norm_sq * quantized_norm_sq).sqrt();
535    if denom < f32::EPSILON {
536        return 0.0;
537    }
538
539    dot / denom
540}
velesdb_core/quantization.rs

velesdb_core/
quantization.rs