Skip to main content

sochdb_vector/
outlier_encoding.rs

1// Copyright 2025 SochDB Authors
2//
3// Licensed under the Apache License, Version 2.0
4
5//! Optimized Outlier Encoding
6//!
7//! This module provides efficient outlier representation that avoids
8//! `.contains()` calls in hot loops by using:
9//!
10//! - **Bitvector**: O(1) membership for dense outliers (k/D > threshold)
11//! - **Sorted list + binary search**: O(log k) for sparse outliers
12//!
13//! # Problem
14//!
15//! Original approach:
16//! ```text
17//! for dim in 0..D {
18//!     if outliers.contains(dim) {  // O(k) linear scan!
19//!         use_outlier_value(dim);
20//!     } else {
21//!         use_quantized_value(dim);
22//!     }
23//! }
24//! ```
25//!
26//! This is poison in tight loops: O(D × k) per vector decode.
27//!
28//! # Solution
29//!
30//! Hybrid representation with automatic crossover:
31//!
32//! ```text
33//! if k/D > BITVEC_THRESHOLD {
34//!     // Dense: use bitvector, O(1) per check
35//!     bitvec.test(dim)
36//! } else {
37//!     // Sparse: sorted list + binary search, O(log k) per check
38//!     sorted_dims.binary_search(&dim).is_ok()
39//! }
40//! ```
41//!
42//! # Crossover Analysis
43//!
44//! For dimension D and k outliers:
45//! - Bitvector: D/8 bytes storage, O(1) access
46//! - Sorted list: k × 2 bytes storage (u16), O(log k) access
47//!
48//! Crossover when: k × 2 ≈ D/8, i.e., k ≈ D/16
49//! For D=768, crossover at k ≈ 48 outliers.
50
51/// Threshold for switching from sorted list to bitvector.
52/// When k/D > this ratio, use bitvector.
53pub const BITVEC_THRESHOLD: f32 = 0.0625; // 1/16 = 6.25%
54
55/// Maximum dimension supported (u16 indices).
56pub const MAX_DIMENSION: usize = 65536;
57
58/// Optimized outlier set with hybrid representation.
59#[derive(Debug, Clone)]
60pub enum OutlierSet {
61    /// Empty set (no outliers).
62    Empty,
63    /// Sparse representation: sorted list of dimension indices.
64    Sparse(SparseOutliers),
65    /// Dense representation: bitvector membership.
66    Dense(DenseOutliers),
67}
68
69impl OutlierSet {
70    /// Create empty outlier set.
71    pub fn empty() -> Self {
72        OutlierSet::Empty
73    }
74
75    /// Create from dimension indices, automatically choosing representation.
76    pub fn from_dims(dims: &[u16], dimension: usize) -> Self {
77        if dims.is_empty() {
78            return OutlierSet::Empty;
79        }
80
81        let density = dims.len() as f32 / dimension as f32;
82
83        if density > BITVEC_THRESHOLD {
84            OutlierSet::Dense(DenseOutliers::from_dims(dims, dimension))
85        } else {
86            OutlierSet::Sparse(SparseOutliers::from_dims(dims))
87        }
88    }
89
90    /// Create from iterator of dimension indices.
91    pub fn from_iter<I: IntoIterator<Item = u16>>(iter: I, dimension: usize) -> Self {
92        let dims: Vec<u16> = iter.into_iter().collect();
93        Self::from_dims(&dims, dimension)
94    }
95
96    /// Check if dimension is an outlier. O(1) for dense, O(log k) for sparse.
97    #[inline]
98    pub fn contains(&self, dim: u16) -> bool {
99        match self {
100            OutlierSet::Empty => false,
101            OutlierSet::Sparse(s) => s.contains(dim),
102            OutlierSet::Dense(d) => d.contains(dim),
103        }
104    }
105
106    /// Number of outliers.
107    pub fn len(&self) -> usize {
108        match self {
109            OutlierSet::Empty => 0,
110            OutlierSet::Sparse(s) => s.len(),
111            OutlierSet::Dense(d) => d.len(),
112        }
113    }
114
115    /// Check if empty.
116    pub fn is_empty(&self) -> bool {
117        matches!(self, OutlierSet::Empty)
118    }
119
120    /// Memory usage in bytes.
121    pub fn memory_bytes(&self) -> usize {
122        match self {
123            OutlierSet::Empty => 0,
124            OutlierSet::Sparse(s) => s.memory_bytes(),
125            OutlierSet::Dense(d) => d.memory_bytes(),
126        }
127    }
128
129    /// Iterate over outlier dimensions.
130    pub fn iter(&self) -> OutlierIterator<'_> {
131        match self {
132            OutlierSet::Empty => OutlierIterator::Empty,
133            OutlierSet::Sparse(s) => OutlierIterator::Sparse(s.dims.iter()),
134            OutlierSet::Dense(d) => OutlierIterator::Dense(DenseIterator::new(d)),
135        }
136    }
137
138    /// Get density ratio.
139    pub fn density(&self, dimension: usize) -> f32 {
140        self.len() as f32 / dimension as f32
141    }
142
143    /// Check representation type.
144    pub fn is_dense(&self) -> bool {
145        matches!(self, OutlierSet::Dense(_))
146    }
147}
148
149/// Sparse outlier representation using sorted list.
150#[derive(Debug, Clone)]
151pub struct SparseOutliers {
152    /// Sorted dimension indices.
153    dims: Vec<u16>,
154}
155
156impl SparseOutliers {
157    /// Create from dimension indices (will sort).
158    pub fn from_dims(dims: &[u16]) -> Self {
159        let mut sorted = dims.to_vec();
160        sorted.sort_unstable();
161        sorted.dedup();
162        Self { dims: sorted }
163    }
164
165    /// Check membership using binary search. O(log k).
166    #[inline]
167    pub fn contains(&self, dim: u16) -> bool {
168        self.dims.binary_search(&dim).is_ok()
169    }
170
171    /// Get position of dimension in sorted list (for value lookup).
172    #[inline]
173    pub fn position(&self, dim: u16) -> Option<usize> {
174        self.dims.binary_search(&dim).ok()
175    }
176
177    /// Number of outliers.
178    pub fn len(&self) -> usize {
179        self.dims.len()
180    }
181
182    /// Check if empty.
183    pub fn is_empty(&self) -> bool {
184        self.dims.is_empty()
185    }
186
187    /// Memory usage in bytes.
188    pub fn memory_bytes(&self) -> usize {
189        self.dims.len() * std::mem::size_of::<u16>()
190    }
191
192    /// Get raw dimensions slice.
193    pub fn dims(&self) -> &[u16] {
194        &self.dims
195    }
196}
197
198/// Dense outlier representation using bitvector.
199#[derive(Debug, Clone)]
200pub struct DenseOutliers {
201    /// Bitvector (1 bit per dimension).
202    bits: Vec<u64>,
203    /// Number of set bits (outliers).
204    count: usize,
205}
206
207impl DenseOutliers {
208    /// Create from dimension indices.
209    pub fn from_dims(dims: &[u16], dimension: usize) -> Self {
210        let num_words = (dimension + 63) / 64;
211        let mut bits = vec![0u64; num_words];
212
213        for &dim in dims {
214            let word_idx = dim as usize / 64;
215            let bit_idx = dim as usize % 64;
216            if word_idx < bits.len() {
217                bits[word_idx] |= 1u64 << bit_idx;
218            }
219        }
220
221        // Count unique set bits
222        let count = bits.iter().map(|w| w.count_ones() as usize).sum();
223
224        Self { bits, count }
225    }
226
227    /// Check membership. O(1).
228    #[inline]
229    pub fn contains(&self, dim: u16) -> bool {
230        let word_idx = dim as usize / 64;
231        let bit_idx = dim as usize % 64;
232
233        if word_idx >= self.bits.len() {
234            return false;
235        }
236
237        (self.bits[word_idx] >> bit_idx) & 1 == 1
238    }
239
240    /// Get the rank (count of set bits before this position).
241    /// Useful for value lookup in compressed storage.
242    #[inline]
243    pub fn rank(&self, dim: u16) -> usize {
244        let word_idx = dim as usize / 64;
245        let bit_idx = dim as usize % 64;
246
247        let mut count = 0usize;
248
249        // Count all bits in previous words
250        for i in 0..word_idx.min(self.bits.len()) {
251            count += self.bits[i].count_ones() as usize;
252        }
253
254        // Count bits before position in current word
255        if word_idx < self.bits.len() {
256            let mask = (1u64 << bit_idx) - 1;
257            count += (self.bits[word_idx] & mask).count_ones() as usize;
258        }
259
260        count
261    }
262
263    /// Number of outliers.
264    pub fn len(&self) -> usize {
265        self.count
266    }
267
268    /// Check if empty.
269    pub fn is_empty(&self) -> bool {
270        self.count == 0
271    }
272
273    /// Memory usage in bytes.
274    pub fn memory_bytes(&self) -> usize {
275        self.bits.len() * std::mem::size_of::<u64>()
276    }
277
278    /// Get dimension capacity.
279    pub fn capacity(&self) -> usize {
280        self.bits.len() * 64
281    }
282}
283
284/// Iterator over outlier dimensions.
285pub enum OutlierIterator<'a> {
286    Empty,
287    Sparse(std::slice::Iter<'a, u16>),
288    Dense(DenseIterator<'a>),
289}
290
291impl<'a> Iterator for OutlierIterator<'a> {
292    type Item = u16;
293
294    fn next(&mut self) -> Option<Self::Item> {
295        match self {
296            OutlierIterator::Empty => None,
297            OutlierIterator::Sparse(iter) => iter.next().copied(),
298            OutlierIterator::Dense(iter) => iter.next(),
299        }
300    }
301}
302
303/// Iterator over dense bitvector.
304pub struct DenseIterator<'a> {
305    bits: &'a [u64],
306    word_idx: usize,
307    #[allow(dead_code)]
308    bit_idx: u32,
309    remaining: u64,
310}
311
312impl<'a> DenseIterator<'a> {
313    fn new(dense: &'a DenseOutliers) -> Self {
314        let remaining = dense.bits.first().copied().unwrap_or(0);
315        Self {
316            bits: &dense.bits,
317            word_idx: 0,
318            bit_idx: 0,
319            remaining,
320        }
321    }
322}
323
324impl Iterator for DenseIterator<'_> {
325    type Item = u16;
326
327    fn next(&mut self) -> Option<Self::Item> {
328        loop {
329            if self.remaining != 0 {
330                let tz = self.remaining.trailing_zeros();
331                self.remaining &= self.remaining - 1; // Clear lowest set bit
332                return Some((self.word_idx * 64 + tz as usize) as u16);
333            }
334
335            self.word_idx += 1;
336            if self.word_idx >= self.bits.len() {
337                return None;
338            }
339            self.remaining = self.bits[self.word_idx];
340        }
341    }
342}
343
344/// Outlier entry with dimension and value.
345#[derive(Debug, Clone, Copy)]
346pub struct OutlierValue {
347    /// Dimension index.
348    pub dim: u16,
349    /// Original f32 value.
350    pub value: f32,
351}
352
353/// Complete outlier storage for a vector.
354#[derive(Debug, Clone)]
355pub struct OutlierStorage {
356    /// Outlier set for membership testing.
357    pub set: OutlierSet,
358    /// Outlier values (same order as set iteration).
359    pub values: Vec<f32>,
360}
361
362impl OutlierStorage {
363    /// Create empty storage.
364    pub fn empty() -> Self {
365        Self {
366            set: OutlierSet::Empty,
367            values: Vec::new(),
368        }
369    }
370
371    /// Create from entries.
372    pub fn from_entries(entries: &[OutlierValue], dimension: usize) -> Self {
373        if entries.is_empty() {
374            return Self::empty();
375        }
376
377        // Sort by dimension
378        let mut sorted: Vec<_> = entries.to_vec();
379        sorted.sort_by_key(|e| e.dim);
380
381        let dims: Vec<u16> = sorted.iter().map(|e| e.dim).collect();
382        let values: Vec<f32> = sorted.iter().map(|e| e.value).collect();
383
384        Self {
385            set: OutlierSet::from_dims(&dims, dimension),
386            values,
387        }
388    }
389
390    /// Get outlier value for dimension, if it exists.
391    #[inline]
392    pub fn get(&self, dim: u16) -> Option<f32> {
393        match &self.set {
394            OutlierSet::Empty => None,
395            OutlierSet::Sparse(s) => s.position(dim).map(|pos| self.values[pos]),
396            OutlierSet::Dense(d) => {
397                if d.contains(dim) {
398                    Some(self.values[d.rank(dim)])
399                } else {
400                    None
401                }
402            }
403        }
404    }
405
406    /// Check if dimension is an outlier.
407    #[inline]
408    pub fn contains(&self, dim: u16) -> bool {
409        self.set.contains(dim)
410    }
411
412    /// Number of outliers.
413    pub fn len(&self) -> usize {
414        self.values.len()
415    }
416
417    /// Check if empty.
418    pub fn is_empty(&self) -> bool {
419        self.values.is_empty()
420    }
421
422    /// Memory usage in bytes.
423    pub fn memory_bytes(&self) -> usize {
424        self.set.memory_bytes() + self.values.len() * std::mem::size_of::<f32>()
425    }
426
427    /// Iterate over (dimension, value) pairs.
428    pub fn iter(&self) -> impl Iterator<Item = (u16, f32)> + '_ {
429        self.set.iter().zip(self.values.iter().copied())
430    }
431}
432
433/// Batch outlier lookup for SIMD-friendly decode.
434pub struct BatchOutlierLookup<'a> {
435    storage: &'a OutlierStorage,
436}
437
438impl<'a> BatchOutlierLookup<'a> {
439    /// Create a batch lookup helper.
440    pub fn new(storage: &'a OutlierStorage) -> Self {
441        Self { storage }
442    }
443
444    /// Lookup multiple dimensions at once, returning mask and values.
445    /// For dimensions that are not outliers, value is 0.0.
446    pub fn lookup_batch(&self, dims: &[u16]) -> (Vec<bool>, Vec<f32>) {
447        let mut is_outlier = Vec::with_capacity(dims.len());
448        let mut values = Vec::with_capacity(dims.len());
449
450        for &dim in dims {
451            if let Some(v) = self.storage.get(dim) {
452                is_outlier.push(true);
453                values.push(v);
454            } else {
455                is_outlier.push(false);
456                values.push(0.0);
457            }
458        }
459
460        (is_outlier, values)
461    }
462
463    /// Lookup 4 dimensions at once (SIMD-friendly).
464    #[inline]
465    pub fn lookup_4(&self, dims: [u16; 4]) -> ([bool; 4], [f32; 4]) {
466        let mut is_outlier = [false; 4];
467        let mut values = [0.0f32; 4];
468
469        for i in 0..4 {
470            if let Some(v) = self.storage.get(dims[i]) {
471                is_outlier[i] = true;
472                values[i] = v;
473            }
474        }
475
476        (is_outlier, values)
477    }
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483
484    #[test]
485    fn test_empty_outlier_set() {
486        let set = OutlierSet::empty();
487
488        assert!(set.is_empty());
489        assert_eq!(set.len(), 0);
490        assert!(!set.contains(0));
491        assert!(!set.contains(100));
492    }
493
494    #[test]
495    fn test_sparse_outliers() {
496        let dims = vec![5, 10, 100, 200];
497        let set = OutlierSet::from_dims(&dims, 768);
498
499        assert!(!set.is_dense());
500        assert_eq!(set.len(), 4);
501
502        assert!(set.contains(5));
503        assert!(set.contains(10));
504        assert!(set.contains(100));
505        assert!(set.contains(200));
506
507        assert!(!set.contains(0));
508        assert!(!set.contains(6));
509        assert!(!set.contains(99));
510    }
511
512    #[test]
513    fn test_dense_outliers() {
514        // Create enough outliers to trigger dense representation
515        let dims: Vec<u16> = (0..100).collect(); // 100 outliers for D=768 → ~13%
516        let set = OutlierSet::from_dims(&dims, 768);
517
518        assert!(set.is_dense());
519        assert_eq!(set.len(), 100);
520
521        for d in 0..100 {
522            assert!(set.contains(d));
523        }
524        assert!(!set.contains(100));
525        assert!(!set.contains(500));
526    }
527
528    #[test]
529    fn test_sparse_binary_search() {
530        let sparse = SparseOutliers::from_dims(&[10, 20, 30, 40, 50]);
531
532        assert_eq!(sparse.position(10), Some(0));
533        assert_eq!(sparse.position(30), Some(2));
534        assert_eq!(sparse.position(50), Some(4));
535        assert_eq!(sparse.position(15), None);
536    }
537
538    #[test]
539    fn test_dense_rank() {
540        let dense = DenseOutliers::from_dims(&[0, 1, 2, 10, 20], 768);
541
542        assert_eq!(dense.rank(0), 0); // No bits before position 0
543        assert_eq!(dense.rank(1), 1); // One bit (position 0)
544        assert_eq!(dense.rank(2), 2); // Two bits (0, 1)
545        assert_eq!(dense.rank(10), 3); // Three bits (0, 1, 2)
546        assert_eq!(dense.rank(20), 4); // Four bits (0, 1, 2, 10)
547    }
548
549    #[test]
550    fn test_outlier_storage() {
551        let entries = vec![
552            OutlierValue {
553                dim: 10,
554                value: 1.5,
555            },
556            OutlierValue {
557                dim: 20,
558                value: -2.0,
559            },
560            OutlierValue { dim: 5, value: 0.5 },
561        ];
562
563        let storage = OutlierStorage::from_entries(&entries, 768);
564
565        assert_eq!(storage.len(), 3);
566        assert!(storage.contains(5));
567        assert!(storage.contains(10));
568        assert!(storage.contains(20));
569        assert!(!storage.contains(15));
570
571        assert_eq!(storage.get(5), Some(0.5));
572        assert_eq!(storage.get(10), Some(1.5));
573        assert_eq!(storage.get(20), Some(-2.0));
574        assert_eq!(storage.get(15), None);
575    }
576
577    #[test]
578    fn test_batch_lookup() {
579        let entries = vec![
580            OutlierValue { dim: 0, value: 1.0 },
581            OutlierValue { dim: 2, value: 2.0 },
582        ];
583        let storage = OutlierStorage::from_entries(&entries, 768);
584        let lookup = BatchOutlierLookup::new(&storage);
585
586        let (is_outlier, values) = lookup.lookup_4([0, 1, 2, 3]);
587
588        assert_eq!(is_outlier, [true, false, true, false]);
589        assert_eq!(values[0], 1.0);
590        assert_eq!(values[1], 0.0);
591        assert_eq!(values[2], 2.0);
592        assert_eq!(values[3], 0.0);
593    }
594
595    #[test]
596    fn test_outlier_iteration() {
597        let dims = vec![5, 10, 100];
598        let set = OutlierSet::from_dims(&dims, 768);
599
600        let collected: Vec<u16> = set.iter().collect();
601        assert_eq!(collected, vec![5, 10, 100]);
602    }
603
604    #[test]
605    fn test_dense_iteration() {
606        let dims: Vec<u16> = (0..100).collect();
607        let set = OutlierSet::from_dims(&dims, 768);
608
609        let collected: Vec<u16> = set.iter().collect();
610        assert_eq!(collected.len(), 100);
611        assert_eq!(collected[0], 0);
612        assert_eq!(collected[99], 99);
613    }
614
615    #[test]
616    fn test_crossover_threshold() {
617        // Just below threshold: should be sparse
618        let sparse_dims: Vec<u16> = (0..40).collect(); // ~5% for D=768
619        let sparse_set = OutlierSet::from_dims(&sparse_dims, 768);
620        assert!(!sparse_set.is_dense());
621
622        // Above threshold: should be dense
623        let dense_dims: Vec<u16> = (0..60).collect(); // ~8% for D=768
624        let dense_set = OutlierSet::from_dims(&dense_dims, 768);
625        assert!(dense_set.is_dense());
626    }
627
628    #[test]
629    fn test_memory_efficiency() {
630        let dimension = 768;
631
632        // Sparse: 10 outliers
633        let sparse_dims: Vec<u16> = (0..10).collect();
634        let sparse_set = OutlierSet::from_dims(&sparse_dims, dimension);
635        assert!(sparse_set.memory_bytes() < 100); // ~20 bytes
636
637        // Dense: 100 outliers
638        let dense_dims: Vec<u16> = (0..100).collect();
639        let dense_set = OutlierSet::from_dims(&dense_dims, dimension);
640        // Bitvector: 768/64 = 12 words × 8 bytes = 96 bytes
641        assert!(dense_set.memory_bytes() <= 100);
642    }
643
644    #[test]
645    fn test_unsorted_input() {
646        let dims = vec![100, 5, 50, 10, 200];
647        let set = OutlierSet::from_dims(&dims, 768);
648
649        // Should work regardless of input order
650        for &d in &dims {
651            assert!(set.contains(d));
652        }
653    }
654
655    #[test]
656    fn test_duplicate_dims() {
657        let dims = vec![5, 5, 10, 10, 10];
658        let set = OutlierSet::from_dims(&dims, 768);
659
660        // Should deduplicate
661        assert_eq!(set.len(), 2);
662        assert!(set.contains(5));
663        assert!(set.contains(10));
664    }
665}