Skip to main content

grafeo_core/storage/
bitpack.rs

1//! Bit-packing for small integers.
2//!
3//! If your largest value is 15, why use 64 bits per number? Bit-packing uses
4//! only the bits you need - 4 bits for values up to 15, giving you 16x compression.
5//!
6//! This works especially well after delta encoding sorted data, where the deltas
7//! are often tiny even when the original values are huge.
8//!
9//! # Example
10//!
11//! ```ignore
12//! // Values [5, 2, 3, 5, 5, 8, 2] - max is 8, needs 4 bits
13//! // Without packing: 7 * 64 = 448 bits
14//! // With packing:    7 * 4  = 28 bits (16x smaller!)
15//!
16//! let values = vec![5u64, 2, 3, 5, 5, 8, 2];
17//! let packed = BitPackedInts::pack(&values);
18//! let unpacked = packed.unpack();
19//! assert_eq!(values, unpacked);
20//! ```
21
22use std::io;
23
24/// Stores integers using only as many bits as the largest value needs.
25///
26/// Pass your values to [`pack()`](Self::pack) and we'll figure out the optimal
27/// bit width automatically. Random access via [`get()`](Self::get) is O(1).
28#[derive(Debug, Clone)]
29pub struct BitPackedInts {
30    /// Packed data.
31    data: Vec<u64>,
32    /// Number of bits per value.
33    bits_per_value: u8,
34    /// Number of values.
35    count: usize,
36}
37
38impl BitPackedInts {
39    /// Packs a slice of u64 values using the minimum bits needed.
40    #[must_use]
41    pub fn pack(values: &[u64]) -> Self {
42        if values.is_empty() {
43            return Self {
44                data: Vec::new(),
45                bits_per_value: 0,
46                count: 0,
47            };
48        }
49
50        let max_value = values.iter().copied().max().unwrap_or(0);
51        let bits = Self::bits_needed(max_value);
52        Self::pack_with_bits(values, bits)
53    }
54
55    /// Packs values using a specified bit width.
56    ///
57    /// # Panics
58    ///
59    /// Panics if any value doesn't fit in the specified bit width.
60    #[must_use]
61    pub fn pack_with_bits(values: &[u64], bits_per_value: u8) -> Self {
62        if values.is_empty() {
63            return Self {
64                data: Vec::new(),
65                bits_per_value,
66                count: 0,
67            };
68        }
69
70        if bits_per_value == 0 {
71            // All values must be 0
72            debug_assert!(values.iter().all(|&v| v == 0));
73            return Self {
74                data: Vec::new(),
75                bits_per_value: 0,
76                count: values.len(),
77            };
78        }
79
80        let bits = bits_per_value as usize;
81        let values_per_word = 64 / bits;
82        let num_words = (values.len() + values_per_word - 1) / values_per_word;
83
84        let mut data = vec![0u64; num_words];
85        let mask = if bits >= 64 {
86            u64::MAX
87        } else {
88            (1u64 << bits) - 1
89        };
90
91        for (i, &value) in values.iter().enumerate() {
92            debug_assert!(
93                value <= mask,
94                "Value {} doesn't fit in {} bits",
95                value,
96                bits_per_value
97            );
98
99            let word_idx = i / values_per_word;
100            let bit_offset = (i % values_per_word) * bits;
101            data[word_idx] |= (value & mask) << bit_offset;
102        }
103
104        Self {
105            data,
106            bits_per_value,
107            count: values.len(),
108        }
109    }
110
111    /// Unpacks all values back to u64.
112    #[must_use]
113    pub fn unpack(&self) -> Vec<u64> {
114        if self.count == 0 {
115            return Vec::new();
116        }
117
118        if self.bits_per_value == 0 {
119            return vec![0u64; self.count];
120        }
121
122        let bits = self.bits_per_value as usize;
123        let values_per_word = 64 / bits;
124        let mask = if bits >= 64 {
125            u64::MAX
126        } else {
127            (1u64 << bits) - 1
128        };
129
130        let mut result = Vec::with_capacity(self.count);
131
132        for i in 0..self.count {
133            let word_idx = i / values_per_word;
134            let bit_offset = (i % values_per_word) * bits;
135            let value = (self.data[word_idx] >> bit_offset) & mask;
136            result.push(value);
137        }
138
139        result
140    }
141
142    /// Gets a single value at the given index.
143    #[must_use]
144    pub fn get(&self, index: usize) -> Option<u64> {
145        if index >= self.count {
146            return None;
147        }
148
149        if self.bits_per_value == 0 {
150            return Some(0);
151        }
152
153        let bits = self.bits_per_value as usize;
154        let values_per_word = 64 / bits;
155        let word_idx = index / values_per_word;
156        let bit_offset = (index % values_per_word) * bits;
157        let mask = if bits >= 64 {
158            u64::MAX
159        } else {
160            (1u64 << bits) - 1
161        };
162
163        Some((self.data[word_idx] >> bit_offset) & mask)
164    }
165
166    /// Returns the number of values.
167    #[must_use]
168    pub fn len(&self) -> usize {
169        self.count
170    }
171
172    /// Returns whether the encoding is empty.
173    #[must_use]
174    pub fn is_empty(&self) -> bool {
175        self.count == 0
176    }
177
178    /// Returns the number of bits per value.
179    #[must_use]
180    pub fn bits_per_value(&self) -> u8 {
181        self.bits_per_value
182    }
183
184    /// Returns the raw packed data.
185    #[must_use]
186    pub fn data(&self) -> &[u64] {
187        &self.data
188    }
189
190    /// Returns the compression ratio compared to storing full u64s.
191    #[must_use]
192    pub fn compression_ratio(&self) -> f64 {
193        if self.count == 0 {
194            return 1.0;
195        }
196
197        let original_size = self.count * 8; // 8 bytes per u64
198        let packed_size = self.data.len() * 8;
199
200        if packed_size == 0 {
201            return f64::INFINITY; // All zeros, perfect compression
202        }
203
204        original_size as f64 / packed_size as f64
205    }
206
207    /// Returns the number of bits needed to represent a value.
208    #[must_use]
209    pub fn bits_needed(value: u64) -> u8 {
210        if value == 0 {
211            1 // Need at least 1 bit to represent 0
212        } else {
213            64 - value.leading_zeros() as u8
214        }
215    }
216
217    /// Serializes to bytes.
218    pub fn to_bytes(&self) -> Vec<u8> {
219        let mut buf = Vec::with_capacity(1 + 4 + self.data.len() * 8);
220        buf.push(self.bits_per_value);
221        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
222        for &word in &self.data {
223            buf.extend_from_slice(&word.to_le_bytes());
224        }
225        buf
226    }
227
228    /// Deserializes from bytes.
229    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
230        if bytes.len() < 5 {
231            return Err(io::Error::new(
232                io::ErrorKind::InvalidData,
233                "BitPackedInts too short",
234            ));
235        }
236
237        let bits_per_value = bytes[0];
238        let count = u32::from_le_bytes(bytes[1..5].try_into().unwrap()) as usize;
239
240        let num_words = if bits_per_value == 0 || count == 0 {
241            0
242        } else {
243            let values_per_word = 64 / bits_per_value as usize;
244            (count + values_per_word - 1) / values_per_word
245        };
246
247        if bytes.len() < 5 + num_words * 8 {
248            return Err(io::Error::new(
249                io::ErrorKind::InvalidData,
250                "BitPackedInts truncated",
251            ));
252        }
253
254        let mut data = Vec::with_capacity(num_words);
255        for i in 0..num_words {
256            let offset = 5 + i * 8;
257            let word = u64::from_le_bytes(bytes[offset..offset + 8].try_into().unwrap());
258            data.push(word);
259        }
260
261        Ok(Self {
262            data,
263            bits_per_value,
264            count,
265        })
266    }
267}
268
269/// The best compression for sorted integers - delta encoding plus bit-packing.
270///
271/// Stores the first value, then packs the differences between consecutive values.
272/// For sequential IDs like [1000, 1001, 1002, ...], deltas are all 1, needing just
273/// 1 bit each - that's up to 64x compression!
274#[derive(Debug, Clone)]
275pub struct DeltaBitPacked {
276    /// Base value (first value in sequence).
277    base: u64,
278    /// Bit-packed deltas.
279    deltas: BitPackedInts,
280}
281
282impl DeltaBitPacked {
283    /// Encodes sorted values using delta + bit-packing.
284    #[must_use]
285    pub fn encode(values: &[u64]) -> Self {
286        if values.is_empty() {
287            return Self {
288                base: 0,
289                deltas: BitPackedInts::pack(&[]),
290            };
291        }
292
293        let base = values[0];
294        let delta_values: Vec<u64> = values
295            .windows(2)
296            .map(|w| w[1].saturating_sub(w[0]))
297            .collect();
298
299        let deltas = BitPackedInts::pack(&delta_values);
300
301        Self { base, deltas }
302    }
303
304    /// Decodes back to the original values.
305    #[must_use]
306    pub fn decode(&self) -> Vec<u64> {
307        if self.deltas.is_empty() && self.base == 0 {
308            return Vec::new();
309        }
310
311        let delta_values = self.deltas.unpack();
312        let mut result = Vec::with_capacity(delta_values.len() + 1);
313        let mut current = self.base;
314        result.push(current);
315
316        for delta in delta_values {
317            current = current.wrapping_add(delta);
318            result.push(current);
319        }
320
321        result
322    }
323
324    /// Returns the number of values.
325    #[must_use]
326    pub fn len(&self) -> usize {
327        if self.deltas.is_empty() && self.base == 0 {
328            0
329        } else {
330            self.deltas.len() + 1
331        }
332    }
333
334    /// Returns whether the encoding is empty.
335    #[must_use]
336    pub fn is_empty(&self) -> bool {
337        self.deltas.is_empty() && self.base == 0
338    }
339
340    /// Returns the base value.
341    #[must_use]
342    pub fn base(&self) -> u64 {
343        self.base
344    }
345
346    /// Returns the bits used per delta.
347    #[must_use]
348    pub fn bits_per_delta(&self) -> u8 {
349        self.deltas.bits_per_value()
350    }
351
352    /// Returns the compression ratio.
353    #[must_use]
354    pub fn compression_ratio(&self) -> f64 {
355        let count = self.len();
356        if count == 0 {
357            return 1.0;
358        }
359
360        let original_size = count * 8;
361        let packed_size = 8 + self.deltas.data().len() * 8; // base + packed deltas
362
363        original_size as f64 / packed_size as f64
364    }
365
366    /// Serializes to bytes.
367    pub fn to_bytes(&self) -> Vec<u8> {
368        let delta_bytes = self.deltas.to_bytes();
369        let mut buf = Vec::with_capacity(8 + delta_bytes.len());
370        buf.extend_from_slice(&self.base.to_le_bytes());
371        buf.extend_from_slice(&delta_bytes);
372        buf
373    }
374
375    /// Deserializes from bytes.
376    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
377        if bytes.len() < 8 {
378            return Err(io::Error::new(
379                io::ErrorKind::InvalidData,
380                "DeltaBitPacked too short",
381            ));
382        }
383
384        let base = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
385        let deltas = BitPackedInts::from_bytes(&bytes[8..])?;
386
387        Ok(Self { base, deltas })
388    }
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394
395    #[test]
396    fn test_bitpack_basic() {
397        let values = vec![5u64, 2, 3, 5, 5, 8, 2];
398        let packed = BitPackedInts::pack(&values);
399        let unpacked = packed.unpack();
400        assert_eq!(values, unpacked);
401    }
402
403    #[test]
404    fn test_bitpack_empty() {
405        let values: Vec<u64> = vec![];
406        let packed = BitPackedInts::pack(&values);
407        assert!(packed.is_empty());
408        assert_eq!(packed.unpack(), values);
409    }
410
411    #[test]
412    fn test_bitpack_single() {
413        let values = vec![42u64];
414        let packed = BitPackedInts::pack(&values);
415        assert_eq!(packed.len(), 1);
416        assert_eq!(packed.unpack(), values);
417    }
418
419    #[test]
420    fn test_bitpack_all_zeros() {
421        let values = vec![0u64; 100];
422        let packed = BitPackedInts::pack(&values);
423        assert_eq!(packed.bits_per_value(), 1);
424        assert_eq!(packed.unpack(), values);
425    }
426
427    #[test]
428    fn test_bitpack_powers_of_two() {
429        for bits in 1..=64u8 {
430            let max_val = if bits == 64 {
431                u64::MAX
432            } else {
433                (1u64 << bits) - 1
434            };
435            let values = vec![0, max_val / 2, max_val];
436            let packed = BitPackedInts::pack(&values);
437            assert_eq!(packed.bits_per_value(), bits);
438            assert_eq!(packed.unpack(), values);
439        }
440    }
441
442    #[test]
443    fn test_bitpack_get() {
444        let values = vec![1u64, 2, 3, 4, 5];
445        let packed = BitPackedInts::pack(&values);
446
447        for (i, &expected) in values.iter().enumerate() {
448            assert_eq!(packed.get(i), Some(expected));
449        }
450        assert_eq!(packed.get(100), None);
451    }
452
453    #[test]
454    fn test_bitpack_compression() {
455        // 100 values all <= 15 (4 bits each)
456        let values: Vec<u64> = (0..100).map(|i| i % 16).collect();
457        let packed = BitPackedInts::pack(&values);
458        assert_eq!(packed.bits_per_value(), 4);
459        // 100 * 64 bits -> 100 * 4 bits = 16x compression
460        let ratio = packed.compression_ratio();
461        assert!(ratio > 10.0, "Expected ratio > 10, got {}", ratio);
462    }
463
464    #[test]
465    fn test_bitpack_serialization() {
466        let values = vec![1u64, 3, 7, 15, 31];
467        let packed = BitPackedInts::pack(&values);
468        let bytes = packed.to_bytes();
469        let restored = BitPackedInts::from_bytes(&bytes).unwrap();
470        assert_eq!(packed.unpack(), restored.unpack());
471    }
472
473    #[test]
474    fn test_delta_bitpacked_basic() {
475        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
476        let encoded = DeltaBitPacked::encode(&values);
477        let decoded = encoded.decode();
478        assert_eq!(values, decoded);
479    }
480
481    #[test]
482    fn test_delta_bitpacked_sequential() {
483        // Sequential values: deltas are all 1, needs only 1 bit each
484        let values: Vec<u64> = (1000..1100).collect();
485        let encoded = DeltaBitPacked::encode(&values);
486        assert_eq!(encoded.bits_per_delta(), 1);
487        assert_eq!(encoded.decode(), values);
488
489        // Great compression: 100 * 64 bits -> 8 (base) + ~100 bits
490        let ratio = encoded.compression_ratio();
491        assert!(ratio > 5.0, "Expected ratio > 5, got {}", ratio);
492    }
493
494    #[test]
495    fn test_delta_bitpacked_empty() {
496        let values: Vec<u64> = vec![];
497        let encoded = DeltaBitPacked::encode(&values);
498        assert!(encoded.is_empty());
499        assert_eq!(encoded.decode(), values);
500    }
501
502    #[test]
503    fn test_delta_bitpacked_single() {
504        let values = vec![42u64];
505        let encoded = DeltaBitPacked::encode(&values);
506        assert_eq!(encoded.len(), 1);
507        assert_eq!(encoded.decode(), values);
508    }
509
510    #[test]
511    fn test_delta_bitpacked_serialization() {
512        let values = vec![100u64, 105, 107, 110, 115];
513        let encoded = DeltaBitPacked::encode(&values);
514        let bytes = encoded.to_bytes();
515        let restored = DeltaBitPacked::from_bytes(&bytes).unwrap();
516        assert_eq!(encoded.decode(), restored.decode());
517    }
518
519    #[test]
520    fn test_bits_needed() {
521        assert_eq!(BitPackedInts::bits_needed(0), 1);
522        assert_eq!(BitPackedInts::bits_needed(1), 1);
523        assert_eq!(BitPackedInts::bits_needed(2), 2);
524        assert_eq!(BitPackedInts::bits_needed(3), 2);
525        assert_eq!(BitPackedInts::bits_needed(4), 3);
526        assert_eq!(BitPackedInts::bits_needed(7), 3);
527        assert_eq!(BitPackedInts::bits_needed(8), 4);
528        assert_eq!(BitPackedInts::bits_needed(255), 8);
529        assert_eq!(BitPackedInts::bits_needed(256), 9);
530        assert_eq!(BitPackedInts::bits_needed(u64::MAX), 64);
531    }
532}