Skip to main content

grafeo_core/codec/
bitpack.rs

1//! Bit-packing for small integers.
2//!
3//! If your largest value is 15, why use 64 bits per number? Bit-packing uses
4//! only the bits you need - 4 bits for values up to 15, giving you 16x compression.
5//!
6//! This works especially well after delta encoding sorted data, where the deltas
7//! are often tiny even when the original values are huge.
8//!
9//! # Example
10//!
11//! ```no_run
12//! # use grafeo_core::codec::bitpack::BitPackedInts;
13//! // Values [5, 2, 3, 5, 5, 8, 2] - max is 8, needs 4 bits
14//! // Without packing: 7 * 64 = 448 bits
15//! // With packing:    7 * 4  = 28 bits (16x smaller!)
16//!
17//! let values = vec![5u64, 2, 3, 5, 5, 8, 2];
18//! let packed = BitPackedInts::pack(&values);
19//! let unpacked = packed.unpack();
20//! assert_eq!(values, unpacked);
21//! ```
22
23use std::io;
24
25/// Stores integers using only as many bits as the largest value needs.
26///
27/// Pass your values to [`pack()`](Self::pack) and we'll figure out the optimal
28/// bit width automatically. Random access via [`get()`](Self::get) is O(1).
29#[derive(Debug, Clone)]
30pub struct BitPackedInts {
31    /// Packed data.
32    data: Vec<u64>,
33    /// Number of bits per value.
34    bits_per_value: u8,
35    /// Number of values.
36    count: usize,
37}
38
39impl BitPackedInts {
40    /// Packs a slice of u64 values using the minimum bits needed.
41    #[must_use]
42    pub fn pack(values: &[u64]) -> Self {
43        if values.is_empty() {
44            return Self {
45                data: Vec::new(),
46                bits_per_value: 0,
47                count: 0,
48            };
49        }
50
51        let max_value = values.iter().copied().max().unwrap_or(0);
52        let bits = Self::bits_needed(max_value);
53        Self::pack_with_bits(values, bits)
54    }
55
56    /// Packs values using a specified bit width.
57    ///
58    /// # Panics
59    ///
60    /// Panics if any value doesn't fit in the specified bit width.
61    #[must_use]
62    pub fn pack_with_bits(values: &[u64], bits_per_value: u8) -> Self {
63        if values.is_empty() {
64            return Self {
65                data: Vec::new(),
66                bits_per_value,
67                count: 0,
68            };
69        }
70
71        if bits_per_value == 0 {
72            // All values must be 0
73            debug_assert!(values.iter().all(|&v| v == 0));
74            return Self {
75                data: Vec::new(),
76                bits_per_value: 0,
77                count: values.len(),
78            };
79        }
80
81        let bits = bits_per_value as usize;
82        let values_per_word = 64 / bits;
83        let num_words = (values.len() + values_per_word - 1) / values_per_word;
84
85        let mut data = vec![0u64; num_words];
86        let mask = if bits >= 64 {
87            u64::MAX
88        } else {
89            (1u64 << bits) - 1
90        };
91
92        for (i, &value) in values.iter().enumerate() {
93            debug_assert!(
94                value <= mask,
95                "Value {} doesn't fit in {} bits",
96                value,
97                bits_per_value
98            );
99
100            let word_idx = i / values_per_word;
101            let bit_offset = (i % values_per_word) * bits;
102            data[word_idx] |= (value & mask) << bit_offset;
103        }
104
105        Self {
106            data,
107            bits_per_value,
108            count: values.len(),
109        }
110    }
111
112    /// Unpacks all values back to u64.
113    #[must_use]
114    pub fn unpack(&self) -> Vec<u64> {
115        if self.count == 0 {
116            return Vec::new();
117        }
118
119        if self.bits_per_value == 0 {
120            return vec![0u64; self.count];
121        }
122
123        let bits = self.bits_per_value as usize;
124        let values_per_word = 64 / bits;
125        let mask = if bits >= 64 {
126            u64::MAX
127        } else {
128            (1u64 << bits) - 1
129        };
130
131        let mut result = Vec::with_capacity(self.count);
132
133        for i in 0..self.count {
134            let word_idx = i / values_per_word;
135            let bit_offset = (i % values_per_word) * bits;
136            let value = (self.data[word_idx] >> bit_offset) & mask;
137            result.push(value);
138        }
139
140        result
141    }
142
143    /// Gets a single value at the given index.
144    #[must_use]
145    pub fn get(&self, index: usize) -> Option<u64> {
146        if index >= self.count {
147            return None;
148        }
149
150        if self.bits_per_value == 0 {
151            return Some(0);
152        }
153
154        let bits = self.bits_per_value as usize;
155        let values_per_word = 64 / bits;
156        let word_idx = index / values_per_word;
157        let bit_offset = (index % values_per_word) * bits;
158        let mask = if bits >= 64 {
159            u64::MAX
160        } else {
161            (1u64 << bits) - 1
162        };
163
164        Some((self.data[word_idx] >> bit_offset) & mask)
165    }
166
167    /// Returns the number of values.
168    #[must_use]
169    pub fn len(&self) -> usize {
170        self.count
171    }
172
173    /// Returns whether the encoding is empty.
174    #[must_use]
175    pub fn is_empty(&self) -> bool {
176        self.count == 0
177    }
178
179    /// Returns the number of bits per value.
180    #[must_use]
181    pub fn bits_per_value(&self) -> u8 {
182        self.bits_per_value
183    }
184
185    /// Returns the raw packed data.
186    #[must_use]
187    pub fn data(&self) -> &[u64] {
188        &self.data
189    }
190
191    /// Returns the compression ratio compared to storing full u64s.
192    #[must_use]
193    pub fn compression_ratio(&self) -> f64 {
194        if self.count == 0 {
195            return 1.0;
196        }
197
198        let original_size = self.count * 8; // 8 bytes per u64
199        let packed_size = self.data.len() * 8;
200
201        if packed_size == 0 {
202            return f64::INFINITY; // All zeros, perfect compression
203        }
204
205        original_size as f64 / packed_size as f64
206    }
207
208    /// Returns the number of bits needed to represent a value.
209    #[must_use]
210    pub fn bits_needed(value: u64) -> u8 {
211        if value == 0 {
212            1 // Need at least 1 bit to represent 0
213        } else {
214            64 - value.leading_zeros() as u8
215        }
216    }
217
218    /// Serializes to bytes.
219    pub fn to_bytes(&self) -> Vec<u8> {
220        let mut buf = Vec::with_capacity(1 + 4 + self.data.len() * 8);
221        buf.push(self.bits_per_value);
222        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
223        for &word in &self.data {
224            buf.extend_from_slice(&word.to_le_bytes());
225        }
226        buf
227    }
228
229    /// Deserializes from bytes.
230    ///
231    /// # Errors
232    ///
233    /// Returns `Err` if the byte slice is too short or contains invalid data.
234    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
235        if bytes.len() < 5 {
236            return Err(io::Error::new(
237                io::ErrorKind::InvalidData,
238                "BitPackedInts too short",
239            ));
240        }
241
242        let bits_per_value = bytes[0];
243        if bits_per_value > 64 {
244            return Err(io::Error::new(
245                io::ErrorKind::InvalidData,
246                format!("BitPackedInts bits_per_value {bits_per_value} exceeds 64"),
247            ));
248        }
249        let count = u32::from_le_bytes(
250            bytes[1..5]
251                .try_into()
252                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
253        ) as usize;
254
255        let num_words = if bits_per_value == 0 || count == 0 {
256            0
257        } else {
258            let values_per_word = 64 / bits_per_value as usize;
259            (count + values_per_word - 1) / values_per_word
260        };
261
262        if bytes.len() < 5 + num_words * 8 {
263            return Err(io::Error::new(
264                io::ErrorKind::InvalidData,
265                "BitPackedInts truncated",
266            ));
267        }
268
269        let mut data = Vec::with_capacity(num_words);
270        for i in 0..num_words {
271            let offset = 5 + i * 8;
272            let word = u64::from_le_bytes(
273                bytes[offset..offset + 8]
274                    .try_into()
275                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
276            );
277            data.push(word);
278        }
279
280        Ok(Self {
281            data,
282            bits_per_value,
283            count,
284        })
285    }
286}
287
288/// The best compression for sorted integers - delta encoding plus bit-packing.
289///
290/// Stores the first value, then packs the differences between consecutive values.
291/// For sequential IDs like [1000, 1001, 1002, ...], deltas are all 1, needing just
292/// 1 bit each - that's up to 64x compression!
293#[derive(Debug, Clone)]
294pub struct DeltaBitPacked {
295    /// Base value (first value in sequence).
296    base: u64,
297    /// Bit-packed deltas.
298    deltas: BitPackedInts,
299}
300
301impl DeltaBitPacked {
302    /// Encodes sorted values using delta + bit-packing.
303    #[must_use]
304    pub fn encode(values: &[u64]) -> Self {
305        if values.is_empty() {
306            return Self {
307                base: 0,
308                deltas: BitPackedInts::pack(&[]),
309            };
310        }
311
312        let base = values[0];
313        let delta_values: Vec<u64> = values
314            .windows(2)
315            .map(|w| w[1].saturating_sub(w[0]))
316            .collect();
317
318        let deltas = BitPackedInts::pack(&delta_values);
319
320        Self { base, deltas }
321    }
322
323    /// Decodes back to the original values.
324    #[must_use]
325    pub fn decode(&self) -> Vec<u64> {
326        if self.deltas.is_empty() && self.base == 0 {
327            return Vec::new();
328        }
329
330        let delta_values = self.deltas.unpack();
331        let mut result = Vec::with_capacity(delta_values.len() + 1);
332        let mut current = self.base;
333        result.push(current);
334
335        for delta in delta_values {
336            current = current.wrapping_add(delta);
337            result.push(current);
338        }
339
340        result
341    }
342
343    /// Returns the number of values.
344    #[must_use]
345    pub fn len(&self) -> usize {
346        if self.deltas.is_empty() && self.base == 0 {
347            0
348        } else {
349            self.deltas.len() + 1
350        }
351    }
352
353    /// Returns whether the encoding is empty.
354    #[must_use]
355    pub fn is_empty(&self) -> bool {
356        self.deltas.is_empty() && self.base == 0
357    }
358
359    /// Returns the base value.
360    #[must_use]
361    pub fn base(&self) -> u64 {
362        self.base
363    }
364
365    /// Returns the bits used per delta.
366    #[must_use]
367    pub fn bits_per_delta(&self) -> u8 {
368        self.deltas.bits_per_value()
369    }
370
371    /// Returns the compression ratio.
372    #[must_use]
373    pub fn compression_ratio(&self) -> f64 {
374        let count = self.len();
375        if count == 0 {
376            return 1.0;
377        }
378
379        let original_size = count * 8;
380        let packed_size = 8 + self.deltas.data().len() * 8; // base + packed deltas
381
382        original_size as f64 / packed_size as f64
383    }
384
385    /// Serializes to bytes.
386    pub fn to_bytes(&self) -> Vec<u8> {
387        let delta_bytes = self.deltas.to_bytes();
388        let mut buf = Vec::with_capacity(8 + delta_bytes.len());
389        buf.extend_from_slice(&self.base.to_le_bytes());
390        buf.extend_from_slice(&delta_bytes);
391        buf
392    }
393
394    /// Deserializes from bytes.
395    ///
396    /// # Errors
397    ///
398    /// Returns `Err` if the byte slice is too short or contains invalid data.
399    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
400        if bytes.len() < 8 {
401            return Err(io::Error::new(
402                io::ErrorKind::InvalidData,
403                "DeltaBitPacked too short",
404            ));
405        }
406
407        let base = u64::from_le_bytes(
408            bytes[0..8]
409                .try_into()
410                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
411        );
412        let deltas = BitPackedInts::from_bytes(&bytes[8..])?;
413
414        Ok(Self { base, deltas })
415    }
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn test_bitpack_basic() {
424        let values = vec![5u64, 2, 3, 5, 5, 8, 2];
425        let packed = BitPackedInts::pack(&values);
426        let unpacked = packed.unpack();
427        assert_eq!(values, unpacked);
428    }
429
430    #[test]
431    fn test_bitpack_empty() {
432        let values: Vec<u64> = vec![];
433        let packed = BitPackedInts::pack(&values);
434        assert!(packed.is_empty());
435        assert_eq!(packed.unpack(), values);
436    }
437
438    #[test]
439    fn test_bitpack_single() {
440        let values = vec![42u64];
441        let packed = BitPackedInts::pack(&values);
442        assert_eq!(packed.len(), 1);
443        assert_eq!(packed.unpack(), values);
444    }
445
446    #[test]
447    fn test_bitpack_all_zeros() {
448        let values = vec![0u64; 100];
449        let packed = BitPackedInts::pack(&values);
450        assert_eq!(packed.bits_per_value(), 1);
451        assert_eq!(packed.unpack(), values);
452    }
453
454    #[test]
455    fn test_bitpack_powers_of_two() {
456        for bits in 1..=64u8 {
457            let max_val = if bits == 64 {
458                u64::MAX
459            } else {
460                (1u64 << bits) - 1
461            };
462            let values = vec![0, max_val / 2, max_val];
463            let packed = BitPackedInts::pack(&values);
464            assert_eq!(packed.bits_per_value(), bits);
465            assert_eq!(packed.unpack(), values);
466        }
467    }
468
469    #[test]
470    fn test_bitpack_get() {
471        let values = vec![1u64, 2, 3, 4, 5];
472        let packed = BitPackedInts::pack(&values);
473
474        for (i, &expected) in values.iter().enumerate() {
475            assert_eq!(packed.get(i), Some(expected));
476        }
477        assert_eq!(packed.get(100), None);
478    }
479
480    #[test]
481    fn test_bitpack_compression() {
482        // 100 values all <= 15 (4 bits each)
483        let values: Vec<u64> = (0..100).map(|i| i % 16).collect();
484        let packed = BitPackedInts::pack(&values);
485        assert_eq!(packed.bits_per_value(), 4);
486        // 100 * 64 bits -> 100 * 4 bits = 16x compression
487        let ratio = packed.compression_ratio();
488        assert!(ratio > 10.0, "Expected ratio > 10, got {}", ratio);
489    }
490
491    #[test]
492    fn test_bitpack_serialization() {
493        let values = vec![1u64, 3, 7, 15, 31];
494        let packed = BitPackedInts::pack(&values);
495        let bytes = packed.to_bytes();
496        let restored = BitPackedInts::from_bytes(&bytes).unwrap();
497        assert_eq!(packed.unpack(), restored.unpack());
498    }
499
500    #[test]
501    fn test_delta_bitpacked_basic() {
502        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
503        let encoded = DeltaBitPacked::encode(&values);
504        let decoded = encoded.decode();
505        assert_eq!(values, decoded);
506    }
507
508    #[test]
509    fn test_delta_bitpacked_sequential() {
510        // Sequential values: deltas are all 1, needs only 1 bit each
511        let values: Vec<u64> = (1000..1100).collect();
512        let encoded = DeltaBitPacked::encode(&values);
513        assert_eq!(encoded.bits_per_delta(), 1);
514        assert_eq!(encoded.decode(), values);
515
516        // Great compression: 100 * 64 bits -> 8 (base) + ~100 bits
517        let ratio = encoded.compression_ratio();
518        assert!(ratio > 5.0, "Expected ratio > 5, got {}", ratio);
519    }
520
521    #[test]
522    fn test_delta_bitpacked_empty() {
523        let values: Vec<u64> = vec![];
524        let encoded = DeltaBitPacked::encode(&values);
525        assert!(encoded.is_empty());
526        assert_eq!(encoded.decode(), values);
527    }
528
529    #[test]
530    fn test_delta_bitpacked_single() {
531        let values = vec![42u64];
532        let encoded = DeltaBitPacked::encode(&values);
533        assert_eq!(encoded.len(), 1);
534        assert_eq!(encoded.decode(), values);
535    }
536
537    #[test]
538    fn test_delta_bitpacked_serialization() {
539        let values = vec![100u64, 105, 107, 110, 115];
540        let encoded = DeltaBitPacked::encode(&values);
541        let bytes = encoded.to_bytes();
542        let restored = DeltaBitPacked::from_bytes(&bytes).unwrap();
543        assert_eq!(encoded.decode(), restored.decode());
544    }
545
546    #[test]
547    fn test_bits_needed() {
548        assert_eq!(BitPackedInts::bits_needed(0), 1);
549        assert_eq!(BitPackedInts::bits_needed(1), 1);
550        assert_eq!(BitPackedInts::bits_needed(2), 2);
551        assert_eq!(BitPackedInts::bits_needed(3), 2);
552        assert_eq!(BitPackedInts::bits_needed(4), 3);
553        assert_eq!(BitPackedInts::bits_needed(7), 3);
554        assert_eq!(BitPackedInts::bits_needed(8), 4);
555        assert_eq!(BitPackedInts::bits_needed(255), 8);
556        assert_eq!(BitPackedInts::bits_needed(256), 9);
557        assert_eq!(BitPackedInts::bits_needed(u64::MAX), 64);
558    }
559}