Skip to main content

grafeo_core/codec/
bitpack.rs

1//! Bit-packing for small integers.
2//!
3//! If your largest value is 15, why use 64 bits per number? Bit-packing uses
4//! only the bits you need - 4 bits for values up to 15, giving you 16x compression.
5//!
6//! This works especially well after delta encoding sorted data, where the deltas
7//! are often tiny even when the original values are huge.
8//!
9//! # Example
10//!
11//! ```no_run
12//! # use grafeo_core::codec::bitpack::BitPackedInts;
13//! // Values [5, 2, 3, 5, 5, 8, 2] - max is 8, needs 4 bits
14//! // Without packing: 7 * 64 = 448 bits
15//! // With packing:    7 * 4  = 28 bits (16x smaller!)
16//!
17//! let values = vec![5u64, 2, 3, 5, 5, 8, 2];
18//! let packed = BitPackedInts::pack(&values);
19//! let unpacked = packed.unpack();
20//! assert_eq!(values, unpacked);
21//! ```
22
23use std::io;
24
25/// Stores integers using only as many bits as the largest value needs.
26///
27/// Pass your values to [`pack()`](Self::pack) and we'll figure out the optimal
28/// bit width automatically. Random access via [`get()`](Self::get) is O(1).
29#[derive(Debug, Clone)]
30pub struct BitPackedInts {
31    /// Packed data.
32    data: Vec<u64>,
33    /// Number of bits per value.
34    bits_per_value: u8,
35    /// Number of values.
36    count: usize,
37}
38
39impl BitPackedInts {
40    /// Reconstructs from pre-packed raw parts.
41    ///
42    /// Used by section deserialization. The caller is responsible for ensuring
43    /// the data is consistent (correct word count for the given bits and count).
44    #[must_use]
45    pub fn from_raw_parts(data: Vec<u64>, bits_per_value: u8, count: usize) -> Self {
46        Self {
47            data,
48            bits_per_value,
49            count,
50        }
51    }
52
53    /// Packs a slice of u64 values using the minimum bits needed.
54    #[must_use]
55    pub fn pack(values: &[u64]) -> Self {
56        if values.is_empty() {
57            return Self {
58                data: Vec::new(),
59                bits_per_value: 0,
60                count: 0,
61            };
62        }
63
64        let max_value = values.iter().copied().max().unwrap_or(0);
65        let bits = Self::bits_needed(max_value);
66        Self::pack_with_bits(values, bits)
67    }
68
69    /// Packs values using a specified bit width.
70    ///
71    /// # Panics
72    ///
73    /// Panics if any value doesn't fit in the specified bit width.
74    #[must_use]
75    pub fn pack_with_bits(values: &[u64], bits_per_value: u8) -> Self {
76        if values.is_empty() {
77            return Self {
78                data: Vec::new(),
79                bits_per_value,
80                count: 0,
81            };
82        }
83
84        if bits_per_value == 0 {
85            // All values must be 0
86            debug_assert!(values.iter().all(|&v| v == 0));
87            return Self {
88                data: Vec::new(),
89                bits_per_value: 0,
90                count: values.len(),
91            };
92        }
93
94        let bits = bits_per_value as usize;
95        let values_per_word = 64 / bits;
96        let num_words = (values.len() + values_per_word - 1) / values_per_word;
97
98        let mut data = vec![0u64; num_words];
99        let mask = if bits >= 64 {
100            u64::MAX
101        } else {
102            (1u64 << bits) - 1
103        };
104
105        for (i, &value) in values.iter().enumerate() {
106            debug_assert!(
107                value <= mask,
108                "Value {} doesn't fit in {} bits",
109                value,
110                bits_per_value
111            );
112
113            let word_idx = i / values_per_word;
114            let bit_offset = (i % values_per_word) * bits;
115            data[word_idx] |= (value & mask) << bit_offset;
116        }
117
118        Self {
119            data,
120            bits_per_value,
121            count: values.len(),
122        }
123    }
124
125    /// Unpacks all values back to u64.
126    #[must_use]
127    pub fn unpack(&self) -> Vec<u64> {
128        if self.count == 0 {
129            return Vec::new();
130        }
131
132        if self.bits_per_value == 0 {
133            return vec![0u64; self.count];
134        }
135
136        let bits = self.bits_per_value as usize;
137        let values_per_word = 64 / bits;
138        let mask = if bits >= 64 {
139            u64::MAX
140        } else {
141            (1u64 << bits) - 1
142        };
143
144        let mut result = Vec::with_capacity(self.count);
145
146        for i in 0..self.count {
147            let word_idx = i / values_per_word;
148            let bit_offset = (i % values_per_word) * bits;
149            let value = (self.data[word_idx] >> bit_offset) & mask;
150            result.push(value);
151        }
152
153        result
154    }
155
156    /// Gets a single value at the given index.
157    #[must_use]
158    pub fn get(&self, index: usize) -> Option<u64> {
159        if index >= self.count {
160            return None;
161        }
162
163        if self.bits_per_value == 0 {
164            return Some(0);
165        }
166
167        let bits = self.bits_per_value as usize;
168        let values_per_word = 64 / bits;
169        let word_idx = index / values_per_word;
170        let bit_offset = (index % values_per_word) * bits;
171        let mask = if bits >= 64 {
172            u64::MAX
173        } else {
174            (1u64 << bits) - 1
175        };
176
177        Some((self.data[word_idx] >> bit_offset) & mask)
178    }
179
180    /// Returns the number of values.
181    #[must_use]
182    pub fn len(&self) -> usize {
183        self.count
184    }
185
186    /// Returns whether the encoding is empty.
187    #[must_use]
188    pub fn is_empty(&self) -> bool {
189        self.count == 0
190    }
191
192    /// Returns the number of bits per value.
193    #[must_use]
194    pub fn bits_per_value(&self) -> u8 {
195        self.bits_per_value
196    }
197
198    /// Returns the raw packed data.
199    #[must_use]
200    pub fn data(&self) -> &[u64] {
201        &self.data
202    }
203
204    /// Returns the compression ratio compared to storing full u64s.
205    #[must_use]
206    pub fn compression_ratio(&self) -> f64 {
207        if self.count == 0 {
208            return 1.0;
209        }
210
211        let original_size = self.count * 8; // 8 bytes per u64
212        let packed_size = self.data.len() * 8;
213
214        if packed_size == 0 {
215            return f64::INFINITY; // All zeros, perfect compression
216        }
217
218        original_size as f64 / packed_size as f64
219    }
220
221    /// Returns the number of bits needed to represent a value.
222    ///
223    /// The result is always in `1..=64`.
224    ///
225    /// # Panics
226    ///
227    /// Cannot panic: the result of `64 - leading_zeros()` is always in
228    /// `1..=64`, which fits `u8`.
229    #[must_use]
230    pub fn bits_needed(value: u64) -> u8 {
231        if value == 0 {
232            1 // Need at least 1 bit to represent 0
233        } else {
234            // leading_zeros() returns u32 in [0, 63] for non-zero u64;
235            // (64 - n) is in [1, 64], which always fits u8.
236            u8::try_from(64u32 - value.leading_zeros()).expect("bits_needed result is in 1..=64")
237        }
238    }
239
240    /// Serializes to bytes.
241    ///
242    /// # Errors
243    ///
244    /// Returns `Err` if the value count exceeds `u32::MAX`.
245    pub fn to_bytes(&self) -> io::Result<Vec<u8>> {
246        let count_u32 = u32::try_from(self.count).map_err(|_| {
247            io::Error::new(
248                io::ErrorKind::InvalidInput,
249                format!(
250                    "BitPackedInts count {} exceeds u32::MAX, cannot serialize",
251                    self.count
252                ),
253            )
254        })?;
255        let mut buf = Vec::with_capacity(1 + 4 + self.data.len() * 8);
256        buf.push(self.bits_per_value);
257        buf.extend_from_slice(&count_u32.to_le_bytes());
258        for &word in &self.data {
259            buf.extend_from_slice(&word.to_le_bytes());
260        }
261        Ok(buf)
262    }
263
264    /// Deserializes from bytes.
265    ///
266    /// # Errors
267    ///
268    /// Returns `Err` if the byte slice is too short or contains invalid data.
269    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
270        if bytes.len() < 5 {
271            return Err(io::Error::new(
272                io::ErrorKind::InvalidData,
273                "BitPackedInts too short",
274            ));
275        }
276
277        let bits_per_value = bytes[0];
278        if bits_per_value > 64 {
279            return Err(io::Error::new(
280                io::ErrorKind::InvalidData,
281                format!("BitPackedInts bits_per_value {bits_per_value} exceeds 64"),
282            ));
283        }
284        let count = u32::from_le_bytes(
285            bytes[1..5]
286                .try_into()
287                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
288        ) as usize;
289
290        let num_words = if bits_per_value == 0 || count == 0 {
291            0
292        } else {
293            let values_per_word = 64 / bits_per_value as usize;
294            (count + values_per_word - 1) / values_per_word
295        };
296
297        if bytes.len() < 5 + num_words * 8 {
298            return Err(io::Error::new(
299                io::ErrorKind::InvalidData,
300                "BitPackedInts truncated",
301            ));
302        }
303
304        let mut data = Vec::with_capacity(num_words);
305        for i in 0..num_words {
306            let offset = 5 + i * 8;
307            let word = u64::from_le_bytes(
308                bytes[offset..offset + 8]
309                    .try_into()
310                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
311            );
312            data.push(word);
313        }
314
315        Ok(Self {
316            data,
317            bits_per_value,
318            count,
319        })
320    }
321}
322
323/// The best compression for sorted integers - delta encoding plus bit-packing.
324///
325/// Stores the first value, then packs the differences between consecutive values.
326/// For sequential IDs like [1000, 1001, 1002, ...], deltas are all 1, needing just
327/// 1 bit each - that's up to 64x compression!
328#[derive(Debug, Clone)]
329pub struct DeltaBitPacked {
330    /// Base value (first value in sequence).
331    base: u64,
332    /// Bit-packed deltas.
333    deltas: BitPackedInts,
334}
335
336impl DeltaBitPacked {
337    /// Encodes sorted values using delta + bit-packing.
338    #[must_use]
339    pub fn encode(values: &[u64]) -> Self {
340        if values.is_empty() {
341            return Self {
342                base: 0,
343                deltas: BitPackedInts::pack(&[]),
344            };
345        }
346
347        let base = values[0];
348        let delta_values: Vec<u64> = values
349            .windows(2)
350            .map(|w| w[1].saturating_sub(w[0]))
351            .collect();
352
353        let deltas = BitPackedInts::pack(&delta_values);
354
355        Self { base, deltas }
356    }
357
358    /// Decodes back to the original values.
359    #[must_use]
360    pub fn decode(&self) -> Vec<u64> {
361        if self.deltas.is_empty() && self.base == 0 {
362            return Vec::new();
363        }
364
365        let delta_values = self.deltas.unpack();
366        let mut result = Vec::with_capacity(delta_values.len() + 1);
367        let mut current = self.base;
368        result.push(current);
369
370        for delta in delta_values {
371            current = current.wrapping_add(delta);
372            result.push(current);
373        }
374
375        result
376    }
377
378    /// Returns the number of values.
379    #[must_use]
380    pub fn len(&self) -> usize {
381        if self.deltas.is_empty() && self.base == 0 {
382            0
383        } else {
384            self.deltas.len() + 1
385        }
386    }
387
388    /// Returns whether the encoding is empty.
389    #[must_use]
390    pub fn is_empty(&self) -> bool {
391        self.deltas.is_empty() && self.base == 0
392    }
393
394    /// Returns the base value.
395    #[must_use]
396    pub fn base(&self) -> u64 {
397        self.base
398    }
399
400    /// Returns the bits used per delta.
401    #[must_use]
402    pub fn bits_per_delta(&self) -> u8 {
403        self.deltas.bits_per_value()
404    }
405
406    /// Returns the compression ratio.
407    #[must_use]
408    pub fn compression_ratio(&self) -> f64 {
409        let count = self.len();
410        if count == 0 {
411            return 1.0;
412        }
413
414        let original_size = count * 8;
415        let packed_size = 8 + self.deltas.data().len() * 8; // base + packed deltas
416
417        original_size as f64 / packed_size as f64
418    }
419
420    /// Serializes to bytes.
421    ///
422    /// # Errors
423    ///
424    /// Returns `Err` if the delta count exceeds `u32::MAX`.
425    pub fn to_bytes(&self) -> io::Result<Vec<u8>> {
426        let delta_bytes = self.deltas.to_bytes()?;
427        let mut buf = Vec::with_capacity(8 + delta_bytes.len());
428        buf.extend_from_slice(&self.base.to_le_bytes());
429        buf.extend_from_slice(&delta_bytes);
430        Ok(buf)
431    }
432
433    /// Deserializes from bytes.
434    ///
435    /// # Errors
436    ///
437    /// Returns `Err` if the byte slice is too short or contains invalid data.
438    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
439        if bytes.len() < 8 {
440            return Err(io::Error::new(
441                io::ErrorKind::InvalidData,
442                "DeltaBitPacked too short",
443            ));
444        }
445
446        let base = u64::from_le_bytes(
447            bytes[0..8]
448                .try_into()
449                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
450        );
451        let deltas = BitPackedInts::from_bytes(&bytes[8..])?;
452
453        Ok(Self { base, deltas })
454    }
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460
461    #[test]
462    fn test_bitpack_basic() {
463        let values = vec![5u64, 2, 3, 5, 5, 8, 2];
464        let packed = BitPackedInts::pack(&values);
465        let unpacked = packed.unpack();
466        assert_eq!(values, unpacked);
467    }
468
469    #[test]
470    fn test_bitpack_empty() {
471        let values: Vec<u64> = vec![];
472        let packed = BitPackedInts::pack(&values);
473        assert!(packed.is_empty());
474        assert_eq!(packed.unpack(), values);
475    }
476
477    #[test]
478    fn test_bitpack_single() {
479        let values = vec![42u64];
480        let packed = BitPackedInts::pack(&values);
481        assert_eq!(packed.len(), 1);
482        assert_eq!(packed.unpack(), values);
483    }
484
485    #[test]
486    fn test_bitpack_all_zeros() {
487        let values = vec![0u64; 100];
488        let packed = BitPackedInts::pack(&values);
489        assert_eq!(packed.bits_per_value(), 1);
490        assert_eq!(packed.unpack(), values);
491    }
492
493    #[test]
494    fn test_bitpack_powers_of_two() {
495        for bits in 1..=64u8 {
496            let max_val = if bits == 64 {
497                u64::MAX
498            } else {
499                (1u64 << bits) - 1
500            };
501            let values = vec![0, max_val / 2, max_val];
502            let packed = BitPackedInts::pack(&values);
503            assert_eq!(packed.bits_per_value(), bits);
504            assert_eq!(packed.unpack(), values);
505        }
506    }
507
508    #[test]
509    fn test_bitpack_get() {
510        let values = vec![1u64, 2, 3, 4, 5];
511        let packed = BitPackedInts::pack(&values);
512
513        for (i, &expected) in values.iter().enumerate() {
514            assert_eq!(packed.get(i), Some(expected));
515        }
516        assert_eq!(packed.get(100), None);
517    }
518
519    #[test]
520    fn test_bitpack_compression() {
521        // 100 values all <= 15 (4 bits each)
522        let values: Vec<u64> = (0..100).map(|i| i % 16).collect();
523        let packed = BitPackedInts::pack(&values);
524        assert_eq!(packed.bits_per_value(), 4);
525        // 100 * 64 bits -> 100 * 4 bits = 16x compression
526        let ratio = packed.compression_ratio();
527        assert!(ratio > 10.0, "Expected ratio > 10, got {}", ratio);
528    }
529
530    #[test]
531    fn test_bitpack_serialization() {
532        let values = vec![1u64, 3, 7, 15, 31];
533        let packed = BitPackedInts::pack(&values);
534        let bytes = packed.to_bytes().unwrap();
535        let restored = BitPackedInts::from_bytes(&bytes).unwrap();
536        assert_eq!(packed.unpack(), restored.unpack());
537    }
538
539    #[test]
540    fn test_delta_bitpacked_basic() {
541        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
542        let encoded = DeltaBitPacked::encode(&values);
543        let decoded = encoded.decode();
544        assert_eq!(values, decoded);
545    }
546
547    #[test]
548    fn test_delta_bitpacked_sequential() {
549        // Sequential values: deltas are all 1, needs only 1 bit each
550        let values: Vec<u64> = (1000..1100).collect();
551        let encoded = DeltaBitPacked::encode(&values);
552        assert_eq!(encoded.bits_per_delta(), 1);
553        assert_eq!(encoded.decode(), values);
554
555        // Great compression: 100 * 64 bits -> 8 (base) + ~100 bits
556        let ratio = encoded.compression_ratio();
557        assert!(ratio > 5.0, "Expected ratio > 5, got {}", ratio);
558    }
559
560    #[test]
561    fn test_delta_bitpacked_empty() {
562        let values: Vec<u64> = vec![];
563        let encoded = DeltaBitPacked::encode(&values);
564        assert!(encoded.is_empty());
565        assert_eq!(encoded.decode(), values);
566    }
567
568    #[test]
569    fn test_delta_bitpacked_single() {
570        let values = vec![42u64];
571        let encoded = DeltaBitPacked::encode(&values);
572        assert_eq!(encoded.len(), 1);
573        assert_eq!(encoded.decode(), values);
574    }
575
576    #[test]
577    fn test_delta_bitpacked_serialization() {
578        let values = vec![100u64, 105, 107, 110, 115];
579        let encoded = DeltaBitPacked::encode(&values);
580        let bytes = encoded.to_bytes().unwrap();
581        let restored = DeltaBitPacked::from_bytes(&bytes).unwrap();
582        assert_eq!(encoded.decode(), restored.decode());
583    }
584
585    #[test]
586    fn test_bits_needed() {
587        assert_eq!(BitPackedInts::bits_needed(0), 1);
588        assert_eq!(BitPackedInts::bits_needed(1), 1);
589        assert_eq!(BitPackedInts::bits_needed(2), 2);
590        assert_eq!(BitPackedInts::bits_needed(3), 2);
591        assert_eq!(BitPackedInts::bits_needed(4), 3);
592        assert_eq!(BitPackedInts::bits_needed(7), 3);
593        assert_eq!(BitPackedInts::bits_needed(8), 4);
594        assert_eq!(BitPackedInts::bits_needed(255), 8);
595        assert_eq!(BitPackedInts::bits_needed(256), 9);
596        assert_eq!(BitPackedInts::bits_needed(u64::MAX), 64);
597    }
598}