Skip to main content

grafeo_core/storage/
bitpack.rs

1//! Bit-packing for small integers.
2//!
3//! Bit-packing stores integers using only the minimum number of bits required
4//! to represent the largest value. This is especially effective when combined
5//! with delta encoding, where deltas are typically small.
6//!
7//! # Example
8//!
9//! ```ignore
10//! // Values: [5, 2, 3, 5, 5, 8, 2] - max is 8, needs 4 bits
11//! // Normal: 7 * 64 = 448 bits
12//! // Packed: 7 * 4 = 28 bits (16x compression)
13//!
14//! let values = vec![5u64, 2, 3, 5, 5, 8, 2];
15//! let packed = BitPackedInts::pack(&values);
16//! let unpacked = packed.unpack();
17//! assert_eq!(values, unpacked);
18//! ```
19
20use std::io;
21
22/// Bit-packed integer array.
23///
24/// Stores integers using a fixed number of bits per value, determined by the
25/// maximum value in the array.
26#[derive(Debug, Clone)]
27pub struct BitPackedInts {
28    /// Packed data.
29    data: Vec<u64>,
30    /// Number of bits per value.
31    bits_per_value: u8,
32    /// Number of values.
33    count: usize,
34}
35
36impl BitPackedInts {
37    /// Packs a slice of u64 values using the minimum bits needed.
38    #[must_use]
39    pub fn pack(values: &[u64]) -> Self {
40        if values.is_empty() {
41            return Self {
42                data: Vec::new(),
43                bits_per_value: 0,
44                count: 0,
45            };
46        }
47
48        let max_value = values.iter().copied().max().unwrap_or(0);
49        let bits = Self::bits_needed(max_value);
50        Self::pack_with_bits(values, bits)
51    }
52
53    /// Packs values using a specified bit width.
54    ///
55    /// # Panics
56    ///
57    /// Panics if any value doesn't fit in the specified bit width.
58    #[must_use]
59    pub fn pack_with_bits(values: &[u64], bits_per_value: u8) -> Self {
60        if values.is_empty() {
61            return Self {
62                data: Vec::new(),
63                bits_per_value,
64                count: 0,
65            };
66        }
67
68        if bits_per_value == 0 {
69            // All values must be 0
70            debug_assert!(values.iter().all(|&v| v == 0));
71            return Self {
72                data: Vec::new(),
73                bits_per_value: 0,
74                count: values.len(),
75            };
76        }
77
78        let bits = bits_per_value as usize;
79        let values_per_word = 64 / bits;
80        let num_words = (values.len() + values_per_word - 1) / values_per_word;
81
82        let mut data = vec![0u64; num_words];
83        let mask = if bits >= 64 {
84            u64::MAX
85        } else {
86            (1u64 << bits) - 1
87        };
88
89        for (i, &value) in values.iter().enumerate() {
90            debug_assert!(
91                value <= mask,
92                "Value {} doesn't fit in {} bits",
93                value,
94                bits_per_value
95            );
96
97            let word_idx = i / values_per_word;
98            let bit_offset = (i % values_per_word) * bits;
99            data[word_idx] |= (value & mask) << bit_offset;
100        }
101
102        Self {
103            data,
104            bits_per_value,
105            count: values.len(),
106        }
107    }
108
109    /// Unpacks all values back to u64.
110    #[must_use]
111    pub fn unpack(&self) -> Vec<u64> {
112        if self.count == 0 {
113            return Vec::new();
114        }
115
116        if self.bits_per_value == 0 {
117            return vec![0u64; self.count];
118        }
119
120        let bits = self.bits_per_value as usize;
121        let values_per_word = 64 / bits;
122        let mask = if bits >= 64 {
123            u64::MAX
124        } else {
125            (1u64 << bits) - 1
126        };
127
128        let mut result = Vec::with_capacity(self.count);
129
130        for i in 0..self.count {
131            let word_idx = i / values_per_word;
132            let bit_offset = (i % values_per_word) * bits;
133            let value = (self.data[word_idx] >> bit_offset) & mask;
134            result.push(value);
135        }
136
137        result
138    }
139
140    /// Gets a single value at the given index.
141    #[must_use]
142    pub fn get(&self, index: usize) -> Option<u64> {
143        if index >= self.count {
144            return None;
145        }
146
147        if self.bits_per_value == 0 {
148            return Some(0);
149        }
150
151        let bits = self.bits_per_value as usize;
152        let values_per_word = 64 / bits;
153        let word_idx = index / values_per_word;
154        let bit_offset = (index % values_per_word) * bits;
155        let mask = if bits >= 64 {
156            u64::MAX
157        } else {
158            (1u64 << bits) - 1
159        };
160
161        Some((self.data[word_idx] >> bit_offset) & mask)
162    }
163
164    /// Returns the number of values.
165    #[must_use]
166    pub fn len(&self) -> usize {
167        self.count
168    }
169
170    /// Returns whether the encoding is empty.
171    #[must_use]
172    pub fn is_empty(&self) -> bool {
173        self.count == 0
174    }
175
176    /// Returns the number of bits per value.
177    #[must_use]
178    pub fn bits_per_value(&self) -> u8 {
179        self.bits_per_value
180    }
181
182    /// Returns the raw packed data.
183    #[must_use]
184    pub fn data(&self) -> &[u64] {
185        &self.data
186    }
187
188    /// Returns the compression ratio compared to storing full u64s.
189    #[must_use]
190    pub fn compression_ratio(&self) -> f64 {
191        if self.count == 0 {
192            return 1.0;
193        }
194
195        let original_size = self.count * 8; // 8 bytes per u64
196        let packed_size = self.data.len() * 8;
197
198        if packed_size == 0 {
199            return f64::INFINITY; // All zeros, perfect compression
200        }
201
202        original_size as f64 / packed_size as f64
203    }
204
205    /// Returns the number of bits needed to represent a value.
206    #[must_use]
207    pub fn bits_needed(value: u64) -> u8 {
208        if value == 0 {
209            1 // Need at least 1 bit to represent 0
210        } else {
211            64 - value.leading_zeros() as u8
212        }
213    }
214
215    /// Serializes to bytes.
216    pub fn to_bytes(&self) -> Vec<u8> {
217        let mut buf = Vec::with_capacity(1 + 4 + self.data.len() * 8);
218        buf.push(self.bits_per_value);
219        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
220        for &word in &self.data {
221            buf.extend_from_slice(&word.to_le_bytes());
222        }
223        buf
224    }
225
226    /// Deserializes from bytes.
227    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
228        if bytes.len() < 5 {
229            return Err(io::Error::new(
230                io::ErrorKind::InvalidData,
231                "BitPackedInts too short",
232            ));
233        }
234
235        let bits_per_value = bytes[0];
236        let count = u32::from_le_bytes(bytes[1..5].try_into().unwrap()) as usize;
237
238        let num_words = if bits_per_value == 0 || count == 0 {
239            0
240        } else {
241            let values_per_word = 64 / bits_per_value as usize;
242            (count + values_per_word - 1) / values_per_word
243        };
244
245        if bytes.len() < 5 + num_words * 8 {
246            return Err(io::Error::new(
247                io::ErrorKind::InvalidData,
248                "BitPackedInts truncated",
249            ));
250        }
251
252        let mut data = Vec::with_capacity(num_words);
253        for i in 0..num_words {
254            let offset = 5 + i * 8;
255            let word = u64::from_le_bytes(bytes[offset..offset + 8].try_into().unwrap());
256            data.push(word);
257        }
258
259        Ok(Self {
260            data,
261            bits_per_value,
262            count,
263        })
264    }
265}
266
267/// Delta + bit-packed encoding for sorted integers.
268///
269/// Combines delta encoding with bit-packing for optimal compression of
270/// sorted integer sequences.
271#[derive(Debug, Clone)]
272pub struct DeltaBitPacked {
273    /// Base value (first value in sequence).
274    base: u64,
275    /// Bit-packed deltas.
276    deltas: BitPackedInts,
277}
278
279impl DeltaBitPacked {
280    /// Encodes sorted values using delta + bit-packing.
281    #[must_use]
282    pub fn encode(values: &[u64]) -> Self {
283        if values.is_empty() {
284            return Self {
285                base: 0,
286                deltas: BitPackedInts::pack(&[]),
287            };
288        }
289
290        let base = values[0];
291        let delta_values: Vec<u64> = values
292            .windows(2)
293            .map(|w| w[1].saturating_sub(w[0]))
294            .collect();
295
296        let deltas = BitPackedInts::pack(&delta_values);
297
298        Self { base, deltas }
299    }
300
301    /// Decodes back to the original values.
302    #[must_use]
303    pub fn decode(&self) -> Vec<u64> {
304        if self.deltas.is_empty() && self.base == 0 {
305            return Vec::new();
306        }
307
308        let delta_values = self.deltas.unpack();
309        let mut result = Vec::with_capacity(delta_values.len() + 1);
310        let mut current = self.base;
311        result.push(current);
312
313        for delta in delta_values {
314            current = current.wrapping_add(delta);
315            result.push(current);
316        }
317
318        result
319    }
320
321    /// Returns the number of values.
322    #[must_use]
323    pub fn len(&self) -> usize {
324        if self.deltas.is_empty() && self.base == 0 {
325            0
326        } else {
327            self.deltas.len() + 1
328        }
329    }
330
331    /// Returns whether the encoding is empty.
332    #[must_use]
333    pub fn is_empty(&self) -> bool {
334        self.deltas.is_empty() && self.base == 0
335    }
336
337    /// Returns the base value.
338    #[must_use]
339    pub fn base(&self) -> u64 {
340        self.base
341    }
342
343    /// Returns the bits used per delta.
344    #[must_use]
345    pub fn bits_per_delta(&self) -> u8 {
346        self.deltas.bits_per_value()
347    }
348
349    /// Returns the compression ratio.
350    #[must_use]
351    pub fn compression_ratio(&self) -> f64 {
352        let count = self.len();
353        if count == 0 {
354            return 1.0;
355        }
356
357        let original_size = count * 8;
358        let packed_size = 8 + self.deltas.data().len() * 8; // base + packed deltas
359
360        original_size as f64 / packed_size as f64
361    }
362
363    /// Serializes to bytes.
364    pub fn to_bytes(&self) -> Vec<u8> {
365        let delta_bytes = self.deltas.to_bytes();
366        let mut buf = Vec::with_capacity(8 + delta_bytes.len());
367        buf.extend_from_slice(&self.base.to_le_bytes());
368        buf.extend_from_slice(&delta_bytes);
369        buf
370    }
371
372    /// Deserializes from bytes.
373    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
374        if bytes.len() < 8 {
375            return Err(io::Error::new(
376                io::ErrorKind::InvalidData,
377                "DeltaBitPacked too short",
378            ));
379        }
380
381        let base = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
382        let deltas = BitPackedInts::from_bytes(&bytes[8..])?;
383
384        Ok(Self { base, deltas })
385    }
386}
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391
392    #[test]
393    fn test_bitpack_basic() {
394        let values = vec![5u64, 2, 3, 5, 5, 8, 2];
395        let packed = BitPackedInts::pack(&values);
396        let unpacked = packed.unpack();
397        assert_eq!(values, unpacked);
398    }
399
400    #[test]
401    fn test_bitpack_empty() {
402        let values: Vec<u64> = vec![];
403        let packed = BitPackedInts::pack(&values);
404        assert!(packed.is_empty());
405        assert_eq!(packed.unpack(), values);
406    }
407
408    #[test]
409    fn test_bitpack_single() {
410        let values = vec![42u64];
411        let packed = BitPackedInts::pack(&values);
412        assert_eq!(packed.len(), 1);
413        assert_eq!(packed.unpack(), values);
414    }
415
416    #[test]
417    fn test_bitpack_all_zeros() {
418        let values = vec![0u64; 100];
419        let packed = BitPackedInts::pack(&values);
420        assert_eq!(packed.bits_per_value(), 1);
421        assert_eq!(packed.unpack(), values);
422    }
423
424    #[test]
425    fn test_bitpack_powers_of_two() {
426        for bits in 1..=64u8 {
427            let max_val = if bits == 64 {
428                u64::MAX
429            } else {
430                (1u64 << bits) - 1
431            };
432            let values = vec![0, max_val / 2, max_val];
433            let packed = BitPackedInts::pack(&values);
434            assert_eq!(packed.bits_per_value(), bits);
435            assert_eq!(packed.unpack(), values);
436        }
437    }
438
439    #[test]
440    fn test_bitpack_get() {
441        let values = vec![1u64, 2, 3, 4, 5];
442        let packed = BitPackedInts::pack(&values);
443
444        for (i, &expected) in values.iter().enumerate() {
445            assert_eq!(packed.get(i), Some(expected));
446        }
447        assert_eq!(packed.get(100), None);
448    }
449
450    #[test]
451    fn test_bitpack_compression() {
452        // 100 values all <= 15 (4 bits each)
453        let values: Vec<u64> = (0..100).map(|i| i % 16).collect();
454        let packed = BitPackedInts::pack(&values);
455        assert_eq!(packed.bits_per_value(), 4);
456        // 100 * 64 bits -> 100 * 4 bits = 16x compression
457        let ratio = packed.compression_ratio();
458        assert!(ratio > 10.0, "Expected ratio > 10, got {}", ratio);
459    }
460
461    #[test]
462    fn test_bitpack_serialization() {
463        let values = vec![1u64, 3, 7, 15, 31];
464        let packed = BitPackedInts::pack(&values);
465        let bytes = packed.to_bytes();
466        let restored = BitPackedInts::from_bytes(&bytes).unwrap();
467        assert_eq!(packed.unpack(), restored.unpack());
468    }
469
470    #[test]
471    fn test_delta_bitpacked_basic() {
472        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
473        let encoded = DeltaBitPacked::encode(&values);
474        let decoded = encoded.decode();
475        assert_eq!(values, decoded);
476    }
477
478    #[test]
479    fn test_delta_bitpacked_sequential() {
480        // Sequential values: deltas are all 1, needs only 1 bit each
481        let values: Vec<u64> = (1000..1100).collect();
482        let encoded = DeltaBitPacked::encode(&values);
483        assert_eq!(encoded.bits_per_delta(), 1);
484        assert_eq!(encoded.decode(), values);
485
486        // Great compression: 100 * 64 bits -> 8 (base) + ~100 bits
487        let ratio = encoded.compression_ratio();
488        assert!(ratio > 5.0, "Expected ratio > 5, got {}", ratio);
489    }
490
491    #[test]
492    fn test_delta_bitpacked_empty() {
493        let values: Vec<u64> = vec![];
494        let encoded = DeltaBitPacked::encode(&values);
495        assert!(encoded.is_empty());
496        assert_eq!(encoded.decode(), values);
497    }
498
499    #[test]
500    fn test_delta_bitpacked_single() {
501        let values = vec![42u64];
502        let encoded = DeltaBitPacked::encode(&values);
503        assert_eq!(encoded.len(), 1);
504        assert_eq!(encoded.decode(), values);
505    }
506
507    #[test]
508    fn test_delta_bitpacked_serialization() {
509        let values = vec![100u64, 105, 107, 110, 115];
510        let encoded = DeltaBitPacked::encode(&values);
511        let bytes = encoded.to_bytes();
512        let restored = DeltaBitPacked::from_bytes(&bytes).unwrap();
513        assert_eq!(encoded.decode(), restored.decode());
514    }
515
516    #[test]
517    fn test_bits_needed() {
518        assert_eq!(BitPackedInts::bits_needed(0), 1);
519        assert_eq!(BitPackedInts::bits_needed(1), 1);
520        assert_eq!(BitPackedInts::bits_needed(2), 2);
521        assert_eq!(BitPackedInts::bits_needed(3), 2);
522        assert_eq!(BitPackedInts::bits_needed(4), 3);
523        assert_eq!(BitPackedInts::bits_needed(7), 3);
524        assert_eq!(BitPackedInts::bits_needed(8), 4);
525        assert_eq!(BitPackedInts::bits_needed(255), 8);
526        assert_eq!(BitPackedInts::bits_needed(256), 9);
527        assert_eq!(BitPackedInts::bits_needed(u64::MAX), 64);
528    }
529}