bloom2/bitmap/
compressed_bitmap.rs

1use crate::Bitmap;
2
3use super::{bitmask_for_key, index_for_key, vec::VecBitmap};
4
5/// A sparse, 2-level bitmap with a low memory footprint, optimised for reads.
6///
7/// A `CompressedBitmap` splits the bitmap up into blocks of `usize` bits, and
8/// uses a second bitmap to mark populated blocks, lazily allocating them as
9/// required. This strategy represents a sparsely populated bitmap such as:
10///
11/// ```text
12///    ┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
13///    │ 0 │ 0 │ 0 │ 0 │ 1 │ 0 │ 0 │ 1 │ 0 │ 0 │ 0 │ 0 │
14///    └───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
15/// ```
16///
17/// As two bitmaps, here initialising only a single blocks of `usize` bits in
18/// the second bitmap:
19///
20/// ```text
21///                  ┌───┬───┬───┬───┐
22///       Block map: │ 0 │ 1 │ 0 │ 0 │
23///                  └───┴─┬─┴───┴───┘
24///                        └──────┐
25///     ┌ ─ ┬ ─ ┬ ─ ┬ ─ ┐ ┌───┬───▼───┬───┐ ┌ ─ ┬ ─ ┬ ─ ┬ ─ ┐
26///       0   0   0   0   │ 1 │ 0 │ 0 │ 1 │   0   0   0   0
27///     └ ─ ┴ ─ ┴ ─ ┴ ─ ┘ └───┴───┴───┴───┘ └ ─ ┴ ─ ┴ ─ ┴ ─ ┘
28/// ```
29///
30/// This amortised `O(1)` insert operation takes ~4ns, while reading a value
31/// takes a constant time ~1ns on a Core i7 @ 2.60GHz.
32///
33/// In practice inserting large numbers of values into a [`CompressedBitmap`]
34/// can be slow - for higher write performance, use a [`VecBitmap`] and later
35/// convert to a [`CompressedBitmap`] when possible.
36///
37/// ## Features
38///
39/// If the `serde` feature is enabled, a `CompressedBitmap` supports
40/// (de)serialisation with [serde].
41///
42/// [serde]: https://github.com/serde-rs/serde
43#[derive(Debug, Clone, PartialEq, Eq)]
44#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
45pub struct CompressedBitmap {
46    /// LSB is 0.
47    block_map: Vec<usize>,
48    bitmap: Vec<usize>,
49
50    #[cfg(debug_assertions)]
51    max_key: usize,
52}
53
54impl CompressedBitmap {
55    /// Construct a `CompressedBitmap` for space to hold up to `max_key` number
56    /// of bits.
57    pub fn new(max_key: usize) -> Self {
58        // Calculate how many instances of usize (blocks) are needed to hold
59        // max_key number of bits.
60        let blocks = index_for_key(max_key);
61
62        // Figure out how many usize elements are needed to represent blocks
63        // number of bitmaps.
64        let num_blocks = match blocks % (u64::BITS as usize) {
65            0 => index_for_key(blocks),
66            _ => index_for_key(blocks) + 1, // +1 to cover the remainder
67        };
68
69        // Allocate a block map.
70        //
71        // The block map contains bitmaps with 1 bits indicating the bitmap for
72        // that key has been allocated.
73        let block_map = vec![0; num_blocks];
74
75        CompressedBitmap {
76            bitmap: Vec::new(),
77            block_map,
78
79            #[cfg(debug_assertions)]
80            max_key,
81        }
82    }
83
84    pub fn size(&self) -> usize {
85        (self.block_map.capacity() * std::mem::size_of::<usize>())
86            + (self.bitmap.capacity() * std::mem::size_of::<usize>())
87            + std::mem::size_of_val(self)
88    }
89
90    /// Reduces the allocated memory usage of the bitmap to the minimum required
91    /// for the current bitmap contents.
92    ///
93    /// This is useful to minimise the memory footprint of a populated,
94    /// read-only CompressedBitmap.
95    ///
96    /// See [`Vec::shrink_to_fit`](std::vec::Vec::shrink_to_fit).
97    pub fn shrink_to_fit(&mut self) {
98        self.bitmap.shrink_to_fit();
99        self.block_map.shrink_to_fit();
100        // TODO: remove 0 blocks
101    }
102
103    /// Resets the state of the bitmap.
104    ///
105    /// An efficient way to remove all elements in the bitmap to allow it to be
106    /// reused. Does not shrink the allocated backing memory, instead retaining
107    /// the capacity to avoid reallocations.
108    pub fn clear(&mut self) {
109        for block in self.block_map.iter_mut() {
110            *block = 0;
111        }
112        self.bitmap.truncate(0);
113    }
114
115    /// Inserts `key` into the bitmap.
116    ///
117    /// # Panics
118    ///
119    /// This method MAY panic if `key` is more than the `max_key` value provided
120    /// when initialising the bitmap.
121    ///
122    /// If `debug_assertions` are enabled (such as in debug builds) inserting
123    /// `key > max` will always panic. In release builds, this may not panic for
124    /// values of `key` that are only slightly larger than `max_key` for
125    /// performance reasons.
126    pub fn set(&mut self, key: usize, value: bool) {
127        #[cfg(debug_assertions)]
128        debug_assert!(key <= self.max_key, "key {} > {} max", key, self.max_key);
129
130        // First compute the index of the bit in the bitmap if it was fully
131        // populated.
132        //
133        //
134        //     Bitmap:                │
135        //                            ▼
136        //       ┌───┬───┬───┬───┐  ┌───┬───┬───┬───┐  ┌───┬───┬───┬───┐
137        //       │ 0 │ 0 │ 0 │ 0 │  │ 0 │ 0 │ 0 │ 0 │  │ 0 │ 0 │ 0 │ 0 │
138        //       └───┴───┴───┴───┘  └───┴───┴───┴───┘  └───┴───┴───┴───┘
139        //            Block 0            Block 1            Block 2
140        //
141        //
142        // Next figure out which block (usize) this bitmap_index is part of.
143        //
144        //	  Bitmap:                      │
145        //	                      ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐
146        //	    ┌───┬───┬───┬───┐  ┌───┬───┬───┬───┐  ┌───┬───┬───┬───┐
147        //	    │ 0 │ 0 │ 0 │ 0 │  │ 0 │ 0 │ 0 │ 0 │  │ 0 │ 0 │ 0 │ 0 │
148        //	    └───┴───┴───┴───┘  └───┴───┴───┴───┘  └───┴───┴───┴───┘
149        //	         Block 0            Block 1            Block 2
150        //
151        let block_index = index_for_key(key);
152
153        // Because the blocks are initialised lazily to provide the sparse
154        // bitmap behaviour, there may be no block yet allocated for this bitmap
155        // index. The block_map data structure is itself bitmap with a 1 bit
156        // indicating the block has been allocated.
157        //
158        // Check which usize in the block_map contains the bit representing the
159        // block.
160        //
161        //            Block Map:
162        //
163        //                      ┌───┬───┬───┬───┐
164        //                   0: │ 0 │ 1 │ 1 │ 0 │
165        //                      └───┴───┴───┴───┘
166        //
167        //                      ┌───┬───┬───┬───┐
168        //                   1: │ 1 │ 0 │ 1 │ 0 │
169        //                      └─▲─┴───┴───┴───┘
170        //     block_index ━━━━━━━┛
171        //                      ┌───┬───┬───┬───┐
172        //                   2: │ 0 │ 0 │ 1 │ 1 │
173        //                      └───┴───┴───┴───┘
174        //
175        let block_map_index = index_for_key(block_index);
176        let block_map_bitmask = bitmask_for_key(block_index);
177
178        // The block has been allocated if the block usize contains a 1 bit.
179        //
180        // Because blocks are lazily initialised, block n may not be at
181        // block_map[n] if prior blocks have not been initialised. To
182        // calculate the offset of block n, the number of 1's in the
183        // block_map before bit n. This operation is very fast on modern
184        // hardware thanks to the POPCNT instruction.
185        //
186        //            Block Map:
187        //
188        //                          ┌───┬───┐
189        //                        0 │ 1 │ 1 │ 0
190        //                          └─△─┴─△─┘
191        //                            └───┼────────── popcount()
192        //                      ┏━━━┓   ┌─▽─┐
193        //                      ┃ 1 ┃ 0 │ 1 │ 0
194        //                      ┗━▲━┛   └───┘
195        //     block_index ━━━━━━━┛
196        //
197        //
198        // In the above example, the popcount() is 3, and the block is the
199        // 3+1=4th block in bitmap. However as the arrays are zero-indexed,
200        // the +1 is omitted to adjust from the position 4, to index 3.
201
202        // Count the ones in the full blocks.
203        //
204        // This could chain() the final masked count_ones() call below using
205        // once_with, and while more readable, it is unfortunately measurably
206        // slower in practice.
207        let offset: usize = (0..block_map_index)
208            .map(|i| self.block_map[i].count_ones() as usize)
209            .sum();
210
211        // Mask out the higher bits in the block map to count the populated
212        // blocks before block_index
213        let mask = block_map_bitmask - 1;
214        let offset = offset + (self.block_map[block_map_index] & mask).count_ones() as usize;
215
216        // Offset now contains the index in bitmap at which block_index can
217        // be found.
218        //
219        // Because the blocks are lazily initialised, there may not yet be a
220        // block for block_map_index.
221        //
222        // Read the usize at block_map_index, and check the bit for
223        // block_index.
224        if self.block_map[block_map_index] & block_map_bitmask == 0 {
225            // If the value to be set is false, there's nothing to do.
226            if !value {
227                return;
228            }
229
230            // The block does not exist, insert it into the bitmap at
231            // block_index.
232            if offset >= self.bitmap.len() {
233                self.bitmap.push(bitmask_for_key(key));
234            } else {
235                // If offset is < bitmap.len() this will require moving all
236                // the elements at offset+1 one slot to the right to make
237                // room for the new element.
238                //
239                // For bitmaps with large numbers of elements to the right
240                // of offset, this can become expensive.
241                self.bitmap.insert(offset, bitmask_for_key(key));
242            }
243            self.block_map[block_map_index] |= block_map_bitmask;
244            return;
245        }
246
247        // Otherwise the block map indicates the block is already allocated
248        if value {
249            self.bitmap[offset] |= bitmask_for_key(key);
250        } else {
251            self.bitmap[offset] &= !bitmask_for_key(key);
252        }
253    }
254
255    /// Returns the value at `key`.
256    ///
257    /// If a value for `key` was not previously set, `false` is returned.
258    ///
259    /// # Panics
260    ///
261    /// This method MAY panic if `key` is more than the `max_key` value provided
262    /// when initialising the bitmap.
263    pub fn get(&self, key: usize) -> bool {
264        let block_index = index_for_key(key);
265        let block_map_index = index_for_key(block_index);
266        let block_map_bitmask = bitmask_for_key(block_index);
267
268        if self.block_map[block_map_index] & block_map_bitmask == 0 {
269            return false;
270        }
271
272        let offset: usize = (0..block_map_index)
273            .map(|i| self.block_map[i].count_ones() as usize)
274            .sum();
275
276        let mask = block_map_bitmask - 1;
277        let offset: usize = offset + (self.block_map[block_map_index] & mask).count_ones() as usize;
278
279        self.bitmap[offset] & bitmask_for_key(key) != 0
280    }
281
282    /// Perform a bitwise OR against `self` and `other`, returning the
283    /// resulting merged [`CompressedBitmap`].
284    ///
285    /// # Panics
286    ///
287    /// This method panics if `other` was not configured with the same
288    /// `max_key`.
289    pub fn or(&self, other: &Self) -> Self {
290        #[cfg(debug_assertions)]
291        debug_assert_eq!(self.max_key, other.max_key);
292
293        // Invariant: the block maps are of equal length, meaning the zipped
294        // iters yield both sides to completion.
295        assert_eq!(self.block_map.len(), other.block_map.len());
296
297        let left = BlockMapIter::new(self);
298        let right = BlockMapIter::new(other);
299
300        // Construct the physical set of compressed bitmap blocks.
301        //
302        // By iterating over the non-empty logical blocks and OR-ing them
303        // together (or picking one if only one is non-empty) the merged output
304        // of both compressed bitmaps is computed (itself compressed).
305        let bitmap = left
306            .zip(right)
307            .filter_map(|(l, r)| {
308                Some(match (l, r) {
309                    (None, None) => return None,
310                    (None, Some(r)) => other.bitmap[r],
311                    (Some(l), None) => self.bitmap[l],
312                    (Some(l), Some(r)) => self.bitmap[l] | other.bitmap[r],
313                })
314            })
315            .collect::<Vec<_>>();
316
317        // Then merge the two bitmap blocks, the OR of which is guaranteed to
318        // contain exactly N set bits for the N blocks in "physical".
319        let block_map = self
320            .block_map
321            .iter()
322            .zip(&other.block_map)
323            .map(|(l, r)| l | r)
324            .collect::<Vec<_>>();
325
326        // Invariant: The number of set bits in the block map must match the
327        // number of blocks in the bitmap.
328        debug_assert_eq!(
329            block_map.iter().map(|v| v.count_ones()).sum::<u32>() as usize,
330            bitmap.len()
331        );
332
333        Self {
334            block_map,
335            bitmap,
336
337            #[cfg(debug_assertions)]
338            max_key: self.max_key,
339        }
340    }
341}
342
343/// Yields the 0-indexed physical indexes into the sparse bitmap for non-empty
344/// blocks.
345///
346/// If for the Nth call to `next()` the Nth sparse bitmap block is elided,
347/// [`None`] is returned. If the Nth bitmap block is non-empty, the physical
348/// index into the compressed vec is yielded.
349#[derive(Debug)]
350struct BlockMapIter<'a> {
351    bitmap: &'a CompressedBitmap,
352
353    /// The index into bitmap.block_map to be processed next (0 -> N).
354    block_idx: usize,
355    /// The bit in the block to be evaluated next (LSB -> MSB).
356    block_bit: u8,
357    /// The physical index to be yielded next.
358    physical_idx: usize,
359}
360
361impl<'a> BlockMapIter<'a> {
362    /// Construct a new [`BlockMapIter`] that yields indexes into the physical
363    /// bitmap blocks in `bitmap`.
364    fn new(bitmap: &'a CompressedBitmap) -> Self {
365        Self {
366            bitmap,
367            block_idx: 0,
368            block_bit: 0,
369            physical_idx: 0,
370        }
371    }
372}
373
374impl Iterator for BlockMapIter<'_> {
375    type Item = Option<usize>;
376
377    fn next(&mut self) -> Option<Self::Item> {
378        let block = self.bitmap.block_map.get(self.block_idx)?;
379
380        let v = if (block & (1 << self.block_bit)) > 0 {
381            // This logical block is non-empty.
382
383            // Read the physical index for the nth logical block.
384            let idx = self.physical_idx;
385
386            // Increment for the next physical block.
387            self.physical_idx += 1;
388
389            Some(idx)
390        } else {
391            // This logical block is empty.
392            None
393        };
394
395        // Advance the bit within the block to evaluate next.
396        self.block_bit += 1;
397
398        // Advance the block index (and wrap the bit index) if the last
399        // inspected bit was the last bit in the block.
400        if self.block_bit == usize::BITS as u8 {
401            self.block_bit = 0;
402            self.block_idx += 1;
403        }
404
405        Some(v)
406    }
407}
408
409impl Bitmap for CompressedBitmap {
410    fn get(&self, key: usize) -> bool {
411        self.get(key)
412    }
413
414    fn set(&mut self, key: usize, value: bool) {
415        self.set(key, value)
416    }
417
418    fn byte_size(&self) -> usize {
419        self.size()
420    }
421
422    fn or(&self, other: &Self) -> Self {
423        self.or(other)
424    }
425
426    fn new_with_capacity(max_key: usize) -> Self {
427        Self::new(max_key)
428    }
429}
430
431impl From<VecBitmap> for CompressedBitmap {
432    fn from(bitmap: VecBitmap) -> Self {
433        let (bitmap, max_key) = bitmap.into_parts();
434
435        // Calculate how many instances of usize (blocks) are needed to hold
436        // max_key number of bits.
437        let num_blocks = index_for_key(max_key);
438
439        // Figure out how many usize elements are needed to represent blocks
440        // number of bitmaps.
441        let num_blocks = match num_blocks % (u64::BITS as usize) {
442            0 => index_for_key(num_blocks),
443            _ => index_for_key(num_blocks) + 1, // +1 to cover the remainder
444        };
445
446        // Then shrink the bitmap into a 2-level compressed bitmap, dropping runs of
447        // 0 bits in the raw bitmap.
448        let mut block_map = vec![0; num_blocks];
449        let mut compressed = Vec::default();
450        for (idx, block) in bitmap.into_iter().enumerate() {
451            // If this block contains no set bits, it is elided from the compressed
452            // representation.
453            if block == 0 {
454                continue;
455            }
456
457            // This block contains data.
458            //
459            // Add the block to the compressed representation and mark it in the
460            // block map.
461            compressed.push(block);
462            block_map[index_for_key(idx)] |= bitmask_for_key(idx);
463        }
464
465        CompressedBitmap {
466            block_map,
467            bitmap: compressed,
468
469            #[cfg(debug_assertions)]
470            max_key,
471        }
472    }
473}
474
475// TODO(dom:test): proptest conversion
476
477#[cfg(test)]
478mod tests {
479    use proptest::prelude::*;
480    use quickcheck_macros::quickcheck;
481
482    use super::*;
483
484    macro_rules! contains_only_truthy {
485		($bitmap:ident, $max:expr; $(
486            $element:expr
487        ),*) => {
488			let truthy = vec![$($element,)*];
489			for i in 0..$max {
490				assert!($bitmap.get(i) == truthy.contains(&i), "unexpected value {}", i);
491			}
492		};
493	}
494
495    #[test]
496    fn test_set_contains() {
497        let mut b = CompressedBitmap::new(100);
498        b.set(100, true);
499        b.set(0, true);
500        b.set(42, true);
501
502        contains_only_truthy!(b, 100; 100, 0, 42);
503
504        assert!(b.get(100));
505        assert!(b.get(0));
506        assert!(b.get(42));
507    }
508
509    #[test]
510    fn test_clear() {
511        let mut b = CompressedBitmap::new(100);
512        b.set(100, true);
513        b.set(0, true);
514        b.set(42, true);
515
516        contains_only_truthy!(b, 100; 100, 0, 42);
517        b.clear();
518        contains_only_truthy!(b, 100;);
519    }
520
521    #[test]
522    fn test_set_true_false() {
523        let mut b = CompressedBitmap::new(100);
524        b.set(42, true);
525        assert!(b.get(42));
526        b.set(42, false);
527        assert!(!b.get(42));
528    }
529
530    #[test]
531    fn test_block_map_iter() {
532        let mut bitmap = CompressedBitmap::new(i16::MAX as _);
533        bitmap.set(1, true); // Block 0
534        bitmap.set(usize::BITS as usize * 4, true); // Block 4
535        bitmap.set(usize::BITS as usize * 64, true); // Block 64
536        bitmap.set(usize::BITS as usize * 65, true); // Block 65
537        bitmap.set(usize::BITS as usize * 128, true); // Block 128
538
539        let mut iter = BlockMapIter::new(&bitmap).enumerate();
540
541        assert_eq!(iter.next().unwrap(), (0, Some(0))); // The 0th block is non-empty and at physical index 0.
542        assert_eq!(iter.next().unwrap(), (1, None)); // The 1st block is all zero and elided.
543        assert_eq!(iter.next().unwrap(), (2, None)); // The 2nd block is all zero and elided.
544        assert_eq!(iter.next().unwrap(), (3, None)); // The 3rd block is all zero and elided.
545        assert_eq!(iter.next().unwrap(), (4, Some(1))); // The 4rd block is non-empty and at physical index 1.
546
547        // Filter out all the None entries, preserving the enumerated idx.
548        //
549        // This causes the iterator to yield (logical block, physical block).
550        let mut iter = iter.filter_map(|(idx, block)| block.map(|v| (idx, v)));
551
552        // Then the next non-empty blocks and their physical indexes:
553        assert_eq!(iter.next().unwrap(), (64, 2)); // The 64th block is non-empty and at physical index 2.
554        assert_eq!(iter.next().unwrap(), (65, 3)); // The 65th block is non-empty and at physical index 3.
555
556        // Finally the last bit!
557        assert_eq!(iter.next().unwrap(), (128, 4)); // The 128th block is non-empty and at physical index 4.
558
559        // And the iterator should terminate.
560        assert!(iter.next().is_none());
561    }
562
563    #[quickcheck]
564    #[should_panic]
565    fn test_panic_exceeds_max(max: u16) {
566        let max = max as usize;
567        let mut b = CompressedBitmap::new(max);
568        b.set(max + 1, true);
569    }
570
571    #[quickcheck]
572    fn test_set_contains_prop(mut vals: Vec<u16>) {
573        vals.truncate(10);
574        let mut b = CompressedBitmap::new(u16::MAX.into());
575        for v in &vals {
576            b.set(*v as usize, true);
577        }
578
579        for i in 0..u16::MAX {
580            assert!(
581                b.get(i as usize) == vals.contains(&i),
582                "unexpected value {}",
583                i
584            );
585        }
586    }
587
588    #[quickcheck]
589    fn test_or(mut a: Vec<u16>, mut b: Vec<u16>) {
590        a.truncate(10);
591        let mut bitmap_a = CompressedBitmap::new(u16::MAX.into());
592        for v in &a {
593            bitmap_a.set(*v as usize, true);
594        }
595
596        b.truncate(10);
597        let mut bitmap_b = CompressedBitmap::new(u16::MAX.into());
598        for v in &b {
599            bitmap_b.set(*v as usize, true);
600        }
601
602        let merged = bitmap_a.or(&bitmap_b);
603
604        for i in 0..u16::MAX {
605            let want_hit = a.contains(&i) || b.contains(&i);
606            assert!(
607                merged.get(i as usize) == want_hit,
608                "unexpected value {} want={:?}",
609                i,
610                want_hit
611            );
612        }
613    }
614
615    #[cfg(feature = "serde")]
616    #[test]
617    fn serde() {
618        let mut b = CompressedBitmap::new(100);
619        b.set(1, true);
620        b.set(2, false);
621        b.set(3, true);
622        contains_only_truthy!(b, 100; 1, 3);
623
624        let encoded = serde_json::to_string(&b).unwrap();
625        let decoded: CompressedBitmap = serde_json::from_str(&encoded).unwrap();
626        contains_only_truthy!(decoded, 100; 1, 3);
627    }
628
629    const MAX_KEY: usize = 1028;
630
631    proptest! {
632        #[test]
633        fn prop_compress(
634            values in prop::collection::hash_set(0..MAX_KEY, 0..20),
635        ) {
636            let mut b = VecBitmap::new_with_capacity(MAX_KEY);
637
638            for v in &values {
639                b.set(*v, true);
640            }
641
642            // Compress
643            let b = CompressedBitmap::from(b);
644
645            // Ensure all values are equal in the test range.
646            for i in 0..MAX_KEY {
647                assert_eq!(b.get(i), values.contains(&i));
648            }
649        }
650    }
651}