bloom2/bitmap/compressed_bitmap.rs
1use crate::Bitmap;
2
3use super::{bitmask_for_key, index_for_key, vec::VecBitmap};
4
5/// A sparse, 2-level bitmap with a low memory footprint, optimised for reads.
6///
7/// A `CompressedBitmap` splits the bitmap up into blocks of `usize` bits, and
8/// uses a second bitmap to mark populated blocks, lazily allocating them as
9/// required. This strategy represents a sparsely populated bitmap such as:
10///
11/// ```text
12/// ┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
13/// │ 0 │ 0 │ 0 │ 0 │ 1 │ 0 │ 0 │ 1 │ 0 │ 0 │ 0 │ 0 │
14/// └───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
15/// ```
16///
17/// As two bitmaps, here initialising only a single blocks of `usize` bits in
18/// the second bitmap:
19///
20/// ```text
21/// ┌───┬───┬───┬───┐
22/// Block map: │ 0 │ 1 │ 0 │ 0 │
23/// └───┴─┬─┴───┴───┘
24/// └──────┐
25/// ┌ ─ ┬ ─ ┬ ─ ┬ ─ ┐ ┌───┬───▼───┬───┐ ┌ ─ ┬ ─ ┬ ─ ┬ ─ ┐
26/// 0 0 0 0 │ 1 │ 0 │ 0 │ 1 │ 0 0 0 0
27/// └ ─ ┴ ─ ┴ ─ ┴ ─ ┘ └───┴───┴───┴───┘ └ ─ ┴ ─ ┴ ─ ┴ ─ ┘
28/// ```
29///
30/// This amortised `O(1)` insert operation takes ~4ns, while reading a value
31/// takes a constant time ~1ns on a Core i7 @ 2.60GHz.
32///
33/// In practice inserting large numbers of values into a [`CompressedBitmap`]
34/// can be slow - for higher write performance, use a [`VecBitmap`] and later
35/// convert to a [`CompressedBitmap`] when possible.
36///
37/// ## Features
38///
39/// If the `serde` feature is enabled, a `CompressedBitmap` supports
40/// (de)serialisation with [serde].
41///
42/// [serde]: https://github.com/serde-rs/serde
43#[derive(Debug, Clone, PartialEq, Eq)]
44#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
45pub struct CompressedBitmap {
46 /// LSB is 0.
47 block_map: Vec<usize>,
48 bitmap: Vec<usize>,
49
50 #[cfg(debug_assertions)]
51 max_key: usize,
52}
53
54impl CompressedBitmap {
55 /// Construct a `CompressedBitmap` for space to hold up to `max_key` number
56 /// of bits.
57 pub fn new(max_key: usize) -> Self {
58 // Calculate how many instances of usize (blocks) are needed to hold
59 // max_key number of bits.
60 let blocks = index_for_key(max_key);
61
62 // Figure out how many usize elements are needed to represent blocks
63 // number of bitmaps.
64 let num_blocks = match blocks % (u64::BITS as usize) {
65 0 => index_for_key(blocks),
66 _ => index_for_key(blocks) + 1, // +1 to cover the remainder
67 };
68
69 // Allocate a block map.
70 //
71 // The block map contains bitmaps with 1 bits indicating the bitmap for
72 // that key has been allocated.
73 let block_map = vec![0; num_blocks];
74
75 CompressedBitmap {
76 bitmap: Vec::new(),
77 block_map,
78
79 #[cfg(debug_assertions)]
80 max_key,
81 }
82 }
83
84 pub fn size(&self) -> usize {
85 (self.block_map.capacity() * std::mem::size_of::<usize>())
86 + (self.bitmap.capacity() * std::mem::size_of::<usize>())
87 + std::mem::size_of_val(self)
88 }
89
90 /// Reduces the allocated memory usage of the bitmap to the minimum required
91 /// for the current bitmap contents.
92 ///
93 /// This is useful to minimise the memory footprint of a populated,
94 /// read-only CompressedBitmap.
95 ///
96 /// See [`Vec::shrink_to_fit`](std::vec::Vec::shrink_to_fit).
97 pub fn shrink_to_fit(&mut self) {
98 self.bitmap.shrink_to_fit();
99 self.block_map.shrink_to_fit();
100 // TODO: remove 0 blocks
101 }
102
103 /// Resets the state of the bitmap.
104 ///
105 /// An efficient way to remove all elements in the bitmap to allow it to be
106 /// reused. Does not shrink the allocated backing memory, instead retaining
107 /// the capacity to avoid reallocations.
108 pub fn clear(&mut self) {
109 for block in self.block_map.iter_mut() {
110 *block = 0;
111 }
112 self.bitmap.truncate(0);
113 }
114
115 /// Inserts `key` into the bitmap.
116 ///
117 /// # Panics
118 ///
119 /// This method MAY panic if `key` is more than the `max_key` value provided
120 /// when initialising the bitmap.
121 ///
122 /// If `debug_assertions` are enabled (such as in debug builds) inserting
123 /// `key > max` will always panic. In release builds, this may not panic for
124 /// values of `key` that are only slightly larger than `max_key` for
125 /// performance reasons.
126 pub fn set(&mut self, key: usize, value: bool) {
127 #[cfg(debug_assertions)]
128 debug_assert!(key <= self.max_key, "key {} > {} max", key, self.max_key);
129
130 // First compute the index of the bit in the bitmap if it was fully
131 // populated.
132 //
133 //
134 // Bitmap: │
135 // ▼
136 // ┌───┬───┬───┬───┐ ┌───┬───┬───┬───┐ ┌───┬───┬───┬───┐
137 // │ 0 │ 0 │ 0 │ 0 │ │ 0 │ 0 │ 0 │ 0 │ │ 0 │ 0 │ 0 │ 0 │
138 // └───┴───┴───┴───┘ └───┴───┴───┴───┘ └───┴───┴───┴───┘
139 // Block 0 Block 1 Block 2
140 //
141 //
142 // Next figure out which block (usize) this bitmap_index is part of.
143 //
144 // Bitmap: │
145 // ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐
146 // ┌───┬───┬───┬───┐ ┌───┬───┬───┬───┐ ┌───┬───┬───┬───┐
147 // │ 0 │ 0 │ 0 │ 0 │ │ 0 │ 0 │ 0 │ 0 │ │ 0 │ 0 │ 0 │ 0 │
148 // └───┴───┴───┴───┘ └───┴───┴───┴───┘ └───┴───┴───┴───┘
149 // Block 0 Block 1 Block 2
150 //
151 let block_index = index_for_key(key);
152
153 // Because the blocks are initialised lazily to provide the sparse
154 // bitmap behaviour, there may be no block yet allocated for this bitmap
155 // index. The block_map data structure is itself bitmap with a 1 bit
156 // indicating the block has been allocated.
157 //
158 // Check which usize in the block_map contains the bit representing the
159 // block.
160 //
161 // Block Map:
162 //
163 // ┌───┬───┬───┬───┐
164 // 0: │ 0 │ 1 │ 1 │ 0 │
165 // └───┴───┴───┴───┘
166 //
167 // ┌───┬───┬───┬───┐
168 // 1: │ 1 │ 0 │ 1 │ 0 │
169 // └─▲─┴───┴───┴───┘
170 // block_index ━━━━━━━┛
171 // ┌───┬───┬───┬───┐
172 // 2: │ 0 │ 0 │ 1 │ 1 │
173 // └───┴───┴───┴───┘
174 //
175 let block_map_index = index_for_key(block_index);
176 let block_map_bitmask = bitmask_for_key(block_index);
177
178 // The block has been allocated if the block usize contains a 1 bit.
179 //
180 // Because blocks are lazily initialised, block n may not be at
181 // block_map[n] if prior blocks have not been initialised. To
182 // calculate the offset of block n, the number of 1's in the
183 // block_map before bit n. This operation is very fast on modern
184 // hardware thanks to the POPCNT instruction.
185 //
186 // Block Map:
187 //
188 // ┌───┬───┐
189 // 0 │ 1 │ 1 │ 0
190 // └─△─┴─△─┘
191 // └───┼────────── popcount()
192 // ┏━━━┓ ┌─▽─┐
193 // ┃ 1 ┃ 0 │ 1 │ 0
194 // ┗━▲━┛ └───┘
195 // block_index ━━━━━━━┛
196 //
197 //
198 // In the above example, the popcount() is 3, and the block is the
199 // 3+1=4th block in bitmap. However as the arrays are zero-indexed,
200 // the +1 is omitted to adjust from the position 4, to index 3.
201
202 // Count the ones in the full blocks.
203 //
204 // This could chain() the final masked count_ones() call below using
205 // once_with, and while more readable, it is unfortunately measurably
206 // slower in practice.
207 let offset: usize = (0..block_map_index)
208 .map(|i| self.block_map[i].count_ones() as usize)
209 .sum();
210
211 // Mask out the higher bits in the block map to count the populated
212 // blocks before block_index
213 let mask = block_map_bitmask - 1;
214 let offset = offset + (self.block_map[block_map_index] & mask).count_ones() as usize;
215
216 // Offset now contains the index in bitmap at which block_index can
217 // be found.
218 //
219 // Because the blocks are lazily initialised, there may not yet be a
220 // block for block_map_index.
221 //
222 // Read the usize at block_map_index, and check the bit for
223 // block_index.
224 if self.block_map[block_map_index] & block_map_bitmask == 0 {
225 // If the value to be set is false, there's nothing to do.
226 if !value {
227 return;
228 }
229
230 // The block does not exist, insert it into the bitmap at
231 // block_index.
232 if offset >= self.bitmap.len() {
233 self.bitmap.push(bitmask_for_key(key));
234 } else {
235 // If offset is < bitmap.len() this will require moving all
236 // the elements at offset+1 one slot to the right to make
237 // room for the new element.
238 //
239 // For bitmaps with large numbers of elements to the right
240 // of offset, this can become expensive.
241 self.bitmap.insert(offset, bitmask_for_key(key));
242 }
243 self.block_map[block_map_index] |= block_map_bitmask;
244 return;
245 }
246
247 // Otherwise the block map indicates the block is already allocated
248 if value {
249 self.bitmap[offset] |= bitmask_for_key(key);
250 } else {
251 self.bitmap[offset] &= !bitmask_for_key(key);
252 }
253 }
254
255 /// Returns the value at `key`.
256 ///
257 /// If a value for `key` was not previously set, `false` is returned.
258 ///
259 /// # Panics
260 ///
261 /// This method MAY panic if `key` is more than the `max_key` value provided
262 /// when initialising the bitmap.
263 pub fn get(&self, key: usize) -> bool {
264 let block_index = index_for_key(key);
265 let block_map_index = index_for_key(block_index);
266 let block_map_bitmask = bitmask_for_key(block_index);
267
268 if self.block_map[block_map_index] & block_map_bitmask == 0 {
269 return false;
270 }
271
272 let offset: usize = (0..block_map_index)
273 .map(|i| self.block_map[i].count_ones() as usize)
274 .sum();
275
276 let mask = block_map_bitmask - 1;
277 let offset: usize = offset + (self.block_map[block_map_index] & mask).count_ones() as usize;
278
279 self.bitmap[offset] & bitmask_for_key(key) != 0
280 }
281
282 /// Perform a bitwise OR against `self` and `other`, returning the
283 /// resulting merged [`CompressedBitmap`].
284 ///
285 /// # Panics
286 ///
287 /// This method panics if `other` was not configured with the same
288 /// `max_key`.
289 pub fn or(&self, other: &Self) -> Self {
290 #[cfg(debug_assertions)]
291 debug_assert_eq!(self.max_key, other.max_key);
292
293 // Invariant: the block maps are of equal length, meaning the zipped
294 // iters yield both sides to completion.
295 assert_eq!(self.block_map.len(), other.block_map.len());
296
297 let left = BlockMapIter::new(self);
298 let right = BlockMapIter::new(other);
299
300 // Construct the physical set of compressed bitmap blocks.
301 //
302 // By iterating over the non-empty logical blocks and OR-ing them
303 // together (or picking one if only one is non-empty) the merged output
304 // of both compressed bitmaps is computed (itself compressed).
305 let bitmap = left
306 .zip(right)
307 .filter_map(|(l, r)| {
308 Some(match (l, r) {
309 (None, None) => return None,
310 (None, Some(r)) => other.bitmap[r],
311 (Some(l), None) => self.bitmap[l],
312 (Some(l), Some(r)) => self.bitmap[l] | other.bitmap[r],
313 })
314 })
315 .collect::<Vec<_>>();
316
317 // Then merge the two bitmap blocks, the OR of which is guaranteed to
318 // contain exactly N set bits for the N blocks in "physical".
319 let block_map = self
320 .block_map
321 .iter()
322 .zip(&other.block_map)
323 .map(|(l, r)| l | r)
324 .collect::<Vec<_>>();
325
326 // Invariant: The number of set bits in the block map must match the
327 // number of blocks in the bitmap.
328 debug_assert_eq!(
329 block_map.iter().map(|v| v.count_ones()).sum::<u32>() as usize,
330 bitmap.len()
331 );
332
333 Self {
334 block_map,
335 bitmap,
336
337 #[cfg(debug_assertions)]
338 max_key: self.max_key,
339 }
340 }
341}
342
343/// Yields the 0-indexed physical indexes into the sparse bitmap for non-empty
344/// blocks.
345///
346/// If for the Nth call to `next()` the Nth sparse bitmap block is elided,
347/// [`None`] is returned. If the Nth bitmap block is non-empty, the physical
348/// index into the compressed vec is yielded.
349#[derive(Debug)]
350struct BlockMapIter<'a> {
351 bitmap: &'a CompressedBitmap,
352
353 /// The index into bitmap.block_map to be processed next (0 -> N).
354 block_idx: usize,
355 /// The bit in the block to be evaluated next (LSB -> MSB).
356 block_bit: u8,
357 /// The physical index to be yielded next.
358 physical_idx: usize,
359}
360
361impl<'a> BlockMapIter<'a> {
362 /// Construct a new [`BlockMapIter`] that yields indexes into the physical
363 /// bitmap blocks in `bitmap`.
364 fn new(bitmap: &'a CompressedBitmap) -> Self {
365 Self {
366 bitmap,
367 block_idx: 0,
368 block_bit: 0,
369 physical_idx: 0,
370 }
371 }
372}
373
374impl Iterator for BlockMapIter<'_> {
375 type Item = Option<usize>;
376
377 fn next(&mut self) -> Option<Self::Item> {
378 let block = self.bitmap.block_map.get(self.block_idx)?;
379
380 let v = if (block & (1 << self.block_bit)) > 0 {
381 // This logical block is non-empty.
382
383 // Read the physical index for the nth logical block.
384 let idx = self.physical_idx;
385
386 // Increment for the next physical block.
387 self.physical_idx += 1;
388
389 Some(idx)
390 } else {
391 // This logical block is empty.
392 None
393 };
394
395 // Advance the bit within the block to evaluate next.
396 self.block_bit += 1;
397
398 // Advance the block index (and wrap the bit index) if the last
399 // inspected bit was the last bit in the block.
400 if self.block_bit == usize::BITS as u8 {
401 self.block_bit = 0;
402 self.block_idx += 1;
403 }
404
405 Some(v)
406 }
407}
408
409impl Bitmap for CompressedBitmap {
410 fn get(&self, key: usize) -> bool {
411 self.get(key)
412 }
413
414 fn set(&mut self, key: usize, value: bool) {
415 self.set(key, value)
416 }
417
418 fn byte_size(&self) -> usize {
419 self.size()
420 }
421
422 fn or(&self, other: &Self) -> Self {
423 self.or(other)
424 }
425
426 fn new_with_capacity(max_key: usize) -> Self {
427 Self::new(max_key)
428 }
429}
430
431impl From<VecBitmap> for CompressedBitmap {
432 fn from(bitmap: VecBitmap) -> Self {
433 let (bitmap, max_key) = bitmap.into_parts();
434
435 // Calculate how many instances of usize (blocks) are needed to hold
436 // max_key number of bits.
437 let num_blocks = index_for_key(max_key);
438
439 // Figure out how many usize elements are needed to represent blocks
440 // number of bitmaps.
441 let num_blocks = match num_blocks % (u64::BITS as usize) {
442 0 => index_for_key(num_blocks),
443 _ => index_for_key(num_blocks) + 1, // +1 to cover the remainder
444 };
445
446 // Then shrink the bitmap into a 2-level compressed bitmap, dropping runs of
447 // 0 bits in the raw bitmap.
448 let mut block_map = vec![0; num_blocks];
449 let mut compressed = Vec::default();
450 for (idx, block) in bitmap.into_iter().enumerate() {
451 // If this block contains no set bits, it is elided from the compressed
452 // representation.
453 if block == 0 {
454 continue;
455 }
456
457 // This block contains data.
458 //
459 // Add the block to the compressed representation and mark it in the
460 // block map.
461 compressed.push(block);
462 block_map[index_for_key(idx)] |= bitmask_for_key(idx);
463 }
464
465 CompressedBitmap {
466 block_map,
467 bitmap: compressed,
468
469 #[cfg(debug_assertions)]
470 max_key,
471 }
472 }
473}
474
475// TODO(dom:test): proptest conversion
476
477#[cfg(test)]
478mod tests {
479 use proptest::prelude::*;
480 use quickcheck_macros::quickcheck;
481
482 use super::*;
483
484 macro_rules! contains_only_truthy {
485 ($bitmap:ident, $max:expr; $(
486 $element:expr
487 ),*) => {
488 let truthy = vec![$($element,)*];
489 for i in 0..$max {
490 assert!($bitmap.get(i) == truthy.contains(&i), "unexpected value {}", i);
491 }
492 };
493 }
494
495 #[test]
496 fn test_set_contains() {
497 let mut b = CompressedBitmap::new(100);
498 b.set(100, true);
499 b.set(0, true);
500 b.set(42, true);
501
502 contains_only_truthy!(b, 100; 100, 0, 42);
503
504 assert!(b.get(100));
505 assert!(b.get(0));
506 assert!(b.get(42));
507 }
508
509 #[test]
510 fn test_clear() {
511 let mut b = CompressedBitmap::new(100);
512 b.set(100, true);
513 b.set(0, true);
514 b.set(42, true);
515
516 contains_only_truthy!(b, 100; 100, 0, 42);
517 b.clear();
518 contains_only_truthy!(b, 100;);
519 }
520
521 #[test]
522 fn test_set_true_false() {
523 let mut b = CompressedBitmap::new(100);
524 b.set(42, true);
525 assert!(b.get(42));
526 b.set(42, false);
527 assert!(!b.get(42));
528 }
529
530 #[test]
531 fn test_block_map_iter() {
532 let mut bitmap = CompressedBitmap::new(i16::MAX as _);
533 bitmap.set(1, true); // Block 0
534 bitmap.set(usize::BITS as usize * 4, true); // Block 4
535 bitmap.set(usize::BITS as usize * 64, true); // Block 64
536 bitmap.set(usize::BITS as usize * 65, true); // Block 65
537 bitmap.set(usize::BITS as usize * 128, true); // Block 128
538
539 let mut iter = BlockMapIter::new(&bitmap).enumerate();
540
541 assert_eq!(iter.next().unwrap(), (0, Some(0))); // The 0th block is non-empty and at physical index 0.
542 assert_eq!(iter.next().unwrap(), (1, None)); // The 1st block is all zero and elided.
543 assert_eq!(iter.next().unwrap(), (2, None)); // The 2nd block is all zero and elided.
544 assert_eq!(iter.next().unwrap(), (3, None)); // The 3rd block is all zero and elided.
545 assert_eq!(iter.next().unwrap(), (4, Some(1))); // The 4rd block is non-empty and at physical index 1.
546
547 // Filter out all the None entries, preserving the enumerated idx.
548 //
549 // This causes the iterator to yield (logical block, physical block).
550 let mut iter = iter.filter_map(|(idx, block)| block.map(|v| (idx, v)));
551
552 // Then the next non-empty blocks and their physical indexes:
553 assert_eq!(iter.next().unwrap(), (64, 2)); // The 64th block is non-empty and at physical index 2.
554 assert_eq!(iter.next().unwrap(), (65, 3)); // The 65th block is non-empty and at physical index 3.
555
556 // Finally the last bit!
557 assert_eq!(iter.next().unwrap(), (128, 4)); // The 128th block is non-empty and at physical index 4.
558
559 // And the iterator should terminate.
560 assert!(iter.next().is_none());
561 }
562
563 #[quickcheck]
564 #[should_panic]
565 fn test_panic_exceeds_max(max: u16) {
566 let max = max as usize;
567 let mut b = CompressedBitmap::new(max);
568 b.set(max + 1, true);
569 }
570
571 #[quickcheck]
572 fn test_set_contains_prop(mut vals: Vec<u16>) {
573 vals.truncate(10);
574 let mut b = CompressedBitmap::new(u16::MAX.into());
575 for v in &vals {
576 b.set(*v as usize, true);
577 }
578
579 for i in 0..u16::MAX {
580 assert!(
581 b.get(i as usize) == vals.contains(&i),
582 "unexpected value {}",
583 i
584 );
585 }
586 }
587
588 #[quickcheck]
589 fn test_or(mut a: Vec<u16>, mut b: Vec<u16>) {
590 a.truncate(10);
591 let mut bitmap_a = CompressedBitmap::new(u16::MAX.into());
592 for v in &a {
593 bitmap_a.set(*v as usize, true);
594 }
595
596 b.truncate(10);
597 let mut bitmap_b = CompressedBitmap::new(u16::MAX.into());
598 for v in &b {
599 bitmap_b.set(*v as usize, true);
600 }
601
602 let merged = bitmap_a.or(&bitmap_b);
603
604 for i in 0..u16::MAX {
605 let want_hit = a.contains(&i) || b.contains(&i);
606 assert!(
607 merged.get(i as usize) == want_hit,
608 "unexpected value {} want={:?}",
609 i,
610 want_hit
611 );
612 }
613 }
614
615 #[cfg(feature = "serde")]
616 #[test]
617 fn serde() {
618 let mut b = CompressedBitmap::new(100);
619 b.set(1, true);
620 b.set(2, false);
621 b.set(3, true);
622 contains_only_truthy!(b, 100; 1, 3);
623
624 let encoded = serde_json::to_string(&b).unwrap();
625 let decoded: CompressedBitmap = serde_json::from_str(&encoded).unwrap();
626 contains_only_truthy!(decoded, 100; 1, 3);
627 }
628
629 const MAX_KEY: usize = 1028;
630
631 proptest! {
632 #[test]
633 fn prop_compress(
634 values in prop::collection::hash_set(0..MAX_KEY, 0..20),
635 ) {
636 let mut b = VecBitmap::new_with_capacity(MAX_KEY);
637
638 for v in &values {
639 b.set(*v, true);
640 }
641
642 // Compress
643 let b = CompressedBitmap::from(b);
644
645 // Ensure all values are equal in the test range.
646 for i in 0..MAX_KEY {
647 assert_eq!(b.get(i), values.contains(&i));
648 }
649 }
650 }
651}