tantivy_bitpacker/
blocked_bitpacker.rs

1use super::bitpacker::BitPacker;
2use super::compute_num_bits;
3use crate::{minmax, BitUnpacker};
4
5const BLOCK_SIZE: usize = 128;
6
7/// `BlockedBitpacker` compresses data in blocks of
8/// 128 elements, while keeping an index on it
9#[derive(Debug, Clone)]
10pub struct BlockedBitpacker {
11    // bitpacked blocks
12    compressed_blocks: Vec<u8>,
13    // uncompressed data, collected until BLOCK_SIZE
14    buffer: Vec<u64>,
15    offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
16}
17impl Default for BlockedBitpacker {
18    fn default() -> Self {
19        BlockedBitpacker::new()
20    }
21}
22
23/// `BlockedBitpackerEntryMetaData` encodes the
24/// offset and bit_width into a u64 bit field
25///
26/// This saves some space, since 7byte is more
27/// than enough and also keeps the access fast
28/// because of alignment
29#[derive(Debug, Clone, Default)]
30struct BlockedBitpackerEntryMetaData {
31    encoded: u64,
32    base_value: u64,
33}
34
35impl BlockedBitpackerEntryMetaData {
36    fn new(offset: u64, num_bits: u8, base_value: u64) -> Self {
37        let encoded = offset | (num_bits as u64) << (64 - 8);
38        Self {
39            encoded,
40            base_value,
41        }
42    }
43    fn offset(&self) -> u64 {
44        (self.encoded << 8) >> 8
45    }
46    fn num_bits(&self) -> u8 {
47        (self.encoded >> 56) as u8
48    }
49    fn base_value(&self) -> u64 {
50        self.base_value
51    }
52}
53
54#[test]
55fn metadata_test() {
56    let meta = BlockedBitpackerEntryMetaData::new(50000, 6, 40000);
57    assert_eq!(meta.offset(), 50000);
58    assert_eq!(meta.num_bits(), 6);
59}
60
61fn mem_usage<T>(items: &Vec<T>) -> usize {
62    items.capacity() * std::mem::size_of::<T>()
63}
64
65impl BlockedBitpacker {
66    pub fn new() -> Self {
67        Self {
68            compressed_blocks: vec![0; 8],
69            buffer: vec![],
70            offset_and_bits: vec![],
71        }
72    }
73
74    /// The memory used (inclusive childs)
75    pub fn mem_usage(&self) -> usize {
76        std::mem::size_of::<BlockedBitpacker>()
77            + self.compressed_blocks.capacity()
78            + mem_usage(&self.offset_and_bits)
79            + mem_usage(&self.buffer)
80    }
81
82    #[inline]
83    pub fn add(&mut self, val: u64) {
84        self.buffer.push(val);
85        if self.buffer.len() == BLOCK_SIZE {
86            self.flush();
87        }
88    }
89
90    pub fn flush(&mut self) {
91        if let Some((min_value, max_value)) = minmax(self.buffer.iter()) {
92            let mut bit_packer = BitPacker::new();
93            let num_bits_block = compute_num_bits(*max_value - min_value);
94            // todo performance: the padding handling could be done better, e.g. use a slice and
95            // return num_bytes written from bitpacker
96            self.compressed_blocks
97                .resize(self.compressed_blocks.len() - 8, 0); // remove padding for bitpacker
98            let offset = self.compressed_blocks.len() as u64;
99            // todo performance: for some bit_width we
100            // can encode multiple vals into the
101            // mini_buffer before checking to flush
102            // (to be done in BitPacker)
103            for val in self.buffer.iter() {
104                bit_packer
105                    .write(
106                        *val - min_value,
107                        num_bits_block,
108                        &mut self.compressed_blocks,
109                    )
110                    .expect("cannot write bitpacking to output"); // write to in memory can't fail
111            }
112            bit_packer.flush(&mut self.compressed_blocks).unwrap();
113            self.offset_and_bits
114                .push(BlockedBitpackerEntryMetaData::new(
115                    offset,
116                    num_bits_block,
117                    *min_value,
118                ));
119
120            self.buffer.clear();
121            self.compressed_blocks
122                .resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
123        }
124    }
125    #[inline]
126    pub fn get(&self, idx: usize) -> u64 {
127        let metadata_pos = idx / BLOCK_SIZE;
128        let pos_in_block = idx % BLOCK_SIZE;
129        if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
130            let unpacked = BitUnpacker::new(metadata.num_bits()).get(
131                pos_in_block as u32,
132                &self.compressed_blocks[metadata.offset() as usize..],
133            );
134            unpacked + metadata.base_value()
135        } else {
136            self.buffer[pos_in_block]
137        }
138    }
139
140    pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
141        // todo performance: we could decompress a whole block and cache it instead
142        let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
143        let iter = (0..bitpacked_elems)
144            .map(move |idx| self.get(idx))
145            .chain(self.buffer.iter().cloned());
146        iter
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153    #[test]
154    fn blocked_bitpacker_empty() {
155        let blocked_bitpacker = BlockedBitpacker::new();
156        assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![]);
157    }
158    #[test]
159    fn blocked_bitpacker_one() {
160        let mut blocked_bitpacker = BlockedBitpacker::new();
161        blocked_bitpacker.add(50000);
162        assert_eq!(blocked_bitpacker.get(0), 50000);
163        assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![50000]);
164    }
165    #[test]
166    fn blocked_bitpacker_test() {
167        let mut blocked_bitpacker = BlockedBitpacker::new();
168        for val in 0..21500 {
169            blocked_bitpacker.add(val);
170        }
171        for val in 0..21500 {
172            assert_eq!(blocked_bitpacker.get(val as usize), val);
173        }
174        assert_eq!(blocked_bitpacker.iter().count(), 21500);
175        assert_eq!(blocked_bitpacker.iter().last().unwrap(), 21499);
176    }
177}