Skip to main content

summavy_bitpacker/
blocked_bitpacker.rs

1use super::bitpacker::BitPacker;
2use super::compute_num_bits;
3use crate::{minmax, BitUnpacker};
4
5const BLOCK_SIZE: usize = 128;
6
7/// `BlockedBitpacker` compresses data in blocks of
8/// 128 elements, while keeping an index on it
9#[derive(Debug, Clone)]
10pub struct BlockedBitpacker {
11    // bitpacked blocks
12    compressed_blocks: Vec<u8>,
13    // uncompressed data, collected until BLOCK_SIZE
14    buffer: Vec<u64>,
15    offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
16}
17impl Default for BlockedBitpacker {
18    fn default() -> Self {
19        BlockedBitpacker::new()
20    }
21}
22
23/// `BlockedBitpackerEntryMetaData` encodes the
24/// offset and bit_width into a u64 bit field
25///
26/// This saves some space, since 7byte is more
27/// than enough and also keeps the access fast
28/// because of alignment
29#[derive(Debug, Clone, Default)]
30struct BlockedBitpackerEntryMetaData {
31    encoded: u64,
32    base_value: u64,
33}
34
35impl BlockedBitpackerEntryMetaData {
36    fn new(offset: u64, num_bits: u8, base_value: u64) -> Self {
37        let encoded = offset | (num_bits as u64) << (64 - 8);
38        Self {
39            encoded,
40            base_value,
41        }
42    }
43    fn offset(&self) -> u64 {
44        (self.encoded << 8) >> 8
45    }
46    fn num_bits(&self) -> u8 {
47        (self.encoded >> 56) as u8
48    }
49    fn base_value(&self) -> u64 {
50        self.base_value
51    }
52}
53
54#[test]
55fn metadata_test() {
56    let meta = BlockedBitpackerEntryMetaData::new(50000, 6, 40000);
57    assert_eq!(meta.offset(), 50000);
58    assert_eq!(meta.num_bits(), 6);
59}
60
61fn mem_usage<T>(items: &Vec<T>) -> usize {
62    items.capacity() * std::mem::size_of::<T>()
63}
64
65impl BlockedBitpacker {
66    pub fn new() -> Self {
67        let mut compressed_blocks = vec![];
68        compressed_blocks.resize(8, 0);
69        Self {
70            compressed_blocks,
71            buffer: vec![],
72            offset_and_bits: vec![],
73        }
74    }
75
76    /// The memory used (inclusive childs)
77    pub fn mem_usage(&self) -> usize {
78        std::mem::size_of::<BlockedBitpacker>()
79            + self.compressed_blocks.capacity()
80            + mem_usage(&self.offset_and_bits)
81            + mem_usage(&self.buffer)
82    }
83
84    #[inline]
85    pub fn add(&mut self, val: u64) {
86        self.buffer.push(val);
87        if self.buffer.len() == BLOCK_SIZE {
88            self.flush();
89        }
90    }
91
92    pub fn flush(&mut self) {
93        if let Some((min_value, max_value)) = minmax(self.buffer.iter()) {
94            let mut bit_packer = BitPacker::new();
95            let num_bits_block = compute_num_bits(*max_value - min_value);
96            // todo performance: the padding handling could be done better, e.g. use a slice and
97            // return num_bytes written from bitpacker
98            self.compressed_blocks
99                .resize(self.compressed_blocks.len() - 8, 0); // remove padding for bitpacker
100            let offset = self.compressed_blocks.len() as u64;
101            // todo performance: for some bit_width we
102            // can encode multiple vals into the
103            // mini_buffer before checking to flush
104            // (to be done in BitPacker)
105            for val in self.buffer.iter() {
106                bit_packer
107                    .write(
108                        *val - min_value,
109                        num_bits_block,
110                        &mut self.compressed_blocks,
111                    )
112                    .expect("cannot write bitpacking to output"); // write to in memory can't fail
113            }
114            bit_packer.flush(&mut self.compressed_blocks).unwrap();
115            self.offset_and_bits
116                .push(BlockedBitpackerEntryMetaData::new(
117                    offset,
118                    num_bits_block,
119                    *min_value,
120                ));
121
122            self.buffer.clear();
123            self.compressed_blocks
124                .resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
125        }
126    }
127    #[inline]
128    pub fn get(&self, idx: usize) -> u64 {
129        let metadata_pos = idx / BLOCK_SIZE;
130        let pos_in_block = idx % BLOCK_SIZE;
131        if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
132            let unpacked = BitUnpacker::new(metadata.num_bits()).get(
133                pos_in_block as u32,
134                &self.compressed_blocks[metadata.offset() as usize..],
135            );
136            unpacked + metadata.base_value()
137        } else {
138            self.buffer[pos_in_block]
139        }
140    }
141
142    pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
143        // todo performance: we could decompress a whole block and cache it instead
144        let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
145        let iter = (0..bitpacked_elems)
146            .map(move |idx| self.get(idx))
147            .chain(self.buffer.iter().cloned());
148        iter
149    }
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155    #[test]
156    fn blocked_bitpacker_empty() {
157        let blocked_bitpacker = BlockedBitpacker::new();
158        assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![]);
159    }
160    #[test]
161    fn blocked_bitpacker_one() {
162        let mut blocked_bitpacker = BlockedBitpacker::new();
163        blocked_bitpacker.add(50000);
164        assert_eq!(blocked_bitpacker.get(0), 50000);
165        assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![50000]);
166    }
167    #[test]
168    fn blocked_bitpacker_test() {
169        let mut blocked_bitpacker = BlockedBitpacker::new();
170        for val in 0..21500 {
171            blocked_bitpacker.add(val);
172        }
173        for val in 0..21500 {
174            assert_eq!(blocked_bitpacker.get(val as usize), val);
175        }
176        assert_eq!(blocked_bitpacker.iter().count(), 21500);
177        assert_eq!(blocked_bitpacker.iter().last().unwrap(), 21499);
178    }
179}