Skip to main content

hdt/containers/
bitmap.rs

1//! Bitmap with rank and select support read from an HDT file.
2use crate::containers::vbyte::{encode_vbyte, read_vbyte};
3use bytesize::ByteSize;
4use mem_dbg::{MemSize, SizeFlags};
5use qwt::{AccessBin, BitVector, BitVectorMut, RankBin, SelectBin, bitvector::rs_narrow::RSNarrow};
6#[cfg(feature = "cache")]
7use serde::{Deserialize, Serialize};
8use std::fmt;
9use std::io::BufRead;
10use std::mem::size_of;
11
12/// Compact bitmap representation with rank and select support.
13#[derive(Clone)]
14#[cfg_attr(feature = "cache", derive(Serialize, Deserialize))]
15pub struct Bitmap {
16    /// should be private but is needed by containers/bitmap.rs, use methods provided by Bitmap
17    pub dict: RSNarrow,
18}
19
20pub type Result<T> = core::result::Result<T, Error>;
21
22/// The error type for the bitmap read function.
23#[derive(thiserror::Error, Debug)]
24pub enum Error {
25    #[error("IO error")]
26    Io(#[from] std::io::Error),
27    #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")]
28    InvalidCrc8Checksum(u8, u8),
29    #[error("Invalid CRC32C checksum {0}, expected {1}")]
30    InvalidCrc32Checksum(u32, u32),
31    #[error("Failed to turn raw bytes into u64")]
32    TryFromSliceError(#[from] std::array::TryFromSliceError),
33    #[error("Read unsupported bitmap type {0} != 1")]
34    UnsupportedBitmapType(u8),
35}
36
37impl fmt::Debug for Bitmap {
38    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39        write!(f, "{}, {} bits", ByteSize(self.size_in_bytes() as u64), self.len())
40    }
41}
42
43impl From<BitVector> for Bitmap {
44    fn from(bv: BitVector) -> Self {
45        Bitmap { dict: bv.into() }
46    }
47}
48
49impl From<BitVectorMut> for Bitmap {
50    fn from(bv: BitVectorMut) -> Self {
51        Bitmap { dict: <BitVectorMut as Into<BitVector>>::into(bv).into() }
52    }
53}
54
55impl Bitmap {
56    /// Construct a bitmap from an existing bitmap in form of a vector, which doesn't have rank and select support. Number of bits multiple of 64.
57    pub fn new(data: Vec<u64>) -> Self {
58        let mut v = BitVectorMut::new();
59        for d in data {
60            v.append_bits(d, 64);
61        }
62        let dict: BitVector = v.into();
63        Bitmap { dict: dict.into() }
64    }
65
66    /// Size in bytes on the heap.
67    pub fn size_in_bytes(&self) -> usize {
68        self.dict.mem_size(SizeFlags::default())
69    }
70
71    /// Number of bits in the bitmap, multiple of 64
72    pub fn len(&self) -> usize {
73        self.dict.n_zeros() + self.dict.n_ones() // RSNarrow.len() is not public
74        // self.dict.bv_len() // only on RSWide
75    }
76
77    /// Number of bits set
78    pub fn num_ones(&self) -> usize {
79        self.dict.n_ones()
80    }
81
82    /// Returns the position of the k-1-th one bit or None if there aren't that many.
83    pub fn select1(&self, k: usize) -> Option<usize> {
84        self.dict.select1(k)
85    }
86
87    /// Returns the number of one bits from the 0-th bit to the k-1-th bit. Panics if self.len() < pos.
88    pub fn rank(&self, k: usize) -> usize {
89        self.dict.rank1(k).unwrap_or_else(|| panic!("Out of bounds position: {} >= {}", k, self.len()))
90    }
91
92    /// Whether the node given position is the last child of its parent.
93    pub fn at_last_sibling(&self, word_index: usize) -> bool {
94        self.dict.get(word_index).expect("word index out of bounds")
95    }
96
97    /// Read bitmap from a suitable point within HDT file data and verify checksums.
98    pub fn read<R: BufRead>(reader: &mut R) -> Result<Self> {
99        use Error::*;
100        let mut history: Vec<u8> = Vec::with_capacity(5);
101
102        // read the type
103        let mut bitmap_type = [0u8];
104        reader.read_exact(&mut bitmap_type)?;
105        history.extend_from_slice(&bitmap_type);
106        if bitmap_type[0] != 1 {
107            return Err(UnsupportedBitmapType(bitmap_type[0]));
108        }
109
110        // read the number of bits
111        let (num_bits, bytes_read) = read_vbyte(reader)?;
112        history.extend_from_slice(&bytes_read);
113
114        // read section CRC8
115        let mut crc_code = [0_u8];
116        reader.read_exact(&mut crc_code)?;
117        let crc_code = crc_code[0];
118
119        // validate section CRC8
120        let crc8 = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
121        let mut digest = crc8.digest();
122        digest.update(&history);
123        let crc_calculated = digest.finalize();
124        if crc_calculated != crc_code {
125            return Err(InvalidCrc8Checksum(crc_calculated, crc_code));
126        }
127
128        // read all but the last word, last word is byte aligned
129        let full_byte_amount = ((num_bits - 1) >> 6) * 8;
130        let mut full_words = vec![0_u8; full_byte_amount];
131        // div_ceil is unstable
132        let mut data: Vec<u64> = Vec::with_capacity(full_byte_amount / 8 + usize::from(full_byte_amount % 8 != 0));
133        reader.read_exact(&mut full_words)?;
134
135        for word in full_words.chunks_exact(size_of::<u64>()) {
136            data.push(u64::from_le_bytes(<[u8; 8]>::try_from(word)?));
137        }
138
139        // initiate computation of CRC32
140        let crc32 = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
141        let mut digest = crc32.digest();
142        digest.update(&full_words);
143
144        let mut bits_read = 0;
145        let mut last_value: u64 = 0;
146        let last_word_bits = if num_bits == 0 { 0 } else { ((num_bits - 1) % 64) + 1 };
147
148        while bits_read < last_word_bits {
149            let mut buffer = [0u8];
150            reader.read_exact(&mut buffer)?;
151            digest.update(&buffer);
152            last_value |= (buffer[0] as u64) << bits_read;
153            bits_read += 8;
154        }
155        data.push(last_value);
156
157        // read entry body CRC32
158        let mut crc_code = [0_u8; 4];
159        reader.read_exact(&mut crc_code)?;
160        let crc_code = u32::from_le_bytes(crc_code);
161
162        // validate entry body CRC32
163        // not worth it to spawn an extra thread as our bitmaps are comparatively small
164        let crc_calculated = digest.finalize();
165        if crc_calculated != crc_code {
166            return Err(InvalidCrc32Checksum(crc_calculated, crc_code));
167        }
168        Ok(Self::new(data))
169    }
170
171    pub fn write(&self, w: &mut impl std::io::Write) -> Result<()> {
172        let crc = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
173        let mut hasher = crc.digest();
174        // type
175        let bitmap_type: [u8; 1] = [1];
176        w.write_all(&bitmap_type)?;
177        hasher.update(&bitmap_type);
178        // number of bits
179        let t = encode_vbyte(self.len());
180        w.write_all(&t)?;
181        hasher.update(&t);
182        // crc8 checksum
183        let checksum = hasher.finalize();
184        w.write_all(&checksum.to_le_bytes())?;
185
186        // write data
187        let crc32 = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
188        let mut hasher = crc32.digest();
189
190        let words: &[u64] = self.dict.bit_vector().words();
191        let num_bytes = self.dict.bit_vector().len().div_ceil(8); // HDT spec expects no superflous bytes to be written
192        let bytes = unsafe {
193            std::slice::from_raw_parts(words.as_ptr().cast::<u8>(), num_bytes) // assume little endian
194        };
195
196        w.write_all(bytes)?;
197        hasher.update(bytes);
198        let crc_code = hasher.finalize();
199        let crc_code = crc_code.to_le_bytes();
200        w.write_all(&crc_code)?;
201        Ok(())
202    }
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208    use crate::tests::init;
209    use pretty_assertions::assert_eq;
210
211    #[test]
212    fn bugtest() {
213        let mut v = BitVectorMut::new();
214        v.push(true);
215        v.push(false);
216        let bv: BitVector = v.into();
217        let rs: RSNarrow = bv.into();
218        rs.n_zeros();
219    }
220
221    #[test]
222    fn write() -> color_eyre::Result<()> {
223        init();
224        let bits: Vec<u64> = vec![0b10111];
225        let bitmap = Bitmap::new(bits);
226        assert_eq!(bitmap.len(), 64);
227        // position of k-1th 1 bit
228        // read bits from right to left, i.e. last one is pos 0
229        assert_eq!(bitmap.select1(0).unwrap(), 0);
230        assert_eq!(bitmap.select1(1).unwrap(), 1);
231        assert_eq!(bitmap.select1(2).unwrap(), 2);
232        assert_eq!(bitmap.select1(3).unwrap(), 4);
233        assert_eq!(bitmap.select1(4), None);
234        // number of one bits from the 0-th bit to the k-1-th bit
235        assert_eq!(bitmap.rank(1), 1);
236        assert_eq!(bitmap.rank(5), 4);
237        let mut buf = Vec::<u8>::new();
238        bitmap.write(&mut buf)?;
239        let bitmap2 = Bitmap::read(&mut std::io::Cursor::new(buf))?;
240        //assert_eq!(bitmap.dict.bit_vector().words(), bitmap2.dict.bit_vector().words());
241        assert_eq!(bitmap.dict, bitmap2.dict);
242        Ok(())
243    }
244}