gix_bitmap/
ewah.rs

1///
2pub mod decode {
3    /// The error returned by [`decode()`][super::decode()].
4    #[derive(Debug, thiserror::Error)]
5    #[allow(missing_docs)]
6    pub enum Error {
7        #[error("{}", message)]
8        Corrupt { message: &'static str },
9    }
10}
11
12/// Decode `data` as EWAH bitmap.
13pub fn decode(data: &[u8]) -> Result<(Vec, &[u8]), decode::Error> {
14    use self::decode::Error;
15    use crate::decode;
16
17    let (num_bits, data) = decode::u32(data).ok_or(Error::Corrupt {
18        message: "eof reading amount of bits",
19    })?;
20    let (len, data) = decode::u32(data).ok_or(Error::Corrupt {
21        message: "eof reading chunk length",
22    })?;
23    let len = len as usize;
24
25    // NOTE: git does this by copying all bytes first, and then it will change the endianness in a separate loop.
26    //       Maybe it's faster, but we can't do it without unsafe. Let's leave it to the optimizer and maybe
27    //       one day somebody will find out that it's worth it to use unsafe here.
28    let (mut bits, data) = data
29        .split_at_checked(len * std::mem::size_of::<u64>())
30        .ok_or(Error::Corrupt {
31            message: "eof while reading bit data",
32        })?;
33    let mut buf = std::vec::Vec::<u64>::with_capacity(len);
34    for _ in 0..len {
35        let (bit_num, rest) = bits.split_at(std::mem::size_of::<u64>());
36        bits = rest;
37        buf.push(u64::from_be_bytes(bit_num.try_into().unwrap()));
38    }
39
40    let (rlw, data) = decode::u32(data).ok_or(Error::Corrupt {
41        message: "eof while reading run length width",
42    })?;
43
44    Ok((
45        Vec {
46            num_bits,
47            bits: buf,
48            rlw: rlw.into(),
49        },
50        data,
51    ))
52}
53
54mod access {
55    use super::Vec;
56
57    impl Vec {
58        /// Call `f(index)` for each bit that is true, given the index of the bit that identifies it uniquely within the bit array.
59        /// If `f` returns `None` the iteration will be stopped and `None` is returned.
60        ///
61        /// The index is sequential like in any other vector.
62        pub fn for_each_set_bit(&self, mut f: impl FnMut(usize) -> Option<()>) -> Option<()> {
63            let mut index = 0usize;
64            let mut iter = self.bits.iter();
65            while let Some(word) = iter.next() {
66                if rlw_runbit_is_set(word) {
67                    let len = rlw_running_len_bits(word);
68                    for _ in 0..len {
69                        f(index)?;
70                        index += 1;
71                    }
72                } else {
73                    index += usize::try_from(rlw_running_len_bits(word)).ok()?;
74                }
75
76                for _ in 0..rlw_literal_words(word) {
77                    let word = iter
78                        .next()
79                        .expect("BUG: ran out of words while going through uncompressed portion");
80                    for bit_index in 0..64 {
81                        if word & (1 << bit_index) != 0 {
82                            f(index)?;
83                        }
84                        index += 1;
85                    }
86                }
87            }
88            Some(())
89        }
90
91        /// The amount of bits we are currently holding.
92        pub fn num_bits(&self) -> usize {
93            self.num_bits.try_into().expect("we are not on 16 bit systems")
94        }
95    }
96
97    #[inline]
98    fn rlw_running_len_bits(w: &u64) -> u64 {
99        rlw_running_len(w) * 64
100    }
101
102    #[inline]
103    fn rlw_running_len(w: &u64) -> u64 {
104        (w >> 1) & RLW_LARGEST_RUNNING_COUNT
105    }
106
107    #[inline]
108    fn rlw_literal_words(w: &u64) -> u64 {
109        w >> (1 + RLW_RUNNING_BITS)
110    }
111
112    #[inline]
113    fn rlw_runbit_is_set(w: &u64) -> bool {
114        w & 1 == 1
115    }
116
117    const RLW_RUNNING_BITS: u64 = 4 * 8;
118    const RLW_LARGEST_RUNNING_COUNT: u64 = (1 << RLW_RUNNING_BITS) - 1;
119}
120
121/// A growable collection of u64 that are seen as stream of individual bits.
122#[allow(dead_code)]
123#[derive(Clone)]
124pub struct Vec {
125    num_bits: u32,
126    bits: std::vec::Vec<u64>,
127    /// RLW is an offset into the `bits` buffer, so `1` translates into &bits\[1] essentially.
128    rlw: u64,
129}