Skip to main content

gix_bitmap/
ewah.rs

1///
2pub mod decode {
3    /// The error returned by [`decode()`](super::decode()).
4    pub type Error = gix_error::Exn<gix_error::ValidationError>;
5}
6
7/// Decode `data` as EWAH bitmap.
8pub fn decode(data: &[u8]) -> Result<(Vec, &[u8]), decode::Error> {
9    use crate::decode;
10    use gix_error::{message, OptionExt};
11
12    let (num_bits, data) = decode::u32(data).ok_or_raise(|| message("eof reading amount of bits").into())?;
13    let (len, data) = decode::u32(data).ok_or_raise(|| message("eof reading chunk length").into())?;
14    let len = len as usize;
15
16    // NOTE: git does this by copying all bytes first, and then it will change the endianness in a separate loop.
17    //       Maybe it's faster, but we can't do it without unsafe. Let's leave it to the optimizer and maybe
18    //       one day somebody will find out that it's worth it to use unsafe here.
19    let (mut bits, data) = data
20        .split_at_checked(len * std::mem::size_of::<u64>())
21        .ok_or_raise(|| message("eof while reading bit data").into())?;
22    let mut buf = std::vec::Vec::<u64>::with_capacity(len);
23    for _ in 0..len {
24        let (bit_num, rest) = bits.split_at(std::mem::size_of::<u64>());
25        bits = rest;
26        buf.push(u64::from_be_bytes(bit_num.try_into().unwrap()));
27    }
28
29    let (rlw, data) = decode::u32(data).ok_or_raise(|| message("eof while reading run length width").into())?;
30
31    Ok((
32        Vec {
33            num_bits,
34            bits: buf,
35            rlw: rlw.into(),
36        },
37        data,
38    ))
39}
40
41mod access {
42    use super::Vec;
43
44    impl Vec {
45        /// Call `f(index)` for each bit that is true, given the index of the bit that identifies it uniquely within the bit array.
46        /// If `f` returns `None` the iteration will be stopped and `None` is returned.
47        ///
48        /// The index is sequential like in any other vector.
49        pub fn for_each_set_bit(&self, mut f: impl FnMut(usize) -> Option<()>) -> Option<()> {
50            let mut index = 0usize;
51            let mut iter = self.bits.iter();
52            while let Some(word) = iter.next() {
53                if rlw_runbit_is_set(word) {
54                    let len = rlw_running_len_bits(word);
55                    for _ in 0..len {
56                        f(index)?;
57                        index += 1;
58                    }
59                } else {
60                    index += usize::try_from(rlw_running_len_bits(word)).ok()?;
61                }
62
63                for _ in 0..rlw_literal_words(word) {
64                    let word = iter
65                        .next()
66                        .expect("BUG: ran out of words while going through uncompressed portion");
67                    for bit_index in 0..64 {
68                        if word & (1 << bit_index) != 0 {
69                            f(index)?;
70                        }
71                        index += 1;
72                    }
73                }
74            }
75            Some(())
76        }
77
78        /// The amount of bits we are currently holding.
79        pub fn num_bits(&self) -> usize {
80            self.num_bits.try_into().expect("we are not on 16 bit systems")
81        }
82    }
83
84    #[inline]
85    fn rlw_running_len_bits(w: &u64) -> u64 {
86        rlw_running_len(w) * 64
87    }
88
89    #[inline]
90    fn rlw_running_len(w: &u64) -> u64 {
91        (w >> 1) & RLW_LARGEST_RUNNING_COUNT
92    }
93
94    #[inline]
95    fn rlw_literal_words(w: &u64) -> u64 {
96        w >> (1 + RLW_RUNNING_BITS)
97    }
98
99    #[inline]
100    fn rlw_runbit_is_set(w: &u64) -> bool {
101        w & 1 == 1
102    }
103
104    const RLW_RUNNING_BITS: u64 = 4 * 8;
105    const RLW_LARGEST_RUNNING_COUNT: u64 = (1 << RLW_RUNNING_BITS) - 1;
106}
107
108/// A growable collection of u64 that are seen as stream of individual bits.
109#[allow(dead_code)]
110#[derive(Clone)]
111pub struct Vec {
112    num_bits: u32,
113    bits: std::vec::Vec<u64>,
114    /// RLW is an offset into the `bits` buffer, so `1` translates into &bits\[1] essentially.
115    rlw: u64,
116}