Skip to main content

gix_bitmap/
ewah.rs

1///
2pub mod decode {
3    /// The error returned by [`decode()`](super::decode()).
4    pub type Error = gix_error::Exn<gix_error::ValidationError>;
5}
6
7/// Decode `data` as EWAH bitmap.
8pub fn decode(data: &[u8]) -> Result<(Vec, &[u8]), decode::Error> {
9    use crate::decode;
10    use gix_error::{message, OptionExt};
11
12    let (num_bits, data) = decode::u32(data).ok_or_raise(|| message("eof reading amount of bits").into())?;
13    let (len, data) = decode::u32(data).ok_or_raise(|| message("eof reading chunk length").into())?;
14    let len = len as usize;
15
16    // NOTE: git does this by copying all bytes first, and then it will change the endianness in a separate loop.
17    //       Maybe it's faster, but we can't do it without unsafe. Let's leave it to the optimizer and maybe
18    //       one day somebody will find out that it's worth it to use unsafe here.
19    let (mut bits, data) = data
20        .split_at_checked(len * std::mem::size_of::<u64>())
21        .ok_or_raise(|| message("eof while reading bit data").into())?;
22    let mut buf = std::vec::Vec::<u64>::with_capacity(len);
23    for _ in 0..len {
24        let (bit_num, rest) = bits.split_at(std::mem::size_of::<u64>());
25        bits = rest;
26        buf.push(u64::from_be_bytes(bit_num.try_into().unwrap()));
27    }
28
29    let (rlw, data) = decode::u32(data).ok_or_raise(|| message("eof while reading run length width").into())?;
30
31    Ok((
32        Vec {
33            num_bits,
34            bits: buf,
35            rlw: rlw.into(),
36        },
37        data,
38    ))
39}
40
41mod access {
42    use super::Vec;
43
44    impl Vec {
45        /// Create a bitmap from a sequence of bit values.
46        ///
47        /// The resulting bitmap uses a literal-only EWAH representation.
48        ///
49        /// Returns `None` if `bits.len()` exceeds `u32::MAX`.
50        pub fn from_bits(bits: &[bool]) -> Option<Self> {
51            let literal_words: std::vec::Vec<u64> = bits
52                .chunks(64)
53                .map(|chunk| {
54                    chunk.iter().enumerate().fold(
55                        0u64,
56                        |word, (idx, bit)| {
57                            if *bit {
58                                word | (1u64 << idx)
59                            } else {
60                                word
61                            }
62                        },
63                    )
64                })
65                .collect();
66            let num_bits = bits.len().try_into().ok()?;
67
68            Some(Vec {
69                num_bits,
70                bits: std::iter::once((literal_words.len() as u64) << (1 + RLW_RUNNING_BITS))
71                    .chain(literal_words)
72                    .collect(),
73                rlw: 0,
74            })
75        }
76
77        /// Write the bitmap as EWAH bytes to `out`.
78        ///
79        /// These bytes can be parsed again with [`decode()`](super::decode()).
80        pub fn write_to(&self, out: &mut impl std::io::Write) -> std::io::Result<()> {
81            let len: u32 = self.bits.len().try_into().map_err(|_| {
82                std::io::Error::new(std::io::ErrorKind::InvalidInput, "bit word count exceeds u32::MAX")
83            })?;
84            let rlw: u32 = self.rlw.try_into().map_err(|_| {
85                std::io::Error::new(
86                    std::io::ErrorKind::InvalidInput,
87                    "run length word offset exceeds u32::MAX",
88                )
89            })?;
90
91            out.write_all(&self.num_bits.to_be_bytes())?;
92            out.write_all(&len.to_be_bytes())?;
93            for word in &self.bits {
94                out.write_all(&word.to_be_bytes())?;
95            }
96            out.write_all(&rlw.to_be_bytes())
97        }
98
99        /// Call `f(index)` for each bit that is true, given the index of the bit that identifies it uniquely within the bit array.
100        /// If `f` returns `None` the iteration will be stopped and `None` is returned.
101        ///
102        /// The index is sequential like in any other vector.
103        pub fn for_each_set_bit(&self, mut f: impl FnMut(usize) -> Option<()>) -> Option<()> {
104            let num_bits = self.num_bits();
105            let mut index = 0usize;
106            let mut iter = self.bits.iter();
107            while let Some(word) = iter.next() {
108                if rlw_runbit_is_set(word) {
109                    let len = usize::try_from(rlw_running_len_bits(word)).ok()?;
110                    let end = index.checked_add(len)?;
111                    if end > num_bits {
112                        return None;
113                    }
114                    for _ in 0..len {
115                        f(index)?;
116                        index += 1;
117                    }
118                } else {
119                    let len = usize::try_from(rlw_running_len_bits(word)).ok()?;
120                    let end = index.checked_add(len)?;
121                    if end > num_bits {
122                        return None;
123                    }
124                    index = end;
125                }
126
127                for _ in 0..rlw_literal_words(word) {
128                    let word = iter.next()?;
129                    let remaining = num_bits.checked_sub(index)?;
130                    if remaining == 0 {
131                        return None;
132                    }
133                    let bits_in_word = remaining.min(64);
134                    if bits_in_word < 64 && (word >> bits_in_word) != 0 {
135                        return None;
136                    }
137                    for bit_index in 0..bits_in_word {
138                        if word & (1 << bit_index) != 0 {
139                            f(index)?;
140                        }
141                        index += 1;
142                    }
143                }
144            }
145            Some(())
146        }
147
148        /// The amount of bits we are currently holding.
149        pub fn num_bits(&self) -> usize {
150            self.num_bits.try_into().expect("we are not on 16 bit systems")
151        }
152    }
153
154    #[inline]
155    fn rlw_running_len_bits(w: &u64) -> u64 {
156        rlw_running_len(w) * 64
157    }
158
159    #[inline]
160    fn rlw_running_len(w: &u64) -> u64 {
161        (w >> 1) & RLW_LARGEST_RUNNING_COUNT
162    }
163
164    #[inline]
165    fn rlw_literal_words(w: &u64) -> u64 {
166        w >> (1 + RLW_RUNNING_BITS)
167    }
168
169    #[inline]
170    fn rlw_runbit_is_set(w: &u64) -> bool {
171        w & 1 == 1
172    }
173
174    const RLW_RUNNING_BITS: u64 = 4 * 8;
175    const RLW_LARGEST_RUNNING_COUNT: u64 = (1 << RLW_RUNNING_BITS) - 1;
176}
177
178/// A growable collection of u64 that are seen as stream of individual bits.
179#[allow(dead_code)]
180#[derive(Clone)]
181pub struct Vec {
182    num_bits: u32,
183    bits: std::vec::Vec<u64>,
184    /// RLW is an offset into the `bits` buffer, so `1` translates into &bits\[1] essentially.
185    rlw: u64,
186}