Skip to main content

gix_bitmap/
ewah.rs

1///
2pub mod decode {
3    /// The error returned by [`decode()`](super::decode()).
4    pub type Error = gix_error::Exn<gix_error::ValidationError>;
5}
6
7/// Decode `data` as EWAH bitmap.
8pub fn decode(data: &[u8]) -> Result<(Vec, &[u8]), decode::Error> {
9    use crate::decode;
10    use gix_error::{OptionExt, message};
11
12    let (num_bits, data) = decode::u32(data).ok_or_raise(|| message("eof reading amount of bits").into())?;
13    let (len, data) = decode::u32(data).ok_or_raise(|| message("eof reading chunk length").into())?;
14    let len = len as usize;
15
16    // NOTE: git does this by copying all bytes first, and then it will change the endianness in a separate loop.
17    //       Maybe it's faster, but we can't do it without unsafe. Let's leave it to the optimizer and maybe
18    //       one day somebody will find out that it's worth it to use unsafe here.
19    let (mut bits, data) = data
20        .split_at_checked(len * std::mem::size_of::<u64>())
21        .ok_or_raise(|| message("eof while reading bit data").into())?;
22    let mut buf = std::vec::Vec::<u64>::with_capacity(len);
23    for _ in 0..len {
24        let (bit_num, rest) = bits.split_at(std::mem::size_of::<u64>());
25        bits = rest;
26        buf.push(u64::from_be_bytes(bit_num.try_into().unwrap()));
27    }
28
29    let (rlw, data) = decode::u32(data).ok_or_raise(|| message("eof while reading run length width").into())?;
30
31    Ok((
32        Vec {
33            num_bits,
34            bits: buf,
35            rlw: rlw.into(),
36        },
37        data,
38    ))
39}
40
41mod access {
42    use super::Vec;
43
44    impl Vec {
45        /// Create a bitmap from a sequence of bit values.
46        ///
47        /// The resulting bitmap uses a literal-only EWAH representation.
48        ///
49        /// Returns `None` if `bits.len()` exceeds `u32::MAX`.
50        pub fn from_bits(bits: &[bool]) -> Option<Self> {
51            let literal_words: std::vec::Vec<u64> = bits
52                .chunks(64)
53                .map(|chunk| {
54                    chunk.iter().enumerate().fold(
55                        0u64,
56                        |word, (idx, bit)| {
57                            if *bit { word | (1u64 << idx) } else { word }
58                        },
59                    )
60                })
61                .collect();
62            let num_bits = bits.len().try_into().ok()?;
63
64            Some(Vec {
65                num_bits,
66                bits: std::iter::once((literal_words.len() as u64) << (1 + RLW_RUNNING_BITS))
67                    .chain(literal_words)
68                    .collect(),
69                rlw: 0,
70            })
71        }
72
73        /// Write the bitmap as EWAH bytes to `out`.
74        ///
75        /// These bytes can be parsed again with [`decode()`](super::decode()).
76        pub fn write_to(&self, out: &mut impl std::io::Write) -> std::io::Result<()> {
77            let len: u32 = self.bits.len().try_into().map_err(|_| {
78                std::io::Error::new(std::io::ErrorKind::InvalidInput, "bit word count exceeds u32::MAX")
79            })?;
80            let rlw: u32 = self.rlw.try_into().map_err(|_| {
81                std::io::Error::new(
82                    std::io::ErrorKind::InvalidInput,
83                    "run length word offset exceeds u32::MAX",
84                )
85            })?;
86
87            out.write_all(&self.num_bits.to_be_bytes())?;
88            out.write_all(&len.to_be_bytes())?;
89            for word in &self.bits {
90                out.write_all(&word.to_be_bytes())?;
91            }
92            out.write_all(&rlw.to_be_bytes())
93        }
94
95        /// Call `f(index)` for each bit that is true, given the index of the bit that identifies it uniquely within the bit array.
96        /// If `f` returns `None` the iteration will be stopped and `None` is returned.
97        ///
98        /// The index is sequential like in any other vector.
99        pub fn for_each_set_bit(&self, mut f: impl FnMut(usize) -> Option<()>) -> Option<()> {
100            let num_bits = self.num_bits();
101            let mut index = 0usize;
102            let mut iter = self.bits.iter();
103            while let Some(word) = iter.next() {
104                if rlw_runbit_is_set(word) {
105                    let len = usize::try_from(rlw_running_len_bits(word)).ok()?;
106                    let end = index.checked_add(len)?;
107                    if end > num_bits {
108                        return None;
109                    }
110                    for _ in 0..len {
111                        f(index)?;
112                        index += 1;
113                    }
114                } else {
115                    let len = usize::try_from(rlw_running_len_bits(word)).ok()?;
116                    let end = index.checked_add(len)?;
117                    if end > num_bits {
118                        return None;
119                    }
120                    index = end;
121                }
122
123                for _ in 0..rlw_literal_words(word) {
124                    let word = iter.next()?;
125                    let remaining = num_bits.checked_sub(index)?;
126                    if remaining == 0 {
127                        return None;
128                    }
129                    let bits_in_word = remaining.min(64);
130                    if bits_in_word < 64 && (word >> bits_in_word) != 0 {
131                        return None;
132                    }
133                    for bit_index in 0..bits_in_word {
134                        if word & (1 << bit_index) != 0 {
135                            f(index)?;
136                        }
137                        index += 1;
138                    }
139                }
140            }
141            Some(())
142        }
143
144        /// The amount of bits we are currently holding.
145        pub fn num_bits(&self) -> usize {
146            self.num_bits.try_into().expect("we are not on 16 bit systems")
147        }
148    }
149
150    #[inline]
151    fn rlw_running_len_bits(w: &u64) -> u64 {
152        rlw_running_len(w) * 64
153    }
154
155    #[inline]
156    fn rlw_running_len(w: &u64) -> u64 {
157        (w >> 1) & RLW_LARGEST_RUNNING_COUNT
158    }
159
160    #[inline]
161    fn rlw_literal_words(w: &u64) -> u64 {
162        w >> (1 + RLW_RUNNING_BITS)
163    }
164
165    #[inline]
166    fn rlw_runbit_is_set(w: &u64) -> bool {
167        w & 1 == 1
168    }
169
170    const RLW_RUNNING_BITS: u64 = 4 * 8;
171    const RLW_LARGEST_RUNNING_COUNT: u64 = (1 << RLW_RUNNING_BITS) - 1;
172}
173
174/// A growable collection of u64 that are seen as stream of individual bits.
175#[allow(dead_code)]
176#[derive(Clone)]
177pub struct Vec {
178    num_bits: u32,
179    bits: std::vec::Vec<u64>,
180    /// RLW is an offset into the `bits` buffer, so `1` translates into &bits\[1] essentially.
181    rlw: u64,
182}