roaring/bitmap/
serialization.rs

1use crate::bitmap::container::{Container, ARRAY_LIMIT};
2use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH};
3use crate::RoaringBitmap;
4use bytemuck::cast_slice_mut;
5use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
6use core::convert::Infallible;
7use core::ops::RangeInclusive;
8use std::error::Error;
9use std::io;
10
11pub(crate) const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
12pub(crate) const SERIAL_COOKIE: u16 = 12347;
13pub(crate) const NO_OFFSET_THRESHOLD: usize = 4;
14
15// Sizes of header structures
16pub(crate) const DESCRIPTION_BYTES: usize = 4;
17pub(crate) const OFFSET_BYTES: usize = 4;
18
19impl RoaringBitmap {
20    /// Return the size in bytes of the serialized output.
21    /// This is compatible with the official C/C++, Java and Go implementations.
22    ///
23    /// # Examples
24    ///
25    /// ```rust
26    /// use roaring::RoaringBitmap;
27    ///
28    /// let rb1: RoaringBitmap = (1..4).collect();
29    /// let mut bytes = Vec::with_capacity(rb1.serialized_size());
30    /// rb1.serialize_into(&mut bytes).unwrap();
31    /// let rb2 = RoaringBitmap::deserialize_from(&bytes[..]).unwrap();
32    ///
33    /// assert_eq!(rb1, rb2);
34    /// ```
35    pub fn serialized_size(&self) -> usize {
36        let container_sizes: usize = self
37            .containers
38            .iter()
39            .map(|container| match container.store {
40                Store::Array(ref values) => 8 + values.len() as usize * 2,
41                Store::Bitmap(..) => 8 + 8 * 1024,
42            })
43            .sum();
44
45        // header + container sizes
46        8 + container_sizes
47    }
48
49    /// Serialize this bitmap into [the standard Roaring on-disk format][format].
50    /// This is compatible with the official C/C++, Java and Go implementations.
51    ///
52    /// [format]: https://github.com/RoaringBitmap/RoaringFormatSpec
53    ///
54    /// # Examples
55    ///
56    /// ```rust
57    /// use roaring::RoaringBitmap;
58    ///
59    /// let rb1: RoaringBitmap = (1..4).collect();
60    /// let mut bytes = vec![];
61    /// rb1.serialize_into(&mut bytes).unwrap();
62    /// let rb2 = RoaringBitmap::deserialize_from(&bytes[..]).unwrap();
63    ///
64    /// assert_eq!(rb1, rb2);
65    /// ```
66    pub fn serialize_into<W: io::Write>(&self, mut writer: W) -> io::Result<()> {
67        writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
68        writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
69
70        for container in &self.containers {
71            writer.write_u16::<LittleEndian>(container.key)?;
72            writer.write_u16::<LittleEndian>((container.len() - 1) as u16)?;
73        }
74
75        let mut offset = 8 + 8 * self.containers.len() as u32;
76        for container in &self.containers {
77            writer.write_u32::<LittleEndian>(offset)?;
78            match container.store {
79                Store::Array(ref values) => {
80                    offset += values.len() as u32 * 2;
81                }
82                Store::Bitmap(..) => {
83                    offset += 8 * 1024;
84                }
85            }
86        }
87
88        for container in &self.containers {
89            match container.store {
90                Store::Array(ref values) => {
91                    for &value in values.iter() {
92                        writer.write_u16::<LittleEndian>(value)?;
93                    }
94                }
95                Store::Bitmap(ref bits) => {
96                    for &value in bits.as_array() {
97                        writer.write_u64::<LittleEndian>(value)?;
98                    }
99                }
100            }
101        }
102
103        Ok(())
104    }
105
106    /// Deserialize a bitmap into memory from [the standard Roaring on-disk
107    /// format][format]. This is compatible with the official C/C++, Java and
108    /// Go implementations. This method checks that all of the internal values
109    /// are valid. If deserializing from a trusted source consider
110    /// [RoaringBitmap::deserialize_unchecked_from]
111    ///
112    /// [format]: https://github.com/RoaringBitmap/RoaringFormatSpec
113    ///
114    /// # Examples
115    ///
116    /// ```rust
117    /// use roaring::RoaringBitmap;
118    ///
119    /// let rb1: RoaringBitmap = (1..4).collect();
120    /// let mut bytes = vec![];
121    /// rb1.serialize_into(&mut bytes).unwrap();
122    /// let rb2 = RoaringBitmap::deserialize_from(&bytes[..]).unwrap();
123    ///
124    /// assert_eq!(rb1, rb2);
125    /// ```
126    pub fn deserialize_from<R: io::Read>(reader: R) -> io::Result<RoaringBitmap> {
127        RoaringBitmap::deserialize_from_impl(reader, ArrayStore::try_from, BitmapStore::try_from)
128    }
129
130    /// Deserialize a bitmap into memory from [the standard Roaring on-disk
131    /// format][format]. This is compatible with the official C/C++, Java and
132    /// Go implementations. This method is memory safe but will not check if
133    /// the data is a valid bitmap.
134    ///
135    /// [format]: https://github.com/RoaringBitmap/RoaringFormatSpec
136    ///
137    /// # Examples
138    ///
139    /// ```rust
140    /// use roaring::RoaringBitmap;
141    ///
142    /// let rb1: RoaringBitmap = (1..4).collect();
143    /// let mut bytes = vec![];
144    /// rb1.serialize_into(&mut bytes).unwrap();
145    /// let rb2 = RoaringBitmap::deserialize_unchecked_from(&bytes[..]).unwrap();
146    ///
147    /// assert_eq!(rb1, rb2);
148    /// ```
149    pub fn deserialize_unchecked_from<R: io::Read>(reader: R) -> io::Result<RoaringBitmap> {
150        RoaringBitmap::deserialize_from_impl::<R, _, Infallible, _, Infallible>(
151            reader,
152            |values| Ok(ArrayStore::from_vec_unchecked(values)),
153            |len, values| Ok(BitmapStore::from_unchecked(len, values)),
154        )
155    }
156
157    fn deserialize_from_impl<R, A, AErr, B, BErr>(
158        mut reader: R,
159        a: A,
160        b: B,
161    ) -> io::Result<RoaringBitmap>
162    where
163        R: io::Read,
164        A: Fn(Vec<u16>) -> Result<ArrayStore, AErr>,
165        AErr: Error + Send + Sync + 'static,
166        B: Fn(u64, Box<[u64; 1024]>) -> Result<BitmapStore, BErr>,
167        BErr: Error + Send + Sync + 'static,
168    {
169        // First read the cookie to determine which version of the format we are reading
170        let (size, has_offsets, has_run_containers) = {
171            let cookie = reader.read_u32::<LittleEndian>()?;
172            if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
173                (reader.read_u32::<LittleEndian>()? as usize, true, false)
174            } else if (cookie as u16) == SERIAL_COOKIE {
175                let size = ((cookie >> 16) + 1) as usize;
176                (size, size >= NO_OFFSET_THRESHOLD, true)
177            } else {
178                return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
179            }
180        };
181
182        // Read the run container bitmap if necessary
183        let run_container_bitmap = if has_run_containers {
184            let mut bitmap = vec![0u8; (size + 7) / 8];
185            reader.read_exact(&mut bitmap)?;
186            Some(bitmap)
187        } else {
188            None
189        };
190
191        if size > u16::MAX as usize + 1 {
192            return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
193        }
194
195        // Read the container descriptions
196        let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES];
197        reader.read_exact(&mut description_bytes)?;
198        let mut description_bytes = &description_bytes[..];
199
200        if has_offsets {
201            let mut offsets = vec![0u8; size * OFFSET_BYTES];
202            reader.read_exact(&mut offsets)?;
203            drop(offsets); // Not useful when deserializing into memory
204        }
205
206        let mut containers = Vec::with_capacity(size);
207
208        // Read each container
209        for i in 0..size {
210            let key = description_bytes.read_u16::<LittleEndian>()?;
211            let cardinality = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
212
213            // If the run container bitmap is present, check if this container is a run container
214            let is_run_container =
215                run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0);
216
217            let store = if is_run_container {
218                let runs = reader.read_u16::<LittleEndian>()?;
219                let mut intervals = vec![[0, 0]; runs as usize];
220                reader.read_exact(cast_slice_mut(&mut intervals))?;
221                intervals.iter_mut().for_each(|[s, len]| {
222                    *s = u16::from_le(*s);
223                    *len = u16::from_le(*len);
224                });
225
226                let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum();
227                let mut store = Store::with_capacity(cardinality);
228                intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> {
229                    let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?;
230                    store.insert_range(RangeInclusive::new(s, end));
231                    Ok(())
232                })?;
233                store
234            } else if cardinality <= ARRAY_LIMIT {
235                let mut values = vec![0; cardinality as usize];
236                reader.read_exact(cast_slice_mut(&mut values))?;
237                values.iter_mut().for_each(|n| *n = u16::from_le(*n));
238                let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
239                Store::Array(array)
240            } else {
241                let mut values = Box::new([0; BITMAP_LENGTH]);
242                reader.read_exact(cast_slice_mut(&mut values[..]))?;
243                values.iter_mut().for_each(|n| *n = u64::from_le(*n));
244                let bitmap = b(cardinality, values)
245                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
246                Store::Bitmap(bitmap)
247            };
248
249            containers.push(Container { key, store });
250        }
251
252        Ok(RoaringBitmap { containers })
253    }
254}
255
256#[cfg(test)]
257mod test {
258    use crate::{bitmap::store::BITMAP_LENGTH, RoaringBitmap};
259    use proptest::prelude::*;
260
261    proptest! {
262        #[test]
263        fn test_serialization(
264            bitmap in RoaringBitmap::arbitrary(),
265        ) {
266            let mut buffer = Vec::new();
267            bitmap.serialize_into(&mut buffer).unwrap();
268            prop_assert_eq!(bitmap, RoaringBitmap::deserialize_from(buffer.as_slice()).unwrap());
269        }
270    }
271
272    #[test]
273    fn test_from_lsb0_bytes() {
274        const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32;
275        const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8;
276        let mut bytes = vec![0xff; CONTAINER_OFFSET_IN_BYTES as usize];
277        bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]);
278        bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]);
279
280        let offset = 32;
281        let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes);
282        for i in 0..offset {
283            assert!(!rb.contains(i), "{i} should not be in the bitmap");
284        }
285        for i in offset..offset + CONTAINER_OFFSET {
286            assert!(rb.contains(i), "{i} should be in the bitmap");
287        }
288        for i in offset + CONTAINER_OFFSET..offset + CONTAINER_OFFSET * 2 {
289            assert!(!rb.contains(i), "{i} should not be in the bitmap");
290        }
291        for bit in [0, 9, 16, 17, 26] {
292            let i = bit + offset + CONTAINER_OFFSET * 2;
293            assert!(rb.contains(i), "{i} should be in the bitmap");
294        }
295
296        assert_eq!(rb.len(), CONTAINER_OFFSET as u64 + 5);
297
298        // Ensure the empty container is not created
299        let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize];
300        bytes.extend([0xff]);
301        let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes);
302        assert_eq!(rb.min(), Some(CONTAINER_OFFSET));
303
304        let rb = RoaringBitmap::from_lsb0_bytes(8, &bytes);
305        assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8));
306
307        // Ensure we can set the last byte in an array container
308        let bytes = [0x80];
309        let rb = RoaringBitmap::from_lsb0_bytes(0xFFFFFFF8, &bytes);
310        assert_eq!(rb.len(), 1);
311        assert!(rb.contains(u32::MAX));
312
313        // Ensure we can set the last byte in a bitmap container
314        let bytes = vec![0xFF; 0x1_0000 / 8];
315        let rb = RoaringBitmap::from_lsb0_bytes(0xFFFF0000, &bytes);
316        assert_eq!(rb.len(), 0x1_0000);
317        assert!(rb.contains(u32::MAX));
318    }
319
320    #[test]
321    fn test_from_lsb0_bytes_not_multiple_of_8() {
322        const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32;
323        const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8;
324
325        let mut bytes = vec![0b0101_1001];
326        bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]);
327        bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]);
328
329        let mut indices = vec![0, 3, 4, 6];
330        indices.extend([0, 9, 16, 17, 26].map(|i| 8 + CONTAINER_OFFSET + i));
331
332        for offset in 0..8 {
333            let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes);
334            for i in indices.iter().map(|&i| i + offset) {
335                assert!(rb.contains(i), "{i} should be in the bitmap");
336            }
337        }
338    }
339
340    #[test]
341    #[should_panic(expected = "<= 2^32")]
342    fn test_from_lsb0_bytes_overflow() {
343        let bytes = [0x01, 0x01];
344        RoaringBitmap::from_lsb0_bytes(u32::MAX - 7, &bytes);
345    }
346
347    #[test]
348    fn test_deserialize_overflow_s_plus_len() {
349        let data = vec![59, 48, 0, 0, 255, 130, 254, 59, 48, 2, 0, 41, 255, 255, 166, 197, 4, 0, 2];
350        let res = RoaringBitmap::deserialize_from(data.as_slice());
351        assert!(res.is_err());
352    }
353}