splinter_rs/
splinter_ref.rs

1use std::{fmt::Debug, ops::Deref};
2
3use bytes::Bytes;
4use culprit::Culprit;
5use zerocopy::FromBytes;
6
7use crate::{
8    Splinter,
9    codec::{
10        DecodeErr, Encodable,
11        encoder::Encoder,
12        footer::{Footer, SPLINTER_V2_MAGIC},
13        partition_ref::PartitionRef,
14    },
15    level::High,
16    traits::PartitionRead,
17};
18
19/// A zero-copy reference to serialized splinter data.
20///
21/// `SplinterRef` allows efficient querying of compressed bitmap data without
22/// deserializing the underlying structure. It wraps any type that can be
23/// dereferenced to `[u8]` and provides all the same read operations as
24/// [`Splinter`], but with minimal memory overhead and no allocation during
25/// queries.
26///
27/// This is the preferred type for read-only operations on serialized splinter
28/// data, especially when the data comes from files, network, or other external
29/// sources.
30///
31/// # Type Parameter
32///
33/// - `B`: Any type that implements `Deref<Target = [u8]>` such as `&[u8]`,
34///   `Vec<u8>`, `Bytes`, `Arc<[u8]>`, etc.
35///
36/// # Examples
37///
38/// Creating from serialized bytes:
39///
40/// ```
41/// use splinter_rs::{Splinter, SplinterRef, PartitionWrite, PartitionRead, Encodable};
42///
43/// // Create and populate a splinter
44/// let mut splinter = Splinter::EMPTY;
45/// splinter.insert(100);
46/// splinter.insert(200);
47///
48/// // Serialize it to bytes
49/// let bytes = splinter.encode_to_bytes();
50///
51/// // Create a zero-copy reference
52/// let splinter_ref = SplinterRef::from_bytes(bytes).unwrap();
53/// assert_eq!(splinter_ref.cardinality(), 2);
54/// assert!(splinter_ref.contains(100));
55/// ```
56///
57/// Working with different buffer types:
58///
59/// ```
60/// use splinter_rs::{Splinter, SplinterRef, PartitionWrite, PartitionRead, Encodable};
61/// use std::sync::Arc;
62///
63/// let mut splinter = Splinter::EMPTY;
64/// splinter.insert(42);
65///
66/// let bytes = splinter.encode_to_bytes();
67/// let shared_bytes: Arc<[u8]> = bytes.to_vec().into();
68///
69/// let splinter_ref = SplinterRef::from_bytes(shared_bytes).unwrap();
70/// assert!(splinter_ref.contains(42));
71/// ```
72#[derive(Clone)]
73pub struct SplinterRef<B> {
74    pub(crate) data: B,
75}
76
77impl<B: Deref<Target = [u8]>> Debug for SplinterRef<B> {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        f.debug_tuple("SplinterRef")
80            .field(&self.load_unchecked())
81            .finish()
82    }
83}
84
85impl<B> SplinterRef<B> {
86    /// Returns a reference to the underlying data buffer.
87    ///
88    /// This provides access to the raw bytes that store the serialized splinter
89    /// data.
90    ///
91    /// # Examples
92    ///
93    /// ```
94    /// use splinter_rs::{Splinter, SplinterRef, PartitionWrite, Encodable};
95    ///
96    /// let mut splinter = Splinter::EMPTY;
97    /// splinter.insert(42);
98    /// let bytes = splinter.encode_to_bytes();
99    /// let splinter_ref = SplinterRef::from_bytes(bytes).unwrap();
100    ///
101    /// let inner_bytes = splinter_ref.inner();
102    /// assert!(!inner_bytes.is_empty());
103    /// ```
104    #[inline]
105    pub fn inner(&self) -> &B {
106        &self.data
107    }
108
109    /// Consumes the `SplinterRef` and returns the underlying data buffer.
110    ///
111    /// This is useful when you need to take ownership of the underlying data
112    /// after you're done querying the splinter.
113    ///
114    /// # Examples
115    ///
116    /// ```
117    /// use splinter_rs::{Splinter, SplinterRef, PartitionWrite, Encodable};
118    ///
119    /// let mut splinter = Splinter::EMPTY;
120    /// splinter.insert(42);
121    /// let bytes = splinter.encode_to_bytes();
122    /// let splinter_ref = SplinterRef::from_bytes(bytes.clone()).unwrap();
123    ///
124    /// let recovered_bytes = splinter_ref.into_inner();
125    /// assert_eq!(recovered_bytes, bytes);
126    /// ```
127    #[inline]
128    pub fn into_inner(self) -> B {
129        self.data
130    }
131}
132
133impl SplinterRef<Bytes> {
134    /// Returns a clone of the underlying bytes.
135    ///
136    /// This is efficient for `Bytes` since it uses reference counting
137    /// internally and doesn't actually copy the data.
138    ///
139    /// # Examples
140    ///
141    /// ```
142    /// use splinter_rs::{Splinter, PartitionWrite};
143    ///
144    /// let mut splinter = Splinter::EMPTY;
145    /// splinter.insert(42);
146    /// let splinter_ref = splinter.encode_to_splinter_ref();
147    ///
148    /// let bytes_copy = splinter_ref.encode_to_bytes();
149    /// assert!(!bytes_copy.is_empty());
150    /// ```
151    #[inline]
152    pub fn encode_to_bytes(&self) -> Bytes {
153        self.data.clone()
154    }
155}
156
157impl<B: Deref<Target = [u8]>> Encodable for SplinterRef<B> {
158    #[inline]
159    fn encoded_size(&self) -> usize {
160        self.data.len()
161    }
162
163    #[inline]
164    fn encode<T: bytes::BufMut>(&self, encoder: &mut Encoder<T>) {
165        encoder.write_splinter(&self.data);
166    }
167}
168
169impl<B: Deref<Target = [u8]>> SplinterRef<B> {
170    /// Converts this reference back to an owned [`Splinter`].
171    ///
172    /// This method deserializes the underlying data and creates a new owned
173    /// `Splinter` that supports mutation. This involves iterating through all
174    /// values and rebuilding the data structure, so it has a cost proportional
175    /// to the number of elements.
176    ///
177    /// # Examples
178    ///
179    /// ```
180    /// use splinter_rs::{Splinter, SplinterRef, PartitionWrite, PartitionRead};
181    ///
182    /// let mut original = Splinter::EMPTY;
183    /// original.insert(100);
184    /// original.insert(200);
185    ///
186    /// let splinter_ref = original.encode_to_splinter_ref();
187    /// let decoded = splinter_ref.decode_to_splinter();
188    ///
189    /// assert_eq!(decoded.cardinality(), 2);
190    /// assert!(decoded.contains(100));
191    /// assert!(decoded.contains(200));
192    /// ```
193    pub fn decode_to_splinter(&self) -> Splinter {
194        Splinter::new((&self.load_unchecked()).into())
195    }
196
197    /// Creates a `SplinterRef` from raw bytes, validating the format.
198    ///
199    /// This method parses and validates the serialized splinter format, checking:
200    /// - Sufficient data length
201    /// - Valid magic bytes
202    /// - Correct checksum
203    ///
204    /// IMPORTANT: This method *does not* recursively verify the entire
205    /// splinter, opting instead to rely on the checksum to detect any
206    /// corruption. Do not use Splinter with untrusted data as it's trivial to
207    /// construct a Splinter which will cause your program to panic at runtime.
208    ///
209    /// Returns an error if the data is corrupted or in an invalid format.
210    ///
211    /// # Errors
212    ///
213    /// - [`DecodeErr::Length`]: Not enough bytes in the buffer
214    /// - [`DecodeErr::Magic`]: Invalid magic bytes
215    /// - [`DecodeErr::Checksum`]: Data corruption detected
216    /// - [`DecodeErr::Validity`]: Invalid internal structure
217    /// - [`DecodeErr::SplinterV1`]: Data is from incompatible v1 format
218    ///
219    /// # Examples
220    ///
221    /// ```
222    /// use splinter_rs::{Splinter, SplinterRef, PartitionWrite, PartitionRead, Encodable};
223    ///
224    /// let mut splinter = Splinter::EMPTY;
225    /// splinter.insert(42);
226    /// let bytes = splinter.encode_to_bytes();
227    ///
228    /// let splinter_ref = SplinterRef::from_bytes(bytes).unwrap();
229    /// assert!(splinter_ref.contains(42));
230    /// ```
231    ///
232    /// Error handling:
233    ///
234    /// ```
235    /// use splinter_rs::{SplinterRef, codec::DecodeErr};
236    ///
237    /// let invalid_bytes = vec![0u8; 5]; // Too short
238    /// let result = SplinterRef::from_bytes(invalid_bytes);
239    /// assert!(matches!(result.unwrap_err().ctx(), DecodeErr::Length));
240    /// ```
241    pub fn from_bytes(data: B) -> culprit::Result<Self, DecodeErr> {
242        pub(crate) const SPLINTER_V1_MAGIC: [u8; 4] = [0xDA, 0xAE, 0x12, 0xDF];
243        if data.len() >= 4
244            && data.starts_with(&SPLINTER_V1_MAGIC)
245            && !data.ends_with(&SPLINTER_V2_MAGIC)
246        {
247            return Err(Culprit::new(DecodeErr::SplinterV1));
248        }
249
250        if data.len() < Footer::SIZE {
251            return Err(Culprit::new(DecodeErr::Length));
252        }
253        let (partitions, footer) = data.split_at(data.len() - Footer::SIZE);
254        Footer::ref_from_bytes(footer)?.validate(partitions)?;
255        PartitionRef::<High>::from_suffix(partitions)?;
256        Ok(Self { data })
257    }
258
259    pub(crate) fn load_unchecked(&self) -> PartitionRef<'_, High> {
260        let without_footer = &self.data[..(self.data.len() - Footer::SIZE)];
261        PartitionRef::from_suffix(without_footer).unwrap()
262    }
263}
264
265impl<B: Deref<Target = [u8]>> PartitionRead<High> for SplinterRef<B> {
266    fn cardinality(&self) -> usize {
267        self.load_unchecked().cardinality()
268    }
269
270    fn is_empty(&self) -> bool {
271        self.load_unchecked().is_empty()
272    }
273
274    fn contains(&self, value: u32) -> bool {
275        self.load_unchecked().contains(value)
276    }
277
278    fn rank(&self, value: u32) -> usize {
279        self.load_unchecked().rank(value)
280    }
281
282    fn select(&self, idx: usize) -> Option<u32> {
283        self.load_unchecked().select(idx)
284    }
285
286    fn last(&self) -> Option<u32> {
287        self.load_unchecked().last()
288    }
289
290    fn iter(&self) -> impl Iterator<Item = u32> {
291        self.load_unchecked().into_iter()
292    }
293}
294
295#[cfg(test)]
296mod test {
297    use proptest::{collection::vec, prop_assume, proptest};
298
299    use crate::{
300        Optimizable, PartitionRead, Splinter,
301        testutil::{SetGen, mksplinter},
302    };
303
304    #[test]
305    fn test_empty() {
306        let splinter = mksplinter(&[]).encode_to_splinter_ref();
307
308        assert_eq!(splinter.decode_to_splinter(), Splinter::EMPTY);
309        assert!(!splinter.contains(0));
310        assert_eq!(splinter.cardinality(), 0);
311        assert_eq!(splinter.last(), None);
312    }
313
314    /// This is a regression test for a bug in the SplinterRef encoding. The bug
315    /// was that we used LittleEndian encoded values to store unaligned values,
316    /// which sort in reverse order from what we expect.
317    #[test]
318    fn test_contains_bug() {
319        let mut set_gen = SetGen::new(0xDEAD_BEEF);
320        let set = set_gen.random(1024);
321        let lookup = set[(set.len() / 3) as usize];
322        let splinter = mksplinter(&set).encode_to_splinter_ref();
323        assert!(splinter.contains(lookup))
324    }
325
326    proptest! {
327        #[test]
328        fn test_splinter_ref_proptest(set in vec(0u32..16384, 0..1024)) {
329            let splinter = mksplinter(&set).encode_to_splinter_ref();
330            if set.is_empty() {
331                assert!(!splinter.contains(123))
332            } else {
333                let lookup = set[set.len() / 3];
334                assert!(splinter.contains(lookup))
335            }
336        }
337
338        #[test]
339        fn test_splinter_opt_ref_proptest(set in vec(0u32..16384, 0..1024))  {
340            let mut splinter = mksplinter(&set);
341            splinter.optimize();
342            let splinter = splinter.encode_to_splinter_ref();
343            if set.is_empty() {
344                assert!(!splinter.contains(123))
345            } else {
346                let lookup = set[set.len() / 3];
347                assert!(splinter.contains(lookup))
348            }
349        }
350
351        #[test]
352        fn test_splinter_ref_eq_proptest(set in vec(0u32..16384, 0..1024))  {
353            let ref1 = mksplinter(&set).encode_to_splinter_ref();
354            let ref2 = mksplinter(&set).encode_to_splinter_ref();
355            assert_eq!(ref1, ref2)
356        }
357
358        #[test]
359        fn test_splinter_opt_ref_eq_proptest(set in vec(0u32..16384, 0..1024))  {
360            let mut ref1 = mksplinter(&set);
361            ref1.optimize();
362            let ref1 = ref1.encode_to_splinter_ref();
363            let ref2 = mksplinter(&set).encode_to_splinter_ref();
364            assert_eq!(ref1, ref2)
365        }
366
367        #[test]
368        fn test_splinter_ref_ne_proptest(
369            set1 in vec(0u32..16384, 0..1024),
370            set2 in vec(0u32..16384, 0..1024),
371        ) {
372            prop_assume!(set1 != set2);
373
374            let ref1 = mksplinter(&set1).encode_to_splinter_ref();
375            let ref2 = mksplinter(&set2).encode_to_splinter_ref();
376            assert_ne!(ref1, ref2)
377        }
378
379        #[test]
380        fn test_splinter_opt_ref_ne_proptest(
381            set1 in vec(0u32..16384, 0..1024),
382            set2 in vec(0u32..16384, 0..1024),
383        ) {
384            prop_assume!(set1 != set2);
385
386            let mut ref1 = mksplinter(&set1);
387            ref1.optimize();
388            let ref1 = ref1.encode_to_splinter_ref();
389            let ref2 = mksplinter(&set2).encode_to_splinter_ref();
390            assert_ne!(ref1 ,ref2)
391        }
392    }
393
394    #[test]
395    fn test_ref_wat() {
396        #[rustfmt::skip]
397        let set = [ 6400, 11776, 768, 15872, 6912, 0, 11008, 769, 770, 11009, 4608, 771, 0, 768, 6401, 0, 8192, 8192, 4609, 772, 4610, 0, 0, 0, 0, 0, 768, 773, 774, 14336, 0, 0, 0, 15872, 11010, 775, 0, 768, 11777, 776, 0, 0, 0, 6400, 14337, 8193, 0, 0, 0, 0, 0, 0, 0, ];
398        let mut ref1 = mksplinter(&set);
399        ref1.optimize();
400        let ref1 = ref1.encode_to_splinter_ref();
401        let ref2 = mksplinter(&set).encode_to_splinter_ref();
402        assert_eq!(ref1, ref2)
403    }
404}