bin_pool/
lib.rs

1//! `bin-pool` is a small crate for interning binary slices.
2//! A type called [`BinPool`] is provided which represents a collection of binary slices;
3//! however, the content is highly optimized for space by allowing slices to overlap in memory.
4//! The data is backed by a smaller subset of "large" slices that have been added previously.
5//! 
6//! Note that interning is not a cheap operation, and in the worst case every backing slice must be examined when adding a new slice.
7//! Interning is only recommended when memory is very limited, or for storage optimization in areas where time is not critical (e.g., compiler output).
8//! 
9//! # Example
10//! ```
11//! # use bin_pool::*;
12//! let mut b = BinPool::new();
13//! 
14//! b.add(b"hello world".as_slice());  // add first slice - backed by itself
15//! b.add(b"hello".as_slice());        // add another slice - backed by first
16//! b.add(b"world".as_slice());        // add another slice - backed by first
17//! assert_eq!(b.bytes(), 21);         // 21 bytes stored
18//! assert_eq!(b.backing_bytes(), 11); // but only 11 bytes needed to represent them
19//! 
20//! b.add(b"hello world!".as_slice()); // add another slice - becomes the backer for others
21//! assert_eq!(b.bytes(), 33);         // now 33 bytes stored
22//! assert_eq!(b.backing_bytes(), 12); // but only 12 bytes to represent them
23//! ```
24//! 
25//! # `no_std`
26//! 
27//! `bin-pool` supports building in `no_std` environments by disabling default features.
28//! Note that the `alloc` crate is required.
29//! 
30//! ```toml
31//! [dependencies]
32//! bin-pool = { version = "...", default-features = false }
33//! ```
34
35#![no_std]
36#![forbid(unsafe_code)]
37
38#[cfg_attr(test, macro_use)]
39extern crate alloc;
40
41use alloc::borrow::Cow;
42use alloc::vec::Vec;
43
44use core::iter::FusedIterator;
45use core::slice;
46
47fn find_subregion(sup: &[u8], sub: &[u8]) -> Option<usize> {
48    // [T]::windows panics if size is empty, but we know all slices start with empty slice
49    if sub.is_empty() { return Some(0) }
50
51    for (i, w) in sup.windows(sub.len()).enumerate() {
52        if w == sub { return Some(i) }
53    }
54    None
55}
56#[test]
57fn test_find_subregion() {
58    assert_eq!(find_subregion(&[1, 2, 3], &[1]), Some(0));
59    assert_eq!(find_subregion(&[1, 2, 3], &[2]), Some(1));
60    assert_eq!(find_subregion(&[1, 2, 3], &[3]), Some(2));
61    assert_eq!(find_subregion(&[1, 2, 3, 3, 2, 3, 4], &[2, 3]), Some(1));
62    assert_eq!(find_subregion(&[1, 2, 3, 3, 2, 3, 4], &[2, 3, 3]), Some(1));
63    assert_eq!(find_subregion(&[1, 2, 3, 3, 2, 3, 4], &[2, 3, 4]), Some(4));
64    assert_eq!(find_subregion(&[1, 2, 3, 3, 2, 3, 4], &[2, 3, 4, 5]), None);
65    assert_eq!(find_subregion(&[1, 2, 3, 3, 2, 3, 4], &[1, 2, 3, 3, 2, 3, 4]), Some(0));
66    assert_eq!(find_subregion(&[1, 2, 3, 3, 2, 3, 4], &[1, 2, 3, 3, 2, 3, 4, 1]), None);
67}
68
69/// Information about the location of a [`BinPool`] slice in the backing data.
70#[derive(Clone, Copy, Debug)]
71pub struct SliceInfo {
72    /// Index of the backing slice.
73    pub src: usize,
74    /// Starting position in the backing slice.
75    pub start: usize,
76    /// Length of the slice.
77    pub length: usize,
78}
79
80/// An append-only pool of auto-overlapping binary slices.
81/// 
82/// In use, you add binary slices one at a time and it will attempt to create maximum overlapping.
83/// Currently, it only guarantees that supersets and subsets will overlap.
84/// It's theoretically possible to do cross-slice overlapping, but this would be complex and even more expensive.
85/// 
86/// Slices added to the pool are guaranteed to remain in insertion order.
87/// Moreover, the collection is append-only, so once a slice is added it will remain at the same index for the lifetime of the [`BinPool`].
88#[derive(Default, Clone, Debug)]
89pub struct BinPool {
90    data: Vec<Vec<u8>>,     // the backing data
91    slices: Vec<SliceInfo>, // effectively slices into top
92}
93impl BinPool {
94    /// Constructs an empty pool.
95    pub fn new() -> Self {
96        Default::default()
97    }
98
99    fn add_internal(&mut self, value: Cow<[u8]>) -> usize {
100        // if an equivalent slice already exists, just refer to that
101        for (i, other) in self.iter().enumerate() {
102            if other == &*value { return i; }
103        }
104
105        let ret = self.slices.len(); // eventual return value
106
107        // look for any data entry that value is a subregion of - if we find one we can use that as data source
108        for (i, top) in self.data.iter().enumerate() {
109            if let Some(start) = find_subregion(top, &value) {
110                self.slices.push(SliceInfo { src: i, start, length: value.len() });
111                return ret;
112            }
113        }
114
115        // if that didn't work, look for any data entry that is a subregion of value (i.e. containment the other way)
116        for i in 0..self.data.len() {
117            // if we found one, we can replace it with value
118            if let Some(start) = find_subregion(&value, &self.data[i]) {
119                // replace it with value and update the starting position of any slices that referenced it
120                self.data[i] = value.into_owned();
121                for slice in self.slices.iter_mut() {
122                    if slice.src == i { slice.start += start; }
123                }
124
125                // now we need to look through the data entries again and see if any of them are contained in value (the new, larger data entry)
126                // we stopped on first that was a subset of value, so no need to tests 0..=i
127                let mut j = i + 1;
128                while j < self.data.len() {
129                    // if data entry j is contained in value (entry i), we can remove j
130                    if let Some(start) = find_subregion(&self.data[i], &self.data[j]) {
131                        // get rid of j, redundant with i - use swap remove for efficiency
132                        self.data.swap_remove(j);
133
134                        // update all the slices to reflect the change
135                        for slice in self.slices.iter_mut() {
136                            // if it referenced the deleted entry (j), repoint it to value (i) and apply the start offset
137                            if slice.src == j {
138                                slice.src = i;
139                                slice.start += start;
140                            }
141                            // if it referenced the moved entry (the one we used for swap remove), repoint it to j
142                            else if slice.src == self.data.len() {
143                                slice.src = j;
144                            }
145                        }
146                    }
147                    else { j += 1; } // only increment j if we didn't remove j
148                }
149
150                // and finally, add the slice info
151                self.slices.push(SliceInfo { src: i, start: 0, length: self.data[i].len() });
152                return ret;
153            }
154        }
155
156        // if that also didn't work then we just have to add value as a new data entry
157        let length = value.len();
158        self.data.push(value.into_owned());
159        self.slices.push(SliceInfo { src: self.data.len() - 1, start: 0, length });
160        ret
161    }
162
163    /// Adds the specified slice to the pool.
164    /// If an equivalent slice already exists, does nothing and returns the index of the pre-existing slice;
165    /// otherwise, adds `value` as a new slice and returns its (new) slice index.
166    /// 
167    /// If you are working with strings, you may find [`str::as_bytes`] and [`String::into_bytes`] useful.
168    pub fn add<'a, T>(&mut self, value: T) -> usize where T: Into<Cow<'a, [u8]>> {
169        self.add_internal(value.into())
170    }
171
172    /// Removes all content from the pool.
173    /// This is the only non-append-only operation, and is just meant to support resource reuse.
174    pub fn clear(&mut self) {
175        self.data.clear();
176        self.slices.clear();
177    }
178    /// Gets the number of (distinct) slices contained in the pool.
179    pub fn len(&self) -> usize {
180        self.slices.len()
181    }
182    /// Checks if the pool is empty.
183    pub fn is_empty(&self) -> bool {
184        self.slices.is_empty()
185    }
186
187    /// Iterates over all (distinct) slices contained in the pool in insertion order.
188    pub fn iter(&self) -> Iter {
189        Iter { data: &self.data, iter: self.slices.iter() }
190    }
191    /// Gets the slice at the specified index, or [`None`] if `index` is not a valid slice index returned by [`BinPool::add`].
192    pub fn get(&self, index: usize) -> Option<&[u8]> {
193        self.slices.get(index).map(|s| &self.data[s.src][s.start..s.start+s.length])
194    }
195
196    /// Gets the total number of bytes from (distinct) slices that were added to the pool.
197    /// Note that the space needed to represent these slices may be significantly smaller (see [`BinPool::backing_bytes`]).
198    pub fn bytes(&self) -> usize {
199        self.slices.iter().fold(0, |v, s| v + s.length)
200    }
201    /// Gets the total number of bytes backing the stored slices.
202    pub fn backing_bytes(&self) -> usize {
203        self.data.iter().fold(0, |v, s| v + s.len())
204    }
205
206    /// Gets a reference to the backing data.
207    pub fn as_backing(&self) -> (&Vec<Vec<u8>>, &Vec<SliceInfo>) {
208        (&self.data, &self.slices)
209    }
210    /// Gets the backing data.
211    pub fn into_backing(self) -> (Vec<Vec<u8>>, Vec<SliceInfo>) {
212        (self.data, self.slices)
213    }
214}
215
216/// Iterates over the (distinct) slices of a [`BinPool`] in insertion order.
217pub struct Iter<'a> {
218    data: &'a [Vec<u8>],
219    iter: slice::Iter<'a, SliceInfo>,
220}
221impl<'a> Iterator for Iter<'a> {
222    type Item = &'a [u8];
223    fn next(&mut self) -> Option<Self::Item> {
224        self.iter.next().map(|s| &self.data[s.src][s.start..s.start+s.length])
225    }
226}
227impl<'a> FusedIterator for Iter<'a> {}
228
229#[test]
230fn test_binary_pool() {
231    let mut s = BinPool::new();
232    let checked_add = |s: &mut BinPool, v: Vec<u8>| {
233        let res = s.add(&v);
234        assert_eq!(s.get(res).unwrap(), &v);
235        res
236    };
237
238    assert_eq!(s.len(), 0);
239    assert!(s.is_empty());
240    assert_eq!(s.iter().count(), 0);
241    assert_eq!(s.bytes(), 0);
242    assert_eq!(s.backing_bytes(), 0);
243
244    assert_eq!(checked_add(&mut s, vec![1, 2, 3]), 0);
245    assert_eq!(s.len(), 1);
246    assert!(!s.is_empty());
247    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref()]);
248    assert_eq!(s.data, &[[1, 2, 3].as_ref()]);
249    assert_eq!(s.bytes(), 3);
250    assert_eq!(s.backing_bytes(), 3);
251
252    assert_eq!(checked_add(&mut s, vec![2, 3]), 1);
253    assert_eq!(s.len(), 2);
254    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref()]);
255    assert_eq!(s.data, &[[1, 2, 3].as_ref()]);
256    assert_eq!(s.bytes(), 5);
257    assert_eq!(s.backing_bytes(), 3);
258
259    assert_eq!(checked_add(&mut s, vec![2, 3]), 1);
260    assert_eq!(s.len(), 2);
261    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref()]);
262    assert_eq!(s.data, &[[1, 2, 3].as_ref()]);
263    assert_eq!(s.bytes(), 5);
264    assert_eq!(s.backing_bytes(), 3);
265
266    assert_eq!(checked_add(&mut s, vec![1, 2, 3]), 0);
267    assert_eq!(s.len(), 2);
268    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref()]);
269    assert_eq!(s.data, &[[1, 2, 3].as_ref()]);
270    assert_eq!(s.bytes(), 5);
271    assert_eq!(s.backing_bytes(), 3);
272
273    assert_eq!(checked_add(&mut s, vec![2, 3, 4, 5]), 2);
274    assert_eq!(s.len(), 3);
275    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref(), [2, 3, 4, 5].as_ref()]);
276    assert_eq!(s.data, &[[1, 2, 3].as_ref(), [2, 3, 4, 5].as_ref()]);
277    assert_eq!(s.bytes(), 9);
278    assert_eq!(s.backing_bytes(), 7);
279
280    assert_eq!(checked_add(&mut s, vec![2, 3, 4, 5]), 2);
281    assert_eq!(s.len(), 3);
282    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref(), [2, 3, 4, 5].as_ref()]);
283    assert_eq!(s.data, &[[1, 2, 3].as_ref(), [2, 3, 4, 5].as_ref()]);
284    assert_eq!(s.bytes(), 9);
285    assert_eq!(s.backing_bytes(), 7);
286
287    assert_eq!(checked_add(&mut s, vec![1, 2, 3, 4]), 3);
288    assert_eq!(s.len(), 4);
289    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref(), [2, 3, 4, 5].as_ref(), [1, 2, 3, 4].as_ref()]);
290    assert_eq!(s.data, &[[1, 2, 3, 4].as_ref(), [2, 3, 4, 5].as_ref()]);
291    assert_eq!(s.bytes(), 13);
292    assert_eq!(s.backing_bytes(), 8);
293
294    {
295        let mut s = s.clone();
296        assert_eq!(checked_add(&mut s, vec![255, 69, 1, 2, 3, 4, 5, 0, 0, 10, 20]), 4);
297        assert_eq!(s.len(), 5);
298        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref(), [2, 3, 4, 5].as_ref(), [1, 2, 3, 4].as_ref(),
299            [255, 69, 1, 2, 3, 4, 5, 0, 0, 10, 20].as_ref()]);
300        assert_eq!(s.data, &[[255, 69, 1, 2, 3, 4, 5, 0, 0, 10, 20].as_ref()]);
301        assert_eq!(s.bytes(), 24);
302        assert_eq!(s.backing_bytes(), 11);
303    }
304    {
305        let mut s = s.clone();
306        assert_eq!(checked_add(&mut s, vec![2, 3, 4, 5, 0, 0, 10, 20]), 4);
307        assert_eq!(s.len(), 5);
308        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref(), [2, 3, 4, 5].as_ref(), [1, 2, 3, 4].as_ref(),
309            [2, 3, 4, 5, 0, 0, 10, 20].as_ref()]);
310            assert_eq!(s.data, &[[1, 2, 3, 4].as_ref(), [2, 3, 4, 5, 0, 0, 10, 20].as_ref()]);
311            assert_eq!(s.bytes(), 21);
312            assert_eq!(s.backing_bytes(), 12);
313    }
314    {
315        let mut s = s.clone();
316        assert_eq!(checked_add(&mut s, vec![255, 69, 1, 2, 3, 4]), 4);
317        assert_eq!(s.len(), 5);
318        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[1, 2, 3].as_ref(), [2, 3].as_ref(), [2, 3, 4, 5].as_ref(), [1, 2, 3, 4].as_ref(),
319            [255, 69, 1, 2, 3, 4].as_ref()]);
320        assert_eq!(s.data, &[[255, 69, 1, 2, 3, 4].as_ref(), [2, 3, 4, 5].as_ref()]);
321        assert_eq!(s.bytes(), 19);
322        assert_eq!(s.backing_bytes(), 10);
323    }
324
325    s.clear();
326    assert_eq!(s.len(), 0);
327    assert!(s.is_empty());
328    assert_eq!(s.iter().count(), 0);
329    assert_eq!(s.bytes(), 0);
330    assert_eq!(s.backing_bytes(), 0);
331
332    assert_eq!(checked_add(&mut s, vec![6, 6, 6]), 0);
333    assert_eq!(s.len(), 1);
334    assert!(!s.is_empty());
335    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[6, 6, 6].as_ref()]);
336    assert_eq!(s.data, &[[6, 6, 6].as_ref()]);
337    assert_eq!(s.bytes(), 3);
338    assert_eq!(s.backing_bytes(), 3);
339
340    assert_eq!(checked_add(&mut s, vec![2, 3, 4]), 1);
341    assert_eq!(s.len(), 2);
342    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[6, 6, 6].as_ref(), [2, 3, 4].as_ref()]);
343    assert_eq!(s.data, &[[6, 6, 6].as_ref(), [2, 3, 4].as_ref()]);
344    assert_eq!(s.bytes(), 6);
345    assert_eq!(s.backing_bytes(), 6);
346
347    assert_eq!(checked_add(&mut s, vec![2, 3]), 2);
348    assert_eq!(s.len(), 3);
349    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[6, 6, 6].as_ref(), [2, 3, 4].as_ref(), [2, 3].as_ref()]);
350    assert_eq!(s.data, &[[6, 6, 6].as_ref(), [2, 3, 4].as_ref()]);
351    assert_eq!(s.bytes(), 8);
352    assert_eq!(s.backing_bytes(), 6);
353
354    assert_eq!(checked_add(&mut s, vec![1, 2, 3, 6, 6, 6]), 3);
355    assert_eq!(s.len(), 4);
356    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[6, 6, 6].as_ref(), [2, 3, 4].as_ref(), [2, 3].as_ref(), [1, 2, 3, 6, 6, 6].as_ref()]);
357    assert_eq!(s.data, &[[1, 2, 3, 6, 6, 6].as_ref(), [2, 3, 4].as_ref()]);
358    assert_eq!(s.bytes(), 14);
359    assert_eq!(s.backing_bytes(), 9);
360
361    assert_eq!(checked_add(&mut s, vec![2, 3]), 2);
362    assert_eq!(s.len(), 4);
363    assert_eq!(s.iter().collect::<Vec<_>>(), vec![[6, 6, 6].as_ref(), [2, 3, 4].as_ref(), [2, 3].as_ref(), [1, 2, 3, 6, 6, 6].as_ref()]);
364    assert_eq!(s.data, &[[1, 2, 3, 6, 6, 6].as_ref(), [2, 3, 4].as_ref()]);
365    assert_eq!(s.bytes(), 14);
366    assert_eq!(s.backing_bytes(), 9);
367
368    {
369        let mut s = BinPool::new();
370        assert_eq!(checked_add(&mut s, vec![0]), 0);
371        assert_eq!(checked_add(&mut s, vec![1]), 1);
372        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[0].as_ref(), [1].as_ref()]);
373        assert_eq!(s.data, vec![[0].as_ref(), [1].as_ref()]);
374        assert_eq!(checked_add(&mut s, vec![0, 1]), 2);
375        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[0].as_ref(), [1].as_ref(), [0, 1].as_ref()]);
376        assert_eq!(s.data, vec![[0, 1].as_ref()]);
377    }
378    {
379        let mut s = BinPool::new();
380        assert_eq!(checked_add(&mut s, vec![0]), 0);
381        assert_eq!(checked_add(&mut s, vec![1]), 1);
382        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[0].as_ref(), [1].as_ref()]);
383        assert_eq!(s.data, vec![[0].as_ref(), [1].as_ref()]);
384        assert_eq!(checked_add(&mut s, vec![1, 0]), 2);
385        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[0].as_ref(), [1].as_ref(), [1, 0].as_ref()]);
386        assert_eq!(s.data, vec![[1, 0].as_ref()]);
387    }
388    {
389        let mut s = BinPool::new();
390        assert_eq!(checked_add(&mut s, vec![]), 0);
391        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[].as_ref()]);
392        assert_eq!(s.data, vec![[].as_ref()]);
393        assert_eq!(checked_add(&mut s, vec![5]), 1);
394        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[].as_ref(), [5].as_ref()]);
395        assert_eq!(s.data, vec![[5].as_ref()]);
396    }
397    {
398        let mut s = BinPool::new();
399        assert_eq!(checked_add(&mut s, vec![7]), 0);
400        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[7].as_ref()]);
401        assert_eq!(s.data, vec![[7].as_ref()]);
402        assert_eq!(checked_add(&mut s, vec![]), 1);
403        assert_eq!(s.iter().collect::<Vec<_>>(), vec![[7].as_ref(), [].as_ref()]);
404        assert_eq!(s.data, vec![[7].as_ref()]);
405    }
406}