musli_zerocopy/trie/
factory.rs

1use core::cmp::Ordering;
2use core::marker::PhantomData;
3use core::mem::replace;
4
5#[cfg(feature = "alloc")]
6use alloc::vec::Vec;
7
8use crate::endian::Native;
9use crate::pointer::{Coerce, Pointee};
10use crate::slice::{BinarySearch, Slice};
11use crate::{Buf, ByteOrder, Error, OwnedBuf, Ref, Size, ZeroCopy};
12
13use super::{DefaultFlavor, Flavor, LinksRef, NodeRef, TrieRef, prefix};
14
15/// Store the given collection in a trie.
16///
17/// The trie is stored as a graph, where each level contains a sorted collection
18/// of strings. Each level is traversed using a binary search. Since levels are
19/// expected to be relatively small, this produces a decent time to complexity
20/// tradeoff.
21///
22/// Note that construction of the trie is the most performant if the input keys
23/// are already sorted. Otherwise trie construction might require many
24/// re-allocations.
25///
26/// # Examples
27///
28/// ```
29/// use musli_zerocopy::{trie, OwnedBuf};
30///
31/// let mut buf = OwnedBuf::new();
32///
33/// let values = [
34///     (buf.store_unsized("work"), 1),
35///     (buf.store_unsized("worker"), 2),
36///     (buf.store_unsized("workers"), 3),
37///     (buf.store_unsized("working"), 4),
38///     (buf.store_unsized("working"), 5),
39///     (buf.store_unsized("working man"), 6),
40/// ];
41///
42/// let trie = trie::store(&mut buf, values)?;
43///
44/// assert_eq!(trie.get(&buf, "aard")?, None);
45/// assert_eq!(trie.get(&buf, "worker")?, Some(&[2][..]));
46/// assert_eq!(trie.get(&buf, "working")?, Some(&[4, 5][..]));
47/// # Ok::<_, musli_zerocopy::Error>(())
48/// ```
49#[cfg(feature = "alloc")]
50pub fn store<S, E, O, I, T>(
51    buf: &mut OwnedBuf<E, O>,
52    it: I,
53) -> Result<TrieRef<T, DefaultFlavor<E, O>>, Error>
54where
55    I: IntoIterator<Item = (Ref<S, E, O>, T)>,
56    T: ZeroCopy,
57    S: ?Sized + Pointee + Coerce<[u8]>,
58    E: ByteOrder,
59    O: Size,
60{
61    // First step is to construct the trie in-memory.
62    let mut trie = Builder::with_flavor();
63
64    for (string, value) in it {
65        trie.insert(buf, string, value)?;
66    }
67
68    trie.build(buf)
69}
70
71/// An in-memory trie structure as it's being constructed.
72///
73/// This can be used over [`store()`] to provide more control.
74#[cfg(feature = "alloc")]
75pub struct Builder<T, F = DefaultFlavor>
76where
77    F: Flavor,
78{
79    links: Links<T>,
80    _marker: PhantomData<F>,
81}
82
83#[cfg(feature = "alloc")]
84impl<T> Builder<T> {
85    /// Construct a new empty trie builder with the default [`DefaultFlavor`].
86    #[inline]
87    pub const fn new() -> Self {
88        Self::with_flavor()
89    }
90}
91
92#[cfg(feature = "alloc")]
93impl<T> Default for Builder<T> {
94    #[inline]
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100#[cfg(feature = "alloc")]
101impl<T, F> Builder<T, F>
102where
103    F: Flavor,
104{
105    /// Construct a new empty trie builder with a custom [`Flavor`].
106    pub const fn with_flavor() -> Self {
107        Self {
108            links: Links::empty(),
109            _marker: PhantomData,
110        }
111    }
112
113    /// Insert a value into the trie.
114    ///
115    /// # Examples
116    ///
117    /// ```
118    /// use musli_zerocopy::{trie, OwnedBuf};
119    ///
120    /// let mut buf = OwnedBuf::new();
121    ///
122    /// let mut trie = trie::Builder::new();
123    ///
124    /// let key = buf.store_unsized("working");
125    /// trie.insert(&buf, key, 4)?;
126    /// let key = buf.store_unsized("working man");
127    /// trie.insert(&buf, key, 6)?;
128    /// let key = buf.store_unsized("work");
129    /// trie.insert(&buf, key, 1)?;
130    /// let key = buf.store_unsized("worker");
131    /// trie.insert(&buf, key, 2)?;
132    /// let key = buf.store_unsized("workers");
133    /// trie.insert(&buf, key, 3)?;
134    /// let key = buf.store_unsized("working");
135    /// trie.insert(&buf, key, 5)?;
136    ///
137    /// let trie = trie.build(&mut buf)?;
138    ///
139    /// assert_eq!(trie.get(&buf, "aard")?, None);
140    /// assert_eq!(trie.get(&buf, "worker")?, Some(&[2][..]));
141    /// assert_eq!(trie.get(&buf, "working")?, Some(&[4, 5][..]));
142    /// # Ok::<_, musli_zerocopy::Error>(())
143    /// ```
144    pub fn insert<S, E, O>(
145        &mut self,
146        buf: &Buf,
147        string: Ref<S, E, O>,
148        value: T,
149    ) -> Result<(), Error>
150    where
151        S: ?Sized + Pointee + Coerce<[u8]>,
152        E: ByteOrder,
153        O: Size,
154    {
155        let mut string = string.coerce::<[u8]>();
156        let mut current = buf.load(string)?;
157        let mut this = &mut self.links;
158
159        loop {
160            let search = try_binary_search_by(&this.children, |c| {
161                Ok::<_, Error>(buf.load(c.string)?.cmp(current))
162            })?;
163
164            match search {
165                BinarySearch::Found(n) => {
166                    this.children[n].links.values.push(value);
167                    return Ok(());
168                }
169                BinarySearch::Missing(0) => {
170                    this.children.insert(
171                        0,
172                        Node {
173                            string: Ref::try_with_metadata(string.offset(), string.len())?,
174                            links: Links::new(value),
175                        },
176                    );
177                    return Ok(());
178                }
179                BinarySearch::Missing(n) => {
180                    let pre = n - 1;
181
182                    // Find common prefix and split nodes if necessary.
183                    let prefix = prefix(buf.load(this.children[pre].string)?, current);
184
185                    // No common prefix in prior node, so a new node is needed.
186                    if prefix == 0 {
187                        this.children.insert(
188                            n,
189                            Node {
190                                string: Ref::try_with_metadata(string.offset(), string.len())?,
191                                links: Links::new(value),
192                            },
193                        );
194                        return Ok(());
195                    }
196
197                    let child = &mut this.children[pre];
198
199                    // This happens if `current` contains a shorter subset match
200                    // than the string represented by `child`, like `work` and
201                    // the child string is `working` (common prefix is `work`).
202                    //
203                    // In that scenario, the child node has to be split up, so we transpose it from:
204                    //
205                    // ```
206                    // "working" => { values = [1, 2, 3] }
207                    // =>
208                    // "work" => { "ing" => { values = [1, 2, 3] } }
209                    // ```
210                    if prefix != child.string.len() {
211                        let (prefix, suffix) = child.string.split_at(prefix);
212                        let new_node = Node::new(prefix);
213                        let mut replaced = replace(child, new_node);
214                        replaced.string = suffix;
215                        child.links.children.push(replaced);
216                    }
217
218                    current = &current[prefix..];
219                    string = string.split_at(prefix).1;
220                    this = &mut child.links;
221                }
222            }
223        }
224    }
225
226    /// Construct a [`TrieRef`] out of the current [`Builder`].
227    ///
228    /// # Errors
229    ///
230    /// Trie construction will error in case an interior node overflows its
231    /// representation as per its [`Flavor`] defined by `F`.
232    ///
233    /// # Examples
234    ///
235    /// ```
236    /// use musli_zerocopy::{trie, OwnedBuf};
237    ///
238    /// let mut buf = OwnedBuf::new();
239    ///
240    /// let mut trie = trie::Builder::new();
241    ///
242    /// let key = buf.store_unsized("work");
243    /// trie.insert(&buf, key, 1)?;
244    /// let key = buf.store_unsized("working");
245    /// trie.insert(&buf, key, 4)?;
246    ///
247    /// let trie = trie.build(&mut buf)?;
248    ///
249    /// assert_eq!(trie.get(&buf, "aard")?, None);
250    /// assert_eq!(trie.get(&buf, "work")?, Some(&[1][..]));
251    /// assert_eq!(trie.get(&buf, "working")?, Some(&[4][..]));
252    /// # Ok::<_, musli_zerocopy::Error>(())
253    /// ```
254    pub fn build<E, O>(self, buf: &mut OwnedBuf<E, O>) -> Result<TrieRef<T, F>, Error>
255    where
256        T: ZeroCopy,
257        E: ByteOrder,
258        O: Size,
259    {
260        Ok(TrieRef {
261            links: self.links.into_ref(buf)?,
262        })
263    }
264}
265
266#[cfg(feature = "alloc")]
267struct Links<T> {
268    values: Vec<T>,
269    children: Vec<Node<T>>,
270}
271
272#[cfg(feature = "alloc")]
273impl<T> Links<T> {
274    const fn empty() -> Self {
275        Self {
276            values: Vec::new(),
277            children: Vec::new(),
278        }
279    }
280
281    fn new(value: T) -> Self {
282        Self {
283            values: alloc::vec![value],
284            children: Vec::new(),
285        }
286    }
287
288    fn into_ref<E, O, F>(self, buf: &mut OwnedBuf<E, O>) -> Result<LinksRef<T, F>, Error>
289    where
290        T: ZeroCopy,
291        E: ByteOrder,
292        O: Size,
293        F: Flavor,
294    {
295        let values = F::Values::try_from_ref(buf.store_slice(&self.values))?;
296
297        let mut children = Vec::with_capacity(self.children.len());
298
299        for node in self.children {
300            children.push(node.into_ref(buf)?);
301        }
302
303        let children = F::Children::try_from_ref(buf.store_slice(&children))?;
304        Ok(LinksRef { values, children })
305    }
306}
307
308#[cfg(feature = "alloc")]
309struct Node<T> {
310    string: Ref<[u8], Native, usize>,
311    links: Links<T>,
312}
313
314#[cfg(feature = "alloc")]
315impl<T> Node<T> {
316    const fn new(string: Ref<[u8], Native, usize>) -> Self {
317        Self {
318            string,
319            links: Links::empty(),
320        }
321    }
322
323    fn into_ref<E, O, F>(self, buf: &mut OwnedBuf<E, O>) -> Result<NodeRef<T, F>, Error>
324    where
325        T: ZeroCopy,
326        E: ByteOrder,
327        O: Size,
328        F: Flavor,
329    {
330        Ok(NodeRef {
331            string: F::String::try_from_ref(self.string)?,
332            links: self.links.into_ref(buf)?,
333        })
334    }
335}
336
337/// Helper function to perform a binary search over a loaded slice.
338fn try_binary_search_by<T, F, E>(slice: &[T], mut f: F) -> Result<BinarySearch, E>
339where
340    F: FnMut(&T) -> Result<Ordering, E>,
341{
342    // INVARIANTS:
343    // - 0 <= left <= left + size = right <= slice.len()
344    // - f returns Less for everything in slice[..left]
345    // - f returns Greater for everything in slice[right..]
346    let mut size = slice.len();
347    let mut left = 0;
348    let mut right = size;
349
350    while left < right {
351        let mid = left + size / 2;
352
353        // SAFETY: The while condition means `size` is strictly positive, so
354        // `size/2 < size`. Thus `left + size/2 < left + size`, which coupled
355        // with the `left + size <= slice.len()` invariant means we have `left +
356        // size/2 < slice.len()`, and this is in-bounds.
357        let value = unsafe { slice.get_unchecked(mid) };
358        let cmp = f(value)?;
359
360        // The reason why we use if/else control flow rather than match
361        // is because match reorders comparison operations, which is perf sensitive.
362        // This is x86 asm for u8: https://rust.godbolt.org/z/8Y8Pra.
363        if cmp == Ordering::Less {
364            left = mid + 1;
365        } else if cmp == Ordering::Greater {
366            right = mid;
367        } else {
368            return Ok(BinarySearch::Found(mid));
369        }
370
371        size = right - left;
372    }
373
374    Ok(BinarySearch::Missing(left))
375}