musli_zerocopy/trie/factory.rs
1use core::cmp::Ordering;
2use core::marker::PhantomData;
3use core::mem::replace;
4
5#[cfg(feature = "alloc")]
6use alloc::vec::Vec;
7
8use crate::endian::Native;
9use crate::pointer::{Coerce, Pointee};
10use crate::slice::{BinarySearch, Slice};
11use crate::{Buf, ByteOrder, Error, OwnedBuf, Ref, Size, ZeroCopy};
12
13use super::{DefaultFlavor, Flavor, LinksRef, NodeRef, TrieRef, prefix};
14
15/// Store the given collection in a trie.
16///
17/// The trie is stored as a graph, where each level contains a sorted collection
18/// of strings. Each level is traversed using a binary search. Since levels are
19/// expected to be relatively small, this produces a decent time to complexity
20/// tradeoff.
21///
22/// Note that construction of the trie is the most performant if the input keys
23/// are already sorted. Otherwise trie construction might require many
24/// re-allocations.
25///
26/// # Examples
27///
28/// ```
29/// use musli_zerocopy::{trie, OwnedBuf};
30///
31/// let mut buf = OwnedBuf::new();
32///
33/// let values = [
34/// (buf.store_unsized("work"), 1),
35/// (buf.store_unsized("worker"), 2),
36/// (buf.store_unsized("workers"), 3),
37/// (buf.store_unsized("working"), 4),
38/// (buf.store_unsized("working"), 5),
39/// (buf.store_unsized("working man"), 6),
40/// ];
41///
42/// let trie = trie::store(&mut buf, values)?;
43///
44/// assert_eq!(trie.get(&buf, "aard")?, None);
45/// assert_eq!(trie.get(&buf, "worker")?, Some(&[2][..]));
46/// assert_eq!(trie.get(&buf, "working")?, Some(&[4, 5][..]));
47/// # Ok::<_, musli_zerocopy::Error>(())
48/// ```
49#[cfg(feature = "alloc")]
50pub fn store<S, E, O, I, T>(
51 buf: &mut OwnedBuf<E, O>,
52 it: I,
53) -> Result<TrieRef<T, DefaultFlavor<E, O>>, Error>
54where
55 I: IntoIterator<Item = (Ref<S, E, O>, T)>,
56 T: ZeroCopy,
57 S: ?Sized + Pointee + Coerce<[u8]>,
58 E: ByteOrder,
59 O: Size,
60{
61 // First step is to construct the trie in-memory.
62 let mut trie = Builder::with_flavor();
63
64 for (string, value) in it {
65 trie.insert(buf, string, value)?;
66 }
67
68 trie.build(buf)
69}
70
71/// An in-memory trie structure as it's being constructed.
72///
73/// This can be used over [`store()`] to provide more control.
74#[cfg(feature = "alloc")]
75pub struct Builder<T, F = DefaultFlavor>
76where
77 F: Flavor,
78{
79 links: Links<T>,
80 _marker: PhantomData<F>,
81}
82
83#[cfg(feature = "alloc")]
84impl<T> Builder<T> {
85 /// Construct a new empty trie builder with the default [`DefaultFlavor`].
86 #[inline]
87 pub const fn new() -> Self {
88 Self::with_flavor()
89 }
90}
91
92#[cfg(feature = "alloc")]
93impl<T> Default for Builder<T> {
94 #[inline]
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100#[cfg(feature = "alloc")]
101impl<T, F> Builder<T, F>
102where
103 F: Flavor,
104{
105 /// Construct a new empty trie builder with a custom [`Flavor`].
106 pub const fn with_flavor() -> Self {
107 Self {
108 links: Links::empty(),
109 _marker: PhantomData,
110 }
111 }
112
113 /// Insert a value into the trie.
114 ///
115 /// # Examples
116 ///
117 /// ```
118 /// use musli_zerocopy::{trie, OwnedBuf};
119 ///
120 /// let mut buf = OwnedBuf::new();
121 ///
122 /// let mut trie = trie::Builder::new();
123 ///
124 /// let key = buf.store_unsized("working");
125 /// trie.insert(&buf, key, 4)?;
126 /// let key = buf.store_unsized("working man");
127 /// trie.insert(&buf, key, 6)?;
128 /// let key = buf.store_unsized("work");
129 /// trie.insert(&buf, key, 1)?;
130 /// let key = buf.store_unsized("worker");
131 /// trie.insert(&buf, key, 2)?;
132 /// let key = buf.store_unsized("workers");
133 /// trie.insert(&buf, key, 3)?;
134 /// let key = buf.store_unsized("working");
135 /// trie.insert(&buf, key, 5)?;
136 ///
137 /// let trie = trie.build(&mut buf)?;
138 ///
139 /// assert_eq!(trie.get(&buf, "aard")?, None);
140 /// assert_eq!(trie.get(&buf, "worker")?, Some(&[2][..]));
141 /// assert_eq!(trie.get(&buf, "working")?, Some(&[4, 5][..]));
142 /// # Ok::<_, musli_zerocopy::Error>(())
143 /// ```
144 pub fn insert<S, E, O>(
145 &mut self,
146 buf: &Buf,
147 string: Ref<S, E, O>,
148 value: T,
149 ) -> Result<(), Error>
150 where
151 S: ?Sized + Pointee + Coerce<[u8]>,
152 E: ByteOrder,
153 O: Size,
154 {
155 let mut string = string.coerce::<[u8]>();
156 let mut current = buf.load(string)?;
157 let mut this = &mut self.links;
158
159 loop {
160 let search = try_binary_search_by(&this.children, |c| {
161 Ok::<_, Error>(buf.load(c.string)?.cmp(current))
162 })?;
163
164 match search {
165 BinarySearch::Found(n) => {
166 this.children[n].links.values.push(value);
167 return Ok(());
168 }
169 BinarySearch::Missing(0) => {
170 this.children.insert(
171 0,
172 Node {
173 string: Ref::try_with_metadata(string.offset(), string.len())?,
174 links: Links::new(value),
175 },
176 );
177 return Ok(());
178 }
179 BinarySearch::Missing(n) => {
180 let pre = n - 1;
181
182 // Find common prefix and split nodes if necessary.
183 let prefix = prefix(buf.load(this.children[pre].string)?, current);
184
185 // No common prefix in prior node, so a new node is needed.
186 if prefix == 0 {
187 this.children.insert(
188 n,
189 Node {
190 string: Ref::try_with_metadata(string.offset(), string.len())?,
191 links: Links::new(value),
192 },
193 );
194 return Ok(());
195 }
196
197 let child = &mut this.children[pre];
198
199 // This happens if `current` contains a shorter subset match
200 // than the string represented by `child`, like `work` and
201 // the child string is `working` (common prefix is `work`).
202 //
203 // In that scenario, the child node has to be split up, so we transpose it from:
204 //
205 // ```
206 // "working" => { values = [1, 2, 3] }
207 // =>
208 // "work" => { "ing" => { values = [1, 2, 3] } }
209 // ```
210 if prefix != child.string.len() {
211 let (prefix, suffix) = child.string.split_at(prefix);
212 let new_node = Node::new(prefix);
213 let mut replaced = replace(child, new_node);
214 replaced.string = suffix;
215 child.links.children.push(replaced);
216 }
217
218 current = ¤t[prefix..];
219 string = string.split_at(prefix).1;
220 this = &mut child.links;
221 }
222 }
223 }
224 }
225
226 /// Construct a [`TrieRef`] out of the current [`Builder`].
227 ///
228 /// # Errors
229 ///
230 /// Trie construction will error in case an interior node overflows its
231 /// representation as per its [`Flavor`] defined by `F`.
232 ///
233 /// # Examples
234 ///
235 /// ```
236 /// use musli_zerocopy::{trie, OwnedBuf};
237 ///
238 /// let mut buf = OwnedBuf::new();
239 ///
240 /// let mut trie = trie::Builder::new();
241 ///
242 /// let key = buf.store_unsized("work");
243 /// trie.insert(&buf, key, 1)?;
244 /// let key = buf.store_unsized("working");
245 /// trie.insert(&buf, key, 4)?;
246 ///
247 /// let trie = trie.build(&mut buf)?;
248 ///
249 /// assert_eq!(trie.get(&buf, "aard")?, None);
250 /// assert_eq!(trie.get(&buf, "work")?, Some(&[1][..]));
251 /// assert_eq!(trie.get(&buf, "working")?, Some(&[4][..]));
252 /// # Ok::<_, musli_zerocopy::Error>(())
253 /// ```
254 pub fn build<E, O>(self, buf: &mut OwnedBuf<E, O>) -> Result<TrieRef<T, F>, Error>
255 where
256 T: ZeroCopy,
257 E: ByteOrder,
258 O: Size,
259 {
260 Ok(TrieRef {
261 links: self.links.into_ref(buf)?,
262 })
263 }
264}
265
266#[cfg(feature = "alloc")]
267struct Links<T> {
268 values: Vec<T>,
269 children: Vec<Node<T>>,
270}
271
272#[cfg(feature = "alloc")]
273impl<T> Links<T> {
274 const fn empty() -> Self {
275 Self {
276 values: Vec::new(),
277 children: Vec::new(),
278 }
279 }
280
281 fn new(value: T) -> Self {
282 Self {
283 values: alloc::vec![value],
284 children: Vec::new(),
285 }
286 }
287
288 fn into_ref<E, O, F>(self, buf: &mut OwnedBuf<E, O>) -> Result<LinksRef<T, F>, Error>
289 where
290 T: ZeroCopy,
291 E: ByteOrder,
292 O: Size,
293 F: Flavor,
294 {
295 let values = F::Values::try_from_ref(buf.store_slice(&self.values))?;
296
297 let mut children = Vec::with_capacity(self.children.len());
298
299 for node in self.children {
300 children.push(node.into_ref(buf)?);
301 }
302
303 let children = F::Children::try_from_ref(buf.store_slice(&children))?;
304 Ok(LinksRef { values, children })
305 }
306}
307
308#[cfg(feature = "alloc")]
309struct Node<T> {
310 string: Ref<[u8], Native, usize>,
311 links: Links<T>,
312}
313
314#[cfg(feature = "alloc")]
315impl<T> Node<T> {
316 const fn new(string: Ref<[u8], Native, usize>) -> Self {
317 Self {
318 string,
319 links: Links::empty(),
320 }
321 }
322
323 fn into_ref<E, O, F>(self, buf: &mut OwnedBuf<E, O>) -> Result<NodeRef<T, F>, Error>
324 where
325 T: ZeroCopy,
326 E: ByteOrder,
327 O: Size,
328 F: Flavor,
329 {
330 Ok(NodeRef {
331 string: F::String::try_from_ref(self.string)?,
332 links: self.links.into_ref(buf)?,
333 })
334 }
335}
336
337/// Helper function to perform a binary search over a loaded slice.
338fn try_binary_search_by<T, F, E>(slice: &[T], mut f: F) -> Result<BinarySearch, E>
339where
340 F: FnMut(&T) -> Result<Ordering, E>,
341{
342 // INVARIANTS:
343 // - 0 <= left <= left + size = right <= slice.len()
344 // - f returns Less for everything in slice[..left]
345 // - f returns Greater for everything in slice[right..]
346 let mut size = slice.len();
347 let mut left = 0;
348 let mut right = size;
349
350 while left < right {
351 let mid = left + size / 2;
352
353 // SAFETY: The while condition means `size` is strictly positive, so
354 // `size/2 < size`. Thus `left + size/2 < left + size`, which coupled
355 // with the `left + size <= slice.len()` invariant means we have `left +
356 // size/2 < slice.len()`, and this is in-bounds.
357 let value = unsafe { slice.get_unchecked(mid) };
358 let cmp = f(value)?;
359
360 // The reason why we use if/else control flow rather than match
361 // is because match reorders comparison operations, which is perf sensitive.
362 // This is x86 asm for u8: https://rust.godbolt.org/z/8Y8Pra.
363 if cmp == Ordering::Less {
364 left = mid + 1;
365 } else if cmp == Ordering::Greater {
366 right = mid;
367 } else {
368 return Ok(BinarySearch::Found(mid));
369 }
370
371 size = right - left;
372 }
373
374 Ok(BinarySearch::Missing(left))
375}