string_interner/backend/bucket/
mod.rs

1#![cfg(feature = "backends")]
2
3mod fixed_str;
4mod interned_str;
5
6use self::{fixed_str::FixedString, interned_str::InternedStr};
7use super::Backend;
8use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol};
9use alloc::{string::String, vec::Vec};
10use core::{iter::Enumerate, marker::PhantomData, slice};
11
12/// An interner backend that reduces memory allocations by using string buckets.
13///
14/// # Note
15///
16/// Implementation inspired by matklad's blog post that can be found here:
17/// <https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html>
18///
19/// # Usage Hint
20///
21/// Use when deallocations or copy overhead is costly or when
22/// interning of static strings is especially common.
23///
24/// # Usage
25///
26/// - **Fill:** Efficiency of filling an empty string interner.
27/// - **Resolve:** Efficiency of interned string look-up given a symbol.
28/// - **Allocations:** The number of allocations performed by the backend.
29/// - **Footprint:** The total heap memory consumed by the backend.
30/// - **Contiguous:** True if the returned symbols have contiguous values.
31/// - **Iteration:** Efficiency of iterating over the interned strings.
32///
33/// Rating varies between **bad**, **ok**, **good** and **best**.
34///
35/// | Scenario    |  Rating  |
36/// |:------------|:--------:|
37/// | Fill        | **good** |
38/// | Resolve     | **best**   |
39/// | Allocations | **good** |
40/// | Footprint   | **ok**   |
41/// | Supports `get_or_intern_static` | **yes** |
42/// | `Send` + `Sync` | **yes** |
43/// | Contiguous  | **yes**  |
44/// | Iteration   | **best** |
45#[derive(Debug)]
46pub struct BucketBackend<S = DefaultSymbol> {
47    spans: Vec<InternedStr>,
48    head: FixedString,
49    full: Vec<String>,
50    marker: PhantomData<fn() -> S>,
51}
52
53/// # Safety
54///
55/// The bucket backend requires a manual [`Send`] impl because it is self
56/// referential. When cloning a bucket backend a deep clone is performed and
57/// all references to itself are updated for the clone.
58unsafe impl<S> Send for BucketBackend<S> where S: Symbol {}
59
60/// # Safety
61///
62/// The bucket backend requires a manual [`Send`] impl because it is self
63/// referential. Those references won't escape its own scope and also
64/// the bucket backend has no interior mutability.
65unsafe impl<S> Sync for BucketBackend<S> where S: Symbol {}
66
67impl<S> Default for BucketBackend<S> {
68    #[cfg_attr(feature = "inline-more", inline)]
69    fn default() -> Self {
70        Self {
71            spans: Vec::new(),
72            head: FixedString::default(),
73            full: Vec::new(),
74            marker: Default::default(),
75        }
76    }
77}
78
79impl<S> Backend for BucketBackend<S>
80where
81    S: Symbol,
82{
83    type Symbol = S;
84    type Iter<'a>
85        = Iter<'a, S>
86    where
87        Self: 'a;
88
89    #[cfg_attr(feature = "inline-more", inline)]
90    fn with_capacity(cap: usize) -> Self {
91        Self {
92            spans: Vec::with_capacity(cap),
93            head: FixedString::with_capacity(cap),
94            full: Vec::new(),
95            marker: Default::default(),
96        }
97    }
98
99    #[inline]
100    fn intern(&mut self, string: &str) -> Self::Symbol {
101        // SAFETY: This is safe because we never hand out the returned
102        //         interned string instance to the outside and only operate
103        //         on it within this backend.
104        let interned = unsafe { self.alloc(string) };
105        self.push_span(interned)
106    }
107
108    #[cfg_attr(feature = "inline-more", inline)]
109    fn intern_static(&mut self, string: &'static str) -> Self::Symbol {
110        let interned = InternedStr::new(string);
111        self.push_span(interned)
112    }
113
114    fn shrink_to_fit(&mut self) {
115        self.spans.shrink_to_fit();
116        // Commenting out the below line fixes: https://github.com/Robbepop/string-interner/issues/46
117        // self.head.shrink_to_fit();
118        self.full.shrink_to_fit();
119    }
120
121    #[inline]
122    fn resolve(&self, symbol: Self::Symbol) -> Option<&str> {
123        self.spans.get(symbol.to_usize()).map(InternedStr::as_str)
124    }
125
126    #[inline]
127    unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str {
128        // SAFETY: The function is marked unsafe so that the caller guarantees
129        //         that required invariants are checked.
130        unsafe { self.spans.get_unchecked(symbol.to_usize()).as_str() }
131    }
132
133    #[inline]
134    fn iter(&self) -> Self::Iter<'_> {
135        Iter::new(self)
136    }
137}
138
139impl<S> BucketBackend<S>
140where
141    S: Symbol,
142{
143    /// Returns the next available symbol.
144    fn next_symbol(&self) -> S {
145        expect_valid_symbol(self.spans.len())
146    }
147
148    /// Pushes the given interned string into the spans and returns its symbol.
149    fn push_span(&mut self, interned: InternedStr) -> S {
150        let symbol = self.next_symbol();
151        self.spans.push(interned);
152        symbol
153    }
154
155    /// Interns a new string into the backend and returns a reference to it.
156    unsafe fn alloc(&mut self, string: &str) -> InternedStr {
157        let cap = self.head.capacity();
158        if cap < self.head.len() + string.len() {
159            let new_cap = (usize::max(cap, string.len()) + 1).next_power_of_two();
160            let new_head = FixedString::with_capacity(new_cap);
161            let old_head = core::mem::replace(&mut self.head, new_head);
162            self.full.push(old_head.finish());
163        }
164        self.head
165            .push_str(string)
166            .expect("encountered invalid head capacity (2)")
167    }
168}
169
170impl<S> Clone for BucketBackend<S> {
171    fn clone(&self) -> Self {
172        // For performance reasons we copy all cloned strings into a single cloned
173        // head string leaving the cloned `full` empty.
174        let new_head_cap =
175            self.head.capacity() + self.full.iter().fold(0, |lhs, rhs| lhs + rhs.len());
176        let mut head = FixedString::with_capacity(new_head_cap);
177        let mut spans = Vec::with_capacity(self.spans.len());
178        for span in &self.spans {
179            let string = span.as_str();
180            let interned = head
181                .push_str(string)
182                .expect("encountered invalid head capacity");
183            spans.push(interned);
184        }
185        Self {
186            spans,
187            head,
188            full: Vec::new(),
189            marker: Default::default(),
190        }
191    }
192}
193
194impl<S> Eq for BucketBackend<S> where S: Symbol {}
195
196impl<S> PartialEq for BucketBackend<S>
197where
198    S: Symbol,
199{
200    #[cfg_attr(feature = "inline-more", inline)]
201    fn eq(&self, other: &Self) -> bool {
202        self.spans == other.spans
203    }
204}
205
206impl<'a, S> IntoIterator for &'a BucketBackend<S>
207where
208    S: Symbol,
209{
210    type Item = (S, &'a str);
211    type IntoIter = Iter<'a, S>;
212
213    #[cfg_attr(feature = "inline-more", inline)]
214    fn into_iter(self) -> Self::IntoIter {
215        self.iter()
216    }
217}
218
219pub struct Iter<'a, S> {
220    iter: Enumerate<slice::Iter<'a, InternedStr>>,
221    symbol_marker: PhantomData<fn() -> S>,
222}
223
224impl<'a, S> Iter<'a, S> {
225    #[cfg_attr(feature = "inline-more", inline)]
226    pub fn new(backend: &'a BucketBackend<S>) -> Self {
227        Self {
228            iter: backend.spans.iter().enumerate(),
229            symbol_marker: Default::default(),
230        }
231    }
232}
233
234impl<'a, S> Iterator for Iter<'a, S>
235where
236    S: Symbol,
237{
238    type Item = (S, &'a str);
239
240    #[inline]
241    fn size_hint(&self) -> (usize, Option<usize>) {
242        self.iter.size_hint()
243    }
244
245    #[inline]
246    fn next(&mut self) -> Option<Self::Item> {
247        self.iter
248            .next()
249            .map(|(id, interned)| (expect_valid_symbol(id), interned.as_str()))
250    }
251}