string_interner/
interner.rs

1use crate::{backend::Backend, Symbol};
2use core::{
3    fmt,
4    fmt::{Debug, Formatter},
5    hash::{BuildHasher, Hash, Hasher},
6    iter::FromIterator,
7};
8use hashbrown::{DefaultHashBuilder, HashMap};
9
10/// Creates the `u64` hash value for the given value using the given hash builder.
11fn make_hash<T>(builder: &impl BuildHasher, value: &T) -> u64
12where
13    T: ?Sized + Hash,
14{
15    let state = &mut builder.build_hasher();
16    value.hash(state);
17    state.finish()
18}
19
20/// Data structure to intern and resolve strings.
21///
22/// Caches strings efficiently, with minimal memory footprint and associates them with unique symbols.
23/// These symbols allow constant time comparisons and look-ups to the underlying interned strings.
24///
25/// The following API covers the main functionality:
26///
27/// - [`StringInterner::get_or_intern`]: To intern a new string.
28///     - This maps from `string` type to `symbol` type.
29/// - [`StringInterner::resolve`]: To resolve your already interned strings.
30///     - This maps from `symbol` type to `string` type.
31pub struct StringInterner<B, H = DefaultHashBuilder>
32where
33    B: Backend,
34{
35    dedup: HashMap<<B as Backend>::Symbol, (), ()>,
36    hasher: H,
37    backend: B,
38}
39
40impl<B, H> Debug for StringInterner<B, H>
41where
42    B: Backend + Debug,
43    <B as Backend>::Symbol: Symbol + Debug,
44    H: BuildHasher,
45{
46    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
47        f.debug_struct("StringInterner")
48            .field("dedup", &self.dedup)
49            .field("backend", &self.backend)
50            .finish()
51    }
52}
53
54#[cfg(feature = "backends")]
55impl Default for StringInterner<crate::DefaultBackend> {
56    #[cfg_attr(feature = "inline-more", inline)]
57    fn default() -> Self {
58        StringInterner::new()
59    }
60}
61
62impl<B, H> Clone for StringInterner<B, H>
63where
64    B: Backend + Clone,
65    <B as Backend>::Symbol: Symbol,
66    H: BuildHasher + Clone,
67{
68    fn clone(&self) -> Self {
69        Self {
70            dedup: self.dedup.clone(),
71            hasher: self.hasher.clone(),
72            backend: self.backend.clone(),
73        }
74    }
75}
76
77impl<B, H> PartialEq for StringInterner<B, H>
78where
79    B: Backend + PartialEq,
80    <B as Backend>::Symbol: Symbol,
81    H: BuildHasher,
82{
83    fn eq(&self, rhs: &Self) -> bool {
84        self.len() == rhs.len() && self.backend == rhs.backend
85    }
86}
87
88impl<B, H> Eq for StringInterner<B, H>
89where
90    B: Backend + Eq,
91    <B as Backend>::Symbol: Symbol,
92    H: BuildHasher,
93{
94}
95
96impl<B, H> StringInterner<B, H>
97where
98    B: Backend,
99    <B as Backend>::Symbol: Symbol,
100    H: BuildHasher + Default,
101{
102    /// Creates a new empty `StringInterner`.
103    #[cfg_attr(feature = "inline-more", inline)]
104    pub fn new() -> Self {
105        Self {
106            dedup: HashMap::default(),
107            hasher: Default::default(),
108            backend: B::default(),
109        }
110    }
111
112    /// Creates a new `StringInterner` with the given initial capacity.
113    #[cfg_attr(feature = "inline-more", inline)]
114    pub fn with_capacity(cap: usize) -> Self {
115        Self {
116            dedup: HashMap::with_capacity_and_hasher(cap, ()),
117            hasher: Default::default(),
118            backend: B::with_capacity(cap),
119        }
120    }
121}
122
123impl<B, H> StringInterner<B, H>
124where
125    B: Backend,
126    <B as Backend>::Symbol: Symbol,
127    H: BuildHasher,
128{
129    /// Creates a new empty `StringInterner` with the given hasher.
130    #[cfg_attr(feature = "inline-more", inline)]
131    pub fn with_hasher(hash_builder: H) -> Self {
132        StringInterner {
133            dedup: HashMap::default(),
134            hasher: hash_builder,
135            backend: B::default(),
136        }
137    }
138
139    /// Creates a new empty `StringInterner` with the given initial capacity and the given hasher.
140    #[cfg_attr(feature = "inline-more", inline)]
141    pub fn with_capacity_and_hasher(cap: usize, hash_builder: H) -> Self {
142        StringInterner {
143            dedup: HashMap::with_capacity_and_hasher(cap, ()),
144            hasher: hash_builder,
145            backend: B::with_capacity(cap),
146        }
147    }
148
149    /// Returns the number of strings interned by the interner.
150    #[cfg_attr(feature = "inline-more", inline)]
151    pub fn len(&self) -> usize {
152        self.dedup.len()
153    }
154
155    /// Returns `true` if the string interner has no interned strings.
156    #[cfg_attr(feature = "inline-more", inline)]
157    pub fn is_empty(&self) -> bool {
158        self.len() == 0
159    }
160
161    /// Returns the symbol for the given string if any.
162    ///
163    /// Can be used to query if a string has already been interned without interning.
164    #[inline]
165    pub fn get<T>(&self, string: T) -> Option<<B as Backend>::Symbol>
166    where
167        T: AsRef<str>,
168    {
169        let string = string.as_ref();
170        let Self {
171            dedup,
172            hasher,
173            backend,
174        } = self;
175        let hash = make_hash(hasher, string);
176        dedup
177            .raw_entry()
178            .from_hash(hash, |symbol| {
179                // SAFETY: This is safe because we only operate on symbols that
180                //         we receive from our backend making them valid.
181                string == unsafe { backend.resolve_unchecked(*symbol) }
182            })
183            .map(|(&symbol, &())| symbol)
184    }
185
186    /// Interns the given string.
187    ///
188    /// This is used as backend by [`get_or_intern`][1] and [`get_or_intern_static`][2].
189    ///
190    /// [1]: [`StringInterner::get_or_intern`]
191    /// [2]: [`StringInterner::get_or_intern_static`]
192    #[cfg_attr(feature = "inline-more", inline)]
193    fn get_or_intern_using<T>(
194        &mut self,
195        string: T,
196        intern_fn: fn(&mut B, T) -> <B as Backend>::Symbol,
197    ) -> <B as Backend>::Symbol
198    where
199        T: Copy + Hash + AsRef<str> + for<'a> PartialEq<&'a str>,
200    {
201        let Self {
202            dedup,
203            hasher,
204            backend,
205        } = self;
206        let hash = make_hash(hasher, string.as_ref());
207        let entry = dedup.raw_entry_mut().from_hash(hash, |symbol| {
208            // SAFETY: This is safe because we only operate on symbols that
209            //         we receive from our backend making them valid.
210            string == unsafe { backend.resolve_unchecked(*symbol) }
211        });
212        use hashbrown::hash_map::RawEntryMut;
213        let (&mut symbol, &mut ()) = match entry {
214            RawEntryMut::Occupied(occupied) => occupied.into_key_value(),
215            RawEntryMut::Vacant(vacant) => {
216                let symbol = intern_fn(backend, string);
217                vacant.insert_with_hasher(hash, symbol, (), |symbol| {
218                    // SAFETY: This is safe because we only operate on symbols that
219                    //         we receive from our backend making them valid.
220                    let string = unsafe { backend.resolve_unchecked(*symbol) };
221                    make_hash(hasher, string)
222                })
223            }
224        };
225        symbol
226    }
227
228    /// Interns the given string.
229    ///
230    /// Returns a symbol for resolution into the original string.
231    ///
232    /// # Panics
233    ///
234    /// If the interner already interns the maximum number of strings possible
235    /// by the chosen symbol type.
236    #[inline]
237    pub fn get_or_intern<T>(&mut self, string: T) -> <B as Backend>::Symbol
238    where
239        T: AsRef<str>,
240    {
241        self.get_or_intern_using(string.as_ref(), B::intern)
242    }
243
244    /// Interns the given `'static` string.
245    ///
246    /// Returns a symbol for resolution into the original string.
247    ///
248    /// # Note
249    ///
250    /// This is more efficient than [`StringInterner::get_or_intern`] since it might
251    /// avoid some memory allocations if the backends supports this.
252    ///
253    /// # Panics
254    ///
255    /// If the interner already interns the maximum number of strings possible
256    /// by the chosen symbol type.
257    #[inline]
258    pub fn get_or_intern_static(&mut self, string: &'static str) -> <B as Backend>::Symbol {
259        self.get_or_intern_using(string, B::intern_static)
260    }
261
262    /// Shrink backend capacity to fit the interned strings exactly.
263    pub fn shrink_to_fit(&mut self) {
264        self.backend.shrink_to_fit()
265    }
266
267    /// Returns the string for the given `symbol`` if any.
268    #[inline]
269    pub fn resolve(&self, symbol: <B as Backend>::Symbol) -> Option<&str> {
270        self.backend.resolve(symbol)
271    }
272
273    /// Returns the string for the given `symbol` without performing any checks.
274    ///
275    /// # Safety
276    ///
277    /// It is the caller's responsibility to provide this method with `symbol`s
278    /// that are valid for the [`StringInterner`].
279    #[inline]
280    pub unsafe fn resolve_unchecked(&self, symbol: <B as Backend>::Symbol) -> &str {
281        unsafe { self.backend.resolve_unchecked(symbol) }
282    }
283
284    /// Returns an iterator that yields all interned strings and their symbols.
285    #[inline]
286    pub fn iter(&self) -> <B as Backend>::Iter<'_> {
287        self.backend.iter()
288    }
289}
290
291impl<B, H, T> FromIterator<T> for StringInterner<B, H>
292where
293    B: Backend,
294    <B as Backend>::Symbol: Symbol,
295    H: BuildHasher + Default,
296    T: AsRef<str>,
297{
298    fn from_iter<I>(iter: I) -> Self
299    where
300        I: IntoIterator<Item = T>,
301    {
302        let iter = iter.into_iter();
303        let (capacity, _) = iter.size_hint();
304        let mut interner = Self::with_capacity(capacity);
305        interner.extend(iter);
306        interner
307    }
308}
309
310impl<B, H, T> Extend<T> for StringInterner<B, H>
311where
312    B: Backend,
313    <B as Backend>::Symbol: Symbol,
314    H: BuildHasher,
315    T: AsRef<str>,
316{
317    fn extend<I>(&mut self, iter: I)
318    where
319        I: IntoIterator<Item = T>,
320    {
321        for s in iter {
322            self.get_or_intern(s.as_ref());
323        }
324    }
325}
326
327impl<'a, B, H> IntoIterator for &'a StringInterner<B, H>
328where
329    B: Backend,
330    H: BuildHasher,
331{
332    type Item = (<B as Backend>::Symbol, &'a str);
333    type IntoIter = <B as Backend>::Iter<'a>;
334
335    #[cfg_attr(feature = "inline-more", inline)]
336    fn into_iter(self) -> Self::IntoIter {
337        self.backend.iter()
338    }
339}