string_hash_interner/
interner.rs

1use crate::{
2    backend::{Iter, IterWithHashes, StringBackend},
3    intern::Intern,
4    DefaultSymbol, Symbol,
5};
6use core::{
7    fmt,
8    fmt::{Debug, Formatter},
9    hash::{BuildHasher, Hasher},
10    iter::FromIterator,
11};
12use hashbrown::{DefaultHashBuilder, HashMap};
13
14/// Creates the `u64` hash value for the given value using the given hash builder.
15fn make_hash<I: Intern + ?Sized>(builder: &impl BuildHasher, value: &I) -> u64 {
16    let state = &mut builder.build_hasher();
17    value.hash(state);
18    state.finish()
19}
20
21/// Data structure to intern and resolve strings.
22///
23/// Caches strings efficiently, with minimal memory footprint and associates them with unique symbols.
24/// These symbols allow constant time comparisons and look-ups to the underlying interned strings.
25///
26/// The following API covers the main functionality:
27///
28/// - [`Interner::intern`]: To intern a new string.
29///     - This maps from `string` type to `symbol` type.
30/// - [`Interner::resolve`]: To resolve your already interned strings.
31///     - This maps from `symbol` type to `string` type.
32pub struct Interner<I: Intern + ?Sized, S: Symbol = DefaultSymbol, H = DefaultHashBuilder> {
33    dedup: HashMap<S, (), ()>,
34    hasher: H,
35    backend: StringBackend<I, S>,
36}
37
38impl<I: Intern + ?Sized, S: Symbol, H> Debug for Interner<I, S, H>
39where
40    S: Debug,
41    H: BuildHasher,
42{
43    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
44        f.debug_struct("StringInterner")
45            .field("dedup", &self.dedup)
46            .field("backend", &self.backend)
47            .finish()
48    }
49}
50
51impl<I: Intern + ?Sized, S: Symbol, H: BuildHasher + Default> Default for Interner<I, S, H> {
52    #[cfg_attr(feature = "inline-more", inline)]
53    fn default() -> Self {
54        Interner::new()
55    }
56}
57
58impl<I: Intern + ?Sized, S: Symbol, H: Clone> Clone for Interner<I, S, H> {
59    fn clone(&self) -> Self {
60        Self {
61            dedup: self.dedup.clone(),
62            hasher: self.hasher.clone(),
63            backend: self.backend.clone(),
64        }
65    }
66}
67
68impl<I: Intern + ?Sized, S: Symbol, H: BuildHasher + Default> Interner<I, S, H> {
69    /// Creates a new empty [`Interner`].
70    #[cfg_attr(feature = "inline-more", inline)]
71    pub fn new() -> Self {
72        Self {
73            dedup: HashMap::default(),
74            hasher: Default::default(),
75            backend: StringBackend::default(),
76        }
77    }
78
79    /// Creates a new `StringInterner` with the given initial capacity.
80    #[cfg_attr(feature = "inline-more", inline)]
81    pub fn with_capacity(cap: usize) -> Self {
82        Self {
83            dedup: HashMap::with_capacity_and_hasher(cap, ()),
84            hasher: Default::default(),
85            backend: StringBackend::with_capacity(cap),
86        }
87    }
88}
89
90impl<I: Intern + ?Sized, S: Symbol, H: BuildHasher> Interner<I, S, H> {
91    /// Creates a new empty `StringInterner` with the given hasher.
92    #[cfg_attr(feature = "inline-more", inline)]
93    pub fn with_hasher(hash_builder: H) -> Self {
94        Interner {
95            dedup: HashMap::default(),
96            hasher: hash_builder,
97            backend: StringBackend::default(),
98        }
99    }
100
101    /// Creates a new empty `StringInterner` with the given initial capacity and the given hasher.
102    #[cfg_attr(feature = "inline-more", inline)]
103    pub fn with_capacity_and_hasher(cap: usize, hash_builder: H) -> Self {
104        Interner {
105            dedup: HashMap::with_capacity_and_hasher(cap, ()),
106            hasher: hash_builder,
107            backend: StringBackend::with_capacity(cap),
108        }
109    }
110
111    /// Returns the number of strings interned by the interner.
112    #[cfg_attr(feature = "inline-more", inline)]
113    pub fn len(&self) -> usize {
114        self.dedup.len()
115    }
116
117    /// Returns `true` if the string interner has no interned strings.
118    #[cfg_attr(feature = "inline-more", inline)]
119    pub fn is_empty(&self) -> bool {
120        self.len() == 0
121    }
122
123    /// Returns the symbol for the given string if any.
124    ///
125    /// Can be used to query if a string has already been interned without interning.
126    #[inline]
127    pub fn get<T>(&self, string: T) -> Option<S>
128    where
129        T: AsRef<I>,
130    {
131        let string = string.as_ref();
132
133        let hash = make_hash(&self.hasher, string);
134        self.dedup
135            .raw_entry()
136            .from_hash(hash, |symbol| {
137                // SAFETY: This is safe because we only operate on symbols that
138                //         we receive from our backend making them valid.
139                string == unsafe { self.backend.resolve_unchecked(*symbol) }
140            })
141            .map(|(&symbol, &())| symbol)
142    }
143
144    /// Interns the given string.
145    ///
146    /// Returns a symbol for resolution into the original string, and its hash.
147    ///
148    /// # Panics
149    ///
150    /// If the interner already interns the maximum number of strings possible
151    /// by the chosen symbol type.
152    #[inline]
153    pub fn intern_and_hash<T: AsRef<I>>(&mut self, string: T) -> (S, u64) {
154        let string = string.as_ref();
155
156        let hash = make_hash(&self.hasher, string);
157        let entry = self.dedup.raw_entry_mut().from_hash(hash, |symbol| {
158            // SAFETY: This is safe because we only operate on symbols that
159            //         we receive from our backend making them valid.
160            string == unsafe { self.backend.resolve_unchecked(*symbol) }
161        });
162        use hashbrown::hash_map::RawEntryMut;
163        let (&mut symbol, &mut ()) = match entry {
164            RawEntryMut::Occupied(occupied) => occupied.into_key_value(),
165            RawEntryMut::Vacant(vacant) => {
166                let symbol = self.backend.intern(string, hash);
167                vacant.insert_with_hasher(hash, symbol, (), |symbol| {
168                    // SAFETY: This is safe because we only operate on symbols that
169                    //         we receive from our backend making them valid.
170                    unsafe { self.backend.get_hash_unchecked(*symbol) }
171                })
172            }
173        };
174        (symbol, hash)
175    }
176
177    /// Interns the given string.
178    ///
179    /// Returns a symbol for resolution into the original string.
180    ///
181    /// # Panics
182    ///
183    /// If the interner already interns the maximum number of strings possible
184    /// by the chosen symbol type.
185    #[inline]
186    pub fn intern<T: AsRef<I>>(&mut self, string: T) -> S {
187        self.intern_and_hash(string).0
188    }
189
190    /// Shrink backend capacity to fit the interned strings exactly.
191    pub fn shrink_to_fit(&mut self) {
192        self.backend.shrink_to_fit()
193    }
194
195    /// Returns the string for the given `symbol`` if any.
196    #[inline]
197    pub fn resolve(&self, symbol: S) -> Option<&I> {
198        self.backend.resolve(symbol)
199    }
200
201    /// Returns cached hash of the string for the given `symbol`.
202    pub fn get_hash(&self, symbol: S) -> Option<u64> {
203        self.backend.get_hash(symbol)
204    }
205
206    /// Returns the string for the given `symbol` without performing any checks.
207    ///
208    /// # Safety
209    ///
210    /// It is the caller's responsibility to provide this method with `symbol`s
211    /// that are valid for the [`Interner`].
212    #[inline]
213    pub unsafe fn resolve_unchecked(&self, symbol: S) -> &I {
214        unsafe { self.backend.resolve_unchecked(symbol) }
215    }
216
217    /// Returns cached hash of the string for the given `symbol` without performing any checks.
218    ///
219    /// # Safety
220    ///
221    /// It is the caller's responsibility to provide this method with `symbol`s
222    /// that are valid for the [`Interner`].
223    pub unsafe fn get_hash_unchecked(&self, symbol: S) -> u64 {
224        // SAFETY: The function is marked unsafe so that the caller guarantees
225        //         that required invariants are checked.
226        unsafe { self.backend.get_hash_unchecked(symbol) }
227    }
228
229    /// Returns an iterator that yields all interned strings, their symbols, and hashes.
230    #[inline]
231    pub fn iter_with_hashes(&self) -> IterWithHashes<'_, I, S> {
232        self.backend.iter_with_hashes()
233    }
234
235    /// Returns an iterator that yields all interned strings and their symbols.
236    #[inline]
237    pub fn iter(&self) -> Iter<'_, I, S> {
238        self.backend.iter()
239    }
240}
241
242impl<I: Intern + ?Sized, S: Symbol, H: BuildHasher + Default, T: AsRef<I>> FromIterator<T>
243    for Interner<I, S, H>
244{
245    fn from_iter<It>(iter: It) -> Self
246    where
247        It: IntoIterator<Item = T>,
248    {
249        let iter = iter.into_iter();
250        let (capacity, _) = iter.size_hint();
251        let mut interner = Self::with_capacity(capacity);
252        interner.extend(iter);
253        interner
254    }
255}
256
257impl<I: Intern + ?Sized, S: Symbol, H: BuildHasher + Default, T: AsRef<I>> Extend<T>
258    for Interner<I, S, H>
259{
260    fn extend<It>(&mut self, iter: It)
261    where
262        It: IntoIterator<Item = T>,
263    {
264        for s in iter {
265            self.intern_and_hash(s);
266        }
267    }
268}
269
270impl<'a, I: Intern + ?Sized, S: Symbol, H> IntoIterator for &'a Interner<I, S, H> {
271    type Item = (S, &'a I);
272    type IntoIter = Iter<'a, I, S>;
273
274    #[cfg_attr(feature = "inline-more", inline)]
275    fn into_iter(self) -> Self::IntoIter {
276        self.backend.iter()
277    }
278}