arc_string_interner/
lib.rs

1#![cfg_attr(all(feature = "bench", test), feature(test))]
2#![doc(html_root_url = "https://docs.rs/crate/arc-string-interner/0.1.0")]
3#![deny(missing_docs)]
4
5//! Caches strings efficiently, with minimal memory footprint and associates them with unique symbols.
6//! These symbols allow constant time comparisons and look-ups to the underlying interned strings.
7//!
8//! ### Example: Interning & Symbols
9//!
10//! ```
11//! use arc_string_interner::StringInterner;
12//!
13//! let mut interner = StringInterner::default();
14//! let sym0 = interner.get_or_intern("Elephant");
15//! let sym1 = interner.get_or_intern("Tiger");
16//! let sym2 = interner.get_or_intern("Horse");
17//! let sym3 = interner.get_or_intern("Tiger");
18//! assert_ne!(sym0, sym1);
19//! assert_ne!(sym0, sym2);
20//! assert_ne!(sym1, sym2);
21//! assert_eq!(sym1, sym3); // same!
22//! ```
23//!
24//! ### Example: Creation by `FromIterator`
25//!
26//! ```
27//! # use arc_string_interner::DefaultStringInterner;
28//! let interner = vec!["Elephant", "Tiger", "Horse", "Tiger"]
29//! 	.into_iter()
30//! 	.collect::<DefaultStringInterner>();
31//! ```
32//!
33//! ### Example: Look-up
34//!
35//! ```
36//! # use arc_string_interner::StringInterner;
37//! let mut interner = StringInterner::default();
38//! let sym = interner.get_or_intern("Banana");
39//! assert_eq!(interner.resolve(sym).map(ToString::to_string), Some("Banana".to_owned()));
40//! ```
41//!
42//! ### Example: Iteration
43//!
44//! ```
45//! # use arc_string_interner::DefaultStringInterner;
46//! let interner = vec!["Earth", "Water", "Fire", "Air"]
47//! 	.into_iter()
48//! 	.collect::<DefaultStringInterner>();
49//! for (sym, str) in interner {
50//! 	// iteration code here!
51//! }
52//! ```
53
54#[cfg(all(feature = "bench", test))]
55extern crate test;
56
57#[cfg(test)]
58mod tests;
59
60#[cfg(all(feature = "bench", test))]
61mod benches;
62
63#[cfg(feature = "serde_support")]
64mod serde_impl;
65
66use std::iter::FromIterator;
67use std::{
68    collections::{hash_map::RandomState, HashMap},
69    hash::{BuildHasher, Hash, Hasher},
70    iter, marker,
71    num::NonZeroU32,
72    slice,
73    sync::Arc,
74    u32, vec,
75};
76
77/// Types implementing this trait are able to act as symbols for string interners.
78///
79/// Symbols are returned by `StringInterner::get_or_intern` and allow look-ups of the
80/// original string contents with `StringInterner::resolve`.
81///
82/// # Note
83///
84/// Optimal symbols allow for efficient comparisons and have a small memory footprint.
85pub trait Symbol: Copy + Ord + Eq {
86    /// Creates a symbol from a `usize`.
87    ///
88    /// # Note
89    ///
90    /// Implementations panic if the operation cannot succeed.
91    fn from_usize(val: usize) -> Self;
92
93    /// Returns the `usize` representation of `self`.
94    fn to_usize(self) -> usize;
95}
96
97/// Symbol type used by the `DefaultStringInterner`.
98///
99/// # Note
100///
101/// This special symbol type has a memory footprint of 32 bits
102/// and allows for certain space optimizations such as using it within an option: `Option<Sym>`
103#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
104pub struct Sym(NonZeroU32);
105
106impl Symbol for Sym {
107    /// Creates a `Sym` from the given `usize`.
108    ///
109    /// # Panics
110    ///
111    /// If the given `usize` is greater than `u32::MAX - 1`.
112    fn from_usize(val: usize) -> Self {
113        assert!(
114            val < u32::MAX as usize,
115            "Symbol value {} is too large and not supported by `string_interner::Sym` type",
116            val
117        );
118        Sym(NonZeroU32::new((val + 1) as u32).unwrap_or_else(|| {
119            unreachable!("Should never fail because `val + 1` is nonzero and `<= u32::MAX`")
120        }))
121    }
122
123    fn to_usize(self) -> usize {
124        (self.0.get() as usize) - 1
125    }
126}
127
128impl Symbol for usize {
129    fn from_usize(val: usize) -> Self {
130        val
131    }
132
133    fn to_usize(self) -> usize {
134        self
135    }
136}
137
138/// Internal reference to `str` used only within the `StringInterner` itself
139/// to encapsulate the unsafe behaviour of interior references.
140#[derive(Debug, Copy, Clone, Eq)]
141struct InternalStrRef(*const str);
142
143impl InternalStrRef {
144    /// Creates an InternalStrRef from a str.
145    ///
146    /// This just wraps the str internally.
147    fn from_str(val: &str) -> Self {
148        InternalStrRef(val as *const str)
149    }
150
151    /// Reinterprets this InternalStrRef as a str.
152    ///
153    /// This is "safe" as long as this InternalStrRef only
154    /// refers to strs that outlive this instance or
155    /// the instance that owns this InternalStrRef.
156    /// This should hold true for `StringInterner`.
157    ///
158    /// Does not allocate memory!
159    fn as_str(&self) -> &str {
160        unsafe { &*self.0 }
161    }
162}
163
164impl<T> From<T> for InternalStrRef
165where
166    T: AsRef<str>,
167{
168    fn from(val: T) -> Self {
169        InternalStrRef::from_str(val.as_ref())
170    }
171}
172
173impl Hash for InternalStrRef {
174    fn hash<H: Hasher>(&self, state: &mut H) {
175        self.as_str().hash(state)
176    }
177}
178
179impl PartialEq for InternalStrRef {
180    fn eq(&self, other: &InternalStrRef) -> bool {
181        self.as_str() == other.as_str()
182    }
183}
184
185/// `StringInterner` that uses `Sym` as its underlying symbol type.
186pub type DefaultStringInterner = StringInterner<Sym>;
187
188/// Caches strings efficiently, with minimal memory footprint and associates them with unique symbols.
189/// These symbols allow constant time comparisons and look-ups to the underlying interned strings.
190#[derive(Debug, Eq)]
191pub struct StringInterner<S, H = RandomState>
192where
193    S: Symbol,
194    H: BuildHasher,
195{
196    map: HashMap<InternalStrRef, S, H>,
197    values: Vec<Arc<str>>,
198}
199
200impl<S, H> PartialEq for StringInterner<S, H>
201where
202    S: Symbol,
203    H: BuildHasher,
204{
205    fn eq(&self, rhs: &Self) -> bool {
206        self.len() == rhs.len() && self.values == rhs.values
207    }
208}
209
210impl Default for StringInterner<Sym, RandomState> {
211    #[inline]
212    fn default() -> Self {
213        StringInterner::new()
214    }
215}
216
217// Should be manually cloned.
218// See <https://github.com/Robbepop/string-interner/issues/9>.
219impl<S, H> Clone for StringInterner<S, H>
220where
221    S: Symbol,
222    H: Clone + BuildHasher,
223{
224    fn clone(&self) -> Self {
225        let values = self.values.clone();
226        let mut map = HashMap::with_capacity_and_hasher(values.len(), self.map.hasher().clone());
227        // Recreate `InternalStrRef` from the newly cloned `Box<str>`s.
228        // Use `extend()` to avoid `H: Default` trait bound required by `FromIterator for HashMap`.
229        map.extend(
230            values
231                .iter()
232                .enumerate()
233                .map(|(i, s)| (InternalStrRef::from_str(s), S::from_usize(i))),
234        );
235        Self { values, map }
236    }
237}
238
239// About `Send` and `Sync` impls for `StringInterner`
240// --------------------------------------------------
241//
242// tl;dr: Automation of Send+Sync impl was prevented by `InternalStrRef`
243// being an unsafe abstraction and thus prevented Send+Sync default derivation.
244//
245// These implementations are safe due to the following reasons:
246//  - `InternalStrRef` cannot be used outside `StringInterner`.
247//  - Strings stored in `StringInterner` are not mutable.
248//  - Iterator invalidation while growing the underlying `Vec<Box<str>>` is prevented by
249//    using an additional indirection to store strings.
250unsafe impl<S, H> Send for StringInterner<S, H>
251where
252    S: Symbol + Send,
253    H: BuildHasher,
254{
255}
256unsafe impl<S, H> Sync for StringInterner<S, H>
257where
258    S: Symbol + Sync,
259    H: BuildHasher,
260{
261}
262
263impl<S> StringInterner<S>
264where
265    S: Symbol,
266{
267    /// Creates a new empty `StringInterner`.
268    #[inline]
269    pub fn new() -> StringInterner<S, RandomState> {
270        StringInterner {
271            map: HashMap::new(),
272            values: Vec::new(),
273        }
274    }
275
276    /// Creates a new `StringInterner` with the given initial capacity.
277    #[inline]
278    pub fn with_capacity(cap: usize) -> Self {
279        StringInterner {
280            map: HashMap::with_capacity(cap),
281            values: Vec::with_capacity(cap),
282        }
283    }
284
285    /// Returns the number of elements the `StringInterner` can hold without reallocating.
286    #[inline]
287    pub fn capacity(&self) -> usize {
288        std::cmp::min(self.map.capacity(), self.values.capacity())
289    }
290
291    /// Reserves capacity for at least `additional` more elements to be interned into `self`.
292    ///
293    /// The collection may reserve more space to avoid frequent allocations.
294    /// After calling `reserve`, capacity will be greater than or equal to `self.len() + additional`.
295    /// Does nothing if capacity is already sufficient.
296    #[inline]
297    pub fn reserve(&mut self, additional: usize) {
298        self.map.reserve(additional);
299        self.values.reserve(additional);
300    }
301}
302
303impl<S, H> StringInterner<S, H>
304where
305    S: Symbol,
306    H: BuildHasher,
307{
308    /// Creates a new empty `StringInterner` with the given hasher.
309    #[inline]
310    pub fn with_hasher(hash_builder: H) -> StringInterner<S, H> {
311        StringInterner {
312            map: HashMap::with_hasher(hash_builder),
313            values: Vec::new(),
314        }
315    }
316
317    /// Creates a new empty `StringInterner` with the given initial capacity and the given hasher.
318    #[inline]
319    pub fn with_capacity_and_hasher(cap: usize, hash_builder: H) -> StringInterner<S, H> {
320        StringInterner {
321            map: HashMap::with_hasher(hash_builder),
322            values: Vec::with_capacity(cap),
323        }
324    }
325
326    /// Interns the given value.
327    ///
328    /// Returns a symbol to access it within this interner.
329    ///
330    /// This either copies the contents of the string (e.g. for str)
331    /// or moves them into this interner (e.g. for String).
332    #[inline]
333    pub fn get_or_intern<T>(&mut self, val: T) -> S
334    where
335        T: Into<String> + AsRef<str>,
336    {
337        match self.map.get(&val.as_ref().into()) {
338            Some(&sym) => sym,
339            None => self.intern(val),
340        }
341    }
342
343    /// Interns the given value and ignores collissions.
344    ///
345    /// Returns a symbol to access it within this interner.
346    fn intern<T>(&mut self, new_val: T) -> S
347    where
348        T: Into<String> + AsRef<str>,
349    {
350        let new_id: S = self.make_symbol();
351        let new_boxed_val: Arc<str> = new_val.into().into();
352        let new_ref: InternalStrRef = new_boxed_val.as_ref().into();
353        self.values.push(new_boxed_val);
354        self.map.insert(new_ref, new_id);
355        new_id
356    }
357
358    /// Creates a new symbol for the current state of the interner.
359    fn make_symbol(&self) -> S {
360        S::from_usize(self.len())
361    }
362
363    /// Returns the string slice associated with the given symbol if available,
364    /// otherwise returns `None`.
365    #[inline]
366    pub fn resolve(&self, symbol: S) -> Option<&Arc<str>> {
367        self.values.get(symbol.to_usize())
368    }
369
370    /// Returns the string associated with the given symbol.
371    ///
372    /// # Note
373    ///
374    /// This does not check whether the given symbol has an associated string
375    /// for the given string interner instance.
376    ///
377    /// # Safety
378    ///
379    /// This will result in undefined behaviour if the given symbol
380    /// had no associated string for this interner instance.
381    #[inline]
382    pub unsafe fn resolve_unchecked(&self, symbol: S) -> &Arc<str> {
383        self.values.get_unchecked(symbol.to_usize())
384    }
385
386    /// Returns the symbol associated with the given string for this interner
387    /// if existent, otherwise returns `None`.
388    #[inline]
389    pub fn get<T>(&self, val: T) -> Option<S>
390    where
391        T: AsRef<str>,
392    {
393        self.map.get(&val.as_ref().into()).cloned()
394    }
395
396    /// Returns the number of uniquely interned strings within this interner.
397    #[inline]
398    pub fn len(&self) -> usize {
399        self.values.len()
400    }
401
402    /// Returns true if the string interner holds no elements.
403    #[inline]
404    pub fn is_empty(&self) -> bool {
405        self.len() == 0
406    }
407
408    /// Returns an iterator over the interned strings.
409    #[inline]
410    pub fn iter(&self) -> Iter<S> {
411        Iter::new(self)
412    }
413
414    /// Returns an iterator over all intern indices and their associated strings.
415    #[inline]
416    pub fn iter_values(&self) -> Values<S> {
417        Values::new(self)
418    }
419
420    /// Shrinks the capacity of the interner as much as possible.
421    pub fn shrink_to_fit(&mut self) {
422        self.map.shrink_to_fit();
423        self.values.shrink_to_fit();
424    }
425}
426
427impl<T, S> FromIterator<T> for StringInterner<S>
428where
429    S: Symbol,
430    T: Into<String> + AsRef<str>,
431{
432    fn from_iter<I>(iter: I) -> Self
433    where
434        I: IntoIterator<Item = T>,
435    {
436        let iter = iter.into_iter();
437        let mut interner = StringInterner::with_capacity(iter.size_hint().0);
438        interner.extend(iter);
439        interner
440    }
441}
442
443impl<T, S> std::iter::Extend<T> for StringInterner<S>
444where
445    S: Symbol,
446    T: Into<String> + AsRef<str>,
447{
448    fn extend<I>(&mut self, iter: I)
449    where
450        I: IntoIterator<Item = T>,
451    {
452        for s in iter {
453            self.get_or_intern(s);
454        }
455    }
456}
457
458/// Iterator over the pairs of associated symbols and interned strings for a `StringInterner`.
459pub struct Iter<'a, S> {
460    iter: iter::Enumerate<slice::Iter<'a, Arc<str>>>,
461    mark: marker::PhantomData<S>,
462}
463
464impl<'a, S> Iter<'a, S>
465where
466    S: Symbol + 'a,
467{
468    /// Creates a new iterator for the given StringIterator over pairs of
469    /// symbols and their associated interned string.
470    #[inline]
471    fn new<H>(interner: &'a StringInterner<S, H>) -> Self
472    where
473        H: BuildHasher,
474    {
475        Iter {
476            iter: interner.values.iter().enumerate(),
477            mark: marker::PhantomData,
478        }
479    }
480}
481
482impl<'a, S> Iterator for Iter<'a, S>
483where
484    S: Symbol + 'a,
485{
486    type Item = (S, &'a Arc<str>);
487
488    #[inline]
489    fn next(&mut self) -> Option<Self::Item> {
490        self.iter
491            .next()
492            .map(|(num, boxed_str)| (S::from_usize(num), boxed_str))
493    }
494
495    #[inline]
496    fn size_hint(&self) -> (usize, Option<usize>) {
497        self.iter.size_hint()
498    }
499}
500
501/// Iterator over the interned strings of a `StringInterner`.
502pub struct Values<'a, S>
503where
504    S: Symbol + 'a,
505{
506    iter: slice::Iter<'a, Arc<str>>,
507    mark: marker::PhantomData<S>,
508}
509
510impl<'a, S> Values<'a, S>
511where
512    S: Symbol + 'a,
513{
514    /// Creates a new iterator for the given StringIterator over its interned strings.
515    #[inline]
516    fn new<H>(interner: &'a StringInterner<S, H>) -> Self
517    where
518        H: BuildHasher,
519    {
520        Values {
521            iter: interner.values.iter(),
522            mark: marker::PhantomData,
523        }
524    }
525}
526
527impl<'a, S> Iterator for Values<'a, S>
528where
529    S: Symbol + 'a,
530{
531    type Item = &'a Arc<str>;
532
533    #[inline]
534    fn next(&mut self) -> Option<Self::Item> {
535        self.iter.next()
536    }
537
538    #[inline]
539    fn size_hint(&self) -> (usize, Option<usize>) {
540        self.iter.size_hint()
541    }
542}
543
544impl<S, H> iter::IntoIterator for StringInterner<S, H>
545where
546    S: Symbol,
547    H: BuildHasher,
548{
549    type Item = (S, Arc<str>);
550    type IntoIter = IntoIter<S>;
551
552    fn into_iter(self) -> Self::IntoIter {
553        IntoIter {
554            iter: self.values.into_iter().enumerate(),
555            mark: marker::PhantomData,
556        }
557    }
558}
559
560/// Iterator over the pairs of associated symbol and strings.
561///
562/// Consumes the `StringInterner` upon usage.
563pub struct IntoIter<S>
564where
565    S: Symbol,
566{
567    iter: iter::Enumerate<vec::IntoIter<Arc<str>>>,
568    mark: marker::PhantomData<S>,
569}
570
571impl<S> Iterator for IntoIter<S>
572where
573    S: Symbol,
574{
575    type Item = (S, Arc<str>);
576
577    fn next(&mut self) -> Option<Self::Item> {
578        self.iter
579            .next()
580            .map(|(num, boxed_str)| (S::from_usize(num), boxed_str))
581    }
582
583    #[inline]
584    fn size_hint(&self) -> (usize, Option<usize>) {
585        self.iter.size_hint()
586    }
587}