tiny_interner/
lib.rs

1#![doc(html_root_url = "https://docs.rs/crate/tiny-interner/0.1.0")]
2#![cfg_attr(not(feature = "std"), no_std)]
3#![deny(missing_docs)]
4#![warn(unsafe_op_in_unsafe_fn, clippy::redundant_closure_for_method_calls)]
5
6//! ~300 lines of Rust code that implement string internering for
7//! your programming language compiler.
8//!
9//! The crate caches strings and associates them with unique symbols.
10//! These allows constant time comparisons and look-ups to underlying interned strings.
11//!
12//! ### Examples:
13//!
14//! Internings:
15//! ```
16//! use tiny_interner::Interner;
17//!
18//! let mut interner = Interner::default();
19//! let symbol0 = interner.get_or_intern("A");
20//! let symbol1 = interner.get_or_intern("B");
21//! let symbol2 = interner.get_or_intern("C");
22//! let symbol3 = interner.get_or_intern("A");
23//!
24//! assert_ne!(symbol0, symbol1);
25//! assert_ne!(symbol0, symbol2);
26//! assert_ne!(symbol1, symbol2);
27//! assert_eq!(symbol0, symbol3);
28//! ```
29//!
30//! Resolving symbols:
31//! ```
32//! use tiny_interner::Interner;
33//!
34//! let mut interner = Interner::default();
35//! let symbol0 = interner.get_or_intern("A");
36//! let symbol1 = interner.get_or_intern("B");
37//!
38//! assert_eq!(interner.resolve(0), Some("A"));
39//! assert_eq!(interner.resolve(1), Some("B"));
40//! assert_eq!(interner.resolve(2), None);
41//! ```
42
43use core::{
44    hash::{BuildHasher, Hash, Hasher},
45    marker::PhantomData,
46    str::from_utf8_unchecked,
47};
48
49extern crate alloc;
50
51use self::alloc::{string::String, vec::Vec};
52use hashbrown::{
53    hash_map::{DefaultHashBuilder, RawEntryMut},
54    HashMap,
55};
56
57/// Represents unique symbol corresponding to some interned string.
58pub type Symbol = usize;
59
60/// Data structure that allows to resolve/intern strings.
61///
62/// See:
63///  - [`Interner::get_or_intern`] to intern a new string.
64///  - [`Interner::resolve`] to resolve already interned strings.
65#[derive(Debug)]
66pub struct Interner<H = DefaultHashBuilder>
67where
68    H: BuildHasher,
69{
70    dedup: HashMap<usize, (), ()>,
71    hasher: H,
72    backend: Backend,
73}
74
75/// Data structures that organizes interned strings.
76#[derive(Debug)]
77pub struct Backend {
78    ends: Vec<usize>,
79    buffer: String,
80    marker: PhantomData<fn() -> usize>,
81}
82
83impl Default for Interner {
84    #[inline]
85    fn default() -> Self {
86        Self::new()
87    }
88}
89
90impl Default for Backend {
91    #[inline]
92    fn default() -> Self {
93        Self {
94            ends: Vec::default(),
95            buffer: String::default(),
96            marker: Default::default(),
97        }
98    }
99}
100
101fn hash_value<T>(hasher: &impl BuildHasher, value: &T) -> u64
102where
103    T: ?Sized + Hash,
104{
105    let state = &mut hasher.build_hasher();
106    value.hash(state);
107    state.finish()
108}
109
110impl Backend {
111    #[inline]
112    fn with_capacity(capacity: usize) -> Self {
113        Self {
114            ends: Vec::with_capacity(capacity),
115            buffer: String::default(),
116            marker: Default::default(),
117        }
118    }
119
120    /// Interns the given string and returns corresponding symbol.
121    #[inline]
122    fn intern(&mut self, string: &str) -> usize {
123        self.push(string)
124    }
125
126    /// Interns the given static string and returns corresponding symbol.
127    #[inline]
128    fn intern_static(&mut self, string: &'static str) -> usize {
129        self.intern(string)
130    }
131
132    /// Resolves the given symbol to its original string.
133    #[inline]
134    fn resolve(&self, symbol: usize) -> Option<&str> {
135        self.span_of(symbol).map(|span| self.str_at(span))
136    }
137
138    /// Resolves the given symbol to its original string, but without additional checks.
139    #[inline]
140    unsafe fn unchecked_resolve(&self, symbol: usize) -> &str {
141        unsafe { self.str_at(self.unchecked_span_of(symbol)) }
142    }
143
144    /// Shrink capacity to fit interned symbols exactly.
145    fn shrink_to_fit(&mut self) {
146        self.ends.shrink_to_fit();
147        self.buffer.shrink_to_fit();
148    }
149
150    fn next_symbol(&self) -> usize {
151        self.ends.len()
152    }
153
154    /// Returns the span for the given symbol if any.
155    fn span_of(&self, symbol: usize) -> Option<Span> {
156        self.ends.get(symbol).copied().map(|end| Span {
157            start: self.ends.get(symbol.wrapping_sub(1)).copied().unwrap_or(0),
158            end,
159        })
160    }
161
162    /// Returns the span for the given symbol if any, but without additional checks.
163    unsafe fn unchecked_span_of(&self, symbol: usize) -> Span {
164        let end = unsafe { *self.ends.get_unchecked(symbol) };
165        let start = self.ends.get(symbol.wrapping_sub(1)).copied().unwrap_or(0);
166
167        Span { start, end }
168    }
169
170    fn str_at(&self, span: Span) -> &str {
171        unsafe {
172            from_utf8_unchecked(&self.buffer.as_bytes()[(span.start as usize)..(span.end as usize)])
173        }
174    }
175
176    /// Pushes the string into the buffer and returns corresponding symbol.
177    fn push(&mut self, string: &str) -> usize {
178        self.buffer.push_str(string);
179
180        let end = self.buffer.as_bytes().len();
181        let symbol = self.next_symbol();
182
183        self.ends.push(end);
184
185        symbol
186    }
187}
188
189impl<H> Interner<H>
190where
191    H: BuildHasher + Default,
192{
193    /// Creates a new empty `Interner`.
194    #[inline]
195    pub fn new() -> Self {
196        Self {
197            dedup: HashMap::default(),
198            hasher: Default::default(),
199            backend: Backend::default(),
200        }
201    }
202
203    /// Creates a new empty `Interner` with the given hasher.
204    #[inline]
205    pub fn with_hasher(hasher: H) -> Self {
206        Self {
207            dedup: HashMap::default(),
208            hasher,
209            backend: Backend::default(),
210        }
211    }
212
213    /// Creates a new empty `Interner` with the given capacity.
214    #[inline]
215    pub fn with_capacity(capacity: usize) -> Self {
216        Self {
217            dedup: HashMap::with_capacity_and_hasher(capacity, ()),
218            hasher: Default::default(),
219            backend: Backend::with_capacity(capacity),
220        }
221    }
222
223    /// Creates a new empty `Interner` with the given capacity and hasher.
224    #[inline]
225    pub fn with_capacity_and_hasher(capacity: usize, hasher: H) -> Self {
226        Self {
227            dedup: HashMap::with_capacity_and_hasher(capacity, ()),
228            hasher,
229            backend: Backend::with_capacity(capacity),
230        }
231    }
232
233    /// Returns the number of symbols/strings interned by the interner.
234    #[inline]
235    pub fn len(&self) -> usize {
236        self.dedup.len()
237    }
238
239    /// Returns `true` if the string interner has no interned strings/amount of symbols is `0`.
240    #[inline]
241    pub fn is_empty(&self) -> bool {
242        self.len() == 0
243    }
244
245    /// Returns the symbol for the given string if any.
246    #[inline]
247    pub fn get<T>(&self, string: T) -> Option<usize>
248    where
249        T: AsRef<str>,
250    {
251        let string_ref = string.as_ref();
252        let hasher = &self.hasher;
253        let hash = hash_value(hasher, string_ref);
254
255        self.dedup
256            .raw_entry()
257            .from_hash(hash, |symbol| {
258                string_ref == unsafe { self.backend.unchecked_resolve(*symbol) }
259            })
260            .map(|(&symbol, ())| symbol)
261    }
262
263    /// Interns the given string and returns a corresponding symbol.
264    #[inline]
265    fn get_or_intern_using<T>(
266        &mut self,
267        string: T,
268        intern_fn: fn(&mut Backend, T) -> usize,
269    ) -> usize
270    where
271        T: AsRef<str> + Copy + Hash + for<'a> PartialEq<&'a str>,
272    {
273        let string_ref = string.as_ref();
274
275        let hasher = &self.hasher;
276        let hash = hash_value(hasher, string_ref);
277
278        let entry = self.dedup.raw_entry_mut().from_hash(hash, |symbol| {
279            string_ref == unsafe { self.backend.unchecked_resolve(*symbol) }
280        });
281
282        let (&mut symbol, &mut ()) = match entry {
283            RawEntryMut::Vacant(vacant) => {
284                let symbol = intern_fn(&mut self.backend, string);
285                vacant.insert_with_hasher(hash, symbol, (), |symbol| {
286                    hash_value(hasher, unsafe { self.backend.unchecked_resolve(*symbol) })
287                })
288            }
289            RawEntryMut::Occupied(occupied) => occupied.into_key_value(),
290        };
291
292        symbol
293    }
294
295    /// Interns the given string and returns a corresponding symbol.
296    #[inline]
297    pub fn get_or_intern<T>(&mut self, string: T) -> usize
298    where
299        T: AsRef<str>,
300    {
301        self.get_or_intern_using(string.as_ref(), Backend::intern)
302    }
303
304    /// Interns the given `'static` string and returns a corresponding symbol.
305    pub fn get_or_intern_static(&mut self, string: &'static str) -> usize {
306        self.get_or_intern_using(string.as_ref(), Backend::intern_static)
307    }
308
309    /// Shrink backend capacity to fit the interned strings exactly.
310    pub fn shrink_to_fit(&mut self) {
311        self.backend.shrink_to_fit();
312    }
313
314    /// Returns the string for the given symbol if any.
315    #[inline]
316    pub fn resolve(&self, symbol: usize) -> Option<&str> {
317        self.backend.resolve(symbol)
318    }
319}
320
321/// Represents a location of an interned string inside the [`Backend::buffer`] buffer.
322#[derive(Debug, Copy, Clone, PartialEq, Eq)]
323pub struct Span {
324    start: usize,
325    end: usize,
326}