Skip to main content

arena_lib/
intern.rs

1//! String interner that hands out compact [`Symbol`] handles.
2//!
3//! [`Interner`] deduplicates owned [`String`] storage:
4//! every call to [`Interner::intern`] returns the same [`Symbol`] for the
5//! same input. Equality and hashing operate on the four-byte symbol rather
6//! than the underlying bytes, which is the win when the same identifier
7//! appears thousands of times across a workload.
8//!
9//! # Cost summary
10//!
11//! - `intern`: expected O(1) on first sight; expected O(1) on repeated sight.
12//! - `resolve`: O(1).
13//! - `lookup` / `contains`: expected O(1).
14//! - `len` / `is_empty`: O(1).
15//!
16//! The implementation uses [`hashbrown::HashMap`] for the de-duplication
17//! index, keeping `intern` / `lookup` at expected O(1) while still
18//! compiling under `no_std`.
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use hashbrown::HashMap;
24
25use crate::error::{Error, Result};
26
27/// Compact handle returned by [`Interner::intern`].
28///
29/// A `Symbol` is a 4-byte `Copy` value. **Within a single interner**, two
30/// symbols compare equal if and only if the strings they refer to were
31/// interned from byte-identical inputs.
32///
33/// Symbols are scoped to the [`Interner`] that issued them: id values
34/// collide across separate interners (both start at 0, etc.). Passing a
35/// foreign symbol to [`Interner::resolve`] is undefined at the API
36/// contract level — the call may return `None`, or it may return an
37/// unrelated string. Treat `Symbol` values as opaque handles tied to a
38/// single interner.
39#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
40pub struct Symbol(u32);
41
42impl Symbol {
43    /// Returns the underlying numeric identifier.
44    ///
45    /// Useful for diagnostics or for serializing alongside an interner
46    /// snapshot. Do not synthesize `Symbol` values; obtain them from
47    /// [`Interner::intern`].
48    #[inline]
49    pub const fn id(self) -> u32 {
50        self.0
51    }
52}
53
54/// String interner. See the [module-level docs](self).
55///
56/// # Examples
57///
58/// ```
59/// use arena_lib::Interner;
60///
61/// let mut interner = Interner::new();
62/// let a = interner.intern("user:42");
63/// let b = interner.intern("user:42");
64/// let c = interner.intern("user:7");
65///
66/// assert_eq!(a, b);
67/// assert_ne!(a, c);
68/// assert_eq!(interner.resolve(a), Some("user:42"));
69/// assert_eq!(interner.len(), 2);
70/// ```
71pub struct Interner {
72    storage: Vec<String>,
73    lookup: HashMap<String, u32>,
74}
75
76impl Interner {
77    /// Creates an empty interner that performs no allocation up front.
78    #[inline]
79    #[must_use]
80    pub fn new() -> Self {
81        Self {
82            storage: Vec::new(),
83            lookup: HashMap::new(),
84        }
85    }
86
87    /// Creates an empty interner with storage pre-reserved for `capacity`
88    /// distinct strings.
89    #[inline]
90    #[must_use]
91    pub fn with_capacity(capacity: usize) -> Self {
92        Self {
93            storage: Vec::with_capacity(capacity),
94            lookup: HashMap::with_capacity(capacity),
95        }
96    }
97
98    /// Number of distinct strings currently interned.
99    #[inline]
100    #[must_use]
101    pub fn len(&self) -> usize {
102        self.storage.len()
103    }
104
105    /// Returns `true` if the interner holds no strings.
106    #[inline]
107    #[must_use]
108    pub fn is_empty(&self) -> bool {
109        self.storage.is_empty()
110    }
111
112    /// Interns `s` and returns its [`Symbol`].
113    ///
114    /// Idempotent: repeated calls with the same input return the same
115    /// symbol. Panics if the symbol counter would overflow `u32::MAX`
116    /// — use [`Interner::try_intern`] for the explicit fallible variant.
117    pub fn intern(&mut self, s: &str) -> Symbol {
118        match self.try_intern(s) {
119            Ok(sym) => sym,
120            Err(_) => panic!("interner symbol counter overflow (u32::MAX symbols)"),
121        }
122    }
123
124    /// Interns `s`, returning a [`Symbol`] on success or
125    /// [`Error::CounterOverflow`] if the interner cannot represent more
126    /// distinct strings.
127    pub fn try_intern(&mut self, s: &str) -> Result<Symbol> {
128        if let Some(&id) = self.lookup.get(s) {
129            return Ok(Symbol(id));
130        }
131        let id_usize = self.storage.len();
132        if id_usize > u32::MAX as usize {
133            return Err(Error::CounterOverflow);
134        }
135        let id = id_usize as u32;
136        let owned = String::from(s);
137        self.storage.push(owned.clone());
138        let _ = self.lookup.insert(owned, id);
139        Ok(Symbol(id))
140    }
141
142    /// Returns the original string for `symbol`, or `None` if the
143    /// symbol's id is out of range for this interner.
144    ///
145    /// Passing a symbol issued by a *different* interner is undefined
146    /// behaviour at the API contract level: the call may return `None`
147    /// (if the foreign id is past this interner's length) or it may
148    /// return some unrelated string (if the foreign id collides with an
149    /// in-range local id). Treat [`Symbol`] values as opaque handles
150    /// scoped to the [`Interner`] that issued them.
151    ///
152    /// # Examples
153    ///
154    /// ```
155    /// use arena_lib::Interner;
156    ///
157    /// let mut interner = Interner::new();
158    /// let sym = interner.intern("payload");
159    /// assert_eq!(interner.resolve(sym), Some("payload"));
160    /// ```
161    #[inline]
162    #[must_use]
163    pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
164        self.storage.get(symbol.0 as usize).map(String::as_str)
165    }
166
167    /// Returns `true` if `s` has already been interned.
168    ///
169    /// Equivalent to [`Interner::lookup`] returning `Some`.
170    #[inline]
171    #[must_use]
172    pub fn contains(&self, s: &str) -> bool {
173        self.lookup.contains_key(s)
174    }
175
176    /// Returns the symbol previously assigned to `s`, without inserting
177    /// a new entry if the string is unknown.
178    ///
179    /// Use this when you want to *probe* the interner without growing
180    /// it — for example, to enforce that only pre-registered identifiers
181    /// are accepted.
182    ///
183    /// # Examples
184    ///
185    /// ```
186    /// use arena_lib::Interner;
187    ///
188    /// let mut interner = Interner::new();
189    /// let known = interner.intern("known");
190    ///
191    /// assert_eq!(interner.lookup("known"), Some(known));
192    /// assert_eq!(interner.lookup("unknown"), None);
193    /// assert_eq!(interner.len(), 1, "lookup must not insert");
194    /// ```
195    #[inline]
196    #[must_use]
197    pub fn lookup(&self, s: &str) -> Option<Symbol> {
198        self.lookup.get(s).copied().map(Symbol)
199    }
200
201    /// Iterator over `(Symbol, &str)` pairs for every interned string,
202    /// in insertion order.
203    pub fn iter(&self) -> Iter<'_> {
204        Iter {
205            inner: self.storage.iter().enumerate(),
206        }
207    }
208}
209
210impl Default for Interner {
211    #[inline]
212    fn default() -> Self {
213        Self::new()
214    }
215}
216
217impl core::fmt::Debug for Interner {
218    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
219        f.debug_struct("Interner")
220            .field("len", &self.storage.len())
221            .finish()
222    }
223}
224
225/// Iterator over `(Symbol, &str)` pairs returned by [`Interner::iter`].
226pub struct Iter<'a> {
227    inner: core::iter::Enumerate<core::slice::Iter<'a, String>>,
228}
229
230impl<'a> Iterator for Iter<'a> {
231    type Item = (Symbol, &'a str);
232
233    fn next(&mut self) -> Option<Self::Item> {
234        let (i, s) = self.inner.next()?;
235        Some((Symbol(i as u32), s.as_str()))
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242
243    #[test]
244    fn same_string_returns_same_symbol() {
245        let mut i = Interner::new();
246        let a = i.intern("hello");
247        let b = i.intern("hello");
248        assert_eq!(a, b);
249        assert_eq!(i.len(), 1);
250    }
251
252    #[test]
253    fn distinct_strings_return_distinct_symbols() {
254        let mut i = Interner::new();
255        let a = i.intern("alpha");
256        let b = i.intern("bravo");
257        assert_ne!(a, b);
258    }
259
260    #[test]
261    fn resolve_round_trips() {
262        let mut i = Interner::new();
263        let s = i.intern("round-trip");
264        assert_eq!(i.resolve(s), Some("round-trip"));
265    }
266
267    #[test]
268    fn lookup_does_not_insert() {
269        let mut i = Interner::new();
270        let _ = i.intern("first");
271        assert!(i.lookup("second").is_none());
272        assert_eq!(i.len(), 1);
273    }
274
275    #[test]
276    fn contains_reflects_state() {
277        let mut i = Interner::new();
278        assert!(!i.contains("x"));
279        let _ = i.intern("x");
280        assert!(i.contains("x"));
281    }
282
283    #[test]
284    fn iter_yields_insertion_order() {
285        let mut i = Interner::new();
286        let _ = i.intern("a");
287        let _ = i.intern("b");
288        let _ = i.intern("c");
289        let collected: Vec<&str> = i.iter().map(|(_, s)| s).collect();
290        assert_eq!(collected, vec!["a", "b", "c"]);
291    }
292}