Skip to main content

arena_lib/
intern.rs

1//! String interner that hands out compact [`Symbol`] handles.
2//!
3//! [`Interner`] deduplicates owned [`String`] storage:
4//! every call to [`Interner::intern`] returns the same [`Symbol`] for the
5//! same input. Equality and hashing operate on the four-byte symbol rather
6//! than the underlying bytes, which is the win when the same identifier
7//! appears thousands of times across a workload.
8//!
9//! # Cost summary
10//!
11//! - `intern`: expected O(1) on first sight; expected O(1) on repeated sight.
12//! - `resolve`: O(1).
13//! - `lookup` / `contains`: expected O(1).
14//! - `len` / `is_empty`: O(1).
15//!
16//! The implementation uses [`hashbrown::HashMap`] for the de-duplication
17//! index, keeping `intern` / `lookup` at expected O(1) while still
18//! compiling under `no_std`.
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use hashbrown::HashMap;
24
25use crate::error::{Error, Result};
26
27/// Compact handle returned by [`Interner::intern`].
28///
29/// A `Symbol` is a 4-byte `Copy` value. Two symbols compare equal if and
30/// only if the strings they refer to were interned by the **same**
31/// [`Interner`] from byte-identical inputs. Symbols are not transferable
32/// across interners.
33#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
34pub struct Symbol(u32);
35
36impl Symbol {
37    /// Returns the underlying numeric identifier.
38    ///
39    /// Useful for diagnostics or for serializing alongside an interner
40    /// snapshot. Do not synthesize `Symbol` values; obtain them from
41    /// [`Interner::intern`].
42    #[inline]
43    pub const fn id(self) -> u32 {
44        self.0
45    }
46}
47
48/// String interner. See the [module-level docs](self).
49///
50/// # Examples
51///
52/// ```
53/// use arena_lib::Interner;
54///
55/// let mut interner = Interner::new();
56/// let a = interner.intern("user:42");
57/// let b = interner.intern("user:42");
58/// let c = interner.intern("user:7");
59///
60/// assert_eq!(a, b);
61/// assert_ne!(a, c);
62/// assert_eq!(interner.resolve(a), Some("user:42"));
63/// assert_eq!(interner.len(), 2);
64/// ```
65pub struct Interner {
66    storage: Vec<String>,
67    lookup: HashMap<String, u32>,
68}
69
70impl Interner {
71    /// Creates an empty interner that performs no allocation up front.
72    #[inline]
73    #[must_use]
74    pub fn new() -> Self {
75        Self {
76            storage: Vec::new(),
77            lookup: HashMap::new(),
78        }
79    }
80
81    /// Creates an empty interner with storage pre-reserved for `capacity`
82    /// distinct strings.
83    #[inline]
84    #[must_use]
85    pub fn with_capacity(capacity: usize) -> Self {
86        Self {
87            storage: Vec::with_capacity(capacity),
88            lookup: HashMap::with_capacity(capacity),
89        }
90    }
91
92    /// Number of distinct strings currently interned.
93    #[inline]
94    #[must_use]
95    pub fn len(&self) -> usize {
96        self.storage.len()
97    }
98
99    /// Returns `true` if the interner holds no strings.
100    #[inline]
101    #[must_use]
102    pub fn is_empty(&self) -> bool {
103        self.storage.is_empty()
104    }
105
106    /// Interns `s` and returns its [`Symbol`].
107    ///
108    /// Idempotent: repeated calls with the same input return the same
109    /// symbol. Panics if the symbol counter would overflow `u32::MAX`
110    /// — use [`Interner::try_intern`] for the explicit fallible variant.
111    pub fn intern(&mut self, s: &str) -> Symbol {
112        match self.try_intern(s) {
113            Ok(sym) => sym,
114            Err(_) => panic!("interner symbol counter overflow (u32::MAX symbols)"),
115        }
116    }
117
118    /// Interns `s`, returning a [`Symbol`] on success or
119    /// [`Error::CounterOverflow`] if the interner cannot represent more
120    /// distinct strings.
121    pub fn try_intern(&mut self, s: &str) -> Result<Symbol> {
122        if let Some(&id) = self.lookup.get(s) {
123            return Ok(Symbol(id));
124        }
125        let id_usize = self.storage.len();
126        if id_usize > u32::MAX as usize {
127            return Err(Error::CounterOverflow);
128        }
129        let id = id_usize as u32;
130        let owned = String::from(s);
131        self.storage.push(owned.clone());
132        let _ = self.lookup.insert(owned, id);
133        Ok(Symbol(id))
134    }
135
136    /// Returns the original string for `symbol`, or `None` if the symbol
137    /// did not come from this interner.
138    #[inline]
139    #[must_use]
140    pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
141        self.storage.get(symbol.0 as usize).map(String::as_str)
142    }
143
144    /// Returns `true` if `s` has already been interned.
145    ///
146    /// Equivalent to [`Interner::lookup`] returning `Some`.
147    #[inline]
148    #[must_use]
149    pub fn contains(&self, s: &str) -> bool {
150        self.lookup.contains_key(s)
151    }
152
153    /// Returns the symbol previously assigned to `s`, without inserting
154    /// a new entry if the string is unknown.
155    #[inline]
156    #[must_use]
157    pub fn lookup(&self, s: &str) -> Option<Symbol> {
158        self.lookup.get(s).copied().map(Symbol)
159    }
160
161    /// Iterator over `(Symbol, &str)` pairs for every interned string,
162    /// in insertion order.
163    pub fn iter(&self) -> Iter<'_> {
164        Iter {
165            inner: self.storage.iter().enumerate(),
166        }
167    }
168}
169
170impl Default for Interner {
171    #[inline]
172    fn default() -> Self {
173        Self::new()
174    }
175}
176
177impl core::fmt::Debug for Interner {
178    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
179        f.debug_struct("Interner")
180            .field("len", &self.storage.len())
181            .finish()
182    }
183}
184
185/// Iterator over `(Symbol, &str)` pairs returned by [`Interner::iter`].
186pub struct Iter<'a> {
187    inner: core::iter::Enumerate<core::slice::Iter<'a, String>>,
188}
189
190impl<'a> Iterator for Iter<'a> {
191    type Item = (Symbol, &'a str);
192
193    fn next(&mut self) -> Option<Self::Item> {
194        let (i, s) = self.inner.next()?;
195        Some((Symbol(i as u32), s.as_str()))
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn same_string_returns_same_symbol() {
205        let mut i = Interner::new();
206        let a = i.intern("hello");
207        let b = i.intern("hello");
208        assert_eq!(a, b);
209        assert_eq!(i.len(), 1);
210    }
211
212    #[test]
213    fn distinct_strings_return_distinct_symbols() {
214        let mut i = Interner::new();
215        let a = i.intern("alpha");
216        let b = i.intern("bravo");
217        assert_ne!(a, b);
218    }
219
220    #[test]
221    fn resolve_round_trips() {
222        let mut i = Interner::new();
223        let s = i.intern("round-trip");
224        assert_eq!(i.resolve(s), Some("round-trip"));
225    }
226
227    #[test]
228    fn lookup_does_not_insert() {
229        let mut i = Interner::new();
230        let _ = i.intern("first");
231        assert!(i.lookup("second").is_none());
232        assert_eq!(i.len(), 1);
233    }
234
235    #[test]
236    fn contains_reflects_state() {
237        let mut i = Interner::new();
238        assert!(!i.contains("x"));
239        let _ = i.intern("x");
240        assert!(i.contains("x"));
241    }
242
243    #[test]
244    fn iter_yields_insertion_order() {
245        let mut i = Interner::new();
246        let _ = i.intern("a");
247        let _ = i.intern("b");
248        let _ = i.intern("c");
249        let collected: Vec<&str> = i.iter().map(|(_, s)| s).collect();
250        assert_eq!(collected, vec!["a", "b", "c"]);
251    }
252}