arena_lib/intern.rs
1//! String interner that hands out compact [`Symbol`] handles.
2//!
3//! [`Interner`] deduplicates owned [`String`] storage:
4//! every call to [`Interner::intern`] returns the same [`Symbol`] for the
5//! same input. Equality and hashing operate on the four-byte symbol rather
6//! than the underlying bytes, which is the win when the same identifier
7//! appears thousands of times across a workload.
8//!
9//! # Cost summary
10//!
11//! - `intern`: expected O(1) on first sight; expected O(1) on repeated sight.
12//! - `resolve`: O(1).
13//! - `lookup` / `contains`: expected O(1).
14//! - `len` / `is_empty`: O(1).
15//!
16//! The implementation uses [`hashbrown::HashMap`] for the de-duplication
17//! index, keeping `intern` / `lookup` at expected O(1) while still
18//! compiling under `no_std`.
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use hashbrown::HashMap;
24
25use crate::error::{Error, Result};
26
27/// Compact handle returned by [`Interner::intern`].
28///
29/// A `Symbol` is a 4-byte `Copy` value. **Within a single interner**, two
30/// symbols compare equal if and only if the strings they refer to were
31/// interned from byte-identical inputs.
32///
33/// Symbols are scoped to the [`Interner`] that issued them: id values
34/// collide across separate interners (both start at 0, etc.). Passing a
35/// foreign symbol to [`Interner::resolve`] is undefined at the API
36/// contract level — the call may return `None`, or it may return an
37/// unrelated string. Treat `Symbol` values as opaque handles tied to a
38/// single interner.
39#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
40pub struct Symbol(u32);
41
42impl Symbol {
43 /// Returns the underlying numeric identifier.
44 ///
45 /// Useful for diagnostics or for serializing alongside an interner
46 /// snapshot. Do not synthesize `Symbol` values; obtain them from
47 /// [`Interner::intern`].
48 #[inline]
49 pub const fn id(self) -> u32 {
50 self.0
51 }
52}
53
54/// String interner. See the [module-level docs](self).
55///
56/// # Examples
57///
58/// ```
59/// use arena_lib::Interner;
60///
61/// let mut interner = Interner::new();
62/// let a = interner.intern("user:42");
63/// let b = interner.intern("user:42");
64/// let c = interner.intern("user:7");
65///
66/// assert_eq!(a, b);
67/// assert_ne!(a, c);
68/// assert_eq!(interner.resolve(a), Some("user:42"));
69/// assert_eq!(interner.len(), 2);
70/// ```
71pub struct Interner {
72 storage: Vec<String>,
73 lookup: HashMap<String, u32>,
74}
75
76impl Interner {
77 /// Creates an empty interner that performs no allocation up front.
78 #[inline]
79 #[must_use]
80 pub fn new() -> Self {
81 Self {
82 storage: Vec::new(),
83 lookup: HashMap::new(),
84 }
85 }
86
87 /// Creates an empty interner with storage pre-reserved for `capacity`
88 /// distinct strings.
89 #[inline]
90 #[must_use]
91 pub fn with_capacity(capacity: usize) -> Self {
92 Self {
93 storage: Vec::with_capacity(capacity),
94 lookup: HashMap::with_capacity(capacity),
95 }
96 }
97
98 /// Number of distinct strings currently interned.
99 #[inline]
100 #[must_use]
101 pub fn len(&self) -> usize {
102 self.storage.len()
103 }
104
105 /// Returns `true` if the interner holds no strings.
106 #[inline]
107 #[must_use]
108 pub fn is_empty(&self) -> bool {
109 self.storage.is_empty()
110 }
111
112 /// Interns `s` and returns its [`Symbol`].
113 ///
114 /// Idempotent: repeated calls with the same input return the same
115 /// symbol. Panics if the symbol counter would overflow `u32::MAX`
116 /// — use [`Interner::try_intern`] for the explicit fallible variant.
117 pub fn intern(&mut self, s: &str) -> Symbol {
118 match self.try_intern(s) {
119 Ok(sym) => sym,
120 Err(_) => panic!("interner symbol counter overflow (u32::MAX symbols)"),
121 }
122 }
123
124 /// Interns `s`, returning a [`Symbol`] on success or
125 /// [`Error::CounterOverflow`] if the interner cannot represent more
126 /// distinct strings.
127 pub fn try_intern(&mut self, s: &str) -> Result<Symbol> {
128 if let Some(&id) = self.lookup.get(s) {
129 return Ok(Symbol(id));
130 }
131 let id_usize = self.storage.len();
132 if id_usize > u32::MAX as usize {
133 return Err(Error::CounterOverflow);
134 }
135 let id = id_usize as u32;
136 let owned = String::from(s);
137 self.storage.push(owned.clone());
138 let _ = self.lookup.insert(owned, id);
139 Ok(Symbol(id))
140 }
141
142 /// Returns the original string for `symbol`, or `None` if the
143 /// symbol's id is out of range for this interner.
144 ///
145 /// Passing a symbol issued by a *different* interner is undefined
146 /// behaviour at the API contract level: the call may return `None`
147 /// (if the foreign id is past this interner's length) or it may
148 /// return some unrelated string (if the foreign id collides with an
149 /// in-range local id). Treat [`Symbol`] values as opaque handles
150 /// scoped to the [`Interner`] that issued them.
151 ///
152 /// # Examples
153 ///
154 /// ```
155 /// use arena_lib::Interner;
156 ///
157 /// let mut interner = Interner::new();
158 /// let sym = interner.intern("payload");
159 /// assert_eq!(interner.resolve(sym), Some("payload"));
160 /// ```
161 #[inline]
162 #[must_use]
163 pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
164 self.storage.get(symbol.0 as usize).map(String::as_str)
165 }
166
167 /// Returns `true` if `s` has already been interned.
168 ///
169 /// Equivalent to [`Interner::lookup`] returning `Some`.
170 #[inline]
171 #[must_use]
172 pub fn contains(&self, s: &str) -> bool {
173 self.lookup.contains_key(s)
174 }
175
176 /// Returns the symbol previously assigned to `s`, without inserting
177 /// a new entry if the string is unknown.
178 ///
179 /// Use this when you want to *probe* the interner without growing
180 /// it — for example, to enforce that only pre-registered identifiers
181 /// are accepted.
182 ///
183 /// # Examples
184 ///
185 /// ```
186 /// use arena_lib::Interner;
187 ///
188 /// let mut interner = Interner::new();
189 /// let known = interner.intern("known");
190 ///
191 /// assert_eq!(interner.lookup("known"), Some(known));
192 /// assert_eq!(interner.lookup("unknown"), None);
193 /// assert_eq!(interner.len(), 1, "lookup must not insert");
194 /// ```
195 #[inline]
196 #[must_use]
197 pub fn lookup(&self, s: &str) -> Option<Symbol> {
198 self.lookup.get(s).copied().map(Symbol)
199 }
200
201 /// Iterator over `(Symbol, &str)` pairs for every interned string,
202 /// in insertion order.
203 pub fn iter(&self) -> Iter<'_> {
204 Iter {
205 inner: self.storage.iter().enumerate(),
206 }
207 }
208}
209
210impl Default for Interner {
211 #[inline]
212 fn default() -> Self {
213 Self::new()
214 }
215}
216
217impl core::fmt::Debug for Interner {
218 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
219 f.debug_struct("Interner")
220 .field("len", &self.storage.len())
221 .finish()
222 }
223}
224
225/// Iterator over `(Symbol, &str)` pairs returned by [`Interner::iter`].
226pub struct Iter<'a> {
227 inner: core::iter::Enumerate<core::slice::Iter<'a, String>>,
228}
229
230impl<'a> Iterator for Iter<'a> {
231 type Item = (Symbol, &'a str);
232
233 fn next(&mut self) -> Option<Self::Item> {
234 let (i, s) = self.inner.next()?;
235 Some((Symbol(i as u32), s.as_str()))
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use super::*;
242
243 #[test]
244 fn same_string_returns_same_symbol() {
245 let mut i = Interner::new();
246 let a = i.intern("hello");
247 let b = i.intern("hello");
248 assert_eq!(a, b);
249 assert_eq!(i.len(), 1);
250 }
251
252 #[test]
253 fn distinct_strings_return_distinct_symbols() {
254 let mut i = Interner::new();
255 let a = i.intern("alpha");
256 let b = i.intern("bravo");
257 assert_ne!(a, b);
258 }
259
260 #[test]
261 fn resolve_round_trips() {
262 let mut i = Interner::new();
263 let s = i.intern("round-trip");
264 assert_eq!(i.resolve(s), Some("round-trip"));
265 }
266
267 #[test]
268 fn lookup_does_not_insert() {
269 let mut i = Interner::new();
270 let _ = i.intern("first");
271 assert!(i.lookup("second").is_none());
272 assert_eq!(i.len(), 1);
273 }
274
275 #[test]
276 fn contains_reflects_state() {
277 let mut i = Interner::new();
278 assert!(!i.contains("x"));
279 let _ = i.intern("x");
280 assert!(i.contains("x"));
281 }
282
283 #[test]
284 fn iter_yields_insertion_order() {
285 let mut i = Interner::new();
286 let _ = i.intern("a");
287 let _ = i.intern("b");
288 let _ = i.intern("c");
289 let collected: Vec<&str> = i.iter().map(|(_, s)| s).collect();
290 assert_eq!(collected, vec!["a", "b", "c"]);
291 }
292}