Skip to main content

arena_lib/
intern.rs

1//! String interner that hands out compact [`Symbol`] handles.
2//!
3//! [`Interner`] deduplicates owned [`String`] storage:
4//! every call to [`Interner::intern`] returns the same [`Symbol`] for the
5//! same input. Equality and hashing operate on the four-byte symbol rather
6//! than the underlying bytes, which is the win when the same identifier
7//! appears thousands of times across a workload.
8//!
9//! # Cost summary
10//!
11//! - `intern`: O(log n) on a first sight; O(log n) on a repeated sight.
12//! - `resolve`: O(1).
13//! - `len` / `is_empty`: O(1).
14//!
15//! The implementation uses an ordered map for the de-duplication index so
16//! that the crate stays `no_std`-compatible without pulling in `hashbrown`.
17//! Hash-backed lookup is planned for the 0.5 milestone.
18
19use alloc::collections::BTreeMap;
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use crate::error::{Error, Result};
24
25/// Compact handle returned by [`Interner::intern`].
26///
27/// A `Symbol` is a 4-byte `Copy` value. Two symbols compare equal if and
28/// only if the strings they refer to were interned by the **same**
29/// [`Interner`] from byte-identical inputs. Symbols are not transferable
30/// across interners.
31#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
32pub struct Symbol(u32);
33
34impl Symbol {
35    /// Returns the underlying numeric identifier.
36    ///
37    /// Useful for diagnostics or for serializing alongside an interner
38    /// snapshot. Do not synthesize `Symbol` values; obtain them from
39    /// [`Interner::intern`].
40    #[inline]
41    pub const fn id(self) -> u32 {
42        self.0
43    }
44}
45
46/// String interner. See the [module-level docs](self).
47///
48/// # Examples
49///
50/// ```
51/// use arena_lib::Interner;
52///
53/// let mut interner = Interner::new();
54/// let a = interner.intern("user:42");
55/// let b = interner.intern("user:42");
56/// let c = interner.intern("user:7");
57///
58/// assert_eq!(a, b);
59/// assert_ne!(a, c);
60/// assert_eq!(interner.resolve(a), Some("user:42"));
61/// assert_eq!(interner.len(), 2);
62/// ```
63pub struct Interner {
64    storage: Vec<String>,
65    lookup: BTreeMap<String, u32>,
66}
67
68impl Interner {
69    /// Creates an empty interner that performs no allocation up front.
70    #[inline]
71    #[must_use]
72    pub const fn new() -> Self {
73        Self {
74            storage: Vec::new(),
75            lookup: BTreeMap::new(),
76        }
77    }
78
79    /// Creates an empty interner with storage pre-reserved for `capacity`
80    /// distinct strings.
81    #[inline]
82    #[must_use]
83    pub fn with_capacity(capacity: usize) -> Self {
84        Self {
85            storage: Vec::with_capacity(capacity),
86            lookup: BTreeMap::new(),
87        }
88    }
89
90    /// Number of distinct strings currently interned.
91    #[inline]
92    #[must_use]
93    pub fn len(&self) -> usize {
94        self.storage.len()
95    }
96
97    /// Returns `true` if the interner holds no strings.
98    #[inline]
99    #[must_use]
100    pub fn is_empty(&self) -> bool {
101        self.storage.is_empty()
102    }
103
104    /// Interns `s` and returns its [`Symbol`].
105    ///
106    /// Idempotent: repeated calls with the same input return the same
107    /// symbol. Panics if the symbol counter would overflow `u32::MAX`
108    /// — use [`Interner::try_intern`] for the explicit fallible variant.
109    pub fn intern(&mut self, s: &str) -> Symbol {
110        match self.try_intern(s) {
111            Ok(sym) => sym,
112            Err(_) => panic!("interner symbol counter overflow (u32::MAX symbols)"),
113        }
114    }
115
116    /// Interns `s`, returning a [`Symbol`] on success or
117    /// [`Error::CounterOverflow`] if the interner cannot represent more
118    /// distinct strings.
119    pub fn try_intern(&mut self, s: &str) -> Result<Symbol> {
120        if let Some(&id) = self.lookup.get(s) {
121            return Ok(Symbol(id));
122        }
123        let id_usize = self.storage.len();
124        if id_usize > u32::MAX as usize {
125            return Err(Error::CounterOverflow);
126        }
127        let id = id_usize as u32;
128        let owned = String::from(s);
129        self.storage.push(owned.clone());
130        let _ = self.lookup.insert(owned, id);
131        Ok(Symbol(id))
132    }
133
134    /// Returns the original string for `symbol`, or `None` if the symbol
135    /// did not come from this interner.
136    #[inline]
137    #[must_use]
138    pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
139        self.storage.get(symbol.0 as usize).map(String::as_str)
140    }
141
142    /// Returns `true` if `s` has already been interned.
143    ///
144    /// Equivalent to [`Interner::lookup`] returning `Some`.
145    #[inline]
146    #[must_use]
147    pub fn contains(&self, s: &str) -> bool {
148        self.lookup.contains_key(s)
149    }
150
151    /// Returns the symbol previously assigned to `s`, without inserting
152    /// a new entry if the string is unknown.
153    #[inline]
154    #[must_use]
155    pub fn lookup(&self, s: &str) -> Option<Symbol> {
156        self.lookup.get(s).copied().map(Symbol)
157    }
158
159    /// Iterator over `(Symbol, &str)` pairs for every interned string,
160    /// in insertion order.
161    pub fn iter(&self) -> Iter<'_> {
162        Iter {
163            inner: self.storage.iter().enumerate(),
164        }
165    }
166}
167
168impl Default for Interner {
169    #[inline]
170    fn default() -> Self {
171        Self::new()
172    }
173}
174
175impl core::fmt::Debug for Interner {
176    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
177        f.debug_struct("Interner")
178            .field("len", &self.storage.len())
179            .finish()
180    }
181}
182
183/// Iterator over `(Symbol, &str)` pairs returned by [`Interner::iter`].
184pub struct Iter<'a> {
185    inner: core::iter::Enumerate<core::slice::Iter<'a, String>>,
186}
187
188impl<'a> Iterator for Iter<'a> {
189    type Item = (Symbol, &'a str);
190
191    fn next(&mut self) -> Option<Self::Item> {
192        let (i, s) = self.inner.next()?;
193        Some((Symbol(i as u32), s.as_str()))
194    }
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    #[test]
202    fn same_string_returns_same_symbol() {
203        let mut i = Interner::new();
204        let a = i.intern("hello");
205        let b = i.intern("hello");
206        assert_eq!(a, b);
207        assert_eq!(i.len(), 1);
208    }
209
210    #[test]
211    fn distinct_strings_return_distinct_symbols() {
212        let mut i = Interner::new();
213        let a = i.intern("alpha");
214        let b = i.intern("bravo");
215        assert_ne!(a, b);
216    }
217
218    #[test]
219    fn resolve_round_trips() {
220        let mut i = Interner::new();
221        let s = i.intern("round-trip");
222        assert_eq!(i.resolve(s), Some("round-trip"));
223    }
224
225    #[test]
226    fn lookup_does_not_insert() {
227        let mut i = Interner::new();
228        let _ = i.intern("first");
229        assert!(i.lookup("second").is_none());
230        assert_eq!(i.len(), 1);
231    }
232
233    #[test]
234    fn contains_reflects_state() {
235        let mut i = Interner::new();
236        assert!(!i.contains("x"));
237        let _ = i.intern("x");
238        assert!(i.contains("x"));
239    }
240
241    #[test]
242    fn iter_yields_insertion_order() {
243        let mut i = Interner::new();
244        let _ = i.intern("a");
245        let _ = i.intern("b");
246        let _ = i.intern("c");
247        let collected: Vec<&str> = i.iter().map(|(_, s)| s).collect();
248        assert_eq!(collected, vec!["a", "b", "c"]);
249    }
250}