Skip to main content

tsz_common/
interner.rs

1//! String Interner for identifier deduplication.
2//!
3//! PERFORMANCE OPTIMIZATION: Intern strings into a global pool and pass around
4//! u32 indices (Atoms). This eliminates duplicate string allocations for common
5//! identifiers like "id", "value", "length", etc.
6//!
7//! Comparisons become integer comparisons (`atom_a` == `atom_b`) instead of string
8//! comparisons, which is significantly faster.
9
10use rustc_hash::{FxHashMap, FxHasher};
11use serde::Serialize;
12use std::hash::{Hash, Hasher};
13use std::sync::{Arc, RwLock};
14
15/// An interned string identifier.
16///
17/// Atoms are cheap to copy (just a u32) and can be compared with == in O(1).
18/// To get the actual string, use `Interner::resolve(atom)`.
19#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Default, PartialOrd, Ord)]
20pub struct Atom(pub u32);
21
22impl Atom {
23    /// A sentinel value representing no atom / empty string.
24    pub const NONE: Self = Self(0);
25
26    /// Returns `Atom::NONE` - used for serde default.
27    #[must_use]
28    #[inline]
29    pub const fn none() -> Self {
30        Self::NONE
31    }
32
33    /// Check if this is the empty/none atom.
34    #[must_use]
35    #[inline]
36    pub const fn is_none(self) -> bool {
37        self.0 == 0
38    }
39
40    /// Get the raw index value.
41    #[must_use]
42    #[inline]
43    pub const fn index(self) -> u32 {
44        self.0
45    }
46}
47
48const SHARD_BITS: u32 = 6;
49const SHARD_COUNT: usize = 64;
50const SHARD_MASK: u32 = 63;
51const SHARD_MASK_U64: u64 = 63;
52const COMMON_STRINGS: &[&str] = &[
53    // Keywords
54    "break",
55    "case",
56    "catch",
57    "class",
58    "const",
59    "continue",
60    "debugger",
61    "default",
62    "delete",
63    "do",
64    "else",
65    "enum",
66    "export",
67    "extends",
68    "false",
69    "finally",
70    "for",
71    "function",
72    "if",
73    "import",
74    "in",
75    "instanceof",
76    "new",
77    "null",
78    "return",
79    "super",
80    "switch",
81    "this",
82    "throw",
83    "true",
84    "try",
85    "typeof",
86    "undefined",
87    "var",
88    "void",
89    "while",
90    "with",
91    "as",
92    "implements",
93    "interface",
94    "let",
95    "package",
96    "private",
97    "protected",
98    "public",
99    "static",
100    "yield",
101    "any",
102    "boolean",
103    "number",
104    "string",
105    "symbol",
106    "type",
107    "from",
108    "of",
109    "async",
110    "await",
111    // Common identifiers
112    "id",
113    "name",
114    "value",
115    "length",
116    "key",
117    "index",
118    "item",
119    "data",
120    "error",
121    "result",
122    "response",
123    "request",
124    "options",
125    "config",
126    "props",
127    "state",
128    "children",
129    "onClick",
130    "onChange",
131    "onSubmit",
132    "constructor",
133    "prototype",
134    "toString",
135    "valueOf",
136    "hasOwnProperty",
137    "Array",
138    "Object",
139    "String",
140    "Number",
141    "Boolean",
142    "Function",
143    "Promise",
144    "Map",
145    "Set",
146    "Date",
147    "RegExp",
148    "Error",
149    "Symbol",
150    "console",
151    "log",
152    "warn",
153    "error",
154    "info",
155    "debug",
156    "document",
157    "window",
158    "global",
159    "process",
160    "module",
161    "exports",
162    "require",
163    "define",
164    "__dirname",
165    "__filename",
166];
167
168/// String interner that deduplicates strings and returns Atom handles.
169///
170/// # Example
171/// ```
172/// use tsz_common::interner::Interner;
173/// let mut interner = Interner::new();
174/// let a1 = interner.intern("hello");
175/// let a2 = interner.intern("hello");
176/// assert_eq!(a1, a2); // Same atom for same string
177/// assert_eq!(interner.resolve(a1), "hello");
178/// ```
179#[derive(Default, Clone, Debug)]
180pub struct Interner {
181    /// Map from string to atom index
182    map: FxHashMap<Arc<str>, Atom>,
183    /// Vector of all interned strings (index 0 is empty string)
184    strings: Vec<Arc<str>>,
185}
186
187impl Interner {
188    /// Create a new interner with the empty string pre-interned at index 0.
189    #[must_use]
190    pub fn new() -> Self {
191        let mut interner = Self {
192            map: FxHashMap::default(),
193            strings: Vec::with_capacity(1024), // Pre-allocate for common case
194        };
195        // Index 0 is reserved for empty/none
196        let empty: Arc<str> = Arc::from("");
197        interner.strings.push(Arc::clone(&empty));
198        interner.map.insert(empty, Atom::NONE);
199        interner
200    }
201
202    /// Intern a string, returning its Atom handle.
203    /// If the string was already interned, returns the existing Atom.
204    #[must_use]
205    #[inline]
206    pub fn intern(&mut self, s: &str) -> Atom {
207        if let Some(&atom) = self.map.get(s) {
208            return atom;
209        }
210        let atom = Atom(u32::try_from(self.strings.len()).unwrap_or(Atom::NONE.0));
211        let owned: Arc<str> = Arc::from(s);
212        self.strings.push(Arc::clone(&owned));
213        self.map.insert(owned, atom);
214        atom
215    }
216
217    /// Intern an owned String, avoiding allocation if possible.
218    #[must_use]
219    #[inline]
220    pub fn intern_owned(&mut self, s: String) -> Atom {
221        if let Some(&atom) = self.map.get(s.as_str()) {
222            return atom;
223        }
224        let atom = Atom(u32::try_from(self.strings.len()).unwrap_or(Atom::NONE.0));
225        let owned: Arc<str> = Arc::from(s.into_boxed_str());
226        self.strings.push(Arc::clone(&owned));
227        self.map.insert(owned, atom);
228        atom
229    }
230
231    /// Resolve an Atom back to its string value.
232    /// Returns empty string if atom is out of bounds (safety for error recovery).
233    #[must_use]
234    #[inline]
235    pub fn resolve(&self, atom: Atom) -> &str {
236        self.strings.get(atom.0 as usize).map_or("", AsRef::as_ref)
237    }
238
239    /// Try to resolve an Atom, returning None if invalid.
240    #[must_use]
241    #[inline]
242    pub fn try_resolve(&self, atom: Atom) -> Option<&str> {
243        self.strings.get(atom.0 as usize).map(AsRef::as_ref)
244    }
245
246    /// Get the number of interned strings.
247    #[must_use]
248    #[inline]
249    pub const fn len(&self) -> usize {
250        self.strings.len()
251    }
252
253    /// Check if the interner is empty (only has the empty string).
254    #[must_use]
255    #[inline]
256    pub const fn is_empty(&self) -> bool {
257        self.strings.len() <= 1
258    }
259
260    /// Pre-intern common TypeScript keywords and identifiers.
261    /// Call this after creating the interner for better cache locality.
262    pub fn intern_common(&mut self) {
263        for s in COMMON_STRINGS {
264            let _ = self.intern(s);
265        }
266    }
267}
268
269#[derive(Default)]
270struct ShardState {
271    map: FxHashMap<Arc<str>, Atom>,
272    strings: Vec<Arc<str>>,
273}
274
275struct InternerShard {
276    state: RwLock<ShardState>,
277}
278
279impl InternerShard {
280    #[must_use]
281    fn new() -> Self {
282        Self {
283            state: RwLock::new(ShardState::default()),
284        }
285    }
286}
287
288/// Sharded string interner for concurrent use.
289///
290/// Uses fixed buckets to reduce lock contention while keeping Atom lookups O(1).
291pub struct ShardedInterner {
292    shards: [InternerShard; SHARD_COUNT],
293}
294
295impl ShardedInterner {
296    /// Create a new sharded interner with the empty string pre-interned at index 0.
297    #[must_use]
298    pub fn new() -> Self {
299        let shards = std::array::from_fn(|_| InternerShard::new());
300
301        // Initialize empty string in shard 0 with safe lock handling
302        if let Ok(mut state) = shards[0].state.write() {
303            let empty: Arc<str> = Arc::from("");
304            state.strings.push(Arc::clone(&empty));
305            state.map.insert(empty, Atom::NONE);
306        }
307        // Note: If lock is poisoned during initialization, we continue anyway
308        // The empty string initialization is an optimization, not critical for correctness
309
310        Self { shards }
311    }
312
313    /// Intern a string, returning its Atom handle.
314    /// If the string was already interned, returns the existing Atom.
315    #[must_use]
316    #[inline]
317    pub fn intern(&self, s: &str) -> Atom {
318        if s.is_empty() {
319            return Atom::NONE;
320        }
321
322        let shard_idx = Self::shard_for(s);
323        let shard = &self.shards[shard_idx];
324        let Ok(mut state) = shard.state.write() else {
325            // If lock is poisoned, return a fallback atom
326            // This maintains availability even if internal state is corrupted
327            return Atom::NONE;
328        };
329
330        if let Some(&atom) = state.map.get(s) {
331            return atom;
332        }
333
334        let Ok(local_index) = u32::try_from(state.strings.len()) else {
335            return Atom::NONE;
336        };
337        if local_index > (u32::MAX >> SHARD_BITS) {
338            // Return empty atom on overflow instead of panicking
339            return Atom::NONE;
340        }
341
342        let shard_idx_u32 = u32::try_from(shard_idx).unwrap_or(Atom::NONE.0);
343        let atom = Self::make_atom(local_index, shard_idx_u32);
344        let owned: Arc<str> = Arc::from(s);
345        state.strings.push(Arc::clone(&owned));
346        state.map.insert(owned, atom);
347        atom
348    }
349
350    /// Intern an owned String, avoiding allocation if possible.
351    #[must_use]
352    #[inline]
353    pub fn intern_owned(&self, s: String) -> Atom {
354        if s.is_empty() {
355            return Atom::NONE;
356        }
357
358        let shard_idx = Self::shard_for(&s);
359        let shard = &self.shards[shard_idx];
360        let Ok(mut state) = shard.state.write() else {
361            // If lock is poisoned, return a fallback atom
362            return Atom::NONE;
363        };
364
365        if let Some(&atom) = state.map.get(s.as_str()) {
366            return atom;
367        }
368
369        let Ok(local_index) = u32::try_from(state.strings.len()) else {
370            return Atom::NONE;
371        };
372        if local_index > (u32::MAX >> SHARD_BITS) {
373            // Return empty atom on overflow instead of panicking
374            return Atom::NONE;
375        }
376
377        let shard_idx_u32 = u32::try_from(shard_idx).unwrap_or(Atom::NONE.0);
378        let atom = Self::make_atom(local_index, shard_idx_u32);
379        let owned: Arc<str> = Arc::from(s);
380        state.strings.push(Arc::clone(&owned));
381        state.map.insert(owned, atom);
382        atom
383    }
384
385    /// Resolve an Atom back to its string value.
386    /// Returns empty string if atom is out of bounds (safety for error recovery).
387    #[must_use]
388    #[inline]
389    pub fn resolve(&self, atom: Atom) -> Arc<str> {
390        self.try_resolve(atom).unwrap_or_else(|| Arc::from(""))
391    }
392
393    /// Try to resolve an Atom, returning None if invalid.
394    #[must_use]
395    #[inline]
396    pub fn try_resolve(&self, atom: Atom) -> Option<Arc<str>> {
397        let (shard_idx, local_index) = Self::split_atom(atom);
398        let shard = self.shards.get(shard_idx)?;
399        let state = shard.state.read().ok()?; // Return None if lock is poisoned
400        state.strings.get(local_index).cloned()
401    }
402
403    /// Get the number of interned strings.
404    #[must_use]
405    #[inline]
406    pub fn len(&self) -> usize {
407        self.shards
408            .iter()
409            .map(|shard| {
410                // Handle lock poisoning gracefully by returning 0 for failed shards
411                shard
412                    .state
413                    .read()
414                    .map(|state| state.strings.len())
415                    .unwrap_or(0)
416            })
417            .sum()
418    }
419
420    /// Check if the interner is empty (only has the empty string).
421    #[must_use]
422    #[inline]
423    pub fn is_empty(&self) -> bool {
424        self.len() <= 1
425    }
426
427    /// Pre-intern common TypeScript keywords and identifiers.
428    /// Call this after creating the interner for better cache locality.
429    pub fn intern_common(&self) {
430        for s in COMMON_STRINGS {
431            let _ = self.intern(s);
432        }
433    }
434
435    #[inline]
436    fn shard_for(s: &str) -> usize {
437        let mut hasher = FxHasher::default();
438        s.hash(&mut hasher);
439        usize::try_from(hasher.finish() & SHARD_MASK_U64).unwrap_or(0)
440    }
441
442    #[inline]
443    const fn make_atom(local_index: u32, shard_idx: u32) -> Atom {
444        Atom((local_index << SHARD_BITS) | (shard_idx & SHARD_MASK))
445    }
446
447    #[inline]
448    fn split_atom(atom: Atom) -> (usize, usize) {
449        if atom == Atom::NONE {
450            return (0, 0);
451        }
452
453        let raw = atom.0;
454        let shard_idx = usize::try_from(raw & SHARD_MASK).unwrap_or(0);
455        let local_index = usize::try_from(raw >> SHARD_BITS).unwrap_or(0);
456        (shard_idx, local_index)
457    }
458}
459
460impl Default for ShardedInterner {
461    fn default() -> Self {
462        Self::new()
463    }
464}