Skip to main content

sui_intern/
lib.rs

1//! String interning for attribute names and identifiers.
2//!
3//! Converts heap-allocated `String` comparisons into cheap `u32`
4//! comparisons. Every unique string is stored exactly once; lookups
5//! and comparisons use the [`Symbol`] handle (a `u32` index).
6//!
7//! # Performance Impact
8//!
9//! Attrset key operations (`GetAttr`, `HasAttr`, `MakeAttrs`, `UpdateAttrs`)
10//! go from O(n) string comparison to O(1) integer comparison. This is
11//! the single highest-ROI optimization for Nix evaluation because
12//! nixpkgs is dominated by attrset operations.
13//!
14//! # Storage
15//!
16//! Strings are stored as `Rc<str>`. The forward map's key and the reverse
17//! `strings` vector share the same allocation — no double-allocation on
18//! intern, and `resolve_rc` returns a cheap `Rc::clone` instead of a
19//! full `String::from` copy. Hashing uses `FxHashMap` (rustc-hash) —
20//! ~2x faster than SipHash for the short strings typical of Nix keys
21//! and identifiers.
22//!
23//! # Thread-Local Helpers
24//!
25//! For convenience in single-threaded evaluation, this crate provides
26//! [`intern()`] and [`resolve()`] free functions that operate on a
27//! thread-local [`Interner`] instance. [`prewarm()`] pre-interns a
28//! curated set of hot nixpkgs symbols so common names get low indices
29//! and the hashmap's initial resize cost is paid upfront.
30
31use std::cell::RefCell;
32use std::fmt;
33use std::rc::Rc;
34
35use rustc_hash::FxHashMap;
36
37/// An interned string handle — a cheap, copyable, comparable token.
38///
39/// Two `Symbol`s are equal if and only if they refer to the same
40/// interned string. Comparison is a single `u32 ==` operation.
41#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
42pub struct Symbol(u32);
43
44impl Symbol {
45    /// Return the raw index for serialization / debugging.
46    #[must_use]
47    pub fn index(self) -> u32 {
48        self.0
49    }
50}
51
52impl fmt::Debug for Symbol {
53    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54        write!(f, "Symbol({})", self.0)
55    }
56}
57
58impl fmt::Display for Symbol {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        write!(f, "#{}", self.0)
61    }
62}
63
64/// A string interner that maps strings to [`Symbol`] handles.
65///
66/// Thread-local, single-owner. For a Nix evaluator this is sufficient
67/// because evaluation is single-threaded.
68#[derive(Clone, Default)]
69pub struct Interner {
70    /// Forward map: string content -> symbol. The `Rc<str>` key is
71    /// shared with the reverse `strings` vector, so interning allocates
72    /// the UTF-8 bytes exactly once.
73    map: FxHashMap<Rc<str>, Symbol>,
74    /// Reverse map: symbol index -> string content.
75    strings: Vec<Rc<str>>,
76}
77
78impl Interner {
79    /// Create a new, empty interner.
80    #[must_use]
81    pub fn new() -> Self {
82        Self {
83            map: FxHashMap::default(),
84            strings: Vec::new(),
85        }
86    }
87
88    /// Create an interner with pre-allocated capacity. Use when you
89    /// know the approximate identifier count upfront — avoids hashmap
90    /// resizes during the hot path.
91    #[must_use]
92    pub fn with_capacity(cap: usize) -> Self {
93        Self {
94            map: FxHashMap::with_capacity_and_hasher(cap, rustc_hash::FxBuildHasher::default()),
95            strings: Vec::with_capacity(cap),
96        }
97    }
98
99    /// Intern a string, returning its symbol. If the string was already
100    /// interned, returns the existing symbol (O(1) amortized).
101    pub fn intern(&mut self, s: &str) -> Symbol {
102        if let Some(&sym) = self.map.get(s) {
103            return sym;
104        }
105        let rc: Rc<str> = Rc::from(s);
106        let sym = Symbol(u32::try_from(self.strings.len()).expect("interner overflow"));
107        self.strings.push(Rc::clone(&rc));
108        self.map.insert(rc, sym);
109        sym
110    }
111
112    /// Resolve a symbol back to its string content.
113    ///
114    /// # Panics
115    ///
116    /// Panics if the symbol was not produced by this interner.
117    #[must_use]
118    pub fn resolve(&self, sym: Symbol) -> &str {
119        &self.strings[sym.0 as usize]
120    }
121
122    /// Resolve a symbol to its shared `Rc<str>` handle. Cheap to clone
123    /// and pass around; prefer this over `resolve(...).to_string()` in
124    /// hot paths.
125    ///
126    /// # Panics
127    ///
128    /// Panics if the symbol was not produced by this interner.
129    #[must_use]
130    pub fn resolve_rc(&self, sym: Symbol) -> Rc<str> {
131        Rc::clone(&self.strings[sym.0 as usize])
132    }
133
134    /// Try to resolve a symbol, returning `None` if invalid.
135    #[must_use]
136    pub fn try_resolve(&self, sym: Symbol) -> Option<&str> {
137        self.strings.get(sym.0 as usize).map(AsRef::as_ref)
138    }
139
140    /// Look up a symbol for a string without interning it.
141    #[must_use]
142    pub fn lookup(&self, s: &str) -> Option<Symbol> {
143        self.map.get(s).copied()
144    }
145
146    /// Return the number of interned strings.
147    #[must_use]
148    pub fn len(&self) -> usize {
149        self.strings.len()
150    }
151
152    /// Whether the interner is empty.
153    #[must_use]
154    pub fn is_empty(&self) -> bool {
155        self.strings.is_empty()
156    }
157}
158
159impl fmt::Debug for Interner {
160    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
161        write!(f, "Interner({} strings)", self.strings.len())
162    }
163}
164
165// -- Thread-local interner helpers --
166
167thread_local! {
168    static GLOBAL_INTERNER: RefCell<Interner> = RefCell::new(Interner::with_capacity(512));
169}
170
171/// Intern a string using the thread-local interner.
172pub fn intern(s: &str) -> Symbol {
173    GLOBAL_INTERNER.with(|i| i.borrow_mut().intern(s))
174}
175
176/// Resolve a symbol using the thread-local interner.
177///
178/// Allocates a fresh `String`. Prefer [`resolve_rc`] or [`with_resolved`]
179/// in hot paths — `Rc::clone` is ~20x cheaper than `String::from` for
180/// typical identifier lengths.
181#[must_use]
182pub fn resolve(sym: Symbol) -> String {
183    GLOBAL_INTERNER.with(|i| i.borrow().resolve(sym).to_string())
184}
185
186/// Resolve a symbol to a shared `Rc<str>` handle. Zero-copy.
187#[must_use]
188pub fn resolve_rc(sym: Symbol) -> Rc<str> {
189    GLOBAL_INTERNER.with(|i| i.borrow().resolve_rc(sym))
190}
191
192/// Borrow the resolved string inside a closure without allocating.
193/// The thread-local interner stays locked for the duration of `f`.
194pub fn with_resolved<F, R>(sym: Symbol, f: F) -> R
195where
196    F: FnOnce(&str) -> R,
197{
198    GLOBAL_INTERNER.with(|i| f(i.borrow().resolve(sym)))
199}
200
201/// Look up a symbol for a string in the thread-local interner without
202/// interning it.
203#[must_use]
204pub fn lookup(s: &str) -> Option<Symbol> {
205    GLOBAL_INTERNER.with(|i| i.borrow().lookup(s))
206}
207
208/// Intern the hot nixpkgs/flake/stdenv symbol set so they get low
209/// `Symbol` indices and the thread-local hashmap pays its first few
210/// resizes upfront instead of on the eval hot path.
211///
212/// Call this once per thread before the first eval. Idempotent —
213/// re-running is cheap because every symbol is a hashmap hit after
214/// the first pass.
215///
216/// The curated list covers the identifiers that dominate nixpkgs
217/// attribute access: derivation fields (`name`, `src`, `buildInputs`
218/// …), module-system glue (`config`, `options`, `mkOption`, `mkIf`
219/// …), flake schema (`inputs`, `outputs`, `description` …), meta
220/// (`meta`, `platforms`, `license` …), and the empty string used by
221/// string context machinery.
222pub fn prewarm() {
223    GLOBAL_INTERNER.with(|i| {
224        let mut guard = i.borrow_mut();
225        for s in HOT_SYMBOLS {
226            guard.intern(s);
227        }
228    });
229}
230
231/// Hot symbols pre-interned by [`prewarm`]. Order is load-bearing —
232/// the first entry gets Symbol(0), the second Symbol(1), etc.  Leave
233/// the empty string first so `Symbol(0)` means "empty" by convention.
234const HOT_SYMBOLS: &[&str] = &[
235    // Sentinels
236    "",
237    // Derivation fields
238    "name",
239    "pname",
240    "version",
241    "src",
242    "system",
243    "builder",
244    "args",
245    "outputs",
246    "out",
247    "dev",
248    "bin",
249    "man",
250    "doc",
251    "outputHash",
252    "outputHashAlgo",
253    "outputHashMode",
254    "passAsFile",
255    "preferLocalBuild",
256    "allowSubstitutes",
257    // Build dependencies
258    "buildInputs",
259    "nativeBuildInputs",
260    "propagatedBuildInputs",
261    "propagatedNativeBuildInputs",
262    "checkInputs",
263    "nativeCheckInputs",
264    "buildPhase",
265    "installPhase",
266    "configurePhase",
267    "patchPhase",
268    "unpackPhase",
269    // Module system
270    "config",
271    "options",
272    "imports",
273    "_module",
274    "mkOption",
275    "mkDefault",
276    "mkForce",
277    "mkIf",
278    "mkMerge",
279    "mkOverride",
280    "type",
281    "default",
282    "description",
283    "example",
284    "visible",
285    "internal",
286    "readOnly",
287    // Flake schema
288    "inputs",
289    "url",
290    "flake",
291    "follows",
292    "packages",
293    "devShells",
294    "apps",
295    "overlays",
296    "nixosModules",
297    "nixosConfigurations",
298    "darwinModules",
299    "darwinConfigurations",
300    "homeModules",
301    "homeConfigurations",
302    "templates",
303    "checks",
304    "formatter",
305    "legacyPackages",
306    // nixpkgs conventions
307    "nixpkgs",
308    "pkgs",
309    "stdenv",
310    "lib",
311    "hostPlatform",
312    "buildPlatform",
313    "targetPlatform",
314    "isDarwin",
315    "isLinux",
316    "x86_64-linux",
317    "aarch64-linux",
318    "x86_64-darwin",
319    "aarch64-darwin",
320    // Meta
321    "meta",
322    "platforms",
323    "homepage",
324    "license",
325    "maintainers",
326    "mainProgram",
327    "available",
328    "broken",
329    "insecure",
330    "unsupported",
331    // String context + laziness helpers
332    "recurseIntoAttrs",
333    "__functor",
334    "__toString",
335    "__ignoreNulls",
336    "outPath",
337    "drvPath",
338    "attrs",
339    "outputName",
340    // Common value identifiers
341    "true",
342    "false",
343    "null",
344];
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn intern_returns_same_symbol() {
352        let mut interner = Interner::new();
353        let s1 = interner.intern("hello");
354        let s2 = interner.intern("hello");
355        assert_eq!(s1, s2);
356    }
357
358    #[test]
359    fn different_strings_different_symbols() {
360        let mut interner = Interner::new();
361        let s1 = interner.intern("hello");
362        let s2 = interner.intern("world");
363        assert_ne!(s1, s2);
364    }
365
366    #[test]
367    fn resolve_roundtrip() {
368        let mut interner = Interner::new();
369        let sym = interner.intern("foo");
370        assert_eq!(interner.resolve(sym), "foo");
371    }
372
373    #[test]
374    fn resolve_rc_shares_allocation() {
375        let mut interner = Interner::new();
376        let sym = interner.intern("shared");
377        let a = interner.resolve_rc(sym);
378        let b = interner.resolve_rc(sym);
379        assert_eq!(&*a, "shared");
380        assert_eq!(&*b, "shared");
381        // Two handles point at the same allocation.
382        assert!(Rc::ptr_eq(&a, &b));
383    }
384
385    #[test]
386    fn lookup_existing() {
387        let mut interner = Interner::new();
388        let sym = interner.intern("bar");
389        assert_eq!(interner.lookup("bar"), Some(sym));
390    }
391
392    #[test]
393    fn lookup_missing() {
394        let interner = Interner::new();
395        assert_eq!(interner.lookup("missing"), None);
396    }
397
398    #[test]
399    fn len_and_empty() {
400        let mut interner = Interner::new();
401        assert!(interner.is_empty());
402        assert_eq!(interner.len(), 0);
403        interner.intern("a");
404        interner.intern("b");
405        interner.intern("a"); // duplicate
406        assert_eq!(interner.len(), 2);
407        assert!(!interner.is_empty());
408    }
409
410    #[test]
411    fn symbol_ordering() {
412        let mut interner = Interner::new();
413        let s1 = interner.intern("alpha");
414        let s2 = interner.intern("beta");
415        // Symbols are ordered by insertion order, not alphabetically.
416        assert!(s1 < s2);
417    }
418
419    #[test]
420    fn try_resolve_valid() {
421        let mut interner = Interner::new();
422        let sym = interner.intern("test");
423        assert_eq!(interner.try_resolve(sym), Some("test"));
424    }
425
426    #[test]
427    fn try_resolve_invalid() {
428        let interner = Interner::new();
429        assert_eq!(interner.try_resolve(Symbol(999)), None);
430    }
431
432    #[test]
433    fn clone_interner() {
434        let mut interner = Interner::new();
435        let s1 = interner.intern("hello");
436        let cloned = interner.clone();
437        assert_eq!(cloned.resolve(s1), "hello");
438        assert_eq!(cloned.len(), 1);
439    }
440
441    #[test]
442    fn with_capacity_preallocates() {
443        let interner = Interner::with_capacity(256);
444        assert!(interner.is_empty());
445        // capacity is not exposed on FxHashMap publicly but len should be 0
446        assert_eq!(interner.len(), 0);
447    }
448
449    // Thread-local helpers run inside `std::thread::spawn` so they
450    // can't collide with each other or with tests in sibling crates
451    // that already touched the global interner.
452    #[test]
453    fn thread_local_intern_resolve() {
454        std::thread::spawn(|| {
455            let sym = intern("thread_local_test");
456            let resolved = resolve(sym);
457            assert_eq!(resolved, "thread_local_test");
458        })
459        .join()
460        .unwrap();
461    }
462
463    #[test]
464    fn thread_local_intern_dedup() {
465        std::thread::spawn(|| {
466            let s1 = intern("dedup");
467            let s2 = intern("dedup");
468            assert_eq!(s1, s2);
469        })
470        .join()
471        .unwrap();
472    }
473
474    #[test]
475    fn thread_local_resolve_rc_zero_copy() {
476        std::thread::spawn(|| {
477            let sym = intern("tl_rc");
478            let a = resolve_rc(sym);
479            let b = resolve_rc(sym);
480            assert!(Rc::ptr_eq(&a, &b));
481        })
482        .join()
483        .unwrap();
484    }
485
486    #[test]
487    fn thread_local_with_resolved_no_alloc() {
488        std::thread::spawn(|| {
489            let sym = intern("borrowed");
490            let len = with_resolved(sym, str::len);
491            assert_eq!(len, "borrowed".len());
492        })
493        .join()
494        .unwrap();
495    }
496
497    #[test]
498    fn thread_local_lookup() {
499        std::thread::spawn(|| {
500            assert_eq!(lookup("never_interned_here"), None);
501            let sym = intern("findme");
502            assert_eq!(lookup("findme"), Some(sym));
503        })
504        .join()
505        .unwrap();
506    }
507
508    #[test]
509    fn prewarm_populates_hot_set() {
510        std::thread::spawn(|| {
511            // Fresh thread — interner starts empty, then prewarm fills it.
512            prewarm();
513            for &s in HOT_SYMBOLS {
514                assert!(
515                    lookup(s).is_some(),
516                    "prewarm should have interned {s:?}"
517                );
518            }
519            // Idempotent — second call shouldn't grow the interner.
520            let before = HOT_SYMBOLS.len();
521            prewarm();
522            // The thread-local state isn't directly inspectable here;
523            // instead, check that every hot symbol still resolves to
524            // the same index it got the first time.
525            let sym_first = lookup("name").expect("name interned by prewarm");
526            assert!(sym_first.index() < u32::try_from(before).unwrap());
527        })
528        .join()
529        .unwrap();
530    }
531
532    #[test]
533    fn hot_symbols_unique() {
534        // Sanity: no duplicates in the curated list, otherwise the
535        // "this symbol has the N-th index" reasoning breaks.
536        let mut seen = std::collections::HashSet::new();
537        for &s in HOT_SYMBOLS {
538            assert!(seen.insert(s), "HOT_SYMBOLS contains duplicate {s:?}");
539        }
540    }
541}