Skip to main content

nickel_lang_parser/
identifier.rs

1//! Define the type of an identifier.
2use serde::{Deserialize, Serialize};
3use std::{
4    borrow::Borrow,
5    fmt::{self, Debug},
6    hash::Hash,
7    sync::{
8        LazyLock,
9        atomic::{AtomicUsize, Ordering},
10    },
11};
12
13use crate::{metrics::increment, position::TermPos};
14
15static INTERNER: LazyLock<interner::Interner> = LazyLock::new(interner::Interner::new);
16static COUNTER: AtomicUsize = AtomicUsize::new(0);
17
18/// An interned identifier.
19//
20// Implementation-wise, this is just a wrapper around interner::Symbol that uses a hard-coded,
21// static `Interner`.
22#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(into = "&'static str", from = "String")]
24pub struct Ident(interner::Symbol);
25
26impl Ident {
27    pub fn new(s: impl AsRef<str>) -> Self {
28        Self(INTERNER.get_or_intern(s.as_ref()))
29    }
30
31    /// Return the string representation of this identifier.
32    pub fn label(&self) -> &'static str {
33        INTERNER.lookup(self.0)
34    }
35
36    pub fn into_label(self) -> String {
37        self.label().to_owned()
38    }
39
40    /// Look up a generated identifier by name, panicking if it doesn't exist.
41    ///
42    /// This is extremely slow because it scans over all symbols. It's only used
43    /// for tests that look at pretty-printed output.
44    ///
45    /// Public only because we use it in tests outside of `nickel_lang_parser`.
46    #[doc(hidden)]
47    pub fn find_generated(s: &str) -> Self {
48        Self(INTERNER.find_generated(s))
49    }
50
51    /// Create a new fresh identifier. This identifier is unique and is
52    /// guaranteed not to collide with any identifier defined before.
53    ///
54    /// Generated identifiers start with a special prefix that isn't valid
55    /// for Nickel identifiers. This doesn't actually guarantee that the
56    /// [label][Self::label] of a fresh identifier will always be different
57    /// from the label of a normal identifier created by [`Ident::new`]: there
58    /// are ways to introduce normal identifiers that aren't parsed as Nickel
59    /// identifiers (for example, `std.record.insert "%1" 42 {}` causes an
60    /// identifier to be created with the name "%1").
61    ///
62    /// The consequence of all this is that different `Ident`s can have the
63    /// same label. `Ident::new("%1")` and `Ident::fresh()` might both generate
64    /// idents with label "%1", but they will be different idents.
65    pub fn fresh() -> Self {
66        increment!("Ident::fresh");
67        Self(INTERNER.intern_generated(format!(
68            "{}{}",
69            GEN_PREFIX,
70            COUNTER.fetch_add(1, Ordering::Relaxed)
71        )))
72    }
73
74    /// Attaches a position to this identifier, making it a `LocIdent`.
75    pub fn spanned(self, pos: TermPos) -> LocIdent {
76        LocIdent { ident: self, pos }
77    }
78
79    /// Checks whether this identifier was generated with [`Ident::fresh`].
80    pub fn is_generated(&self) -> bool {
81        INTERNER.is_generated(self.0)
82    }
83}
84
85impl fmt::Display for Ident {
86    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
87        write!(f, "{}", self.label())
88    }
89}
90
91impl fmt::Debug for Ident {
92    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
93        if self.is_generated() {
94            write!(f, "`{}` (generated)", self.label())
95        } else {
96            write!(f, "`{}`", self.label())
97        }
98    }
99}
100
101impl From<Ident> for LocIdent {
102    fn from(ident: Ident) -> Self {
103        ident.spanned(TermPos::None)
104    }
105}
106
107impl From<&LocIdent> for Ident {
108    fn from(ident: &LocIdent) -> Self {
109        ident.ident()
110    }
111}
112
113impl PartialOrd for Ident {
114    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
115        Some(self.cmp(other))
116    }
117}
118
119impl Ord for Ident {
120    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
121        self.label().cmp(other.label())
122    }
123}
124
125impl<'a> From<&'a str> for Ident {
126    fn from(s: &'a str) -> Self {
127        Ident::new(s)
128    }
129}
130
131impl From<String> for Ident {
132    fn from(s: String) -> Self {
133        Ident::new(s)
134    }
135}
136
137impl From<Ident> for &'static str {
138    fn from(id: Ident) -> &'static str {
139        id.label()
140    }
141}
142
143/// An identifier with a location.
144///
145/// The location is ignored for equality comparison and hashing; it's mainly
146/// intended for error messages.
147#[derive(Clone, Copy, Debug, Deserialize, Serialize)]
148#[serde(into = "String", from = "String")]
149pub struct LocIdent {
150    ident: Ident,
151    pub pos: TermPos,
152}
153
154impl LocIdent {
155    pub fn new_with_pos(label: impl AsRef<str>, pos: TermPos) -> Self {
156        Self {
157            ident: Ident::new(label),
158            pos,
159        }
160    }
161
162    pub fn new(label: impl AsRef<str>) -> Self {
163        Self::new_with_pos(label, TermPos::None)
164    }
165
166    /// Create an identifier with the same label as this one, but a specified position.
167    pub fn with_pos(self, pos: TermPos) -> LocIdent {
168        LocIdent { pos, ..self }
169    }
170
171    /// Create a fresh identifier with no position. See [Ident::fresh].
172    pub fn fresh() -> Self {
173        Ident::fresh().into()
174    }
175
176    /// Return the identifier without its position.
177    pub fn ident(&self) -> Ident {
178        self.ident
179    }
180
181    /// Return the string representation of this identifier.
182    pub fn label(&self) -> &'static str {
183        self.ident.label()
184    }
185
186    pub fn into_label(self) -> String {
187        self.label().to_owned()
188    }
189
190    /// Checks whether this identifier was generated by [`Ident::fresh`].
191    ///
192    /// Note that this is not optimized for speed: it involves taking a lock
193    /// and looking up a table.
194    pub fn is_generated(&self) -> bool {
195        self.ident.is_generated()
196    }
197}
198
199/// Special character used for generating fresh identifiers. It must be syntactically impossible to
200/// use to write in a standard Nickel program, to avoid name clashes.
201pub const GEN_PREFIX: char = '%';
202
203impl PartialOrd for LocIdent {
204    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
205        Some(self.cmp(other))
206    }
207}
208
209impl Ord for LocIdent {
210    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
211        self.label().cmp(other.label())
212    }
213}
214
215impl PartialEq for LocIdent {
216    fn eq(&self, other: &Self) -> bool {
217        self.ident == other.ident
218    }
219}
220
221impl Eq for LocIdent {}
222
223impl Hash for LocIdent {
224    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
225        self.ident.hash(state)
226    }
227}
228
229impl Borrow<Ident> for LocIdent {
230    fn borrow(&self) -> &Ident {
231        &self.ident
232    }
233}
234
235impl fmt::Display for LocIdent {
236    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
237        write!(f, "{}", self.label())
238    }
239}
240
241/// Wrapper around [Ident] with a fast ordering function that only compares the underlying symbols.
242/// Useful when a bunch of idents need to be sorted for algorithmic reasons, but one doesn't need
243/// the actual natural order on strings nor care about the specific order.
244#[derive(Clone, Debug, Eq, PartialEq)]
245pub struct FastOrdIdent(pub Ident);
246
247impl PartialOrd for FastOrdIdent {
248    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
249        Some(self.cmp(other))
250    }
251}
252
253impl Ord for FastOrdIdent {
254    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
255        self.0.0.cmp(&other.0.0)
256    }
257}
258
259impl<'a> From<&'a str> for LocIdent {
260    fn from(s: &'a str) -> Self {
261        LocIdent::new(s)
262    }
263}
264
265impl From<String> for LocIdent {
266    fn from(s: String) -> Self {
267        LocIdent::new(s)
268    }
269}
270
271impl From<LocIdent> for &'static str {
272    fn from(id: LocIdent) -> &'static str {
273        id.label()
274    }
275}
276
277// TODO: among all the `From` impls here, this is the only one that allocates.
278// Allocations aren't forbidden in `From` (e.g. `String: From<&str>`), but it
279// would still be nice to get rid of this implicit allocation. It's mainly used
280// in `Term` right now.
281impl From<LocIdent> for String {
282    fn from(id: LocIdent) -> String {
283        id.label().to_owned()
284    }
285}
286
287impl AsRef<str> for LocIdent {
288    fn as_ref(&self) -> &str {
289        self.label()
290    }
291}
292
293mod interner {
294    use std::collections::HashMap;
295    use std::sync::{Mutex, RwLock};
296
297    use typed_arena::Arena;
298
299    use super::Bitmap;
300
301    /// A symbol is a correspondence between an [Ident](super::Ident) and its string representation
302    /// stored in the [Interner].
303    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
304    pub struct Symbol(u32);
305
306    /// The interner, which serves a double purpose: it pre-allocates space
307    /// so that [Ident](super::Ident) labels are created faster
308    /// and it makes it so that labels are stored only once, saving space.
309    pub(crate) struct Interner(RwLock<InnerInterner>);
310
311    impl Interner {
312        /// Creates an empty [Interner].
313        pub(crate) fn new() -> Self {
314            Self(RwLock::new(InnerInterner::empty()))
315        }
316
317        /// Stores a string inside the [Interner] if it does not exists, and returns the
318        /// corresponding [Symbol].
319        pub(crate) fn get_or_intern(&self, string: impl AsRef<str>) -> Symbol {
320            self.0.write().unwrap().get_or_intern(string)
321        }
322
323        /// Stores the name of a generated string in the [Interner].
324        ///
325        /// This always generates a new symbol and does not check for duplicate strings.
326        pub(crate) fn intern_generated(&self, string: impl AsRef<str>) -> Symbol {
327            self.0.write().unwrap().intern(string, true)
328        }
329
330        /// Looks up the stored string corresponding to the [Symbol].
331        ///
332        /// This operation cannot fail since the only way to have a [Symbol] is to have
333        /// [interned](Interner::intern) the corresponding string first.
334        pub(crate) fn lookup(&self, sym: Symbol) -> &str {
335            // SAFETY: Here we are transmuting the reference lifetime: &'lock str -> &'slf str.
336            // This is okay because InnerInterner::lookup guarantees stable references, and we
337            // never replace our InnerInterner.
338            unsafe { std::mem::transmute::<&'_ str, &'_ str>(self.0.read().unwrap().lookup(sym)) }
339        }
340
341        /// Look up a generated identifier by name, panicking if it doesn't exist.
342        ///
343        /// This is extremely slow because it scans over all symbols. It's only used
344        /// for tests that look at pretty-printed output.
345        pub(crate) fn find_generated(&self, s: &str) -> Symbol {
346            let inner = self.0.read().unwrap();
347            inner.with(|inner| {
348                let idx = (0..inner.vec.len())
349                    .find(|&idx| inner.generated[idx] && inner.vec[idx] == s)
350                    .unwrap();
351                Symbol(idx as u32)
352            })
353        }
354
355        pub(crate) fn is_generated(&self, sym: Symbol) -> bool {
356            self.0.read().unwrap().is_generated(sym)
357        }
358    }
359
360    /// The main part of the Interner.
361    #[ouroboros::self_referencing]
362    struct InnerInterner {
363        /// Preallocates space where strings are stored.
364        arena: Mutex<Arena<u8>>,
365
366        /// Prevents the arena from creating different [Symbols](Symbol) for the same string.
367        #[borrows(arena)]
368        #[covariant]
369        map: HashMap<&'this str, Symbol>,
370
371        /// Allows retrieving a string from a [Symbol].
372        #[borrows(arena)]
373        #[covariant]
374        vec: Vec<&'this str>,
375
376        /// Allows checking whether an identifier was generated.
377        generated: Bitmap,
378    }
379
380    impl InnerInterner {
381        /// Creates an empty [InnerInterner].
382        fn empty() -> Self {
383            Self::new(
384                Mutex::new(Arena::new()),
385                |_arena| HashMap::new(),
386                |_arena| Vec::new(),
387                Bitmap::default(),
388            )
389        }
390
391        /// Stores a string inside the [InnerInterner] if it does not exists, and returns the
392        /// corresponding [Symbol].
393        fn get_or_intern(&mut self, string: impl AsRef<str>) -> Symbol {
394            if let Some(sym) = self.borrow_map().get(string.as_ref()) {
395                return *sym;
396            }
397            self.intern(string, false)
398        }
399
400        /// Interns a string without checking for deduplication.
401        fn intern(&mut self, string: impl AsRef<str>, generated: bool) -> Symbol {
402            // SAFETY: Here we are transmuting the reference lifetime: &'lock str -> &'slf str.
403            // This is okay because references to data in the arena are valid until the arena
404            // is destroyed.
405            let in_string = unsafe {
406                std::mem::transmute::<&'_ str, &'_ str>(
407                    self.borrow_arena()
408                        .lock()
409                        .unwrap()
410                        .alloc_str(string.as_ref()),
411                )
412            };
413            let sym = Symbol(self.borrow_vec().len() as u32);
414            self.with_vec_mut(|v| v.push(in_string));
415            self.with_generated_mut(|g| g.push(generated));
416            // We only insert non-generated ids into the map, because we don't
417            // want deduplication to ever hit a generated id: if someone does
418            // `Ident::new("%0") they should get a non-generated id.
419            if !generated {
420                self.with_map_mut(|m| m.insert(in_string, sym));
421            }
422            sym
423        }
424
425        /// Looks up for the stored string corresponding to the [Symbol].
426        ///
427        /// This operation cannot fail since the only way to have a [Symbol]
428        /// is to have [interned](InnerInterner::intern) the corresponding string first.
429        ///
430        /// References returned by this method are valid until this `InnerInterner` is
431        /// destroyed: they won't be invalidated by, for example, [`Self::intern`].
432        fn lookup(&self, sym: Symbol) -> &str {
433            self.borrow_vec()[sym.0 as usize]
434        }
435
436        fn is_generated(&self, sym: Symbol) -> bool {
437            self.borrow_generated()[sym.0 as usize]
438        }
439    }
440
441    #[cfg(test)]
442    mod tests {
443        use super::*;
444
445        #[test]
446        fn test_intern_then_lookup() {
447            let interner = Interner::new();
448            let test_string = "test_string";
449            let sym = interner.get_or_intern(test_string);
450            assert_eq!(interner.lookup(sym), test_string);
451        }
452
453        #[test]
454        fn test_intern_twice_has_same_symbol() {
455            let interner = Interner::new();
456            let test_string = "test_string";
457            let sym1 = interner.get_or_intern(test_string);
458            let sym2 = interner.get_or_intern(test_string);
459            assert_eq!(sym1, sym2);
460        }
461
462        #[test]
463        fn test_intern_two_different_has_different_symbols() {
464            let interner = Interner::new();
465            let sym1 = interner.get_or_intern("a");
466            let sym2 = interner.get_or_intern("b");
467            assert_ne!(sym1, sym2);
468        }
469
470        #[test]
471        fn test_large_number_of_interns() {
472            let interner = Interner::new();
473            for i in 0..10000 {
474                let i = i.to_string();
475                let sym = interner.get_or_intern(&i);
476                assert_eq!(i, interner.lookup(sym));
477            }
478            assert_eq!(10000, interner.0.read().unwrap().borrow_map().len());
479            assert_eq!(10000, interner.0.read().unwrap().borrow_vec().len());
480            // doing the same a second time should not add anything to the interner
481            for i in 0..10000 {
482                let i = i.to_string();
483                let sym = interner.get_or_intern(&i);
484                assert_eq!(i, interner.lookup(sym));
485            }
486            assert_eq!(10000, interner.0.read().unwrap().borrow_map().len());
487            assert_eq!(10000, interner.0.read().unwrap().borrow_vec().len());
488        }
489    }
490}
491
492/// A basic bitmap that's more space-efficient than a `Vec<bool>`, but much simpler
493/// than (and not size-bounded like) the `bitmaps` crate.
494#[derive(Default)]
495struct Bitmap {
496    /// Our first element is at `self.data[0] & 0b1`, our next element is
497    /// at `self.data[0] & 0b10`, and so on.
498    data: Vec<u64>,
499    /// The size of this bitmap in bits. This will always be between
500    /// `self.data.len() * 8 - 7` and `self.data.len() * 8` inclusive.
501    len: usize,
502}
503
504impl Bitmap {
505    /// Breaks down a location into an index (in `Bitmap::data`) and a bitmask.
506    fn location(idx: usize) -> (usize, u64) {
507        let shift = (idx % 64) as u32;
508        (idx / 64, 1 << shift)
509    }
510
511    fn push(&mut self, val: bool) {
512        let (idx, mask) = Bitmap::location(self.len);
513        if idx >= self.data.len() {
514            debug_assert_eq!(idx, self.data.len());
515            self.data.push(0);
516        }
517        if val {
518            self.data[idx] |= mask;
519        }
520        self.len += 1;
521    }
522}
523
524impl std::ops::Index<usize> for Bitmap {
525    type Output = bool;
526
527    fn index(&self, idx: usize) -> &bool {
528        let (idx, mask) = Bitmap::location(idx);
529        if (self.data[idx] & mask) == 0 {
530            &false
531        } else {
532            &true
533        }
534    }
535}
536
537#[cfg(test)]
538mod tests {
539    use super::Bitmap;
540
541    #[test]
542    fn bitmap_basics() {
543        let mut b = Bitmap::default();
544        b.push(true);
545        b.push(false);
546        assert!(b[0]);
547        assert!(!b[1]);
548
549        for _ in 0..64 {
550            b.push(true);
551        }
552        b.push(false);
553
554        assert!(b[63]);
555        assert!(b[64]);
556        assert!(b[65]);
557        assert!(!b[66]);
558    }
559}