nickel_lang_parser/
identifier.rs

1//! Define the type of an identifier.
2use serde::{Deserialize, Serialize};
3use std::{
4    borrow::Borrow,
5    fmt::{self, Debug},
6    hash::Hash,
7    sync::LazyLock,
8};
9
10use crate::{metrics::increment, position::TermPos};
11
12simple_counter::generate_counter!(GeneratedCounter, usize);
13static INTERNER: LazyLock<interner::Interner> = LazyLock::new(interner::Interner::new);
14
15/// An interned identifier.
16//
17// Implementation-wise, this is just a wrapper around interner::Symbol that uses a hard-coded,
18// static `Interner`.
19#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
20#[serde(into = "&'static str", from = "String")]
21pub struct Ident(interner::Symbol);
22
23impl Ident {
24    pub fn new(s: impl AsRef<str>) -> Self {
25        Self(INTERNER.intern(s.as_ref()))
26    }
27
28    /// Return the string representation of this identifier.
29    pub fn label(&self) -> &'static str {
30        INTERNER.lookup(self.0)
31    }
32
33    pub fn into_label(self) -> String {
34        self.label().to_owned()
35    }
36
37    /// Create a new fresh identifier. This identifier is unique and is guaranteed not to collide
38    /// with any identifier defined before. Generated identifiers start with a special prefix that
39    /// can't be used by normal, user-defined identifiers.
40    ///
41    /// FIXME: the comment above isn't true: the identifier isn't guaranteed to
42    /// be unique because there are ways to introduce identifiers that aren't
43    /// parsed as Nickel identifiers. For example, `std.record.insert "%1" 42 {}` causes
44    /// an identifier to be created with the name "%1".
45    pub fn fresh() -> Self {
46        increment!("Ident::fresh");
47        Self::new(format!("{}{}", GEN_PREFIX, GeneratedCounter::next()))
48    }
49
50    /// Attaches a position to this identifier, making it a `LocIdent`.
51    pub fn spanned(self, pos: TermPos) -> LocIdent {
52        LocIdent {
53            ident: self,
54            pos,
55            generated: self.label().starts_with(GEN_PREFIX),
56        }
57    }
58}
59
60impl fmt::Display for Ident {
61    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
62        write!(f, "{}", self.label())
63    }
64}
65
66impl fmt::Debug for Ident {
67    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68        write!(f, "`{}`", self.label())
69    }
70}
71
72impl From<Ident> for LocIdent {
73    fn from(ident: Ident) -> Self {
74        ident.spanned(TermPos::None)
75    }
76}
77
78impl From<&LocIdent> for Ident {
79    fn from(ident: &LocIdent) -> Self {
80        ident.ident()
81    }
82}
83
84impl PartialOrd for Ident {
85    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
86        Some(self.cmp(other))
87    }
88}
89
90impl Ord for Ident {
91    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
92        self.label().cmp(other.label())
93    }
94}
95
96impl<'a> From<&'a str> for Ident {
97    fn from(s: &'a str) -> Self {
98        Ident::new(s)
99    }
100}
101
102impl From<String> for Ident {
103    fn from(s: String) -> Self {
104        Ident::new(s)
105    }
106}
107
108impl From<Ident> for &'static str {
109    fn from(id: Ident) -> &'static str {
110        id.label()
111    }
112}
113
114/// An identifier with a location.
115///
116/// The location is ignored for equality comparison and hashing; it's mainly
117/// intended for error messages.
118#[derive(Clone, Copy, Debug, Deserialize, Serialize)]
119#[serde(into = "String", from = "String")]
120pub struct LocIdent {
121    ident: Ident,
122    pub pos: TermPos,
123    generated: bool,
124}
125
126impl LocIdent {
127    pub fn new_with_pos(label: impl AsRef<str>, pos: TermPos) -> Self {
128        let generated = label.as_ref().starts_with(GEN_PREFIX);
129        Self {
130            ident: Ident::new(label),
131            pos,
132            generated,
133        }
134    }
135
136    pub fn new(label: impl AsRef<str>) -> Self {
137        Self::new_with_pos(label, TermPos::None)
138    }
139
140    /// Create an identifier with the same label as this one, but a specified position.
141    pub fn with_pos(self, pos: TermPos) -> LocIdent {
142        LocIdent { pos, ..self }
143    }
144
145    /// Create a fresh identifier with no position. See [Ident::fresh].
146    pub fn fresh() -> Self {
147        Ident::fresh().into()
148    }
149
150    /// Return the identifier without its position.
151    pub fn ident(&self) -> Ident {
152        self.ident
153    }
154
155    /// Return the string representation of this identifier.
156    pub fn label(&self) -> &'static str {
157        self.ident.label()
158    }
159
160    pub fn into_label(self) -> String {
161        self.label().to_owned()
162    }
163}
164
165/// Special character used for generating fresh identifiers. It must be syntactically impossible to
166/// use to write in a standard Nickel program, to avoid name clashes.
167pub const GEN_PREFIX: char = '%';
168
169impl PartialOrd for LocIdent {
170    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
171        Some(self.cmp(other))
172    }
173}
174
175impl Ord for LocIdent {
176    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
177        self.label().cmp(other.label())
178    }
179}
180
181impl PartialEq for LocIdent {
182    fn eq(&self, other: &Self) -> bool {
183        self.ident == other.ident
184    }
185}
186
187impl Eq for LocIdent {}
188
189impl Hash for LocIdent {
190    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
191        self.ident.hash(state)
192    }
193}
194
195impl Borrow<Ident> for LocIdent {
196    fn borrow(&self) -> &Ident {
197        &self.ident
198    }
199}
200
201impl fmt::Display for LocIdent {
202    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
203        write!(f, "{}", self.label())
204    }
205}
206
207/// Wrapper around [Ident] with a fast ordering function that only compares the underlying symbols.
208/// Useful when a bunch of idents need to be sorted for algorithmic reasons, but one doesn't need
209/// the actual natural order on strings nor care about the specific order.
210#[derive(Clone, Debug, Eq, PartialEq)]
211pub struct FastOrdIdent(pub Ident);
212
213impl PartialOrd for FastOrdIdent {
214    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
215        Some(self.cmp(other))
216    }
217}
218
219impl Ord for FastOrdIdent {
220    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
221        self.0.0.cmp(&other.0.0)
222    }
223}
224
225impl<'a> From<&'a str> for LocIdent {
226    fn from(s: &'a str) -> Self {
227        LocIdent::new(s)
228    }
229}
230
231impl From<String> for LocIdent {
232    fn from(s: String) -> Self {
233        LocIdent::new(s)
234    }
235}
236
237impl From<LocIdent> for &'static str {
238    fn from(id: LocIdent) -> &'static str {
239        id.label()
240    }
241}
242
243// TODO: among all the `From` impls here, this is the only one that allocates.
244// Allocations aren't forbidden in `From` (e.g. `String: From<&str>`), but it
245// would still be nice to get rid of this implicit allocation. It's mainly used
246// in `Term` right now.
247impl From<LocIdent> for String {
248    fn from(id: LocIdent) -> String {
249        id.label().to_owned()
250    }
251}
252
253impl LocIdent {
254    pub fn is_generated(&self) -> bool {
255        self.generated
256    }
257}
258
259impl AsRef<str> for LocIdent {
260    fn as_ref(&self) -> &str {
261        self.label()
262    }
263}
264
265mod interner {
266    use std::collections::HashMap;
267    use std::sync::{Mutex, RwLock};
268
269    use typed_arena::Arena;
270
271    /// A symbol is a correspondence between an [Ident](super::Ident) and its string representation
272    /// stored in the [Interner].
273    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
274    pub struct Symbol(u32);
275
276    /// The interner, which serves a double purpose: it pre-allocates space
277    /// so that [Ident](super::Ident) labels are created faster
278    /// and it makes it so that labels are stored only once, saving space.
279    pub(crate) struct Interner(RwLock<InnerInterner>);
280
281    impl Interner {
282        /// Creates an empty [Interner].
283        pub(crate) fn new() -> Self {
284            Self(RwLock::new(InnerInterner::empty()))
285        }
286
287        /// Stores a string inside the [Interner] if it does not exists, and returns the
288        /// corresponding [Symbol].
289        pub(crate) fn intern(&self, string: impl AsRef<str>) -> Symbol {
290            self.0.write().unwrap().intern(string)
291        }
292
293        /// Looks up the stored string corresponding to the [Symbol].
294        ///
295        /// This operation cannot fail since the only way to have a [Symbol] is to have
296        /// [interned](Interner::intern) the corresponding string first.
297        pub(crate) fn lookup(&self, sym: Symbol) -> &str {
298            // SAFETY: Here we are transmuting the reference lifetime: &'lock str -> &'slf str.
299            // This is okay because InnerInterner::lookup guarantees stable references, and we
300            // never replace our InnerInterner.
301            unsafe { std::mem::transmute::<&'_ str, &'_ str>(self.0.read().unwrap().lookup(sym)) }
302        }
303    }
304
305    /// The main part of the Interner.
306    #[ouroboros::self_referencing]
307    struct InnerInterner {
308        /// Preallocates space where strings are stored.
309        arena: Mutex<Arena<u8>>,
310
311        /// Prevents the arena from creating different [Symbols](Symbol) for the same string.
312        #[borrows(arena)]
313        #[covariant]
314        map: HashMap<&'this str, Symbol>,
315
316        /// Allows retrieving a string from a [Symbol].
317        #[borrows(arena)]
318        #[covariant]
319        vec: Vec<&'this str>,
320    }
321
322    impl InnerInterner {
323        /// Creates an empty [InnerInterner].
324        fn empty() -> Self {
325            Self::new(
326                Mutex::new(Arena::new()),
327                |_arena| HashMap::new(),
328                |_arena| Vec::new(),
329            )
330        }
331
332        /// Stores a string inside the [InnerInterner] if it does not exists, and returns the
333        /// corresponding [Symbol].
334        fn intern(&mut self, string: impl AsRef<str>) -> Symbol {
335            if let Some(sym) = self.borrow_map().get(string.as_ref()) {
336                return *sym;
337            }
338            // SAFETY: Here we are transmuting the reference lifetime: &'lock str -> &'slf str.
339            // This is okay because references to data in the arena are valid until the arena
340            // is destroyed.
341            let in_string = unsafe {
342                std::mem::transmute::<&'_ str, &'_ str>(
343                    self.borrow_arena()
344                        .lock()
345                        .unwrap()
346                        .alloc_str(string.as_ref()),
347                )
348            };
349            let sym = Symbol(self.borrow_vec().len() as u32);
350            self.with_vec_mut(|v| v.push(in_string));
351            self.with_map_mut(|m| m.insert(in_string, sym));
352            sym
353        }
354        /// Looks up for the stored string corresponding to the [Symbol].
355        ///
356        /// This operation cannot fail since the only way to have a [Symbol]
357        /// is to have [interned](InnerInterner::intern) the corresponding string first.
358        ///
359        /// References returned by this method are valid until this `InnerInterner` is
360        /// destroyed: they won't be invalidated by, for example, [`Self::intern`].
361        fn lookup(&self, sym: Symbol) -> &str {
362            self.borrow_vec()[sym.0 as usize]
363        }
364    }
365
366    #[cfg(test)]
367    mod tests {
368        use super::*;
369
370        #[test]
371        fn test_intern_then_lookup() {
372            let interner = Interner::new();
373            let test_string = "test_string";
374            let sym = interner.intern(test_string);
375            assert_eq!(interner.lookup(sym), test_string);
376        }
377
378        #[test]
379        fn test_intern_twice_has_same_symbol() {
380            let interner = Interner::new();
381            let test_string = "test_string";
382            let sym1 = interner.intern(test_string);
383            let sym2 = interner.intern(test_string);
384            assert_eq!(sym1, sym2);
385        }
386
387        #[test]
388        fn test_intern_two_different_has_different_symbols() {
389            let interner = Interner::new();
390            let sym1 = interner.intern("a");
391            let sym2 = interner.intern("b");
392            assert_ne!(sym1, sym2);
393        }
394
395        #[test]
396        fn test_large_number_of_interns() {
397            let interner = Interner::new();
398            for i in 0..10000 {
399                let i = i.to_string();
400                let sym = interner.intern(&i);
401                assert_eq!(i, interner.lookup(sym));
402            }
403            assert_eq!(10000, interner.0.read().unwrap().borrow_map().len());
404            assert_eq!(10000, interner.0.read().unwrap().borrow_vec().len());
405            // doing the same a second time should not add anything to the interner
406            for i in 0..10000 {
407                let i = i.to_string();
408                let sym = interner.intern(&i);
409                assert_eq!(i, interner.lookup(sym));
410            }
411            assert_eq!(10000, interner.0.read().unwrap().borrow_map().len());
412            assert_eq!(10000, interner.0.read().unwrap().borrow_vec().len());
413        }
414    }
415}