nickel_lang_core/
identifier.rs

1//! Define the type of an identifier.
2use once_cell::sync::Lazy;
3use serde::{Deserialize, Serialize};
4use std::{
5    borrow::Borrow,
6    fmt::{self, Debug},
7    hash::Hash,
8};
9
10use crate::{metrics::increment, position::TermPos, term::string::NickelString};
11
12simple_counter::generate_counter!(GeneratedCounter, usize);
13static INTERNER: Lazy<interner::Interner> = Lazy::new(interner::Interner::new);
14
15/// An interned identifier.
16//
17// Implementation-wise, this is just a wrapper around interner::Symbol that uses a hard-coded,
18// static `Interner`.
19#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
20#[serde(into = "&'static str", from = "String")]
21pub struct Ident(interner::Symbol);
22
23impl Ident {
24    pub fn new(s: impl AsRef<str>) -> Self {
25        Self(INTERNER.intern(s.as_ref()))
26    }
27
28    /// Return the string representation of this identifier.
29    pub fn label(&self) -> &'static str {
30        INTERNER.lookup(self.0)
31    }
32
33    pub fn into_label(self) -> String {
34        self.label().to_owned()
35    }
36
37    /// Create a new fresh identifier. This identifier is unique and is guaranteed not to collide
38    /// with any identifier defined before. Generated identifiers start with a special prefix that
39    /// can't be used by normal, user-defined identifiers.
40    ///
41    /// FIXME: the comment above isn't true: the identifier isn't guaranteed to
42    /// be unique because there are ways to introduce identifiers that aren't
43    /// parsed as Nickel identifiers. For example, `std.record.insert "%1" 42 {}` causes
44    /// an identifier to be created with the name "%1".
45    pub fn fresh() -> Self {
46        increment!("Ident::fresh");
47        Self::new(format!("{}{}", GEN_PREFIX, GeneratedCounter::next()))
48    }
49
50    /// Attaches a position to this identifier, making it a `LocIdent`.
51    pub fn spanned(self, pos: TermPos) -> LocIdent {
52        LocIdent {
53            ident: self,
54            pos,
55            generated: self.label().starts_with(GEN_PREFIX),
56        }
57    }
58}
59
60impl fmt::Display for Ident {
61    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
62        write!(f, "{}", self.label())
63    }
64}
65
66impl fmt::Debug for Ident {
67    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68        write!(f, "`{}`", self.label())
69    }
70}
71
72impl From<Ident> for LocIdent {
73    fn from(ident: Ident) -> Self {
74        ident.spanned(TermPos::None)
75    }
76}
77
78impl From<&LocIdent> for Ident {
79    fn from(ident: &LocIdent) -> Self {
80        ident.ident()
81    }
82}
83
84impl PartialOrd for Ident {
85    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
86        Some(self.cmp(other))
87    }
88}
89
90impl Ord for Ident {
91    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
92        self.label().cmp(other.label())
93    }
94}
95
96impl From<Ident> for NickelString {
97    fn from(sym: Ident) -> Self {
98        sym.to_string().into()
99    }
100}
101
102impl<'a> From<&'a str> for Ident {
103    fn from(s: &'a str) -> Self {
104        Ident::new(s)
105    }
106}
107
108impl From<String> for Ident {
109    fn from(s: String) -> Self {
110        Ident::new(s)
111    }
112}
113
114impl From<Ident> for &'static str {
115    fn from(id: Ident) -> &'static str {
116        id.label()
117    }
118}
119
120/// An identifier with a location.
121///
122/// The location is ignored for equality comparison and hashing; it's mainly
123/// intended for error messages.
124#[derive(Clone, Copy, Debug, Deserialize, Serialize)]
125#[serde(into = "String", from = "String")]
126pub struct LocIdent {
127    ident: Ident,
128    pub pos: TermPos,
129    generated: bool,
130}
131
132impl LocIdent {
133    pub fn new_with_pos(label: impl AsRef<str>, pos: TermPos) -> Self {
134        let generated = label.as_ref().starts_with(GEN_PREFIX);
135        Self {
136            ident: Ident::new(label),
137            pos,
138            generated,
139        }
140    }
141
142    pub fn new(label: impl AsRef<str>) -> Self {
143        Self::new_with_pos(label, TermPos::None)
144    }
145
146    /// Create an identifier with the same label as this one, but a specified position.
147    pub fn with_pos(self, pos: TermPos) -> LocIdent {
148        LocIdent { pos, ..self }
149    }
150
151    /// Create a fresh identifier with no position. See [Ident::fresh].
152    pub fn fresh() -> Self {
153        Ident::fresh().into()
154    }
155
156    /// Return the identifier without its position.
157    pub fn ident(&self) -> Ident {
158        self.ident
159    }
160
161    /// Return the string representation of this identifier.
162    pub fn label(&self) -> &'static str {
163        self.ident.label()
164    }
165
166    pub fn into_label(self) -> String {
167        self.label().to_owned()
168    }
169}
170
171/// Special character used for generating fresh identifiers. It must be syntactically impossible to
172/// use to write in a standard Nickel program, to avoid name clashes.
173pub const GEN_PREFIX: char = '%';
174
175impl PartialOrd for LocIdent {
176    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
177        Some(self.cmp(other))
178    }
179}
180
181impl Ord for LocIdent {
182    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
183        self.label().cmp(other.label())
184    }
185}
186
187impl PartialEq for LocIdent {
188    fn eq(&self, other: &Self) -> bool {
189        self.ident == other.ident
190    }
191}
192
193impl Eq for LocIdent {}
194
195impl Hash for LocIdent {
196    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
197        self.ident.hash(state)
198    }
199}
200
201impl Borrow<Ident> for LocIdent {
202    fn borrow(&self) -> &Ident {
203        &self.ident
204    }
205}
206
207impl fmt::Display for LocIdent {
208    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
209        write!(f, "{}", self.label())
210    }
211}
212
213/// Wrapper around [Ident] with a fast ordering function that only compares the underlying symbols.
214/// Useful when a bunch of idents need to be sorted for algorithmic reasons, but one doesn't need
215/// the actual natural order on strings nor care about the specific order.
216#[derive(Clone, Debug, Eq, PartialEq)]
217pub struct FastOrdIdent(pub Ident);
218
219impl PartialOrd for FastOrdIdent {
220    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
221        Some(self.cmp(other))
222    }
223}
224
225impl Ord for FastOrdIdent {
226    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
227        self.0 .0.cmp(&other.0 .0)
228    }
229}
230
231impl<'a> From<&'a str> for LocIdent {
232    fn from(s: &'a str) -> Self {
233        LocIdent::new(s)
234    }
235}
236
237impl From<String> for LocIdent {
238    fn from(s: String) -> Self {
239        LocIdent::new(s)
240    }
241}
242
243impl From<LocIdent> for &'static str {
244    fn from(id: LocIdent) -> &'static str {
245        id.label()
246    }
247}
248
249// TODO: among all the `From` impls here, this is the only one that allocates.
250// Allocations aren't forbidden in `From` (e.g. `String: From<&str>`), but it
251// would still be nice to get rid of this implicit allocation. It's mainly used
252// in `Term` right now.
253impl From<LocIdent> for String {
254    fn from(id: LocIdent) -> String {
255        id.label().to_owned()
256    }
257}
258
259impl LocIdent {
260    pub fn is_generated(&self) -> bool {
261        self.generated
262    }
263}
264
265impl AsRef<str> for LocIdent {
266    fn as_ref(&self) -> &str {
267        self.label()
268    }
269}
270
271mod interner {
272    use std::collections::HashMap;
273    use std::sync::{Mutex, RwLock};
274
275    use typed_arena::Arena;
276
277    /// A symbol is a correspondence between an [Ident](super::Ident) and its string representation
278    /// stored in the [Interner].
279    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
280    pub struct Symbol(u32);
281
282    /// The interner, which serves a double purpose: it pre-allocates space
283    /// so that [Ident](super::Ident) labels are created faster
284    /// and it makes it so that labels are stored only once, saving space.
285    pub(crate) struct Interner(RwLock<InnerInterner>);
286
287    impl Interner {
288        /// Creates an empty [Interner].
289        pub(crate) fn new() -> Self {
290            Self(RwLock::new(InnerInterner::empty()))
291        }
292
293        /// Stores a string inside the [Interner] if it does not exists, and returns the
294        /// corresponding [Symbol].
295        pub(crate) fn intern(&self, string: impl AsRef<str>) -> Symbol {
296            self.0.write().unwrap().intern(string)
297        }
298
299        /// Looks up the stored string corresponding to the [Symbol].
300        ///
301        /// This operation cannot fail since the only way to have a [Symbol] is to have
302        /// [interned](Interner::intern) the corresponding string first.
303        pub(crate) fn lookup(&self, sym: Symbol) -> &str {
304            // SAFETY: Here we are transmuting the reference lifetime: &'lock str -> &'slf str.
305            // This is okay because InnerInterner::lookup guarantees stable references, and we
306            // never replace our InnerInterner.
307            unsafe { std::mem::transmute::<&'_ str, &'_ str>(self.0.read().unwrap().lookup(sym)) }
308        }
309    }
310
311    /// The main part of the Interner.
312    #[ouroboros::self_referencing]
313    struct InnerInterner {
314        /// Preallocates space where strings are stored.
315        arena: Mutex<Arena<u8>>,
316
317        /// Prevents the arena from creating different [Symbols](Symbol) for the same string.
318        #[borrows(arena)]
319        #[covariant]
320        map: HashMap<&'this str, Symbol>,
321
322        /// Allows retrieving a string from a [Symbol].
323        #[borrows(arena)]
324        #[covariant]
325        vec: Vec<&'this str>,
326    }
327
328    impl InnerInterner {
329        /// Creates an empty [InnerInterner].
330        fn empty() -> Self {
331            Self::new(
332                Mutex::new(Arena::new()),
333                |_arena| HashMap::new(),
334                |_arena| Vec::new(),
335            )
336        }
337
338        /// Stores a string inside the [InnerInterner] if it does not exists, and returns the
339        /// corresponding [Symbol].
340        fn intern(&mut self, string: impl AsRef<str>) -> Symbol {
341            if let Some(sym) = self.borrow_map().get(string.as_ref()) {
342                return *sym;
343            }
344            // SAFETY: Here we are transmuting the reference lifetime: &'lock str -> &'slf str.
345            // This is okay because references to data in the arena are valid until the arena
346            // is destroyed.
347            let in_string = unsafe {
348                std::mem::transmute::<&'_ str, &'_ str>(
349                    self.borrow_arena()
350                        .lock()
351                        .unwrap()
352                        .alloc_str(string.as_ref()),
353                )
354            };
355            let sym = Symbol(self.borrow_vec().len() as u32);
356            self.with_vec_mut(|v| v.push(in_string));
357            self.with_map_mut(|m| m.insert(in_string, sym));
358            sym
359        }
360        /// Looks up for the stored string corresponding to the [Symbol].
361        ///
362        /// This operation cannot fail since the only way to have a [Symbol]
363        /// is to have [interned](InnerInterner::intern) the corresponding string first.
364        ///
365        /// References returned by this method are valid until this `InnerInterner` is
366        /// destroyed: they won't be invalidated by, for example, [`Self::intern`].
367        fn lookup(&self, sym: Symbol) -> &str {
368            self.borrow_vec()[sym.0 as usize]
369        }
370    }
371
372    #[cfg(test)]
373    mod tests {
374        use super::*;
375
376        #[test]
377        fn test_intern_then_lookup() {
378            let interner = Interner::new();
379            let test_string = "test_string";
380            let sym = interner.intern(test_string);
381            assert_eq!(interner.lookup(sym), test_string);
382        }
383
384        #[test]
385        fn test_intern_twice_has_same_symbol() {
386            let interner = Interner::new();
387            let test_string = "test_string";
388            let sym1 = interner.intern(test_string);
389            let sym2 = interner.intern(test_string);
390            assert_eq!(sym1, sym2);
391        }
392
393        #[test]
394        fn test_intern_two_different_has_different_symbols() {
395            let interner = Interner::new();
396            let sym1 = interner.intern("a");
397            let sym2 = interner.intern("b");
398            assert_ne!(sym1, sym2);
399        }
400
401        #[test]
402        fn test_large_number_of_interns() {
403            let interner = Interner::new();
404            for i in 0..10000 {
405                let i = i.to_string();
406                let sym = interner.intern(&i);
407                assert_eq!(i, interner.lookup(sym));
408            }
409            assert_eq!(10000, interner.0.read().unwrap().borrow_map().len());
410            assert_eq!(10000, interner.0.read().unwrap().borrow_vec().len());
411            // doing the same a second time should not add anything to the interner
412            for i in 0..10000 {
413                let i = i.to_string();
414                let sym = interner.intern(&i);
415                assert_eq!(i, interner.lookup(sym));
416            }
417            assert_eq!(10000, interner.0.read().unwrap().borrow_map().len());
418            assert_eq!(10000, interner.0.read().unwrap().borrow_vec().len());
419        }
420    }
421}