Skip to main content

obeli_sk_boa_interner/
lib.rs

1//! Boa's **`boa_interner`** is a string interner for compiler performance.
2//!
3//! # Crate Overview
4//!
5//! The idea behind using a string interner is that in most of the code, strings such as
6//! identifiers and literals are often repeated. This causes extra burden when comparing them and
7//! storing them. A string interner stores a unique `usize` symbol for each string, making sure
8//! that there are no duplicates. This makes it much easier to compare, since it's just comparing
9//! to `usize`, and also it's easier to store, since instead of a heap-allocated string, you only
10//! need to store a `usize`. This reduces memory consumption and improves performance in the
11//! compiler.
12#![doc = include_str!("../ABOUT.md")]
13#![doc(
14    html_logo_url = "https://raw.githubusercontent.com/boa-dev/boa/main/assets/logo_black.svg",
15    html_favicon_url = "https://raw.githubusercontent.com/boa-dev/boa/main/assets/logo_black.svg"
16)]
17#![cfg_attr(not(test), forbid(clippy::unwrap_used))]
18#![allow(
19    clippy::redundant_pub_crate,
20    // TODO deny once false positive is fixed (https://github.com/rust-lang/rust-clippy/issues/9626).
21    clippy::trait_duplication_in_bounds,
22    // Field names intentionally mirror the encoding type they store.
23    clippy::struct_field_names
24)]
25#![cfg_attr(not(feature = "arbitrary"), no_std)]
26
27extern crate alloc;
28
29mod fixed_string;
30mod interned_str;
31mod raw;
32mod sym;
33
34#[cfg(test)]
35mod tests;
36
37use alloc::{borrow::Cow, format, string::String, vec::Vec};
38use raw::RawInterner;
39
40pub use sym::*;
41
42/// An enumeration of all slice types [`Interner`] can internally store.
43///
44/// This struct allows us to intern either `UTF-8` or `UTF-16` str references, which are the two
45/// encodings [`Interner`] can store.
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
47pub enum JStrRef<'a> {
48    /// A `UTF-8` string reference.
49    Utf8(&'a str),
50
51    /// A `UTF-16` string reference.
52    Utf16(&'a [u16]),
53}
54
55impl<'a> From<&'a str> for JStrRef<'a> {
56    fn from(s: &'a str) -> Self {
57        JStrRef::Utf8(s)
58    }
59}
60
61impl<'a> From<&'a [u16]> for JStrRef<'a> {
62    fn from(s: &'a [u16]) -> Self {
63        JStrRef::Utf16(s)
64    }
65}
66
67impl<'a, const N: usize> From<&'a [u16; N]> for JStrRef<'a> {
68    fn from(s: &'a [u16; N]) -> Self {
69        JStrRef::Utf16(s)
70    }
71}
72
73/// A double reference to an interned string inside [`Interner`].
74///
75/// [`JSInternedStrRef::utf8`] returns an [`Option`], since not every `UTF-16` string is fully
76/// representable as a `UTF-8` string (because of unpaired surrogates). However, every `UTF-8`
77/// string is representable as a `UTF-16` string, so `JSInternedStrRef::utf8` returns a
78/// [<code>&\[u16\]</code>][core::slice].
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
80pub struct JSInternedStrRef<'a, 'b> {
81    utf8: Option<&'a str>,
82    utf16: &'b [u16],
83}
84
85impl<'a, 'b> JSInternedStrRef<'a, 'b> {
86    /// Returns the inner reference to the interned string in `UTF-8` encoding.
87    /// if the string is not representable in `UTF-8`, returns [`None`]
88    ///
89    /// # Examples
90    ///
91    /// ```
92    /// use boa_interner::Interner;
93    ///
94    /// let mut interner = Interner::new();
95    /// let sym = interner.get_or_intern("hello");
96    /// let interned = interner.resolve_expect(sym);
97    /// assert_eq!(interned.utf8(), Some("hello"));
98    /// ```
99    #[inline]
100    #[must_use]
101    pub const fn utf8(&self) -> Option<&'a str> {
102        self.utf8
103    }
104
105    /// Returns the inner reference to the interned string in `UTF-16` encoding.
106    ///
107    /// # Examples
108    ///
109    /// ```
110    /// use boa_interner::Interner;
111    ///
112    /// let mut interner = Interner::new();
113    /// let sym = interner.get_or_intern("hello");
114    /// let interned = interner.resolve_expect(sym);
115    /// let utf16: Vec<u16> = "hello".encode_utf16().collect();
116    /// assert_eq!(interned.utf16(), utf16.as_slice());
117    /// ```
118    #[inline]
119    #[must_use]
120    pub const fn utf16(&self) -> &'b [u16] {
121        self.utf16
122    }
123
124    /// Joins the result of both possible strings into a common type.
125    ///
126    /// If `self` is representable by a `UTF-8` string and the `prioritize_utf8` argument is set,
127    /// it will prioritize calling `f`, and will only call `g` if `self` is only representable by a
128    /// `UTF-16` string. Otherwise, it will directly call `g`.
129    ///
130    /// # Examples
131    ///
132    /// ```
133    /// use boa_interner::Interner;
134    ///
135    /// let mut interner = Interner::new();
136    /// let sym = interner.get_or_intern("hello");
137    /// let interned = interner.resolve_expect(sym);
138    /// let result = interned.join(
139    ///     |utf8| utf8.to_uppercase(),
140    ///     |utf16| String::from_utf16_lossy(utf16).to_uppercase(),
141    ///     true,
142    /// );
143    /// assert_eq!(result, "HELLO");
144    /// ```
145    pub fn join<F, G, T>(self, f: F, g: G, prioritize_utf8: bool) -> T
146    where
147        F: FnOnce(&'a str) -> T,
148        G: FnOnce(&'b [u16]) -> T,
149    {
150        if prioritize_utf8 && let Some(str) = self.utf8 {
151            return f(str);
152        }
153        g(self.utf16)
154    }
155
156    /// Same as [`join`][`JSInternedStrRef::join`], but where you can pass an additional context.
157    ///
158    /// Useful when you have a `&mut Context` context that cannot be borrowed by both closures at
159    /// the same time.
160    ///
161    /// # Examples
162    ///
163    /// ```
164    /// use boa_interner::Interner;
165    ///
166    /// let mut interner = Interner::new();
167    /// let sym = interner.get_or_intern("hello");
168    /// let interned = interner.resolve_expect(sym);
169    /// let mut output = String::new();
170    /// interned.join_with_context(
171    ///     |utf8, buf: &mut String| buf.push_str(&utf8.to_uppercase()),
172    ///     |utf16, buf: &mut String| buf.push_str(&String::from_utf16_lossy(utf16).to_uppercase()),
173    ///     &mut output,
174    ///     true,
175    /// );
176    /// assert_eq!(output, "HELLO");
177    /// ```
178    pub fn join_with_context<C, F, G, T>(self, f: F, g: G, ctx: C, prioritize_utf8: bool) -> T
179    where
180        F: FnOnce(&'a str, C) -> T,
181        G: FnOnce(&'b [u16], C) -> T,
182    {
183        if prioritize_utf8 && let Some(str) = self.utf8 {
184            return f(str, ctx);
185        }
186        g(self.utf16, ctx)
187    }
188
189    /// Converts both string types into a common type `C`.
190    ///
191    /// If `self` is representable by a `UTF-8` string and the `prioritize_utf8` argument is set, it
192    /// will prioritize converting its `UTF-8` representation first, and will only convert its
193    /// `UTF-16` representation if it is only representable by a `UTF-16` string. Otherwise, it will
194    /// directly convert its `UTF-16` representation.
195    ///
196    /// # Examples
197    ///
198    /// ```
199    /// use boa_interner::Interner;
200    ///
201    /// enum JsString<'a> {
202    ///     Utf8(&'a str),
203    ///     Utf16(&'a [u16]),
204    /// }
205    ///
206    /// impl<'a> From<&'a str> for JsString<'a> {
207    ///     fn from(s: &'a str) -> Self {
208    ///         JsString::Utf8(s)
209    ///     }
210    /// }
211    ///
212    /// impl<'a> From<&'a [u16]> for JsString<'a> {
213    ///     fn from(s: &'a [u16]) -> Self {
214    ///         JsString::Utf16(s)
215    ///     }
216    /// }
217    ///
218    /// let mut interner = Interner::new();
219    /// let sym = interner.get_or_intern("hello");
220    /// let interned = interner.resolve_expect(sym);
221    /// let result: JsString<'_> = interned.into_common(true);
222    /// assert!(matches!(result, JsString::Utf8("hello")));
223    /// ```
224    pub fn into_common<C>(self, prioritize_utf8: bool) -> C
225    where
226        C: From<&'a str> + From<&'b [u16]>,
227    {
228        self.join(Into::into, Into::into, prioritize_utf8)
229    }
230}
231
232impl core::fmt::Display for JSInternedStrRef<'_, '_> {
233    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
234        self.join_with_context(
235            core::fmt::Display::fmt,
236            |js, f| {
237                char::decode_utf16(js.iter().copied())
238                    .map(|r| match r {
239                        Ok(c) => String::from(c),
240                        Err(e) => format!("\\u{:04X}", e.unpaired_surrogate()),
241                    })
242                    .collect::<String>()
243                    .fmt(f)
244            },
245            f,
246            true,
247        )
248    }
249}
250
251/// The string interner for Boa.
252#[derive(Debug, Default)]
253pub struct Interner {
254    utf8_interner: RawInterner<u8>,
255    utf16_interner: RawInterner<u16>,
256    /// Latin1-encodability cache for dynamically-interned strings (all code units ≤ 0xFF).
257    latin1_flags: Vec<bool>,
258}
259
260impl Interner {
261    /// Creates a new [`Interner`].
262    ///
263    /// # Examples
264    ///
265    /// ```
266    /// use boa_interner::Interner;
267    ///
268    /// let mut interner = Interner::new();
269    /// let sym = interner.get_or_intern("hello");
270    /// assert!(interner.resolve(sym).is_some());
271    /// ```
272    #[inline]
273    #[must_use]
274    pub fn new() -> Self {
275        Self::default()
276    }
277
278    /// Creates a new [`Interner`] with the specified capacity.
279    ///
280    /// # Examples
281    ///
282    /// ```
283    /// use boa_interner::Interner;
284    ///
285    /// let mut interner = Interner::with_capacity(10);
286    /// let sym = interner.get_or_intern("hello");
287    /// assert!(interner.resolve(sym).is_some());
288    /// ```
289    #[inline]
290    #[must_use]
291    pub fn with_capacity(capacity: usize) -> Self {
292        Self {
293            utf8_interner: RawInterner::with_capacity(capacity),
294            utf16_interner: RawInterner::with_capacity(capacity),
295            latin1_flags: Vec::with_capacity(capacity),
296        }
297    }
298
299    /// Returns the number of strings interned by the interner.
300    ///
301    /// # Examples
302    ///
303    /// ```
304    /// use boa_interner::Interner;
305    ///
306    /// let mut interner = Interner::new();
307    /// let initial_len = interner.len();
308    /// interner.get_or_intern("hello");
309    /// assert_eq!(interner.len(), initial_len + 1);
310    /// ```
311    #[inline]
312    #[must_use]
313    pub fn len(&self) -> usize {
314        // `utf16_interner.len()` == `utf8_interner.len()`,
315        // so we can use any of them.
316        COMMON_STRINGS_UTF8.len() + self.utf16_interner.len()
317    }
318
319    /// Returns `true` if the [`Interner`] contains no interned strings.
320    ///
321    /// # Examples
322    ///
323    /// ```
324    /// use boa_interner::Interner;
325    ///
326    /// let interner = Interner::new();
327    /// assert!(!interner.is_empty());
328    /// ```
329    #[inline]
330    #[must_use]
331    pub fn is_empty(&self) -> bool {
332        COMMON_STRINGS_UTF8.is_empty() && self.utf16_interner.is_empty()
333    }
334
335    /// Returns the symbol for the given string if any.
336    ///
337    /// Can be used to query if a string has already been interned without interning.
338    ///
339    /// # Examples
340    ///
341    /// ```
342    /// use boa_interner::Interner;
343    ///
344    /// let mut interner = Interner::new();
345    /// assert!(interner.get("hello").is_none());
346    /// interner.get_or_intern("hello");
347    /// assert!(interner.get("hello").is_some());
348    /// ```
349    pub fn get<'a, T>(&self, string: T) -> Option<Sym>
350    where
351        T: Into<JStrRef<'a>>,
352    {
353        let string = string.into();
354        Self::get_common(string).or_else(|| {
355            let index = match string {
356                JStrRef::Utf8(s) => self.utf8_interner.get(s.as_bytes()),
357                JStrRef::Utf16(s) => self.utf16_interner.get(s),
358            };
359            // SAFETY:
360            // `get_or_intern/get_or_intern_static` already have checks to avoid returning indices
361            // that could cause overflows, meaning the indices returned by
362            // `idx + 1 + COMMON_STRINGS_UTF8.len()` cannot cause overflows.
363            unsafe { index.map(|i| Sym::new_unchecked(i + 1 + COMMON_STRINGS_UTF8.len())) }
364        })
365    }
366
367    /// Interns the given string.
368    ///
369    /// Returns a symbol for resolution into the original string.
370    ///
371    /// # Panics
372    ///
373    /// If the interner already interns the maximum number of strings possible by the chosen symbol type.
374    ///
375    /// # Examples
376    ///
377    /// ```
378    /// use boa_interner::Interner;
379    ///
380    /// let mut interner = Interner::new();
381    /// let sym1 = interner.get_or_intern("hello");
382    /// let sym2 = interner.get_or_intern("hello");
383    /// assert_eq!(sym1, sym2);
384    /// let sym3 = interner.get_or_intern("world");
385    /// assert_ne!(sym1, sym3);
386    /// ```
387    pub fn get_or_intern<'a, T>(&mut self, string: T) -> Sym
388    where
389        T: Into<JStrRef<'a>>,
390    {
391        let string = string.into();
392        self.get(string).unwrap_or_else(|| {
393            let (utf8, utf16) = match string {
394                JStrRef::Utf8(s) => (
395                    Some(Cow::Borrowed(s)),
396                    Cow::Owned(s.encode_utf16().collect()),
397                ),
398                JStrRef::Utf16(s) => (String::from_utf16(s).ok().map(Cow::Owned), Cow::Borrowed(s)),
399            };
400
401            // We need a way to check for the strings that can be interned by `utf16_interner` but
402            // not by `utf8_interner` (since there are some UTF-16 strings with surrogates that are
403            // not representable in UTF-8), so we use the sentinel value `""` as a marker indicating
404            // that the `Sym` corresponding to that string is only available in `utf16_interner`.
405            //
406            // We don't need to worry about matches with `""` inside `get`, because
407            // `COMMON_STRINGS_UTF8` filters all the empty strings before interning.
408            let index = if let Some(utf8) = utf8 {
409                self.utf8_interner.intern(utf8.as_bytes())
410            } else {
411                self.utf8_interner.intern_static(b"")
412            };
413
414            let utf16_index = self.utf16_interner.intern(&utf16);
415
416            assert_eq!(index, utf16_index);
417
418            self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
419
420            index
421                .checked_add(1 + COMMON_STRINGS_UTF8.len())
422                .and_then(Sym::new)
423                .expect("Cannot intern new string: integer overflow")
424        })
425    }
426
427    /// Interns the given `'static` string.
428    ///
429    /// Returns a symbol for resolution into the original string.
430    ///
431    /// # Note
432    ///
433    /// This is more efficient than [`Interner::get_or_intern`], since it avoids allocating space
434    /// for one `string` inside the [`Interner`], with the disadvantage that you need to provide
435    /// both the `UTF-8` and the `UTF-16` representation of the string.
436    ///
437    /// # Panics
438    ///
439    /// If the interner already interns the maximum number of strings possible by the chosen symbol type.
440    ///
441    /// # Examples
442    ///
443    /// ```
444    /// use boa_interner::Interner;
445    ///
446    /// static HELLO_UTF16: &[u16] = &[0x68, 0x65, 0x6C, 0x6C, 0x6F];
447    ///
448    /// let mut interner = Interner::new();
449    /// let sym1 = interner.get_or_intern_static("hello", HELLO_UTF16);
450    /// let sym2 = interner.get_or_intern("hello");
451    /// assert_eq!(sym1, sym2);
452    /// ```
453    pub fn get_or_intern_static(&mut self, utf8: &'static str, utf16: &'static [u16]) -> Sym {
454        // Uses the utf8 because it's quicker to check inside `COMMON_STRINGS_UTF8`
455        // (which is a perfect hash set) than to check inside `COMMON_STRINGS_UTF16`
456        // (which is a lazy static hash set).
457        self.get(utf8).unwrap_or_else(|| {
458            let index = self.utf8_interner.intern(utf8.as_bytes());
459            let utf16_index = self.utf16_interner.intern(utf16);
460
461            debug_assert_eq!(index, utf16_index);
462
463            self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
464
465            index
466                .checked_add(1 + COMMON_STRINGS_UTF8.len())
467                .and_then(Sym::new)
468                .expect("Cannot intern new string: integer overflow")
469        })
470    }
471
472    /// Returns the string for the given symbol if any.
473    ///
474    /// # Panics
475    ///
476    /// Panics if the size of both statics is not equal or the interners do
477    /// not have the same size
478    ///
479    /// # Examples
480    ///
481    /// ```
482    /// use boa_interner::Interner;
483    ///
484    /// let mut interner = Interner::new();
485    /// let sym = interner.get_or_intern("hello");
486    /// let resolved = interner.resolve(sym);
487    /// assert!(resolved.is_some());
488    /// assert_eq!(resolved.unwrap().utf8(), Some("hello"));
489    /// ```
490    #[must_use]
491    pub fn resolve(&self, symbol: Sym) -> Option<JSInternedStrRef<'_, '_>> {
492        let index = symbol.get() - 1;
493
494        if let Some(utf8) = COMMON_STRINGS_UTF8.index(index).copied() {
495            let utf16 = COMMON_STRINGS_UTF16
496                .get_index(index)
497                .copied()
498                .expect("The sizes of both statics must be equal");
499            return Some(JSInternedStrRef {
500                utf8: Some(utf8),
501                utf16,
502            });
503        }
504
505        let index = index - COMMON_STRINGS_UTF8.len();
506
507        if let Some(utf16) = self.utf16_interner.index(index) {
508            let index = index - (self.utf16_interner.len() - self.utf8_interner.len());
509            // SAFETY:
510            // We only manipulate valid UTF-8 `str`s and convert them to `[u8]` for convenience,
511            // so converting back to a `str` is safe.
512            let utf8 = unsafe {
513                core::str::from_utf8_unchecked(
514                    self.utf8_interner
515                        .index(index)
516                        .expect("both interners must have the same size"),
517                )
518            };
519            return Some(JSInternedStrRef {
520                utf8: if utf8.is_empty() { None } else { Some(utf8) },
521                utf16,
522            });
523        }
524
525        None
526    }
527
528    /// Returns the string for the given symbol.
529    ///
530    /// # Panics
531    ///
532    /// If the interner cannot resolve the given symbol.
533    ///
534    /// # Examples
535    ///
536    /// ```
537    /// use boa_interner::Interner;
538    ///
539    /// let mut interner = Interner::new();
540    /// let sym = interner.get_or_intern("hello");
541    /// let resolved = interner.resolve_expect(sym);
542    /// assert_eq!(resolved.utf8(), Some("hello"));
543    /// ```
544    #[inline]
545    #[must_use]
546    pub fn resolve_expect(&self, symbol: Sym) -> JSInternedStrRef<'_, '_> {
547        self.resolve(symbol).expect("string disappeared")
548    }
549
550    /// Returns `true` if the string identified by `symbol` can be encoded as Latin1
551    /// (i.e. all code units are in the range `0x00..=0xFF`).
552    ///
553    /// This information is computed **once** when the string is first interned, so callers pay no
554    /// O(n) scanning cost beyond the initial intern call.
555    ///
556    /// # Examples
557    ///
558    /// ```
559    /// use boa_interner::Interner;
560    ///
561    /// let mut interner = Interner::new();
562    /// let ascii = interner.get_or_intern("hello");
563    /// assert!(interner.is_latin1(ascii));
564    ///
565    /// let non_latin1: Vec<u16> = vec![0x4e2d, 0x6587]; // "中文"
566    /// let sym = interner.get_or_intern(non_latin1.as_slice());
567    /// assert!(!interner.is_latin1(sym));
568    /// ```
569    #[inline]
570    #[must_use]
571    pub fn is_latin1(&self, symbol: Sym) -> bool {
572        let index = symbol.get() - 1;
573        if index < COMMON_STRINGS_UTF8.len() {
574            return true;
575        }
576        let dynamic_index = index - COMMON_STRINGS_UTF8.len();
577        self.latin1_flags
578            .get(dynamic_index)
579            .copied()
580            .unwrap_or(false)
581    }
582
583    fn get_common(string: JStrRef<'_>) -> Option<Sym> {
584        match string {
585            JStrRef::Utf8(s) => COMMON_STRINGS_UTF8.get_index(s).map(|idx| {
586                // SAFETY: `idx >= 0`, since it's an `usize`, and `idx + 1 > 0`.
587                // In this case, we don't need to worry about overflows because we have a static
588                // assertion in place checking that `COMMON_STRINGS.len() < usize::MAX`.
589                unsafe { Sym::new_unchecked(idx + 1) }
590            }),
591            JStrRef::Utf16(s) => COMMON_STRINGS_UTF16.get_index_of(&s).map(|idx| {
592                // SAFETY: `idx >= 0`, since it's an `usize`, and `idx + 1 > 0`.
593                // In this case, we don't need to worry about overflows because we have a static
594                // assertion in place checking that `COMMON_STRINGS.len() < usize::MAX`.
595                unsafe { Sym::new_unchecked(idx + 1) }
596            }),
597        }
598    }
599}
600
601/// Implements the display formatting with indentation.
602pub trait ToIndentedString {
603    /// Converts the element to a string using an interner, with the given indentation.
604    fn to_indented_string(&self, interner: &Interner, indentation: usize) -> String;
605}
606
607/// Converts a given element to a string using an interner.
608pub trait ToInternedString {
609    /// Converts a given element to a string using an interner.
610    fn to_interned_string(&self, interner: &Interner) -> String;
611}
612
613impl<T> ToInternedString for T
614where
615    T: ToIndentedString,
616{
617    fn to_interned_string(&self, interner: &Interner) -> String {
618        self.to_indented_string(interner, 0)
619    }
620}