obeli_sk_boa_interner/lib.rs
1//! Boa's **`boa_interner`** is a string interner for compiler performance.
2//!
3//! # Crate Overview
4//!
5//! The idea behind using a string interner is that in most of the code, strings such as
6//! identifiers and literals are often repeated. This causes extra burden when comparing them and
7//! storing them. A string interner stores a unique `usize` symbol for each string, making sure
8//! that there are no duplicates. This makes it much easier to compare, since it's just comparing
9//! to `usize`, and also it's easier to store, since instead of a heap-allocated string, you only
10//! need to store a `usize`. This reduces memory consumption and improves performance in the
11//! compiler.
12#![doc = include_str!("../ABOUT.md")]
13#![doc(
14 html_logo_url = "https://raw.githubusercontent.com/boa-dev/boa/main/assets/logo_black.svg",
15 html_favicon_url = "https://raw.githubusercontent.com/boa-dev/boa/main/assets/logo_black.svg"
16)]
17#![cfg_attr(not(test), forbid(clippy::unwrap_used))]
18#![allow(
19 clippy::redundant_pub_crate,
20 // TODO deny once false positive is fixed (https://github.com/rust-lang/rust-clippy/issues/9626).
21 clippy::trait_duplication_in_bounds,
22 // Field names intentionally mirror the encoding type they store.
23 clippy::struct_field_names
24)]
25#![cfg_attr(not(feature = "arbitrary"), no_std)]
26
27extern crate alloc;
28
29mod fixed_string;
30mod interned_str;
31mod raw;
32mod sym;
33
34#[cfg(test)]
35mod tests;
36
37use alloc::{borrow::Cow, format, string::String, vec::Vec};
38use raw::RawInterner;
39
40pub use sym::*;
41
42/// An enumeration of all slice types [`Interner`] can internally store.
43///
44/// This struct allows us to intern either `UTF-8` or `UTF-16` str references, which are the two
45/// encodings [`Interner`] can store.
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
47pub enum JStrRef<'a> {
48 /// A `UTF-8` string reference.
49 Utf8(&'a str),
50
51 /// A `UTF-16` string reference.
52 Utf16(&'a [u16]),
53}
54
55impl<'a> From<&'a str> for JStrRef<'a> {
56 fn from(s: &'a str) -> Self {
57 JStrRef::Utf8(s)
58 }
59}
60
61impl<'a> From<&'a [u16]> for JStrRef<'a> {
62 fn from(s: &'a [u16]) -> Self {
63 JStrRef::Utf16(s)
64 }
65}
66
67impl<'a, const N: usize> From<&'a [u16; N]> for JStrRef<'a> {
68 fn from(s: &'a [u16; N]) -> Self {
69 JStrRef::Utf16(s)
70 }
71}
72
73/// A double reference to an interned string inside [`Interner`].
74///
75/// [`JSInternedStrRef::utf8`] returns an [`Option`], since not every `UTF-16` string is fully
76/// representable as a `UTF-8` string (because of unpaired surrogates). However, every `UTF-8`
77/// string is representable as a `UTF-16` string, so `JSInternedStrRef::utf8` returns a
78/// [<code>&\[u16\]</code>][core::slice].
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
80pub struct JSInternedStrRef<'a, 'b> {
81 utf8: Option<&'a str>,
82 utf16: &'b [u16],
83}
84
85impl<'a, 'b> JSInternedStrRef<'a, 'b> {
86 /// Returns the inner reference to the interned string in `UTF-8` encoding.
87 /// if the string is not representable in `UTF-8`, returns [`None`]
88 ///
89 /// # Examples
90 ///
91 /// ```
92 /// use boa_interner::Interner;
93 ///
94 /// let mut interner = Interner::new();
95 /// let sym = interner.get_or_intern("hello");
96 /// let interned = interner.resolve_expect(sym);
97 /// assert_eq!(interned.utf8(), Some("hello"));
98 /// ```
99 #[inline]
100 #[must_use]
101 pub const fn utf8(&self) -> Option<&'a str> {
102 self.utf8
103 }
104
105 /// Returns the inner reference to the interned string in `UTF-16` encoding.
106 ///
107 /// # Examples
108 ///
109 /// ```
110 /// use boa_interner::Interner;
111 ///
112 /// let mut interner = Interner::new();
113 /// let sym = interner.get_or_intern("hello");
114 /// let interned = interner.resolve_expect(sym);
115 /// let utf16: Vec<u16> = "hello".encode_utf16().collect();
116 /// assert_eq!(interned.utf16(), utf16.as_slice());
117 /// ```
118 #[inline]
119 #[must_use]
120 pub const fn utf16(&self) -> &'b [u16] {
121 self.utf16
122 }
123
124 /// Joins the result of both possible strings into a common type.
125 ///
126 /// If `self` is representable by a `UTF-8` string and the `prioritize_utf8` argument is set,
127 /// it will prioritize calling `f`, and will only call `g` if `self` is only representable by a
128 /// `UTF-16` string. Otherwise, it will directly call `g`.
129 ///
130 /// # Examples
131 ///
132 /// ```
133 /// use boa_interner::Interner;
134 ///
135 /// let mut interner = Interner::new();
136 /// let sym = interner.get_or_intern("hello");
137 /// let interned = interner.resolve_expect(sym);
138 /// let result = interned.join(
139 /// |utf8| utf8.to_uppercase(),
140 /// |utf16| String::from_utf16_lossy(utf16).to_uppercase(),
141 /// true,
142 /// );
143 /// assert_eq!(result, "HELLO");
144 /// ```
145 pub fn join<F, G, T>(self, f: F, g: G, prioritize_utf8: bool) -> T
146 where
147 F: FnOnce(&'a str) -> T,
148 G: FnOnce(&'b [u16]) -> T,
149 {
150 if prioritize_utf8 && let Some(str) = self.utf8 {
151 return f(str);
152 }
153 g(self.utf16)
154 }
155
156 /// Same as [`join`][`JSInternedStrRef::join`], but where you can pass an additional context.
157 ///
158 /// Useful when you have a `&mut Context` context that cannot be borrowed by both closures at
159 /// the same time.
160 ///
161 /// # Examples
162 ///
163 /// ```
164 /// use boa_interner::Interner;
165 ///
166 /// let mut interner = Interner::new();
167 /// let sym = interner.get_or_intern("hello");
168 /// let interned = interner.resolve_expect(sym);
169 /// let mut output = String::new();
170 /// interned.join_with_context(
171 /// |utf8, buf: &mut String| buf.push_str(&utf8.to_uppercase()),
172 /// |utf16, buf: &mut String| buf.push_str(&String::from_utf16_lossy(utf16).to_uppercase()),
173 /// &mut output,
174 /// true,
175 /// );
176 /// assert_eq!(output, "HELLO");
177 /// ```
178 pub fn join_with_context<C, F, G, T>(self, f: F, g: G, ctx: C, prioritize_utf8: bool) -> T
179 where
180 F: FnOnce(&'a str, C) -> T,
181 G: FnOnce(&'b [u16], C) -> T,
182 {
183 if prioritize_utf8 && let Some(str) = self.utf8 {
184 return f(str, ctx);
185 }
186 g(self.utf16, ctx)
187 }
188
189 /// Converts both string types into a common type `C`.
190 ///
191 /// If `self` is representable by a `UTF-8` string and the `prioritize_utf8` argument is set, it
192 /// will prioritize converting its `UTF-8` representation first, and will only convert its
193 /// `UTF-16` representation if it is only representable by a `UTF-16` string. Otherwise, it will
194 /// directly convert its `UTF-16` representation.
195 ///
196 /// # Examples
197 ///
198 /// ```
199 /// use boa_interner::Interner;
200 ///
201 /// enum JsString<'a> {
202 /// Utf8(&'a str),
203 /// Utf16(&'a [u16]),
204 /// }
205 ///
206 /// impl<'a> From<&'a str> for JsString<'a> {
207 /// fn from(s: &'a str) -> Self {
208 /// JsString::Utf8(s)
209 /// }
210 /// }
211 ///
212 /// impl<'a> From<&'a [u16]> for JsString<'a> {
213 /// fn from(s: &'a [u16]) -> Self {
214 /// JsString::Utf16(s)
215 /// }
216 /// }
217 ///
218 /// let mut interner = Interner::new();
219 /// let sym = interner.get_or_intern("hello");
220 /// let interned = interner.resolve_expect(sym);
221 /// let result: JsString<'_> = interned.into_common(true);
222 /// assert!(matches!(result, JsString::Utf8("hello")));
223 /// ```
224 pub fn into_common<C>(self, prioritize_utf8: bool) -> C
225 where
226 C: From<&'a str> + From<&'b [u16]>,
227 {
228 self.join(Into::into, Into::into, prioritize_utf8)
229 }
230}
231
232impl core::fmt::Display for JSInternedStrRef<'_, '_> {
233 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
234 self.join_with_context(
235 core::fmt::Display::fmt,
236 |js, f| {
237 char::decode_utf16(js.iter().copied())
238 .map(|r| match r {
239 Ok(c) => String::from(c),
240 Err(e) => format!("\\u{:04X}", e.unpaired_surrogate()),
241 })
242 .collect::<String>()
243 .fmt(f)
244 },
245 f,
246 true,
247 )
248 }
249}
250
251/// The string interner for Boa.
252#[derive(Debug, Default)]
253pub struct Interner {
254 utf8_interner: RawInterner<u8>,
255 utf16_interner: RawInterner<u16>,
256 /// Latin1-encodability cache for dynamically-interned strings (all code units ≤ 0xFF).
257 latin1_flags: Vec<bool>,
258}
259
260impl Interner {
261 /// Creates a new [`Interner`].
262 ///
263 /// # Examples
264 ///
265 /// ```
266 /// use boa_interner::Interner;
267 ///
268 /// let mut interner = Interner::new();
269 /// let sym = interner.get_or_intern("hello");
270 /// assert!(interner.resolve(sym).is_some());
271 /// ```
272 #[inline]
273 #[must_use]
274 pub fn new() -> Self {
275 Self::default()
276 }
277
278 /// Creates a new [`Interner`] with the specified capacity.
279 ///
280 /// # Examples
281 ///
282 /// ```
283 /// use boa_interner::Interner;
284 ///
285 /// let mut interner = Interner::with_capacity(10);
286 /// let sym = interner.get_or_intern("hello");
287 /// assert!(interner.resolve(sym).is_some());
288 /// ```
289 #[inline]
290 #[must_use]
291 pub fn with_capacity(capacity: usize) -> Self {
292 Self {
293 utf8_interner: RawInterner::with_capacity(capacity),
294 utf16_interner: RawInterner::with_capacity(capacity),
295 latin1_flags: Vec::with_capacity(capacity),
296 }
297 }
298
299 /// Returns the number of strings interned by the interner.
300 ///
301 /// # Examples
302 ///
303 /// ```
304 /// use boa_interner::Interner;
305 ///
306 /// let mut interner = Interner::new();
307 /// let initial_len = interner.len();
308 /// interner.get_or_intern("hello");
309 /// assert_eq!(interner.len(), initial_len + 1);
310 /// ```
311 #[inline]
312 #[must_use]
313 pub fn len(&self) -> usize {
314 // `utf16_interner.len()` == `utf8_interner.len()`,
315 // so we can use any of them.
316 COMMON_STRINGS_UTF8.len() + self.utf16_interner.len()
317 }
318
319 /// Returns `true` if the [`Interner`] contains no interned strings.
320 ///
321 /// # Examples
322 ///
323 /// ```
324 /// use boa_interner::Interner;
325 ///
326 /// let interner = Interner::new();
327 /// assert!(!interner.is_empty());
328 /// ```
329 #[inline]
330 #[must_use]
331 pub fn is_empty(&self) -> bool {
332 COMMON_STRINGS_UTF8.is_empty() && self.utf16_interner.is_empty()
333 }
334
335 /// Returns the symbol for the given string if any.
336 ///
337 /// Can be used to query if a string has already been interned without interning.
338 ///
339 /// # Examples
340 ///
341 /// ```
342 /// use boa_interner::Interner;
343 ///
344 /// let mut interner = Interner::new();
345 /// assert!(interner.get("hello").is_none());
346 /// interner.get_or_intern("hello");
347 /// assert!(interner.get("hello").is_some());
348 /// ```
349 pub fn get<'a, T>(&self, string: T) -> Option<Sym>
350 where
351 T: Into<JStrRef<'a>>,
352 {
353 let string = string.into();
354 Self::get_common(string).or_else(|| {
355 let index = match string {
356 JStrRef::Utf8(s) => self.utf8_interner.get(s.as_bytes()),
357 JStrRef::Utf16(s) => self.utf16_interner.get(s),
358 };
359 // SAFETY:
360 // `get_or_intern/get_or_intern_static` already have checks to avoid returning indices
361 // that could cause overflows, meaning the indices returned by
362 // `idx + 1 + COMMON_STRINGS_UTF8.len()` cannot cause overflows.
363 unsafe { index.map(|i| Sym::new_unchecked(i + 1 + COMMON_STRINGS_UTF8.len())) }
364 })
365 }
366
367 /// Interns the given string.
368 ///
369 /// Returns a symbol for resolution into the original string.
370 ///
371 /// # Panics
372 ///
373 /// If the interner already interns the maximum number of strings possible by the chosen symbol type.
374 ///
375 /// # Examples
376 ///
377 /// ```
378 /// use boa_interner::Interner;
379 ///
380 /// let mut interner = Interner::new();
381 /// let sym1 = interner.get_or_intern("hello");
382 /// let sym2 = interner.get_or_intern("hello");
383 /// assert_eq!(sym1, sym2);
384 /// let sym3 = interner.get_or_intern("world");
385 /// assert_ne!(sym1, sym3);
386 /// ```
387 pub fn get_or_intern<'a, T>(&mut self, string: T) -> Sym
388 where
389 T: Into<JStrRef<'a>>,
390 {
391 let string = string.into();
392 self.get(string).unwrap_or_else(|| {
393 let (utf8, utf16) = match string {
394 JStrRef::Utf8(s) => (
395 Some(Cow::Borrowed(s)),
396 Cow::Owned(s.encode_utf16().collect()),
397 ),
398 JStrRef::Utf16(s) => (String::from_utf16(s).ok().map(Cow::Owned), Cow::Borrowed(s)),
399 };
400
401 // We need a way to check for the strings that can be interned by `utf16_interner` but
402 // not by `utf8_interner` (since there are some UTF-16 strings with surrogates that are
403 // not representable in UTF-8), so we use the sentinel value `""` as a marker indicating
404 // that the `Sym` corresponding to that string is only available in `utf16_interner`.
405 //
406 // We don't need to worry about matches with `""` inside `get`, because
407 // `COMMON_STRINGS_UTF8` filters all the empty strings before interning.
408 let index = if let Some(utf8) = utf8 {
409 self.utf8_interner.intern(utf8.as_bytes())
410 } else {
411 self.utf8_interner.intern_static(b"")
412 };
413
414 let utf16_index = self.utf16_interner.intern(&utf16);
415
416 assert_eq!(index, utf16_index);
417
418 self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
419
420 index
421 .checked_add(1 + COMMON_STRINGS_UTF8.len())
422 .and_then(Sym::new)
423 .expect("Cannot intern new string: integer overflow")
424 })
425 }
426
427 /// Interns the given `'static` string.
428 ///
429 /// Returns a symbol for resolution into the original string.
430 ///
431 /// # Note
432 ///
433 /// This is more efficient than [`Interner::get_or_intern`], since it avoids allocating space
434 /// for one `string` inside the [`Interner`], with the disadvantage that you need to provide
435 /// both the `UTF-8` and the `UTF-16` representation of the string.
436 ///
437 /// # Panics
438 ///
439 /// If the interner already interns the maximum number of strings possible by the chosen symbol type.
440 ///
441 /// # Examples
442 ///
443 /// ```
444 /// use boa_interner::Interner;
445 ///
446 /// static HELLO_UTF16: &[u16] = &[0x68, 0x65, 0x6C, 0x6C, 0x6F];
447 ///
448 /// let mut interner = Interner::new();
449 /// let sym1 = interner.get_or_intern_static("hello", HELLO_UTF16);
450 /// let sym2 = interner.get_or_intern("hello");
451 /// assert_eq!(sym1, sym2);
452 /// ```
453 pub fn get_or_intern_static(&mut self, utf8: &'static str, utf16: &'static [u16]) -> Sym {
454 // Uses the utf8 because it's quicker to check inside `COMMON_STRINGS_UTF8`
455 // (which is a perfect hash set) than to check inside `COMMON_STRINGS_UTF16`
456 // (which is a lazy static hash set).
457 self.get(utf8).unwrap_or_else(|| {
458 let index = self.utf8_interner.intern(utf8.as_bytes());
459 let utf16_index = self.utf16_interner.intern(utf16);
460
461 debug_assert_eq!(index, utf16_index);
462
463 self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
464
465 index
466 .checked_add(1 + COMMON_STRINGS_UTF8.len())
467 .and_then(Sym::new)
468 .expect("Cannot intern new string: integer overflow")
469 })
470 }
471
472 /// Returns the string for the given symbol if any.
473 ///
474 /// # Panics
475 ///
476 /// Panics if the size of both statics is not equal or the interners do
477 /// not have the same size
478 ///
479 /// # Examples
480 ///
481 /// ```
482 /// use boa_interner::Interner;
483 ///
484 /// let mut interner = Interner::new();
485 /// let sym = interner.get_or_intern("hello");
486 /// let resolved = interner.resolve(sym);
487 /// assert!(resolved.is_some());
488 /// assert_eq!(resolved.unwrap().utf8(), Some("hello"));
489 /// ```
490 #[must_use]
491 pub fn resolve(&self, symbol: Sym) -> Option<JSInternedStrRef<'_, '_>> {
492 let index = symbol.get() - 1;
493
494 if let Some(utf8) = COMMON_STRINGS_UTF8.index(index).copied() {
495 let utf16 = COMMON_STRINGS_UTF16
496 .get_index(index)
497 .copied()
498 .expect("The sizes of both statics must be equal");
499 return Some(JSInternedStrRef {
500 utf8: Some(utf8),
501 utf16,
502 });
503 }
504
505 let index = index - COMMON_STRINGS_UTF8.len();
506
507 if let Some(utf16) = self.utf16_interner.index(index) {
508 let index = index - (self.utf16_interner.len() - self.utf8_interner.len());
509 // SAFETY:
510 // We only manipulate valid UTF-8 `str`s and convert them to `[u8]` for convenience,
511 // so converting back to a `str` is safe.
512 let utf8 = unsafe {
513 core::str::from_utf8_unchecked(
514 self.utf8_interner
515 .index(index)
516 .expect("both interners must have the same size"),
517 )
518 };
519 return Some(JSInternedStrRef {
520 utf8: if utf8.is_empty() { None } else { Some(utf8) },
521 utf16,
522 });
523 }
524
525 None
526 }
527
528 /// Returns the string for the given symbol.
529 ///
530 /// # Panics
531 ///
532 /// If the interner cannot resolve the given symbol.
533 ///
534 /// # Examples
535 ///
536 /// ```
537 /// use boa_interner::Interner;
538 ///
539 /// let mut interner = Interner::new();
540 /// let sym = interner.get_or_intern("hello");
541 /// let resolved = interner.resolve_expect(sym);
542 /// assert_eq!(resolved.utf8(), Some("hello"));
543 /// ```
544 #[inline]
545 #[must_use]
546 pub fn resolve_expect(&self, symbol: Sym) -> JSInternedStrRef<'_, '_> {
547 self.resolve(symbol).expect("string disappeared")
548 }
549
550 /// Returns `true` if the string identified by `symbol` can be encoded as Latin1
551 /// (i.e. all code units are in the range `0x00..=0xFF`).
552 ///
553 /// This information is computed **once** when the string is first interned, so callers pay no
554 /// O(n) scanning cost beyond the initial intern call.
555 ///
556 /// # Examples
557 ///
558 /// ```
559 /// use boa_interner::Interner;
560 ///
561 /// let mut interner = Interner::new();
562 /// let ascii = interner.get_or_intern("hello");
563 /// assert!(interner.is_latin1(ascii));
564 ///
565 /// let non_latin1: Vec<u16> = vec![0x4e2d, 0x6587]; // "中文"
566 /// let sym = interner.get_or_intern(non_latin1.as_slice());
567 /// assert!(!interner.is_latin1(sym));
568 /// ```
569 #[inline]
570 #[must_use]
571 pub fn is_latin1(&self, symbol: Sym) -> bool {
572 let index = symbol.get() - 1;
573 if index < COMMON_STRINGS_UTF8.len() {
574 return true;
575 }
576 let dynamic_index = index - COMMON_STRINGS_UTF8.len();
577 self.latin1_flags
578 .get(dynamic_index)
579 .copied()
580 .unwrap_or(false)
581 }
582
583 fn get_common(string: JStrRef<'_>) -> Option<Sym> {
584 match string {
585 JStrRef::Utf8(s) => COMMON_STRINGS_UTF8.get_index(s).map(|idx| {
586 // SAFETY: `idx >= 0`, since it's an `usize`, and `idx + 1 > 0`.
587 // In this case, we don't need to worry about overflows because we have a static
588 // assertion in place checking that `COMMON_STRINGS.len() < usize::MAX`.
589 unsafe { Sym::new_unchecked(idx + 1) }
590 }),
591 JStrRef::Utf16(s) => COMMON_STRINGS_UTF16.get_index_of(&s).map(|idx| {
592 // SAFETY: `idx >= 0`, since it's an `usize`, and `idx + 1 > 0`.
593 // In this case, we don't need to worry about overflows because we have a static
594 // assertion in place checking that `COMMON_STRINGS.len() < usize::MAX`.
595 unsafe { Sym::new_unchecked(idx + 1) }
596 }),
597 }
598 }
599}
600
601/// Implements the display formatting with indentation.
602pub trait ToIndentedString {
603 /// Converts the element to a string using an interner, with the given indentation.
604 fn to_indented_string(&self, interner: &Interner, indentation: usize) -> String;
605}
606
607/// Converts a given element to a string using an interner.
608pub trait ToInternedString {
609 /// Converts a given element to a string using an interner.
610 fn to_interned_string(&self, interner: &Interner) -> String;
611}
612
613impl<T> ToInternedString for T
614where
615 T: ToIndentedString,
616{
617 fn to_interned_string(&self, interner: &Interner) -> String {
618 self.to_indented_string(interner, 0)
619 }
620}