typst_utils/
pico.rs

1use std::borrow::Borrow;
2use std::cmp::Ordering;
3use std::fmt::{self, Debug, Display, Formatter};
4use std::hash::{Hash, Hasher};
5use std::num::NonZeroU64;
6use std::ops::Deref;
7use std::sync::{LazyLock, RwLock};
8
9use rustc_hash::FxHashMap;
10
11/// Marks a number as a bitcode encoded `PicoStr``.
12const MARKER: u64 = 1 << 63;
13
14/// The global runtime string interner.
15static INTERNER: LazyLock<RwLock<Interner>> = LazyLock::new(|| {
16    RwLock::new(Interner { seen: FxHashMap::default(), strings: Vec::new() })
17});
18
19/// A string interner.
20struct Interner {
21    seen: FxHashMap<&'static str, PicoStr>,
22    strings: Vec<&'static str>,
23}
24
25/// An interned string representation that is cheap to copy and hash, but more
26/// expensive to access.
27///
28/// This type takes up 8 bytes and is copyable and null-optimized (i.e.
29/// `Option<PicoStr>` also takes 8 bytes).
30///
31/// Supports compile-time string interning via [`PicoStr::constant`] in two
32/// flavors:
33/// - Strings of length at most 12 containing only chars from 'a'-'z', '1'-'4',
34///   and '-' are stored inline in the number
35/// - Other strings _can_ be compile-time interned the same way, but must first
36///   be added to the list in `exceptions::LIST`.
37///
38/// No such restrictions apply at runtime (via [`PicoStr::intern`]).
39#[derive(Copy, Clone, Eq, PartialEq, Hash)]
40pub struct PicoStr(NonZeroU64);
41
42impl PicoStr {
43    /// Intern a string at runtime.
44    pub fn intern(string: &str) -> PicoStr {
45        // Try to use bitcode or exception representations.
46        if let Ok(value) = PicoStr::try_constant(string) {
47            return value;
48        }
49
50        // Try to find an existing entry that we can reuse.
51        //
52        // We could check with just a read lock, but if the string is not yet
53        // present, we would then need to recheck after acquiring a write lock,
54        // which is probably not worth it.
55        let mut interner = INTERNER.write().unwrap();
56        if let Some(&id) = interner.seen.get(string) {
57            return id;
58        }
59
60        // Create a new entry forever by leaking the string. PicoStr is only
61        // used for strings that aren't created en masse, so it is okay.
62        let num = exceptions::LIST.len() + interner.strings.len() + 1;
63        let id = Self(NonZeroU64::new(num as u64).unwrap());
64        let string = Box::leak(string.to_string().into_boxed_str());
65        interner.seen.insert(string, id);
66        interner.strings.push(string);
67        id
68    }
69
70    /// Try to create a `PicoStr`, but don't intern it if it does not exist yet.
71    ///
72    /// This is useful to try to compare against one or multiple `PicoStr`
73    /// without interning needlessly.
74    ///
75    /// Will always return `Some(_)` if the string can be represented inline.
76    pub fn get(string: &str) -> Option<PicoStr> {
77        // Try to use bitcode or exception representations.
78        if let Ok(value) = PicoStr::try_constant(string) {
79            return Some(value);
80        }
81
82        // Try to find an existing entry that we can reuse.
83        let interner = INTERNER.read().unwrap();
84        interner.seen.get(string).copied()
85    }
86
87    /// Creates a compile-time constant `PicoStr`.
88    ///
89    /// Should only be used in const contexts because it can panic.
90    #[track_caller]
91    pub const fn constant(string: &'static str) -> PicoStr {
92        match PicoStr::try_constant(string) {
93            Ok(value) => value,
94            Err(err) => failed_to_compile_time_intern(err, string),
95        }
96    }
97
98    /// Try to intern a string statically at compile-time.
99    pub const fn try_constant(string: &str) -> Result<PicoStr, bitcode::EncodingError> {
100        // Try to encode with bitcode.
101        let value = match bitcode::encode(string) {
102            // Store representation marker in high bit. Bitcode doesn't use
103            // 4 high bits.
104            Ok(v) => v | MARKER,
105
106            // If that fails, try to use the exception list.
107            Err(e) => {
108                if let Some(i) = exceptions::get(string) {
109                    // Offset by one to make it non-zero.
110                    i as u64 + 1
111                } else {
112                    return Err(e);
113                }
114            }
115        };
116
117        Ok(Self(NonZeroU64::new(value).unwrap()))
118    }
119
120    /// Resolve to a decoded string.
121    pub fn resolve(self) -> ResolvedPicoStr {
122        // If high bit is set, this is a bitcode-encoded string.
123        let value = self.0.get();
124        if value & MARKER != 0 {
125            return bitcode::decode(value & !MARKER);
126        }
127
128        let index = (value - 1) as usize;
129        let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) {
130            INTERNER.read().unwrap().strings[runtime]
131        } else {
132            exceptions::LIST[index]
133        };
134
135        ResolvedPicoStr(Repr::Static(string))
136    }
137}
138
139impl Debug for PicoStr {
140    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
141        Debug::fmt(self.resolve().as_str(), f)
142    }
143}
144
145/// A 5-bit encoding for strings with length up two 12 that are restricted to a
146/// specific charset.
147mod bitcode {
148    use super::{Repr, ResolvedPicoStr};
149
150    /// Maps from encodings to their bytes.
151    const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234";
152
153    /// Maps from bytes to their encodings.
154    const ENCODE: &[u8; 256] = &{
155        let mut map = [0; 256];
156        let mut i = 0;
157        while i < DECODE.len() {
158            map[DECODE[i] as usize] = i as u8;
159            i += 1;
160        }
161        map
162    };
163
164    /// Try to encode a string as a 64-bit integer.
165    pub const fn encode(string: &str) -> Result<u64, EncodingError> {
166        let bytes = string.as_bytes();
167
168        if bytes.len() > 12 {
169            return Err(EncodingError::TooLong);
170        }
171
172        let mut num: u64 = 0;
173        let mut i = bytes.len();
174        while i > 0 {
175            i -= 1;
176            let b = bytes[i];
177            let v = ENCODE[b as usize];
178            if v == 0 {
179                return Err(EncodingError::BadChar);
180            }
181            num <<= 5;
182            num |= v as u64;
183        }
184
185        Ok(num)
186    }
187
188    /// Decode the string for a 64-bit integer.
189    pub const fn decode(mut value: u64) -> ResolvedPicoStr {
190        let mut buf = [0; 12];
191        let mut len = 0;
192
193        while value != 0 {
194            let v = value & 0b11111;
195            buf[len as usize] = DECODE[v as usize];
196            len += 1;
197            value >>= 5;
198        }
199
200        ResolvedPicoStr(Repr::Inline(buf, len))
201    }
202
203    /// A failure during compile-time interning.
204    pub enum EncodingError {
205        TooLong,
206        BadChar,
207    }
208
209    impl EncodingError {
210        pub const fn message(&self) -> &'static str {
211            match self {
212                Self::TooLong => "the maximum auto-internible string length is 12",
213                Self::BadChar => {
214                    "can only auto-intern the chars 'a'-'z', '1'-'4', and '-'"
215                }
216            }
217        }
218    }
219}
220
221/// Compile-time interned strings that cannot be encoded with `bitcode`.
222mod exceptions {
223    use std::cmp::Ordering;
224
225    /// A global list of non-bitcode-encodable compile-time internible strings.
226    ///
227    /// Must be sorted.
228    pub const LIST: &[&str] = &[
229        "accept-charset",
230        "allowfullscreen",
231        "aria-activedescendant",
232        "aria-autocomplete",
233        "aria-colcount",
234        "aria-colindex",
235        "aria-controls",
236        "aria-describedby",
237        "aria-disabled",
238        "aria-dropeffect",
239        "aria-errormessage",
240        "aria-expanded",
241        "aria-haspopup",
242        "aria-keyshortcuts",
243        "aria-labelledby",
244        "aria-multiline",
245        "aria-multiselectable",
246        "aria-orientation",
247        "aria-placeholder",
248        "aria-posinset",
249        "aria-readonly",
250        "aria-relevant",
251        "aria-required",
252        "aria-roledescription",
253        "aria-rowcount",
254        "aria-rowindex",
255        "aria-selected",
256        "aria-valuemax",
257        "aria-valuemin",
258        "aria-valuenow",
259        "aria-valuetext",
260        "autocapitalize",
261        "cjk-latin-spacing",
262        "contenteditable",
263        "discretionary-ligatures",
264        "fetchpriority",
265        "formnovalidate",
266        "h5",
267        "h6",
268        "historical-ligatures",
269        "number-clearance",
270        "number-margin",
271        "numbering-scope",
272        "onbeforeprint",
273        "onbeforeunload",
274        "onlanguagechange",
275        "onmessageerror",
276        "onrejectionhandled",
277        "onunhandledrejection",
278        "page-numbering",
279        "par-line-marker",
280        "popovertarget",
281        "popovertargetaction",
282        "referrerpolicy",
283        "shadowrootclonable",
284        "shadowrootcustomelementregistry",
285        "shadowrootdelegatesfocus",
286        "shadowrootmode",
287        "shadowrootserializable",
288        "transparentize",
289        "writingsuggestions",
290    ];
291
292    /// Try to find the index of an exception if it exists.
293    pub const fn get(string: &str) -> Option<usize> {
294        let mut lo = 0;
295        let mut hi = LIST.len();
296        while lo < hi {
297            let mid = (lo + hi) / 2;
298            match strcmp(string, LIST[mid]) {
299                Ordering::Less => hi = mid,
300                Ordering::Greater => lo = mid + 1,
301                Ordering::Equal => return Some(mid),
302            }
303        }
304        None
305    }
306
307    /// Compare two strings.
308    const fn strcmp(a: &str, b: &str) -> Ordering {
309        let a = a.as_bytes();
310        let b = b.as_bytes();
311        let l = min(a.len(), b.len());
312
313        let mut i = 0;
314        while i < l {
315            if a[i] == b[i] {
316                i += 1;
317            } else if a[i] < b[i] {
318                return Ordering::Less;
319            } else {
320                return Ordering::Greater;
321            }
322        }
323
324        if i < b.len() {
325            Ordering::Less
326        } else if i < a.len() {
327            Ordering::Greater
328        } else {
329            Ordering::Equal
330        }
331    }
332
333    /// Determine the minimum of two integers.
334    const fn min(a: usize, b: usize) -> usize {
335        if a < b { a } else { b }
336    }
337}
338
339/// This is returned by [`PicoStr::resolve`].
340///
341/// Dereferences to a `str`.
342pub struct ResolvedPicoStr(Repr);
343
344/// Representation of a resolved string.
345enum Repr {
346    Inline([u8; 12], u8),
347    Static(&'static str),
348}
349
350impl ResolvedPicoStr {
351    /// Retrieve the underlying string.
352    pub fn as_str(&self) -> &str {
353        match &self.0 {
354            Repr::Inline(buf, len) => unsafe {
355                std::str::from_utf8_unchecked(&buf[..*len as usize])
356            },
357            Repr::Static(s) => s,
358        }
359    }
360}
361
362impl Debug for ResolvedPicoStr {
363    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
364        Debug::fmt(self.as_str(), f)
365    }
366}
367
368impl Display for ResolvedPicoStr {
369    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
370        Display::fmt(self.as_str(), f)
371    }
372}
373
374impl Deref for ResolvedPicoStr {
375    type Target = str;
376
377    fn deref(&self) -> &Self::Target {
378        self.as_str()
379    }
380}
381
382impl AsRef<str> for ResolvedPicoStr {
383    fn as_ref(&self) -> &str {
384        self.as_str()
385    }
386}
387
388impl Borrow<str> for ResolvedPicoStr {
389    fn borrow(&self) -> &str {
390        self.as_str()
391    }
392}
393
394impl Eq for ResolvedPicoStr {}
395
396impl PartialEq for ResolvedPicoStr {
397    fn eq(&self, other: &Self) -> bool {
398        self.as_str().eq(other.as_str())
399    }
400}
401
402impl Ord for ResolvedPicoStr {
403    fn cmp(&self, other: &Self) -> Ordering {
404        self.as_str().cmp(other.as_str())
405    }
406}
407
408impl PartialOrd for ResolvedPicoStr {
409    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
410        Some(self.cmp(other))
411    }
412}
413
414impl Hash for ResolvedPicoStr {
415    fn hash<H: Hasher>(&self, state: &mut H) {
416        self.as_str().hash(state);
417    }
418}
419
420/// The error when a string could not be interned at compile time. Because the
421/// normal formatting machinery is not available at compile time, just producing
422/// the message is a bit involved ...
423#[track_caller]
424const fn failed_to_compile_time_intern(
425    error: bitcode::EncodingError,
426    string: &'static str,
427) -> ! {
428    const CAPACITY: usize = 512;
429    const fn push((buf, i): &mut ([u8; CAPACITY], usize), s: &str) {
430        let mut k = 0;
431        while k < s.len() && *i < buf.len() {
432            buf[*i] = s.as_bytes()[k];
433            k += 1;
434            *i += 1;
435        }
436    }
437
438    let mut dest = ([0; CAPACITY], 0);
439    push(&mut dest, "failed to compile-time intern string \"");
440    push(&mut dest, string);
441    push(&mut dest, "\". ");
442    push(&mut dest, error.message());
443    push(&mut dest, ". you can add an exception to ");
444    push(&mut dest, file!());
445    push(&mut dest, " to intern longer strings.");
446
447    let (slice, _) = dest.0.split_at(dest.1);
448    let Ok(message) = std::str::from_utf8(slice) else { panic!() };
449
450    panic!("{}", message);
451}
452
453#[cfg(test)]
454mod tests {
455    use super::*;
456
457    #[track_caller]
458    fn roundtrip(s: &str) {
459        assert_eq!(PicoStr::intern(s).resolve().as_str(), s);
460    }
461
462    #[test]
463    fn test_pico_str() {
464        // Test comparing compile-time and runtime-interned bitcode string.
465        const H1: PicoStr = PicoStr::constant("h1");
466        assert_eq!(H1, PicoStr::intern("h1"));
467        assert_eq!(H1.resolve().as_str(), "h1");
468
469        // Test comparing compile-time and runtime-interned exception.
470        const DISC: PicoStr = PicoStr::constant("discretionary-ligatures");
471        assert_eq!(DISC, PicoStr::intern("discretionary-ligatures"));
472        assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures");
473
474        // Test just roundtripping some strings.
475        roundtrip("");
476        roundtrip("hi");
477        roundtrip("∆@<hi-10_");
478        roundtrip("you");
479        roundtrip("discretionary-ligatures");
480    }
481
482    /// Ensures that none of the exceptions is bitcode-encodable.
483    #[test]
484    fn test_exceptions_not_bitcode_encodable() {
485        for s in exceptions::LIST {
486            assert!(
487                bitcode::encode(s).is_err(),
488                "{s:?} can be encoded with bitcode and should not be an exception"
489            );
490        }
491    }
492
493    /// Ensures that the exceptions are sorted.
494    #[test]
495    fn test_exceptions_sorted() {
496        for group in exceptions::LIST.windows(2) {
497            assert!(group[0] < group[1], "{group:?} are out of order");
498        }
499    }
500
501    /// Ensures that all exceptions can be found.
502    #[test]
503    fn test_exception_find() {
504        for (i, s) in exceptions::LIST.iter().enumerate() {
505            assert_eq!(exceptions::get(s), Some(i), "wrong index for {s:?}");
506        }
507        assert_eq!(exceptions::get("a"), None);
508        assert_eq!(exceptions::get("another-"), None);
509        assert_eq!(exceptions::get("z"), None);
510    }
511}