Skip to main content

typst_utils/
pico.rs

1use std::borrow::Borrow;
2use std::cmp::Ordering;
3use std::fmt::{self, Debug, Display, Formatter};
4use std::hash::{Hash, Hasher};
5use std::num::NonZeroU64;
6use std::ops::Deref;
7use std::sync::{LazyLock, RwLock};
8
9use rustc_hash::FxHashMap;
10
11/// Marks a number as a bitcode encoded `PicoStr``.
12const MARKER: u64 = 1 << 63;
13
14/// The global runtime string interner.
15static INTERNER: LazyLock<RwLock<Interner>> = LazyLock::new(|| {
16    RwLock::new(Interner { seen: FxHashMap::default(), strings: Vec::new() })
17});
18
19/// A string interner.
20struct Interner {
21    seen: FxHashMap<&'static str, PicoStr>,
22    strings: Vec<&'static str>,
23}
24
25/// An interned string representation that is cheap to copy and hash, but more
26/// expensive to access.
27///
28/// This type takes up 8 bytes and is copyable and null-optimized (i.e.
29/// `Option<PicoStr>` also takes 8 bytes).
30///
31/// Supports compile-time string interning via [`PicoStr::constant`] in two
32/// flavors:
33/// - Strings of length at most 12 containing only chars from 'a'-'z', '1'-'4',
34///   and '-' are stored inline in the number
35/// - Other strings _can_ be compile-time interned the same way, but must first
36///   be added to the list in `exceptions::LIST`.
37///
38/// No such restrictions apply at runtime (via [`PicoStr::intern`]).
39#[derive(Copy, Clone, Eq, PartialEq, Hash)]
40pub struct PicoStr(NonZeroU64);
41
42impl PicoStr {
43    /// Intern a string at runtime.
44    pub fn intern(string: &str) -> PicoStr {
45        // Try to use bitcode or exception representations.
46        if let Ok(value) = PicoStr::try_constant(string) {
47            return value;
48        }
49
50        // Try to find an existing entry that we can reuse.
51        //
52        // We could check with just a read lock, but if the string is not yet
53        // present, we would then need to recheck after acquiring a write lock,
54        // which is probably not worth it.
55        let mut interner = INTERNER.write().unwrap();
56        if let Some(&id) = interner.seen.get(string) {
57            return id;
58        }
59
60        // Create a new entry forever by leaking the string. PicoStr is only
61        // used for strings that aren't created en masse, so it is okay.
62        let num = exceptions::LIST.len() + interner.strings.len() + 1;
63        let id = Self(NonZeroU64::new(num as u64).unwrap());
64        let string = Box::leak(string.to_string().into_boxed_str());
65        interner.seen.insert(string, id);
66        interner.strings.push(string);
67        id
68    }
69
70    /// Try to create a `PicoStr`, but don't intern it if it does not exist yet.
71    ///
72    /// This is useful to try to compare against one or multiple `PicoStr`
73    /// without interning needlessly.
74    ///
75    /// Will always return `Some(_)` if the string can be represented inline.
76    pub fn get(string: &str) -> Option<PicoStr> {
77        // Try to use bitcode or exception representations.
78        if let Ok(value) = PicoStr::try_constant(string) {
79            return Some(value);
80        }
81
82        // Try to find an existing entry that we can reuse.
83        let interner = INTERNER.read().unwrap();
84        interner.seen.get(string).copied()
85    }
86
87    /// Creates a compile-time constant `PicoStr`.
88    ///
89    /// Should only be used in const contexts because it can panic.
90    #[track_caller]
91    pub const fn constant(string: &'static str) -> PicoStr {
92        match PicoStr::try_constant(string) {
93            Ok(value) => value,
94            Err(err) => failed_to_compile_time_intern(err, string),
95        }
96    }
97
98    /// Try to intern a string statically at compile-time.
99    pub const fn try_constant(string: &str) -> Result<PicoStr, bitcode::EncodingError> {
100        // Try to encode with bitcode.
101        let value = match bitcode::encode(string) {
102            // Store representation marker in high bit. Bitcode doesn't use
103            // 4 high bits.
104            Ok(v) => v | MARKER,
105
106            // If that fails, try to use the exception list.
107            Err(e) => {
108                if let Some(i) = exceptions::get(string) {
109                    // Offset by one to make it non-zero.
110                    i as u64 + 1
111                } else {
112                    return Err(e);
113                }
114            }
115        };
116
117        Ok(Self(NonZeroU64::new(value).unwrap()))
118    }
119
120    /// Resolve to a decoded string.
121    pub fn resolve(self) -> ResolvedPicoStr {
122        // If high bit is set, this is a bitcode-encoded string.
123        let value = self.0.get();
124        if value & MARKER != 0 {
125            return bitcode::decode(value & !MARKER);
126        }
127
128        let index = (value - 1) as usize;
129        let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) {
130            INTERNER.read().unwrap().strings[runtime]
131        } else {
132            exceptions::LIST[index]
133        };
134
135        ResolvedPicoStr(ResolvedPicoStrInner::Static(string))
136    }
137}
138
139impl Debug for PicoStr {
140    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
141        Debug::fmt(self.resolve().as_str(), f)
142    }
143}
144
145/// A 5-bit encoding for strings with length up two 12 that are restricted to a
146/// specific charset.
147mod bitcode {
148    use super::{ResolvedPicoStr, ResolvedPicoStrInner};
149
150    /// Maps from encodings to their bytes.
151    const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234";
152
153    /// Maps from bytes to their encodings.
154    const ENCODE: &[u8; 256] = &{
155        let mut map = [0; 256];
156        let mut i = 0;
157        while i < DECODE.len() {
158            map[DECODE[i] as usize] = i as u8;
159            i += 1;
160        }
161        map
162    };
163
164    /// Try to encode a string as a 64-bit integer.
165    pub const fn encode(string: &str) -> Result<u64, EncodingError> {
166        let bytes = string.as_bytes();
167
168        if bytes.len() > 12 {
169            return Err(EncodingError::TooLong);
170        }
171
172        let mut num: u64 = 0;
173        let mut i = bytes.len();
174        while i > 0 {
175            i -= 1;
176            let b = bytes[i];
177            let v = ENCODE[b as usize];
178            if v == 0 {
179                return Err(EncodingError::BadChar);
180            }
181            num <<= 5;
182            num |= v as u64;
183        }
184
185        Ok(num)
186    }
187
188    /// Decode the string for a 64-bit integer.
189    pub const fn decode(mut value: u64) -> ResolvedPicoStr {
190        let mut buf = [0; 12];
191        let mut len = 0;
192
193        while value != 0 {
194            let v = value & 0b11111;
195            buf[len as usize] = DECODE[v as usize];
196            len += 1;
197            value >>= 5;
198        }
199
200        ResolvedPicoStr(ResolvedPicoStrInner::Inline(buf, len))
201    }
202
203    /// A failure during compile-time interning.
204    pub enum EncodingError {
205        TooLong,
206        BadChar,
207    }
208
209    impl EncodingError {
210        pub const fn message(&self) -> &'static str {
211            match self {
212                Self::TooLong => "the maximum auto-internible string length is 12",
213                Self::BadChar => {
214                    "can only auto-intern the chars 'a'-'z', '1'-'4', and '-'"
215                }
216            }
217        }
218    }
219}
220
221/// Compile-time interned strings that cannot be encoded with `bitcode`.
222mod exceptions {
223    use std::cmp::Ordering;
224
225    /// A global list of non-bitcode-encodable compile-time internible strings.
226    ///
227    /// Must be sorted.
228    pub const LIST: &[&str] = &[
229        "accept-charset",
230        "allowfullscreen",
231        "annotation-xml",
232        "aria-activedescendant",
233        "aria-autocomplete",
234        "aria-colcount",
235        "aria-colindex",
236        "aria-controls",
237        "aria-describedby",
238        "aria-disabled",
239        "aria-dropeffect",
240        "aria-errormessage",
241        "aria-expanded",
242        "aria-haspopup",
243        "aria-keyshortcuts",
244        "aria-labelledby",
245        "aria-multiline",
246        "aria-multiselectable",
247        "aria-orientation",
248        "aria-placeholder",
249        "aria-posinset",
250        "aria-readonly",
251        "aria-relevant",
252        "aria-required",
253        "aria-roledescription",
254        "aria-rowcount",
255        "aria-rowindex",
256        "aria-selected",
257        "aria-valuemax",
258        "aria-valuemin",
259        "aria-valuenow",
260        "aria-valuetext",
261        "autocapitalize",
262        "cjk-latin-spacing",
263        "contenteditable",
264        "discretionary-ligatures",
265        "fetchpriority",
266        "formnovalidate",
267        "h5",
268        "h6",
269        "historical-ligatures",
270        "linethickness",
271        "mmultiscripts",
272        "movablelimits",
273        "number-clearance",
274        "number-margin",
275        "numbering-scope",
276        "onbeforeprint",
277        "onbeforeunload",
278        "onlanguagechange",
279        "onmessageerror",
280        "onrejectionhandled",
281        "onunhandledrejection",
282        "page-numbering",
283        "par-line-marker",
284        "popovertarget",
285        "popovertargetaction",
286        "referrerpolicy",
287        "shadowrootclonable",
288        "shadowrootcustomelementregistry",
289        "shadowrootdelegatesfocus",
290        "shadowrootmode",
291        "shadowrootserializable",
292        "transparentize",
293        "writingsuggestions",
294    ];
295
296    /// Try to find the index of an exception if it exists.
297    pub const fn get(string: &str) -> Option<usize> {
298        let mut lo = 0;
299        let mut hi = LIST.len();
300        while lo < hi {
301            let mid = (lo + hi) / 2;
302            match strcmp(string, LIST[mid]) {
303                Ordering::Less => hi = mid,
304                Ordering::Greater => lo = mid + 1,
305                Ordering::Equal => return Some(mid),
306            }
307        }
308        None
309    }
310
311    /// Compare two strings.
312    const fn strcmp(a: &str, b: &str) -> Ordering {
313        let a = a.as_bytes();
314        let b = b.as_bytes();
315        let l = min(a.len(), b.len());
316
317        let mut i = 0;
318        while i < l {
319            if a[i] == b[i] {
320                i += 1;
321            } else if a[i] < b[i] {
322                return Ordering::Less;
323            } else {
324                return Ordering::Greater;
325            }
326        }
327
328        if i < b.len() {
329            Ordering::Less
330        } else if i < a.len() {
331            Ordering::Greater
332        } else {
333            Ordering::Equal
334        }
335    }
336
337    /// Determine the minimum of two integers.
338    const fn min(a: usize, b: usize) -> usize {
339        if a < b { a } else { b }
340    }
341}
342
343/// This is returned by [`PicoStr::resolve`].
344///
345/// Dereferences to a `str`.
346#[derive(Copy, Clone)]
347pub struct ResolvedPicoStr(ResolvedPicoStrInner);
348
349/// The internal representation of a [`ResolvedPicoStr`].
350#[derive(Copy, Clone)]
351enum ResolvedPicoStrInner {
352    Inline([u8; 12], u8),
353    Static(&'static str),
354}
355
356impl ResolvedPicoStr {
357    /// Retrieve the underlying string.
358    pub fn as_str(&self) -> &str {
359        match &self.0 {
360            ResolvedPicoStrInner::Inline(buf, len) => unsafe {
361                std::str::from_utf8_unchecked(&buf[..*len as usize])
362            },
363            ResolvedPicoStrInner::Static(s) => s,
364        }
365    }
366}
367
368impl Debug for ResolvedPicoStr {
369    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
370        Debug::fmt(self.as_str(), f)
371    }
372}
373
374impl Display for ResolvedPicoStr {
375    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
376        Display::fmt(self.as_str(), f)
377    }
378}
379
380impl Deref for ResolvedPicoStr {
381    type Target = str;
382
383    fn deref(&self) -> &Self::Target {
384        self.as_str()
385    }
386}
387
388impl AsRef<str> for ResolvedPicoStr {
389    fn as_ref(&self) -> &str {
390        self.as_str()
391    }
392}
393
394impl Borrow<str> for ResolvedPicoStr {
395    fn borrow(&self) -> &str {
396        self.as_str()
397    }
398}
399
400impl Eq for ResolvedPicoStr {}
401
402impl PartialEq for ResolvedPicoStr {
403    fn eq(&self, other: &Self) -> bool {
404        self.as_str().eq(other.as_str())
405    }
406}
407
408impl Ord for ResolvedPicoStr {
409    fn cmp(&self, other: &Self) -> Ordering {
410        self.as_str().cmp(other.as_str())
411    }
412}
413
414impl PartialOrd for ResolvedPicoStr {
415    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
416        Some(self.cmp(other))
417    }
418}
419
420impl Hash for ResolvedPicoStr {
421    fn hash<H: Hasher>(&self, state: &mut H) {
422        self.as_str().hash(state);
423    }
424}
425
426/// The error when a string could not be interned at compile time. Because the
427/// normal formatting machinery is not available at compile time, just producing
428/// the message is a bit involved ...
429#[track_caller]
430const fn failed_to_compile_time_intern(
431    error: bitcode::EncodingError,
432    string: &'static str,
433) -> ! {
434    const CAPACITY: usize = 512;
435    const fn push((buf, i): &mut ([u8; CAPACITY], usize), s: &str) {
436        let mut k = 0;
437        while k < s.len() && *i < buf.len() {
438            buf[*i] = s.as_bytes()[k];
439            k += 1;
440            *i += 1;
441        }
442    }
443
444    let mut dest = ([0; CAPACITY], 0);
445    push(&mut dest, "failed to compile-time intern string \"");
446    push(&mut dest, string);
447    push(&mut dest, "\". ");
448    push(&mut dest, error.message());
449    push(&mut dest, ". you can add an exception to ");
450    push(&mut dest, file!());
451    push(&mut dest, " to intern longer strings.");
452
453    let (slice, _) = dest.0.split_at(dest.1);
454    let Ok(message) = std::str::from_utf8(slice) else { panic!() };
455
456    panic!("{}", message);
457}
458
459#[cfg(test)]
460mod tests {
461    use super::*;
462
463    #[track_caller]
464    fn roundtrip(s: &str) {
465        assert_eq!(PicoStr::intern(s).resolve().as_str(), s);
466    }
467
468    #[test]
469    fn test_pico_str() {
470        // Test comparing compile-time and runtime-interned bitcode string.
471        const H1: PicoStr = PicoStr::constant("h1");
472        assert_eq!(H1, PicoStr::intern("h1"));
473        assert_eq!(H1.resolve().as_str(), "h1");
474
475        // Test comparing compile-time and runtime-interned exception.
476        const DISC: PicoStr = PicoStr::constant("discretionary-ligatures");
477        assert_eq!(DISC, PicoStr::intern("discretionary-ligatures"));
478        assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures");
479
480        // Test just roundtripping some strings.
481        roundtrip("");
482        roundtrip("hi");
483        roundtrip("∆@<hi-10_");
484        roundtrip("you");
485        roundtrip("discretionary-ligatures");
486    }
487
488    /// Ensures that none of the exceptions is bitcode-encodable.
489    #[test]
490    fn test_exceptions_not_bitcode_encodable() {
491        for s in exceptions::LIST {
492            assert!(
493                bitcode::encode(s).is_err(),
494                "{s:?} can be encoded with bitcode and should not be an exception"
495            );
496        }
497    }
498
499    /// Ensures that the exceptions are sorted.
500    #[test]
501    fn test_exceptions_sorted() {
502        for group in exceptions::LIST.windows(2) {
503            assert!(group[0] < group[1], "{group:?} are out of order");
504        }
505    }
506
507    /// Ensures that all exceptions can be found.
508    #[test]
509    fn test_exception_find() {
510        for (i, s) in exceptions::LIST.iter().enumerate() {
511            assert_eq!(exceptions::get(s), Some(i), "wrong index for {s:?}");
512        }
513        assert_eq!(exceptions::get("a"), None);
514        assert_eq!(exceptions::get("another-"), None);
515        assert_eq!(exceptions::get("z"), None);
516    }
517}