typst_utils/
pico.rs

1use std::borrow::Borrow;
2use std::cmp::Ordering;
3use std::collections::HashMap;
4use std::fmt::{self, Debug, Display, Formatter};
5use std::hash::{Hash, Hasher};
6use std::num::NonZeroU64;
7use std::ops::Deref;
8use std::sync::{LazyLock, RwLock};
9
10/// Marks a number as a bitcode encoded `PicoStr``.
11const MARKER: u64 = 1 << 63;
12
13/// The global runtime string interner.
14static INTERNER: LazyLock<RwLock<Interner>> =
15    LazyLock::new(|| RwLock::new(Interner { seen: HashMap::new(), strings: Vec::new() }));
16
17/// A string interner.
18struct Interner {
19    seen: HashMap<&'static str, PicoStr>,
20    strings: Vec<&'static str>,
21}
22
23/// An interned string representation that is cheap to copy and hash, but more
24/// expensive to access.
25///
26/// This type takes up 8 bytes and is copyable and null-optimized (i.e.
27/// `Option<PicoStr>` also takes 8 bytes).
28///
29/// Supports compile-time string interning via [`PicoStr::constant`] in two
30/// flavors:
31/// - Strings of length at most 12 containing only chars from 'a'-'z', '1'-'4',
32///   and '-' are stored inline in the number
33/// - Other strings _can_ be compile-time interned the same way, but must first
34///   be added to the list in `exceptions::LIST`.
35///
36/// No such restrictions apply at runtime (via [`PicoStr::intern`]).
37#[derive(Copy, Clone, Eq, PartialEq, Hash)]
38pub struct PicoStr(NonZeroU64);
39
40impl PicoStr {
41    /// Intern a string at runtime.
42    pub fn intern(string: &str) -> PicoStr {
43        // Try to use bitcode or exception representations.
44        if let Ok(value) = PicoStr::try_constant(string) {
45            return value;
46        }
47
48        // Try to find an existing entry that we can reuse.
49        //
50        // We could check with just a read lock, but if the string is not yet
51        // present, we would then need to recheck after acquiring a write lock,
52        // which is probably not worth it.
53        let mut interner = INTERNER.write().unwrap();
54        if let Some(&id) = interner.seen.get(string) {
55            return id;
56        }
57
58        // Create a new entry forever by leaking the string. PicoStr is only
59        // used for strings that aren't created en masse, so it is okay.
60        let num = exceptions::LIST.len() + interner.strings.len() + 1;
61        let id = Self(NonZeroU64::new(num as u64).unwrap());
62        let string = Box::leak(string.to_string().into_boxed_str());
63        interner.seen.insert(string, id);
64        interner.strings.push(string);
65        id
66    }
67
68    /// Creates a compile-time constant `PicoStr`.
69    ///
70    /// Should only be used in const contexts because it can panic.
71    #[track_caller]
72    pub const fn constant(string: &'static str) -> PicoStr {
73        match PicoStr::try_constant(string) {
74            Ok(value) => value,
75            Err(err) => panic!("{}", err.message()),
76        }
77    }
78
79    /// Try to intern a string statically at compile-time.
80    pub const fn try_constant(string: &str) -> Result<PicoStr, bitcode::EncodingError> {
81        // Try to encode with bitcode.
82        let value = match bitcode::encode(string) {
83            // Store representation marker in high bit. Bitcode doesn't use
84            // 4 high bits.
85            Ok(v) => v | MARKER,
86
87            // If that fails, try to use the exception list.
88            Err(e) => {
89                if let Some(i) = exceptions::get(string) {
90                    // Offset by one to make it non-zero.
91                    i as u64 + 1
92                } else {
93                    return Err(e);
94                }
95            }
96        };
97
98        match NonZeroU64::new(value) {
99            Some(value) => Ok(Self(value)),
100            None => unreachable!(),
101        }
102    }
103
104    /// Resolve to a decoded string.
105    pub fn resolve(self) -> ResolvedPicoStr {
106        // If high bit is set, this is a bitcode-encoded string.
107        let value = self.0.get();
108        if value & MARKER != 0 {
109            return bitcode::decode(value & !MARKER);
110        }
111
112        let index = (value - 1) as usize;
113        let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) {
114            INTERNER.read().unwrap().strings[runtime]
115        } else {
116            exceptions::LIST[index]
117        };
118
119        ResolvedPicoStr(Repr::Static(string))
120    }
121}
122
123impl Debug for PicoStr {
124    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
125        Debug::fmt(self.resolve().as_str(), f)
126    }
127}
128
129/// A 5-bit encoding for strings with length up two 12 that are restricted to a
130/// specific charset.
131mod bitcode {
132    use super::{Repr, ResolvedPicoStr};
133
134    /// Maps from encodings to their bytes.
135    const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234";
136
137    /// Maps from bytes to their encodings.
138    const ENCODE: &[u8; 256] = &{
139        let mut map = [0; 256];
140        let mut i = 0;
141        while i < DECODE.len() {
142            map[DECODE[i] as usize] = i as u8;
143            i += 1;
144        }
145        map
146    };
147
148    /// Try to encode a string as a 64-bit integer.
149    pub const fn encode(string: &str) -> Result<u64, EncodingError> {
150        let bytes = string.as_bytes();
151
152        if bytes.len() > 12 {
153            return Err(EncodingError::TooLong);
154        }
155
156        let mut num: u64 = 0;
157        let mut i = bytes.len();
158        while i > 0 {
159            i -= 1;
160            let b = bytes[i];
161            let v = ENCODE[b as usize];
162            if v == 0 {
163                return Err(EncodingError::BadChar);
164            }
165            num <<= 5;
166            num |= v as u64;
167        }
168
169        Ok(num)
170    }
171
172    /// Decode the string for a 64-bit integer.
173    pub const fn decode(mut value: u64) -> ResolvedPicoStr {
174        let mut buf = [0; 12];
175        let mut len = 0;
176
177        while value != 0 {
178            let v = value & 0b11111;
179            buf[len as usize] = DECODE[v as usize];
180            len += 1;
181            value >>= 5;
182        }
183
184        ResolvedPicoStr(Repr::Inline(buf, len))
185    }
186
187    /// A failure during compile-time interning.
188    pub enum EncodingError {
189        TooLong,
190        BadChar,
191    }
192
193    impl EncodingError {
194        pub const fn message(&self) -> &'static str {
195            match self {
196                Self::TooLong => {
197                    "the maximum auto-internible string length is 12. \
198                     you can add an exception to typst-utils/src/pico.rs \
199                     to intern longer strings."
200                }
201                Self::BadChar => {
202                    "can only auto-intern the chars 'a'-'z', '1'-'4', and '-'. \
203                     you can add an exception to typst-utils/src/pico.rs \
204                     to intern other strings."
205                }
206            }
207        }
208    }
209}
210
211/// Compile-time interned strings that cannot be encoded with `bitcode`.
212mod exceptions {
213    use std::cmp::Ordering;
214
215    /// A global list of non-bitcode-encodable compile-time internible strings.
216    pub const LIST: &[&str] = &[
217        "cjk-latin-spacing",
218        "discretionary-ligatures",
219        "h5",
220        "h6",
221        "historical-ligatures",
222        "number-clearance",
223        "number-margin",
224        "numbering-scope",
225        "page-numbering",
226        "par-line-marker",
227        "transparentize",
228    ];
229
230    /// Try to find the index of an exception if it exists.
231    pub const fn get(string: &str) -> Option<usize> {
232        let mut lo = 0;
233        let mut hi = LIST.len();
234        while lo < hi {
235            let mid = (lo + hi) / 2;
236            match strcmp(string, LIST[mid]) {
237                Ordering::Less => hi = mid,
238                Ordering::Greater => lo = mid + 1,
239                Ordering::Equal => return Some(mid),
240            }
241        }
242        None
243    }
244
245    /// Compare two strings.
246    const fn strcmp(a: &str, b: &str) -> Ordering {
247        let a = a.as_bytes();
248        let b = b.as_bytes();
249        let l = min(a.len(), b.len());
250
251        let mut i = 0;
252        while i < l {
253            if a[i] == b[i] {
254                i += 1;
255            } else if a[i] < b[i] {
256                return Ordering::Less;
257            } else {
258                return Ordering::Greater;
259            }
260        }
261
262        if i < b.len() {
263            Ordering::Less
264        } else if i < a.len() {
265            Ordering::Greater
266        } else {
267            Ordering::Equal
268        }
269    }
270
271    /// Determine the minimum of two integers.
272    const fn min(a: usize, b: usize) -> usize {
273        if a < b {
274            a
275        } else {
276            b
277        }
278    }
279}
280
281/// This is returned by [`PicoStr::resolve`].
282///
283/// Dereferences to a `str`.
284pub struct ResolvedPicoStr(Repr);
285
286/// Representation of a resolved string.
287enum Repr {
288    Inline([u8; 12], u8),
289    Static(&'static str),
290}
291
292impl ResolvedPicoStr {
293    /// Retrieve the underlying string.
294    pub fn as_str(&self) -> &str {
295        match &self.0 {
296            Repr::Inline(buf, len) => unsafe {
297                std::str::from_utf8_unchecked(&buf[..*len as usize])
298            },
299            Repr::Static(s) => s,
300        }
301    }
302}
303
304impl Debug for ResolvedPicoStr {
305    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
306        Debug::fmt(self.as_str(), f)
307    }
308}
309
310impl Display for ResolvedPicoStr {
311    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
312        Display::fmt(self.as_str(), f)
313    }
314}
315
316impl Deref for ResolvedPicoStr {
317    type Target = str;
318
319    fn deref(&self) -> &Self::Target {
320        self.as_str()
321    }
322}
323
324impl AsRef<str> for ResolvedPicoStr {
325    fn as_ref(&self) -> &str {
326        self.as_str()
327    }
328}
329
330impl Borrow<str> for ResolvedPicoStr {
331    fn borrow(&self) -> &str {
332        self.as_str()
333    }
334}
335
336impl Eq for ResolvedPicoStr {}
337
338impl PartialEq for ResolvedPicoStr {
339    fn eq(&self, other: &Self) -> bool {
340        self.as_str().eq(other.as_str())
341    }
342}
343
344impl Ord for ResolvedPicoStr {
345    fn cmp(&self, other: &Self) -> Ordering {
346        self.as_str().cmp(other.as_str())
347    }
348}
349
350impl PartialOrd for ResolvedPicoStr {
351    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
352        Some(self.cmp(other))
353    }
354}
355
356impl Hash for ResolvedPicoStr {
357    fn hash<H: Hasher>(&self, state: &mut H) {
358        self.as_str().hash(state);
359    }
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    #[track_caller]
367    fn roundtrip(s: &str) {
368        assert_eq!(PicoStr::intern(s).resolve().as_str(), s);
369    }
370
371    #[test]
372    fn test_pico_str() {
373        // Test comparing compile-time and runtime-interned bitcode string.
374        const H1: PicoStr = PicoStr::constant("h1");
375        assert_eq!(H1, PicoStr::intern("h1"));
376        assert_eq!(H1.resolve().as_str(), "h1");
377
378        // Test comparing compile-time and runtime-interned exception.
379        const DISC: PicoStr = PicoStr::constant("discretionary-ligatures");
380        assert_eq!(DISC, PicoStr::intern("discretionary-ligatures"));
381        assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures");
382
383        // Test just roundtripping some strings.
384        roundtrip("");
385        roundtrip("hi");
386        roundtrip("∆@<hi-10_");
387        roundtrip("you");
388        roundtrip("discretionary-ligatures");
389    }
390
391    /// Ensures that none of the exceptions is bitcode-encodable.
392    #[test]
393    fn test_exceptions_not_bitcode_encodable() {
394        for s in exceptions::LIST {
395            assert!(
396                bitcode::encode(s).is_err(),
397                "{s:?} can be encoded with bitcode and should not be an exception"
398            );
399        }
400    }
401
402    /// Ensures that the exceptions are sorted.
403    #[test]
404    fn test_exceptions_sorted() {
405        for group in exceptions::LIST.windows(2) {
406            assert!(group[0] < group[1], "{group:?} are out of order");
407        }
408    }
409
410    /// Ensures that all exceptions can be found.
411    #[test]
412    fn test_exception_find() {
413        for (i, s) in exceptions::LIST.iter().enumerate() {
414            assert_eq!(exceptions::get(s), Some(i), "wrong index for {s:?}");
415        }
416        assert_eq!(exceptions::get("a"), None);
417        assert_eq!(exceptions::get("another-"), None);
418        assert_eq!(exceptions::get("z"), None);
419    }
420}