typst_library/text/font/
book.rs

1use std::cmp::Reverse;
2use std::collections::BTreeMap;
3use std::fmt::{self, Debug, Formatter};
4
5use serde::{Deserialize, Serialize};
6use ttf_parser::{PlatformId, Tag, name_id};
7use unicode_segmentation::UnicodeSegmentation;
8
9use super::exceptions::find_exception;
10use crate::text::{
11    Font, FontStretch, FontStyle, FontVariant, FontWeight, is_default_ignorable,
12};
13
14/// Metadata about a collection of fonts.
15#[derive(Debug, Default, Clone, Hash)]
16pub struct FontBook {
17    /// Maps from lowercased family names to font indices.
18    families: BTreeMap<String, Vec<usize>>,
19    /// Metadata about each font in the collection.
20    infos: Vec<FontInfo>,
21}
22
23impl FontBook {
24    /// Create a new, empty font book.
25    pub fn new() -> Self {
26        Self { families: BTreeMap::new(), infos: vec![] }
27    }
28
29    /// Create a font book from a collection of font infos.
30    pub fn from_infos(infos: impl IntoIterator<Item = FontInfo>) -> Self {
31        let mut book = Self::new();
32        for info in infos {
33            book.push(info);
34        }
35        book
36    }
37
38    /// Create a font book for a collection of fonts.
39    pub fn from_fonts<'a>(fonts: impl IntoIterator<Item = &'a Font>) -> Self {
40        Self::from_infos(fonts.into_iter().map(|font| font.info().clone()))
41    }
42
43    /// Insert metadata into the font book.
44    pub fn push(&mut self, info: FontInfo) {
45        let index = self.infos.len();
46        let family = info.family.to_lowercase();
47        self.families.entry(family).or_default().push(index);
48        self.infos.push(info);
49    }
50
51    /// Get the font info for the given index.
52    pub fn info(&self, index: usize) -> Option<&FontInfo> {
53        self.infos.get(index)
54    }
55
56    /// Returns true if the book contains a font family with the given name.
57    pub fn contains_family(&self, family: &str) -> bool {
58        self.families.contains_key(family)
59    }
60
61    /// An ordered iterator over all font families this book knows and details
62    /// about the fonts that are part of them.
63    pub fn families(
64        &self,
65    ) -> impl Iterator<Item = (&str, impl Iterator<Item = &FontInfo>)> + '_ {
66        // Since the keys are lowercased, we instead use the family field of the
67        // first face's info.
68        self.families.values().map(|ids| {
69            let family = self.infos[ids[0]].family.as_str();
70            let infos = ids.iter().map(|&id| &self.infos[id]);
71            (family, infos)
72        })
73    }
74
75    /// Try to find a font from the given `family` that matches the given
76    /// `variant` as closely as possible.
77    ///
78    /// The `family` should be all lowercase.
79    pub fn select(&self, family: &str, variant: FontVariant) -> Option<usize> {
80        let ids = self.families.get(family)?;
81        self.find_best_variant(None, variant, ids.iter().copied())
82    }
83
84    /// Iterate over all variants of a family.
85    pub fn select_family(&self, family: &str) -> impl Iterator<Item = usize> + '_ {
86        self.families
87            .get(family)
88            .map(|vec| vec.as_slice())
89            .unwrap_or_default()
90            .iter()
91            .copied()
92    }
93
94    /// Try to find and load a fallback font that
95    /// - is as close as possible to the font `like` (if any)
96    /// - is as close as possible to the given `variant`
97    /// - is suitable for shaping the given `text`
98    pub fn select_fallback(
99        &self,
100        like: Option<&FontInfo>,
101        variant: FontVariant,
102        text: &str,
103    ) -> Option<usize> {
104        // Find the fonts that contain the text's first non-space and
105        // non-ignorable char ...
106        let c = text
107            .chars()
108            .find(|&c| !c.is_whitespace() && !is_default_ignorable(c))?;
109
110        let ids = self
111            .infos
112            .iter()
113            .enumerate()
114            .filter(|(_, info)| info.coverage.contains(c as u32))
115            .map(|(index, _)| index);
116
117        // ... and find the best variant among them.
118        self.find_best_variant(like, variant, ids)
119    }
120
121    /// Find the font in the passed iterator that
122    /// - is closest to the font `like` (if any)
123    /// - is closest to the given `variant`
124    ///
125    /// To do that we compute a key for all variants and select the one with the
126    /// minimal key. This key prioritizes:
127    /// - If `like` is some other font:
128    ///   - Are both fonts (not) monospaced?
129    ///   - Do both fonts (not) have serifs?
130    ///   - How many words do the families share in their prefix? E.g. "Noto
131    ///     Sans" and "Noto Sans Arabic" share two words, whereas "IBM Plex
132    ///     Arabic" shares none with "Noto Sans", so prefer "Noto Sans Arabic"
133    ///     if `like` is "Noto Sans". In case there are two equally good
134    ///     matches, we prefer the shorter one because it is less special (e.g.
135    ///     if `like` is "Noto Sans Arabic", we prefer "Noto Sans" over "Noto
136    ///     Sans CJK HK".)
137    /// - The style (normal / italic / oblique). If we want italic or oblique
138    ///   but it doesn't exist, the other one of the two is still better than
139    ///   normal.
140    /// - The absolute distance to the target stretch.
141    /// - The absolute distance to the target weight.
142    fn find_best_variant(
143        &self,
144        like: Option<&FontInfo>,
145        variant: FontVariant,
146        ids: impl IntoIterator<Item = usize>,
147    ) -> Option<usize> {
148        let mut best = None;
149        let mut best_key = None;
150
151        for id in ids {
152            let current = &self.infos[id];
153            let key = (
154                like.map(|like| {
155                    (
156                        current.flags.contains(FontFlags::MONOSPACE)
157                            != like.flags.contains(FontFlags::MONOSPACE),
158                        current.flags.contains(FontFlags::SERIF)
159                            != like.flags.contains(FontFlags::SERIF),
160                        Reverse(shared_prefix_words(&current.family, &like.family)),
161                        current.family.len(),
162                    )
163                }),
164                current.variant.style.distance(variant.style),
165                current.variant.stretch.distance(variant.stretch),
166                current.variant.weight.distance(variant.weight),
167            );
168
169            if best_key.is_none_or(|b| key < b) {
170                best = Some(id);
171                best_key = Some(key);
172            }
173        }
174
175        best
176    }
177}
178
179/// Properties of a single font.
180#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
181pub struct FontInfo {
182    /// The typographic font family this font is part of.
183    pub family: String,
184    /// Properties that distinguish this font from other fonts in the same
185    /// family.
186    pub variant: FontVariant,
187    /// Properties of the font.
188    pub flags: FontFlags,
189    /// The unicode coverage of the font.
190    pub coverage: Coverage,
191}
192
193bitflags::bitflags! {
194    /// Bitflags describing characteristics of a font.
195    #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
196    #[derive(Serialize, Deserialize)]
197    #[serde(transparent)]
198    pub struct FontFlags: u32 {
199        /// All glyphs have the same width.
200        const MONOSPACE = 1 << 0;
201        /// Glyphs have short strokes at their stems.
202        const SERIF = 1 << 1;
203        /// Font face has a MATH table
204        const MATH = 1 << 2;
205        /// Font face has an fvar table
206        const VARIABLE = 1 << 3;
207    }
208}
209
210impl FontInfo {
211    /// Compute metadata for font at the `index` of the given data.
212    pub fn new(data: &[u8], index: u32) -> Option<Self> {
213        let ttf = ttf_parser::Face::parse(data, index).ok()?;
214        Self::from_ttf(&ttf)
215    }
216
217    /// Compute metadata for all fonts in the given data.
218    pub fn iter(data: &[u8]) -> impl Iterator<Item = FontInfo> + '_ {
219        let count = ttf_parser::fonts_in_collection(data).unwrap_or(1);
220        (0..count).filter_map(move |index| Self::new(data, index))
221    }
222
223    /// Compute metadata for a single ttf-parser face.
224    pub(super) fn from_ttf(ttf: &ttf_parser::Face) -> Option<Self> {
225        let ps_name = find_name(ttf, name_id::POST_SCRIPT_NAME);
226        let exception = ps_name.as_deref().and_then(find_exception);
227        // We cannot use Name ID 16 "Typographic Family", because for some
228        // fonts it groups together more than just Style / Weight / Stretch
229        // variants (e.g. Display variants of Noto fonts) and then some
230        // variants become inaccessible from Typst. And even though the
231        // fsSelection bit WWS should help us decide whether that is the
232        // case, it's wrong for some fonts (e.g. for certain variants of "Noto
233        // Sans Display").
234        //
235        // So, instead we use Name ID 1 "Family" and trim many common
236        // suffixes for which know that they just describe styling (e.g.
237        // "ExtraBold").
238        let family =
239            exception.and_then(|c| c.family.map(str::to_string)).or_else(|| {
240                let family = find_name(ttf, name_id::FAMILY)?;
241                Some(typographic_family(&family).to_string())
242            })?;
243
244        let variant = {
245            let style = exception.and_then(|c| c.style).unwrap_or_else(|| {
246                let mut full = find_name(ttf, name_id::FULL_NAME).unwrap_or_default();
247                full.make_ascii_lowercase();
248
249                // Some fonts miss the relevant bits for italic or oblique, so
250                // we also try to infer that from the full name.
251                let italic = ttf.is_italic() || full.contains("italic");
252                let oblique = ttf.is_oblique()
253                    || full.contains("oblique")
254                    || full.contains("slanted");
255
256                match (italic, oblique) {
257                    (false, false) => FontStyle::Normal,
258                    (true, _) => FontStyle::Italic,
259                    (_, true) => FontStyle::Oblique,
260                }
261            });
262
263            let weight = exception.and_then(|c| c.weight).unwrap_or_else(|| {
264                let number = ttf.weight().to_number();
265                FontWeight::from_number(number)
266            });
267
268            let stretch = exception
269                .and_then(|c| c.stretch)
270                .unwrap_or_else(|| FontStretch::from_number(ttf.width().to_number()));
271
272            FontVariant { style, weight, stretch }
273        };
274
275        // Determine the unicode coverage.
276        let mut codepoints = vec![];
277        for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
278            if subtable.is_unicode() {
279                subtable.codepoints(|c| codepoints.push(c));
280            }
281        }
282
283        let mut flags = FontFlags::empty();
284        flags.set(FontFlags::MONOSPACE, ttf.is_monospaced());
285        flags.set(FontFlags::MATH, ttf.tables().math.is_some());
286        flags.set(FontFlags::VARIABLE, ttf.is_variable());
287
288        // Determine whether this is a serif or sans-serif font.
289        if let Some(panose) = ttf
290            .raw_face()
291            .table(Tag::from_bytes(b"OS/2"))
292            .and_then(|os2| os2.get(32..45))
293            && matches!(panose, [2, 2..=10, ..])
294        {
295            flags.insert(FontFlags::SERIF);
296        }
297
298        Some(FontInfo {
299            family,
300            variant,
301            flags,
302            coverage: Coverage::from_vec(codepoints),
303        })
304    }
305
306    /// Whether this is the macOS LastResort font. It can yield tofus with
307    /// glyph ID != 0.
308    pub fn is_last_resort(&self) -> bool {
309        self.family == "LastResort"
310    }
311}
312
313/// Try to find and decode the name with the given id.
314pub(super) fn find_name(ttf: &ttf_parser::Face, name_id: u16) -> Option<String> {
315    ttf.names().into_iter().find_map(|entry| {
316        if entry.name_id == name_id {
317            if let Some(string) = entry.to_string() {
318                return Some(string);
319            }
320
321            if entry.platform_id == PlatformId::Macintosh && entry.encoding_id == 0 {
322                return Some(decode_mac_roman(entry.name));
323            }
324        }
325
326        None
327    })
328}
329
330/// Decode mac roman encoded bytes into a string.
331fn decode_mac_roman(coded: &[u8]) -> String {
332    #[rustfmt::skip]
333    const TABLE: [char; 128] = [
334        'Ä', 'Å', 'Ç', 'É', 'Ñ', 'Ö', 'Ü', 'á', 'à', 'â', 'ä', 'ã', 'å', 'ç', 'é', 'è',
335        'ê', 'ë', 'í', 'ì', 'î', 'ï', 'ñ', 'ó', 'ò', 'ô', 'ö', 'õ', 'ú', 'ù', 'û', 'ü',
336        '†', '°', '¢', '£', '§', '•', '¶', 'ß', '®', '©', '™', '´', '¨', '≠', 'Æ', 'Ø',
337        '∞', '±', '≤', '≥', '¥', 'µ', '∂', '∑', '∏', 'π', '∫', 'ª', 'º', 'Ω', 'æ', 'ø',
338        '¿', '¡', '¬', '√', 'ƒ', '≈', '∆', '«', '»', '…', '\u{a0}', 'À', 'Ã', 'Õ', 'Œ', 'œ',
339        '–', '—', '“', '”', '‘', '’', '÷', '◊', 'ÿ', 'Ÿ', '⁄', '€', '‹', '›', 'fi', 'fl',
340        '‡', '·', '‚', '„', '‰', 'Â', 'Ê', 'Á', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'Ó', 'Ô',
341        '\u{f8ff}', 'Ò', 'Ú', 'Û', 'Ù', 'ı', 'ˆ', '˜', '¯', '˘', '˙', '˚', '¸', '˝', '˛', 'ˇ',
342    ];
343
344    fn char_from_mac_roman(code: u8) -> char {
345        if code < 128 { code as char } else { TABLE[(code - 128) as usize] }
346    }
347
348    coded.iter().copied().map(char_from_mac_roman).collect()
349}
350
351/// Trim style naming from a family name and fix bad names.
352fn typographic_family(mut family: &str) -> &str {
353    // Separators between names, modifiers and styles.
354    const SEPARATORS: [char; 3] = [' ', '-', '_'];
355
356    // Modifiers that can appear in combination with suffixes.
357    const MODIFIERS: &[&str] =
358        &["extra", "ext", "ex", "x", "semi", "sem", "sm", "demi", "dem", "ultra"];
359
360    // Style suffixes.
361    #[rustfmt::skip]
362    const SUFFIXES: &[&str] = &[
363        "normal", "italic", "oblique", "slanted",
364        "thin", "th", "hairline", "light", "lt", "regular", "medium", "med",
365        "md", "bold", "bd", "demi", "extb", "black", "blk", "bk", "heavy",
366        "narrow", "condensed", "cond", "cn", "cd", "compressed", "expanded", "exp"
367    ];
368
369    // Trim spacing and weird leading dots in Apple fonts.
370    family = family.trim().trim_start_matches('.');
371
372    // Lowercase the string so that the suffixes match case-insensitively.
373    let lower = family.to_ascii_lowercase();
374    let mut len = usize::MAX;
375    let mut trimmed = lower.as_str();
376
377    // Trim style suffixes repeatedly.
378    while trimmed.len() < len {
379        len = trimmed.len();
380
381        // Find style suffix.
382        let mut t = trimmed;
383        let mut shortened = false;
384        while let Some(s) = SUFFIXES.iter().find_map(|s| t.strip_suffix(s)) {
385            shortened = true;
386            t = s;
387        }
388
389        if !shortened {
390            break;
391        }
392
393        // Strip optional separator.
394        if let Some(s) = t.strip_suffix(SEPARATORS) {
395            trimmed = s;
396            t = s;
397        }
398
399        // Also allow an extra modifier, but apply it only if it is separated it
400        // from the text before it (to prevent false positives).
401        if let Some(t) = MODIFIERS.iter().find_map(|s| t.strip_suffix(s))
402            && let Some(stripped) = t.strip_suffix(SEPARATORS)
403        {
404            trimmed = stripped;
405        }
406    }
407
408    // Apply style suffix trimming.
409    family = &family[..len];
410
411    family
412}
413
414/// How many words the two strings share in their prefix.
415fn shared_prefix_words(left: &str, right: &str) -> usize {
416    left.unicode_words()
417        .zip(right.unicode_words())
418        .take_while(|(l, r)| l == r)
419        .count()
420}
421
422/// A compactly encoded set of codepoints.
423///
424/// The set is represented by alternating specifications of how many codepoints
425/// are not in the set and how many are in the set.
426///
427/// For example, for the set `{2, 3, 4, 9, 10, 11, 15, 18, 19}`, there are:
428/// - 2 codepoints not inside (0, 1)
429/// - 3 codepoints inside (2, 3, 4)
430/// - 4 codepoints not inside (5, 6, 7, 8)
431/// - 3 codepoints inside (9, 10, 11)
432/// - 3 codepoints not inside (12, 13, 14)
433/// - 1 codepoint inside (15)
434/// - 2 codepoints not inside (16, 17)
435/// - 2 codepoints inside (18, 19)
436///
437/// So the resulting encoding is `[2, 3, 4, 3, 3, 1, 2, 2]`.
438#[derive(Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
439#[serde(transparent)]
440pub struct Coverage(Vec<u32>);
441
442impl Coverage {
443    /// Encode a vector of codepoints.
444    pub fn from_vec(mut codepoints: Vec<u32>) -> Self {
445        codepoints.sort();
446        codepoints.dedup();
447
448        let mut runs = Vec::new();
449        let mut next = 0;
450
451        for c in codepoints {
452            if let Some(run) = runs.last_mut().filter(|_| c == next) {
453                *run += 1;
454            } else {
455                runs.push(c - next);
456                runs.push(1);
457            }
458
459            next = c + 1;
460        }
461
462        Self(runs)
463    }
464
465    /// Whether the codepoint is covered.
466    pub fn contains(&self, c: u32) -> bool {
467        let mut inside = false;
468        let mut cursor = 0;
469
470        for &run in &self.0 {
471            if (cursor..cursor + run).contains(&c) {
472                return inside;
473            }
474            cursor += run;
475            inside = !inside;
476        }
477
478        false
479    }
480
481    /// Iterate over all covered codepoints.
482    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
483        let mut inside = false;
484        let mut cursor = 0;
485        self.0.iter().flat_map(move |run| {
486            let range = if inside { cursor..cursor + run } else { 0..0 };
487            inside = !inside;
488            cursor += run;
489            range
490        })
491    }
492}
493
494impl Debug for Coverage {
495    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
496        f.pad("Coverage(..)")
497    }
498}
499
500#[cfg(test)]
501mod tests {
502    use super::*;
503
504    #[test]
505    fn test_trim_styles() {
506        assert_eq!(typographic_family("Atma Light"), "Atma");
507        assert_eq!(typographic_family("eras bold"), "eras");
508        assert_eq!(typographic_family("footlight mt light"), "footlight mt");
509        assert_eq!(typographic_family("times new roman"), "times new roman");
510        assert_eq!(typographic_family("noto sans mono cond sembd"), "noto sans mono");
511        assert_eq!(typographic_family("noto serif SEMCOND sembd"), "noto serif");
512        assert_eq!(typographic_family("crimson text"), "crimson text");
513        assert_eq!(typographic_family("footlight light"), "footlight");
514        assert_eq!(typographic_family("Noto Sans"), "Noto Sans");
515        assert_eq!(typographic_family("Noto Sans Light"), "Noto Sans");
516        assert_eq!(typographic_family("Noto Sans Semicondensed Heavy"), "Noto Sans");
517        assert_eq!(typographic_family("Familx"), "Familx");
518        assert_eq!(typographic_family("Font Ultra"), "Font Ultra");
519        assert_eq!(typographic_family("Font Ultra Bold"), "Font");
520    }
521
522    #[test]
523    fn test_coverage() {
524        #[track_caller]
525        fn test(set: &[u32], runs: &[u32]) {
526            let coverage = Coverage::from_vec(set.to_vec());
527            assert_eq!(coverage.0, runs);
528
529            let max = 5 + set.iter().copied().max().unwrap_or_default();
530            for c in 0..max {
531                assert_eq!(set.contains(&c), coverage.contains(c));
532            }
533        }
534
535        test(&[], &[]);
536        test(&[0], &[0, 1]);
537        test(&[1], &[1, 1]);
538        test(&[0, 1], &[0, 2]);
539        test(&[0, 1, 3], &[0, 2, 1, 1]);
540        test(
541            // {2, 3, 4, 9, 10, 11, 15, 18, 19}
542            &[18, 19, 2, 4, 9, 11, 15, 3, 3, 10],
543            &[2, 3, 4, 3, 3, 1, 2, 2],
544        )
545    }
546
547    #[test]
548    fn test_coverage_iter() {
549        let codepoints = vec![2, 3, 7, 8, 9, 14, 15, 19, 21];
550        let coverage = Coverage::from_vec(codepoints.clone());
551        assert_eq!(coverage.iter().collect::<Vec<_>>(), codepoints);
552    }
553}