typst_library/text/font/
book.rs

1use std::cmp::Reverse;
2use std::collections::BTreeMap;
3use std::fmt::{self, Debug, Formatter};
4
5use serde::{Deserialize, Serialize};
6use ttf_parser::{PlatformId, Tag, name_id};
7use unicode_segmentation::UnicodeSegmentation;
8
9use super::exceptions::find_exception;
10use crate::text::{
11    Font, FontStretch, FontStyle, FontVariant, FontWeight, is_default_ignorable,
12};
13
14/// Metadata about a collection of fonts.
15#[derive(Debug, Default, Clone, Hash)]
16pub struct FontBook {
17    /// Maps from lowercased family names to font indices.
18    families: BTreeMap<String, Vec<usize>>,
19    /// Metadata about each font in the collection.
20    infos: Vec<FontInfo>,
21}
22
23impl FontBook {
24    /// Create a new, empty font book.
25    pub fn new() -> Self {
26        Self { families: BTreeMap::new(), infos: vec![] }
27    }
28
29    /// Create a font book from a collection of font infos.
30    pub fn from_infos(infos: impl IntoIterator<Item = FontInfo>) -> Self {
31        let mut book = Self::new();
32        for info in infos {
33            book.push(info);
34        }
35        book
36    }
37
38    /// Create a font book for a collection of fonts.
39    pub fn from_fonts<'a>(fonts: impl IntoIterator<Item = &'a Font>) -> Self {
40        Self::from_infos(fonts.into_iter().map(|font| font.info().clone()))
41    }
42
43    /// Insert metadata into the font book.
44    pub fn push(&mut self, info: FontInfo) {
45        let index = self.infos.len();
46        let family = info.family.to_lowercase();
47        self.families.entry(family).or_default().push(index);
48        self.infos.push(info);
49    }
50
51    /// Get the font info for the given index.
52    pub fn info(&self, index: usize) -> Option<&FontInfo> {
53        self.infos.get(index)
54    }
55
56    /// Returns true if the book contains a font family with the given name.
57    pub fn contains_family(&self, family: &str) -> bool {
58        self.families.contains_key(family)
59    }
60
61    /// An ordered iterator over all font families this book knows and details
62    /// about the fonts that are part of them.
63    pub fn families(
64        &self,
65    ) -> impl Iterator<Item = (&str, impl Iterator<Item = &FontInfo>)> + '_ {
66        // Since the keys are lowercased, we instead use the family field of the
67        // first face's info.
68        self.families.values().map(|ids| {
69            let family = self.infos[ids[0]].family.as_str();
70            let infos = ids.iter().map(|&id| &self.infos[id]);
71            (family, infos)
72        })
73    }
74
75    /// Try to find a font from the given `family` that matches the given
76    /// `variant` as closely as possible.
77    ///
78    /// The `family` should be all lowercase.
79    pub fn select(&self, family: &str, variant: FontVariant) -> Option<usize> {
80        let ids = self.families.get(family)?;
81        self.find_best_variant(None, variant, ids.iter().copied())
82    }
83
84    /// Iterate over all variants of a family.
85    pub fn select_family(&self, family: &str) -> impl Iterator<Item = usize> + '_ {
86        self.families
87            .get(family)
88            .map(|vec| vec.as_slice())
89            .unwrap_or_default()
90            .iter()
91            .copied()
92    }
93
94    /// Try to find and load a fallback font that
95    /// - is as close as possible to the font `like` (if any)
96    /// - is as close as possible to the given `variant`
97    /// - is suitable for shaping the given `text`
98    pub fn select_fallback(
99        &self,
100        like: Option<&FontInfo>,
101        variant: FontVariant,
102        text: &str,
103    ) -> Option<usize> {
104        // Find the fonts that contain the text's first non-space and
105        // non-ignorable char ...
106        let c = text
107            .chars()
108            .find(|&c| !c.is_whitespace() && !is_default_ignorable(c))?;
109
110        let ids = self
111            .infos
112            .iter()
113            .enumerate()
114            .filter(|(_, info)| info.coverage.contains(c as u32))
115            .map(|(index, _)| index);
116
117        // ... and find the best variant among them.
118        self.find_best_variant(like, variant, ids)
119    }
120
121    /// Find the font in the passed iterator that
122    /// - is closest to the font `like` (if any)
123    /// - is closest to the given `variant`
124    ///
125    /// To do that we compute a key for all variants and select the one with the
126    /// minimal key. This key prioritizes:
127    /// - If `like` is some other font:
128    ///   - Are both fonts (not) monospaced?
129    ///   - Do both fonts (not) have serifs?
130    ///   - How many words do the families share in their prefix? E.g. "Noto
131    ///     Sans" and "Noto Sans Arabic" share two words, whereas "IBM Plex
132    ///     Arabic" shares none with "Noto Sans", so prefer "Noto Sans Arabic"
133    ///     if `like` is "Noto Sans". In case there are two equally good
134    ///     matches, we prefer the shorter one because it is less special (e.g.
135    ///     if `like` is "Noto Sans Arabic", we prefer "Noto Sans" over "Noto
136    ///     Sans CJK HK".)
137    /// - The style (normal / italic / oblique). If we want italic or oblique
138    ///   but it doesn't exist, the other one of the two is still better than
139    ///   normal.
140    /// - The absolute distance to the target stretch.
141    /// - The absolute distance to the target weight.
142    fn find_best_variant(
143        &self,
144        like: Option<&FontInfo>,
145        variant: FontVariant,
146        ids: impl IntoIterator<Item = usize>,
147    ) -> Option<usize> {
148        let mut best = None;
149        let mut best_key = None;
150
151        for id in ids {
152            let current = &self.infos[id];
153            let key = (
154                like.map(|like| {
155                    (
156                        current.flags.contains(FontFlags::MONOSPACE)
157                            != like.flags.contains(FontFlags::MONOSPACE),
158                        current.flags.contains(FontFlags::SERIF)
159                            != like.flags.contains(FontFlags::SERIF),
160                        Reverse(shared_prefix_words(&current.family, &like.family)),
161                        current.family.len(),
162                    )
163                }),
164                current.variant.style.distance(variant.style),
165                current.variant.stretch.distance(variant.stretch),
166                current.variant.weight.distance(variant.weight),
167            );
168
169            if best_key.is_none_or(|b| key < b) {
170                best = Some(id);
171                best_key = Some(key);
172            }
173        }
174
175        best
176    }
177}
178
179/// Properties of a single font.
180#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
181pub struct FontInfo {
182    /// The typographic font family this font is part of.
183    pub family: String,
184    /// Properties that distinguish this font from other fonts in the same
185    /// family.
186    pub variant: FontVariant,
187    /// Properties of the font.
188    pub flags: FontFlags,
189    /// The unicode coverage of the font.
190    pub coverage: Coverage,
191}
192
193bitflags::bitflags! {
194    /// Bitflags describing characteristics of a font.
195    #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
196    #[derive(Serialize, Deserialize)]
197    #[serde(transparent)]
198    pub struct FontFlags: u32 {
199        /// All glyphs have the same width.
200        const MONOSPACE = 1 << 0;
201        /// Glyphs have short strokes at their stems.
202        const SERIF = 1 << 1;
203        /// Font face has a MATH table
204        const MATH = 1 << 2;
205        /// Font face has an fvar table
206        const VARIABLE = 1 << 3;
207    }
208}
209
210impl FontInfo {
211    /// Compute metadata for font at the `index` of the given data.
212    pub fn new(data: &[u8], index: u32) -> Option<Self> {
213        let ttf = ttf_parser::Face::parse(data, index).ok()?;
214        Self::from_ttf(&ttf)
215    }
216
217    /// Compute metadata for all fonts in the given data.
218    pub fn iter(data: &[u8]) -> impl Iterator<Item = FontInfo> + '_ {
219        let count = ttf_parser::fonts_in_collection(data).unwrap_or(1);
220        (0..count).filter_map(move |index| Self::new(data, index))
221    }
222
223    /// Compute metadata for a single ttf-parser face.
224    pub(super) fn from_ttf(ttf: &ttf_parser::Face) -> Option<Self> {
225        let ps_name = find_name(ttf, name_id::POST_SCRIPT_NAME);
226        let exception = ps_name.as_deref().and_then(find_exception);
227        // We cannot use Name ID 16 "Typographic Family", because for some
228        // fonts it groups together more than just Style / Weight / Stretch
229        // variants (e.g. Display variants of Noto fonts) and then some
230        // variants become inaccessible from Typst. And even though the
231        // fsSelection bit WWS should help us decide whether that is the
232        // case, it's wrong for some fonts (e.g. for certain variants of "Noto
233        // Sans Display").
234        //
235        // So, instead we use Name ID 1 "Family" and trim many common
236        // suffixes for which know that they just describe styling (e.g.
237        // "ExtraBold").
238        let family =
239            exception.and_then(|c| c.family.map(str::to_string)).or_else(|| {
240                let family = find_name(ttf, name_id::FAMILY)?;
241                Some(typographic_family(&family).to_string())
242            })?;
243
244        let variant = {
245            let style = exception.and_then(|c| c.style).unwrap_or_else(|| {
246                let mut full = find_name(ttf, name_id::FULL_NAME).unwrap_or_default();
247                full.make_ascii_lowercase();
248
249                // Some fonts miss the relevant bits for italic or oblique, so
250                // we also try to infer that from the full name.
251                //
252                // We do not use `ttf.is_italic()` because that also checks the
253                // italic angle which leads to false positives for some oblique
254                // fonts.
255                //
256                // See <https://github.com/typst/typst/issues/7479>.
257                let italic =
258                    ttf.style() == ttf_parser::Style::Italic || full.contains("italic");
259                let oblique = ttf.is_oblique()
260                    || full.contains("oblique")
261                    || full.contains("slanted");
262
263                match (italic, oblique) {
264                    (false, false) => FontStyle::Normal,
265                    (true, _) => FontStyle::Italic,
266                    (_, true) => FontStyle::Oblique,
267                }
268            });
269
270            let weight = exception.and_then(|c| c.weight).unwrap_or_else(|| {
271                let number = ttf.weight().to_number();
272                FontWeight::from_number(number)
273            });
274
275            let stretch = exception
276                .and_then(|c| c.stretch)
277                .unwrap_or_else(|| FontStretch::from_number(ttf.width().to_number()));
278
279            FontVariant { style, weight, stretch }
280        };
281
282        // Determine the unicode coverage.
283        let mut codepoints = vec![];
284        for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
285            if subtable.is_unicode() {
286                subtable.codepoints(|c| codepoints.push(c));
287            }
288        }
289
290        let mut flags = FontFlags::empty();
291        flags.set(FontFlags::MONOSPACE, ttf.is_monospaced());
292        flags.set(FontFlags::MATH, ttf.tables().math.is_some());
293        flags.set(FontFlags::VARIABLE, ttf.is_variable());
294
295        // Determine whether this is a serif or sans-serif font.
296        if let Some(panose) = ttf
297            .raw_face()
298            .table(Tag::from_bytes(b"OS/2"))
299            .and_then(|os2| os2.get(32..45))
300            && matches!(panose, [2, 2..=10, ..])
301        {
302            flags.insert(FontFlags::SERIF);
303        }
304
305        Some(FontInfo {
306            family,
307            variant,
308            flags,
309            coverage: Coverage::from_vec(codepoints),
310        })
311    }
312
313    /// Whether this is the macOS LastResort font. It can yield tofus with
314    /// glyph ID != 0.
315    pub fn is_last_resort(&self) -> bool {
316        self.family == "LastResort"
317    }
318}
319
320/// Try to find and decode the name with the given id.
321pub(super) fn find_name(ttf: &ttf_parser::Face, name_id: u16) -> Option<String> {
322    ttf.names().into_iter().find_map(|entry| {
323        if entry.name_id == name_id {
324            if let Some(string) = entry.to_string() {
325                return Some(string);
326            }
327
328            if entry.platform_id == PlatformId::Macintosh && entry.encoding_id == 0 {
329                return Some(decode_mac_roman(entry.name));
330            }
331        }
332
333        None
334    })
335}
336
337/// Decode mac roman encoded bytes into a string.
338fn decode_mac_roman(coded: &[u8]) -> String {
339    #[rustfmt::skip]
340    const TABLE: [char; 128] = [
341        'Ä', 'Å', 'Ç', 'É', 'Ñ', 'Ö', 'Ü', 'á', 'à', 'â', 'ä', 'ã', 'å', 'ç', 'é', 'è',
342        'ê', 'ë', 'í', 'ì', 'î', 'ï', 'ñ', 'ó', 'ò', 'ô', 'ö', 'õ', 'ú', 'ù', 'û', 'ü',
343        '†', '°', '¢', '£', '§', '•', '¶', 'ß', '®', '©', '™', '´', '¨', '≠', 'Æ', 'Ø',
344        '∞', '±', '≤', '≥', '¥', 'µ', '∂', '∑', '∏', 'π', '∫', 'ª', 'º', 'Ω', 'æ', 'ø',
345        '¿', '¡', '¬', '√', 'ƒ', '≈', '∆', '«', '»', '…', '\u{a0}', 'À', 'Ã', 'Õ', 'Œ', 'œ',
346        '–', '—', '“', '”', '‘', '’', '÷', '◊', 'ÿ', 'Ÿ', '⁄', '€', '‹', '›', 'fi', 'fl',
347        '‡', '·', '‚', '„', '‰', 'Â', 'Ê', 'Á', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'Ó', 'Ô',
348        '\u{f8ff}', 'Ò', 'Ú', 'Û', 'Ù', 'ı', 'ˆ', '˜', '¯', '˘', '˙', '˚', '¸', '˝', '˛', 'ˇ',
349    ];
350
351    fn char_from_mac_roman(code: u8) -> char {
352        if code < 128 { code as char } else { TABLE[(code - 128) as usize] }
353    }
354
355    coded.iter().copied().map(char_from_mac_roman).collect()
356}
357
358/// Trim style naming from a family name and fix bad names.
359fn typographic_family(mut family: &str) -> &str {
360    // Separators between names, modifiers and styles.
361    const SEPARATORS: [char; 3] = [' ', '-', '_'];
362
363    // Modifiers that can appear in combination with suffixes.
364    const MODIFIERS: &[&str] =
365        &["extra", "ext", "ex", "x", "semi", "sem", "sm", "demi", "dem", "ultra"];
366
367    // Style suffixes.
368    #[rustfmt::skip]
369    const SUFFIXES: &[&str] = &[
370        "normal", "italic", "oblique", "slanted",
371        "thin", "th", "hairline", "light", "lt", "regular", "medium", "med",
372        "md", "bold", "bd", "demi", "extb", "black", "blk", "bk", "heavy",
373        "narrow", "condensed", "cond", "cn", "cd", "compressed", "expanded", "exp"
374    ];
375
376    // Trim spacing and weird leading dots in Apple fonts.
377    family = family.trim().trim_start_matches('.');
378
379    // Lowercase the string so that the suffixes match case-insensitively.
380    let lower = family.to_ascii_lowercase();
381    let mut len = usize::MAX;
382    let mut trimmed = lower.as_str();
383
384    // Trim style suffixes repeatedly.
385    while trimmed.len() < len {
386        len = trimmed.len();
387
388        // Find style suffix.
389        let mut t = trimmed;
390        let mut shortened = false;
391        while let Some(s) = SUFFIXES.iter().find_map(|s| t.strip_suffix(s)) {
392            shortened = true;
393            t = s;
394        }
395
396        if !shortened {
397            break;
398        }
399
400        // Strip optional separator.
401        if let Some(s) = t.strip_suffix(SEPARATORS) {
402            trimmed = s;
403            t = s;
404        }
405
406        // Also allow an extra modifier, but apply it only if it is separated it
407        // from the text before it (to prevent false positives).
408        if let Some(t) = MODIFIERS.iter().find_map(|s| t.strip_suffix(s))
409            && let Some(stripped) = t.strip_suffix(SEPARATORS)
410        {
411            trimmed = stripped;
412        }
413    }
414
415    // Apply style suffix trimming.
416    family = &family[..len];
417
418    family
419}
420
421/// How many words the two strings share in their prefix.
422fn shared_prefix_words(left: &str, right: &str) -> usize {
423    left.unicode_words()
424        .zip(right.unicode_words())
425        .take_while(|(l, r)| l == r)
426        .count()
427}
428
429/// A compactly encoded set of codepoints.
430///
431/// The set is represented by alternating specifications of how many codepoints
432/// are not in the set and how many are in the set.
433///
434/// For example, for the set `{2, 3, 4, 9, 10, 11, 15, 18, 19}`, there are:
435/// - 2 codepoints not inside (0, 1)
436/// - 3 codepoints inside (2, 3, 4)
437/// - 4 codepoints not inside (5, 6, 7, 8)
438/// - 3 codepoints inside (9, 10, 11)
439/// - 3 codepoints not inside (12, 13, 14)
440/// - 1 codepoint inside (15)
441/// - 2 codepoints not inside (16, 17)
442/// - 2 codepoints inside (18, 19)
443///
444/// So the resulting encoding is `[2, 3, 4, 3, 3, 1, 2, 2]`.
445#[derive(Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
446#[serde(transparent)]
447pub struct Coverage(Vec<u32>);
448
449impl Coverage {
450    /// Encode a vector of codepoints.
451    pub fn from_vec(mut codepoints: Vec<u32>) -> Self {
452        codepoints.sort();
453        codepoints.dedup();
454
455        let mut runs = Vec::new();
456        let mut next = 0;
457
458        for c in codepoints {
459            if let Some(run) = runs.last_mut().filter(|_| c == next) {
460                *run += 1;
461            } else {
462                runs.push(c - next);
463                runs.push(1);
464            }
465
466            next = c + 1;
467        }
468
469        Self(runs)
470    }
471
472    /// Whether the codepoint is covered.
473    pub fn contains(&self, c: u32) -> bool {
474        let mut inside = false;
475        let mut cursor = 0;
476
477        for &run in &self.0 {
478            if (cursor..cursor + run).contains(&c) {
479                return inside;
480            }
481            cursor += run;
482            inside = !inside;
483        }
484
485        false
486    }
487
488    /// Iterate over all covered codepoints.
489    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
490        let mut inside = false;
491        let mut cursor = 0;
492        self.0.iter().flat_map(move |run| {
493            let range = if inside { cursor..cursor + run } else { 0..0 };
494            inside = !inside;
495            cursor += run;
496            range
497        })
498    }
499}
500
501impl Debug for Coverage {
502    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
503        f.pad("Coverage(..)")
504    }
505}
506
507#[cfg(test)]
508mod tests {
509    use super::*;
510
511    #[test]
512    fn test_trim_styles() {
513        assert_eq!(typographic_family("Atma Light"), "Atma");
514        assert_eq!(typographic_family("eras bold"), "eras");
515        assert_eq!(typographic_family("footlight mt light"), "footlight mt");
516        assert_eq!(typographic_family("times new roman"), "times new roman");
517        assert_eq!(typographic_family("noto sans mono cond sembd"), "noto sans mono");
518        assert_eq!(typographic_family("noto serif SEMCOND sembd"), "noto serif");
519        assert_eq!(typographic_family("crimson text"), "crimson text");
520        assert_eq!(typographic_family("footlight light"), "footlight");
521        assert_eq!(typographic_family("Noto Sans"), "Noto Sans");
522        assert_eq!(typographic_family("Noto Sans Light"), "Noto Sans");
523        assert_eq!(typographic_family("Noto Sans Semicondensed Heavy"), "Noto Sans");
524        assert_eq!(typographic_family("Familx"), "Familx");
525        assert_eq!(typographic_family("Font Ultra"), "Font Ultra");
526        assert_eq!(typographic_family("Font Ultra Bold"), "Font");
527    }
528
529    #[test]
530    fn test_coverage() {
531        #[track_caller]
532        fn test(set: &[u32], runs: &[u32]) {
533            let coverage = Coverage::from_vec(set.to_vec());
534            assert_eq!(coverage.0, runs);
535
536            let max = 5 + set.iter().copied().max().unwrap_or_default();
537            for c in 0..max {
538                assert_eq!(set.contains(&c), coverage.contains(c));
539            }
540        }
541
542        test(&[], &[]);
543        test(&[0], &[0, 1]);
544        test(&[1], &[1, 1]);
545        test(&[0, 1], &[0, 2]);
546        test(&[0, 1, 3], &[0, 2, 1, 1]);
547        test(
548            // {2, 3, 4, 9, 10, 11, 15, 18, 19}
549            &[18, 19, 2, 4, 9, 11, 15, 3, 3, 10],
550            &[2, 3, 4, 3, 3, 1, 2, 2],
551        )
552    }
553
554    #[test]
555    fn test_coverage_iter() {
556        let codepoints = vec![2, 3, 7, 8, 9, 14, 15, 19, 21];
557        let coverage = Coverage::from_vec(codepoints.clone());
558        assert_eq!(coverage.iter().collect::<Vec<_>>(), codepoints);
559    }
560}