typst_library/text/font/
book.rs

1use std::cmp::Reverse;
2use std::collections::BTreeMap;
3use std::fmt::{self, Debug, Formatter};
4
5use serde::{Deserialize, Serialize};
6use ttf_parser::{name_id, PlatformId, Tag};
7use unicode_segmentation::UnicodeSegmentation;
8
9use super::exceptions::find_exception;
10use crate::text::{Font, FontStretch, FontStyle, FontVariant, FontWeight};
11
12/// Metadata about a collection of fonts.
13#[derive(Debug, Default, Clone, Hash)]
14pub struct FontBook {
15    /// Maps from lowercased family names to font indices.
16    families: BTreeMap<String, Vec<usize>>,
17    /// Metadata about each font in the collection.
18    infos: Vec<FontInfo>,
19}
20
21impl FontBook {
22    /// Create a new, empty font book.
23    pub fn new() -> Self {
24        Self { families: BTreeMap::new(), infos: vec![] }
25    }
26
27    /// Create a font book from a collection of font infos.
28    pub fn from_infos(infos: impl IntoIterator<Item = FontInfo>) -> Self {
29        let mut book = Self::new();
30        for info in infos {
31            book.push(info);
32        }
33        book
34    }
35
36    /// Create a font book for a collection of fonts.
37    pub fn from_fonts<'a>(fonts: impl IntoIterator<Item = &'a Font>) -> Self {
38        Self::from_infos(fonts.into_iter().map(|font| font.info().clone()))
39    }
40
41    /// Insert metadata into the font book.
42    pub fn push(&mut self, info: FontInfo) {
43        let index = self.infos.len();
44        let family = info.family.to_lowercase();
45        self.families.entry(family).or_default().push(index);
46        self.infos.push(info);
47    }
48
49    /// Get the font info for the given index.
50    pub fn info(&self, index: usize) -> Option<&FontInfo> {
51        self.infos.get(index)
52    }
53
54    /// Returns true if the book contains a font family with the given name.
55    pub fn contains_family(&self, family: &str) -> bool {
56        self.families.contains_key(family)
57    }
58
59    /// An ordered iterator over all font families this book knows and details
60    /// about the fonts that are part of them.
61    pub fn families(
62        &self,
63    ) -> impl Iterator<Item = (&str, impl Iterator<Item = &FontInfo>)> + '_ {
64        // Since the keys are lowercased, we instead use the family field of the
65        // first face's info.
66        self.families.values().map(|ids| {
67            let family = self.infos[ids[0]].family.as_str();
68            let infos = ids.iter().map(|&id| &self.infos[id]);
69            (family, infos)
70        })
71    }
72
73    /// Try to find a font from the given `family` that matches the given
74    /// `variant` as closely as possible.
75    ///
76    /// The `family` should be all lowercase.
77    pub fn select(&self, family: &str, variant: FontVariant) -> Option<usize> {
78        let ids = self.families.get(family)?;
79        self.find_best_variant(None, variant, ids.iter().copied())
80    }
81
82    /// Iterate over all variants of a family.
83    pub fn select_family(&self, family: &str) -> impl Iterator<Item = usize> + '_ {
84        self.families
85            .get(family)
86            .map(|vec| vec.as_slice())
87            .unwrap_or_default()
88            .iter()
89            .copied()
90    }
91
92    /// Try to find and load a fallback font that
93    /// - is as close as possible to the font `like` (if any)
94    /// - is as close as possible to the given `variant`
95    /// - is suitable for shaping the given `text`
96    pub fn select_fallback(
97        &self,
98        like: Option<&FontInfo>,
99        variant: FontVariant,
100        text: &str,
101    ) -> Option<usize> {
102        // Find the fonts that contain the text's first non-space char ...
103        let c = text.chars().find(|c| !c.is_whitespace())?;
104        let ids = self
105            .infos
106            .iter()
107            .enumerate()
108            .filter(|(_, info)| info.coverage.contains(c as u32))
109            .map(|(index, _)| index);
110
111        // ... and find the best variant among them.
112        self.find_best_variant(like, variant, ids)
113    }
114
115    /// Find the font in the passed iterator that
116    /// - is closest to the font `like` (if any)
117    /// - is closest to the given `variant`
118    ///
119    /// To do that we compute a key for all variants and select the one with the
120    /// minimal key. This key prioritizes:
121    /// - If `like` is some other font:
122    ///   - Are both fonts (not) monospaced?
123    ///   - Do both fonts (not) have serifs?
124    ///   - How many words do the families share in their prefix? E.g. "Noto
125    ///     Sans" and "Noto Sans Arabic" share two words, whereas "IBM Plex
126    ///     Arabic" shares none with "Noto Sans", so prefer "Noto Sans Arabic"
127    ///     if `like` is "Noto Sans". In case there are two equally good
128    ///     matches, we prefer the shorter one because it is less special (e.g.
129    ///     if `like` is "Noto Sans Arabic", we prefer "Noto Sans" over "Noto
130    ///     Sans CJK HK".)
131    /// - The style (normal / italic / oblique). If we want italic or oblique
132    ///   but it doesn't exist, the other one of the two is still better than
133    ///   normal.
134    /// - The absolute distance to the target stretch.
135    /// - The absolute distance to the target weight.
136    fn find_best_variant(
137        &self,
138        like: Option<&FontInfo>,
139        variant: FontVariant,
140        ids: impl IntoIterator<Item = usize>,
141    ) -> Option<usize> {
142        let mut best = None;
143        let mut best_key = None;
144
145        for id in ids {
146            let current = &self.infos[id];
147            let key = (
148                like.map(|like| {
149                    (
150                        current.flags.contains(FontFlags::MONOSPACE)
151                            != like.flags.contains(FontFlags::MONOSPACE),
152                        current.flags.contains(FontFlags::SERIF)
153                            != like.flags.contains(FontFlags::SERIF),
154                        Reverse(shared_prefix_words(&current.family, &like.family)),
155                        current.family.len(),
156                    )
157                }),
158                current.variant.style.distance(variant.style),
159                current.variant.stretch.distance(variant.stretch),
160                current.variant.weight.distance(variant.weight),
161            );
162
163            if best_key.map_or(true, |b| key < b) {
164                best = Some(id);
165                best_key = Some(key);
166            }
167        }
168
169        best
170    }
171}
172
173/// Properties of a single font.
174#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
175pub struct FontInfo {
176    /// The typographic font family this font is part of.
177    pub family: String,
178    /// Properties that distinguish this font from other fonts in the same
179    /// family.
180    pub variant: FontVariant,
181    /// Properties of the font.
182    pub flags: FontFlags,
183    /// The unicode coverage of the font.
184    pub coverage: Coverage,
185}
186
187bitflags::bitflags! {
188    /// Bitflags describing characteristics of a font.
189    #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
190    #[derive(Serialize, Deserialize)]
191    #[serde(transparent)]
192    pub struct FontFlags: u32 {
193        /// All glyphs have the same width.
194        const MONOSPACE = 1 << 0;
195        /// Glyphs have short strokes at their stems.
196        const SERIF = 1 << 1;
197    }
198}
199
200impl FontInfo {
201    /// Compute metadata for font at the `index` of the given data.
202    pub fn new(data: &[u8], index: u32) -> Option<Self> {
203        let ttf = ttf_parser::Face::parse(data, index).ok()?;
204        Self::from_ttf(&ttf)
205    }
206
207    /// Compute metadata for all fonts in the given data.
208    pub fn iter(data: &[u8]) -> impl Iterator<Item = FontInfo> + '_ {
209        let count = ttf_parser::fonts_in_collection(data).unwrap_or(1);
210        (0..count).filter_map(move |index| Self::new(data, index))
211    }
212
213    /// Compute metadata for a single ttf-parser face.
214    pub(super) fn from_ttf(ttf: &ttf_parser::Face) -> Option<Self> {
215        let ps_name = find_name(ttf, name_id::POST_SCRIPT_NAME);
216        let exception = ps_name.as_deref().and_then(find_exception);
217        // We cannot use Name ID 16 "Typographic Family", because for some
218        // fonts it groups together more than just Style / Weight / Stretch
219        // variants (e.g. Display variants of Noto fonts) and then some
220        // variants become inaccessible from Typst. And even though the
221        // fsSelection bit WWS should help us decide whether that is the
222        // case, it's wrong for some fonts (e.g. for certain variants of "Noto
223        // Sans Display").
224        //
225        // So, instead we use Name ID 1 "Family" and trim many common
226        // suffixes for which know that they just describe styling (e.g.
227        // "ExtraBold").
228        let family =
229            exception.and_then(|c| c.family.map(str::to_string)).or_else(|| {
230                let family = find_name(ttf, name_id::FAMILY)?;
231                Some(typographic_family(&family).to_string())
232            })?;
233
234        let variant = {
235            let style = exception.and_then(|c| c.style).unwrap_or_else(|| {
236                let mut full = find_name(ttf, name_id::FULL_NAME).unwrap_or_default();
237                full.make_ascii_lowercase();
238
239                // Some fonts miss the relevant bits for italic or oblique, so
240                // we also try to infer that from the full name.
241                let italic = ttf.is_italic() || full.contains("italic");
242                let oblique = ttf.is_oblique()
243                    || full.contains("oblique")
244                    || full.contains("slanted");
245
246                match (italic, oblique) {
247                    (false, false) => FontStyle::Normal,
248                    (true, _) => FontStyle::Italic,
249                    (_, true) => FontStyle::Oblique,
250                }
251            });
252
253            let weight = exception.and_then(|c| c.weight).unwrap_or_else(|| {
254                let number = ttf.weight().to_number();
255                FontWeight::from_number(number)
256            });
257
258            let stretch = exception
259                .and_then(|c| c.stretch)
260                .unwrap_or_else(|| FontStretch::from_number(ttf.width().to_number()));
261
262            FontVariant { style, weight, stretch }
263        };
264
265        // Determine the unicode coverage.
266        let mut codepoints = vec![];
267        for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
268            if subtable.is_unicode() {
269                subtable.codepoints(|c| codepoints.push(c));
270            }
271        }
272
273        let mut flags = FontFlags::empty();
274        flags.set(FontFlags::MONOSPACE, ttf.is_monospaced());
275
276        // Determine whether this is a serif or sans-serif font.
277        if let Some(panose) = ttf
278            .raw_face()
279            .table(Tag::from_bytes(b"OS/2"))
280            .and_then(|os2| os2.get(32..45))
281        {
282            if matches!(panose, [2, 2..=10, ..]) {
283                flags.insert(FontFlags::SERIF);
284            }
285        }
286
287        Some(FontInfo {
288            family,
289            variant,
290            flags,
291            coverage: Coverage::from_vec(codepoints),
292        })
293    }
294
295    /// Whether this is the macOS LastResort font. It can yield tofus with
296    /// glyph ID != 0.
297    pub fn is_last_resort(&self) -> bool {
298        self.family == "LastResort"
299    }
300}
301
302/// Try to find and decode the name with the given id.
303pub(super) fn find_name(ttf: &ttf_parser::Face, name_id: u16) -> Option<String> {
304    ttf.names().into_iter().find_map(|entry| {
305        if entry.name_id == name_id {
306            if let Some(string) = entry.to_string() {
307                return Some(string);
308            }
309
310            if entry.platform_id == PlatformId::Macintosh && entry.encoding_id == 0 {
311                return Some(decode_mac_roman(entry.name));
312            }
313        }
314
315        None
316    })
317}
318
319/// Decode mac roman encoded bytes into a string.
320fn decode_mac_roman(coded: &[u8]) -> String {
321    #[rustfmt::skip]
322    const TABLE: [char; 128] = [
323        'Ä', 'Å', 'Ç', 'É', 'Ñ', 'Ö', 'Ü', 'á', 'à', 'â', 'ä', 'ã', 'å', 'ç', 'é', 'è',
324        'ê', 'ë', 'í', 'ì', 'î', 'ï', 'ñ', 'ó', 'ò', 'ô', 'ö', 'õ', 'ú', 'ù', 'û', 'ü',
325        '†', '°', '¢', '£', '§', '•', '¶', 'ß', '®', '©', '™', '´', '¨', '≠', 'Æ', 'Ø',
326        '∞', '±', '≤', '≥', '¥', 'µ', '∂', '∑', '∏', 'π', '∫', 'ª', 'º', 'Ω', 'æ', 'ø',
327        '¿', '¡', '¬', '√', 'ƒ', '≈', '∆', '«', '»', '…', '\u{a0}', 'À', 'Ã', 'Õ', 'Œ', 'œ',
328        '–', '—', '“', '”', '‘', '’', '÷', '◊', 'ÿ', 'Ÿ', '⁄', '€', '‹', '›', 'fi', 'fl',
329        '‡', '·', '‚', '„', '‰', 'Â', 'Ê', 'Á', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'Ó', 'Ô',
330        '\u{f8ff}', 'Ò', 'Ú', 'Û', 'Ù', 'ı', 'ˆ', '˜', '¯', '˘', '˙', '˚', '¸', '˝', '˛', 'ˇ',
331    ];
332
333    fn char_from_mac_roman(code: u8) -> char {
334        if code < 128 {
335            code as char
336        } else {
337            TABLE[(code - 128) as usize]
338        }
339    }
340
341    coded.iter().copied().map(char_from_mac_roman).collect()
342}
343
344/// Trim style naming from a family name and fix bad names.
345fn typographic_family(mut family: &str) -> &str {
346    // Separators between names, modifiers and styles.
347    const SEPARATORS: [char; 3] = [' ', '-', '_'];
348
349    // Modifiers that can appear in combination with suffixes.
350    const MODIFIERS: &[&str] =
351        &["extra", "ext", "ex", "x", "semi", "sem", "sm", "demi", "dem", "ultra"];
352
353    // Style suffixes.
354    #[rustfmt::skip]
355    const SUFFIXES: &[&str] = &[
356        "normal", "italic", "oblique", "slanted",
357        "thin", "th", "hairline", "light", "lt", "regular", "medium", "med",
358        "md", "bold", "bd", "demi", "extb", "black", "blk", "bk", "heavy",
359        "narrow", "condensed", "cond", "cn", "cd", "compressed", "expanded", "exp"
360    ];
361
362    // Trim spacing and weird leading dots in Apple fonts.
363    family = family.trim().trim_start_matches('.');
364
365    // Lowercase the string so that the suffixes match case-insensitively.
366    let lower = family.to_ascii_lowercase();
367    let mut len = usize::MAX;
368    let mut trimmed = lower.as_str();
369
370    // Trim style suffixes repeatedly.
371    while trimmed.len() < len {
372        len = trimmed.len();
373
374        // Find style suffix.
375        let mut t = trimmed;
376        let mut shortened = false;
377        while let Some(s) = SUFFIXES.iter().find_map(|s| t.strip_suffix(s)) {
378            shortened = true;
379            t = s;
380        }
381
382        if !shortened {
383            break;
384        }
385
386        // Strip optional separator.
387        if let Some(s) = t.strip_suffix(SEPARATORS) {
388            trimmed = s;
389            t = s;
390        }
391
392        // Also allow an extra modifier, but apply it only if it is separated it
393        // from the text before it (to prevent false positives).
394        if let Some(t) = MODIFIERS.iter().find_map(|s| t.strip_suffix(s)) {
395            if let Some(stripped) = t.strip_suffix(SEPARATORS) {
396                trimmed = stripped;
397            }
398        }
399    }
400
401    // Apply style suffix trimming.
402    family = &family[..len];
403
404    family
405}
406
407/// How many words the two strings share in their prefix.
408fn shared_prefix_words(left: &str, right: &str) -> usize {
409    left.unicode_words()
410        .zip(right.unicode_words())
411        .take_while(|(l, r)| l == r)
412        .count()
413}
414
415/// A compactly encoded set of codepoints.
416///
417/// The set is represented by alternating specifications of how many codepoints
418/// are not in the set and how many are in the set.
419///
420/// For example, for the set `{2, 3, 4, 9, 10, 11, 15, 18, 19}`, there are:
421/// - 2 codepoints not inside (0, 1)
422/// - 3 codepoints inside (2, 3, 4)
423/// - 4 codepoints not inside (5, 6, 7, 8)
424/// - 3 codepoints inside (9, 10, 11)
425/// - 3 codepoints not inside (12, 13, 14)
426/// - 1 codepoint inside (15)
427/// - 2 codepoints not inside (16, 17)
428/// - 2 codepoints inside (18, 19)
429///
430/// So the resulting encoding is `[2, 3, 4, 3, 3, 1, 2, 2]`.
431#[derive(Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
432#[serde(transparent)]
433pub struct Coverage(Vec<u32>);
434
435impl Coverage {
436    /// Encode a vector of codepoints.
437    pub fn from_vec(mut codepoints: Vec<u32>) -> Self {
438        codepoints.sort();
439        codepoints.dedup();
440
441        let mut runs = Vec::new();
442        let mut next = 0;
443
444        for c in codepoints {
445            if let Some(run) = runs.last_mut().filter(|_| c == next) {
446                *run += 1;
447            } else {
448                runs.push(c - next);
449                runs.push(1);
450            }
451
452            next = c + 1;
453        }
454
455        Self(runs)
456    }
457
458    /// Whether the codepoint is covered.
459    pub fn contains(&self, c: u32) -> bool {
460        let mut inside = false;
461        let mut cursor = 0;
462
463        for &run in &self.0 {
464            if (cursor..cursor + run).contains(&c) {
465                return inside;
466            }
467            cursor += run;
468            inside = !inside;
469        }
470
471        false
472    }
473
474    /// Iterate over all covered codepoints.
475    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
476        let mut inside = false;
477        let mut cursor = 0;
478        self.0.iter().flat_map(move |run| {
479            let range = if inside { cursor..cursor + run } else { 0..0 };
480            inside = !inside;
481            cursor += run;
482            range
483        })
484    }
485}
486
487impl Debug for Coverage {
488    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
489        f.pad("Coverage(..)")
490    }
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496
497    #[test]
498    fn test_trim_styles() {
499        assert_eq!(typographic_family("Atma Light"), "Atma");
500        assert_eq!(typographic_family("eras bold"), "eras");
501        assert_eq!(typographic_family("footlight mt light"), "footlight mt");
502        assert_eq!(typographic_family("times new roman"), "times new roman");
503        assert_eq!(typographic_family("noto sans mono cond sembd"), "noto sans mono");
504        assert_eq!(typographic_family("noto serif SEMCOND sembd"), "noto serif");
505        assert_eq!(typographic_family("crimson text"), "crimson text");
506        assert_eq!(typographic_family("footlight light"), "footlight");
507        assert_eq!(typographic_family("Noto Sans"), "Noto Sans");
508        assert_eq!(typographic_family("Noto Sans Light"), "Noto Sans");
509        assert_eq!(typographic_family("Noto Sans Semicondensed Heavy"), "Noto Sans");
510        assert_eq!(typographic_family("Familx"), "Familx");
511        assert_eq!(typographic_family("Font Ultra"), "Font Ultra");
512        assert_eq!(typographic_family("Font Ultra Bold"), "Font");
513    }
514
515    #[test]
516    fn test_coverage() {
517        #[track_caller]
518        fn test(set: &[u32], runs: &[u32]) {
519            let coverage = Coverage::from_vec(set.to_vec());
520            assert_eq!(coverage.0, runs);
521
522            let max = 5 + set.iter().copied().max().unwrap_or_default();
523            for c in 0..max {
524                assert_eq!(set.contains(&c), coverage.contains(c));
525            }
526        }
527
528        test(&[], &[]);
529        test(&[0], &[0, 1]);
530        test(&[1], &[1, 1]);
531        test(&[0, 1], &[0, 2]);
532        test(&[0, 1, 3], &[0, 2, 1, 1]);
533        test(
534            // {2, 3, 4, 9, 10, 11, 15, 18, 19}
535            &[18, 19, 2, 4, 9, 11, 15, 3, 3, 10],
536            &[2, 3, 4, 3, 3, 1, 2, 2],
537        )
538    }
539
540    #[test]
541    fn test_coverage_iter() {
542        let codepoints = vec![2, 3, 7, 8, 9, 14, 15, 19, 21];
543        let coverage = Coverage::from_vec(codepoints.clone());
544        assert_eq!(coverage.iter().collect::<Vec<_>>(), codepoints);
545    }
546}