Skip to main content

pdf_interpret/font/
mod.rs

1//! Interacting with the different kinds of PDF fonts.
2
3use crate::cache::Cache;
4use crate::context::Context;
5use crate::device::Device;
6use crate::font::cid::Type0Font;
7use crate::font::generated::{
8    glyph_names, mac_expert, mac_os_roman, mac_roman, standard, win_ansi,
9};
10use crate::font::true_type::TrueTypeFont;
11use crate::font::type1::Type1Font;
12use crate::font::type3::Type3;
13use crate::interpret::state::State;
14use crate::util::decode_or_warn;
15use crate::{CMapResolverFn, CacheKey, FontResolverFn, InterpreterSettings, Paint, WarningSinkFn};
16use bitflags::bitflags;
17use kurbo::{Affine, BezPath, Vec2};
18use log::warn;
19use outline::OutlineFont;
20use pdf_syntax::object::Name;
21use pdf_syntax::object::dict::keys::SUBTYPE;
22use pdf_syntax::object::dict::keys::*;
23use pdf_syntax::object::{Dict, Stream};
24use pdf_syntax::page::Resources;
25use pdf_syntax::xref::XRef;
26use skrifa::GlyphId;
27use std::fmt::Debug;
28use std::ops::Deref;
29use std::rc::Rc;
30use std::sync::Arc;
31
32mod blob;
33mod cid;
34mod generated;
35mod glyph_simulator;
36pub(crate) mod outline;
37mod standard_font;
38mod true_type;
39mod type1;
40pub(crate) mod type3;
41
42pub(crate) const UNITS_PER_EM: f32 = 1000.0;
43
44pub(crate) fn stretch_glyph(path: BezPath, expected_width: f32, actual_width: f32) -> BezPath {
45    if actual_width != 0.0 && actual_width != expected_width {
46        let stretch_factor = expected_width / actual_width;
47        Affine::scale_non_uniform(stretch_factor as f64, 1.0) * path
48    } else {
49        path
50    }
51}
52
53/// A container for the bytes of a PDF file.
54pub type FontData = Arc<dyn AsRef<[u8]> + Send + Sync>;
55
56/// Strip the 6-character subset prefix from a PostScript font name.
57///
58/// PDF subset fonts use names like "ABCDEF+TimesNewRoman". This function
59/// returns `TimesNewRoman` from such a name, or the original name if no
60/// valid prefix is found.
61pub(crate) fn strip_subset_prefix(name: &str) -> &str {
62    match name.split_once('+') {
63        Some((prefix, rest)) if prefix.len() == 6 => rest,
64        _ => name,
65    }
66}
67
68use crate::util::hash128;
69pub use outline::OutlineFontData;
70use pdf_font::cmap::{BfString, CMap, CMapName, CharacterCollection};
71pub use standard_font::StandardFont;
72
73/// A glyph that can be drawn.
74pub enum Glyph<'a> {
75    /// A glyph defined by an outline.
76    Outline(OutlineGlyph),
77    /// A type3 glyph, defined by PDF drawing instructions.
78    Type3(Box<Type3Glyph<'a>>),
79}
80
81impl Glyph<'_> {
82    /// Returns the Unicode code point for this glyph, if available.
83    ///
84    /// This method attempts to determine the Unicode character that this glyph
85    /// represents. The exact fallback chain depends on the font type:
86    ///
87    /// **For Outline Fonts (Type1, TrueType, CFF):**
88    /// 1. `ToUnicode` cmap
89    /// 2. Glyph name → Unicode (via Adobe Glyph List)
90    /// 3. Unicode naming conventions (e.g., "uni0041", "u0041")
91    ///
92    /// **For CID Fonts (Type0):**
93    /// 1. `ToUnicode` cmap
94    ///
95    ///
96    /// **For Type3 Fonts:**
97    /// 1. `ToUnicode` cmap
98    ///
99    /// Returns `None` if the Unicode value could not be determined.
100    ///
101    /// Please note that this method is still somewhat experimental and might
102    /// not work reliably in all cases.
103    pub fn as_unicode(&self) -> Option<BfString> {
104        match self {
105            Glyph::Outline(g) => g.as_unicode(),
106            Glyph::Type3(g) => g.as_unicode(),
107        }
108    }
109}
110
111/// An identifier that uniquely identifies a glyph, for caching purposes.
112#[derive(Clone, Debug)]
113pub struct GlyphIdentifier {
114    id: GlyphId,
115    font: OutlineFont,
116}
117
118impl CacheKey for GlyphIdentifier {
119    fn cache_key(&self) -> u128 {
120        hash128(&(self.id, self.font.cache_key()))
121    }
122}
123
124/// A glyph defined by an outline.
125#[derive(Clone, Debug)]
126pub struct OutlineGlyph {
127    pub(crate) id: GlyphId,
128    pub(crate) font: OutlineFont,
129    pub(crate) char_code: u32,
130}
131
132impl OutlineGlyph {
133    /// Return the outline of the glyph, assuming an upem value of 1000.
134    pub fn outline(&self) -> BezPath {
135        self.font.outline_glyph(self.id, self.char_code)
136    }
137
138    /// Return the identifier of the glyph. You can use this to calculate the cache key
139    /// for the glyph.
140    ///
141    /// Note that the `glyph_transform` attribute is not considered in the cache key of
142    /// the identifier, only the glyph ID and the font.
143    pub fn identifier(&self) -> GlyphIdentifier {
144        GlyphIdentifier {
145            id: self.id,
146            font: self.font.clone(),
147        }
148    }
149
150    /// Returns the Unicode code point for this glyph, if available.
151    ///
152    /// See [`Glyph::as_unicode`] for details on the fallback chain used.
153    pub fn as_unicode(&self) -> Option<BfString> {
154        self.font.char_code_to_unicode(self.char_code)
155    }
156
157    /// Get raw font bytes and metadata for downstream use.
158    ///
159    /// Returns `None` for Type1 fonts.
160    pub fn font_data(&self) -> Option<OutlineFontData> {
161        self.font.font_data()
162    }
163
164    /// Get the glyph ID within the font.
165    pub fn glyph_id(&self) -> GlyphId {
166        self.id
167    }
168
169    /// Get the advance width for this glyph.
170    ///
171    /// The advance width is how far to move horizontally after drawing
172    /// this glyph before drawing the next one.
173    pub fn advance_width(&self) -> Option<f32> {
174        self.font.glyph_advance_width(self.char_code)
175    }
176
177    /// Get the cache key for this glyph's font.
178    ///
179    /// This identifies the font uniquely, even when `font_data()` returns `None`
180    /// (e.g., for Type1 fonts). Useful for grouping glyphs by font.
181    pub fn font_cache_key(&self) -> u128 {
182        self.font.cache_key()
183    }
184}
185
186/// A type3 glyph.
187#[derive(Clone)]
188pub struct Type3Glyph<'a> {
189    pub(crate) font: Rc<Type3<'a>>,
190    pub(crate) glyph_id: GlyphId,
191    pub(crate) state: State<'a>,
192    pub(crate) parent_resources: Resources<'a>,
193    pub(crate) cache: Cache,
194    pub(crate) xref: &'a XRef,
195    pub(crate) settings: InterpreterSettings,
196    pub(crate) char_code: u32,
197}
198
199/// A glyph defined by PDF drawing instructions.
200impl<'a> Type3Glyph<'a> {
201    /// Draw the type3 glyph to the given device.
202    pub fn interpret(
203        &self,
204        device: &mut impl Device<'a>,
205        transform: Affine,
206        glyph_transform: Affine,
207        paint: &Paint<'a>,
208    ) {
209        self.font
210            .render_glyph(self, transform, glyph_transform, paint, device);
211    }
212
213    /// Returns the Unicode code point for this glyph, if available.
214    ///
215    /// Note: Type3 fonts can only provide Unicode via `ToUnicode` cmap.
216    pub fn as_unicode(&self) -> Option<BfString> {
217        self.font.char_code_to_unicode(self.char_code)
218    }
219}
220
221impl CacheKey for Type3Glyph<'_> {
222    fn cache_key(&self) -> u128 {
223        hash128(&(self.font.cache_key(), self.glyph_id))
224    }
225}
226
227#[derive(Clone, Debug)]
228pub(crate) struct Font<'a>(u128, FontType<'a>);
229
230impl<'a> Font<'a> {
231    pub(crate) fn new(
232        dict: &Dict<'a>,
233        font_resolver: &FontResolverFn,
234        cmap_resolver: &CMapResolverFn,
235        warning_sink: &WarningSinkFn,
236    ) -> Option<Self> {
237        let f_type = match dict.get::<Name>(SUBTYPE)?.deref() {
238            TYPE1 | MM_TYPE1 => FontType::Type1(Rc::new(Type1Font::new(
239                dict,
240                font_resolver,
241                cmap_resolver,
242                warning_sink,
243            )?)),
244            // PDFBOX-5463: PDF viewers seem to accept OpenType as well.
245            TRUE_TYPE | OPEN_TYPE => FontType::TrueType(Rc::new(TrueTypeFont::new(
246                dict,
247                font_resolver,
248                cmap_resolver,
249                warning_sink,
250            )?)),
251            TYPE0 => FontType::Type0(Rc::new(Type0Font::new(
252                dict,
253                font_resolver,
254                cmap_resolver,
255                warning_sink,
256            )?)),
257            TYPE3 => FontType::Type3(Rc::new(Type3::new(dict, cmap_resolver, warning_sink)?)),
258            f => {
259                warn!(
260                    "unimplemented font type {:?}",
261                    std::str::from_utf8(f).unwrap_or("unknown type")
262                );
263
264                return None;
265            }
266        };
267
268        let cache_key = dict.cache_key();
269
270        Some(Self(cache_key, f_type))
271    }
272
273    pub(crate) fn new_standard(
274        standard_font: StandardFont,
275        font_resolver: &FontResolverFn,
276    ) -> Option<Self> {
277        let font = Type1Font::new_standard(standard_font, font_resolver)?;
278
279        Some(Self(0, FontType::Type1(Rc::new(font))))
280    }
281
282    pub(crate) fn map_code(&self, code: u32) -> GlyphId {
283        match &self.1 {
284            FontType::Type1(f) => {
285                debug_assert!(code <= u8::MAX as u32);
286
287                f.map_code(code as u8)
288            }
289            FontType::TrueType(t) => {
290                debug_assert!(code <= u8::MAX as u32);
291
292                t.map_code(code as u8)
293            }
294            FontType::Type0(t) => t.map_code(code),
295            FontType::Type3(t) => {
296                debug_assert!(code <= u8::MAX as u32);
297
298                t.map_code(code as u8)
299            }
300        }
301    }
302
303    pub(crate) fn get_glyph(
304        &self,
305        glyph: GlyphId,
306        char_code: u32,
307        ctx: &mut Context<'a>,
308        resources: &Resources<'a>,
309        origin_displacement: Vec2,
310    ) -> (Glyph<'a>, Affine) {
311        let glyph_transform = ctx.get().text_state.full_transform()
312            * Affine::scale(1.0 / UNITS_PER_EM as f64)
313            * Affine::translate(origin_displacement);
314
315        let glyph = match &self.1 {
316            FontType::Type1(t) => {
317                let font = OutlineFont::Type1(t.clone());
318                Glyph::Outline(OutlineGlyph {
319                    id: glyph,
320                    font,
321                    char_code,
322                })
323            }
324            FontType::TrueType(t) => {
325                let font = OutlineFont::TrueType(t.clone());
326                Glyph::Outline(OutlineGlyph {
327                    id: glyph,
328                    font,
329                    char_code,
330                })
331            }
332            FontType::Type0(t) => {
333                let font = OutlineFont::Type0(t.clone());
334                Glyph::Outline(OutlineGlyph {
335                    id: glyph,
336                    font,
337                    char_code,
338                })
339            }
340            FontType::Type3(t) => {
341                let shape_glyph = Type3Glyph {
342                    font: t.clone(),
343                    glyph_id: glyph,
344                    state: ctx.get().clone(),
345                    parent_resources: resources.clone(),
346                    cache: ctx.object_cache.clone(),
347                    xref: ctx.xref,
348                    settings: ctx.settings.clone(),
349                    char_code,
350                };
351
352                Glyph::Type3(Box::new(shape_glyph))
353            }
354        };
355
356        (glyph, glyph_transform)
357    }
358
359    pub(crate) fn code_advance(&self, code: u32) -> Vec2 {
360        match &self.1 {
361            FontType::Type1(t) => {
362                debug_assert!(code <= u8::MAX as u32);
363
364                Vec2::new(t.glyph_width(code as u8).unwrap_or(0.0) as f64, 0.0)
365            }
366            FontType::TrueType(t) => {
367                debug_assert!(code <= u8::MAX as u32);
368
369                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
370            }
371            FontType::Type0(t) => t.code_advance(code),
372            FontType::Type3(t) => {
373                debug_assert!(code <= u8::MAX as u32);
374
375                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
376            }
377        }
378    }
379
380    pub(crate) fn origin_displacement(&self, code: u32) -> Vec2 {
381        match &self.1 {
382            FontType::Type1(_) => Vec2::default(),
383            FontType::TrueType(_) => Vec2::default(),
384            FontType::Type0(t) => t.origin_displacement(code),
385            FontType::Type3(_) => Vec2::default(),
386        }
387    }
388
389    pub(crate) fn read_code(&self, bytes: &[u8], offset: usize) -> (u32, usize) {
390        match &self.1 {
391            FontType::Type1(_) => (bytes[offset] as u32, 1),
392            FontType::TrueType(_) => (bytes[offset] as u32, 1),
393            FontType::Type0(t) => t.read_code(bytes, offset),
394            FontType::Type3(_) => (bytes[offset] as u32, 1),
395        }
396    }
397
398    pub(crate) fn is_horizontal(&self) -> bool {
399        match &self.1 {
400            FontType::Type1(_) => true,
401            FontType::TrueType(_) => true,
402            FontType::Type0(t) => t.is_horizontal(),
403            FontType::Type3(_) => true,
404        }
405    }
406}
407
408impl CacheKey for Font<'_> {
409    fn cache_key(&self) -> u128 {
410        self.0
411    }
412}
413
414#[derive(Clone, Debug)]
415enum FontType<'a> {
416    Type1(Rc<Type1Font>),
417    TrueType(Rc<TrueTypeFont>),
418    Type0(Rc<Type0Font>),
419    Type3(Rc<Type3<'a>>),
420}
421
422#[derive(Debug)]
423enum Encoding {
424    Standard,
425    MacRoman,
426    WinAnsi,
427    MacExpert,
428    BuiltIn,
429}
430
431impl Encoding {
432    fn map_code(&self, code: u8) -> Option<&'static str> {
433        if code == 0 {
434            return Some(".notdef");
435        }
436        match self {
437            Self::Standard => standard::get(code),
438            Self::MacRoman => mac_roman::get(code).or_else(|| mac_os_roman::get(code)),
439            Self::WinAnsi => win_ansi::get(code),
440            Self::MacExpert => mac_expert::get(code),
441            Self::BuiltIn => None,
442        }
443    }
444}
445
446/// The font stretch.
447#[derive(Debug, Copy, Clone)]
448pub enum FontStretch {
449    /// Normal.
450    Normal,
451    /// Ultra condensed.
452    UltraCondensed,
453    /// Extra condensed.
454    ExtraCondensed,
455    /// Condensed.
456    Condensed,
457    /// Semi condensed.
458    SemiCondensed,
459    /// Semi expanded.
460    SemiExpanded,
461    /// Expanded.
462    Expanded,
463    /// Extra expanded.
464    ExtraExpanded,
465    /// Ultra expanded.
466    UltraExpanded,
467}
468
469impl FontStretch {
470    fn from_string(s: &str) -> Self {
471        match s {
472            "UltraCondensed" => Self::UltraCondensed,
473            "ExtraCondensed" => Self::ExtraCondensed,
474            "Condensed" => Self::Condensed,
475            "SemiCondensed" => Self::SemiCondensed,
476            "SemiExpanded" => Self::SemiExpanded,
477            "Expanded" => Self::Expanded,
478            "ExtraExpanded" => Self::ExtraExpanded,
479            "UltraExpanded" => Self::UltraExpanded,
480            _ => Self::Normal,
481        }
482    }
483}
484
485bitflags! {
486    /// Bitflags describing various characteristics of fonts.
487    #[derive(Debug)]
488    pub(crate) struct FontFlags: u32 {
489        const FIXED_PITCH = 1 << 0;
490        const SERIF = 1 << 1;
491        const SYMBOLIC = 1 << 2;
492        const SCRIPT = 1 << 3;
493        const NON_SYMBOLIC = 1 << 5;
494        const ITALIC = 1 << 6;
495        const ALL_CAP = 1 << 16;
496        const SMALL_CAP = 1 << 17;
497        const FORCE_BOLD = 1 << 18;
498    }
499}
500
501/// A query for a font.
502pub enum FontQuery {
503    /// A query for one of the 14 PDF standard fonts.
504    Standard(StandardFont),
505    /// A query for a font that is not embedded in the PDF file.
506    ///
507    /// Note that this type of query is currently not supported,
508    /// but will be implemented in the future.
509    Fallback(FallbackFontQuery),
510}
511
512/// A query for a font with specific properties.
513#[derive(Debug, Clone)]
514pub struct FallbackFontQuery {
515    /// The postscript name of the font.
516    pub post_script_name: Option<String>,
517    /// The name of the font.
518    pub font_name: Option<String>,
519    /// The family of the font.
520    pub font_family: Option<String>,
521    /// The stretch of the font.
522    pub font_stretch: FontStretch,
523    /// The weight of the font.
524    pub font_weight: u32,
525    /// Whether the font is monospaced.
526    pub is_fixed_pitch: bool,
527    /// Whether the font is serif.
528    pub is_serif: bool,
529    /// Whether the font is italic.
530    pub is_italic: bool,
531    /// Whether the font is bold.
532    pub is_bold: bool,
533    /// Whether the font is small cap.
534    pub is_small_cap: bool,
535    /// The character collection (registry/ordering) if this is a CID font.
536    pub character_collection: Option<CharacterCollection>,
537}
538
539impl FallbackFontQuery {
540    pub(crate) fn new(dict: &Dict<'_>) -> Self {
541        let post_script_name = dict
542            .get::<Name>(BASE_FONT)
543            .map(|n| strip_subset_prefix(n.as_str()).to_string());
544
545        let mut data = Self {
546            post_script_name,
547            ..Default::default()
548        };
549
550        if let Some(descriptor) = dict.get::<Dict<'_>>(FONT_DESC) {
551            data.font_name = dict
552                .get::<Name>(FONT_NAME)
553                .map(|n| strip_subset_prefix(n.as_str()).to_string());
554            data.font_family = descriptor
555                .get::<Name>(FONT_FAMILY)
556                .map(|n| n.as_str().to_string());
557            data.font_stretch = descriptor
558                .get::<Name>(FONT_STRETCH)
559                .map(|n| FontStretch::from_string(n.as_str()))
560                .unwrap_or(FontStretch::Normal);
561            data.font_weight = descriptor.get::<u32>(FONT_WEIGHT).unwrap_or(400);
562
563            if let Some(flags) = descriptor
564                .get::<u32>(FLAGS)
565                .map(FontFlags::from_bits_truncate)
566            {
567                data.is_fixed_pitch = flags.contains(FontFlags::FIXED_PITCH);
568                data.is_serif = flags.contains(FontFlags::SERIF);
569                data.is_italic = flags.contains(FontFlags::ITALIC);
570                data.is_small_cap = flags.contains(FontFlags::SMALL_CAP);
571            }
572        }
573
574        data.is_bold |= data.font_weight >= 700;
575
576        if let Some(name) = &data.post_script_name {
577            let lower = name.to_ascii_lowercase();
578            data.is_italic |=
579                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
580            data.is_bold |= lower.contains("bold")
581                || lower.contains("demi")
582                || lower.contains("semibold")
583                || lower.contains("heavy")
584                || lower.contains("black");
585        }
586
587        data
588    }
589
590    /// Do a best-effort fallback to the 14 standard fonts based on the query.
591    pub fn pick_standard_font(&self) -> StandardFont {
592        if self.is_fixed_pitch {
593            match (self.is_bold, self.is_italic) {
594                (true, true) => StandardFont::CourierBoldOblique,
595                (true, false) => StandardFont::CourierBold,
596                (false, true) => StandardFont::CourierOblique,
597                (false, false) => StandardFont::Courier,
598            }
599        } else if !self.is_serif {
600            match (self.is_bold, self.is_italic) {
601                (true, true) => StandardFont::HelveticaBoldOblique,
602                (true, false) => StandardFont::HelveticaBold,
603                (false, true) => StandardFont::HelveticaOblique,
604                (false, false) => StandardFont::Helvetica,
605            }
606        } else {
607            match (self.is_bold, self.is_italic) {
608                (true, true) => StandardFont::TimesBoldItalic,
609                (true, false) => StandardFont::TimesBold,
610                (false, true) => StandardFont::TimesItalic,
611                (false, false) => StandardFont::TimesRoman,
612            }
613        }
614    }
615}
616
617impl Default for FallbackFontQuery {
618    fn default() -> Self {
619        Self {
620            post_script_name: None,
621            font_name: None,
622            font_family: None,
623            font_stretch: FontStretch::Normal,
624            font_weight: 400,
625            is_fixed_pitch: false,
626            is_serif: false,
627            is_italic: false,
628            is_bold: false,
629            is_small_cap: false,
630            character_collection: None,
631        }
632    }
633}
634
635/// Convert a glyph name to a Unicode character, if possible.
636/// Implements the Adobe Glyph List Specification with extended fallbacks
637/// for non-standard names found in real-world PDFs.
638/// <https://github.com/adobe-type-tools/agl-specification>
639pub(crate) fn glyph_name_to_unicode(name: &str) -> Option<char> {
640    // 1. Direct AGL lookup
641    if let Some(unicode_str) = glyph_names::get(name) {
642        return unicode_str.chars().next();
643    }
644
645    // 2. uni/u prefix convention
646    if let Some(c) = unicode_from_name(name) {
647        return Some(c);
648    }
649
650    // 3. AGL §2.4: strip variant suffix after period (e.g., "A.swash" → "A",
651    //    "comma.alt" → "comma"). Only the base name before the first period
652    //    maps to Unicode.
653    if let Some(dot_pos) = name.find('.') {
654        let base = &name[..dot_pos];
655        if !base.is_empty() {
656            if let Some(c) = glyph_names::get(base).and_then(|s| s.chars().next()) {
657                return Some(c);
658            }
659            if let Some(c) = unicode_from_name(base) {
660                return Some(c);
661            }
662        }
663    }
664
665    // 4. Handle "aXX" decimal glyph names (e.g., "a65" → 'A') used by some
666    //    TeX/LaTeX generated PDFs and custom encoding vectors.
667    if name.starts_with('a')
668        && name.len() >= 2
669        && let Ok(code) = name[1..].parse::<u32>()
670        && let Some(c) = char::from_u32(code)
671        && (!c.is_control() || c == ' ')
672    {
673        return Some(c);
674    }
675
676    warn!("failed to map glyph name {} to unicode", name);
677    None
678}
679
680/// AGL §2.2: resolve a glyph name to one or more Unicode code points. Handles
681/// everything `glyph_name_to_unicode` does, plus underscore-joined ligature
682/// names (e.g. `f_i` → "fi", `f_f_i` → "ffi") which glyph-list-based fonts
683/// use for Latin ligatures. Returns the full string, so ligatures surface as
684/// their component code points rather than being dropped when the composite
685/// name isn't in the AGL.
686///
687/// Strips the variant suffix (`.ss01`, `.swash`, …) from the base name before
688/// splitting, matching AGL §2.4. Bails out if any component fails to resolve
689/// so we never produce a partial string (which would be worse than None).
690pub(crate) fn glyph_name_to_string(name: &str) -> Option<String> {
691    // Try the single-code-point fast path first — it also handles `.suffix`
692    // stripping, `uniXXXX`, and the `aNN` decimal convention.
693    if let Some(c) = glyph_name_to_unicode(name) {
694        return Some(c.to_string());
695    }
696
697    let base = name.split_once('.').map(|(b, _)| b).unwrap_or(name);
698    if !base.contains('_') {
699        return None;
700    }
701
702    let mut out = String::new();
703    for part in base.split('_') {
704        if part.is_empty() {
705            return None;
706        }
707        let c = glyph_name_to_unicode(part)?;
708        out.push(c);
709    }
710    (!out.is_empty()).then_some(out)
711}
712
713pub(crate) fn unicode_from_name(name: &str) -> Option<char> {
714    let convert = |input: &str| u32::from_str_radix(input, 16).ok().and_then(char::from_u32);
715
716    name.starts_with("uni")
717        .then(|| name.get(3..).and_then(convert))
718        .or_else(|| {
719            name.starts_with("u")
720                .then(|| name.get(1..).and_then(convert))
721        })
722        .flatten()
723}
724
725pub(crate) fn read_to_unicode(
726    dict: &Dict<'_>,
727    cmap_resolver: &CMapResolverFn,
728    warning_sink: &WarningSinkFn,
729) -> Option<CMap> {
730    dict.get::<Stream<'_>>(TO_UNICODE)
731        .and_then(|s| decode_or_warn(&s, warning_sink))
732        // See PDFJS-11915, where `Identity-H` is used for `ToUnicode`. I don't
733        // believe it's valid, but at least mupdf seems to be able to deal with it.
734        .or_else(|| {
735            dict.get::<Name>(TO_UNICODE)
736                .and_then(|name| (cmap_resolver)(CMapName::from_bytes(name.as_ref())))
737                .map(|d| d.to_vec())
738        })
739        .and_then(|data| {
740            let cmap_resolver = cmap_resolver.clone();
741            CMap::parse(&data, move |name| (cmap_resolver)(name))
742        })
743}
744
745/// Build a 256-entry Unicode map from the font's `/Encoding` entry.
746/// Used as a fallback when no `/ToUnicode` CMap is present.
747pub(crate) fn synthesize_unicode_map_from_encoding(dict: &Dict<'_>) -> Option<[Option<char>; 256]> {
748    let (base_encoding, differences) = true_type::read_encoding(dict);
749    // Only synthesize if there is something to map (encoding name or differences).
750    // For BuiltIn with no differences we cannot reliably guess the mapping.
751    if matches!(base_encoding, Encoding::BuiltIn) && differences.is_empty() {
752        return None;
753    }
754    let mut table: [Option<char>; 256] = [None; 256];
755    for code in 0u8..=255 {
756        let glyph_name = differences
757            .get(&code)
758            .map(String::as_str)
759            .or_else(|| base_encoding.map_code(code));
760        if let Some(name) = glyph_name {
761            table[code as usize] = glyph_name_to_unicode(name);
762        }
763    }
764    Some(table)
765}
766
767// When mapping to glyphs, some fonts might only have a glyph for the "normalized"
768// name. These aliases come from the Adobe Glyph List (AGL) and are safe
769// synonyms: they are consulted only when the literal lookup already failed, so
770// adding new entries broadens recovery without overriding correct mappings.
771pub(crate) fn normalized_glyph_name(mut name: &str) -> &str {
772    if name == "nbspace" || name == "nonbreakingspace" {
773        name = "space";
774    }
775
776    if name == "sfthyphen" || name == "softhyphen" {
777        name = "hyphen";
778    }
779
780    name
781}
782
783#[cfg(test)]
784mod normalized_glyph_name_tests {
785    use super::normalized_glyph_name;
786
787    #[test]
788    fn maps_space_aliases() {
789        assert_eq!(normalized_glyph_name("nbspace"), "space");
790        assert_eq!(normalized_glyph_name("nonbreakingspace"), "space");
791    }
792
793    #[test]
794    fn maps_hyphen_aliases() {
795        assert_eq!(normalized_glyph_name("sfthyphen"), "hyphen");
796        assert_eq!(normalized_glyph_name("softhyphen"), "hyphen");
797    }
798
799    #[test]
800    fn preserves_unrelated_names() {
801        assert_eq!(normalized_glyph_name("A"), "A");
802        assert_eq!(normalized_glyph_name(".notdef"), ".notdef");
803        assert_eq!(normalized_glyph_name("hyphen"), "hyphen");
804        assert_eq!(normalized_glyph_name("space"), "space");
805    }
806}
807
808#[cfg(test)]
809mod glyph_name_to_unicode_tests {
810    use super::{glyph_name_to_string, glyph_name_to_unicode};
811
812    #[test]
813    fn standard_agl_name() {
814        assert_eq!(glyph_name_to_unicode("A"), Some('A'));
815        assert_eq!(glyph_name_to_unicode("space"), Some(' '));
816        assert_eq!(glyph_name_to_unicode("hyphen"), Some('-'));
817    }
818
819    #[test]
820    fn ligature_name_underscore_joined() {
821        // AGL maps `fi` directly (U+FB01); the underscore form is what
822        // CFF fonts without a ToUnicode map tend to emit for the same
823        // ligature. We want the decomposed string so readers see "fi".
824        assert_eq!(glyph_name_to_string("f_i"), Some("fi".to_string()));
825        assert_eq!(glyph_name_to_string("f_f_i"), Some("ffi".to_string()));
826        assert_eq!(glyph_name_to_string("A_B_C"), Some("ABC".to_string()));
827    }
828
829    #[test]
830    fn ligature_name_with_suffix() {
831        // Variant suffix must be stripped before splitting on `_`.
832        assert_eq!(glyph_name_to_string("f_i.alt"), Some("fi".to_string()));
833    }
834
835    #[test]
836    fn ligature_name_falls_back_to_single_char_path() {
837        // Direct AGL hit should still win and yield the precomposed char.
838        assert_eq!(glyph_name_to_string("fi"), Some("\u{FB01}".to_string()));
839        assert_eq!(glyph_name_to_string("A"), Some("A".to_string()));
840    }
841
842    #[test]
843    fn ligature_name_rejects_unresolvable_component() {
844        // If any segment is unknown we return None rather than a partial
845        // string — a partial would be more confusing than a drop.
846        assert!(glyph_name_to_string("A_totallyUnknownGlyph").is_none());
847        assert!(glyph_name_to_string("_").is_none());
848        assert!(glyph_name_to_string("A__B").is_none());
849    }
850
851    #[test]
852    fn uni_prefix() {
853        assert_eq!(glyph_name_to_unicode("uni0041"), Some('A'));
854        assert_eq!(glyph_name_to_unicode("uni00E9"), Some('é'));
855    }
856
857    #[test]
858    fn u_prefix() {
859        assert_eq!(glyph_name_to_unicode("u0041"), Some('A'));
860        assert_eq!(glyph_name_to_unicode("u2022"), Some('•'));
861    }
862
863    #[test]
864    fn variant_suffix_stripped() {
865        assert_eq!(glyph_name_to_unicode("A.swash"), Some('A'));
866        assert_eq!(glyph_name_to_unicode("comma.alt"), Some(','));
867        assert_eq!(glyph_name_to_unicode("space.narrow"), Some(' '));
868    }
869
870    #[test]
871    fn variant_suffix_with_uni_prefix() {
872        assert_eq!(glyph_name_to_unicode("uni0041.ss01"), Some('A'));
873    }
874
875    #[test]
876    fn a_decimal_glyph_name() {
877        assert_eq!(glyph_name_to_unicode("a65"), Some('A'));
878        assert_eq!(glyph_name_to_unicode("a32"), Some(' '));
879        assert_eq!(glyph_name_to_unicode("a97"), Some('a'));
880    }
881
882    #[test]
883    fn a_decimal_rejects_control_chars() {
884        assert_eq!(glyph_name_to_unicode("a0"), None);
885        assert_eq!(glyph_name_to_unicode("a7"), None);
886    }
887
888    #[test]
889    fn unknown_name_returns_none() {
890        assert_eq!(glyph_name_to_unicode("xyzzynonexistent"), None);
891    }
892}
893
894#[cfg(test)]
895mod fallback_font_query_tests {
896    use super::*;
897
898    fn query_with(name: &str, flags: u32, weight: u32) -> FallbackFontQuery {
899        let mut q = FallbackFontQuery {
900            post_script_name: Some(name.to_string()),
901            font_weight: weight,
902            ..Default::default()
903        };
904
905        let font_flags = FontFlags::from_bits_truncate(flags);
906        q.is_fixed_pitch = font_flags.contains(FontFlags::FIXED_PITCH);
907        q.is_serif = font_flags.contains(FontFlags::SERIF);
908        q.is_italic = font_flags.contains(FontFlags::ITALIC);
909        q.is_small_cap = font_flags.contains(FontFlags::SMALL_CAP);
910
911        q.is_bold |= q.font_weight >= 700;
912
913        if let Some(name) = &q.post_script_name {
914            let lower = name.to_ascii_lowercase();
915            q.is_italic |=
916                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
917            q.is_bold |= lower.contains("bold")
918                || lower.contains("demi")
919                || lower.contains("semibold")
920                || lower.contains("heavy")
921                || lower.contains("black");
922        }
923        q
924    }
925
926    #[test]
927    fn fixed_pitch_flag_selects_courier() {
928        let q = query_with("LetterGothic", FontFlags::FIXED_PITCH.bits(), 400);
929        assert!(matches!(q.pick_standard_font(), StandardFont::Courier));
930    }
931
932    #[test]
933    fn demi_in_name_selects_bold() {
934        let q = query_with("FranklinGothic-Demi", FontFlags::SERIF.bits(), 400);
935        assert!(q.is_bold);
936        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
937    }
938
939    #[test]
940    fn oblique_detected_as_italic() {
941        let q = query_with("HelveticaNeue-LightOblique", 0, 400);
942        assert!(q.is_italic);
943        assert!(matches!(
944            q.pick_standard_font(),
945            StandardFont::HelveticaOblique
946        ));
947    }
948
949    #[test]
950    fn font_weight_700_detected_as_bold() {
951        let q = query_with("CustomFont", 0, 700);
952        assert!(q.is_bold);
953        assert!(matches!(
954            q.pick_standard_font(),
955            StandardFont::HelveticaBold
956        ));
957    }
958
959    #[test]
960    fn semibold_detected_as_bold() {
961        let q = query_with("AGaramond-Semibold", FontFlags::SERIF.bits(), 400);
962        assert!(q.is_bold);
963        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
964    }
965}