Skip to main content

pdf_interpret/font/
mod.rs

1//! Interacting with the different kinds of PDF fonts.
2
3use crate::cache::Cache;
4use crate::context::Context;
5use crate::device::Device;
6use crate::font::cid::Type0Font;
7use crate::font::generated::{
8    glyph_names, mac_expert, mac_os_roman, mac_roman, standard, win_ansi,
9};
10use crate::font::true_type::TrueTypeFont;
11use crate::font::type1::Type1Font;
12use crate::font::type3::Type3;
13use crate::interpret::state::State;
14use crate::{CMapResolverFn, CacheKey, FontResolverFn, InterpreterSettings, Paint};
15use bitflags::bitflags;
16use kurbo::{Affine, BezPath, Vec2};
17use log::warn;
18use outline::OutlineFont;
19use pdf_syntax::object::Name;
20use pdf_syntax::object::dict::keys::SUBTYPE;
21use pdf_syntax::object::dict::keys::*;
22use pdf_syntax::object::{Dict, Stream};
23use pdf_syntax::page::Resources;
24use pdf_syntax::xref::XRef;
25use skrifa::GlyphId;
26use std::fmt::Debug;
27use std::ops::Deref;
28use std::rc::Rc;
29use std::sync::Arc;
30
31mod blob;
32mod cid;
33mod generated;
34mod glyph_simulator;
35pub(crate) mod outline;
36mod standard_font;
37mod true_type;
38mod type1;
39pub(crate) mod type3;
40
41pub(crate) const UNITS_PER_EM: f32 = 1000.0;
42
43pub(crate) fn stretch_glyph(path: BezPath, expected_width: f32, actual_width: f32) -> BezPath {
44    if actual_width != 0.0 && actual_width != expected_width {
45        let stretch_factor = expected_width / actual_width;
46        Affine::scale_non_uniform(stretch_factor as f64, 1.0) * path
47    } else {
48        path
49    }
50}
51
52/// A container for the bytes of a PDF file.
53pub type FontData = Arc<dyn AsRef<[u8]> + Send + Sync>;
54
55/// Strip the 6-character subset prefix from a PostScript font name.
56///
57/// PDF subset fonts use names like "ABCDEF+TimesNewRoman". This function
58/// returns `TimesNewRoman` from such a name, or the original name if no
59/// valid prefix is found.
60pub(crate) fn strip_subset_prefix(name: &str) -> &str {
61    match name.split_once('+') {
62        Some((prefix, rest)) if prefix.len() == 6 => rest,
63        _ => name,
64    }
65}
66
67use crate::util::hash128;
68pub use outline::OutlineFontData;
69use pdf_font::cmap::{BfString, CMap, CMapName, CharacterCollection};
70pub use standard_font::StandardFont;
71
72/// A glyph that can be drawn.
73pub enum Glyph<'a> {
74    /// A glyph defined by an outline.
75    Outline(OutlineGlyph),
76    /// A type3 glyph, defined by PDF drawing instructions.
77    Type3(Box<Type3Glyph<'a>>),
78}
79
80impl Glyph<'_> {
81    /// Returns the Unicode code point for this glyph, if available.
82    ///
83    /// This method attempts to determine the Unicode character that this glyph
84    /// represents. The exact fallback chain depends on the font type:
85    ///
86    /// **For Outline Fonts (Type1, TrueType, CFF):**
87    /// 1. `ToUnicode` cmap
88    /// 2. Glyph name → Unicode (via Adobe Glyph List)
89    /// 3. Unicode naming conventions (e.g., "uni0041", "u0041")
90    ///
91    /// **For CID Fonts (Type0):**
92    /// 1. `ToUnicode` cmap
93    ///
94    ///
95    /// **For Type3 Fonts:**
96    /// 1. `ToUnicode` cmap
97    ///
98    /// Returns `None` if the Unicode value could not be determined.
99    ///
100    /// Please note that this method is still somewhat experimental and might
101    /// not work reliably in all cases.
102    pub fn as_unicode(&self) -> Option<BfString> {
103        match self {
104            Glyph::Outline(g) => g.as_unicode(),
105            Glyph::Type3(g) => g.as_unicode(),
106        }
107    }
108}
109
110/// An identifier that uniquely identifies a glyph, for caching purposes.
111#[derive(Clone, Debug)]
112pub struct GlyphIdentifier {
113    id: GlyphId,
114    font: OutlineFont,
115}
116
117impl CacheKey for GlyphIdentifier {
118    fn cache_key(&self) -> u128 {
119        hash128(&(self.id, self.font.cache_key()))
120    }
121}
122
123/// A glyph defined by an outline.
124#[derive(Clone, Debug)]
125pub struct OutlineGlyph {
126    pub(crate) id: GlyphId,
127    pub(crate) font: OutlineFont,
128    pub(crate) char_code: u32,
129}
130
131impl OutlineGlyph {
132    /// Return the outline of the glyph, assuming an upem value of 1000.
133    pub fn outline(&self) -> BezPath {
134        self.font.outline_glyph(self.id, self.char_code)
135    }
136
137    /// Return the identifier of the glyph. You can use this to calculate the cache key
138    /// for the glyph.
139    ///
140    /// Note that the `glyph_transform` attribute is not considered in the cache key of
141    /// the identifier, only the glyph ID and the font.
142    pub fn identifier(&self) -> GlyphIdentifier {
143        GlyphIdentifier {
144            id: self.id,
145            font: self.font.clone(),
146        }
147    }
148
149    /// Returns the Unicode code point for this glyph, if available.
150    ///
151    /// See [`Glyph::as_unicode`] for details on the fallback chain used.
152    pub fn as_unicode(&self) -> Option<BfString> {
153        self.font.char_code_to_unicode(self.char_code)
154    }
155
156    /// Get raw font bytes and metadata for downstream use.
157    ///
158    /// Returns `None` for Type1 fonts.
159    pub fn font_data(&self) -> Option<OutlineFontData> {
160        self.font.font_data()
161    }
162
163    /// Get the glyph ID within the font.
164    pub fn glyph_id(&self) -> GlyphId {
165        self.id
166    }
167
168    /// Get the advance width for this glyph.
169    ///
170    /// The advance width is how far to move horizontally after drawing
171    /// this glyph before drawing the next one.
172    pub fn advance_width(&self) -> Option<f32> {
173        self.font.glyph_advance_width(self.char_code)
174    }
175
176    /// Get the cache key for this glyph's font.
177    ///
178    /// This identifies the font uniquely, even when `font_data()` returns `None`
179    /// (e.g., for Type1 fonts). Useful for grouping glyphs by font.
180    pub fn font_cache_key(&self) -> u128 {
181        self.font.cache_key()
182    }
183}
184
185/// A type3 glyph.
186#[derive(Clone)]
187pub struct Type3Glyph<'a> {
188    pub(crate) font: Rc<Type3<'a>>,
189    pub(crate) glyph_id: GlyphId,
190    pub(crate) state: State<'a>,
191    pub(crate) parent_resources: Resources<'a>,
192    pub(crate) cache: Cache,
193    pub(crate) xref: &'a XRef,
194    pub(crate) settings: InterpreterSettings,
195    pub(crate) char_code: u32,
196}
197
198/// A glyph defined by PDF drawing instructions.
199impl<'a> Type3Glyph<'a> {
200    /// Draw the type3 glyph to the given device.
201    pub fn interpret(
202        &self,
203        device: &mut impl Device<'a>,
204        transform: Affine,
205        glyph_transform: Affine,
206        paint: &Paint<'a>,
207    ) {
208        self.font
209            .render_glyph(self, transform, glyph_transform, paint, device);
210    }
211
212    /// Returns the Unicode code point for this glyph, if available.
213    ///
214    /// Note: Type3 fonts can only provide Unicode via `ToUnicode` cmap.
215    pub fn as_unicode(&self) -> Option<BfString> {
216        self.font.char_code_to_unicode(self.char_code)
217    }
218}
219
220impl CacheKey for Type3Glyph<'_> {
221    fn cache_key(&self) -> u128 {
222        hash128(&(self.font.cache_key(), self.glyph_id))
223    }
224}
225
226#[derive(Clone, Debug)]
227pub(crate) struct Font<'a>(u128, FontType<'a>);
228
229impl<'a> Font<'a> {
230    pub(crate) fn new(
231        dict: &Dict<'a>,
232        font_resolver: &FontResolverFn,
233        cmap_resolver: &CMapResolverFn,
234    ) -> Option<Self> {
235        let f_type = match dict.get::<Name>(SUBTYPE)?.deref() {
236            TYPE1 | MM_TYPE1 => {
237                FontType::Type1(Rc::new(Type1Font::new(dict, font_resolver, cmap_resolver)?))
238            }
239            // PDFBOX-5463: PDF viewers seem to accept OpenType as well.
240            TRUE_TYPE | OPEN_TYPE => FontType::TrueType(Rc::new(TrueTypeFont::new(
241                dict,
242                font_resolver,
243                cmap_resolver,
244            )?)),
245            TYPE0 => FontType::Type0(Rc::new(Type0Font::new(dict, font_resolver, cmap_resolver)?)),
246            TYPE3 => FontType::Type3(Rc::new(Type3::new(dict, cmap_resolver)?)),
247            f => {
248                warn!(
249                    "unimplemented font type {:?}",
250                    std::str::from_utf8(f).unwrap_or("unknown type")
251                );
252
253                return None;
254            }
255        };
256
257        let cache_key = dict.cache_key();
258
259        Some(Self(cache_key, f_type))
260    }
261
262    pub(crate) fn new_standard(
263        standard_font: StandardFont,
264        font_resolver: &FontResolverFn,
265    ) -> Option<Self> {
266        let font = Type1Font::new_standard(standard_font, font_resolver)?;
267
268        Some(Self(0, FontType::Type1(Rc::new(font))))
269    }
270
271    pub(crate) fn map_code(&self, code: u32) -> GlyphId {
272        match &self.1 {
273            FontType::Type1(f) => {
274                debug_assert!(code <= u8::MAX as u32);
275
276                f.map_code(code as u8)
277            }
278            FontType::TrueType(t) => {
279                debug_assert!(code <= u8::MAX as u32);
280
281                t.map_code(code as u8)
282            }
283            FontType::Type0(t) => t.map_code(code),
284            FontType::Type3(t) => {
285                debug_assert!(code <= u8::MAX as u32);
286
287                t.map_code(code as u8)
288            }
289        }
290    }
291
292    pub(crate) fn get_glyph(
293        &self,
294        glyph: GlyphId,
295        char_code: u32,
296        ctx: &mut Context<'a>,
297        resources: &Resources<'a>,
298        origin_displacement: Vec2,
299    ) -> (Glyph<'a>, Affine) {
300        let glyph_transform = ctx.get().text_state.full_transform()
301            * Affine::scale(1.0 / UNITS_PER_EM as f64)
302            * Affine::translate(origin_displacement);
303
304        let glyph = match &self.1 {
305            FontType::Type1(t) => {
306                let font = OutlineFont::Type1(t.clone());
307                Glyph::Outline(OutlineGlyph {
308                    id: glyph,
309                    font,
310                    char_code,
311                })
312            }
313            FontType::TrueType(t) => {
314                let font = OutlineFont::TrueType(t.clone());
315                Glyph::Outline(OutlineGlyph {
316                    id: glyph,
317                    font,
318                    char_code,
319                })
320            }
321            FontType::Type0(t) => {
322                let font = OutlineFont::Type0(t.clone());
323                Glyph::Outline(OutlineGlyph {
324                    id: glyph,
325                    font,
326                    char_code,
327                })
328            }
329            FontType::Type3(t) => {
330                let shape_glyph = Type3Glyph {
331                    font: t.clone(),
332                    glyph_id: glyph,
333                    state: ctx.get().clone(),
334                    parent_resources: resources.clone(),
335                    cache: ctx.object_cache.clone(),
336                    xref: ctx.xref,
337                    settings: ctx.settings.clone(),
338                    char_code,
339                };
340
341                Glyph::Type3(Box::new(shape_glyph))
342            }
343        };
344
345        (glyph, glyph_transform)
346    }
347
348    pub(crate) fn code_advance(&self, code: u32) -> Vec2 {
349        match &self.1 {
350            FontType::Type1(t) => {
351                debug_assert!(code <= u8::MAX as u32);
352
353                Vec2::new(t.glyph_width(code as u8).unwrap_or(0.0) as f64, 0.0)
354            }
355            FontType::TrueType(t) => {
356                debug_assert!(code <= u8::MAX as u32);
357
358                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
359            }
360            FontType::Type0(t) => t.code_advance(code),
361            FontType::Type3(t) => {
362                debug_assert!(code <= u8::MAX as u32);
363
364                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
365            }
366        }
367    }
368
369    pub(crate) fn origin_displacement(&self, code: u32) -> Vec2 {
370        match &self.1 {
371            FontType::Type1(_) => Vec2::default(),
372            FontType::TrueType(_) => Vec2::default(),
373            FontType::Type0(t) => t.origin_displacement(code),
374            FontType::Type3(_) => Vec2::default(),
375        }
376    }
377
378    pub(crate) fn read_code(&self, bytes: &[u8], offset: usize) -> (u32, usize) {
379        match &self.1 {
380            FontType::Type1(_) => (bytes[offset] as u32, 1),
381            FontType::TrueType(_) => (bytes[offset] as u32, 1),
382            FontType::Type0(t) => t.read_code(bytes, offset),
383            FontType::Type3(_) => (bytes[offset] as u32, 1),
384        }
385    }
386
387    pub(crate) fn is_horizontal(&self) -> bool {
388        match &self.1 {
389            FontType::Type1(_) => true,
390            FontType::TrueType(_) => true,
391            FontType::Type0(t) => t.is_horizontal(),
392            FontType::Type3(_) => true,
393        }
394    }
395}
396
397impl CacheKey for Font<'_> {
398    fn cache_key(&self) -> u128 {
399        self.0
400    }
401}
402
403#[derive(Clone, Debug)]
404enum FontType<'a> {
405    Type1(Rc<Type1Font>),
406    TrueType(Rc<TrueTypeFont>),
407    Type0(Rc<Type0Font>),
408    Type3(Rc<Type3<'a>>),
409}
410
411#[derive(Debug)]
412enum Encoding {
413    Standard,
414    MacRoman,
415    WinAnsi,
416    MacExpert,
417    BuiltIn,
418}
419
420impl Encoding {
421    fn map_code(&self, code: u8) -> Option<&'static str> {
422        if code == 0 {
423            return Some(".notdef");
424        }
425        match self {
426            Self::Standard => standard::get(code),
427            Self::MacRoman => mac_roman::get(code).or_else(|| mac_os_roman::get(code)),
428            Self::WinAnsi => win_ansi::get(code),
429            Self::MacExpert => mac_expert::get(code),
430            Self::BuiltIn => None,
431        }
432    }
433}
434
435/// The font stretch.
436#[derive(Debug, Copy, Clone)]
437pub enum FontStretch {
438    /// Normal.
439    Normal,
440    /// Ultra condensed.
441    UltraCondensed,
442    /// Extra condensed.
443    ExtraCondensed,
444    /// Condensed.
445    Condensed,
446    /// Semi condensed.
447    SemiCondensed,
448    /// Semi expanded.
449    SemiExpanded,
450    /// Expanded.
451    Expanded,
452    /// Extra expanded.
453    ExtraExpanded,
454    /// Ultra expanded.
455    UltraExpanded,
456}
457
458impl FontStretch {
459    fn from_string(s: &str) -> Self {
460        match s {
461            "UltraCondensed" => Self::UltraCondensed,
462            "ExtraCondensed" => Self::ExtraCondensed,
463            "Condensed" => Self::Condensed,
464            "SemiCondensed" => Self::SemiCondensed,
465            "SemiExpanded" => Self::SemiExpanded,
466            "Expanded" => Self::Expanded,
467            "ExtraExpanded" => Self::ExtraExpanded,
468            "UltraExpanded" => Self::UltraExpanded,
469            _ => Self::Normal,
470        }
471    }
472}
473
474bitflags! {
475    /// Bitflags describing various characteristics of fonts.
476    #[derive(Debug)]
477    pub(crate) struct FontFlags: u32 {
478        const FIXED_PITCH = 1 << 0;
479        const SERIF = 1 << 1;
480        const SYMBOLIC = 1 << 2;
481        const SCRIPT = 1 << 3;
482        const NON_SYMBOLIC = 1 << 5;
483        const ITALIC = 1 << 6;
484        const ALL_CAP = 1 << 16;
485        const SMALL_CAP = 1 << 17;
486        const FORCE_BOLD = 1 << 18;
487    }
488}
489
490/// A query for a font.
491pub enum FontQuery {
492    /// A query for one of the 14 PDF standard fonts.
493    Standard(StandardFont),
494    /// A query for a font that is not embedded in the PDF file.
495    ///
496    /// Note that this type of query is currently not supported,
497    /// but will be implemented in the future.
498    Fallback(FallbackFontQuery),
499}
500
501/// A query for a font with specific properties.
502#[derive(Debug, Clone)]
503pub struct FallbackFontQuery {
504    /// The postscript name of the font.
505    pub post_script_name: Option<String>,
506    /// The name of the font.
507    pub font_name: Option<String>,
508    /// The family of the font.
509    pub font_family: Option<String>,
510    /// The stretch of the font.
511    pub font_stretch: FontStretch,
512    /// The weight of the font.
513    pub font_weight: u32,
514    /// Whether the font is monospaced.
515    pub is_fixed_pitch: bool,
516    /// Whether the font is serif.
517    pub is_serif: bool,
518    /// Whether the font is italic.
519    pub is_italic: bool,
520    /// Whether the font is bold.
521    pub is_bold: bool,
522    /// Whether the font is small cap.
523    pub is_small_cap: bool,
524    /// The character collection (registry/ordering) if this is a CID font.
525    pub character_collection: Option<CharacterCollection>,
526}
527
528impl FallbackFontQuery {
529    pub(crate) fn new(dict: &Dict<'_>) -> Self {
530        let post_script_name = dict
531            .get::<Name>(BASE_FONT)
532            .map(|n| strip_subset_prefix(n.as_str()).to_string());
533
534        let mut data = Self {
535            post_script_name,
536            ..Default::default()
537        };
538
539        if let Some(descriptor) = dict.get::<Dict<'_>>(FONT_DESC) {
540            data.font_name = dict
541                .get::<Name>(FONT_NAME)
542                .map(|n| strip_subset_prefix(n.as_str()).to_string());
543            data.font_family = descriptor
544                .get::<Name>(FONT_FAMILY)
545                .map(|n| n.as_str().to_string());
546            data.font_stretch = descriptor
547                .get::<Name>(FONT_STRETCH)
548                .map(|n| FontStretch::from_string(n.as_str()))
549                .unwrap_or(FontStretch::Normal);
550            data.font_weight = descriptor.get::<u32>(FONT_WEIGHT).unwrap_or(400);
551
552            if let Some(flags) = descriptor
553                .get::<u32>(FLAGS)
554                .map(FontFlags::from_bits_truncate)
555            {
556                data.is_fixed_pitch = flags.contains(FontFlags::FIXED_PITCH);
557                data.is_serif = flags.contains(FontFlags::SERIF);
558                data.is_italic = flags.contains(FontFlags::ITALIC);
559                data.is_small_cap = flags.contains(FontFlags::SMALL_CAP);
560            }
561        }
562
563        data.is_bold |= data.font_weight >= 700;
564
565        if let Some(name) = &data.post_script_name {
566            let lower = name.to_ascii_lowercase();
567            data.is_italic |=
568                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
569            data.is_bold |= lower.contains("bold")
570                || lower.contains("demi")
571                || lower.contains("semibold")
572                || lower.contains("heavy")
573                || lower.contains("black");
574        }
575
576        data
577    }
578
579    /// Do a best-effort fallback to the 14 standard fonts based on the query.
580    pub fn pick_standard_font(&self) -> StandardFont {
581        if self.is_fixed_pitch {
582            match (self.is_bold, self.is_italic) {
583                (true, true) => StandardFont::CourierBoldOblique,
584                (true, false) => StandardFont::CourierBold,
585                (false, true) => StandardFont::CourierOblique,
586                (false, false) => StandardFont::Courier,
587            }
588        } else if !self.is_serif {
589            match (self.is_bold, self.is_italic) {
590                (true, true) => StandardFont::HelveticaBoldOblique,
591                (true, false) => StandardFont::HelveticaBold,
592                (false, true) => StandardFont::HelveticaOblique,
593                (false, false) => StandardFont::Helvetica,
594            }
595        } else {
596            match (self.is_bold, self.is_italic) {
597                (true, true) => StandardFont::TimesBoldItalic,
598                (true, false) => StandardFont::TimesBold,
599                (false, true) => StandardFont::TimesItalic,
600                (false, false) => StandardFont::TimesRoman,
601            }
602        }
603    }
604}
605
606impl Default for FallbackFontQuery {
607    fn default() -> Self {
608        Self {
609            post_script_name: None,
610            font_name: None,
611            font_family: None,
612            font_stretch: FontStretch::Normal,
613            font_weight: 400,
614            is_fixed_pitch: false,
615            is_serif: false,
616            is_italic: false,
617            is_bold: false,
618            is_small_cap: false,
619            character_collection: None,
620        }
621    }
622}
623
624/// Convert a glyph name to a Unicode character, if possible.
625/// Implements the Adobe Glyph List Specification with extended fallbacks
626/// for non-standard names found in real-world PDFs.
627/// <https://github.com/adobe-type-tools/agl-specification>
628pub(crate) fn glyph_name_to_unicode(name: &str) -> Option<char> {
629    // 1. Direct AGL lookup
630    if let Some(unicode_str) = glyph_names::get(name) {
631        return unicode_str.chars().next();
632    }
633
634    // 2. uni/u prefix convention
635    if let Some(c) = unicode_from_name(name) {
636        return Some(c);
637    }
638
639    // 3. AGL §2.4: strip variant suffix after period (e.g., "A.swash" → "A",
640    //    "comma.alt" → "comma"). Only the base name before the first period
641    //    maps to Unicode.
642    if let Some(dot_pos) = name.find('.') {
643        let base = &name[..dot_pos];
644        if !base.is_empty() {
645            if let Some(c) = glyph_names::get(base).and_then(|s| s.chars().next()) {
646                return Some(c);
647            }
648            if let Some(c) = unicode_from_name(base) {
649                return Some(c);
650            }
651        }
652    }
653
654    // 4. Handle "aXX" decimal glyph names (e.g., "a65" → 'A') used by some
655    //    TeX/LaTeX generated PDFs and custom encoding vectors.
656    if name.starts_with('a') && name.len() >= 2 {
657        if let Ok(code) = name[1..].parse::<u32>() {
658            if let Some(c) = char::from_u32(code) {
659                if !c.is_control() || c == ' ' {
660                    return Some(c);
661                }
662            }
663        }
664    }
665
666    warn!("failed to map glyph name {} to unicode", name);
667    None
668}
669
670/// AGL §2.2: resolve a glyph name to one or more Unicode code points. Handles
671/// everything `glyph_name_to_unicode` does, plus underscore-joined ligature
672/// names (e.g. `f_i` → "fi", `f_f_i` → "ffi") which glyph-list-based fonts
673/// use for Latin ligatures. Returns the full string, so ligatures surface as
674/// their component code points rather than being dropped when the composite
675/// name isn't in the AGL.
676///
677/// Strips the variant suffix (`.ss01`, `.swash`, …) from the base name before
678/// splitting, matching AGL §2.4. Bails out if any component fails to resolve
679/// so we never produce a partial string (which would be worse than None).
680pub(crate) fn glyph_name_to_string(name: &str) -> Option<String> {
681    // Try the single-code-point fast path first — it also handles `.suffix`
682    // stripping, `uniXXXX`, and the `aNN` decimal convention.
683    if let Some(c) = glyph_name_to_unicode(name) {
684        return Some(c.to_string());
685    }
686
687    let base = name.split_once('.').map(|(b, _)| b).unwrap_or(name);
688    if !base.contains('_') {
689        return None;
690    }
691
692    let mut out = String::new();
693    for part in base.split('_') {
694        if part.is_empty() {
695            return None;
696        }
697        let c = glyph_name_to_unicode(part)?;
698        out.push(c);
699    }
700    (!out.is_empty()).then_some(out)
701}
702
703pub(crate) fn unicode_from_name(name: &str) -> Option<char> {
704    let convert = |input: &str| u32::from_str_radix(input, 16).ok().and_then(char::from_u32);
705
706    name.starts_with("uni")
707        .then(|| name.get(3..).and_then(convert))
708        .or_else(|| {
709            name.starts_with("u")
710                .then(|| name.get(1..).and_then(convert))
711        })
712        .flatten()
713}
714
715pub(crate) fn read_to_unicode(dict: &Dict<'_>, cmap_resolver: &CMapResolverFn) -> Option<CMap> {
716    dict.get::<Stream<'_>>(TO_UNICODE)
717        .and_then(|s| s.decoded().ok())
718        // See PDFJS-11915, where `Identity-H` is used for `ToUnicode`. I don't
719        // believe it's valid, but at least mupdf seems to be able to deal with it.
720        .or_else(|| {
721            dict.get::<Name>(TO_UNICODE)
722                .and_then(|name| (cmap_resolver)(CMapName::from_bytes(name.as_ref())))
723                .map(|d| d.to_vec())
724        })
725        .and_then(|data| {
726            let cmap_resolver = cmap_resolver.clone();
727            CMap::parse(&data, move |name| (cmap_resolver)(name))
728        })
729}
730
731/// Build a 256-entry Unicode map from the font's `/Encoding` entry.
732/// Used as a fallback when no `/ToUnicode` CMap is present.
733pub(crate) fn synthesize_unicode_map_from_encoding(dict: &Dict<'_>) -> Option<[Option<char>; 256]> {
734    let (base_encoding, differences) = true_type::read_encoding(dict);
735    // Only synthesize if there is something to map (encoding name or differences).
736    // For BuiltIn with no differences we cannot reliably guess the mapping.
737    if matches!(base_encoding, Encoding::BuiltIn) && differences.is_empty() {
738        return None;
739    }
740    let mut table: [Option<char>; 256] = [None; 256];
741    for code in 0u8..=255 {
742        let glyph_name = differences
743            .get(&code)
744            .map(String::as_str)
745            .or_else(|| base_encoding.map_code(code));
746        if let Some(name) = glyph_name {
747            table[code as usize] = glyph_name_to_unicode(name);
748        }
749    }
750    Some(table)
751}
752
753// When mapping to glyphs, some fonts might only have a glyph for the "normalized"
754// name. These aliases come from the Adobe Glyph List (AGL) and are safe
755// synonyms: they are consulted only when the literal lookup already failed, so
756// adding new entries broadens recovery without overriding correct mappings.
757pub(crate) fn normalized_glyph_name(mut name: &str) -> &str {
758    if name == "nbspace" || name == "nonbreakingspace" {
759        name = "space";
760    }
761
762    if name == "sfthyphen" || name == "softhyphen" {
763        name = "hyphen";
764    }
765
766    name
767}
768
769#[cfg(test)]
770mod normalized_glyph_name_tests {
771    use super::normalized_glyph_name;
772
773    #[test]
774    fn maps_space_aliases() {
775        assert_eq!(normalized_glyph_name("nbspace"), "space");
776        assert_eq!(normalized_glyph_name("nonbreakingspace"), "space");
777    }
778
779    #[test]
780    fn maps_hyphen_aliases() {
781        assert_eq!(normalized_glyph_name("sfthyphen"), "hyphen");
782        assert_eq!(normalized_glyph_name("softhyphen"), "hyphen");
783    }
784
785    #[test]
786    fn preserves_unrelated_names() {
787        assert_eq!(normalized_glyph_name("A"), "A");
788        assert_eq!(normalized_glyph_name(".notdef"), ".notdef");
789        assert_eq!(normalized_glyph_name("hyphen"), "hyphen");
790        assert_eq!(normalized_glyph_name("space"), "space");
791    }
792}
793
794#[cfg(test)]
795mod glyph_name_to_unicode_tests {
796    use super::{glyph_name_to_string, glyph_name_to_unicode};
797
798    #[test]
799    fn standard_agl_name() {
800        assert_eq!(glyph_name_to_unicode("A"), Some('A'));
801        assert_eq!(glyph_name_to_unicode("space"), Some(' '));
802        assert_eq!(glyph_name_to_unicode("hyphen"), Some('-'));
803    }
804
805    #[test]
806    fn ligature_name_underscore_joined() {
807        // AGL maps `fi` directly (U+FB01); the underscore form is what
808        // CFF fonts without a ToUnicode map tend to emit for the same
809        // ligature. We want the decomposed string so readers see "fi".
810        assert_eq!(glyph_name_to_string("f_i"), Some("fi".to_string()));
811        assert_eq!(glyph_name_to_string("f_f_i"), Some("ffi".to_string()));
812        assert_eq!(glyph_name_to_string("A_B_C"), Some("ABC".to_string()));
813    }
814
815    #[test]
816    fn ligature_name_with_suffix() {
817        // Variant suffix must be stripped before splitting on `_`.
818        assert_eq!(glyph_name_to_string("f_i.alt"), Some("fi".to_string()));
819    }
820
821    #[test]
822    fn ligature_name_falls_back_to_single_char_path() {
823        // Direct AGL hit should still win and yield the precomposed char.
824        assert_eq!(glyph_name_to_string("fi"), Some("\u{FB01}".to_string()));
825        assert_eq!(glyph_name_to_string("A"), Some("A".to_string()));
826    }
827
828    #[test]
829    fn ligature_name_rejects_unresolvable_component() {
830        // If any segment is unknown we return None rather than a partial
831        // string — a partial would be more confusing than a drop.
832        assert!(glyph_name_to_string("A_totallyUnknownGlyph").is_none());
833        assert!(glyph_name_to_string("_").is_none());
834        assert!(glyph_name_to_string("A__B").is_none());
835    }
836
837    #[test]
838    fn uni_prefix() {
839        assert_eq!(glyph_name_to_unicode("uni0041"), Some('A'));
840        assert_eq!(glyph_name_to_unicode("uni00E9"), Some('é'));
841    }
842
843    #[test]
844    fn u_prefix() {
845        assert_eq!(glyph_name_to_unicode("u0041"), Some('A'));
846        assert_eq!(glyph_name_to_unicode("u2022"), Some('•'));
847    }
848
849    #[test]
850    fn variant_suffix_stripped() {
851        assert_eq!(glyph_name_to_unicode("A.swash"), Some('A'));
852        assert_eq!(glyph_name_to_unicode("comma.alt"), Some(','));
853        assert_eq!(glyph_name_to_unicode("space.narrow"), Some(' '));
854    }
855
856    #[test]
857    fn variant_suffix_with_uni_prefix() {
858        assert_eq!(glyph_name_to_unicode("uni0041.ss01"), Some('A'));
859    }
860
861    #[test]
862    fn a_decimal_glyph_name() {
863        assert_eq!(glyph_name_to_unicode("a65"), Some('A'));
864        assert_eq!(glyph_name_to_unicode("a32"), Some(' '));
865        assert_eq!(glyph_name_to_unicode("a97"), Some('a'));
866    }
867
868    #[test]
869    fn a_decimal_rejects_control_chars() {
870        assert_eq!(glyph_name_to_unicode("a0"), None);
871        assert_eq!(glyph_name_to_unicode("a7"), None);
872    }
873
874    #[test]
875    fn unknown_name_returns_none() {
876        assert_eq!(glyph_name_to_unicode("xyzzynonexistent"), None);
877    }
878}
879
880#[cfg(test)]
881mod fallback_font_query_tests {
882    use super::*;
883
884    fn query_with(name: &str, flags: u32, weight: u32) -> FallbackFontQuery {
885        let mut q = FallbackFontQuery {
886            post_script_name: Some(name.to_string()),
887            font_weight: weight,
888            ..Default::default()
889        };
890
891        let font_flags = FontFlags::from_bits_truncate(flags);
892        q.is_fixed_pitch = font_flags.contains(FontFlags::FIXED_PITCH);
893        q.is_serif = font_flags.contains(FontFlags::SERIF);
894        q.is_italic = font_flags.contains(FontFlags::ITALIC);
895        q.is_small_cap = font_flags.contains(FontFlags::SMALL_CAP);
896
897        q.is_bold |= q.font_weight >= 700;
898
899        if let Some(name) = &q.post_script_name {
900            let lower = name.to_ascii_lowercase();
901            q.is_italic |=
902                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
903            q.is_bold |= lower.contains("bold")
904                || lower.contains("demi")
905                || lower.contains("semibold")
906                || lower.contains("heavy")
907                || lower.contains("black");
908        }
909        q
910    }
911
912    #[test]
913    fn fixed_pitch_flag_selects_courier() {
914        let q = query_with("LetterGothic", FontFlags::FIXED_PITCH.bits(), 400);
915        assert!(matches!(q.pick_standard_font(), StandardFont::Courier));
916    }
917
918    #[test]
919    fn demi_in_name_selects_bold() {
920        let q = query_with("FranklinGothic-Demi", FontFlags::SERIF.bits(), 400);
921        assert!(q.is_bold);
922        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
923    }
924
925    #[test]
926    fn oblique_detected_as_italic() {
927        let q = query_with("HelveticaNeue-LightOblique", 0, 400);
928        assert!(q.is_italic);
929        assert!(matches!(
930            q.pick_standard_font(),
931            StandardFont::HelveticaOblique
932        ));
933    }
934
935    #[test]
936    fn font_weight_700_detected_as_bold() {
937        let q = query_with("CustomFont", 0, 700);
938        assert!(q.is_bold);
939        assert!(matches!(
940            q.pick_standard_font(),
941            StandardFont::HelveticaBold
942        ));
943    }
944
945    #[test]
946    fn semibold_detected_as_bold() {
947        let q = query_with("AGaramond-Semibold", FontFlags::SERIF.bits(), 400);
948        assert!(q.is_bold);
949        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
950    }
951}