Skip to main content

pdf_interpret/font/
mod.rs

1//! Interacting with the different kinds of PDF fonts.
2
3use crate::cache::Cache;
4use crate::context::Context;
5use crate::device::Device;
6use crate::font::cid::Type0Font;
7use crate::font::generated::{
8    glyph_names, mac_expert, mac_os_roman, mac_roman, standard, win_ansi,
9};
10use crate::font::true_type::TrueTypeFont;
11use crate::font::type1::Type1Font;
12use crate::font::type3::Type3;
13use crate::interpret::state::State;
14use crate::util::decode_or_warn;
15use crate::{CMapResolverFn, CacheKey, FontResolverFn, InterpreterSettings, Paint, WarningSinkFn};
16use bitflags::bitflags;
17use kurbo::{Affine, BezPath, Vec2};
18use log::warn;
19use outline::OutlineFont;
20use pdf_syntax::object::Name;
21use pdf_syntax::object::dict::keys::SUBTYPE;
22use pdf_syntax::object::dict::keys::*;
23use pdf_syntax::object::{Dict, Stream};
24use pdf_syntax::page::Resources;
25use pdf_syntax::xref::XRef;
26use skrifa::GlyphId;
27use std::fmt::Debug;
28use std::ops::Deref;
29use std::rc::Rc;
30use std::sync::Arc;
31
32mod blob;
33mod cid;
34mod generated;
35mod glyph_simulator;
36pub(crate) mod outline;
37mod standard_font;
38mod true_type;
39mod type1;
40pub(crate) mod type3;
41
42pub(crate) const UNITS_PER_EM: f32 = 1000.0;
43
44pub(crate) fn stretch_glyph(path: BezPath, expected_width: f32, actual_width: f32) -> BezPath {
45    if actual_width != 0.0 && actual_width != expected_width {
46        let stretch_factor = expected_width / actual_width;
47        Affine::scale_non_uniform(stretch_factor as f64, 1.0) * path
48    } else {
49        path
50    }
51}
52
53/// A container for the bytes of a PDF file.
54pub type FontData = Arc<dyn AsRef<[u8]> + Send + Sync>;
55
56/// Strip the 6-character subset prefix from a PostScript font name.
57///
58/// PDF subset fonts use names like "ABCDEF+TimesNewRoman". This function
59/// returns `TimesNewRoman` from such a name, or the original name if no
60/// valid prefix is found.
61pub(crate) fn strip_subset_prefix(name: &str) -> &str {
62    match name.split_once('+') {
63        Some((prefix, rest)) if prefix.len() == 6 => rest,
64        _ => name,
65    }
66}
67
68use crate::util::hash128;
69pub use outline::OutlineFontData;
70use pdf_font::cmap::{BfString, CMap, CMapName, CharacterCollection};
71pub use standard_font::StandardFont;
72
73/// A glyph that can be drawn.
74pub enum Glyph<'a> {
75    /// A glyph defined by an outline.
76    Outline(OutlineGlyph),
77    /// A type3 glyph, defined by PDF drawing instructions.
78    Type3(Box<Type3Glyph<'a>>),
79}
80
81impl Glyph<'_> {
82    /// Returns the Unicode code point for this glyph, if available.
83    ///
84    /// This method attempts to determine the Unicode character that this glyph
85    /// represents. The exact fallback chain depends on the font type:
86    ///
87    /// **For Outline Fonts (Type1, TrueType, CFF):**
88    /// 1. `ToUnicode` cmap
89    /// 2. Glyph name → Unicode (via Adobe Glyph List)
90    /// 3. Unicode naming conventions (e.g., "uni0041", "u0041")
91    ///
92    /// **For CID Fonts (Type0):**
93    /// 1. `ToUnicode` cmap
94    ///
95    ///
96    /// **For Type3 Fonts:**
97    /// 1. `ToUnicode` cmap
98    ///
99    /// Returns `None` if the Unicode value could not be determined.
100    ///
101    /// Please note that this method is still somewhat experimental and might
102    /// not work reliably in all cases.
103    pub fn as_unicode(&self) -> Option<BfString> {
104        match self {
105            Glyph::Outline(g) => g.as_unicode(),
106            Glyph::Type3(g) => g.as_unicode(),
107        }
108    }
109}
110
111/// An identifier that uniquely identifies a glyph, for caching purposes.
112#[derive(Clone, Debug)]
113pub struct GlyphIdentifier {
114    id: GlyphId,
115    font: OutlineFont,
116}
117
118impl CacheKey for GlyphIdentifier {
119    fn cache_key(&self) -> u128 {
120        hash128(&(self.id, self.font.cache_key()))
121    }
122}
123
124/// A glyph defined by an outline.
125#[derive(Clone, Debug)]
126pub struct OutlineGlyph {
127    pub(crate) id: GlyphId,
128    pub(crate) font: OutlineFont,
129    pub(crate) char_code: u32,
130}
131
132impl OutlineGlyph {
133    /// Return the outline of the glyph, assuming an upem value of 1000.
134    pub fn outline(&self) -> BezPath {
135        self.font.outline_glyph(self.id, self.char_code)
136    }
137
138    /// Return the identifier of the glyph. You can use this to calculate the cache key
139    /// for the glyph.
140    ///
141    /// Note that the `glyph_transform` attribute is not considered in the cache key of
142    /// the identifier, only the glyph ID and the font.
143    pub fn identifier(&self) -> GlyphIdentifier {
144        GlyphIdentifier {
145            id: self.id,
146            font: self.font.clone(),
147        }
148    }
149
150    /// Returns the Unicode code point for this glyph, if available.
151    ///
152    /// See [`Glyph::as_unicode`] for details on the fallback chain used.
153    pub fn as_unicode(&self) -> Option<BfString> {
154        self.font.char_code_to_unicode(self.char_code)
155    }
156
157    /// Get raw font bytes and metadata for downstream use.
158    ///
159    /// Returns `None` for Type1 fonts.
160    pub fn font_data(&self) -> Option<OutlineFontData> {
161        self.font.font_data()
162    }
163
164    /// PostScript name of the underlying font, if determinable.
165    ///
166    /// Works even when [`font_data`](Self::font_data) returns
167    /// `None` — e.g. non-embedded standard-14 Type1 fonts still
168    /// expose their canonical PostScript name through this
169    /// accessor. Returns `None` when no name source is available.
170    ///
171    /// Surfaced to support the WASM `getTextPositions()` `fontName`
172    /// field for the editor contract.
173    pub fn postscript_name(&self) -> Option<String> {
174        self.font.postscript_name()
175    }
176
177    /// Get the glyph ID within the font.
178    pub fn glyph_id(&self) -> GlyphId {
179        self.id
180    }
181
182    /// Get the advance width for this glyph.
183    ///
184    /// The advance width is how far to move horizontally after drawing
185    /// this glyph before drawing the next one.
186    pub fn advance_width(&self) -> Option<f32> {
187        self.font.glyph_advance_width(self.char_code)
188    }
189
190    /// Get the cache key for this glyph's font.
191    ///
192    /// This identifies the font uniquely, even when `font_data()` returns `None`
193    /// (e.g., for Type1 fonts). Useful for grouping glyphs by font.
194    pub fn font_cache_key(&self) -> u128 {
195        self.font.cache_key()
196    }
197}
198
199/// A type3 glyph.
200#[derive(Clone)]
201pub struct Type3Glyph<'a> {
202    pub(crate) font: Rc<Type3<'a>>,
203    pub(crate) glyph_id: GlyphId,
204    pub(crate) state: State<'a>,
205    pub(crate) parent_resources: Resources<'a>,
206    pub(crate) cache: Cache,
207    pub(crate) xref: &'a XRef,
208    pub(crate) settings: InterpreterSettings,
209    pub(crate) char_code: u32,
210}
211
212/// A glyph defined by PDF drawing instructions.
213impl<'a> Type3Glyph<'a> {
214    /// Draw the type3 glyph to the given device.
215    pub fn interpret(
216        &self,
217        device: &mut impl Device<'a>,
218        transform: Affine,
219        glyph_transform: Affine,
220        paint: &Paint<'a>,
221    ) {
222        self.font
223            .render_glyph(self, transform, glyph_transform, paint, device);
224    }
225
226    /// Returns the Unicode code point for this glyph, if available.
227    ///
228    /// Note: Type3 fonts can only provide Unicode via `ToUnicode` cmap.
229    pub fn as_unicode(&self) -> Option<BfString> {
230        self.font.char_code_to_unicode(self.char_code)
231    }
232}
233
234impl CacheKey for Type3Glyph<'_> {
235    fn cache_key(&self) -> u128 {
236        hash128(&(self.font.cache_key(), self.glyph_id))
237    }
238}
239
240#[derive(Clone, Debug)]
241pub(crate) struct Font<'a>(u128, FontType<'a>);
242
243impl<'a> Font<'a> {
244    pub(crate) fn new(
245        dict: &Dict<'a>,
246        font_resolver: &FontResolverFn,
247        cmap_resolver: &CMapResolverFn,
248        warning_sink: &WarningSinkFn,
249    ) -> Option<Self> {
250        let f_type = match dict.get::<Name>(SUBTYPE)?.deref() {
251            TYPE1 | MM_TYPE1 => FontType::Type1(Rc::new(Type1Font::new(
252                dict,
253                font_resolver,
254                cmap_resolver,
255                warning_sink,
256            )?)),
257            // PDFBOX-5463: PDF viewers seem to accept OpenType as well.
258            TRUE_TYPE | OPEN_TYPE => FontType::TrueType(Rc::new(TrueTypeFont::new(
259                dict,
260                font_resolver,
261                cmap_resolver,
262                warning_sink,
263            )?)),
264            TYPE0 => FontType::Type0(Rc::new(Type0Font::new(
265                dict,
266                font_resolver,
267                cmap_resolver,
268                warning_sink,
269            )?)),
270            TYPE3 => FontType::Type3(Rc::new(Type3::new(dict, cmap_resolver, warning_sink)?)),
271            f => {
272                warn!(
273                    "unimplemented font type {:?}",
274                    std::str::from_utf8(f).unwrap_or("unknown type")
275                );
276
277                return None;
278            }
279        };
280
281        let cache_key = dict.cache_key();
282
283        Some(Self(cache_key, f_type))
284    }
285
286    pub(crate) fn new_standard(
287        standard_font: StandardFont,
288        font_resolver: &FontResolverFn,
289    ) -> Option<Self> {
290        let font = Type1Font::new_standard(standard_font, font_resolver)?;
291
292        Some(Self(0, FontType::Type1(Rc::new(font))))
293    }
294
295    pub(crate) fn map_code(&self, code: u32) -> GlyphId {
296        match &self.1 {
297            FontType::Type1(f) => {
298                debug_assert!(code <= u8::MAX as u32);
299
300                f.map_code(code as u8)
301            }
302            FontType::TrueType(t) => {
303                debug_assert!(code <= u8::MAX as u32);
304
305                t.map_code(code as u8)
306            }
307            FontType::Type0(t) => t.map_code(code),
308            FontType::Type3(t) => {
309                debug_assert!(code <= u8::MAX as u32);
310
311                t.map_code(code as u8)
312            }
313        }
314    }
315
316    pub(crate) fn get_glyph(
317        &self,
318        glyph: GlyphId,
319        char_code: u32,
320        ctx: &mut Context<'a>,
321        resources: &Resources<'a>,
322        origin_displacement: Vec2,
323    ) -> (Glyph<'a>, Affine) {
324        let glyph_transform = ctx.get().text_state.full_transform()
325            * Affine::scale(1.0 / UNITS_PER_EM as f64)
326            * Affine::translate(origin_displacement);
327
328        let glyph = match &self.1 {
329            FontType::Type1(t) => {
330                let font = OutlineFont::Type1(t.clone());
331                Glyph::Outline(OutlineGlyph {
332                    id: glyph,
333                    font,
334                    char_code,
335                })
336            }
337            FontType::TrueType(t) => {
338                let font = OutlineFont::TrueType(t.clone());
339                Glyph::Outline(OutlineGlyph {
340                    id: glyph,
341                    font,
342                    char_code,
343                })
344            }
345            FontType::Type0(t) => {
346                let font = OutlineFont::Type0(t.clone());
347                Glyph::Outline(OutlineGlyph {
348                    id: glyph,
349                    font,
350                    char_code,
351                })
352            }
353            FontType::Type3(t) => {
354                let shape_glyph = Type3Glyph {
355                    font: t.clone(),
356                    glyph_id: glyph,
357                    state: ctx.get().clone(),
358                    parent_resources: resources.clone(),
359                    cache: ctx.object_cache.clone(),
360                    xref: ctx.xref,
361                    settings: ctx.settings.clone(),
362                    char_code,
363                };
364
365                Glyph::Type3(Box::new(shape_glyph))
366            }
367        };
368
369        (glyph, glyph_transform)
370    }
371
372    pub(crate) fn code_advance(&self, code: u32) -> Vec2 {
373        match &self.1 {
374            FontType::Type1(t) => {
375                debug_assert!(code <= u8::MAX as u32);
376
377                Vec2::new(t.glyph_width(code as u8).unwrap_or(0.0) as f64, 0.0)
378            }
379            FontType::TrueType(t) => {
380                debug_assert!(code <= u8::MAX as u32);
381
382                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
383            }
384            FontType::Type0(t) => t.code_advance(code),
385            FontType::Type3(t) => {
386                debug_assert!(code <= u8::MAX as u32);
387
388                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
389            }
390        }
391    }
392
393    pub(crate) fn origin_displacement(&self, code: u32) -> Vec2 {
394        match &self.1 {
395            FontType::Type1(_) => Vec2::default(),
396            FontType::TrueType(_) => Vec2::default(),
397            FontType::Type0(t) => t.origin_displacement(code),
398            FontType::Type3(_) => Vec2::default(),
399        }
400    }
401
402    pub(crate) fn read_code(&self, bytes: &[u8], offset: usize) -> (u32, usize) {
403        match &self.1 {
404            FontType::Type1(_) => (bytes[offset] as u32, 1),
405            FontType::TrueType(_) => (bytes[offset] as u32, 1),
406            FontType::Type0(t) => t.read_code(bytes, offset),
407            FontType::Type3(_) => (bytes[offset] as u32, 1),
408        }
409    }
410
411    pub(crate) fn is_horizontal(&self) -> bool {
412        match &self.1 {
413            FontType::Type1(_) => true,
414            FontType::TrueType(_) => true,
415            FontType::Type0(t) => t.is_horizontal(),
416            FontType::Type3(_) => true,
417        }
418    }
419}
420
421impl CacheKey for Font<'_> {
422    fn cache_key(&self) -> u128 {
423        self.0
424    }
425}
426
427#[derive(Clone, Debug)]
428enum FontType<'a> {
429    Type1(Rc<Type1Font>),
430    TrueType(Rc<TrueTypeFont>),
431    Type0(Rc<Type0Font>),
432    Type3(Rc<Type3<'a>>),
433}
434
435#[derive(Debug)]
436enum Encoding {
437    Standard,
438    MacRoman,
439    WinAnsi,
440    MacExpert,
441    BuiltIn,
442}
443
444impl Encoding {
445    fn map_code(&self, code: u8) -> Option<&'static str> {
446        if code == 0 {
447            return Some(".notdef");
448        }
449        match self {
450            Self::Standard => standard::get(code),
451            Self::MacRoman => mac_roman::get(code).or_else(|| mac_os_roman::get(code)),
452            Self::WinAnsi => win_ansi::get(code),
453            Self::MacExpert => mac_expert::get(code),
454            Self::BuiltIn => None,
455        }
456    }
457}
458
459/// The font stretch.
460#[derive(Debug, Copy, Clone)]
461pub enum FontStretch {
462    /// Normal.
463    Normal,
464    /// Ultra condensed.
465    UltraCondensed,
466    /// Extra condensed.
467    ExtraCondensed,
468    /// Condensed.
469    Condensed,
470    /// Semi condensed.
471    SemiCondensed,
472    /// Semi expanded.
473    SemiExpanded,
474    /// Expanded.
475    Expanded,
476    /// Extra expanded.
477    ExtraExpanded,
478    /// Ultra expanded.
479    UltraExpanded,
480}
481
482impl FontStretch {
483    fn from_string(s: &str) -> Self {
484        match s {
485            "UltraCondensed" => Self::UltraCondensed,
486            "ExtraCondensed" => Self::ExtraCondensed,
487            "Condensed" => Self::Condensed,
488            "SemiCondensed" => Self::SemiCondensed,
489            "SemiExpanded" => Self::SemiExpanded,
490            "Expanded" => Self::Expanded,
491            "ExtraExpanded" => Self::ExtraExpanded,
492            "UltraExpanded" => Self::UltraExpanded,
493            _ => Self::Normal,
494        }
495    }
496}
497
498bitflags! {
499    /// Bitflags describing various characteristics of fonts.
500    #[derive(Debug)]
501    pub(crate) struct FontFlags: u32 {
502        const FIXED_PITCH = 1 << 0;
503        const SERIF = 1 << 1;
504        const SYMBOLIC = 1 << 2;
505        const SCRIPT = 1 << 3;
506        const NON_SYMBOLIC = 1 << 5;
507        const ITALIC = 1 << 6;
508        const ALL_CAP = 1 << 16;
509        const SMALL_CAP = 1 << 17;
510        const FORCE_BOLD = 1 << 18;
511    }
512}
513
514/// A query for a font.
515pub enum FontQuery {
516    /// A query for one of the 14 PDF standard fonts.
517    Standard(StandardFont),
518    /// A query for a font that is not embedded in the PDF file.
519    ///
520    /// Note that this type of query is currently not supported,
521    /// but will be implemented in the future.
522    Fallback(FallbackFontQuery),
523}
524
525/// A query for a font with specific properties.
526#[derive(Debug, Clone)]
527pub struct FallbackFontQuery {
528    /// The postscript name of the font.
529    pub post_script_name: Option<String>,
530    /// The name of the font.
531    pub font_name: Option<String>,
532    /// The family of the font.
533    pub font_family: Option<String>,
534    /// The stretch of the font.
535    pub font_stretch: FontStretch,
536    /// The weight of the font.
537    pub font_weight: u32,
538    /// Whether the font is monospaced.
539    pub is_fixed_pitch: bool,
540    /// Whether the font is serif.
541    pub is_serif: bool,
542    /// Whether the font is italic.
543    pub is_italic: bool,
544    /// Whether the font is bold.
545    pub is_bold: bool,
546    /// Whether the font is small cap.
547    pub is_small_cap: bool,
548    /// The character collection (registry/ordering) if this is a CID font.
549    pub character_collection: Option<CharacterCollection>,
550}
551
552impl FallbackFontQuery {
553    pub(crate) fn new(dict: &Dict<'_>) -> Self {
554        let post_script_name = dict
555            .get::<Name>(BASE_FONT)
556            .map(|n| strip_subset_prefix(n.as_str()).to_string());
557
558        let mut data = Self {
559            post_script_name,
560            ..Default::default()
561        };
562
563        if let Some(descriptor) = dict.get::<Dict<'_>>(FONT_DESC) {
564            data.font_name = dict
565                .get::<Name>(FONT_NAME)
566                .map(|n| strip_subset_prefix(n.as_str()).to_string());
567            data.font_family = descriptor
568                .get::<Name>(FONT_FAMILY)
569                .map(|n| n.as_str().to_string());
570            data.font_stretch = descriptor
571                .get::<Name>(FONT_STRETCH)
572                .map(|n| FontStretch::from_string(n.as_str()))
573                .unwrap_or(FontStretch::Normal);
574            data.font_weight = descriptor.get::<u32>(FONT_WEIGHT).unwrap_or(400);
575
576            if let Some(flags) = descriptor
577                .get::<u32>(FLAGS)
578                .map(FontFlags::from_bits_truncate)
579            {
580                data.is_fixed_pitch = flags.contains(FontFlags::FIXED_PITCH);
581                data.is_serif = flags.contains(FontFlags::SERIF);
582                data.is_italic = flags.contains(FontFlags::ITALIC);
583                data.is_small_cap = flags.contains(FontFlags::SMALL_CAP);
584            }
585        }
586
587        data.is_bold |= data.font_weight >= 700;
588
589        if let Some(name) = &data.post_script_name {
590            let lower = name.to_ascii_lowercase();
591            data.is_italic |=
592                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
593            data.is_bold |= lower.contains("bold")
594                || lower.contains("demi")
595                || lower.contains("semibold")
596                || lower.contains("heavy")
597                || lower.contains("black");
598        }
599
600        data
601    }
602
603    /// Do a best-effort fallback to the 14 standard fonts based on the query.
604    pub fn pick_standard_font(&self) -> StandardFont {
605        if self.is_fixed_pitch {
606            match (self.is_bold, self.is_italic) {
607                (true, true) => StandardFont::CourierBoldOblique,
608                (true, false) => StandardFont::CourierBold,
609                (false, true) => StandardFont::CourierOblique,
610                (false, false) => StandardFont::Courier,
611            }
612        } else if !self.is_serif {
613            match (self.is_bold, self.is_italic) {
614                (true, true) => StandardFont::HelveticaBoldOblique,
615                (true, false) => StandardFont::HelveticaBold,
616                (false, true) => StandardFont::HelveticaOblique,
617                (false, false) => StandardFont::Helvetica,
618            }
619        } else {
620            match (self.is_bold, self.is_italic) {
621                (true, true) => StandardFont::TimesBoldItalic,
622                (true, false) => StandardFont::TimesBold,
623                (false, true) => StandardFont::TimesItalic,
624                (false, false) => StandardFont::TimesRoman,
625            }
626        }
627    }
628}
629
630impl Default for FallbackFontQuery {
631    fn default() -> Self {
632        Self {
633            post_script_name: None,
634            font_name: None,
635            font_family: None,
636            font_stretch: FontStretch::Normal,
637            font_weight: 400,
638            is_fixed_pitch: false,
639            is_serif: false,
640            is_italic: false,
641            is_bold: false,
642            is_small_cap: false,
643            character_collection: None,
644        }
645    }
646}
647
648/// Convert a glyph name to a Unicode character, if possible.
649/// Implements the Adobe Glyph List Specification with extended fallbacks
650/// for non-standard names found in real-world PDFs.
651/// <https://github.com/adobe-type-tools/agl-specification>
652pub(crate) fn glyph_name_to_unicode(name: &str) -> Option<char> {
653    // 1. Direct AGL lookup
654    if let Some(unicode_str) = glyph_names::get(name) {
655        return unicode_str.chars().next();
656    }
657
658    // 2. uni/u prefix convention
659    if let Some(c) = unicode_from_name(name) {
660        return Some(c);
661    }
662
663    // 3. AGL §2.4: strip variant suffix after period (e.g., "A.swash" → "A",
664    //    "comma.alt" → "comma"). Only the base name before the first period
665    //    maps to Unicode.
666    if let Some(dot_pos) = name.find('.') {
667        let base = &name[..dot_pos];
668        if !base.is_empty() {
669            if let Some(c) = glyph_names::get(base).and_then(|s| s.chars().next()) {
670                return Some(c);
671            }
672            if let Some(c) = unicode_from_name(base) {
673                return Some(c);
674            }
675        }
676    }
677
678    // 4. Handle "aXX" decimal glyph names (e.g., "a65" → 'A') used by some
679    //    TeX/LaTeX generated PDFs and custom encoding vectors.
680    if name.starts_with('a')
681        && name.len() >= 2
682        && let Ok(code) = name[1..].parse::<u32>()
683        && let Some(c) = char::from_u32(code)
684        && (!c.is_control() || c == ' ')
685    {
686        return Some(c);
687    }
688
689    warn!("failed to map glyph name {} to unicode", name);
690    None
691}
692
693/// AGL §2.2: resolve a glyph name to one or more Unicode code points. Handles
694/// everything `glyph_name_to_unicode` does, plus underscore-joined ligature
695/// names (e.g. `f_i` → "fi", `f_f_i` → "ffi") which glyph-list-based fonts
696/// use for Latin ligatures. Returns the full string, so ligatures surface as
697/// their component code points rather than being dropped when the composite
698/// name isn't in the AGL.
699///
700/// Strips the variant suffix (`.ss01`, `.swash`, …) from the base name before
701/// splitting, matching AGL §2.4. Bails out if any component fails to resolve
702/// so we never produce a partial string (which would be worse than None).
703pub(crate) fn glyph_name_to_string(name: &str) -> Option<String> {
704    // Try the single-code-point fast path first — it also handles `.suffix`
705    // stripping, `uniXXXX`, and the `aNN` decimal convention.
706    if let Some(c) = glyph_name_to_unicode(name) {
707        return Some(c.to_string());
708    }
709
710    let base = name.split_once('.').map(|(b, _)| b).unwrap_or(name);
711    if !base.contains('_') {
712        return None;
713    }
714
715    let mut out = String::new();
716    for part in base.split('_') {
717        if part.is_empty() {
718            return None;
719        }
720        let c = glyph_name_to_unicode(part)?;
721        out.push(c);
722    }
723    (!out.is_empty()).then_some(out)
724}
725
726pub(crate) fn unicode_from_name(name: &str) -> Option<char> {
727    let convert = |input: &str| u32::from_str_radix(input, 16).ok().and_then(char::from_u32);
728
729    name.starts_with("uni")
730        .then(|| name.get(3..).and_then(convert))
731        .or_else(|| {
732            name.starts_with("u")
733                .then(|| name.get(1..).and_then(convert))
734        })
735        .flatten()
736}
737
738pub(crate) fn read_to_unicode(
739    dict: &Dict<'_>,
740    cmap_resolver: &CMapResolverFn,
741    warning_sink: &WarningSinkFn,
742) -> Option<CMap> {
743    dict.get::<Stream<'_>>(TO_UNICODE)
744        .and_then(|s| decode_or_warn(&s, warning_sink))
745        // See PDFJS-11915, where `Identity-H` is used for `ToUnicode`. I don't
746        // believe it's valid, but at least mupdf seems to be able to deal with it.
747        .or_else(|| {
748            dict.get::<Name>(TO_UNICODE)
749                .and_then(|name| (cmap_resolver)(CMapName::from_bytes(name.as_ref())))
750                .map(|d| d.to_vec())
751        })
752        .and_then(|data| {
753            let cmap_resolver = cmap_resolver.clone();
754            CMap::parse(&data, move |name| (cmap_resolver)(name))
755        })
756}
757
758/// Build a 256-entry Unicode map from the font's `/Encoding` entry.
759/// Used as a fallback when no `/ToUnicode` CMap is present.
760pub(crate) fn synthesize_unicode_map_from_encoding(dict: &Dict<'_>) -> Option<[Option<char>; 256]> {
761    let (base_encoding, differences) = true_type::read_encoding(dict);
762    // Only synthesize if there is something to map (encoding name or differences).
763    // For BuiltIn with no differences we cannot reliably guess the mapping.
764    if matches!(base_encoding, Encoding::BuiltIn) && differences.is_empty() {
765        return None;
766    }
767    let mut table: [Option<char>; 256] = [None; 256];
768    for code in 0u8..=255 {
769        let glyph_name = differences
770            .get(&code)
771            .map(String::as_str)
772            .or_else(|| base_encoding.map_code(code));
773        if let Some(name) = glyph_name {
774            table[code as usize] = glyph_name_to_unicode(name);
775        }
776    }
777    Some(table)
778}
779
780// When mapping to glyphs, some fonts might only have a glyph for the "normalized"
781// name. These aliases come from the Adobe Glyph List (AGL) and are safe
782// synonyms: they are consulted only when the literal lookup already failed, so
783// adding new entries broadens recovery without overriding correct mappings.
784pub(crate) fn normalized_glyph_name(mut name: &str) -> &str {
785    if name == "nbspace" || name == "nonbreakingspace" {
786        name = "space";
787    }
788
789    if name == "sfthyphen" || name == "softhyphen" {
790        name = "hyphen";
791    }
792
793    name
794}
795
796#[cfg(test)]
797mod normalized_glyph_name_tests {
798    use super::normalized_glyph_name;
799
800    #[test]
801    fn maps_space_aliases() {
802        assert_eq!(normalized_glyph_name("nbspace"), "space");
803        assert_eq!(normalized_glyph_name("nonbreakingspace"), "space");
804    }
805
806    #[test]
807    fn maps_hyphen_aliases() {
808        assert_eq!(normalized_glyph_name("sfthyphen"), "hyphen");
809        assert_eq!(normalized_glyph_name("softhyphen"), "hyphen");
810    }
811
812    #[test]
813    fn preserves_unrelated_names() {
814        assert_eq!(normalized_glyph_name("A"), "A");
815        assert_eq!(normalized_glyph_name(".notdef"), ".notdef");
816        assert_eq!(normalized_glyph_name("hyphen"), "hyphen");
817        assert_eq!(normalized_glyph_name("space"), "space");
818    }
819}
820
821#[cfg(test)]
822mod glyph_name_to_unicode_tests {
823    use super::{glyph_name_to_string, glyph_name_to_unicode};
824
825    #[test]
826    fn standard_agl_name() {
827        assert_eq!(glyph_name_to_unicode("A"), Some('A'));
828        assert_eq!(glyph_name_to_unicode("space"), Some(' '));
829        assert_eq!(glyph_name_to_unicode("hyphen"), Some('-'));
830    }
831
832    #[test]
833    fn ligature_name_underscore_joined() {
834        // AGL maps `fi` directly (U+FB01); the underscore form is what
835        // CFF fonts without a ToUnicode map tend to emit for the same
836        // ligature. We want the decomposed string so readers see "fi".
837        assert_eq!(glyph_name_to_string("f_i"), Some("fi".to_string()));
838        assert_eq!(glyph_name_to_string("f_f_i"), Some("ffi".to_string()));
839        assert_eq!(glyph_name_to_string("A_B_C"), Some("ABC".to_string()));
840    }
841
842    #[test]
843    fn ligature_name_with_suffix() {
844        // Variant suffix must be stripped before splitting on `_`.
845        assert_eq!(glyph_name_to_string("f_i.alt"), Some("fi".to_string()));
846    }
847
848    #[test]
849    fn ligature_name_falls_back_to_single_char_path() {
850        // Direct AGL hit should still win and yield the precomposed char.
851        assert_eq!(glyph_name_to_string("fi"), Some("\u{FB01}".to_string()));
852        assert_eq!(glyph_name_to_string("A"), Some("A".to_string()));
853    }
854
855    #[test]
856    fn ligature_name_rejects_unresolvable_component() {
857        // If any segment is unknown we return None rather than a partial
858        // string — a partial would be more confusing than a drop.
859        assert!(glyph_name_to_string("A_totallyUnknownGlyph").is_none());
860        assert!(glyph_name_to_string("_").is_none());
861        assert!(glyph_name_to_string("A__B").is_none());
862    }
863
864    #[test]
865    fn uni_prefix() {
866        assert_eq!(glyph_name_to_unicode("uni0041"), Some('A'));
867        assert_eq!(glyph_name_to_unicode("uni00E9"), Some('é'));
868    }
869
870    #[test]
871    fn u_prefix() {
872        assert_eq!(glyph_name_to_unicode("u0041"), Some('A'));
873        assert_eq!(glyph_name_to_unicode("u2022"), Some('•'));
874    }
875
876    #[test]
877    fn variant_suffix_stripped() {
878        assert_eq!(glyph_name_to_unicode("A.swash"), Some('A'));
879        assert_eq!(glyph_name_to_unicode("comma.alt"), Some(','));
880        assert_eq!(glyph_name_to_unicode("space.narrow"), Some(' '));
881    }
882
883    #[test]
884    fn variant_suffix_with_uni_prefix() {
885        assert_eq!(glyph_name_to_unicode("uni0041.ss01"), Some('A'));
886    }
887
888    #[test]
889    fn a_decimal_glyph_name() {
890        assert_eq!(glyph_name_to_unicode("a65"), Some('A'));
891        assert_eq!(glyph_name_to_unicode("a32"), Some(' '));
892        assert_eq!(glyph_name_to_unicode("a97"), Some('a'));
893    }
894
895    #[test]
896    fn a_decimal_rejects_control_chars() {
897        assert_eq!(glyph_name_to_unicode("a0"), None);
898        assert_eq!(glyph_name_to_unicode("a7"), None);
899    }
900
901    #[test]
902    fn unknown_name_returns_none() {
903        assert_eq!(glyph_name_to_unicode("xyzzynonexistent"), None);
904    }
905}
906
907#[cfg(test)]
908mod fallback_font_query_tests {
909    use super::*;
910
911    fn query_with(name: &str, flags: u32, weight: u32) -> FallbackFontQuery {
912        let mut q = FallbackFontQuery {
913            post_script_name: Some(name.to_string()),
914            font_weight: weight,
915            ..Default::default()
916        };
917
918        let font_flags = FontFlags::from_bits_truncate(flags);
919        q.is_fixed_pitch = font_flags.contains(FontFlags::FIXED_PITCH);
920        q.is_serif = font_flags.contains(FontFlags::SERIF);
921        q.is_italic = font_flags.contains(FontFlags::ITALIC);
922        q.is_small_cap = font_flags.contains(FontFlags::SMALL_CAP);
923
924        q.is_bold |= q.font_weight >= 700;
925
926        if let Some(name) = &q.post_script_name {
927            let lower = name.to_ascii_lowercase();
928            q.is_italic |=
929                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
930            q.is_bold |= lower.contains("bold")
931                || lower.contains("demi")
932                || lower.contains("semibold")
933                || lower.contains("heavy")
934                || lower.contains("black");
935        }
936        q
937    }
938
939    #[test]
940    fn fixed_pitch_flag_selects_courier() {
941        let q = query_with("LetterGothic", FontFlags::FIXED_PITCH.bits(), 400);
942        assert!(matches!(q.pick_standard_font(), StandardFont::Courier));
943    }
944
945    #[test]
946    fn demi_in_name_selects_bold() {
947        let q = query_with("FranklinGothic-Demi", FontFlags::SERIF.bits(), 400);
948        assert!(q.is_bold);
949        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
950    }
951
952    #[test]
953    fn oblique_detected_as_italic() {
954        let q = query_with("HelveticaNeue-LightOblique", 0, 400);
955        assert!(q.is_italic);
956        assert!(matches!(
957            q.pick_standard_font(),
958            StandardFont::HelveticaOblique
959        ));
960    }
961
962    #[test]
963    fn font_weight_700_detected_as_bold() {
964        let q = query_with("CustomFont", 0, 700);
965        assert!(q.is_bold);
966        assert!(matches!(
967            q.pick_standard_font(),
968            StandardFont::HelveticaBold
969        ));
970    }
971
972    #[test]
973    fn semibold_detected_as_bold() {
974        let q = query_with("AGaramond-Semibold", FontFlags::SERIF.bits(), 400);
975        assert!(q.is_bold);
976        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
977    }
978}