Skip to main content

pdf_interpret/font/
mod.rs

1//! Interacting with the different kinds of PDF fonts.
2
3use crate::cache::Cache;
4use crate::context::Context;
5use crate::device::Device;
6use crate::font::cid::Type0Font;
7use crate::font::generated::{
8    glyph_names, mac_expert, mac_os_roman, mac_roman, standard, win_ansi,
9};
10use crate::font::true_type::TrueTypeFont;
11use crate::font::type1::Type1Font;
12use crate::font::type3::Type3;
13use crate::interpret::state::State;
14use crate::util::decode_or_warn;
15use crate::{CMapResolverFn, CacheKey, FontResolverFn, InterpreterSettings, Paint, WarningSinkFn};
16use bitflags::bitflags;
17use kurbo::{Affine, BezPath, Vec2};
18use log::warn;
19use outline::OutlineFont;
20use pdf_syntax::object::Name;
21use pdf_syntax::object::dict::keys::SUBTYPE;
22use pdf_syntax::object::dict::keys::*;
23use pdf_syntax::object::{Dict, Stream};
24use pdf_syntax::page::Resources;
25use pdf_syntax::xref::XRef;
26use skrifa::GlyphId;
27use std::fmt::Debug;
28use std::ops::Deref;
29use std::rc::Rc;
30use std::sync::Arc;
31
32mod blob;
33mod cid;
34mod generated;
35mod glyph_simulator;
36pub(crate) mod outline;
37mod standard_font;
38mod true_type;
39mod type1;
40pub(crate) mod type3;
41
42pub(crate) const UNITS_PER_EM: f32 = 1000.0;
43
44pub(crate) fn stretch_glyph(path: BezPath, expected_width: f32, actual_width: f32) -> BezPath {
45    if actual_width != 0.0 && actual_width != expected_width {
46        let stretch_factor = expected_width / actual_width;
47        Affine::scale_non_uniform(stretch_factor as f64, 1.0) * path
48    } else {
49        path
50    }
51}
52
53/// A container for the bytes of a PDF file.
54pub type FontData = Arc<dyn AsRef<[u8]> + Send + Sync>;
55
56/// Strip the 6-character subset prefix from a PostScript font name.
57///
58/// PDF subset fonts use names like "ABCDEF+TimesNewRoman". This function
59/// returns `TimesNewRoman` from such a name, or the original name if no
60/// valid prefix is found.
61pub(crate) fn strip_subset_prefix(name: &str) -> &str {
62    match name.split_once('+') {
63        Some((prefix, rest)) if prefix.len() == 6 => rest,
64        _ => name,
65    }
66}
67
68use crate::util::hash128;
69pub use outline::OutlineFontData;
70use pdf_font::cmap::{BfString, CMap, CMapName, CharacterCollection};
71pub use standard_font::StandardFont;
72
73/// A glyph that can be drawn.
74pub enum Glyph<'a> {
75    /// A glyph defined by an outline.
76    Outline(OutlineGlyph),
77    /// A type3 glyph, defined by PDF drawing instructions.
78    Type3(Box<Type3Glyph<'a>>),
79}
80
81impl Glyph<'_> {
82    /// Returns the Unicode code point for this glyph, if available.
83    ///
84    /// This method attempts to determine the Unicode character that this glyph
85    /// represents. The exact fallback chain depends on the font type:
86    ///
87    /// **For Outline Fonts (Type1, TrueType, CFF):**
88    /// 1. `ToUnicode` cmap
89    /// 2. Glyph name → Unicode (via Adobe Glyph List)
90    /// 3. Unicode naming conventions (e.g., "uni0041", "u0041")
91    ///
92    /// **For CID Fonts (Type0):**
93    /// 1. `ToUnicode` cmap
94    ///
95    ///
96    /// **For Type3 Fonts:**
97    /// 1. `ToUnicode` cmap
98    ///
99    /// Returns `None` if the Unicode value could not be determined.
100    ///
101    /// Please note that this method is still somewhat experimental and might
102    /// not work reliably in all cases.
103    pub fn as_unicode(&self) -> Option<BfString> {
104        match self {
105            Glyph::Outline(g) => g.as_unicode(),
106            Glyph::Type3(g) => g.as_unicode(),
107        }
108    }
109}
110
111/// An identifier that uniquely identifies a glyph, for caching purposes.
112#[derive(Clone, Debug)]
113pub struct GlyphIdentifier {
114    id: GlyphId,
115    font: OutlineFont,
116}
117
118impl CacheKey for GlyphIdentifier {
119    fn cache_key(&self) -> u128 {
120        hash128(&(self.id, self.font.cache_key()))
121    }
122}
123
124/// A glyph defined by an outline.
125#[derive(Clone, Debug)]
126pub struct OutlineGlyph {
127    pub(crate) id: GlyphId,
128    pub(crate) font: OutlineFont,
129    pub(crate) char_code: u32,
130}
131
132impl OutlineGlyph {
133    /// Return the outline of the glyph, assuming an upem value of 1000.
134    pub fn outline(&self) -> BezPath {
135        self.font.outline_glyph(self.id, self.char_code)
136    }
137
138    /// Return the identifier of the glyph. You can use this to calculate the cache key
139    /// for the glyph.
140    ///
141    /// Note that the `glyph_transform` attribute is not considered in the cache key of
142    /// the identifier, only the glyph ID and the font.
143    pub fn identifier(&self) -> GlyphIdentifier {
144        GlyphIdentifier {
145            id: self.id,
146            font: self.font.clone(),
147        }
148    }
149
150    /// Returns the Unicode code point for this glyph, if available.
151    ///
152    /// See [`Glyph::as_unicode`] for details on the fallback chain used.
153    pub fn as_unicode(&self) -> Option<BfString> {
154        self.font.char_code_to_unicode(self.char_code)
155    }
156
157    /// Get raw font bytes and metadata for downstream use.
158    ///
159    /// Returns `None` for Type1 fonts.
160    pub fn font_data(&self) -> Option<OutlineFontData> {
161        self.font.font_data()
162    }
163
164    /// PostScript name of the underlying font, if determinable.
165    ///
166    /// Works even when [`font_data`](Self::font_data) returns
167    /// `None` — e.g. non-embedded standard-14 Type1 fonts still
168    /// expose their canonical PostScript name through this
169    /// accessor. Returns `None` when no name source is available.
170    ///
171    /// Surfaced to support the WASM `getTextPositions()` `fontName`
172    /// field for the editor contract.
173    pub fn postscript_name(&self) -> Option<String> {
174        self.font.postscript_name()
175    }
176
177    /// Get the glyph ID within the font.
178    pub fn glyph_id(&self) -> GlyphId {
179        self.id
180    }
181
182    /// Get the advance width for this glyph.
183    ///
184    /// The advance width is how far to move horizontally after drawing
185    /// this glyph before drawing the next one.
186    pub fn advance_width(&self) -> Option<f32> {
187        self.font.glyph_advance_width(self.char_code)
188    }
189
190    /// Vertical font metrics in /1000 em.
191    ///
192    /// Returns `(ascent, descent, cap_height, x_height)` from the font's
193    /// embedded OS/2 tables (via skrifa) when available, or from the
194    /// Standard-14 AFM fallback table for non-embedded standard fonts.
195    /// Returns `None` only when no metrics source is reachable (e.g.
196    /// CFF fonts without a standard-font fallback, Type3).
197    pub fn font_metrics(&self) -> Option<(f64, f64, Option<f64>, Option<f64>)> {
198        self.font.font_metrics()
199    }
200
201    /// Get the cache key for this glyph's font.
202    ///
203    /// This identifies the font uniquely, even when `font_data()` returns `None`
204    /// (e.g., for Type1 fonts). Useful for grouping glyphs by font.
205    pub fn font_cache_key(&self) -> u128 {
206        self.font.cache_key()
207    }
208}
209
210/// A type3 glyph.
211#[derive(Clone)]
212pub struct Type3Glyph<'a> {
213    pub(crate) font: Rc<Type3<'a>>,
214    pub(crate) glyph_id: GlyphId,
215    pub(crate) state: State<'a>,
216    pub(crate) parent_resources: Resources<'a>,
217    pub(crate) cache: Cache,
218    pub(crate) xref: &'a XRef,
219    pub(crate) settings: InterpreterSettings,
220    pub(crate) char_code: u32,
221}
222
223/// A glyph defined by PDF drawing instructions.
224impl<'a> Type3Glyph<'a> {
225    /// Draw the type3 glyph to the given device.
226    pub fn interpret(
227        &self,
228        device: &mut impl Device<'a>,
229        transform: Affine,
230        glyph_transform: Affine,
231        paint: &Paint<'a>,
232    ) {
233        self.font
234            .render_glyph(self, transform, glyph_transform, paint, device);
235    }
236
237    /// Returns the Unicode code point for this glyph, if available.
238    ///
239    /// Note: Type3 fonts can only provide Unicode via `ToUnicode` cmap.
240    pub fn as_unicode(&self) -> Option<BfString> {
241        self.font.char_code_to_unicode(self.char_code)
242    }
243}
244
245impl CacheKey for Type3Glyph<'_> {
246    fn cache_key(&self) -> u128 {
247        hash128(&(self.font.cache_key(), self.glyph_id))
248    }
249}
250
251#[derive(Clone, Debug)]
252pub(crate) struct Font<'a>(u128, FontType<'a>);
253
254impl<'a> Font<'a> {
255    pub(crate) fn new(
256        dict: &Dict<'a>,
257        font_resolver: &FontResolverFn,
258        cmap_resolver: &CMapResolverFn,
259        warning_sink: &WarningSinkFn,
260    ) -> Option<Self> {
261        let f_type = match dict.get::<Name>(SUBTYPE)?.deref() {
262            TYPE1 | MM_TYPE1 => FontType::Type1(Rc::new(Type1Font::new(
263                dict,
264                font_resolver,
265                cmap_resolver,
266                warning_sink,
267            )?)),
268            // PDFBOX-5463: PDF viewers seem to accept OpenType as well.
269            TRUE_TYPE | OPEN_TYPE => FontType::TrueType(Rc::new(TrueTypeFont::new(
270                dict,
271                font_resolver,
272                cmap_resolver,
273                warning_sink,
274            )?)),
275            TYPE0 => FontType::Type0(Rc::new(Type0Font::new(
276                dict,
277                font_resolver,
278                cmap_resolver,
279                warning_sink,
280            )?)),
281            TYPE3 => FontType::Type3(Rc::new(Type3::new(dict, cmap_resolver, warning_sink)?)),
282            f => {
283                warn!(
284                    "unimplemented font type {:?}",
285                    std::str::from_utf8(f).unwrap_or("unknown type")
286                );
287
288                return None;
289            }
290        };
291
292        let cache_key = dict.cache_key();
293
294        Some(Self(cache_key, f_type))
295    }
296
297    pub(crate) fn new_standard(
298        standard_font: StandardFont,
299        font_resolver: &FontResolverFn,
300    ) -> Option<Self> {
301        let font = Type1Font::new_standard(standard_font, font_resolver)?;
302
303        Some(Self(0, FontType::Type1(Rc::new(font))))
304    }
305
306    pub(crate) fn map_code(&self, code: u32) -> GlyphId {
307        match &self.1 {
308            FontType::Type1(f) => {
309                debug_assert!(code <= u8::MAX as u32);
310
311                f.map_code(code as u8)
312            }
313            FontType::TrueType(t) => {
314                debug_assert!(code <= u8::MAX as u32);
315
316                t.map_code(code as u8)
317            }
318            FontType::Type0(t) => t.map_code(code),
319            FontType::Type3(t) => {
320                debug_assert!(code <= u8::MAX as u32);
321
322                t.map_code(code as u8)
323            }
324        }
325    }
326
327    pub(crate) fn get_glyph(
328        &self,
329        glyph: GlyphId,
330        char_code: u32,
331        ctx: &mut Context<'a>,
332        resources: &Resources<'a>,
333        origin_displacement: Vec2,
334    ) -> (Glyph<'a>, Affine) {
335        let glyph_transform = ctx.get().text_state.full_transform()
336            * Affine::scale(1.0 / UNITS_PER_EM as f64)
337            * Affine::translate(origin_displacement);
338
339        let glyph = match &self.1 {
340            FontType::Type1(t) => {
341                let font = OutlineFont::Type1(t.clone());
342                Glyph::Outline(OutlineGlyph {
343                    id: glyph,
344                    font,
345                    char_code,
346                })
347            }
348            FontType::TrueType(t) => {
349                let font = OutlineFont::TrueType(t.clone());
350                Glyph::Outline(OutlineGlyph {
351                    id: glyph,
352                    font,
353                    char_code,
354                })
355            }
356            FontType::Type0(t) => {
357                let font = OutlineFont::Type0(t.clone());
358                Glyph::Outline(OutlineGlyph {
359                    id: glyph,
360                    font,
361                    char_code,
362                })
363            }
364            FontType::Type3(t) => {
365                let shape_glyph = Type3Glyph {
366                    font: t.clone(),
367                    glyph_id: glyph,
368                    state: ctx.get().clone(),
369                    parent_resources: resources.clone(),
370                    cache: ctx.object_cache.clone(),
371                    xref: ctx.xref,
372                    settings: ctx.settings.clone(),
373                    char_code,
374                };
375
376                Glyph::Type3(Box::new(shape_glyph))
377            }
378        };
379
380        (glyph, glyph_transform)
381    }
382
383    pub(crate) fn code_advance(&self, code: u32) -> Vec2 {
384        match &self.1 {
385            FontType::Type1(t) => {
386                debug_assert!(code <= u8::MAX as u32);
387
388                Vec2::new(t.glyph_width(code as u8).unwrap_or(0.0) as f64, 0.0)
389            }
390            FontType::TrueType(t) => {
391                debug_assert!(code <= u8::MAX as u32);
392
393                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
394            }
395            FontType::Type0(t) => t.code_advance(code),
396            FontType::Type3(t) => {
397                debug_assert!(code <= u8::MAX as u32);
398
399                Vec2::new(t.glyph_width(code as u8) as f64, 0.0)
400            }
401        }
402    }
403
404    pub(crate) fn origin_displacement(&self, code: u32) -> Vec2 {
405        match &self.1 {
406            FontType::Type1(_) => Vec2::default(),
407            FontType::TrueType(_) => Vec2::default(),
408            FontType::Type0(t) => t.origin_displacement(code),
409            FontType::Type3(_) => Vec2::default(),
410        }
411    }
412
413    pub(crate) fn read_code(&self, bytes: &[u8], offset: usize) -> (u32, usize) {
414        match &self.1 {
415            FontType::Type1(_) => (bytes[offset] as u32, 1),
416            FontType::TrueType(_) => (bytes[offset] as u32, 1),
417            FontType::Type0(t) => t.read_code(bytes, offset),
418            FontType::Type3(_) => (bytes[offset] as u32, 1),
419        }
420    }
421
422    pub(crate) fn is_horizontal(&self) -> bool {
423        match &self.1 {
424            FontType::Type1(_) => true,
425            FontType::TrueType(_) => true,
426            FontType::Type0(t) => t.is_horizontal(),
427            FontType::Type3(_) => true,
428        }
429    }
430}
431
432impl CacheKey for Font<'_> {
433    fn cache_key(&self) -> u128 {
434        self.0
435    }
436}
437
438#[derive(Clone, Debug)]
439enum FontType<'a> {
440    Type1(Rc<Type1Font>),
441    TrueType(Rc<TrueTypeFont>),
442    Type0(Rc<Type0Font>),
443    Type3(Rc<Type3<'a>>),
444}
445
446#[derive(Debug)]
447enum Encoding {
448    Standard,
449    MacRoman,
450    WinAnsi,
451    MacExpert,
452    BuiltIn,
453}
454
455impl Encoding {
456    fn map_code(&self, code: u8) -> Option<&'static str> {
457        if code == 0 {
458            return Some(".notdef");
459        }
460        match self {
461            Self::Standard => standard::get(code),
462            Self::MacRoman => mac_roman::get(code).or_else(|| mac_os_roman::get(code)),
463            Self::WinAnsi => win_ansi::get(code),
464            Self::MacExpert => mac_expert::get(code),
465            Self::BuiltIn => None,
466        }
467    }
468}
469
470/// The font stretch.
471#[derive(Debug, Copy, Clone)]
472pub enum FontStretch {
473    /// Normal.
474    Normal,
475    /// Ultra condensed.
476    UltraCondensed,
477    /// Extra condensed.
478    ExtraCondensed,
479    /// Condensed.
480    Condensed,
481    /// Semi condensed.
482    SemiCondensed,
483    /// Semi expanded.
484    SemiExpanded,
485    /// Expanded.
486    Expanded,
487    /// Extra expanded.
488    ExtraExpanded,
489    /// Ultra expanded.
490    UltraExpanded,
491}
492
493impl FontStretch {
494    fn from_string(s: &str) -> Self {
495        match s {
496            "UltraCondensed" => Self::UltraCondensed,
497            "ExtraCondensed" => Self::ExtraCondensed,
498            "Condensed" => Self::Condensed,
499            "SemiCondensed" => Self::SemiCondensed,
500            "SemiExpanded" => Self::SemiExpanded,
501            "Expanded" => Self::Expanded,
502            "ExtraExpanded" => Self::ExtraExpanded,
503            "UltraExpanded" => Self::UltraExpanded,
504            _ => Self::Normal,
505        }
506    }
507}
508
509bitflags! {
510    /// Bitflags describing various characteristics of fonts.
511    #[derive(Debug)]
512    pub(crate) struct FontFlags: u32 {
513        const FIXED_PITCH = 1 << 0;
514        const SERIF = 1 << 1;
515        const SYMBOLIC = 1 << 2;
516        const SCRIPT = 1 << 3;
517        const NON_SYMBOLIC = 1 << 5;
518        const ITALIC = 1 << 6;
519        const ALL_CAP = 1 << 16;
520        const SMALL_CAP = 1 << 17;
521        const FORCE_BOLD = 1 << 18;
522    }
523}
524
525/// A query for a font.
526pub enum FontQuery {
527    /// A query for one of the 14 PDF standard fonts.
528    Standard(StandardFont),
529    /// A query for a font that is not embedded in the PDF file.
530    ///
531    /// Note that this type of query is currently not supported,
532    /// but will be implemented in the future.
533    Fallback(FallbackFontQuery),
534}
535
536/// A query for a font with specific properties.
537#[derive(Debug, Clone)]
538pub struct FallbackFontQuery {
539    /// The postscript name of the font.
540    pub post_script_name: Option<String>,
541    /// The name of the font.
542    pub font_name: Option<String>,
543    /// The family of the font.
544    pub font_family: Option<String>,
545    /// The stretch of the font.
546    pub font_stretch: FontStretch,
547    /// The weight of the font.
548    pub font_weight: u32,
549    /// Whether the font is monospaced.
550    pub is_fixed_pitch: bool,
551    /// Whether the font is serif.
552    pub is_serif: bool,
553    /// Whether the font is italic.
554    pub is_italic: bool,
555    /// Whether the font is bold.
556    pub is_bold: bool,
557    /// Whether the font is small cap.
558    pub is_small_cap: bool,
559    /// The character collection (registry/ordering) if this is a CID font.
560    pub character_collection: Option<CharacterCollection>,
561}
562
563impl FallbackFontQuery {
564    pub(crate) fn new(dict: &Dict<'_>) -> Self {
565        let post_script_name = dict
566            .get::<Name>(BASE_FONT)
567            .map(|n| strip_subset_prefix(n.as_str()).to_string());
568
569        let mut data = Self {
570            post_script_name,
571            ..Default::default()
572        };
573
574        if let Some(descriptor) = dict.get::<Dict<'_>>(FONT_DESC) {
575            data.font_name = dict
576                .get::<Name>(FONT_NAME)
577                .map(|n| strip_subset_prefix(n.as_str()).to_string());
578            data.font_family = descriptor
579                .get::<Name>(FONT_FAMILY)
580                .map(|n| n.as_str().to_string());
581            data.font_stretch = descriptor
582                .get::<Name>(FONT_STRETCH)
583                .map(|n| FontStretch::from_string(n.as_str()))
584                .unwrap_or(FontStretch::Normal);
585            data.font_weight = descriptor.get::<u32>(FONT_WEIGHT).unwrap_or(400);
586
587            if let Some(flags) = descriptor
588                .get::<u32>(FLAGS)
589                .map(FontFlags::from_bits_truncate)
590            {
591                data.is_fixed_pitch = flags.contains(FontFlags::FIXED_PITCH);
592                data.is_serif = flags.contains(FontFlags::SERIF);
593                data.is_italic = flags.contains(FontFlags::ITALIC);
594                data.is_small_cap = flags.contains(FontFlags::SMALL_CAP);
595            }
596        }
597
598        data.is_bold |= data.font_weight >= 700;
599
600        if let Some(name) = &data.post_script_name {
601            let lower = name.to_ascii_lowercase();
602            data.is_italic |=
603                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
604            data.is_bold |= lower.contains("bold")
605                || lower.contains("demi")
606                || lower.contains("semibold")
607                || lower.contains("heavy")
608                || lower.contains("black");
609        }
610
611        data
612    }
613
614    /// Do a best-effort fallback to the 14 standard fonts based on the query.
615    pub fn pick_standard_font(&self) -> StandardFont {
616        if self.is_fixed_pitch {
617            match (self.is_bold, self.is_italic) {
618                (true, true) => StandardFont::CourierBoldOblique,
619                (true, false) => StandardFont::CourierBold,
620                (false, true) => StandardFont::CourierOblique,
621                (false, false) => StandardFont::Courier,
622            }
623        } else if !self.is_serif {
624            match (self.is_bold, self.is_italic) {
625                (true, true) => StandardFont::HelveticaBoldOblique,
626                (true, false) => StandardFont::HelveticaBold,
627                (false, true) => StandardFont::HelveticaOblique,
628                (false, false) => StandardFont::Helvetica,
629            }
630        } else {
631            match (self.is_bold, self.is_italic) {
632                (true, true) => StandardFont::TimesBoldItalic,
633                (true, false) => StandardFont::TimesBold,
634                (false, true) => StandardFont::TimesItalic,
635                (false, false) => StandardFont::TimesRoman,
636            }
637        }
638    }
639}
640
641impl Default for FallbackFontQuery {
642    fn default() -> Self {
643        Self {
644            post_script_name: None,
645            font_name: None,
646            font_family: None,
647            font_stretch: FontStretch::Normal,
648            font_weight: 400,
649            is_fixed_pitch: false,
650            is_serif: false,
651            is_italic: false,
652            is_bold: false,
653            is_small_cap: false,
654            character_collection: None,
655        }
656    }
657}
658
659/// Convert a glyph name to a Unicode character, if possible.
660/// Implements the Adobe Glyph List Specification with extended fallbacks
661/// for non-standard names found in real-world PDFs.
662/// <https://github.com/adobe-type-tools/agl-specification>
663pub(crate) fn glyph_name_to_unicode(name: &str) -> Option<char> {
664    // 1. Direct AGL lookup
665    if let Some(unicode_str) = glyph_names::get(name) {
666        return unicode_str.chars().next();
667    }
668
669    // 2. uni/u prefix convention
670    if let Some(c) = unicode_from_name(name) {
671        return Some(c);
672    }
673
674    // 3. AGL §2.4: strip variant suffix after period (e.g., "A.swash" → "A",
675    //    "comma.alt" → "comma"). Only the base name before the first period
676    //    maps to Unicode.
677    if let Some(dot_pos) = name.find('.') {
678        let base = &name[..dot_pos];
679        if !base.is_empty() {
680            if let Some(c) = glyph_names::get(base).and_then(|s| s.chars().next()) {
681                return Some(c);
682            }
683            if let Some(c) = unicode_from_name(base) {
684                return Some(c);
685            }
686        }
687    }
688
689    // 4. Handle "aXX" decimal glyph names (e.g., "a65" → 'A') used by some
690    //    TeX/LaTeX generated PDFs and custom encoding vectors.
691    if name.starts_with('a')
692        && name.len() >= 2
693        && let Ok(code) = name[1..].parse::<u32>()
694        && let Some(c) = char::from_u32(code)
695        && (!c.is_control() || c == ' ')
696    {
697        return Some(c);
698    }
699
700    warn!("failed to map glyph name {} to unicode", name);
701    None
702}
703
704/// AGL §2.2: resolve a glyph name to one or more Unicode code points. Handles
705/// everything `glyph_name_to_unicode` does, plus underscore-joined ligature
706/// names (e.g. `f_i` → "fi", `f_f_i` → "ffi") which glyph-list-based fonts
707/// use for Latin ligatures. Returns the full string, so ligatures surface as
708/// their component code points rather than being dropped when the composite
709/// name isn't in the AGL.
710///
711/// Strips the variant suffix (`.ss01`, `.swash`, …) from the base name before
712/// splitting, matching AGL §2.4. Bails out if any component fails to resolve
713/// so we never produce a partial string (which would be worse than None).
714pub(crate) fn glyph_name_to_string(name: &str) -> Option<String> {
715    // Try the single-code-point fast path first — it also handles `.suffix`
716    // stripping, `uniXXXX`, and the `aNN` decimal convention.
717    if let Some(c) = glyph_name_to_unicode(name) {
718        return Some(c.to_string());
719    }
720
721    let base = name.split_once('.').map(|(b, _)| b).unwrap_or(name);
722    if !base.contains('_') {
723        return None;
724    }
725
726    let mut out = String::new();
727    for part in base.split('_') {
728        if part.is_empty() {
729            return None;
730        }
731        let c = glyph_name_to_unicode(part)?;
732        out.push(c);
733    }
734    (!out.is_empty()).then_some(out)
735}
736
737pub(crate) fn unicode_from_name(name: &str) -> Option<char> {
738    let convert = |input: &str| u32::from_str_radix(input, 16).ok().and_then(char::from_u32);
739
740    name.starts_with("uni")
741        .then(|| name.get(3..).and_then(convert))
742        .or_else(|| {
743            name.starts_with("u")
744                .then(|| name.get(1..).and_then(convert))
745        })
746        .flatten()
747}
748
749pub(crate) fn read_to_unicode(
750    dict: &Dict<'_>,
751    cmap_resolver: &CMapResolverFn,
752    warning_sink: &WarningSinkFn,
753) -> Option<CMap> {
754    dict.get::<Stream<'_>>(TO_UNICODE)
755        .and_then(|s| decode_or_warn(&s, warning_sink))
756        // See PDFJS-11915, where `Identity-H` is used for `ToUnicode`. I don't
757        // believe it's valid, but at least mupdf seems to be able to deal with it.
758        .or_else(|| {
759            dict.get::<Name>(TO_UNICODE)
760                .and_then(|name| (cmap_resolver)(CMapName::from_bytes(name.as_ref())))
761                .map(|d| d.to_vec())
762        })
763        .and_then(|data| {
764            let cmap_resolver = cmap_resolver.clone();
765            CMap::parse(&data, move |name| (cmap_resolver)(name))
766        })
767}
768
769/// Build a 256-entry Unicode map from the font's `/Encoding` entry.
770/// Used as a fallback when no `/ToUnicode` CMap is present.
771pub(crate) fn synthesize_unicode_map_from_encoding(dict: &Dict<'_>) -> Option<[Option<char>; 256]> {
772    let (base_encoding, differences) = true_type::read_encoding(dict);
773    // Only synthesize if there is something to map (encoding name or differences).
774    // For BuiltIn with no differences we cannot reliably guess the mapping.
775    if matches!(base_encoding, Encoding::BuiltIn) && differences.is_empty() {
776        return None;
777    }
778    let mut table: [Option<char>; 256] = [None; 256];
779    for code in 0u8..=255 {
780        let glyph_name = differences
781            .get(&code)
782            .map(String::as_str)
783            .or_else(|| base_encoding.map_code(code));
784        if let Some(name) = glyph_name {
785            table[code as usize] = glyph_name_to_unicode(name);
786        }
787    }
788    Some(table)
789}
790
791// When mapping to glyphs, some fonts might only have a glyph for the "normalized"
792// name. These aliases come from the Adobe Glyph List (AGL) and are safe
793// synonyms: they are consulted only when the literal lookup already failed, so
794// adding new entries broadens recovery without overriding correct mappings.
795pub(crate) fn normalized_glyph_name(mut name: &str) -> &str {
796    if name == "nbspace" || name == "nonbreakingspace" {
797        name = "space";
798    }
799
800    if name == "sfthyphen" || name == "softhyphen" {
801        name = "hyphen";
802    }
803
804    name
805}
806
807#[cfg(test)]
808mod normalized_glyph_name_tests {
809    use super::normalized_glyph_name;
810
811    #[test]
812    fn maps_space_aliases() {
813        assert_eq!(normalized_glyph_name("nbspace"), "space");
814        assert_eq!(normalized_glyph_name("nonbreakingspace"), "space");
815    }
816
817    #[test]
818    fn maps_hyphen_aliases() {
819        assert_eq!(normalized_glyph_name("sfthyphen"), "hyphen");
820        assert_eq!(normalized_glyph_name("softhyphen"), "hyphen");
821    }
822
823    #[test]
824    fn preserves_unrelated_names() {
825        assert_eq!(normalized_glyph_name("A"), "A");
826        assert_eq!(normalized_glyph_name(".notdef"), ".notdef");
827        assert_eq!(normalized_glyph_name("hyphen"), "hyphen");
828        assert_eq!(normalized_glyph_name("space"), "space");
829    }
830}
831
832#[cfg(test)]
833mod glyph_name_to_unicode_tests {
834    use super::{glyph_name_to_string, glyph_name_to_unicode};
835
836    #[test]
837    fn standard_agl_name() {
838        assert_eq!(glyph_name_to_unicode("A"), Some('A'));
839        assert_eq!(glyph_name_to_unicode("space"), Some(' '));
840        assert_eq!(glyph_name_to_unicode("hyphen"), Some('-'));
841    }
842
843    #[test]
844    fn ligature_name_underscore_joined() {
845        // AGL maps `fi` directly (U+FB01); the underscore form is what
846        // CFF fonts without a ToUnicode map tend to emit for the same
847        // ligature. We want the decomposed string so readers see "fi".
848        assert_eq!(glyph_name_to_string("f_i"), Some("fi".to_string()));
849        assert_eq!(glyph_name_to_string("f_f_i"), Some("ffi".to_string()));
850        assert_eq!(glyph_name_to_string("A_B_C"), Some("ABC".to_string()));
851    }
852
853    #[test]
854    fn ligature_name_with_suffix() {
855        // Variant suffix must be stripped before splitting on `_`.
856        assert_eq!(glyph_name_to_string("f_i.alt"), Some("fi".to_string()));
857    }
858
859    #[test]
860    fn ligature_name_falls_back_to_single_char_path() {
861        // Direct AGL hit should still win and yield the precomposed char.
862        assert_eq!(glyph_name_to_string("fi"), Some("\u{FB01}".to_string()));
863        assert_eq!(glyph_name_to_string("A"), Some("A".to_string()));
864    }
865
866    #[test]
867    fn ligature_name_rejects_unresolvable_component() {
868        // If any segment is unknown we return None rather than a partial
869        // string — a partial would be more confusing than a drop.
870        assert!(glyph_name_to_string("A_totallyUnknownGlyph").is_none());
871        assert!(glyph_name_to_string("_").is_none());
872        assert!(glyph_name_to_string("A__B").is_none());
873    }
874
875    #[test]
876    fn uni_prefix() {
877        assert_eq!(glyph_name_to_unicode("uni0041"), Some('A'));
878        assert_eq!(glyph_name_to_unicode("uni00E9"), Some('é'));
879    }
880
881    #[test]
882    fn u_prefix() {
883        assert_eq!(glyph_name_to_unicode("u0041"), Some('A'));
884        assert_eq!(glyph_name_to_unicode("u2022"), Some('•'));
885    }
886
887    #[test]
888    fn variant_suffix_stripped() {
889        assert_eq!(glyph_name_to_unicode("A.swash"), Some('A'));
890        assert_eq!(glyph_name_to_unicode("comma.alt"), Some(','));
891        assert_eq!(glyph_name_to_unicode("space.narrow"), Some(' '));
892    }
893
894    #[test]
895    fn variant_suffix_with_uni_prefix() {
896        assert_eq!(glyph_name_to_unicode("uni0041.ss01"), Some('A'));
897    }
898
899    #[test]
900    fn a_decimal_glyph_name() {
901        assert_eq!(glyph_name_to_unicode("a65"), Some('A'));
902        assert_eq!(glyph_name_to_unicode("a32"), Some(' '));
903        assert_eq!(glyph_name_to_unicode("a97"), Some('a'));
904    }
905
906    #[test]
907    fn a_decimal_rejects_control_chars() {
908        assert_eq!(glyph_name_to_unicode("a0"), None);
909        assert_eq!(glyph_name_to_unicode("a7"), None);
910    }
911
912    #[test]
913    fn unknown_name_returns_none() {
914        assert_eq!(glyph_name_to_unicode("xyzzynonexistent"), None);
915    }
916}
917
918#[cfg(test)]
919mod fallback_font_query_tests {
920    use super::*;
921
922    fn query_with(name: &str, flags: u32, weight: u32) -> FallbackFontQuery {
923        let mut q = FallbackFontQuery {
924            post_script_name: Some(name.to_string()),
925            font_weight: weight,
926            ..Default::default()
927        };
928
929        let font_flags = FontFlags::from_bits_truncate(flags);
930        q.is_fixed_pitch = font_flags.contains(FontFlags::FIXED_PITCH);
931        q.is_serif = font_flags.contains(FontFlags::SERIF);
932        q.is_italic = font_flags.contains(FontFlags::ITALIC);
933        q.is_small_cap = font_flags.contains(FontFlags::SMALL_CAP);
934
935        q.is_bold |= q.font_weight >= 700;
936
937        if let Some(name) = &q.post_script_name {
938            let lower = name.to_ascii_lowercase();
939            q.is_italic |=
940                lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
941            q.is_bold |= lower.contains("bold")
942                || lower.contains("demi")
943                || lower.contains("semibold")
944                || lower.contains("heavy")
945                || lower.contains("black");
946        }
947        q
948    }
949
950    #[test]
951    fn fixed_pitch_flag_selects_courier() {
952        let q = query_with("LetterGothic", FontFlags::FIXED_PITCH.bits(), 400);
953        assert!(matches!(q.pick_standard_font(), StandardFont::Courier));
954    }
955
956    #[test]
957    fn demi_in_name_selects_bold() {
958        let q = query_with("FranklinGothic-Demi", FontFlags::SERIF.bits(), 400);
959        assert!(q.is_bold);
960        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
961    }
962
963    #[test]
964    fn oblique_detected_as_italic() {
965        let q = query_with("HelveticaNeue-LightOblique", 0, 400);
966        assert!(q.is_italic);
967        assert!(matches!(
968            q.pick_standard_font(),
969            StandardFont::HelveticaOblique
970        ));
971    }
972
973    #[test]
974    fn font_weight_700_detected_as_bold() {
975        let q = query_with("CustomFont", 0, 700);
976        assert!(q.is_bold);
977        assert!(matches!(
978            q.pick_standard_font(),
979            StandardFont::HelveticaBold
980        ));
981    }
982
983    #[test]
984    fn semibold_detected_as_bold() {
985        let q = query_with("AGaramond-Semibold", FontFlags::SERIF.bits(), 400);
986        assert!(q.is_bold);
987        assert!(matches!(q.pick_standard_font(), StandardFont::TimesBold));
988    }
989}