pdf-interpret 0.5.0

A crate for interpreting PDF files.
Documentation
use crate::font::UNITS_PER_EM;
use crate::font::outline::OutlinePath;
use kurbo::{Affine, BezPath};
use pdf_font::{Matrix, cff, type1};
use skrifa::instance::{LocationRef, Size};
use skrifa::metrics::GlyphMetrics;
use skrifa::outline::{DrawSettings, Engine, HintingInstance, HintingOptions, Target};
use skrifa::raw::TableProvider;
use skrifa::{FontRef, GlyphId, MetadataProvider, OutlineGlyphCollection};
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::sync::{Arc, OnceLock};
use yoke::{Yoke, Yokeable};

type FontData = Arc<dyn AsRef<[u8]> + Send + Sync>;
type OpenTypeFontYoke = Yoke<OTFYoke<'static>, FontData>;
type CffFontYoke = Yoke<CFFYoke<'static>, FontData>;

/// A font blob for type 1 fonts.
#[derive(Clone)]
pub(crate) struct Type1FontBlob(Arc<type1::Table>);

impl Debug for Type1FontBlob {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "Type1 Font {{ .. }}")
    }
}

impl Type1FontBlob {
    pub(crate) fn new(data: FontData) -> Option<Self> {
        let table = type1::Table::parse(data.as_ref().as_ref())?;

        Some(Self(Arc::new(table)))
    }

    pub(crate) fn table(&self) -> &type1::Table {
        self.0.as_ref()
    }

    pub(crate) fn outline_glyph(&self, name: &str) -> BezPath {
        let mut path = OutlinePath::new();

        self.table().outline(name, &mut path).unwrap_or_default();

        Affine::scale(UNITS_PER_EM as f64) * convert_matrix(self.table().matrix()) * path.take()
    }
}

/// A font blob for CFF-based fonts.
#[derive(Clone)]
pub(crate) struct CffFontBlob(Arc<CffFontYoke>);

impl Debug for CffFontBlob {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "Type1 Font {{ .. }}")
    }
}

impl CffFontBlob {
    pub(crate) fn new(data: FontData) -> Option<Self> {
        let _ = cff::Table::parse(data.as_ref().as_ref())?;

        let yoke = Yoke::<CFFYoke<'static>, FontData>::attach_to_cart(data.clone(), |data| {
            let table = cff::Table::parse(data.as_ref()).unwrap();
            CFFYoke { table }
        });

        Some(Self(Arc::new(yoke)))
    }

    pub(crate) fn font_data(&self) -> FontData {
        self.0.backing_cart().clone()
    }

    pub(crate) fn table(&self) -> &cff::Table<'_> {
        &self.0.as_ref().get().table
    }

    pub(crate) fn outline_glyph(&self, glyph: GlyphId) -> BezPath {
        let mut path = OutlinePath::new();

        let glyph_id = pdf_font::GlyphId(glyph.to_u32() as u16);

        let Ok(_) = self.table().outline(glyph_id, &mut path) else {
            return BezPath::new();
        };

        let matrix = self.table().glyph_matrix(glyph_id);

        Affine::scale(UNITS_PER_EM as f64) * convert_matrix(matrix) * path.take()
    }
}

/// A font blob for OpenType fonts.
#[derive(Clone)]
pub(crate) struct OpenTypeFontBlob {
    font: Arc<OpenTypeFontYoke>,
    cmap_inverse: Arc<OnceLock<HashMap<u32, char>>>,
}

impl Debug for OpenTypeFontBlob {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "OpenType Font {{ .. }}")
    }
}

impl OpenTypeFontBlob {
    pub(crate) fn new(data: FontData, index: u32) -> Option<Self> {
        // Check first whether the font is valid so we can unwrap in the closure.
        let f = FontRef::from_index(data.as_ref().as_ref(), index).ok()?;
        // Reject fonts with invalid post table version, fixes pdf.js issue 9462. Not sure if there
        // is a better fix, for some reason skrifa accepts the font which is completely invalid.
        let invalid = f.post().is_ok_and(|p| {
            !matches!(
                p.version().to_major_minor(),
                (1, 0) | (2, 0) | (2, 5) | (3, 0)
            )
        });

        if invalid {
            return None;
        }

        let font_ref_yoke =
            Yoke::<OTFYoke<'static>, FontData>::attach_to_cart(data.clone(), |data| {
                let font_ref = FontRef::from_index(data.as_ref(), index).unwrap();

                let hinting_instance = if font_ref.outline_glyphs().require_interpreter() {
                    HintingInstance::new(
                        &font_ref.outline_glyphs(),
                        Size::new(UNITS_PER_EM),
                        LocationRef::default(),
                        HintingOptions {
                            engine: Engine::Interpreter,
                            target: Target::Mono,
                        },
                    )
                    .ok()
                } else {
                    None
                };

                OTFYoke {
                    font_ref: font_ref.clone(),
                    outline_glyphs: font_ref.outline_glyphs(),
                    hinting_instance,
                    glyph_metrics: font_ref
                        .glyph_metrics(Size::new(UNITS_PER_EM), LocationRef::default()),
                }
            });

        Some(Self {
            font: Arc::new(font_ref_yoke),
            cmap_inverse: Arc::new(OnceLock::new()),
        })
    }

    pub(crate) fn font_data(&self) -> FontData {
        self.font.backing_cart().clone()
    }

    pub(crate) fn font_ref(&self) -> &FontRef<'_> {
        &self.font.as_ref().get().font_ref
    }

    pub(crate) fn glyph_metrics(&self) -> &GlyphMetrics<'_> {
        &self.font.as_ref().get().glyph_metrics
    }

    fn outline_glyphs(&self) -> &OutlineGlyphCollection<'_> {
        &self.font.as_ref().get().outline_glyphs
    }

    /// Resolve a glyph id back to Unicode via the embedded font's cmap table.
    ///
    /// We prefer the Windows BMP (3,1) cmap because that is the mapping most
    /// PDFs rely on for CIDFontType2 subsets. If it is absent, fall back to any
    /// Unicode cmap to avoid discarding otherwise-decodable text.
    pub(crate) fn glyph_id_to_unicode(&self, glyph: GlyphId) -> Option<char> {
        let inverse = self.cmap_inverse.get_or_init(|| {
            let data = self.font_data();
            build_truetype_cmap_inverse(data.as_ref().as_ref())
        });

        inverse.get(&glyph.to_u32()).copied()
    }

    pub(crate) fn outline_glyph(&self, glyph: GlyphId) -> BezPath {
        let mut path = OutlinePath::new();

        let draw_settings = if let Some(instance) = self.font.get().hinting_instance.as_ref() {
            // Note: We always hint at the font size `UNITS_PER_EM`, which obviously isn't very useful. We don't do this
            // for better text quality (right now), but instead because there are some PDFs with obscure fonts that
            // actually render wrongly if hinting is disabled!
            DrawSettings::hinted(instance, false)
        } else {
            DrawSettings::unhinted(Size::new(UNITS_PER_EM), LocationRef::default())
        };

        let Some(outline) = self.outline_glyphs().get(glyph) else {
            return BezPath::new();
        };

        let _ = outline.draw(draw_settings, &mut path);
        path.take()
    }

    pub(crate) fn num_glyphs(&self) -> u16 {
        self.font_ref().maxp().map(|m| m.num_glyphs()).unwrap_or(0)
    }
}

fn build_truetype_cmap_inverse(font_data: &[u8]) -> HashMap<u32, char> {
    let face = match ttf_parser::Face::parse(font_data, 0) {
        Ok(face) => face,
        Err(_) => return HashMap::new(),
    };
    let Some(cmap_table) = face.tables().cmap else {
        return HashMap::new();
    };

    // Rank Unicode subtables: prefer full-range (format 12 / UCS-4) over BMP
    // (format 4) so fonts with supplementary-plane glyphs are fully covered.
    // We merge ALL unicode subtables so that BMP mappings from (3,1) are not
    // lost when a sparse (3,10) format-12 table is also present.
    fn rank(subtable: &ttf_parser::cmap::Subtable<'_>) -> u32 {
        match (subtable.platform_id, subtable.encoding_id) {
            (ttf_parser::PlatformId::Windows, 10) => 100,
            (ttf_parser::PlatformId::Unicode, 6) => 95,
            (ttf_parser::PlatformId::Unicode, 4) => 90,
            (ttf_parser::PlatformId::Windows, 1) => 80,
            (ttf_parser::PlatformId::Unicode, 3) => 70,
            (ttf_parser::PlatformId::Unicode, _) => 60,
            _ => 0,
        }
    }

    // Collect and sort ascending by rank so higher-priority subtables are
    // processed last and their mappings overwrite lower-priority ones.
    let mut ranked: Vec<_> = cmap_table
        .subtables
        .into_iter()
        .filter(|s| s.is_unicode())
        .collect();
    ranked.sort_by_key(|s| rank(&s));

    let mut inverse = HashMap::new();

    for subtable in &ranked {
        subtable.codepoints(|cp| {
            if let Some(ch) = char::from_u32(cp)
                && let Some(glyph_id) = subtable.glyph_index(cp)
                && glyph_id.0 != 0
            {
                inverse.insert(glyph_id.0 as u32, ch);
            }
        });
    }

    inverse
}

fn convert_matrix(matrix: Matrix) -> Affine {
    Affine::new([
        matrix.sx as f64,
        matrix.ky as f64,
        matrix.kx as f64,
        matrix.sy as f64,
        matrix.tx as f64,
        matrix.ty as f64,
    ])
}

#[derive(Yokeable, Clone)]
struct OTFYoke<'a> {
    font_ref: FontRef<'a>,
    glyph_metrics: GlyphMetrics<'a>,
    hinting_instance: Option<HintingInstance>,
    outline_glyphs: OutlineGlyphCollection<'a>,
}

#[derive(Yokeable, Clone)]
struct CFFYoke<'a> {
    table: cff::Table<'a>,
}