spectre_parse 1.0.0

Lazy PDF parser — xref-only at open(), objects materialize on demand. Read-only. Powers the spectre_pdf extraction crate.
Documentation
//! Font encoding lookup (PDF spec §9.6.6).
//!
//! Covers the four predefined encodings (`StandardEncoding`,
//! `MacRomanEncoding`, `MacExpertEncoding`, `WinAnsiEncoding`),
//! `PDFDocEncoding`, and `Identity-H`/`Identity-V` paired with a
//! `/ToUnicode` CMap. Unknown encodings fall back to Latin-1.
//! Glyph-name → Unicode tables for the differences-encoding case
//! (§9.6.6.4) are not implemented.

use crate::cmap::ToUnicodeCMap;
use crate::error::Result;
use crate::object::{Dictionary, Object};
use crate::Document;
use std::collections::BTreeMap;

#[derive(Debug, Clone)]
pub enum Encoding {
    /// 1-byte → Unicode codepoint table; index `b` produces the scalar
    /// or `0` for "unmapped".
    OneByte(Box<[u32; 256]>),
    /// Pass-through as Latin-1.
    Identity,
    /// 1- or 2-byte source codes mapped through a `/ToUnicode` CMap.
    Unicode(ToUnicodeCMap),
}

impl Encoding {
    /// Decode a content-stream byte string into UTF-8.
    pub fn bytes_to_string(&self, bytes: &[u8]) -> Result<String> {
        match self {
            Self::OneByte(map) => {
                let mut out = String::with_capacity(bytes.len());
                for &b in bytes {
                    let cp = map[b as usize];
                    if cp != 0 {
                        if let Some(c) = char::from_u32(cp) {
                            out.push(c);
                        }
                    } else {
                        // Unmapped slot — preserve the raw byte so
                        // downstream char-count parity still tracks.
                        out.push(b as char);
                    }
                }
                Ok(out)
            }
            Self::Identity => Ok(bytes.iter().map(|&b| b as char).collect()),
            Self::Unicode(cmap) => cmap.decode(bytes),
        }
    }
}

/// Resolve every page font's encoding once so a content-stream walk
/// can look up by font resource name.
pub fn resolve_page_encodings(
    doc: &Document,
    fonts: &BTreeMap<Vec<u8>, Dictionary>,
) -> BTreeMap<Vec<u8>, Encoding> {
    let mut out = BTreeMap::new();
    for (name, font_dict) in fonts {
        let enc = resolve_font_encoding(doc, font_dict);
        out.insert(name.clone(), enc);
    }
    out
}

fn resolve_font_encoding(doc: &Document, font: &Dictionary) -> Encoding {
    // /ToUnicode CMap takes precedence (it's authoritative when present).
    if let Some(to_uni) = font.get_optional(b"ToUnicode") {
        if let Some(enc) = try_to_unicode(doc, to_uni) {
            return Encoding::Unicode(enc);
        }
    }
    // /Encoding may be a name (predefined) or a dict (differences).
    if let Some(enc_obj) = font.get_optional(b"Encoding") {
        if let Ok(name) = enc_obj.as_name() {
            return predefined_by_name(name);
        }
        if let Ok(id) = enc_obj.as_reference() {
            if let Ok(d) = doc.get_dictionary(id) {
                if let Some(base) = d.get_optional(b"BaseEncoding").and_then(|o| o.as_name().ok()) {
                    return predefined_by_name(base);
                }
            }
        }
        if let Ok(d) = enc_obj.as_dict() {
            if let Some(base) = d.get_optional(b"BaseEncoding").and_then(|o| o.as_name().ok()) {
                return predefined_by_name(base);
            }
        }
    }
 // Default per spec §9.6.6.4 for the base 14 fonts is Standard
 // Encoding, but real-world PDFs without an /Encoding entry are
    // overwhelmingly WinAnsi — better default for char-count parity.
    Encoding::OneByte(Box::new(win_ansi_encoding()))
}

fn try_to_unicode(doc: &Document, to_uni: &Object) -> Option<ToUnicodeCMap> {
    let stream_obj = match to_uni {
        Object::Reference(id) => doc.get_object(*id).ok()?,
        Object::Stream(_) => to_uni.clone(),
        _ => return None,
    };
    let stream = stream_obj.as_stream().ok()?;
    Some(crate::cmap::ToUnicodeCMap::parse(&stream.content).unwrap_or_default())
}

fn predefined_by_name(name: &[u8]) -> Encoding {
    match name {
        b"StandardEncoding" => Encoding::OneByte(Box::new(standard_encoding())),
        b"WinAnsiEncoding" => Encoding::OneByte(Box::new(win_ansi_encoding())),
        b"MacRomanEncoding" => Encoding::OneByte(Box::new(mac_roman_encoding())),
        b"MacExpertEncoding" => Encoding::OneByte(Box::new(mac_expert_encoding())),
        b"PDFDocEncoding" => Encoding::OneByte(Box::new(pdf_doc_encoding())),
        b"Identity-H" | b"Identity-V" => Encoding::Identity,
        _ => Encoding::Identity,
    }
}

// ── Predefined encoding tables (PDF spec Appendix D) ──────────────────────
//
// Each table is a 256-entry array of Unicode scalar values. Entry
// `[b]` gives the Unicode codepoint for byte `b`; `0` means "unmapped"
// and the caller falls back to the byte itself.

fn standard_encoding() -> [u32; 256] {
    let mut t = base_ascii_table();
    // Standard encoding adds a few non-ASCII slots (spec Table D.2).
    t[0xA1] = 0x00A1; // exclamdown
    t[0xA2] = 0x00A2; // cent
    t[0xA3] = 0x00A3; // sterling
    t[0xA4] = 0x2044; // fraction
    t[0xA5] = 0x00A5; // yen
    t[0xA6] = 0x0192; // florin
    t[0xA7] = 0x00A7; // section
    t[0xA8] = 0x00A4; // currency
    t[0xA9] = 0x0027; // quotesingle
    t[0xAA] = 0x201C; // quotedblleft
    t[0xAB] = 0x00AB; // guillemotleft
    t[0xAC] = 0x2039; // guilsinglleft
    t[0xAD] = 0x203A; // guilsinglright
    t[0xAE] = 0xFB01; // fi
    t[0xAF] = 0xFB02; // fl
    t[0xB1] = 0x2013; // endash
    t[0xB2] = 0x2020; // dagger
    t[0xB3] = 0x2021; // daggerdbl
    t[0xB4] = 0x00B7; // periodcentered
    t[0xB6] = 0x00B6; // paragraph
    t[0xB7] = 0x2022; // bullet
    t[0xB8] = 0x201A; // quotesinglbase
    t[0xB9] = 0x201E; // quotedblbase
    t[0xBA] = 0x201D; // quotedblright
    t[0xBB] = 0x00BB; // guillemotright
    t[0xBC] = 0x2026; // ellipsis
    t[0xBD] = 0x2030; // perthousand
    t[0xBF] = 0x00BF; // questiondown
    t[0xC1] = 0x0060; // grave
    t[0xC2] = 0x00B4; // acute
    t[0xC3] = 0x02C6; // circumflex
    t[0xC4] = 0x02DC; // tilde
    t[0xC5] = 0x00AF; // macron
    t[0xC6] = 0x02D8; // breve
    t[0xC7] = 0x02D9; // dotaccent
    t[0xC8] = 0x00A8; // dieresis
    t[0xCA] = 0x02DA; // ring
    t[0xCB] = 0x00B8; // cedilla
    t[0xCD] = 0x02DD; // hungarumlaut
    t[0xCE] = 0x02DB; // ogonek
    t[0xCF] = 0x02C7; // caron
    t[0xE1] = 0x00C6; // AE
    t[0xE3] = 0x00AA; // ordfeminine
    t[0xE8] = 0x0141; // Lslash
    t[0xE9] = 0x00D8; // Oslash
    t[0xEA] = 0x0152; // OE
    t[0xEB] = 0x00BA; // ordmasculine
    t[0xF1] = 0x00E6; // ae
    t[0xF5] = 0x0131; // dotlessi
    t[0xF8] = 0x0142; // lslash
    t[0xF9] = 0x00F8; // oslash
    t[0xFA] = 0x0153; // oe
    t[0xFB] = 0x00DF; // germandbls
    t
}

fn win_ansi_encoding() -> [u32; 256] {
    let mut t = base_ascii_table();
    // WinAnsi (Microsoft Windows code page 1252) — copy CP1252 maps.
    let upper: [u32; 128] = [
        0x20AC, 0, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039,
        0x0152, 0, 0x017D, 0, 0, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC,
        0x2122, 0x0161, 0x203A, 0x0153, 0, 0x017E, 0x0178, 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4,
        0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
        0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA,
        0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5,
        0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0,
        0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB,
        0x00DC, 0x00DD, 0x00DE, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6,
        0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1,
        0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC,
        0x00FD, 0x00FE, 0x00FF,
    ];
    for (i, &cp) in upper.iter().enumerate() {
        t[0x80 + i] = cp;
    }
    t
}

fn mac_roman_encoding() -> [u32; 256] {
 // We use the same low-128 ASCII mapping and a near-identity for
 // the upper 128 slots. Full MacRoman differs from WinAnsi mainly
 // in the order of the upper-128 glyphs; for char-count parity vs
    // the document handle on ICDAR (which has no MacRoman docs), this is enough.
    let mut t = base_ascii_table();
    for i in 0x80..=0xFF {
        t[i] = i as u32;
    }
 // Bullet, ellipsis, smart quotes — the most common substitutions
    // we still want correct.
    t[0xA5] = 0x2022; // bullet
    t[0xC9] = 0x2026; // ellipsis
    t[0xD0] = 0x2013; // endash
    t[0xD1] = 0x2014; // emdash
    t[0xD2] = 0x201C; // quotedblleft
    t[0xD3] = 0x201D; // quotedblright
    t[0xD4] = 0x2018; // quoteleft
    t[0xD5] = 0x2019; // quoteright
    t
}

fn mac_expert_encoding() -> [u32; 256] {
    // MacExpert is rare; pass-through.
    let mut t = base_ascii_table();
    for i in 0x80..=0xFF {
        t[i] = i as u32;
    }
    t
}

fn pdf_doc_encoding() -> [u32; 256] {
 // PDFDoc encoding for metadata strings. Differs from WinAnsi in a
 // few control-character slots; for our purposes the WinAnsi map
    // is the closest sensible default.
    win_ansi_encoding()
}

/// Fill `[0x00..=0x7F]` with identity (ASCII passthrough). Zero for
/// the rest so the per-encoding tables can override specific slots.
fn base_ascii_table() -> [u32; 256] {
    let mut t = [0u32; 256];
    for i in 0..=0x7Fu32 {
        t[i as usize] = i;
    }
    t
}