spectre_parse 1.0.0

Lazy PDF parser — xref-only at open(), objects materialize on demand. Read-only. Powers the spectre_pdf extraction crate.
Documentation
//! ToUnicode CMap parser (PDF spec §9.10 + Adobe Technical Note 5411).
//!
//! A CMap stream is a PostScript program; we only need `bfchar` and
//! `bfrange` sections plus the `codespacerange` 1-byte/2-byte signal.
//! The parser scans for section markers and ignores everything else,
//! tolerating the extra whitespace and trailing garbage that real-world
//! CMaps tend to carry.

use crate::error::Result;
use indexmap::IndexMap;

#[derive(Debug, Clone, Default)]
pub struct ToUnicodeCMap {
    /// Source code (1- or 2-byte packed into u32) → Unicode string.
    pub(crate) mappings: IndexMap<u32, String>,
    /// True when source codes are 2-byte (typical for CID fonts).
    pub(crate) is_two_byte: bool,
}

impl ToUnicodeCMap {
    pub fn parse(data: &[u8]) -> Result<Self> {
        let mut out = Self::default();
        let mut max_code: u32 = 0;

        // bfchar sections
        let mut cursor = 0;
        while let Some((start, end)) = find_section(data, b"beginbfchar", b"endbfchar", cursor) {
            let body = &data[start..end];
            parse_bfchar(body, &mut out.mappings, &mut max_code);
            cursor = end;
        }
        // bfrange sections
        cursor = 0;
        while let Some((start, end)) = find_section(data, b"beginbfrange", b"endbfrange", cursor) {
            let body = &data[start..end];
            parse_bfrange(body, &mut out.mappings, &mut max_code);
            cursor = end;
        }
        // codespacerange — defines source-code width.
        out.is_two_byte = max_code > 0xFF
            || find_section(data, b"begincodespacerange", b"endcodespacerange", 0)
                .map(|(s, e)| codespace_is_two_byte(&data[s..e]))
                .unwrap_or(false);
        Ok(out)
    }

    /// Decode `bytes` through the mapping table.
    pub fn decode(&self, bytes: &[u8]) -> Result<String> {
        let mut out = String::with_capacity(bytes.len());
        if self.is_two_byte {
            let mut i = 0;
            while i + 1 < bytes.len() {
                let code = ((bytes[i] as u32) << 8) | bytes[i + 1] as u32;
                if let Some(s) = self.mappings.get(&code) {
                    out.push_str(s);
                }
                i += 2;
            }
        } else {
            for &b in bytes {
                if let Some(s) = self.mappings.get(&(b as u32)) {
                    out.push_str(s);
                }
            }
        }
        Ok(out)
    }
}

/// Returns `(start, end)` where `start` is the byte after `start_marker`
/// and `end` is the byte of `end_marker`.
fn find_section(
    data: &[u8],
    start_marker: &[u8],
    end_marker: &[u8],
    from: usize,
) -> Option<(usize, usize)> {
    let hay = &data[from.min(data.len())..];
    let s_off = hay
        .windows(start_marker.len())
        .position(|w| w == start_marker)?;
    let s = from + s_off + start_marker.len();
    let tail = &data[s..];
    let e_off = tail.windows(end_marker.len()).position(|w| w == end_marker)?;
    Some((s, s + e_off))
}

fn parse_bfchar(body: &[u8], out: &mut IndexMap<u32, String>, max_code: &mut u32) {
    let mut iter = HexIter::new(body);
    while let Some(src) = iter.next_hex_string() {
        let Some(dst) = iter.next_hex_string() else { break };
        let code = src.to_code();
        let value = dst.to_unicode_string();
        if code > *max_code {
            *max_code = code;
        }
        out.insert(code, value);
    }
}

fn parse_bfrange(body: &[u8], out: &mut IndexMap<u32, String>, max_code: &mut u32) {
    let mut iter = HexIter::new(body);
    while let Some(first) = iter.next_hex_string() {
        let Some(last) = iter.next_hex_string() else { break };
        let Some(third) = iter.next_third(body, &mut iter.pos()) else { break };
        let start = first.to_code();
        let end = last.to_code();
        if start > *max_code {
            *max_code = start;
        }
        if end > *max_code {
            *max_code = end;
        }
 // Defensive: a malformed CMap with end<start would underflow
        // u32 in `end-start` and spin for a long time. Cap range size.
        if end < start || end - start > 65_535 {
            continue;
        }
        match third {
            Third::Hex(h) => {
                // base + i offset for each source code in range
                let base_codes = h.to_unicode_codepoints();
                let mut code = start;
                for i in 0..=(end - start) {
                    let mut buf = String::new();
                    for (j, &cp) in base_codes.iter().enumerate() {
                        if j == base_codes.len() - 1 {
                            if let Some(c) = char::from_u32(cp + i) {
                                buf.push(c);
                            }
                        } else if let Some(c) = char::from_u32(cp) {
                            buf.push(c);
                        }
                    }
                    out.insert(code, buf);
                    code += 1;
                }
            }
            Third::Array(items) => {
                let mut code = start;
                for s in items {
                    if code > end {
                        break;
                    }
                    out.insert(code, s);
                    code += 1;
                }
            }
        }
    }
}

enum Third {
    Hex(HexString),
    Array(Vec<String>),
}

#[derive(Debug, Clone)]
struct HexString(Vec<u8>);

impl HexString {
    fn to_code(&self) -> u32 {
        let mut v: u32 = 0;
        for &b in &self.0 {
            v = (v << 8) | b as u32;
        }
        v
    }
    fn to_unicode_string(&self) -> String {
        let cps = self.to_unicode_codepoints();
        let mut s = String::new();
        for cp in cps {
            if let Some(c) = char::from_u32(cp) {
                s.push(c);
            }
        }
        s
    }
    fn to_unicode_codepoints(&self) -> Vec<u32> {
 // ToUnicode CMap value strings are UTF-16BE: pairs of bytes
 // form 16-bit code units, surrogate pairs combine into a
        // single codepoint.
        let bytes = &self.0;
        let mut units: Vec<u16> = Vec::with_capacity(bytes.len() / 2);
        let mut i = 0;
        while i + 1 < bytes.len() {
            units.push(((bytes[i] as u16) << 8) | bytes[i + 1] as u16);
            i += 2;
        }
        let mut out: Vec<u32> = Vec::with_capacity(units.len());
        let mut j = 0;
        while j < units.len() {
            let u = units[j];
            if (0xD800..0xDC00).contains(&u) && j + 1 < units.len() {
                let l = units[j + 1];
                if (0xDC00..0xE000).contains(&l) {
                    let cp = 0x10000
                        + (((u - 0xD800) as u32) << 10)
                        + (l - 0xDC00) as u32;
                    out.push(cp);
                    j += 2;
                    continue;
                }
            }
            out.push(u as u32);
            j += 1;
        }
        out
    }
}

/// Iterator over `<hex>` strings inside a CMap section, skipping all
/// whitespace + non-hex content.
struct HexIter<'a> {
    buf: &'a [u8],
    pos: usize,
}

impl<'a> HexIter<'a> {
    fn new(buf: &'a [u8]) -> Self {
        Self { buf, pos: 0 }
    }

    fn pos(&self) -> usize {
        self.pos
    }

    fn next_hex_string(&mut self) -> Option<HexString> {
        while self.pos < self.buf.len() {
            let b = self.buf[self.pos];
            if b == b'<' {
                self.pos += 1;
                let start = self.pos;
                while self.pos < self.buf.len() && self.buf[self.pos] != b'>' {
                    self.pos += 1;
                }
                let end = self.pos;
                if self.pos < self.buf.len() {
                    self.pos += 1; // consume '>'
                }
                return Some(HexString(decode_hex(&self.buf[start..end])));
            }
            self.pos += 1;
        }
        None
    }

    fn next_third(&mut self, body: &[u8], _shared_pos: &mut usize) -> Option<Third> {
        // Peek the next non-whitespace byte to decide between hex or array.
        while self.pos < self.buf.len() && self.buf[self.pos].is_ascii_whitespace() {
            self.pos += 1;
        }
        if self.pos >= self.buf.len() {
            return None;
        }
        match self.buf[self.pos] {
            b'<' => self.next_hex_string().map(Third::Hex),
            b'[' => {
                self.pos += 1;
                let mut items: Vec<String> = Vec::new();
                while self.pos < self.buf.len() {
                    let b = self.buf[self.pos];
                    if b == b']' {
                        self.pos += 1;
                        break;
                    }
                    if b == b'<' {
                        if let Some(h) = self.next_hex_string() {
                            items.push(h.to_unicode_string());
                        }
                        continue;
                    }
                    self.pos += 1;
                }
                let _ = body;
                Some(Third::Array(items))
            }
            _ => None,
        }
    }
}

fn decode_hex(input: &[u8]) -> Vec<u8> {
    let mut out: Vec<u8> = Vec::with_capacity(input.len() / 2);
    let mut nybble: i16 = -1;
    for &b in input {
        if b.is_ascii_whitespace() {
            continue;
        }
        let v = match b {
            b'0'..=b'9' => Some(b - b'0'),
            b'a'..=b'f' => Some(b - b'a' + 10),
            b'A'..=b'F' => Some(b - b'A' + 10),
            _ => None,
        };
        let Some(v) = v else { break };
        if nybble < 0 {
            nybble = v as i16;
        } else {
            out.push((((nybble as u8) << 4) | v) & 0xff);
            nybble = -1;
        }
    }
    out
}

fn codespace_is_two_byte(body: &[u8]) -> bool {
    let mut iter = HexIter::new(body);
    while let Some(s) = iter.next_hex_string() {
        if s.0.len() >= 2 {
            return true;
        }
    }
    false
}