spectre_parse 1.0.0

Lazy PDF parser — xref-only at open(), objects materialize on demand. Read-only. Powers the spectre_pdf extraction crate.
Documentation
//! Content-stream tokenization + the `extract_text` walk.
//!
//! Text extraction handles `BT` / `ET` / `Tf` / `Tj` / `TJ` / `'` /
//! `"` / `T*`. The `'` / `"` / `T*` operators are the ones upstream
//! lopdf silently drops; we cover them.

use crate::encoding::Encoding;
use crate::error::Result;
use crate::object::{Object, StringFormat};
use crate::parser::{is_token_end, is_ws, read_object, Cursor};
use std::collections::BTreeMap;

/// One content-stream instruction.
#[derive(Debug, Clone)]
pub struct Operation {
    pub operator: String,
    pub operands: Vec<Object>,
}

impl Operation {
    pub fn new(operator: impl Into<String>, operands: Vec<Object>) -> Self {
        Self {
            operator: operator.into(),
            operands,
        }
    }
}

#[derive(Debug, Clone, Default)]
pub struct Content {
    pub operations: Vec<Operation>,
}

impl Content {
    /// Tokenize a decoded content stream into operations.
    pub fn decode(data: &[u8]) -> Result<Self> {
        let mut c = Cursor::at(data, 0);
        let mut ops: Vec<Operation> = Vec::with_capacity(256);
        let mut buf: Vec<Object> = Vec::with_capacity(8);
        loop {
            c.skip_ws_and_comments();
            if c.at_eof_local() {
                break;
            }
            let before = c.pos;
            if can_start_operand(c.rest()) {
                match read_object(&mut c) {
                    Ok(obj) => {
                        buf.push(obj);
                    }
                    Err(_) => {
                        // Object parse failed — fall back to operator.
                        let op = read_operator(&mut c);
                        if !op.is_empty() {
                            ops.push(Operation {
                                operator: op,
                                operands: std::mem::take(&mut buf),
                            });
                        }
                    }
                }
            } else if c.peek() == Some(b'B') && c.rest().starts_with(b"BI") {
                // Inline image block `BI ... ID ... EI` — payload between
                // ID/EI isn't valid content-stream syntax, skip past it.
                skip_inline_image(&mut c);
            } else {
                let op = read_operator(&mut c);
                if !op.is_empty() {
                    ops.push(Operation {
                        operator: op,
                        operands: std::mem::take(&mut buf),
                    });
                }
            }
 // Forward progress invariant: if neither branch consumed a
 // byte, the loop would spin forever on whatever delimiter
            // tripped read_object. Force one byte of progress.
            if c.pos == before {
                c.advance(1);
            }
        }
        Ok(Self { operations: ops })
    }
}

/// Walk a tokenized content stream and emit the assembled text. This
/// is the spectre v0.4 `extract_text` path, mechanically translated to
/// our parser's types.
///
/// Operators handled:
/// - `Tf`: switch the active font (and therefore the encoding).
/// - `Tj`: show a string.
/// - `TJ`: show an array of strings (numeric kerning offsets are
/// dropped — they only affect glyph spacing).
/// - `'`: move to next line then show a string (PDF spec §9.4.3;
/// upstream lopdf 0.39.0 silently drops this op — our `lopdf-fork`
/// patches it, and we replicate the fix here).
/// - `"`: like `'` plus character/word spacing operands; we read the
/// string at index 2.
/// - `T*`: move to next line, preserve the line break.
/// - `ET`: end-text — preserves a newline so consecutive text blocks
/// are visually separated.
pub fn extract_text_from_stream(
    data: &[u8],
    encodings: &BTreeMap<Vec<u8>, Encoding>,
) -> String {
    let content = match Content::decode(data) {
        Ok(c) => c,
        Err(_) => return String::new(),
    };
    let mut current_encoding: Option<&Encoding> = None;
    let mut out = String::with_capacity(data.len() / 4);
    for op in &content.operations {
        match op.operator.as_str() {
            "Tf" => {
                if let Some(first) = op.operands.first() {
                    if let Ok(name) = first.as_name() {
                        current_encoding = encodings.get(name);
                    }
                }
            }
            "Tj" => {
                collect_show(&mut out, current_encoding, &op.operands);
            }
            "TJ" => {
                if let Some(Object::Array(items)) = op.operands.first() {
                    for item in items {
                        if let Object::String(_, _) = item {
                            collect_show(&mut out, current_encoding, std::slice::from_ref(item));
                        }
                    }
                }
            }
            "'" => {
                if !out.ends_with('\n') {
                    out.push('\n');
                }
                collect_show(&mut out, current_encoding, &op.operands);
            }
            "\"" => {
                if !out.ends_with('\n') {
                    out.push('\n');
                }
                if let Some(s) = op.operands.get(2) {
                    collect_show(&mut out, current_encoding, std::slice::from_ref(s));
                }
            }
            "T*" => {
                if !out.ends_with('\n') {
                    out.push('\n');
                }
            }
            "ET" => {
                if !out.ends_with('\n') {
                    out.push('\n');
                }
            }
            _ => {}
        }
    }
    out
}

fn collect_show(out: &mut String, encoding: Option<&Encoding>, operands: &[Object]) {
    for op in operands {
        if let Object::String(bytes, fmt) = op {
            let s = decode_one(encoding, bytes, fmt);
            out.push_str(&s);
        }
    }
}

fn decode_one(encoding: Option<&Encoding>, bytes: &[u8], _fmt: &StringFormat) -> String {
    if let Some(enc) = encoding {
        if let Ok(s) = enc.bytes_to_string(bytes) {
            return s;
        }
    }
 // Last-resort fallback: Latin-1 round-trip — preserves ASCII
    // verbatim, which is what most content streams contain anyway.
    bytes.iter().map(|&b| b as char).collect()
}

/// True if the upcoming bytes at the cursor could start an operand
/// (object) rather than an operator name.
fn can_start_operand(rest: &[u8]) -> bool {
    let Some(&b) = rest.first() else { return false };
    matches!(
        b,
        b'/' | b'(' | b'<' | b'[' | b'+' | b'-' | b'.' | b'0'..=b'9'
    )
}

/// Read one operator name: a token of non-delimiter, non-whitespace
/// bytes terminated by whitespace, delimiter, or EOF.
fn read_operator(c: &mut Cursor<'_>) -> String {
    let start = c.pos;
    while let Some(b) = c.peek() {
        if is_token_end(b) || is_ws(b) {
            break;
        }
        c.bump();
    }
    String::from_utf8_lossy(&c.buf[start..c.pos]).into_owned()
}

/// Walk past an inline image block (`BI ... ID <data> EI`). Inline
/// images embed raw filter-encoded bytes that don't tokenize as
/// content operators; skipping the block keeps the rest of the stream
/// parseable.
fn skip_inline_image(c: &mut Cursor<'_>) {
    // Skip the BI.
    c.advance(2);
    // Find `ID\n` or `ID\r` or `ID ` — start of the image data.
    while c.pos + 2 <= c.buf.len() {
        if c.rest().starts_with(b"ID") {
            c.advance(2);
            // Spec wants exactly one whitespace before the data; skip it.
            if matches!(c.peek(), Some(b' ') | Some(b'\n') | Some(b'\r')) {
                c.bump();
            }
            break;
        }
        c.bump();
    }
    // Walk to `EI` — followed by whitespace per spec.
    while c.pos + 2 <= c.buf.len() {
        if c.rest().starts_with(b"EI")
            && c.buf
                .get(c.pos + 2)
                .map(|&b| is_ws(b))
                .unwrap_or(true)
        {
            c.advance(2);
            break;
        }
        c.bump();
    }
}

// `Cursor::at_eof` is public-but-unused in lib.rs; reproduce a tiny
// helper here so we don't have to widen Cursor's surface.
trait CursorExt {
    fn at_eof_local(&self) -> bool;
}
impl<'a> CursorExt for Cursor<'a> {
    fn at_eof_local(&self) -> bool {
        self.pos >= self.buf.len()
    }
}