spectre_parse 1.0.0

Lazy PDF parser — xref-only at open(), objects materialize on demand. Read-only. Powers the spectre_pdf extraction crate.
Documentation
//! Detailed trace of a single PDF's open path.
use spectre_parse::Document;

fn main() {
    let path = std::env::args().nth(1).expect("usage: trace_one <pdf>");
    let bytes = std::fs::read(&path).expect("read");
    eprintln!("file: {} bytes", bytes.len());

    // Try each step of the open pipeline manually.
    match spectre_parse_internals::header(&bytes) {
        Ok((v, off)) => eprintln!("header: v{v} at byte {off}"),
        Err(e) => eprintln!("header ERR: {e}"),
    }
    match spectre_parse_internals::startxref(&bytes) {
        Ok(off) => eprintln!("startxref: {off}"),
        Err(e) => eprintln!("startxref ERR: {e}"),
    }
    match Document::open(&bytes) {
        Ok(d) => eprintln!("Document::open: OK  version={} xref_size={}", d.version, d.xref_size()),
        Err(e) => eprintln!("Document::open ERR: {e:?}"),
    }
}

mod spectre_parse_internals {
    use spectre_parse::Error;
    pub fn header(buf: &[u8]) -> Result<(String, usize), Error> {
        // re-impl via public API only; just locate the header marker.
        const NEEDLE: &[u8] = b"%PDF-";
        let start = buf
            .windows(NEEDLE.len())
            .position(|w| w == NEEDLE)
            .ok_or_else(|| Error::Io("no header".into()))?;
        let after = &buf[start + NEEDLE.len()..];
        let mut end = 0;
        for &b in after {
            if b == b' ' || b == b'\n' || b == b'\r' || b == b'\t' {
                break;
            }
            end += 1;
        }
        Ok((
            std::str::from_utf8(&after[..end])
                .map_err(|_| Error::Io("header utf8".into()))?
                .to_string(),
            start,
        ))
    }
    pub fn startxref(buf: &[u8]) -> Result<usize, Error> {
        const NEEDLE: &[u8] = b"startxref";
        let from = buf.len().saturating_sub(4096);
        let tail = &buf[from..];
        let last = tail
            .windows(NEEDLE.len())
            .enumerate()
            .filter(|(_, w)| *w == NEEDLE)
            .map(|(i, _)| i)
            .last()
            .ok_or_else(|| Error::Io("no startxref".into()))?;
        let after = &buf[from + last + NEEDLE.len()..];
        let s = std::str::from_utf8(after).map_err(|_| Error::Io("startxref utf8".into()))?;
        let s = s.trim_start();
        let n: usize = s
            .split(|c: char| c.is_whitespace())
            .next()
            .ok_or_else(|| Error::Io("no startxref digits".into()))?
            .parse()
            .map_err(|_| Error::Io("startxref not int".into()))?;
        Ok(n)
    }
}