spectre_parse 1.0.0

//! Hand-rolled byte parser for the PDF productions spectre needs.
//!
//! Each function takes a `Cursor<'a>` (slice + position) and advances
//! it, leaving the caller in charge of error recovery. Parser
//! combinators are avoided here: at this granularity their generic
//! trait dispatch shows up in the hot path.
//!
//! # Productions
//!
//! - `%PDF-x.y` header
//! - `startxref` locator
//! - classical `xref` table and trailer
//! - xref streams (PDF 1.5+)
//! - generic object body: bool / number / name / string / array /
//!   dictionary / null / reference / indirect-object envelope

use crate::error::{Error, ParseError, Result, XrefError};
use crate::object::{Dictionary, Object, ObjectId, Stream, StringFormat};
use crate::xref::{Xref, XrefEntry};

/// A byte cursor — slice + position. All parser helpers mutate `pos`.
pub(crate) struct Cursor<'a> {
    pub(crate) buf: &'a [u8],
    pub(crate) pos: usize,
}

impl<'a> Cursor<'a> {
    pub fn new(buf: &'a [u8]) -> Self {
        Self { buf, pos: 0 }
    }

    pub fn at(buf: &'a [u8], pos: usize) -> Self {
        Self { buf, pos }
    }

    #[inline]
    pub fn rest(&self) -> &'a [u8] {
        &self.buf[self.pos.min(self.buf.len())..]
    }

    #[inline]
    pub fn at_eof(&self) -> bool {
        self.pos >= self.buf.len()
    }

    #[inline]
    pub fn peek(&self) -> Option<u8> {
        self.buf.get(self.pos).copied()
    }

    /// Advance one byte; returns the byte that was at the cursor.
    #[inline]
    pub fn bump(&mut self) -> Option<u8> {
        let b = self.peek()?;
        self.pos += 1;
        Some(b)
    }

    /// Advance `n` bytes (clamped at EOF).
    #[inline]
    pub fn advance(&mut self, n: usize) {
        self.pos = (self.pos + n).min(self.buf.len());
    }

    pub fn starts_with(&self, prefix: &[u8]) -> bool {
        self.rest().starts_with(prefix)
    }

    /// Eat `prefix` if present; return whether we did.
    pub fn eat(&mut self, prefix: &[u8]) -> bool {
        if self.starts_with(prefix) {
            self.advance(prefix.len());
            true
        } else {
            false
        }
    }

    /// Skip PDF whitespace (spec §7.2.3) and `% ... <eol>` comments.
    pub fn skip_ws_and_comments(&mut self) {
        loop {
            match self.peek() {
                Some(b) if is_ws(b) => self.advance(1),
                Some(b'%') => {
                    while let Some(b) = self.peek() {
                        self.advance(1);
                        if b == b'\n' || b == b'\r' {
                            break;
                        }
                    }
                }
                _ => break,
            }
        }
    }
}

/// PDF whitespace per spec §7.2.3.
#[inline]
pub(crate) fn is_ws(b: u8) -> bool {
    matches!(b, b'\0' | b'\t' | b'\n' | 0x0C | b'\r' | b' ')
}

/// PDF delimiter per spec §7.2.3.
#[inline]
pub(crate) fn is_delim(b: u8) -> bool {
    matches!(b, b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%')
}

#[inline]
pub(crate) fn is_token_end(b: u8) -> bool {
    is_ws(b) || is_delim(b)
}

// ── Header ─────────────────────────────────────────────────────────────────

/// Locate `%PDF-x.y` near the start of the buffer and return the
/// version string. PDF spec allows up to 1 KB of prefix junk; we use
/// a 4 KB look-window for the worst real-world prefixes.
pub fn parse_header(buf: &[u8]) -> Result<(String, usize)> {
    const WINDOW: usize = 4096;
    let window = &buf[..buf.len().min(WINDOW)];
    let needle = b"%PDF-";
    let start = window
        .windows(needle.len())
        .position(|w| w == needle)
        .ok_or(ParseError::InvalidFileHeader)?;
    let after = &buf[start + needle.len()..];
    let mut end = 0;
    for &b in after {
        if is_ws(b) {
            break;
        }
        end += 1;
    }
    if end == 0 || end > 8 {
        return Err(ParseError::InvalidFileHeader.into());
    }
    let v = std::str::from_utf8(&after[..end])
        .map_err(|_| ParseError::InvalidFileHeader)?
        .to_string();
    Ok((v, start))
}

// ── startxref ─────────────────────────────────────────────────────────────

/// Find the byte offset declared by the `startxref` directive near
/// end-of-file. Spec puts this in the last 1 KB; we search the last
/// 4 KB to tolerate signing tools that append blobs after %%EOF.
pub fn parse_startxref(buf: &[u8]) -> Result<usize> {
    const WINDOW: usize = 4096;
    let from = buf.len().saturating_sub(WINDOW);
    let needle = b"startxref";
    let tail = &buf[from..];
    let last = tail
        .windows(needle.len())
        .enumerate()
        .filter(|(_, w)| *w == needle)
        .map(|(i, _)| i)
        .last()
        .ok_or(ParseError::MissingStartXref)?;
    let mut c = Cursor::at(buf, from + last + needle.len());
    c.skip_ws_and_comments();
    let n = read_integer(&mut c)?;
    if n < 0 || n as usize > buf.len() {
        return Err(XrefError::Start.into());
    }
    Ok(n as usize)
}

// ── Numbers ───────────────────────────────────────────────────────────────

/// Read an unsigned-or-signed integer literal at the cursor.
pub fn read_integer(c: &mut Cursor<'_>) -> Result<i64> {
    let start = c.pos;
    let mut negative = false;
    if c.peek() == Some(b'+') {
        c.bump();
    } else if c.peek() == Some(b'-') {
        c.bump();
        negative = true;
    }
    let digits_start = c.pos;
    while let Some(b) = c.peek() {
        if b.is_ascii_digit() {
            c.bump();
        } else {
            break;
        }
    }
    if c.pos == digits_start {
        return Err(ParseError::Unexpected {
            offset: start,
            expected: "integer",
        }
        .into());
    }
    let s = std::str::from_utf8(&c.buf[digits_start..c.pos])
        .map_err(|_| ParseError::Unexpected {
            offset: start,
            expected: "integer-utf8",
        })?;
    let v: i64 = s.parse().map_err(|_| ParseError::Unexpected {
        offset: start,
        expected: "integer-parse",
    })?;
    Ok(if negative { -v } else { v })
}

/// Read an integer or real number literal. Returns Object::Integer or
/// Object::Real depending on whether a `.` was present.
pub fn read_number(c: &mut Cursor<'_>) -> Result<Object> {
    let start = c.pos;
    let mut saw_sign = false;
    if matches!(c.peek(), Some(b'+') | Some(b'-')) {
        c.bump();
        saw_sign = true;
    }
    let body_start = c.pos;
    let mut saw_digit = false;
    let mut saw_dot = false;
    while let Some(b) = c.peek() {
        if b.is_ascii_digit() {
            saw_digit = true;
            c.bump();
        } else if b == b'.' && !saw_dot {
            saw_dot = true;
            c.bump();
        } else {
            break;
        }
    }
    if !saw_digit {
        return Err(ParseError::Unexpected {
            offset: start,
            expected: "number",
        }
        .into());
    }
    let token_start = if saw_sign { start } else { body_start };
    let s = std::str::from_utf8(&c.buf[token_start..c.pos])
        .map_err(|_| ParseError::Unexpected {
            offset: start,
            expected: "number-utf8",
        })?;
    if saw_dot {
        let v: f32 = s.parse().map_err(|_| ParseError::Unexpected {
            offset: start,
            expected: "real-parse",
        })?;
        Ok(Object::Real(v))
    } else {
        let v: i64 = s.parse().map_err(|_| ParseError::Unexpected {
            offset: start,
            expected: "int-parse",
        })?;
        Ok(Object::Integer(v))
    }
}

// ── Names ────────────────────────────────────────────────────────────────

/// `/Name` — read the name bytes. `#XX` hex escapes decoded inline.
pub fn read_name(c: &mut Cursor<'_>) -> Result<Vec<u8>> {
    if c.bump() != Some(b'/') {
        return Err(ParseError::Unexpected {
            offset: c.pos,
            expected: "name '/'",
        }
        .into());
    }
    let mut out = Vec::with_capacity(16);
    while let Some(b) = c.peek() {
        if is_token_end(b) {
            break;
        }
        c.bump();
        if b == b'#' {
            let hi = c.bump().ok_or(ParseError::Unexpected {
                offset: c.pos,
                expected: "name-hex-hi",
            })?;
            let lo = c.bump().ok_or(ParseError::Unexpected {
                offset: c.pos,
                expected: "name-hex-lo",
            })?;
            let v = (hex_value(hi).ok_or(ParseError::Unexpected {
                offset: c.pos,
                expected: "name-hex-hi-value",
            })? << 4)
                | hex_value(lo).ok_or(ParseError::Unexpected {
                    offset: c.pos,
                    expected: "name-hex-lo-value",
                })?;
            out.push(v);
        } else {
            out.push(b);
        }
    }
    Ok(out)
}

#[inline]
fn hex_value(b: u8) -> Option<u8> {
    match b {
        b'0'..=b'9' => Some(b - b'0'),
        b'a'..=b'f' => Some(b - b'a' + 10),
        b'A'..=b'F' => Some(b - b'A' + 10),
        _ => None,
    }
}

// ── Strings ───────────────────────────────────────────────────────────────

/// `(literal)` — read a parenthesized string body. Handles balanced
/// parens, `\n`/`\r`/`\t`/`\b`/`\f`/`\\`/`\(`/`\)` escapes, and `\ddd`
/// octal escapes per spec §7.3.4.2.
pub fn read_literal_string(c: &mut Cursor<'_>) -> Result<Vec<u8>> {
    if c.bump() != Some(b'(') {
        return Err(ParseError::Unexpected {
            offset: c.pos,
            expected: "literal-string '('",
        }
        .into());
    }
    let mut out = Vec::with_capacity(32);
    let mut depth = 1usize;
    while let Some(b) = c.bump() {
        match b {
            b'(' => {
                depth += 1;
                out.push(b);
            }
            b')' => {
                depth -= 1;
                if depth == 0 {
                    return Ok(out);
                }
                out.push(b);
            }
            b'\\' => {
                let next = c.bump().ok_or(ParseError::Unexpected {
                    offset: c.pos,
                    expected: "string-escape",
                })?;
                match next {
                    b'n' => out.push(b'\n'),
                    b'r' => out.push(b'\r'),
                    b't' => out.push(b'\t'),
                    b'b' => out.push(8),
                    b'f' => out.push(12),
                    b'(' | b')' | b'\\' => out.push(next),
                    // `\<eol>` is a line continuation.
                    b'\n' => {}
                    b'\r' => {
                        if c.peek() == Some(b'\n') {
                            c.bump();
                        }
                    }
                    d if d.is_ascii_digit() => {
                        // \ddd octal — up to 3 digits including this one.
                        let mut v: u16 = (d - b'0') as u16;
                        for _ in 0..2 {
                            match c.peek() {
                                Some(b2) if b2.is_ascii_digit() && b2 < b'8' => {
                                    c.bump();
                                    v = v * 8 + (b2 - b'0') as u16;
                                }
                                _ => break,
                            }
                        }
                        out.push((v & 0xff) as u8);
                    }
                    other => out.push(other), // unknown escape → literal char
                }
            }
            _ => out.push(b),
        }
    }
    Err(ParseError::Unexpected {
        offset: c.pos,
        expected: "literal-string-close",
    }
    .into())
}

/// `<hex>` — hex-encoded string body. An odd-length stream gets a
/// trailing zero per spec.
pub fn read_hex_string(c: &mut Cursor<'_>) -> Result<Vec<u8>> {
    if c.bump() != Some(b'<') {
        return Err(ParseError::Unexpected {
            offset: c.pos,
            expected: "hex-string '<'",
        }
        .into());
    }
    let mut out = Vec::with_capacity(32);
    let mut buf: i16 = -1;
    while let Some(b) = c.bump() {
        if b == b'>' {
            if buf >= 0 {
                out.push(((buf as u8) << 4) & 0xf0);
            }
            return Ok(out);
        }
        if is_ws(b) {
            continue;
        }
        let v = hex_value(b).ok_or(ParseError::Unexpected {
            offset: c.pos,
            expected: "hex-string-digit",
        })?;
        if buf < 0 {
            buf = v as i16;
        } else {
            out.push((((buf as u8) << 4) | v) & 0xff);
            buf = -1;
        }
    }
    Err(ParseError::Unexpected {
        offset: c.pos,
        expected: "hex-string-close",
    }
    .into())
}

// ── Arrays / dictionaries / objects ───────────────────────────────────────

/// Read one PDF object at the cursor (any variant).
pub fn read_object(c: &mut Cursor<'_>) -> Result<Object> {
    c.skip_ws_and_comments();
    let here = c.pos;
    match c.peek() {
        Some(b'/') => read_name(c).map(Object::Name),
        Some(b'(') => read_literal_string(c).map(|b| Object::String(b, StringFormat::Literal)),
        Some(b'[') => read_array(c),
        Some(b'<') => {
            if c.buf.get(c.pos + 1) == Some(&b'<') {
                read_dictionary_or_stream(c)
            } else {
                read_hex_string(c).map(|b| Object::String(b, StringFormat::Hexadecimal))
            }
        }
        Some(b't') => {
            if c.eat(b"true") {
                Ok(Object::Boolean(true))
            } else {
                Err(ParseError::Unexpected {
                    offset: here,
                    expected: "true",
                }
                .into())
            }
        }
        Some(b'f') => {
            if c.eat(b"false") {
                Ok(Object::Boolean(false))
            } else {
                Err(ParseError::Unexpected {
                    offset: here,
                    expected: "false",
                }
                .into())
            }
        }
        Some(b'n') => {
            if c.eat(b"null") {
                Ok(Object::Null)
            } else {
                Err(ParseError::Unexpected {
                    offset: here,
                    expected: "null",
                }
                .into())
            }
        }
        Some(b) if b.is_ascii_digit() || b == b'+' || b == b'-' || b == b'.' => {
            try_read_reference_or_number(c)
        }
        Some(_) => Err(ParseError::Unexpected {
            offset: here,
            expected: "object-start",
        }
        .into()),
        None => Err(ParseError::Unexpected {
            offset: here,
            expected: "object (eof)",
        }
        .into()),
    }
}

/// A leading numeric token is either a plain number, the first half of
/// a reference (`12 0 R`), or the first half of an indirect-object
/// header (`12 0 obj`). Disambiguate by looking 2 tokens ahead.
fn try_read_reference_or_number(c: &mut Cursor<'_>) -> Result<Object> {
    let save = c.pos;
    let first = read_number(c)?;
    let after_first = c.pos;
    c.skip_ws_and_comments();
    let after_first_ws = c.pos;
    if matches!(c.peek(), Some(b) if b.is_ascii_digit() || b == b'+' || b == b'-') {
        let second = read_number(&mut Cursor::at(c.buf, c.pos));
        if let Ok(Object::Integer(gen)) = second {
            let mut probe = Cursor::at(c.buf, c.pos);
            let _ = read_number(&mut probe);
            let after_second = probe.pos;
            probe.skip_ws_and_comments();
            if probe.starts_with(b"R")
                && probe
                    .buf
                    .get(probe.pos + 1)
                    .map(|&b| is_token_end(b))
                    .unwrap_or(true)
            {
                let num = match first {
                    Object::Integer(n) if n >= 0 => n as u32,
                    _ => {
                        c.pos = after_first;
                        return Ok(first);
                    }
                };
                let gen = if (0..=u16::MAX as i64).contains(&gen) {
                    gen as u16
                } else {
                    c.pos = after_first;
                    return Ok(first);
                };
                c.pos = after_second;
                c.skip_ws_and_comments();
                c.bump(); // 'R'
                return Ok(Object::Reference((num, gen)));
            }
        }
    }
    c.pos = after_first_ws.min(after_first);
    c.pos = after_first;
    let _ = save;
    Ok(first)
}

/// `[ ... ]` — array body. Reads child objects until the closing `]`.
pub fn read_array(c: &mut Cursor<'_>) -> Result<Object> {
    if c.bump() != Some(b'[') {
        return Err(ParseError::Unexpected {
            offset: c.pos,
            expected: "array '['",
        }
        .into());
    }
    let mut items = Vec::new();
    loop {
        c.skip_ws_and_comments();
        match c.peek() {
            Some(b']') => {
                c.bump();
                return Ok(Object::Array(items));
            }
            Some(_) => items.push(read_object(c)?),
            None => {
                return Err(ParseError::Unexpected {
                    offset: c.pos,
                    expected: "array ']' (eof)",
                }
                .into())
            }
        }
    }
}

/// `<<...>>` — dictionary body. May be followed by `stream\n…\nendstream`
/// in which case we promote to a [`Stream`].
pub fn read_dictionary_or_stream(c: &mut Cursor<'_>) -> Result<Object> {
    let dict = read_dictionary(c)?;
    c.skip_ws_and_comments();
    if c.starts_with(b"stream") {
        c.advance(b"stream".len());
        // Spec requires exactly one EOL after `stream` — `\n` or `\r\n`.
        match c.peek() {
            Some(b'\n') => {
                c.bump();
            }
            Some(b'\r') => {
                c.bump();
                if c.peek() == Some(b'\n') {
                    c.bump();
                }
            }
            _ => {}
        }
        let start_position = Some(c.pos);
        // Content body isn't sliced here; materialize_stream_body
        // computes /Length and copies bytes on demand.
        Ok(Object::Stream(Stream {
            dict,
            content: Vec::new(),
            start_position,
        }))
    } else {
        Ok(Object::Dictionary(dict))
    }
}

/// `<< /key value /key value ... >>` — dictionary body alone.
pub fn read_dictionary(c: &mut Cursor<'_>) -> Result<Dictionary> {
    if !(c.eat(b"<<")) {
        return Err(ParseError::Unexpected {
            offset: c.pos,
            expected: "dictionary '<<'",
        }
        .into());
    }
    let mut dict = Dictionary::new();
    loop {
        c.skip_ws_and_comments();
        if c.starts_with(b">>") {
            c.advance(2);
            return Ok(dict);
        }
        let key = read_name(c)?;
        c.skip_ws_and_comments();
        let value = read_object(c)?;
        dict.set(key, value);
    }
}

// ── Indirect-object envelope: `N G obj ... endobj` ────────────────────────

/// Parse the body of an indirect object positioned at byte `offset`.
/// Returns the inner object (the bytes between `obj` and `endobj`).
pub fn read_indirect_object_at(buf: &[u8], offset: usize) -> Result<(ObjectId, Object)> {
    let mut c = Cursor::at(buf, offset);
    c.skip_ws_and_comments();
    let num = read_integer(&mut c)?;
    if num < 0 {
        return Err(ParseError::Unexpected {
            offset,
            expected: "non-negative object number",
        }
        .into());
    }
    c.skip_ws_and_comments();
    let gen = read_integer(&mut c)?;
    if !(0..=u16::MAX as i64).contains(&gen) {
        return Err(ParseError::Unexpected {
            offset,
            expected: "u16 generation",
        }
        .into());
    }
    c.skip_ws_and_comments();
    if !c.eat(b"obj") {
        return Err(ParseError::Unexpected {
            offset: c.pos,
            expected: "'obj' keyword",
        }
        .into());
    }
    c.skip_ws_and_comments();
    let body = read_object(&mut c)?;
    Ok(((num as u32, gen as u16), body))
}

// ── xref table ────────────────────────────────────────────────────────────

/// Parse the cross-reference data at `offset`. Returns the xref table
/// plus the trailer dictionary. Handles both classical (`xref\n...
/// \ntrailer << ... >>`, PDF 1.4 and older) and xref-stream layouts
/// (PDF 1.5+).
pub fn parse_xref_and_trailer(buf: &[u8], offset: usize) -> Result<(Xref, Dictionary)> {
    if offset > buf.len() {
        return Err(XrefError::Start.into());
    }
    let mut c = Cursor::at(buf, offset);
    c.skip_ws_and_comments();
    if c.eat(b"xref") {
        let xref = parse_classical_xref(&mut c)?;
        c.skip_ws_and_comments();
        if !c.eat(b"trailer") {
            return Err(ParseError::Unexpected {
                offset: c.pos,
                expected: "'trailer' keyword",
            }
            .into());
        }
        c.skip_ws_and_comments();
        let trailer = read_dictionary(&mut c)?;
        Ok((xref, trailer))
    } else {
        parse_xref_stream(buf, offset)
    }
}

/// Parse an xref-stream indirect object at `offset` per PDF spec §7.5.8.
/// The stream's dictionary doubles as the trailer.
fn parse_xref_stream(buf: &[u8], offset: usize) -> Result<(Xref, Dictionary)> {
    let (_id, body) = read_indirect_object_at(buf, offset)?;
    let stream = match body {
        Object::Stream(s) => s,
        other => {
            return Err(Error::Type {
                expected: "Stream (xref-stream)",
                found: other.type_name(),
            })
        }
    };
    let dict = stream.dict.clone();
    if !dict.has_type(b"XRef") {
        return Err(ParseError::Unexpected {
            offset,
            expected: "/Type /XRef",
        }
        .into());
    }

    // Required `/W [w1 w2 w3]` field widths. Each width is an i64 in
    // the PDF; legal values are 0..=8. A hostile encoder can store
    // a negative or i64::MAX here, so reject anything outside the
    // PDF-spec range rather than truncating into a usize.
    let w_arr = dict.get(b"W")?.as_array()?;
    if w_arr.len() < 3 {
        return Err(ParseError::Other("xref-stream W has <3 fields".into()).into());
    }
    let w1 = read_xref_width(w_arr[0].as_i64()?)?;
    let w2 = read_xref_width(w_arr[1].as_i64()?)?;
    let w3 = read_xref_width(w_arr[2].as_i64()?)?;
    let row_len = w1
        .checked_add(w2)
        .and_then(|s| s.checked_add(w3))
        .ok_or_else(|| ParseError::Other("xref-stream W row overflow".into()))?;
    if row_len == 0 {
        return Err(ParseError::Other("xref-stream W sum is zero".into()).into());
    }

    let size = read_xref_count(dict.get(b"Size")?.as_i64()?)?;

    // Optional `/Index [first1 count1 ...]`; defaults to `[0 Size]`.
    let mut index_pairs: Vec<(u32, u32)> = Vec::new();
    if let Some(idx_obj) = dict.get_optional(b"Index") {
        let arr = idx_obj.as_array()?;
        let mut i = 0;
        while i + 1 < arr.len() {
            let first = read_xref_count(arr[i].as_i64()?)?;
            let count = read_xref_count(arr[i + 1].as_i64()?)?;
            index_pairs.push((first, count));
            i += 2;
        }
    } else {
        index_pairs.push((0, size));
    }

    // Compute the encoded byte range from /Length without trusting
    // its sign or magnitude — a wrap on start+length here would let
    // the slice below bypass the EOF guard.
    let length_obj = dict.get(b"Length")?;
    let length = usize::try_from(length_obj.as_i64()?)
        .map_err(|_| ParseError::Other("xref-stream /Length out of range".into()))?;
    let start = stream
        .start_position
        .ok_or(ParseError::Other("xref-stream missing start_position".into()))?;
    let end = start
        .checked_add(length)
        .ok_or(ParseError::Other("xref-stream start+length overflow".into()))?;
    if end > buf.len() {
        return Err(ParseError::Other("xref-stream extends past EOF".into()).into());
    }
    let raw = &buf[start..end];

    // Filter chain — typically just FlateDecode for xref streams.
    let filters: Vec<String> = match dict.get_optional(b"Filter") {
        Some(Object::Name(n)) => vec![String::from_utf8_lossy(n).into_owned()],
        Some(Object::Array(arr)) => arr
            .iter()
            .filter_map(|o| match o {
                Object::Name(n) => Some(String::from_utf8_lossy(n).into_owned()),
                _ => None,
            })
            .collect(),
        _ => Vec::new(),
    };
    let decoded = if filters.is_empty() {
        raw.to_vec()
    } else {
        let parms = dict.get_optional(b"DecodeParms");
        crate::filter::apply_chain(raw, &filters, parms)?
    };
    let predicted = decoded;
    let _ = row_len;

    let mut xref = Xref::new();
    let mut cursor = 0usize;
    for (first, count) in index_pairs {
        for i in 0..count {
            let row_end = match cursor.checked_add(row_len) {
                Some(e) if e <= predicted.len() => e,
                _ => break,
            };
            let row = &predicted[cursor..row_end];
            cursor = row_end;
            let kind = if w1 == 0 { 1 } else { read_be_uint(&row[..w1]) };
            let field2 = read_be_uint(&row[w1..w1 + w2]);
            let field3 = read_be_uint(&row[w1 + w2..]);
            let entry = match kind {
                0 => XrefEntry::Free,
                1 => XrefEntry::Normal {
                    offset: field2 as u32,
                    generation: field3 as u16,
                },
                2 => XrefEntry::Compressed {
                    container: field2 as u32,
                    index: field3 as u32,
                },
                _ => continue,
            };
            let Some(obj_num) = first.checked_add(i) else {
                break;
            };
            xref.insert(obj_num, entry);
        }
    }
    xref.size = size;
    Ok((xref, dict))
}

#[inline]
fn read_be_uint(bytes: &[u8]) -> u64 {
    let mut v: u64 = 0;
    for &b in bytes {
        v = (v << 8) | b as u64;
    }
    v
}

/// /W entry widths are field-byte sizes. Spec allows 0..=8; we use 8 as
/// a generous ceiling since `read_be_uint` reads into a u64.
fn read_xref_width(v: i64) -> Result<usize> {
    if !(0..=8).contains(&v) {
        return Err(ParseError::Other("xref-stream W width out of range".into()).into());
    }
    Ok(v as usize)
}

/// Object-number counts are bounded by u32 in the PDF data model.
fn read_xref_count(v: i64) -> Result<u32> {
    u32::try_from(v).map_err(|_| ParseError::Other("xref-stream count out of range".into()).into())
}

// Kept as a fallback path; the active predictor pass lives in
// `crate::filter::apply_png_predictor`.
#[allow(dead_code)]
fn apply_xref_predictor(
    data: &[u8],
    dict: &Dictionary,
    row_len: usize,
) -> Result<Vec<u8>> {
    let params = match dict.get_optional(b"DecodeParms") {
        Some(Object::Dictionary(d)) => Some(d.clone()),
        _ => None,
    };
    let predictor = params
        .as_ref()
        .and_then(|d| d.get_optional(b"Predictor").and_then(|o| o.as_i64().ok()))
        .unwrap_or(1);
    let columns = params
        .as_ref()
        .and_then(|d| d.get_optional(b"Columns").and_then(|o| o.as_i64().ok()))
        .map(|n| n as usize)
        .unwrap_or(row_len);
    if predictor == 1 {
        return Ok(data.to_vec());
    }
    if predictor < 10 {
        // TIFF predictor 2 is rare for xref streams; treat as identity.
        return Ok(data.to_vec());
    }
    // PNG predictors 10..15: row = [filter_type, ...columns bytes].
    let stride = columns + 1;
    let mut out: Vec<u8> = Vec::with_capacity(data.len());
    let mut prev_row: Vec<u8> = vec![0; columns];
    let mut chunks = data.chunks(stride);
    while let Some(row) = chunks.next() {
        if row.len() < stride {
            break;
        }
        let filter = row[0];
        let data_row = &row[1..];
        let mut decoded_row: Vec<u8> = Vec::with_capacity(columns);
        match filter {
            0 => decoded_row.extend_from_slice(data_row), // None
            1 => {
                // Sub
                for (i, &b) in data_row.iter().enumerate() {
                    let left = if i == 0 { 0 } else { decoded_row[i - 1] };
                    decoded_row.push(b.wrapping_add(left));
                }
            }
            2 => {
                // Up
                for (i, &b) in data_row.iter().enumerate() {
                    decoded_row.push(b.wrapping_add(prev_row[i]));
                }
            }
            3 => {
                // Average
                for (i, &b) in data_row.iter().enumerate() {
                    let left = if i == 0 { 0u16 } else { decoded_row[i - 1] as u16 };
                    let up = prev_row[i] as u16;
                    let avg = ((left + up) / 2) as u8;
                    decoded_row.push(b.wrapping_add(avg));
                }
            }
            4 => {
                // Paeth
                for (i, &b) in data_row.iter().enumerate() {
                    let left = if i == 0 { 0i16 } else { decoded_row[i - 1] as i16 };
                    let up = prev_row[i] as i16;
                    let up_left = if i == 0 { 0i16 } else { prev_row[i - 1] as i16 };
                    let p = left + up - up_left;
                    let pa = (p - left).abs();
                    let pb = (p - up).abs();
                    let pc = (p - up_left).abs();
                    let predictor = if pa <= pb && pa <= pc {
                        left
                    } else if pb <= pc {
                        up
                    } else {
                        up_left
                    };
                    decoded_row.push(b.wrapping_add(predictor as u8));
                }
            }
            _ => decoded_row.extend_from_slice(data_row),
        }
        out.extend_from_slice(&decoded_row);
        prev_row = decoded_row;
    }
    let _ = row_len;
    Ok(out)
}

fn parse_classical_xref(c: &mut Cursor<'_>) -> Result<Xref> {
    // Per spec §7.5.4: `xref\n` then subsections of `first count\n`
    // followed by `count` 20-byte rows:
    //   `nnnnnnnnnn ggggg n \n`  (in-use)
    //   `nnnnnnnnnn ggggg f \n`  (free)
    let mut xref = Xref::new();
    loop {
        c.skip_ws_and_comments();
        if c.starts_with(b"trailer") {
            break;
        }
        let first = read_integer(c)?;
        c.skip_ws_and_comments();
        let count = read_integer(c)?;
        if first < 0 || count < 0 {
            return Err(XrefError::Start.into());
        }
        // Skip subsection-header EOL (LF or CRLF).
        if matches!(c.peek(), Some(b'\r') | Some(b'\n')) {
            if c.peek() == Some(b'\r') {
                c.bump();
            }
            if c.peek() == Some(b'\n') {
                c.bump();
            }
        }
        for i in 0..count {
            // 20-byte fixed-width entry. We tolerate single-LF EOL too.
            if c.pos + 18 > c.buf.len() {
                return Err(XrefError::Start.into());
            }
            let line = &c.buf[c.pos..c.pos + 18];
            let off_str = std::str::from_utf8(&line[..10]).map_err(|_| XrefError::Start)?;
            let gen_str = std::str::from_utf8(&line[11..16]).map_err(|_| XrefError::Start)?;
            let kind = line[17];
            let offset: u64 = off_str
                .trim()
                .parse()
                .map_err(|_| XrefError::Start)?;
            // Spec says u16, but real-world PDFs emit larger values
            // (e.g. 65536 for free-list heads when the writer doesn't
            // follow §7.5.4). Clamp; these entries are rarely
            // referenced with the mismatched generation anyway.
            let generation: u16 = gen_str
                .trim()
                .parse::<u32>()
                .map(|n| n.min(u16::MAX as u32) as u16)
                .unwrap_or(0);
            let entry = match kind {
                b'n' => Some(XrefEntry::Normal {
                    offset: offset as u32,
                    generation,
                }),
                b'f' => Some(XrefEntry::Free),
                // Unknown row type — skip. Some PDFs emit corrupt
                // rows mid-table; readers tolerate them.
                _ => None,
            };
            if let Some(e) = entry {
                let object_number = (first as u32).saturating_add(i as u32);
                xref.insert(object_number, e);
            }
            // Consume 18-byte content + the writer's EOL flavor. Spec
            // says SP+LF / CR+LF; real-world PDFs ship LF-only /
            // SP+LF / CR+LF / SP+CR+LF. Walk whitespace until the
            // next row's first content byte or the terminator.
            c.advance(18);
            let mut saw_eol = false;
            for _ in 0..4 {
                match c.peek() {
                    Some(b' ') | Some(b'\t') => {
                        c.bump();
                    }
                    Some(b'\r') => {
                        c.bump();
                        if c.peek() == Some(b'\n') {
                            c.bump();
                        }
                        saw_eol = true;
                        break;
                    }
                    Some(b'\n') => {
                        c.bump();
                        saw_eol = true;
                        break;
                    }
                    _ => break,
                }
            }
            // Writers that omit the EOL entirely are rare-but-seen;
            // the 18-byte minimum lets the next iteration succeed as
            // long as the next row immediately follows.
            let _ = saw_eol;
        }
        let after_last = (first as u32).saturating_add(count as u32);
        if after_last > xref.size {
            xref.size = after_last;
        }
    }
    Ok(xref)
}