spectre_parse 1.0.0

Lazy PDF parser — xref-only at open(), objects materialize on demand. Read-only. Powers the spectre_pdf extraction crate.
Documentation
//! PDF object model per spec §7.3: Boolean, Integer, Real, String,
//! Name, Array, Dictionary, Stream, Null, and the indirect reference.

use crate::error::{Error, Result};
use indexmap::IndexMap;

/// `(object_number, generation)`. Most PDFs only use generation 0; the
/// spec permits incremental updates to bump it.
pub type ObjectId = (u32, u16);

/// PDF string literal form (`(text)`) vs hex (`<48656C6C6F>`).
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum StringFormat {
    Literal,
    Hexadecimal,
}

/// Dictionaries use [`IndexMap`] to preserve insertion order — several
/// PDF generators rely on first-key-wins for `/Type` lookups.
#[derive(Debug, Clone)]
pub enum Object {
    Boolean(bool),
    Integer(i64),
    /// `f32` matches PDF rasterizer layout precision; f64 buys nothing.
    Real(f32),
    Name(Vec<u8>),
    String(Vec<u8>, StringFormat),
    Array(Vec<Object>),
    Dictionary(Dictionary),
    Stream(Stream),
    Null,
    /// Indirect reference resolved via [`crate::Document::get_object`].
    Reference(ObjectId),
}

impl PartialEq for Object {
    fn eq(&self, other: &Self) -> bool {
        match (self, other) {
            (Self::Boolean(a), Self::Boolean(b)) => a == b,
            (Self::Integer(a), Self::Integer(b)) => a == b,
            (Self::Real(a), Self::Real(b)) => a.to_bits() == b.to_bits(),
            (Self::Name(a), Self::Name(b)) => a == b,
            (Self::String(a, af), Self::String(b, bf)) => a == b && af == bf,
            (Self::Array(a), Self::Array(b)) => a == b,
            (Self::Dictionary(a), Self::Dictionary(b)) => a == b,
            // Compare by dict alone — stream content is large and
            // equality is rarely useful at this layer.
            (Self::Stream(a), Self::Stream(b)) => a.dict == b.dict,
            (Self::Null, Self::Null) => true,
            (Self::Reference(a), Self::Reference(b)) => a == b,
            _ => false,
        }
    }
}

impl Object {
    /// Variant name as a `&'static str` (for error messages).
    pub fn type_name(&self) -> &'static str {
        match self {
            Self::Boolean(_) => "Boolean",
            Self::Integer(_) => "Integer",
            Self::Real(_) => "Real",
            Self::Name(_) => "Name",
            Self::String(_, _) => "String",
            Self::Array(_) => "Array",
            Self::Dictionary(_) => "Dictionary",
            Self::Stream(_) => "Stream",
            Self::Null => "Null",
            Self::Reference(_) => "Reference",
        }
    }

    /// Return `Ok(&[u8])` for `Object::Name`, error otherwise.
    pub fn as_name(&self) -> Result<&[u8]> {
        match self {
            Self::Name(n) => Ok(n),
            other => Err(Error::Type {
                expected: "Name",
                found: other.type_name(),
            }),
        }
    }

    /// Return `Ok(&[u8])` for `Object::String`, error otherwise.
    pub fn as_str(&self) -> Result<&[u8]> {
        match self {
            Self::String(b, _) => Ok(b),
            other => Err(Error::Type {
                expected: "String",
                found: other.type_name(),
            }),
        }
    }

    /// `Object::Integer` only; use [`Object::as_float`] for either.
    pub fn as_i64(&self) -> Result<i64> {
        match self {
            Self::Integer(n) => Ok(*n),
            other => Err(Error::Type {
                expected: "Integer",
                found: other.type_name(),
            }),
        }
    }

    /// `Object::Real` only; use [`Object::as_float`] for either.
    pub fn as_f32(&self) -> Result<f32> {
        match self {
            Self::Real(n) => Ok(*n),
            other => Err(Error::Type {
                expected: "Real",
                found: other.type_name(),
            }),
        }
    }

    /// Coerce Integer or Real to `f32`. PDF layout math accepts either.
    pub fn as_float(&self) -> Result<f32> {
        match self {
            Self::Integer(n) => Ok(*n as f32),
            Self::Real(n) => Ok(*n),
            other => Err(Error::Type {
                expected: "Numeric",
                found: other.type_name(),
            }),
        }
    }

    /// Return the array body for `Object::Array`.
    pub fn as_array(&self) -> Result<&Vec<Object>> {
        match self {
            Self::Array(a) => Ok(a),
            other => Err(Error::Type {
                expected: "Array",
                found: other.type_name(),
            }),
        }
    }

    /// Return the dictionary body for `Object::Dictionary`.
    pub fn as_dict(&self) -> Result<&Dictionary> {
        match self {
            Self::Dictionary(d) => Ok(d),
            Self::Stream(s) => Ok(&s.dict),
            other => Err(Error::Type {
                expected: "Dictionary",
                found: other.type_name(),
            }),
        }
    }

    /// Return the stream body for `Object::Stream`.
    pub fn as_stream(&self) -> Result<&Stream> {
        match self {
            Self::Stream(s) => Ok(s),
            other => Err(Error::Type {
                expected: "Stream",
                found: other.type_name(),
            }),
        }
    }

    /// Return the (num, gen) tuple for `Object::Reference`.
    pub fn as_reference(&self) -> Result<ObjectId> {
        match self {
            Self::Reference(id) => Ok(*id),
            other => Err(Error::Type {
                expected: "Reference",
                found: other.type_name(),
            }),
        }
    }
}

/// PDF dictionary `<<...>>`. Insertion-ordered (PDF spec doesn't
/// require it but several generators rely on it for `/Type` to come
/// first); look-up is by byte-slice key.
#[derive(Debug, Clone, Default, PartialEq)]
pub struct Dictionary {
    entries: IndexMap<Vec<u8>, Object>,
}

impl Dictionary {
    /// Empty dictionary.
    pub fn new() -> Self {
        Self::default()
    }

    /// Insert `key -> value`. Replaces any existing entry.
    pub fn set<K, V>(&mut self, key: K, value: V)
    where
        K: Into<Vec<u8>>,
        V: Into<Object>,
    {
        self.entries.insert(key.into(), value.into());
    }

    /// Fetch `key`, returning `Error::DictKey` when absent.
    pub fn get(&self, key: &[u8]) -> Result<&Object> {
        self.entries
            .get(key)
            .ok_or_else(|| Error::DictKey(String::from_utf8_lossy(key).into_owned()))
    }

    /// Probe `key` without producing an error — for optional fields.
    pub fn get_optional(&self, key: &[u8]) -> Option<&Object> {
        self.entries.get(key)
    }

    /// Number of entries.
    pub fn len(&self) -> usize {
        self.entries.len()
    }

    /// Whether the dictionary is empty.
    pub fn is_empty(&self) -> bool {
        self.entries.is_empty()
    }

    /// Iterate `(key, value)` pairs in insertion order.
    pub fn iter(&self) -> indexmap::map::Iter<'_, Vec<u8>, Object> {
        self.entries.iter()
    }

 /// Mutable iteration of values for in-place rewriting (used by the
    /// decryption pass to overwrite encrypted string bytes).
    pub fn iter_mut(&mut self) -> indexmap::map::IterMut<'_, Vec<u8>, Object> {
        self.entries.iter_mut()
    }

    /// Iterate `key` byte-slices in insertion order.
    pub fn keys(&self) -> indexmap::map::Keys<'_, Vec<u8>, Object> {
        self.entries.keys()
    }

    /// Whether the dictionary has `/Type = name`.
    pub fn has_type(&self, name: &[u8]) -> bool {
        matches!(
            self.entries.get(b"Type".as_slice()),
            Some(Object::Name(n)) if n == name
        )
    }
}

/// PDF stream: a dictionary plus a (potentially compressed) content body.
/// Content stays in the source buffer until first decode; the
/// `content` field caches the decoded bytes after one [`Document`]
/// call.
#[derive(Debug, Clone)]
pub struct Stream {
    /// Dictionary describing the stream (length, filter chain, …).
    pub dict: Dictionary,
    /// Raw bytes between `stream` and `endstream`. Decode on demand.
    pub content: Vec<u8>,
 /// Offset of the first content byte inside the source buffer, when
    /// the stream came from a parsed PDF (vs being built in-memory).
    pub start_position: Option<usize>,
}

impl Stream {
    /// Build a stream from its dictionary + raw content bytes.
    pub fn new(dict: Dictionary, content: Vec<u8>) -> Self {
        Self {
            dict,
            content,
            start_position: None,
        }
    }
}

impl From<bool> for Object {
    fn from(v: bool) -> Self {
        Self::Boolean(v)
    }
}
impl From<i64> for Object {
    fn from(v: i64) -> Self {
        Self::Integer(v)
    }
}
impl From<i32> for Object {
    fn from(v: i32) -> Self {
        Self::Integer(v as i64)
    }
}
impl From<f32> for Object {
    fn from(v: f32) -> Self {
        Self::Real(v)
    }
}
impl From<Vec<u8>> for Object {
    fn from(v: Vec<u8>) -> Self {
        Self::String(v, StringFormat::Literal)
    }
}
impl From<&str> for Object {
    fn from(v: &str) -> Self {
        Self::Name(v.as_bytes().to_vec())
    }
}
impl From<Vec<Object>> for Object {
    fn from(v: Vec<Object>) -> Self {
        Self::Array(v)
    }
}
impl From<Dictionary> for Object {
    fn from(v: Dictionary) -> Self {
        Self::Dictionary(v)
    }
}
impl From<Stream> for Object {
    fn from(v: Stream) -> Self {
        Self::Stream(v)
    }
}