zpdf_parser/
lib.rs

1mod ccitt;
2mod crypt;
3pub mod filters;
4mod header;
5mod jbig2;
6mod lexer;
7mod object_parser;
8mod recovery;
9mod xref;
10
11pub use header::PdfHeader;
12pub use lexer::Lexer;
13pub use object_parser::ObjectParser;
14pub use xref::{XrefEntry, XrefTable};
15
16use std::cell::{OnceCell, RefCell};
17use std::collections::HashMap;
18use std::sync::Arc;
19use zpdf_core::{ObjectId, ParseLimits, PdfDict, PdfName, PdfObject, PdfStream, Result};
20
21/// One fully-decoded /Type /ObjStm: decoded bytes + parsed offset table, shared
22/// via Arc so a cache hit is a refcount bump, not a copy of the decoded buffer.
23struct DecodedObjStm {
24    /// Decoded stream bytes (after the filter pipeline).
25    data: Arc<[u8]>,
26    /// `/First`: byte offset within `data` where object bodies begin.
27    first: usize,
28    /// Parsed header: (obj_num, offset_within_data) per contained object,
29    /// in stream order (index == `index_in_stream`).
30    entries: Vec<(u32, usize)>,
31}
32
33pub struct PdfFile {
34    data: Arc<[u8]>,
35    pub header: PdfHeader,
36    pub xref: XrefTable,
37    pub trailer: zpdf_core::PdfDict,
38    limits: ParseLimits,
39    /// Standard-security-handler decryptor, built once at open time from the
40    /// trailer `/Encrypt` dict. `None` for unencrypted (or unsupported-handler)
41    /// documents, in which case `resolve`/object-stream decoding are unchanged.
42    decryptor: Option<crypt::Decryptor>,
43    /// Cache of resolved top-level indirect objects, keyed by ObjectId.
44    /// `RefCell` suffices: `PdfFile` is never shared across threads in this
45    /// workspace (swap to `Mutex` if that ever changes).
46    object_cache: RefCell<HashMap<ObjectId, PdfObject>>,
47    /// Cache of decoded object streams, keyed by the ObjStm object number.
48    /// Avoids re-decoding the whole stream for every compressed object it holds.
49    objstm_cache: RefCell<HashMap<u32, Arc<DecodedObjStm>>>,
50    /// Lazily-built repair table: populated at most once by a full-file object
51    /// scan, the first time an xref offset turns out to hold the wrong object
52    /// (or no parseable object at all). The inner `None` means the scan itself
53    /// failed and is not retried. Open-time recovery is independent of this.
54    repair_table: OnceCell<Option<XrefTable>>,
55}
56
57impl PdfFile {
58    pub fn parse(data: impl Into<Arc<[u8]>>) -> Result<Self> {
59        Self::parse_with_limits(data, ParseLimits::default())
60    }
61
62    pub fn parse_with_limits(data: impl Into<Arc<[u8]>>, limits: ParseLimits) -> Result<Self> {
63        Self::parse_with_password_and_limits(data, b"", limits)
64    }
65
66    /// Open with a user/owner password (for documents the empty password cannot
67    /// decrypt). Returns [`zpdf_core::Error::WrongPassword`] if it authenticates
68    /// as neither.
69    pub fn parse_with_password(data: impl Into<Arc<[u8]>>, password: &[u8]) -> Result<Self> {
70        Self::parse_with_password_and_limits(data, password, ParseLimits::default())
71    }
72
73    pub fn parse_with_password_and_limits(
74        data: impl Into<Arc<[u8]>>,
75        password: &[u8],
76        limits: ParseLimits,
77    ) -> Result<Self> {
78        let data: Arc<[u8]> = data.into();
79        // A missing `%PDF` marker is not fatal on its own: a sliced/headerless
80        // fragment that begins directly with `N G obj` can still be opened by the
81        // object-scan recovery below. Defer the NotAPdf verdict until recovery
82        // has also come up empty.
83        let header_res = header::parse_header(&data);
84
85        // Try the normal xref pipeline first. Fall back to tail-scan recovery if
86        // it fails structurally OR yields a trailer whose /Root doesn't resolve.
87        let normal = xref::parse_xref_and_trailer(&data, &limits);
88        let (xref, trailer) = match normal {
89            Ok((xref, trailer)) if root_resolves(&data, &xref, &trailer, &limits) => {
90                (xref, trailer)
91            }
92            other => {
93                match &other {
94                    Err(e) => {
95                        tracing::warn!("xref parse failed ({e}); attempting tail-scan recovery")
96                    }
97                    Ok(_) => {
98                        tracing::warn!("xref /Root did not resolve; attempting tail-scan recovery")
99                    }
100                }
101                match recovery::scan_all_objects(&data, &limits) {
102                    Ok(recovered) => recovered,
103                    // Recovery failed: fall back to the normal parse if it at
104                    // least produced a table, else surface the most useful error.
105                    // For a file that never carried a `%PDF` marker, NotAPdf is
106                    // more accurate than the recovery layer's InvalidXref.
107                    Err(rec_err) => match other {
108                        Ok(parsed) => parsed,
109                        Err(_) if header_res.is_err() => return Err(zpdf_core::Error::NotAPdf),
110                        Err(_) => return Err(rec_err),
111                    },
112                }
113            }
114        };
115        // Past this point the document is structurally usable; if the version
116        // header was absent entirely, assume a modern default (matching
117        // header::parse_header's malformed-version fallback) rather than failing.
118        let header = header_res.unwrap_or(PdfHeader { major: 1, minor: 7 });
119
120        let mut file = Self {
121            data,
122            header,
123            xref,
124            trailer,
125            limits,
126            decryptor: None,
127            object_cache: RefCell::new(HashMap::new()),
128            objstm_cache: RefCell::new(HashMap::new()),
129            repair_table: OnceCell::new(),
130        };
131        // Build the decryptor *after* construction so it can use `resolve` to
132        // fetch the (never-encrypted) /Encrypt dict; `decryptor` is still `None`
133        // at this point, so that resolve does not try to decrypt it.
134        file.decryptor = file.build_decryptor(password)?;
135        Ok(file)
136    }
137
138    /// True when the trailer carries an `/Encrypt` dictionary. Note this does not
139    /// imply decryption succeeded — open the document to find out.
140    pub fn is_encrypted(&self) -> bool {
141        self.trailer.get("Encrypt").is_some()
142    }
143
144    /// Construct the Standard-security-handler decryptor from the trailer
145    /// `/Encrypt` dictionary, the first element of `/ID`, and the password.
146    /// `Ok(None)` for unencrypted documents or unsupported/degraded handlers;
147    /// `Err(WrongPassword)` when a non-empty password fails to authenticate.
148    fn build_decryptor(&self, password: &[u8]) -> Result<Option<crypt::Decryptor>> {
149        // /Encrypt is normally an indirect reference, but a direct dict is
150        // legal too (a direct dict has no object id to exempt from decryption).
151        // The /Encrypt dict is itself never encrypted; resolve it directly.
152        let Some(enc) = self.trailer.get("Encrypt") else {
153            return Ok(None);
154        };
155        let (enc_obj, encrypt_ref) = match enc {
156            PdfObject::Ref(r) => match self.resolve(*r) {
157                Ok(o) => (o, Some(*r)),
158                Err(_) => return Ok(None),
159            },
160            direct => (direct.clone(), None),
161        };
162        let Ok(enc_dict) = enc_obj.as_dict() else {
163            return Ok(None);
164        };
165        let id_first = self.first_id_bytes();
166        match crypt::Decryptor::from_encrypt_dict(enc_dict, &id_first, encrypt_ref, password) {
167            crypt::BuildResult::Decryptor(d) => Ok(Some(d)),
168            crypt::BuildResult::Degrade => Ok(None),
169            crypt::BuildResult::WrongPassword => Err(zpdf_core::Error::WrongPassword),
170        }
171    }
172
173    /// Raw bytes of the first element of the trailer `/ID` array (used in the
174    /// encryption key derivation). `/ID` is normally a direct array but may be an
175    /// indirect reference; resolve it (safe — `decryptor` is still `None` here,
176    /// and `/ID` is never encrypted). Empty if absent or malformed.
177    fn first_id_bytes(&self) -> Vec<u8> {
178        let arr = match self.trailer.get("ID") {
179            Some(PdfObject::Array(a)) => Some(std::borrow::Cow::Borrowed(a.as_slice())),
180            Some(PdfObject::Ref(r)) => self.resolve(*r).ok().and_then(|o| {
181                o.as_array()
182                    .ok()
183                    .map(|a| std::borrow::Cow::Owned(a.to_vec()))
184            }),
185            _ => None,
186        };
187        match arr.as_deref().and_then(|a| a.first()) {
188            Some(PdfObject::String(s)) => s.0.clone(),
189            _ => Vec::new(),
190        }
191    }
192
193    pub fn resolve(&self, id: zpdf_core::ObjectId) -> Result<PdfObject> {
194        self.resolve_depth(id, 0)
195    }
196
197    fn resolve_depth(&self, id: ObjectId, depth: u32) -> Result<PdfObject> {
198        /// Maximum length of a ref-to-ref chain (`1 0 obj 2 0 R endobj` ...)
199        /// followed before the reference is treated as null. Guards against
200        /// reference cycles (`A -> B -> A`) without a per-call visited set.
201        const MAX_REF_CHAIN: u32 = 32;
202        if depth > MAX_REF_CHAIN {
203            tracing::warn!(
204                "indirect reference chain longer than {MAX_REF_CHAIN} at {id}; treating as null"
205            );
206            return Ok(PdfObject::Null);
207        }
208
209        // Fast path: already resolved. The borrow ends with this block.
210        if let Some(obj) = self.object_cache.borrow().get(&id) {
211            return Ok(obj.clone());
212        }
213
214        // ISO 32000-1, 7.3.10: a reference to an object that is missing from
215        // the xref, or marked free, is a reference to the null object — not an
216        // error. BUT a damaged xref frequently just omits (or wrongly frees)
217        // objects that physically exist in the file, which would silently empty
218        // the page tree. So before treating a missing/free entry as null, give
219        // the lazy repair table (one memoized full-file scan) a chance to locate
220        // the real object. The Null is cached either way so the warning fires
221        // once per object and a genuinely-dangling ref stays cheap.
222        let obj = match self.xref.get(id) {
223            Some(XrefEntry::InUse { offset, .. }) => self.parse_at_offset_checked(*offset, id)?,
224            Some(XrefEntry::Compressed {
225                stream_obj,
226                index_in_stream,
227            }) => self.extract_from_object_stream(*stream_obj, *index_in_stream)?,
228            Some(XrefEntry::Free { .. }) => match self.repaired_object(id) {
229                Some(obj) => obj,
230                None => {
231                    tracing::warn!("reference to free object {id}; treating as null");
232                    PdfObject::Null
233                }
234            },
235            None => match self.repaired_object(id) {
236                Some(obj) => obj,
237                None => {
238                    tracing::warn!("reference to missing object {id}; treating as null");
239                    PdfObject::Null
240                }
241            },
242        };
243
244        // A top-level object body may itself be an indirect reference; follow
245        // the chain (depth-limited) so callers always get a direct value.
246        let obj = match obj {
247            PdfObject::Ref(next) => self.resolve_depth(next, depth + 1)?,
248            other => other,
249        };
250
251        self.object_cache.borrow_mut().insert(id, obj.clone());
252        Ok(obj)
253    }
254
255    /// Parse the indirect object at `offset`, validating that the header's
256    /// `(num, gen)` matches the id the xref claimed lives there. On mismatch or
257    /// parse failure, consult the lazily-built repair table (full-file object
258    /// scan, run at most once) before giving up.
259    fn parse_at_offset_checked(&self, offset: u64, id: ObjectId) -> Result<PdfObject> {
260        let parser = ObjectParser::new(&self.data, &self.limits);
261        match parser.parse_indirect_with_id(offset as usize) {
262            Ok((pid, mut obj)) if pid == id => {
263                // Top-level objects parsed straight from the file are encrypted;
264                // RC4-decrypt their strings and stream bytes in place (the
265                // decryptor skips the /Encrypt object itself). Objects pulled
266                // from an ObjStm take the Compressed arm and are already
267                // plaintext (the container was decrypted in get_or_decode_objstm).
268                if let Some(dec) = &self.decryptor {
269                    dec.decrypt_object(&mut obj, id);
270                }
271                Ok(obj)
272            }
273            Ok((pid, _)) => {
274                tracing::warn!("xref offset {offset} for {id} holds object {pid}; trying repair");
275                self.repaired_object(id).ok_or_else(|| {
276                    zpdf_core::Error::InvalidObject(
277                        offset,
278                        format!("xref entry for {id} points at object {pid}"),
279                    )
280                })
281            }
282            Err(e) => {
283                tracing::warn!("failed to parse {id} at xref offset {offset} ({e}); trying repair");
284                match self.repaired_object(id) {
285                    Some(obj) => Ok(obj),
286                    None => Err(e),
287                }
288            }
289        }
290    }
291
292    /// Look up `id` in the repair table, building the table on first use by
293    /// running tail-scan recovery over the whole file (memoized; the scan runs
294    /// at most once per `PdfFile`). Returns `None` if the scan failed, the id
295    /// is not in it, or the repaired entry does not actually hold `id`.
296    fn repaired_object(&self, id: ObjectId) -> Option<PdfObject> {
297        let table = self
298            .repair_table
299            .get_or_init(
300                || match recovery::scan_all_objects(&self.data, &self.limits) {
301                    Ok((table, _trailer)) => Some(table),
302                    Err(e) => {
303                        tracing::warn!("repair object scan failed: {e}");
304                        None
305                    }
306                },
307            )
308            .as_ref()?;
309        match table.get(id)? {
310            XrefEntry::InUse { offset, .. } => {
311                let parser = ObjectParser::new(&self.data, &self.limits);
312                let (pid, mut obj) = parser.parse_indirect_with_id(*offset as usize).ok()?;
313                if pid != id {
314                    return None;
315                }
316                if let Some(dec) = &self.decryptor {
317                    dec.decrypt_object(&mut obj, id);
318                }
319                Some(obj)
320            }
321            XrefEntry::Compressed {
322                stream_obj,
323                index_in_stream,
324            } => self
325                .extract_from_object_stream(*stream_obj, *index_in_stream)
326                .ok(),
327            XrefEntry::Free { .. } => None,
328        }
329    }
330
331    /// Resolve a stream object and decode its data through the filter pipeline.
332    /// `/Filter` and `/DecodeParms` may be indirect references (or arrays
333    /// containing them); resolve those before handing the dict to the filter
334    /// layer, which has no access to the file.
335    pub fn resolve_stream_data(&self, id: zpdf_core::ObjectId) -> Result<Vec<u8>> {
336        self.resolve_stream_data_inner(id, true)
337    }
338
339    fn resolve_stream_data_inner(
340        &self,
341        id: zpdf_core::ObjectId,
342        inline_globals: bool,
343    ) -> Result<Vec<u8>> {
344        let obj = self.resolve(id)?;
345        let stream = obj.as_stream()?;
346        match self.dict_with_resolved_filters(&stream.dict, inline_globals) {
347            Some(resolved) => filters::decode_stream(&stream.data, &resolved),
348            None => filters::decode_stream(&stream.data, &stream.dict),
349        }
350    }
351
352    /// If `/Filter`, `/DecodeParms`, or `/DP` is an indirect reference (or an
353    /// array containing one), return a clone of `dict` with those values
354    /// resolved one level. `None` when nothing needs resolving (common case —
355    /// avoids cloning the dict). When `inline_globals` is set, a DecodeParms
356    /// `/JBIG2Globals` stream reference is also inlined (see
357    /// [`Self::inline_jbig2_globals`]).
358    fn dict_with_resolved_filters(&self, dict: &PdfDict, inline_globals: bool) -> Option<PdfDict> {
359        const KEYS: [&str; 3] = ["Filter", "DecodeParms", "DP"];
360        // A DecodeParms dict containing a /JBIG2Globals reference needs the
361        // globals stream inlined even though the dict itself is direct.
362        let dict_needs_globals = |obj: &PdfObject| {
363            inline_globals
364                && matches!(obj, PdfObject::Dict(d)
365                    if matches!(d.get("JBIG2Globals"), Some(PdfObject::Ref(_))))
366        };
367        let needs_resolve = |obj: &PdfObject| match obj {
368            PdfObject::Ref(_) => true,
369            PdfObject::Array(a) => a
370                .iter()
371                .any(|e| matches!(e, PdfObject::Ref(_)) || dict_needs_globals(e)),
372            other => dict_needs_globals(other),
373        };
374        if !KEYS.iter().any(|k| dict.get(k).is_some_and(needs_resolve)) {
375            return None;
376        }
377
378        let resolve_shallow = |obj: &PdfObject| match obj {
379            PdfObject::Ref(r) => self.resolve(*r).unwrap_or(PdfObject::Null),
380            other => other.clone(),
381        };
382        let inline = |obj: PdfObject| {
383            if inline_globals {
384                self.inline_jbig2_globals(obj)
385            } else {
386                obj
387            }
388        };
389        let mut out = dict.clone();
390        for key in KEYS {
391            let Some(value) = dict.get(key) else { continue };
392            let resolved = match resolve_shallow(value) {
393                // Also resolve refs *inside* a (possibly itself indirect) array.
394                PdfObject::Array(a) => {
395                    PdfObject::Array(a.iter().map(resolve_shallow).map(inline).collect())
396                }
397                other => inline(other),
398            };
399            out.insert(PdfName::new(key), resolved);
400        }
401        Some(out)
402    }
403
404    /// If `obj` is a DecodeParms dict whose `/JBIG2Globals` is an indirect
405    /// stream reference, replace the reference with an inline string holding
406    /// the globals stream's *decoded* bytes — the filter layer has no file
407    /// access to chase references itself. The globals stream is decoded
408    /// without globals inlining of its own, so a crafted reference cycle
409    /// cannot recurse. Anything else passes through unchanged.
410    fn inline_jbig2_globals(&self, obj: PdfObject) -> PdfObject {
411        let PdfObject::Dict(mut d) = obj else {
412            return obj;
413        };
414        if let Some(PdfObject::Ref(r)) = d.get("JBIG2Globals") {
415            let r = *r;
416            let value = match self.resolve_stream_data_inner(r, false) {
417                Ok(bytes) => PdfObject::String(zpdf_core::PdfString(bytes)),
418                Err(e) => {
419                    tracing::warn!("failed to decode /JBIG2Globals stream {r}: {e}");
420                    PdfObject::Null
421                }
422            };
423            d.insert(PdfName::new("JBIG2Globals"), value);
424        }
425        PdfObject::Dict(d)
426    }
427
428    /// Extract an object from a compressed object stream (/Type /ObjStm).
429    fn extract_from_object_stream(
430        &self,
431        stream_obj_num: u32,
432        index_in_stream: u32,
433    ) -> Result<PdfObject> {
434        let objstm = self.get_or_decode_objstm(stream_obj_num)?;
435
436        let idx = index_in_stream as usize;
437        if idx >= objstm.entries.len() {
438            return Err(zpdf_core::Error::InvalidObject(
439                0,
440                format!(
441                    "object stream index {idx} out of range (n={})",
442                    objstm.entries.len()
443                ),
444            ));
445        }
446
447        let (_, obj_offset) = objstm.entries[idx];
448        let oob = || {
449            zpdf_core::Error::InvalidObject(0, "object stream member offset out of range".into())
450        };
451        let data_start = objstm.first.checked_add(obj_offset).ok_or_else(oob)?;
452        let data_end = if idx + 1 < objstm.entries.len() {
453            objstm
454                .first
455                .checked_add(objstm.entries[idx + 1].1)
456                .ok_or_else(oob)?
457        } else {
458            objstm.data.len()
459        };
460
461        // Member offsets are attacker-controlled and need not be monotonic, so
462        // guard against start > end and out-of-bounds before slicing (would
463        // otherwise panic).
464        let data_end = data_end.min(objstm.data.len());
465        if data_start > data_end {
466            return Err(zpdf_core::Error::InvalidObject(
467                0,
468                "object stream member offsets out of order".into(),
469            ));
470        }
471
472        let obj_data = &objstm.data[data_start..data_end];
473        let mut lexer = Lexer::new(obj_data, 0, &self.limits);
474        lexer.next_token()
475    }
476
477    /// Get a decoded object stream from cache, decoding+parsing it once on miss.
478    /// Resolves the ObjStm container directly from the xref (it cannot itself
479    /// live in another ObjStm) WITHOUT going through `self.resolve`, so it never
480    /// re-enters the `object_cache` borrow.
481    fn get_or_decode_objstm(&self, stream_obj_num: u32) -> Result<Arc<DecodedObjStm>> {
482        if let Some(hit) = self.objstm_cache.borrow().get(&stream_obj_num) {
483            return Ok(Arc::clone(hit));
484        }
485
486        let stream_id = zpdf_core::ObjectId(stream_obj_num, 0);
487        let stream_entry = self
488            .xref
489            .get(stream_id)
490            .ok_or(zpdf_core::Error::ObjectNotFound(stream_id))?;
491        let stream_obj = match stream_entry {
492            XrefEntry::InUse { offset, .. } => {
493                let parser = ObjectParser::new(&self.data, &self.limits);
494                parser.parse_indirect_at(*offset as usize)?
495            }
496            _ => return Err(zpdf_core::Error::ObjectNotFound(stream_id)),
497        };
498
499        let stream: &PdfStream = stream_obj.as_stream()?;
500        // Reject negative /N and /First (attacker-controlled): a negative i64 cast
501        // straight to usize becomes a near-usize::MAX value that overflows the
502        // offset arithmetic later.
503        let neg =
504            |what: &str| zpdf_core::Error::InvalidObject(0, format!("ObjStm {what} is negative"));
505        let n = usize::try_from(stream.dict.get_i64("N")?).map_err(|_| neg("/N"))?;
506        let first = usize::try_from(stream.dict.get_i64("First")?).map_err(|_| neg("/First"))?;
507
508        // An encrypted document encrypts the ObjStm *container* once (keyed by
509        // the container's own object id); its member objects are not separately
510        // encrypted. Decrypt the raw bytes before running the filter pipeline.
511        let raw: std::borrow::Cow<[u8]> = match &self.decryptor {
512            Some(dec) => std::borrow::Cow::Owned(
513                dec.decrypt_stream_bytes(zpdf_core::ObjectId(stream_obj_num, 0), &stream.data),
514            ),
515            None => std::borrow::Cow::Borrowed(&stream.data),
516        };
517        let decoded = filters::decode_stream(&raw, &stream.dict)?;
518
519        // Parse the header: N pairs of (obj_num, offset_within_data). Capacity is
520        // bounded by the header length to avoid a huge allocation on a bogus /N.
521        let header = &decoded[..first.min(decoded.len())];
522        let mut header_lexer = Lexer::new(header, 0, &self.limits);
523        let mut entries = Vec::with_capacity(n.min(header.len()));
524        for _ in 0..n {
525            let obj_num_tok = header_lexer.next_token()?;
526            let offset_tok = header_lexer.next_token()?;
527            let obj_num = obj_num_tok.as_i64()? as u32;
528            let offset = usize::try_from(offset_tok.as_i64()?).map_err(|_| neg("member offset"))?;
529            entries.push((obj_num, offset));
530        }
531
532        let decoded_arc = Arc::new(DecodedObjStm {
533            data: Arc::<[u8]>::from(decoded),
534            first,
535            entries,
536        });
537        self.objstm_cache
538            .borrow_mut()
539            .insert(stream_obj_num, Arc::clone(&decoded_arc));
540        Ok(decoded_arc)
541    }
542
543    pub fn data(&self) -> &[u8] {
544        &self.data
545    }
546
547    /// Force-build (once) and return the full-file repair-scan table, or `None`
548    /// if the scan found nothing. Shares the `OnceCell` the lazy per-object
549    /// repair uses, so the scan runs at most once per `PdfFile`.
550    pub fn force_repair_scan(&self) -> Option<&XrefTable> {
551        self.repair_table
552            .get_or_init(
553                || match recovery::scan_all_objects(&self.data, &self.limits) {
554                    Ok((table, _trailer)) => Some(table),
555                    Err(e) => {
556                        tracing::warn!("repair object scan failed: {e}");
557                        None
558                    }
559                },
560            )
561            .as_ref()
562    }
563
564    /// Every object id known to this file: the live xref unioned with the
565    /// repair-scan table (built on demand). Deduped and sorted by `(num, gen)`.
566    pub fn all_object_ids(&self) -> Vec<ObjectId> {
567        let mut ids: Vec<ObjectId> = self.xref.object_ids().collect();
568        if let Some(table) = self.force_repair_scan() {
569            ids.extend(table.object_ids());
570        }
571        ids.sort_by_key(|id| (id.0, id.1));
572        ids.dedup();
573        ids
574    }
575
576    /// All objects whose dict `/Type` equals `ty`, in `(num, gen)` order.
577    /// Resolves through [`Self::resolve`] (so /ObjStm members are decoded and,
578    /// for encrypted files, decrypted) and falls back to the repair table for
579    /// ids the live xref lacks. Bounded by `limits.max_objects`. The document
580    /// layer uses this to rebuild a page list when the /Pages tree is
581    /// unreachable.
582    pub fn find_objects_by_type(&self, ty: &str) -> Vec<ObjectId> {
583        let mut out = Vec::new();
584        for id in self.all_object_ids() {
585            if out.len() as u32 >= self.limits.max_objects {
586                break;
587            }
588            let obj = match self.resolve(id) {
589                Ok(PdfObject::Null) | Err(_) => self.repaired_object(id),
590                Ok(o) => Some(o),
591            };
592            let is_match = obj
593                .as_ref()
594                .and_then(|o| o.as_dict().ok())
595                .map(|d| d.get_name("Type").map(|t| t == ty).unwrap_or(false))
596                .unwrap_or(false);
597            if is_match {
598                out.push(id);
599            }
600        }
601        out
602    }
603}
604
605/// Best-effort check that the trailer's /Root points at a usable Catalog. Runs
606/// once at open time (before `PdfFile` exists), so it is a free function that
607/// parses the Root directly rather than going through `PdfFile::resolve`.
608///
609/// Lenient by design: a Root that is present but compressed/free is trusted
610/// (the normal pipeline handles it); only a direct InUse Root is strictly
611/// checked for `/Type /Catalog`. A missing Root triggers recovery.
612fn root_resolves(
613    data: &[u8],
614    xref: &XrefTable,
615    trailer: &zpdf_core::PdfDict,
616    limits: &ParseLimits,
617) -> bool {
618    let Ok(root_ref) = trailer.get_ref("Root") else {
619        return false;
620    };
621    match xref.get(root_ref) {
622        Some(XrefEntry::InUse { offset, .. }) => {
623            let parser = ObjectParser::new(data, limits);
624            matches!(
625                parser
626                    .parse_indirect_at(*offset as usize)
627                    .ok()
628                    .and_then(|o| o
629                        .as_dict()
630                        .ok()
631                        .map(|d| d.get_name("Type").unwrap_or("").to_string())),
632                Some(t) if t == "Catalog"
633            )
634        }
635        Some(_) => true, // compressed/free-but-present: trust the normal pipeline
636        None => false,
637    }
638}
639
640#[cfg(test)]
641mod tests {
642    use super::*;
643
644    /// Validates the object-stream header parse + body-slicing arithmetic that
645    /// `get_or_decode_objstm`/`extract_from_object_stream` rely on, without
646    /// needing a full xref-stream fixture.
647    #[test]
648    fn objstm_header_and_slicing_math() {
649        let limits = ParseLimits::default();
650        let o10 = b"<< /Type /Catalog /Pages 2 0 R >>";
651        let o11 = b"42";
652        let header = format!("10 0 11 {} ", o10.len() + 1);
653        let first = header.len();
654        let mut decoded = header.into_bytes();
655        decoded.extend_from_slice(o10);
656        decoded.push(b' ');
657        decoded.extend_from_slice(o11);
658
659        // Mirror the header parse.
660        let mut hx = Lexer::new(&decoded[..first], 0, &limits);
661        let mut entries = Vec::new();
662        for _ in 0..2 {
663            let num = hx.next_token().unwrap().as_i64().unwrap() as u32;
664            let off = hx.next_token().unwrap().as_i64().unwrap() as usize;
665            entries.push((num, off));
666        }
667        assert_eq!(entries, vec![(10, 0), (11, o10.len() + 1)]);
668
669        // Slice + lex object index 0 (obj 10).
670        let (start0, end0) = (first + entries[0].1, first + entries[1].1);
671        let obj = Lexer::new(&decoded[start0..end0], 0, &limits)
672            .next_token()
673            .unwrap();
674        assert!(obj.as_dict().is_ok(), "obj 10 should lex as a dict");
675
676        // Slice + lex object index 1 (obj 11) — runs to end of decoded.
677        let start1 = first + entries[1].1;
678        let n = Lexer::new(&decoded[start1..], 0, &limits)
679            .next_token()
680            .unwrap();
681        assert_eq!(n.as_i64().unwrap(), 42);
682    }
683
684    /// Assemble a minimal PDF: the given `(num, body)` objects at gen 0, a
685    /// traditional xref covering each (one single-entry subsection apiece),
686    /// and a trailer pointing /Root at `root`.
687    fn build_pdf(objects: &[(u32, &str)], root: u32) -> Vec<u8> {
688        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
689        let mut offsets = Vec::new();
690        for (num, body) in objects {
691            offsets.push((*num, d.len()));
692            d.extend_from_slice(format!("{num} 0 obj\n{body}\nendobj\n").as_bytes());
693        }
694        let xref_off = d.len();
695        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
696        for (num, off) in &offsets {
697            d.extend_from_slice(format!("{num} 1\n{off:010} 00000 n \n").as_bytes());
698        }
699        let size = objects.iter().map(|(n, _)| n + 1).max().unwrap_or(1);
700        d.extend_from_slice(
701            format!("trailer\n<< /Size {size} /Root {root} 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
702                .as_bytes(),
703        );
704        d
705    }
706
707    #[test]
708    fn dangling_ref_resolves_to_null() {
709        // Object 9 is referenced but absent from the xref entirely: per
710        // ISO 32000 7.3.10 it resolves to null, not an error.
711        let pdf = build_pdf(&[(1, "<< /Type /Catalog /Pages 9 0 R >>")], 1);
712        let file = PdfFile::parse(pdf).unwrap();
713        assert_eq!(file.resolve(ObjectId(9, 0)).unwrap(), PdfObject::Null);
714        // Second resolve hits the cache (warn fires once).
715        assert_eq!(file.resolve(ObjectId(9, 0)).unwrap(), PdfObject::Null);
716    }
717
718    #[test]
719    fn free_entry_resolves_to_null() {
720        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
721        let off1 = d.len();
722        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
723        let xref_off = d.len();
724        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n1 1\n");
725        d.extend_from_slice(format!("{off1:010} 00000 n \n").as_bytes());
726        d.extend_from_slice(b"2 1\n0000000000 00000 f \n");
727        d.extend_from_slice(
728            format!("trailer\n<< /Size 3 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
729                .as_bytes(),
730        );
731
732        let file = PdfFile::parse(d).unwrap();
733        assert!(matches!(
734            file.xref.get(ObjectId(2, 0)),
735            Some(XrefEntry::Free { .. })
736        ));
737        assert_eq!(file.resolve(ObjectId(2, 0)).unwrap(), PdfObject::Null);
738    }
739
740    #[test]
741    fn header_mismatch_triggers_lazy_repair() {
742        // The xref entry for object 3 points at object 2's offset; the real
743        // object 3 lives elsewhere. resolve(3) must repair via the lazy scan.
744        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
745        let off1 = d.len();
746        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
747        let off2 = d.len();
748        d.extend_from_slice(b"2 0 obj\n<< /Marker /Wrong >>\nendobj\n");
749        // Real object 3 — its offset is deliberately NOT in the xref.
750        d.extend_from_slice(b"3 0 obj\n<< /Marker /Real >>\nendobj\n");
751        let xref_off = d.len();
752        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
753        d.extend_from_slice(format!("1 1\n{off1:010} 00000 n \n").as_bytes());
754        d.extend_from_slice(format!("2 1\n{off2:010} 00000 n \n").as_bytes());
755        d.extend_from_slice(format!("3 1\n{off2:010} 00000 n \n").as_bytes()); // wrong!
756        d.extend_from_slice(
757            format!("trailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
758                .as_bytes(),
759        );
760
761        let file = PdfFile::parse(d).unwrap();
762        let obj = file.resolve(ObjectId(3, 0)).unwrap();
763        assert_eq!(obj.as_dict().unwrap().get_name("Marker").unwrap(), "Real");
764        // Object 2 still resolves normally (its entry was correct).
765        let obj2 = file.resolve(ObjectId(2, 0)).unwrap();
766        assert_eq!(obj2.as_dict().unwrap().get_name("Marker").unwrap(), "Wrong");
767    }
768
769    #[test]
770    fn ref_to_ref_chain_resolves() {
771        let pdf = build_pdf(
772            &[
773                (1, "<< /Type /Catalog /Pages 2 0 R >>"),
774                (4, "5 0 R"),
775                (5, "42"),
776            ],
777            1,
778        );
779        let file = PdfFile::parse(pdf).unwrap();
780        assert_eq!(
781            file.resolve(ObjectId(4, 0)).unwrap(),
782            PdfObject::Integer(42)
783        );
784    }
785
786    #[test]
787    fn ref_cycle_resolves_to_null() {
788        // 4 -> 5 -> 4: the chain guard must terminate (no hang/stack overflow)
789        // and degrade the value to null.
790        let pdf = build_pdf(
791            &[
792                (1, "<< /Type /Catalog /Pages 2 0 R >>"),
793                (4, "5 0 R"),
794                (5, "4 0 R"),
795            ],
796            1,
797        );
798        let file = PdfFile::parse(pdf).unwrap();
799        assert_eq!(file.resolve(ObjectId(4, 0)).unwrap(), PdfObject::Null);
800    }
801
802    #[test]
803    fn indirect_filter_is_resolved() {
804        use flate2::write::ZlibEncoder;
805        use flate2::Compression;
806        use std::io::Write;
807
808        let payload = b"indirect filter payload";
809        let mut enc = ZlibEncoder::new(Vec::new(), Compression::default());
810        enc.write_all(payload).unwrap();
811        let compressed = enc.finish().unwrap();
812
813        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
814        let off1 = d.len();
815        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
816        let off3 = d.len();
817        d.extend_from_slice(
818            format!(
819                "3 0 obj\n<< /Length {} /Filter 4 0 R >>\nstream\n",
820                compressed.len()
821            )
822            .as_bytes(),
823        );
824        d.extend_from_slice(&compressed);
825        d.extend_from_slice(b"\nendstream\nendobj\n");
826        let off4 = d.len();
827        d.extend_from_slice(b"4 0 obj\n/FlateDecode\nendobj\n");
828        let xref_off = d.len();
829        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
830        d.extend_from_slice(format!("1 1\n{off1:010} 00000 n \n").as_bytes());
831        d.extend_from_slice(format!("3 1\n{off3:010} 00000 n \n").as_bytes());
832        d.extend_from_slice(format!("4 1\n{off4:010} 00000 n \n").as_bytes());
833        d.extend_from_slice(
834            format!("trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
835                .as_bytes(),
836        );
837
838        let file = PdfFile::parse(d).unwrap();
839        let data = file.resolve_stream_data(ObjectId(3, 0)).unwrap();
840        assert_eq!(data, payload);
841    }
842
843    /// An image stream with /Filter /JBIG2Decode whose /DecodeParms holds an
844    /// indirect /JBIG2Globals stream: the globals reference must be resolved,
845    /// decoded (here through its own FlateDecode), and inlined before the
846    /// filter layer runs. The globals carry the page-info segment; the image
847    /// stream carries an MMR generic region (two "WWWBBWWW" rows).
848    #[test]
849    fn jbig2_globals_stream_is_resolved_and_decoded() {
850        use flate2::write::ZlibEncoder;
851        use flate2::Compression;
852        use std::io::Write;
853
854        // Globals: segment 0, type 48 (page information), page 1, 8x2 page.
855        let globals: Vec<u8> = [
856            &[0, 0, 0, 0, 0x30, 0x00, 0x01, 0, 0, 0, 19][..], // header, length 19
857            &[0, 0, 0, 8, 0, 0, 0, 2][..],                    // width 8, height 2
858            &[0; 8][..],                                      // x/y resolution
859            &[0x00, 0, 0][..],                                // flags, striping
860        ]
861        .concat();
862        let mut gz = ZlibEncoder::new(Vec::new(), Compression::default());
863        gz.write_all(&globals).unwrap();
864        let globals_z = gz.finish().unwrap();
865
866        // Image stream: segment 1, type 38 (immediate generic region), MMR
867        // payload 0x31 0xF8 = T.6-coded WWWBBWWW twice.
868        let image: Vec<u8> = [
869            &[0, 0, 0, 1, 0x26, 0x00, 0x01, 0, 0, 0, 20][..], // header, length 20
870            &[0, 0, 0, 8, 0, 0, 0, 2][..],                    // region 8x2 …
871            &[0, 0, 0, 0, 0, 0, 0, 0, 0x00][..],              // … at (0,0), OR
872            &[0x01, 0x31, 0xF8][..],                          // MMR flag + data
873        ]
874        .concat();
875
876        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
877        let off1 = d.len();
878        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
879        let off3 = d.len();
880        d.extend_from_slice(
881            format!(
882                "3 0 obj\n<< /Length {} /Filter /JBIG2Decode \
883                 /DecodeParms << /JBIG2Globals 4 0 R >> >>\nstream\n",
884                image.len()
885            )
886            .as_bytes(),
887        );
888        d.extend_from_slice(&image);
889        d.extend_from_slice(b"\nendstream\nendobj\n");
890        let off4 = d.len();
891        d.extend_from_slice(
892            format!(
893                "4 0 obj\n<< /Length {} /Filter /FlateDecode >>\nstream\n",
894                globals_z.len()
895            )
896            .as_bytes(),
897        );
898        d.extend_from_slice(&globals_z);
899        d.extend_from_slice(b"\nendstream\nendobj\n");
900        let xref_off = d.len();
901        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
902        d.extend_from_slice(format!("1 1\n{off1:010} 00000 n \n").as_bytes());
903        d.extend_from_slice(format!("3 1\n{off3:010} 00000 n \n").as_bytes());
904        d.extend_from_slice(format!("4 1\n{off4:010} 00000 n \n").as_bytes());
905        d.extend_from_slice(
906            format!("trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
907                .as_bytes(),
908        );
909
910        let file = PdfFile::parse(d).unwrap();
911        let data = file.resolve_stream_data(ObjectId(3, 0)).unwrap();
912        // WWWBBWWW in PDF 1-bpc polarity (black = 0): 1110 0111, both rows.
913        assert_eq!(data, vec![0xE7, 0xE7]);
914    }
915}
zpdf_parser/lib.rs

zpdf_parser/
lib.rs