zpdf_parser/
lib.rs

1mod ccitt;
2mod crypt;
3pub mod filters;
4mod header;
5mod jbig2;
6mod lexer;
7mod object_parser;
8mod recovery;
9mod xref;
10
11pub use header::PdfHeader;
12pub use lexer::Lexer;
13pub use object_parser::ObjectParser;
14pub use xref::{XrefEntry, XrefTable};
15
16use std::cell::{OnceCell, RefCell};
17use std::collections::HashMap;
18use std::sync::Arc;
19use zpdf_core::{ObjectId, ParseLimits, PdfDict, PdfName, PdfObject, PdfStream, Result};
20
21/// One fully-decoded /Type /ObjStm: decoded bytes + parsed offset table, shared
22/// via Arc so a cache hit is a refcount bump, not a copy of the decoded buffer.
23struct DecodedObjStm {
24    /// Decoded stream bytes (after the filter pipeline).
25    data: Arc<[u8]>,
26    /// `/First`: byte offset within `data` where object bodies begin.
27    first: usize,
28    /// Parsed header: (obj_num, offset_within_data) per contained object,
29    /// in stream order (index == `index_in_stream`).
30    entries: Vec<(u32, usize)>,
31}
32
33pub struct PdfFile {
34    data: Arc<[u8]>,
35    pub header: PdfHeader,
36    pub xref: XrefTable,
37    pub trailer: zpdf_core::PdfDict,
38    limits: ParseLimits,
39    /// Standard-security-handler decryptor, built once at open time from the
40    /// trailer `/Encrypt` dict. `None` for unencrypted (or unsupported-handler)
41    /// documents, in which case `resolve`/object-stream decoding are unchanged.
42    decryptor: Option<crypt::Decryptor>,
43    /// Cache of resolved top-level indirect objects, keyed by ObjectId.
44    /// `RefCell` suffices: `PdfFile` is never shared across threads in this
45    /// workspace (swap to `Mutex` if that ever changes).
46    object_cache: RefCell<HashMap<ObjectId, PdfObject>>,
47    /// Cache of decoded object streams, keyed by the ObjStm object number.
48    /// Avoids re-decoding the whole stream for every compressed object it holds.
49    objstm_cache: RefCell<HashMap<u32, Arc<DecodedObjStm>>>,
50    /// Lazily-built repair table: populated at most once by a full-file object
51    /// scan, the first time an xref offset turns out to hold the wrong object
52    /// (or no parseable object at all). The inner `None` means the scan itself
53    /// failed and is not retried. Open-time recovery is independent of this.
54    repair_table: OnceCell<Option<XrefTable>>,
55}
56
57impl PdfFile {
58    pub fn parse(data: impl Into<Arc<[u8]>>) -> Result<Self> {
59        Self::parse_with_limits(data, ParseLimits::default())
60    }
61
62    pub fn parse_with_limits(data: impl Into<Arc<[u8]>>, limits: ParseLimits) -> Result<Self> {
63        let data: Arc<[u8]> = data.into();
64        let header = header::parse_header(&data)?;
65
66        // Try the normal xref pipeline first. Fall back to tail-scan recovery if
67        // it fails structurally OR yields a trailer whose /Root doesn't resolve.
68        let normal = xref::parse_xref_and_trailer(&data, &limits);
69        let (xref, trailer) = match normal {
70            Ok((xref, trailer)) if root_resolves(&data, &xref, &trailer, &limits) => {
71                (xref, trailer)
72            }
73            other => {
74                match &other {
75                    Err(e) => {
76                        tracing::warn!("xref parse failed ({e}); attempting tail-scan recovery")
77                    }
78                    Ok(_) => {
79                        tracing::warn!("xref /Root did not resolve; attempting tail-scan recovery")
80                    }
81                }
82                match recovery::scan_all_objects(&data, &limits) {
83                    Ok(recovered) => recovered,
84                    // Recovery failed: fall back to the normal parse if it at
85                    // least produced a table, else surface the recovery error.
86                    Err(rec_err) => match other {
87                        Ok(parsed) => parsed,
88                        Err(_) => return Err(rec_err),
89                    },
90                }
91            }
92        };
93
94        let mut file = Self {
95            data,
96            header,
97            xref,
98            trailer,
99            limits,
100            decryptor: None,
101            object_cache: RefCell::new(HashMap::new()),
102            objstm_cache: RefCell::new(HashMap::new()),
103            repair_table: OnceCell::new(),
104        };
105        // Build the decryptor *after* construction so it can use `resolve` to
106        // fetch the (never-encrypted) /Encrypt dict; `decryptor` is still `None`
107        // at this point, so that resolve does not try to decrypt it.
108        file.decryptor = file.build_decryptor();
109        Ok(file)
110    }
111
112    /// Construct the Standard-security-handler decryptor from the trailer
113    /// `/Encrypt` dictionary and the first element of `/ID`. Returns `None` for
114    /// unencrypted documents or unsupported handlers (AES, non-Standard).
115    fn build_decryptor(&self) -> Option<crypt::Decryptor> {
116        // /Encrypt is normally an indirect reference, but a direct dict is
117        // legal too (a direct dict has no object id to exempt from decryption).
118        // The /Encrypt dict is itself never encrypted; resolve it directly.
119        let (enc_obj, encrypt_ref) = match self.trailer.get("Encrypt")? {
120            PdfObject::Ref(r) => (self.resolve(*r).ok()?, Some(*r)),
121            direct => (direct.clone(), None),
122        };
123        let enc_dict = enc_obj.as_dict().ok()?;
124        let id_first = self.first_id_bytes();
125        crypt::Decryptor::from_encrypt_dict(enc_dict, &id_first, encrypt_ref)
126    }
127
128    /// Raw bytes of the first element of the trailer `/ID` array (used in the
129    /// encryption key derivation). `/ID` is normally a direct array but may be an
130    /// indirect reference; resolve it (safe — `decryptor` is still `None` here,
131    /// and `/ID` is never encrypted). Empty if absent or malformed.
132    fn first_id_bytes(&self) -> Vec<u8> {
133        let arr = match self.trailer.get("ID") {
134            Some(PdfObject::Array(a)) => Some(std::borrow::Cow::Borrowed(a.as_slice())),
135            Some(PdfObject::Ref(r)) => self.resolve(*r).ok().and_then(|o| {
136                o.as_array()
137                    .ok()
138                    .map(|a| std::borrow::Cow::Owned(a.to_vec()))
139            }),
140            _ => None,
141        };
142        match arr.as_deref().and_then(|a| a.first()) {
143            Some(PdfObject::String(s)) => s.0.clone(),
144            _ => Vec::new(),
145        }
146    }
147
148    pub fn resolve(&self, id: zpdf_core::ObjectId) -> Result<PdfObject> {
149        self.resolve_depth(id, 0)
150    }
151
152    fn resolve_depth(&self, id: ObjectId, depth: u32) -> Result<PdfObject> {
153        /// Maximum length of a ref-to-ref chain (`1 0 obj 2 0 R endobj` ...)
154        /// followed before the reference is treated as null. Guards against
155        /// reference cycles (`A -> B -> A`) without a per-call visited set.
156        const MAX_REF_CHAIN: u32 = 32;
157        if depth > MAX_REF_CHAIN {
158            tracing::warn!(
159                "indirect reference chain longer than {MAX_REF_CHAIN} at {id}; treating as null"
160            );
161            return Ok(PdfObject::Null);
162        }
163
164        // Fast path: already resolved. The borrow ends with this block.
165        if let Some(obj) = self.object_cache.borrow().get(&id) {
166            return Ok(obj.clone());
167        }
168
169        // ISO 32000-1, 7.3.10: a reference to an object that is missing from
170        // the xref, or marked free, is a reference to the null object — not an
171        // error. Cache the Null so the warning fires once per object.
172        let Some(entry) = self.xref.get(id) else {
173            tracing::warn!("reference to missing object {id}; treating as null");
174            self.object_cache.borrow_mut().insert(id, PdfObject::Null);
175            return Ok(PdfObject::Null);
176        };
177        let obj = match entry {
178            XrefEntry::InUse { offset, .. } => self.parse_at_offset_checked(*offset, id)?,
179            XrefEntry::Compressed {
180                stream_obj,
181                index_in_stream,
182            } => self.extract_from_object_stream(*stream_obj, *index_in_stream)?,
183            XrefEntry::Free { .. } => {
184                tracing::warn!("reference to free object {id}; treating as null");
185                PdfObject::Null
186            }
187        };
188
189        // A top-level object body may itself be an indirect reference; follow
190        // the chain (depth-limited) so callers always get a direct value.
191        let obj = match obj {
192            PdfObject::Ref(next) => self.resolve_depth(next, depth + 1)?,
193            other => other,
194        };
195
196        self.object_cache.borrow_mut().insert(id, obj.clone());
197        Ok(obj)
198    }
199
200    /// Parse the indirect object at `offset`, validating that the header's
201    /// `(num, gen)` matches the id the xref claimed lives there. On mismatch or
202    /// parse failure, consult the lazily-built repair table (full-file object
203    /// scan, run at most once) before giving up.
204    fn parse_at_offset_checked(&self, offset: u64, id: ObjectId) -> Result<PdfObject> {
205        let parser = ObjectParser::new(&self.data, &self.limits);
206        match parser.parse_indirect_with_id(offset as usize) {
207            Ok((pid, mut obj)) if pid == id => {
208                // Top-level objects parsed straight from the file are encrypted;
209                // RC4-decrypt their strings and stream bytes in place (the
210                // decryptor skips the /Encrypt object itself). Objects pulled
211                // from an ObjStm take the Compressed arm and are already
212                // plaintext (the container was decrypted in get_or_decode_objstm).
213                if let Some(dec) = &self.decryptor {
214                    dec.decrypt_object(&mut obj, id);
215                }
216                Ok(obj)
217            }
218            Ok((pid, _)) => {
219                tracing::warn!("xref offset {offset} for {id} holds object {pid}; trying repair");
220                self.repaired_object(id).ok_or_else(|| {
221                    zpdf_core::Error::InvalidObject(
222                        offset,
223                        format!("xref entry for {id} points at object {pid}"),
224                    )
225                })
226            }
227            Err(e) => {
228                tracing::warn!("failed to parse {id} at xref offset {offset} ({e}); trying repair");
229                match self.repaired_object(id) {
230                    Some(obj) => Ok(obj),
231                    None => Err(e),
232                }
233            }
234        }
235    }
236
237    /// Look up `id` in the repair table, building the table on first use by
238    /// running tail-scan recovery over the whole file (memoized; the scan runs
239    /// at most once per `PdfFile`). Returns `None` if the scan failed, the id
240    /// is not in it, or the repaired entry does not actually hold `id`.
241    fn repaired_object(&self, id: ObjectId) -> Option<PdfObject> {
242        let table = self
243            .repair_table
244            .get_or_init(
245                || match recovery::scan_all_objects(&self.data, &self.limits) {
246                    Ok((table, _trailer)) => Some(table),
247                    Err(e) => {
248                        tracing::warn!("repair object scan failed: {e}");
249                        None
250                    }
251                },
252            )
253            .as_ref()?;
254        match table.get(id)? {
255            XrefEntry::InUse { offset, .. } => {
256                let parser = ObjectParser::new(&self.data, &self.limits);
257                let (pid, mut obj) = parser.parse_indirect_with_id(*offset as usize).ok()?;
258                if pid != id {
259                    return None;
260                }
261                if let Some(dec) = &self.decryptor {
262                    dec.decrypt_object(&mut obj, id);
263                }
264                Some(obj)
265            }
266            XrefEntry::Compressed {
267                stream_obj,
268                index_in_stream,
269            } => self
270                .extract_from_object_stream(*stream_obj, *index_in_stream)
271                .ok(),
272            XrefEntry::Free { .. } => None,
273        }
274    }
275
276    /// Resolve a stream object and decode its data through the filter pipeline.
277    /// `/Filter` and `/DecodeParms` may be indirect references (or arrays
278    /// containing them); resolve those before handing the dict to the filter
279    /// layer, which has no access to the file.
280    pub fn resolve_stream_data(&self, id: zpdf_core::ObjectId) -> Result<Vec<u8>> {
281        self.resolve_stream_data_inner(id, true)
282    }
283
284    fn resolve_stream_data_inner(
285        &self,
286        id: zpdf_core::ObjectId,
287        inline_globals: bool,
288    ) -> Result<Vec<u8>> {
289        let obj = self.resolve(id)?;
290        let stream = obj.as_stream()?;
291        match self.dict_with_resolved_filters(&stream.dict, inline_globals) {
292            Some(resolved) => filters::decode_stream(&stream.data, &resolved),
293            None => filters::decode_stream(&stream.data, &stream.dict),
294        }
295    }
296
297    /// If `/Filter`, `/DecodeParms`, or `/DP` is an indirect reference (or an
298    /// array containing one), return a clone of `dict` with those values
299    /// resolved one level. `None` when nothing needs resolving (common case —
300    /// avoids cloning the dict). When `inline_globals` is set, a DecodeParms
301    /// `/JBIG2Globals` stream reference is also inlined (see
302    /// [`Self::inline_jbig2_globals`]).
303    fn dict_with_resolved_filters(&self, dict: &PdfDict, inline_globals: bool) -> Option<PdfDict> {
304        const KEYS: [&str; 3] = ["Filter", "DecodeParms", "DP"];
305        // A DecodeParms dict containing a /JBIG2Globals reference needs the
306        // globals stream inlined even though the dict itself is direct.
307        let dict_needs_globals = |obj: &PdfObject| {
308            inline_globals
309                && matches!(obj, PdfObject::Dict(d)
310                    if matches!(d.get("JBIG2Globals"), Some(PdfObject::Ref(_))))
311        };
312        let needs_resolve = |obj: &PdfObject| match obj {
313            PdfObject::Ref(_) => true,
314            PdfObject::Array(a) => a
315                .iter()
316                .any(|e| matches!(e, PdfObject::Ref(_)) || dict_needs_globals(e)),
317            other => dict_needs_globals(other),
318        };
319        if !KEYS.iter().any(|k| dict.get(k).is_some_and(needs_resolve)) {
320            return None;
321        }
322
323        let resolve_shallow = |obj: &PdfObject| match obj {
324            PdfObject::Ref(r) => self.resolve(*r).unwrap_or(PdfObject::Null),
325            other => other.clone(),
326        };
327        let inline = |obj: PdfObject| {
328            if inline_globals {
329                self.inline_jbig2_globals(obj)
330            } else {
331                obj
332            }
333        };
334        let mut out = dict.clone();
335        for key in KEYS {
336            let Some(value) = dict.get(key) else { continue };
337            let resolved = match resolve_shallow(value) {
338                // Also resolve refs *inside* a (possibly itself indirect) array.
339                PdfObject::Array(a) => {
340                    PdfObject::Array(a.iter().map(resolve_shallow).map(inline).collect())
341                }
342                other => inline(other),
343            };
344            out.insert(PdfName::new(key), resolved);
345        }
346        Some(out)
347    }
348
349    /// If `obj` is a DecodeParms dict whose `/JBIG2Globals` is an indirect
350    /// stream reference, replace the reference with an inline string holding
351    /// the globals stream's *decoded* bytes — the filter layer has no file
352    /// access to chase references itself. The globals stream is decoded
353    /// without globals inlining of its own, so a crafted reference cycle
354    /// cannot recurse. Anything else passes through unchanged.
355    fn inline_jbig2_globals(&self, obj: PdfObject) -> PdfObject {
356        let PdfObject::Dict(mut d) = obj else {
357            return obj;
358        };
359        if let Some(PdfObject::Ref(r)) = d.get("JBIG2Globals") {
360            let r = *r;
361            let value = match self.resolve_stream_data_inner(r, false) {
362                Ok(bytes) => PdfObject::String(zpdf_core::PdfString(bytes)),
363                Err(e) => {
364                    tracing::warn!("failed to decode /JBIG2Globals stream {r}: {e}");
365                    PdfObject::Null
366                }
367            };
368            d.insert(PdfName::new("JBIG2Globals"), value);
369        }
370        PdfObject::Dict(d)
371    }
372
373    /// Extract an object from a compressed object stream (/Type /ObjStm).
374    fn extract_from_object_stream(
375        &self,
376        stream_obj_num: u32,
377        index_in_stream: u32,
378    ) -> Result<PdfObject> {
379        let objstm = self.get_or_decode_objstm(stream_obj_num)?;
380
381        let idx = index_in_stream as usize;
382        if idx >= objstm.entries.len() {
383            return Err(zpdf_core::Error::InvalidObject(
384                0,
385                format!(
386                    "object stream index {idx} out of range (n={})",
387                    objstm.entries.len()
388                ),
389            ));
390        }
391
392        let (_, obj_offset) = objstm.entries[idx];
393        let oob = || {
394            zpdf_core::Error::InvalidObject(0, "object stream member offset out of range".into())
395        };
396        let data_start = objstm.first.checked_add(obj_offset).ok_or_else(oob)?;
397        let data_end = if idx + 1 < objstm.entries.len() {
398            objstm
399                .first
400                .checked_add(objstm.entries[idx + 1].1)
401                .ok_or_else(oob)?
402        } else {
403            objstm.data.len()
404        };
405
406        // Member offsets are attacker-controlled and need not be monotonic, so
407        // guard against start > end and out-of-bounds before slicing (would
408        // otherwise panic).
409        let data_end = data_end.min(objstm.data.len());
410        if data_start > data_end {
411            return Err(zpdf_core::Error::InvalidObject(
412                0,
413                "object stream member offsets out of order".into(),
414            ));
415        }
416
417        let obj_data = &objstm.data[data_start..data_end];
418        let mut lexer = Lexer::new(obj_data, 0, &self.limits);
419        lexer.next_token()
420    }
421
422    /// Get a decoded object stream from cache, decoding+parsing it once on miss.
423    /// Resolves the ObjStm container directly from the xref (it cannot itself
424    /// live in another ObjStm) WITHOUT going through `self.resolve`, so it never
425    /// re-enters the `object_cache` borrow.
426    fn get_or_decode_objstm(&self, stream_obj_num: u32) -> Result<Arc<DecodedObjStm>> {
427        if let Some(hit) = self.objstm_cache.borrow().get(&stream_obj_num) {
428            return Ok(Arc::clone(hit));
429        }
430
431        let stream_id = zpdf_core::ObjectId(stream_obj_num, 0);
432        let stream_entry = self
433            .xref
434            .get(stream_id)
435            .ok_or(zpdf_core::Error::ObjectNotFound(stream_id))?;
436        let stream_obj = match stream_entry {
437            XrefEntry::InUse { offset, .. } => {
438                let parser = ObjectParser::new(&self.data, &self.limits);
439                parser.parse_indirect_at(*offset as usize)?
440            }
441            _ => return Err(zpdf_core::Error::ObjectNotFound(stream_id)),
442        };
443
444        let stream: &PdfStream = stream_obj.as_stream()?;
445        // Reject negative /N and /First (attacker-controlled): a negative i64 cast
446        // straight to usize becomes a near-usize::MAX value that overflows the
447        // offset arithmetic later.
448        let neg =
449            |what: &str| zpdf_core::Error::InvalidObject(0, format!("ObjStm {what} is negative"));
450        let n = usize::try_from(stream.dict.get_i64("N")?).map_err(|_| neg("/N"))?;
451        let first = usize::try_from(stream.dict.get_i64("First")?).map_err(|_| neg("/First"))?;
452
453        // An encrypted document encrypts the ObjStm *container* once (keyed by
454        // the container's own object id); its member objects are not separately
455        // encrypted. Decrypt the raw bytes before running the filter pipeline.
456        let raw: std::borrow::Cow<[u8]> = match &self.decryptor {
457            Some(dec) => std::borrow::Cow::Owned(
458                dec.decrypt_stream_bytes(zpdf_core::ObjectId(stream_obj_num, 0), &stream.data),
459            ),
460            None => std::borrow::Cow::Borrowed(&stream.data),
461        };
462        let decoded = filters::decode_stream(&raw, &stream.dict)?;
463
464        // Parse the header: N pairs of (obj_num, offset_within_data). Capacity is
465        // bounded by the header length to avoid a huge allocation on a bogus /N.
466        let header = &decoded[..first.min(decoded.len())];
467        let mut header_lexer = Lexer::new(header, 0, &self.limits);
468        let mut entries = Vec::with_capacity(n.min(header.len()));
469        for _ in 0..n {
470            let obj_num_tok = header_lexer.next_token()?;
471            let offset_tok = header_lexer.next_token()?;
472            let obj_num = obj_num_tok.as_i64()? as u32;
473            let offset = usize::try_from(offset_tok.as_i64()?).map_err(|_| neg("member offset"))?;
474            entries.push((obj_num, offset));
475        }
476
477        let decoded_arc = Arc::new(DecodedObjStm {
478            data: Arc::<[u8]>::from(decoded),
479            first,
480            entries,
481        });
482        self.objstm_cache
483            .borrow_mut()
484            .insert(stream_obj_num, Arc::clone(&decoded_arc));
485        Ok(decoded_arc)
486    }
487
488    pub fn data(&self) -> &[u8] {
489        &self.data
490    }
491}
492
493/// Best-effort check that the trailer's /Root points at a usable Catalog. Runs
494/// once at open time (before `PdfFile` exists), so it is a free function that
495/// parses the Root directly rather than going through `PdfFile::resolve`.
496///
497/// Lenient by design: a Root that is present but compressed/free is trusted
498/// (the normal pipeline handles it); only a direct InUse Root is strictly
499/// checked for `/Type /Catalog`. A missing Root triggers recovery.
500fn root_resolves(
501    data: &[u8],
502    xref: &XrefTable,
503    trailer: &zpdf_core::PdfDict,
504    limits: &ParseLimits,
505) -> bool {
506    let Ok(root_ref) = trailer.get_ref("Root") else {
507        return false;
508    };
509    match xref.get(root_ref) {
510        Some(XrefEntry::InUse { offset, .. }) => {
511            let parser = ObjectParser::new(data, limits);
512            matches!(
513                parser
514                    .parse_indirect_at(*offset as usize)
515                    .ok()
516                    .and_then(|o| o
517                        .as_dict()
518                        .ok()
519                        .map(|d| d.get_name("Type").unwrap_or("").to_string())),
520                Some(t) if t == "Catalog"
521            )
522        }
523        Some(_) => true, // compressed/free-but-present: trust the normal pipeline
524        None => false,
525    }
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531
532    /// Validates the object-stream header parse + body-slicing arithmetic that
533    /// `get_or_decode_objstm`/`extract_from_object_stream` rely on, without
534    /// needing a full xref-stream fixture.
535    #[test]
536    fn objstm_header_and_slicing_math() {
537        let limits = ParseLimits::default();
538        let o10 = b"<< /Type /Catalog /Pages 2 0 R >>";
539        let o11 = b"42";
540        let header = format!("10 0 11 {} ", o10.len() + 1);
541        let first = header.len();
542        let mut decoded = header.into_bytes();
543        decoded.extend_from_slice(o10);
544        decoded.push(b' ');
545        decoded.extend_from_slice(o11);
546
547        // Mirror the header parse.
548        let mut hx = Lexer::new(&decoded[..first], 0, &limits);
549        let mut entries = Vec::new();
550        for _ in 0..2 {
551            let num = hx.next_token().unwrap().as_i64().unwrap() as u32;
552            let off = hx.next_token().unwrap().as_i64().unwrap() as usize;
553            entries.push((num, off));
554        }
555        assert_eq!(entries, vec![(10, 0), (11, o10.len() + 1)]);
556
557        // Slice + lex object index 0 (obj 10).
558        let (start0, end0) = (first + entries[0].1, first + entries[1].1);
559        let obj = Lexer::new(&decoded[start0..end0], 0, &limits)
560            .next_token()
561            .unwrap();
562        assert!(obj.as_dict().is_ok(), "obj 10 should lex as a dict");
563
564        // Slice + lex object index 1 (obj 11) — runs to end of decoded.
565        let start1 = first + entries[1].1;
566        let n = Lexer::new(&decoded[start1..], 0, &limits)
567            .next_token()
568            .unwrap();
569        assert_eq!(n.as_i64().unwrap(), 42);
570    }
571
572    /// Assemble a minimal PDF: the given `(num, body)` objects at gen 0, a
573    /// traditional xref covering each (one single-entry subsection apiece),
574    /// and a trailer pointing /Root at `root`.
575    fn build_pdf(objects: &[(u32, &str)], root: u32) -> Vec<u8> {
576        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
577        let mut offsets = Vec::new();
578        for (num, body) in objects {
579            offsets.push((*num, d.len()));
580            d.extend_from_slice(format!("{num} 0 obj\n{body}\nendobj\n").as_bytes());
581        }
582        let xref_off = d.len();
583        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
584        for (num, off) in &offsets {
585            d.extend_from_slice(format!("{num} 1\n{off:010} 00000 n \n").as_bytes());
586        }
587        let size = objects.iter().map(|(n, _)| n + 1).max().unwrap_or(1);
588        d.extend_from_slice(
589            format!("trailer\n<< /Size {size} /Root {root} 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
590                .as_bytes(),
591        );
592        d
593    }
594
595    #[test]
596    fn dangling_ref_resolves_to_null() {
597        // Object 9 is referenced but absent from the xref entirely: per
598        // ISO 32000 7.3.10 it resolves to null, not an error.
599        let pdf = build_pdf(&[(1, "<< /Type /Catalog /Pages 9 0 R >>")], 1);
600        let file = PdfFile::parse(pdf).unwrap();
601        assert_eq!(file.resolve(ObjectId(9, 0)).unwrap(), PdfObject::Null);
602        // Second resolve hits the cache (warn fires once).
603        assert_eq!(file.resolve(ObjectId(9, 0)).unwrap(), PdfObject::Null);
604    }
605
606    #[test]
607    fn free_entry_resolves_to_null() {
608        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
609        let off1 = d.len();
610        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
611        let xref_off = d.len();
612        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n1 1\n");
613        d.extend_from_slice(format!("{off1:010} 00000 n \n").as_bytes());
614        d.extend_from_slice(b"2 1\n0000000000 00000 f \n");
615        d.extend_from_slice(
616            format!("trailer\n<< /Size 3 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
617                .as_bytes(),
618        );
619
620        let file = PdfFile::parse(d).unwrap();
621        assert!(matches!(
622            file.xref.get(ObjectId(2, 0)),
623            Some(XrefEntry::Free { .. })
624        ));
625        assert_eq!(file.resolve(ObjectId(2, 0)).unwrap(), PdfObject::Null);
626    }
627
628    #[test]
629    fn header_mismatch_triggers_lazy_repair() {
630        // The xref entry for object 3 points at object 2's offset; the real
631        // object 3 lives elsewhere. resolve(3) must repair via the lazy scan.
632        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
633        let off1 = d.len();
634        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
635        let off2 = d.len();
636        d.extend_from_slice(b"2 0 obj\n<< /Marker /Wrong >>\nendobj\n");
637        // Real object 3 — its offset is deliberately NOT in the xref.
638        d.extend_from_slice(b"3 0 obj\n<< /Marker /Real >>\nendobj\n");
639        let xref_off = d.len();
640        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
641        d.extend_from_slice(format!("1 1\n{off1:010} 00000 n \n").as_bytes());
642        d.extend_from_slice(format!("2 1\n{off2:010} 00000 n \n").as_bytes());
643        d.extend_from_slice(format!("3 1\n{off2:010} 00000 n \n").as_bytes()); // wrong!
644        d.extend_from_slice(
645            format!("trailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
646                .as_bytes(),
647        );
648
649        let file = PdfFile::parse(d).unwrap();
650        let obj = file.resolve(ObjectId(3, 0)).unwrap();
651        assert_eq!(obj.as_dict().unwrap().get_name("Marker").unwrap(), "Real");
652        // Object 2 still resolves normally (its entry was correct).
653        let obj2 = file.resolve(ObjectId(2, 0)).unwrap();
654        assert_eq!(obj2.as_dict().unwrap().get_name("Marker").unwrap(), "Wrong");
655    }
656
657    #[test]
658    fn ref_to_ref_chain_resolves() {
659        let pdf = build_pdf(
660            &[
661                (1, "<< /Type /Catalog /Pages 2 0 R >>"),
662                (4, "5 0 R"),
663                (5, "42"),
664            ],
665            1,
666        );
667        let file = PdfFile::parse(pdf).unwrap();
668        assert_eq!(
669            file.resolve(ObjectId(4, 0)).unwrap(),
670            PdfObject::Integer(42)
671        );
672    }
673
674    #[test]
675    fn ref_cycle_resolves_to_null() {
676        // 4 -> 5 -> 4: the chain guard must terminate (no hang/stack overflow)
677        // and degrade the value to null.
678        let pdf = build_pdf(
679            &[
680                (1, "<< /Type /Catalog /Pages 2 0 R >>"),
681                (4, "5 0 R"),
682                (5, "4 0 R"),
683            ],
684            1,
685        );
686        let file = PdfFile::parse(pdf).unwrap();
687        assert_eq!(file.resolve(ObjectId(4, 0)).unwrap(), PdfObject::Null);
688    }
689
690    #[test]
691    fn indirect_filter_is_resolved() {
692        use flate2::write::ZlibEncoder;
693        use flate2::Compression;
694        use std::io::Write;
695
696        let payload = b"indirect filter payload";
697        let mut enc = ZlibEncoder::new(Vec::new(), Compression::default());
698        enc.write_all(payload).unwrap();
699        let compressed = enc.finish().unwrap();
700
701        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
702        let off1 = d.len();
703        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
704        let off3 = d.len();
705        d.extend_from_slice(
706            format!(
707                "3 0 obj\n<< /Length {} /Filter 4 0 R >>\nstream\n",
708                compressed.len()
709            )
710            .as_bytes(),
711        );
712        d.extend_from_slice(&compressed);
713        d.extend_from_slice(b"\nendstream\nendobj\n");
714        let off4 = d.len();
715        d.extend_from_slice(b"4 0 obj\n/FlateDecode\nendobj\n");
716        let xref_off = d.len();
717        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
718        d.extend_from_slice(format!("1 1\n{off1:010} 00000 n \n").as_bytes());
719        d.extend_from_slice(format!("3 1\n{off3:010} 00000 n \n").as_bytes());
720        d.extend_from_slice(format!("4 1\n{off4:010} 00000 n \n").as_bytes());
721        d.extend_from_slice(
722            format!("trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
723                .as_bytes(),
724        );
725
726        let file = PdfFile::parse(d).unwrap();
727        let data = file.resolve_stream_data(ObjectId(3, 0)).unwrap();
728        assert_eq!(data, payload);
729    }
730
731    /// An image stream with /Filter /JBIG2Decode whose /DecodeParms holds an
732    /// indirect /JBIG2Globals stream: the globals reference must be resolved,
733    /// decoded (here through its own FlateDecode), and inlined before the
734    /// filter layer runs. The globals carry the page-info segment; the image
735    /// stream carries an MMR generic region (two "WWWBBWWW" rows).
736    #[test]
737    fn jbig2_globals_stream_is_resolved_and_decoded() {
738        use flate2::write::ZlibEncoder;
739        use flate2::Compression;
740        use std::io::Write;
741
742        // Globals: segment 0, type 48 (page information), page 1, 8x2 page.
743        let globals: Vec<u8> = [
744            &[0, 0, 0, 0, 0x30, 0x00, 0x01, 0, 0, 0, 19][..], // header, length 19
745            &[0, 0, 0, 8, 0, 0, 0, 2][..],                    // width 8, height 2
746            &[0; 8][..],                                      // x/y resolution
747            &[0x00, 0, 0][..],                                // flags, striping
748        ]
749        .concat();
750        let mut gz = ZlibEncoder::new(Vec::new(), Compression::default());
751        gz.write_all(&globals).unwrap();
752        let globals_z = gz.finish().unwrap();
753
754        // Image stream: segment 1, type 38 (immediate generic region), MMR
755        // payload 0x31 0xF8 = T.6-coded WWWBBWWW twice.
756        let image: Vec<u8> = [
757            &[0, 0, 0, 1, 0x26, 0x00, 0x01, 0, 0, 0, 20][..], // header, length 20
758            &[0, 0, 0, 8, 0, 0, 0, 2][..],                    // region 8x2 …
759            &[0, 0, 0, 0, 0, 0, 0, 0, 0x00][..],              // … at (0,0), OR
760            &[0x01, 0x31, 0xF8][..],                          // MMR flag + data
761        ]
762        .concat();
763
764        let mut d = Vec::from(&b"%PDF-1.4\n"[..]);
765        let off1 = d.len();
766        d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
767        let off3 = d.len();
768        d.extend_from_slice(
769            format!(
770                "3 0 obj\n<< /Length {} /Filter /JBIG2Decode \
771                 /DecodeParms << /JBIG2Globals 4 0 R >> >>\nstream\n",
772                image.len()
773            )
774            .as_bytes(),
775        );
776        d.extend_from_slice(&image);
777        d.extend_from_slice(b"\nendstream\nendobj\n");
778        let off4 = d.len();
779        d.extend_from_slice(
780            format!(
781                "4 0 obj\n<< /Length {} /Filter /FlateDecode >>\nstream\n",
782                globals_z.len()
783            )
784            .as_bytes(),
785        );
786        d.extend_from_slice(&globals_z);
787        d.extend_from_slice(b"\nendstream\nendobj\n");
788        let xref_off = d.len();
789        d.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \n");
790        d.extend_from_slice(format!("1 1\n{off1:010} 00000 n \n").as_bytes());
791        d.extend_from_slice(format!("3 1\n{off3:010} 00000 n \n").as_bytes());
792        d.extend_from_slice(format!("4 1\n{off4:010} 00000 n \n").as_bytes());
793        d.extend_from_slice(
794            format!("trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
795                .as_bytes(),
796        );
797
798        let file = PdfFile::parse(d).unwrap();
799        let data = file.resolve_stream_data(ObjectId(3, 0)).unwrap();
800        // WWWBBWWW in PDF 1-bpc polarity (black = 0): 1110 0111, both rows.
801        assert_eq!(data, vec![0xE7, 0xE7]);
802    }
803}
zpdf_parser/lib.rs

zpdf_parser/
lib.rs