Skip to main content

zpdf_parser/
object_parser.rs

1use std::sync::Arc;
2
3use zpdf_core::{Error, ObjectId, ParseLimits, PdfDict, PdfObject, PdfStream, Result};
4
5use crate::lexer::Lexer;
6
7pub struct ObjectParser<'a> {
8    data: &'a [u8],
9    limits: &'a ParseLimits,
10}
11
12impl<'a> ObjectParser<'a> {
13    pub fn new(data: &'a [u8], limits: &'a ParseLimits) -> Self {
14        Self { data, limits }
15    }
16
17    /// Parse an indirect object at the given byte offset.
18    /// Expected format: `<num> <gen> obj <value> endobj`
19    pub fn parse_indirect_at(&self, offset: usize) -> Result<PdfObject> {
20        self.parse_indirect_with_id(offset).map(|(_, obj)| obj)
21    }
22
23    /// Like [`parse_indirect_at`](Self::parse_indirect_at), but also returns
24    /// the `(num, gen)` actually present in the object header. Callers that
25    /// arrived here via an xref entry can compare it against the id they asked
26    /// for and trigger repair on a mismatch (stale/corrupt offsets are common
27    /// in damaged files).
28    pub fn parse_indirect_with_id(&self, offset: usize) -> Result<(ObjectId, PdfObject)> {
29        let mut lex = Lexer::new(self.data, offset, self.limits);
30
31        let num_tok = lex.next_token()?;
32        let gen_tok = lex.next_token()?;
33        let id = match (&num_tok, &gen_tok) {
34            (PdfObject::Integer(n), PdfObject::Integer(g)) => {
35                match (u32::try_from(*n), u16::try_from(*g)) {
36                    (Ok(n), Ok(g)) => ObjectId(n, g),
37                    _ => {
38                        return Err(Error::InvalidObject(
39                            offset as u64,
40                            format!("object header out of range: {n} {g} obj"),
41                        ))
42                    }
43                }
44            }
45            _ => {
46                return Err(Error::InvalidObject(
47                    offset as u64,
48                    "object header is not '<int> <int> obj'".into(),
49                ))
50            }
51        };
52
53        lex.skip_whitespace_and_comments();
54        self.expect_keyword(&mut lex, b"obj")?;
55
56        let obj = lex.next_token()?;
57        // A top-level body may itself be an indirect reference (`N G R`), which
58        // the plain tokenizer reads as a bare integer; promote it so resolve()
59        // can follow ref-to-ref chains.
60        let obj = lex.maybe_resolve_ref(obj)?;
61
62        // Check if this is a stream object
63        lex.skip_whitespace_and_comments();
64        if let PdfObject::Dict(dict) = &obj {
65            if self.starts_with_at(lex.pos(), b"stream") {
66                let stream = self.read_stream(dict.clone(), lex.pos())?;
67                return Ok((id, PdfObject::Stream(stream)));
68            }
69        }
70
71        Ok((id, obj))
72    }
73
74    fn expect_keyword(&self, lex: &mut Lexer, keyword: &[u8]) -> Result<()> {
75        let pos = lex.pos();
76        if self.data[pos..].starts_with(keyword) {
77            lex.set_pos(pos + keyword.len());
78            Ok(())
79        } else {
80            Err(Error::InvalidObject(
81                pos as u64,
82                format!(
83                    "expected '{}', got '{}'",
84                    String::from_utf8_lossy(keyword),
85                    String::from_utf8_lossy(
86                        &self.data[pos..self.data.len().min(pos + keyword.len())]
87                    )
88                ),
89            ))
90        }
91    }
92
93    fn starts_with_at(&self, pos: usize, prefix: &[u8]) -> bool {
94        self.data.get(pos..).is_some_and(|s| s.starts_with(prefix))
95    }
96
97    fn read_stream(&self, dict: PdfDict, keyword_pos: usize) -> Result<PdfStream> {
98        let mut pos = keyword_pos + b"stream".len();
99
100        // Skip stream keyword EOL: \r\n or \n (a lone \r is tolerated too).
101        if self.data.get(pos) == Some(&b'\r') {
102            pos += 1;
103        }
104        if self.data.get(pos) == Some(&b'\n') {
105            pos += 1;
106        }
107
108        // Determine the stream's byte length. Trust a direct, non-negative
109        // /Length ONLY if `endstream` actually follows it; otherwise (missing,
110        // indirect `N G R`, negative, or simply wrong) fall back to scanning for
111        // the `endstream` keyword. The low-level parser cannot resolve an
112        // indirect /Length, so without this fallback such streams (very common,
113        // e.g. Acrobat output) would decode to empty/garbage data.
114        let declared = match dict.get("Length") {
115            Some(PdfObject::Integer(n)) if *n >= 0 => Some(*n as usize),
116            _ => None,
117        };
118
119        let end = match declared {
120            Some(len)
121                if pos
122                    .checked_add(len)
123                    .is_some_and(|e| self.endstream_follows(e)) =>
124            {
125                pos + len
126            }
127            _ => self.scan_for_endstream(pos)?,
128        };
129
130        let length = (end - pos) as u64;
131        if length > self.limits.max_stream_bytes {
132            return Err(Error::StreamSizeLimit(self.limits.max_stream_bytes));
133        }
134
135        let stream_data = self.data[pos..end].to_vec();
136        Ok(PdfStream {
137            dict,
138            data: Arc::from(stream_data),
139        })
140    }
141
142    /// True if (after optional whitespace) the bytes at `at` begin the
143    /// `endstream` keyword. Used to validate a declared /Length before trusting it.
144    fn endstream_follows(&self, at: usize) -> bool {
145        let mut p = at;
146        while let Some(&b) = self.data.get(p) {
147            if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c') {
148                p += 1;
149            } else {
150                break;
151            }
152        }
153        self.data
154            .get(p..)
155            .is_some_and(|s| s.starts_with(b"endstream"))
156    }
157
158    /// Find the stream's data end by scanning for the `endstream` keyword,
159    /// stripping the single EOL that precedes it (per spec, not part of the
160    /// data). The search is bounded by `max_stream_bytes` so a stream missing
161    /// its `endstream` cannot force an unbounded scan.
162    fn scan_for_endstream(&self, pos: usize) -> Result<usize> {
163        let cap = (self.limits.max_stream_bytes as usize).saturating_add(b"endstream".len() + 2);
164        let search_end = pos.saturating_add(cap).min(self.data.len());
165        let hay = self
166            .data
167            .get(pos..search_end)
168            .ok_or(Error::UnexpectedEof(pos as u64))?;
169        let rel = hay
170            .windows(b"endstream".len())
171            .position(|w| w == b"endstream")
172            .ok_or_else(|| {
173                Error::InvalidObject(pos as u64, "stream: no endstream within size limit".into())
174            })?;
175        let mut end = pos + rel;
176        // Strip the EOL immediately before `endstream` (CRLF, LF, or lone CR).
177        if end > pos && self.data[end - 1] == b'\n' {
178            end -= 1;
179            if end > pos && self.data[end - 1] == b'\r' {
180                end -= 1;
181            }
182        } else if end > pos && self.data[end - 1] == b'\r' {
183            end -= 1;
184        }
185        Ok(end)
186    }
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn parse_simple_indirect() {
195        let data = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
196        let limits = ParseLimits::default();
197        let parser = ObjectParser::new(data, &limits);
198        let obj = parser.parse_indirect_at(0).unwrap();
199        match obj {
200            PdfObject::Dict(d) => {
201                assert_eq!(d.get_name("Type").unwrap(), "Catalog");
202            }
203            other => panic!("expected Dict, got {other:?}"),
204        }
205    }
206
207    #[test]
208    fn parse_stream_object() {
209        let content = b"BT /F1 12 Tf (Hello) Tj ET";
210        let obj_bytes = format!("5 0 obj\n<< /Length {} >>\nstream\n", content.len());
211        let mut data = obj_bytes.into_bytes();
212        data.extend_from_slice(content);
213        data.extend_from_slice(b"\nendstream\nendobj\n");
214
215        let limits = ParseLimits::default();
216        let parser = ObjectParser::new(&data, &limits);
217        let obj = parser.parse_indirect_at(0).unwrap();
218        match obj {
219            PdfObject::Stream(s) => {
220                assert_eq!(s.data.as_ref(), content);
221                assert_eq!(s.dict.get_i64("Length").unwrap(), content.len() as i64);
222            }
223            other => panic!("expected Stream, got {other:?}"),
224        }
225    }
226
227    #[test]
228    fn reject_oversized_stream_length() {
229        let limits = ParseLimits {
230            max_stream_bytes: 16,
231            ..Default::default()
232        };
233        let body = b"0123456789ABCDEFGHIJ"; // 20 bytes > 16
234        let obj_bytes = format!("5 0 obj\n<< /Length {} >>\nstream\n", body.len());
235        let mut data = obj_bytes.into_bytes();
236        data.extend_from_slice(body);
237        data.extend_from_slice(b"\nendstream\nendobj\n");
238        let parser = ObjectParser::new(&data, &limits);
239        let err = parser.parse_indirect_at(0).unwrap_err();
240        assert!(matches!(err, Error::StreamSizeLimit(16)), "got {err:?}");
241    }
242
243    /// Helper: parse a single stream object and return its decoded data bytes.
244    fn stream_data(data: &[u8]) -> Vec<u8> {
245        let limits = ParseLimits::default();
246        let parser = ObjectParser::new(data, &limits);
247        match parser.parse_indirect_at(0).unwrap() {
248            PdfObject::Stream(s) => s.data.to_vec(),
249            other => panic!("expected Stream, got {other:?}"),
250        }
251    }
252
253    #[test]
254    fn indirect_length_recovers_via_endstream_scan() {
255        // `/Length 99 0 R` is an indirect ref the low-level parser cannot
256        // resolve; it must fall back to scanning for `endstream`.
257        let mut data = b"5 0 obj\n<< /Length 99 0 R >>\nstream\n".to_vec();
258        data.extend_from_slice(b"Hello, world!");
259        data.extend_from_slice(b"\nendstream\nendobj\n");
260        assert_eq!(stream_data(&data), b"Hello, world!");
261    }
262
263    #[test]
264    fn missing_length_recovers_via_endstream_scan() {
265        let mut data = b"5 0 obj\n<< /Type /Whatever >>\nstream\n".to_vec();
266        data.extend_from_slice(b"payload bytes");
267        data.extend_from_slice(b"\nendstream\nendobj\n");
268        assert_eq!(stream_data(&data), b"payload bytes");
269    }
270
271    #[test]
272    fn wrong_length_recovers_via_endstream_scan() {
273        // Declared /Length 3 but the real body is 5 bytes; `endstream` does not
274        // follow at +3, so the scan recovers the true extent.
275        let mut data = b"5 0 obj\n<< /Length 3 >>\nstream\n".to_vec();
276        data.extend_from_slice(b"Hello");
277        data.extend_from_slice(b"\nendstream\nendobj\n");
278        assert_eq!(stream_data(&data), b"Hello");
279    }
280
281    #[test]
282    fn negative_length_recovers_via_endstream_scan() {
283        let mut data = b"5 0 obj\n<< /Length -1 >>\nstream\n".to_vec();
284        data.extend_from_slice(b"abc");
285        data.extend_from_slice(b"\nendstream\nendobj\n");
286        assert_eq!(stream_data(&data), b"abc");
287    }
288
289    #[test]
290    fn correct_length_trusted_even_if_data_contains_endstream_bytes() {
291        // A correct /Length must be trusted so binary data that happens to
292        // contain the bytes "endstream" is not truncated at the wrong place.
293        let body: &[u8] = b"AAendstreamBB"; // 13 bytes, literal "endstream" inside
294        let mut data = format!("5 0 obj\n<< /Length {} >>\nstream\n", body.len()).into_bytes();
295        data.extend_from_slice(body);
296        data.extend_from_slice(b"\nendstream\nendobj\n");
297        assert_eq!(stream_data(&data), body);
298    }
299
300    #[test]
301    fn crlf_before_endstream_is_stripped_on_scan() {
302        // When scanning, a CRLF preceding `endstream` must not be included.
303        let mut data = b"5 0 obj\n<< >>\nstream\n".to_vec();
304        data.extend_from_slice(b"data");
305        data.extend_from_slice(b"\r\nendstream\nendobj\n");
306        assert_eq!(stream_data(&data), b"data");
307    }
308
309    #[test]
310    fn parse_indirect_with_id_returns_header_id() {
311        let data = b"7 2 obj\n<< /Type /Catalog >>\nendobj\n";
312        let limits = ParseLimits::default();
313        let parser = ObjectParser::new(data, &limits);
314        let (id, obj) = parser.parse_indirect_with_id(0).unwrap();
315        assert_eq!(id, ObjectId(7, 2));
316        assert!(obj.as_dict().is_ok());
317    }
318
319    #[test]
320    fn parse_indirect_with_id_rejects_non_integer_header() {
321        let data = b"/Name 0 obj\n42\nendobj\n";
322        let limits = ParseLimits::default();
323        let parser = ObjectParser::new(data, &limits);
324        assert!(parser.parse_indirect_with_id(0).is_err());
325    }
326
327    #[test]
328    fn top_level_ref_body_parses_as_ref() {
329        // `4 0 obj 5 0 R endobj` — the body is itself an indirect reference.
330        let data = b"4 0 obj\n5 0 R\nendobj\n";
331        let limits = ParseLimits::default();
332        let parser = ObjectParser::new(data, &limits);
333        let obj = parser.parse_indirect_at(0).unwrap();
334        assert_eq!(obj, PdfObject::Ref(ObjectId(5, 0)));
335    }
336
337    #[test]
338    fn deeply_nested_value_in_indirect_object_errors() {
339        // The recursion guard must fire even when reached via parse_indirect_at.
340        let limits = ParseLimits {
341            max_object_depth: 4,
342            ..Default::default()
343        };
344        let n = 20usize;
345        let mut inner = String::new();
346        for _ in 0..n {
347            inner.push('[');
348        }
349        inner.push('1');
350        for _ in 0..n {
351            inner.push(']');
352        }
353        let data = format!("1 0 obj\n{inner}\nendobj\n").into_bytes();
354        let parser = ObjectParser::new(&data, &limits);
355        let err = parser.parse_indirect_at(0).unwrap_err();
356        assert!(matches!(err, Error::RecursionLimit(4)), "got {err:?}");
357    }
358}