Skip to main content

pdf_objects/
parser.rs

1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::document::build_document;
4use crate::error::{PdfError, PdfResult};
5use crate::types::{
6    ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfStream, PdfString, PdfValue, XrefEntry,
7};
8
9pub fn parse_pdf(bytes: &[u8]) -> PdfResult<crate::document::ParsedDocument> {
10    let version = parse_header(bytes)?;
11    let startxref = find_startxref(bytes)?;
12    let (xref, trailer) = parse_xref_table(bytes, startxref)?;
13
14    let mut objects = BTreeMap::new();
15    let mut max_object_number = 0;
16    for (object_ref, entry) in xref {
17        if !entry.in_use {
18            continue;
19        }
20        if object_ref.object_number == 0 {
21            continue;
22        }
23        let object = parse_indirect_object(bytes, entry.offset)?;
24        max_object_number = max_object_number.max(object_ref.object_number);
25        objects.insert(object_ref, object);
26    }
27    let file = PdfFile {
28        version,
29        objects,
30        trailer,
31        max_object_number,
32    };
33    build_document(file)
34}
35
36fn parse_header(bytes: &[u8]) -> PdfResult<String> {
37    if !bytes.starts_with(b"%PDF-") {
38        return Err(PdfError::Parse("missing PDF header".to_string()));
39    }
40    let line_end = bytes
41        .iter()
42        .position(|byte| *byte == b'\n' || *byte == b'\r')
43        .ok_or_else(|| PdfError::Parse("unterminated header".to_string()))?;
44    Ok(String::from_utf8_lossy(&bytes[5..line_end])
45        .trim()
46        .to_string())
47}
48
49fn find_startxref(bytes: &[u8]) -> PdfResult<usize> {
50    let marker = b"startxref";
51    let position = bytes
52        .windows(marker.len())
53        .rposition(|window| window == marker)
54        .ok_or_else(|| PdfError::Parse("missing startxref".to_string()))?;
55    let mut parser = Cursor::new(bytes, position + marker.len());
56    parser.skip_ws_and_comments();
57    parser.parse_usize()
58}
59
60fn parse_xref_table(
61    bytes: &[u8],
62    start_offset: usize,
63) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
64    let mut merged_entries = BTreeMap::new();
65    let mut newest_trailer = None;
66    let mut visited = BTreeSet::new();
67    let mut offset = start_offset;
68
69    loop {
70        if !visited.insert(offset) {
71            return Err(PdfError::Parse("circular Prev chain".to_string()));
72        }
73        let (section_entries, trailer) = parse_xref_section(bytes, offset)?;
74
75        if trailer.contains_key("XRefStm") {
76            return Err(PdfError::Unsupported(
77                "xref streams are not supported".to_string(),
78            ));
79        }
80
81        // Newest-first: only insert entries not already present
82        for (object_ref, entry) in section_entries {
83            merged_entries.entry(object_ref).or_insert(entry);
84        }
85
86        if newest_trailer.is_none() {
87            newest_trailer = Some(trailer.clone());
88        }
89
90        match trailer.get("Prev").and_then(PdfValue::as_integer) {
91            Some(prev_offset) => offset = prev_offset as usize,
92            None => break,
93        }
94    }
95
96    Ok((merged_entries, newest_trailer.unwrap()))
97}
98
99fn parse_xref_section(
100    bytes: &[u8],
101    offset: usize,
102) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
103    let mut cursor = Cursor::new(bytes, offset);
104    cursor.expect_keyword("xref")?;
105    let mut entries = BTreeMap::new();
106    loop {
107        cursor.skip_ws_and_comments();
108        if cursor.peek_keyword("trailer") {
109            break;
110        }
111        let start = cursor.parse_u32()?;
112        cursor.skip_ws_and_comments();
113        let count = cursor.parse_u32()?;
114        cursor.skip_line_breaks();
115        for index in 0..count {
116            let line = cursor.read_line()?;
117            if line.len() < 17 {
118                return Err(PdfError::Parse("invalid xref entry".to_string()));
119            }
120            let parts = String::from_utf8_lossy(line).trim().to_string();
121            let mut fields = parts.split_whitespace();
122            let entry_offset = fields
123                .next()
124                .ok_or_else(|| PdfError::Parse("invalid xref entry offset".to_string()))?
125                .parse::<usize>()
126                .map_err(|_| PdfError::Parse("invalid xref entry offset".to_string()))?;
127            let generation = fields
128                .next()
129                .ok_or_else(|| PdfError::Parse("invalid xref generation".to_string()))?
130                .parse::<u16>()
131                .map_err(|_| PdfError::Parse("invalid xref generation".to_string()))?;
132            let flag = fields
133                .next()
134                .ok_or_else(|| PdfError::Parse("invalid xref flag".to_string()))?;
135            let object_number = start
136                .checked_add(index)
137                .ok_or_else(|| PdfError::Parse("xref object number overflow".to_string()))?;
138            entries.insert(
139                ObjectRef::new(object_number, generation),
140                XrefEntry {
141                    offset: entry_offset,
142                    generation,
143                    in_use: flag == "n",
144                },
145            );
146        }
147    }
148    cursor.expect_keyword("trailer")?;
149    let trailer = match cursor.parse_value()? {
150        PdfValue::Dictionary(dictionary) => dictionary,
151        _ => return Err(PdfError::Parse("trailer is not a dictionary".to_string())),
152    };
153    Ok((entries, trailer))
154}
155
156fn parse_indirect_object(bytes: &[u8], offset: usize) -> PdfResult<PdfObject> {
157    let mut cursor = Cursor::new(bytes, offset);
158    let _object_number = cursor.parse_u32()?;
159    cursor.skip_ws_and_comments();
160    let _generation = cursor.parse_u16()?;
161    cursor.skip_ws_and_comments();
162    cursor.expect_keyword("obj")?;
163    cursor.skip_ws_and_comments();
164
165    let value = cursor.parse_value()?;
166    cursor.skip_ws_and_comments();
167    if matches!(value, PdfValue::Dictionary(_)) && cursor.peek_keyword("stream") {
168        let dict = match value {
169            PdfValue::Dictionary(dict) => dict,
170            _ => unreachable!(),
171        };
172        cursor.expect_keyword("stream")?;
173        cursor.consume_stream_line_break();
174        let stream_start = cursor.position;
175        // Prefer the Length entry from the stream dictionary to determine the
176        // data boundary.  This prevents binary stream data that happens to
177        // contain the literal bytes "endstream" from being truncated.
178        // Fall back to scanning for `endstream` when Length is absent,
179        // an indirect reference (can't resolve yet), or past EOF.
180        let length_hint = dict
181            .get("Length")
182            .and_then(PdfValue::as_integer)
183            .filter(|&len| len >= 0)
184            .map(|len| len as usize);
185        let (data, endstream_pos) = match length_hint {
186            Some(len) if stream_start + len <= bytes.len() => {
187                // Verify the endstream keyword follows at the expected offset.
188                // Tolerate trailing EOL between data and keyword per PDF spec.
189                let mut check = stream_start + len;
190                while check < bytes.len() && matches!(bytes[check], b'\r' | b'\n') {
191                    check += 1;
192                }
193                if bytes.get(check..check + 9) == Some(b"endstream") {
194                    (bytes[stream_start..stream_start + len].to_vec(), check)
195                } else {
196                    // Length is wrong; fall back to scanning
197                    let pos = find_keyword(bytes, stream_start, b"endstream")
198                        .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
199                    (bytes[stream_start..pos].to_vec(), pos)
200                }
201            }
202            _ => {
203                let pos = find_keyword(bytes, stream_start, b"endstream")
204                    .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
205                (bytes[stream_start..pos].to_vec(), pos)
206            }
207        };
208        cursor.position = endstream_pos;
209        cursor.expect_keyword("endstream")?;
210        cursor.skip_ws_and_comments();
211        cursor.expect_keyword("endobj")?;
212        Ok(PdfObject::Stream(PdfStream { dict, data }))
213    } else {
214        cursor.expect_keyword("endobj")?;
215        Ok(PdfObject::Value(value))
216    }
217}
218
219fn find_keyword(bytes: &[u8], start: usize, keyword: &[u8]) -> Option<usize> {
220    bytes[start..]
221        .windows(keyword.len())
222        .position(|window| window == keyword)
223        .map(|relative| start + relative)
224}
225
226struct Cursor<'a> {
227    bytes: &'a [u8],
228    position: usize,
229}
230
231impl<'a> Cursor<'a> {
232    fn new(bytes: &'a [u8], position: usize) -> Self {
233        Self { bytes, position }
234    }
235
236    fn eof(&self) -> bool {
237        self.position >= self.bytes.len()
238    }
239
240    fn current(&self) -> Option<u8> {
241        self.bytes.get(self.position).copied()
242    }
243
244    fn skip_ws_and_comments(&mut self) {
245        while let Some(byte) = self.current() {
246            match byte {
247                b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00 => self.position += 1,
248                b'%' => {
249                    while let Some(next) = self.current() {
250                        self.position += 1;
251                        if next == b'\n' || next == b'\r' {
252                            break;
253                        }
254                    }
255                }
256                _ => break,
257            }
258        }
259    }
260
261    fn skip_line_breaks(&mut self) {
262        while matches!(self.current(), Some(b'\n' | b'\r')) {
263            self.position += 1;
264        }
265    }
266
267    fn read_line(&mut self) -> PdfResult<&'a [u8]> {
268        if self.eof() {
269            return Err(PdfError::Parse("unexpected end of file".to_string()));
270        }
271        let start = self.position;
272        while let Some(byte) = self.current() {
273            if byte == b'\n' || byte == b'\r' {
274                let end = self.position;
275                self.skip_line_breaks();
276                return Ok(&self.bytes[start..end]);
277            }
278            self.position += 1;
279        }
280        Ok(&self.bytes[start..self.position])
281    }
282
283    fn peek_keyword(&self, keyword: &str) -> bool {
284        self.bytes
285            .get(self.position..self.position + keyword.len())
286            .map(|slice| slice == keyword.as_bytes())
287            .unwrap_or(false)
288    }
289
290    fn expect_keyword(&mut self, keyword: &str) -> PdfResult<()> {
291        self.skip_ws_and_comments();
292        if self.peek_keyword(keyword) {
293            self.position += keyword.len();
294            Ok(())
295        } else {
296            Err(PdfError::Parse(format!("expected keyword {keyword}")))
297        }
298    }
299
300    fn consume_stream_line_break(&mut self) {
301        if self.current() == Some(b'\r') {
302            self.position += 1;
303        }
304        if self.current() == Some(b'\n') {
305            self.position += 1;
306        }
307    }
308
309    fn parse_u32(&mut self) -> PdfResult<u32> {
310        let token = self.parse_token()?;
311        token
312            .parse::<u32>()
313            .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
314    }
315
316    fn parse_u16(&mut self) -> PdfResult<u16> {
317        let token = self.parse_token()?;
318        token
319            .parse::<u16>()
320            .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
321    }
322
323    fn parse_usize(&mut self) -> PdfResult<usize> {
324        let token = self.parse_token()?;
325        token
326            .parse::<usize>()
327            .map_err(|_| PdfError::Parse(format!("invalid offset token: {token}")))
328    }
329
330    fn parse_token(&mut self) -> PdfResult<String> {
331        self.skip_ws_and_comments();
332        let start = self.position;
333        while let Some(byte) = self.current() {
334            if is_delimiter(byte) || is_whitespace(byte) {
335                break;
336            }
337            self.position += 1;
338        }
339        if self.position == start {
340            return Err(PdfError::Parse("expected token".to_string()));
341        }
342        Ok(String::from_utf8_lossy(&self.bytes[start..self.position]).to_string())
343    }
344
345    fn parse_value(&mut self) -> PdfResult<PdfValue> {
346        self.skip_ws_and_comments();
347        match self.current() {
348            Some(b'/') => self.parse_name(),
349            Some(b'(') => self.parse_literal_string(),
350            Some(b'[') => self.parse_array(),
351            Some(b'<') if self.bytes.get(self.position + 1) == Some(&b'<') => {
352                self.parse_dictionary()
353            }
354            Some(b'<') => self.parse_hex_string(),
355            Some(b't') if self.peek_keyword("true") => {
356                self.position += 4;
357                Ok(PdfValue::Bool(true))
358            }
359            Some(b'f') if self.peek_keyword("false") => {
360                self.position += 5;
361                Ok(PdfValue::Bool(false))
362            }
363            Some(b'n') if self.peek_keyword("null") => {
364                self.position += 4;
365                Ok(PdfValue::Null)
366            }
367            Some(_) => self.parse_number_or_reference(),
368            None => Err(PdfError::Parse("unexpected end of file".to_string())),
369        }
370    }
371
372    fn parse_name(&mut self) -> PdfResult<PdfValue> {
373        self.position += 1;
374        let mut raw = Vec::new();
375        while let Some(byte) = self.current() {
376            if is_delimiter(byte) || is_whitespace(byte) {
377                break;
378            }
379            if byte == b'#' {
380                let high =
381                    self.bytes.get(self.position + 1).copied().ok_or_else(|| {
382                        PdfError::Parse("truncated #XX escape in name".to_string())
383                    })?;
384                let low =
385                    self.bytes.get(self.position + 2).copied().ok_or_else(|| {
386                        PdfError::Parse("truncated #XX escape in name".to_string())
387                    })?;
388                let decoded = u8::from_str_radix(&format!("{}{}", high as char, low as char), 16)
389                    .map_err(|_| {
390                    PdfError::Parse("invalid #XX hex escape in name".to_string())
391                })?;
392                raw.push(decoded);
393                self.position += 3;
394            } else {
395                raw.push(byte);
396                self.position += 1;
397            }
398        }
399        Ok(PdfValue::Name(String::from_utf8_lossy(&raw).to_string()))
400    }
401
402    fn parse_literal_string(&mut self) -> PdfResult<PdfValue> {
403        self.position += 1;
404        let mut output = Vec::new();
405        let mut depth = 1usize;
406        while let Some(byte) = self.current() {
407            self.position += 1;
408            match byte {
409                b'\\' => {
410                    let escaped = self
411                        .current()
412                        .ok_or_else(|| PdfError::Parse("unterminated string escape".to_string()))?;
413                    self.position += 1;
414                    match escaped {
415                        b'n' => output.push(b'\n'),
416                        b'r' => output.push(b'\r'),
417                        b't' => output.push(b'\t'),
418                        b'b' => output.push(0x08),
419                        b'f' => output.push(0x0C),
420                        b'(' | b')' | b'\\' => output.push(escaped),
421                        b'\n' => {}
422                        b'\r' => {
423                            if self.current() == Some(b'\n') {
424                                self.position += 1;
425                            }
426                        }
427                        b'0'..=b'7' => {
428                            let mut octal = vec![escaped];
429                            for _ in 0..2 {
430                                match self.current() {
431                                    Some(next @ b'0'..=b'7') => {
432                                        octal.push(next);
433                                        self.position += 1;
434                                    }
435                                    _ => break,
436                                }
437                            }
438                            // PDF spec: octal value is taken modulo 256
439                            let value =
440                                u16::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8)
441                                    .unwrap_or(0);
442                            output.push((value % 256) as u8);
443                        }
444                        other => output.push(other),
445                    }
446                }
447                b'(' => {
448                    depth += 1;
449                    output.push(byte);
450                }
451                b')' => {
452                    depth -= 1;
453                    if depth == 0 {
454                        return Ok(PdfValue::String(PdfString(output)));
455                    }
456                    output.push(byte);
457                }
458                _ => output.push(byte),
459            }
460        }
461        Err(PdfError::Parse("unterminated literal string".to_string()))
462    }
463
464    fn parse_hex_string(&mut self) -> PdfResult<PdfValue> {
465        self.position += 1;
466        let start = self.position;
467        while self.current() != Some(b'>') {
468            if self.eof() {
469                return Err(PdfError::Parse("unterminated hex string".to_string()));
470            }
471            self.position += 1;
472        }
473        let raw = String::from_utf8_lossy(&self.bytes[start..self.position])
474            .chars()
475            .filter(|character| !character.is_whitespace())
476            .collect::<String>();
477        self.position += 1;
478        let mut chars = raw.chars().collect::<Vec<_>>();
479        if chars.len() % 2 != 0 {
480            chars.push('0');
481        }
482        let mut bytes = Vec::with_capacity(chars.len() / 2);
483        for pair in chars.chunks(2) {
484            let value = u8::from_str_radix(&pair.iter().collect::<String>(), 16)
485                .map_err(|_| PdfError::Parse("invalid hex string".to_string()))?;
486            bytes.push(value);
487        }
488        Ok(PdfValue::String(PdfString(bytes)))
489    }
490
491    fn parse_array(&mut self) -> PdfResult<PdfValue> {
492        self.position += 1;
493        let mut values = Vec::new();
494        loop {
495            self.skip_ws_and_comments();
496            match self.current() {
497                Some(b']') => {
498                    self.position += 1;
499                    break;
500                }
501                Some(_) => values.push(self.parse_value()?),
502                None => return Err(PdfError::Parse("unterminated array".to_string())),
503            }
504        }
505        Ok(PdfValue::Array(values))
506    }
507
508    fn parse_dictionary(&mut self) -> PdfResult<PdfValue> {
509        self.position += 2;
510        let mut dictionary = PdfDictionary::new();
511        loop {
512            self.skip_ws_and_comments();
513            if self.current() == Some(b'>') && self.bytes.get(self.position + 1) == Some(&b'>') {
514                self.position += 2;
515                break;
516            }
517            let key = match self.parse_name()? {
518                PdfValue::Name(name) => name,
519                _ => unreachable!(),
520            };
521            let value = self.parse_value()?;
522            dictionary.insert(key, value);
523        }
524        Ok(PdfValue::Dictionary(dictionary))
525    }
526
527    fn parse_number_or_reference(&mut self) -> PdfResult<PdfValue> {
528        let first_token = self.parse_token()?;
529        if first_token.contains('.') || first_token.contains(['e', 'E']) {
530            return first_token
531                .parse::<f64>()
532                .map(PdfValue::Number)
533                .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")));
534        }
535
536        let checkpoint = self.position;
537        self.skip_ws_and_comments();
538        if let Ok(second_token) = self.parse_token() {
539            self.skip_ws_and_comments();
540            if self.current() == Some(b'R')
541                && second_token
542                    .chars()
543                    .all(|character| character.is_ascii_digit())
544            {
545                self.position += 1;
546                return Ok(PdfValue::Reference(ObjectRef::new(
547                    first_token
548                        .parse::<u32>()
549                        .map_err(|_| PdfError::Parse("invalid reference object".to_string()))?,
550                    second_token
551                        .parse::<u16>()
552                        .map_err(|_| PdfError::Parse("invalid reference generation".to_string()))?,
553                )));
554            }
555        }
556        self.position = checkpoint;
557        first_token
558            .parse::<i64>()
559            .map(PdfValue::Integer)
560            .or_else(|_| first_token.parse::<f64>().map(PdfValue::Number))
561            .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")))
562    }
563}
564
565fn is_whitespace(byte: u8) -> bool {
566    matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00)
567}
568
569fn is_delimiter(byte: u8) -> bool {
570    matches!(
571        byte,
572        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
573    )
574}
575
576#[cfg(test)]
577mod tests {
578    use super::parse_pdf;
579    use crate::error::PdfError;
580    use crate::types::PdfObject;
581
582    #[test]
583    fn parses_simple_pdf_fixture() {
584        let bytes = include_bytes!("../../../tests/fixtures/simple-text.pdf");
585        let document = parse_pdf(bytes).expect("fixture should parse");
586        assert_eq!(document.pages.len(), 1);
587    }
588
589    #[test]
590    fn parses_incremental_update_fixture() {
591        let bytes = include_bytes!("../../../tests/fixtures/incremental-update.pdf");
592        let document = parse_pdf(bytes).expect("incremental fixture should parse");
593        assert_eq!(document.pages.len(), 1);
594
595        // The updated content stream (object 4) should contain "Updated Secret",
596        // not "Original Secret"
597        let content_refs = &document.pages[0].content_refs;
598        assert!(!content_refs.is_empty());
599        let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
600        let stream_data = match content_obj {
601            PdfObject::Stream(stream) => String::from_utf8_lossy(&stream.data),
602            _ => panic!("expected stream object for page content"),
603        };
604        assert!(
605            stream_data.contains("Updated Secret"),
606            "content stream should contain updated text"
607        );
608        assert!(
609            !stream_data.contains("Original Secret"),
610            "content stream should not contain original text"
611        );
612    }
613
614    #[test]
615    fn rejects_circular_prev_chain() {
616        // Build a minimal PDF where Prev points back to the same xref offset
617        let mut pdf = Vec::new();
618        pdf.extend_from_slice(b"%PDF-1.4\n");
619
620        // Object 1: catalog
621        let obj1_offset = pdf.len();
622        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
623
624        // Object 2: pages
625        let obj2_offset = pdf.len();
626        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
627
628        let xref_offset = pdf.len();
629        pdf.extend_from_slice(b"xref\n0 3\n");
630        pdf.extend_from_slice(b"0000000000 65535 f \n");
631        pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
632        pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
633        pdf.extend_from_slice(b"trailer\n");
634        // Prev points back to this same xref offset — circular
635        pdf.extend_from_slice(
636            format!("<< /Size 3 /Root 1 0 R /Prev {} >>\n", xref_offset).as_bytes(),
637        );
638        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
639
640        let result = parse_pdf(&pdf);
641        match result {
642            Err(PdfError::Parse(message)) => {
643                assert!(
644                    message.contains("circular Prev chain"),
645                    "expected circular chain error, got: {message}"
646                );
647            }
648            other => panic!("expected Parse error, got: {other:?}"),
649        }
650    }
651}