libpdf/
parse.rs

1use crate::{PdfObject, PdfObjectIdentifier, PdfStreamObject, PdfString, PdfUntypedDictionary};
2use std::collections::HashMap;
3
4pub struct PdfReaderPosition {
5    index: usize,
6}
7
8impl PdfReaderPosition {
9    #[must_use]
10    pub const fn new() -> Self {
11        Self { index: 0 }
12    }
13}
14
15pub struct PdfParser<'a> {
16    buffer: &'a [u8],
17}
18
19impl<'a> PdfParser<'a> {
20    #[must_use]
21    pub const fn new(buffer: &'a [u8]) -> PdfParser<'a> {
22        PdfParser { buffer }
23    }
24
25    /// Advance the position to after the next newline.
26    pub fn read_line(&self, position: &mut PdfReaderPosition) -> &[u8] {
27        let start_position = position.index;
28
29        let last_byte = loop {
30            let current_byte = self.current_value(position);
31            if current_byte == b'\n' || current_byte == b'\r' {
32                break current_byte;
33            } else if !self.advance_position(position) {
34                break b'0';
35            };
36            // Continue.
37        };
38
39        self.advance_position(position);
40
41        let slice = &self.buffer[start_position..position.index];
42
43        if last_byte == b'\r' {
44            self.advance_position_if_next(b'\n', position)
45        }
46
47        slice
48    }
49
50    fn advance_position(&self, position: &mut PdfReaderPosition) -> bool {
51        if position.index < self.buffer.len() - 1 {
52            position.index += 1;
53            true
54        } else {
55            false
56        }
57    }
58
59    fn advance_position_if_next(&self, next: u8, position: &mut PdfReaderPosition) {
60        if position.index < self.buffer.len() - 1 && self.current_value(position) == next {
61            position.index += 1;
62        }
63    }
64
65    pub fn next_object(&self, position: &mut PdfReaderPosition) -> Option<PdfObject<'a>> {
66        let next_word = self.next_word(position)?;
67        if next_word == b"true" {
68            return Some(PdfObject::Boolean(true));
69        } else if next_word == b"false" {
70            return Some(PdfObject::Boolean(false));
71        } else if next_word == b"null" {
72            return Some(PdfObject::Null);
73        } else if next_word
74            .iter()
75            .enumerate()
76            .all(|(idx, b)| b.is_ascii_digit() || (idx == 0 && matches!(b, b'+' | b'-')))
77        {
78            // TODO: use from_utf8 here?
79            // TODO: Error handling (too big integer to fit in i32)
80            let first_number = std::str::from_utf8(next_word).ok()?.parse::<i32>().ok()?;
81
82            // This might be a object reference: "<obj> <gen> R".
83            let index_after_first = position.index;
84            if let Some(second_number) = self.parse_next::<u16>(position) {
85                let third_word = self.next_word(position);
86                if third_word == Some(b"R") {
87                    // TODO: Check if numbers fits in u16
88                    return Some(PdfObject::Reference(PdfObjectIdentifier::new(
89                        first_number as u16,
90                        second_number,
91                    )));
92                }
93                // We might be reading an integer array.
94                position.index = index_after_first;
95            } else {
96                position.index = index_after_first;
97            }
98
99            return Some(PdfObject::Integer(first_number));
100        } else if next_word == b"/" {
101            let current_byte = self.current_value(position);
102            let name_word = if Self::is_whitespace(current_byte) || Self::is_delimiter(current_byte)
103            {
104                return Some(PdfObject::Name(
105                    &self.buffer[position.index..position.index],
106                ));
107            } else {
108                self.next_word(position)?
109            };
110            return if Self::is_delimiter(name_word[0]) {
111                // TODO: Return error?
112                None
113            } else {
114                Some(PdfObject::Name(name_word))
115            };
116        } else if next_word == b"[" {
117            let mut array = Vec::new();
118            // FIXME: Validate array ends with ']'
119            while let Some(object) = self.next_object(position) {
120                array.push(object);
121            }
122            return Some(PdfObject::Array(array));
123        } else if next_word == b"<<" {
124            let mut map = HashMap::new();
125            // FIXME: Do not consume other type of object if it's out of place (a "non-name key")
126            // FIXME: Validate dict ends with '>>'
127            while let Some(PdfObject::Name(key)) = self.next_object(position) {
128                let value = self.next_object(position)?;
129                map.insert(key, value);
130            }
131
132            let index_after_dict = position.index;
133            let next_word = self.next_word(position);
134            if next_word == Some(b"stream") {
135                if self.current_value(position) == b'\r' && self.buffer[position.index + 1] == b'\n'
136                {
137                    position.index += 2;
138                } else if self.current_value(position) == b'\n' {
139                    position.index += 1;
140                } else {
141                    unimplemented!("'stream' not followed by \r\n or \n");
142                }
143
144                if let Some(PdfObject::Integer(length)) = map.get(&b"Length"[..]) {
145                    // Section 7.3.8: "The keyword stream that follows the stream dictionary shall
146                    // be followed by an end-of-line marker consisting of either a CARRIAGE RETURN ´
147                    // and a LINE FEED or just a LINE FEED, and not by a CARRIAGE RETURN alone. The
148                    // sequence of bytes that make up a stream lie between the end-of-line marker
149                    // following the stream keyword and the endstream keyword; the stream dictionary
150                    // specifies the exact number of bytes. There should be an end-of-line marker
151                    // after the data and before endstream; this marker shall not be included in the
152                    // stream length. There shall not be any extra bytes, other than white space,
153                    // between endstream and endobj."
154                    // TODO: Verify stream length positive
155                    let stream_length = *length as usize;
156                    let bytes = &self.buffer[position.index..(position.index + stream_length)];
157                    position.index += stream_length;
158                    // TODO: Verify out of bounds
159                    let stream_object = PdfStreamObject {
160                        dictionary: PdfUntypedDictionary::new(map),
161                        bytes,
162                    };
163                    let next_word = self.next_word(position)?;
164                    if next_word == b"endstream" {
165                        return Some(PdfObject::Stream(stream_object));
166                    }
167                    unimplemented!("Stream not ended with 'endstream'");
168                } else {
169                    unimplemented!("stream without /Length in dictionary");
170                }
171            } else {
172                position.index = index_after_dict;
173            }
174
175            return Some(PdfObject::Dictionary(PdfUntypedDictionary::new(map)));
176        } else if next_word == b"(" {
177            // Start of string.
178            let start_position = position.index;
179            let mut last_was_slash = false;
180            while (last_was_slash, self.current_value(position)) != (false, b')') {
181                last_was_slash = self.current_value(position) == b'\\';
182                if !self.advance_position(position) {
183                    return None;
184                }
185            }
186            let slice = &self.buffer[start_position..position.index];
187            if !self.advance_position(position) {
188                return None;
189            }
190            return Some(PdfObject::String(PdfString::new_literal(slice)));
191        } else if next_word == b"<" {
192            // Start of hexadecimal string.
193            // TODO: Decode (on demand?)
194            let start_position = position.index;
195            while self.current_value(position) != b'>' {
196                if !self.advance_position(position) {
197                    return None;
198                }
199            }
200            let slice = &self.buffer[start_position..position.index];
201            if !self.advance_position(position) {
202                return None;
203            }
204            return Some(PdfObject::String(PdfString::new_hexadecimal(slice)));
205        }
206
207        if let Ok(float_value) = std::str::from_utf8(next_word).ok()?.parse::<f32>() {
208            return Some(PdfObject::Real(float_value));
209        }
210
211        None
212    }
213
214    pub fn next_indirect_object(
215        &self,
216        position: &mut PdfReaderPosition,
217    ) -> Option<(PdfObjectIdentifier, PdfObject<'a>)> {
218        let object_identifier = self.parse_next(position)?;
219        let generation_number = self.parse_next(position)?;
220
221        if self.next_word(position) != Some(b"obj") {
222            return None;
223        }
224        let object = self.next_object(position)?;
225        let w = self.next_word(position);
226        if w != Some(b"endobj") {
227            return None;
228        }
229
230        let object_identifier = PdfObjectIdentifier::new(object_identifier, generation_number);
231        Some((object_identifier, object))
232    }
233
234    fn parse_next<T: core::str::FromStr>(&self, position: &mut PdfReaderPosition) -> Option<T> {
235        let next_word = self.next_word(position)?;
236        // TODO: Do not panic below. Avoid from_utf8?
237        let next_str = std::str::from_utf8(next_word).ok()?;
238        next_str.parse::<T>().ok()
239    }
240
241    /// Check if a byte represents a whitespace according to the PDF file format.
242    ///
243    /// See PDF 32000-1:2008, 7.2.2 Character Set.
244    pub const fn is_whitespace(byte: u8) -> bool {
245        matches!(byte, 0 | 9 | 10 | 12 | 13 | 32)
246    }
247
248    /// Check if a byte is a delimiter
249    ///
250    /// 7.2.2 Character Set: "The delimiter characters (, ), <, >, [, ], {, }, /, and % are special
251    /// (LEFT PARENTHESIS (28h), RIGHT PARENTHESIS (29h), LESS-THAN SIGN (3Ch), GREATER-THAN SIGN
252    /// (3Eh), LEFT SQUARE BRACKET (5Bh), RIGHT SQUARE BRACKET (5Dh), LEFT CURLY BRACE (7Bh), RIGHT
253    /// CURLY BRACE (07Dh), SOLIDUS (2Fh) and PERCENT SIGN (25h), respectively). They delimit
254    /// syntactic entities such as arrays, names, and comments. Any of these characters terminates
255    /// the entity preceding it and is not included in the entity. Delimiter characters are allowed
256    /// within the scope of a string when following the rules for composing strings; see 7.3.4.2,
257    /// “Literal Strings”. The leading ( of a string does delimit a preceding entity and the closing
258    /// ) of a string delimits the string’s end."
259    const fn is_delimiter(byte: u8) -> bool {
260        matches!(
261            byte,
262            b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
263        )
264    }
265
266    const fn at_eof(&self, position: &PdfReaderPosition) -> bool {
267        position.index >= self.buffer.len()
268    }
269
270    const fn current_value(&self, position: &PdfReaderPosition) -> u8 {
271        self.buffer[position.index]
272    }
273
274    /// Next word or None if no word follows (end of file or delimiter).
275    pub fn next_word(&self, position: &mut PdfReaderPosition) -> Option<&'a [u8]> {
276        loop {
277            let current_char = self.current_value(position);
278            if Self::is_whitespace(current_char) {
279                if !self.advance_position(position) {
280                    return None;
281                }
282            } else if current_char == b'%' {
283                self.read_line(position);
284                if self.at_eof(position) {
285                    return None;
286                }
287            } else {
288                break;
289            }
290        }
291
292        let start_index = position.index;
293        let start_char = self.current_value(position);
294        let start_char_is_delimiter = Self::is_delimiter(start_char);
295        if start_char_is_delimiter {
296            if !self.advance_position(position) {
297                // TODO: Return EOF error
298                return None;
299            }
300            let next_char = self.current_value(position);
301            if (start_char == b'<' && next_char == b'<')
302                || (start_char == b'>' && next_char == b'>')
303            {
304                // The double delimiters << and >> are special - see section 7.2.3 of the PDF 1.7
305                // specification.
306                if !self.advance_position(position) {
307                    // TODO: Return EOF error
308                    return None;
309                }
310            }
311            return Some(&self.buffer[start_index..position.index]);
312        }
313
314        loop {
315            let current_char = self.current_value(position);
316            if Self::is_whitespace(current_char) || Self::is_delimiter(current_char) {
317                break;
318            } else if !self.advance_position(position) {
319                return None;
320            }
321        }
322
323        Some(&self.buffer[start_index..position.index])
324    }
325}
326
327#[cfg(test)]
328mod tests {
329    use crate::parse::{PdfParser, PdfReaderPosition};
330    use crate::{
331        PdfDocument, PdfDocumentData, PdfFormField, PdfObject, PdfObjectIdentifier, PdfString,
332        PdfVersion,
333    };
334
335    #[test]
336    fn parse_junk() {
337        let pdf_bytes = b"hello world";
338        let parse_result = PdfDocumentData::parse(pdf_bytes);
339        assert_eq!(
340            parse_result.err().unwrap(),
341            "File not starting with '%PDF-'".to_string()
342        );
343    }
344
345    #[test]
346    fn parse_minimal() {
347        fn assert_minimal_first_object(pdf: &PdfDocumentData) {
348            if let Some(PdfObject::Dictionary(dict)) =
349                pdf.objects.get(&PdfObjectIdentifier::new(1, 0))
350            {
351                assert_eq!(dict.map.len(), 2);
352                assert_eq!(
353                    dict.map.get(&b"Type"[..]),
354                    Some(&PdfObject::Name(b"Catalog"))
355                );
356                assert_eq!(
357                    dict.map.get(&b"Pages"[..]),
358                    Some(&PdfObject::Reference(PdfObjectIdentifier::new(2, 0)))
359                );
360            } else {
361                panic!("Object 1 0 was not parsed into a dictionary");
362            }
363        }
364
365        fn assert_minimal_second_object(pdf: &PdfDocumentData) {
366            if let Some(PdfObject::Dictionary(dict)) =
367                pdf.objects.get(&PdfObjectIdentifier::new(2, 0))
368            {
369                assert_eq!(dict.map.len(), 4);
370                assert_eq!(dict.map.get(&b"Type"[..]), Some(&PdfObject::Name(b"Pages")));
371                assert_eq!(
372                    dict.map.get(&b"Kids"[..]),
373                    Some(&PdfObject::Array(vec![PdfObject::Reference(
374                        PdfObjectIdentifier::new(3, 0)
375                    )]))
376                );
377                assert_eq!(dict.map.get(&b"Count"[..]), Some(&PdfObject::Integer(1)));
378                assert_eq!(
379                    dict.map.get(&b"MediaBox"[..]),
380                    Some(&PdfObject::Array(vec![
381                        PdfObject::Integer(0),
382                        PdfObject::Integer(0),
383                        PdfObject::Integer(300),
384                        PdfObject::Integer(144)
385                    ]))
386                );
387            } else {
388                panic!("Object 2 0 was not parsed into a dictionary");
389            }
390        }
391
392        fn assert_minimal_third_object(pdf: &PdfDocumentData) {
393            if let Some(PdfObject::Dictionary(dict)) =
394                pdf.objects.get(&PdfObjectIdentifier::new(3, 0))
395            {
396                assert_eq!(dict.map.len(), 4);
397                assert_eq!(dict.map.get(&b"Type"[..]), Some(&PdfObject::Name(b"Page")));
398                assert_eq!(
399                    dict.map.get(&b"Parent"[..]),
400                    Some(&PdfObject::Reference(PdfObjectIdentifier::new(2, 0)))
401                );
402                assert_eq!(
403                    dict.map.get(&b"Contents"[..]),
404                    Some(&PdfObject::Reference(PdfObjectIdentifier::new(4, 0)))
405                );
406                if let Some(PdfObject::Dictionary(resources_dict)) = dict.map.get(&b"Resources"[..])
407                {
408                    assert_eq!(resources_dict.map.len(), 1);
409                    if let Some(PdfObject::Dictionary(font_dict)) =
410                        resources_dict.map.get(&b"Font"[..])
411                    {
412                        assert_eq!(font_dict.map.len(), 1);
413                        if let Some(PdfObject::Dictionary(f1_dict)) = font_dict.map.get(&b"F1"[..])
414                        {
415                            assert_eq!(f1_dict.map.len(), 3);
416                            assert_eq!(
417                                f1_dict.map.get(&b"Type"[..]),
418                                Some(&PdfObject::Name(b"Font"))
419                            );
420                            assert_eq!(
421                                f1_dict.map.get(&b"Subtype"[..]),
422                                Some(&PdfObject::Name(b"Type1"))
423                            );
424                            assert_eq!(
425                                f1_dict.map.get(&b"BaseFont"[..]),
426                                Some(&PdfObject::Name(b"Times-Roman"))
427                            );
428                        } else {
429                            panic!("Failed to parse /Resources->/Font->/F1 dictionary");
430                        }
431                    } else {
432                        panic!("Failed to parse /Resources->/Font dictionary");
433                    }
434                } else {
435                    panic!("Failed to parse /Resources dictionary");
436                }
437            } else {
438                panic!("Object 3 0 was not parsed into a dictionary");
439            }
440        }
441
442        fn assert_minimal_fourth_object(pdf: &PdfDocumentData, unix_line_endings: bool) {
443            if let Some(PdfObject::Stream(stream)) =
444                pdf.objects.get(&PdfObjectIdentifier::new(4, 0))
445            {
446                assert_eq!(stream.dictionary.map.len(), 1);
447                if unix_line_endings {
448                    assert_eq!(stream.bytes.len(), 55);
449                    assert_eq!(
450                        stream.bytes,
451                        b"  BT
452    /F1 18 Tf
453    0 0 Td
454    (Hello World) Tj
455  ET"
456                    );
457                } else {
458                    assert_eq!(stream.bytes.len(), 59);
459                    assert_eq!(
460                        stream.bytes,
461                        b"  BT\r
462    /F1 18 Tf\r
463    0 0 Td\r
464    (Hello World) Tj\r
465  ET"
466                    );
467                }
468            } else {
469                panic!("Object 4 0 was not parsed into a stream");
470            }
471        }
472
473        fn assert_minimal_file(pdf_bytes: &[u8], unix_line_endings: bool) {
474            // Minimal PDF file explained at https://brendanzagaeski.appspot.com/0004.html
475            assert!(pdf_bytes.starts_with(b"%PDF-1.1"));
476
477            let pdf = PdfDocumentData::parse(pdf_bytes).unwrap();
478            assert_eq!(pdf.version, PdfVersion::Version11);
479            assert_eq!(pdf.objects.len(), 4);
480
481            assert_minimal_first_object(&pdf);
482            assert_minimal_second_object(&pdf);
483            assert_minimal_third_object(&pdf);
484            assert_minimal_fourth_object(&pdf, unix_line_endings);
485
486            assert_eq!(
487                pdf.trailer.map.get(&b"Root"[..]),
488                Some(&PdfObject::Reference(PdfObjectIdentifier::new(1, 0)))
489            );
490            assert_eq!(
491                pdf.trailer.map.get(&b"Size"[..]),
492                Some(&PdfObject::Integer(5))
493            );
494        }
495
496        assert_minimal_file(include_bytes!("tests/assets/minimal.pdf"), true);
497        assert_minimal_file(include_bytes!("tests/assets/minimal_crlf_l.pdf"), false);
498    }
499
500    #[test]
501    fn parse_signed_signicat() {
502        let pdf_bytes = include_bytes!("tests/assets/signed-by-signicat-example.pdf");
503        let pdf = PdfDocumentData::parse(pdf_bytes).unwrap();
504        assert_eq!(pdf.version, PdfVersion::Version17);
505        assert_eq!(pdf.objects.len(), 110);
506
507        let pdf = PdfDocument::parse(pdf_bytes).unwrap();
508        let pdf_fields = pdf.catalog.interactive_form.unwrap().fields;
509        assert_eq!(pdf_fields.len(), 1);
510        if let PdfFormField::Signature(signature_field) = &pdf_fields[0] {
511            if let Some(_signature) = &signature_field.signature {
512                return;
513            }
514        }
515        panic!("Failed parsing signature");
516    }
517
518    #[test]
519    fn test_whitespace() {
520        assert!(PdfParser::is_whitespace(b' '));
521        assert!(PdfParser::is_whitespace(b'\n'));
522        assert!(PdfParser::is_whitespace(b'\r'));
523        assert!(!PdfParser::is_whitespace(b'a'));
524        assert!(!PdfParser::is_whitespace(b'.'));
525        assert!(!PdfParser::is_whitespace(b'!'));
526    }
527
528    #[test]
529    fn test_read_line() {
530        let bytes = b"123\n456";
531        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
532        assert_eq!(position.index, 0);
533        parser.read_line(&mut position);
534        assert_eq!(position.index, 4);
535        assert_eq!(parser.current_value(&position), b'4');
536
537        let bytes = b"123\r456";
538        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
539        parser.read_line(&mut position);
540        assert_eq!(position.index, 4);
541        assert_eq!(parser.current_value(&position), b'4');
542
543        let bytes = b"123\r\n456";
544        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
545        parser.read_line(&mut position);
546        assert_eq!(position.index, 5);
547        assert_eq!(parser.current_value(&position), b'4');
548    }
549
550    #[test]
551    fn test_next_word() {
552        let bytes = b"obj ";
553        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
554        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
555
556        let bytes = b"   obj<";
557        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
558        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
559
560        let bytes = b"   obj  ";
561        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
562        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
563
564        let bytes = b"   obj<<";
565        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
566        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
567        assert_eq!(parser.next_word(&mut position), None);
568
569        let bytes = b"   obj    endobj ";
570        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
571        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
572        assert_eq!(parser.next_word(&mut position), Some(&b"endobj"[..]));
573        assert_eq!(parser.next_word(&mut position), None);
574
575        let bytes = b"  % a comment\n   obj    endobj ";
576        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
577        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
578        assert_eq!(parser.next_word(&mut position), Some(&b"endobj"[..]));
579        assert_eq!(parser.next_word(&mut position), None);
580
581        let bytes = b" obj<< ";
582        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
583        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
584        assert_eq!(parser.next_word(&mut position), Some(&b"<<"[..]));
585        assert_eq!(parser.next_word(&mut position), None);
586
587        let bytes = b" obj \n % a comment\n<< ";
588        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
589        assert_eq!(parser.next_word(&mut position), Some(&b"obj"[..]));
590        assert_eq!(parser.next_word(&mut position), Some(&b"<<"[..]));
591        assert_eq!(parser.next_word(&mut position), None);
592    }
593
594    #[test]
595    fn test_next_object() {
596        let bytes = b"true ";
597        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
598        assert_eq!(
599            parser.next_object(&mut position),
600            Some(PdfObject::Boolean(true))
601        );
602
603        let bytes = b"false ";
604        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
605        assert_eq!(
606            parser.next_object(&mut position),
607            Some(PdfObject::Boolean(false))
608        );
609
610        let bytes = b"6.14 ";
611        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
612        assert_eq!(
613            parser.next_object(&mut position),
614            Some(PdfObject::Real(6.14))
615        );
616
617        let bytes = b"false false ";
618        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
619        assert_eq!(
620            parser.next_object(&mut position),
621            Some(PdfObject::Boolean(false))
622        );
623        assert_eq!(
624            parser.next_object(&mut position),
625            Some(PdfObject::Boolean(false))
626        );
627
628        let bytes = b" 0 ";
629        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
630        assert_eq!(
631            parser.next_object(&mut position),
632            Some(PdfObject::Integer(0))
633        );
634
635        let bytes = b" -1 ";
636        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
637        assert_eq!(
638            parser.next_object(&mut position),
639            Some(PdfObject::Integer(-1))
640        );
641
642        let bytes = b" +1 ";
643        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
644        assert_eq!(
645            parser.next_object(&mut position),
646            Some(PdfObject::Integer(1))
647        );
648
649        let bytes = b" 2147483647 ";
650        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
651        assert_eq!(
652            parser.next_object(&mut position),
653            Some(PdfObject::Integer(2_147_483_647))
654        );
655
656        let bytes = b" -2147483648 ";
657        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
658        assert_eq!(
659            parser.next_object(&mut position),
660            Some(PdfObject::Integer(-2_147_483_648))
661        );
662
663        let bytes = b"/Type ";
664        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
665        assert_eq!(
666            parser.next_object(&mut position),
667            Some(PdfObject::Name(b"Type"))
668        );
669
670        let bytes = b" << /Type /Catalog /Value 1 >> << /Name /Value >> % A comment\n999 << /OtherName /OtherValue >> ";
671        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
672        if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
673            assert_eq!(dictionary.map.len(), 2);
674            let entry = dictionary.map.get(&b"Type"[..]);
675            assert_eq!(entry, Some(&PdfObject::Name(b"Catalog")));
676            let entry = dictionary.map.get(&b"Value"[..]);
677            assert_eq!(entry, Some(&PdfObject::Integer(1)));
678        } else {
679            panic!("Failed to parse dict");
680        }
681        if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
682            assert_eq!(dictionary.map.len(), 1);
683            let entry = dictionary.map.get(&b"Name"[..]);
684            assert_eq!(entry, Some(&PdfObject::Name(b"Value")));
685        } else {
686            panic!("Failed to parse dict");
687        }
688        assert_eq!(
689            parser.next_object(&mut position),
690            Some(PdfObject::Integer(999))
691        );
692        if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
693            assert_eq!(dictionary.map.len(), 1);
694            let entry = dictionary.map.get(&b"OtherName"[..]);
695            assert_eq!(entry, Some(&PdfObject::Name(b"OtherValue")));
696        } else {
697            panic!("Failed to parse dict");
698        }
699    }
700
701    #[test]
702    fn test_parse_dictionaries() {
703        let bytes = b"<</FT/Sig/T(Signature1)/V 1 0 R/F 132/Type/Annot/Subtype/Widget/Rect[0 0 0 0]/AP<</N 2 0 R>>/P 4 0 R/DR<<>>>> ";
704        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
705        if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
706            assert_eq!(dictionary.map.len(), 10);
707            assert_eq!(
708                dictionary.map.get(&b"FT"[..]),
709                Some(&PdfObject::Name(b"Sig"))
710            );
711            assert_eq!(
712                dictionary.map.get(&b"T"[..]),
713                Some(&PdfObject::String(PdfString::new_literal(b"Signature1")))
714            );
715            assert_eq!(
716                dictionary.map.get(&b"V"[..]),
717                Some(&PdfObject::Reference(PdfObjectIdentifier::new(1, 0)))
718            );
719            assert_eq!(
720                dictionary.map.get(&b"F"[..]),
721                Some(&PdfObject::Integer(132))
722            );
723            assert_eq!(
724                dictionary.map.get(&b"Type"[..]),
725                Some(&PdfObject::Name(b"Annot"))
726            );
727        } else {
728            panic!("Failed to parse dict");
729        }
730
731        let bytes = b"<</Contents <ff>/Reference 1>> ";
732        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
733        if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
734            assert_eq!(dictionary.map.len(), 2);
735            assert_eq!(
736                dictionary.map.get(&b"Contents"[..]),
737                // TODO:
738                Some(&PdfObject::String(PdfString::new_hexadecimal(b"ff")))
739            );
740            assert_eq!(
741                dictionary.map.get(&b"Reference"[..]),
742                Some(&PdfObject::Integer(1))
743            );
744        } else {
745            panic!("Failed to parse dict");
746        }
747    }
748
749    #[test]
750    fn test_parse_dictionary_mapping_to_empty_name() {
751        for bytes in [
752            &b"<< /App << /Name / >> >> "[..],
753            &b"<</App<</Name/>>>> "[..],
754        ] {
755            let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
756            if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
757                assert_eq!(dictionary.map.len(), 1);
758                if let Some(PdfObject::Dictionary(dictionary)) = dictionary.map.get(&b"App"[..]) {
759                    assert_eq!(dictionary.map.len(), 1);
760                    assert_eq!(
761                        dictionary.map.get(&b"Name"[..]),
762                        Some(&PdfObject::Name(b""))
763                    );
764                    return;
765                }
766            }
767            panic!("Failed to parse dict");
768        }
769    }
770
771    /// Test that "A dictionary entry whose value is null (see 7.3.9, 'Null object') shall be
772    /// treated the same as if the entry does not exist".
773    ///
774    /// From 7.3.7 Dictionary objects:
775    /// <https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf#page=26>
776    ///
777    /// TODO: Ensure this behaviour also when value is indirect reference to non-existing object.
778    /// TODO: Perhaps by having this logic on a dictionary lookup method?
779    #[test]
780    #[ignore]
781    fn test_parse_dictionary_with_null_value() {
782        let bytes = b"<< /FirstKey 1 /SecondKey null /ThirdKey 3 >> ";
783        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
784        if let Some(PdfObject::Dictionary(dictionary)) = parser.next_object(&mut position) {
785            assert_eq!(dictionary.map.len(), 2);
786            assert_eq!(
787                dictionary.map.get(&b"FirstKey"[..]),
788                Some(&PdfObject::Integer(1))
789            );
790            assert_eq!(
791                dictionary.map.get(&b"SecondKey"[..]),
792                Some(&PdfObject::Integer(3))
793            );
794            return;
795        }
796        panic!("Failed to parse dict");
797    }
798
799    #[test]
800    fn test_parse_null() {
801        let bytes = b"null ";
802        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
803        assert_eq!(parser.next_object(&mut position), Some(PdfObject::Null));
804    }
805
806    #[test]
807    fn test_parse_names() {
808        fn assert_parsing(from: &[u8], expected_name: &[u8]) {
809            let (parser, mut position) = (PdfParser::new(from), PdfReaderPosition::new());
810            assert_eq!(
811                parser.next_object(&mut position),
812                Some(PdfObject::Name(expected_name))
813            );
814        }
815
816        assert_parsing(b"/Name1 ", b"Name1");
817        assert_parsing(b"/ASomewhatLongerName ", b"ASomewhatLongerName");
818        assert_parsing(
819            b"/A;Name_With-Various***Characters? ",
820            b"A;Name_With-Various***Characters?",
821        );
822        assert_parsing(b"/1.2 ", b"1.2");
823        assert_parsing(b"/$$ ", b"$$");
824        assert_parsing(b"/@pattern ", b"@pattern");
825        assert_parsing(b"/.notdef ", b".notdef");
826        // TODO: assert_parsing(b"/Lime#20Green ", b"Lime Green");
827        // TODO: assert_parsing(b"/paired#28#29parentheses ", b"paired( )parentheses");
828        // TODO: assert_parsing(b"/paired#28#29parentheses ", b"paired( )parentheses");
829        // TODO: assert_parsing(b"/The_Key_of_F#23_Minor ", b"The_Key_of_F#_Minor");
830        // TODO: assert_parsing(b"/A#42 ", b"AB");
831        // TODO: assert_parsing(b"/ ", b"");
832    }
833
834    #[test]
835    fn test_parse_strings() {
836        let bytes = b"() ";
837        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
838        assert_eq!(
839            parser.next_object(&mut position),
840            Some(PdfObject::String(PdfString::new_literal(b"")))
841        );
842
843        let bytes = b"(hello, world) (second string)(third) ";
844        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
845        assert_eq!(
846            parser.next_object(&mut position),
847            Some(PdfObject::String(PdfString::new_literal(b"hello, world")))
848        );
849        assert_eq!(
850            parser.next_object(&mut position),
851            Some(PdfObject::String(PdfString::new_literal(b"second string")))
852        );
853        assert_eq!(
854            parser.next_object(&mut position),
855            Some(PdfObject::String(PdfString::new_literal(b"third")))
856        );
857
858        let bytes = b"(hello \\(world\\) bye) ";
859        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
860        assert_eq!(
861            parser.next_object(&mut position),
862            // TODO: Unescape backslash (on demand?)
863            Some(PdfObject::String(PdfString::new_literal(
864                b"hello \\(world\\) bye"
865            )))
866        );
867    }
868
869    #[test]
870    fn test_parse_arrays() {
871        let bytes = b"[549 6.14 false (Ralph) /SomeName ]";
872        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
873        if let Some(PdfObject::Array(array)) = parser.next_object(&mut position) {
874            assert_eq!(array.len(), 5);
875            assert_eq!(array[0], PdfObject::Integer(549));
876            assert_eq!(array[1], PdfObject::Real(6.14));
877            assert_eq!(array[2], PdfObject::Boolean(false));
878            assert_eq!(
879                array[3],
880                PdfObject::String(PdfString::new_literal(b"Ralph"))
881            );
882            assert_eq!(array[4], PdfObject::Name(b"SomeName"));
883        } else {
884            panic!("Failed to parse array");
885        }
886
887        let bytes = b"[] ";
888        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
889        if let Some(PdfObject::Array(array)) = parser.next_object(&mut position) {
890            assert_eq!(array.len(), 0);
891        } else {
892            panic!("Failed to parse empty array");
893        }
894    }
895
896    #[test]
897    fn test_parse_streams() {
898        let bytes = b"4 1 obj
899  << /Length 55 >>
900stream
901  BT
902    /F1 18 Tf
903    0 0 Td
904    (Hello World) Tj
905  ET
906endstream
907endobj ";
908        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
909        let (parsed_id, object) = parser.next_indirect_object(&mut position).unwrap();
910        assert_eq!(parsed_id, PdfObjectIdentifier::new(4, 1));
911        if let PdfObject::Stream(stream) = object {
912            assert_eq!(stream.dictionary.map.len(), 1);
913            assert_eq!(stream.bytes.len(), 55);
914            assert_eq!(
915                stream.bytes,
916                b"  BT
917    /F1 18 Tf
918    0 0 Td
919    (Hello World) Tj
920  ET"
921            );
922        } else {
923            panic!("Failed parsing stream");
924        }
925    }
926
927    #[test]
928    fn test_parse_next() {
929        let bytes = b"32 ";
930        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
931        assert_eq!(parser.parse_next(&mut position), Some(32));
932
933        let bytes = b" 32 ";
934        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
935        assert_eq!(parser.parse_next(&mut position), Some(32));
936
937        let bytes = b" 32 % a comment\n   33 34 ";
938        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
939        assert_eq!(parser.parse_next(&mut position), Some(32));
940        assert_eq!(parser.parse_next(&mut position), Some(33));
941        assert_eq!(parser.parse_next(&mut position), Some(34));
942    }
943
944    #[test]
945    fn test_next_indirect_object() {
946        let bytes = b"8 0 obj\n 77\nendobj ";
947        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
948        let id = PdfObjectIdentifier::new(8, 0);
949        let object = PdfObject::Integer(77);
950        assert_eq!(
951            parser.next_indirect_object(&mut position),
952            Some((id, object))
953        );
954
955        let bytes = b"7 2 obj 62 endobj ";
956        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
957        let parsed = parser.next_indirect_object(&mut position);
958        let id = PdfObjectIdentifier::new(7, 2);
959        let object = PdfObject::Integer(62);
960        assert_eq!(parsed, Some((id, object)));
961
962        let bytes = b"7 0 obj 62 endobj\n\n8 0 obj 63 endobj ";
963        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
964        let id = PdfObjectIdentifier::new(7, 0);
965        let object = PdfObject::Integer(62);
966        assert_eq!(
967            parser.next_indirect_object(&mut position),
968            Some((id, object))
969        );
970        let id = PdfObjectIdentifier::new(8, 0);
971        let object = PdfObject::Integer(63);
972        assert_eq!(
973            parser.next_indirect_object(&mut position),
974            Some((id, object))
975        );
976
977        let bytes = b"3 0 obj
978  <<  /Type /Page
979      /Parent 2 0 R
980      /Resources
981       << /Font
982           << /F1
983               << /Type /Font
984                  /Subtype /Type1
985                  /BaseFont /Times-Roman
986               >>
987           >>
988       >>
989      /Contents 4 1 R
990  >>
991endobj\n\n4 1 obj\n765\nendobj ";
992        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
993        let expected_id = PdfObjectIdentifier::new(3, 0);
994        let (parsed_id, object) = parser.next_indirect_object(&mut position).unwrap();
995        assert_eq!(parsed_id, expected_id);
996        if let PdfObject::Dictionary(dict) = object {
997            assert_eq!(dict.map.len(), 4);
998            assert_eq!(dict.map.get(&b"Type"[..]), Some(&PdfObject::Name(b"Page")));
999            assert_eq!(
1000                dict.map.get(&b"Parent"[..]),
1001                Some(&PdfObject::Reference(PdfObjectIdentifier::new(2, 0)))
1002            );
1003            assert_eq!(
1004                dict.map.get(&b"Contents"[..]),
1005                Some(&PdfObject::Reference(PdfObjectIdentifier::new(4, 1)))
1006            );
1007            if let Some(PdfObject::Dictionary(resources_dict)) = dict.map.get(&b"Resources"[..]) {
1008                assert_eq!(resources_dict.map.len(), 1);
1009                if let Some(PdfObject::Dictionary(font_dict)) = resources_dict.map.get(&b"Font"[..])
1010                {
1011                    assert_eq!(font_dict.map.len(), 1);
1012                    if let Some(PdfObject::Dictionary(f1_dict)) = font_dict.map.get(&b"F1"[..]) {
1013                        assert_eq!(f1_dict.map.len(), 3);
1014                        assert_eq!(
1015                            f1_dict.map.get(&b"Type"[..]),
1016                            Some(&PdfObject::Name(b"Font"))
1017                        );
1018                        assert_eq!(
1019                            f1_dict.map.get(&b"Subtype"[..]),
1020                            Some(&PdfObject::Name(b"Type1"))
1021                        );
1022                        assert_eq!(
1023                            f1_dict.map.get(&b"BaseFont"[..]),
1024                            Some(&PdfObject::Name(b"Times-Roman"))
1025                        );
1026                    } else {
1027                        panic!("Failed to parse /Resources->/Font->/F1 dictionary");
1028                    }
1029                } else {
1030                    panic!("Failed to parse /Resources->/Font dictionary");
1031                }
1032            } else {
1033                panic!("Failed to parse /Resources dictionary");
1034            }
1035        } else {
1036            panic!("Failed parsing dictionary");
1037        }
1038        let expected_id = PdfObjectIdentifier::new(4, 1);
1039        let (parsed_id, object) = parser.next_indirect_object(&mut position).unwrap();
1040        assert_eq!(parsed_id, expected_id);
1041        assert_eq!(object, PdfObject::Integer(765));
1042    }
1043
1044    #[test]
1045    fn test_parsing_indirect_object() {
1046        let bytes = b"2 0 obj
1047  << /Type /Pages
1048     /Kids [3 1 R]
1049     /Count 1
1050     /MediaBox [1 2 300 144]
1051  >>
1052endobj ";
1053        let (parser, mut position) = (PdfParser::new(bytes), PdfReaderPosition::new());
1054        let (parsed_id, object) = parser.next_indirect_object(&mut position).unwrap();
1055        assert_eq!(parsed_id, PdfObjectIdentifier::new(2, 0));
1056        if let PdfObject::Dictionary(dict) = object {
1057            assert_eq!(dict.map.len(), 4);
1058            assert_eq!(dict.map.get(&b"Type"[..]), Some(&PdfObject::Name(b"Pages")));
1059            assert_eq!(
1060                dict.map.get(&b"Kids"[..]),
1061                Some(&PdfObject::Array(vec![PdfObject::Reference(
1062                    PdfObjectIdentifier::new(3, 1)
1063                )]))
1064            );
1065            assert_eq!(dict.map.get(&b"Count"[..]), Some(&PdfObject::Integer(1)));
1066            assert_eq!(
1067                dict.map.get(&b"MediaBox"[..]),
1068                Some(&PdfObject::Array(vec![
1069                    PdfObject::Integer(1),
1070                    PdfObject::Integer(2),
1071                    PdfObject::Integer(300),
1072                    PdfObject::Integer(144)
1073                ]))
1074            );
1075        } else {
1076            panic!("Failed to parse dict");
1077        }
1078    }
1079}