Skip to main content

zpdf_parser/
lexer.rs

1use zpdf_core::{Error, ObjectId, ParseLimits, PdfName, PdfObject, PdfString, Result};
2
3pub struct Lexer<'a> {
4    data: &'a [u8],
5    pos: usize,
6    limits: &'a ParseLimits,
7    depth: u32,
8}
9
10impl<'a> Lexer<'a> {
11    pub fn new(data: &'a [u8], pos: usize, limits: &'a ParseLimits) -> Self {
12        Self {
13            data,
14            pos,
15            limits,
16            depth: 0,
17        }
18    }
19
20    /// Increment container-nesting depth, erroring if it exceeds the limit.
21    /// Call once on entry to `read_array`/`read_dict`.
22    fn enter_container(&mut self) -> Result<()> {
23        self.depth += 1;
24        if self.depth > self.limits.max_object_depth {
25            return Err(Error::RecursionLimit(self.limits.max_object_depth));
26        }
27        Ok(())
28    }
29
30    fn leave_container(&mut self) {
31        self.depth = self.depth.saturating_sub(1);
32    }
33
34    pub fn pos(&self) -> usize {
35        self.pos
36    }
37
38    pub fn set_pos(&mut self, pos: usize) {
39        self.pos = pos;
40    }
41
42    pub fn is_eof(&self) -> bool {
43        self.pos >= self.data.len()
44    }
45
46    fn peek(&self) -> Option<u8> {
47        self.data.get(self.pos).copied()
48    }
49
50    fn advance(&mut self) -> Option<u8> {
51        let b = self.data.get(self.pos).copied()?;
52        self.pos += 1;
53        Some(b)
54    }
55
56    pub fn skip_whitespace_and_comments(&mut self) {
57        loop {
58            match self.peek() {
59                Some(b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c') => {
60                    self.pos += 1;
61                }
62                Some(b'%') => {
63                    self.pos += 1;
64                    while let Some(b) = self.peek() {
65                        self.pos += 1;
66                        if b == b'\r' || b == b'\n' {
67                            break;
68                        }
69                    }
70                }
71                _ => break,
72            }
73        }
74    }
75
76    pub fn next_token(&mut self) -> Result<PdfObject> {
77        self.skip_whitespace_and_comments();
78
79        if self.is_eof() {
80            return Err(Error::UnexpectedEof(self.pos as u64));
81        }
82
83        match self.peek().unwrap() {
84            b'/' => self.read_name(),
85            b'(' => self.read_literal_string(),
86            b'<' => {
87                if self.data.get(self.pos + 1) == Some(&b'<') {
88                    self.read_dict()
89                } else {
90                    self.read_hex_string()
91                }
92            }
93            b'[' => self.read_array(),
94            b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(),
95            b't' | b'f' => self.read_bool_or_keyword(),
96            b'n' => self.read_null_or_keyword(),
97            _ => Err(Error::InvalidObject(
98                self.pos as u64,
99                format!("unexpected byte: 0x{:02x}", self.peek().unwrap()),
100            )),
101        }
102    }
103
104    fn read_name(&mut self) -> Result<PdfObject> {
105        self.advance(); // skip '/'
106        let start = self.pos;
107        while let Some(b) = self.peek() {
108            if is_delimiter(b) || is_whitespace(b) {
109                break;
110            }
111            self.pos += 1;
112        }
113        let raw = &self.data[start..self.pos];
114        let name = decode_name(raw);
115        Ok(PdfObject::Name(PdfName::new(name)))
116    }
117
118    fn read_literal_string(&mut self) -> Result<PdfObject> {
119        self.advance(); // skip '('
120        let mut buf = Vec::new();
121        let mut depth = 1u32;
122        let max = self.limits.max_string_length as usize;
123
124        while let Some(b) = self.advance() {
125            match b {
126                b'(' => {
127                    depth += 1;
128                    buf.push(b'(');
129                }
130                b')' => {
131                    depth -= 1;
132                    if depth == 0 {
133                        break;
134                    }
135                    buf.push(b')');
136                }
137                b'\\' => {
138                    if let Some(esc) = self.advance() {
139                        match esc {
140                            b'n' => buf.push(b'\n'),
141                            b'r' => buf.push(b'\r'),
142                            b't' => buf.push(b'\t'),
143                            b'b' => buf.push(0x08),
144                            b'f' => buf.push(0x0c),
145                            b'(' => buf.push(b'('),
146                            b')' => buf.push(b')'),
147                            b'\\' => buf.push(b'\\'),
148                            b'0'..=b'7' => {
149                                let mut octal = (esc - b'0') as u16;
150                                for _ in 0..2 {
151                                    match self.peek() {
152                                        Some(c @ b'0'..=b'7') => {
153                                            octal = octal * 8 + (c - b'0') as u16;
154                                            self.pos += 1;
155                                        }
156                                        _ => break,
157                                    }
158                                }
159                                buf.push(octal as u8);
160                            }
161                            b'\r' => {
162                                if self.peek() == Some(b'\n') {
163                                    self.pos += 1;
164                                }
165                            }
166                            b'\n' => {}
167                            _ => buf.push(esc),
168                        }
169                    }
170                }
171                _ => buf.push(b),
172            }
173            // Each iteration pushes at most one byte, so a single post-match
174            // check is sufficient to bound total string growth.
175            if buf.len() > max {
176                return Err(Error::StringLengthLimit(self.limits.max_string_length));
177            }
178        }
179
180        Ok(PdfObject::String(PdfString::new(buf)))
181    }
182
183    fn read_hex_string(&mut self) -> Result<PdfObject> {
184        self.advance(); // skip '<'
185        let mut buf = Vec::new();
186        let mut high: Option<u8> = None;
187        let max = self.limits.max_string_length as usize;
188
189        loop {
190            match self.advance() {
191                Some(b'>') => break,
192                Some(b) if is_whitespace(b) => continue,
193                Some(b) => {
194                    let nibble = hex_digit(b).ok_or_else(|| {
195                        Error::InvalidObject(self.pos as u64 - 1, "invalid hex digit".into())
196                    })?;
197                    match high {
198                        None => high = Some(nibble),
199                        Some(h) => {
200                            buf.push((h << 4) | nibble);
201                            high = None;
202                            if buf.len() > max {
203                                return Err(Error::StringLengthLimit(
204                                    self.limits.max_string_length,
205                                ));
206                            }
207                        }
208                    }
209                }
210                None => return Err(Error::UnexpectedEof(self.pos as u64)),
211            }
212        }
213
214        if let Some(h) = high {
215            buf.push(h << 4);
216        }
217
218        Ok(PdfObject::String(PdfString::new(buf)))
219    }
220
221    fn read_number(&mut self) -> Result<PdfObject> {
222        let start = self.pos;
223        let mut has_dot = false;
224
225        if matches!(self.peek(), Some(b'+' | b'-')) {
226            self.pos += 1;
227        }
228
229        while let Some(b) = self.peek() {
230            match b {
231                b'0'..=b'9' => self.pos += 1,
232                b'.' if !has_dot => {
233                    has_dot = true;
234                    self.pos += 1;
235                }
236                _ => break,
237            }
238        }
239
240        let s = std::str::from_utf8(&self.data[start..self.pos])
241            .map_err(|_| Error::InvalidObject(start as u64, "invalid number".into()))?;
242
243        if has_dot {
244            let n: f64 = s
245                .parse()
246                .map_err(|_| Error::InvalidObject(start as u64, format!("bad real: {s}")))?;
247            Ok(PdfObject::Real(n))
248        } else {
249            let n: i64 = s
250                .parse()
251                .map_err(|_| Error::InvalidObject(start as u64, format!("bad integer: {s}")))?;
252            Ok(PdfObject::Integer(n))
253        }
254    }
255
256    fn read_bool_or_keyword(&mut self) -> Result<PdfObject> {
257        let start = self.pos;
258        while let Some(b) = self.peek() {
259            if is_delimiter(b) || is_whitespace(b) {
260                break;
261            }
262            self.pos += 1;
263        }
264        let word = &self.data[start..self.pos];
265        match word {
266            b"true" => Ok(PdfObject::Bool(true)),
267            b"false" => Ok(PdfObject::Bool(false)),
268            _ => Err(Error::InvalidObject(
269                start as u64,
270                format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
271            )),
272        }
273    }
274
275    fn read_null_or_keyword(&mut self) -> Result<PdfObject> {
276        let start = self.pos;
277        while let Some(b) = self.peek() {
278            if is_delimiter(b) || is_whitespace(b) {
279                break;
280            }
281            self.pos += 1;
282        }
283        let word = &self.data[start..self.pos];
284        match word {
285            b"null" => Ok(PdfObject::Null),
286            _ => Err(Error::InvalidObject(
287                start as u64,
288                format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
289            )),
290        }
291    }
292
293    fn read_array(&mut self) -> Result<PdfObject> {
294        self.enter_container()?;
295        self.advance(); // skip '['
296        let mut items = Vec::new();
297        loop {
298            self.skip_whitespace_and_comments();
299            if self.peek() == Some(b']') {
300                self.pos += 1;
301                break;
302            }
303            if self.is_eof() {
304                return Err(Error::UnexpectedEof(self.pos as u64));
305            }
306            let obj = self.next_token()?;
307            items.push(self.maybe_resolve_ref(obj)?);
308        }
309        self.leave_container();
310        Ok(PdfObject::Array(items))
311    }
312
313    fn read_dict(&mut self) -> Result<PdfObject> {
314        self.enter_container()?;
315        self.pos += 2; // skip '<<'
316        let mut dict = zpdf_core::PdfDict::new();
317        // Bound on malformed tokens skipped before we give up on this dict, so a
318        // pathological body can't make us churn. Well-formed dicts never trip it.
319        let mut bad = 0u32;
320        const MAX_BAD_TOKENS: u32 = 64;
321        loop {
322            self.skip_whitespace_and_comments();
323            if self.data.get(self.pos..self.pos + 2) == Some(b">>") {
324                self.pos += 2;
325                break;
326            }
327            if self.is_eof() {
328                // Tolerate a dict whose closing `>>` was truncated or overwritten
329                // (e.g. by the next `N 0 obj` header): return what parsed so far
330                // rather than failing the whole — often critical-path — object.
331                break;
332            }
333            // Read the key leniently: damaged files routinely corrupt one
334            // key/value while the rest of the dict is intact. A non-Name key or
335            // an untokenizable byte is skipped, not fatal — but a resource-limit
336            // error (depth/recursion) must still propagate so the guards hold.
337            let key = match self.next_token() {
338                Ok(PdfObject::Name(n)) => n,
339                Err(e @ Error::RecursionLimit(_)) => return Err(e),
340                Ok(_non_name) => {
341                    // next_token already advanced past the stray token.
342                    bad += 1;
343                    if bad > MAX_BAD_TOKENS {
344                        break;
345                    }
346                    continue;
347                }
348                Err(_) => {
349                    bad += 1;
350                    if bad > MAX_BAD_TOKENS {
351                        break;
352                    }
353                    self.pos += 1; // guarantee forward progress past the bad byte
354                    continue;
355                }
356            };
357            // A missing or garbled value ends the dict (best effort) instead of
358            // aborting the object; a recursion-limit error still propagates.
359            let value = match self.next_token() {
360                Ok(v) => v,
361                Err(e @ Error::RecursionLimit(_)) => return Err(e),
362                Err(_) => break,
363            };
364            let value = match self.maybe_resolve_ref(value) {
365                Ok(v) => v,
366                Err(e @ Error::RecursionLimit(_)) => return Err(e),
367                Err(_) => break,
368            };
369            dict.insert(key, value);
370        }
371        self.leave_container();
372        Ok(PdfObject::Dict(dict))
373    }
374
375    pub(crate) fn maybe_resolve_ref(&mut self, obj: PdfObject) -> Result<PdfObject> {
376        if let PdfObject::Integer(num) = obj {
377            let saved = self.pos;
378            self.skip_whitespace_and_comments();
379            if let Ok(PdfObject::Integer(gen)) = self.read_number_if_available() {
380                self.skip_whitespace_and_comments();
381                if self.peek() == Some(b'R') {
382                    self.pos += 1;
383                    return Ok(PdfObject::Ref(ObjectId(num as u32, gen as u16)));
384                }
385            }
386            self.pos = saved;
387            Ok(PdfObject::Integer(num))
388        } else {
389            Ok(obj)
390        }
391    }
392
393    fn read_number_if_available(&mut self) -> Result<PdfObject> {
394        if matches!(self.peek(), Some(b'0'..=b'9' | b'+' | b'-' | b'.')) {
395            self.read_number()
396        } else {
397            Err(Error::InvalidObject(self.pos as u64, "not a number".into()))
398        }
399    }
400}
401
402fn is_whitespace(b: u8) -> bool {
403    matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c')
404}
405
406fn is_delimiter(b: u8) -> bool {
407    matches!(
408        b,
409        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
410    )
411}
412
413fn hex_digit(b: u8) -> Option<u8> {
414    match b {
415        b'0'..=b'9' => Some(b - b'0'),
416        b'a'..=b'f' => Some(b - b'a' + 10),
417        b'A'..=b'F' => Some(b - b'A' + 10),
418        _ => None,
419    }
420}
421
422fn decode_name(raw: &[u8]) -> String {
423    let mut result = Vec::with_capacity(raw.len());
424    let mut i = 0;
425    while i < raw.len() {
426        if raw[i] == b'#' && i + 2 < raw.len() {
427            if let (Some(h), Some(l)) = (hex_digit(raw[i + 1]), hex_digit(raw[i + 2])) {
428                result.push((h << 4) | l);
429                i += 3;
430                continue;
431            }
432        }
433        result.push(raw[i]);
434        i += 1;
435    }
436    String::from_utf8_lossy(&result).into_owned()
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442
443    fn lim() -> ParseLimits {
444        ParseLimits::default()
445    }
446
447    #[test]
448    fn lex_name() {
449        let l = lim();
450        let mut lex = Lexer::new(b"/Type", 0, &l);
451        let obj = lex.next_token().unwrap();
452        assert_eq!(obj, PdfObject::Name(PdfName::new("Type")));
453    }
454
455    #[test]
456    fn lex_name_with_hex_escape() {
457        let l = lim();
458        let mut lex = Lexer::new(b"/A#20B", 0, &l);
459        let obj = lex.next_token().unwrap();
460        assert_eq!(obj, PdfObject::Name(PdfName::new("A B")));
461    }
462
463    #[test]
464    fn lex_integer() {
465        let l = lim();
466        let mut lex = Lexer::new(b"42 ", 0, &l);
467        assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
468    }
469
470    #[test]
471    fn lex_negative_real() {
472        let l = lim();
473        let mut lex = Lexer::new(b"-3.5 ", 0, &l);
474        match lex.next_token().unwrap() {
475            PdfObject::Real(n) => assert!((n - (-3.5)).abs() < 1e-10),
476            other => panic!("expected Real, got {other:?}"),
477        }
478    }
479
480    #[test]
481    fn lex_literal_string() {
482        let l = lim();
483        let mut lex = Lexer::new(b"(hello world)", 0, &l);
484        let obj = lex.next_token().unwrap();
485        assert_eq!(
486            obj,
487            PdfObject::String(PdfString::new(b"hello world".to_vec()))
488        );
489    }
490
491    #[test]
492    fn lex_literal_string_nested_parens() {
493        let l = lim();
494        let mut lex = Lexer::new(b"(a (b) c)", 0, &l);
495        let obj = lex.next_token().unwrap();
496        assert_eq!(obj, PdfObject::String(PdfString::new(b"a (b) c".to_vec())));
497    }
498
499    #[test]
500    fn lex_hex_string() {
501        let l = lim();
502        let mut lex = Lexer::new(b"<48656C6C6F>", 0, &l);
503        let obj = lex.next_token().unwrap();
504        assert_eq!(obj, PdfObject::String(PdfString::new(b"Hello".to_vec())));
505    }
506
507    #[test]
508    fn lex_array() {
509        let l = lim();
510        let mut lex = Lexer::new(b"[1 2 3]", 0, &l);
511        let obj = lex.next_token().unwrap();
512        assert_eq!(
513            obj,
514            PdfObject::Array(vec![
515                PdfObject::Integer(1),
516                PdfObject::Integer(2),
517                PdfObject::Integer(3),
518            ])
519        );
520    }
521
522    #[test]
523    fn lex_dict() {
524        let l = lim();
525        let mut lex = Lexer::new(b"<< /Type /Page /Count 5 >>", 0, &l);
526        let obj = lex.next_token().unwrap();
527        match obj {
528            PdfObject::Dict(d) => {
529                assert_eq!(d.get_name("Type").unwrap(), "Page");
530                assert_eq!(d.get_i64("Count").unwrap(), 5);
531            }
532            other => panic!("expected Dict, got {other:?}"),
533        }
534    }
535
536    #[test]
537    fn lex_bool_and_null() {
538        let l = lim();
539        let mut lex = Lexer::new(b"true", 0, &l);
540        assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(true));
541
542        let mut lex = Lexer::new(b"false", 0, &l);
543        assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(false));
544
545        let mut lex = Lexer::new(b"null", 0, &l);
546        assert_eq!(lex.next_token().unwrap(), PdfObject::Null);
547    }
548
549    #[test]
550    fn lex_indirect_ref_in_array() {
551        let l = lim();
552        let mut lex = Lexer::new(b"[12 0 R]", 0, &l);
553        let obj = lex.next_token().unwrap();
554        assert_eq!(obj, PdfObject::Array(vec![PdfObject::Ref(ObjectId(12, 0))]));
555    }
556
557    #[test]
558    fn skip_comments() {
559        let l = lim();
560        let mut lex = Lexer::new(b"% comment\n42 ", 0, &l);
561        assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
562    }
563
564    #[test]
565    fn reject_deeply_nested_array() {
566        let mut l = lim();
567        l.max_object_depth = 10;
568        let depth = 50usize;
569        let mut data = vec![b'['; depth];
570        data.extend(std::iter::repeat_n(b']', depth));
571        let mut lex = Lexer::new(&data, 0, &l);
572        let err = lex.next_token().unwrap_err();
573        assert!(matches!(err, Error::RecursionLimit(10)), "got {err:?}");
574    }
575
576    #[test]
577    fn reject_deeply_nested_dict() {
578        let mut l = lim();
579        l.max_object_depth = 5;
580        let n = 20usize;
581        let mut s = String::new();
582        for _ in 0..n {
583            s.push_str("<< /a ");
584        }
585        s.push('1');
586        for _ in 0..n {
587            s.push_str(" >>");
588        }
589        let data = s.into_bytes();
590        let mut lex = Lexer::new(&data, 0, &l);
591        let err = lex.next_token().unwrap_err();
592        assert!(matches!(err, Error::RecursionLimit(5)), "got {err:?}");
593    }
594
595    #[test]
596    fn nested_within_limit_ok() {
597        let l = lim(); // depth limit 100
598        let data = b"[[[[[1]]]]]"; // depth 5
599        let mut lex = Lexer::new(data, 0, &l);
600        assert!(lex.next_token().is_ok());
601    }
602
603    #[test]
604    fn reject_oversized_literal_string() {
605        let mut l = lim();
606        l.max_string_length = 8;
607        let mut data = vec![b'('];
608        data.extend(std::iter::repeat_n(b'a', 100));
609        data.push(b')');
610        let mut lex = Lexer::new(&data, 0, &l);
611        let err = lex.next_token().unwrap_err();
612        assert!(matches!(err, Error::StringLengthLimit(8)), "got {err:?}");
613    }
614
615    #[test]
616    fn reject_oversized_hex_string() {
617        let mut l = lim();
618        l.max_string_length = 4;
619        // 20 hex digits => 10 raw bytes > 4
620        let mut data = vec![b'<'];
621        data.extend(std::iter::repeat_n(b'4', 20));
622        data.push(b'>');
623        let mut lex = Lexer::new(&data, 0, &l);
624        let err = lex.next_token().unwrap_err();
625        assert!(matches!(err, Error::StringLengthLimit(4)), "got {err:?}");
626    }
627
628    #[test]
629    fn small_string_within_limit_ok() {
630        let l = lim(); // 65536
631        let mut lex = Lexer::new(b"(hello)", 0, &l);
632        assert_eq!(
633            lex.next_token().unwrap(),
634            PdfObject::String(PdfString::new(b"hello".to_vec()))
635        );
636    }
637}