Skip to main content

zpdf_parser/
lexer.rs

1use zpdf_core::{Error, ObjectId, ParseLimits, PdfName, PdfObject, PdfString, Result};
2
3pub struct Lexer<'a> {
4    data: &'a [u8],
5    pos: usize,
6    limits: &'a ParseLimits,
7    depth: u32,
8}
9
10impl<'a> Lexer<'a> {
11    pub fn new(data: &'a [u8], pos: usize, limits: &'a ParseLimits) -> Self {
12        Self {
13            data,
14            pos,
15            limits,
16            depth: 0,
17        }
18    }
19
20    /// Increment container-nesting depth, erroring if it exceeds the limit.
21    /// Call once on entry to `read_array`/`read_dict`.
22    fn enter_container(&mut self) -> Result<()> {
23        self.depth += 1;
24        if self.depth > self.limits.max_object_depth {
25            return Err(Error::RecursionLimit(self.limits.max_object_depth));
26        }
27        Ok(())
28    }
29
30    fn leave_container(&mut self) {
31        self.depth = self.depth.saturating_sub(1);
32    }
33
34    pub fn pos(&self) -> usize {
35        self.pos
36    }
37
38    pub fn set_pos(&mut self, pos: usize) {
39        self.pos = pos;
40    }
41
42    pub fn is_eof(&self) -> bool {
43        self.pos >= self.data.len()
44    }
45
46    fn peek(&self) -> Option<u8> {
47        self.data.get(self.pos).copied()
48    }
49
50    fn advance(&mut self) -> Option<u8> {
51        let b = self.data.get(self.pos).copied()?;
52        self.pos += 1;
53        Some(b)
54    }
55
56    pub fn skip_whitespace_and_comments(&mut self) {
57        loop {
58            match self.peek() {
59                Some(b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c') => {
60                    self.pos += 1;
61                }
62                Some(b'%') => {
63                    self.pos += 1;
64                    while let Some(b) = self.peek() {
65                        self.pos += 1;
66                        if b == b'\r' || b == b'\n' {
67                            break;
68                        }
69                    }
70                }
71                _ => break,
72            }
73        }
74    }
75
76    pub fn next_token(&mut self) -> Result<PdfObject> {
77        self.skip_whitespace_and_comments();
78
79        if self.is_eof() {
80            return Err(Error::UnexpectedEof(self.pos as u64));
81        }
82
83        match self.peek().unwrap() {
84            b'/' => self.read_name(),
85            b'(' => self.read_literal_string(),
86            b'<' => {
87                if self.data.get(self.pos + 1) == Some(&b'<') {
88                    self.read_dict()
89                } else {
90                    self.read_hex_string()
91                }
92            }
93            b'[' => self.read_array(),
94            b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(),
95            b't' | b'f' => self.read_bool_or_keyword(),
96            b'n' => self.read_null_or_keyword(),
97            _ => Err(Error::InvalidObject(
98                self.pos as u64,
99                format!("unexpected byte: 0x{:02x}", self.peek().unwrap()),
100            )),
101        }
102    }
103
104    fn read_name(&mut self) -> Result<PdfObject> {
105        self.advance(); // skip '/'
106        let start = self.pos;
107        while let Some(b) = self.peek() {
108            if is_delimiter(b) || is_whitespace(b) {
109                break;
110            }
111            self.pos += 1;
112        }
113        let raw = &self.data[start..self.pos];
114        let name = decode_name(raw);
115        Ok(PdfObject::Name(PdfName::new(name)))
116    }
117
118    fn read_literal_string(&mut self) -> Result<PdfObject> {
119        self.advance(); // skip '('
120        let mut buf = Vec::new();
121        let mut depth = 1u32;
122        let max = self.limits.max_string_length as usize;
123
124        while let Some(b) = self.advance() {
125            match b {
126                b'(' => {
127                    depth += 1;
128                    buf.push(b'(');
129                }
130                b')' => {
131                    depth -= 1;
132                    if depth == 0 {
133                        break;
134                    }
135                    buf.push(b')');
136                }
137                b'\\' => {
138                    if let Some(esc) = self.advance() {
139                        match esc {
140                            b'n' => buf.push(b'\n'),
141                            b'r' => buf.push(b'\r'),
142                            b't' => buf.push(b'\t'),
143                            b'b' => buf.push(0x08),
144                            b'f' => buf.push(0x0c),
145                            b'(' => buf.push(b'('),
146                            b')' => buf.push(b')'),
147                            b'\\' => buf.push(b'\\'),
148                            b'0'..=b'7' => {
149                                let mut octal = (esc - b'0') as u16;
150                                for _ in 0..2 {
151                                    match self.peek() {
152                                        Some(c @ b'0'..=b'7') => {
153                                            octal = octal * 8 + (c - b'0') as u16;
154                                            self.pos += 1;
155                                        }
156                                        _ => break,
157                                    }
158                                }
159                                buf.push(octal as u8);
160                            }
161                            b'\r' => {
162                                if self.peek() == Some(b'\n') {
163                                    self.pos += 1;
164                                }
165                            }
166                            b'\n' => {}
167                            _ => buf.push(esc),
168                        }
169                    }
170                }
171                _ => buf.push(b),
172            }
173            // Each iteration pushes at most one byte, so a single post-match
174            // check is sufficient to bound total string growth.
175            if buf.len() > max {
176                return Err(Error::StringLengthLimit(self.limits.max_string_length));
177            }
178        }
179
180        Ok(PdfObject::String(PdfString::new(buf)))
181    }
182
183    fn read_hex_string(&mut self) -> Result<PdfObject> {
184        self.advance(); // skip '<'
185        let mut buf = Vec::new();
186        let mut high: Option<u8> = None;
187        let max = self.limits.max_string_length as usize;
188
189        loop {
190            match self.advance() {
191                Some(b'>') => break,
192                Some(b) if is_whitespace(b) => continue,
193                Some(b) => {
194                    let nibble = hex_digit(b).ok_or_else(|| {
195                        Error::InvalidObject(self.pos as u64 - 1, "invalid hex digit".into())
196                    })?;
197                    match high {
198                        None => high = Some(nibble),
199                        Some(h) => {
200                            buf.push((h << 4) | nibble);
201                            high = None;
202                            if buf.len() > max {
203                                return Err(Error::StringLengthLimit(
204                                    self.limits.max_string_length,
205                                ));
206                            }
207                        }
208                    }
209                }
210                None => return Err(Error::UnexpectedEof(self.pos as u64)),
211            }
212        }
213
214        if let Some(h) = high {
215            buf.push(h << 4);
216        }
217
218        Ok(PdfObject::String(PdfString::new(buf)))
219    }
220
221    fn read_number(&mut self) -> Result<PdfObject> {
222        let start = self.pos;
223        let mut has_dot = false;
224
225        if matches!(self.peek(), Some(b'+' | b'-')) {
226            self.pos += 1;
227        }
228
229        while let Some(b) = self.peek() {
230            match b {
231                b'0'..=b'9' => self.pos += 1,
232                b'.' if !has_dot => {
233                    has_dot = true;
234                    self.pos += 1;
235                }
236                _ => break,
237            }
238        }
239
240        let s = std::str::from_utf8(&self.data[start..self.pos])
241            .map_err(|_| Error::InvalidObject(start as u64, "invalid number".into()))?;
242
243        if has_dot {
244            let n: f64 = s
245                .parse()
246                .map_err(|_| Error::InvalidObject(start as u64, format!("bad real: {s}")))?;
247            Ok(PdfObject::Real(n))
248        } else {
249            let n: i64 = s
250                .parse()
251                .map_err(|_| Error::InvalidObject(start as u64, format!("bad integer: {s}")))?;
252            Ok(PdfObject::Integer(n))
253        }
254    }
255
256    fn read_bool_or_keyword(&mut self) -> Result<PdfObject> {
257        let start = self.pos;
258        while let Some(b) = self.peek() {
259            if is_delimiter(b) || is_whitespace(b) {
260                break;
261            }
262            self.pos += 1;
263        }
264        let word = &self.data[start..self.pos];
265        match word {
266            b"true" => Ok(PdfObject::Bool(true)),
267            b"false" => Ok(PdfObject::Bool(false)),
268            _ => Err(Error::InvalidObject(
269                start as u64,
270                format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
271            )),
272        }
273    }
274
275    fn read_null_or_keyword(&mut self) -> Result<PdfObject> {
276        let start = self.pos;
277        while let Some(b) = self.peek() {
278            if is_delimiter(b) || is_whitespace(b) {
279                break;
280            }
281            self.pos += 1;
282        }
283        let word = &self.data[start..self.pos];
284        match word {
285            b"null" => Ok(PdfObject::Null),
286            _ => Err(Error::InvalidObject(
287                start as u64,
288                format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
289            )),
290        }
291    }
292
293    fn read_array(&mut self) -> Result<PdfObject> {
294        self.enter_container()?;
295        self.advance(); // skip '['
296        let mut items = Vec::new();
297        loop {
298            self.skip_whitespace_and_comments();
299            if self.peek() == Some(b']') {
300                self.pos += 1;
301                break;
302            }
303            if self.is_eof() {
304                return Err(Error::UnexpectedEof(self.pos as u64));
305            }
306            let obj = self.next_token()?;
307            items.push(self.maybe_resolve_ref(obj)?);
308        }
309        self.leave_container();
310        Ok(PdfObject::Array(items))
311    }
312
313    fn read_dict(&mut self) -> Result<PdfObject> {
314        self.enter_container()?;
315        self.pos += 2; // skip '<<'
316        let mut dict = zpdf_core::PdfDict::new();
317        loop {
318            self.skip_whitespace_and_comments();
319            if self.data.get(self.pos..self.pos + 2) == Some(b">>") {
320                self.pos += 2;
321                break;
322            }
323            if self.is_eof() {
324                return Err(Error::UnexpectedEof(self.pos as u64));
325            }
326            let key = match self.next_token()? {
327                PdfObject::Name(n) => n,
328                other => {
329                    return Err(Error::InvalidObject(
330                        self.pos as u64,
331                        format!("dict key must be Name, got {}", other.type_name()),
332                    ));
333                }
334            };
335            let value = self.next_token()?;
336            let value = self.maybe_resolve_ref(value)?;
337            dict.insert(key, value);
338        }
339        self.leave_container();
340        Ok(PdfObject::Dict(dict))
341    }
342
343    pub(crate) fn maybe_resolve_ref(&mut self, obj: PdfObject) -> Result<PdfObject> {
344        if let PdfObject::Integer(num) = obj {
345            let saved = self.pos;
346            self.skip_whitespace_and_comments();
347            if let Ok(PdfObject::Integer(gen)) = self.read_number_if_available() {
348                self.skip_whitespace_and_comments();
349                if self.peek() == Some(b'R') {
350                    self.pos += 1;
351                    return Ok(PdfObject::Ref(ObjectId(num as u32, gen as u16)));
352                }
353            }
354            self.pos = saved;
355            Ok(PdfObject::Integer(num))
356        } else {
357            Ok(obj)
358        }
359    }
360
361    fn read_number_if_available(&mut self) -> Result<PdfObject> {
362        if matches!(self.peek(), Some(b'0'..=b'9' | b'+' | b'-' | b'.')) {
363            self.read_number()
364        } else {
365            Err(Error::InvalidObject(self.pos as u64, "not a number".into()))
366        }
367    }
368}
369
370fn is_whitespace(b: u8) -> bool {
371    matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c')
372}
373
374fn is_delimiter(b: u8) -> bool {
375    matches!(
376        b,
377        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
378    )
379}
380
381fn hex_digit(b: u8) -> Option<u8> {
382    match b {
383        b'0'..=b'9' => Some(b - b'0'),
384        b'a'..=b'f' => Some(b - b'a' + 10),
385        b'A'..=b'F' => Some(b - b'A' + 10),
386        _ => None,
387    }
388}
389
390fn decode_name(raw: &[u8]) -> String {
391    let mut result = Vec::with_capacity(raw.len());
392    let mut i = 0;
393    while i < raw.len() {
394        if raw[i] == b'#' && i + 2 < raw.len() {
395            if let (Some(h), Some(l)) = (hex_digit(raw[i + 1]), hex_digit(raw[i + 2])) {
396                result.push((h << 4) | l);
397                i += 3;
398                continue;
399            }
400        }
401        result.push(raw[i]);
402        i += 1;
403    }
404    String::from_utf8_lossy(&result).into_owned()
405}
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410
411    fn lim() -> ParseLimits {
412        ParseLimits::default()
413    }
414
415    #[test]
416    fn lex_name() {
417        let l = lim();
418        let mut lex = Lexer::new(b"/Type", 0, &l);
419        let obj = lex.next_token().unwrap();
420        assert_eq!(obj, PdfObject::Name(PdfName::new("Type")));
421    }
422
423    #[test]
424    fn lex_name_with_hex_escape() {
425        let l = lim();
426        let mut lex = Lexer::new(b"/A#20B", 0, &l);
427        let obj = lex.next_token().unwrap();
428        assert_eq!(obj, PdfObject::Name(PdfName::new("A B")));
429    }
430
431    #[test]
432    fn lex_integer() {
433        let l = lim();
434        let mut lex = Lexer::new(b"42 ", 0, &l);
435        assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
436    }
437
438    #[test]
439    fn lex_negative_real() {
440        let l = lim();
441        let mut lex = Lexer::new(b"-3.5 ", 0, &l);
442        match lex.next_token().unwrap() {
443            PdfObject::Real(n) => assert!((n - (-3.5)).abs() < 1e-10),
444            other => panic!("expected Real, got {other:?}"),
445        }
446    }
447
448    #[test]
449    fn lex_literal_string() {
450        let l = lim();
451        let mut lex = Lexer::new(b"(hello world)", 0, &l);
452        let obj = lex.next_token().unwrap();
453        assert_eq!(
454            obj,
455            PdfObject::String(PdfString::new(b"hello world".to_vec()))
456        );
457    }
458
459    #[test]
460    fn lex_literal_string_nested_parens() {
461        let l = lim();
462        let mut lex = Lexer::new(b"(a (b) c)", 0, &l);
463        let obj = lex.next_token().unwrap();
464        assert_eq!(obj, PdfObject::String(PdfString::new(b"a (b) c".to_vec())));
465    }
466
467    #[test]
468    fn lex_hex_string() {
469        let l = lim();
470        let mut lex = Lexer::new(b"<48656C6C6F>", 0, &l);
471        let obj = lex.next_token().unwrap();
472        assert_eq!(obj, PdfObject::String(PdfString::new(b"Hello".to_vec())));
473    }
474
475    #[test]
476    fn lex_array() {
477        let l = lim();
478        let mut lex = Lexer::new(b"[1 2 3]", 0, &l);
479        let obj = lex.next_token().unwrap();
480        assert_eq!(
481            obj,
482            PdfObject::Array(vec![
483                PdfObject::Integer(1),
484                PdfObject::Integer(2),
485                PdfObject::Integer(3),
486            ])
487        );
488    }
489
490    #[test]
491    fn lex_dict() {
492        let l = lim();
493        let mut lex = Lexer::new(b"<< /Type /Page /Count 5 >>", 0, &l);
494        let obj = lex.next_token().unwrap();
495        match obj {
496            PdfObject::Dict(d) => {
497                assert_eq!(d.get_name("Type").unwrap(), "Page");
498                assert_eq!(d.get_i64("Count").unwrap(), 5);
499            }
500            other => panic!("expected Dict, got {other:?}"),
501        }
502    }
503
504    #[test]
505    fn lex_bool_and_null() {
506        let l = lim();
507        let mut lex = Lexer::new(b"true", 0, &l);
508        assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(true));
509
510        let mut lex = Lexer::new(b"false", 0, &l);
511        assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(false));
512
513        let mut lex = Lexer::new(b"null", 0, &l);
514        assert_eq!(lex.next_token().unwrap(), PdfObject::Null);
515    }
516
517    #[test]
518    fn lex_indirect_ref_in_array() {
519        let l = lim();
520        let mut lex = Lexer::new(b"[12 0 R]", 0, &l);
521        let obj = lex.next_token().unwrap();
522        assert_eq!(obj, PdfObject::Array(vec![PdfObject::Ref(ObjectId(12, 0))]));
523    }
524
525    #[test]
526    fn skip_comments() {
527        let l = lim();
528        let mut lex = Lexer::new(b"% comment\n42 ", 0, &l);
529        assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
530    }
531
532    #[test]
533    fn reject_deeply_nested_array() {
534        let mut l = lim();
535        l.max_object_depth = 10;
536        let depth = 50usize;
537        let mut data = vec![b'['; depth];
538        data.extend(std::iter::repeat_n(b']', depth));
539        let mut lex = Lexer::new(&data, 0, &l);
540        let err = lex.next_token().unwrap_err();
541        assert!(matches!(err, Error::RecursionLimit(10)), "got {err:?}");
542    }
543
544    #[test]
545    fn reject_deeply_nested_dict() {
546        let mut l = lim();
547        l.max_object_depth = 5;
548        let n = 20usize;
549        let mut s = String::new();
550        for _ in 0..n {
551            s.push_str("<< /a ");
552        }
553        s.push('1');
554        for _ in 0..n {
555            s.push_str(" >>");
556        }
557        let data = s.into_bytes();
558        let mut lex = Lexer::new(&data, 0, &l);
559        let err = lex.next_token().unwrap_err();
560        assert!(matches!(err, Error::RecursionLimit(5)), "got {err:?}");
561    }
562
563    #[test]
564    fn nested_within_limit_ok() {
565        let l = lim(); // depth limit 100
566        let data = b"[[[[[1]]]]]"; // depth 5
567        let mut lex = Lexer::new(data, 0, &l);
568        assert!(lex.next_token().is_ok());
569    }
570
571    #[test]
572    fn reject_oversized_literal_string() {
573        let mut l = lim();
574        l.max_string_length = 8;
575        let mut data = vec![b'('];
576        data.extend(std::iter::repeat_n(b'a', 100));
577        data.push(b')');
578        let mut lex = Lexer::new(&data, 0, &l);
579        let err = lex.next_token().unwrap_err();
580        assert!(matches!(err, Error::StringLengthLimit(8)), "got {err:?}");
581    }
582
583    #[test]
584    fn reject_oversized_hex_string() {
585        let mut l = lim();
586        l.max_string_length = 4;
587        // 20 hex digits => 10 raw bytes > 4
588        let mut data = vec![b'<'];
589        data.extend(std::iter::repeat_n(b'4', 20));
590        data.push(b'>');
591        let mut lex = Lexer::new(&data, 0, &l);
592        let err = lex.next_token().unwrap_err();
593        assert!(matches!(err, Error::StringLengthLimit(4)), "got {err:?}");
594    }
595
596    #[test]
597    fn small_string_within_limit_ok() {
598        let l = lim(); // 65536
599        let mut lex = Lexer::new(b"(hello)", 0, &l);
600        assert_eq!(
601            lex.next_token().unwrap(),
602            PdfObject::String(PdfString::new(b"hello".to_vec()))
603        );
604    }
605}