1use zpdf_core::{Error, ObjectId, ParseLimits, PdfName, PdfObject, PdfString, Result};
2
3pub struct Lexer<'a> {
4 data: &'a [u8],
5 pos: usize,
6 limits: &'a ParseLimits,
7 depth: u32,
8}
9
10impl<'a> Lexer<'a> {
11 pub fn new(data: &'a [u8], pos: usize, limits: &'a ParseLimits) -> Self {
12 Self {
13 data,
14 pos,
15 limits,
16 depth: 0,
17 }
18 }
19
20 fn enter_container(&mut self) -> Result<()> {
23 self.depth += 1;
24 if self.depth > self.limits.max_object_depth {
25 return Err(Error::RecursionLimit(self.limits.max_object_depth));
26 }
27 Ok(())
28 }
29
30 fn leave_container(&mut self) {
31 self.depth = self.depth.saturating_sub(1);
32 }
33
34 pub fn pos(&self) -> usize {
35 self.pos
36 }
37
38 pub fn set_pos(&mut self, pos: usize) {
39 self.pos = pos;
40 }
41
42 pub fn is_eof(&self) -> bool {
43 self.pos >= self.data.len()
44 }
45
46 fn peek(&self) -> Option<u8> {
47 self.data.get(self.pos).copied()
48 }
49
50 fn advance(&mut self) -> Option<u8> {
51 let b = self.data.get(self.pos).copied()?;
52 self.pos += 1;
53 Some(b)
54 }
55
56 pub fn skip_whitespace_and_comments(&mut self) {
57 loop {
58 match self.peek() {
59 Some(b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c') => {
60 self.pos += 1;
61 }
62 Some(b'%') => {
63 self.pos += 1;
64 while let Some(b) = self.peek() {
65 self.pos += 1;
66 if b == b'\r' || b == b'\n' {
67 break;
68 }
69 }
70 }
71 _ => break,
72 }
73 }
74 }
75
76 pub fn next_token(&mut self) -> Result<PdfObject> {
77 self.skip_whitespace_and_comments();
78
79 if self.is_eof() {
80 return Err(Error::UnexpectedEof(self.pos as u64));
81 }
82
83 match self.peek().unwrap() {
84 b'/' => self.read_name(),
85 b'(' => self.read_literal_string(),
86 b'<' => {
87 if self.data.get(self.pos + 1) == Some(&b'<') {
88 self.read_dict()
89 } else {
90 self.read_hex_string()
91 }
92 }
93 b'[' => self.read_array(),
94 b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(),
95 b't' | b'f' => self.read_bool_or_keyword(),
96 b'n' => self.read_null_or_keyword(),
97 _ => Err(Error::InvalidObject(
98 self.pos as u64,
99 format!("unexpected byte: 0x{:02x}", self.peek().unwrap()),
100 )),
101 }
102 }
103
104 fn read_name(&mut self) -> Result<PdfObject> {
105 self.advance(); let start = self.pos;
107 while let Some(b) = self.peek() {
108 if is_delimiter(b) || is_whitespace(b) {
109 break;
110 }
111 self.pos += 1;
112 }
113 let raw = &self.data[start..self.pos];
114 let name = decode_name(raw);
115 Ok(PdfObject::Name(PdfName::new(name)))
116 }
117
118 fn read_literal_string(&mut self) -> Result<PdfObject> {
119 self.advance(); let mut buf = Vec::new();
121 let mut depth = 1u32;
122 let max = self.limits.max_string_length as usize;
123
124 while let Some(b) = self.advance() {
125 match b {
126 b'(' => {
127 depth += 1;
128 buf.push(b'(');
129 }
130 b')' => {
131 depth -= 1;
132 if depth == 0 {
133 break;
134 }
135 buf.push(b')');
136 }
137 b'\\' => {
138 if let Some(esc) = self.advance() {
139 match esc {
140 b'n' => buf.push(b'\n'),
141 b'r' => buf.push(b'\r'),
142 b't' => buf.push(b'\t'),
143 b'b' => buf.push(0x08),
144 b'f' => buf.push(0x0c),
145 b'(' => buf.push(b'('),
146 b')' => buf.push(b')'),
147 b'\\' => buf.push(b'\\'),
148 b'0'..=b'7' => {
149 let mut octal = (esc - b'0') as u16;
150 for _ in 0..2 {
151 match self.peek() {
152 Some(c @ b'0'..=b'7') => {
153 octal = octal * 8 + (c - b'0') as u16;
154 self.pos += 1;
155 }
156 _ => break,
157 }
158 }
159 buf.push(octal as u8);
160 }
161 b'\r' => {
162 if self.peek() == Some(b'\n') {
163 self.pos += 1;
164 }
165 }
166 b'\n' => {}
167 _ => buf.push(esc),
168 }
169 }
170 }
171 _ => buf.push(b),
172 }
173 if buf.len() > max {
176 return Err(Error::StringLengthLimit(self.limits.max_string_length));
177 }
178 }
179
180 Ok(PdfObject::String(PdfString::new(buf)))
181 }
182
183 fn read_hex_string(&mut self) -> Result<PdfObject> {
184 self.advance(); let mut buf = Vec::new();
186 let mut high: Option<u8> = None;
187 let max = self.limits.max_string_length as usize;
188
189 loop {
190 match self.advance() {
191 Some(b'>') => break,
192 Some(b) if is_whitespace(b) => continue,
193 Some(b) => {
194 let nibble = hex_digit(b).ok_or_else(|| {
195 Error::InvalidObject(self.pos as u64 - 1, "invalid hex digit".into())
196 })?;
197 match high {
198 None => high = Some(nibble),
199 Some(h) => {
200 buf.push((h << 4) | nibble);
201 high = None;
202 if buf.len() > max {
203 return Err(Error::StringLengthLimit(
204 self.limits.max_string_length,
205 ));
206 }
207 }
208 }
209 }
210 None => return Err(Error::UnexpectedEof(self.pos as u64)),
211 }
212 }
213
214 if let Some(h) = high {
215 buf.push(h << 4);
216 }
217
218 Ok(PdfObject::String(PdfString::new(buf)))
219 }
220
221 fn read_number(&mut self) -> Result<PdfObject> {
222 let start = self.pos;
223 let mut has_dot = false;
224
225 if matches!(self.peek(), Some(b'+' | b'-')) {
226 self.pos += 1;
227 }
228
229 while let Some(b) = self.peek() {
230 match b {
231 b'0'..=b'9' => self.pos += 1,
232 b'.' if !has_dot => {
233 has_dot = true;
234 self.pos += 1;
235 }
236 _ => break,
237 }
238 }
239
240 let s = std::str::from_utf8(&self.data[start..self.pos])
241 .map_err(|_| Error::InvalidObject(start as u64, "invalid number".into()))?;
242
243 if has_dot {
244 let n: f64 = s
245 .parse()
246 .map_err(|_| Error::InvalidObject(start as u64, format!("bad real: {s}")))?;
247 Ok(PdfObject::Real(n))
248 } else {
249 let n: i64 = s
250 .parse()
251 .map_err(|_| Error::InvalidObject(start as u64, format!("bad integer: {s}")))?;
252 Ok(PdfObject::Integer(n))
253 }
254 }
255
256 fn read_bool_or_keyword(&mut self) -> Result<PdfObject> {
257 let start = self.pos;
258 while let Some(b) = self.peek() {
259 if is_delimiter(b) || is_whitespace(b) {
260 break;
261 }
262 self.pos += 1;
263 }
264 let word = &self.data[start..self.pos];
265 match word {
266 b"true" => Ok(PdfObject::Bool(true)),
267 b"false" => Ok(PdfObject::Bool(false)),
268 _ => Err(Error::InvalidObject(
269 start as u64,
270 format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
271 )),
272 }
273 }
274
275 fn read_null_or_keyword(&mut self) -> Result<PdfObject> {
276 let start = self.pos;
277 while let Some(b) = self.peek() {
278 if is_delimiter(b) || is_whitespace(b) {
279 break;
280 }
281 self.pos += 1;
282 }
283 let word = &self.data[start..self.pos];
284 match word {
285 b"null" => Ok(PdfObject::Null),
286 _ => Err(Error::InvalidObject(
287 start as u64,
288 format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
289 )),
290 }
291 }
292
293 fn read_array(&mut self) -> Result<PdfObject> {
294 self.enter_container()?;
295 self.advance(); let mut items = Vec::new();
297 loop {
298 self.skip_whitespace_and_comments();
299 if self.peek() == Some(b']') {
300 self.pos += 1;
301 break;
302 }
303 if self.is_eof() {
304 return Err(Error::UnexpectedEof(self.pos as u64));
305 }
306 let obj = self.next_token()?;
307 items.push(self.maybe_resolve_ref(obj)?);
308 }
309 self.leave_container();
310 Ok(PdfObject::Array(items))
311 }
312
313 fn read_dict(&mut self) -> Result<PdfObject> {
314 self.enter_container()?;
315 self.pos += 2; let mut dict = zpdf_core::PdfDict::new();
317 let mut bad = 0u32;
320 const MAX_BAD_TOKENS: u32 = 64;
321 loop {
322 self.skip_whitespace_and_comments();
323 if self.data.get(self.pos..self.pos + 2) == Some(b">>") {
324 self.pos += 2;
325 break;
326 }
327 if self.is_eof() {
328 break;
332 }
333 let key = match self.next_token() {
338 Ok(PdfObject::Name(n)) => n,
339 Err(e @ Error::RecursionLimit(_)) => return Err(e),
340 Ok(_non_name) => {
341 bad += 1;
343 if bad > MAX_BAD_TOKENS {
344 break;
345 }
346 continue;
347 }
348 Err(_) => {
349 bad += 1;
350 if bad > MAX_BAD_TOKENS {
351 break;
352 }
353 self.pos += 1; continue;
355 }
356 };
357 let value = match self.next_token() {
360 Ok(v) => v,
361 Err(e @ Error::RecursionLimit(_)) => return Err(e),
362 Err(_) => break,
363 };
364 let value = match self.maybe_resolve_ref(value) {
365 Ok(v) => v,
366 Err(e @ Error::RecursionLimit(_)) => return Err(e),
367 Err(_) => break,
368 };
369 dict.insert(key, value);
370 }
371 self.leave_container();
372 Ok(PdfObject::Dict(dict))
373 }
374
375 pub(crate) fn maybe_resolve_ref(&mut self, obj: PdfObject) -> Result<PdfObject> {
376 if let PdfObject::Integer(num) = obj {
377 let saved = self.pos;
378 self.skip_whitespace_and_comments();
379 if let Ok(PdfObject::Integer(gen)) = self.read_number_if_available() {
380 self.skip_whitespace_and_comments();
381 if self.peek() == Some(b'R') {
382 self.pos += 1;
383 return Ok(PdfObject::Ref(ObjectId(num as u32, gen as u16)));
384 }
385 }
386 self.pos = saved;
387 Ok(PdfObject::Integer(num))
388 } else {
389 Ok(obj)
390 }
391 }
392
393 fn read_number_if_available(&mut self) -> Result<PdfObject> {
394 if matches!(self.peek(), Some(b'0'..=b'9' | b'+' | b'-' | b'.')) {
395 self.read_number()
396 } else {
397 Err(Error::InvalidObject(self.pos as u64, "not a number".into()))
398 }
399 }
400}
401
402fn is_whitespace(b: u8) -> bool {
403 matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c')
404}
405
406fn is_delimiter(b: u8) -> bool {
407 matches!(
408 b,
409 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
410 )
411}
412
413fn hex_digit(b: u8) -> Option<u8> {
414 match b {
415 b'0'..=b'9' => Some(b - b'0'),
416 b'a'..=b'f' => Some(b - b'a' + 10),
417 b'A'..=b'F' => Some(b - b'A' + 10),
418 _ => None,
419 }
420}
421
422fn decode_name(raw: &[u8]) -> String {
423 let mut result = Vec::with_capacity(raw.len());
424 let mut i = 0;
425 while i < raw.len() {
426 if raw[i] == b'#' && i + 2 < raw.len() {
427 if let (Some(h), Some(l)) = (hex_digit(raw[i + 1]), hex_digit(raw[i + 2])) {
428 result.push((h << 4) | l);
429 i += 3;
430 continue;
431 }
432 }
433 result.push(raw[i]);
434 i += 1;
435 }
436 String::from_utf8_lossy(&result).into_owned()
437}
438
439#[cfg(test)]
440mod tests {
441 use super::*;
442
443 fn lim() -> ParseLimits {
444 ParseLimits::default()
445 }
446
447 #[test]
448 fn lex_name() {
449 let l = lim();
450 let mut lex = Lexer::new(b"/Type", 0, &l);
451 let obj = lex.next_token().unwrap();
452 assert_eq!(obj, PdfObject::Name(PdfName::new("Type")));
453 }
454
455 #[test]
456 fn lex_name_with_hex_escape() {
457 let l = lim();
458 let mut lex = Lexer::new(b"/A#20B", 0, &l);
459 let obj = lex.next_token().unwrap();
460 assert_eq!(obj, PdfObject::Name(PdfName::new("A B")));
461 }
462
463 #[test]
464 fn lex_integer() {
465 let l = lim();
466 let mut lex = Lexer::new(b"42 ", 0, &l);
467 assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
468 }
469
470 #[test]
471 fn lex_negative_real() {
472 let l = lim();
473 let mut lex = Lexer::new(b"-3.5 ", 0, &l);
474 match lex.next_token().unwrap() {
475 PdfObject::Real(n) => assert!((n - (-3.5)).abs() < 1e-10),
476 other => panic!("expected Real, got {other:?}"),
477 }
478 }
479
480 #[test]
481 fn lex_literal_string() {
482 let l = lim();
483 let mut lex = Lexer::new(b"(hello world)", 0, &l);
484 let obj = lex.next_token().unwrap();
485 assert_eq!(
486 obj,
487 PdfObject::String(PdfString::new(b"hello world".to_vec()))
488 );
489 }
490
491 #[test]
492 fn lex_literal_string_nested_parens() {
493 let l = lim();
494 let mut lex = Lexer::new(b"(a (b) c)", 0, &l);
495 let obj = lex.next_token().unwrap();
496 assert_eq!(obj, PdfObject::String(PdfString::new(b"a (b) c".to_vec())));
497 }
498
499 #[test]
500 fn lex_hex_string() {
501 let l = lim();
502 let mut lex = Lexer::new(b"<48656C6C6F>", 0, &l);
503 let obj = lex.next_token().unwrap();
504 assert_eq!(obj, PdfObject::String(PdfString::new(b"Hello".to_vec())));
505 }
506
507 #[test]
508 fn lex_array() {
509 let l = lim();
510 let mut lex = Lexer::new(b"[1 2 3]", 0, &l);
511 let obj = lex.next_token().unwrap();
512 assert_eq!(
513 obj,
514 PdfObject::Array(vec![
515 PdfObject::Integer(1),
516 PdfObject::Integer(2),
517 PdfObject::Integer(3),
518 ])
519 );
520 }
521
522 #[test]
523 fn lex_dict() {
524 let l = lim();
525 let mut lex = Lexer::new(b"<< /Type /Page /Count 5 >>", 0, &l);
526 let obj = lex.next_token().unwrap();
527 match obj {
528 PdfObject::Dict(d) => {
529 assert_eq!(d.get_name("Type").unwrap(), "Page");
530 assert_eq!(d.get_i64("Count").unwrap(), 5);
531 }
532 other => panic!("expected Dict, got {other:?}"),
533 }
534 }
535
536 #[test]
537 fn lex_bool_and_null() {
538 let l = lim();
539 let mut lex = Lexer::new(b"true", 0, &l);
540 assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(true));
541
542 let mut lex = Lexer::new(b"false", 0, &l);
543 assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(false));
544
545 let mut lex = Lexer::new(b"null", 0, &l);
546 assert_eq!(lex.next_token().unwrap(), PdfObject::Null);
547 }
548
549 #[test]
550 fn lex_indirect_ref_in_array() {
551 let l = lim();
552 let mut lex = Lexer::new(b"[12 0 R]", 0, &l);
553 let obj = lex.next_token().unwrap();
554 assert_eq!(obj, PdfObject::Array(vec![PdfObject::Ref(ObjectId(12, 0))]));
555 }
556
557 #[test]
558 fn skip_comments() {
559 let l = lim();
560 let mut lex = Lexer::new(b"% comment\n42 ", 0, &l);
561 assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
562 }
563
564 #[test]
565 fn reject_deeply_nested_array() {
566 let mut l = lim();
567 l.max_object_depth = 10;
568 let depth = 50usize;
569 let mut data = vec![b'['; depth];
570 data.extend(std::iter::repeat_n(b']', depth));
571 let mut lex = Lexer::new(&data, 0, &l);
572 let err = lex.next_token().unwrap_err();
573 assert!(matches!(err, Error::RecursionLimit(10)), "got {err:?}");
574 }
575
576 #[test]
577 fn reject_deeply_nested_dict() {
578 let mut l = lim();
579 l.max_object_depth = 5;
580 let n = 20usize;
581 let mut s = String::new();
582 for _ in 0..n {
583 s.push_str("<< /a ");
584 }
585 s.push('1');
586 for _ in 0..n {
587 s.push_str(" >>");
588 }
589 let data = s.into_bytes();
590 let mut lex = Lexer::new(&data, 0, &l);
591 let err = lex.next_token().unwrap_err();
592 assert!(matches!(err, Error::RecursionLimit(5)), "got {err:?}");
593 }
594
595 #[test]
596 fn nested_within_limit_ok() {
597 let l = lim(); let data = b"[[[[[1]]]]]"; let mut lex = Lexer::new(data, 0, &l);
600 assert!(lex.next_token().is_ok());
601 }
602
603 #[test]
604 fn reject_oversized_literal_string() {
605 let mut l = lim();
606 l.max_string_length = 8;
607 let mut data = vec![b'('];
608 data.extend(std::iter::repeat_n(b'a', 100));
609 data.push(b')');
610 let mut lex = Lexer::new(&data, 0, &l);
611 let err = lex.next_token().unwrap_err();
612 assert!(matches!(err, Error::StringLengthLimit(8)), "got {err:?}");
613 }
614
615 #[test]
616 fn reject_oversized_hex_string() {
617 let mut l = lim();
618 l.max_string_length = 4;
619 let mut data = vec![b'<'];
621 data.extend(std::iter::repeat_n(b'4', 20));
622 data.push(b'>');
623 let mut lex = Lexer::new(&data, 0, &l);
624 let err = lex.next_token().unwrap_err();
625 assert!(matches!(err, Error::StringLengthLimit(4)), "got {err:?}");
626 }
627
628 #[test]
629 fn small_string_within_limit_ok() {
630 let l = lim(); let mut lex = Lexer::new(b"(hello)", 0, &l);
632 assert_eq!(
633 lex.next_token().unwrap(),
634 PdfObject::String(PdfString::new(b"hello".to_vec()))
635 );
636 }
637}