1use zpdf_core::{Error, ObjectId, ParseLimits, PdfName, PdfObject, PdfString, Result};
2
3pub struct Lexer<'a> {
4 data: &'a [u8],
5 pos: usize,
6 limits: &'a ParseLimits,
7 depth: u32,
8}
9
10impl<'a> Lexer<'a> {
11 pub fn new(data: &'a [u8], pos: usize, limits: &'a ParseLimits) -> Self {
12 Self {
13 data,
14 pos,
15 limits,
16 depth: 0,
17 }
18 }
19
20 fn enter_container(&mut self) -> Result<()> {
23 self.depth += 1;
24 if self.depth > self.limits.max_object_depth {
25 return Err(Error::RecursionLimit(self.limits.max_object_depth));
26 }
27 Ok(())
28 }
29
30 fn leave_container(&mut self) {
31 self.depth = self.depth.saturating_sub(1);
32 }
33
34 pub fn pos(&self) -> usize {
35 self.pos
36 }
37
38 pub fn set_pos(&mut self, pos: usize) {
39 self.pos = pos;
40 }
41
42 pub fn is_eof(&self) -> bool {
43 self.pos >= self.data.len()
44 }
45
46 fn peek(&self) -> Option<u8> {
47 self.data.get(self.pos).copied()
48 }
49
50 fn advance(&mut self) -> Option<u8> {
51 let b = self.data.get(self.pos).copied()?;
52 self.pos += 1;
53 Some(b)
54 }
55
56 pub fn skip_whitespace_and_comments(&mut self) {
57 loop {
58 match self.peek() {
59 Some(b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c') => {
60 self.pos += 1;
61 }
62 Some(b'%') => {
63 self.pos += 1;
64 while let Some(b) = self.peek() {
65 self.pos += 1;
66 if b == b'\r' || b == b'\n' {
67 break;
68 }
69 }
70 }
71 _ => break,
72 }
73 }
74 }
75
76 pub fn next_token(&mut self) -> Result<PdfObject> {
77 self.skip_whitespace_and_comments();
78
79 if self.is_eof() {
80 return Err(Error::UnexpectedEof(self.pos as u64));
81 }
82
83 match self.peek().unwrap() {
84 b'/' => self.read_name(),
85 b'(' => self.read_literal_string(),
86 b'<' => {
87 if self.data.get(self.pos + 1) == Some(&b'<') {
88 self.read_dict()
89 } else {
90 self.read_hex_string()
91 }
92 }
93 b'[' => self.read_array(),
94 b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(),
95 b't' | b'f' => self.read_bool_or_keyword(),
96 b'n' => self.read_null_or_keyword(),
97 _ => Err(Error::InvalidObject(
98 self.pos as u64,
99 format!("unexpected byte: 0x{:02x}", self.peek().unwrap()),
100 )),
101 }
102 }
103
104 fn read_name(&mut self) -> Result<PdfObject> {
105 self.advance(); let start = self.pos;
107 while let Some(b) = self.peek() {
108 if is_delimiter(b) || is_whitespace(b) {
109 break;
110 }
111 self.pos += 1;
112 }
113 let raw = &self.data[start..self.pos];
114 let name = decode_name(raw);
115 Ok(PdfObject::Name(PdfName::new(name)))
116 }
117
118 fn read_literal_string(&mut self) -> Result<PdfObject> {
119 self.advance(); let mut buf = Vec::new();
121 let mut depth = 1u32;
122 let max = self.limits.max_string_length as usize;
123
124 while let Some(b) = self.advance() {
125 match b {
126 b'(' => {
127 depth += 1;
128 buf.push(b'(');
129 }
130 b')' => {
131 depth -= 1;
132 if depth == 0 {
133 break;
134 }
135 buf.push(b')');
136 }
137 b'\\' => {
138 if let Some(esc) = self.advance() {
139 match esc {
140 b'n' => buf.push(b'\n'),
141 b'r' => buf.push(b'\r'),
142 b't' => buf.push(b'\t'),
143 b'b' => buf.push(0x08),
144 b'f' => buf.push(0x0c),
145 b'(' => buf.push(b'('),
146 b')' => buf.push(b')'),
147 b'\\' => buf.push(b'\\'),
148 b'0'..=b'7' => {
149 let mut octal = (esc - b'0') as u16;
150 for _ in 0..2 {
151 match self.peek() {
152 Some(c @ b'0'..=b'7') => {
153 octal = octal * 8 + (c - b'0') as u16;
154 self.pos += 1;
155 }
156 _ => break,
157 }
158 }
159 buf.push(octal as u8);
160 }
161 b'\r' => {
162 if self.peek() == Some(b'\n') {
163 self.pos += 1;
164 }
165 }
166 b'\n' => {}
167 _ => buf.push(esc),
168 }
169 }
170 }
171 _ => buf.push(b),
172 }
173 if buf.len() > max {
176 return Err(Error::StringLengthLimit(self.limits.max_string_length));
177 }
178 }
179
180 Ok(PdfObject::String(PdfString::new(buf)))
181 }
182
183 fn read_hex_string(&mut self) -> Result<PdfObject> {
184 self.advance(); let mut buf = Vec::new();
186 let mut high: Option<u8> = None;
187 let max = self.limits.max_string_length as usize;
188
189 loop {
190 match self.advance() {
191 Some(b'>') => break,
192 Some(b) if is_whitespace(b) => continue,
193 Some(b) => {
194 let nibble = hex_digit(b).ok_or_else(|| {
195 Error::InvalidObject(self.pos as u64 - 1, "invalid hex digit".into())
196 })?;
197 match high {
198 None => high = Some(nibble),
199 Some(h) => {
200 buf.push((h << 4) | nibble);
201 high = None;
202 if buf.len() > max {
203 return Err(Error::StringLengthLimit(
204 self.limits.max_string_length,
205 ));
206 }
207 }
208 }
209 }
210 None => return Err(Error::UnexpectedEof(self.pos as u64)),
211 }
212 }
213
214 if let Some(h) = high {
215 buf.push(h << 4);
216 }
217
218 Ok(PdfObject::String(PdfString::new(buf)))
219 }
220
221 fn read_number(&mut self) -> Result<PdfObject> {
222 let start = self.pos;
223 let mut has_dot = false;
224
225 if matches!(self.peek(), Some(b'+' | b'-')) {
226 self.pos += 1;
227 }
228
229 while let Some(b) = self.peek() {
230 match b {
231 b'0'..=b'9' => self.pos += 1,
232 b'.' if !has_dot => {
233 has_dot = true;
234 self.pos += 1;
235 }
236 _ => break,
237 }
238 }
239
240 let s = std::str::from_utf8(&self.data[start..self.pos])
241 .map_err(|_| Error::InvalidObject(start as u64, "invalid number".into()))?;
242
243 if has_dot {
244 let n: f64 = s
245 .parse()
246 .map_err(|_| Error::InvalidObject(start as u64, format!("bad real: {s}")))?;
247 Ok(PdfObject::Real(n))
248 } else {
249 let n: i64 = s
250 .parse()
251 .map_err(|_| Error::InvalidObject(start as u64, format!("bad integer: {s}")))?;
252 Ok(PdfObject::Integer(n))
253 }
254 }
255
256 fn read_bool_or_keyword(&mut self) -> Result<PdfObject> {
257 let start = self.pos;
258 while let Some(b) = self.peek() {
259 if is_delimiter(b) || is_whitespace(b) {
260 break;
261 }
262 self.pos += 1;
263 }
264 let word = &self.data[start..self.pos];
265 match word {
266 b"true" => Ok(PdfObject::Bool(true)),
267 b"false" => Ok(PdfObject::Bool(false)),
268 _ => Err(Error::InvalidObject(
269 start as u64,
270 format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
271 )),
272 }
273 }
274
275 fn read_null_or_keyword(&mut self) -> Result<PdfObject> {
276 let start = self.pos;
277 while let Some(b) = self.peek() {
278 if is_delimiter(b) || is_whitespace(b) {
279 break;
280 }
281 self.pos += 1;
282 }
283 let word = &self.data[start..self.pos];
284 match word {
285 b"null" => Ok(PdfObject::Null),
286 _ => Err(Error::InvalidObject(
287 start as u64,
288 format!("unexpected keyword: {}", String::from_utf8_lossy(word)),
289 )),
290 }
291 }
292
293 fn read_array(&mut self) -> Result<PdfObject> {
294 self.enter_container()?;
295 self.advance(); let mut items = Vec::new();
297 loop {
298 self.skip_whitespace_and_comments();
299 if self.peek() == Some(b']') {
300 self.pos += 1;
301 break;
302 }
303 if self.is_eof() {
304 return Err(Error::UnexpectedEof(self.pos as u64));
305 }
306 let obj = self.next_token()?;
307 items.push(self.maybe_resolve_ref(obj)?);
308 }
309 self.leave_container();
310 Ok(PdfObject::Array(items))
311 }
312
313 fn read_dict(&mut self) -> Result<PdfObject> {
314 self.enter_container()?;
315 self.pos += 2; let mut dict = zpdf_core::PdfDict::new();
317 loop {
318 self.skip_whitespace_and_comments();
319 if self.data.get(self.pos..self.pos + 2) == Some(b">>") {
320 self.pos += 2;
321 break;
322 }
323 if self.is_eof() {
324 return Err(Error::UnexpectedEof(self.pos as u64));
325 }
326 let key = match self.next_token()? {
327 PdfObject::Name(n) => n,
328 other => {
329 return Err(Error::InvalidObject(
330 self.pos as u64,
331 format!("dict key must be Name, got {}", other.type_name()),
332 ));
333 }
334 };
335 let value = self.next_token()?;
336 let value = self.maybe_resolve_ref(value)?;
337 dict.insert(key, value);
338 }
339 self.leave_container();
340 Ok(PdfObject::Dict(dict))
341 }
342
343 pub(crate) fn maybe_resolve_ref(&mut self, obj: PdfObject) -> Result<PdfObject> {
344 if let PdfObject::Integer(num) = obj {
345 let saved = self.pos;
346 self.skip_whitespace_and_comments();
347 if let Ok(PdfObject::Integer(gen)) = self.read_number_if_available() {
348 self.skip_whitespace_and_comments();
349 if self.peek() == Some(b'R') {
350 self.pos += 1;
351 return Ok(PdfObject::Ref(ObjectId(num as u32, gen as u16)));
352 }
353 }
354 self.pos = saved;
355 Ok(PdfObject::Integer(num))
356 } else {
357 Ok(obj)
358 }
359 }
360
361 fn read_number_if_available(&mut self) -> Result<PdfObject> {
362 if matches!(self.peek(), Some(b'0'..=b'9' | b'+' | b'-' | b'.')) {
363 self.read_number()
364 } else {
365 Err(Error::InvalidObject(self.pos as u64, "not a number".into()))
366 }
367 }
368}
369
370fn is_whitespace(b: u8) -> bool {
371 matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c')
372}
373
374fn is_delimiter(b: u8) -> bool {
375 matches!(
376 b,
377 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
378 )
379}
380
381fn hex_digit(b: u8) -> Option<u8> {
382 match b {
383 b'0'..=b'9' => Some(b - b'0'),
384 b'a'..=b'f' => Some(b - b'a' + 10),
385 b'A'..=b'F' => Some(b - b'A' + 10),
386 _ => None,
387 }
388}
389
390fn decode_name(raw: &[u8]) -> String {
391 let mut result = Vec::with_capacity(raw.len());
392 let mut i = 0;
393 while i < raw.len() {
394 if raw[i] == b'#' && i + 2 < raw.len() {
395 if let (Some(h), Some(l)) = (hex_digit(raw[i + 1]), hex_digit(raw[i + 2])) {
396 result.push((h << 4) | l);
397 i += 3;
398 continue;
399 }
400 }
401 result.push(raw[i]);
402 i += 1;
403 }
404 String::from_utf8_lossy(&result).into_owned()
405}
406
407#[cfg(test)]
408mod tests {
409 use super::*;
410
411 fn lim() -> ParseLimits {
412 ParseLimits::default()
413 }
414
415 #[test]
416 fn lex_name() {
417 let l = lim();
418 let mut lex = Lexer::new(b"/Type", 0, &l);
419 let obj = lex.next_token().unwrap();
420 assert_eq!(obj, PdfObject::Name(PdfName::new("Type")));
421 }
422
423 #[test]
424 fn lex_name_with_hex_escape() {
425 let l = lim();
426 let mut lex = Lexer::new(b"/A#20B", 0, &l);
427 let obj = lex.next_token().unwrap();
428 assert_eq!(obj, PdfObject::Name(PdfName::new("A B")));
429 }
430
431 #[test]
432 fn lex_integer() {
433 let l = lim();
434 let mut lex = Lexer::new(b"42 ", 0, &l);
435 assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
436 }
437
438 #[test]
439 fn lex_negative_real() {
440 let l = lim();
441 let mut lex = Lexer::new(b"-3.5 ", 0, &l);
442 match lex.next_token().unwrap() {
443 PdfObject::Real(n) => assert!((n - (-3.5)).abs() < 1e-10),
444 other => panic!("expected Real, got {other:?}"),
445 }
446 }
447
448 #[test]
449 fn lex_literal_string() {
450 let l = lim();
451 let mut lex = Lexer::new(b"(hello world)", 0, &l);
452 let obj = lex.next_token().unwrap();
453 assert_eq!(
454 obj,
455 PdfObject::String(PdfString::new(b"hello world".to_vec()))
456 );
457 }
458
459 #[test]
460 fn lex_literal_string_nested_parens() {
461 let l = lim();
462 let mut lex = Lexer::new(b"(a (b) c)", 0, &l);
463 let obj = lex.next_token().unwrap();
464 assert_eq!(obj, PdfObject::String(PdfString::new(b"a (b) c".to_vec())));
465 }
466
467 #[test]
468 fn lex_hex_string() {
469 let l = lim();
470 let mut lex = Lexer::new(b"<48656C6C6F>", 0, &l);
471 let obj = lex.next_token().unwrap();
472 assert_eq!(obj, PdfObject::String(PdfString::new(b"Hello".to_vec())));
473 }
474
475 #[test]
476 fn lex_array() {
477 let l = lim();
478 let mut lex = Lexer::new(b"[1 2 3]", 0, &l);
479 let obj = lex.next_token().unwrap();
480 assert_eq!(
481 obj,
482 PdfObject::Array(vec![
483 PdfObject::Integer(1),
484 PdfObject::Integer(2),
485 PdfObject::Integer(3),
486 ])
487 );
488 }
489
490 #[test]
491 fn lex_dict() {
492 let l = lim();
493 let mut lex = Lexer::new(b"<< /Type /Page /Count 5 >>", 0, &l);
494 let obj = lex.next_token().unwrap();
495 match obj {
496 PdfObject::Dict(d) => {
497 assert_eq!(d.get_name("Type").unwrap(), "Page");
498 assert_eq!(d.get_i64("Count").unwrap(), 5);
499 }
500 other => panic!("expected Dict, got {other:?}"),
501 }
502 }
503
504 #[test]
505 fn lex_bool_and_null() {
506 let l = lim();
507 let mut lex = Lexer::new(b"true", 0, &l);
508 assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(true));
509
510 let mut lex = Lexer::new(b"false", 0, &l);
511 assert_eq!(lex.next_token().unwrap(), PdfObject::Bool(false));
512
513 let mut lex = Lexer::new(b"null", 0, &l);
514 assert_eq!(lex.next_token().unwrap(), PdfObject::Null);
515 }
516
517 #[test]
518 fn lex_indirect_ref_in_array() {
519 let l = lim();
520 let mut lex = Lexer::new(b"[12 0 R]", 0, &l);
521 let obj = lex.next_token().unwrap();
522 assert_eq!(obj, PdfObject::Array(vec![PdfObject::Ref(ObjectId(12, 0))]));
523 }
524
525 #[test]
526 fn skip_comments() {
527 let l = lim();
528 let mut lex = Lexer::new(b"% comment\n42 ", 0, &l);
529 assert_eq!(lex.next_token().unwrap(), PdfObject::Integer(42));
530 }
531
532 #[test]
533 fn reject_deeply_nested_array() {
534 let mut l = lim();
535 l.max_object_depth = 10;
536 let depth = 50usize;
537 let mut data = vec![b'['; depth];
538 data.extend(std::iter::repeat_n(b']', depth));
539 let mut lex = Lexer::new(&data, 0, &l);
540 let err = lex.next_token().unwrap_err();
541 assert!(matches!(err, Error::RecursionLimit(10)), "got {err:?}");
542 }
543
544 #[test]
545 fn reject_deeply_nested_dict() {
546 let mut l = lim();
547 l.max_object_depth = 5;
548 let n = 20usize;
549 let mut s = String::new();
550 for _ in 0..n {
551 s.push_str("<< /a ");
552 }
553 s.push('1');
554 for _ in 0..n {
555 s.push_str(" >>");
556 }
557 let data = s.into_bytes();
558 let mut lex = Lexer::new(&data, 0, &l);
559 let err = lex.next_token().unwrap_err();
560 assert!(matches!(err, Error::RecursionLimit(5)), "got {err:?}");
561 }
562
563 #[test]
564 fn nested_within_limit_ok() {
565 let l = lim(); let data = b"[[[[[1]]]]]"; let mut lex = Lexer::new(data, 0, &l);
568 assert!(lex.next_token().is_ok());
569 }
570
571 #[test]
572 fn reject_oversized_literal_string() {
573 let mut l = lim();
574 l.max_string_length = 8;
575 let mut data = vec![b'('];
576 data.extend(std::iter::repeat_n(b'a', 100));
577 data.push(b')');
578 let mut lex = Lexer::new(&data, 0, &l);
579 let err = lex.next_token().unwrap_err();
580 assert!(matches!(err, Error::StringLengthLimit(8)), "got {err:?}");
581 }
582
583 #[test]
584 fn reject_oversized_hex_string() {
585 let mut l = lim();
586 l.max_string_length = 4;
587 let mut data = vec![b'<'];
589 data.extend(std::iter::repeat_n(b'4', 20));
590 data.push(b'>');
591 let mut lex = Lexer::new(&data, 0, &l);
592 let err = lex.next_token().unwrap_err();
593 assert!(matches!(err, Error::StringLengthLimit(4)), "got {err:?}");
594 }
595
596 #[test]
597 fn small_string_within_limit_ok() {
598 let l = lim(); let mut lex = Lexer::new(b"(hello)", 0, &l);
600 assert_eq!(
601 lex.next_token().unwrap(),
602 PdfObject::String(PdfString::new(b"hello".to_vec()))
603 );
604 }
605}