1use rpdfium_core::error::{ParseError, PdfError};
12use rpdfium_core::{Name, PdfString};
13
14use crate::object::ObjectId;
15
16#[derive(Debug, Clone, PartialEq)]
18pub enum Token {
19 Integer(i64),
21 Real(f64),
23 String(PdfString),
25 Name(Name),
27 Boolean(bool),
29 Null,
31 ArrayStart,
33 ArrayEnd,
35 DictStart,
37 DictEnd,
39 Ref(ObjectId),
41 Keyword(Vec<u8>),
43 Comment(Vec<u8>),
45}
46
47pub struct Tokenizer<'a> {
49 source: &'a [u8],
50 pos: usize,
51}
52
53impl<'a> Tokenizer<'a> {
54 pub fn new(source: &'a [u8]) -> Self {
56 Self { source, pos: 0 }
57 }
58
59 pub fn new_at(source: &'a [u8], pos: usize) -> Self {
61 Self { source, pos }
62 }
63
64 pub fn position(&self) -> usize {
66 self.pos
67 }
68
69 pub fn set_position(&mut self, pos: usize) {
71 self.pos = pos;
72 }
73
74 pub fn source(&self) -> &'a [u8] {
76 self.source
77 }
78
79 pub fn skip_whitespace_and_comments(&mut self) {
81 loop {
82 while self.pos < self.source.len() && is_whitespace(self.source[self.pos]) {
84 self.pos += 1;
85 }
86 if self.pos < self.source.len() && self.source[self.pos] == b'%' {
88 while self.pos < self.source.len()
89 && self.source[self.pos] != b'\r'
90 && self.source[self.pos] != b'\n'
91 {
92 self.pos += 1;
93 }
94 } else {
95 break;
96 }
97 }
98 }
99
100 pub fn next_token(&mut self) -> Option<Result<Token, PdfError>> {
102 self.skip_whitespace_and_comments();
103
104 if self.pos >= self.source.len() {
105 return None;
106 }
107
108 let offset = self.pos as u64;
109 let b = self.source[self.pos];
110
111 let result = match b {
112 b'[' => {
113 self.pos += 1;
114 Ok(Token::ArrayStart)
115 }
116 b']' => {
117 self.pos += 1;
118 Ok(Token::ArrayEnd)
119 }
120 b'<' => {
121 if self.pos + 1 < self.source.len() && self.source[self.pos + 1] == b'<' {
122 self.pos += 2;
123 Ok(Token::DictStart)
124 } else {
125 self.read_hex_string(offset)
126 }
127 }
128 b'>' => {
129 if self.pos + 1 < self.source.len() && self.source[self.pos + 1] == b'>' {
130 self.pos += 2;
131 Ok(Token::DictEnd)
132 } else {
133 Err(PdfError::Parse(ParseError::UnexpectedToken {
134 offset,
135 expected: ">>".into(),
136 found: ">".into(),
137 }))
138 }
139 }
140 b'(' => self.read_literal_string(offset),
141 b'/' => self.read_name(offset),
142 b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(offset),
143 _ if b.is_ascii_alphabetic() => self.read_keyword_or_bool(offset),
144 _ => {
145 self.pos += 1;
146 Err(PdfError::Parse(ParseError::UnexpectedToken {
147 offset,
148 expected: "token".into(),
149 found: format!("byte 0x{:02x}", b),
150 }))
151 }
152 };
153
154 Some(result)
155 }
156
157 pub fn peek_token(&mut self) -> Option<Result<Token, PdfError>> {
161 let saved = self.position();
162 let token = self.next_token();
163 self.set_position(saved);
164 token
165 }
166
167 fn read_literal_string(&mut self, offset: u64) -> Result<Token, PdfError> {
169 debug_assert_eq!(self.source[self.pos], b'(');
170 self.pos += 1; let mut bytes = Vec::new();
173 let mut depth: u32 = 1;
174
175 while self.pos < self.source.len() {
176 let b = self.source[self.pos];
177 self.pos += 1;
178
179 match b {
180 b'(' => {
181 depth += 1;
182 bytes.push(b'(');
183 }
184 b')' => {
185 depth -= 1;
186 if depth == 0 {
187 return Ok(Token::String(PdfString::from_bytes(bytes)));
188 }
189 bytes.push(b')');
190 }
191 b'\\' => {
192 if self.pos >= self.source.len() {
193 return Err(PdfError::Parse(ParseError::InvalidString {
194 offset,
195 detail: "unexpected end of input in escape sequence".into(),
196 }));
197 }
198 let escaped = self.source[self.pos];
199 self.pos += 1;
200 match escaped {
201 b'n' => bytes.push(b'\n'),
202 b'r' => bytes.push(b'\r'),
203 b't' => bytes.push(b'\t'),
204 b'b' => bytes.push(0x08),
205 b'f' => bytes.push(0x0C),
206 b'(' => bytes.push(b'('),
207 b')' => bytes.push(b')'),
208 b'\\' => bytes.push(b'\\'),
209 b'\r' => {
210 if self.pos < self.source.len() && self.source[self.pos] == b'\n' {
212 self.pos += 1;
213 }
214 }
215 b'\n' => {
216 }
218 b'0'..=b'7' => {
219 let mut val: u8 = escaped - b'0';
221 for _ in 0..2 {
222 if self.pos < self.source.len() {
223 let next = self.source[self.pos];
224 if (b'0'..=b'7').contains(&next) {
225 val = val * 8 + (next - b'0');
226 self.pos += 1;
227 } else {
228 break;
229 }
230 }
231 }
232 bytes.push(val);
233 }
234 _ => {
235 bytes.push(escaped);
238 }
239 }
240 }
241 _ => bytes.push(b),
242 }
243 }
244
245 Err(PdfError::Parse(ParseError::InvalidString {
246 offset,
247 detail: "unterminated literal string".into(),
248 }))
249 }
250
251 fn read_hex_string(&mut self, offset: u64) -> Result<Token, PdfError> {
253 debug_assert_eq!(self.source[self.pos], b'<');
254 self.pos += 1; let mut hex_chars = Vec::new();
257
258 while self.pos < self.source.len() {
259 let b = self.source[self.pos];
260 self.pos += 1;
261
262 match b {
263 b'>' => {
264 if hex_chars.len() % 2 != 0 {
266 hex_chars.push(0);
267 }
268 let bytes: Vec<u8> = hex_chars
269 .chunks(2)
270 .map(|pair| pair[0] * 16 + pair[1])
271 .collect();
272 return Ok(Token::String(PdfString::from_bytes(bytes)));
273 }
274 _ if is_whitespace(b) => continue,
275 _ => {
276 let nibble =
277 hex_digit(b).ok_or(PdfError::Parse(ParseError::InvalidString {
278 offset,
279 detail: format!("invalid hex digit 0x{:02x}", b),
280 }))?;
281 hex_chars.push(nibble);
282 }
283 }
284 }
285
286 Err(PdfError::Parse(ParseError::InvalidString {
287 offset,
288 detail: "unterminated hex string".into(),
289 }))
290 }
291
292 fn read_name(&mut self, _offset: u64) -> Result<Token, PdfError> {
294 debug_assert_eq!(self.source[self.pos], b'/');
295 self.pos += 1; let mut bytes = Vec::new();
298
299 while self.pos < self.source.len() {
300 let b = self.source[self.pos];
301
302 if is_whitespace(b) || is_delimiter(b) {
303 break;
304 }
305
306 self.pos += 1;
307
308 if b == b'#' {
309 if self.pos + 1 < self.source.len() {
311 let hi = hex_digit(self.source[self.pos]);
312 let lo = hex_digit(self.source[self.pos + 1]);
313 if let (Some(h), Some(l)) = (hi, lo) {
314 bytes.push(h * 16 + l);
315 self.pos += 2;
316 continue;
317 }
318 }
319 bytes.push(b'#');
321 } else {
322 bytes.push(b);
323 }
324 }
325
326 Ok(Token::Name(Name::from_bytes(bytes)))
327 }
328
329 fn read_number(&mut self, offset: u64) -> Result<Token, PdfError> {
331 let start = self.pos;
332 let mut has_dot = false;
333
334 if self.pos < self.source.len()
336 && (self.source[self.pos] == b'+' || self.source[self.pos] == b'-')
337 {
338 self.pos += 1;
339 }
340
341 if self.pos < self.source.len() && self.source[self.pos] == b'.' {
343 has_dot = true;
344 self.pos += 1;
345 }
346
347 let digit_start = self.pos;
348
349 while self.pos < self.source.len() {
350 let b = self.source[self.pos];
351 if b == b'.' && !has_dot {
352 has_dot = true;
353 self.pos += 1;
354 } else if b.is_ascii_digit() {
355 self.pos += 1;
356 } else {
357 break;
358 }
359 }
360
361 if self.pos == digit_start && !has_dot {
363 self.pos = start;
366 return self.read_keyword_or_bool(offset);
367 }
368
369 let num_str = std::str::from_utf8(&self.source[start..self.pos])
370 .map_err(|_| PdfError::Parse(ParseError::InvalidNumber { offset }))?;
371
372 if has_dot {
373 let val: f64 = num_str
374 .parse()
375 .map_err(|_| PdfError::Parse(ParseError::InvalidNumber { offset }))?;
376 Ok(Token::Real(val))
377 } else {
378 match num_str.parse::<i64>() {
380 Ok(val) => {
381 let saved_pos = self.pos;
383 if let Some(obj_ref) = self.try_read_reference(val, offset) {
384 return Ok(Token::Ref(obj_ref));
385 }
386 self.pos = saved_pos;
387 Ok(Token::Integer(val))
388 }
389 Err(_) => {
390 let val: f64 = num_str
391 .parse()
392 .map_err(|_| PdfError::Parse(ParseError::InvalidNumber { offset }))?;
393 Ok(Token::Real(val))
394 }
395 }
396 }
397 }
398
399 fn try_read_reference(&mut self, number: i64, _offset: u64) -> Option<ObjectId> {
401 if number < 0 {
402 return None;
403 }
404
405 let saved = self.pos;
406
407 self.skip_whitespace_and_comments();
408
409 let gen_start = self.pos;
411 while self.pos < self.source.len()
412 && self.source[self.pos] >= b'0'
413 && self.source[self.pos] <= b'9'
414 {
415 self.pos += 1;
416 }
417
418 if self.pos == gen_start {
419 self.pos = saved;
420 return None;
421 }
422
423 let gen_str = std::str::from_utf8(&self.source[gen_start..self.pos]).ok()?;
424 let generation: u16 = gen_str.parse().ok()?;
425
426 self.skip_whitespace_and_comments();
427
428 if self.pos < self.source.len() && self.source[self.pos] == b'R' {
430 let after_r = self.pos + 1;
431 if after_r >= self.source.len()
432 || is_whitespace(self.source[after_r])
433 || is_delimiter(self.source[after_r])
434 {
435 self.pos = after_r;
436 return Some(ObjectId::new(number as u32, generation));
437 }
438 }
439
440 self.pos = saved;
441 None
442 }
443
444 fn read_keyword_or_bool(&mut self, _offset: u64) -> Result<Token, PdfError> {
446 let start = self.pos;
447
448 while self.pos < self.source.len() {
449 let b = self.source[self.pos];
450 if is_whitespace(b) || is_delimiter(b) {
451 break;
452 }
453 self.pos += 1;
454 }
455
456 let word = &self.source[start..self.pos];
457
458 match word {
459 b"true" => Ok(Token::Boolean(true)),
460 b"false" => Ok(Token::Boolean(false)),
461 b"null" => Ok(Token::Null),
462 _ => Ok(Token::Keyword(word.to_vec())),
463 }
464 }
465}
466
467#[inline]
469pub fn is_whitespace(b: u8) -> bool {
470 matches!(b, b'\0' | b'\t' | b'\n' | 0x0C | b'\r' | b' ')
471}
472
473#[inline]
475pub fn is_delimiter(b: u8) -> bool {
476 matches!(
477 b,
478 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
479 )
480}
481
482#[inline]
484fn hex_digit(b: u8) -> Option<u8> {
485 match b {
486 b'0'..=b'9' => Some(b - b'0'),
487 b'a'..=b'f' => Some(b - b'a' + 10),
488 b'A'..=b'F' => Some(b - b'A' + 10),
489 _ => None,
490 }
491}
492
493#[cfg(test)]
494mod tests {
495 use super::*;
496
497 fn tokenize_all(source: &[u8]) -> Vec<Token> {
498 let mut tok = Tokenizer::new(source);
499 let mut tokens = Vec::new();
500 while let Some(result) = tok.next_token() {
501 tokens.push(result.unwrap());
502 }
503 tokens
504 }
505
506 #[test]
507 fn test_integer_token() {
508 let tokens = tokenize_all(b"42");
509 assert_eq!(tokens, vec![Token::Integer(42)]);
510 }
511
512 #[test]
513 fn test_negative_integer() {
514 let tokens = tokenize_all(b"-7");
515 assert_eq!(tokens, vec![Token::Integer(-7)]);
516 }
517
518 #[test]
519 fn test_positive_integer() {
520 let tokens = tokenize_all(b"+3");
521 assert_eq!(tokens, vec![Token::Integer(3)]);
522 }
523
524 #[test]
525 #[allow(clippy::approx_constant)]
526 fn test_real_number() {
527 let tokens = tokenize_all(b"3.14");
528 assert_eq!(tokens, vec![Token::Real(3.14)]);
529 }
530
531 #[test]
532 fn test_real_leading_dot() {
533 let tokens = tokenize_all(b".5");
534 assert_eq!(tokens, vec![Token::Real(0.5)]);
535 }
536
537 #[test]
538 fn test_real_negative() {
539 let tokens = tokenize_all(b"-1.5");
540 assert_eq!(tokens, vec![Token::Real(-1.5)]);
541 }
542
543 #[test]
544 fn test_boolean_true() {
545 let tokens = tokenize_all(b"true");
546 assert_eq!(tokens, vec![Token::Boolean(true)]);
547 }
548
549 #[test]
550 fn test_boolean_false() {
551 let tokens = tokenize_all(b"false");
552 assert_eq!(tokens, vec![Token::Boolean(false)]);
553 }
554
555 #[test]
556 fn test_null_token() {
557 let tokens = tokenize_all(b"null");
558 assert_eq!(tokens, vec![Token::Null]);
559 }
560
561 #[test]
562 fn test_name_simple() {
563 let tokens = tokenize_all(b"/Type");
564 assert_eq!(
565 tokens,
566 vec![Token::Name(Name::from_bytes(b"Type".to_vec()))]
567 );
568 }
569
570 #[test]
571 fn test_name_with_hex_escape() {
572 let tokens = tokenize_all(b"/Name#20With#20Spaces");
573 assert_eq!(
574 tokens,
575 vec![Token::Name(Name::from_bytes(b"Name With Spaces".to_vec()))]
576 );
577 }
578
579 #[test]
580 fn test_name_empty() {
581 let tokens = tokenize_all(b"/ ");
582 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(Vec::new()))]);
583 }
584
585 #[test]
586 fn test_literal_string_simple() {
587 let tokens = tokenize_all(b"(Hello World)");
588 assert_eq!(
589 tokens,
590 vec![Token::String(PdfString::from_bytes(
591 b"Hello World".to_vec()
592 ))]
593 );
594 }
595
596 #[test]
597 fn test_literal_string_nested_parens() {
598 let tokens = tokenize_all(b"(A (B) C)");
599 assert_eq!(
600 tokens,
601 vec![Token::String(PdfString::from_bytes(b"A (B) C".to_vec()))]
602 );
603 }
604
605 #[test]
606 fn test_literal_string_escape_sequences() {
607 let tokens = tokenize_all(b"(\\n\\r\\t\\\\)");
608 assert_eq!(
609 tokens,
610 vec![Token::String(PdfString::from_bytes(b"\n\r\t\\".to_vec()))]
611 );
612 }
613
614 #[test]
615 fn test_literal_string_octal_escape() {
616 let tokens = tokenize_all(b"(\\101)"); assert_eq!(
618 tokens,
619 vec![Token::String(PdfString::from_bytes(b"A".to_vec()))]
620 );
621 }
622
623 #[test]
624 fn test_hex_string() {
625 let tokens = tokenize_all(b"<48656C6C6F>");
626 assert_eq!(
627 tokens,
628 vec![Token::String(PdfString::from_bytes(b"Hello".to_vec()))]
629 );
630 }
631
632 #[test]
633 fn test_hex_string_with_whitespace() {
634 let tokens = tokenize_all(b"<48 65 6C 6C 6F>");
635 assert_eq!(
636 tokens,
637 vec![Token::String(PdfString::from_bytes(b"Hello".to_vec()))]
638 );
639 }
640
641 #[test]
642 fn test_hex_string_odd_digits() {
643 let tokens = tokenize_all(b"<ABC>");
645 assert_eq!(
646 tokens,
647 vec![Token::String(PdfString::from_bytes(vec![0xAB, 0xC0]))]
648 );
649 }
650
651 #[test]
652 fn test_array_delimiters() {
653 let tokens = tokenize_all(b"[1 2]");
654 assert_eq!(
655 tokens,
656 vec![
657 Token::ArrayStart,
658 Token::Integer(1),
659 Token::Integer(2),
660 Token::ArrayEnd,
661 ]
662 );
663 }
664
665 #[test]
666 fn test_dict_delimiters() {
667 let tokens = tokenize_all(b"<< /Type /Catalog >>");
668 assert_eq!(
669 tokens,
670 vec![
671 Token::DictStart,
672 Token::Name(Name::from_bytes(b"Type".to_vec())),
673 Token::Name(Name::from_bytes(b"Catalog".to_vec())),
674 Token::DictEnd,
675 ]
676 );
677 }
678
679 #[test]
680 fn test_indirect_reference() {
681 let tokens = tokenize_all(b"5 0 R");
682 assert_eq!(tokens, vec![Token::Ref(ObjectId::new(5, 0))]);
683 }
684
685 #[test]
686 fn test_indirect_reference_in_array() {
687 let tokens = tokenize_all(b"[5 0 R]");
688 assert_eq!(
689 tokens,
690 vec![
691 Token::ArrayStart,
692 Token::Ref(ObjectId::new(5, 0)),
693 Token::ArrayEnd,
694 ]
695 );
696 }
697
698 #[test]
699 fn test_keyword_obj() {
700 let tokens = tokenize_all(b"obj");
701 assert_eq!(tokens, vec![Token::Keyword(b"obj".to_vec())]);
702 }
703
704 #[test]
705 fn test_keyword_stream() {
706 let tokens = tokenize_all(b"stream");
707 assert_eq!(tokens, vec![Token::Keyword(b"stream".to_vec())]);
708 }
709
710 #[test]
711 fn test_skip_comments() {
712 let tokens = tokenize_all(b"% this is a comment\n42");
713 assert_eq!(tokens, vec![Token::Integer(42)]);
714 }
715
716 #[test]
717 fn test_mixed_tokens() {
718 let source = b"1 0 obj\n<< /Type /Page /MediaBox [0 0 612 792] >>\nendobj";
719 let tokens = tokenize_all(source);
720 assert_eq!(tokens[0], Token::Integer(1));
721 assert_eq!(tokens[1], Token::Integer(0));
722 assert_eq!(tokens[2], Token::Keyword(b"obj".to_vec()));
723 assert_eq!(tokens[3], Token::DictStart);
724 }
725
726 #[test]
727 fn test_position_tracking() {
728 let mut tok = Tokenizer::new(b" 42");
729 assert_eq!(tok.position(), 0);
730 let _ = tok.next_token();
731 assert_eq!(tok.position(), 4);
732 }
733
734 #[test]
735 fn test_empty_input() {
736 let tokens = tokenize_all(b"");
737 assert!(tokens.is_empty());
738 }
739
740 #[test]
741 fn test_whitespace_only() {
742 let tokens = tokenize_all(b" \n\r\t ");
743 assert!(tokens.is_empty());
744 }
745
746 #[test]
747 fn test_literal_string_line_continuation() {
748 let tokens = tokenize_all(b"(line1\\\nline2)");
749 assert_eq!(
750 tokens,
751 vec![Token::String(PdfString::from_bytes(b"line1line2".to_vec()))]
752 );
753 }
754
755 #[test]
761 fn test_syntax_parser_read_hex_string_empty() {
762 let tokens = tokenize_all(b"<>");
763 assert_eq!(
764 tokens,
765 vec![Token::String(PdfString::from_bytes(Vec::new()))]
766 );
767 }
768
769 #[test]
771 fn test_syntax_parser_read_hex_string_stops_at_closing_angle() {
772 let tokens = tokenize_all(b"<1A2b>abcd");
773 assert_eq!(
774 tokens,
775 vec![
776 Token::String(PdfString::from_bytes(vec![0x1A, 0x2B])),
777 Token::Keyword(b"abcd".to_vec()),
778 ]
779 );
780 }
781
782 #[test]
784 fn test_syntax_parser_read_hex_string_uneven_single_digit() {
785 let tokens = tokenize_all(b"<A>");
787 assert_eq!(
788 tokens,
789 vec![Token::String(PdfString::from_bytes(vec![0xA0]))]
790 );
791 }
792
793 #[test]
795 fn test_syntax_parser_read_hex_string_unterminated_is_error() {
796 let mut tok = Tokenizer::new(b"<1A2b");
797 let result = tok.next_token();
798 assert!(result.is_some());
799 assert!(result.unwrap().is_err());
800 }
801
802 #[test]
804 fn test_syntax_parser_read_hex_string_invalid_hex_digit() {
805 let mut tok = Tokenizer::new(b"<zz>");
806 let result = tok.next_token();
807 assert!(result.is_some());
808 assert!(result.unwrap().is_err());
809 }
810
811 #[test]
813 fn test_syntax_parser_read_hex_string_just_closing_angle() {
814 let tokens = tokenize_all(b"<>");
816 assert_eq!(
817 tokens,
818 vec![Token::String(PdfString::from_bytes(Vec::new()))]
819 );
820 }
821
822 #[test]
824 fn test_syntax_parser_read_hex_string_whitespace_varieties() {
825 let tokens = tokenize_all(b"<4 8\t6\n5\r6C>");
826 assert_eq!(
828 tokens,
829 vec![Token::String(PdfString::from_bytes(b"Hel".to_vec()))]
830 );
831 }
832
833 #[test]
835 fn test_syntax_parser_read_hex_string_lowercase() {
836 let tokens = tokenize_all(b"<48656c6c6f>");
837 assert_eq!(
838 tokens,
839 vec![Token::String(PdfString::from_bytes(b"Hello".to_vec()))]
840 );
841 }
842
843 #[test]
849 fn test_syntax_parser_get_invalid_reference() {
850 let tokens = tokenize_all(b"4294967295 0 R");
851 assert!(!tokens.is_empty());
854 }
855
856 #[test]
862 fn test_simple_parser_get_word_whitespace_separated() {
863 let tokens = tokenize_all(b" foo bar ");
864 assert_eq!(
865 tokens,
866 vec![
867 Token::Keyword(b"foo".to_vec()),
868 Token::Keyword(b"bar".to_vec()),
869 ]
870 );
871 }
872
873 #[test]
875 fn test_simple_parser_get_word_around_delimiters() {
876 let tokens = tokenize_all(b"/Name[42]");
877 assert_eq!(
878 tokens,
879 vec![
880 Token::Name(Name::from_bytes(b"Name".to_vec())),
881 Token::ArrayStart,
882 Token::Integer(42),
883 Token::ArrayEnd,
884 ]
885 );
886 }
887
888 #[test]
890 fn test_simple_parser_get_word_comment_then_token() {
891 let tokens = tokenize_all(b"% comment line\r\n/Next");
892 assert_eq!(
893 tokens,
894 vec![Token::Name(Name::from_bytes(b"Next".to_vec()))]
895 );
896 }
897
898 #[test]
900 fn test_simple_parser_get_word_multiple_comments() {
901 let tokens = tokenize_all(b"% first\n% second\n42");
902 assert_eq!(tokens, vec![Token::Integer(42)]);
903 }
904
905 #[test]
907 fn test_syntax_parser_read_string_deeply_nested_parens() {
908 let tokens = tokenize_all(b"(a(b(c)d)e)");
909 assert_eq!(
910 tokens,
911 vec![Token::String(PdfString::from_bytes(b"a(b(c)d)e".to_vec()))]
912 );
913 }
914
915 #[test]
917 fn test_syntax_parser_read_string_escape_backspace_formfeed() {
918 let tokens = tokenize_all(b"(\\b\\f)");
919 assert_eq!(
920 tokens,
921 vec![Token::String(PdfString::from_bytes(vec![0x08, 0x0C]))]
922 );
923 }
924
925 #[test]
927 fn test_syntax_parser_read_string_escaped_parens() {
928 let tokens = tokenize_all(b"(\\(\\))");
929 assert_eq!(
930 tokens,
931 vec![Token::String(PdfString::from_bytes(b"()".to_vec()))]
932 );
933 }
934
935 #[test]
937 fn test_syntax_parser_read_string_unterminated() {
938 let mut tok = Tokenizer::new(b"(no closing paren");
939 let result = tok.next_token();
940 assert!(result.is_some());
941 assert!(result.unwrap().is_err());
942 }
943
944 #[test]
946 fn test_syntax_parser_read_string_octal_range() {
947 let tokens = tokenize_all(b"(\\000\\377)");
949 assert_eq!(
950 tokens,
951 vec![Token::String(PdfString::from_bytes(vec![0x00, 0xFF]))]
952 );
953 }
954
955 #[test]
957 fn test_syntax_parser_read_string_cr_lf_continuation() {
958 let tokens = tokenize_all(b"(line1\\\r\nline2)");
959 assert_eq!(
960 tokens,
961 vec![Token::String(PdfString::from_bytes(b"line1line2".to_vec()))]
962 );
963 }
964
965 #[test]
967 fn test_syntax_parser_read_string_unknown_escape() {
968 let tokens = tokenize_all(b"(\\z)");
970 assert_eq!(
971 tokens,
972 vec![Token::String(PdfString::from_bytes(b"z".to_vec()))]
973 );
974 }
975
976 #[test]
982 fn test_name_all_hex_encoded() {
983 let tokens = tokenize_all(b"/#54#79#70#65");
985 assert_eq!(
986 tokens,
987 vec![Token::Name(Name::from_bytes(b"Type".to_vec()))]
988 );
989 }
990
991 #[test]
993 fn test_name_at_eof() {
994 let tokens = tokenize_all(b"/EOF");
995 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(b"EOF".to_vec()))]);
996 }
997
998 #[test]
1004 fn test_large_integer_becomes_real() {
1005 let tokens = tokenize_all(b"99999999999999999999");
1006 assert_eq!(tokens.len(), 1);
1007 match &tokens[0] {
1009 Token::Real(_) => {}
1010 other => panic!("expected Real, got {:?}", other),
1011 }
1012 }
1013
1014 #[test]
1016 fn test_zero_integer() {
1017 let tokens = tokenize_all(b"0");
1018 assert_eq!(tokens, vec![Token::Integer(0)]);
1019 }
1020
1021 #[test]
1023 fn test_negative_zero() {
1024 let tokens = tokenize_all(b"-0");
1025 assert_eq!(tokens, vec![Token::Integer(0)]);
1026 }
1027
1028 #[test]
1030 fn test_position_tracks_through_tokens() {
1031 let mut tok = Tokenizer::new(b"true false null");
1032 assert_eq!(tok.position(), 0);
1033 let _ = tok.next_token(); assert_eq!(tok.position(), 4);
1035 let _ = tok.next_token(); assert_eq!(tok.position(), 10);
1037 let _ = tok.next_token(); assert_eq!(tok.position(), 15);
1039 assert!(tok.next_token().is_none()); }
1041
1042 #[test]
1044 fn test_set_position_reparse() {
1045 let source = b"42 true";
1046 let mut tok = Tokenizer::new(source);
1047 let _ = tok.next_token(); tok.set_position(3); let t = tok.next_token().unwrap().unwrap();
1050 assert_eq!(t, Token::Boolean(true));
1051 }
1052
1053 #[test]
1055 fn test_lone_greater_than_is_error() {
1056 let mut tok = Tokenizer::new(b">");
1057 let result = tok.next_token();
1058 assert!(result.is_some());
1059 assert!(result.unwrap().is_err());
1060 }
1061
1062 #[test]
1064 fn test_nul_byte_is_whitespace() {
1065 let tokens = tokenize_all(&[0x00, b'4', b'2']);
1066 assert_eq!(tokens, vec![Token::Integer(42)]);
1067 }
1068
1069 #[test]
1070 fn test_peek_token_does_not_advance() {
1071 let source = b"42 true";
1072 let mut tok = Tokenizer::new(source);
1073 let first_peek = tok.peek_token();
1074 let second_peek = tok.peek_token();
1075 let consumed = tok.next_token();
1076 assert!(matches!(first_peek, Some(Ok(Token::Integer(42)))));
1078 assert!(matches!(second_peek, Some(Ok(Token::Integer(42)))));
1079 assert!(matches!(consumed, Some(Ok(Token::Integer(42)))));
1080 assert!(matches!(tok.next_token(), Some(Ok(Token::Boolean(true)))));
1082 }
1083
1084 #[test]
1092 fn test_simple_parser_bug358381390() {
1093 let input = b"1 beginbfchar\n<01> <>\nendbfchar\n1 beginbfchar";
1094 let mut tok = Tokenizer::new(input);
1095
1096 assert!(matches!(tok.next_token(), Some(Ok(Token::Integer(1)))));
1098 assert!(matches!(tok.next_token(), Some(Ok(Token::Keyword(ref k))) if k == b"beginbfchar"));
1100 assert!(matches!(tok.next_token(), Some(Ok(Token::String(_)))));
1102 assert!(
1104 matches!(tok.next_token(), Some(Ok(Token::String(ref s))) if s.as_bytes().is_empty())
1105 );
1106 assert!(matches!(tok.next_token(), Some(Ok(Token::Keyword(ref k))) if k == b"endbfchar"));
1108 assert!(matches!(tok.next_token(), Some(Ok(Token::Integer(1)))));
1110 assert!(matches!(tok.next_token(), Some(Ok(Token::Keyword(ref k))) if k == b"beginbfchar"));
1112 assert!(tok.next_token().is_none());
1114 }
1115
1116 #[test]
1121 fn test_syntax_parser_peek_next_word() {
1122 let input = b" WORD ";
1123 let mut tok = Tokenizer::new(input);
1124 let peeked = tok.peek_token();
1126 assert!(matches!(peeked, Some(Ok(Token::Keyword(ref k))) if k == b"WORD"));
1127 let consumed = tok.next_token();
1129 assert!(matches!(consumed, Some(Ok(Token::Keyword(ref k))) if k == b"WORD"));
1130 assert!(tok.next_token().is_none());
1132 }
1133
1134 #[test]
1142 fn test_parser_utility_name_decode_empty() {
1143 let tokens = tokenize_all(b"/");
1144 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(Vec::new()))]);
1145 }
1146
1147 #[test]
1151 fn test_parser_utility_name_decode_simple() {
1152 let tokens = tokenize_all(b"/A ");
1153 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(b"A".to_vec()))]);
1154 }
1155
1156 #[test]
1160 fn test_parser_utility_name_decode_bare_hash() {
1161 let tokens = tokenize_all(b"/# ");
1162 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(b"#".to_vec()))]);
1163 }
1164
1165 #[test]
1169 fn test_parser_utility_name_decode_incomplete_hex() {
1170 let tokens = tokenize_all(b"/#4 ");
1171 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(b"#4".to_vec()))]);
1172 }
1173
1174 #[test]
1178 fn test_parser_utility_name_decode_hex_41() {
1179 let tokens = tokenize_all(b"/#41 ");
1180 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(b"A".to_vec()))]);
1181 }
1182
1183 #[test]
1187 fn test_parser_utility_name_decode_hex_411() {
1188 let tokens = tokenize_all(b"/#411 ");
1189 assert_eq!(tokens, vec![Token::Name(Name::from_bytes(b"A1".to_vec()))]);
1190 }
1191}