1use super::{ParseError, ParseResult};
6use std::io::Read;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R: Read> {
68 reader: std::io::BufReader<R>,
69 #[allow(dead_code)]
70 buffer: Vec<u8>,
71 position: usize,
72 peek_buffer: Option<u8>,
73 token_buffer: Vec<Token>,
74}
75
76impl<R: Read> Lexer<R> {
77 pub fn new(reader: R) -> Self {
79 Self {
80 reader: std::io::BufReader::new(reader),
81 buffer: Vec::with_capacity(1024),
82 position: 0,
83 peek_buffer: None,
84 token_buffer: Vec::new(),
85 }
86 }
87
88 pub fn next_token(&mut self) -> ParseResult<Token> {
90 if let Some(token) = self.token_buffer.pop() {
92 return Ok(token);
93 }
94
95 self.skip_whitespace()?;
96
97 let ch = match self.peek_char()? {
98 Some(ch) => ch,
99 None => return Ok(Token::Eof),
100 };
101
102 match ch {
103 b'%' => self.read_comment(),
104 b'/' => self.read_name(),
105 b'(' => self.read_literal_string(),
106 b'<' => self.read_angle_bracket(),
107 b'>' => {
108 self.consume_char()?;
109 if self.peek_char()? == Some(b'>') {
110 self.consume_char()?;
111 Ok(Token::DictEnd)
112 } else {
113 Err(ParseError::SyntaxError {
114 position: self.position,
115 message: "Expected '>' after '>'".to_string(),
116 })
117 }
118 }
119 b'[' => {
120 self.consume_char()?;
121 Ok(Token::ArrayStart)
122 }
123 b']' => {
124 self.consume_char()?;
125 Ok(Token::ArrayEnd)
126 }
127 b't' | b'f' => self.read_boolean(),
128 b'n' => self.read_null(),
129 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
130 b'R' => {
131 self.consume_char()?;
133 Ok(Token::Name("R".to_string()))
134 }
135 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
136 _ => Err(ParseError::SyntaxError {
137 position: self.position,
138 message: format!("Unexpected character: {}", ch as char),
139 }),
140 }
141 }
142
143 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
145 if let Some(ch) = self.peek_buffer {
146 return Ok(Some(ch));
147 }
148
149 let mut buf = [0u8; 1];
150 match self.reader.read_exact(&mut buf) {
151 Ok(_) => {
152 self.peek_buffer = Some(buf[0]);
153 Ok(Some(buf[0]))
154 }
155 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
156 Err(e) => Err(e.into()),
157 }
158 }
159
160 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
162 let ch = self.peek_char()?;
163 if ch.is_some() {
164 self.peek_buffer = None;
165 self.position += 1;
166 }
167 Ok(ch)
168 }
169
170 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
172 let mut count = 0;
173 while let Some(ch) = self.peek_char()? {
174 if ch.is_ascii_whitespace() {
175 self.consume_char()?;
176 count += 1;
177 } else {
178 break;
179 }
180 }
181 Ok(count)
182 }
183
184 fn read_comment(&mut self) -> ParseResult<Token> {
186 self.consume_char()?; let mut comment = String::new();
188
189 while let Some(ch) = self.peek_char()? {
190 if ch == b'\n' || ch == b'\r' {
191 break;
192 }
193 self.consume_char()?;
194 comment.push(ch as char);
195 }
196
197 Ok(Token::Comment(comment))
198 }
199
200 fn read_name(&mut self) -> ParseResult<Token> {
202 self.consume_char()?; let mut name = String::new();
204
205 while let Some(ch) = self.peek_char()? {
206 if ch.is_ascii_whitespace()
207 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
208 {
209 break;
210 }
211 self.consume_char()?;
212
213 if ch == b'#' {
215 let hex1 = self
216 .consume_char()?
217 .ok_or_else(|| ParseError::SyntaxError {
218 position: self.position,
219 message: "Incomplete hex code in name".to_string(),
220 })?;
221 let hex2 = self
222 .consume_char()?
223 .ok_or_else(|| ParseError::SyntaxError {
224 position: self.position,
225 message: "Incomplete hex code in name".to_string(),
226 })?;
227
228 let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
229 .map_err(|_| ParseError::SyntaxError {
230 position: self.position,
231 message: "Invalid hex code in name".to_string(),
232 })?;
233
234 name.push(value as char);
235 } else {
236 name.push(ch as char);
237 }
238 }
239
240 Ok(Token::Name(name))
241 }
242
243 fn read_literal_string(&mut self) -> ParseResult<Token> {
245 self.consume_char()?; let mut string = Vec::new();
247 let mut paren_depth = 1;
248 let mut escape = false;
249
250 while paren_depth > 0 {
251 let ch = self
252 .consume_char()?
253 .ok_or_else(|| ParseError::SyntaxError {
254 position: self.position,
255 message: "Unterminated string".to_string(),
256 })?;
257
258 if escape {
259 let escaped = match ch {
260 b'n' => b'\n',
261 b'r' => b'\r',
262 b't' => b'\t',
263 b'b' => b'\x08',
264 b'f' => b'\x0C',
265 b'(' => b'(',
266 b')' => b')',
267 b'\\' => b'\\',
268 b'0'..=b'7' => {
269 let mut value = ch - b'0';
271 for _ in 0..2 {
272 if let Some(next) = self.peek_char()? {
273 if matches!(next, b'0'..=b'7') {
274 self.consume_char()?;
275 value = value * 8 + (next - b'0');
276 } else {
277 break;
278 }
279 }
280 }
281 value
282 }
283 _ => ch, };
285 string.push(escaped);
286 escape = false;
287 } else {
288 match ch {
289 b'\\' => escape = true,
290 b'(' => {
291 string.push(ch);
292 paren_depth += 1;
293 }
294 b')' => {
295 paren_depth -= 1;
296 if paren_depth > 0 {
297 string.push(ch);
298 }
299 }
300 _ => string.push(ch),
301 }
302 }
303 }
304
305 Ok(Token::String(string))
306 }
307
308 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
310 self.consume_char()?; if self.peek_char()? == Some(b'<') {
313 self.consume_char()?;
314 Ok(Token::DictStart)
315 } else {
316 let mut hex_chars = String::new();
318 let mut found_end = false;
319
320 while let Some(ch) = self.peek_char()? {
321 if ch == b'>' {
322 self.consume_char()?;
323 found_end = true;
324 break;
325 }
326 self.consume_char()?;
327 if ch.is_ascii_hexdigit() {
328 hex_chars.push(ch as char);
329 } else if !ch.is_ascii_whitespace() {
330 return Err(ParseError::SyntaxError {
331 position: self.position,
332 message: "Invalid character in hex string".to_string(),
333 });
334 }
335 }
336
337 if !found_end {
338 return Err(ParseError::SyntaxError {
339 position: self.position,
340 message: "Unterminated hex string".to_string(),
341 });
342 }
343
344 if hex_chars.len() % 2 != 0 {
346 hex_chars.push('0');
347 }
348
349 let mut bytes = Vec::new();
351 for chunk in hex_chars.as_bytes().chunks(2) {
352 let hex_str = std::str::from_utf8(chunk).unwrap();
353 let byte =
354 u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
355 position: self.position,
356 message: "Invalid hex string".to_string(),
357 })?;
358 bytes.push(byte);
359 }
360
361 Ok(Token::String(bytes))
362 }
363 }
364
365 fn read_boolean(&mut self) -> ParseResult<Token> {
367 let word = self.read_word()?;
368 match word.as_str() {
369 "true" => Ok(Token::Boolean(true)),
370 "false" => Ok(Token::Boolean(false)),
371 _ => {
372 self.process_keyword(word)
374 }
375 }
376 }
377
378 fn read_null(&mut self) -> ParseResult<Token> {
380 let word = self.read_word()?;
381 if word == "null" {
382 Ok(Token::Null)
383 } else {
384 self.process_keyword(word)
386 }
387 }
388
389 fn read_number(&mut self) -> ParseResult<Token> {
391 let mut number_str = String::new();
392 let mut has_dot = false;
393
394 if let Some(ch) = self.peek_char()? {
396 if ch == b'+' || ch == b'-' {
397 self.consume_char()?;
398 number_str.push(ch as char);
399
400 if let Some(next) = self.peek_char()? {
402 if !next.is_ascii_digit() && next != b'.' {
403 return Err(ParseError::SyntaxError {
404 position: self.position,
405 message: "Expected digit after sign".to_string(),
406 });
407 }
408 }
409 }
410 }
411
412 while let Some(ch) = self.peek_char()? {
414 match ch {
415 b'0'..=b'9' => {
416 self.consume_char()?;
417 number_str.push(ch as char);
418 }
419 b'.' if !has_dot => {
420 self.consume_char()?;
421 number_str.push(ch as char);
422 has_dot = true;
423 }
424 _ => break,
425 }
426 }
427
428 if let Some(ch) = self.peek_char()? {
430 if ch == b'e' || ch == b'E' {
431 self.consume_char()?;
432 number_str.push(ch as char);
433
434 if let Some(sign_ch) = self.peek_char()? {
436 if sign_ch == b'+' || sign_ch == b'-' {
437 self.consume_char()?;
438 number_str.push(sign_ch as char);
439 }
440 }
441
442 while let Some(digit_ch) = self.peek_char()? {
444 if digit_ch.is_ascii_digit() {
445 self.consume_char()?;
446 number_str.push(digit_ch as char);
447 } else {
448 break;
449 }
450 }
451
452 has_dot = true;
454 }
455 }
456
457 if has_dot {
462 let value = number_str
463 .parse::<f64>()
464 .map_err(|_| ParseError::SyntaxError {
465 position: self.position,
466 message: format!("Invalid real number: '{number_str}'"),
467 })?;
468 Ok(Token::Real(value))
469 } else {
470 let value = number_str
471 .parse::<i64>()
472 .map_err(|_| ParseError::SyntaxError {
473 position: self.position,
474 message: format!("Invalid integer: '{number_str}'"),
475 })?;
476 Ok(Token::Integer(value))
477 }
478 }
479
480 fn read_keyword(&mut self) -> ParseResult<Token> {
482 let word = self.read_word()?;
483 self.process_keyword(word)
484 }
485
486 fn process_keyword(&self, word: String) -> ParseResult<Token> {
488 match word.as_str() {
489 "stream" => Ok(Token::Stream),
490 "endstream" => Ok(Token::EndStream),
491 "obj" => Ok(Token::Obj),
492 "endobj" => Ok(Token::EndObj),
493 "startxref" => Ok(Token::StartXRef),
494 _ => Err(ParseError::SyntaxError {
495 position: self.position,
496 message: format!("Unknown keyword: {word}"),
497 }),
498 }
499 }
500
501 fn read_word(&mut self) -> ParseResult<String> {
503 let mut word = String::new();
504
505 while let Some(ch) = self.peek_char()? {
506 if ch.is_ascii_whitespace()
507 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
508 {
509 break;
510 }
511 self.consume_char()?;
512 word.push(ch as char);
513 }
514
515 Ok(word)
516 }
517
518 #[allow(dead_code)]
520 fn read_digits(&mut self) -> ParseResult<String> {
521 let mut digits = String::new();
522
523 while let Some(ch) = self.peek_char()? {
524 if ch.is_ascii_digit() {
525 self.consume_char()?;
526 digits.push(ch as char);
527 } else {
528 break;
529 }
530 }
531
532 Ok(digits)
533 }
534
535 pub fn read_newline(&mut self) -> ParseResult<()> {
537 match self.peek_char()? {
538 Some(b'\r') => {
539 self.consume_char()?;
540 if self.peek_char()? == Some(b'\n') {
542 self.consume_char()?;
543 }
544 Ok(())
545 }
546 Some(b'\n') => {
547 self.consume_char()?;
548 Ok(())
549 }
550 _ => Err(ParseError::SyntaxError {
551 position: self.position,
552 message: "Expected newline".to_string(),
553 }),
554 }
555 }
556
557 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
559 let mut bytes = vec![0u8; n];
560 self.reader.read_exact(&mut bytes)?;
561 self.position += n;
562 Ok(bytes)
563 }
564
565 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
567 let mut result = Vec::new();
568 let mut match_pos = 0;
569
570 while let Some(ch) = self.consume_char()? {
571 result.push(ch);
572
573 if ch == sequence[match_pos] {
574 match_pos += 1;
575 if match_pos == sequence.len() {
576 result.truncate(result.len() - sequence.len());
578 break;
579 }
580 } else if ch == sequence[0] {
581 match_pos = 1;
582 } else {
583 match_pos = 0;
584 }
585 }
586
587 if match_pos < sequence.len() {
588 return Err(ParseError::SyntaxError {
589 position: self.position,
590 message: format!("Sequence {sequence:?} not found"),
591 });
592 }
593
594 Ok(result)
595 }
596
597 pub fn position(&self) -> usize {
599 self.position
600 }
601
602 pub fn push_token(&mut self, token: Token) {
604 self.token_buffer.push(token);
605 }
606}
607
608#[cfg(test)]
609mod tests {
610 use super::*;
611 use std::io::Cursor;
612
613 #[test]
614 fn test_lexer_basic_tokens() {
615 let input = b"123 -456 3.14 true false null /Name";
617 let mut lexer = Lexer::new(Cursor::new(input));
618
619 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
620 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
621 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
622 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
623 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
624 assert_eq!(lexer.next_token().unwrap(), Token::Null);
625 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
626 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
627 }
628
629 #[test]
630 fn test_lexer_negative_numbers() {
631 let input = b"-123 -45.67";
633 let mut lexer = Lexer::new(Cursor::new(input));
634
635 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
636 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
637 }
638
639 #[test]
640 fn test_lexer_strings() {
641 let input = b"(Hello World) <48656C6C6F>";
642 let mut lexer = Lexer::new(Cursor::new(input));
643
644 assert_eq!(
645 lexer.next_token().unwrap(),
646 Token::String(b"Hello World".to_vec())
647 );
648 assert_eq!(
649 lexer.next_token().unwrap(),
650 Token::String(b"Hello".to_vec())
651 );
652 }
653
654 #[test]
655 fn test_lexer_dictionaries() {
656 let input = b"<< /Type /Page >>";
657 let mut lexer = Lexer::new(Cursor::new(input));
658
659 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
660 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
661 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
662 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
663 }
664
665 #[test]
666 fn test_lexer_arrays() {
667 let input = b"[1 2 3]";
668 let mut lexer = Lexer::new(Cursor::new(input));
669
670 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
671 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
672 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
673 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
674 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
675 }
676
677 #[test]
678 fn test_lexer_references() {
679 let input = b"1 0 R 25 1 R";
680 let mut lexer = Lexer::new(Cursor::new(input));
681
682 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
684 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
685 match lexer.next_token().unwrap() {
687 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
689 }
690
691 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
692 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
693 match lexer.next_token().unwrap() {
694 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
696 }
697 }
698
699 #[test]
700 fn test_lexer_comments() {
701 let input = b"%PDF-1.7\n123";
702 let mut lexer = Lexer::new(Cursor::new(input));
703
704 assert_eq!(
705 lexer.next_token().unwrap(),
706 Token::Comment("PDF-1.7".to_string())
707 );
708 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
709 }
710
711 mod comprehensive_tests {
713 use super::*;
714 use std::io::Cursor;
715
716 #[test]
717 fn test_token_debug_trait() {
718 let token = Token::Integer(42);
719 let debug_str = format!("{:?}", token);
720 assert!(debug_str.contains("Integer"));
721 assert!(debug_str.contains("42"));
722 }
723
724 #[test]
725 fn test_token_clone() {
726 let token = Token::String(b"hello".to_vec());
727 let cloned = token.clone();
728 assert_eq!(token, cloned);
729 }
730
731 #[test]
732 fn test_token_equality() {
733 assert_eq!(Token::Integer(42), Token::Integer(42));
734 assert_ne!(Token::Integer(42), Token::Integer(43));
735 assert_eq!(Token::Boolean(true), Token::Boolean(true));
736 assert_ne!(Token::Boolean(true), Token::Boolean(false));
737 assert_eq!(Token::Null, Token::Null);
738 assert_ne!(Token::Null, Token::Integer(0));
739 }
740
741 #[test]
742 fn test_lexer_empty_input() {
743 let input = b"";
744 let mut lexer = Lexer::new(Cursor::new(input));
745 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
746 }
747
748 #[test]
749 fn test_lexer_whitespace_only() {
750 let input = b" \t\n\r ";
751 let mut lexer = Lexer::new(Cursor::new(input));
752 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
753 }
754
755 #[test]
756 fn test_lexer_integer_edge_cases() {
757 let input = b"0 +123 -0 9876543210";
758 let mut lexer = Lexer::new(Cursor::new(input));
759
760 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
761 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
762 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
763 assert_eq!(lexer.next_token().unwrap(), Token::Integer(9876543210));
764 }
765
766 #[test]
767 fn test_lexer_real_edge_cases() {
768 let input = b"0.0 +3.14 -2.71828 .5 5. 123.456789";
769 let mut lexer = Lexer::new(Cursor::new(input));
770
771 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.0));
772 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
773 assert_eq!(lexer.next_token().unwrap(), Token::Real(-2.71828));
774 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.5));
775 assert_eq!(lexer.next_token().unwrap(), Token::Real(5.0));
776 assert_eq!(lexer.next_token().unwrap(), Token::Real(123.456789));
777 }
778
779 #[test]
780 fn test_lexer_scientific_notation() {
781 let input = b"1.23e10 -4.56E-5 1e0 2E+3";
782 let mut lexer = Lexer::new(Cursor::new(input));
783
784 assert_eq!(lexer.next_token().unwrap(), Token::Real(1.23e10));
785 assert_eq!(lexer.next_token().unwrap(), Token::Real(-4.56e-5));
786 assert_eq!(lexer.next_token().unwrap(), Token::Real(1e0));
787 assert_eq!(lexer.next_token().unwrap(), Token::Real(2e3));
788 }
789
790 #[test]
791 fn test_lexer_string_literal_escapes() {
792 let input = b"(Hello\\nWorld) (Tab\\tChar) (Quote\\\"Mark) (Backslash\\\\)";
793 let mut lexer = Lexer::new(Cursor::new(input));
794
795 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello\nWorld".to_vec()));
796 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Tab\tChar".to_vec()));
797 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Quote\"Mark".to_vec()));
798 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Backslash\\".to_vec()));
799 }
800
801 #[test]
802 fn test_lexer_string_literal_nested_parens() {
803 let input = b"(Nested (parentheses) work)";
804 let mut lexer = Lexer::new(Cursor::new(input));
805
806 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Nested (parentheses) work".to_vec()));
807 }
808
809 #[test]
810 fn test_lexer_string_literal_empty() {
811 let input = b"()";
812 let mut lexer = Lexer::new(Cursor::new(input));
813
814 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
815 }
816
817 #[test]
818 fn test_lexer_hexadecimal_strings() {
819 let input = b"<48656C6C6F> <20576F726C64> <>";
820 let mut lexer = Lexer::new(Cursor::new(input));
821
822 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello".to_vec()));
823 assert_eq!(lexer.next_token().unwrap(), Token::String(b" World".to_vec()));
824 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
825 }
826
827 #[test]
828 fn test_lexer_hexadecimal_strings_odd_length() {
829 let input = b"<48656C6C6F2> <1> <ABC>";
830 let mut lexer = Lexer::new(Cursor::new(input));
831
832 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello ".to_vec()));
834 assert_eq!(lexer.next_token().unwrap(), Token::String(b"\x10".to_vec()));
835 assert_eq!(lexer.next_token().unwrap(), Token::String(b"\xAB\xC0".to_vec()));
836 }
837
838 #[test]
839 fn test_lexer_hexadecimal_strings_whitespace() {
840 let input = b"<48 65 6C 6C 6F>";
841 let mut lexer = Lexer::new(Cursor::new(input));
842
843 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello".to_vec()));
844 }
845
846 #[test]
847 fn test_lexer_names() {
848 let input = b"/Type /Page /Root /Kids /Count /MediaBox";
849 let mut lexer = Lexer::new(Cursor::new(input));
850
851 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
852 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
853 assert_eq!(lexer.next_token().unwrap(), Token::Name("Root".to_string()));
854 assert_eq!(lexer.next_token().unwrap(), Token::Name("Kids".to_string()));
855 assert_eq!(lexer.next_token().unwrap(), Token::Name("Count".to_string()));
856 assert_eq!(lexer.next_token().unwrap(), Token::Name("MediaBox".to_string()));
857 }
858
859 #[test]
860 fn test_lexer_names_with_special_chars() {
861 let input = b"/Name#20with#20spaces /Name#2Fwith#2Fslashes";
862 let mut lexer = Lexer::new(Cursor::new(input));
863
864 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name with spaces".to_string()));
865 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name/with/slashes".to_string()));
866 }
867
868 #[test]
869 fn test_lexer_names_edge_cases() {
870 let input = b"/ /A /123 /true /false /null";
871 let mut lexer = Lexer::new(Cursor::new(input));
872
873 assert_eq!(lexer.next_token().unwrap(), Token::Name("".to_string()));
874 assert_eq!(lexer.next_token().unwrap(), Token::Name("A".to_string()));
875 assert_eq!(lexer.next_token().unwrap(), Token::Name("123".to_string()));
876 assert_eq!(lexer.next_token().unwrap(), Token::Name("true".to_string()));
877 assert_eq!(lexer.next_token().unwrap(), Token::Name("false".to_string()));
878 assert_eq!(lexer.next_token().unwrap(), Token::Name("null".to_string()));
879 }
880
881 #[test]
882 fn test_lexer_nested_dictionaries() {
883 let input = b"<< /Type /Page /Resources << /Font << /F1 123 0 R >> >> >>";
884 let mut lexer = Lexer::new(Cursor::new(input));
885
886 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
887 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
888 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
889 assert_eq!(lexer.next_token().unwrap(), Token::Name("Resources".to_string()));
890 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
891 assert_eq!(lexer.next_token().unwrap(), Token::Name("Font".to_string()));
892 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
893 assert_eq!(lexer.next_token().unwrap(), Token::Name("F1".to_string()));
894 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
895 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
896 assert_eq!(lexer.next_token().unwrap(), Token::Name("R".to_string()));
897 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
898 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
899 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
900 }
901
902 #[test]
903 fn test_lexer_nested_arrays() {
904 let input = b"[[1 2] [3 4] [5 [6 7]]]";
905 let mut lexer = Lexer::new(Cursor::new(input));
906
907 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
908 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
909 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
910 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
911 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
912 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
913 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
914 assert_eq!(lexer.next_token().unwrap(), Token::Integer(4));
915 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
916 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
917 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
918 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
919 assert_eq!(lexer.next_token().unwrap(), Token::Integer(6));
920 assert_eq!(lexer.next_token().unwrap(), Token::Integer(7));
921 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
922 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
923 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
924 }
925
926 #[test]
927 fn test_lexer_mixed_content() {
928 let input = b"<< /Type /Page /MediaBox [0 0 612 792] /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 >> >> >> >>";
929 let mut lexer = Lexer::new(Cursor::new(input));
930
931 let mut tokens = Vec::new();
933 loop {
934 match lexer.next_token().unwrap() {
935 Token::Eof => break,
936 token => tokens.push(token),
937 }
938 }
939 assert!(tokens.len() > 10);
940 }
941
942 #[test]
943 fn test_lexer_keywords() {
944 let input = b"obj endobj stream endstream startxref";
945 let mut lexer = Lexer::new(Cursor::new(input));
946
947 assert_eq!(lexer.next_token().unwrap(), Token::Obj);
948 assert_eq!(lexer.next_token().unwrap(), Token::EndObj);
949 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
950 assert_eq!(lexer.next_token().unwrap(), Token::EndStream);
951 assert_eq!(lexer.next_token().unwrap(), Token::StartXRef);
952 }
953
954 #[test]
955 fn test_lexer_multiple_comments() {
956 let input = b"%First comment\n%Second comment\n123";
957 let mut lexer = Lexer::new(Cursor::new(input));
958
959 assert_eq!(lexer.next_token().unwrap(), Token::Comment("First comment".to_string()));
960 assert_eq!(lexer.next_token().unwrap(), Token::Comment("Second comment".to_string()));
961 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
962 }
963
964 #[test]
965 fn test_lexer_comment_without_newline() {
966 let input = b"%Comment at end";
967 let mut lexer = Lexer::new(Cursor::new(input));
968
969 assert_eq!(lexer.next_token().unwrap(), Token::Comment("Comment at end".to_string()));
970 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
971 }
972
973 #[test]
974 fn test_lexer_special_characters_in_streams() {
975 let input = b"<< /Length 5 >> stream\nHello endstream";
976 let mut lexer = Lexer::new(Cursor::new(input));
977
978 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
979 assert_eq!(lexer.next_token().unwrap(), Token::Name("Length".to_string()));
980 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
981 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
982 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
983 }
985
986 #[test]
987 fn test_lexer_push_token() {
988 let input = b"123 456";
989 let mut lexer = Lexer::new(Cursor::new(input));
990
991 let token1 = lexer.next_token().unwrap();
992 assert_eq!(token1, Token::Integer(123));
993
994 let token2 = lexer.next_token().unwrap();
995 assert_eq!(token2, Token::Integer(456));
996
997 lexer.push_token(token2.clone());
999
1000 let token3 = lexer.next_token().unwrap();
1002 assert_eq!(token3, token2);
1003
1004 let token4 = lexer.next_token().unwrap();
1006 assert_eq!(token4, Token::Eof);
1007 }
1008
1009 #[test]
1010 fn test_lexer_push_multiple_tokens() {
1011 let input = b"123";
1012 let mut lexer = Lexer::new(Cursor::new(input));
1013
1014 let original_token = lexer.next_token().unwrap();
1015 assert_eq!(original_token, Token::Integer(123));
1016
1017 lexer.push_token(Token::Boolean(true));
1019 lexer.push_token(Token::Boolean(false));
1020 lexer.push_token(Token::Null);
1021
1022 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1024 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1025 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1026 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1027 }
1028
1029 #[test]
1030 fn test_lexer_read_newline() {
1031 let input = b"123\n456\r\n789";
1032 let mut lexer = Lexer::new(Cursor::new(input));
1033
1034 let digits1 = lexer.read_digits().unwrap();
1036 assert_eq!(digits1, "123");
1037 assert!(lexer.read_newline().is_ok());
1038
1039 let digits2 = lexer.read_digits().unwrap();
1041 assert_eq!(digits2, "456");
1042 assert!(lexer.read_newline().is_ok());
1043
1044 let digits3 = lexer.read_digits().unwrap();
1046 assert_eq!(digits3, "789");
1047 }
1048
1049 #[test]
1050 fn test_lexer_read_bytes() {
1051 let input = b"Hello World";
1052 let mut lexer = Lexer::new(Cursor::new(input));
1053
1054 let bytes = lexer.read_bytes(5).unwrap();
1055 assert_eq!(bytes, b"Hello");
1056
1057 let bytes = lexer.read_bytes(6).unwrap();
1058 assert_eq!(bytes, b" World");
1059 }
1060
1061 #[test]
1062 fn test_lexer_read_until_sequence() {
1063 let input = b"Hello endstream World";
1064 let mut lexer = Lexer::new(Cursor::new(input));
1065
1066 let result = lexer.read_until_sequence(b"endstream").unwrap();
1067 assert_eq!(result, b"Hello ");
1068
1069 let rest = lexer.read_digits().unwrap();
1071 assert_eq!(rest, ""); }
1073
1074 #[test]
1075 fn test_lexer_read_until_sequence_not_found() {
1076 let input = b"Hello World";
1077 let mut lexer = Lexer::new(Cursor::new(input));
1078
1079 let result = lexer.read_until_sequence(b"notfound");
1080 assert!(result.is_err());
1081 }
1082
1083 #[test]
1084 fn test_lexer_position_tracking() {
1085 let input = b"123 456";
1086 let mut lexer = Lexer::new(Cursor::new(input));
1087
1088 let initial_pos = lexer.position();
1089 assert_eq!(initial_pos, 0);
1090
1091 lexer.next_token().unwrap(); let pos_after_first = lexer.position();
1093 assert!(pos_after_first > initial_pos);
1094
1095 lexer.next_token().unwrap(); let pos_after_second = lexer.position();
1097 assert!(pos_after_second > pos_after_first);
1098 }
1099
1100 #[test]
1101 fn test_lexer_large_numbers() {
1102 let input = b"2147483647 -2147483648 9223372036854775807 -9223372036854775808";
1103 let mut lexer = Lexer::new(Cursor::new(input));
1104
1105 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2147483647));
1106 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-2147483648));
1107 assert_eq!(lexer.next_token().unwrap(), Token::Integer(9223372036854775807));
1108 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-9223372036854775808));
1109 }
1110
1111 #[test]
1112 fn test_lexer_very_long_string() {
1113 let long_str = "A".repeat(1000);
1114 let input = format!("({})", long_str);
1115 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1116
1117 if let Token::String(s) = lexer.next_token().unwrap() {
1118 assert_eq!(s.len(), 1000);
1119 assert_eq!(s, long_str.as_bytes());
1120 } else {
1121 panic!("Expected string token");
1122 }
1123 }
1124
1125 #[test]
1126 fn test_lexer_very_long_name() {
1127 let long_name = "A".repeat(500);
1128 let input = format!("/{}", long_name);
1129 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1130
1131 if let Token::Name(name) = lexer.next_token().unwrap() {
1132 assert_eq!(name.len(), 500);
1133 assert_eq!(name, long_name);
1134 } else {
1135 panic!("Expected name token");
1136 }
1137 }
1138
1139 #[test]
1140 fn test_lexer_error_handling_invalid_hex() {
1141 let input = b"<48656C6C6FG>";
1142 let mut lexer = Lexer::new(Cursor::new(input));
1143
1144 let result = lexer.next_token();
1146 assert!(result.is_ok() || result.is_err()); }
1148
1149 #[test]
1150 fn test_lexer_all_token_types() {
1151 let input = b"true false null 123 -456 3.14 (string) <48656C6C6F> /Name [ ] << >> obj endobj stream endstream startxref % comment\n";
1152 let mut lexer = Lexer::new(Cursor::new(input));
1153
1154 let mut token_types = Vec::new();
1155 loop {
1156 match lexer.next_token().unwrap() {
1157 Token::Eof => break,
1158 token => token_types.push(std::mem::discriminant(&token)),
1159 }
1160 }
1161
1162 assert!(token_types.len() > 10);
1164 }
1165
1166 #[test]
1167 fn test_lexer_performance() {
1168 let input = "123 456 789 ".repeat(1000);
1169 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1170
1171 let start_time = std::time::Instant::now();
1172 let mut count = 0;
1173 loop {
1174 match lexer.next_token().unwrap() {
1175 Token::Eof => break,
1176 _ => count += 1,
1177 }
1178 }
1179 let elapsed = start_time.elapsed();
1180
1181 assert_eq!(count, 3000); assert!(elapsed.as_millis() < 1000); }
1184 }
1185}